diff --git a/example.py b/example.py index 85c0ae19..3917fbc6 100644 --- a/example.py +++ b/example.py @@ -674,7 +674,7 @@ def core(self, slothy): class ntt_kyber_1234_567(Example): - def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72, timeout=None): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): name = "ntt_kyber_1234_567" infile = name @@ -724,7 +724,7 @@ def core(self, slothy): class ntt_kyber_1234(Example): - def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): name = "ntt_kyber_1234" infile = "ntt_kyber_1234_567" @@ -749,7 +749,7 @@ def core(self, slothy): class ntt_kyber_567(Example): - def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72, timeout=None): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): name = "ntt_kyber_567" infile = "ntt_kyber_1234_567" @@ -1136,7 +1136,7 @@ def core(self, slothy): class ntt_dilithium_1234_5678(Example): - def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72, timeout=None): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): name = f"ntt_dilithium_1234_5678" infile = name @@ -1226,7 +1226,7 @@ def core(self, slothy): class ntt_dilithium_1234(Example): - def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): name = "ntt_dilithium_1234" infile = "ntt_dilithium_1234_5678" @@ -1250,7 +1250,7 @@ def core(self, slothy): class ntt_dilithium_5678(Example): - def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): name = "ntt_dilithium_5678" infile = "ntt_dilithium_1234_5678" @@ -1399,6 +1399,7 @@ def main(): ntt_kyber_1234_567(), intt_kyber_123_4567(), intt_kyber_123_4567(var="manual_ld4"), + # Cortex-A72 ntt_kyber_123_4567(target=Target_CortexA72), ntt_kyber_123_4567(var="scalar_load", target=Target_CortexA72), @@ -1408,6 +1409,7 @@ def main(): ntt_kyber_1234_567(target=Target_CortexA72), intt_kyber_123_4567(target=Target_CortexA72), intt_kyber_123_4567(var="manual_ld4", target=Target_CortexA72), + # # Apple M1 Firestorm ntt_kyber_123_4567(target=Target_AppleM1_firestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_load", target=Target_AppleM1_firestorm, timeout=3600), @@ -1453,6 +1455,7 @@ def main(): intt_dilithium_123_45678(var="manual_ld4"), intt_dilithium_1234_5678(), intt_dilithium_1234_5678(var="manual_ld4"), + # Cortex-A72 ntt_dilithium_123_45678(target=Target_CortexA72), ntt_dilithium_123_45678(var="w_scalar", target=Target_CortexA72), @@ -1463,6 +1466,7 @@ def main(): intt_dilithium_123_45678(var="manual_ld4", target=Target_CortexA72), intt_dilithium_1234_5678(target=Target_CortexA72), intt_dilithium_1234_5678(var="manual_ld4", target=Target_CortexA72), + # Apple M1 Firestorm ntt_dilithium_123_45678(target=Target_AppleM1_firestorm, timeout=3600), ntt_dilithium_123_45678(var="w_scalar", target=Target_AppleM1_firestorm, timeout=3600), diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678.s b/examples/naive/aarch64/intt_dilithium_1234_5678.s index 1e8008fe..ebe3df6a 100644 --- a/examples/naive/aarch64/intt_dilithium_1234_5678.s +++ b/examples/naive/aarch64/intt_dilithium_1234_5678.s @@ -35,18 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -132,31 +120,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [\r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -334,12 +322,6 @@ _intt_dilithium_1234_5678: .p2align 2 layer5678_start: - // manual_ld4 - // ldr_vo data0, inp, (16*0) - // ldr_vo data1, inp, (16*1) - // ldr_vo data2, inp, (16*2) - // ldr_vo data3, inp, (16*3) - // transpose4 data ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] @@ -363,10 +345,10 @@ layer5678_start: barrett_reduce_single data0 barrett_reduce_single data1 - str_vi data0, inp, (16*4) - str_vo data1, inp, (-16*4 + 1*16) - str_vo data2, inp, (-16*4 + 2*16) - str_vo data3, inp, (-16*4 + 3*16) + str qform_data0, [inp], #(16*4) + str qform_data1, [inp, #(-16*4 + 1*16)] + str qform_data2, [inp, #(-16*4 + 2*16)] + str qform_data3, [inp, #(-16*4 + 3*16)] // layer5678_end: subs count, count, #1 cbnz count, layer5678_start @@ -411,22 +393,22 @@ layer5678_start: .p2align 2 layer1234_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) - ldr_vo data8, in, (8*(512/8)) - ldr_vo data9, in, (9*(512/8)) - ldr_vo data10, in, (10*(512/8)) - ldr_vo data11, in, (11*(512/8)) - ldr_vo data12, in, (12*(512/8)) - ldr_vo data13, in, (13*(512/8)) - ldr_vo data14, in, (14*(512/8)) - ldr_vo data15, in, (15*(512/8)) + ldr qform_data0, [in] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] + ldr qform_data8, [in, #(8*(512/8))] + ldr qform_data9, [in, #(9*(512/8))] + ldr qform_data10, [in, #(10*(512/8))] + ldr qform_data11, [in, #(11*(512/8))] + ldr qform_data12, [in, #(12*(512/8))] + ldr qform_data13, [in, #(13*(512/8))] + ldr qform_data14, [in, #(14*(512/8))] + ldr qform_data15, [in, #(15*(512/8))] // layer4 gs_butterfly data0, data1, root3, 2, 3 @@ -477,14 +459,14 @@ layer1234_start: canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - str_vo data8, in, (8*(512/8)) - str_vo data9, in, (9*(512/8)) - str_vo data10, in, (10*(512/8)) - str_vo data11, in, (11*(512/8)) - str_vo data12, in, (12*(512/8)) - str_vo data13, in, (13*(512/8)) - str_vo data14, in, (14*(512/8)) - str_vo data15, in, (15*(512/8)) + str qform_data8, [in, #(8*(512/8))] + str qform_data9, [in, #(9*(512/8))] + str qform_data10, [in, #(10*(512/8))] + str qform_data11, [in, #(11*(512/8))] + str qform_data12, [in, #(12*(512/8))] + str qform_data13, [in, #(13*(512/8))] + str qform_data14, [in, #(14*(512/8))] + str qform_data15, [in, #(15*(512/8))] // Scale half the coeffs by 1/n; for the other half, the scaling has // been merged into the multiplication with the twiddle factor on the @@ -500,14 +482,14 @@ layer1234_start: canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] // layer1234_end: subs count, count, #1 diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s b/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s index 22d24757..f5851e31 100644 --- a/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s +++ b/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s @@ -35,18 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -132,31 +120,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [\r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -335,10 +323,10 @@ _intt_dilithium_1234_5678_manual_ld4: .p2align 2 layer5678_start: // manual_ld4 - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] transpose4 data load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr0 @@ -361,10 +349,10 @@ layer5678_start: barrett_reduce_single data0 barrett_reduce_single data1 - str_vi data0, inp, (16*4) - str_vo data1, inp, (-16*4 + 1*16) - str_vo data2, inp, (-16*4 + 2*16) - str_vo data3, inp, (-16*4 + 3*16) + str qform_data0, [inp], #(16*4) + str qform_data1, [inp, #(-16*4 + 1*16)] + str qform_data2, [inp, #(-16*4 + 2*16)] + str qform_data3, [inp, #(-16*4 + 3*16)] // layer5678_end: subs count, count, #1 cbnz count, layer5678_start @@ -409,22 +397,22 @@ layer5678_start: .p2align 2 layer1234_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) - ldr_vo data8, in, (8*(512/8)) - ldr_vo data9, in, (9*(512/8)) - ldr_vo data10, in, (10*(512/8)) - ldr_vo data11, in, (11*(512/8)) - ldr_vo data12, in, (12*(512/8)) - ldr_vo data13, in, (13*(512/8)) - ldr_vo data14, in, (14*(512/8)) - ldr_vo data15, in, (15*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] + ldr qform_data8, [in, #(8*(512/8))] + ldr qform_data9, [in, #(9*(512/8))] + ldr qform_data10, [in, #(10*(512/8))] + ldr qform_data11, [in, #(11*(512/8))] + ldr qform_data12, [in, #(12*(512/8))] + ldr qform_data13, [in, #(13*(512/8))] + ldr qform_data14, [in, #(14*(512/8))] + ldr qform_data15, [in, #(15*(512/8))] // layer4 gs_butterfly data0, data1, root3, 2, 3 @@ -475,14 +463,14 @@ layer1234_start: canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - str_vo data8, in, (8*(512/8)) - str_vo data9, in, (9*(512/8)) - str_vo data10, in, (10*(512/8)) - str_vo data11, in, (11*(512/8)) - str_vo data12, in, (12*(512/8)) - str_vo data13, in, (13*(512/8)) - str_vo data14, in, (14*(512/8)) - str_vo data15, in, (15*(512/8)) + str qform_data8, [in, #(8*(512/8))] + str qform_data9, [in, #(9*(512/8))] + str qform_data10, [in, #(10*(512/8))] + str qform_data11, [in, #(11*(512/8))] + str qform_data12, [in, #(12*(512/8))] + str qform_data13, [in, #(13*(512/8))] + str qform_data14, [in, #(14*(512/8))] + str qform_data15, [in, #(15*(512/8))] // Scale half the coeffs by 1/n; for the other half, the scaling has // been merged into the multiplication with the twiddle factor on the @@ -498,14 +486,14 @@ layer1234_start: canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] // layer1234_end: subs count, count, #1 diff --git a/examples/naive/aarch64/intt_dilithium_123_45678.s b/examples/naive/aarch64/intt_dilithium_123_45678.s index 598a1a9c..95d4105b 100644 --- a/examples/naive/aarch64/intt_dilithium_123_45678.s +++ b/examples/naive/aarch64/intt_dilithium_123_45678.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -447,15 +433,15 @@ layer45678_start: // Standard way using vector instructions - str_vi data0, inp, (16*4) - str_vo data1, inp, (-16*4 + 1*16) - str_vo data2, inp, (-16*4 + 2*16) - str_vo data3, inp, (-16*4 + 3*16) + str qform_data0, [inp], #(16*4) + str qform_data1, [inp, #(-16*4 + 1*16)] + str qform_data2, [inp, #(-16*4 + 2*16)] + str qform_data3, [inp, #(-16*4 + 3*16)] - str_vi data4, inpp, (16*4) - str_vo data5, inpp, (-16*4 + 1*16) - str_vo data6, inpp, (-16*4 + 2*16) - str_vo data7, inpp, (-16*4 + 3*16) + str qform_data4, [inpp], #(16*4) + str qform_data5, [inpp, #(-16*4 + 1*16)] + str qform_data6, [inpp, #(-16*4 + 2*16)] + str qform_data7, [inpp, #(-16*4 + 3*16)] add inp, inp, #64 add inpp, inpp, #64 @@ -485,14 +471,14 @@ layer45678_start: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] gs_butterfly data0, data1, root1, 2, 3 gs_butterfly data2, data3, root2, 0, 1 @@ -515,10 +501,10 @@ layer123_start: canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 - str_vo data4, in, (4*(1024/8)) - str_vo data5, in, (5*(1024/8)) - str_vo data6, in, (6*(1024/8)) - str_vo data7, in, (7*(1024/8)) + str qform_data4, [in, #(4*(1024/8))] + str qform_data5, [in, #(5*(1024/8))] + str qform_data6, [in, #(6*(1024/8))] + str qform_data7, [in, #(7*(1024/8))] // Scale half the coeffs by 1/n; for the other half, the scaling has // been merged into the multiplication with the twiddle factor on the @@ -530,10 +516,10 @@ layer123_start: canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s b/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s index 69dc2c2a..bee3bb74 100644 --- a/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s +++ b/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -395,16 +381,16 @@ _intt_dilithium_123_45678_manual_ld4: .p2align 2 layer45678_start: // Manual ld4 using vector instructions - ldr_vo data0, inp, 0 - ldr_vo data1, inp, 16 - ldr_vo data2, inp, 32 - ldr_vo data3, inp, 48 + ldr qform_data0, [inp, #0] + ldr qform_data1, [inp, #16] + ldr qform_data2, [inp, #32] + ldr qform_data3, [inp, #48] transpose4 data0, data1, data2, data3 - ldr_vo data4, inpp, 0 - ldr_vo data5, inpp, 16 - ldr_vo data6, inpp, 32 - ldr_vo data7, inpp, 48 + ldr qform_data4, [inpp, #0] + ldr qform_data5, [inpp, #16] + ldr qform_data6, [inpp, #32] + ldr qform_data7, [inpp, #48] transpose4 data4, data5, data6, data7 load_roots_78_part1 @@ -456,15 +442,15 @@ layer45678_start: // Standard way using vector instructions - str_vi data0, inp, (16*4) - str_vo data1, inp, (-16*4 + 1*16) - str_vo data2, inp, (-16*4 + 2*16) - str_vo data3, inp, (-16*4 + 3*16) + str qform_data0, [inp], #(16*4) + str qform_data1, [inp, #(-16*4 + 1*16)] + str qform_data2, [inp, #(-16*4 + 2*16)] + str qform_data3, [inp, #(-16*4 + 3*16)] - str_vi data4, inpp, (16*4) - str_vo data5, inpp, (-16*4 + 1*16) - str_vo data6, inpp, (-16*4 + 2*16) - str_vo data7, inpp, (-16*4 + 3*16) + str qform_data4, [inpp], #(16*4) + str qform_data5, [inpp, #(-16*4 + 1*16)] + str qform_data6, [inpp, #(-16*4 + 2*16)] + str qform_data7, [inpp, #(-16*4 + 3*16)] add inp, inp, #64 add inpp, inpp, #64 @@ -494,14 +480,14 @@ layer45678_start: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] gs_butterfly data0, data1, root1, 2, 3 gs_butterfly data2, data3, root2, 0, 1 @@ -524,10 +510,10 @@ layer123_start: canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 - str_vo data4, in, (4*(1024/8)) - str_vo data5, in, (5*(1024/8)) - str_vo data6, in, (6*(1024/8)) - str_vo data7, in, (7*(1024/8)) + str qform_data4, [in, #(4*(1024/8))] + str qform_data5, [in, #(5*(1024/8))] + str qform_data6, [in, #(6*(1024/8))] + str qform_data7, [in, #(7*(1024/8))] // Scale half the coeffs by 1/n; for the other half, the scaling has // been merged into the multiplication with the twiddle factor on the @@ -539,10 +525,10 @@ layer123_start: canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/intt_kyber_123_4567.s b/examples/naive/aarch64/intt_kyber_123_4567.s index 9cb0b6c4..61a3c52a 100644 --- a/examples/naive/aarch64/intt_kyber_123_4567.s +++ b/examples/naive/aarch64/intt_kyber_123_4567.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -104,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -355,10 +340,10 @@ _intt_kyber_123_4567: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] transpose4 data // manual ld4 @@ -388,10 +373,10 @@ layer4567_start: gs_butterfly data0, data2, root0, 0, 1 gs_butterfly data1, data3, root0, 0, 1 - str_vi data0, inp, (64) - str_vo data1, inp, (-64 + 16*1) - str_vo data2, inp, (-64 + 16*2) - str_vo data3, inp, (-64 + 16*3) + str qform_data0, [inp], #(64) + str qform_data1, [inp, #(-64 + 16*1)] + str qform_data2, [inp, #(-64 + 16*2)] + str qform_data3, [inp, #(-64 + 16*3)] subs count, count, #1 cbnz count, layer4567_start @@ -414,14 +399,14 @@ layer4567_start: layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] gs_butterfly data0, data1, root0, 6, 7 gs_butterfly data2, data3, root1, 0, 1 @@ -438,20 +423,20 @@ layer123_start: gs_butterfly data2, data6, root0, 0, 1 gs_butterfly data3, data7, root0, 0, 1 - str_vo data4, in, (4*(512/8)) - str_vo data5, in, (5*(512/8)) - str_vo data6, in, (6*(512/8)) - str_vo data7, in, (7*(512/8)) + str qform_data4, [in, #(4*(512/8))] + str qform_data5, [in, #(5*(512/8))] + str qform_data6, [in, #(6*(512/8))] + str qform_data7, [in, #(7*(512/8))] // Scale half the coeffs by 1/n; for the other half, the scaling has // been merged into the multiplication with the twiddle factor on the // last layer. mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] subs count, count, #1 diff --git a/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s b/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s index 2bcc941d..1b659511 100644 --- a/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s +++ b/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -104,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -383,10 +368,10 @@ layer4567_start: gs_butterfly data0, data2, root0, 0, 1 gs_butterfly data1, data3, root0, 0, 1 - str_vi data0, inp, (64) - str_vo data1, inp, (-64 + 16*1) - str_vo data2, inp, (-64 + 16*2) - str_vo data3, inp, (-64 + 16*3) + str qform_data0, [inp], #(64) + str qform_data1, [inp, #(-64 + 16*1)] + str qform_data2, [inp, #(-64 + 16*2)] + str qform_data3, [inp, #(-64 + 16*3)] subs count, count, #1 cbnz count, layer4567_start @@ -409,14 +394,14 @@ layer4567_start: layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] gs_butterfly data0, data1, root0, 6, 7 gs_butterfly data2, data3, root1, 0, 1 @@ -433,20 +418,20 @@ layer123_start: gs_butterfly data2, data6, root0, 0, 1 gs_butterfly data3, data7, root0, 0, 1 - str_vo data4, in, (4*(512/8)) - str_vo data5, in, (5*(512/8)) - str_vo data6, in, (6*(512/8)) - str_vo data7, in, (7*(512/8)) + str qform_data4, [in, #(4*(512/8))] + str qform_data5, [in, #(5*(512/8))] + str qform_data6, [in, #(6*(512/8))] + str qform_data7, [in, #(7*(512/8))] // Scale half the coeffs by 1/n; for the other half, the scaling has // been merged into the multiplication with the twiddle factor on the // last layer. mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] subs count, count, #1 diff --git a/examples/naive/aarch64/ntt_dilithium_1234_5678.s b/examples/naive/aarch64/ntt_dilithium_1234_5678.s index c607f61a..1b85e1cd 100644 --- a/examples/naive/aarch64/ntt_dilithium_1234_5678.s +++ b/examples/naive/aarch64/ntt_dilithium_1234_5678.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -92,31 +71,31 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -318,22 +297,22 @@ _ntt_dilithium_1234_5678: .p2align 2 layer1234_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) - ldr_vo data8, in, (8*(512/8)) - ldr_vo data9, in, (9*(512/8)) - ldr_vo data10, in, (10*(512/8)) - ldr_vo data11, in, (11*(512/8)) - ldr_vo data12, in, (12*(512/8)) - ldr_vo data13, in, (13*(512/8)) - ldr_vo data14, in, (14*(512/8)) - ldr_vo data15, in, (15*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] + ldr qform_data8, [in, #(8*(512/8))] + ldr qform_data9, [in, #(9*(512/8))] + ldr qform_data10, [in, #(10*(512/8))] + ldr qform_data11, [in, #(11*(512/8))] + ldr qform_data12, [in, #(12*(512/8))] + ldr qform_data13, [in, #(13*(512/8))] + ldr qform_data14, [in, #(14*(512/8))] + ldr qform_data15, [in, #(15*(512/8))] // layer 1 ct_butterfly data0, data8, root0, 0, 1 @@ -375,22 +354,22 @@ layer1234_start: ct_butterfly data12, data13, root6, 2, 3 ct_butterfly data14, data15, root7, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) - str_vo data8, in, (-16 + 8*(512/8)) - str_vo data9, in, (-16 + 9*(512/8)) - str_vo data10, in, (-16 + 10*(512/8)) - str_vo data11, in, (-16 + 11*(512/8)) - str_vo data12, in, (-16 + 12*(512/8)) - str_vo data13, in, (-16 + 13*(512/8)) - str_vo data14, in, (-16 + 14*(512/8)) - str_vo data15, in, (-16 + 15*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] + str qform_data8, [in, #(-16 + 8*(512/8))] + str qform_data9, [in, #(-16 + 9*(512/8))] + str qform_data10, [in, #(-16 + 10*(512/8))] + str qform_data11, [in, #(-16 + 11*(512/8))] + str qform_data12, [in, #(-16 + 12*(512/8))] + str qform_data13, [in, #(-16 + 13*(512/8))] + str qform_data14, [in, #(-16 + 14*(512/8))] + str qform_data15, [in, #(-16 + 15*(512/8))] // layer1234_end: subs count, count, #1 cbnz count, layer1234_start @@ -417,10 +396,10 @@ layer1234_start: .p2align 2 layer5678_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_56 root0, r_ptr0 load_next_roots_6 root1, r_ptr0 diff --git a/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s b/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s index 2f5d42a8..ac2f52d0 100644 --- a/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s +++ b/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -91,38 +70,38 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data0, data1, data2, data3 @@ -291,22 +270,22 @@ _ntt_dilithium_1234_5678_manual_st4: .p2align 2 layer1234_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) - ldr_vo data8, in, (8*(512/8)) - ldr_vo data9, in, (9*(512/8)) - ldr_vo data10, in, (10*(512/8)) - ldr_vo data11, in, (11*(512/8)) - ldr_vo data12, in, (12*(512/8)) - ldr_vo data13, in, (13*(512/8)) - ldr_vo data14, in, (14*(512/8)) - ldr_vo data15, in, (15*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] + ldr qform_data8, [in, #(8*(512/8))] + ldr qform_data9, [in, #(9*(512/8))] + ldr qform_data10, [in, #(10*(512/8))] + ldr qform_data11, [in, #(11*(512/8))] + ldr qform_data12, [in, #(12*(512/8))] + ldr qform_data13, [in, #(13*(512/8))] + ldr qform_data14, [in, #(14*(512/8))] + ldr qform_data15, [in, #(15*(512/8))] // layer 1 ct_butterfly data0, data8, root0, 0, 1 @@ -348,22 +327,22 @@ layer1234_start: ct_butterfly data12, data13, root6, 2, 3 ct_butterfly data14, data15, root7, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) - str_vo data8, in, (-16 + 8*(512/8)) - str_vo data9, in, (-16 + 9*(512/8)) - str_vo data10, in, (-16 + 10*(512/8)) - str_vo data11, in, (-16 + 11*(512/8)) - str_vo data12, in, (-16 + 12*(512/8)) - str_vo data13, in, (-16 + 13*(512/8)) - str_vo data14, in, (-16 + 14*(512/8)) - str_vo data15, in, (-16 + 15*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] + str qform_data8, [in, #(-16 + 8*(512/8))] + str qform_data9, [in, #(-16 + 9*(512/8))] + str qform_data10, [in, #(-16 + 10*(512/8))] + str qform_data11, [in, #(-16 + 11*(512/8))] + str qform_data12, [in, #(-16 + 12*(512/8))] + str qform_data13, [in, #(-16 + 13*(512/8))] + str qform_data14, [in, #(-16 + 14*(512/8))] + str qform_data15, [in, #(-16 + 15*(512/8))] // layer1234_end: subs count, count, #1 cbnz count, layer1234_start @@ -390,10 +369,10 @@ layer1234_start: .p2align 2 layer5678_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_56 root0, r_ptr0 load_next_roots_6 root1, r_ptr0 diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678.s b/examples/naive/aarch64/ntt_dilithium_123_45678.s index b460c4cf..fdd8b9b6 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -83,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -130,35 +108,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -371,14 +349,14 @@ _ntt_dilithium_123_45678: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -395,14 +373,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s b/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s index 75c1274b..01b06ce0 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -83,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -130,35 +108,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -371,14 +349,14 @@ _ntt_dilithium_123_45678_manual_st4: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -395,14 +373,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_red.s b/examples/naive/aarch64/ntt_dilithium_123_45678_red.s index e8c63491..880f9041 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_red.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_red.s @@ -2,32 +2,10 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -84,49 +62,49 @@ .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -339,14 +317,14 @@ _ntt_dilithium_123_45678_red: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -363,14 +341,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s index 209d6473..fa9dee01 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s @@ -2,41 +2,13 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -93,24 +65,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -140,35 +112,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -381,14 +353,14 @@ _ntt_dilithium_123_45678_w_scalar: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -405,14 +377,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s index 519251cc..48143723 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s @@ -2,37 +2,13 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -89,17 +65,17 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -129,35 +105,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -370,14 +346,14 @@ _ntt_dilithium_123_45678_w_scalar_red: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(1024/8))] + ldr qform_data2, [in, #(2*(1024/8))] + ldr qform_data3, [in, #(3*(1024/8))] + ldr qform_data4, [in, #(4*(1024/8))] + ldr qform_data5, [in, #(5*(1024/8))] + ldr qform_data6, [in, #(6*(1024/8))] + ldr qform_data7, [in, #(7*(1024/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -394,14 +370,14 @@ layer123_start: ct_butterfly data4, data5, root2, 2, 3 ct_butterfly data6, data7, root3, 0, 1 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(1024/8))] + str qform_data2, [in, #(-16 + 2*(1024/8))] + str qform_data3, [in, #(-16 + 3*(1024/8))] + str qform_data4, [in, #(-16 + 4*(1024/8))] + str qform_data5, [in, #(-16 + 5*(1024/8))] + str qform_data6, [in, #(-16 + 6*(1024/8))] + str qform_data7, [in, #(-16 + 7*(1024/8))] subs count, count, #1 cbnz count, layer123_start diff --git a/examples/naive/aarch64/ntt_kyber_1234_567.s b/examples/naive/aarch64/ntt_kyber_1234_567.s index 7c211a3b..49146c8f 100644 --- a/examples/naive/aarch64/ntt_kyber_1234_567.s +++ b/examples/naive/aarch64/ntt_kyber_1234_567.s @@ -26,33 +26,12 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro trn1_s d,a,b trn1 \d\().4s, \a\().4s, \b\().4s .endm .macro trn2_s d,a,b trn2 \d\().4s, \a\().4s, \b\().4s .endm -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -103,21 +82,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -347,23 +326,23 @@ _ntt_kyber_1234_567: .p2align 2 layer1234_start: - ldr_vo data0, src0, 0 - ldr_vo data1, src0, 1*32 - ldr_vo data2, src0, 2*32 - ldr_vo data3, src0, 3*32 - ldr_vo data4, src0, 4*32 - ldr_vo data5, src0, 5*32 - ldr_vo data6, src0, 6*32 - ldr_vo data7, src0, 7*32 - - ldr_vo data8, src8, 0 - ldr_vo data9, src8, 1*32 - ldr_vo data10, src8, 2*32 - ldr_vo data11, src8, 3*32 - ldr_vo data12, src8, 4*32 - ldr_vo data13, src8, 5*32 - ldr_vo data14, src8, 6*32 - ldr_vo data15, src8, 7*32 + ldr qform_data0, [src0, #0] + ldr qform_data1, [src0, #1*32] + ldr qform_data2, [src0, #2*32] + ldr qform_data3, [src0, #3*32] + ldr qform_data4, [src0, #4*32] + ldr qform_data5, [src0, #5*32] + ldr qform_data6, [src0, #6*32] + ldr qform_data7, [src0, #7*32] + + ldr qform_data8, [src8, #0] + ldr qform_data9, [src8, #1*32] + ldr qform_data10, [src8, #2*32] + ldr qform_data11, [src8, #3*32] + ldr qform_data12, [src8, #4*32] + ldr qform_data13, [src8, #5*32] + ldr qform_data14, [src8, #6*32] + ldr qform_data15, [src8, #7*32] ct_butterfly data0, data8, root0, 0, 1 ct_butterfly data1, data9, root0, 0, 1 @@ -401,23 +380,23 @@ layer1234_start: ct_butterfly data12, data13, root3, 2, 3 ct_butterfly data14, data15, root3, 4, 5 - str_vi data0, src0, 16 - str_vo data1, src0, -16+1*32 - str_vo data2, src0, -16+2*32 - str_vo data3, src0, -16+3*32 - str_vo data4, src0, -16+4*32 - str_vo data5, src0, -16+5*32 - str_vo data6, src0, -16+6*32 - str_vo data7, src0, -16+7*32 - - str_vi data8, src8, 16 - str_vo data9, src8, -16+1*32 - str_vo data10, src8, -16+2*32 - str_vo data11, src8, -16+3*32 - str_vo data12, src8, -16+4*32 - str_vo data13, src8, -16+5*32 - str_vo data14, src8, -16+6*32 - str_vo data15, src8, -16+7*32 + str qform_data0, [src0], #16 + str qform_data1, [src0, #-16+1*32] + str qform_data2, [src0, #-16+2*32] + str qform_data3, [src0, #-16+3*32] + str qform_data4, [src0, #-16+4*32] + str qform_data5, [src0, #-16+5*32] + str qform_data6, [src0, #-16+6*32] + str qform_data7, [src0, #-16+7*32] + + str qform_data8, [src8], #16 + str qform_data9, [src8, #-16+1*32] + str qform_data10, [src8, #-16+2*32] + str qform_data11, [src8, #-16+3*32] + str qform_data12, [src8, #-16+4*32] + str qform_data13, [src8, #-16+5*32] + str qform_data14, [src8, #-16+6*32] + str qform_data15, [src8, #-16+7*32] subs count, count, #1 cbnz count, layer1234_start @@ -446,21 +425,21 @@ layer567_start: trn2_s data7, data11, data15 // load twiddle factors - ldr_vi root0, r_ptr1, 16*14 - ldr_vo root0_tw, r_ptr1, -16*14+16*1 - ldr_vo root1, r_ptr1, -16*14+16*2 - ldr_vo root1_tw, r_ptr1, -16*14+16*3 - ldr_vo root2, r_ptr1, -16*14+16*4 - ldr_vo root2_tw, r_ptr1, -16*14+16*5 - ldr_vo root3, r_ptr1, -16*14+16*6 - ldr_vo root3_tw, r_ptr1, -16*14+16*7 - - ldr_vo data8, r_ptr1, -16*14+16*8 - ldr_vo data9, r_ptr1, -16*14+16*9 - ldr_vo data10, r_ptr1, -16*14+16*10 - ldr_vo data11, r_ptr1, -16*14+16*11 - ldr_vo data12, r_ptr1, -16*14+16*12 - ldr_vo data13, r_ptr1, -16*14+16*13 + ldr qform_root0, [ r_ptr1], #16*14 + ldr qform_root0_tw, [r_ptr1, #-16*14+16*1] + ldr qform_root1, [ r_ptr1, #-16*14+16*2] + ldr qform_root1_tw, [r_ptr1, #-16*14+16*3] + ldr qform_root2, [ r_ptr1, #-16*14+16*4] + ldr qform_root2_tw, [r_ptr1, #-16*14+16*5] + ldr qform_root3, [ r_ptr1, #-16*14+16*6] + ldr qform_root3_tw, [r_ptr1, #-16*14+16*7] + + ldr qform_data8, [ r_ptr1, #-16*14+16*8] + ldr qform_data9, [ r_ptr1, #-16*14+16*9] + ldr qform_data10, [ r_ptr1, #-16*14+16*10] + ldr qform_data11, [ r_ptr1, #-16*14+16*11] + ldr qform_data12, [ r_ptr1, #-16*14+16*12] + ldr qform_data13, [ r_ptr1, #-16*14+16*13] // butterflies ct_butterfly_v data0, data4, root0, root0_tw diff --git a/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s b/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s index c0dc85fd..f5c6a8ea 100644 --- a/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s +++ b/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s @@ -26,15 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro trn1_s d,a,b trn1 \d\().4s, \a\().4s, \b\().4s .endm @@ -42,18 +33,6 @@ trn2 \d\().4s, \a\().4s, \b\().4s .endm -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -104,21 +83,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -134,10 +113,10 @@ .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro save_gprs // @slothy:no-unfold @@ -355,23 +334,23 @@ _ntt_kyber_1234_567: .p2align 2 layer1234_start: - ldr_vo data0, src0, 0 - ldr_vo data1, src0, 1*32 - ldr_vo data2, src0, 2*32 - ldr_vo data3, src0, 3*32 - ldr_vo data4, src0, 4*32 - ldr_vo data5, src0, 5*32 - ldr_vo data6, src0, 6*32 - ldr_vo data7, src0, 7*32 - - ldr_vo data8, src8, 0 - ldr_vo data9, src8, 1*32 - ldr_vo data10, src8, 2*32 - ldr_vo data11, src8, 3*32 - ldr_vo data12, src8, 4*32 - ldr_vo data13, src8, 5*32 - ldr_vo data14, src8, 6*32 - ldr_vo data15, src8, 7*32 + ldr qform_data0, [src0, #0] + ldr qform_data1, [src0, #1*32] + ldr qform_data2, [src0, #2*32] + ldr qform_data3, [src0, #3*32] + ldr qform_data4, [src0, #4*32] + ldr qform_data5, [src0, #5*32] + ldr qform_data6, [src0, #6*32] + ldr qform_data7, [src0, #7*32] + + ldr qform_data8, [src8, #0] + ldr qform_data9, [src8, #1*32] + ldr qform_data10, [src8, #2*32] + ldr qform_data11, [src8, #3*32] + ldr qform_data12, [src8, #4*32] + ldr qform_data13, [src8, #5*32] + ldr qform_data14, [src8, #6*32] + ldr qform_data15, [src8, #7*32] ct_butterfly data0, data8, root0, 0, 1 ct_butterfly data1, data9, root0, 0, 1 @@ -409,23 +388,23 @@ layer1234_start: ct_butterfly data12, data13, root3, 2, 3 ct_butterfly data14, data15, root3, 4, 5 - str_vi data0, src0, 16 - str_vo data1, src0, -16+1*32 - str_vo data2, src0, -16+2*32 - str_vo data3, src0, -16+3*32 - str_vo data4, src0, -16+4*32 - str_vo data5, src0, -16+5*32 - str_vo data6, src0, -16+6*32 - str_vo data7, src0, -16+7*32 - - str_vi data8, src8, 16 - str_vo data9, src8, -16+1*32 - str_vo data10, src8, -16+2*32 - str_vo data11, src8, -16+3*32 - str_vo data12, src8, -16+4*32 - str_vo data13, src8, -16+5*32 - str_vo data14, src8, -16+6*32 - str_vo data15, src8, -16+7*32 + str qform_data0, [src0], #16 + str qform_data1, [src0, #-16+1*32] + str qform_data2, [src0, #-16+2*32] + str qform_data3, [src0, #-16+3*32] + str qform_data4, [src0, #-16+4*32] + str qform_data5, [src0, #-16+5*32] + str qform_data6, [src0, #-16+6*32] + str qform_data7, [src0, #-16+7*32] + + str qform_data8, [src8], #16 + str qform_data9, [src8, #-16+1*32] + str qform_data10, [src8, #-16+2*32] + str qform_data11, [src8, #-16+3*32] + str qform_data12, [src8, #-16+4*32] + str qform_data13, [src8, #-16+5*32] + str qform_data14, [src8, #-16+6*32] + str qform_data15, [src8, #-16+7*32] subs count, count, #1 cbnz count, layer1234_start @@ -454,21 +433,21 @@ layer567_start: trn2_s data7, data11, data15 // load twiddle factors - ldr_vi root0, r_ptr1, 16*14 - ldr_vo root0_tw, r_ptr1, -16*14+16*1 - ldr_vo root1, r_ptr1, -16*14+16*2 - ldr_vo root1_tw, r_ptr1, -16*14+16*3 - ldr_vo root2, r_ptr1, -16*14+16*4 - ldr_vo root2_tw, r_ptr1, -16*14+16*5 - ldr_vo root3, r_ptr1, -16*14+16*6 - ldr_vo root3_tw, r_ptr1, -16*14+16*7 - - ldr_vo data8, r_ptr1, -16*14+16*8 - ldr_vo data9, r_ptr1, -16*14+16*9 - ldr_vo data10, r_ptr1, -16*14+16*10 - ldr_vo data11, r_ptr1, -16*14+16*11 - ldr_vo data12, r_ptr1, -16*14+16*12 - ldr_vo data13, r_ptr1, -16*14+16*13 + ldr qform_root0, [ r_ptr1], #16*14 + ldr qform_root0_tw, [r_ptr1, #-16*14+16*1] + ldr qform_root1, [ r_ptr1, #-16*14+16*2] + ldr qform_root1_tw, [r_ptr1, #-16*14+16*3] + ldr qform_root2, [ r_ptr1, #-16*14+16*4] + ldr qform_root2_tw, [r_ptr1, #-16*14+16*5] + ldr qform_root3, [ r_ptr1, #-16*14+16*6] + ldr qform_root3_tw, [r_ptr1, #-16*14+16*7] + + ldr qform_data8, [ r_ptr1, #-16*14+16*8] + ldr qform_data9, [ r_ptr1, #-16*14+16*9] + ldr qform_data10, [ r_ptr1, #-16*14+16*10] + ldr qform_data11, [ r_ptr1, #-16*14+16*11] + ldr qform_data12, [ r_ptr1, #-16*14+16*12] + ldr qform_data13, [ r_ptr1, #-16*14+16*13] // butterflies ct_butterfly_v data0, data4, root0, root0_tw diff --git a/examples/naive/aarch64/ntt_kyber_123_4567.s b/examples/naive/aarch64/ntt_kyber_123_4567.s index b6bc21a7..f72f8a0c 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h @@ -97,21 +74,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -326,14 +303,14 @@ _ntt_kyber_123_4567: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -350,14 +327,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -367,10 +344,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s b/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s index c699d9ab..51ab6892 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s @@ -26,30 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -97,28 +73,28 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data @@ -333,14 +309,14 @@ _ntt_kyber_123_4567_manual_st4: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -357,14 +333,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -374,10 +350,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s index face162b..6e20512e 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s @@ -26,42 +26,12 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -109,21 +79,21 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -338,14 +308,14 @@ _ntt_kyber_123_4567_scalar_load: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -362,14 +332,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -379,10 +349,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s index 90beaa50..82e46502 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s @@ -26,15 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 .macro vins vec_out, gpr_in, lane @@ -45,27 +36,6 @@ xtmp1 .req x11 umov \gpr_out\(), \vec_in\().d[\lane] .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -113,21 +83,21 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -365,14 +335,14 @@ _ntt_kyber_123_4567_scalar_load_store: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -389,14 +359,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -406,10 +376,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s index 523fe947..18030440 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -96,21 +73,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -352,14 +329,14 @@ _ntt_kyber_123_4567_scalar_store: .p2align 2 layer123_start: - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(512/8)) - ldr_vo data2, in, (2*(512/8)) - ldr_vo data3, in, (3*(512/8)) - ldr_vo data4, in, (4*(512/8)) - ldr_vo data5, in, (5*(512/8)) - ldr_vo data6, in, (6*(512/8)) - ldr_vo data7, in, (7*(512/8)) + ldr qform_data0, [in, #0] + ldr qform_data1, [in, #(1*(512/8))] + ldr qform_data2, [in, #(2*(512/8))] + ldr qform_data3, [in, #(3*(512/8))] + ldr qform_data4, [in, #(4*(512/8))] + ldr qform_data5, [in, #(5*(512/8))] + ldr qform_data6, [in, #(6*(512/8))] + ldr qform_data7, [in, #(7*(512/8))] ct_butterfly data0, data4, root0, 0, 1 ct_butterfly data1, data5, root0, 0, 1 @@ -376,14 +353,14 @@ layer123_start: ct_butterfly data4, data5, root1, 2, 3 ct_butterfly data6, data7, root1, 4, 5 - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(512/8)) - str_vo data2, in, (-16 + 2*(512/8)) - str_vo data3, in, (-16 + 3*(512/8)) - str_vo data4, in, (-16 + 4*(512/8)) - str_vo data5, in, (-16 + 5*(512/8)) - str_vo data6, in, (-16 + 6*(512/8)) - str_vo data7, in, (-16 + 7*(512/8)) + str qform_data0, [in], #(16) + str qform_data1, [in, #(-16 + 1*(512/8))] + str qform_data2, [in, #(-16 + 2*(512/8))] + str qform_data3, [in, #(-16 + 3*(512/8))] + str qform_data4, [in, #(-16 + 4*(512/8))] + str qform_data5, [in, #(-16 + 5*(512/8))] + str qform_data6, [in, #(-16 + 6*(512/8))] + str qform_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 cbnz count, layer123_start @@ -393,10 +370,10 @@ layer123_start: .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr qform_data0, [inp, #(16*0)] + ldr qform_data1, [inp, #(16*1)] + ldr qform_data2, [inp, #(16*2)] + ldr qform_data3, [inp, #(16*3)] load_next_roots_45 diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a72.s b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a72.s index 568e8ff2..e41943da 100644 --- a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a72.s +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a72.s @@ -35,18 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -85,15 +73,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm .macro barrett_reduce_single a @@ -114,12 +102,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -138,31 +120,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [\r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -177,7 +159,7 @@ trn1_d \data\()1, t1, t3 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -188,7 +170,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -198,7 +180,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -206,7 +188,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -217,24 +199,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -333,590 +321,623 @@ _intt_dilithium_1234_5678_manual_ld4_opt_a72: mov count, #16 .p2align 2 - ldr q4, [x0, #48] // .*................................................. - ldr q2, [x0, #32] // *.................................................. - // gap // ................................................... - ldr q22, [x0, #0] // ..*................................................ - ldr q3, [x0, #16] // ...*............................................... - // gap // ................................................... - ldr q30, [x4], #8 // ........................................*.......... - ldr q10, [x3, #80] // ......*............................................ - // gap // ................................................... - ldr q12, [x3, #32] // ....*.............................................. - // gap // ................................................... - // gap // ................................................... - trn2 v14.4S, v2.4S, v4.4S // ...........*....................................... - trn1 v2.4S, v2.4S, v4.4S // ........*.......................................... - ldr q19, [x3, #48] // .............*..................................... - trn2 v0.4S, v22.4S, v3.4S // ..........*........................................ - trn1 v25.4S, v22.4S, v3.4S // .........*......................................... - ldr q7, [x4], #16 // ..........................................*........ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - trn2 v5.2D, v0.2D, v14.2D // ...............*................................... - trn2 v11.2D, v25.2D, v2.2D // .................*................................. - // gap // ................................................... - trn1 v28.2D, v25.2D, v2.2D // ............*...................................... - trn1 v3.2D, v0.2D, v14.2D // ..............*.................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v15.4S, v11.4S, v5.4S // ....................*.............................. - // gap // ................................................... - // gap // ................................................... - add v8.4S, v28.4S, v3.4S // ...................*............................... - // gap // ................................................... - // gap // ................................................... - sub v2.4S, v28.4S, v3.4S // ................*.................................. - // gap // ................................................... - // gap // ................................................... - sqrdmulh v13.4S, v15.4S, v10.4S // .......................*........................... - ldr q10, [x3, #64] // .....*............................................. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sqrdmulh v18.4S, v2.4S, v19.4S // .....................*............................. - // gap // ................................................... - // gap // ................................................... - add v28.4S, v11.4S, v5.4S // ......................*............................ - // gap // ................................................... - // gap // ................................................... - mul v2.4S, v2.4S, v12.4S // ..................*................................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mul v15.4S, v15.4S, v10.4S // .........................*......................... - ldr q10, [x3, #16] // .......*........................................... - sub v3.4S, v8.4S, v28.4S // ..........................*........................ - add v31.4S, v8.4S, v28.4S // ........................*.......................... - // gap // ................................................... - // gap // ................................................... - mls v2.4S, v18.4S, v29.4S // ...........................*....................... - ldr q18, [x3], #(6*16) // .............................*..................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v15.4S, v13.4S, v29.4S // ............................*...................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sqrdmulh v22.4S, v3.4S, v10.4S // ..............................*.................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mul v21.4S, v3.4S, v18.4S // ................................*.................. - // gap // ................................................... - // gap // ................................................... - sub v5.4S, v2.4S, v15.4S // ...............................*................... - // gap // ................................................... - // gap // ................................................... - add v11.4S, v2.4S, v15.4S // ....................................*.............. - // gap // ................................................... - // gap // ................................................... - mls v21.4S, v22.4S, v29.4S // ..................................*................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sqrdmulh v2.4S, v5.4S, v10.4S // .................................*................. - trn1 v0.4S, v31.4S, v11.4S // .......................................*........... - // gap // ................................................... - trn2 v11.4S, v31.4S, v11.4S // ......................................*............ - // gap // ................................................... - // gap // ................................................... - mul v18.4S, v5.4S, v18.4S // ...................................*............... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v18.4S, v2.4S, v29.4S // .....................................*............. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - trn2 v2.4S, v21.4S, v18.4S // ...........................................*....... - trn1 v27.4S, v21.4S, v18.4S // .........................................*......... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - trn2 v15.2D, v11.2D, v2.2D // .............................................*..... - trn2 v24.2D, v0.2D, v27.2D // ............................................*...... - trn1 v13.2D, v0.2D, v27.2D // ..............................................*.... - // gap // ................................................... - // gap // ................................................... - trn1 v27.2D, v11.2D, v2.2D // ...............................................*... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v19.4S, v24.4S, v15.4S // ................................................*.. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mul v18.4S, v19.4S, v7.S[2] // ..................................................* - add v11.4S, v13.4S, v27.4S // .................................................*. + // Instructions: 53 + // Expected cycles: 53 + // Expected IPC: 1.00 + // + // Wall time: 1.41s + // User time: 1.41s + // + // ---------------- original position -----------------> + // 0 25 50 + // |------------------------|------------------------|-- + ldr q20, [x0, #32] // *.................................................... + ldr q18, [x0, #48] // .*................................................... + // gap // ..................................................... + ldr q19, [x0, #16] // ..*.................................................. + ldr q11, [x0, #0] // ...*................................................. + // gap // ..................................................... + ldr q0, [x3, #48] // ....*................................................ + ldr q22, [x3, #80] // .....*............................................... + // gap // ..................................................... + ldr q30, [x3, #64] // .............*....................................... + ldr q13, [x3], #(6*16) // ......*.............................................. + // gap // ..................................................... + trn1 v6.4S, v20.4S, v18.4S // .......*............................................. + trn2 v20.4S, v20.4S, v18.4S // ........*............................................ + ldr q18, [x3, #-80] // .....................*............................... + trn1 v27.4S, v11.4S, v19.4S // .........*........................................... + trn2 v19.4S, v11.4S, v19.4S // ..........*.......................................... + ldr q11, [x3, #-64] // ......................*.............................. + ldr q31, [x4], #8 // ..........................................*.......... + ldr q24, [x4], #16 // ............................................*........ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + trn1 v12.2D, v19.2D, v20.2D // ...............*..................................... + trn1 v17.2D, v27.2D, v6.2D // ..............*...................................... + // gap // ..................................................... + trn2 v6.2D, v27.2D, v6.2D // ...........*......................................... + // gap // ..................................................... + // gap // ..................................................... + trn2 v20.2D, v19.2D, v20.2D // ............*........................................ + // gap // ..................................................... + // gap // ..................................................... + add v19.4S, v17.4S, v12.4S // ....................*................................ + sub v27.4S, v17.4S, v12.4S // ..................*.................................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + add v17.4S, v6.4S, v20.4S // .................*................................... + sub v20.4S, v6.4S, v20.4S // ................*.................................... + // gap // ..................................................... + sqrdmulh v0.4S, v27.4S, v0.4S // ........................*............................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v22.4S, v20.4S, v22.4S // .......................*............................. + sub v6.4S, v19.4S, v17.4S // ............................*........................ + // gap // ..................................................... + add v19.4S, v19.4S, v17.4S // .............................*....................... + // gap // ..................................................... + // gap // ..................................................... + mul v20.4S, v20.4S, v30.4S // ...................*................................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v11.4S, v27.4S, v11.4S // .........................*........................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v20.4S, v22.4S, v29.4S // ..........................*.......................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v11.4S, v0.4S, v29.4S // ...........................*......................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v0.4S, v6.4S, v18.4S // ..............................*...................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v22.4S, v6.4S, v13.4S // .................................*................... + // gap // ..................................................... + // gap // ..................................................... + sub v30.4S, v11.4S, v20.4S // ...............................*..................... + // gap // ..................................................... + // gap // ..................................................... + add v20.4S, v11.4S, v20.4S // ................................*.................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v18.4S, v30.4S, v18.4S // ..................................*.................. + // gap // ..................................................... + // gap // ..................................................... + trn2 v11.4S, v19.4S, v20.4S // ...................................*................. + // gap // ..................................................... + // gap // ..................................................... + mul v30.4S, v30.4S, v13.4S // ....................................*................ + trn1 v20.4S, v19.4S, v20.4S // .......................................*............. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v22.4S, v0.4S, v29.4S // .....................................*............... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v30.4S, v18.4S, v29.4S // ......................................*.............. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + trn1 v18.4S, v22.4S, v30.4S // ........................................*............ + trn2 v19.4S, v22.4S, v30.4S // .........................................*........... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + trn1 v22.2D, v11.2D, v19.2D // .............................................*....... + trn1 v0.2D, v20.2D, v18.2D // ...........................................*......... + // gap // ..................................................... + trn2 v20.2D, v20.2D, v18.2D // ..............................................*...... + trn2 v18.2D, v11.2D, v19.2D // ...............................................*..... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + add v30.4S, v0.4S, v22.4S // .................................................*... + sub v22.4S, v0.4S, v22.4S // ................................................*.... + // gap // ..................................................... + sub v28.4S, v20.4S, v18.4S // ..................................................*.. + add v0.4S, v20.4S, v18.4S // ...................................................*. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v5.4S, v22.4S, v24.S[0] // ....................................................* + // gap // ..................................................... + // gap // ..................................................... - // original source code - // ldr q21, [x0, #32] // .*................................................. - // ldr q16, [x0, #48] // *.................................................. - // ldr q25, [x0, #0] // ..*................................................ - // ldr q2, [x0, #16] // ...*............................................... - // ldr q28, [x3, #32] // ......*............................................ - // ldr q4, [x3, #64] // .....................*............................. - // ldr q13, [x3, #80] // .....*............................................. - // ldr q23, [x3, #16] // ..........................*........................ - // trn1 v14.4S, v21.4S, v16.4S // ........*.......................................... - // trn1 v19.4S, v25.4S, v2.4S // ...........*....................................... - // trn2 v9.4S, v25.4S, v2.4S // ..........*........................................ - // trn2 v17.4S, v21.4S, v16.4S // .......*........................................... - // trn1 v12.2D, v19.2D, v14.2D // ...............*................................... - // ldr q22, [x3, #48] // .........*......................................... - // trn1 v20.2D, v9.2D, v17.2D // ................*.................................. - // trn2 v1.2D, v9.2D, v17.2D // .............*..................................... - // sub v17.4S, v12.4S, v20.4S // ...................*............................... - // trn2 v2.2D, v19.2D, v14.2D // ..............*.................................... - // mul v8.4S, v17.4S, v28.4S // ........................*.......................... - // add v7.4S, v12.4S, v20.4S // ..................*................................ - // sub v6.4S, v2.4S, v1.4S // .................*................................. - // sqrdmulh v10.4S, v17.4S, v22.4S // ......................*............................ - // add v5.4S, v2.4S, v1.4S // .......................*........................... - // sqrdmulh v2.4S, v6.4S, v13.4S // ....................*.............................. - // add v31.4S, v7.4S, v5.4S // ............................*...................... - // mul v6.4S, v6.4S, v4.4S // .........................*......................... - // sub v28.4S, v7.4S, v5.4S // ...........................*....................... - // mls v8.4S, v10.4S, v29.4S // .............................*..................... - // mls v6.4S, v2.4S, v29.4S // ...............................*................... - // ldr q2, [x3], #(6*16) // ..............................*.................... - // sqrdmulh v10.4S, v28.4S, v23.4S // ................................*.................. - // sub v16.4S, v8.4S, v6.4S // ..................................*................ - // mul v22.4S, v28.4S, v2.4S // .................................*................. - // sqrdmulh v12.4S, v16.4S, v23.4S // .....................................*............. - // mls v22.4S, v10.4S, v29.4S // ....................................*.............. - // mul v23.4S, v16.4S, v2.4S // ........................................*.......... - // add v2.4S, v8.4S, v6.4S // ...................................*............... - // mls v23.4S, v12.4S, v29.4S // .........................................*......... - // trn2 v19.4S, v31.4S, v2.4S // .......................................*........... - // trn1 v2.4S, v31.4S, v2.4S // ......................................*............ - // ldr q30, [x4], #8 // ....*.............................................. - // trn1 v18.4S, v22.4S, v23.4S // ...........................................*....... - // ldr q7, [x4], #16 // ............*...................................... - // trn2 v14.4S, v22.4S, v23.4S // ..........................................*........ - // trn2 v24.2D, v2.2D, v18.2D // .............................................*..... - // trn2 v15.2D, v19.2D, v14.2D // ............................................*...... - // trn1 v13.2D, v2.2D, v18.2D // ..............................................*.... - // trn1 v27.2D, v19.2D, v14.2D // ...............................................*... - // sub v19.4S, v24.4S, v15.4S // ................................................*.. - // add v11.4S, v13.4S, v27.4S // ..................................................* - // mul v18.4S, v19.4S, v7.S[2] // .................................................*. + // ------------------- new position -------------------> + // 0 25 50 + // |------------------------|------------------------|-- + // ldr q12, [x0, #32] // *.................................................... + // ldr q7, [x0, #48] // .*................................................... + // ldr q3, [x0, #16] // ..*.................................................. + // ldr q21, [x0, #0] // ...*................................................. + // ldr q17, [x3, #48] // ....*................................................ + // ldr q8, [x3, #80] // .....*............................................... + // ldr q4, [x3], #(6*16) // .......*............................................. + // trn1 v23.4S, v12.4S, v7.4S // ........*............................................ + // trn2 v9.4S, v12.4S, v7.4S // .........*........................................... + // trn1 v0.4S, v21.4S, v3.4S // ...........*......................................... + // trn2 v19.4S, v21.4S, v3.4S // ............*........................................ + // trn2 v13.2D, v0.2D, v23.2D // ..................*.................................. + // trn2 v14.2D, v19.2D, v9.2D // ...................*................................. + // ldr q15, [x3, #-32] // ......*.............................................. + // trn1 v22.2D, v0.2D, v23.2D // .................*................................... + // trn1 v12.2D, v19.2D, v9.2D // ................*.................................... + // sub v21.4S, v13.4S, v14.4S // .......................*............................. + // add v9.4S, v13.4S, v14.4S // ......................*.............................. + // sub v27.4S, v22.4S, v12.4S // .....................*............................... + // mul v28.4S, v21.4S, v15.4S // ............................*........................ + // add v6.4S, v22.4S, v12.4S // ....................*................................ + // ldr q12, [x3, #-80] // ..........*.......................................... + // ldr q25, [x3, #-64] // .............*....................................... + // sqrdmulh v23.4S, v21.4S, v8.4S // .........................*........................... + // sqrdmulh v3.4S, v27.4S, v17.4S // ........................*............................ + // mul v18.4S, v27.4S, v25.4S // .............................*....................... + // mls v28.4S, v23.4S, v29.4S // ..............................*...................... + // mls v18.4S, v3.4S, v29.4S // ...............................*..................... + // sub v21.4S, v6.4S, v9.4S // ..........................*.......................... + // add v2.4S, v6.4S, v9.4S // ...........................*......................... + // sqrdmulh v22.4S, v21.4S, v12.4S // ................................*.................... + // sub v9.4S, v18.4S, v28.4S // ..................................*.................. + // add v19.4S, v18.4S, v28.4S // ...................................*................. + // mul v11.4S, v21.4S, v4.4S // .................................*................... + // sqrdmulh v13.4S, v9.4S, v12.4S // ....................................*................ + // trn2 v25.4S, v2.4S, v19.4S // .....................................*............... + // mul v21.4S, v9.4S, v4.4S // ......................................*.............. + // mls v11.4S, v22.4S, v29.4S // ........................................*............ + // mls v21.4S, v13.4S, v29.4S // .........................................*........... + // trn1 v22.4S, v2.4S, v19.4S // .......................................*............. + // trn1 v6.4S, v11.4S, v21.4S // ..........................................*.......... + // trn2 v17.4S, v11.4S, v21.4S // ...........................................*......... + // ldr q31, [x4], #8 // ..............*...................................... + // trn1 v16.2D, v22.2D, v6.2D // .............................................*....... + // ldr q24, [x4], #16 // ...............*..................................... + // trn1 v28.2D, v25.2D, v17.2D // ............................................*........ + // trn2 v0.2D, v22.2D, v6.2D // ..............................................*...... + // trn2 v17.2D, v25.2D, v17.2D // ...............................................*..... + // sub v22.4S, v16.4S, v28.4S // .................................................*... + // add v30.4S, v16.4S, v28.4S // ................................................*.... + // sub v28.4S, v0.4S, v17.4S // ..................................................*.. + // add v0.4S, v0.4S, v17.4S // ...................................................*. + // mul v5.4S, v22.4S, v24.S[0] // ....................................................* sub count, count, #1 layer5678_start: - sub v26.4S, v13.4S, v27.4S // ................................................*........................... - ldr q21, [x0, #96] // ..e......................................................................... - ldr q16, [x0, #112] // ...e........................................................................ - ldr q25, [x0, #64] // e........................................................................... - ldr q2, [x0, #80] // .e.......................................................................... - add v8.4S, v24.4S, v15.4S // ......................................................*..................... - sqrdmulh v10.4S, v19.4S, v7.S[3] // ........................................................*................... - ldr q28, [x3, #32] // ..............e............................................................. - ldr q4, [x3, #64] // ................e........................................................... - ldr q13, [x3, #80] // .................e.......................................................... - ldr q23, [x3, #16] // .............e.............................................................. + // Instructions: 76 + // Expected cycles: 56 + // Expected IPC: 1.36 + // + // Wall time: 125.35s + // User time: 125.35s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sqrdmulh v2.4S, v22.4S, v24.S[1] // ..................................................*......................... + ldr q12, [x0, #96] // ..e......................................................................... + ldr q7, [x0, #112] // ...e........................................................................ + sub v15.4S, v30.4S, v0.4S // ..........................................................*................. + ldr q3, [x0, #80] // .e.......................................................................... + ldr q21, [x0, #64] // e........................................................................... + ldr q17, [x3, #48] // ...............e............................................................ + ldr q8, [x3, #80] // .................e.......................................................... + mul v16.4S, v28.4S, v24.S[2] // ........................................................*................... + ldr q4, [x3], #(6*16) // ............e............................................................... + add v25.4S, v30.4S, v0.4S // ...........................................................*................ // gap // ............................................................................ - trn1 v14.4S, v21.4S, v16.4S // ......e..................................................................... - sqrdmulh v22.4S, v26.4S, v7.S[1] // ...................................................*........................ + trn1 v23.4S, v12.4S, v7.4S // ......e..................................................................... + trn2 v9.4S, v12.4S, v7.4S // .......e.................................................................... // gap // ............................................................................ - trn1 v19.4S, v25.4S, v2.4S // ....e....................................................................... + mls v5.4S, v2.4S, v29.4S // ....................................................*....................... + trn1 v0.4S, v21.4S, v3.4S // ....e....................................................................... // gap // ............................................................................ + trn2 v19.4S, v21.4S, v3.4S // .....e...................................................................... // gap // ............................................................................ - trn2 v9.4S, v25.4S, v2.4S // .....e...................................................................... - mul v26.4S, v26.4S, v7.S[0] // ..................................................*......................... // gap // ............................................................................ - trn2 v17.4S, v21.4S, v16.4S // .......e.................................................................... + srshr v2.4S, v25.4S, #23 // ....................................................................*....... + sqrdmulh v10.4S, v15.4S, v31.S[1] // ............................................................*............... // gap // ............................................................................ + trn2 v13.2D, v0.2D, v23.2D // ........e................................................................... // gap // ............................................................................ - mls v18.4S, v10.4S, v29.4S // .........................................................*.................. - trn1 v12.2D, v19.2D, v14.2D // ..........e................................................................. // gap // ............................................................................ - sub v24.4S, v11.4S, v8.4S // ..........................................................*................. + mul v30.4S, v15.4S, v31.S[0] // .............................................................*.............. + trn2 v14.2D, v19.2D, v9.2D // .........e.................................................................. + ldr q15, [x3, #-32] // ................e........................................................... + trn1 v22.2D, v0.2D, v23.2D // ..........e................................................................. // gap // ............................................................................ // gap // ............................................................................ - mls v26.4S, v22.4S, v29.4S // ....................................................*....................... - ldr q22, [x3, #48] // ...............e............................................................ - trn1 v20.2D, v9.2D, v17.2D // ...........e................................................................ - trn2 v1.2D, v9.2D, v17.2D // .........e.................................................................. + mls v25.4S, v2.4S, v29.4S // .....................................................................*...... + trn1 v12.2D, v19.2D, v9.2D // ...........e................................................................ // gap // ............................................................................ + sub v21.4S, v13.4S, v14.4S // .......................e.................................................... // gap // ............................................................................ - add v3.4S, v11.4S, v8.4S // ...........................................................*................ - mul v0.4S, v24.4S, v30.S[0] // ............................................................*............... // gap // ............................................................................ - sub v17.4S, v12.4S, v20.4S // ..................e......................................................... + add v9.4S, v13.4S, v14.4S // ........................e................................................... + sqrdmulh v2.4S, v28.4S, v24.S[3] // .......................................................*.................... // gap // ............................................................................ + sub v27.4S, v22.4S, v12.4S // ..................e......................................................... // gap // ............................................................................ - sqrdmulh v15.4S, v24.4S, v30.S[1] // .............................................................*.............. - trn2 v2.2D, v19.2D, v14.2D // ........e................................................................... // gap // ............................................................................ - srshr v21.4S, v3.4S, #23 // ....................................................................*....... + mul v28.4S, v21.4S, v15.4S // ..........................e................................................. + add v6.4S, v22.4S, v12.4S // ...................e........................................................ + ldr q12, [x3, #-80] // .............e.............................................................. + str q25, [x0], #(16*4) // ........................................................................*... + ldr q25, [x3, #-64] // ..............e............................................................. // gap // ............................................................................ + sqrdmulh v23.4S, v21.4S, v8.4S // .........................e.................................................. // gap // ............................................................................ - mul v8.4S, v17.4S, v28.4S // ....................e....................................................... - add v7.4S, v12.4S, v20.4S // ...................e........................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sub v6.4S, v2.4S, v1.4S // .......................e.................................................... - sqrdmulh v10.4S, v17.4S, v22.4S // .....................e...................................................... - add v5.4S, v2.4S, v1.4S // ........................e................................................... // gap // ............................................................................ + sqrdmulh v3.4S, v27.4S, v17.4S // ....................e....................................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v2.4S, v6.4S, v13.4S // ..........................e................................................. // gap // ............................................................................ // gap // ............................................................................ - add v31.4S, v7.4S, v5.4S // .............................e.............................................. + mul v18.4S, v27.4S, v25.4S // .....................e...................................................... // gap // ............................................................................ // gap // ............................................................................ - mul v6.4S, v6.4S, v4.4S // .........................e.................................................. - sub v28.4S, v7.4S, v5.4S // ............................e............................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + mls v28.4S, v23.4S, v29.4S // ...........................e................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mls v8.4S, v10.4S, v29.4S // ......................e..................................................... // gap // ............................................................................ // gap // ............................................................................ + mls v18.4S, v3.4S, v29.4S // ......................e..................................................... // gap // ............................................................................ - mls v6.4S, v2.4S, v29.4S // ...........................e................................................ - ldr q2, [x3], #(6*16) // ............e............................................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + sub v21.4S, v6.4S, v9.4S // ............................e............................................... + mls v16.4S, v2.4S, v29.4S // .........................................................*.................. + add v2.4S, v6.4S, v9.4S // .............................e.............................................. // gap // ............................................................................ - mls v0.4S, v15.4S, v29.4S // ..............................................................*............. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v10.4S, v28.4S, v23.4S // ...............................e............................................ + sqrdmulh v22.4S, v21.4S, v12.4S // ..............................e............................................. + sub v9.4S, v18.4S, v28.4S // .................................e.......................................... // gap // ............................................................................ // gap // ............................................................................ - sub v16.4S, v8.4S, v6.4S // .................................e.......................................... + add v19.4S, v18.4S, v28.4S // ..................................e......................................... + mul v11.4S, v21.4S, v4.4S // ...............................e............................................ // gap // ............................................................................ // gap // ............................................................................ - mul v22.4S, v28.4S, v2.4S // ..............................e............................................. // gap // ............................................................................ + sub v15.4S, v5.4S, v16.4S // ...............................................................*............ + add v3.4S, v5.4S, v16.4S // ................................................................*........... + sqrdmulh v13.4S, v9.4S, v12.4S // ...................................e........................................ // gap // ............................................................................ - str q0, [x0, #32] // ..........................................................................*. + trn2 v25.4S, v2.4S, v19.4S // .......................................e.................................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v12.4S, v16.4S, v23.4S // ....................................e....................................... + mul v21.4S, v9.4S, v4.4S // ....................................e....................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sub v11.4S, v26.4S, v18.4S // ...............................................................*............ - mls v22.4S, v10.4S, v29.4S // ................................e........................................... - add v10.4S, v26.4S, v18.4S // ................................................................*........... // gap // ............................................................................ + mls v11.4S, v22.4S, v29.4S // ................................e........................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mul v23.4S, v16.4S, v2.4S // ...................................e........................................ - add v2.4S, v8.4S, v6.4S // ..................................e......................................... // gap // ............................................................................ // gap // ............................................................................ + mls v21.4S, v13.4S, v29.4S // .....................................e...................................... // gap // ............................................................................ // gap // ............................................................................ - mls v23.4S, v12.4S, v29.4S // .....................................e...................................... // gap // ............................................................................ // gap // ............................................................................ - trn2 v19.4S, v31.4S, v2.4S // .......................................e.................................... // gap // ............................................................................ + mls v30.4S, v10.4S, v29.4S // ..............................................................*............. // gap // ............................................................................ - trn1 v2.4S, v31.4S, v2.4S // ......................................e..................................... // gap // ............................................................................ - sqrdmulh v8.4S, v11.4S, v30.S[1] // ..................................................................*......... // gap // ............................................................................ // gap // ............................................................................ + srshr v20.4S, v3.4S, #23 // ......................................................................*..... + trn1 v22.4S, v2.4S, v19.4S // ......................................e..................................... // gap // ............................................................................ - mul v1.4S, v11.4S, v30.S[0] // .................................................................*.......... // gap // ............................................................................ - ldr q30, [x4], #8 // ..............................................e............................. - trn1 v18.4S, v22.4S, v23.4S // ........................................e................................... - ldr q7, [x4], #16 // ...............................................e............................ + trn1 v6.4S, v11.4S, v21.4S // ........................................e................................... // gap // ............................................................................ - mls v3.4S, v21.4S, v29.4S // .....................................................................*...... - trn2 v14.4S, v22.4S, v23.4S // .........................................e.................................. + sqrdmulh v9.4S, v15.4S, v31.S[1] // .................................................................*.......... + trn2 v17.4S, v11.4S, v21.4S // .........................................e.................................. // gap // ............................................................................ - srshr v22.4S, v10.4S, #23 // ......................................................................*..... // gap // ............................................................................ + mul v26.4S, v15.4S, v31.S[0] // ..................................................................*......... + str q30, [x0, #-32] // ..........................................................................*. + ldr q31, [x4], #8 // ..............................................e............................. + trn1 v16.2D, v22.2D, v6.2D // ............................................e............................... + ldr q24, [x4], #16 // ...............................................e............................ // gap // ............................................................................ - mls v1.4S, v8.4S, v29.4S // ...................................................................*........ - trn2 v24.2D, v2.2D, v18.2D // ..........................................e................................. + mls v3.4S, v20.4S, v29.4S // .......................................................................*.... + trn1 v28.2D, v25.2D, v17.2D // .............................................e.............................. // gap // ............................................................................ - trn2 v15.2D, v19.2D, v14.2D // ...........................................e................................ + trn2 v0.2D, v22.2D, v6.2D // ..........................................e................................. // gap // ............................................................................ // gap // ............................................................................ - mls v10.4S, v22.4S, v29.4S // .......................................................................*.... - trn1 v13.2D, v2.2D, v18.2D // ............................................e............................... + trn2 v17.2D, v25.2D, v17.2D // ...........................................e................................ + mls v26.4S, v9.4S, v29.4S // ...................................................................*........ // gap // ............................................................................ - trn1 v27.2D, v19.2D, v14.2D // .............................................e.............................. - str q3, [x0], #(16*4) // ........................................................................*... + sub v22.4S, v16.4S, v28.4S // ................................................e........................... // gap // ............................................................................ // gap // ............................................................................ + add v30.4S, v16.4S, v28.4S // .................................................e.......................... // gap // ............................................................................ - sub v19.4S, v24.4S, v15.4S // .....................................................e...................... - str q1, [x0, #-16] // ...........................................................................* // gap // ............................................................................ + str q3, [x0, #-48] // .........................................................................*.. // gap // ............................................................................ - add v11.4S, v13.4S, v27.4S // .................................................e.......................... + sub v28.4S, v0.4S, v17.4S // .....................................................e...................... + add v0.4S, v0.4S, v17.4S // ......................................................e..................... + mul v5.4S, v22.4S, v24.S[0] // ...................................................e........................ // gap // ............................................................................ + str q26, [x0, #-16] // ...........................................................................* // gap // ............................................................................ - str q10, [x0, #-48] // .........................................................................*.. - mul v18.4S, v19.4S, v7.S[2] // .......................................................e.................... // gap // ............................................................................ - // original source code - // ldr q8, [x0, #(16*0)] // ..e........................................................................|..e....................................................................... - // ldr q9, [x0, #(16*1)] // ...e.......................................................................|...e...................................................................... - // ldr q10, [x0, #(16*2)] // e..........................................................................|e......................................................................... - // ldr q11, [x0, #(16*3)] // .e.........................................................................|.e........................................................................ - // trn1 v25.4s, v8.4s, v9.4s // ............e..............................................................|............e............................................................. - // trn2 v26.4s, v8.4s, v9.4s // .............e.............................................................|.............e............................................................ - // trn1 v27.4s, v10.4s, v11.4s // ..........e................................................................|..........e............................................................... - // trn2 v28.4s, v10.4s, v11.4s // ...............e...........................................................|...............e.......................................................... - // trn2 v10.2d, v25.2d, v27.2d // ...........................e...............................................|...........................e.............................................. - // trn2 v11.2d, v26.2d, v28.2d // ......................e....................................................|......................e................................................... - // trn1 v8.2d, v25.2d, v27.2d // .................e.........................................................|.................e........................................................ - // trn1 v9.2d, v26.2d, v28.2d // .....................e.....................................................|.....................e.................................................... - // ldr q0, [x3], #(6*16) // ........................................e..................................|........................................e................................. - // ldr q4, [x3, #(-6*16 + 1*16)] // .........e.................................................................|.........e................................................................ - // ldr q1, [x3, #(-6*16 + 2*16)] // ......e....................................................................|......e................................................................... - // ldr q5, [x3, #(-6*16 + 3*16)] // ....................e......................................................|....................e..................................................... - // ldr q2, [x3, #(-6*16 + 4*16)] // .......e...................................................................|.......e.................................................................. - // ldr q6, [x3, #(-6*16 + 5*16)] // ........e..................................................................|........e................................................................. - // sub v24.4s, v8.4s, v9.4s // .........................e.................................................|.........................e................................................ - // add v8.4s, v8.4s, v9.4s // ..............................e............................................|..............................e........................................... - // mul v9.4s, v24.4s, v1.4s // .............................e.............................................|.............................e............................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // ................................e..........................................|................................e......................................... - // mls v9.4s, v24.4s, v29.4s // ......................................e....................................|......................................e................................... - // sub v24.4s, v10.4s, v11.4s // ...............................e...........................................|...............................e.......................................... - // add v10.4s, v10.4s, v11.4s // .................................e.........................................|.................................e........................................ - // mul v11.4s, v24.4s, v2.4s // ....................................e......................................|....................................e..................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..................................e........................................|..................................e....................................... - // mls v11.4s, v24.4s, v29.4s // .......................................e...................................|.......................................e.................................. - // sub v24.4s, v8.4s, v10.4s // .....................................e.....................................|.....................................e.................................... - // add v8.4s, v8.4s, v10.4s // ...................................e.......................................|...................................e...................................... - // mul v10.4s, v24.4s, v0.4s // ............................................e..............................|............................................e............................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................................e................................|..........................................e............................... - // mls v10.4s, v24.4s, v29.4s // ................................................e..........................|................................................e......................... - // sub v24.4s, v9.4s, v11.4s // ...........................................e...............................|...........................................e.............................. - // add v9.4s, v9.4s, v11.4s // ...................................................e.......................|...................................................e...................... - // mul v11.4s, v24.4s, v0.4s // ..................................................e........................|..................................................e....................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................e............................|..............................................e........................... - // mls v11.4s, v24.4s, v29.4s // ....................................................e......................|....................................................e..................... - // trn1 v25.4s, v8.4s, v9.4s // ......................................................e....................|......................................................e................... - // trn2 v26.4s, v8.4s, v9.4s // .....................................................e.....................|.....................................................e.................... - // trn1 v27.4s, v10.4s, v11.4s // ..........................................................e................|..........................................................e............... - // trn2 v28.4s, v10.4s, v11.4s // .............................................................e.............|.............................................................e............ - // trn2 v10.2d, v25.2d, v27.2d // ................................................................e..........|................................................................e......... - // trn2 v11.2d, v26.2d, v28.2d // .................................................................e.........|.................................................................e........ - // trn1 v8.2d, v25.2d, v27.2d // ...................................................................e.......|...................................................................e...... - // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e......|....................................................................e..... - // ldr q1, [x4], #8 // .........................................................e.................|.........................................................e................ - // ldr q0, [x4], #16 // ...........................................................e...............|...........................................................e.............. - // sub v24.4s, v8.4s, v9.4s // ...........................................................................*.......................................................................... - // add v8.4s, v8.4s, v9.4s // ........................................................................e..|........................................................................e. - // mul v9.4s, v24.4s, v0.s[0] // ..............*............................................................|..............*........................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........*...............................................................|...........*.............................................................. - // mls v9.4s, v24.4s, v29.4s // ...................*.......................................................|...................*...................................................... - // sub v24.4s, v10.4s, v11.4s // ......................................................................e....|......................................................................e... - // add v10.4s, v10.4s, v11.4s // ....*......................................................................|....*..................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ..........................................................................e|.......................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....*.....................................................................|.....*.................................................................... - // mls v11.4s, v24.4s, v29.4s // ................*..........................................................|................*......................................................... - // sub v24.4s, v8.4s, v10.4s // ..................*........................................................|..................*....................................................... - // add v8.4s, v8.4s, v10.4s // .......................*...................................................|.......................*.................................................. - // mul v10.4s, v24.4s, v1.s[0] // ........................*..................................................|........................*................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................*................................................|..........................*............................................... - // mls v10.4s, v24.4s, v29.4s // .........................................*.................................|.........................................*................................ - // sub v24.4s, v9.4s, v11.4s // ...............................................*...........................|...............................................*.......................... - // add v9.4s, v9.4s, v11.4s // .................................................*.........................|.................................................*........................ - // mul v11.4s, v24.4s, v1.s[0] // ........................................................*..................|........................................................*................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................*...................|.......................................................*.................. - // mls v11.4s, v24.4s, v29.4s // ...............................................................*...........|...............................................................*.......... - // srshr v24.4S, v8.4S, #23 // ............................*..............................................|............................*............................................. - // mls v8.4s, v24.4s, v29.4s // ............................................................*..............|............................................................*............. - // srshr v24.4S, v9.4S, #23 // ..............................................................*............|..............................................................*........... - // mls v9.4s, v24.4s, v29.4s // ..................................................................*........|..................................................................*....... - // str q8, [x0], #(16*4) // .....................................................................*.....|.....................................................................*.... - // str q9, [x0, #(-16*4 + 1*16)] // .........................................................................*.|.........................................................................* - // str q10, [x0, #(-16*4 + 2*16)] // .............................................*.............................|.............................................*............................ - // str q11, [x0, #(-16*4 + 3*16)] // .......................................................................*...|.......................................................................*.. + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q8, [x0, #(16*0)] // ....e......................................................................'....~...................................................................... + // ldr q9, [x0, #(16*1)] // ...e.......................................................................'...~....................................................................... + // ldr q10, [x0, #(16*2)] // e..........................................................................'~.......................................................................... + // ldr q11, [x0, #(16*3)] // .e.........................................................................'.~......................................................................... + // trn1 v25.4s, v8.4s, v9.4s // .............e.............................................................'.............~............................................................. + // trn2 v26.4s, v8.4s, v9.4s // ..............e............................................................'..............~............................................................ + // trn1 v27.4s, v10.4s, v11.4s // ..........e................................................................'..........~................................................................ + // trn2 v28.4s, v10.4s, v11.4s // ...........e...............................................................'...........~............................................................... + // trn2 v10.2d, v25.2d, v27.2d // .................e.........................................................'.................~......................................................... + // trn2 v11.2d, v26.2d, v28.2d // ...................e.......................................................'...................~....................................................... + // trn1 v8.2d, v25.2d, v27.2d // .....................e.....................................................'.....................~..................................................... + // trn1 v9.2d, v26.2d, v28.2d // .......................e...................................................'.......................~................................................... + // ldr q0, [x3], #(6*16) // ........e..................................................................'........~.................................................................. + // ldr q4, [x3, #(-6*16 + 1*16)] // ..............................e............................................'..............................~............................................ + // ldr q1, [x3, #(-6*16 + 2*16)] // ................................e..........................................'................................~.......................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // .....e.....................................................................'.....~..................................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ....................e......................................................'....................~...................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ......e....................................................................'......~.................................................................... + // sub v24.4s, v8.4s, v9.4s // ...........................e...............................................'...........................~............................................... + // add v8.4s, v8.4s, v9.4s // .............................e.............................................'.............................~............................................. + // sqrdmulh v27.4s, v24.4s, v5.4s // ..................................e........................................'..................................~........................................ + // mul v9.4s, v24.4s, v1.4s // ...................................e.......................................'...................................~....................................... + // mls v9.4s, v27.4s, v29.4s // .....................................e.....................................'.....................................~..................................... + // sub v24.4s, v10.4s, v11.4s // ........................e..................................................'........................~.................................................. + // add v10.4s, v10.4s, v11.4s // .........................e.................................................'.........................~................................................. + // sqrdmulh v27.4s, v24.4s, v6.4s // .................................e.........................................'.................................~......................................... + // mul v11.4s, v24.4s, v2.4s // ............................e..............................................'............................~.............................................. + // mls v11.4s, v27.4s, v29.4s // ....................................e......................................'....................................~...................................... + // sub v24.4s, v8.4s, v10.4s // ......................................e....................................'......................................~.................................... + // add v8.4s, v8.4s, v10.4s // ........................................e..................................'........................................~.................................. + // sqrdmulh v27.4s, v24.4s, v4.4s // .........................................e.................................'.........................................~................................. + // mul v10.4s, v24.4s, v0.4s // ............................................e..............................'............................................~.............................. + // mls v10.4s, v27.4s, v29.4s // ..................................................e........................'..................................................~........................ + // sub v24.4s, v9.4s, v11.4s // ..........................................e................................'..........................................~................................ + // add v9.4s, v9.4s, v11.4s // ...........................................e...............................'...........................................~............................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...............................................e...........................'...............................................~........................... + // mul v11.4s, v24.4s, v0.4s // .................................................e.........................'.................................................~......................... + // mls v11.4s, v27.4s, v29.4s // ...................................................e.......................'...................................................~....................... + // trn1 v25.4s, v8.4s, v9.4s // ......................................................e....................'......................................................~.................... + // trn2 v26.4s, v8.4s, v9.4s // ................................................e..........................'................................................~.......................... + // trn1 v27.4s, v10.4s, v11.4s // .......................................................e...................'.......................................................~................... + // trn2 v28.4s, v10.4s, v11.4s // .........................................................e.................'.........................................................~................. + // trn2 v10.2d, v25.2d, v27.2d // .................................................................e.........'.................................................................~......... + // trn2 v11.2d, v26.2d, v28.2d // ..................................................................e........'..................................................................~........ + // trn1 v8.2d, v25.2d, v27.2d // .............................................................e.............'.............................................................~............. + // trn1 v9.2d, v26.2d, v28.2d // ................................................................e..........'................................................................~.......... + // ldr q1, [x4], #8 // ............................................................e..............'............................................................~.............. + // ldr q0, [x4], #16 // ..............................................................e............'..............................................................~............ + // sub v24.4s, v8.4s, v9.4s // ....................................................................e......'....................................................................~...... + // add v8.4s, v8.4s, v9.4s // .....................................................................e.....'.....................................................................~..... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...........................................................................*........................................................................... + // mul v9.4s, v24.4s, v0.s[0] // .........................................................................e.'.........................................................................~. + // mls v9.4s, v27.4s, v29.4s // ............~..............................................................'............*.............................................................. + // sub v24.4s, v10.4s, v11.4s // .......................................................................e...'.......................................................................~... + // add v10.4s, v10.4s, v11.4s // ........................................................................e..'........................................................................~.. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ..........................~................................................'..........................*................................................ + // mul v11.4s, v24.4s, v0.s[2] // .......~...................................................................'.......*................................................................... + // mls v11.4s, v27.4s, v29.4s // .......................................~...................................'.......................................*................................... + // sub v24.4s, v8.4s, v10.4s // ..~........................................................................'..*........................................................................ + // add v8.4s, v8.4s, v10.4s // .........~.................................................................'.........*................................................................. + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ................~..........................................................'................*.......................................................... + // mul v10.4s, v24.4s, v1.s[0] // ..................~........................................................'..................*........................................................ + // mls v10.4s, v27.4s, v29.4s // ....................................................~......................'....................................................*...................... + // sub v24.4s, v9.4s, v11.4s // .............................................~.............................'.............................................*............................. + // add v9.4s, v9.4s, v11.4s // ..............................................~............................'..............................................*............................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ........................................................~..................'........................................................*.................. + // mul v11.4s, v24.4s, v1.s[0] // ..........................................................~................'..........................................................*................ + // mls v11.4s, v27.4s, v29.4s // ...................................................................~.......'...................................................................*....... + // srshr v24.4S, v8.4S, #23 // ...............~...........................................................'...............*........................................................... + // mls v8.4s, v24.4s, v29.4s // ......................~....................................................'......................*.................................................... + // srshr v24.4S, v9.4S, #23 // .....................................................~.....................'.....................................................*..................... + // mls v9.4s, v24.4s, v29.4s // ...............................................................~...........'...............................................................*........... + // str q8, [x0], #(16*4) // ...............................~...........................................'...............................*........................................... + // str q9, [x0, #(-16*4 + 1*16)] // ......................................................................~....'......................................................................*.... + // str q10, [x0, #(-16*4 + 2*16)] // ...........................................................~...............'...........................................................*............... + // str q11, [x0, #(-16*4 + 3*16)] // ..........................................................................~'..........................................................................* sub count, count, #1 cbnz count, layer5678_start - sub v25.4S, v13.4S, v27.4S // *........................ - sqrdmulh v14.4S, v19.4S, v7.S[3] // ..*...................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - add v15.4S, v24.4S, v15.4S // .*....................... - // gap // ......................... - // gap // ......................... - sqrdmulh v17.4S, v25.4S, v7.S[1] // ...*..................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - mul v26.4S, v25.4S, v7.S[0] // ....*.................... - sub v5.4S, v11.4S, v15.4S // ......*.................. - // gap // ......................... - add v24.4S, v11.4S, v15.4S // ........*................ - // gap // ......................... - // gap // ......................... - // gap // ......................... - mls v18.4S, v14.4S, v29.4S // .....*................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - mls v26.4S, v17.4S, v29.4S // .......*................. - srshr v15.4S, v24.4S, #23 // ...........*............. - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - mul v13.4S, v5.4S, v30.S[0] // .........*............... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - sqrdmulh v23.4S, v5.4S, v30.S[1] // ..........*.............. - // gap // ......................... - // gap // ......................... - sub v20.4S, v26.4S, v18.4S // ..............*.......... - // gap // ......................... - // gap // ......................... - mls v24.4S, v15.4S, v29.4S // ..................*...... - add v8.4S, v26.4S, v18.4S // ...............*......... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - sqrdmulh v18.4S, v20.4S, v30.S[1] // ................*........ - // gap // ......................... - // gap // ......................... - srshr v21.4S, v8.4S, #23 // ...................*..... - // gap // ......................... - // gap // ......................... - mul v9.4S, v20.4S, v30.S[0] // .................*....... - // gap // ......................... - // gap // ......................... - str q24, [x0], #(16*4) // ......................*.. - // gap // ......................... - // gap // ......................... - mls v13.4S, v23.4S, v29.4S // ............*............ - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - mls v9.4S, v18.4S, v29.4S // ....................*.... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - mls v8.4S, v21.4S, v29.4S // .....................*... - // gap // ......................... - // gap // ......................... - str q13, [x0, #-32] // .............*........... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - str q9, [x0, #-16] // .......................*. - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - str q8, [x0, #-48] // ........................* - // gap // ......................... - // gap // ......................... + // Instructions: 23 + // Expected cycles: 30 + // Expected IPC: 0.77 + // + // Wall time: 0.18s + // User time: 0.18s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sqrdmulh v20.4S, v22.4S, v24.S[1] // *............................. + sub v18.4S, v30.4S, v0.4S // .*............................ + // gap // .............................. + add v19.4S, v30.4S, v0.4S // ...*.......................... + // gap // .............................. + // gap // .............................. + sqrdmulh v11.4S, v28.4S, v24.S[3] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v0.4S, v28.4S, v24.S[2] // ..*........................... + srshr v22.4S, v19.4S, #23 // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v5.4S, v20.4S, v29.4S // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v0.4S, v11.4S, v29.4S // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v20.4S, v18.4S, v31.S[1] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v18.4S, v18.4S, v31.S[0] // .......*...................... + // gap // .............................. + // gap // .............................. + sub v11.4S, v5.4S, v0.4S // ............*................. + // gap // .............................. + // gap // .............................. + mls v19.4S, v22.4S, v29.4S // ........*..................... + add v0.4S, v5.4S, v0.4S // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v22.4S, v11.4S, v31.S[1] // ................*............. + // gap // .............................. + // gap // .............................. + srshr v30.4S, v0.4S, #23 // ...............*.............. + // gap // .............................. + // gap // .............................. + mul v11.4S, v11.4S, v31.S[0] // .................*............ + // gap // .............................. + // gap // .............................. + str q19, [x0], #(16*4) // ..........*................... + // gap // .............................. + // gap // .............................. + mls v18.4S, v20.4S, v29.4S // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v0.4S, v30.4S, v29.4S // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v11.4S, v22.4S, v29.4S // ....................*......... + // gap // .............................. + // gap // .............................. + str q18, [x0, #-32] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q0, [x0, #-48] // .....................*........ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q11, [x0, #-16] // ......................*....... + // gap // .............................. + // gap // .............................. - // original source code - // sub v26.4S, v13.4S, v27.4S // *........................ - // add v8.4S, v24.4S, v15.4S // ..*...................... - // sqrdmulh v10.4S, v19.4S, v7.S[3] // .*....................... - // sqrdmulh v22.4S, v26.4S, v7.S[1] // ...*..................... - // mul v26.4S, v26.4S, v7.S[0] // ....*.................... - // mls v18.4S, v10.4S, v29.4S // .......*................. - // sub v24.4S, v11.4S, v8.4S // .....*................... - // mls v26.4S, v22.4S, v29.4S // ........*................ - // add v3.4S, v11.4S, v8.4S // ......*.................. - // mul v0.4S, v24.4S, v30.S[0] // ..........*.............. - // sqrdmulh v15.4S, v24.4S, v30.S[1] // ...........*............. - // srshr v21.4S, v3.4S, #23 // .........*............... - // mls v0.4S, v15.4S, v29.4S // ...................*..... - // str q0, [x0, #32] // ......................*.. - // sub v11.4S, v26.4S, v18.4S // ............*............ - // add v10.4S, v26.4S, v18.4S // ..............*.......... - // sqrdmulh v8.4S, v11.4S, v30.S[1] // ...............*......... - // mul v1.4S, v11.4S, v30.S[0] // .................*....... - // mls v3.4S, v21.4S, v29.4S // .............*........... - // srshr v22.4S, v10.4S, #23 // ................*........ - // mls v1.4S, v8.4S, v29.4S // ....................*.... - // mls v10.4S, v22.4S, v29.4S // .....................*... - // str q3, [x0], #(16*4) // ..................*...... - // str q1, [x0, #-16] // .......................*. - // str q10, [x0, #-48] // ........................* + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sqrdmulh v2.4S, v22.4S, v24.S[1] // *.............................. + // sub v15.4S, v30.4S, v0.4S // .*............................. + // mul v16.4S, v28.4S, v24.S[2] // ....*.......................... + // add v25.4S, v30.4S, v0.4S // ..*............................ + // mls v5.4S, v2.4S, v29.4S // ......*........................ + // srshr v2.4S, v25.4S, #23 // .....*......................... + // sqrdmulh v10.4S, v15.4S, v31.S[1] // ........*...................... + // mul v30.4S, v15.4S, v31.S[0] // .........*..................... + // mls v25.4S, v2.4S, v29.4S // ...........*................... + // sqrdmulh v2.4S, v28.4S, v24.S[3] // ...*........................... + // str q25, [x0], #(16*4) // ................*.............. + // mls v16.4S, v2.4S, v29.4S // .......*....................... + // sub v15.4S, v5.4S, v16.4S // ..........*.................... + // add v3.4S, v5.4S, v16.4S // ............*.................. + // mls v30.4S, v10.4S, v29.4S // .................*............. + // srshr v20.4S, v3.4S, #23 // ..............*................ + // sqrdmulh v9.4S, v15.4S, v31.S[1] // .............*................. + // mul v26.4S, v15.4S, v31.S[0] // ...............*............... + // str q30, [x0, #-32] // ....................*.......... + // mls v3.4S, v20.4S, v29.4S // ..................*............ + // mls v26.4S, v9.4S, v29.4S // ...................*........... + // str q3, [x0, #-48] // .....................*......... + // str q26, [x0, #-16] // ......................*........ .unreq root0_tw @@ -958,853 +979,862 @@ layer5678_start: load_roots_1234 r_ptr1 .p2align 2 - ldr q18, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. - ldr q10, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ - ldr q22, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. - ldr q27, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. - ldr q13, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ - ldr q15, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... - ldr q24, [x1, #0] // *....................................................................................................................................................................................................................................................................................... - ldr q8, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... - sub v11.4S, v18.4S, v10.4S // .........................................*.............................................................................................................................................................................................................................................. - add v18.4S, v18.4S, v10.4S // ..........................................*............................................................................................................................................................................................................................................. + ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q18, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q19, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q16, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q22, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + ldr q11, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + ldr q13, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q27, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + sub v17.4S, v20.4S, v18.4S // ................*....................................................................................................................................................................................................................................................................... + add v18.4S, v20.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... + ldr q12, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q14, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... ldr q10, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. - ldr q23, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... - sub v9.4S, v22.4S, v13.4S // ...............................*........................................................................................................................................................................................................................................................ - add v16.4S, v22.4S, v13.4S // ................................*....................................................................................................................................................................................................................................................... - ldr q20, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... - ldr q22, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... - sqrdmulh v17.4S, v11.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... - sub v28.4S, v24.4S, v8.4S // ................*....................................................................................................................................................................................................................................................................... - add v24.4S, v24.4S, v8.4S // .................*...................................................................................................................................................................................................................................................................... - mul v19.4S, v11.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ - sub v8.4S, v23.4S, v27.4S // ....................................*................................................................................................................................................................................................................................................... - add v21.4S, v23.4S, v27.4S // .....................................*.................................................................................................................................................................................................................................................. - add v27.4S, v22.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. - sqrdmulh v11.4S, v9.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... - sub v12.4S, v22.4S, v15.4S // .....................*.................................................................................................................................................................................................................................................................. - mul v13.4S, v9.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... - sub v23.4S, v20.4S, v10.4S // ..........................*............................................................................................................................................................................................................................................................. - add v15.4S, v20.4S, v10.4S // ...........................*............................................................................................................................................................................................................................................................ - sqrdmulh v20.4S, v23.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... - add v10.4S, v24.4S, v27.4S // .........................................................*.............................................................................................................................................................................................................................. - sub v24.4S, v24.4S, v27.4S // ........................................................*............................................................................................................................................................................................................................... - mul v23.4S, v23.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... - add v27.4S, v15.4S, v16.4S // ...................................................................*.................................................................................................................................................................................................................... - sub v22.4S, v15.4S, v16.4S // ..................................................................*..................................................................................................................................................................................................................... - mls v19.4S, v17.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... - mls v13.4S, v11.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... - sqrdmulh v11.4S, v8.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ - mul v8.4S, v8.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. - mul v14.4S, v22.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... - sqrdmulh v15.4S, v22.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. - sqrdmulh v17.4S, v12.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... - mul v22.4S, v12.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ - sub v12.4S, v10.4S, v27.4S // ................................................................................................*....................................................................................................................................................................................... - add v27.4S, v10.4S, v27.4S // .................................................................................................*...................................................................................................................................................................................... - sqrdmulh v16.4S, v28.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... - mul v9.4S, v28.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... - add v28.4S, v21.4S, v18.4S // .............................................................................*.......................................................................................................................................................................................................... - sub v21.4S, v21.4S, v18.4S // ............................................................................*........................................................................................................................................................................................................... - mls v23.4S, v20.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... - sqrdmulh v18.4S, v24.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ - mul v10.4S, v24.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. - ldr q24, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... - sub v20.4S, v23.4S, v13.4S // .......................................................................*................................................................................................................................................................................................................ - add v13.4S, v23.4S, v13.4S // ........................................................................*............................................................................................................................................................................................................... - mls v8.4S, v11.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... - ldr q11, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... - ldr q23, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... - mls v22.4S, v17.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. - add v17.4S, v8.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... - sub v19.4S, v8.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... - ldr q8, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ - mls v9.4S, v16.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... - mls v14.4S, v15.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. - add v15.4S, v24.4S, v23.4S // ...............................................*........................................................................................................................................................................................................................................ - sub v23.4S, v24.4S, v23.4S // ..............................................*......................................................................................................................................................................................................................................... - mls v10.4S, v18.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... - sub v16.4S, v11.4S, v8.4S // ...................................................*.................................................................................................................................................................................................................................... - add v8.4S, v11.4S, v8.4S // ....................................................*................................................................................................................................................................................................................................... - sqrdmulh v24.4S, v23.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... - add v18.4S, v9.4S, v22.4S // ..............................................................*......................................................................................................................................................................................................................... - sub v11.4S, v9.4S, v22.4S // .............................................................*.......................................................................................................................................................................................................................... - mul v22.4S, v23.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... - sqrdmulh v9.4S, v16.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. - mls v22.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... - mul v16.4S, v16.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. - mls v16.4S, v9.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ - sqrdmulh v24.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... - mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... - add v23.4S, v22.4S, v16.4S // ............................................................................................*........................................................................................................................................................................................... - sub v9.4S, v22.4S, v16.4S // ...........................................................................................*............................................................................................................................................................................................ - sqrdmulh v16.4S, v19.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... - mul v19.4S, v19.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... - add v22.4S, v17.4S, v23.4S // ..........................................................................................................................*............................................................................................................................................................. - sub v23.4S, v17.4S, v23.4S // .........................................................................................................................*.............................................................................................................................................................. - mls v12.4S, v24.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... - add v24.4S, v15.4S, v8.4S // .......................................................................................*................................................................................................................................................................................................ - sub v17.4S, v18.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. - add v13.4S, v18.4S, v13.4S // ......................................................................................................*................................................................................................................................................................................. - mls v19.4S, v16.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. - add v18.4S, v28.4S, v24.4S // .....................................................................................................................*.................................................................................................................................................................. - sub v16.4S, v28.4S, v24.4S // ....................................................................................................................*................................................................................................................................................................... - mul v24.4S, v9.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... - sub v28.4S, v15.4S, v8.4S // ......................................................................................*................................................................................................................................................................................................. - sqrdmulh v8.4S, v9.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... - add v9.4S, v13.4S, v22.4S // ..............................................................................................................................................*......................................................................................................................................... - sub v22.4S, v13.4S, v22.4S // .............................................................................................................................................*.......................................................................................................................................... - mul v13.4S, v28.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... - sqrdmulh v28.4S, v28.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. - mls v24.4S, v8.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ - sqrdmulh v8.4S, v21.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ - mul v15.4S, v21.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... - sqrdmulh v21.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... - mls v15.4S, v8.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... - add v8.4S, v27.4S, v18.4S // .........................................................................................................................................*.............................................................................................................................................. - sub v18.4S, v27.4S, v18.4S // ........................................................................................................................................*............................................................................................................................................... - mul v27.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ - mul v11.4S, v17.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ - sqrdmulh v17.4S, v17.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... - mls v13.4S, v28.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. - mls v27.4S, v21.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... - sqrdmulh v21.4S, v20.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. - add v28.4S, v15.4S, v13.4S // ...............................................................................................................................*........................................................................................................................................................ - sub v13.4S, v15.4S, v13.4S // ..............................................................................................................................*......................................................................................................................................................... - mul v15.4S, v22.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ - sqrdmulh v22.4S, v22.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... - mls v11.4S, v17.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. - sub v17.4S, v19.4S, v24.4S // ...................................................................................................................................*.................................................................................................................................................... - add v19.4S, v19.4S, v24.4S // ....................................................................................................................................*................................................................................................................................................... - sqrdmulh v24.4S, v16.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ - mls v15.4S, v22.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - mul v16.4S, v16.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. - mls v16.4S, v24.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... - mul v24.4S, v20.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. - cmge v20.4S, v31.4S, v15.4S // ....................................................................................................................................................................................*................................................................................................... - mls v24.4S, v21.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ - cmge v21.4S, v15.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - mul v22.4S, v13.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... - sub v21.4S, v20.4S, v21.4S // ......................................................................................................................................................................................*................................................................................................. - sqrdmulh v20.4S, v13.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... - sub v13.4S, v27.4S, v24.4S // ...............................................................................................................*........................................................................................................................................................................ - add v27.4S, v27.4S, v24.4S // ................................................................................................................*....................................................................................................................................................................... - sqrdmulh v24.4S, v23.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... - mls v15.4S, v21.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - sub v21.4S, v10.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. - mls v22.4S, v20.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... - sqrdmulh v20.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... - str q15, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - mul v15.4S, v23.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ - mul v23.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... - mls v23.4S, v20.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + add v28.4S, v19.4S, v11.4S // ..........................................*............................................................................................................................................................................................................................................. + sub v9.4S, v19.4S, v11.4S // .........................................*.............................................................................................................................................................................................................................................. + ldr q11, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + sub v24.4S, v13.4S, v27.4S // .....................*.................................................................................................................................................................................................................................................................. + mul v21.4S, v17.4S, v3.S[2] // ...................*.................................................................................................................................................................................................................................................................... + ldr q15, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + add v27.4S, v13.4S, v27.4S // ......................*................................................................................................................................................................................................................................................................. + ldr q19, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v17.4S, v3.S[3] // ..................*..................................................................................................................................................................................................................................................................... + add v20.4S, v22.4S, v14.4S // ...............................................*........................................................................................................................................................................................................................................ + sub v23.4S, v22.4S, v14.4S // ..............................................*......................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v24.4S, v4.S[1] // .......................*................................................................................................................................................................................................................................................................ + add v8.4S, v18.4S, v27.4S // .........................................................*.............................................................................................................................................................................................................................. + sub v17.4S, v12.4S, v11.4S // ...............................*........................................................................................................................................................................................................................................................ + add v14.4S, v12.4S, v11.4S // ................................*....................................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v9.4S, v6.S[1] // ...........................................*............................................................................................................................................................................................................................................ + sub v12.4S, v15.4S, v10.4S // ..........................*............................................................................................................................................................................................................................................................. + add v15.4S, v15.4S, v10.4S // ...........................*............................................................................................................................................................................................................................................................ + mul v9.4S, v9.4S, v6.S[0] // ............................................*........................................................................................................................................................................................................................................... + sub v10.4S, v16.4S, v19.4S // ....................................*................................................................................................................................................................................................................................................... + add v19.4S, v16.4S, v19.4S // .....................................*.................................................................................................................................................................................................................................................. + sub v27.4S, v18.4S, v27.4S // ........................................................*............................................................................................................................................................................................................................... + mls v21.4S, v13.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + add v18.4S, v15.4S, v14.4S // ...................................................................*.................................................................................................................................................................................................................... + sub v14.4S, v15.4S, v14.4S // ..................................................................*..................................................................................................................................................................................................................... + mul v16.4S, v24.4S, v4.S[0] // ........................*............................................................................................................................................................................................................................................................... + sub v15.4S, v19.4S, v28.4S // ............................................................................*........................................................................................................................................................................................................... + mls v16.4S, v22.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v10.4S, v5.S[3] // ......................................*................................................................................................................................................................................................................................................. + mls v9.4S, v11.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + add v11.4S, v19.4S, v28.4S // .............................................................................*.......................................................................................................................................................................................................... + mul v28.4S, v10.4S, v5.S[2] // .......................................*................................................................................................................................................................................................................................................ + mls v28.4S, v13.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v17.4S, v5.S[1] // .................................*...................................................................................................................................................................................................................................................... + add v13.4S, v21.4S, v16.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v24.4S, v12.4S, v4.S[2] // .............................*.......................................................................................................................................................................................................................................................... + sub v19.4S, v28.4S, v9.4S // .................................................................................*...................................................................................................................................................................................................... + add v10.4S, v28.4S, v9.4S // ..................................................................................*..................................................................................................................................................................................................... + mul v28.4S, v27.4S, v1.S[2] // ...........................................................*............................................................................................................................................................................................................................ + sub v9.4S, v21.4S, v16.4S // .............................................................*.......................................................................................................................................................................................................................... + ldr q16, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + sqrdmulh v21.4S, v12.4S, v4.S[3] // ............................*........................................................................................................................................................................................................................................................... + mul v12.4S, v17.4S, v5.S[0] // ..................................*..................................................................................................................................................................................................................................................... + mls v24.4S, v21.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + sub v17.4S, v8.4S, v18.4S // ................................................................................................*....................................................................................................................................................................................... + mls v12.4S, v22.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v1.S[3] // ..........................................................*............................................................................................................................................................................................................................. + sqrdmulh v21.4S, v19.4S, v2.S[3] // ...................................................................................*.................................................................................................................................................................................................... + mul v22.4S, v9.4S, v1.S[2] // ................................................................*....................................................................................................................................................................................................................... + mls v28.4S, v27.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + mul v19.4S, v19.4S, v2.S[2] // ....................................................................................*................................................................................................................................................................................................... + mls v19.4S, v21.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v8.4S, v8.4S, v18.4S // .................................................................................................*...................................................................................................................................................................................... + sqrdmulh v18.4S, v9.4S, v1.S[3] // ...............................................................*........................................................................................................................................................................................................................ + sqrdmulh v27.4S, v17.4S, v0.S[3] // ..................................................................................................*..................................................................................................................................................................................... + sub v9.4S, v24.4S, v12.4S // .......................................................................*................................................................................................................................................................................................................ + add v12.4S, v24.4S, v12.4S // ........................................................................*............................................................................................................................................................................................................... + mul v21.4S, v17.4S, v0.S[2] // ...................................................................................................*.................................................................................................................................................................................... + mls v22.4S, v18.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mul v18.4S, v15.4S, v2.S[2] // ...............................................................................*........................................................................................................................................................................................................ + sqrdmulh v24.4S, v15.4S, v2.S[3] // ..............................................................................*......................................................................................................................................................................................................... + sqrdmulh v17.4S, v23.4S, v6.S[3] // ................................................*....................................................................................................................................................................................................................................... + ldr q15, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + mls v21.4S, v27.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + mls v18.4S, v24.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sub v24.4S, v16.4S, v15.4S // ...................................................*.................................................................................................................................................................................................................................... + add v16.4S, v16.4S, v15.4S // ....................................................*................................................................................................................................................................................................................................... + mul v15.4S, v9.4S, v2.S[0] // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v27.4S, v9.4S, v2.S[1] // .........................................................................*.............................................................................................................................................................................................................. + sub v9.4S, v20.4S, v16.4S // ......................................................................................*................................................................................................................................................................................................. + add v16.4S, v20.4S, v16.4S // .......................................................................................*................................................................................................................................................................................................ + add v20.4S, v13.4S, v12.4S // ......................................................................................................*................................................................................................................................................................................. + mul v23.4S, v23.4S, v6.S[2] // .................................................*...................................................................................................................................................................................................................................... + sub v12.4S, v13.4S, v12.4S // .....................................................................................................*.................................................................................................................................................................................. + mls v23.4S, v17.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + add v17.4S, v11.4S, v16.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v16.4S, v11.4S, v16.4S // ....................................................................................................................*................................................................................................................................................................... + mls v15.4S, v27.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + sqrdmulh v27.4S, v24.4S, v7.S[1] // .....................................................*.................................................................................................................................................................................................................................. + mul v24.4S, v24.4S, v7.S[0] // ......................................................*................................................................................................................................................................................................................................. + add v11.4S, v22.4S, v15.4S // ................................................................................................................*....................................................................................................................................................................... + sub v15.4S, v22.4S, v15.4S // ...............................................................................................................*........................................................................................................................................................................ + sqrdmulh v13.4S, v16.4S, v1.S[1] // ......................................................................................................................*................................................................................................................................................................. + mls v24.4S, v27.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sqrdmulh v27.4S, v15.4S, v0.S[3] // .................................................................................................................*...................................................................................................................................................................... + mul v15.4S, v15.4S, v0.S[2] // ..................................................................................................................*..................................................................................................................................................................... + add v22.4S, v23.4S, v24.4S // ............................................................................................*........................................................................................................................................................................................... + sub v23.4S, v23.4S, v24.4S // ...........................................................................................*............................................................................................................................................................................................ + mul v24.4S, v16.4S, v1.S[0] // .......................................................................................................................*................................................................................................................................................................ + mls v15.4S, v27.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub v27.4S, v10.4S, v22.4S // .........................................................................................................................*.............................................................................................................................................................. + add v22.4S, v10.4S, v22.4S // ..........................................................................................................................*............................................................................................................................................................. + sqrdmulh v16.4S, v9.4S, v3.S[1] // ........................................................................................*............................................................................................................................................................................................... + mul v10.4S, v9.4S, v3.S[0] // .........................................................................................*.............................................................................................................................................................................................. + add v9.4S, v20.4S, v22.4S // ..............................................................................................................................................*......................................................................................................................................... + sub v20.4S, v20.4S, v22.4S // .............................................................................................................................................*.......................................................................................................................................... + sqrdmulh v22.4S, v14.4S, v2.S[1] // ....................................................................*................................................................................................................................................................................................................... + mls v10.4S, v16.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + mul v16.4S, v14.4S, v2.S[0] // .....................................................................*.................................................................................................................................................................................................................. + mls v24.4S, v13.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v14.4S, v18.4S, v10.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v18.4S, v18.4S, v10.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v13.4S, v12.4S, v0.S[2] // ........................................................................................................*............................................................................................................................................................................... + sqrdmulh v10.4S, v12.4S, v0.S[3] // .......................................................................................................*................................................................................................................................................................................ + add v12.4S, v21.4S, v24.4S // .............................................................................................................................................................*.......................................................................................................................... + sub v24.4S, v21.4S, v24.4S // ............................................................................................................................................................*........................................................................................................................... + sqrdmulh v21.4S, v23.4S, v3.S[1] // .............................................................................................*.......................................................................................................................................................................................... + mul v23.4S, v23.4S, v3.S[0] // ..............................................................................................*......................................................................................................................................................................................... + mls v16.4S, v22.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + mls v23.4S, v21.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v22.4S, v20.4S, v0.S[1] // ...............................................................................................................................................*........................................................................................................................................ + sub v21.4S, v28.4S, v16.4S // ..........................................................................................................*............................................................................................................................................................................. + add v28.4S, v28.4S, v16.4S // ...........................................................................................................*............................................................................................................................................................................ + mls v13.4S, v10.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + add v10.4S, v19.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... + sub v23.4S, v19.4S, v23.4S // ...................................................................................................................................*.................................................................................................................................................... + mul v20.4S, v20.4S, v0.S[0] // ................................................................................................................................................*....................................................................................................................................... + mls v20.4S, v22.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sub v19.4S, v11.4S, v10.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v10.4S // ........................................................................................................................................................*............................................................................................................................... + sqrdmulh v10.4S, v21.4S, v0.S[3] // ............................................................................................................*........................................................................................................................................................................... + mul v16.4S, v21.4S, v0.S[2] // .............................................................................................................*.......................................................................................................................................................................... + sqrdmulh v22.4S, v18.4S, v1.S[1] // ................................................................................................................................*....................................................................................................................................................... + sub v21.4S, v28.4S, v14.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v16.4S, v10.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + add v10.4S, v28.4S, v14.4S // ...................................................................................................................................................*.................................................................................................................................... + cmge v14.4S, v20.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. sub count, count, #1 layer1234_start: - add v10.4S, v10.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ - mls v15.4S, v24.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - sub v20.4S, v12.4S, v16.4S // ............................................................................................................................................................*........................................................................................................................... - add v12.4S, v12.4S, v16.4S // .............................................................................................................................................................*.......................................................................................................................... - sqrdmulh v14.4S, v17.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - sub v24.4S, v10.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... - add v10.4S, v10.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... - mul v17.4S, v17.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - add v13.4S, v11.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... - sub v15.4S, v11.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... - mul v28.4S, v21.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - add v11.4S, v27.4S, v19.4S // ........................................................................................................................................................*............................................................................................................................... - sqrdmulh v21.4S, v21.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - sub v27.4S, v27.4S, v19.4S // .......................................................................................................................................................*................................................................................................................................ - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - sqrdmulh v19.4S, v18.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - mul v18.4S, v18.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - mls v28.4S, v21.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - mls v18.4S, v19.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - sub v19.4S, v28.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. - add v14.4S, v28.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ - sqrdmulh v21.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - mul v10.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - cmge v28.4S, v31.4S, v18.4S // ................................................................................................................................................................................*....................................................................................................... - mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - cmge v8.4S, v18.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - sub v8.4S, v28.4S, v8.4S // ..................................................................................................................................................................................*..................................................................................................... - sqrdmulh v22.4S, v27.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - mls v10.4S, v21.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - mls v18.4S, v8.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - mul v21.4S, v20.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - cmge v28.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... - sqrdmulh v20.4S, v20.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - cmge v8.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - str q18, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - mul v18.4S, v15.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - sub v28.4S, v28.4S, v8.4S // ..........................................................................................................................................................................................................................................................*............................. - sqrdmulh v15.4S, v15.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - mul v8.4S, v19.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - sqrdmulh v19.4S, v19.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mls v10.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - mls v18.4S, v15.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - cmge v15.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - mls v21.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - str q10, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... - mul v10.4S, v27.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sub v27.4S, v28.4S, v15.4S // ..................................................................................................................................................................................................................................................*..................................... - cmge v20.4S, v18.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - mls v10.4S, v22.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - cmge v22.4S, v31.4S, v18.4S // ....................................................................................................................................................................................................*................................................................................... - cmge v28.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v16.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - cmge v15.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... - sub v27.4S, v15.4S, v28.4S // ..................................................................................................................................................................................................*..................................................................................... - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - sqrdmulh v15.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - sub v16.4S, v22.4S, v20.4S // ......................................................................................................................................................................................................*................................................................................. - mls v8.4S, v19.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - mls v18.4S, v16.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - sqrdmulh v28.4S, v24.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - str q18, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. - sqrdmulh v18.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - cmge v14.4S, v31.4S, v10.4S // ............................................................................................................................................................................................*........................................................................................... - mul v24.4S, v24.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - mls v21.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - mls v24.4S, v28.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - cmge v28.4S, v10.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - mls v19.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - sub v15.4S, v14.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... - str q21, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... - mls v22.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - sub v18.4S, v23.4S, v17.4S // ...........................................................................................................................................................................*............................................................................................................ - cmge v14.4S, v31.4S, v8.4S // ........................................................................................................................................................................................................*............................................................................... - mls v10.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - add v15.4S, v23.4S, v17.4S // ............................................................................................................................................................................*........................................................................................................... - sqrdmulh v21.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - cmge v9.4S, v8.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - str q10, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... - mls v17.4S, v21.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - sub v28.4S, v14.4S, v9.4S // ..........................................................................................................................................................................................................*............................................................................. - ldr q9, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... - cmge v20.4S, v24.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - ldr q27, [x1, #0] // e....................................................................................................................................................................................................................................................................................... - cmge v16.4S, v31.4S, v24.4S // ........................................................................................................................................................................................*............................................................................................... - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - cmge v11.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - cmge v10.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - cmge v23.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - ldr q14, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ - mls v8.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - cmge v28.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - sub v16.4S, v16.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. - sqrdmulh v20.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - sub v11.4S, v11.4S, v10.4S // ..............................................................................................................................................................................................................................................................*......................... - sub v28.4S, v23.4S, v28.4S // ......................................................................................................................................................................................................................................................*................................. - mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - cmge v13.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - str q8, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. - cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - mls v17.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - sub v28.4S, v27.4S, v9.4S // ................e....................................................................................................................................................................................................................................................................... - mul v15.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - sub v23.4S, v10.4S, v13.4S // ..........................................................................................................................................................................................................................................................................*............. - mls v15.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - cmge v8.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - sqrdmulh v10.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - add v20.4S, v27.4S, v9.4S // .................e...................................................................................................................................................................................................................................................................... - ldr q27, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... - ldr q9, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... - str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... - cmge v17.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... - mul v12.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - mls v24.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - mls v12.4S, v10.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - ldr q10, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. - sub v16.4S, v9.4S, v27.4S // .....................e.................................................................................................................................................................................................................................................................. - add v9.4S, v9.4S, v27.4S // ......................e................................................................................................................................................................................................................................................................. - sub v13.4S, v13.4S, v8.4S // ......................................................................................................................................................................................................................................................................*................. - mls v22.4S, v23.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - str q24, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... - cmge v27.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - sqrdmulh v24.4S, v18.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - cmge v8.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... - mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - cmge v23.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - sub v17.4S, v17.4S, v27.4S // ..............................................................................................................................................................................................................................................................................*......... - sub v11.4S, v10.4S, v14.4S // .........................................e.............................................................................................................................................................................................................................................. - mul v27.4S, v18.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - sub v8.4S, v8.4S, v23.4S // ..................................................................................................................................................................................................................................................................*..................... - ldr q23, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. - mls v27.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - mls v12.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - add v18.4S, v10.4S, v14.4S // ..........................................e............................................................................................................................................................................................................................................. - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - ldr q19, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. - cmge v10.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... - mls v15.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - cmge v17.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - ldr q22, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ - sub v24.4S, v20.4S, v9.4S // ........................................................e............................................................................................................................................................................................................................... - mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - ldr q13, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... - add v14.4S, v20.4S, v9.4S // .........................................................e.............................................................................................................................................................................................................................. - str q12, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - sub v12.4S, v10.4S, v17.4S // ..............................................................................................................................................................................................................*......................................................................... - sqrdmulh v8.4S, v11.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... - add v10.4S, v19.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... - sub v17.4S, v19.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ - str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - ldr q22, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... - mls v27.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. - mul v19.4S, v11.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ - sub v15.4S, v22.4S, v23.4S // ..........................e............................................................................................................................................................................................................................................................. - mul v12.4S, v28.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... - str q27, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ - sqrdmulh v28.4S, v28.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... - sqrdmulh v9.4S, v16.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... - mul v27.4S, v16.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ - mls v12.4S, v28.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... - add v11.4S, v22.4S, v23.4S // ...........................e............................................................................................................................................................................................................................................................ - ldr q16, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. - mls v27.4S, v9.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. - add v22.4S, v13.4S, v16.4S // .....................................e.................................................................................................................................................................................................................................................. - mls v19.4S, v8.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... - sub v28.4S, v13.4S, v16.4S // ....................................e................................................................................................................................................................................................................................................... - sqrdmulh v8.4S, v17.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... - sub v23.4S, v11.4S, v10.4S // ..................................................................e..................................................................................................................................................................................................................... - add v20.4S, v12.4S, v27.4S // ..............................................................e......................................................................................................................................................................................................................... - sub v12.4S, v12.4S, v27.4S // .............................................................e.......................................................................................................................................................................................................................... - sqrdmulh v21.4S, v15.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... - mul v9.4S, v15.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... - add v10.4S, v11.4S, v10.4S // ...................................................................e.................................................................................................................................................................................................................... - sqrdmulh v15.4S, v28.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ - add v11.4S, v14.4S, v10.4S // .................................................................................................e...................................................................................................................................................................................... - sub v14.4S, v14.4S, v10.4S // ................................................................................................e....................................................................................................................................................................................... - mul v10.4S, v17.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... - mls v9.4S, v21.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... - mls v10.4S, v8.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... - mul v17.4S, v28.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. - sqrdmulh v13.4S, v12.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... - add v28.4S, v22.4S, v18.4S // .............................................................................e.......................................................................................................................................................................................................... - mls v17.4S, v15.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... - sub v8.4S, v9.4S, v10.4S // .......................................................................e................................................................................................................................................................................................................ - mul v15.4S, v12.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ - sqrdmulh v21.4S, v23.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. - sqrdmulh v16.4S, v14.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... - mul v12.4S, v14.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... - sub v22.4S, v22.4S, v18.4S // ............................................................................e........................................................................................................................................................................................................... - mul v14.4S, v23.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... - add v23.4S, v9.4S, v10.4S // ........................................................................e............................................................................................................................................................................................................... - ldr q10, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ - ldr q9, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... - mls v14.4S, v21.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. - ldr q21, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... - mls v12.4S, v16.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... - mls v15.4S, v13.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... - sub v27.4S, v21.4S, v9.4S // ..............................................e......................................................................................................................................................................................................................................... - sqrdmulh v16.4S, v8.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. - mul v13.4S, v8.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. - sqrdmulh v18.4S, v27.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... - ldr q8, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... - mls v13.4S, v16.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ - add v16.4S, v21.4S, v9.4S // ...............................................e........................................................................................................................................................................................................................................ - mul v27.4S, v27.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... - sub v21.4S, v8.4S, v10.4S // ...................................................e.................................................................................................................................................................................................................................... - add v8.4S, v8.4S, v10.4S // ....................................................e................................................................................................................................................................................................................................... - mls v27.4S, v18.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... - add v18.4S, v16.4S, v8.4S // .......................................................................................e................................................................................................................................................................................................ - sqrdmulh v9.4S, v22.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ - sub v10.4S, v16.4S, v8.4S // ......................................................................................e................................................................................................................................................................................................. - mul v8.4S, v22.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... - sub v16.4S, v28.4S, v18.4S // ....................................................................................................................e................................................................................................................................................................... - add v18.4S, v28.4S, v18.4S // .....................................................................................................................e.................................................................................................................................................................. - sqrdmulh v28.4S, v21.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. - mul v21.4S, v21.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. - sub v22.4S, v20.4S, v23.4S // .....................................................................................................e.................................................................................................................................................................................. - add v20.4S, v20.4S, v23.4S // ......................................................................................................e................................................................................................................................................................................. - add v23.4S, v17.4S, v19.4S // ..................................................................................e..................................................................................................................................................................................................... - mls v21.4S, v28.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ - sub v17.4S, v17.4S, v19.4S // .................................................................................e...................................................................................................................................................................................................... - sqrdmulh v19.4S, v10.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. - mls v8.4S, v9.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... - add v28.4S, v27.4S, v21.4S // ............................................................................................e........................................................................................................................................................................................... - sub v27.4S, v27.4S, v21.4S // ...........................................................................................e............................................................................................................................................................................................ - mul v10.4S, v10.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... - sub v21.4S, v15.4S, v13.4S // ...............................................................................................................e........................................................................................................................................................................ - mls v10.4S, v19.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. - add v19.4S, v23.4S, v28.4S // ..........................................................................................................................e............................................................................................................................................................. - sub v23.4S, v23.4S, v28.4S // .........................................................................................................................e.............................................................................................................................................................. - sqrdmulh v28.4S, v27.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... - add v9.4S, v20.4S, v19.4S // ..............................................................................................................................................e......................................................................................................................................... - sub v19.4S, v20.4S, v19.4S // .............................................................................................................................................e.......................................................................................................................................... - mul v20.4S, v27.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... - add v27.4S, v15.4S, v13.4S // ................................................................................................................e....................................................................................................................................................................... - sqrdmulh v15.4S, v16.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ - mul v16.4S, v16.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. - sub v13.4S, v8.4S, v10.4S // ..............................................................................................................................e......................................................................................................................................................... - mls v20.4S, v28.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ - add v28.4S, v8.4S, v10.4S // ...............................................................................................................................e........................................................................................................................................................ - add v8.4S, v11.4S, v18.4S // .........................................................................................................................................e.............................................................................................................................................. - sub v18.4S, v11.4S, v18.4S // ........................................................................................................................................e............................................................................................................................................... - mul v10.4S, v24.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. - mul v11.4S, v19.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ - sqrdmulh v19.4S, v19.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... - mls v16.4S, v15.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... - sqrdmulh v15.4S, v24.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ - mls v11.4S, v19.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... - sqrdmulh v19.4S, v17.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... - mls v10.4S, v15.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... - cmge v24.4S, v31.4S, v11.4S // ....................................................................................................................................................................................e................................................................................................... - mul v15.4S, v17.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... - cmge v17.4S, v11.4S, v30.4S // .....................................................................................................................................................................................e.................................................................................................. - mls v15.4S, v19.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. - sub v17.4S, v24.4S, v17.4S // ......................................................................................................................................................................................e................................................................................................. - sqrdmulh v24.4S, v22.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... - mls v11.4S, v17.4S, v29.4S // .......................................................................................................................................................................................e................................................................................................ - add v19.4S, v15.4S, v20.4S // ....................................................................................................................................e................................................................................................................................................... - sub v17.4S, v15.4S, v20.4S // ...................................................................................................................................e.................................................................................................................................................... - mul v15.4S, v23.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ - sqrdmulh v20.4S, v21.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... - str q11, [x1, #576] // .................................................................................................................................................................................................................e...................................................................... - mul v11.4S, v22.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ - mul v22.4S, v13.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... - sqrdmulh v13.4S, v13.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... - mls v11.4S, v24.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. - sqrdmulh v24.4S, v23.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... - mul v23.4S, v21.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... - mls v23.4S, v20.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... - mls v22.4S, v13.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... - sub v21.4S, v10.4S, v14.4S // ..........................................................................................................e............................................................................................................................................................................. + // Instructions: 280 + // Expected cycles: 93 + // Expected IPC: 3.01 - // original source code - // ldr q8, [x1, #0] // ..e...............................................................................................................................................................................................|.......................................................................................e............................................................................ - // ldr q9, [x1, #(1*(512/8))] // e.................................................................................................................................................................................................|.....................................................................................e.............................................................................. - // ldr q10, [x1, #(2*(512/8))] // .............................e....................................................................................................................................................................|..................................................................................................................e................................................. - // ldr q11, [x1, #(3*(512/8))] // ............................e.....................................................................................................................................................................|.................................................................................................................e.................................................. - // ldr q12, [x1, #(4*(512/8))] // ........................................................................e.........................................................................................................................|.............................................................................................................................................................e...... - // ldr q13, [x1, #(5*(512/8))] // ....................................................e.............................................................................................................................................|.........................................................................................................................................e.......................... - // ldr q14, [x1, #(6*(512/8))] // .........................................................e........................................................................................................................................|..............................................................................................................................................e..................... - // ldr q15, [x1, #(7*(512/8))] // .............................................................e....................................................................................................................................|..................................................................................................................................................e................. - // ldr q16, [x1, #(8*(512/8))] // ................................................................e.................................................................................................................................|.....................................................................................................................................................e.............. - // ldr q17, [x1, #(9*(512/8))] // ....................................................................................e.............................................................................................................|.................................................................................................................................................................... - // ldr q18, [x1, #(10*(512/8))] // ....................................e.............................................................................................................................................................|.........................................................................................................................e.......................................... - // ldr q19, [x1, #(11*(512/8))] // .........e........................................................................................................................................................................................|..............................................................................................e..................................................................... - // ldr q20, [x1, #(12*(512/8))] // .....................................................................................................................e............................................................................|.................................................................................................................................................................... - // ldr q21, [x1, #(13*(512/8))] // ...................................................................................................................e..............................................................................|.................................................................................................................................................................... - // ldr q22, [x1, #(14*(512/8))] // ............................................................................................................................e.....................................................................|.................................................................................................................................................................... - // ldr q23, [x1, #(15*(512/8))] // ..................................................................................................................e...............................................................................|.................................................................................................................................................................... - // sub v24.4s, v8.4s, v9.4s // .....................e............................................................................................................................................................................|..........................................................................................................e......................................................... - // add v8.4s, v8.4s, v9.4s // ...........................e......................................................................................................................................................................|................................................................................................................e................................................... - // mul v9.4s, v24.4s, v3.s[2] // .............................................................................e....................................................................................................................|..................................................................................................................................................................e. - // sqrdmulh v24.4s, v24.4s, v3.s[3] // ...............................................................................e..................................................................................................................|.................................................................................................................................................................... - // mls v9.4s, v24.4s, v29.4s // ..................................................................................e...............................................................................................................|.................................................................................................................................................................... - // sub v24.4s, v10.4s, v11.4s // .....................................e............................................................................................................................................................|..........................................................................................................................e......................................... - // add v10.4s, v10.4s, v11.4s // ......................................e...........................................................................................................................................................|...........................................................................................................................e........................................ - // mul v11.4s, v24.4s, v4.s[0] // .................................................................................e................................................................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.s[1] // ................................................................................e.................................................................................................................|.................................................................................................................................................................... - // mls v11.4s, v24.4s, v29.4s // .....................................................................................e............................................................................................................|.................................................................................................................................................................... - // sub v24.4s, v12.4s, v13.4s // ............................................................................e.....................................................................................................................|.................................................................................................................................................................e.. - // add v12.4s, v12.4s, v13.4s // ...................................................................................e..............................................................................................................|.................................................................................................................................................................... - // mul v13.4s, v24.4s, v4.s[2] // ..............................................................................................e...................................................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.s[3] // .............................................................................................e....................................................................................................|.................................................................................................................................................................... - // mls v13.4s, v24.4s, v29.4s // ....................................................................................................e.............................................................................................|.................................................................................................................................................................... - // sub v24.4s, v14.4s, v15.4s // ......................................................................e...........................................................................................................................|...........................................................................................................................................................e........ - // add v14.4s, v14.4s, v15.4s // .....................................................................e............................................................................................................................|..........................................................................................................................................................e......... - // mul v15.4s, v24.4s, v5.s[0] // ...................................................................................................e..............................................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.s[1] // .........................................................................................e........................................................................................................|.................................................................................................................................................................... - // mls v15.4s, v24.4s, v29.4s // .....................................................................................................e............................................................................................|.................................................................................................................................................................... - // sub v24.4s, v16.4s, v17.4s // ........................................................................................e.........................................................................................................|.................................................................................................................................................................... - // add v16.4s, v16.4s, v17.4s // ......................................................................................e...........................................................................................................|.................................................................................................................................................................... - // mul v17.4s, v24.4s, v5.s[2] // ......................................................................................................e...........................................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.s[3] // ................................................................................................e.................................................................................................|.................................................................................................................................................................... - // mls v17.4s, v24.4s, v29.4s // .........................................................................................................e........................................................................................|.................................................................................................................................................................... - // sub v24.4s, v18.4s, v19.4s // .................................................e................................................................................................................................................|......................................................................................................................................e............................. - // add v18.4s, v18.4s, v19.4s // .......................................................e..........................................................................................................................................|............................................................................................................................................e....................... - // mul v19.4s, v24.4s, v6.s[0] // ...........................................................................e......................................................................................................................|................................................................................................................................................................e... - // sqrdmulh v24.4s, v24.4s, v6.s[1] // ....................................................................e.............................................................................................................................|.........................................................................................................................................................e.......... - // mls v19.4s, v24.4s, v29.4s // .......................................................................................e..........................................................................................................|.................................................................................................................................................................... - // sub v24.4s, v20.4s, v21.4s // ........................................................................................................................e.........................................................................|.................................................................................................................................................................... - // add v20.4s, v20.4s, v21.4s // ..............................................................................................................................e...................................................................|.................................................................................................................................................................... - // mul v21.4s, v24.4s, v6.s[2] // ...............................................................................................................................e..................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.s[3] // ...........................................................................................................................e......................................................................|.................................................................................................................................................................... - // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................e...............................................................|.................................................................................................................................................................... - // sub v24.4s, v22.4s, v23.4s // ................................................................................................................................e.................................................................|.................................................................................................................................................................... - // add v22.4s, v22.4s, v23.4s // .................................................................................................................................e................................................................|.................................................................................................................................................................... - // mul v23.4s, v24.4s, v7.s[0] // ..........................................................................................................................................e.......................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v7.s[1] // .........................................................................................................................................e........................................................|.................................................................................................................................................................... - // mls v23.4s, v24.4s, v29.4s // ..............................................................................................................................................e...................................................|.................................................................................................................................................................... - // sub v24.4s, v8.4s, v10.4s // ..............................................................e...................................................................................................................................|...................................................................................................................................................e................ - // add v8.4s, v8.4s, v10.4s // .................................................................e................................................................................................................................|......................................................................................................................................................e............. - // mul v10.4s, v24.4s, v1.s[2] // .....................................................................................................................................................................e............................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................................................................................................................................e........................|.................................................................................................................................................................... - // mls v10.4s, v24.4s, v29.4s // ............................................................................................................................................................................e.....................|.................................................................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ............................................................................................e.....................................................................................................|.................................................................................................................................................................... - // add v9.4s, v9.4s, v11.4s // ...........................................................................................e......................................................................................................|.................................................................................................................................................................... - // mul v11.4s, v24.4s, v1.s[2] // ...........................................................................................................e......................................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .......................................................................................................e..........................................................................................|.................................................................................................................................................................... - // mls v11.4s, v24.4s, v29.4s // .......................................................................................................................e..........................................................................|.................................................................................................................................................................... - // sub v24.4s, v12.4s, v14.4s // ..........................................................................................e.......................................................................................................|.................................................................................................................................................................... - // add v12.4s, v12.4s, v14.4s // ...............................................................................................e..................................................................................................|.................................................................................................................................................................... - // mul v14.4s, v24.4s, v2.s[0] // ................................................................................................................e.................................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................e.....................................................................................|.................................................................................................................................................................... - // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................e.............................................................................|.................................................................................................................................................................... - // sub v24.4s, v13.4s, v15.4s // ..........................................................................................................e.......................................................................................|.................................................................................................................................................................... - // add v13.4s, v13.4s, v15.4s // .................................................................................................................e................................................................................|.................................................................................................................................................................... - // mul v15.4s, v24.4s, v2.s[0] // ..........................................................................................................................e.......................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .........................................................................................................................e........................................................................|.................................................................................................................................................................... - // mls v15.4s, v24.4s, v29.4s // .............................................................................................................................e....................................................................|.................................................................................................................................................................... - // sub v24.4s, v16.4s, v18.4s // ...............................................................................................................e..................................................................................|.................................................................................................................................................................... - // add v16.4s, v16.4s, v18.4s // ........................................................................................................e.........................................................................................|.................................................................................................................................................................... - // mul v18.4s, v24.4s, v2.s[2] // ......................................................................................................................................e...........................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ....................................................................................................................................e.............................................................|.................................................................................................................................................................... - // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................e................................................|.................................................................................................................................................................... - // sub v24.4s, v17.4s, v19.4s // ...............................................................................................................................................e..................................................|.................................................................................................................................................................... - // add v17.4s, v17.4s, v19.4s // .............................................................................................................................................e....................................................|.................................................................................................................................................................... - // mul v19.4s, v24.4s, v2.s[2] // ..............................................................................................................................................................................e...................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................................................................................e......................|.................................................................................................................................................................... - // mls v19.4s, v24.4s, v29.4s // ................................................................................................................................................................................e.................|.................................................................................................................................................................... - // sub v24.4s, v20.4s, v22.4s // .....................................................................................................................................e............................................................|.................................................................................................................................................................... - // add v20.4s, v20.4s, v22.4s // ...................................................................................................................................e..............................................................|.................................................................................................................................................................... - // mul v22.4s, v24.4s, v3.s[0] // ....................................................................................................................................................e.............................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................................e.................................................|.................................................................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // ......................................................................................................................................................e...........................................|.................................................................................................................................................................... - // sub v24.4s, v21.4s, v23.4s // ...................................................................................................................................................e..............................................|.................................................................................................................................................................... - // add v21.4s, v21.4s, v23.4s // ..................................................................................................................................................e...............................................|.................................................................................................................................................................... - // mul v23.4s, v24.4s, v3.s[0] // ............................................................................................................................................................e.....................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................................................e........................................|.................................................................................................................................................................... - // mls v23.4s, v24.4s, v29.4s // .................................................................................................................................................................e................................|.................................................................................................................................................................... - // sub v24.4s, v8.4s, v12.4s // ..................................................................................................e...............................................................................................|.................................................................................................................................................................... - // add v8.4s, v8.4s, v12.4s // .................................................................................................e................................................................................................|.................................................................................................................................................................... - // mul v12.4s, v24.4s, v0.s[2] // ..............................................................................................................e...................................................................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................e....................................................................................|.................................................................................................................................................................... - // mls v12.4s, v24.4s, v29.4s // ......................................................................................................................e...........................................................................|.................................................................................................................................................................... - // sub v24.4s, v9.4s, v13.4s // ...........................................................................................................................................e......................................................|.................................................................................................................................................................... - // add v9.4s, v9.4s, v13.4s // ............................................................................................................................................e.....................................................|.................................................................................................................................................................... - // mul v13.4s, v24.4s, v0.s[2] // .........................................................................................................................................................................................e........|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................e...............|.................................................................................................................................................................... - // mls v13.4s, v24.4s, v29.4s // ............................................................................................................................................................................................e.....|.................................................................................................................................................................... - // sub v24.4s, v10.4s, v14.4s // .................................................................................................................................................................................................e|.................................................................................................................................................................... - // add v10.4s, v10.4s, v14.4s // ..................................................................................................................................................................................................*.................................................................................................................................................................... - // mul v14.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................................................|.........*.......................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................................|...........*........................................................................................................................................................ - // mls v14.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.................*.................................................................................................................................................. - // sub v24.4s, v11.4s, v15.4s // .....................................................................................................................................................e............................................|.................................................................................................................................................................... - // add v11.4s, v11.4s, v15.4s // .............................................................................................................................................................e....................................|.................................................................................................................................................................... - // mul v15.4s, v24.4s, v0.s[2] // ..............................................................................................................................................................................................e...|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................................e..........|.................................................................................................................................................................... - // mls v15.4s, v24.4s, v29.4s // ...............................................................................................................................................................................................e..|.................................................................................................................................................................... - // sub v24.4s, v16.4s, v20.4s // .......................................................................................................................................e..........................................................|.................................................................................................................................................................... - // add v16.4s, v16.4s, v20.4s // ........................................................................................................................................e.........................................................|.................................................................................................................................................................... - // mul v20.4s, v24.4s, v1.s[0] // ...............................................................................................................................................................e..................................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................e...................................|.................................................................................................................................................................... - // mls v20.4s, v24.4s, v29.4s // ........................................................................................................................................................................e.........................|.................................................................................................................................................................... - // sub v24.4s, v17.4s, v21.4s // ........................................................................................................................................................e.........................................|.................................................................................................................................................................... - // add v17.4s, v17.4s, v21.4s // .......................................................................................................................................................e..........................................|.................................................................................................................................................................... - // mul v21.4s, v24.4s, v1.s[0] // ......................................................................................................................................................................................e...........|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................................................................................................................................................e....|.................................................................................................................................................................... - // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|*................................................................................................................................................................... - // sub v24.4s, v18.4s, v22.4s // ................................................................................................................................................................e.................................|.................................................................................................................................................................... - // add v18.4s, v18.4s, v22.4s // ..................................................................................................................................................................e...............................|.................................................................................................................................................................... - // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................................................................e.......|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................................................................................e......|.................................................................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // ................................................................................................................................................................................................e.|.................................................................................................................................................................... - // sub v24.4s, v19.4s, v23.4s // .....................................................................................................................................................................................e............|.................................................................................................................................................................... - // add v19.4s, v19.4s, v23.4s // ....................................................................................................................................................................................e.............|.................................................................................................................................................................... - // mul v23.4s, v24.4s, v1.s[0] // ..................................................................................................................................................................................................|......*............................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................................................................................................................................................................|...*................................................................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..................*................................................................................................................................................. - // sub v24.4s, v8.4s, v16.4s // ....................................................................................................................................................................e.............................|.................................................................................................................................................................... - // add v8.4s, v8.4s, v16.4s // ...................................................................................................................................................................e..............................|.................................................................................................................................................................... - // mul v16.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|................*................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|...............*.................................................................................................................................................... - // mls v16.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...................*................................................................................................................................................ - // sub v24.4s, v9.4s, v17.4s // ...........................................................................................................................................................e......................................|.................................................................................................................................................................... - // add v9.4s, v9.4s, v17.4s // ..........................................................................................................................................................e.......................................|.................................................................................................................................................................... - // mul v17.4s, v24.4s, v0.s[0] // ......................................................................................................................................................................e...........................|.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................e..........................|.................................................................................................................................................................... - // mls v17.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.......................|.................................................................................................................................................................... - // sub v24.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|....*............................................................................................................................................................... - // add v10.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|.....*.............................................................................................................................................................. - // mul v18.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...................................................................*................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|..............................................................*..................................................................................................... - // mls v18.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.....................................................................*.............................................................................................. - // sub v24.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|............*....................................................................................................................................................... - // add v11.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|..........*......................................................................................................................................................... - // mul v19.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...............................................*.................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|............................*....................................................................................................................................... - // mls v19.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..................................................*................................................................................................................. - // sub v24.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|.*.................................................................................................................................................................. - // add v12.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|..*................................................................................................................................................................. - // mul v20.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...............................*.................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|.................................*.................................................................................................................................. - // mls v20.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.............................................*...................................................................................................................... - // sub v24.4s, v13.4s, v21.4s // ..................................................................................................................................................................................................|........*........................................................................................................................................................... - // add v13.4s, v13.4s, v21.4s // ..................................................................................................................................................................................................|.......*............................................................................................................................................................ - // mul v21.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|....................................*............................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|......................................*............................................................................................................................. - // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...........................................*........................................................................................................................ - // sub v24.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|....................*............................................................................................................................................... - // add v14.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|.....................*.............................................................................................................................................. - // mul v22.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|.......................................*............................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|........................................*........................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...........................................................*........................................................................................................ - // sub v24.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|...........................................................................*........................................................................................ - // add v15.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|..............................................................................*..................................................................................... - // mul v23.4s, v24.4s, v0.s[0] // ..................................................*...............................................................................................................................................|.......................................................................................................................................*............................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................*......................................................................................................................................................|................................................................................................................................*................................... - // mls v23.4s, v24.4s, v29.4s // .....................................................*............................................................................................................................................|..........................................................................................................................................*......................... - // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................|........................*........................................................................................................................................... - // cmge v28.4s, v16.4s, v30.4s // ..................................................................................................................................................................................................|..........................*......................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|...........................*........................................................................................................................................ - // mls v16.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|..............................*..................................................................................................................................... - // cmge v27.4s, v31.4s, v17.4s // .............................................................................................................................................................................e....................|.................................................................................................................................................................... - // cmge v28.4s, v17.4s, v30.4s // ...............................................................................................................................................................................e..................|.................................................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................e................|.................................................................................................................................................................... - // mls v17.4s, v28.4s, v29.4s // ...................................................................................................................................................................................e..............|.................................................................................................................................................................... - // cmge v27.4s, v31.4s, v18.4s // ...*..............................................................................................................................................................................................|........................................................................................*........................................................................... - // cmge v28.4s, v18.4s, v30.4s // .*................................................................................................................................................................................................|......................................................................................*............................................................................. - // sub v28.4s, v27.4s, v28.4s // ............*.....................................................................................................................................................................................|.................................................................................................*.................................................................. - // mls v18.4s, v28.4s, v29.4s // .................................*................................................................................................................................................................|......................................................................................................................*............................................. - // cmge v27.4s, v31.4s, v19.4s // ..................................................................................................................................................................................................|..................................................................*................................................................................................. - // cmge v28.4s, v19.4s, v30.4s // ..................................................................................................................................................................................................|......................................................................*............................................................................................. - // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|........................................................................*........................................................................................... - // mls v19.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.............................................................................*...................................................................................... - // cmge v27.4s, v31.4s, v20.4s // ..................................................................................................................................................................................................|......................................................*............................................................................................................. - // cmge v28.4s, v20.4s, v30.4s // ..................................................................................................................................................................................................|....................................................*............................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.......................................................*............................................................................................................ - // mls v20.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|....................................................................*............................................................................................... - // cmge v27.4s, v31.4s, v21.4s // ..................................................................................................................................................................................................|...................................................*................................................................................................................ - // cmge v28.4s, v21.4s, v30.4s // ..................................................................................................................................................................................................|.................................................*.................................................................................................................. - // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|..........................................................*......................................................................................................... - // mls v21.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.............................................................*...................................................................................................... - // cmge v27.4s, v31.4s, v22.4s // ..................................................................................................................................................................................................|............................................................................*....................................................................................... - // cmge v28.4s, v22.4s, v30.4s // ..................................................................................................................................................................................................|.................................................................................*.................................................................................. - // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|....................................................................................*............................................................................... - // mls v22.4s, v28.4s, v29.4s // ..........*.......................................................................................................................................................................................|...............................................................................................*.................................................................... - // cmge v27.4s, v31.4s, v23.4s // ..........................................................*.......................................................................................................................................|...............................................................................................................................................*.................... - // cmge v28.4s, v23.4s, v30.4s // ............................................................*.....................................................................................................................................|.................................................................................................................................................*.................. - // sub v28.4s, v27.4s, v28.4s // ...................................................................*..............................................................................................................................|........................................................................................................................................................*........... - // mls v23.4s, v28.4s, v29.4s // .........................................................................*........................................................................................................................|..............................................................................................................................................................*..... - // str q16, [x1, #(8*(512/8))] // ..................................................................................................................................................................................................|...................................*................................................................................................................................ - // str q17, [x1, #(9*(512/8))] // ........................................................................................................................................................................................e.........|.................................................................................................................................................................... - // str q18, [x1, #(10*(512/8))] // .........................................*........................................................................................................................................................|..............................................................................................................................*..................................... - // str q19, [x1, #(11*(512/8))] // ..................................................................................................................................................................................................|..................................................................................*................................................................................. - // str q20, [x1, #(12*(512/8))] // ..................................................................................................................................................................................................|.........................................................................*.......................................................................................... - // str q21, [x1, #(13*(512/8))] // ..................................................................................................................................................................................................|................................................................*................................................................................................... - // str q22, [x1, #(14*(512/8))] // ..................*...............................................................................................................................................................................|.......................................................................................................*............................................................ - // str q23, [x1, #(15*(512/8))] // ..............................................................................*...................................................................................................................|...................................................................................................................................................................* - // mul v16.4s, v8.4s, v25.4s // ..................................................................................................................................................................................................|.............*...................................................................................................................................................... - // sqrdmulh v8.4s, v8.4s, v26.4s // ..................................................................................................................................................................................................|..............*..................................................................................................................................................... - // mls v16.4s, v8.4s, v29.4s // ..................................................................................................................................................................................................|.........................*.......................................................................................................................................... - // mul v17.4s, v9.4s, v25.4s // ..................................................................................................................................................................................................|................................................................................*................................................................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ..................................................................................................................................................................................................|...............................................................................*.................................................................................... - // mls v17.4s, v9.4s, v29.4s // ..................................................................................................................................................................................................|...................................................................................*................................................................................ - // mul v18.4s, v10.4s, v25.4s // ..................................................................................................................................................................................................|.......................*............................................................................................................................................ - // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................................................................................................................................|......................*............................................................................................................................................. - // mls v18.4s, v10.4s, v29.4s // ..................................................................................................................................................................................................|.............................*...................................................................................................................................... - // mul v19.4s, v11.4s, v25.4s // ..................................................................................................................................................................................................|............................................................*....................................................................................................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ..................................................................................................................................................................................................|.........................................................*.......................................................................................................... - // mls v19.4s, v11.4s, v29.4s // ..................................................................................................................................................................................................|.......................................................................*............................................................................................ - // mul v20.4s, v12.4s, v25.4s // ................................*.................................................................................................................................................................|.....................................................................................................................*.............................................. - // sqrdmulh v12.4s, v12.4s, v26.4s // ..........................*.......................................................................................................................................................................|...............................................................................................................*.................................................... - // mls v20.4s, v12.4s, v29.4s // ...................................*..............................................................................................................................................................|........................................................................................................................*........................................... - // mul v21.4s, v13.4s, v25.4s // ....*.............................................................................................................................................................................................|.........................................................................................*.......................................................................... - // sqrdmulh v13.4s, v13.4s, v26.4s // ......*...........................................................................................................................................................................................|...........................................................................................*........................................................................ - // mls v21.4s, v13.4s, v29.4s // ................*.................................................................................................................................................................................|.....................................................................................................*.............................................................. - // mul v22.4s, v14.4s, v25.4s // ..................................................................................................................................................................................................|...............................................................*.................................................................................................... - // sqrdmulh v14.4s, v14.4s, v26.4s // ..................................................................................................................................................................................................|.................................................................*.................................................................................................. - // mls v22.4s, v14.4s, v29.4s // ..................................................................................................................................................................................................|..........................................................................*......................................................................................... - // mul v23.4s, v15.4s, v25.4s // ......................*...........................................................................................................................................................................|...........................................................................................................*........................................................ - // sqrdmulh v15.4s, v15.4s, v26.4s // .............*....................................................................................................................................................................................|..................................................................................................*................................................................. - // mls v23.4s, v15.4s, v29.4s // ........................*.........................................................................................................................................................................|.............................................................................................................*...................................................... - // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................|..........................................*......................................................................................................................... - // cmge v28.4s, v16.4s, v30.4s // ..................................................................................................................................................................................................|............................................*....................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|................................................*................................................................................................................... - // mls v16.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.....................................................*.............................................................................................................. - // cmge v27.4s, v31.4s, v17.4s // ........*.........................................................................................................................................................................................|.............................................................................................*...................................................................... - // cmge v28.4s, v17.4s, v30.4s // ...........*......................................................................................................................................................................................|................................................................................................*................................................................... - // sub v28.4s, v27.4s, v28.4s // ...............*..................................................................................................................................................................................|....................................................................................................*............................................................... - // mls v17.4s, v28.4s, v29.4s // ....................*.............................................................................................................................................................................|.........................................................................................................*.......................................................... - // cmge v27.4s, v31.4s, v18.4s // ..................................................................................................................................................................................................|................................*................................................................................................................................... - // cmge v28.4s, v18.4s, v30.4s // ..................................................................................................................................................................................................|..................................*................................................................................................................................. - // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.....................................*.............................................................................................................................. - // mls v18.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.........................................*.......................................................................................................................... - // cmge v27.4s, v31.4s, v19.4s // .....*............................................................................................................................................................................................|..........................................................................................*......................................................................... - // cmge v28.4s, v19.4s, v30.4s // .......*..........................................................................................................................................................................................|............................................................................................*....................................................................... - // sub v28.4s, v27.4s, v28.4s // ..............*...................................................................................................................................................................................|...................................................................................................*................................................................ - // mls v19.4s, v28.4s, v29.4s // .............................................*....................................................................................................................................................|..................................................................................................................................*................................. - // cmge v27.4s, v31.4s, v20.4s // ............................................*.....................................................................................................................................................|.................................................................................................................................*.................................. - // cmge v28.4s, v20.4s, v30.4s // ..............................................*...................................................................................................................................................|...................................................................................................................................*................................ - // sub v28.4s, v27.4s, v28.4s // ...................................................*..............................................................................................................................................|........................................................................................................................................*........................... - // mls v20.4s, v28.4s, v29.4s // ......................................................*...........................................................................................................................................|...........................................................................................................................................*........................ - // cmge v27.4s, v31.4s, v21.4s // ..................................*...............................................................................................................................................................|.......................................................................................................................*............................................ - // cmge v28.4s, v21.4s, v30.4s // .........................*........................................................................................................................................................................|..............................................................................................................*..................................................... - // sub v28.4s, v27.4s, v28.4s // .......................................*..........................................................................................................................................................|............................................................................................................................*....................................... - // mls v21.4s, v28.4s, v29.4s // ...............................................................*..................................................................................................................................|....................................................................................................................................................*............... - // cmge v27.4s, v31.4s, v22.4s // ...................*..............................................................................................................................................................................|........................................................................................................*........................................................... - // cmge v28.4s, v22.4s, v30.4s // .................*................................................................................................................................................................................|......................................................................................................*............................................................. - // sub v28.4s, v27.4s, v28.4s // .......................*..........................................................................................................................................................................|............................................................................................................*....................................................... - // mls v22.4s, v28.4s, v29.4s // ........................................*.........................................................................................................................................................|.............................................................................................................................*...................................... - // cmge v27.4s, v31.4s, v23.4s // ...............................*..................................................................................................................................................................|....................................................................................................................*............................................... - // cmge v28.4s, v23.4s, v30.4s // ..........................................*.......................................................................................................................................................|...............................................................................................................................*.................................... - // sub v28.4s, v27.4s, v28.4s // ................................................*.................................................................................................................................................|.....................................................................................................................................*.............................. - // mls v23.4s, v28.4s, v29.4s // ...........................................................*......................................................................................................................................|................................................................................................................................................*................... - // str q16, [x1], #(16) // ..................................................................................................................................................................................................|........................................................*........................................................................................................... - // str q17, [x1, #(-16 + 1*(512/8))] // ..............................*...................................................................................................................................................................|...................................................................................................................*................................................ - // str q18, [x1, #(-16 + 2*(512/8))] // ..................................................................................................................................................................................................|..............................................*..................................................................................................................... - // str q19, [x1, #(-16 + 3*(512/8))] // ........................................................*.........................................................................................................................................|.............................................................................................................................................*...................... - // str q20, [x1, #(-16 + 4*(512/8))] // ..................................................................*...............................................................................................................................|.......................................................................................................................................................*............ - // str q21, [x1, #(-16 + 5*(512/8))] // ..........................................................................*.......................................................................................................................|...............................................................................................................................................................*.... - // str q22, [x1, #(-16 + 6*(512/8))] // ...............................................*..................................................................................................................................................|....................................................................................................................................*............................... - // str q23, [x1, #(-16 + 7*(512/8))] // .......................................................................*..........................................................................................................................|............................................................................................................................................................*....... + // ---------------------------------------------------------------------------------------------------------------------------------- original position ----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + sqrdmulh v28.4S, v12.4S, v26.4S // .........................................*.............................................................................................................................................................................................................................................. + mul v18.4S, v18.4S, v1.S[0] // *....................................................................................................................................................................................................................................................................................... + mul v12.4S, v12.4S, v25.4S // ..........................................*............................................................................................................................................................................................................................................. + mls v12.4S, v28.4S, v29.4S // .................................................*...................................................................................................................................................................................................................................... + cmge v28.4S, v31.4S, v20.4S // ....*................................................................................................................................................................................................................................................................................... + mls v18.4S, v22.4S, v29.4S // ...*.................................................................................................................................................................................................................................................................................... + sub v22.4S, v8.4S, v17.4S // .*...................................................................................................................................................................................................................................................................................... + sub v28.4S, v28.4S, v14.4S // ......*................................................................................................................................................................................................................................................................................. + sqrdmulh v14.4S, v23.4S, v1.S[1] // .....*.................................................................................................................................................................................................................................................................................. + mul v23.4S, v23.4S, v1.S[0] // .......*................................................................................................................................................................................................................................................................................ + mls v20.4S, v28.4S, v29.4S // .........*.............................................................................................................................................................................................................................................................................. + add v28.4S, v16.4S, v18.4S // ........*............................................................................................................................................................................................................................................................................... + mls v23.4S, v14.4S, v29.4S // ..................*..................................................................................................................................................................................................................................................................... + sub v14.4S, v16.4S, v18.4S // ..........*............................................................................................................................................................................................................................................................................. + add v18.4S, v8.4S, v17.4S // ..*..................................................................................................................................................................................................................................................................................... + mul v17.4S, v21.4S, v0.S[0] // ............*........................................................................................................................................................................................................................................................................... + str q20, [x1, #576] // .............*.......................................................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v21.4S, v0.S[1] // ...........*............................................................................................................................................................................................................................................................................ + sqrdmulh v20.4S, v9.4S, v26.4S // ..........................................................*............................................................................................................................................................................................................................. + mul v9.4S, v9.4S, v25.4S // ............................................................*........................................................................................................................................................................................................................... + sqrdmulh v8.4S, v24.4S, v0.S[1] // ..............*......................................................................................................................................................................................................................................................................... + mul v24.4S, v24.4S, v0.S[0] // ...............*........................................................................................................................................................................................................................................................................ + mls v9.4S, v20.4S, v29.4S // .............................................................................................*.......................................................................................................................................................................................... + sqrdmulh v20.4S, v27.4S, v1.S[1] // .................*...................................................................................................................................................................................................................................................................... + mul v21.4S, v27.4S, v1.S[0] // ................*....................................................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v18.4S, v26.4S // ......................................*................................................................................................................................................................................................................................................. + mls v21.4S, v20.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + mls v24.4S, v8.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + mls v17.4S, v16.4S, v29.4S // ...................*.................................................................................................................................................................................................................................................................... + sub v20.4S, v13.4S, v21.4S // ...........................................*............................................................................................................................................................................................................................................ + add v21.4S, v13.4S, v21.4S // ............................................*........................................................................................................................................................................................................................................... + mul v8.4S, v18.4S, v25.4S // .......................................*................................................................................................................................................................................................................................................ + cmge v18.4S, v31.4S, v24.4S // ..............................................*......................................................................................................................................................................................................................................... + mls v8.4S, v27.4S, v29.4S // ....................................................*................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v22.4S, v0.S[1] // .......................*................................................................................................................................................................................................................................................................ + mul v13.4S, v22.4S, v0.S[0] // ....................*................................................................................................................................................................................................................................................................... + cmge v22.4S, v24.4S, v30.4S // ................................................*....................................................................................................................................................................................................................................... + mul v27.4S, v11.4S, v25.4S // ....................................................................*................................................................................................................................................................................................................... + sub v22.4S, v18.4S, v22.4S // ...................................................*.................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v11.4S, v26.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v24.4S, v22.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mul v22.4S, v28.4S, v25.4S // .................................*...................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v28.4S, v26.4S // ...............................*........................................................................................................................................................................................................................................................ + str q24, [x1, #768] // .........................................................*.............................................................................................................................................................................................................................. + sub v24.4S, v15.4S, v23.4S // .....................*.................................................................................................................................................................................................................................................................. + add v15.4S, v15.4S, v23.4S // ......................*................................................................................................................................................................................................................................................................. + mls v13.4S, v16.4S, v29.4S // ............................*........................................................................................................................................................................................................................................................... + cmge v16.4S, v31.4S, v17.4S // ........................*............................................................................................................................................................................................................................................................... + sqrdmulh v23.4S, v19.4S, v0.S[1] // ...............................................................................*........................................................................................................................................................................................................ + mls v22.4S, v28.4S, v29.4S // ...............................................*........................................................................................................................................................................................................................................ + cmge v11.4S, v31.4S, v13.4S // ..................................................*..................................................................................................................................................................................................................................... + mul v28.4S, v19.4S, v0.S[0] // ..................................................................................*..................................................................................................................................................................................................... + cmge v19.4S, v13.4S, v30.4S // ......................................................*................................................................................................................................................................................................................................. + mls v28.4S, v23.4S, v29.4S // .......................................................................................*................................................................................................................................................................................................ + sub v11.4S, v11.4S, v19.4S // ........................................................*............................................................................................................................................................................................................................... + cmge v19.4S, v17.4S, v30.4S // ..............................*......................................................................................................................................................................................................................................................... + mls v13.4S, v11.4S, v29.4S // ...............................................................*........................................................................................................................................................................................................................ + sub v19.4S, v16.4S, v19.4S // ................................*....................................................................................................................................................................................................................................................... + cmge v11.4S, v22.4S, v30.4S // ......................................................................*................................................................................................................................................................................................................. + mls v27.4S, v18.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + cmge v18.4S, v31.4S, v22.4S // ................................................................................................*....................................................................................................................................................................................... + cmge v16.4S, v28.4S, v30.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v17.4S, v19.4S, v29.4S // ..................................*..................................................................................................................................................................................................................................................... + cmge v19.4S, v31.4S, v28.4S // ........................................................................................................*............................................................................................................................................................................... + str q13, [x1, #512] // .....................................................................*.................................................................................................................................................................................................................. + sub v11.4S, v18.4S, v11.4S // .....................................................................................................*.................................................................................................................................................................................. + sqrdmulh v23.4S, v21.4S, v26.4S // .........................................................................*.............................................................................................................................................................................................................. + cmge v18.4S, v8.4S, v30.4S // ........................................................................................*............................................................................................................................................................................................... + sub v16.4S, v19.4S, v16.4S // ................................................................................................................*....................................................................................................................................................................... + mul v13.4S, v21.4S, v25.4S // ...........................................................................*............................................................................................................................................................................................................ + cmge v19.4S, v12.4S, v30.4S // .................................................................*...................................................................................................................................................................................................................... + str q17, [x1, #640] // .....................................*.................................................................................................................................................................................................................................................. + cmge v17.4S, v31.4S, v12.4S // .....................................................................................*.................................................................................................................................................................................................. + mls v22.4S, v11.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub v11.4S, v17.4S, v19.4S // .........................................................................................*.............................................................................................................................................................................................. + sqrdmulh v19.4S, v24.4S, v0.S[1] // ....................................*................................................................................................................................................................................................................................................... + mul v24.4S, v24.4S, v0.S[0] // ...................................*.................................................................................................................................................................................................................................................... + str q22, [x1, #384] // ............................................................................................................................*........................................................................................................................................................... + cmge v22.4S, v31.4S, v8.4S // ...........................................................................................*............................................................................................................................................................................................ + sqrdmulh v21.4S, v14.4S, v0.S[1] // ..........................*............................................................................................................................................................................................................................................................. + mul v17.4S, v14.4S, v0.S[0] // ...........................*............................................................................................................................................................................................................................................................ + mls v24.4S, v19.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mul v19.4S, v20.4S, v0.S[0] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v0.S[1] // ................................................................................*....................................................................................................................................................................................................... + mls v28.4S, v16.4S, v29.4S // ......................................................................................................................*................................................................................................................................................................. + cmge v14.4S, v31.4S, v24.4S // .............................................................*.......................................................................................................................................................................................................................... + cmge v16.4S, v24.4S, v30.4S // ..............................................................*......................................................................................................................................................................................................................... + mls v19.4S, v20.4S, v29.4S // ....................................................................................*................................................................................................................................................................................................... + sub v18.4S, v22.4S, v18.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v22.4S, v14.4S, v16.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v20.4S, v15.4S, v26.4S // .................................................................................................*...................................................................................................................................................................................... + mul v15.4S, v15.4S, v25.4S // ...................................................................................................*.................................................................................................................................................................................... + cmge v14.4S, v31.4S, v19.4S // ..............................................................................................*......................................................................................................................................................................................... + mls v24.4S, v22.4S, v29.4S // ........................................................................*............................................................................................................................................................................................................... + mls v13.4S, v23.4S, v29.4S // .............................................................................................................*.......................................................................................................................................................................... + mls v8.4S, v18.4S, v29.4S // ....................................................................................................................................*................................................................................................................................................... + cmge v18.4S, v31.4S, v27.4S // ..............................................................................................................*......................................................................................................................................................................... + str q24, [x1, #960] // ............................................................................*........................................................................................................................................................................................................... + mls v17.4S, v21.4S, v29.4S // .............................*.......................................................................................................................................................................................................................................................... + cmge v22.4S, v13.4S, v30.4S // ...........................................................................................................................*............................................................................................................................................................ + mls v12.4S, v11.4S, v29.4S // ........................................................................................................................................*............................................................................................................................................... + cmge v11.4S, v31.4S, v13.4S // ...............................................................................................................................*........................................................................................................................................................ + sqrdmulh v16.4S, v10.4S, v26.4S // .............................................................................*.......................................................................................................................................................................................................... + cmge v23.4S, v19.4S, v30.4S // ............................................................................................*........................................................................................................................................................................................... + cmge v21.4S, v31.4S, v17.4S // ...........................................................*............................................................................................................................................................................................................................ + mls v15.4S, v20.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + cmge v20.4S, v17.4S, v30.4S // .....................................................*.................................................................................................................................................................................................................................. + sub v11.4S, v11.4S, v22.4S // ...................................................................................................................................*.................................................................................................................................................... + str q12, [x1, #256] // ...........................................................................................................................................*............................................................................................................................................ + mul v24.4S, v10.4S, v25.4S // ..............................................................................*......................................................................................................................................................................................................... + cmge v22.4S, v27.4S, v30.4S // ............................................................................................................*........................................................................................................................................................................... + sub v21.4S, v21.4S, v20.4S // ................................................................*....................................................................................................................................................................................................................... + ldr q10, [x1, #464] // ...........................................................................................................................................................*............................................................................................................................ + mls v24.4S, v16.4S, v29.4S // .................................................................................*...................................................................................................................................................................................................... + cmge v20.4S, v15.4S, v30.4S // .....................................................................................................................*.................................................................................................................................................................. + ldr q12, [x1, #400] // ......................................................................................................................................................*................................................................................................................................. + str q8, [x1], #(16) // .........................................................................................................................................*.............................................................................................................................................. + cmge v8.4S, v31.4S, v15.4S // .......................................................................................................................*................................................................................................................................................................ + sub v16.4S, v18.4S, v22.4S // ..................................................................................................................*..................................................................................................................................................................... + mls v13.4S, v11.4S, v29.4S // ......................................................................................................................................*................................................................................................................................................. + str q28, [x1, #688] // ................................................................................................................................*....................................................................................................................................................... + sub v28.4S, v14.4S, v23.4S // ..................................................................................................*..................................................................................................................................................................................... + ldr q11, [x1, #192] // ...................................................................................................................................................*.................................................................................................................................... + ldr q22, [x1, #0] // ............................................................................................................................................*........................................................................................................................................... + sub v14.4S, v8.4S, v20.4S // .............................................................................................................................*.......................................................................................................................................................... + ldr q23, [x1, #704] // .................................................................................................................................................*...................................................................................................................................... + mls v17.4S, v21.4S, v29.4S // .......................................................................*................................................................................................................................................................................................................ + cmge v21.4S, v31.4S, v24.4S // ....................................................................................................*................................................................................................................................................................................... + mls v27.4S, v16.4S, v29.4S // ..........................................................................................................................*............................................................................................................................................................. + cmge v8.4S, v24.4S, v30.4S // ......................................................................................*................................................................................................................................................................................................. + cmge v18.4S, v31.4S, v9.4S // ....................................................................................................................*................................................................................................................................................................... + mls v15.4S, v14.4S, v29.4S // .................................................................................................................................*...................................................................................................................................................... + ldr q20, [x1, #128] // ..................................................................................................................................................*..................................................................................................................................... + cmge v14.4S, v9.4S, v30.4S // .......................................................................................................*................................................................................................................................................................................ + str q17, [x1, #880] // ..........................................................................*............................................................................................................................................................................................................. + sub v17.4S, v21.4S, v8.4S // ...........................................................................................................*............................................................................................................................................................................ + ldr q21, [x1, #640] // ..............................................................................................................................................*......................................................................................................................................... + mls v19.4S, v28.4S, v29.4S // ......................................................................................................*................................................................................................................................................................................. + sub v16.4S, v12.4S, v10.4S // ......................................................................................................................................................................*................................................................................................................. + ldr q8, [x1, #64] // .............................................................................................................................................*.......................................................................................................................................... + ldr q28, [x1, #512] // ...............................................................................................................................................*........................................................................................................................................ + mls v24.4S, v17.4S, v29.4S // .................................................................................................................*...................................................................................................................................................................... + add v17.4S, v20.4S, v11.4S // ...............................................................................................................................................................*........................................................................................................................ + str q15, [x1, #432] // .......................................................................................................................................*................................................................................................................................................ + sub v20.4S, v20.4S, v11.4S // ............................................................................................................................................................*........................................................................................................................... + sub v15.4S, v22.4S, v8.4S // ....................................................................................................................................................*................................................................................................................................... + str q13, [x1, #304] // ..........................................................................................................................................*............................................................................................................................................. + mul v13.4S, v16.4S, v5.S[0] // ...................................................................................................................................................................................................*.................................................................................... + add v22.4S, v22.4S, v8.4S // .....................................................................................................................................................*.................................................................................................................................. + str q27, [x1, #176] // ..................................................................................................................................*..................................................................................................................................................... + ldr q27, [x1, #256] // ..............................................................................................................................................................*......................................................................................................................... + mul v11.4S, v20.4S, v4.S[0] // ..................................................................................................................................................................................*..................................................................................................... + str q19, [x1, #816] // ...............................................................................................................*........................................................................................................................................................................ + ldr q19, [x1, #320] // ........................................................................................................................................................*............................................................................................................................... + str q24, [x1, #112] // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v8.4S, v20.4S, v4.S[1] // ....................................................................................................................................................................*................................................................................................................... + sub v20.4S, v18.4S, v14.4S // ........................................................................................................................*............................................................................................................................................................... + add v18.4S, v12.4S, v10.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v14.4S, v15.4S, v3.S[3] // .................................................................................................................................................................*...................................................................................................................... + add v12.4S, v27.4S, v19.4S // ..........................................................................................................................................................................*............................................................................................................. + mls v9.4S, v20.4S, v29.4S // ..............................................................................................................................*......................................................................................................................................................... + sub v20.4S, v21.4S, v23.4S // ..........................................................................................................................................................*............................................................................................................................. + mul v24.4S, v15.4S, v3.S[2] // .............................................................................................................................................................*.......................................................................................................................... + sub v15.4S, v22.4S, v17.4S // ..............................................................................................................................................................................*......................................................................................................... + ldr q10, [x1, #576] // ................................................................................................................................................................*....................................................................................................................... + mls v11.4S, v8.4S, v29.4S // ....................................................................................................................................................................................*................................................................................................... + add v8.4S, v22.4S, v17.4S // .....................................................................................................................................................................*.................................................................................................................. + sub v17.4S, v27.4S, v19.4S // .........................................................................................................................................................................*.............................................................................................................. + add v19.4S, v21.4S, v23.4S // .........................................................................................................................................................*.............................................................................................................................. + str q9, [x1, #48] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v22.4S, v20.4S, v6.S[1] // ........................................................................................................................................................................*............................................................................................................... + add v23.4S, v12.4S, v18.4S // ................................................................................................................................................................................*....................................................................................................... + mul v9.4S, v20.4S, v6.S[0] // ...........................................................................................................................................................................*............................................................................................................ + mls v24.4S, v14.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v14.4S, v12.4S, v18.4S // .................................................................................................................................................................................*...................................................................................................... + sub v27.4S, v8.4S, v23.4S // .....................................................................................................................................................................................................*.................................................................................. + add v8.4S, v8.4S, v23.4S // .............................................................................................................................................................................................................*.......................................................................... + sqrdmulh v12.4S, v15.4S, v1.S[3] // .......................................................................................................................................................................................................*................................................................................ + add v20.4S, v28.4S, v10.4S // .............................................................................................................................................................................*.......................................................................................................... + sub v10.4S, v28.4S, v10.4S // ............................................................................................................................................................................*........................................................................................................... + mul v28.4S, v15.4S, v1.S[2] // ...............................................................................................................................................................................................*........................................................................................ + sqrdmulh v16.4S, v16.4S, v5.S[1] // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v15.4S, v17.4S, v4.S[3] // ..................................................................................................................................................................................................*..................................................................................... + sub v18.4S, v20.4S, v19.4S // ...................................................................................................................................................................................*.................................................................................................... + add v21.4S, v24.4S, v11.4S // ...........................................................................................................................................................................................*............................................................................................ + mul v23.4S, v17.4S, v4.S[2] // ............................................................................................................................................................................................*........................................................................................... + sub v17.4S, v24.4S, v11.4S // ................................................................................................................................................................................................*....................................................................................... + add v11.4S, v20.4S, v19.4S // .......................................................................................................................................................................................*................................................................................................ + ldr q20, [x1, #768] // ................................................................................................................................................*....................................................................................................................................... + mls v13.4S, v16.4S, v29.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v24.4S, v27.4S, v0.S[3] // ...............................................................................................................................................................................................................*........................................................................ + mul v19.4S, v27.4S, v0.S[2] // ..................................................................................................................................................................................................................*..................................................................... + mls v9.4S, v22.4S, v29.4S // ......................................................................................................................................................................................*................................................................................................. + mls v28.4S, v12.4S, v29.4S // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v22.4S, v10.4S, v5.S[3] // .....................................................................................................................................................................................*.................................................................................................. + mul v27.4S, v10.4S, v5.S[2] // ........................................................................................................................................................................................*............................................................................................... + mls v27.4S, v22.4S, v29.4S // .........................................................................................................................................................................................*.............................................................................................. + ldr q22, [x1, #832] // .......................................................................................................................................................*................................................................................................................................ + mls v19.4S, v24.4S, v29.4S // ........................................................................................................................................................................................................................*............................................................... + ldr q24, [x1, #896] // .................................................................................................................................................................................................*...................................................................................... + sub v12.4S, v20.4S, v22.4S // ...................................................................................................................................................................*.................................................................................................................... + mul v10.4S, v17.4S, v1.S[2] // .........................................................................................................................................................................................................*.............................................................................. + add v22.4S, v20.4S, v22.4S // ..................................................................................................................................................................*..................................................................................................................... + sqrdmulh v20.4S, v17.4S, v1.S[3] // ..............................................................................................................................................................................................................*......................................................................... + sub v16.4S, v27.4S, v9.4S // .............................................................................................................................................................................................*.......................................................................................... + ldr q17, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + add v27.4S, v27.4S, v9.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v9.4S, v18.4S, v2.S[3] // .....................................................................................................................................................................................................................*.................................................................. + mul v18.4S, v18.4S, v2.S[2] // ....................................................................................................................................................................................................................*................................................................... + mls v10.4S, v20.4S, v29.4S // ...................................................................................................................................................................................................................*.................................................................... + add v20.4S, v24.4S, v17.4S // ...........................................................................................................................................................................................................................*............................................................ + mls v23.4S, v15.4S, v29.4S // ....................................................................................................................................................................................................*................................................................................... + sub v15.4S, v24.4S, v17.4S // ..........................................................................................................................................................................................................................*............................................................. + add v17.4S, v22.4S, v20.4S // ...............................................................................................................................................................................................................................*........................................................ + sub v24.4S, v22.4S, v20.4S // ..............................................................................................................................................................................................................................*......................................................... + sqrdmulh v22.4S, v12.4S, v6.S[3] // ......................................................................................................................................................................................................................*................................................................. + mul v20.4S, v12.4S, v6.S[2] // .................................................................................................................................................................................................................................*...................................................... + sub v12.4S, v11.4S, v17.4S // .....................................................................................................................................................................................................................................*.................................................. + add v17.4S, v11.4S, v17.4S // ....................................................................................................................................................................................................................................*................................................... + mls v18.4S, v9.4S, v29.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v11.4S, v23.4S, v13.4S // ................................................................................................................................................................................................................*....................................................................... + add v13.4S, v23.4S, v13.4S // .................................................................................................................................................................................................................*...................................................................... + mul v23.4S, v15.4S, v7.S[0] // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v9.4S, v15.4S, v7.S[1] // .......................................................................................................................................................................................................................................*................................................ + mls v23.4S, v9.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mls v20.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sqrdmulh v15.4S, v14.4S, v2.S[1] // .........................................................................................................................................................................................................................................................*.............................. + mul v9.4S, v14.4S, v2.S[0] // ...........................................................................................................................................................................................................................................................*............................ + add v14.4S, v20.4S, v23.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v22.4S, v20.4S, v23.4S // ................................................................................................................................................................................................................................................*....................................... + mul v20.4S, v11.4S, v2.S[0] // ............................................................................................................................................................................................................................*........................................................... + add v23.4S, v27.4S, v14.4S // ....................................................................................................................................................................................................................................................*................................... + mls v9.4S, v15.4S, v29.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v27.4S, v27.4S, v14.4S // ...................................................................................................................................................................................................................................................*.................................... + sqrdmulh v11.4S, v11.4S, v2.S[1] // .............................................................................................................................................................................................................................*.......................................................... + sqrdmulh v15.4S, v16.4S, v2.S[3] // ........................................................................................................................................................................................................*............................................................................... + mul v14.4S, v16.4S, v2.S[2] // ...........................................................................................................................................................................................................*............................................................................ + mls v20.4S, v11.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sqrdmulh v16.4S, v22.4S, v3.S[1] // ...................................................................................................................................................................................................................................................................*.................... + mls v14.4S, v15.4S, v29.4S // ............................................................................................................................................................................................................*........................................................................... + sub v15.4S, v10.4S, v20.4S // ..........................................................................................................................................................................................................................................*............................................. + mul v22.4S, v22.4S, v3.S[0] // ....................................................................................................................................................................................................................................................................*................... + add v11.4S, v10.4S, v20.4S // .........................................................................................................................................................................................................................................*.............................................. + sub v10.4S, v28.4S, v9.4S // ........................................................................................................................................................................................................................................................................*............... + sqrdmulh v20.4S, v15.4S, v0.S[3] // .............................................................................................................................................................................................................................................*.......................................... + add v28.4S, v28.4S, v9.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v22.4S, v16.4S, v29.4S // ......................................................................................................................................................................................................................................................................*................. + add v9.4S, v21.4S, v13.4S // ................................................................................................................................................................................................................................*....................................................... + mul v15.4S, v15.4S, v0.S[2] // ..............................................................................................................................................................................................................................................*......................................... + mls v15.4S, v20.4S, v29.4S // ..................................................................................................................................................................................................................................................*..................................... + sub v20.4S, v9.4S, v23.4S // ........................................................................................................................................................................................................................................................*............................... + add v9.4S, v9.4S, v23.4S // .......................................................................................................................................................................................................................................................*................................ + sub v13.4S, v21.4S, v13.4S // ..................................................................................................................................................................................................................................*..................................................... + sqrdmulh v21.4S, v20.4S, v0.S[1] // .......................................................................................................................................................................................................................................................................*................ + sub v23.4S, v14.4S, v22.4S // ............................................................................................................................................................................................................................................................................*........... + mul v20.4S, v20.4S, v0.S[0] // .............................................................................................................................................................................................................................................................................*.......... + add v14.4S, v14.4S, v22.4S // ...........................................................................................................................................................................................................................................................................*............ + sqrdmulh v22.4S, v12.4S, v1.S[1] // ...........................................................................................................................................................................................................................................*............................................ + mul v12.4S, v12.4S, v1.S[0] // .................................................................................................................................................................................................................................................*...................................... + sqrdmulh v16.4S, v24.4S, v3.S[1] // .....................................................................................................................................................................................................................................................*.................................. + mls v12.4S, v22.4S, v29.4S // ............................................................................................................................................................................................................................................................*........................... + mul v22.4S, v24.4S, v3.S[0] // ......................................................................................................................................................................................................................................................*................................. + mls v22.4S, v16.4S, v29.4S // ..........................................................................................................................................................................................................................................................*............................. + sqrdmulh v24.4S, v10.4S, v0.S[3] // .................................................................................................................................................................................................................................................................................*...... + mul v16.4S, v10.4S, v0.S[2] // ..................................................................................................................................................................................................................................................................................*..... + add v10.4S, v18.4S, v22.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v20.4S, v21.4S, v29.4S // ..............................................................................................................................................................................................................................................................................*......... + sub v18.4S, v18.4S, v22.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v16.4S, v24.4S, v29.4S // .....................................................................................................................................................................................................................................................................................*.. + sub v21.4S, v28.4S, v10.4S // ....................................................................................................................................................................................................................................................................................*... + add v10.4S, v28.4S, v10.4S // ......................................................................................................................................................................................................................................................................................*. + sqrdmulh v28.4S, v13.4S, v0.S[3] // ................................................................................................................................................................................................................................................................*....................... + sub v24.4S, v19.4S, v12.4S // ..................................................................................................................................................................................................................................................................*..................... + add v12.4S, v19.4S, v12.4S // .................................................................................................................................................................................................................................................................*...................... + mul v13.4S, v13.4S, v0.S[2] // ...............................................................................................................................................................................................................................................................*........................ + sub v19.4S, v11.4S, v14.4S // ...............................................................................................................................................................................................................................................................................*........ + add v11.4S, v11.4S, v14.4S // ................................................................................................................................................................................................................................................................................*....... + cmge v14.4S, v20.4S, v30.4S // .......................................................................................................................................................................................................................................................................................* + sqrdmulh v22.4S, v18.4S, v1.S[1] // ...................................................................................................................................................................................................................................................................................*.... + mls v13.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................................................................*............. + + // ------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // mul v28.4S, v18.4S, v1.S[0] // .*...................................................................................................................................................................................................................................................................................... + // sub v18.4S, v8.4S, v17.4S // ......*................................................................................................................................................................................................................................................................................. + // add v8.4S, v8.4S, v17.4S // ..............*......................................................................................................................................................................................................................................................................... + // mls v28.4S, v22.4S, v29.4S // .....*.................................................................................................................................................................................................................................................................................. + // cmge v22.4S, v31.4S, v20.4S // ....*................................................................................................................................................................................................................................................................................... + // sqrdmulh v17.4S, v23.4S, v1.S[1] // ........*............................................................................................................................................................................................................................................................................... + // sub v22.4S, v22.4S, v14.4S // .......*................................................................................................................................................................................................................................................................................ + // mul v14.4S, v23.4S, v1.S[0] // .........*.............................................................................................................................................................................................................................................................................. + // add v23.4S, v16.4S, v28.4S // ...........*............................................................................................................................................................................................................................................................................ + // mls v20.4S, v22.4S, v29.4S // ..........*............................................................................................................................................................................................................................................................................. + // sub v22.4S, v16.4S, v28.4S // .............*.......................................................................................................................................................................................................................................................................... + // sqrdmulh v16.4S, v21.4S, v0.S[1] // .................*...................................................................................................................................................................................................................................................................... + // mul v21.4S, v21.4S, v0.S[0] // ...............*........................................................................................................................................................................................................................................................................ + // str q20, [x1, #576] // ................*....................................................................................................................................................................................................................................................................... + // sqrdmulh v20.4S, v24.4S, v0.S[1] // ....................*................................................................................................................................................................................................................................................................... + // mul v24.4S, v24.4S, v0.S[0] // .....................*.................................................................................................................................................................................................................................................................. + // mul v28.4S, v27.4S, v1.S[0] // ........................*............................................................................................................................................................................................................................................................... + // sqrdmulh v27.4S, v27.4S, v1.S[1] // .......................*................................................................................................................................................................................................................................................................ + // mls v14.4S, v17.4S, v29.4S // ............*........................................................................................................................................................................................................................................................................... + // mls v21.4S, v16.4S, v29.4S // ............................*........................................................................................................................................................................................................................................................... + // mul v16.4S, v18.4S, v0.S[0] // ...................................*.................................................................................................................................................................................................................................................... + // sub v17.4S, v15.4S, v14.4S // ............................................*........................................................................................................................................................................................................................................... + // add v15.4S, v15.4S, v14.4S // .............................................*.......................................................................................................................................................................................................................................... + // sqrdmulh v18.4S, v18.4S, v0.S[1] // ..................................*..................................................................................................................................................................................................................................................... + // cmge v14.4S, v31.4S, v21.4S // ...............................................*........................................................................................................................................................................................................................................ + // mls v24.4S, v20.4S, v29.4S // ...........................*............................................................................................................................................................................................................................................................ + // sqrdmulh v20.4S, v22.4S, v0.S[1] // ...............................................................................*........................................................................................................................................................................................................ + // mul v22.4S, v22.4S, v0.S[0] // ................................................................................*....................................................................................................................................................................................................... + // mls v16.4S, v18.4S, v29.4S // ..............................................*......................................................................................................................................................................................................................................... + // mls v22.4S, v20.4S, v29.4S // ..................................................................................................*..................................................................................................................................................................................... + // cmge v20.4S, v21.4S, v30.4S // .......................................................*................................................................................................................................................................................................................................ + // sqrdmulh v18.4S, v23.4S, v26.4S // ..........................................*............................................................................................................................................................................................................................................. + // sub v20.4S, v14.4S, v20.4S // .........................................................*.............................................................................................................................................................................................................................. + // mul v14.4S, v23.4S, v25.4S // .........................................*.............................................................................................................................................................................................................................................. + // mls v21.4S, v20.4S, v29.4S // ..............................................................*......................................................................................................................................................................................................................... + // mul v23.4S, v17.4S, v0.S[0] // ............................................................................*........................................................................................................................................................................................................... + // sqrdmulh v20.4S, v17.4S, v0.S[1] // ...........................................................................*............................................................................................................................................................................................................ + // str q21, [x1, #640] // .......................................................................*................................................................................................................................................................................................................ + // sqrdmulh v17.4S, v8.4S, v26.4S // .........................*.............................................................................................................................................................................................................................................................. + // mul v8.4S, v8.4S, v25.4S // ...............................*........................................................................................................................................................................................................................................................ + // mls v28.4S, v27.4S, v29.4S // ..........................*............................................................................................................................................................................................................................................................. + // sqrdmulh v27.4S, v12.4S, v26.4S // *....................................................................................................................................................................................................................................................................................... + // mul v12.4S, v12.4S, v25.4S // ..*..................................................................................................................................................................................................................................................................................... + // sub v21.4S, v13.4S, v28.4S // .............................*.......................................................................................................................................................................................................................................................... + // add v13.4S, v13.4S, v28.4S // ..............................*......................................................................................................................................................................................................................................................... + // mls v23.4S, v20.4S, v29.4S // .................................................................................*...................................................................................................................................................................................................... + // cmge v20.4S, v31.4S, v24.4S // ................................*....................................................................................................................................................................................................................................................... + // mls v14.4S, v18.4S, v29.4S // .................................................*...................................................................................................................................................................................................................................... + // cmge v18.4S, v24.4S, v30.4S // ....................................*................................................................................................................................................................................................................................................... + // mls v12.4S, v27.4S, v29.4S // ...*.................................................................................................................................................................................................................................................................................... + // cmge v27.4S, v31.4S, v16.4S // ..................................................*..................................................................................................................................................................................................................................... + // sub v28.4S, v20.4S, v18.4S // ......................................*................................................................................................................................................................................................................................................. + // mls v8.4S, v17.4S, v29.4S // .................................*...................................................................................................................................................................................................................................................... + // cmge v20.4S, v22.4S, v30.4S // ..........................................................................................................*............................................................................................................................................................................. + // cmge v18.4S, v16.4S, v30.4S // ....................................................*................................................................................................................................................................................................................................... + // mls v24.4S, v28.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + // sub v18.4S, v27.4S, v18.4S // ......................................................*................................................................................................................................................................................................................................. + // str q24, [x1, #768] // ...........................................*............................................................................................................................................................................................................................................ + // sqrdmulh v27.4S, v9.4S, v26.4S // ..................*..................................................................................................................................................................................................................................................................... + // cmge v17.4S, v31.4S, v22.4S // ........................................................................................................*............................................................................................................................................................................... + // mul v9.4S, v9.4S, v25.4S // ...................*.................................................................................................................................................................................................................................................................... + // cmge v28.4S, v31.4S, v23.4S // .....................................................................................*.................................................................................................................................................................................................. + // cmge v24.4S, v23.4S, v30.4S // ......................................................................................*................................................................................................................................................................................................. + // mls v16.4S, v18.4S, v29.4S // ........................................................*............................................................................................................................................................................................................................... + // sub v20.4S, v17.4S, v20.4S // ...............................................................................................................*........................................................................................................................................................................ + // cmge v17.4S, v12.4S, v30.4S // ......................................................................*................................................................................................................................................................................................................. + // sub v28.4S, v28.4S, v24.4S // .........................................................................................*.............................................................................................................................................................................................. + // sqrdmulh v18.4S, v11.4S, v26.4S // .......................................*................................................................................................................................................................................................................................................ + // mul v11.4S, v11.4S, v25.4S // .....................................*.................................................................................................................................................................................................................................................. + // str q16, [x1, #512] // ................................................................*....................................................................................................................................................................................................................... + // cmge v16.4S, v14.4S, v30.4S // ..........................................................*............................................................................................................................................................................................................................. + // mls v22.4S, v20.4S, v29.4S // ..............................................................................................................................*......................................................................................................................................................... + // mls v23.4S, v28.4S, v29.4S // .............................................................................................*.......................................................................................................................................................................................... + // sqrdmulh v24.4S, v13.4S, v26.4S // ..................................................................*..................................................................................................................................................................................................................... + // str q22, [x1, #896] // ......................................................................................................................................*................................................................................................................................................. + // mul v13.4S, v13.4S, v25.4S // .....................................................................*.................................................................................................................................................................................................................. + // str q23, [x1, #960] // .................................................................................................*...................................................................................................................................................................................... + // sqrdmulh v20.4S, v10.4S, v26.4S // ......................................................................................................*................................................................................................................................................................................. + // mul v22.4S, v10.4S, v25.4S // .............................................................................................................*.......................................................................................................................................................................... + // sqrdmulh v10.4S, v19.4S, v0.S[1] // ................................................*....................................................................................................................................................................................................................................... + // sqrdmulh v23.4S, v21.4S, v0.S[1] // ...................................................................................*.................................................................................................................................................................................................... + // mls v22.4S, v20.4S, v29.4S // .................................................................................................................*...................................................................................................................................................................... + // mul v28.4S, v19.4S, v0.S[0] // ...................................................*.................................................................................................................................................................................................................................... + // mul v19.4S, v21.4S, v0.S[0] // ..................................................................................*..................................................................................................................................................................................................... + // mls v19.4S, v23.4S, v29.4S // .......................................................................................*................................................................................................................................................................................................ + // cmge v20.4S, v31.4S, v12.4S // ........................................................................*............................................................................................................................................................................................................... + // cmge v21.4S, v22.4S, v30.4S // .................................................................................................................................*...................................................................................................................................................... + // mls v28.4S, v10.4S, v29.4S // .....................................................*.................................................................................................................................................................................................................................. + // cmge v10.4S, v8.4S, v30.4S // ...................................................................*.................................................................................................................................................................................................................... + // sub v20.4S, v20.4S, v17.4S // ..........................................................................*............................................................................................................................................................................................................. + // mls v11.4S, v18.4S, v29.4S // ...........................................................*............................................................................................................................................................................................................................ + // cmge v18.4S, v31.4S, v8.4S // ..............................................................................*......................................................................................................................................................................................................... + // cmge v23.4S, v19.4S, v30.4S // .......................................................................................................*................................................................................................................................................................................ + // mls v9.4S, v27.4S, v29.4S // ......................*................................................................................................................................................................................................................................................................. + // cmge v27.4S, v31.4S, v19.4S // ............................................................................................*........................................................................................................................................................................................... + // sub v10.4S, v18.4S, v10.4S // ........................................................................................*............................................................................................................................................................................................... + // cmge v18.4S, v31.4S, v14.4S // ............................................................*........................................................................................................................................................................................................................... + // sqrdmulh v17.4S, v15.4S, v26.4S // ..........................................................................................*............................................................................................................................................................................................. + // sub v27.4S, v27.4S, v23.4S // .........................................................................................................................*.............................................................................................................................................................. + // mul v15.4S, v15.4S, v25.4S // ...........................................................................................*............................................................................................................................................................................................ + // cmge v23.4S, v31.4S, v22.4S // ...............................................................................................................................*........................................................................................................................................................ + // sub v16.4S, v18.4S, v16.4S // .................................................................*...................................................................................................................................................................................................................... + // mls v19.4S, v27.4S, v29.4S // .........................................................................................................................................*.............................................................................................................................................. + // cmge v18.4S, v9.4S, v30.4S // .....................................................................................................................................*.................................................................................................................................................. + // cmge v27.4S, v31.4S, v28.4S // ...............................................................*........................................................................................................................................................................................................................ + // mls v15.4S, v17.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + // cmge v17.4S, v28.4S, v30.4S // .............................................................*.......................................................................................................................................................................................................................... + // sub v23.4S, v23.4S, v21.4S // .......................................................................................................................................*................................................................................................................................................ + // cmge v21.4S, v11.4S, v30.4S // ..............................................................................................................*......................................................................................................................................................................... + // mls v13.4S, v24.4S, v29.4S // ..............................................................................................*......................................................................................................................................................................................... + // cmge v24.4S, v31.4S, v11.4S // ................................................................................................*....................................................................................................................................................................................... + // str q19, [x1, #832] // ........................................................................................................................................................*............................................................................................................................... + // sub v19.4S, v27.4S, v17.4S // ....................................................................*................................................................................................................................................................................................................... + // mls v22.4S, v23.4S, v29.4S // .............................................................................................................................................*.......................................................................................................................................... + // sub v27.4S, v24.4S, v21.4S // ......................................................................................................................*................................................................................................................................................................. + // mls v14.4S, v16.4S, v29.4S // .........................................................................*.............................................................................................................................................................................................................. + // cmge v17.4S, v31.4S, v9.4S // ..................................................................................................................................*..................................................................................................................................................... + // cmge v16.4S, v15.4S, v30.4S // ..................................................................................................................*..................................................................................................................................................................... + // mls v28.4S, v19.4S, v29.4S // ....................................................................................*................................................................................................................................................................................................... + // cmge v19.4S, v31.4S, v15.4S // .....................................................................................................................*.................................................................................................................................................................. + // sub v18.4S, v17.4S, v18.4S // ............................................................................................................................................................*........................................................................................................................... + // str q22, [x1, #128] // ..........................................................................................................................................................*............................................................................................................................. + // mls v11.4S, v27.4S, v29.4S // ................................................................................................................................*....................................................................................................................................................... + // cmge v22.4S, v13.4S, v30.4S // ...................................................................................................*.................................................................................................................................................................................... + // str q14, [x1, #384] // .............................................................................*.......................................................................................................................................................................................................... + // sub v19.4S, v19.4S, v16.4S // ............................................................................................................................*........................................................................................................................................................... + // mls v9.4S, v18.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + // cmge v18.4S, v31.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. + // str q28, [x1, #704] // ........................................................................................................................*............................................................................................................................................................... + // mls v15.4S, v19.4S, v29.4S // ...................................................................................................................................*.................................................................................................................................................... + // str q11, [x1, #192] // .....................................................................................................................................................*.................................................................................................................................. + // sub v18.4S, v18.4S, v22.4S // ...........................................................................................................*............................................................................................................................................................................ + // mls v8.4S, v10.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + // str q9, [x1, #64] // .........................................................................................................................................................................*.............................................................................................................. + // mls v13.4S, v18.4S, v29.4S // .......................................................................................................................*................................................................................................................................................................ + // str q15, [x1, #448] // ...............................................................................................................................................*........................................................................................................................................ + // mls v12.4S, v20.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + // str q8, [x1], #(16) // ....................................................................................................................*................................................................................................................................................................... + // str q13, [x1, #304] // ..................................................................................................................................................*..................................................................................................................................... + // str q12, [x1, #240] // ............................................................................................................*........................................................................................................................................................................... + // ldr q20, [x1, #0] // ...........................................................................................................................*............................................................................................................................................................ + // ldr q18, [x1, #64] // ...........................................................................................................................................*............................................................................................................................................ + // ldr q19, [x1, #640] // ........................................................................................................................................*............................................................................................................................................... + // ldr q16, [x1, #512] // ............................................................................................................................................*........................................................................................................................................... + // ldr q22, [x1, #768] // ............................................................................................................................................................................................*........................................................................................... + // ldr q11, [x1, #704] // .............................................................................................................................*.......................................................................................................................................................... + // ldr q13, [x1, #128] // ....................................................................................................................................*................................................................................................................................................... + // ldr q27, [x1, #192] // ..........................................................................................................................*............................................................................................................................................................. + // sub v17.4S, v20.4S, v18.4S // .................................................................................................................................................*...................................................................................................................................... + // add v18.4S, v20.4S, v18.4S // ....................................................................................................................................................*................................................................................................................................... + // ldr q12, [x1, #384] // ...................................................................................................................*.................................................................................................................................................................... + // ldr q14, [x1, #832] // .....................................................................................................................................................................................................*.................................................................................. + // ldr q10, [x1, #320] // .........................................................................................................................................................*.............................................................................................................................. + // add v28.4S, v19.4S, v11.4S // ........................................................................................................................................................................*............................................................................................................... + // sub v9.4S, v19.4S, v11.4S // .................................................................................................................................................................*...................................................................................................................... + // ldr q11, [x1, #448] // ................................................................................................................*....................................................................................................................................................................... + // sub v24.4S, v13.4S, v27.4S // ................................................................................................................................................*....................................................................................................................................... + // mul v21.4S, v17.4S, v3.S[2] // ..................................................................................................................................................................*..................................................................................................................... + // ldr q15, [x1, #256] // ......................................................................................................................................................*................................................................................................................................. + // add v27.4S, v13.4S, v27.4S // ..............................................................................................................................................*......................................................................................................................................... + // ldr q19, [x1, #576] // ....................................................................................................................................................................*................................................................................................................... + // sqrdmulh v13.4S, v17.4S, v3.S[3] // ..............................................................................................................................................................*......................................................................................................................... + // add v20.4S, v22.4S, v14.4S // ..........................................................................................................................................................................................................*............................................................................. + // sub v23.4S, v22.4S, v14.4S // ........................................................................................................................................................................................................*............................................................................... + // sqrdmulh v22.4S, v24.4S, v4.S[1] // ...........................................................................................................................................................*............................................................................................................................ + // add v8.4S, v18.4S, v27.4S // ......................................................................................................................................................................*................................................................................................................. + // sub v17.4S, v12.4S, v11.4S // ..........................................................................................................................................*............................................................................................................................................. + // add v14.4S, v12.4S, v11.4S // .............................................................................................................................................................*.......................................................................................................................... + // sqrdmulh v11.4S, v9.4S, v6.S[1] // ..........................................................................................................................................................................*............................................................................................................. + // sub v12.4S, v15.4S, v10.4S // .......................................................................................................................................................................*................................................................................................................ + // add v15.4S, v15.4S, v10.4S // ...............................................................................................................................................................*........................................................................................................................ + // mul v9.4S, v9.4S, v6.S[0] // ............................................................................................................................................................................*........................................................................................................... + // sub v10.4S, v16.4S, v19.4S // ...................................................................................................................................................................................*.................................................................................................... + // add v19.4S, v16.4S, v19.4S // ..................................................................................................................................................................................*..................................................................................................... + // sub v27.4S, v18.4S, v27.4S // ...................................................................................................................................................................*.................................................................................................................... + // mls v21.4S, v13.4S, v29.4S // .............................................................................................................................................................................*.......................................................................................................... + // add v18.4S, v15.4S, v14.4S // ...........................................................................................................................................................................*............................................................................................................ + // sub v14.4S, v15.4S, v14.4S // ..............................................................................................................................................................................*......................................................................................................... + // mul v16.4S, v24.4S, v4.S[0] // .......................................................................................................................................................*................................................................................................................................ + // sub v15.4S, v19.4S, v28.4S // .......................................................................................................................................................................................*................................................................................................ + // mls v16.4S, v22.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + // sqrdmulh v13.4S, v10.4S, v5.S[3] // ..................................................................................................................................................................................................*..................................................................................... + // mls v9.4S, v11.4S, v29.4S // ................................................................................................................................................................................................*....................................................................................... + // add v11.4S, v19.4S, v28.4S // ...........................................................................................................................................................................................*............................................................................................ + // mul v28.4S, v10.4S, v5.S[2] // ...................................................................................................................................................................................................*.................................................................................... + // mls v28.4S, v13.4S, v29.4S // ....................................................................................................................................................................................................*................................................................................... + // sqrdmulh v22.4S, v17.4S, v5.S[1] // .....................................................................................................................................................................................*.................................................................................................. + // add v13.4S, v21.4S, v16.4S // ........................................................................................................................................................................................*............................................................................................... + // mul v24.4S, v12.4S, v4.S[2] // .........................................................................................................................................................................................*.............................................................................................. + // sub v19.4S, v28.4S, v9.4S // ............................................................................................................................................................................................................*........................................................................... + // add v10.4S, v28.4S, v9.4S // ..............................................................................................................................................................................................................*......................................................................... + // mul v28.4S, v27.4S, v1.S[2] // ....................................................................................................................................................................................*................................................................................................... + // sub v9.4S, v21.4S, v16.4S // ..........................................................................................................................................................................................*............................................................................................. + // ldr q16, [x1, #896] // .......................................................................................................................................................................................................*................................................................................ + // sqrdmulh v21.4S, v12.4S, v4.S[3] // ......................................................................................................................................................................................*................................................................................................. + // mul v12.4S, v17.4S, v5.S[0] // ...................................................................................................................................................*.................................................................................................................................... + // mls v24.4S, v21.4S, v29.4S // ...................................................................................................................................................................................................................*.................................................................... + // sub v17.4S, v8.4S, v18.4S // ...............................................................................................................................................................................*........................................................................................................ + // mls v12.4S, v22.4S, v29.4S // .............................................................................................................................................................................................*.......................................................................................... + // sqrdmulh v27.4S, v27.4S, v1.S[3] // .................................................................................................................................................................................*...................................................................................................... + // sqrdmulh v21.4S, v19.4S, v2.S[3] // ...........................................................................................................................................................................................................................................*............................................ + // mul v22.4S, v9.4S, v1.S[2] // .........................................................................................................................................................................................................*.............................................................................. + // mls v28.4S, v27.4S, v29.4S // .................................................................................................................................................................................................*...................................................................................... + // mul v19.4S, v19.4S, v2.S[2] // ............................................................................................................................................................................................................................................*........................................... + // mls v19.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + // add v8.4S, v8.4S, v18.4S // ................................................................................................................................................................................*....................................................................................................... + // sqrdmulh v18.4S, v9.4S, v1.S[3] // ...........................................................................................................................................................................................................*............................................................................ + // sqrdmulh v27.4S, v17.4S, v0.S[3] // ..............................................................................................................................................................................................*......................................................................................... + // sub v9.4S, v24.4S, v12.4S // ............................................................................................................................................................................................................................*........................................................... + // add v12.4S, v24.4S, v12.4S // .............................................................................................................................................................................................................................*.......................................................... + // mul v21.4S, v17.4S, v0.S[2] // ...............................................................................................................................................................................................*........................................................................................ + // mls v22.4S, v18.4S, v29.4S // .................................................................................................................................................................................................................*...................................................................... + // mul v18.4S, v15.4S, v2.S[2] // ................................................................................................................................................................................................................*....................................................................... + // sqrdmulh v24.4S, v15.4S, v2.S[3] // ...............................................................................................................................................................................................................*........................................................................ + // sqrdmulh v17.4S, v23.4S, v6.S[3] // .......................................................................................................................................................................................................................*................................................................ + // ldr q15, [x1, #960] // .............................................................................................................................................................................................................*.......................................................................... + // mls v21.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................*................................................................................. + // mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................*............................................................ + // sub v24.4S, v16.4S, v15.4S // ....................................................................................................................................................................................................................*................................................................... + // add v16.4S, v16.4S, v15.4S // ..................................................................................................................................................................................................................*..................................................................... + // mul v15.4S, v9.4S, v2.S[0] // ......................................................................................................................................................................................................................................*................................................. + // sqrdmulh v27.4S, v9.4S, v2.S[1] // ..........................................................................................................................................................................................................................................*............................................. + // sub v9.4S, v20.4S, v16.4S // ......................................................................................................................................................................................................................*................................................................. + // add v16.4S, v20.4S, v16.4S // .....................................................................................................................................................................................................................*.................................................................. + // add v20.4S, v13.4S, v12.4S // .......................................................................................................................................................................................................................................................*................................ + // mul v23.4S, v23.4S, v6.S[2] // ........................................................................................................................................................................................................................*............................................................... + // sub v12.4S, v13.4S, v12.4S // ............................................................................................................................................................................................................................................................*........................... + // mls v23.4S, v17.4S, v29.4S // .................................................................................................................................................................................................................................*...................................................... + // add v17.4S, v11.4S, v16.4S // ..........................................................................................................................................................................................................................*............................................................. + // sub v16.4S, v11.4S, v16.4S // .........................................................................................................................................................................................................................*.............................................................. + // mls v15.4S, v27.4S, v29.4S // .............................................................................................................................................................................................................................................*.......................................... + // sqrdmulh v27.4S, v24.4S, v7.S[1] // ...............................................................................................................................................................................................................................*........................................................ + // mul v24.4S, v24.4S, v7.S[0] // ..............................................................................................................................................................................................................................*......................................................... + // add v11.4S, v22.4S, v15.4S // ..................................................................................................................................................................................................................................................*..................................... + // sub v15.4S, v22.4S, v15.4S // ................................................................................................................................................................................................................................................*....................................... + // sqrdmulh v13.4S, v16.4S, v1.S[1] // .................................................................................................................................................................................................................................................................*...................... + // mls v24.4S, v27.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + // sqrdmulh v27.4S, v15.4S, v0.S[3] // ....................................................................................................................................................................................................................................................*................................... + // mul v15.4S, v15.4S, v0.S[2] // ........................................................................................................................................................................................................................................................*............................... + // add v22.4S, v23.4S, v24.4S // ....................................................................................................................................................................................................................................*................................................... + // sub v23.4S, v23.4S, v24.4S // .....................................................................................................................................................................................................................................*.................................................. + // mul v24.4S, v16.4S, v1.S[0] // ..................................................................................................................................................................................................................................................................*..................... + // mls v15.4S, v27.4S, v29.4S // .........................................................................................................................................................................................................................................................*.............................. + // sub v27.4S, v10.4S, v22.4S // .........................................................................................................................................................................................................................................*.............................................. + // add v22.4S, v10.4S, v22.4S // .......................................................................................................................................................................................................................................*................................................ + // sqrdmulh v16.4S, v9.4S, v3.S[1] // ...................................................................................................................................................................................................................................................................*.................... + // mul v10.4S, v9.4S, v3.S[0] // .....................................................................................................................................................................................................................................................................*.................. + // add v9.4S, v20.4S, v22.4S // ...........................................................................................................................................................................................................................................................*............................ + // sub v20.4S, v20.4S, v22.4S // ..........................................................................................................................................................................................................................................................*............................. + // sqrdmulh v22.4S, v14.4S, v2.S[1] // ..................................................................................................................................................................................................................................*..................................................... + // mls v10.4S, v16.4S, v29.4S // ......................................................................................................................................................................................................................................................................*................. + // mul v16.4S, v14.4S, v2.S[0] // ...................................................................................................................................................................................................................................*.................................................... + // mls v24.4S, v13.4S, v29.4S // ....................................................................................................................................................................................................................................................................*................... + // add v14.4S, v18.4S, v10.4S // .........................................................................................................................................................................................................................................................................*.............. + // sub v18.4S, v18.4S, v10.4S // ...........................................................................................................................................................................................................................................................................*............ + // mul v13.4S, v12.4S, v0.S[2] // ..................................................................................................................................................................................................................................................................................*..... + // sqrdmulh v10.4S, v12.4S, v0.S[3] // ...............................................................................................................................................................................................................................................................................*........ + // add v12.4S, v21.4S, v24.4S // .................................................................................................................................................................................................................................................................................*...... + // sub v24.4S, v21.4S, v24.4S // ................................................................................................................................................................................................................................................................................*....... + // sqrdmulh v21.4S, v23.4S, v3.S[1] // ..............................................................................................................................................................................................................................................*......................................... + // mul v23.4S, v23.4S, v3.S[0] // .................................................................................................................................................................................................................................................*...................................... + // mls v16.4S, v22.4S, v29.4S // ........................................................................................................................................................................................................................................*............................................... + // mls v23.4S, v21.4S, v29.4S // ......................................................................................................................................................................................................................................................*................................. + // sqrdmulh v22.4S, v20.4S, v0.S[1] // .............................................................................................................................................................................................................................................................*.......................... + // sub v21.4S, v28.4S, v16.4S // ...................................................................................................................................................................................................................................................*.................................... + // add v28.4S, v28.4S, v16.4S // .....................................................................................................................................................................................................................................................*.................................. + // mls v13.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................................................................................................* + // add v10.4S, v19.4S, v23.4S // ................................................................................................................................................................................................................................................................*....................... + // sub v23.4S, v19.4S, v23.4S // ..............................................................................................................................................................................................................................................................*......................... + // mul v20.4S, v20.4S, v0.S[0] // ...............................................................................................................................................................................................................................................................*........................ + // mls v20.4S, v22.4S, v29.4S // ..........................................................................................................................................................................................................................................................................*............. + // sub v19.4S, v11.4S, v10.4S // ...................................................................................................................................................................................................................................................................................*.... + // add v11.4S, v11.4S, v10.4S // ....................................................................................................................................................................................................................................................................................*... + // sqrdmulh v10.4S, v21.4S, v0.S[3] // .......................................................................................................................................................................................................................................................................*................ + // mul v16.4S, v21.4S, v0.S[2] // ........................................................................................................................................................................................................................................................................*............... + // sqrdmulh v22.4S, v18.4S, v1.S[1] // ......................................................................................................................................................................................................................................................................................*. + // sub v21.4S, v28.4S, v14.4S // .............................................................................................................................................................................................................................................................................*.......... + // mls v16.4S, v10.4S, v29.4S // ............................................................................................................................................................................................................................................................................*........... + // add v10.4S, v28.4S, v14.4S // ..............................................................................................................................................................................................................................................................................*......... + // cmge v14.4S, v20.4S, v30.4S // .....................................................................................................................................................................................................................................................................................*.. sub count, count, #1 cbnz count, layer1234_start - mls v15.4S, v24.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - add v24.4S, v10.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ - sub v20.4S, v27.4S, v19.4S // .......................................................................................................................................................*................................................................................................................................ - sub v10.4S, v12.4S, v16.4S // ............................................................................................................................................................*........................................................................................................................... - sqrdmulh v14.4S, v18.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - add v12.4S, v12.4S, v16.4S // .............................................................................................................................................................*.......................................................................................................................... - mul v16.4S, v18.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - sub v18.4S, v11.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... - add v13.4S, v11.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... - sqrdmulh v15.4S, v21.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - add v11.4S, v27.4S, v19.4S // ........................................................................................................................................................*............................................................................................................................... - mul v19.4S, v21.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - mul v27.4S, v20.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - mls v19.4S, v15.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - mls v16.4S, v14.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - sqrdmulh v15.4S, v10.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - sub v21.4S, v19.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. - add v14.4S, v19.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ - sqrdmulh v22.4S, v20.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - cmge v19.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... - mul v20.4S, v10.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - cmge v10.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - mls v20.4S, v15.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - sub v15.4S, v24.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... - sub v10.4S, v19.4S, v10.4S // ..................................................................................................................................................................................*..................................................................................................... - mul v19.4S, v17.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - add v10.4S, v24.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... + mul v28.4S, v18.4S, v1.S[0] // .................................................................................................................................*...................................................................................................................................................... + sub v18.4S, v8.4S, v17.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v17.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v28.4S, v22.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + cmge v22.4S, v31.4S, v20.4S // ....................................................................................................................................................................................*................................................................................................... + sqrdmulh v17.4S, v23.4S, v1.S[1] // .....................................................................................................................................*.................................................................................................................................................. + sub v22.4S, v22.4S, v14.4S // ......................................................................................................................................................................................*................................................................................................. + mul v14.4S, v23.4S, v1.S[0] // ......................................................................................................................................*................................................................................................................................................. + add v23.4S, v16.4S, v28.4S // .......................................................................................................................................................................*................................................................................................................ + mls v20.4S, v22.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v22.4S, v16.4S, v28.4S // ......................................................................................................................................................................*................................................................................................................. + sqrdmulh v16.4S, v21.4S, v0.S[1] // ....................................................................................................................................................*................................................................................................................................... + mul v21.4S, v21.4S, v0.S[0] // .....................................................................................................................................................*.................................................................................................................................. + str q20, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + sqrdmulh v20.4S, v24.4S, v0.S[1] // ..............................................................................................................................................................*......................................................................................................................... + mul v24.4S, v24.4S, v0.S[0] // ...............................................................................................................................................................*........................................................................................................................ + mul v28.4S, v27.4S, v1.S[0] // ............................................................................................................................*........................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v1.S[1] // ...........................................................................................................................*............................................................................................................................................................ + mls v14.4S, v17.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v21.4S, v16.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mul v16.4S, v18.4S, v0.S[0] // ...........................................................................................................................................*............................................................................................................................................ + sub v17.4S, v15.4S, v14.4S // ...........................................................................................................................................................................*............................................................................................................ + add v15.4S, v15.4S, v14.4S // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v18.4S, v18.4S, v0.S[1] // ..........................................................................................................................................*............................................................................................................................................. + cmge v14.4S, v31.4S, v21.4S // ........................................................................................................................................................................................*............................................................................................... + mls v24.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sqrdmulh v20.4S, v22.4S, v0.S[1] // ........................................................................................................................................................................*............................................................................................................... + mul v22.4S, v22.4S, v0.S[0] // .........................................................................................................................................................................*.............................................................................................................. + mls v16.4S, v18.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mls v22.4S, v20.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v20.4S, v21.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + sqrdmulh v18.4S, v23.4S, v26.4S // ..........................................................................................................................................................................................................................................*............................................. + sub v20.4S, v14.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v14.4S, v23.4S, v25.4S // ...........................................................................................................................................................................................................................................*............................................ + mls v21.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + mul v23.4S, v17.4S, v0.S[0] // ..............................................................................................................................................................................*......................................................................................................... + sqrdmulh v20.4S, v17.4S, v0.S[1] // .............................................................................................................................................................................*.......................................................................................................... + str q21, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v17.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................*............................................................... + mul v8.4S, v8.4S, v25.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v28.4S, v27.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sqrdmulh v27.4S, v12.4S, v26.4S // ....................................................................................................................................................................................................................................*................................................... + mul v12.4S, v12.4S, v25.4S // .....................................................................................................................................................................................................................................*.................................................. + sub v21.4S, v13.4S, v28.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v28.4S // ..................................................................................................................................................................*..................................................................................................................... + mls v23.4S, v20.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v20.4S, v31.4S, v24.4S // ................................................................................................................................................................................................*....................................................................................... + mls v14.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v18.4S, v24.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v12.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + cmge v27.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + sub v28.4S, v20.4S, v18.4S // ..................................................................................................................................................................................................*..................................................................................... + mls v8.4S, v17.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + cmge v20.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v18.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v24.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v18.4S, v27.4S, v18.4S // ..................................................................................................................................................................................*..................................................................................................... + str q24, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sqrdmulh v27.4S, v9.4S, v26.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v17.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + mul v9.4S, v9.4S, v25.4S // ............................................................................................................................................................................................................................*........................................................... + cmge v28.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v24.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v16.4S, v18.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v20.4S, v17.4S, v20.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v17.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sub v28.4S, v28.4S, v24.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v18.4S, v11.4S, v26.4S // .................................................................................................................................................................................................................................*...................................................... + mul v11.4S, v11.4S, v25.4S // ..................................................................................................................................................................................................................................*..................................................... str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - sqrdmulh v24.4S, v17.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - cmge v16.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - cmge v17.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... - mls v27.4S, v22.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - sub v22.4S, v17.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... - sqrdmulh v17.4S, v18.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - mul v28.4S, v18.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - cmge v18.4S, v31.4S, v27.4S // ............................................................................................................................................................................................*........................................................................................... - mls v20.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - cmge v22.4S, v27.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - mls v19.4S, v24.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - sub v22.4S, v18.4S, v22.4S // ..............................................................................................................................................................................................*......................................................................................... - sqrdmulh v24.4S, v15.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - mul v20.4S, v15.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - sqrdmulh v15.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - sqrdmulh v8.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - mul v10.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - mul v18.4S, v21.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - sqrdmulh v21.4S, v21.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mls v20.4S, v24.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - mls v27.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - mls v16.4S, v15.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - add v15.4S, v23.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... - cmge v22.4S, v20.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - mls v28.4S, v17.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - str q27, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - sub v27.4S, v23.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ - mls v18.4S, v21.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - cmge v24.4S, v31.4S, v20.4S // ........................................................................................................................................................................................*............................................................................................... - mls v10.4S, v8.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - cmge v8.4S, v28.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - cmge v23.4S, v31.4S, v28.4S // ....................................................................................................................................................................................................*................................................................................... - sub v22.4S, v24.4S, v22.4S // ..........................................................................................................................................................................................*............................................................................................. - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - cmge v19.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - sub v8.4S, v23.4S, v8.4S // ......................................................................................................................................................................................................*................................................................................. - mls v20.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - cmge v22.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - sub v24.4S, v24.4S, v19.4S // ..................................................................................................................................................................................................................................................*..................................... - mls v28.4S, v8.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................*............................................................................... - cmge v23.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - mls v16.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - cmge v24.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... - str q20, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - sub v20.4S, v8.4S, v22.4S // ..........................................................................................................................................................................................................*............................................................................. - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - str q28, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - sub v8.4S, v24.4S, v23.4S // ..........................................................................................................................................................................................................................................................*............................. - sqrdmulh v22.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - sqrdmulh v16.4S, v27.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - mls v19.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - mls v10.4S, v8.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - sqrdmulh v23.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - cmge v8.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - cmge v14.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... - sqrdmulh v28.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - sqrdmulh v9.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - cmge v24.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - str q18, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. - cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - sqrdmulh v12.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - mul v15.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - sub v24.4S, v24.4S, v18.4S // ......................................................................................................................................................................................................................................................*................................. - mul v27.4S, v27.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - mls v27.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - mls v20.4S, v9.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - sub v9.4S, v14.4S, v8.4S // ..............................................................................................................................................................................................................................................................*......................... - mls v21.4S, v28.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - cmge v14.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... - mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - cmge v10.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - cmge v28.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... - cmge v17.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - mls v15.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - cmge v16.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - cmge v12.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - sub v28.4S, v17.4S, v28.4S // ..................................................................................................................................................................................................................................................................*..................... - mls v22.4S, v23.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - sub v10.4S, v14.4S, v10.4S // ..............................................................................................................................................................................................................*......................................................................... - mls v19.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - sub v14.4S, v16.4S, v12.4S // ......................................................................................................................................................................................................................................................................*................. - cmge v17.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - mls v20.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - cmge v16.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... - cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - mls v21.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - cmge v14.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - sub v19.4S, v16.4S, v17.4S // ..............................................................................................................................................................................................................................................................................*......... - mls v27.4S, v10.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - sub v14.4S, v14.4S, v28.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v16.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v22.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sqrdmulh v24.4S, v13.4S, v26.4S // .......................................................................................................................................................................................................................................*................................................ + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v13.4S, v13.4S, v25.4S // ........................................................................................................................................................................................................................................*............................................... + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + sqrdmulh v20.4S, v10.4S, v26.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v22.4S, v10.4S, v25.4S // ...............................................................................................................................................................................................................................*........................................................ + sqrdmulh v10.4S, v19.4S, v0.S[1] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v23.4S, v21.4S, v0.S[1] // ...................................................................................................................................................................*.................................................................................................................... + mls v22.4S, v20.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mul v28.4S, v19.4S, v0.S[0] // ..........................................................................................................................................................*............................................................................................................................. + mul v19.4S, v21.4S, v0.S[0] // ....................................................................................................................................................................*................................................................................................................... + mls v19.4S, v23.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v20.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v21.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v28.4S, v10.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + cmge v10.4S, v8.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v20.4S, v20.4S, v17.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v11.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v18.4S, v31.4S, v8.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v23.4S, v19.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v9.4S, v27.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + cmge v27.4S, v31.4S, v19.4S // ....................................................................................................................................................................................................*................................................................................... + sub v10.4S, v18.4S, v10.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v18.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................................*............... + sqrdmulh v17.4S, v15.4S, v26.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v27.4S, v27.4S, v23.4S // ......................................................................................................................................................................................................*................................................................................. + mul v15.4S, v15.4S, v25.4S // ..............................................................................................................................................................................................................................................*......................................... + cmge v23.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................*............................... + sub v16.4S, v18.4S, v16.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v19.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v18.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v27.4S, v31.4S, v28.4S // ............................................................................................................................................................................................*........................................................................................... + mls v15.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v17.4S, v28.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + sub v23.4S, v23.4S, v21.4S // ..........................................................................................................................................................................................................................................................*............................. + cmge v21.4S, v11.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v13.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v24.4S, v31.4S, v11.4S // ............................................................................................................................................................................................................................................................*........................... + str q19, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + sub v19.4S, v27.4S, v17.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v22.4S, v23.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v27.4S, v24.4S, v21.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v14.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + cmge v17.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v16.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v28.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + cmge v19.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + sub v18.4S, v17.4S, v18.4S // ......................................................................................................................................................................................................................................................*................................. + str q22, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... + mls v11.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v22.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + str q14, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. + sub v19.4S, v19.4S, v16.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v9.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + cmge v18.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... + str q28, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... mls v15.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. - mls v22.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - str q27, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ - str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + str q11, [x1, #192] // ...................................................................................................................................................................................................................................................................................*.... + sub v18.4S, v18.4S, v22.4S // ......................................................................................................................................................................................................................................................................*................. + mls v8.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + str q9, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mls v13.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + str q15, [x1, #448] // .......................................................................................................................................................................................................................................................................................* + mls v12.4S, v20.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + str q8, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + str q13, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + str q12, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s index c23659d1..4187d60d 100644 --- a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s @@ -35,18 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -85,15 +73,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm .macro barrett_reduce_single a @@ -114,12 +102,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -138,31 +120,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [\r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -177,7 +159,7 @@ trn1_d \data\()1, t1, t3 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -188,7 +170,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -198,7 +180,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -206,7 +188,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -217,24 +199,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -333,29 +321,540 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: mov count, #16 .p2align 2 + // Instructions: 36 + // Expected cycles: 20 + // Expected IPC: 1.80 + // + // Wall time: 0.89s + // User time: 0.89s + // + // -------- original position --------> + // 0 25 + // |------------------------|---------- + ldr q31, [x0, #32] // *................................... + ldr q6, [x3], #(6*16) // ...*................................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + ldr q24, [x3, #-80] // .....*.............................. + ldr q30, [x0, #0] // .*.................................. + ldr q17, [x0, #16] // ..*................................. + ldr q9, [x0, #48] // ....*............................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + ldr q15, [x3, #-16] // ........*........................... + ldr q2, [x3, #-48] // .................*.................. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + ldr q8, [x3, #-64] // ...............*.................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + ldr q10, [x3, #-32] // ................*................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + trn2 v3.4S, v31.4S, v9.4S // .........*.......................... + trn1 v9.4S, v31.4S, v9.4S // ..........*......................... + trn1 v16.4S, v30.4S, v17.4S // ......*............................. + trn2 v31.4S, v30.4S, v17.4S // .......*............................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + trn2 v14.2D, v31.2D, v3.2D // ...........*........................ + trn1 v31.2D, v31.2D, v3.2D // ............*....................... + trn1 v0.2D, v16.2D, v9.2D // .............*...................... + trn2 v30.2D, v16.2D, v9.2D // ..............*..................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v19.4S, v0.4S, v31.4S // ..................*................. + add v21.4S, v0.4S, v31.4S // .......................*............ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v17.4S, v30.4S, v14.4S // ...................*................ + add v30.4S, v30.4S, v14.4S // ......................*............. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sqrdmulh v9.4S, v17.4S, v15.4S // ....................*............... + mul v3.4S, v17.4S, v10.4S // ........................*........... + mul v16.4S, v19.4S, v8.4S // .....................*.............. + sqrdmulh v12.4S, v19.4S, v2.4S // .........................*.......... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + add v7.4S, v21.4S, v30.4S // ...........................*........ + sub v15.4S, v21.4S, v30.4S // ..........................*......... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v3.4S, v9.4S, v29.4S // .............................*...... + mls v16.4S, v12.4S, v29.4S // ............................*....... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v13.4S, v16.4S, v3.4S // ...............................*.... + add v10.4S, v16.4S, v3.4S // ...................................* + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sqrdmulh v20.4S, v15.4S, v24.4S // ..............................*..... + mul v17.4S, v15.4S, v6.4S // ..................................*. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sqrdmulh v0.4S, v13.4S, v24.4S // ................................*... + mul v6.4S, v13.4S, v6.4S // .................................*.. + + // ---------- new position -----------> + // 0 25 + // |------------------------|---------- + // ldr q2, [x0, #32] // *................................... + // ldr q1, [x0, #0] // ...*................................ + // ldr q5, [x0, #16] // ....*............................... + // ldr q4, [x3], #(6*16) // .*.................................. + // ldr q7, [x0, #48] // .....*.............................. + // ldr q13, [x3, #-80] // ..*................................. + // trn1 v30.4S, v1.4S, v5.4S // ............*....................... + // trn2 v21.4S, v1.4S, v5.4S // .............*...................... + // ldr q25, [x3, #-16] // ......*............................. + // trn2 v23.4S, v2.4S, v7.4S // ..........*......................... + // trn1 v12.4S, v2.4S, v7.4S // ...........*........................ + // trn2 v10.2D, v21.2D, v23.2D // ..............*..................... + // trn1 v28.2D, v21.2D, v23.2D // ...............*.................... + // trn1 v5.2D, v30.2D, v12.2D // ................*................... + // trn2 v16.2D, v30.2D, v12.2D // .................*.................. + // ldr q12, [x3, #-64] // ........*........................... + // ldr q30, [x3, #-32] // .........*.......................... + // ldr q23, [x3, #-48] // .......*............................ + // sub v14.4S, v5.4S, v28.4S // ..................*................. + // sub v7.4S, v16.4S, v10.4S // ....................*............... + // sqrdmulh v19.4S, v7.4S, v25.4S // ......................*............. + // mul v3.4S, v14.4S, v12.4S // ........................*........... + // add v17.4S, v16.4S, v10.4S // .....................*.............. + // add v10.4S, v5.4S, v28.4S // ...................*................ + // mul v12.4S, v7.4S, v30.4S // .......................*............ + // sqrdmulh v27.4S, v14.4S, v23.4S // .........................*.......... + // sub v30.4S, v10.4S, v17.4S // ...........................*........ + // add v7.4S, v10.4S, v17.4S // ..........................*......... + // mls v3.4S, v27.4S, v29.4S // .............................*...... + // mls v12.4S, v19.4S, v29.4S // ............................*....... + // sqrdmulh v20.4S, v30.4S, v13.4S // ................................*... + // sub v17.4S, v3.4S, v12.4S // ..............................*..... + // sqrdmulh v0.4S, v17.4S, v13.4S // ..................................*. + // mul v6.4S, v17.4S, v4.4S // ...................................* + // mul v17.4S, v30.4S, v4.4S // .................................*.. + // add v10.4S, v3.4S, v12.4S // ...............................*.... + + sub count, count, #1 +layer5678_start: + // Instructions: 76 + // Expected cycles: 24 + // Expected IPC: 3.17 + // + // Wall time: 22.04s + // User time: 22.04s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + mls v17.4S, v20.4S, v29.4S // ................................*........................................... + ldr q2, [x0, #96] // ..e......................................................................... + ldr q1, [x0, #64] // e........................................................................... + mls v6.4S, v0.4S, v29.4S // .....................................*...................................... + ldr q5, [x0, #80] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v23.4S, v7.4S, v10.4S // ......................................*..................................... + ldr q4, [x3], #(6*16) // ............e............................................................... + trn2 v10.4S, v7.4S, v10.4S // .......................................*.................................... + ldr q7, [x0, #112] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q11, [x4], #8 // ..............................................*............................. + ldr q31, [x4], #16 // ...............................................*............................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v20.4S, v17.4S, v6.4S // ........................................*................................... + ldr q13, [x3, #-80] // .............e.............................................................. + trn2 v9.4S, v17.4S, v6.4S // .........................................*.................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v30.4S, v1.4S, v5.4S // ....e....................................................................... + trn2 v21.4S, v1.4S, v5.4S // .....e...................................................................... + ldr q25, [x3, #-16] // .................e.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v17.2D, v23.2D, v20.2D // ..........................................*................................. + trn1 v3.2D, v10.2D, v9.2D // .............................................*.............................. + trn1 v5.2D, v23.2D, v20.2D // ............................................*............................... + trn2 v0.2D, v10.2D, v9.2D // ...........................................*................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v23.4S, v2.4S, v7.4S // .......e.................................................................... + trn1 v12.4S, v2.4S, v7.4S // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v6.4S, v17.4S, v0.4S // .....................................................*...................... + add v2.4S, v17.4S, v0.4S // ......................................................*..................... + sub v20.4S, v5.4S, v3.4S // ................................................*........................... + add v1.4S, v5.4S, v3.4S // .................................................*.......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v10.2D, v21.2D, v23.2D // .........e.................................................................. + trn1 v28.2D, v21.2D, v23.2D // ...........e................................................................ + trn1 v5.2D, v30.2D, v12.2D // ..........e................................................................. + trn2 v16.2D, v30.2D, v12.2D // ........e................................................................... + ldr q12, [x3, #-64] // ..............e............................................................. + ldr q30, [x3, #-32] // ................e........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v0.4S, v6.4S, v31.S[3] // .......................................................*.................... + mul v9.4S, v6.4S, v31.S[2] // ........................................................*................... + sqrdmulh v17.4S, v20.4S, v31.S[1] // ..................................................*......................... + mul v31.4S, v20.4S, v31.S[0] // ...................................................*........................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q23, [x3, #-48] // ...............e............................................................ + sub v14.4S, v5.4S, v28.4S // ..................e......................................................... + add v22.4S, v1.4S, v2.4S // ...........................................................*................ + sub v1.4S, v1.4S, v2.4S // ..........................................................*................. + sub v7.4S, v16.4S, v10.4S // .......................e.................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v19.4S, v7.4S, v25.4S // .........................e.................................................. + mul v3.4S, v14.4S, v12.4S // .....................e...................................................... + mls v9.4S, v0.4S, v29.4S // .........................................................*.................. + mls v31.4S, v17.4S, v29.4S // ....................................................*....................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v17.4S, v16.4S, v10.4S // ........................e................................................... + add v10.4S, v5.4S, v28.4S // ...................e........................................................ + mul v12.4S, v7.4S, v30.4S // ..........................e................................................. + sqrdmulh v27.4S, v14.4S, v23.4S // ....................e....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v8.4S, v1.4S, v11.S[1] // ............................................................*............... + mul v21.4S, v1.4S, v11.S[0] // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v6.4S, v31.4S, v9.4S // ...............................................................*............ + sub v30.4S, v10.4S, v17.4S // ............................e............................................... + add v7.4S, v10.4S, v17.4S // .............................e.............................................. + add v31.4S, v31.4S, v9.4S // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v3.4S, v27.4S, v29.4S // ......................e..................................................... + mls v12.4S, v19.4S, v29.4S // ...........................e................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + srshr v17.4S, v22.4S, #23 // ....................................................................*....... + sqrdmulh v0.4S, v6.4S, v11.S[1] // .................................................................*.......... + mul v10.4S, v6.4S, v11.S[0] // ..................................................................*......... + mls v21.4S, v8.4S, v29.4S // ..............................................................*............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + srshr v11.4S, v31.4S, #23 // ......................................................................*..... + sqrdmulh v20.4S, v30.4S, v13.4S // ..............................e............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v22.4S, v17.4S, v29.4S // .....................................................................*...... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v17.4S, v3.4S, v12.4S // .................................e.......................................... + mls v31.4S, v11.4S, v29.4S // .......................................................................*.... + mls v10.4S, v0.4S, v29.4S // ...................................................................*........ + str q21, [x0, #32] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v0.4S, v17.4S, v13.4S // ...................................e........................................ + mul v6.4S, v17.4S, v4.4S // ....................................e....................................... + mul v17.4S, v30.4S, v4.4S // ...............................e............................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q22, [x0], #(16*4) // ........................................................................*... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q10, [x0, #-16] // ...........................................................................* + add v10.4S, v3.4S, v12.4S // ..................................e......................................... + str q31, [x0, #-48] // .........................................................................*.. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q8, [x0, #(16*0)] // .e.........................................................................'.~......................................................................... + // ldr q9, [x0, #(16*1)] // ...e.......................................................................'...~....................................................................... + // ldr q10, [x0, #(16*2)] // e..........................................................................'~.......................................................................... + // ldr q11, [x0, #(16*3)] // .......e...................................................................'.......~................................................................... + // trn1 v25.4s, v8.4s, v9.4s // .............e.............................................................'.............~............................................................. + // trn2 v26.4s, v8.4s, v9.4s // ..............e............................................................'..............~............................................................ + // trn1 v27.4s, v10.4s, v11.4s // .....................e.....................................................'.....................~..................................................... + // trn2 v28.4s, v10.4s, v11.4s // ....................e......................................................'....................~...................................................... + // trn2 v10.2d, v25.2d, v27.2d // .............................e.............................................'.............................~............................................. + // trn2 v11.2d, v26.2d, v28.2d // ..........................e................................................'..........................~................................................ + // trn1 v8.2d, v25.2d, v27.2d // ............................e..............................................'............................~.............................................. + // trn1 v9.2d, v26.2d, v28.2d // ...........................e...............................................'...........................~............................................... + // ldr q0, [x3], #(6*16) // .....e.....................................................................'.....~..................................................................... + // ldr q4, [x3, #(-6*16 + 1*16)] // ...........e...............................................................'...........~............................................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // ..............................e............................................'..............................~............................................ + // ldr q5, [x3, #(-6*16 + 3*16)] // ....................................e......................................'....................................~...................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ...............................e...........................................'...............................~........................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ...............e...........................................................'...............~........................................................... + // sub v24.4s, v8.4s, v9.4s // .....................................e.....................................'.....................................~..................................... + // add v8.4s, v8.4s, v9.4s // ..............................................e............................'..............................................~............................ + // sqrdmulh v27.4s, v24.4s, v5.4s // ................................................e..........................'................................................~.......................... + // mul v9.4s, v24.4s, v1.4s // ..........................................e................................'..........................................~................................ + // mls v9.4s, v27.4s, v29.4s // .......................................................e...................'.......................................................~................... + // sub v24.4s, v10.4s, v11.4s // ........................................e..................................'........................................~.................................. + // add v10.4s, v10.4s, v11.4s // .............................................e.............................'.............................................~............................. + // sqrdmulh v27.4s, v24.4s, v6.4s // .........................................e.................................'.........................................~................................. + // mul v11.4s, v24.4s, v2.4s // ...............................................e...........................'...............................................~........................... + // mls v11.4s, v27.4s, v29.4s // ........................................................e..................'........................................................~.................. + // sub v24.4s, v8.4s, v10.4s // ....................................................e......................'....................................................~...................... + // add v8.4s, v8.4s, v10.4s // .....................................................e.....................'.....................................................~..................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ..............................................................e............'..............................................................~............ + // mul v10.4s, v24.4s, v0.4s // ......................................................................e....'......................................................................~.... + // mls v10.4s, v27.4s, v29.4s // ...........................................................................*........................................................................... + // sub v24.4s, v9.4s, v11.4s // ................................................................e..........'................................................................~.......... + // add v9.4s, v9.4s, v11.4s // .........................................................................e.'.........................................................................~. + // sqrdmulh v27.4s, v24.4s, v4.4s // ....................................................................e......'....................................................................~...... + // mul v11.4s, v24.4s, v0.4s // .....................................................................e.....'.....................................................................~..... + // mls v11.4s, v27.4s, v29.4s // ..~........................................................................'..*........................................................................ + // trn1 v25.4s, v8.4s, v9.4s // ....~......................................................................'....*...................................................................... + // trn2 v26.4s, v8.4s, v9.4s // ......~....................................................................'......*.................................................................... + // trn1 v27.4s, v10.4s, v11.4s // ..........~................................................................'..........*................................................................ + // trn2 v28.4s, v10.4s, v11.4s // ............~..............................................................'............*.............................................................. + // trn2 v10.2d, v25.2d, v27.2d // ................~..........................................................'................*.......................................................... + // trn2 v11.2d, v26.2d, v28.2d // ...................~.......................................................'...................*....................................................... + // trn1 v8.2d, v25.2d, v27.2d // ..................~........................................................'..................*........................................................ + // trn1 v9.2d, v26.2d, v28.2d // .................~.........................................................'.................*......................................................... + // ldr q1, [x4], #8 // ........~..................................................................'........*.................................................................. + // ldr q0, [x4], #16 // .........~.................................................................'.........*................................................................. + // sub v24.4s, v8.4s, v9.4s // ........................~..................................................'........................*.................................................. + // add v8.4s, v8.4s, v9.4s // .........................~.................................................'.........................*................................................. + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..................................~........................................'..................................*........................................ + // mul v9.4s, v24.4s, v0.s[0] // ...................................~.......................................'...................................*....................................... + // mls v9.4s, v27.4s, v29.4s // ............................................~..............................'............................................*.............................. + // sub v24.4s, v10.4s, v11.4s // ......................~....................................................'......................*.................................................... + // add v10.4s, v10.4s, v11.4s // .......................~...................................................'.......................*................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ................................~..........................................'................................*.......................................... + // mul v11.4s, v24.4s, v0.s[2] // .................................~.........................................'.................................*......................................... + // mls v11.4s, v27.4s, v29.4s // ...........................................~...............................'...........................................*............................... + // sub v24.4s, v8.4s, v10.4s // .......................................~...................................'.......................................*................................... + // add v8.4s, v8.4s, v10.4s // ......................................~....................................'......................................*.................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .................................................~.........................'.................................................*......................... + // mul v10.4s, v24.4s, v1.s[0] // ..................................................~........................'..................................................*........................ + // mls v10.4s, v27.4s, v29.4s // ............................................................~..............'............................................................*.............. + // sub v24.4s, v9.4s, v11.4s // ...................................................~.......................'...................................................*....................... + // add v9.4s, v9.4s, v11.4s // ......................................................~....................'......................................................*.................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ..........................................................~................'..........................................................*................ + // mul v11.4s, v24.4s, v1.s[0] // ...........................................................~...............'...........................................................*............... + // mls v11.4s, v27.4s, v29.4s // ..................................................................~........'..................................................................*........ + // srshr v24.4S, v8.4S, #23 // .........................................................~.................'.........................................................*................. + // mls v8.4s, v24.4s, v29.4s // ...............................................................~...........'...............................................................*........... + // srshr v24.4S, v9.4S, #23 // .............................................................~.............'.............................................................*............. + // mls v9.4s, v24.4s, v29.4s // .................................................................~.........'.................................................................*......... + // str q8, [x0], #(16*4) // .......................................................................~...'.......................................................................*... + // str q9, [x0, #(-16*4 + 1*16)] // ..........................................................................~'..........................................................................* + // str q10, [x0, #(-16*4 + 2*16)] // ...................................................................~.......'...................................................................*....... + // str q11, [x0, #(-16*4 + 3*16)] // ........................................................................~..'........................................................................*.. + + sub count, count, #1 + cbnz count, layer5678_start + // Instructions: 40 + // Expected cycles: 24 + // Expected IPC: 1.67 + // + // Wall time: 0.75s + // User time: 0.75s + // + // ---------- original position ----------> + // 0 25 + // |------------------------|-------------- + mls v17.4S, v20.4S, v29.4S // *....................................... + mls v6.4S, v0.4S, v29.4S // .*...................................... + trn1 v21.4S, v7.4S, v10.4S // ..*..................................... + trn2 v1.4S, v7.4S, v10.4S // ...*.................................... + ldr q22, [x4], #8 // ....*................................... + ldr q18, [x4], #16 // .....*.................................. // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - ldr q11, [x0, #0] // ..*..................................... - ldr q9, [x0, #48] // ...*.................................... // gap // ........................................ // gap // ........................................ // gap // ........................................ - ldr q17, [x0, #16] // .*...................................... - ldr q2, [x0, #32] // ....*................................... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - ldr q1, [x3, #32] // *....................................... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ + trn1 v8.4S, v17.4S, v6.4S // ......*................................. + trn2 v9.4S, v17.4S, v6.4S // .......*................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -370,13 +869,13 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: // gap // ........................................ // gap // ........................................ // gap // ........................................ + trn2 v2.2D, v21.2D, v8.2D // ........*............................... + trn1 v21.2D, v21.2D, v8.2D // ..........*............................. + trn2 v8.2D, v1.2D, v9.2D // ...........*............................ + trn1 v1.2D, v1.2D, v9.2D // .........*.............................. // gap // ........................................ // gap // ........................................ // gap // ........................................ - trn1 v28.4S, v11.4S, v17.4S // .......*................................ - trn1 v12.4S, v2.4S, v9.4S // .........*.............................. - trn2 v8.4S, v11.4S, v17.4S // ........*............................... - trn2 v23.4S, v2.4S, v9.4S // ..........*............................. // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -386,16 +885,13 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: // gap // ........................................ // gap // ........................................ // gap // ........................................ + sub v9.4S, v2.4S, v8.4S // ............*........................... + add v8.4S, v2.4S, v8.4S // .............*.......................... + sub v2.4S, v21.4S, v1.4S // ..............*......................... + add v21.4S, v21.4S, v1.4S // ...............*........................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - trn2 v17.2D, v8.2D, v23.2D // ..............*......................... - trn2 v11.2D, v28.2D, v12.2D // ...............*........................ - trn1 v15.2D, v28.2D, v12.2D // ...........*............................ - ldr q0, [x3, #80] // .....*.................................. - trn1 v20.2D, v8.2D, v23.2D // ............*........................... - ldr q16, [x3, #64] // ......*................................. - ldr q25, [x3, #48] // .............*.......................... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -405,15 +901,18 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: // gap // ........................................ // gap // ........................................ // gap // ........................................ - sub v10.4S, v11.4S, v17.4S // ...................*.................... - sub v18.4S, v15.4S, v20.4S // ................*....................... + sqrdmulh v1.4S, v9.4S, v18.S[3] // ................*....................... + mul v9.4S, v9.4S, v18.S[2] // .................*...................... + sqrdmulh v3.4S, v2.4S, v18.S[1] // ..................*..................... + mul v18.4S, v2.4S, v18.S[0] // ...................*.................... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ + sub v2.4S, v21.4S, v8.4S // .....................*.................. + add v21.4S, v21.4S, v8.4S // ....................*................... // gap // ........................................ // gap // ........................................ - add v28.4S, v11.4S, v17.4S // .........................*.............. // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -421,23 +920,22 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mul v24.4S, v18.4S, v1.4S // .....................*.................. - sqrdmulh v0.4S, v10.4S, v0.4S // .......................*................ - mul v30.4S, v10.4S, v16.4S // ......................*................. - sqrdmulh v17.4S, v18.4S, v25.4S // ....................*................... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - ldr q31, [x3], #(6*16) // ..................*..................... // gap // ........................................ + mls v9.4S, v1.4S, v29.4S // ......................*................. + mls v18.4S, v3.4S, v29.4S // .......................*................ + sqrdmulh v1.4S, v2.4S, v22.S[1] // ........................*............... + mul v8.4S, v2.4S, v22.S[0] // .........................*.............. // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ + srshr v2.4S, v21.4S, #23 // ............................*........... // gap // ........................................ // gap // ........................................ - add v7.4S, v15.4S, v20.4S // .................*...................... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -445,55 +943,54 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mls v30.4S, v0.4S, v29.4S // ...........................*............ - mls v24.4S, v17.4S, v29.4S // ........................*............... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - ldr q15, [x3, #-80] // ..........................*............. + sub v3.4S, v18.4S, v9.4S // ..........................*............. + add v18.4S, v18.4S, v9.4S // ...........................*............ + mls v8.4S, v1.4S, v29.4S // ...............................*........ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ + mls v21.4S, v2.4S, v29.4S // .................................*...... // gap // ........................................ // gap // ........................................ - sub v6.4S, v7.4S, v28.4S // ............................*........... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ + sqrdmulh v1.4S, v3.4S, v22.S[1] // .............................*.......... + mul v22.4S, v3.4S, v22.S[0] // ..............................*......... + srshr v9.4S, v18.4S, #23 // ................................*....... // gap // ........................................ // gap // ........................................ - sub v27.4S, v24.4S, v30.4S // ...............................*........ - add v4.4S, v24.4S, v30.4S // ................................*....... - add v24.4S, v7.4S, v28.4S // ..............................*......... // gap // ........................................ // gap // ........................................ // gap // ........................................ + str q8, [x0, #32] // ....................................*... // gap // ........................................ // gap // ........................................ - mul v10.4S, v6.4S, v31.4S // .................................*...... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ + str q21, [x0], #(16*4) // .....................................*.. // gap // ........................................ // gap // ........................................ - mul v0.4S, v27.4S, v31.4S // ..................................*..... - sqrdmulh v26.4S, v6.4S, v15.4S // .............................*.......... - sqrdmulh v16.4S, v27.4S, v15.4S // ...................................*.... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - trn2 v5.4S, v24.4S, v4.4S // .....................................*.. + mls v18.4S, v9.4S, v29.4S // ..................................*..... + mls v22.4S, v1.4S, v29.4S // ...................................*.... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -501,7 +998,6 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: // gap // ........................................ // gap // ........................................ // gap // ........................................ - trn1 v4.4S, v24.4S, v4.4S // ....................................*... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -509,8 +1005,16 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mls v10.4S, v26.4S, v29.4S // ......................................*. - mls v0.4S, v16.4S, v29.4S // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q18, [x0, #-48] // .......................................* + str q22, [x0, #-16] // ......................................*. // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -518,529 +1022,49 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: // gap // ........................................ // gap // ........................................ - // original source code - // ldr q21, [x3, #32] // ....*................................... - // ldr q20, [x0, #16] // ..*..................................... - // ldr q19, [x0, #0] // *....................................... - // ldr q7, [x0, #48] // .*...................................... - // ldr q1, [x0, #32] // ...*.................................... - // ldr q0, [x3, #80] // ............*........................... - // ldr q16, [x3, #64] // ..............*......................... - // trn1 v30.4S, v19.4S, v20.4S // .....*.................................. - // trn2 v25.4S, v19.4S, v20.4S // .......*................................ - // trn1 v3.4S, v1.4S, v7.4S // ......*................................. - // trn2 v2.4S, v1.4S, v7.4S // ........*............................... - // trn1 v26.2D, v30.2D, v3.2D // ...........*............................ - // trn1 v23.2D, v25.2D, v2.2D // .............*.......................... - // ldr q6, [x3, #48] // ...............*........................ - // trn2 v1.2D, v25.2D, v2.2D // .........*.............................. - // trn2 v24.2D, v30.2D, v3.2D // ..........*............................. - // sub v18.4S, v26.4S, v23.4S // .................*...................... - // add v25.4S, v26.4S, v23.4S // ........................*............... - // ldr q20, [x3], #(6*16) // .......................*................ - // sub v4.4S, v24.4S, v1.4S // ................*....................... - // sqrdmulh v27.4S, v18.4S, v6.4S // ......................*................. - // mul v3.4S, v18.4S, v21.4S // ...................*.................... - // mul v14.4S, v4.4S, v16.4S // .....................*.................. - // sqrdmulh v16.4S, v4.4S, v0.4S // ....................*................... - // mls v3.4S, v27.4S, v29.4S // ..........................*............. - // add v27.4S, v24.4S, v1.4S // ..................*..................... - // ldr q1, [x3, #-80] // ...........................*............ - // mls v14.4S, v16.4S, v29.4S // .........................*.............. - // sub v9.4S, v25.4S, v27.4S // ............................*........... - // sqrdmulh v16.4S, v9.4S, v1.4S // ..................................*..... - // add v30.4S, v25.4S, v27.4S // ...............................*........ - // sub v22.4S, v3.4S, v14.4S // .............................*.......... - // add v13.4S, v3.4S, v14.4S // ..............................*......... - // mul v10.4S, v9.4S, v20.4S // ................................*....... - // mul v0.4S, v22.4S, v20.4S // .................................*...... - // sqrdmulh v12.4S, v22.4S, v1.4S // ...................................*.... - // trn1 v4.4S, v30.4S, v13.4S // .....................................*.. - // trn2 v5.4S, v30.4S, v13.4S // ....................................*... - // mls v10.4S, v16.4S, v29.4S // ......................................*. - // mls v0.4S, v12.4S, v29.4S // .......................................* - - sub count, count, #1 -layer5678_start: - ldr q21, [x3, #32] // ..............e............................................................. - ldr q20, [x0, #80] // .e.......................................................................... - ldr q19, [x0, #64] // e........................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - ldr q7, [x0, #112] // ...e........................................................................ - ldr q1, [x0, #96] // ..e......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - trn2 v28.4S, v10.4S, v0.4S // .........................................*.................................. - trn1 v23.4S, v10.4S, v0.4S // ........................................*................................... - ldr q0, [x3, #80] // .................e.......................................................... - ldr q16, [x3, #64] // ................e........................................................... - ldr q22, [x4], #8 // ..............................................*............................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - trn1 v30.4S, v19.4S, v20.4S // ....e....................................................................... - trn2 v25.4S, v19.4S, v20.4S // .....e...................................................................... - ldr q20, [x4], #16 // ...............................................*............................ - trn2 v10.2D, v5.2D, v28.2D // ...........................................*................................ - trn2 v8.2D, v4.2D, v23.2D // ..........................................*................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - trn1 v3.4S, v1.4S, v7.4S // ......e..................................................................... - trn2 v2.4S, v1.4S, v7.4S // .......e.................................................................... - trn1 v7.2D, v4.2D, v23.2D // ............................................*............................... - trn1 v6.2D, v5.2D, v28.2D // .............................................*.............................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v17.4S, v8.4S, v10.4S // ......................................................*..................... - sub v18.4S, v8.4S, v10.4S // .....................................................*...................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v28.4S, v7.4S, v6.4S // ................................................*........................... - add v5.4S, v7.4S, v6.4S // .................................................*.......................... - trn1 v26.2D, v30.2D, v3.2D // ..........e................................................................. - trn1 v23.2D, v25.2D, v2.2D // ...........e................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - ldr q6, [x3, #48] // ...............e............................................................ - trn2 v1.2D, v25.2D, v2.2D // .........e.................................................................. - mul v31.4S, v18.4S, v20.S[2] // .......................................................*.................... - sqrdmulh v7.4S, v18.4S, v20.S[3] // ........................................................*................... - trn2 v24.2D, v30.2D, v3.2D // ........e................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v18.4S, v26.4S, v23.4S // ..................e......................................................... - add v25.4S, v26.4S, v23.4S // ...................e........................................................ - mul v2.4S, v28.4S, v20.S[0] // ..................................................*......................... - sqrdmulh v23.4S, v28.4S, v20.S[1] // ...................................................*........................ - ldr q20, [x3], #(6*16) // ............e............................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v4.4S, v24.4S, v1.4S // .......................e.................................................... - sub v8.4S, v5.4S, v17.4S // ..........................................................*................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v10.4S, v5.4S, v17.4S // ...........................................................*................ - mls v31.4S, v7.4S, v29.4S // .........................................................*.................. - sqrdmulh v27.4S, v18.4S, v6.4S // .....................e...................................................... - mul v3.4S, v18.4S, v21.4S // ....................e....................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v14.4S, v4.4S, v16.4S // .........................e.................................................. - sqrdmulh v16.4S, v4.4S, v0.4S // ..........................e................................................. - mls v2.4S, v23.4S, v29.4S // ....................................................*....................... - mul v17.4S, v8.4S, v22.S[0] // ............................................................*............... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v12.4S, v8.4S, v22.S[1] // .............................................................*.............. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - srshr v30.4S, v10.4S, #23 // ....................................................................*....... - mls v3.4S, v27.4S, v29.4S // ......................e..................................................... - add v27.4S, v24.4S, v1.4S // ........................e................................................... - ldr q1, [x3, #-80] // .............e.............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v4.4S, v2.4S, v31.4S // ...............................................................*............ - add v6.4S, v2.4S, v31.4S // ................................................................*........... - mls v14.4S, v16.4S, v29.4S // ...........................e................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v17.4S, v12.4S, v29.4S // ..............................................................*............. - sub v9.4S, v25.4S, v27.4S // ............................e............................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v10.4S, v30.4S, v29.4S // .....................................................................*...... - mul v19.4S, v4.4S, v22.S[0] // .................................................................*.......... - sqrdmulh v4.4S, v4.4S, v22.S[1] // ..................................................................*......... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - srshr v7.4S, v6.4S, #23 // ......................................................................*..... - sqrdmulh v16.4S, v9.4S, v1.4S // ...............................e............................................ - add v30.4S, v25.4S, v27.4S // .............................e.............................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v22.4S, v3.4S, v14.4S // .................................e.......................................... - add v13.4S, v3.4S, v14.4S // ..................................e......................................... - str q10, [x0], #(16*4) // ........................................................................*... - mul v10.4S, v9.4S, v20.4S // ..............................e............................................. - str q17, [x0, #-32] // ..........................................................................*. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v0.4S, v22.4S, v20.4S // ...................................e........................................ - mls v6.4S, v7.4S, v29.4S // .......................................................................*.... - sqrdmulh v12.4S, v22.4S, v1.4S // ....................................e....................................... - mls v19.4S, v4.4S, v29.4S // ...................................................................*........ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - trn1 v4.4S, v30.4S, v13.4S // ......................................e..................................... - trn2 v5.4S, v30.4S, v13.4S // .......................................e.................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v10.4S, v16.4S, v29.4S // ................................e........................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v0.4S, v12.4S, v29.4S // .....................................e...................................... - str q6, [x0, #-48] // .........................................................................*.. - str q19, [x0, #-16] // ...........................................................................* - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - - // original source code - // ldr q8, [x0, #(16*0)] // ..e.........................................................................|.e......................................................................... - // ldr q9, [x0, #(16*1)] // .e..........................................................................|e.......................................................................... - // ldr q10, [x0, #(16*2)] // ....e.......................................................................|...e....................................................................... - // ldr q11, [x0, #(16*3)] // ...e........................................................................|..e........................................................................ - // trn1 v25.4s, v8.4s, v9.4s // ..........e.................................................................|.........e................................................................. - // trn2 v26.4s, v8.4s, v9.4s // ...........e................................................................|..........e................................................................ - // trn1 v27.4s, v10.4s, v11.4s // ...............e............................................................|..............e............................................................ - // trn2 v28.4s, v10.4s, v11.4s // ................e...........................................................|...............e........................................................... - // trn2 v10.2d, v25.2d, v27.2d // .............................e..............................................|............................e.............................................. - // trn2 v11.2d, v26.2d, v28.2d // ..........................e.................................................|.........................e................................................. - // trn1 v8.2d, v25.2d, v27.2d // .......................e....................................................|......................e.................................................... - // trn1 v9.2d, v26.2d, v28.2d // ........................e...................................................|.......................e................................................... - // ldr q0, [x3], #(6*16) // ..................................e.........................................|.................................e......................................... - // ldr q4, [x3, #(-6*16 + 1*16)] // .................................................e..........................|................................................e.......................... - // ldr q1, [x3, #(-6*16 + 2*16)] // e...........................................................................e........................................................................... - // ldr q5, [x3, #(-6*16 + 3*16)] // .........................e..................................................|........................e.................................................. - // ldr q2, [x3, #(-6*16 + 4*16)] // ........e...................................................................|.......e................................................................... - // ldr q6, [x3, #(-6*16 + 5*16)] // .......e....................................................................|......e.................................................................... - // sub v24.4s, v8.4s, v9.4s // ..............................e.............................................|.............................e............................................. - // add v8.4s, v8.4s, v9.4s // ...............................e............................................|..............................e............................................ - // mul v9.4s, v24.4s, v1.4s // ........................................e...................................|.......................................e................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................e....................................|......................................e.................................... - // mls v9.4s, v24.4s, v29.4s // ...............................................e............................|..............................................e............................ - // sub v24.4s, v10.4s, v11.4s // ...................................e........................................|..................................e........................................ - // add v10.4s, v10.4s, v11.4s // ................................................e...........................|...............................................e........................... - // mul v11.4s, v24.4s, v2.4s // .........................................e..................................|........................................e.................................. - // sqrdmulh v24.4s, v24.4s, v6.4s // ..........................................e.................................|.........................................e................................. - // mls v11.4s, v24.4s, v29.4s // ....................................................e.......................|...................................................e....................... - // sub v24.4s, v8.4s, v10.4s // ......................................................e.....................|.....................................................e..................... - // add v8.4s, v8.4s, v10.4s // ............................................................e...............|...........................................................e............... - // mul v10.4s, v24.4s, v0.4s // ................................................................e...........|...............................................................e........... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...........................................................e................|..........................................................e................ - // mls v10.4s, v24.4s, v29.4s // ........................................................................e...|.......................................................................e... - // sub v24.4s, v9.4s, v11.4s // .............................................................e..............|............................................................e.............. - // add v9.4s, v9.4s, v11.4s // ..............................................................e.............|.............................................................e............. - // mul v11.4s, v24.4s, v0.4s // ..................................................................e.........|.................................................................e......... - // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................................e.......|...................................................................e....... - // mls v11.4s, v24.4s, v29.4s // .........................................................................e..|........................................................................e.. - // trn1 v25.4s, v8.4s, v9.4s // ......................................................................e.....|.....................................................................e..... - // trn2 v26.4s, v8.4s, v9.4s // .......................................................................e....|......................................................................e.... - // trn1 v27.4s, v10.4s, v11.4s // ......*.....................................................................|.....*..................................................................... - // trn2 v28.4s, v10.4s, v11.4s // .....*......................................................................|....*...................................................................... - // trn2 v10.2d, v25.2d, v27.2d // ..............*.............................................................|.............*............................................................. - // trn2 v11.2d, v26.2d, v28.2d // .............*..............................................................|............*.............................................................. - // trn1 v8.2d, v25.2d, v27.2d // .................*..........................................................|................*.......................................................... - // trn1 v9.2d, v26.2d, v28.2d // ..................*.........................................................|.................*......................................................... - // ldr q1, [x4], #8 // .........*..................................................................|........*.................................................................. - // ldr q0, [x4], #16 // ............*...............................................................|...........*............................................................... - // sub v24.4s, v8.4s, v9.4s // .....................*......................................................|....................*...................................................... - // add v8.4s, v8.4s, v9.4s // ......................*.....................................................|.....................*..................................................... - // mul v9.4s, v24.4s, v0.s[0] // ................................*...........................................|...............................*........................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................*..........................................|................................*.......................................... - // mls v9.4s, v24.4s, v29.4s // ...........................................*................................|..........................................*................................ - // sub v24.4s, v10.4s, v11.4s // ....................*.......................................................|...................*....................................................... - // add v10.4s, v10.4s, v11.4s // ...................*........................................................|..................*........................................................ - // mul v11.4s, v24.4s, v0.s[2] // ...........................*................................................|..........................*................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................*...............................................|...........................*............................................... - // mls v11.4s, v24.4s, v29.4s // ......................................*.....................................|.....................................*..................................... - // sub v24.4s, v8.4s, v10.4s // ....................................*.......................................|...................................*....................................... - // add v8.4s, v8.4s, v10.4s // .....................................*......................................|....................................*...................................... - // mul v10.4s, v24.4s, v1.s[0] // ............................................*...............................|...........................................*............................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................*..............................|............................................*.............................. - // mls v10.4s, v24.4s, v29.4s // .....................................................*......................|....................................................*...................... - // sub v24.4s, v9.4s, v11.4s // ..................................................*.........................|.................................................*......................... - // add v9.4s, v9.4s, v11.4s // ...................................................*........................|..................................................*........................ - // mul v11.4s, v24.4s, v1.s[0] // ........................................................*...................|.......................................................*................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................*..................|........................................................*.................. - // mls v11.4s, v24.4s, v29.4s // .....................................................................*......|....................................................................*...... - // srshr v24.4S, v8.4S, #23 // ..............................................*.............................|.............................................*............................. - // mls v8.4s, v24.4s, v29.4s // .......................................................*....................|......................................................*.................... - // srshr v24.4S, v9.4S, #23 // ..........................................................*.................|.........................................................*................. - // mls v9.4s, v24.4s, v29.4s // ...................................................................*........|..................................................................*........ - // str q8, [x0], #(16*4) // ...............................................................*............|..............................................................*............ - // str q9, [x0, #(-16*4 + 1*16)] // ..........................................................................*.|.........................................................................*. - // str q10, [x0, #(-16*4 + 2*16)] // .................................................................*..........|................................................................*.......... - // str q11, [x0, #(-16*4 + 3*16)] // ...........................................................................*|..........................................................................* - - sub count, count, #1 - cbnz count, layer5678_start - trn1 v30.4S, v10.4S, v0.4S // .*.................................. - trn2 v10.4S, v10.4S, v0.4S // *................................... - ldr q14, [x4], #8 // ..*................................. - ldr q19, [x4], #16 // ...*................................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - trn2 v1.2D, v4.2D, v30.2D // .....*.............................. - trn1 v30.2D, v4.2D, v30.2D // ......*............................. - trn2 v24.2D, v5.2D, v10.2D // ....*............................... - trn1 v4.2D, v5.2D, v10.2D // .......*............................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sub v10.4S, v1.4S, v24.4S // .........*.......................... - sub v3.4S, v30.4S, v4.4S // ..........*......................... - add v30.4S, v30.4S, v4.4S // ...........*........................ - add v17.4S, v1.4S, v24.4S // ........*........................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v27.4S, v10.4S, v19.S[2] // ............*....................... - sqrdmulh v7.4S, v10.4S, v19.S[3] // .............*...................... - mul v0.4S, v3.4S, v19.S[0] // ..............*..................... - sqrdmulh v24.4S, v3.4S, v19.S[1] // ...............*.................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sub v4.4S, v30.4S, v17.4S // ................*................... - add v16.4S, v30.4S, v17.4S // .................*.................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v27.4S, v7.4S, v29.4S // ..................*................. - mls v0.4S, v24.4S, v29.4S // ...................*................ - srshr v26.4S, v16.4S, #23 // ......................*............. - sqrdmulh v30.4S, v4.4S, v14.S[1] // .....................*.............. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v6.4S, v4.4S, v14.S[0] // ....................*............... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sub v4.4S, v0.4S, v27.4S // .......................*............ - mls v16.4S, v26.4S, v29.4S // ..........................*......... - add v24.4S, v0.4S, v27.4S // ........................*........... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v6.4S, v30.4S, v29.4S // .........................*.......... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v10.4S, v4.4S, v14.S[0] // ...........................*........ - sqrdmulh v30.4S, v4.4S, v14.S[1] // ............................*....... - srshr v28.4S, v24.4S, #23 // .............................*...... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q16, [x0], #(16*4) // ..............................*..... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q6, [x0, #-32] // ...............................*.... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v24.4S, v28.4S, v29.4S // ................................*... - mls v10.4S, v30.4S, v29.4S // .................................*.. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q24, [x0, #-48] // ..................................*. - str q10, [x0, #-16] // ...................................* - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - - // original source code - // trn2 v28.4S, v10.4S, v0.4S // .*.................................. - // trn1 v23.4S, v10.4S, v0.4S // *................................... - // ldr q22, [x4], #8 // ..*................................. - // ldr q20, [x4], #16 // ...*................................ - // trn2 v10.2D, v5.2D, v28.2D // ......*............................. - // trn2 v8.2D, v4.2D, v23.2D // ....*............................... - // trn1 v7.2D, v4.2D, v23.2D // .....*.............................. - // trn1 v6.2D, v5.2D, v28.2D // .......*............................ - // add v17.4S, v8.4S, v10.4S // ...........*........................ - // sub v18.4S, v8.4S, v10.4S // ........*........................... - // sub v28.4S, v7.4S, v6.4S // .........*.......................... - // add v5.4S, v7.4S, v6.4S // ..........*......................... - // mul v31.4S, v18.4S, v20.S[2] // ............*....................... - // sqrdmulh v7.4S, v18.4S, v20.S[3] // .............*...................... - // mul v2.4S, v28.4S, v20.S[0] // ..............*..................... - // sqrdmulh v23.4S, v28.4S, v20.S[1] // ...............*.................... - // sub v8.4S, v5.4S, v17.4S // ................*................... - // add v10.4S, v5.4S, v17.4S // .................*.................. - // mls v31.4S, v7.4S, v29.4S // ..................*................. - // mls v2.4S, v23.4S, v29.4S // ...................*................ - // mul v17.4S, v8.4S, v22.S[0] // ......................*............. - // sqrdmulh v12.4S, v8.4S, v22.S[1] // .....................*.............. - // srshr v30.4S, v10.4S, #23 // ....................*............... - // sub v4.4S, v2.4S, v31.4S // .......................*............ - // add v6.4S, v2.4S, v31.4S // .........................*.......... - // mls v17.4S, v12.4S, v29.4S // ..........................*......... - // mls v10.4S, v30.4S, v29.4S // ........................*........... - // mul v19.4S, v4.4S, v22.S[0] // ...........................*........ - // sqrdmulh v4.4S, v4.4S, v22.S[1] // ............................*....... - // srshr v7.4S, v6.4S, #23 // .............................*...... - // str q10, [x0], #(16*4) // ..............................*..... - // str q17, [x0, #-32] // ...............................*.... - // mls v6.4S, v7.4S, v29.4S // ................................*... - // mls v19.4S, v4.4S, v29.4S // .................................*.. - // str q6, [x0, #-48] // ..................................*. - // str q19, [x0, #-16] // ...................................* + // ------------ new position -------------> + // 0 25 + // |------------------------|-------------- + // mls v17.4S, v20.4S, v29.4S // *....................................... + // mls v6.4S, v0.4S, v29.4S // .*...................................... + // trn1 v23.4S, v7.4S, v10.4S // ..*..................................... + // trn2 v10.4S, v7.4S, v10.4S // ...*.................................... + // ldr q11, [x4], #8 // ....*................................... + // ldr q31, [x4], #16 // .....*.................................. + // trn1 v20.4S, v17.4S, v6.4S // ......*................................. + // trn2 v9.4S, v17.4S, v6.4S // .......*................................ + // trn2 v17.2D, v23.2D, v20.2D // ........*............................... + // trn1 v3.2D, v10.2D, v9.2D // ...........*............................ + // trn1 v5.2D, v23.2D, v20.2D // .........*.............................. + // trn2 v0.2D, v10.2D, v9.2D // ..........*............................. + // sub v6.4S, v17.4S, v0.4S // ............*........................... + // add v2.4S, v17.4S, v0.4S // .............*.......................... + // sub v20.4S, v5.4S, v3.4S // ..............*......................... + // add v1.4S, v5.4S, v3.4S // ...............*........................ + // sqrdmulh v0.4S, v6.4S, v31.S[3] // ................*....................... + // mul v9.4S, v6.4S, v31.S[2] // .................*...................... + // sqrdmulh v17.4S, v20.4S, v31.S[1] // ..................*..................... + // mul v31.4S, v20.4S, v31.S[0] // ...................*.................... + // add v22.4S, v1.4S, v2.4S // .....................*.................. + // sub v1.4S, v1.4S, v2.4S // ....................*................... + // mls v9.4S, v0.4S, v29.4S // ......................*................. + // mls v31.4S, v17.4S, v29.4S // .......................*................ + // sqrdmulh v8.4S, v1.4S, v11.S[1] // ........................*............... + // mul v21.4S, v1.4S, v11.S[0] // .........................*.............. + // sub v6.4S, v31.4S, v9.4S // ...........................*............ + // add v31.4S, v31.4S, v9.4S // ............................*........... + // srshr v17.4S, v22.4S, #23 // ..........................*............. + // sqrdmulh v0.4S, v6.4S, v11.S[1] // ...............................*........ + // mul v10.4S, v6.4S, v11.S[0] // ................................*....... + // mls v21.4S, v8.4S, v29.4S // .............................*.......... + // srshr v11.4S, v31.4S, #23 // .................................*...... + // mls v22.4S, v17.4S, v29.4S // ..............................*......... + // mls v31.4S, v11.4S, v29.4S // ....................................*... + // mls v10.4S, v0.4S, v29.4S // .....................................*.. + // str q21, [x0, #32] // ..................................*..... + // str q22, [x0], #(16*4) // ...................................*.... + // str q10, [x0, #-16] // .......................................* + // str q31, [x0, #-48] // ......................................*. .unreq root0_tw @@ -1082,853 +1106,862 @@ layer5678_start: load_roots_1234 r_ptr1 .p2align 2 - ldr q17, [x1, #0] // *....................................................................................................................................................................................................................................................................................... - ldr q12, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ - ldr q27, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. - ldr q16, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... - ldr q18, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... - ldr q9, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. - ldr q14, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. - ldr q11, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... - ldr q22, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. - ldr q15, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... - ldr q28, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... - sub v8.4S, v27.4S, v12.4S // ...............................*........................................................................................................................................................................................................................................................ - ldr q19, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ - add v23.4S, v27.4S, v12.4S // ................................*....................................................................................................................................................................................................................................................... - sub v27.4S, v17.4S, v18.4S // ................*....................................................................................................................................................................................................................................................................... - sub v21.4S, v16.4S, v9.4S // ....................................*................................................................................................................................................................................................................................................... - add v16.4S, v16.4S, v9.4S // .....................................*.................................................................................................................................................................................................................................................. - sub v12.4S, v11.4S, v14.4S // ..........................*............................................................................................................................................................................................................................................................. - add v20.4S, v11.4S, v14.4S // ...........................*............................................................................................................................................................................................................................................................ - mul v11.4S, v8.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... - sqrdmulh v10.4S, v8.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... - mul v9.4S, v27.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... - sqrdmulh v24.4S, v27.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... - sub v27.4S, v28.4S, v15.4S // .....................*.................................................................................................................................................................................................................................................................. - sqrdmulh v14.4S, v21.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ - add v8.4S, v28.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. - mul v21.4S, v21.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. - sub v28.4S, v22.4S, v19.4S // .........................................*.............................................................................................................................................................................................................................................. - add v19.4S, v22.4S, v19.4S // ..........................................*............................................................................................................................................................................................................................................. - sqrdmulh v15.4S, v27.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... - mul v22.4S, v27.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ - ldr q13, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... - mls v21.4S, v14.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + ldr q21, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q22, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q18, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q8, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q14, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q27, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q24, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + ldr q15, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q13, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q19, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q16, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + sub v20.4S, v22.4S, v21.4S // ....................................*................................................................................................................................................................................................................................................... + add v23.4S, v22.4S, v21.4S // .....................................*.................................................................................................................................................................................................................................................. + ldr q10, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q28, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + sub v11.4S, v18.4S, v8.4S // .....................*.................................................................................................................................................................................................................................................................. + add v21.4S, v18.4S, v8.4S // ......................*................................................................................................................................................................................................................................................................. + add v8.4S, v14.4S, v27.4S // ................................*....................................................................................................................................................................................................................................................... + sub v14.4S, v14.4S, v27.4S // ...............................*........................................................................................................................................................................................................................................................ + add v18.4S, v15.4S, v24.4S // ..........................................*............................................................................................................................................................................................................................................. + sub v22.4S, v15.4S, v24.4S // .........................................*.............................................................................................................................................................................................................................................. + mul v17.4S, v20.4S, v5.S[2] // .......................................*................................................................................................................................................................................................................................................ + sqrdmulh v12.4S, v20.4S, v5.S[3] // ......................................*................................................................................................................................................................................................................................................. + add v27.4S, v19.4S, v13.4S // .................*...................................................................................................................................................................................................................................................................... + sub v19.4S, v19.4S, v13.4S // ................*....................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v11.4S, v4.S[1] // .......................*................................................................................................................................................................................................................................................................ + mul v11.4S, v11.4S, v4.S[0] // ........................*............................................................................................................................................................................................................................................................... + mul v20.4S, v22.4S, v6.S[0] // ............................................*........................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v22.4S, v6.S[1] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v22.4S, v19.4S, v3.S[3] // ..................*..................................................................................................................................................................................................................................................................... + mul v19.4S, v19.4S, v3.S[2] // ...................*.................................................................................................................................................................................................................................................................... + mls v11.4S, v24.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v14.4S, v5.S[1] // .................................*...................................................................................................................................................................................................................................................... + mul v24.4S, v14.4S, v5.S[0] // ..................................*..................................................................................................................................................................................................................................................... + sub v14.4S, v16.4S, v10.4S // ..........................*............................................................................................................................................................................................................................................................. + mls v17.4S, v12.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v12.4S, v28.4S, v9.4S // ...................................................*.................................................................................................................................................................................................................................... + add v28.4S, v28.4S, v9.4S // ....................................................*................................................................................................................................................................................................................................... + mls v19.4S, v22.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v14.4S, v4.S[3] // ............................*........................................................................................................................................................................................................................................................... + mul v22.4S, v14.4S, v4.S[2] // .............................*.......................................................................................................................................................................................................................................................... ldr q14, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... - mls v11.4S, v10.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... - ldr q10, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... - add v27.4S, v20.4S, v23.4S // ...................................................................*.................................................................................................................................................................................................................... - mls v9.4S, v24.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... - sub v23.4S, v20.4S, v23.4S // ..................................................................*..................................................................................................................................................................................................................... - ldr q20, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ - add v24.4S, v17.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... - mul v18.4S, v28.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ - sqrdmulh v17.4S, v28.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... - add v28.4S, v16.4S, v19.4S // .............................................................................*.......................................................................................................................................................................................................... - mls v22.4S, v15.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. - mul v15.4S, v12.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... - sub v19.4S, v16.4S, v19.4S // ............................................................................*........................................................................................................................................................................................................... - sqrdmulh v12.4S, v12.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... - sub v16.4S, v14.4S, v13.4S // ..............................................*......................................................................................................................................................................................................................................... - add v13.4S, v14.4S, v13.4S // ...............................................*........................................................................................................................................................................................................................................ - mul v14.4S, v23.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. - mls v18.4S, v17.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... - sub v17.4S, v10.4S, v20.4S // ...................................................*.................................................................................................................................................................................................................................... - add v20.4S, v10.4S, v20.4S // ....................................................*................................................................................................................................................................................................................................... - sub v10.4S, v24.4S, v8.4S // ........................................................*............................................................................................................................................................................................................................... - mls v14.4S, v23.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. - add v23.4S, v13.4S, v20.4S // .......................................................................................*................................................................................................................................................................................................ - sub v20.4S, v13.4S, v20.4S // ......................................................................................*................................................................................................................................................................................................. - mls v15.4S, v12.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... - add v12.4S, v9.4S, v22.4S // ..............................................................*......................................................................................................................................................................................................................... - sub v13.4S, v9.4S, v22.4S // .............................................................*.......................................................................................................................................................................................................................... - add v24.4S, v24.4S, v8.4S // .........................................................*.............................................................................................................................................................................................................................. - mul v22.4S, v17.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. - sqrdmulh v8.4S, v17.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. - add v17.4S, v15.4S, v11.4S // ........................................................................*............................................................................................................................................................................................................... - sub v11.4S, v15.4S, v11.4S // .......................................................................*................................................................................................................................................................................................................ - sqrdmulh v15.4S, v19.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ - mul v19.4S, v19.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... - sub v9.4S, v24.4S, v27.4S // ................................................................................................*....................................................................................................................................................................................... - add v24.4S, v24.4S, v27.4S // .................................................................................................*...................................................................................................................................................................................... - mul v27.4S, v20.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... - mls v22.4S, v8.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ - sqrdmulh v8.4S, v20.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. - sub v20.4S, v21.4S, v18.4S // .................................................................................*...................................................................................................................................................................................................... - add v21.4S, v21.4S, v18.4S // ..................................................................................*..................................................................................................................................................................................................... - mls v19.4S, v15.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... - mul v18.4S, v16.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... - sqrdmulh v15.4S, v16.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... - add v16.4S, v28.4S, v23.4S // .....................................................................................................................*.................................................................................................................................................................. - sub v28.4S, v28.4S, v23.4S // ....................................................................................................................*................................................................................................................................................................... - mul v23.4S, v10.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. - mls v27.4S, v8.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. - sqrdmulh v8.4S, v10.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ - mls v18.4S, v15.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... - sub v15.4S, v19.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... - mls v23.4S, v8.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... - add v27.4S, v19.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ - add v8.4S, v24.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. - sub v16.4S, v24.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... - sqrdmulh v24.4S, v13.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... - mul v13.4S, v13.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ - mul v19.4S, v11.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. - add v10.4S, v23.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ - sub v14.4S, v23.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. - sqrdmulh v23.4S, v11.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. - sub v11.4S, v18.4S, v22.4S // ...........................................................................................*............................................................................................................................................................................................ - add v18.4S, v18.4S, v22.4S // ............................................................................................*........................................................................................................................................................................................... - mls v13.4S, v24.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... - mul v24.4S, v14.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - mls v19.4S, v23.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ - sqrdmulh v22.4S, v16.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - mul v16.4S, v16.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - mls v24.4S, v14.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - mul v23.4S, v11.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... - sqrdmulh v14.4S, v11.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... - add v11.4S, v13.4S, v19.4S // ................................................................................................................*....................................................................................................................................................................... - sub v13.4S, v13.4S, v19.4S // ...............................................................................................................*........................................................................................................................................................................ - sub v19.4S, v12.4S, v17.4S // .....................................................................................................*.................................................................................................................................................................................. - mls v16.4S, v22.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - add v22.4S, v12.4S, v17.4S // ......................................................................................................*................................................................................................................................................................................. - sqrdmulh v17.4S, v9.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... - sub v12.4S, v21.4S, v18.4S // .........................................................................................................................*.............................................................................................................................................................. - add v18.4S, v21.4S, v18.4S // ..........................................................................................................................*............................................................................................................................................................. - mul v9.4S, v9.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... - sqrdmulh v21.4S, v20.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... - mul v20.4S, v20.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... - mls v23.4S, v14.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ - mul v14.4S, v15.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... - sqrdmulh v15.4S, v15.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... - mls v9.4S, v17.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... - sqrdmulh v17.4S, v12.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... - mls v20.4S, v21.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. - mul v21.4S, v12.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ - mul v12.4S, v28.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. - sqrdmulh v28.4S, v28.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ - mls v14.4S, v15.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... - mul v15.4S, v19.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ - sqrdmulh v19.4S, v19.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... - mls v21.4S, v17.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - mls v12.4S, v28.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... - cmge v17.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... - mls v15.4S, v19.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. - sub v19.4S, v10.4S, v27.4S // ..................................................................................................................................................*..................................................................................................................................... - add v10.4S, v10.4S, v27.4S // ...................................................................................................................................................*.................................................................................................................................... - add v27.4S, v20.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... - sub v23.4S, v20.4S, v23.4S // ...................................................................................................................................*.................................................................................................................................................... - sub v17.4S, v28.4S, v17.4S // ..................................................................................................................................................................................*..................................................................................................... + mls v20.4S, v15.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + add v15.4S, v16.4S, v10.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v10.4S, v27.4S, v21.4S // ........................................................*............................................................................................................................................................................................................................... + add v27.4S, v27.4S, v21.4S // .........................................................*.............................................................................................................................................................................................................................. + ldr q21, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mls v24.4S, v13.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sub v13.4S, v23.4S, v18.4S // ............................................................................*........................................................................................................................................................................................................... + add v16.4S, v23.4S, v18.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v18.4S, v15.4S, v8.4S // ..................................................................*..................................................................................................................................................................................................................... + mls v22.4S, v9.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + add v8.4S, v15.4S, v8.4S // ...................................................................*.................................................................................................................................................................................................................... + add v15.4S, v19.4S, v11.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v19.4S, v19.4S, v11.4S // .............................................................*.......................................................................................................................................................................................................................... + mul v23.4S, v10.4S, v1.S[2] // ...........................................................*............................................................................................................................................................................................................................ + sub v11.4S, v14.4S, v21.4S // ..............................................*......................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v10.4S, v1.S[3] // ..........................................................*............................................................................................................................................................................................................................. + add v21.4S, v14.4S, v21.4S // ...............................................*........................................................................................................................................................................................................................................ + sqrdmulh v14.4S, v18.4S, v2.S[1] // ....................................................................*................................................................................................................................................................................................................... + mul v18.4S, v18.4S, v2.S[0] // .....................................................................*.................................................................................................................................................................................................................. + add v9.4S, v21.4S, v28.4S // .......................................................................................*................................................................................................................................................................................................ + sub v28.4S, v21.4S, v28.4S // ......................................................................................*................................................................................................................................................................................................. + mul v21.4S, v11.4S, v6.S[2] // .................................................*...................................................................................................................................................................................................................................... + mls v23.4S, v10.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sqrdmulh v10.4S, v11.4S, v6.S[3] // ................................................*....................................................................................................................................................................................................................................... + sub v11.4S, v22.4S, v24.4S // .......................................................................*................................................................................................................................................................................................................ + add v22.4S, v22.4S, v24.4S // ........................................................................*............................................................................................................................................................................................................... + mls v18.4S, v14.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + mul v24.4S, v12.4S, v7.S[0] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v14.4S, v12.4S, v7.S[1] // .....................................................*.................................................................................................................................................................................................................................. + sub v12.4S, v27.4S, v8.4S // ................................................................................................*....................................................................................................................................................................................... + mls v21.4S, v10.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v19.4S, v1.S[3] // ...............................................................*........................................................................................................................................................................................................................ + add v8.4S, v27.4S, v8.4S // .................................................................................................*...................................................................................................................................................................................... + mul v27.4S, v19.4S, v1.S[2] // ................................................................*....................................................................................................................................................................................................................... + mls v24.4S, v14.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sqrdmulh v14.4S, v28.4S, v3.S[1] // ........................................................................................*............................................................................................................................................................................................... + mul v19.4S, v28.4S, v3.S[0] // .........................................................................................*.............................................................................................................................................................................................. + add v28.4S, v16.4S, v9.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v16.4S, v16.4S, v9.4S // ....................................................................................................................*................................................................................................................................................................... + sub v9.4S, v17.4S, v20.4S // .................................................................................*...................................................................................................................................................................................................... + add v17.4S, v17.4S, v20.4S // ..................................................................................*..................................................................................................................................................................................................... + sub v20.4S, v21.4S, v24.4S // ...........................................................................................*............................................................................................................................................................................................ + add v21.4S, v21.4S, v24.4S // ............................................................................................*........................................................................................................................................................................................... + mul v24.4S, v9.4S, v2.S[2] // ....................................................................................*................................................................................................................................................................................................... + sqrdmulh v9.4S, v9.4S, v2.S[3] // ...................................................................................*.................................................................................................................................................................................................... + mls v27.4S, v10.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mul v10.4S, v11.4S, v2.S[0] // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v11.4S, v11.4S, v2.S[1] // .........................................................................*.............................................................................................................................................................................................................. + mls v19.4S, v14.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sqrdmulh v14.4S, v12.4S, v0.S[3] // ..................................................................................................*..................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ...................................................................................................*.................................................................................................................................................................................... + mls v24.4S, v9.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + mul v9.4S, v13.4S, v2.S[2] // ...............................................................................*........................................................................................................................................................................................................ + sqrdmulh v13.4S, v13.4S, v2.S[3] // ..............................................................................*......................................................................................................................................................................................................... + mls v10.4S, v11.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + sub v11.4S, v8.4S, v28.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v28.4S // .........................................................................................................................................*.............................................................................................................................................. + mul v28.4S, v16.4S, v1.S[0] // .......................................................................................................................*................................................................................................................................................................ + mls v12.4S, v14.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sqrdmulh v14.4S, v16.4S, v1.S[1] // ......................................................................................................................*................................................................................................................................................................. + sub v16.4S, v15.4S, v22.4S // .....................................................................................................*.................................................................................................................................................................................. + add v22.4S, v15.4S, v22.4S // ......................................................................................................*................................................................................................................................................................................. + add v15.4S, v23.4S, v18.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v23.4S, v23.4S, v18.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v9.4S, v13.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + mul v13.4S, v16.4S, v0.S[2] // ........................................................................................................*............................................................................................................................................................................... + sqrdmulh v18.4S, v16.4S, v0.S[3] // .......................................................................................................*................................................................................................................................................................................ + mul v16.4S, v11.4S, v0.S[0] // ...........................................................................................................................................*............................................................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[1] // ..........................................................................................................................................*............................................................................................................................................. + mls v13.4S, v18.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + mls v28.4S, v14.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + mul v18.4S, v20.4S, v3.S[0] // ..............................................................................................*......................................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v3.S[1] // .............................................................................................*.......................................................................................................................................................................................... + mls v16.4S, v11.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mul v14.4S, v23.4S, v0.S[2] // .............................................................................................................*.......................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v0.S[3] // ............................................................................................................*........................................................................................................................................................................... + add v11.4S, v27.4S, v10.4S // ................................................................................................................*....................................................................................................................................................................... + sub v27.4S, v27.4S, v10.4S // ...............................................................................................................*........................................................................................................................................................................ + add v10.4S, v9.4S, v19.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v19.4S, v9.4S, v19.4S // ..............................................................................................................................*......................................................................................................................................................... + add v9.4S, v17.4S, v21.4S // ..........................................................................................................................*............................................................................................................................................................. + mls v18.4S, v20.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v20.4S, v12.4S, v28.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v28.4S // .............................................................................................................................................................*.......................................................................................................................... + sub v21.4S, v17.4S, v21.4S // .........................................................................................................................*.............................................................................................................................................................. + sub v28.4S, v22.4S, v9.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v22.4S, v9.4S // ..............................................................................................................................................*......................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mul v23.4S, v27.4S, v0.S[2] // ..................................................................................................................*..................................................................................................................................................................... + mul v22.4S, v21.4S, v1.S[0] // ............................................................................................................................*........................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v1.S[1] // ...........................................................................................................................*............................................................................................................................................................ + sqrdmulh v17.4S, v27.4S, v0.S[3] // .................................................................................................................*...................................................................................................................................................................... + add v27.4S, v24.4S, v18.4S // ....................................................................................................................................*................................................................................................................................................... + sub v18.4S, v24.4S, v18.4S // ...................................................................................................................................*.................................................................................................................................................... + mul v24.4S, v19.4S, v1.S[0] // .................................................................................................................................*...................................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v1.S[1] // ................................................................................................................................*....................................................................................................................................................... + mls v23.4S, v17.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... sub count, count, #1 layer1234_start: - mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - sub v17.4S, v22.4S, v18.4S // .............................................................................................................................................*.......................................................................................................................................... - sub v20.4S, v9.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... - add v12.4S, v9.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... - add v9.4S, v22.4S, v18.4S // ..............................................................................................................................................*......................................................................................................................................... - mul v28.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... - sqrdmulh v22.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... - sub v18.4S, v15.4S, v21.4S // .................................................................................................................................................................*...................................................................................................................... - add v13.4S, v15.4S, v21.4S // ..................................................................................................................................................................*..................................................................................................................... - mul v21.4S, v20.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - sqrdmulh v20.4S, v20.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - sub v15.4S, v24.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. - add v14.4S, v24.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ - str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - mul v24.4S, v23.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - sqrdmulh v16.4S, v18.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - mls v28.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... - sqrdmulh v22.4S, v23.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - sub v23.4S, v11.4S, v27.4S // .......................................................................................................................................................*................................................................................................................................ - add v11.4S, v11.4S, v27.4S // ........................................................................................................................................................*............................................................................................................................... - mls v21.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - mul v27.4S, v18.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - sqrdmulh v20.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - sqrdmulh v19.4S, v17.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... - mul v17.4S, v17.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ - mls v24.4S, v22.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - mul v22.4S, v15.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - mls v27.4S, v16.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - mls v17.4S, v19.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - cmge v19.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... - cmge v16.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v18.4S, v20.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - sqrdmulh v20.4S, v15.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - add v15.4S, v28.4S, v24.4S // ............................................................................................................................................................................*........................................................................................................... - sub v24.4S, v28.4S, v24.4S // ...........................................................................................................................................................................*............................................................................................................ - sub v28.4S, v19.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... - mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sqrdmulh v16.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - mul v23.4S, v24.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - sqrdmulh v24.4S, v24.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - mls v21.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - cmge v28.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - mls v22.4S, v20.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - cmge v20.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... - mls v19.4S, v16.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - sub v20.4S, v20.4S, v28.4S // ..........................................................................................................................................................................................*............................................................................................. - cmge v28.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... - cmge v24.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - str q21, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - sub v21.4S, v28.4S, v24.4S // ......................................................................................................................................................................................*................................................................................................. - cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - cmge v20.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... - cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - cmge v24.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... - mls v17.4S, v21.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - sub v28.4S, v24.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... - sqrdmulh v24.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - sub v13.4S, v20.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - cmge v8.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - cmge v18.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... - mls v23.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - sqrdmulh v13.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - mls v21.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - sub v24.4S, v18.4S, v8.4S // ..........................................................................................................................................................................................................*............................................................................. - mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - cmge v10.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - sqrdmulh v8.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - mls v22.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - cmge v15.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - cmge v12.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................*................................................................................... - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - cmge v24.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - sub v10.4S, v12.4S, v10.4S // ......................................................................................................................................................................................................*................................................................................. - cmge v12.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - mls v18.4S, v13.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - sqrdmulh v13.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - sub v11.4S, v11.4S, v15.4S // ..................................................................................................................................................................................................................................................*..................................... - cmge v15.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - sub v14.4S, v12.4S, v24.4S // ..................................................................................................................................................................................................................................................................*..................... - mls v27.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - mls v19.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - cmge v28.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - cmge v9.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... - cmge v10.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - cmge v12.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... - mls v22.4S, v13.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - sub v13.4S, v9.4S, v15.4S // ......................................................................................................................................................................................................................................................................*................. - sub v24.4S, v8.4S, v28.4S // ..........................................................................................................................................................................................................................................................*............................. - str q27, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - ldr q9, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... - cmge v8.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - ldr q16, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. - mls v20.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - sub v14.4S, v12.4S, v10.4S // ..............................................................................................................................................................................................................................................................................*......... - ldr q10, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. - cmge v15.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - ldr q12, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ - cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - ldr q11, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... - mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - sub v24.4S, v27.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. - cmge v8.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - cmge v27.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - mls v23.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - ldr q14, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. - mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - add v24.4S, v9.4S, v16.4S // ...........................e............................................................................................................................................................................................................................................................ - sub v13.4S, v9.4S, v16.4S // ..........................e............................................................................................................................................................................................................................................................. - sub v16.4S, v27.4S, v28.4S // ..............................................................................................................................................................................................................................................................*......................... - str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... - sub v27.4S, v10.4S, v12.4S // ...............................e........................................................................................................................................................................................................................................................ - sub v15.4S, v8.4S, v15.4S // ..........................................................................................................................................................................................................................................................................*............. - sub v18.4S, v11.4S, v14.4S // ....................................e................................................................................................................................................................................................................................................... - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - add v20.4S, v11.4S, v14.4S // .....................................e.................................................................................................................................................................................................................................................. - add v8.4S, v10.4S, v12.4S // ................................e....................................................................................................................................................................................................................................................... - ldr q10, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ - ldr q14, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... - ldr q28, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. - ldr q12, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... - mls v19.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - sqrdmulh v9.4S, v27.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... - str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... - ldr q17, [x1, #0] // e....................................................................................................................................................................................................................................................................................... - ldr q16, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... - mul v11.4S, v27.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... - str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. - add v27.4S, v24.4S, v8.4S // ...................................................................e.................................................................................................................................................................................................................... - mul v21.4S, v18.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. - str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - sub v23.4S, v24.4S, v8.4S // ..................................................................e..................................................................................................................................................................................................................... - sqrdmulh v8.4S, v18.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ - mul v18.4S, v13.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... - sqrdmulh v24.4S, v13.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... - mls v22.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - add v15.4S, v28.4S, v10.4S // ..........................................e............................................................................................................................................................................................................................................. - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - sub v19.4S, v28.4S, v10.4S // .........................................e.............................................................................................................................................................................................................................................. - mul v10.4S, v23.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... - add v13.4S, v16.4S, v14.4S // ......................e................................................................................................................................................................................................................................................................. - mls v11.4S, v9.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. - sub v9.4S, v16.4S, v14.4S // .....................e.................................................................................................................................................................................................................................................................. - ldr q28, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - mul v14.4S, v19.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ - mls v18.4S, v24.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... - mls v21.4S, v8.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... - sqrdmulh v16.4S, v19.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... - ldr q24, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... - mul v19.4S, v9.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ - sqrdmulh v22.4S, v9.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... - add v9.4S, v17.4S, v12.4S // .................e...................................................................................................................................................................................................................................................................... - sub v17.4S, v17.4S, v12.4S // ................e....................................................................................................................................................................................................................................................................... - ldr q12, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ - ldr q8, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... - mls v10.4S, v23.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. - sub v23.4S, v20.4S, v15.4S // ............................................................................e........................................................................................................................................................................................................... - mls v14.4S, v16.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... - add v16.4S, v20.4S, v15.4S // .............................................................................e.......................................................................................................................................................................................................... - add v20.4S, v9.4S, v13.4S // .........................................................e.............................................................................................................................................................................................................................. - mls v19.4S, v22.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. - add v22.4S, v24.4S, v28.4S // ...............................................e........................................................................................................................................................................................................................................ - sub v15.4S, v24.4S, v28.4S // ..............................................e......................................................................................................................................................................................................................................... - sub v24.4S, v9.4S, v13.4S // ........................................................e............................................................................................................................................................................................................................... - add v13.4S, v8.4S, v12.4S // ....................................................e................................................................................................................................................................................................................................... - sub v8.4S, v8.4S, v12.4S // ...................................................e.................................................................................................................................................................................................................................... - sub v28.4S, v22.4S, v13.4S // ......................................................................................e................................................................................................................................................................................................. - add v13.4S, v22.4S, v13.4S // .......................................................................................e................................................................................................................................................................................................ - mul v22.4S, v17.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... - sqrdmulh v17.4S, v17.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... - mul v12.4S, v15.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... - sqrdmulh v15.4S, v15.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... - sub v9.4S, v20.4S, v27.4S // ................................................................................................e....................................................................................................................................................................................... - add v27.4S, v20.4S, v27.4S // .................................................................................................e...................................................................................................................................................................................... - sub v20.4S, v16.4S, v13.4S // ....................................................................................................................e................................................................................................................................................................... - add v16.4S, v16.4S, v13.4S // .....................................................................................................................e.................................................................................................................................................................. - mul v13.4S, v24.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. - sqrdmulh v24.4S, v24.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ - mls v22.4S, v17.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... - add v17.4S, v18.4S, v11.4S // ........................................................................e............................................................................................................................................................................................................... - mls v12.4S, v15.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... - sub v18.4S, v18.4S, v11.4S // .......................................................................e................................................................................................................................................................................................................ - mul v15.4S, v8.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. - sqrdmulh v11.4S, v8.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. - sqrdmulh v8.4S, v28.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. - mls v13.4S, v24.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... - add v24.4S, v22.4S, v19.4S // ..............................................................e......................................................................................................................................................................................................................... - sub v19.4S, v22.4S, v19.4S // .............................................................e.......................................................................................................................................................................................................................... - mul v22.4S, v28.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... - mls v15.4S, v11.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ - mul v11.4S, v19.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ - sqrdmulh v28.4S, v19.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... - sqrdmulh v19.4S, v23.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ - mul v23.4S, v23.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... - mls v11.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... - mls v23.4S, v19.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... - add v28.4S, v12.4S, v15.4S // ............................................................................................e........................................................................................................................................................................................... - sub v15.4S, v12.4S, v15.4S // ...........................................................................................e............................................................................................................................................................................................ - sub v12.4S, v27.4S, v16.4S // ........................................................................................................................................e............................................................................................................................................... - mls v22.4S, v8.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. - add v8.4S, v27.4S, v16.4S // .........................................................................................................................................e.............................................................................................................................................. - add v19.4S, v13.4S, v10.4S // ...........................................................................................................e............................................................................................................................................................................ - sub v13.4S, v13.4S, v10.4S // ..........................................................................................................e............................................................................................................................................................................. - mul v16.4S, v12.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. - sqrdmulh v12.4S, v12.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ - sub v10.4S, v24.4S, v17.4S // .....................................................................................................e.................................................................................................................................................................................. - add v27.4S, v23.4S, v22.4S // ...............................................................................................................................e........................................................................................................................................................ - sub v23.4S, v23.4S, v22.4S // ..............................................................................................................................e......................................................................................................................................................... - add v22.4S, v24.4S, v17.4S // ......................................................................................................e................................................................................................................................................................................. - mul v24.4S, v13.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... - sqrdmulh v17.4S, v18.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. - mul v18.4S, v18.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. - sqrdmulh v13.4S, v13.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... - mls v16.4S, v12.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... - mul v12.4S, v20.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. - sqrdmulh v20.4S, v20.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ - mls v24.4S, v13.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... - mls v18.4S, v17.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ - sqrdmulh v13.4S, v9.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... - mul v9.4S, v9.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... - sub v17.4S, v21.4S, v14.4S // .................................................................................e...................................................................................................................................................................................................... - mls v12.4S, v20.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... - mul v20.4S, v15.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... - sqrdmulh v15.4S, v15.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... - add v21.4S, v21.4S, v14.4S // ..................................................................................e..................................................................................................................................................................................................... - mul v14.4S, v23.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... - mls v9.4S, v13.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... - sub v13.4S, v11.4S, v18.4S // ...............................................................................................................e........................................................................................................................................................................ - add v11.4S, v11.4S, v18.4S // ................................................................................................................e....................................................................................................................................................................... - add v18.4S, v21.4S, v28.4S // ..........................................................................................................................e............................................................................................................................................................. - sub v21.4S, v21.4S, v28.4S // .........................................................................................................................e.............................................................................................................................................................. - mul v28.4S, v17.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... - sqrdmulh v17.4S, v17.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... - mls v20.4S, v15.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ - mul v15.4S, v10.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ - sqrdmulh v10.4S, v10.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... - mls v14.4S, v23.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... - mls v28.4S, v17.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. - sqrdmulh v23.4S, v21.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... - mul v21.4S, v21.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ - cmge v17.4S, v31.4S, v16.4S // ................................................................................................................................................................................e....................................................................................................... - mls v15.4S, v10.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. - add v10.4S, v19.4S, v27.4S // ...................................................................................................................................................e.................................................................................................................................... - sub v19.4S, v19.4S, v27.4S // ..................................................................................................................................................e..................................................................................................................................... - cmge v27.4S, v16.4S, v30.4S // .................................................................................................................................................................................e...................................................................................................... - mls v21.4S, v23.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... - sub v17.4S, v17.4S, v27.4S // ..................................................................................................................................................................................e..................................................................................................... - add v27.4S, v28.4S, v20.4S // ....................................................................................................................................e................................................................................................................................................... - sub v23.4S, v28.4S, v20.4S // ...................................................................................................................................e.................................................................................................................................................... + // Instructions: 280 + // Expected cycles: 35 + // Expected IPC: 8.00 + + // ---------------------------------------------------------------------------------------------------------------------------------- original position ----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + mls v24.4S, v19.4S, v29.4S // *....................................................................................................................................................................................................................................................................................... + cmge v17.4S, v16.4S, v30.4S // ...*.................................................................................................................................................................................................................................................................................... + mls v22.4S, v21.4S, v29.4S // .*...................................................................................................................................................................................................................................................................................... + cmge v19.4S, v31.4S, v16.4S // ..*..................................................................................................................................................................................................................................................................................... + sub v21.4S, v15.4S, v10.4S // .........................*.............................................................................................................................................................................................................................................................. + add v15.4S, v15.4S, v10.4S // ..........................*............................................................................................................................................................................................................................................................. + mul v10.4S, v28.4S, v0.S[0] // .....*.................................................................................................................................................................................................................................................................................. + sqrdmulh v28.4S, v28.4S, v0.S[1] // ....*................................................................................................................................................................................................................................................................................... + sub v19.4S, v19.4S, v17.4S // ......*................................................................................................................................................................................................................................................................................. + add v17.4S, v14.4S, v24.4S // ........*............................................................................................................................................................................................................................................................................... + sub v24.4S, v14.4S, v24.4S // .......*................................................................................................................................................................................................................................................................................ + mls v16.4S, v19.4S, v29.4S // ...........*............................................................................................................................................................................................................................................................................ + sqrdmulh v14.4S, v20.4S, v0.S[1] // .....................*.................................................................................................................................................................................................................................................................. + mul v20.4S, v20.4S, v0.S[0] // ......................*................................................................................................................................................................................................................................................................. + mls v10.4S, v28.4S, v29.4S // ............*........................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v18.4S, v1.S[1] // ..............*......................................................................................................................................................................................................................................................................... + mul v19.4S, v18.4S, v1.S[0] // .............*.......................................................................................................................................................................................................................................................................... + sub v18.4S, v13.4S, v22.4S // .........*.............................................................................................................................................................................................................................................................................. + add v13.4S, v13.4S, v22.4S // ..........*............................................................................................................................................................................................................................................................................. + sqrdmulh v22.4S, v8.4S, v26.4S // .................................................................*...................................................................................................................................................................................................................... + mul v8.4S, v8.4S, v25.4S // ..................................................................*..................................................................................................................................................................................................................... + str q16, [x1, #512] // .................*...................................................................................................................................................................................................................................................................... + mls v20.4S, v14.4S, v29.4S // .............................*.......................................................................................................................................................................................................................................................... + cmge v16.4S, v31.4S, v10.4S // ....................*................................................................................................................................................................................................................................................................... + cmge v14.4S, v10.4S, v30.4S // ...................*.................................................................................................................................................................................................................................................................... + mls v19.4S, v28.4S, v29.4S // ..................*..................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v17.4S, v26.4S // ......................................*................................................................................................................................................................................................................................................. + sub v16.4S, v16.4S, v14.4S // ........................*............................................................................................................................................................................................................................................................... + mul v14.4S, v17.4S, v25.4S // .......................................*................................................................................................................................................................................................................................................ + mls v10.4S, v16.4S, v29.4S // ............................*........................................................................................................................................................................................................................................................... + cmge v17.4S, v31.4S, v20.4S // ..........................................*............................................................................................................................................................................................................................................. + cmge v16.4S, v20.4S, v30.4S // .........................................*.............................................................................................................................................................................................................................................. + mls v8.4S, v22.4S, v29.4S // ..............................................................................*......................................................................................................................................................................................................... + mul v22.4S, v24.4S, v0.S[0] // ...............*........................................................................................................................................................................................................................................................................ + sqrdmulh v24.4S, v24.4S, v0.S[1] // ................*....................................................................................................................................................................................................................................................................... + sub v17.4S, v17.4S, v16.4S // ..................................................*..................................................................................................................................................................................................................................... + add v16.4S, v23.4S, v19.4S // ................................*....................................................................................................................................................................................................................................................... + sub v23.4S, v23.4S, v19.4S // .................................*...................................................................................................................................................................................................................................................... + mul v19.4S, v18.4S, v0.S[0] // ..................................*..................................................................................................................................................................................................................................................... + mls v14.4S, v28.4S, v29.4S // ...............................................*........................................................................................................................................................................................................................................ + sqrdmulh v28.4S, v18.4S, v0.S[1] // ...................................*.................................................................................................................................................................................................................................................... + str q10, [x1, #576] // ........................................*............................................................................................................................................................................................................................................... + sub v10.4S, v11.4S, v27.4S // ...........................*............................................................................................................................................................................................................................................................ + mul v18.4S, v21.4S, v0.S[0] // .............................................*.......................................................................................................................................................................................................................................... + mls v22.4S, v24.4S, v29.4S // .......................*................................................................................................................................................................................................................................................................ + add v11.4S, v11.4S, v27.4S // ....................................*................................................................................................................................................................................................................................................... + mls v20.4S, v17.4S, v29.4S // .........................................................*.............................................................................................................................................................................................................................. + sqrdmulh v17.4S, v21.4S, v0.S[1] // ............................................*........................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v10.4S, v0.S[1] // .................................................*...................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v13.4S, v26.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v21.4S, v10.4S, v0.S[0] // ................................................*....................................................................................................................................................................................................................................... + mul v10.4S, v15.4S, v25.4S // ....................................................*................................................................................................................................................................................................................................... + mls v19.4S, v28.4S, v29.4S // ..............................................*......................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v11.4S, v26.4S // ........................................................*............................................................................................................................................................................................................................... + mul v11.4S, v11.4S, v25.4S // ...................................................................*.................................................................................................................................................................................................................... + mul v13.4S, v13.4S, v25.4S // ...............................................................*........................................................................................................................................................................................................................ + str q20, [x1, #768] // .......................................................................*................................................................................................................................................................................................................ + mls v13.4S, v24.4S, v29.4S // .........................................................................*.............................................................................................................................................................................................................. + mls v21.4S, v27.4S, v29.4S // ...........................................................*............................................................................................................................................................................................................................ + sqrdmulh v27.4S, v23.4S, v0.S[1] // .....................................................*.................................................................................................................................................................................................................................. + mul v23.4S, v23.4S, v0.S[0] // ......................................................*................................................................................................................................................................................................................................. + mls v18.4S, v17.4S, v29.4S // ................................................................*....................................................................................................................................................................................................................... + sqrdmulh v17.4S, v15.4S, v26.4S // ...................................................*.................................................................................................................................................................................................................................... + mul v15.4S, v16.4S, v25.4S // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v24.4S, v16.4S, v26.4S // .........................................................................................*.............................................................................................................................................................................................. + mls v11.4S, v28.4S, v29.4S // ...................................................................................*.................................................................................................................................................................................................... + cmge v16.4S, v31.4S, v22.4S // ...............................*........................................................................................................................................................................................................................................................ + cmge v28.4S, v22.4S, v30.4S // ..............................*......................................................................................................................................................................................................................................................... + mls v23.4S, v27.4S, v29.4S // .....................................................................*.................................................................................................................................................................................................................. + cmge v27.4S, v19.4S, v30.4S // ..........................................................*............................................................................................................................................................................................................................. + cmge v20.4S, v31.4S, v19.4S // .............................................................*.......................................................................................................................................................................................................................... + sub v16.4S, v16.4S, v28.4S // .....................................*.................................................................................................................................................................................................................................................. + mls v10.4S, v17.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + cmge v17.4S, v18.4S, v30.4S // ................................................................................*....................................................................................................................................................................................................... + sub v28.4S, v20.4S, v27.4S // ....................................................................*................................................................................................................................................................................................................... + sqrdmulh v20.4S, v12.4S, v26.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v12.4S, v12.4S, v25.4S // ............................................................................*........................................................................................................................................................................................................... + cmge v27.4S, v31.4S, v18.4S // ...............................................................................*........................................................................................................................................................................................................ + mls v22.4S, v16.4S, v29.4S // ...........................................*............................................................................................................................................................................................................................................ + mls v15.4S, v24.4S, v29.4S // .......................................................................................................*................................................................................................................................................................................ + cmge v24.4S, v23.4S, v30.4S // ..................................................................................*..................................................................................................................................................................................................... + cmge v16.4S, v31.4S, v23.4S // ........................................................................................*............................................................................................................................................................................................... + mls v19.4S, v28.4S, v29.4S // .................................................................................*...................................................................................................................................................................................................... + sub v27.4S, v27.4S, v17.4S // .....................................................................................*.................................................................................................................................................................................................. + cmge v17.4S, v31.4S, v15.4S // .......................................................................................................................*................................................................................................................................................................ + cmge v28.4S, v31.4S, v21.4S // ........................................................................*............................................................................................................................................................................................................... + str q19, [x1, #832] // .......................................................................................*................................................................................................................................................................................................ + cmge v19.4S, v21.4S, v30.4S // ......................................................................*................................................................................................................................................................................................................. + str q22, [x1, #896] // .......................................................*................................................................................................................................................................................................................................ + cmge v22.4S, v13.4S, v30.4S // ..........................................................................................*............................................................................................................................................................................................. + sub v16.4S, v16.4S, v24.4S // .............................................................................................*.......................................................................................................................................................................................... + mls v18.4S, v27.4S, v29.4S // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v24.4S, v9.4S, v26.4S // ..............................................................................................*......................................................................................................................................................................................... + mul v9.4S, v9.4S, v25.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v19.4S, v28.4S, v19.4S // .............................................................................*.......................................................................................................................................................................................................... + mls v12.4S, v20.4S, v29.4S // ............................................................................................*........................................................................................................................................................................................... + cmge v28.4S, v10.4S, v30.4S // ...................................................................................................*.................................................................................................................................................................................... + cmge v27.4S, v31.4S, v10.4S // ........................................................................................................*............................................................................................................................................................................... + cmge v20.4S, v11.4S, v30.4S // .........................................................................................................*.............................................................................................................................................................................. + mls v23.4S, v16.4S, v29.4S // .....................................................................................................*.................................................................................................................................................................................. + sub v28.4S, v27.4S, v28.4S // .............................................................................................................*.......................................................................................................................................................................... + str q18, [x1, #640] // ..............................................................................................................*......................................................................................................................................................................... + cmge v16.4S, v31.4S, v11.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v9.4S, v24.4S, v29.4S // ............................................................................................................*........................................................................................................................................................................... + cmge v24.4S, v31.4S, v13.4S // ...........................................................................................*............................................................................................................................................................................................ + cmge v27.4S, v31.4S, v14.4S // ..................................................................................................................*..................................................................................................................................................................... + cmge v18.4S, v31.4S, v12.4S // ...........................................................................................................*............................................................................................................................................................................ + mls v21.4S, v19.4S, v29.4S // ......................................................................................*................................................................................................................................................................................................. + cmge v19.4S, v12.4S, v30.4S // ...............................................................................................................*........................................................................................................................................................................ + str q23, [x1, #960] // ................................................................................................................*....................................................................................................................................................................... + sub v23.4S, v16.4S, v20.4S // ...........................................................................................................................*............................................................................................................................................................ + mls v10.4S, v28.4S, v29.4S // .................................................................................................................*...................................................................................................................................................................... + cmge v20.4S, v31.4S, v9.4S // ......................................................................................................................*................................................................................................................................................................. + sub v18.4S, v18.4S, v19.4S // .....................................................................................................................*.................................................................................................................................................................. + str q21, [x1, #704] // ................................................................................................*....................................................................................................................................................................................... + str q10, [x1, #128] // ..............................................................................................................................*......................................................................................................................................................... + cmge v19.4S, v14.4S, v30.4S // ......................................................................................................*................................................................................................................................................................................. + ldr q16, [x1, #720] // ..................................................................................................................................................*..................................................................................................................................... + cmge v28.4S, v15.4S, v30.4S // ........................................................................................................................*............................................................................................................................................................... + cmge v10.4S, v9.4S, v30.4S // ....................................................................................................................*................................................................................................................................................................... + mls v11.4S, v23.4S, v29.4S // .................................................................................................................................*...................................................................................................................................................... + sub v24.4S, v24.4S, v22.4S // .................................................................................................*...................................................................................................................................................................................... + ldr q22, [x1, #656] // ...................................................................................................................................................*.................................................................................................................................... + ldr q23, [x1, #592] // ............................................................................................................................................*........................................................................................................................................... + ldr q21, [x1, #528] // .............................................................................................................................................*.......................................................................................................................................... + mls v12.4S, v18.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v19.4S, v27.4S, v19.4S // ..........................................................................................................................*............................................................................................................................................................. + cmge v18.4S, v8.4S, v30.4S // ....................................................................................*................................................................................................................................................................................................... + sub v28.4S, v17.4S, v28.4S // ................................................................................................................................*....................................................................................................................................................... + ldr q27, [x1, #16] // .....................................................................................................................................................*.................................................................................................................................. + ldr q17, [x1, #80] // ....................................................................................................................................................*................................................................................................................................... + sub v20.4S, v20.4S, v10.4S // ...............................................................................................................................*........................................................................................................................................................ + cmge v10.4S, v31.4S, v8.4S // ....................................................................................................*................................................................................................................................................................................... + mls v13.4S, v24.4S, v29.4S // ............................................................................................................................*........................................................................................................................................................... + str q11, [x1, #192] // ........................................................................................................................................*............................................................................................................................................... + mls v14.4S, v19.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mls v15.4S, v28.4S, v29.4S // ...................................................................................................................................*.................................................................................................................................................... + mls v9.4S, v20.4S, v29.4S // ....................................................................................................................................*................................................................................................................................................... + sub v20.4S, v21.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + str q12, [x1, #256] // .......................................................................................................................................*................................................................................................................................................ + ldr q24, [x1, #464] // .................................................................................................................................................*...................................................................................................................................... + add v21.4S, v21.4S, v23.4S // .........................................................................................................................................................*.............................................................................................................................. + str q15, [x1, #448] // ..........................................................................................................................................*............................................................................................................................................. + str q9, [x1, #64] // ...........................................................................................................................................*............................................................................................................................................ + ldr q9, [x1, #272] // ......................................................................................................................................................*................................................................................................................................. + sub v19.4S, v27.4S, v17.4S // .....................................................................................................................................................................*.................................................................................................................. + sub v28.4S, v22.4S, v16.4S // .................................................................................................................................................................*...................................................................................................................... + sub v11.4S, v10.4S, v18.4S // ..........................................................................................................*............................................................................................................................................................................. + add v15.4S, v27.4S, v17.4S // ....................................................................................................................................................................*................................................................................................................... + mul v17.4S, v20.4S, v5.S[2] // ..................................................................................................................................................................*..................................................................................................................... + sqrdmulh v18.4S, v20.4S, v5.S[3] // ...................................................................................................................................................................*.................................................................................................................... + add v23.4S, v22.4S, v16.4S // ................................................................................................................................................................*....................................................................................................................... + ldr q27, [x1, #336] // ..........................................................................................................................................................*............................................................................................................................. + str q14, [x1, #384] // .........................................................................................................................................*.............................................................................................................................................. + ldr q12, [x1, #144] // ..............................................................................................................................................*......................................................................................................................................... + ldr q10, [x1, #208] // ...............................................................................................................................................*........................................................................................................................................ + str q13, [x1, #320] // ......................................................................................................................................*................................................................................................................................................. + sqrdmulh v22.4S, v19.4S, v3.S[3] // ..........................................................................................................................................................................*............................................................................................................. + sqrdmulh v13.4S, v28.4S, v6.S[1] // .........................................................................................................................................................................*.............................................................................................................. + ldr q14, [x1, #400] // ................................................................................................................................................*....................................................................................................................................... + mul v20.4S, v28.4S, v6.S[0] // ........................................................................................................................................................................*............................................................................................................... + ldr q16, [x1, #912] // ...........................................................................................................................................................*............................................................................................................................ + mls v8.4S, v11.4S, v29.4S // .........................................................................................................................*.............................................................................................................................................................. + ldr q28, [x1, #976] // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v21.4S, v23.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v17.4S, v18.4S, v29.4S // ................................................................................................................................................................................*....................................................................................................... + mul v19.4S, v19.4S, v3.S[2] // ...........................................................................................................................................................................*............................................................................................................ + str q8, [x1], #(16) // .....................................................................................................................................*.................................................................................................................................................. + sub v18.4S, v16.4S, v28.4S // .................................................................................................................................................................................*...................................................................................................... + add v16.4S, v16.4S, v28.4S // ..................................................................................................................................................................................*..................................................................................................... + add v28.4S, v12.4S, v10.4S // .............................................................................................................................................................*.......................................................................................................................... + sub v8.4S, v12.4S, v10.4S // ............................................................................................................................................................*........................................................................................................................... + add v10.4S, v9.4S, v27.4S // ........................................................................................................................................................................................*............................................................................................... + sub v27.4S, v9.4S, v27.4S // ...............................................................................................................................................................................*........................................................................................................ + mls v20.4S, v13.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v13.4S, v15.4S, v28.4S // .........................................................................................................................................................................................*.............................................................................................. + add v9.4S, v15.4S, v28.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v15.4S, v8.4S, v4.S[0] // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v28.4S, v8.4S, v4.S[1] // ......................................................................................................................................................................*................................................................................................................. + mls v19.4S, v22.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v8.4S, v21.4S, v23.4S // .............................................................................................................................................................................................*.......................................................................................... + ldr q21, [x1, #832] // ...........................................................................................................................................................................................*............................................................................................ + mul v22.4S, v27.4S, v4.S[2] // .....................................................................................................................................................................................*.................................................................................................. + sqrdmulh v27.4S, v27.4S, v4.S[3] // ....................................................................................................................................................................................*................................................................................................... + ldr q12, [x1, #768] // ......................................................................................................................................................................................*................................................................................................. + add v23.4S, v14.4S, v24.4S // ..............................................................................................................................................................*......................................................................................................................... + sub v24.4S, v14.4S, v24.4S // ...............................................................................................................................................................*........................................................................................................................ + sub v14.4S, v17.4S, v20.4S // ..............................................................................................................................................................................................................................*......................................................... + add v20.4S, v17.4S, v20.4S // ...............................................................................................................................................................................................................................*........................................................ + mls v15.4S, v28.4S, v29.4S // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v17.4S, v18.4S, v7.S[1] // ...................................................................................................................................................................................................................*.................................................................... + mul v28.4S, v18.4S, v7.S[0] // ..................................................................................................................................................................................................................*..................................................................... + mls v22.4S, v27.4S, v29.4S // ................................................................................................................................................................................................*....................................................................................... + sub v18.4S, v10.4S, v23.4S // ...............................................................................................................................................................................................*........................................................................................ + add v10.4S, v10.4S, v23.4S // .................................................................................................................................................................................................*...................................................................................... + mul v23.4S, v13.4S, v1.S[2] // ....................................................................................................................................................................................................*................................................................................... + mls v28.4S, v17.4S, v29.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v17.4S, v12.4S, v21.4S // .....................................................................................................................................................................................................*.................................................................................. + add v27.4S, v12.4S, v21.4S // .......................................................................................................................................................................................................*................................................................................ + sqrdmulh v21.4S, v13.4S, v1.S[3] // ......................................................................................................................................................................................................*................................................................................. + sub v13.4S, v9.4S, v10.4S // ....................................................................................................................................................................................................................*................................................................... + add v10.4S, v9.4S, v10.4S // .......................................................................................................................................................................................................................*................................................................ + sqrdmulh v12.4S, v24.4S, v5.S[1] // .............................................................................................................................................................................*.......................................................................................................... + mul v9.4S, v24.4S, v5.S[0] // ..............................................................................................................................................................................*......................................................................................................... + add v24.4S, v27.4S, v16.4S // ..........................................................................................................................................................................................................*............................................................................. + sub v16.4S, v27.4S, v16.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v27.4S, v17.4S, v6.S[2] // ............................................................................................................................................................................................................*........................................................................... + sqrdmulh v17.4S, v17.4S, v6.S[3] // ..............................................................................................................................................................................................................*......................................................................... + mls v23.4S, v21.4S, v29.4S // .............................................................................................................................................................................................................*.......................................................................... + sqrdmulh v21.4S, v13.4S, v0.S[3] // ........................................................................................................................................................................................................................................*............................................... + mls v9.4S, v12.4S, v29.4S // ............................................................................................................................................................................................*........................................................................................... + sub v12.4S, v11.4S, v24.4S // .............................................................................................................................................................................................................................*.......................................................... + add v24.4S, v11.4S, v24.4S // ............................................................................................................................................................................................................................*........................................................... + mul v11.4S, v13.4S, v0.S[2] // .........................................................................................................................................................................................................................................*.............................................. + sub v13.4S, v19.4S, v15.4S // ...................................................................................................................................................................................................*.................................................................................... + add v15.4S, v19.4S, v15.4S // ..................................................................................................................................................................................................*..................................................................................... + mls v27.4S, v17.4S, v29.4S // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v19.4S, v8.4S, v2.S[3] // ............................................................................................................................................................................................................................................*........................................... + mul v17.4S, v8.4S, v2.S[2] // ...........................................................................................................................................................................................................................................*............................................ + add v8.4S, v10.4S, v24.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v24.4S, v10.4S, v24.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v11.4S, v21.4S, v29.4S // .................................................................................................................................................................................................................................................*...................................... + sqrdmulh v21.4S, v12.4S, v1.S[1] // ..................................................................................................................................................................................................................................................*..................................... + mul v12.4S, v12.4S, v1.S[0] // ................................................................................................................................................................................................................................................*....................................... + sub v10.4S, v22.4S, v9.4S // ...............................................................................................................................................................................................................*........................................................................ + add v22.4S, v22.4S, v9.4S // ................................................................................................................................................................................................................*....................................................................... + sub v9.4S, v27.4S, v28.4S // ................................................................................................................................................................................................................................*....................................................... + add v28.4S, v27.4S, v28.4S // .................................................................................................................................................................................................................................*...................................................... + mul v27.4S, v16.4S, v3.S[0] // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v16.4S, v16.4S, v3.S[1] // ..........................................................................................................................................................................................................................*............................................................. + mls v17.4S, v19.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + mls v12.4S, v21.4S, v29.4S // .............................................................................................................................................................................................................................................................*.......................... + sub v21.4S, v15.4S, v22.4S // ...................................................................................................................................................................................................................................................*.................................... + add v22.4S, v15.4S, v22.4S // ....................................................................................................................................................................................................................................................*................................... + sqrdmulh v15.4S, v10.4S, v2.S[1] // ......................................................................................................................................................................................................................................*................................................. + mul v19.4S, v10.4S, v2.S[0] // .....................................................................................................................................................................................................................................*.................................................. + sqrdmulh v10.4S, v24.4S, v0.S[1] // ...........................................................................................................................................................................................................................................................*............................ + mls v27.4S, v16.4S, v29.4S // .......................................................................................................................................................................................................................................*................................................ + mul v16.4S, v24.4S, v0.S[0] // ..........................................................................................................................................................................................................................................................*............................. + mul v24.4S, v13.4S, v1.S[2] // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v13.4S, v13.4S, v1.S[3] // ......................................................................................................................................................................................................................*................................................................. + mls v19.4S, v15.4S, v29.4S // .............................................................................................................................................................................................................................................*.......................................... + mul v15.4S, v18.4S, v2.S[0] // .........................................................................................................................................................................................................*.............................................................................. + sqrdmulh v18.4S, v18.4S, v2.S[1] // ........................................................................................................................................................................................................*............................................................................... + mls v24.4S, v13.4S, v29.4S // ....................................................................................................................................................................................................................................*................................................... + mul v13.4S, v21.4S, v0.S[2] // ........................................................................................................................................................................................................................................................*............................... + mls v16.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................................................*....................... + mls v15.4S, v18.4S, v29.4S // .................................................................................................................................................................................................................*...................................................................... + mul v18.4S, v14.4S, v2.S[2] // ..................................................................................................................................................................................................................................*..................................................... + sqrdmulh v14.4S, v14.4S, v2.S[3] // ...................................................................................................................................................................................................................................*.................................................... + sqrdmulh v10.4S, v21.4S, v0.S[3] // .........................................................................................................................................................................................................................................................*.............................. + sub v21.4S, v20.4S, v28.4S // ...........................................................................................................................................................................................................................................................................*............ + mls v13.4S, v10.4S, v29.4S // ............................................................................................................................................................................................................................................................*........................... + add v10.4S, v17.4S, v27.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v27.4S, v17.4S, v27.4S // ......................................................................................................................................................................................................................................................................*................. + mul v17.4S, v9.4S, v3.S[0] // ..............................................................................................................................................................................................................................................................*......................... + add v28.4S, v20.4S, v28.4S // .......................................................................................................................................................................................................................................................................*................ + mls v18.4S, v14.4S, v29.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v14.4S, v9.4S, v3.S[1] // ...............................................................................................................................................................................................................................................................*........................ + sub v9.4S, v24.4S, v19.4S // ....................................................................................................................................................................................................................................................................*................... + sub v20.4S, v11.4S, v12.4S // .........................................................................................................................................................................................................................................................................*.............. + add v12.4S, v11.4S, v12.4S // ..........................................................................................................................................................................................................................................................................*............. + add v11.4S, v24.4S, v19.4S // ...................................................................................................................................................................................................................................................................*.................... + sub v24.4S, v23.4S, v15.4S // ......................................................................................................................................................................................................................................................*................................. + add v15.4S, v23.4S, v15.4S // .....................................................................................................................................................................................................................................................*.................................. + sqrdmulh v19.4S, v9.4S, v0.S[3] // ..................................................................................................................................................................................................................................................................................*..... + mul v23.4S, v9.4S, v0.S[2] // ...............................................................................................................................................................................................................................................................................*........ + mls v17.4S, v14.4S, v29.4S // ........................................................................................................................................................................................................................................................................*............... + mul v14.4S, v24.4S, v0.S[2] // .................................................................................................................................................................................................................................................................*...................... + sqrdmulh v24.4S, v24.4S, v0.S[3] // ..................................................................................................................................................................................................................................................................*..................... + add v9.4S, v22.4S, v28.4S // .............................................................................................................................................................................................................................................................................*.......... + sub v28.4S, v22.4S, v28.4S // ............................................................................................................................................................................................................................................................................*........... + mul v22.4S, v21.4S, v1.S[0] // ................................................................................................................................................................................................................................................................................*....... + sqrdmulh v21.4S, v21.4S, v1.S[1] // .................................................................................................................................................................................................................................................................................*...... + mls v23.4S, v19.4S, v29.4S // .......................................................................................................................................................................................................................................................................................* + sqrdmulh v19.4S, v27.4S, v1.S[1] // ......................................................................................................................................................................................................................................................................................*. + mls v14.4S, v24.4S, v29.4S // ..............................................................................................................................................................................................................................................................................*......... + mul v24.4S, v27.4S, v1.S[0] // .....................................................................................................................................................................................................................................................................................*.. + add v27.4S, v18.4S, v17.4S // ...................................................................................................................................................................................................................................................................................*.... + sub v18.4S, v18.4S, v17.4S // ....................................................................................................................................................................................................................................................................................*... - // original source code - // ldr q8, [x1, #0] // ...................................e...............................................................................................................................|.......................................................................................................................................................e..................... - // ldr q9, [x1, #(1*(512/8))] // ...............................e...................................................................................................................................|...................................................................................................................................................e......................... - // ldr q10, [x1, #(2*(512/8))] // ....................................e..............................................................................................................................|........................................................................................................................................................e.................... - // ldr q11, [x1, #(3*(512/8))] // .............................e.....................................................................................................................................|.................................................................................................................................................e........................... - // ldr q12, [x1, #(4*(512/8))] // e..................................................................................................................................................................|....................................................................................................................e........................................................ - // ldr q13, [x1, #(5*(512/8))] // ..e................................................................................................................................................................|......................................................................................................................e...................................................... - // ldr q14, [x1, #(6*(512/8))] // .....e.............................................................................................................................................................|.........................................................................................................................e................................................... - // ldr q15, [x1, #(7*(512/8))] // .......e...........................................................................................................................................................|...........................................................................................................................e................................................. - // ldr q16, [x1, #(8*(512/8))] // .........e.........................................................................................................................................................|.............................................................................................................................e............................................... - // ldr q17, [x1, #(9*(512/8))] // ...............e...................................................................................................................................................|...................................................................................................................................e......................................... - // ldr q18, [x1, #(10*(512/8))] // ..............................e....................................................................................................................................|..................................................................................................................................................e.......................... - // ldr q19, [x1, #(11*(512/8))] // ............................e......................................................................................................................................|................................................................................................................................................e............................ - // ldr q20, [x1, #(12*(512/8))] // .............................................................e.....................................................................................................|............................................................................................................................................................................. - // ldr q21, [x1, #(13*(512/8))] // .......................................................e...........................................................................................................|...........................................................................................................................................................................e. - // ldr q22, [x1, #(14*(512/8))] // ...................................................................e...............................................................................................|............................................................................................................................................................................. - // ldr q23, [x1, #(15*(512/8))] // ..................................................................e................................................................................................|............................................................................................................................................................................. - // sub v24.4s, v8.4s, v9.4s // .................................................................e.................................................................................................|............................................................................................................................................................................. - // add v8.4s, v8.4s, v9.4s // ................................................................e..................................................................................................|............................................................................................................................................................................. - // mul v9.4s, v24.4s, v3.s[2] // .................................................................................e.................................................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..................................................................................e................................................................................|............................................................................................................................................................................. - // mls v9.4s, v24.4s, v29.4s // ...........................................................................................e.......................................................................|............................................................................................................................................................................. - // sub v24.4s, v10.4s, v11.4s // ......................................................e............................................................................................................|..........................................................................................................................................................................e.. - // add v10.4s, v10.4s, v11.4s // ...................................................e...............................................................................................................|.......................................................................................................................................................................e..... - // mul v11.4s, v24.4s, v4.s[0] // ..............................................................e....................................................................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.s[1] // ...............................................................e...................................................................................................|............................................................................................................................................................................. - // mls v11.4s, v24.4s, v29.4s // .........................................................................e.........................................................................................|............................................................................................................................................................................. - // sub v24.4s, v12.4s, v13.4s // ...................e...............................................................................................................................................|.......................................................................................................................................e..................................... - // add v12.4s, v12.4s, v13.4s // ..................e................................................................................................................................................|......................................................................................................................................e...................................... - // mul v13.4s, v24.4s, v4.s[2] // ............................................e......................................................................................................................|................................................................................................................................................................e............ - // sqrdmulh v24.4s, v24.4s, v4.s[3] // .............................................e.....................................................................................................................|.................................................................................................................................................................e........... - // mls v13.4s, v24.4s, v29.4s // ..........................................................e........................................................................................................|............................................................................................................................................................................. - // sub v24.4s, v14.4s, v15.4s // ......................e............................................................................................................................................|..........................................................................................................................................e.................................. - // add v14.4s, v14.4s, v15.4s // ...........................e.......................................................................................................................................|...............................................................................................................................................e............................. - // mul v15.4s, v24.4s, v5.s[0] // .....................................e.............................................................................................................................|.........................................................................................................................................................e................... - // sqrdmulh v24.4s, v24.4s, v5.s[1] // .................................e.................................................................................................................................|.....................................................................................................................................................e....................... - // mls v15.4s, v24.4s, v29.4s // ....................................................e..............................................................................................................|........................................................................................................................................................................e.... - // sub v24.4s, v16.4s, v17.4s // ........................e..........................................................................................................................................|............................................................................................................................................e................................ - // add v16.4s, v16.4s, v17.4s // ..........................e........................................................................................................................................|..............................................................................................................................................e.............................. - // mul v17.4s, v24.4s, v5.s[2] // ........................................e..........................................................................................................................|............................................................................................................................................................e................ - // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...........................................e.......................................................................................................................|...............................................................................................................................................................e............. - // mls v17.4s, v24.4s, v29.4s // ...........................................................e.......................................................................................................|............................................................................................................................................................................. - // sub v24.4s, v18.4s, v19.4s // .................................................e.................................................................................................................|.....................................................................................................................................................................e....... - // add v18.4s, v18.4s, v19.4s // ...............................................e...................................................................................................................|...................................................................................................................................................................e......... - // mul v19.4s, v24.4s, v6.s[0] // .........................................................e.........................................................................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v6.s[1] // ............................................................e......................................................................................................|............................................................................................................................................................................. - // mls v19.4s, v24.4s, v29.4s // ......................................................................e............................................................................................|............................................................................................................................................................................. - // sub v24.4s, v20.4s, v21.4s // ...........................................................................e.......................................................................................|............................................................................................................................................................................. - // add v20.4s, v20.4s, v21.4s // ..........................................................................e........................................................................................|............................................................................................................................................................................. - // mul v21.4s, v24.4s, v6.s[2] // ...................................................................................e...............................................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v6.s[3] // ....................................................................................e..............................................................................|............................................................................................................................................................................. - // mls v21.4s, v24.4s, v29.4s // .............................................................................................e.....................................................................|............................................................................................................................................................................. - // sub v24.4s, v22.4s, v23.4s // ..............................................................................e....................................................................................|............................................................................................................................................................................. - // add v22.4s, v22.4s, v23.4s // .............................................................................e.....................................................................................|............................................................................................................................................................................. - // mul v23.4s, v24.4s, v7.s[0] // ...............................................................................................e...................................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................e..................................................................|............................................................................................................................................................................. - // mls v23.4s, v24.4s, v29.4s // ......................................................................................................e............................................................|............................................................................................................................................................................. - // sub v24.4s, v8.4s, v10.4s // ............................................................................e......................................................................................|............................................................................................................................................................................. - // add v8.4s, v8.4s, v10.4s // ........................................................................e..........................................................................................|............................................................................................................................................................................. - // mul v10.4s, v24.4s, v1.s[2] // .........................................................................................e.........................................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..........................................................................................e........................................................................|............................................................................................................................................................................. - // mls v10.4s, v24.4s, v29.4s // ..................................................................................................e................................................................|............................................................................................................................................................................. - // sub v24.4s, v9.4s, v11.4s // ....................................................................................................e..............................................................|............................................................................................................................................................................. - // add v9.4s, v9.4s, v11.4s // ...................................................................................................e...............................................................|............................................................................................................................................................................. - // mul v11.4s, v24.4s, v1.s[2] // .......................................................................................................e...........................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................................................e..........................................................|............................................................................................................................................................................. - // mls v11.4s, v24.4s, v29.4s // ...........................................................................................................e.......................................................|............................................................................................................................................................................. - // sub v24.4s, v12.4s, v14.4s // ..........................................e........................................................................................................................|..............................................................................................................................................................e.............. - // add v12.4s, v12.4s, v14.4s // .......................................e...........................................................................................................................|...........................................................................................................................................................e................. - // mul v14.4s, v24.4s, v2.s[0] // ..................................................e................................................................................................................|......................................................................................................................................................................e...... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................e.............................................................................................................|.........................................................................................................................................................................e... - // mls v14.4s, v24.4s, v29.4s // ....................................................................e..............................................................................................|............................................................................................................................................................................. - // sub v24.4s, v13.4s, v15.4s // ..............................................................................................e....................................................................|............................................................................................................................................................................. - // add v13.4s, v13.4s, v15.4s // ............................................................................................e......................................................................|............................................................................................................................................................................. - // mul v15.4s, v24.4s, v2.s[0] // ............................................................................................................................e......................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................................................e.......................................|............................................................................................................................................................................. - // mls v15.4s, v24.4s, v29.4s // ..................................................................................................................................e................................|............................................................................................................................................................................. - // sub v24.4s, v16.4s, v18.4s // .....................................................................e.............................................................................................|............................................................................................................................................................................. - // add v16.4s, v16.4s, v18.4s // .......................................................................e...........................................................................................|............................................................................................................................................................................. - // mul v18.4s, v24.4s, v2.s[2] // ..........................................................................................................e........................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // .........................................................................................................e.........................................................|............................................................................................................................................................................. - // mls v18.4s, v24.4s, v29.4s // ............................................................................................................e......................................................|............................................................................................................................................................................. - // sub v24.4s, v17.4s, v19.4s // .....................................................................................................................................e.............................|............................................................................................................................................................................. - // add v17.4s, v17.4s, v19.4s // .........................................................................................................................................e.........................|............................................................................................................................................................................. - // mul v19.4s, v24.4s, v2.s[2] // .................................................................................................................................................e.................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..................................................................................................................................................e................|............................................................................................................................................................................. - // mls v19.4s, v24.4s, v29.4s // .......................................................................................................................................................e...........|............................................................................................................................................................................. - // sub v24.4s, v20.4s, v22.4s // ...............................................................................e...................................................................................|............................................................................................................................................................................. - // add v20.4s, v20.4s, v22.4s // ................................................................................e..................................................................................|............................................................................................................................................................................. - // mul v22.4s, v24.4s, v3.s[0] // .....................................................................................................e.............................................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................e.................................................................|............................................................................................................................................................................. - // mls v22.4s, v24.4s, v29.4s // ................................................................................................................e..................................................|............................................................................................................................................................................. - // sub v24.4s, v21.4s, v23.4s // ..............................................................................................................e....................................................|............................................................................................................................................................................. - // add v21.4s, v21.4s, v23.4s // .............................................................................................................e.....................................................|............................................................................................................................................................................. - // mul v23.4s, v24.4s, v3.s[0] // .......................................................................................................................................e...........................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................................................e..........................|............................................................................................................................................................................. - // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................e...............|............................................................................................................................................................................. - // sub v24.4s, v8.4s, v12.4s // .....................................................................................e.............................................................................|............................................................................................................................................................................. - // add v8.4s, v8.4s, v12.4s // ......................................................................................e............................................................................|............................................................................................................................................................................. - // mul v12.4s, v24.4s, v0.s[2] // ....................................................................................................................................e..............................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................e...............................|............................................................................................................................................................................. - // mls v12.4s, v24.4s, v29.4s // ............................................................................................................................................e......................|............................................................................................................................................................................. - // sub v24.4s, v9.4s, v13.4s // ......................................................................................................................e............................................|............................................................................................................................................................................. - // add v9.4s, v9.4s, v13.4s // .........................................................................................................................e.........................................|............................................................................................................................................................................. - // mul v13.4s, v24.4s, v0.s[2] // ....................................................................................................................................................e..............|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................................e.............|............................................................................................................................................................................. - // mls v13.4s, v24.4s, v29.4s // ...........................................................................................................................................................e.......|............................................................................................................................................................................. - // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................e...............................................|............................................................................................................................................................................. - // add v10.4s, v10.4s, v14.4s // ..................................................................................................................e................................................|............................................................................................................................................................................. - // mul v14.4s, v24.4s, v0.s[2] // ..........................................................................................................................e........................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................e.....................................|............................................................................................................................................................................. - // mls v14.4s, v24.4s, v29.4s // .................................................................................................................................e.................................|............................................................................................................................................................................. - // sub v24.4s, v11.4s, v15.4s // .............................................................................................................................................e.....................|............................................................................................................................................................................. - // add v11.4s, v11.4s, v15.4s // ..............................................................................................................................................e....................|............................................................................................................................................................................. - // mul v15.4s, v24.4s, v0.s[2] // ...................................................................................................................................................................|....*........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................................................|.....*....................................................................................................................................................................... - // mls v15.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............*............................................................................................................................................................. - // sub v24.4s, v16.4s, v20.4s // .......................................................................................e...........................................................................|............................................................................................................................................................................. - // add v16.4s, v16.4s, v20.4s // ........................................................................................e..........................................................................|............................................................................................................................................................................. - // mul v20.4s, v24.4s, v1.s[0] // ...............................................................................................................................e...................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................................................e..................................|............................................................................................................................................................................. - // mls v20.4s, v24.4s, v29.4s // ......................................................................................................................................e............................|............................................................................................................................................................................. - // sub v24.4s, v17.4s, v21.4s // ................................................................................................................................................e..................|............................................................................................................................................................................. - // add v17.4s, v17.4s, v21.4s // ...............................................................................................................................................e...................|............................................................................................................................................................................. - // mul v21.4s, v24.4s, v1.s[0] // .........................................................................................................................................................e.........|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................................................................................................e..........|............................................................................................................................................................................. - // mls v21.4s, v24.4s, v29.4s // ...............................................................................................................................................................e...|............................................................................................................................................................................. - // sub v24.4s, v18.4s, v22.4s // ........................................................................................................................e..........................................|............................................................................................................................................................................. - // add v18.4s, v18.4s, v22.4s // .......................................................................................................................e...........................................|............................................................................................................................................................................. - // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................e........................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................................e.......................|............................................................................................................................................................................. - // mls v22.4s, v24.4s, v29.4s // ......................................................................................................................................................e............|............................................................................................................................................................................. - // sub v24.4s, v19.4s, v23.4s // ..................................................................................................................................................................e|............................................................................................................................................................................. - // add v19.4s, v19.4s, v23.4s // .................................................................................................................................................................e.|............................................................................................................................................................................. - // mul v23.4s, v24.4s, v1.s[0] // ...................................................................................................................................................................|.............*............................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................................|................*............................................................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................................|.........................*................................................................................................................................................... - // sub v24.4s, v8.4s, v16.4s // ...............................................................................................................e...................................................|............................................................................................................................................................................. - // add v8.4s, v8.4s, v16.4s // .................................................................................................................e.................................................|............................................................................................................................................................................. - // mul v16.4s, v24.4s, v0.s[0] // ....................................................................................................................e..............................................|............................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................e.............................................|............................................................................................................................................................................. - // mls v16.4s, v24.4s, v29.4s // ..............................................................................................................................e....................................|............................................................................................................................................................................. - // sub v24.4s, v9.4s, v17.4s // ...................................................................................................................................................................|*............................................................................................................................................................................ - // add v9.4s, v9.4s, v17.4s // ...................................................................................................................................................................|...*......................................................................................................................................................................... - // mul v17.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|........................*.................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.......................*..................................................................................................................................................... - // mls v17.4s, v24.4s, v29.4s // ...................................................................................................................................................................|............................*................................................................................................................................................ - // sub v24.4s, v10.4s, v18.4s // .............................................................................................................................................................e.....|............................................................................................................................................................................. - // add v10.4s, v10.4s, v18.4s // ............................................................................................................................................................e......|............................................................................................................................................................................. - // mul v18.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|.....................*....................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|......................*...................................................................................................................................................... - // mls v18.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............................*............................................................................................................................................. - // sub v24.4s, v11.4s, v19.4s // ...................................................................................................................................................................|.................*........................................................................................................................................................... - // add v11.4s, v11.4s, v19.4s // ...................................................................................................................................................................|..................*.......................................................................................................................................................... - // mul v19.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|....................................*........................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.....................................*....................................................................................................................................... - // mls v19.4s, v24.4s, v29.4s // ...................................................................................................................................................................|............................................*................................................................................................................................ - // sub v24.4s, v12.4s, v20.4s // ...................................................................................................................................................................|.*........................................................................................................................................................................... - // add v12.4s, v12.4s, v20.4s // ...................................................................................................................................................................|..*.......................................................................................................................................................................... - // mul v20.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|........*.................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.........*................................................................................................................................................................... - // mls v20.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...................*......................................................................................................................................................... - // sub v24.4s, v13.4s, v21.4s // ...................................................................................................................................................................|......*...................................................................................................................................................................... - // add v13.4s, v13.4s, v21.4s // ...................................................................................................................................................................|.......*..................................................................................................................................................................... - // mul v21.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|....................*........................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|..............*.............................................................................................................................................................. - // mls v21.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...........................*................................................................................................................................................. - // sub v24.4s, v14.4s, v22.4s // ...................................................................................................................................................................|..........*.................................................................................................................................................................. - // add v14.4s, v14.4s, v22.4s // ...................................................................................................................................................................|...........*................................................................................................................................................................. - // mul v22.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|..........................*.................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|................................*............................................................................................................................................ - // mls v22.4s, v24.4s, v29.4s // ...................................................................................................................................................................|..........................................*.................................................................................................................................. - // sub v24.4s, v15.4s, v23.4s // ...................................................................................................................................................................|..................................*.......................................................................................................................................... - // add v15.4s, v15.4s, v23.4s // ...................................................................................................................................................................|.................................*........................................................................................................................................... - // mul v23.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|......................................*...................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.......................................*..................................................................................................................................... - // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............................................*............................................................................................................................. - // cmge v27.4s, v31.4s, v16.4s // ..........................................................................................................................................................e........|............................................................................................................................................................................. - // cmge v28.4s, v16.4s, v30.4s // ..............................................................................................................................................................e....|............................................................................................................................................................................. - // sub v28.4s, v27.4s, v28.4s // ................................................................................................................................................................e..|............................................................................................................................................................................. - // mls v16.4s, v28.4s, v29.4s // ...................................................................................................................................................................*............................................................................................................................................................................. - // cmge v27.4s, v31.4s, v17.4s // ...................................................................................................................................................................|.................................................*........................................................................................................................... - // cmge v28.4s, v17.4s, v30.4s // ...................................................................................................................................................................|..................................................*.......................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.....................................................*....................................................................................................................... - // mls v17.4s, v28.4s, v29.4s // ...................................................................................................................................................................|...........................................................*................................................................................................................. - // cmge v27.4s, v31.4s, v18.4s // ...................................................................................................................................................................|...........................................*................................................................................................................................. - // cmge v28.4s, v18.4s, v30.4s // ...................................................................................................................................................................|.........................................*................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|................................................*............................................................................................................................ - // mls v18.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.......................................................*..................................................................................................................... - // cmge v27.4s, v31.4s, v19.4s // ...................................................................................................................................................................|..........................................................*.................................................................................................................. - // cmge v28.4s, v19.4s, v30.4s // ...................................................................................................................................................................|......................................................*...................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.............................................................*............................................................................................................... - // mls v19.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.........................................................................*................................................................................................... - // cmge v27.4s, v31.4s, v20.4s // ...................................................................................................................................................................|.............................*............................................................................................................................................... - // cmge v28.4s, v20.4s, v30.4s // ...................................................................................................................................................................|..............................*.............................................................................................................................................. - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...................................*......................................................................................................................................... - // mls v20.4s, v28.4s, v29.4s // ...................................................................................................................................................................|........................................*.................................................................................................................................... - // cmge v27.4s, v31.4s, v21.4s // ...................................................................................................................................................................|.......................................................................................*..................................................................................... - // cmge v28.4s, v21.4s, v30.4s // ...................................................................................................................................................................|..............................................................................*.............................................................................................. - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...........................................................................................*................................................................................. - // mls v21.4s, v28.4s, v29.4s // ...................................................................................................................................................................|......................................................................................................*...................................................................... - // cmge v27.4s, v31.4s, v22.4s // ...................................................................................................................................................................|....................................................................*........................................................................................................ - // cmge v28.4s, v22.4s, v30.4s // ...................................................................................................................................................................|...................................................................*......................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|............................................................................*................................................................................................ - // mls v22.4s, v28.4s, v29.4s // ...................................................................................................................................................................|....................................................................................*........................................................................................ - // cmge v27.4s, v31.4s, v23.4s // ...................................................................................................................................................................|........................................................*.................................................................................................................... - // cmge v28.4s, v23.4s, v30.4s // ...................................................................................................................................................................|.........................................................*................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................*............................................................................................................. - // mls v23.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.....................................................................*....................................................................................................... - // str q16, [x1, #(8*(512/8))] // ...................................................................................................................................................................|............*................................................................................................................................................................ - // str q17, [x1, #(9*(512/8))] // ...................................................................................................................................................................|.......................................................................*..................................................................................................... - // str q18, [x1, #(10*(512/8))] // ...................................................................................................................................................................|.................................................................*........................................................................................................... - // str q19, [x1, #(11*(512/8))] // ...................................................................................................................................................................|...................................................................................*......................................................................................... - // str q20, [x1, #(12*(512/8))] // ...................................................................................................................................................................|...................................................*......................................................................................................................... - // str q21, [x1, #(13*(512/8))] // ...................................................................................................................................................................|.................................................................................................................*........................................................... - // str q22, [x1, #(14*(512/8))] // ...................................................................................................................................................................|.................................................................................................*........................................................................... - // str q23, [x1, #(15*(512/8))] // ...................................................................................................................................................................|..................................................................................*.......................................................................................... - // mul v16.4s, v8.4s, v25.4s // ...................................................................................................................................................................|.............................................*............................................................................................................................... - // sqrdmulh v8.4s, v8.4s, v26.4s // ...................................................................................................................................................................|..............................................*.............................................................................................................................. - // mls v16.4s, v8.4s, v29.4s // ...................................................................................................................................................................|....................................................*........................................................................................................................ - // mul v17.4s, v9.4s, v25.4s // ...................................................................................................................................................................|........................................................................*.................................................................................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................................................................................................................................|..........................................................................*.................................................................................................. - // mls v17.4s, v9.4s, v29.4s // ...................................................................................................................................................................|....................................................................................................*........................................................................ - // mul v18.4s, v10.4s, v25.4s // ...................................................................................................................................................................|.............................................................................*............................................................................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ...................................................................................................................................................................|......................................................................*...................................................................................................... - // mls v18.4s, v10.4s, v29.4s // ...................................................................................................................................................................|.............................................................................................*............................................................................... - // mul v19.4s, v11.4s, v25.4s // ...................................................................................................................................................................|........................................................................................*.................................................................................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ...................................................................................................................................................................|...............................................................................*............................................................................................. - // mls v19.4s, v11.4s, v29.4s // ...................................................................................................................................................................|.......................................................................................................*..................................................................... - // mul v20.4s, v12.4s, v25.4s // ...................................................................................................................................................................|................................................................*............................................................................................................ - // sqrdmulh v12.4s, v12.4s, v26.4s // ...................................................................................................................................................................|..................................................................*.......................................................................................................... - // mls v20.4s, v12.4s, v29.4s // ...................................................................................................................................................................|.................................................................................*........................................................................................... - // mul v21.4s, v13.4s, v25.4s // ...................................................................................................................................................................|............................................................*................................................................................................................ - // sqrdmulh v13.4s, v13.4s, v26.4s // ...................................................................................................................................................................|..............................................................*.............................................................................................................. - // mls v21.4s, v13.4s, v29.4s // ...................................................................................................................................................................|...........................................................................*................................................................................................. - // mul v22.4s, v14.4s, v25.4s // ...................................................................................................................................................................|..................................................................................................*.......................................................................... - // sqrdmulh v14.4s, v14.4s, v26.4s // ...................................................................................................................................................................|..............................................................................................*.............................................................................. - // mls v22.4s, v14.4s, v29.4s // ...................................................................................................................................................................|..............................................................................................................*.............................................................. - // mul v23.4s, v15.4s, v25.4s // ...................................................................................................................................................................|.....................................................................................*....................................................................................... - // sqrdmulh v15.4s, v15.4s, v26.4s // ...................................................................................................................................................................|................................................................................*............................................................................................ - // mls v23.4s, v15.4s, v29.4s // ...................................................................................................................................................................|...................................................................................................*......................................................................... - // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................................................................................|..........................................................................................*.................................................................................. - // cmge v28.4s, v16.4s, v30.4s // ...................................................................................................................................................................|......................................................................................*...................................................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................................................*............................................................................. - // mls v16.4s, v28.4s, v29.4s // ...................................................................................................................................................................|..........................................................................................................*.................................................................. - // cmge v27.4s, v31.4s, v17.4s // ...................................................................................................................................................................|...................................................................................................................*......................................................... - // cmge v28.4s, v17.4s, v30.4s // .*.................................................................................................................................................................|.....................................................................................................................*....................................................... - // sub v28.4s, v27.4s, v28.4s // ...........*.......................................................................................................................................................|...............................................................................................................................*............................................. - // mls v17.4s, v28.4s, v29.4s // .................*.................................................................................................................................................|.....................................................................................................................................*....................................... - // cmge v27.4s, v31.4s, v18.4s // ...................................................................................................................................................................|...........................................................................................................*................................................................. - // cmge v28.4s, v18.4s, v30.4s // ...................................................................................................................................................................|........................................................................................................*.................................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|................................................................................................................*............................................................ - // mls v18.4s, v28.4s, v29.4s // ..........*........................................................................................................................................................|..............................................................................................................................*.............................................. - // cmge v27.4s, v31.4s, v19.4s // .............*.....................................................................................................................................................|.................................................................................................................................*........................................... - // cmge v28.4s, v19.4s, v30.4s // ........*..........................................................................................................................................................|............................................................................................................................*................................................ - // sub v28.4s, v27.4s, v28.4s // ....................*..............................................................................................................................................|........................................................................................................................................*.................................... - // mls v19.4s, v28.4s, v29.4s // ................................*..................................................................................................................................|....................................................................................................................................................*........................ - // cmge v27.4s, v31.4s, v20.4s // ...................................................................................................................................................................|............................................................................................*................................................................................ - // cmge v28.4s, v20.4s, v30.4s // ...................................................................................................................................................................|.........................................................................................*................................................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.....................................................................................................*....................................................................... - // mls v20.4s, v28.4s, v29.4s // ...*...............................................................................................................................................................|.......................................................................................................................*..................................................... - // cmge v27.4s, v31.4s, v21.4s // ...................................................................................................................................................................|.........................................................................................................*................................................................... - // cmge v28.4s, v21.4s, v30.4s // ...................................................................................................................................................................|................................................................................................*............................................................................ - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................................................................*............................................................. - // mls v21.4s, v28.4s, v29.4s // ................*..................................................................................................................................................|....................................................................................................................................*........................................ - // cmge v27.4s, v31.4s, v22.4s // ............*......................................................................................................................................................|................................................................................................................................*............................................ - // cmge v28.4s, v22.4s, v30.4s // ......*............................................................................................................................................................|..........................................................................................................................*.................................................. - // sub v28.4s, v27.4s, v28.4s // .......................*...........................................................................................................................................|...........................................................................................................................................*................................. - // mls v22.4s, v28.4s, v29.4s // ..............................................*....................................................................................................................|..................................................................................................................................................................*.......... - // cmge v27.4s, v31.4s, v23.4s // ...................................................................................................................................................................|.............................................................................................................*............................................................... - // cmge v28.4s, v23.4s, v30.4s // ...................................................................................................................................................................|............................................................................................................*................................................................ - // sub v28.4s, v27.4s, v28.4s // ....*..............................................................................................................................................................|........................................................................................................................*.................................................... - // mls v23.4s, v28.4s, v29.4s // ..............*....................................................................................................................................................|..................................................................................................................................*.......................................... - // str q16, [x1], #(16) // ...................................................................................................................................................................|..................................................................................................................*.......................................................... - // str q17, [x1, #(-16 + 1*(512/8))] // ..................................*................................................................................................................................|......................................................................................................................................................*...................... - // str q18, [x1, #(-16 + 2*(512/8))] // .....................*.............................................................................................................................................|.........................................................................................................................................*................................... - // str q19, [x1, #(-16 + 3*(512/8))] // ................................................*..................................................................................................................|....................................................................................................................................................................*........ - // str q20, [x1, #(-16 + 4*(512/8))] // .........................*.........................................................................................................................................|.............................................................................................................................................*............................... - // str q21, [x1, #(-16 + 5*(512/8))] // ......................................*............................................................................................................................|..........................................................................................................................................................*.................. - // str q22, [x1, #(-16 + 6*(512/8))] // ........................................................*..........................................................................................................|............................................................................................................................................................................* - // str q23, [x1, #(-16 + 7*(512/8))] // .........................................*.........................................................................................................................|.............................................................................................................................................................*............... + // ------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // mls v24.4S, v19.4S, v29.4S // *....................................................................................................................................................................................................................................................................................... + // mls v22.4S, v21.4S, v29.4S // ..*..................................................................................................................................................................................................................................................................................... + // cmge v21.4S, v31.4S, v16.4S // ...*.................................................................................................................................................................................................................................................................................... + // cmge v17.4S, v16.4S, v30.4S // .*...................................................................................................................................................................................................................................................................................... + // sqrdmulh v19.4S, v28.4S, v0.S[1] // .......*................................................................................................................................................................................................................................................................................ + // mul v28.4S, v28.4S, v0.S[0] // ......*................................................................................................................................................................................................................................................................................. + // sub v21.4S, v21.4S, v17.4S // ........*............................................................................................................................................................................................................................................................................... + // sub v17.4S, v14.4S, v24.4S // ..........*............................................................................................................................................................................................................................................................................. + // add v14.4S, v14.4S, v24.4S // .........*.............................................................................................................................................................................................................................................................................. + // sub v24.4S, v13.4S, v22.4S // .................*...................................................................................................................................................................................................................................................................... + // add v13.4S, v13.4S, v22.4S // ..................*..................................................................................................................................................................................................................................................................... + // mls v16.4S, v21.4S, v29.4S // ...........*............................................................................................................................................................................................................................................................................ + // mls v28.4S, v19.4S, v29.4S // ..............*......................................................................................................................................................................................................................................................................... + // mul v19.4S, v18.4S, v1.S[0] // ................*....................................................................................................................................................................................................................................................................... + // sqrdmulh v21.4S, v18.4S, v1.S[1] // ...............*........................................................................................................................................................................................................................................................................ + // mul v22.4S, v17.4S, v0.S[0] // .................................*...................................................................................................................................................................................................................................................... + // sqrdmulh v17.4S, v17.4S, v0.S[1] // ..................................*..................................................................................................................................................................................................................................................... + // str q16, [x1, #512] // .....................*.................................................................................................................................................................................................................................................................. + // mls v19.4S, v21.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + // cmge v21.4S, v28.4S, v30.4S // ........................*............................................................................................................................................................................................................................................................... + // cmge v18.4S, v31.4S, v28.4S // .......................*................................................................................................................................................................................................................................................................ + // sqrdmulh v16.4S, v20.4S, v0.S[1] // ............*........................................................................................................................................................................................................................................................................... + // mul v20.4S, v20.4S, v0.S[0] // .............*.......................................................................................................................................................................................................................................................................... + // mls v22.4S, v17.4S, v29.4S // ............................................*........................................................................................................................................................................................................................................... + // sub v21.4S, v18.4S, v21.4S // ...........................*............................................................................................................................................................................................................................................................ + // sub v18.4S, v15.4S, v10.4S // ....*................................................................................................................................................................................................................................................................................... + // add v10.4S, v15.4S, v10.4S // .....*.................................................................................................................................................................................................................................................................................. + // sub v15.4S, v11.4S, v27.4S // ..........................................*............................................................................................................................................................................................................................................. + // mls v28.4S, v21.4S, v29.4S // .............................*.......................................................................................................................................................................................................................................................... + // mls v20.4S, v16.4S, v29.4S // ......................*................................................................................................................................................................................................................................................................. + // cmge v21.4S, v22.4S, v30.4S // ...................................................................*.................................................................................................................................................................................................................... + // cmge v17.4S, v31.4S, v22.4S // ..................................................................*..................................................................................................................................................................................................................... + // add v16.4S, v23.4S, v19.4S // ....................................*................................................................................................................................................................................................................................................... + // sub v23.4S, v23.4S, v19.4S // .....................................*.................................................................................................................................................................................................................................................. + // mul v19.4S, v24.4S, v0.S[0] // ......................................*................................................................................................................................................................................................................................................. + // sqrdmulh v24.4S, v24.4S, v0.S[1] // ........................................*............................................................................................................................................................................................................................................... + // add v11.4S, v11.4S, v27.4S // .............................................*.......................................................................................................................................................................................................................................... + // sub v27.4S, v17.4S, v21.4S // .......................................................................*................................................................................................................................................................................................................ + // sqrdmulh v21.4S, v14.4S, v26.4S // ..........................*............................................................................................................................................................................................................................................................. + // mul v14.4S, v14.4S, v25.4S // ............................*........................................................................................................................................................................................................................................................... + // str q28, [x1, #576] // .........................................*.............................................................................................................................................................................................................................................. + // cmge v28.4S, v20.4S, v30.4S // ...............................*........................................................................................................................................................................................................................................................ + // cmge v17.4S, v31.4S, v20.4S // ..............................*......................................................................................................................................................................................................................................................... + // mls v22.4S, v27.4S, v29.4S // ..............................................................................*......................................................................................................................................................................................................... + // sqrdmulh v27.4S, v18.4S, v0.S[1] // ...............................................*........................................................................................................................................................................................................................................ + // mul v18.4S, v18.4S, v0.S[0] // ...........................................*............................................................................................................................................................................................................................................ + // mls v19.4S, v24.4S, v29.4S // ....................................................*................................................................................................................................................................................................................................... + // mls v14.4S, v21.4S, v29.4S // .......................................*................................................................................................................................................................................................................................................ + // mul v21.4S, v15.4S, v0.S[0] // ..................................................*..................................................................................................................................................................................................................................... + // sqrdmulh v15.4S, v15.4S, v0.S[1] // ................................................*....................................................................................................................................................................................................................................... + // sub v17.4S, v17.4S, v28.4S // ...................................*.................................................................................................................................................................................................................................................... + // sqrdmulh v24.4S, v10.4S, v26.4S // ..............................................................*......................................................................................................................................................................................................................... + // mul v10.4S, v10.4S, v25.4S // ...................................................*.................................................................................................................................................................................................................................... + // sqrdmulh v28.4S, v23.4S, v0.S[1] // ...........................................................*............................................................................................................................................................................................................................ + // mul v23.4S, v23.4S, v0.S[0] // ............................................................*........................................................................................................................................................................................................................... + // str q22, [x1, #896] // ........................................................................................*............................................................................................................................................................................................... + // sqrdmulh v22.4S, v11.4S, v26.4S // .....................................................*.................................................................................................................................................................................................................................. + // mls v20.4S, v17.4S, v29.4S // ..............................................*......................................................................................................................................................................................................................................... + // cmge v17.4S, v19.4S, v30.4S // .....................................................................*.................................................................................................................................................................................................................. + // mls v21.4S, v15.4S, v29.4S // ..........................................................*............................................................................................................................................................................................................................. + // mls v10.4S, v24.4S, v29.4S // ........................................................................*............................................................................................................................................................................................................... + // cmge v24.4S, v31.4S, v19.4S // ......................................................................*................................................................................................................................................................................................................. + // sqrdmulh v15.4S, v13.4S, v26.4S // .................................................*...................................................................................................................................................................................................................................... + // mul v13.4S, v13.4S, v25.4S // .......................................................*................................................................................................................................................................................................................................ + // mls v18.4S, v27.4S, v29.4S // .............................................................*.......................................................................................................................................................................................................................... + // sqrdmulh v27.4S, v8.4S, v26.4S // ...................*.................................................................................................................................................................................................................................................................... + // mul v8.4S, v8.4S, v25.4S // ....................*................................................................................................................................................................................................................................................................... + // mul v11.4S, v11.4S, v25.4S // ......................................................*................................................................................................................................................................................................................................. + // sub v24.4S, v24.4S, v17.4S // ..........................................................................*............................................................................................................................................................................................................. + // mls v23.4S, v28.4S, v29.4S // ....................................................................*................................................................................................................................................................................................................... + // cmge v28.4S, v21.4S, v30.4S // .......................................................................................*................................................................................................................................................................................................ + // str q20, [x1, #768] // ........................................................*............................................................................................................................................................................................................................... + // cmge v20.4S, v31.4S, v21.4S // .....................................................................................*.................................................................................................................................................................................................. + // mls v13.4S, v15.4S, v29.4S // .........................................................*.............................................................................................................................................................................................................................. + // mul v15.4S, v16.4S, v25.4S // ...............................................................*........................................................................................................................................................................................................................ + // sqrdmulh v17.4S, v12.4S, v26.4S // ...........................................................................*............................................................................................................................................................................................................ + // mul v12.4S, v12.4S, v25.4S // ............................................................................*........................................................................................................................................................................................................... + // sub v28.4S, v20.4S, v28.4S // ..............................................................................................*......................................................................................................................................................................................... + // mls v8.4S, v27.4S, v29.4S // ................................*....................................................................................................................................................................................................................................................... + // cmge v27.4S, v31.4S, v18.4S // .............................................................................*.......................................................................................................................................................................................................... + // cmge v20.4S, v18.4S, v30.4S // .........................................................................*.............................................................................................................................................................................................................. + // mls v19.4S, v24.4S, v29.4S // ..................................................................................*..................................................................................................................................................................................................... + // cmge v24.4S, v23.4S, v30.4S // ................................................................................*....................................................................................................................................................................................................... + // mls v11.4S, v22.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + // cmge v22.4S, v8.4S, v30.4S // ...............................................................................................................................*........................................................................................................................................................ + // sub v27.4S, v27.4S, v20.4S // ...................................................................................*.................................................................................................................................................................................................... + // mls v21.4S, v28.4S, v29.4S // ...........................................................................................................*............................................................................................................................................................................ + // str q19, [x1, #832] // ......................................................................................*................................................................................................................................................................................................. + // cmge v20.4S, v31.4S, v23.4S // .................................................................................*...................................................................................................................................................................................................... + // sqrdmulh v19.4S, v16.4S, v26.4S // ................................................................*....................................................................................................................................................................................................................... + // cmge v16.4S, v13.4S, v30.4S // .........................................................................................*.............................................................................................................................................................................................. + // cmge v28.4S, v31.4S, v13.4S // ........................................................................................................*............................................................................................................................................................................... + // mls v12.4S, v17.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + // sub v17.4S, v20.4S, v24.4S // ..........................................................................................*............................................................................................................................................................................................. + // sqrdmulh v24.4S, v9.4S, v26.4S // ............................................................................................*........................................................................................................................................................................................... + // mul v9.4S, v9.4S, v25.4S // .............................................................................................*.......................................................................................................................................................................................... + // str q21, [x1, #704] // ..................................................................................................................*..................................................................................................................................................................... + // sub v21.4S, v28.4S, v16.4S // .........................................................................................................................*.............................................................................................................................................................. + // mls v18.4S, v27.4S, v29.4S // ...........................................................................................*............................................................................................................................................................................................ + // cmge v16.4S, v10.4S, v30.4S // ................................................................................................*....................................................................................................................................................................................... + // cmge v20.4S, v31.4S, v8.4S // ....................................................................................................................................*................................................................................................................................................... + // mls v23.4S, v17.4S, v29.4S // ...................................................................................................*.................................................................................................................................................................................... + // cmge v17.4S, v14.4S, v30.4S // ....................................................................................................................*................................................................................................................................................................... + // mls v15.4S, v19.4S, v29.4S // ...............................................................................*........................................................................................................................................................................................................ + // cmge v19.4S, v31.4S, v10.4S // .................................................................................................*...................................................................................................................................................................................... + // cmge v28.4S, v11.4S, v30.4S // ..................................................................................................*..................................................................................................................................................................................... + // sub v27.4S, v20.4S, v22.4S // ...................................................................................................................................................*.................................................................................................................................... + // cmge v20.4S, v31.4S, v12.4S // ..........................................................................................................*............................................................................................................................................................................. + // mls v9.4S, v24.4S, v29.4S // .......................................................................................................*................................................................................................................................................................................ + // sub v16.4S, v19.4S, v16.4S // ....................................................................................................*................................................................................................................................................................................... + // str q18, [x1, #640] // .....................................................................................................*.................................................................................................................................................................................. + // cmge v18.4S, v12.4S, v30.4S // ............................................................................................................*........................................................................................................................................................................... + // str q23, [x1, #960] // .............................................................................................................*.......................................................................................................................................................................... + // mls v10.4S, v16.4S, v29.4S // ...............................................................................................................*........................................................................................................................................................................ + // cmge v22.4S, v31.4S, v14.4S // .........................................................................................................*.............................................................................................................................................................................. + // cmge v16.4S, v31.4S, v11.4S // ......................................................................................................*................................................................................................................................................................................. + // cmge v23.4S, v9.4S, v30.4S // .......................................................................................................................*................................................................................................................................................................ + // sub v18.4S, v20.4S, v18.4S // .................................................................................................................*...................................................................................................................................................................... + // cmge v20.4S, v31.4S, v9.4S // ................................................................................................................*....................................................................................................................................................................... + // cmge v19.4S, v31.4S, v15.4S // ....................................................................................*................................................................................................................................................................................................... + // cmge v24.4S, v15.4S, v30.4S // ......................................................................................................................*................................................................................................................................................................. + // mls v8.4S, v27.4S, v29.4S // ..................................................................................................................................................................*..................................................................................................................... + // sub v27.4S, v22.4S, v17.4S // ..............................................................................................................................*......................................................................................................................................................... + // sub v22.4S, v16.4S, v28.4S // ..............................................................................................................*......................................................................................................................................................................... + // mls v13.4S, v21.4S, v29.4S // .....................................................................................................................................*.................................................................................................................................................. + // mls v12.4S, v18.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + // str q10, [x1, #128] // ...................................................................................................................*.................................................................................................................................................................... + // sub v21.4S, v20.4S, v23.4S // ...................................................................................................................................*.................................................................................................................................................... + // sub v18.4S, v19.4S, v24.4S // ................................................................................................................................*....................................................................................................................................................... + // mls v11.4S, v22.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + // mls v14.4S, v27.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + // mls v15.4S, v18.4S, v29.4S // ........................................................................................................................................*............................................................................................................................................... + // mls v9.4S, v21.4S, v29.4S // .........................................................................................................................................*.............................................................................................................................................. + // str q8, [x1], #(16) // .......................................................................................................................................................................*................................................................................................................ + // str q13, [x1, #304] // ............................................................................................................................................................*........................................................................................................................... + // str q12, [x1, #240] // ...........................................................................................................................................*............................................................................................................................................ + // str q11, [x1, #176] // ......................................................................................................................................*................................................................................................................................................. + // str q14, [x1, #368] // .........................................................................................................................................................*.............................................................................................................................. + // str q15, [x1, #432] // ..............................................................................................................................................*......................................................................................................................................... + // str q9, [x1, #48] // ...............................................................................................................................................*........................................................................................................................................ + // ldr q21, [x1, #576] // ...........................................................................................................................*............................................................................................................................................................ + // ldr q22, [x1, #512] // ............................................................................................................................*........................................................................................................................................................... + // ldr q18, [x1, #128] // ..........................................................................................................................................................*............................................................................................................................. + // ldr q8, [x1, #192] // ...........................................................................................................................................................*............................................................................................................................ + // ldr q14, [x1, #384] // ...............................................................................................................................................................*........................................................................................................................ + // ldr q27, [x1, #448] // ............................................................................................................................................*........................................................................................................................................... + // ldr q24, [x1, #704] // .....................................................................................................................*.................................................................................................................................................................. + // ldr q15, [x1, #640] // ..........................................................................................................................*............................................................................................................................................................. + // ldr q13, [x1, #64] // ..................................................................................................................................*..................................................................................................................................................... + // ldr q19, [x1, #0] // .................................................................................................................................*...................................................................................................................................................... + // ldr q16, [x1, #256] // ................................................................................................................................................*....................................................................................................................................... + // ldr q9, [x1, #960] // ...................................................................................................................................................................*.................................................................................................................... + // sub v20.4S, v22.4S, v21.4S // ..........................................................................................................................................*............................................................................................................................................. + // add v23.4S, v22.4S, v21.4S // .............................................................................................................................................*.......................................................................................................................................... + // ldr q10, [x1, #320] // ........................................................................................................................................................*............................................................................................................................... + // ldr q28, [x1, #896] // .................................................................................................................................................................*...................................................................................................................... + // sub v11.4S, v18.4S, v8.4S // ...........................................................................................................................................................................*............................................................................................................ + // add v21.4S, v18.4S, v8.4S // ..........................................................................................................................................................................*............................................................................................................. + // add v8.4S, v14.4S, v27.4S // .........................................................................................................................................................................................*.............................................................................................. + // sub v14.4S, v14.4S, v27.4S // ..........................................................................................................................................................................................*............................................................................................. + // add v18.4S, v15.4S, v24.4S // .......................................................................................................................................................*................................................................................................................................ + // sub v22.4S, v15.4S, v24.4S // ..................................................................................................................................................*..................................................................................................................................... + // mul v17.4S, v20.4S, v5.S[2] // .....................................................................................................................................................*.................................................................................................................................. + // sqrdmulh v12.4S, v20.4S, v5.S[3] // ......................................................................................................................................................*................................................................................................................................. + // add v27.4S, v19.4S, v13.4S // ....................................................................................................................................................*................................................................................................................................... + // sub v19.4S, v19.4S, v13.4S // .................................................................................................................................................*...................................................................................................................................... + // sqrdmulh v24.4S, v11.4S, v4.S[1] // ..................................................................................................................................................................................*..................................................................................................... + // mul v11.4S, v11.4S, v4.S[0] // .................................................................................................................................................................................*...................................................................................................... + // mul v20.4S, v22.4S, v6.S[0] // ................................................................................................................................................................*....................................................................................................................... + // sqrdmulh v15.4S, v22.4S, v6.S[1] // ..............................................................................................................................................................*......................................................................................................................... + // sqrdmulh v22.4S, v19.4S, v3.S[3] // .............................................................................................................................................................*.......................................................................................................................... + // mul v19.4S, v19.4S, v3.S[2] // ......................................................................................................................................................................*................................................................................................................. + // mls v11.4S, v24.4S, v29.4S // .............................................................................................................................................................................................*.......................................................................................... + // sqrdmulh v13.4S, v14.4S, v5.S[1] // ..........................................................................................................................................................................................................*............................................................................. + // mul v24.4S, v14.4S, v5.S[0] // ...........................................................................................................................................................................................................*............................................................................ + // sub v14.4S, v16.4S, v10.4S // .............................................................................................................................................................................*.......................................................................................................... + // mls v17.4S, v12.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + // sub v12.4S, v28.4S, v9.4S // ........................................................................................................................................................................*............................................................................................................... + // add v28.4S, v28.4S, v9.4S // .........................................................................................................................................................................*.............................................................................................................. + // mls v19.4S, v22.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + // sqrdmulh v9.4S, v14.4S, v4.S[3] // .......................................................................................................................................................................................*................................................................................................ + // mul v22.4S, v14.4S, v4.S[2] // ......................................................................................................................................................................................*................................................................................................. + // ldr q14, [x1, #768] // ........................................................................................................................................................................................*............................................................................................... + // mls v20.4S, v15.4S, v29.4S // ..............................................................................................................................................................................*......................................................................................................... + // add v15.4S, v16.4S, v10.4S // ............................................................................................................................................................................*........................................................................................................... + // sub v10.4S, v27.4S, v21.4S // ...............................................................................................................................................................................*........................................................................................................ + // add v27.4S, v27.4S, v21.4S // ................................................................................................................................................................................*....................................................................................................... + // ldr q21, [x1, #832] // .....................................................................................................................................................................................*.................................................................................................. + // mls v24.4S, v13.4S, v29.4S // ..................................................................................................................................................................................................................*..................................................................... + // sub v13.4S, v23.4S, v18.4S // ....................................................................................................................................................................................*................................................................................................... + // add v16.4S, v23.4S, v18.4S // ....................................................................................................................................................................*................................................................................................................... + // sub v18.4S, v15.4S, v8.4S // .................................................................................................................................................................................................*...................................................................................... + // mls v22.4S, v9.4S, v29.4S // ................................................................................................................................................................................................*....................................................................................... + // add v8.4S, v15.4S, v8.4S // ..................................................................................................................................................................................................*..................................................................................... + // add v15.4S, v19.4S, v11.4S // .......................................................................................................................................................................................................................*................................................................ + // sub v19.4S, v19.4S, v11.4S // ......................................................................................................................................................................................................................*................................................................. + // mul v23.4S, v10.4S, v1.S[2] // ...................................................................................................................................................................................................*.................................................................................... + // sub v11.4S, v14.4S, v21.4S // .....................................................................................................................................................................................................*.................................................................................. + // sqrdmulh v10.4S, v10.4S, v1.S[3] // .......................................................................................................................................................................................................*................................................................................ + // add v21.4S, v14.4S, v21.4S // ......................................................................................................................................................................................................*................................................................................. + // sqrdmulh v14.4S, v18.4S, v2.S[1] // ...................................................................................................................................................................................................................................................*.................................... + // mul v18.4S, v18.4S, v2.S[0] // ..................................................................................................................................................................................................................................................*..................................... + // add v9.4S, v21.4S, v28.4S // ............................................................................................................................................................................................................*........................................................................... + // sub v28.4S, v21.4S, v28.4S // .............................................................................................................................................................................................................*.......................................................................... + // mul v21.4S, v11.4S, v6.S[2] // ..............................................................................................................................................................................................................*......................................................................... + // mls v23.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................*....................................................................... + // sqrdmulh v10.4S, v11.4S, v6.S[3] // ...............................................................................................................................................................................................................*........................................................................ + // sub v11.4S, v22.4S, v24.4S // ................................................................................................................................................................................................................................*....................................................... + // add v22.4S, v22.4S, v24.4S // .................................................................................................................................................................................................................................*...................................................... + // mls v18.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + // mul v24.4S, v12.4S, v7.S[0] // ...............................................................................................................................................................................................*........................................................................................ + // sqrdmulh v14.4S, v12.4S, v7.S[1] // ..............................................................................................................................................................................................*......................................................................................... + // sub v12.4S, v27.4S, v8.4S // ........................................................................................................................................................................................................*............................................................................... + // mls v21.4S, v10.4S, v29.4S // ........................................................................................................................................................................................................................*............................................................... + // sqrdmulh v10.4S, v19.4S, v1.S[3] // ................................................................................................................................................................................................................................................*....................................... + // add v8.4S, v27.4S, v8.4S // .........................................................................................................................................................................................................*.............................................................................. + // mul v27.4S, v19.4S, v1.S[2] // ...............................................................................................................................................................................................................................................*........................................ + // mls v24.4S, v14.4S, v29.4S // ....................................................................................................................................................................................................*................................................................................... + // sqrdmulh v14.4S, v28.4S, v3.S[1] // .....................................................................................................................................................................................................................................*.................................................. + // mul v19.4S, v28.4S, v3.S[0] // ....................................................................................................................................................................................................................................*................................................... + // add v28.4S, v16.4S, v9.4S // ....................................................................................................................................................................................................................*................................................................... + // sub v16.4S, v16.4S, v9.4S // ...................................................................................................................................................................................................................*.................................................................... + // sub v9.4S, v17.4S, v20.4S // ...........................................................................................................................................................................................*............................................................................................ + // add v17.4S, v17.4S, v20.4S // ............................................................................................................................................................................................*........................................................................................... + // sub v20.4S, v21.4S, v24.4S // ..................................................................................................................................................................................................................................*..................................................... + // add v21.4S, v21.4S, v24.4S // ...................................................................................................................................................................................................................................*.................................................... + // mul v24.4S, v9.4S, v2.S[2] // ........................................................................................................................................................................................................................................................*............................... + // sqrdmulh v9.4S, v9.4S, v2.S[3] // .........................................................................................................................................................................................................................................................*.............................. + // mls v27.4S, v10.4S, v29.4S // ....................................................................................................................................................................................................................................................*................................... + // mul v10.4S, v11.4S, v2.S[0] // ...........................................................................................................................................................................................................................................*............................................ + // sqrdmulh v11.4S, v11.4S, v2.S[1] // ..........................................................................................................................................................................................................................................*............................................. + // mls v19.4S, v14.4S, v29.4S // .............................................................................................................................................................................................................................................*.......................................... + // sqrdmulh v14.4S, v12.4S, v0.S[3] // .................................................................................................................................................................................................................*...................................................................... + // mul v12.4S, v12.4S, v0.S[2] // .....................................................................................................................................................................................................................*.................................................................. + // mls v24.4S, v9.4S, v29.4S // .................................................................................................................................................................................................................................................................*...................... + // mul v9.4S, v13.4S, v2.S[2] // ..........................................................................................................................................................................................................................*............................................................. + // sqrdmulh v13.4S, v13.4S, v2.S[3] // .........................................................................................................................................................................................................................*.............................................................. + // mls v10.4S, v11.4S, v29.4S // .................................................................................................................................................................................................................................................*...................................... + // sub v11.4S, v8.4S, v28.4S // ............................................................................................................................................................................................................................*........................................................... + // add v8.4S, v8.4S, v28.4S // ...........................................................................................................................................................................................................................*............................................................ + // mul v28.4S, v16.4S, v1.S[0] // ...............................................................................................................................................................................................................................*........................................................ + // mls v12.4S, v14.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + // sqrdmulh v14.4S, v16.4S, v1.S[1] // ..............................................................................................................................................................................................................................*......................................................... + // sub v16.4S, v15.4S, v22.4S // ........................................................................................................................................................................................................................................*............................................... + // add v22.4S, v15.4S, v22.4S // .........................................................................................................................................................................................................................................*.............................................. + // add v15.4S, v23.4S, v18.4S // ........................................................................................................................................................................................................................................................................*............... + // sub v23.4S, v23.4S, v18.4S // .......................................................................................................................................................................................................................................................................*................ + // mls v9.4S, v13.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + // mul v13.4S, v16.4S, v0.S[2] // .....................................................................................................................................................................................................................................................*.................................. + // sqrdmulh v18.4S, v16.4S, v0.S[3] // ..........................................................................................................................................................................................................................................................*............................. + // mul v16.4S, v11.4S, v0.S[0] // ..............................................................................................................................................................................................................................................*......................................... + // sqrdmulh v11.4S, v11.4S, v0.S[1] // ............................................................................................................................................................................................................................................*........................................... + // mls v13.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................................*........................... + // mls v28.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................*................................................ + // mul v18.4S, v20.4S, v3.S[0] // ...............................................................................................................................................................................................................................................................*........................ + // sqrdmulh v20.4S, v20.4S, v3.S[1] // ..................................................................................................................................................................................................................................................................*..................... + // mls v16.4S, v11.4S, v29.4S // ......................................................................................................................................................................................................................................................*................................. + // mul v14.4S, v23.4S, v0.S[2] // ............................................................................................................................................................................................................................................................................*........... + // sqrdmulh v23.4S, v23.4S, v0.S[3] // .............................................................................................................................................................................................................................................................................*.......... + // add v11.4S, v27.4S, v10.4S // ......................................................................................................................................................................................................................................................................*................. + // sub v27.4S, v27.4S, v10.4S // ...................................................................................................................................................................................................................................................................*.................... + // add v10.4S, v9.4S, v19.4S // .............................................................................................................................................................................................................................................................*.......................... + // sub v19.4S, v9.4S, v19.4S // ..............................................................................................................................................................................................................................................................*......................... + // add v9.4S, v17.4S, v21.4S // ................................................................................................................................................................................................................................................................*....................... + // mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + // sub v20.4S, v12.4S, v28.4S // ....................................................................................................................................................................................................................................................................*................... + // add v12.4S, v12.4S, v28.4S // .....................................................................................................................................................................................................................................................................*.................. + // sub v21.4S, v17.4S, v21.4S // ...........................................................................................................................................................................................................................................................*............................ + // sub v28.4S, v22.4S, v9.4S // ...............................................................................................................................................................................................................................................................................*........ + // add v9.4S, v22.4S, v9.4S // ..............................................................................................................................................................................................................................................................................*......... + // mls v14.4S, v23.4S, v29.4S // ....................................................................................................................................................................................................................................................................................*... + // mul v23.4S, v27.4S, v0.S[2] // ..........................................................................................................................................................................................................................................................................*............. + // mul v22.4S, v21.4S, v1.S[0] // ................................................................................................................................................................................................................................................................................*....... + // sqrdmulh v21.4S, v21.4S, v1.S[1] // .................................................................................................................................................................................................................................................................................*...... + // sqrdmulh v17.4S, v27.4S, v0.S[3] // .........................................................................................................................................................................................................................................................................*.............. + // add v27.4S, v24.4S, v18.4S // ......................................................................................................................................................................................................................................................................................*. + // sub v18.4S, v24.4S, v18.4S // .......................................................................................................................................................................................................................................................................................* + // mul v24.4S, v19.4S, v1.S[0] // .....................................................................................................................................................................................................................................................................................*.. + // sqrdmulh v19.4S, v19.4S, v1.S[1] // ...................................................................................................................................................................................................................................................................................*.... + // mls v23.4S, v17.4S, v29.4S // ..................................................................................................................................................................................................................................................................................*..... sub count, count, #1 cbnz count, layer1234_start - mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - sub v17.4S, v22.4S, v18.4S // .............................................................................................................................................*.......................................................................................................................................... - sub v28.4S, v9.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... - add v12.4S, v9.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... - add v9.4S, v22.4S, v18.4S // ..............................................................................................................................................*......................................................................................................................................... - mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - sqrdmulh v19.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - sub v20.4S, v11.4S, v27.4S // .......................................................................................................................................................*................................................................................................................................ - sub v22.4S, v24.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. - add v14.4S, v24.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ - sqrdmulh v24.4S, v17.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... - mul v17.4S, v17.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + mls v24.4S, v19.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mls v22.4S, v21.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + cmge v21.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + cmge v17.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + sqrdmulh v19.4S, v28.4S, v0.S[1] // ...............................................................................................................................................*........................................................................................................................................ + mul v28.4S, v28.4S, v0.S[0] // ................................................................................................................................................*....................................................................................................................................... + sub v21.4S, v21.4S, v17.4S // ..................................................................................................................................................................................*..................................................................................................... + sub v17.4S, v14.4S, v24.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v14.4S, v24.4S // .......................................................................................................................................................................*................................................................................................................ + sub v24.4S, v13.4S, v22.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v22.4S // ..................................................................................................................................................................*..................................................................................................................... + mls v16.4S, v21.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mls v28.4S, v19.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mul v19.4S, v18.4S, v1.S[0] // ......................................................................................................................................*................................................................................................................................................. + sqrdmulh v21.4S, v18.4S, v1.S[1] // .....................................................................................................................................*.................................................................................................................................................. + mul v22.4S, v17.4S, v0.S[0] // .........................................................................................................................................................................*.............................................................................................................. + sqrdmulh v17.4S, v17.4S, v0.S[1] // ........................................................................................................................................................................*............................................................................................................... str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - mul v16.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mls v19.4S, v21.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v21.4S, v28.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + cmge v18.4S, v31.4S, v28.4S // ....................................................................................................................................................................................*................................................................................................... + sqrdmulh v16.4S, v20.4S, v0.S[1] // ..............................................................................................................................................................*......................................................................................................................... + mul v20.4S, v20.4S, v0.S[0] // ...............................................................................................................................................................*........................................................................................................................ + mls v22.4S, v17.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + sub v21.4S, v18.4S, v21.4S // ......................................................................................................................................................................................*................................................................................................. + sub v18.4S, v15.4S, v10.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v15.4S, v10.4S // ...................................................................................................................................................*.................................................................................................................................... + sub v15.4S, v11.4S, v27.4S // .......................................................................................................................................................*................................................................................................................................ + mls v28.4S, v21.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mls v20.4S, v16.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + cmge v21.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v17.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + add v16.4S, v23.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + sub v23.4S, v23.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + mul v19.4S, v24.4S, v0.S[0] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v24.4S, v24.4S, v0.S[1] // ...................................................................................................................................................................*.................................................................................................................... add v11.4S, v11.4S, v27.4S // ........................................................................................................................................................*............................................................................................................................... - sqrdmulh v13.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... - sqrdmulh v27.4S, v20.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - mls v18.4S, v19.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - mul v19.4S, v20.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sqrdmulh v20.4S, v23.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - mul v23.4S, v23.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - mls v17.4S, v24.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - sqrdmulh v24.4S, v22.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mul v22.4S, v22.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - mls v16.4S, v13.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... - add v13.4S, v15.4S, v21.4S // ..................................................................................................................................................................*..................................................................................................................... - sub v21.4S, v15.4S, v21.4S // .................................................................................................................................................................*...................................................................................................................... - mls v19.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - mls v23.4S, v20.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - mls v22.4S, v24.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - sqrdmulh v24.4S, v28.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - mul v28.4S, v28.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - mul v27.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - sqrdmulh v20.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - cmge v15.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - cmge v21.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... - mls v28.4S, v24.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - sub v24.4S, v21.4S, v15.4S // ......................................................................................................................................................................................*................................................................................................. - add v15.4S, v16.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... - sub v23.4S, v16.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ - mls v27.4S, v20.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - cmge v21.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + sub v27.4S, v17.4S, v21.4S // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v21.4S, v14.4S, v26.4S // ..........................................................................................................................................................................................................................................*............................................. + mul v14.4S, v14.4S, v25.4S // ...........................................................................................................................................................................................................................................*............................................ + str q28, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + cmge v28.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v17.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + mls v22.4S, v27.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sqrdmulh v27.4S, v18.4S, v0.S[1] // ....................................................................................................................................................*................................................................................................................................... + mul v18.4S, v18.4S, v0.S[0] // .....................................................................................................................................................*.................................................................................................................................. + mls v19.4S, v24.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mls v14.4S, v21.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mul v21.4S, v15.4S, v0.S[0] // ..........................................................................................................................................................*............................................................................................................................. + sqrdmulh v15.4S, v15.4S, v0.S[1] // .........................................................................................................................................................*.............................................................................................................................. + sub v17.4S, v17.4S, v28.4S // ..................................................................................................................................................................................................*..................................................................................... + sqrdmulh v24.4S, v10.4S, v26.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v10.4S, v10.4S, v25.4S // ...............................................................................................................................................................................................................................*........................................................ + sqrdmulh v28.4S, v23.4S, v0.S[1] // .............................................................................................................................................................................*.......................................................................................................... + mul v23.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................................*......................................................................................................... + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + sqrdmulh v22.4S, v11.4S, v26.4S // .................................................................................................................................................................................................................................*...................................................... + mls v20.4S, v17.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v17.4S, v19.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v21.4S, v15.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mls v10.4S, v24.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v24.4S, v31.4S, v19.4S // ....................................................................................................................................................................................................*................................................................................... + sqrdmulh v15.4S, v13.4S, v26.4S // .......................................................................................................................................................................................................................................*................................................ + mul v13.4S, v13.4S, v25.4S // ........................................................................................................................................................................................................................................*............................................... + mls v18.4S, v27.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + sqrdmulh v27.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................*............................................................... + mul v8.4S, v8.4S, v25.4S // .........................................................................................................................................................................................................................*.............................................................. + mul v11.4S, v11.4S, v25.4S // ..................................................................................................................................................................................................................................*..................................................... + sub v24.4S, v24.4S, v17.4S // ......................................................................................................................................................................................................*................................................................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v28.4S, v21.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + cmge v20.4S, v31.4S, v21.4S // ............................................................................................................................................................................................*........................................................................................... + mls v13.4S, v15.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mul v15.4S, v16.4S, v25.4S // ..............................................................................................................................................................................................................................................*......................................... + sqrdmulh v17.4S, v12.4S, v26.4S // ....................................................................................................................................................................................................................................*................................................... + mul v12.4S, v12.4S, v25.4S // .....................................................................................................................................................................................................................................*.................................................. + sub v28.4S, v20.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v8.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - sqrdmulh v16.4S, v23.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - mul v23.4S, v23.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - sub v20.4S, v21.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. - cmge v24.4S, v28.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - cmge v21.4S, v31.4S, v28.4S // ................................................................................................................................................................................................*....................................................................................... - mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - sub v24.4S, v21.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... - cmge v20.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... - cmge v17.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - mls v23.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - sub v17.4S, v20.4S, v17.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v19.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v24.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v11.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v22.4S, v8.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v27.4S, v27.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v21.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + str q19, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + cmge v20.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + sqrdmulh v19.4S, v16.4S, v26.4S // .............................................................................................................................................................................................................................................*.......................................... + cmge v16.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v28.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... + mls v12.4S, v17.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v17.4S, v20.4S, v24.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v24.4S, v9.4S, v26.4S // ...........................................................................................................................................................................................................................*............................................................ + mul v9.4S, v9.4S, v25.4S // ............................................................................................................................................................................................................................*........................................................... + str q21, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v21.4S, v28.4S, v16.4S // ......................................................................................................................................................................................................................................................................*................. + mls v18.4S, v27.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v16.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v20.4S, v31.4S, v8.4S // ................................................................................................................................................................................................................................................*....................................... + mls v23.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v17.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v15.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v19.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v28.4S, v11.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + sub v27.4S, v20.4S, v22.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v20.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... + mls v9.4S, v24.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v16.4S, v19.4S, v16.4S // ..........................................................................................................................................................................................................................................................*............................. str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - sqrdmulh v21.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - cmge v18.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... - cmge v20.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - mls v28.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - mls v22.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - sub v20.4S, v18.4S, v20.4S // ..............................................................................................................................................................................................*......................................................................................... - mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - cmge v24.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... - mls v16.4S, v21.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - sqrdmulh v10.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - str q28, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - sub v28.4S, v24.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... - sqrdmulh v24.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - sqrdmulh v13.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - mls v19.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - cmge v8.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - sqrdmulh v9.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - mls v18.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - sqrdmulh v14.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - cmge v10.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v18.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - cmge v11.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - cmge v15.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................*................................................................................... - sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - mls v17.4S, v13.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - mls v22.4S, v9.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - cmge v9.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - cmge v13.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... - mls v21.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - sub v10.4S, v10.4S, v11.4S // ..................................................................................................................................................................................................................................................*..................................... - mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - sub v15.4S, v15.4S, v8.4S // ......................................................................................................................................................................................................*................................................................................. - mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - sub v24.4S, v13.4S, v9.4S // ..........................................................................................................................................................................................................................................................*............................. - cmge v14.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - cmge v9.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - cmge v13.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - mls v27.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - cmge v15.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - cmge v11.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - cmge v12.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - str q27, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - cmge v27.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... - cmge v16.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - cmge v8.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - sub v15.4S, v15.4S, v13.4S // ......................................................................................................................................................................................................................................................................*................. - sub v11.4S, v12.4S, v11.4S // ..............................................................................................................................................................................................................................................................*......................... - cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - cmge v13.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - sub v14.4S, v14.4S, v9.4S // ......................................................................................................................................................................................................................................................*................................. - mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - sub v9.4S, v27.4S, v16.4S // ..............................................................................................................................................................................................................................................................................*......... - mls v21.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - sub v16.4S, v10.4S, v28.4S // ..........................................................................................................................................................................................................................................................................*............. - sub v27.4S, v8.4S, v13.4S // ..................................................................................................................................................................................................................................................................*..................... - mls v23.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - mls v22.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... - str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v10.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v22.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v16.4S, v31.4S, v11.4S // ............................................................................................................................................................................................................................................................*........................... + cmge v23.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v18.4S, v20.4S, v18.4S // ..................................................................................................................................................................................................................................................................*..................... + cmge v20.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v19.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v24.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v8.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v27.4S, v22.4S, v17.4S // ..........................................................................................................................................................................................................................................................................*............. + sub v22.4S, v16.4S, v28.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v13.4S, v21.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v12.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + str q10, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... + sub v21.4S, v20.4S, v23.4S // ......................................................................................................................................................................................................................................................*................................. + sub v18.4S, v19.4S, v24.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v11.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + mls v14.4S, v27.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + mls v15.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + mls v9.4S, v21.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + str q8, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + str q13, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + str q12, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + str q11, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q14, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q9, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s index e23094cc..a3557456 100644 --- a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s @@ -35,18 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -85,15 +73,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm .macro barrett_reduce_single a @@ -114,12 +102,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -138,31 +120,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [\r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -177,7 +159,7 @@ trn1_d \data\()1, t1, t3 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -188,7 +170,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -198,7 +180,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -206,7 +188,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -217,24 +199,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -333,490 +321,526 @@ _intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm: mov count, #16 .p2align 2 - // gap // ................................................. - ldr q6, [x0, #32] // ..*.............................................. - ldr q18, [x0, #48] // ...*............................................. - // gap // ................................................. - ldr q11, [x0, #16] // .*............................................... - ldr q5, [x0, #0] // *................................................ - // gap // ................................................. - // gap // ................................................. - ldr q4, [x3, #64] // ..........*...................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ldr q1, [x3, #80] // ....*............................................ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn2 v12.4S, v6.4S, v18.4S // .......*......................................... - trn1 v7.4S, v6.4S, v18.4S // .........*....................................... - ldr q28, [x3, #48] // ........*........................................ - // gap // ................................................. - trn2 v3.4S, v5.4S, v11.4S // ......*.......................................... - trn1 v16.4S, v5.4S, v11.4S // .....*........................................... - ldr q5, [x3, #32] // ...........*..................................... - // gap // ................................................. - ldr q30, [x4], #8 // ...................................*............. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn2 v27.2D, v16.2D, v7.2D // ............*.................................... - ldr q19, [x3, #16] // ....................*............................ - trn2 v14.2D, v3.2D, v12.2D // .............*................................... - // gap // ................................................. - trn1 v22.2D, v3.2D, v12.2D // ...............*................................. - trn1 v10.2D, v16.2D, v7.2D // ..............*.................................. - ldr q12, [x3], #(6*16) // .................*............................... - // gap // ................................................. - sub v2.4S, v27.4S, v14.4S // ................*................................ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sub v17.4S, v10.4S, v22.4S // ...................*............................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mul v21.4S, v2.4S, v4.4S // .......................*......................... - sqrdmulh v11.4S, v2.4S, v1.4S // ......................*.......................... - // gap // ................................................. - // gap // ................................................. - sqrdmulh v24.4S, v17.4S, v28.4S // ........................*........................ - mul v28.4S, v17.4S, v5.4S // .........................*....................... - ldr q17, [x4], #16 // ....................................*............ - // gap // ................................................. - add v8.4S, v27.4S, v14.4S // ..................*.............................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - add v9.4S, v10.4S, v22.4S // .....................*........................... - mls v21.4S, v11.4S, v29.4S // ..........................*...................... - // gap // ................................................. - // gap // ................................................. - mls v28.4S, v24.4S, v29.4S // ...........................*..................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sub v23.4S, v9.4S, v8.4S // ............................*.................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - add v9.4S, v9.4S, v8.4S // .....................................*........... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sub v26.4S, v28.4S, v21.4S // .............................*................... - sqrdmulh v6.4S, v23.4S, v19.4S // ..............................*.................. - // gap // ................................................. - // gap // ................................................. - mul v5.4S, v23.4S, v12.4S // ...............................*................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sqrdmulh v2.4S, v26.4S, v19.4S // .................................*............... - mul v20.4S, v26.4S, v12.4S // ................................*................ - // gap // ................................................. - // gap // ................................................. - add v13.4S, v28.4S, v21.4S // ..................................*.............. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mls v5.4S, v6.4S, v29.4S // ......................................*.......... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mls v20.4S, v2.4S, v29.4S // .......................................*......... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn1 v21.4S, v9.4S, v13.4S // ........................................*........ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn2 v2.4S, v9.4S, v13.4S // .........................................*....... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn2 v22.4S, v5.4S, v20.4S // ...........................................*..... - trn1 v14.4S, v5.4S, v20.4S // ..........................................*...... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn1 v18.2D, v2.2D, v22.2D // ............................................*.... - trn1 v31.2D, v21.2D, v14.2D // .............................................*... - // gap // ................................................. - // gap // ................................................. - trn2 v5.2D, v2.2D, v22.2D // ...............................................*. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn2 v22.2D, v21.2D, v14.2D // ..............................................*.. - sub v25.4S, v31.4S, v18.4S // ................................................* - // gap // ................................................. - // gap // ................................................. + // Instructions: 50 + // Expected cycles: 31 + // Expected IPC: 1.61 + // + // Wall time: 1.10s + // User time: 1.10s + // + // --------------- original position ---------------> + // 0 25 + // |------------------------| + ldr q1, [x0, #16] // .*................................................ + ldr q21, [x0, #0] // *................................................. + // gap // .................................................. + // gap // .................................................. + ldr q22, [x0, #32] // ..*............................................... + ldr q18, [x0, #48] // ...*.............................................. + // gap // .................................................. + // gap // .................................................. + ldr q8, [x3, #80] // ....*............................................. + ldr q9, [x3, #64] // .....*............................................ + // gap // .................................................. + // gap // .................................................. + ldr q2, [x3, #48] // ......*........................................... + ldr q3, [x3, #32] // ...........*...................................... + // gap // .................................................. + // gap // .................................................. + trn2 v26.4S, v21.4S, v1.4S // .......*.......................................... + trn1 v21.4S, v21.4S, v1.4S // ........*......................................... + ldr q14, [x3], #(6*16) // .....................*............................ + ldr q27, [x3, #-80] // ......................*........................... + trn2 v24.4S, v22.4S, v18.4S // ..........*....................................... + trn1 v22.4S, v22.4S, v18.4S // .........*........................................ + ldr q1, [x4], #8 // .......................................*.......... + ldr q19, [x4], #16 // ........................................*......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v25.2D, v26.2D, v24.2D // .............*.................................... + trn1 v18.2D, v21.2D, v22.2D // ............*..................................... + // gap // .................................................. + // gap // .................................................. + trn2 v26.2D, v26.2D, v24.2D // ..............*................................... + trn2 v21.2D, v21.2D, v22.2D // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + sub v22.4S, v18.4S, v25.4S // ................*................................. + add v18.4S, v18.4S, v25.4S // ..................*............................... + // gap // .................................................. + // gap // .................................................. + sub v24.4S, v21.4S, v26.4S // .................*................................ + add v21.4S, v21.4S, v26.4S // .........................*........................ + // gap // .................................................. + // gap // .................................................. + sqrdmulh v2.4S, v22.4S, v2.4S // ...................*.............................. + mul v22.4S, v22.4S, v3.4S // ....................*............................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v8.4S, v24.4S, v8.4S // ........................*......................... + mul v9.4S, v24.4S, v9.4S // .......................*.......................... + // gap // .................................................. + // gap // .................................................. + sub v3.4S, v18.4S, v21.4S // ............................*..................... + add v21.4S, v18.4S, v21.4S // ...................................*.............. + // gap // .................................................. + // gap // .................................................. + mls v22.4S, v2.4S, v29.4S // ...........................*...................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v9.4S, v8.4S, v29.4S // ..........................*....................... + sqrdmulh v18.4S, v3.4S, v27.4S // .............................*.................... + // gap // .................................................. + // gap // .................................................. + mul v8.4S, v3.4S, v14.4S // ................................*................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v2.4S, v22.4S, v9.4S // ..............................*................... + add v22.4S, v22.4S, v9.4S // ...............................*.................. + // gap // .................................................. + // gap // .................................................. + mls v8.4S, v18.4S, v29.4S // ....................................*............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v18.4S, v2.4S, v14.4S // .................................*................ + sqrdmulh v9.4S, v2.4S, v27.4S // ..................................*............... + // gap // .................................................. + // gap // .................................................. + trn1 v2.4S, v21.4S, v22.4S // .....................................*............ + trn2 v21.4S, v21.4S, v22.4S // .........................................*........ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v18.4S, v9.4S, v29.4S // ......................................*........... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v22.4S, v8.4S, v18.4S // ...........................................*...... + trn2 v18.4S, v8.4S, v18.4S // ..........................................*....... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn2 v9.2D, v2.2D, v22.2D // .............................................*.... + trn2 v8.2D, v21.2D, v18.2D // ............................................*..... + // gap // .................................................. + // gap // .................................................. + trn1 v11.2D, v2.2D, v22.2D // ...............................................*.. + trn1 v17.2D, v21.2D, v18.2D // ..............................................*... + // gap // .................................................. + // gap // .................................................. + add v13.4S, v9.4S, v8.4S // ................................................*. + sub v10.4S, v9.4S, v8.4S // .................................................* + // gap // .................................................. + // gap // .................................................. - // original source code - // ldr q14, [x0, #0] // ...*............................................. - // ldr q15, [x0, #16] // ..*.............................................. - // ldr q18, [x0, #32] // *................................................ - // ldr q13, [x0, #48] // .*............................................... - // ldr q24, [x3, #80] // .....*........................................... - // trn1 v12.4S, v14.4S, v15.4S // ..........*...................................... - // trn2 v31.4S, v14.4S, v15.4S // .........*....................................... - // trn2 v15.4S, v18.4S, v13.4S // ......*.......................................... - // ldr q28, [x3, #48] // ........*........................................ - // trn1 v14.4S, v18.4S, v13.4S // .......*......................................... - // ldr q21, [x3, #64] // ....*............................................ - // ldr q23, [x3, #32] // ...........*..................................... - // trn2 v25.2D, v12.2D, v14.2D // .............*................................... - // trn2 v27.2D, v31.2D, v15.2D // ...............*................................. - // trn1 v19.2D, v12.2D, v14.2D // .................*............................... - // trn1 v1.2D, v31.2D, v15.2D // ................*................................ - // sub v11.4S, v25.4S, v27.4S // ...................*............................. - // ldr q20, [x3], #(6*16) // ..................*.............................. - // add v3.4S, v25.4S, v27.4S // ..........................*...................... - // sub v18.4S, v19.4S, v1.4S // ....................*............................ - // ldr q6, [x3, #-80] // ..............*.................................. - // add v7.4S, v19.4S, v1.4S // ...........................*..................... - // sqrdmulh v31.4S, v11.4S, v24.4S // ......................*.......................... - // mul v19.4S, v11.4S, v21.4S // .....................*........................... - // sqrdmulh v28.4S, v18.4S, v28.4S // .......................*......................... - // mul v14.4S, v18.4S, v23.4S // ........................*........................ - // mls v19.4S, v31.4S, v29.4S // ............................*.................... - // mls v14.4S, v28.4S, v29.4S // .............................*................... - // sub v16.4S, v7.4S, v3.4S // ..............................*.................. - // sub v31.4S, v14.4S, v19.4S // ................................*................ - // sqrdmulh v25.4S, v16.4S, v6.4S // .................................*............... - // mul v5.4S, v16.4S, v20.4S // ..................................*.............. - // mul v8.4S, v31.4S, v20.4S // ....................................*............ - // sqrdmulh v1.4S, v31.4S, v6.4S // ...................................*............. - // add v31.4S, v14.4S, v19.4S // .....................................*........... - // ldr q30, [x4], #8 // ............*.................................... - // ldr q17, [x4], #16 // .........................*....................... - // add v6.4S, v7.4S, v3.4S // ...............................*................. - // mls v5.4S, v25.4S, v29.4S // ......................................*.......... - // mls v8.4S, v1.4S, v29.4S // .......................................*......... - // trn1 v7.4S, v6.4S, v31.4S // ........................................*........ - // trn2 v4.4S, v6.4S, v31.4S // .........................................*....... - // trn1 v9.4S, v5.4S, v8.4S // ...........................................*..... - // trn2 v14.4S, v5.4S, v8.4S // ..........................................*...... - // trn1 v18.2D, v4.2D, v14.2D // ............................................*.... - // trn1 v31.2D, v7.2D, v9.2D // .............................................*... - // trn2 v22.2D, v7.2D, v9.2D // ...............................................*. - // trn2 v5.2D, v4.2D, v14.2D // ..............................................*.. - // sub v25.4S, v31.4S, v18.4S // ................................................* + // ----------------- new position ------------------> + // 0 25 + // |------------------------|------------------------ + // ldr q6, [x0, #0] // .*................................................ + // ldr q21, [x0, #16] // *................................................. + // ldr q27, [x0, #32] // ..*............................................... + // ldr q3, [x0, #48] // ...*.............................................. + // ldr q22, [x3, #80] // ....*............................................. + // ldr q28, [x3, #64] // .....*............................................ + // ldr q20, [x3, #48] // ......*........................................... + // trn2 v26.4S, v6.4S, v21.4S // ........*......................................... + // trn1 v15.4S, v6.4S, v21.4S // .........*........................................ + // trn1 v24.4S, v27.4S, v3.4S // .............*.................................... + // trn2 v4.4S, v27.4S, v3.4S // ............*..................................... + // ldr q27, [x3, #32] // .......*.......................................... + // trn1 v6.2D, v15.2D, v24.2D // .................*................................ + // trn1 v25.2D, v26.2D, v4.2D // ................*................................. + // trn2 v14.2D, v26.2D, v4.2D // ..................*............................... + // trn2 v13.2D, v15.2D, v24.2D // ...................*.............................. + // sub v11.4S, v6.4S, v25.4S // ....................*............................. + // sub v31.4S, v13.4S, v14.4S // ......................*........................... + // add v17.4S, v6.4S, v25.4S // .....................*............................ + // sqrdmulh v7.4S, v11.4S, v20.4S // ........................*......................... + // mul v6.4S, v11.4S, v27.4S // .........................*........................ + // ldr q15, [x3], #(6*16) // ..........*....................................... + // ldr q16, [x3, #-80] // ...........*...................................... + // mul v21.4S, v31.4S, v28.4S // ...........................*...................... + // sqrdmulh v4.4S, v31.4S, v22.4S // ..........................*....................... + // add v10.4S, v13.4S, v14.4S // .......................*.......................... + // mls v21.4S, v4.4S, v29.4S // ...............................*.................. + // mls v6.4S, v7.4S, v29.4S // ..............................*................... + // sub v22.4S, v17.4S, v10.4S // ............................*..................... + // sqrdmulh v0.4S, v22.4S, v16.4S // ................................*................. + // sub v26.4S, v6.4S, v21.4S // ..................................*............... + // add v13.4S, v6.4S, v21.4S // ...................................*.............. + // mul v21.4S, v22.4S, v15.4S // .................................*................ + // mul v22.4S, v26.4S, v15.4S // .....................................*............ + // sqrdmulh v27.4S, v26.4S, v16.4S // ......................................*........... + // add v17.4S, v17.4S, v10.4S // .............................*.................... + // mls v21.4S, v0.4S, v29.4S // ....................................*............. + // trn1 v0.4S, v17.4S, v13.4S // .......................................*.......... + // mls v22.4S, v27.4S, v29.4S // .........................................*........ + // ldr q1, [x4], #8 // ..............*................................... + // ldr q19, [x4], #16 // ...............*.................................. + // trn2 v6.4S, v17.4S, v13.4S // ........................................*......... + // trn2 v5.4S, v21.4S, v22.4S // ...........................................*...... + // trn1 v20.4S, v21.4S, v22.4S // ..........................................*....... + // trn2 v10.2D, v6.2D, v5.2D // .............................................*.... + // trn2 v30.2D, v0.2D, v20.2D // ............................................*..... + // trn1 v17.2D, v6.2D, v5.2D // ...............................................*.. + // trn1 v11.2D, v0.2D, v20.2D // ..............................................*... + // add v13.4S, v30.4S, v10.4S // ................................................*. + // sub v10.4S, v30.4S, v10.4S // .................................................* sub count, count, #1 layer5678_start: - ldr q14, [x0, #64] // e........................................................................... - add v10.4S, v31.4S, v18.4S // .................................................*.......................... - ldr q15, [x0, #80] // .e.......................................................................... - sub v0.4S, v22.4S, v5.4S // .....................................................*...................... - ldr q18, [x0, #96] // ..e......................................................................... - sqrdmulh v2.4S, v25.4S, v17.S[1] // ...................................................*........................ - add v5.4S, v22.4S, v5.4S // ......................................................*..................... - ldr q13, [x0, #112] // ...e........................................................................ - mul v8.4S, v25.4S, v17.S[0] // ..................................................*......................... + // Instructions: 76 + // Expected cycles: 31 + // Expected IPC: 2.45 + // + // Wall time: 47.50s + // User time: 47.50s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sub v31.4S, v11.4S, v17.4S // ................................................*........................... + ldr q6, [x0, #64] // e........................................................................... + ldr q21, [x0, #80] // .e.......................................................................... + add v18.4S, v11.4S, v17.4S // .................................................*.......................... + ldr q27, [x0, #96] // ..e......................................................................... + sqrdmulh v30.4S, v10.4S, v19.S[3] // .......................................................*.................... + mul v8.4S, v10.4S, v19.S[2] // ........................................................*................... + ldr q3, [x0, #112] // ...e........................................................................ + add v9.4S, v18.4S, v13.4S // ...........................................................*................ + sub v23.4S, v18.4S, v13.4S // ..........................................................*................. // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v19.4S, v0.4S, v17.S[3] // ........................................................*................... - mul v9.4S, v0.4S, v17.S[2] // .......................................................*.................... - add v0.4S, v10.4S, v5.4S // ...........................................................*................ + mul v18.4S, v31.4S, v19.S[0] // ...................................................*........................ + ldr q22, [x3, #80] // .................e.......................................................... + ldr q28, [x3, #64] // ................e........................................................... + sqrdmulh v17.4S, v31.4S, v19.S[1] // ..................................................*......................... + ldr q20, [x3, #48] // ...............e............................................................ // gap // ............................................................................ - ldr q24, [x3, #80] // .................e.......................................................... + trn2 v26.4S, v6.4S, v21.4S // .....e...................................................................... + trn1 v15.4S, v6.4S, v21.4S // ....e....................................................................... // gap // ............................................................................ + trn1 v24.4S, v27.4S, v3.4S // ......e..................................................................... + trn2 v4.4S, v27.4S, v3.4S // .......e.................................................................... + ldr q27, [x3, #32] // ..............e............................................................. + mls v8.4S, v30.4S, v29.4S // .........................................................*.................. + mls v18.4S, v17.4S, v29.4S // ....................................................*....................... // gap // ............................................................................ - trn1 v12.4S, v14.4S, v15.4S // ....e....................................................................... - trn2 v31.4S, v14.4S, v15.4S // .....e...................................................................... - trn2 v15.4S, v18.4S, v13.4S // .......e.................................................................... // gap // ............................................................................ - ldr q28, [x3, #48] // ...............e............................................................ - trn1 v14.4S, v18.4S, v13.4S // ......e..................................................................... - mls v9.4S, v19.4S, v29.4S // .........................................................*.................. - ldr q21, [x3, #64] // ................e........................................................... - ldr q23, [x3, #32] // ..............e............................................................. - mls v8.4S, v2.4S, v29.4S // ....................................................*....................... // gap // ............................................................................ - trn2 v25.2D, v12.2D, v14.2D // ........e................................................................... // gap // ............................................................................ - trn2 v27.2D, v31.2D, v15.2D // .........e.................................................................. + trn1 v6.2D, v15.2D, v24.2D // ..........e................................................................. + trn1 v25.2D, v26.2D, v4.2D // ...........e................................................................ + trn2 v14.2D, v26.2D, v4.2D // .........e.................................................................. + trn2 v13.2D, v15.2D, v24.2D // ........e................................................................... // gap // ............................................................................ - trn1 v19.2D, v12.2D, v14.2D // ..........e................................................................. - trn1 v1.2D, v31.2D, v15.2D // ...........e................................................................ // gap // ............................................................................ - sub v11.4S, v25.4S, v27.4S // .......................e.................................................... // gap // ............................................................................ - ldr q20, [x3], #(6*16) // ............e............................................................... - add v3.4S, v25.4S, v27.4S // ........................e................................................... - sub v18.4S, v19.4S, v1.4S // ..................e......................................................... - ldr q6, [x3, #-80] // .............e.............................................................. - add v7.4S, v19.4S, v1.4S // ...................e........................................................ // gap // ............................................................................ + sub v11.4S, v6.4S, v25.4S // ..................e......................................................... + sqrdmulh v24.4S, v23.4S, v1.S[1] // ............................................................*............... + sub v31.4S, v13.4S, v14.4S // .......................e.................................................... + add v17.4S, v6.4S, v25.4S // ...................e........................................................ // gap // ............................................................................ - sqrdmulh v31.4S, v11.4S, v24.4S // ..........................e................................................. - mul v19.4S, v11.4S, v21.4S // .........................e.................................................. // gap // ............................................................................ - sqrdmulh v28.4S, v18.4S, v28.4S // .....................e...................................................... - mul v14.4S, v18.4S, v23.4S // ....................e....................................................... + sqrdmulh v7.4S, v11.4S, v20.4S // ....................e....................................................... + mul v6.4S, v11.4S, v27.4S // .....................e...................................................... + ldr q15, [x3], #(6*16) // ............e............................................................... + ldr q16, [x3, #-80] // .............e.............................................................. + mul v21.4S, v31.4S, v28.4S // ..........................e................................................. + sqrdmulh v4.4S, v31.4S, v22.4S // .........................e.................................................. // gap // ............................................................................ // gap // ............................................................................ - sub v2.4S, v8.4S, v9.4S // ...............................................................*............ + add v10.4S, v13.4S, v14.4S // ........................e................................................... // gap // ............................................................................ // gap // ............................................................................ - sub v15.4S, v10.4S, v5.4S // ..........................................................*................. + srshr v11.4S, v9.4S, #23 // ....................................................................*....... + sub v14.4S, v18.4S, v8.4S // ...............................................................*............ + add v3.4S, v18.4S, v8.4S // ................................................................*........... // gap // ............................................................................ // gap // ............................................................................ - mls v19.4S, v31.4S, v29.4S // ...........................e................................................ - add v12.4S, v8.4S, v9.4S // ................................................................*........... - mls v14.4S, v28.4S, v29.4S // ......................e..................................................... + mls v21.4S, v4.4S, v29.4S // ...........................e................................................ + mls v6.4S, v7.4S, v29.4S // ......................e..................................................... // gap // ............................................................................ // gap // ............................................................................ - sub v16.4S, v7.4S, v3.4S // ............................e............................................... - srshr v9.4S, v12.4S, #23 // ......................................................................*..... + sub v22.4S, v17.4S, v10.4S // ............................e............................................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v27.4S, v15.4S, v30.S[1] // .............................................................*.............. + mul v2.4S, v14.4S, v1.S[0] // ..................................................................*......... // gap // ............................................................................ + srshr v28.4S, v3.4S, #23 // ......................................................................*..... // gap // ............................................................................ - srshr v18.4S, v0.4S, #23 // ....................................................................*....... - mul v13.4S, v2.4S, v30.S[0] // .................................................................*.......... + sqrdmulh v30.4S, v14.4S, v1.S[1] // .................................................................*.......... + sqrdmulh v0.4S, v22.4S, v16.4S // ..............................e............................................. + sub v26.4S, v6.4S, v21.4S // .................................e.......................................... // gap // ............................................................................ // gap // ............................................................................ - sub v31.4S, v14.4S, v19.4S // .................................e.......................................... - sqrdmulh v25.4S, v16.4S, v6.4S // ...............................e............................................ - mls v12.4S, v9.4S, v29.4S // .......................................................................*.... + add v13.4S, v6.4S, v21.4S // ..................................e......................................... + mul v21.4S, v22.4S, v15.4S // ...............................e............................................ // gap // ............................................................................ // gap // ............................................................................ - mul v5.4S, v16.4S, v20.4S // ..............................e............................................. - mul v8.4S, v31.4S, v20.4S // ...................................e........................................ + mul v22.4S, v26.4S, v15.4S // ....................................e....................................... + sqrdmulh v27.4S, v26.4S, v16.4S // ...................................e........................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v1.4S, v31.4S, v6.4S // ....................................e....................................... - add v31.4S, v14.4S, v19.4S // ..................................e......................................... + add v17.4S, v17.4S, v10.4S // .............................e.............................................. + mls v3.4S, v28.4S, v29.4S // .......................................................................*.... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v2.4S, v2.4S, v30.S[1] // ..................................................................*......... - mul v19.4S, v15.4S, v30.S[0] // ............................................................*............... - ldr q30, [x4], #8 // ..............................................e............................. - ldr q17, [x4], #16 // ...............................................e............................ - add v6.4S, v7.4S, v3.4S // .............................e.............................................. - mls v5.4S, v25.4S, v29.4S // ................................e........................................... + mls v21.4S, v0.4S, v29.4S // ................................e........................................... + mls v2.4S, v30.4S, v29.4S // ...................................................................*........ // gap // ............................................................................ // gap // ............................................................................ - mls v8.4S, v1.4S, v29.4S // .....................................e...................................... + trn1 v0.4S, v17.4S, v13.4S // ......................................e..................................... + mls v22.4S, v27.4S, v29.4S // .....................................e...................................... // gap // ............................................................................ - mls v0.4S, v18.4S, v29.4S // .....................................................................*...... // gap // ............................................................................ - mls v13.4S, v2.4S, v29.4S // ...................................................................*........ - trn1 v7.4S, v6.4S, v31.4S // ......................................e..................................... - trn2 v4.4S, v6.4S, v31.4S // .......................................e.................................... + mul v8.4S, v23.4S, v1.S[0] // .............................................................*.............. + ldr q1, [x4], #8 // ..............................................e............................. + mls v9.4S, v11.4S, v29.4S // .....................................................................*...... + ldr q19, [x4], #16 // ...............................................e............................ + str q2, [x0, #48] // ...........................................................................* + trn2 v6.4S, v17.4S, v13.4S // .......................................e.................................... // gap // ............................................................................ // gap // ............................................................................ - trn1 v9.4S, v5.4S, v8.4S // ........................................e................................... - trn2 v14.4S, v5.4S, v8.4S // .........................................e.................................. + trn2 v5.4S, v21.4S, v22.4S // .........................................e.................................. + trn1 v20.4S, v21.4S, v22.4S // ........................................e................................... // gap // ............................................................................ // gap // ............................................................................ - str q0, [x0], #(16*4) // ........................................................................*... - mls v19.4S, v27.4S, v29.4S // ..............................................................*............. + mls v8.4S, v24.4S, v29.4S // ..............................................................*............. + str q9, [x0], #(16*4) // ........................................................................*... // gap // ............................................................................ // gap // ............................................................................ + str q3, [x0, #-48] // .........................................................................*.. // gap // ............................................................................ - trn1 v18.2D, v4.2D, v14.2D // .............................................e.............................. - trn1 v31.2D, v7.2D, v9.2D // ............................................e............................... - str q12, [x0, #-48] // .........................................................................*.. - trn2 v22.2D, v7.2D, v9.2D // ..........................................e................................. - trn2 v5.2D, v4.2D, v14.2D // ...........................................e................................ - str q13, [x0, #-16] // ...........................................................................* + trn2 v10.2D, v6.2D, v5.2D // ...........................................e................................ + trn2 v30.2D, v0.2D, v20.2D // ..........................................e................................. + trn1 v17.2D, v6.2D, v5.2D // .............................................e.............................. + trn1 v11.2D, v0.2D, v20.2D // ............................................e............................... // gap // ............................................................................ - sub v25.4S, v31.4S, v18.4S // ................................................e........................... - str q19, [x0, #-32] // ..........................................................................*. // gap // ............................................................................ + add v13.4S, v30.4S, v10.4S // ......................................................e..................... + sub v10.4S, v30.4S, v10.4S // .....................................................e...................... // gap // ............................................................................ + str q8, [x0, #-32] // ..........................................................................*. - // original source code - // ldr q8, [x0, #(16*0)] // e...........................................................................e........................................................................... - // ldr q9, [x0, #(16*1)] // ..e.........................................................................|.e......................................................................... - // ldr q10, [x0, #(16*2)] // ....e.......................................................................|...e....................................................................... - // ldr q11, [x0, #(16*3)] // .......e....................................................................|......e.................................................................... - // trn1 v25.4s, v8.4s, v9.4s // .............e..............................................................|............e.............................................................. - // trn2 v26.4s, v8.4s, v9.4s // ..............e.............................................................|.............e............................................................. - // trn1 v27.4s, v10.4s, v11.4s // .................e..........................................................|................e.......................................................... - // trn2 v28.4s, v10.4s, v11.4s // ...............e............................................................|..............e............................................................ - // trn2 v10.2d, v25.2d, v27.2d // ......................e.....................................................|.....................e..................................................... - // trn2 v11.2d, v26.2d, v28.2d // .......................e....................................................|......................e.................................................... - // trn1 v8.2d, v25.2d, v27.2d // ........................e...................................................|.......................e................................................... - // trn1 v9.2d, v26.2d, v28.2d // .........................e..................................................|........................e.................................................. - // ldr q0, [x3], #(6*16) // ...........................e................................................|..........................e................................................ - // ldr q4, [x3, #(-6*16 + 1*16)] // ..............................e.............................................|.............................e............................................. - // ldr q1, [x3, #(-6*16 + 2*16)] // ....................e.......................................................|...................e....................................................... - // ldr q5, [x3, #(-6*16 + 3*16)] // ................e...........................................................|...............e........................................................... - // ldr q2, [x3, #(-6*16 + 4*16)] // ...................e........................................................|..................e........................................................ - // ldr q6, [x3, #(-6*16 + 5*16)] // ............e...............................................................|...........e............................................................... - // sub v24.4s, v8.4s, v9.4s // .............................e..............................................|............................e.............................................. - // add v8.4s, v8.4s, v9.4s // ...............................e............................................|..............................e............................................ - // mul v9.4s, v24.4s, v1.4s // ...................................e........................................|..................................e........................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // ..................................e.........................................|.................................e......................................... - // mls v9.4s, v24.4s, v29.4s // ........................................e...................................|.......................................e................................... - // sub v24.4s, v10.4s, v11.4s // ..........................e.................................................|.........................e................................................. - // add v10.4s, v10.4s, v11.4s // ............................e...............................................|...........................e............................................... - // mul v11.4s, v24.4s, v2.4s // .................................e..........................................|................................e.......................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ................................e...........................................|...............................e........................................... - // mls v11.4s, v24.4s, v29.4s // ......................................e.....................................|.....................................e..................................... - // sub v24.4s, v8.4s, v10.4s // .........................................e..................................|........................................e.................................. - // add v8.4s, v8.4s, v10.4s // .........................................................e..................|........................................................e.................. - // mul v10.4s, v24.4s, v0.4s // .................................................e..........................|................................................e.......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................e............................|..............................................e............................ - // mls v10.4s, v24.4s, v29.4s // ..........................................................e.................|.........................................................e................. - // sub v24.4s, v9.4s, v11.4s // ..............................................e.............................|.............................................e............................. - // add v9.4s, v9.4s, v11.4s // ....................................................e.......................|...................................................e....................... - // mul v11.4s, v24.4s, v0.4s // ..................................................e.........................|.................................................e......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e........................|..................................................e........................ - // mls v11.4s, v24.4s, v29.4s // ...........................................................e................|..........................................................e................ - // trn1 v25.4s, v8.4s, v9.4s // ..............................................................e.............|.............................................................e............. - // trn2 v26.4s, v8.4s, v9.4s // ...............................................................e............|..............................................................e............ - // trn1 v27.4s, v10.4s, v11.4s // ................................................................e...........|...............................................................e........... - // trn2 v28.4s, v10.4s, v11.4s // .................................................................e..........|................................................................e.......... - // trn2 v10.2d, v25.2d, v27.2d // .......................................................................e....|......................................................................e.... - // trn2 v11.2d, v26.2d, v28.2d // ........................................................................e...|.......................................................................e... - // trn1 v8.2d, v25.2d, v27.2d // .....................................................................e......|....................................................................e...... - // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e.......|...................................................................e....... - // ldr q1, [x4], #8 // .......................................................e....................|......................................................e.................... - // ldr q0, [x4], #16 // ........................................................e...................|.......................................................e................... - // sub v24.4s, v8.4s, v9.4s // ..........................................................................e.|.........................................................................e. - // add v8.4s, v8.4s, v9.4s // .*..........................................................................|*.......................................................................... - // mul v9.4s, v24.4s, v0.s[0] // ........*...................................................................|.......*................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....*......................................................................|....*...................................................................... - // mls v9.4s, v24.4s, v29.4s // .....................*......................................................|....................*...................................................... - // sub v24.4s, v10.4s, v11.4s // ...*........................................................................|..*........................................................................ - // add v10.4s, v10.4s, v11.4s // ......*.....................................................................|.....*..................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ..........*.................................................................|.........*................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........*..................................................................|........*.................................................................. - // mls v11.4s, v24.4s, v29.4s // ..................*.........................................................|.................*......................................................... - // sub v24.4s, v8.4s, v10.4s // .....................................*......................................|....................................*...................................... - // add v8.4s, v8.4s, v10.4s // ...........*................................................................|..........*................................................................ - // mul v10.4s, v24.4s, v1.s[0] // ......................................................*.....................|.....................................................*..................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................*................................|..........................................*................................ - // mls v10.4s, v24.4s, v29.4s // ...................................................................*........|..................................................................*........ - // sub v24.4s, v9.4s, v11.4s // ....................................*.......................................|...................................*....................................... - // add v9.4s, v9.4s, v11.4s // .......................................*....................................|......................................*.................................... - // mul v11.4s, v24.4s, v1.s[0] // .............................................*..............................|............................................*.............................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................................................*......................|....................................................*...................... - // mls v11.4s, v24.4s, v29.4s // .............................................................*..............|............................................................*.............. - // srshr v24.4S, v8.4S, #23 // ............................................*...............................|...........................................*............................... - // mls v8.4s, v24.4s, v29.4s // ............................................................*...............|...........................................................*............... - // srshr v24.4S, v9.4S, #23 // ..........................................*.................................|.........................................*................................. - // mls v9.4s, v24.4s, v29.4s // ................................................*...........................|...............................................*........................... - // str q8, [x0], #(16*4) // ..................................................................*.........|.................................................................*......... - // str q9, [x0, #(-16*4 + 1*16)] // ......................................................................*.....|.....................................................................*..... - // str q10, [x0, #(-16*4 + 2*16)] // ...........................................................................*|..........................................................................* - // str q11, [x0, #(-16*4 + 3*16)] // .........................................................................*..|........................................................................*.. + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q8, [x0, #(16*0)] // e..........................................................................'~.......................................................................... + // ldr q9, [x0, #(16*1)] // .e.........................................................................'.~......................................................................... + // ldr q10, [x0, #(16*2)] // ...e.......................................................................'...~....................................................................... + // ldr q11, [x0, #(16*3)] // ......e....................................................................'......~.................................................................... + // trn1 v25.4s, v8.4s, v9.4s // ...............e...........................................................'...............~........................................................... + // trn2 v26.4s, v8.4s, v9.4s // ..............e............................................................'..............~............................................................ + // trn1 v27.4s, v10.4s, v11.4s // ................e..........................................................'................~.......................................................... + // trn2 v28.4s, v10.4s, v11.4s // .................e.........................................................'.................~......................................................... + // trn2 v10.2d, v25.2d, v27.2d // ........................e..................................................'........................~.................................................. + // trn2 v11.2d, v26.2d, v28.2d // .......................e...................................................'.......................~................................................... + // trn1 v8.2d, v25.2d, v27.2d // .....................e.....................................................'.....................~..................................................... + // trn1 v9.2d, v26.2d, v28.2d // ......................e....................................................'......................~.................................................... + // ldr q0, [x3], #(6*16) // ...............................e...........................................'...............................~........................................... + // ldr q4, [x3, #(-6*16 + 1*16)] // ................................e..........................................'................................~.......................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // ..................e........................................................'..................~........................................................ + // ldr q5, [x3, #(-6*16 + 3*16)] // .............e.............................................................'.............~............................................................. + // ldr q2, [x3, #(-6*16 + 4*16)] // ...........e...............................................................'...........~............................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ..........e................................................................'..........~................................................................ + // sub v24.4s, v8.4s, v9.4s // .........................e.................................................'.........................~................................................. + // add v8.4s, v8.4s, v9.4s // ............................e..............................................'............................~.............................................. + // sqrdmulh v27.4s, v24.4s, v5.4s // .............................e.............................................'.............................~............................................. + // mul v9.4s, v24.4s, v1.4s // ..............................e............................................'..............................~............................................ + // mls v9.4s, v27.4s, v29.4s // ........................................e..................................'........................................~.................................. + // sub v24.4s, v10.4s, v11.4s // ...........................e...............................................'...........................~............................................... + // add v10.4s, v10.4s, v11.4s // ...................................e.......................................'...................................~....................................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ..................................e........................................'..................................~........................................ + // mul v11.4s, v24.4s, v2.4s // .................................e.........................................'.................................~......................................... + // mls v11.4s, v27.4s, v29.4s // .......................................e...................................'.......................................~................................... + // sub v24.4s, v8.4s, v10.4s // .........................................e.................................'.........................................~................................. + // add v8.4s, v8.4s, v10.4s // ...................................................e.......................'...................................................~....................... + // sqrdmulh v27.4s, v24.4s, v4.4s // .............................................e.............................'.............................................~............................. + // mul v10.4s, v24.4s, v0.4s // ................................................e..........................'................................................~.......................... + // mls v10.4s, v27.4s, v29.4s // .....................................................e.....................'.....................................................~..................... + // sub v24.4s, v9.4s, v11.4s // ..............................................e............................'..............................................~............................ + // add v9.4s, v9.4s, v11.4s // ...............................................e...........................'...............................................~........................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ..................................................e........................'..................................................~........................ + // mul v11.4s, v24.4s, v0.4s // .................................................e.........................'.................................................~......................... + // mls v11.4s, v27.4s, v29.4s // ........................................................e..................'........................................................~.................. + // trn1 v25.4s, v8.4s, v9.4s // .......................................................e...................'.......................................................~................... + // trn2 v26.4s, v8.4s, v9.4s // ..............................................................e............'..............................................................~............ + // trn1 v27.4s, v10.4s, v11.4s // ................................................................e..........'................................................................~.......... + // trn2 v28.4s, v10.4s, v11.4s // ...............................................................e...........'...............................................................~........... + // trn2 v10.2d, v25.2d, v27.2d // .....................................................................e.....'.....................................................................~..... + // trn2 v11.2d, v26.2d, v28.2d // ....................................................................e......'....................................................................~...... + // trn1 v8.2d, v25.2d, v27.2d // .......................................................................e...'.......................................................................~... + // trn1 v9.2d, v26.2d, v28.2d // ......................................................................e....'......................................................................~.... + // ldr q1, [x4], #8 // ..........................................................e................'..........................................................~................ + // ldr q0, [x4], #16 // ............................................................e..............'............................................................~.............. + // sub v24.4s, v8.4s, v9.4s // ...........................................................................*........................................................................... + // add v8.4s, v8.4s, v9.4s // ..~........................................................................'..*........................................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ............~..............................................................'............*.............................................................. + // mul v9.4s, v24.4s, v0.s[0] // .........~.................................................................'.........*................................................................. + // mls v9.4s, v27.4s, v29.4s // ....................~......................................................'....................*...................................................... + // sub v24.4s, v10.4s, v11.4s // .........................................................................e.'.........................................................................~. + // add v10.4s, v10.4s, v11.4s // ........................................................................e..'........................................................................~.. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ....~......................................................................'....*...................................................................... + // mul v11.4s, v24.4s, v0.s[2] // .....~.....................................................................'.....*..................................................................... + // mls v11.4s, v27.4s, v29.4s // ...................~.......................................................'...................*....................................................... + // sub v24.4s, v8.4s, v10.4s // ........~..................................................................'........*.................................................................. + // add v8.4s, v8.4s, v10.4s // .......~...................................................................'.......*................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ..........................~................................................'..........................*................................................ + // mul v10.4s, v24.4s, v1.s[0] // .........................................................~.................'.........................................................*................. + // mls v10.4s, v27.4s, v29.4s // .................................................................~.........'.................................................................*......... + // sub v24.4s, v9.4s, v11.4s // .....................................~.....................................'.....................................*..................................... + // add v9.4s, v9.4s, v11.4s // ......................................~....................................'......................................*.................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ............................................~..............................'............................................*.............................. + // mul v11.4s, v24.4s, v1.s[0] // ..........................................~................................'..........................................*................................ + // mls v11.4s, v27.4s, v29.4s // ......................................................~....................'......................................................*.................... + // srshr v24.4S, v8.4S, #23 // ....................................~......................................'....................................*...................................... + // mls v8.4s, v24.4s, v29.4s // ...........................................................~...............'...........................................................*............... + // srshr v24.4S, v9.4S, #23 // ...........................................~...............................'...........................................*............................... + // mls v9.4s, v24.4s, v29.4s // ....................................................~......................'....................................................*...................... + // str q8, [x0], #(16*4) // ..................................................................~........'..................................................................*........ + // str q9, [x0, #(-16*4 + 1*16)] // ...................................................................~.......'...................................................................*....... + // str q10, [x0, #(-16*4 + 2*16)] // ..........................................................................~'..........................................................................* + // str q11, [x0, #(-16*4 + 3*16)] // .............................................................~.............'.............................................................*............. sub count, count, #1 cbnz count, layer5678_start - sub v23.4S, v22.4S, v5.4S // .*......................... - sqrdmulh v21.4S, v25.4S, v17.S[1] // ..*........................ - // gap // ........................... - // gap // ........................... - add v3.4S, v31.4S, v18.4S // *.......................... - mul v11.4S, v25.4S, v17.S[0] // ....*...................... - // gap // ........................... - // gap // ........................... - sqrdmulh v7.4S, v23.4S, v17.S[3] // .....*..................... - mul v8.4S, v23.4S, v17.S[2] // ......*.................... - // gap // ........................... - // gap // ........................... - add v23.4S, v22.4S, v5.4S // ...*....................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v11.4S, v21.4S, v29.4S // .........*................. - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v8.4S, v7.4S, v29.4S // ........*.................. - add v10.4S, v3.4S, v23.4S // .......*................... - // gap // ........................... - // gap // ........................... - sub v12.4S, v3.4S, v23.4S // ...........*............... - // gap // ........................... - // gap // ........................... - // gap // ........................... - srshr v23.4S, v10.4S, #23 // ...............*........... - // gap // ........................... - // gap // ........................... - // gap // ........................... - sub v18.4S, v11.4S, v8.4S // ..........*................ - mul v21.4S, v12.4S, v30.S[0] // ...................*....... - // gap // ........................... - // gap // ........................... - add v3.4S, v11.4S, v8.4S // ............*.............. - sqrdmulh v11.4S, v12.4S, v30.S[1] // ..............*............ - // gap // ........................... - // gap // ........................... - sqrdmulh v2.4S, v18.4S, v30.S[1] // ..................*........ - mul v18.4S, v18.4S, v30.S[0] // ................*.......... - // gap // ........................... - // gap // ........................... - mls v10.4S, v23.4S, v29.4S // ....................*...... - srshr v30.4S, v3.4S, #23 // .............*............. - // gap // ........................... - // gap // ........................... - mls v21.4S, v11.4S, v29.4S // .......................*... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v18.4S, v2.4S, v29.4S // .....................*..... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v3.4S, v30.4S, v29.4S // .................*......... - str q10, [x0], #(16*4) // ......................*.... - // gap // ........................... - // gap // ........................... - str q21, [x0, #-32] // ..........................* - // gap // ........................... - // gap // ........................... - // gap // ........................... - str q18, [x0, #-16] // .........................*. - // gap // ........................... - // gap // ........................... - // gap // ........................... - str q3, [x0, #-48] // ........................*.. - // gap // ........................... - // gap // ........................... - // gap // ........................... + // Instructions: 26 + // Expected cycles: 18 + // Expected IPC: 1.44 + // + // Wall time: 0.29s + // User time: 0.29s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sub v21.4S, v11.4S, v17.4S // *............................. + add v22.4S, v11.4S, v17.4S // .*............................ + // gap // .............................. + // gap // .............................. + sqrdmulh v18.4S, v10.4S, v19.S[3] // ..*........................... + mul v8.4S, v10.4S, v19.S[2] // ...*.......................... + // gap // .............................. + // gap // .............................. + mul v9.4S, v21.4S, v19.S[0] // ......*....................... + sqrdmulh v21.4S, v21.4S, v19.S[1] // .......*...................... + // gap // .............................. + // gap // .............................. + sub v2.4S, v22.4S, v13.4S // .....*........................ + add v22.4S, v22.4S, v13.4S // ....*......................... + // gap // .............................. + // gap // .............................. + mls v8.4S, v18.4S, v29.4S // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v9.4S, v21.4S, v29.4S // .........*.................... + sqrdmulh v21.4S, v2.4S, v1.S[1] // ..........*................... + // gap // .............................. + // gap // .............................. + mul v18.4S, v2.4S, v1.S[0] // ...................*.......... + srshr v2.4S, v22.4S, #23 // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v3.4S, v9.4S, v8.4S // ............*................. + add v8.4S, v9.4S, v8.4S // .............*................ + // gap // .............................. + // gap // .............................. + mls v22.4S, v2.4S, v29.4S // ....................*......... + mls v18.4S, v21.4S, v29.4S // ......................*....... + // gap // .............................. + // gap // .............................. + mul v21.4S, v3.4S, v1.S[0] // ..............*............... + sqrdmulh v1.4S, v3.4S, v1.S[1] // ................*............. + // gap // .............................. + // gap // .............................. + srshr v9.4S, v8.4S, #23 // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q22, [x0], #(16*4) // .......................*...... + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q18, [x0, #-32] // .........................*.... + mls v21.4S, v1.4S, v29.4S // ..................*........... + // gap // .............................. + // gap // .............................. + mls v8.4S, v9.4S, v29.4S // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q21, [x0, #-16] // .....................*........ + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q8, [x0, #-48] // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. - // original source code - // add v10.4S, v31.4S, v18.4S // ..*........................ - // sub v0.4S, v22.4S, v5.4S // *.......................... - // sqrdmulh v2.4S, v25.4S, v17.S[1] // .*......................... - // add v5.4S, v22.4S, v5.4S // ......*.................... - // mul v8.4S, v25.4S, v17.S[0] // ...*....................... - // sqrdmulh v19.4S, v0.4S, v17.S[3] // ....*...................... - // mul v9.4S, v0.4S, v17.S[2] // .....*..................... - // add v0.4S, v10.4S, v5.4S // .........*................. - // mls v9.4S, v19.4S, v29.4S // ........*.................. - // mls v8.4S, v2.4S, v29.4S // .......*................... - // sub v2.4S, v8.4S, v9.4S // ............*.............. - // sub v15.4S, v10.4S, v5.4S // ..........*................ - // add v12.4S, v8.4S, v9.4S // ..............*............ - // srshr v9.4S, v12.4S, #23 // ...................*....... - // sqrdmulh v27.4S, v15.4S, v30.S[1] // ...............*........... - // srshr v18.4S, v0.4S, #23 // ...........*............... - // mul v13.4S, v2.4S, v30.S[0] // .................*......... - // mls v12.4S, v9.4S, v29.4S // ......................*.... - // sqrdmulh v2.4S, v2.4S, v30.S[1] // ................*.......... - // mul v19.4S, v15.4S, v30.S[0] // .............*............. - // mls v0.4S, v18.4S, v29.4S // ..................*........ - // mls v13.4S, v2.4S, v29.4S // .....................*..... - // str q0, [x0], #(16*4) // .......................*... - // mls v19.4S, v27.4S, v29.4S // ....................*...... - // str q12, [x0, #-48] // ..........................* - // str q13, [x0, #-16] // .........................*. - // str q19, [x0, #-32] // ........................*.. + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sub v31.4S, v11.4S, v17.4S // *.............................. + // add v18.4S, v11.4S, v17.4S // .*............................. + // sqrdmulh v30.4S, v10.4S, v19.S[3] // ..*............................ + // mul v8.4S, v10.4S, v19.S[2] // ...*........................... + // add v9.4S, v18.4S, v13.4S // .......*....................... + // sub v23.4S, v18.4S, v13.4S // ......*........................ + // mul v18.4S, v31.4S, v19.S[0] // ....*.......................... + // sqrdmulh v17.4S, v31.4S, v19.S[1] // .....*......................... + // mls v8.4S, v30.4S, v29.4S // ........*...................... + // mls v18.4S, v17.4S, v29.4S // .........*..................... + // sqrdmulh v24.4S, v23.4S, v1.S[1] // ..........*.................... + // srshr v11.4S, v9.4S, #23 // ............*.................. + // sub v14.4S, v18.4S, v8.4S // .............*................. + // add v3.4S, v18.4S, v8.4S // ..............*................ + // mul v2.4S, v14.4S, v1.S[0] // .................*............. + // srshr v28.4S, v3.4S, #23 // ...................*........... + // sqrdmulh v30.4S, v14.4S, v1.S[1] // ..................*............ + // mls v3.4S, v28.4S, v29.4S // .......................*....... + // mls v2.4S, v30.4S, v29.4S // ......................*........ + // mul v8.4S, v23.4S, v1.S[0] // ...........*................... + // mls v9.4S, v11.4S, v29.4S // ...............*............... + // str q2, [x0, #48] // ........................*...... + // mls v8.4S, v24.4S, v29.4S // ................*.............. + // str q9, [x0], #(16*4) // ....................*.......... + // str q3, [x0, #-48] // .........................*..... + // str q8, [x0, #-32] // .....................*......... .unreq root0_tw @@ -858,853 +882,862 @@ layer5678_start: load_roots_1234 r_ptr1 .p2align 2 - ldr q9, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... - ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... - ldr q10, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... - ldr q11, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... - ldr q15, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + ldr q13, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q18, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q14, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + ldr q16, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q27, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ ldr q19, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. - ldr q16, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. - sub v14.4S, v20.4S, v9.4S // ................*....................................................................................................................................................................................................................................................................... - add v13.4S, v20.4S, v9.4S // .................*...................................................................................................................................................................................................................................................................... - ldr q12, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... - sub v22.4S, v10.4S, v11.4S // .....................*.................................................................................................................................................................................................................................................................. - ldr q28, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ - ldr q24, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... - mul v27.4S, v14.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... - sqrdmulh v9.4S, v22.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... - mul v8.4S, v22.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ - ldr q23, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... - add v17.4S, v12.4S, v16.4S // ...........................*............................................................................................................................................................................................................................................................ - sub v21.4S, v12.4S, v16.4S // ..........................*............................................................................................................................................................................................................................................................. - ldr q16, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. - mls v27.4S, v14.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... - sub v20.4S, v19.4S, v28.4S // ...............................*........................................................................................................................................................................................................................................................ - ldr q18, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ - mls v8.4S, v9.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. - mul v9.4S, v21.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... - ldr q22, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. - mul v12.4S, v20.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... - sqrdmulh v14.4S, v21.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... - sqrdmulh v21.4S, v20.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... - add v20.4S, v15.4S, v24.4S // ...............................................*........................................................................................................................................................................................................................................ - add v10.4S, v10.4S, v11.4S // ......................*................................................................................................................................................................................................................................................................. - sub v24.4S, v15.4S, v24.4S // ..............................................*......................................................................................................................................................................................................................................... - add v15.4S, v27.4S, v8.4S // ..............................................................*......................................................................................................................................................................................................................... - mls v12.4S, v21.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... - mls v9.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... - mul v21.4S, v24.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... - sqrdmulh v14.4S, v24.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... - sub v24.4S, v13.4S, v10.4S // ........................................................*............................................................................................................................................................................................................................... - sub v11.4S, v27.4S, v8.4S // .............................................................*.......................................................................................................................................................................................................................... - add v27.4S, v9.4S, v12.4S // ........................................................................*............................................................................................................................................................................................................... - sub v12.4S, v9.4S, v12.4S // .......................................................................*................................................................................................................................................................................................................ - mls v21.4S, v14.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... - sub v14.4S, v23.4S, v16.4S // ....................................*................................................................................................................................................................................................................................................... - add v8.4S, v13.4S, v10.4S // .........................................................*.............................................................................................................................................................................................................................. - sub v13.4S, v15.4S, v27.4S // .....................................................................................................*.................................................................................................................................................................................. - add v9.4S, v15.4S, v27.4S // ......................................................................................................*................................................................................................................................................................................. - mul v27.4S, v14.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. - mul v15.4S, v24.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. - sqrdmulh v10.4S, v14.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ - sqrdmulh v14.4S, v24.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ - add v24.4S, v22.4S, v18.4S // ..........................................*............................................................................................................................................................................................................................................. - add v28.4S, v19.4S, v28.4S // ................................*....................................................................................................................................................................................................................................................... - sqrdmulh v19.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... - mls v27.4S, v10.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... - sub v22.4S, v22.4S, v18.4S // .........................................*.............................................................................................................................................................................................................................................. - mls v15.4S, v14.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... - sub v14.4S, v17.4S, v28.4S // ..................................................................*..................................................................................................................................................................................................................... - sqrdmulh v18.4S, v22.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... - mul v10.4S, v22.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ - sqrdmulh v22.4S, v14.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. - mul v14.4S, v14.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... - add v23.4S, v23.4S, v16.4S // .....................................*.................................................................................................................................................................................................................................................. - add v28.4S, v17.4S, v28.4S // ...................................................................*.................................................................................................................................................................................................................... - mls v10.4S, v18.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... - sqrdmulh v18.4S, v12.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. - mls v14.4S, v22.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. - mul v22.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ - mul v11.4S, v12.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. - add v16.4S, v23.4S, v24.4S // .............................................................................*.......................................................................................................................................................................................................... - sub v12.4S, v27.4S, v10.4S // .................................................................................*...................................................................................................................................................................................................... - add v17.4S, v27.4S, v10.4S // ..................................................................................*..................................................................................................................................................................................................... - mls v22.4S, v19.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... - add v10.4S, v15.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ - mul v19.4S, v12.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... - sqrdmulh v27.4S, v12.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... - sub v12.4S, v8.4S, v28.4S // ................................................................................................*....................................................................................................................................................................................... - mls v11.4S, v18.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ - add v8.4S, v8.4S, v28.4S // .................................................................................................*...................................................................................................................................................................................... - sub v28.4S, v15.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. - ldr q18, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ - ldr q14, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + ldr q15, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + ldr q12, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q23, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + add v28.4S, v13.4S, v18.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v18.4S, v13.4S, v18.4S // ..........................*............................................................................................................................................................................................................................................................. + add v21.4S, v16.4S, v14.4S // ..........................................*............................................................................................................................................................................................................................................. + sub v14.4S, v16.4S, v14.4S // .........................................*.............................................................................................................................................................................................................................................. + ldr q16, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + sub v20.4S, v19.4S, v27.4S // ...............................*........................................................................................................................................................................................................................................................ + add v8.4S, v19.4S, v27.4S // ................................*....................................................................................................................................................................................................................................................... + ldr q11, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + mul v13.4S, v18.4S, v4.S[2] // .............................*.......................................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v18.4S, v4.S[3] // ............................*........................................................................................................................................................................................................................................................... + ldr q10, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + add v22.4S, v15.4S, v23.4S // ....................................................*................................................................................................................................................................................................................................... + sub v23.4S, v15.4S, v23.4S // ...................................................*.................................................................................................................................................................................................................................... + ldr q9, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + mul v19.4S, v14.4S, v6.S[0] // ............................................*........................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v14.4S, v6.S[1] // ...........................................*............................................................................................................................................................................................................................................ + mls v13.4S, v18.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + sub v24.4S, v11.4S, v12.4S // ................*....................................................................................................................................................................................................................................................................... + ldr q18, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + sub v14.4S, v16.4S, v10.4S // .....................*.................................................................................................................................................................................................................................................................. + mul v27.4S, v24.4S, v3.S[2] // ...................*.................................................................................................................................................................................................................................................................... + add v10.4S, v16.4S, v10.4S // ......................*................................................................................................................................................................................................................................................................. + add v16.4S, v9.4S, v18.4S // .....................................*.................................................................................................................................................................................................................................................. + sqrdmulh v15.4S, v14.4S, v4.S[1] // .......................*................................................................................................................................................................................................................................................................ + mul v14.4S, v14.4S, v4.S[0] // ........................*............................................................................................................................................................................................................................................................... + add v11.4S, v11.4S, v12.4S // .................*...................................................................................................................................................................................................................................................................... + sub v12.4S, v16.4S, v21.4S // ............................................................................*........................................................................................................................................................................................................... + sub v9.4S, v9.4S, v18.4S // ....................................*................................................................................................................................................................................................................................................... + add v16.4S, v16.4S, v21.4S // .............................................................................*.......................................................................................................................................................................................................... + mls v14.4S, v15.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + add v21.4S, v11.4S, v10.4S // .........................................................*.............................................................................................................................................................................................................................. + sqrdmulh v18.4S, v9.4S, v5.S[3] // ......................................*................................................................................................................................................................................................................................................. + mul v9.4S, v9.4S, v5.S[2] // .......................................*................................................................................................................................................................................................................................................ + sub v15.4S, v11.4S, v10.4S // ........................................................*............................................................................................................................................................................................................................... + sub v11.4S, v28.4S, v8.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v10.4S, v24.4S, v3.S[3] // ..................*..................................................................................................................................................................................................................................................................... + add v28.4S, v28.4S, v8.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v19.4S, v17.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mls v9.4S, v18.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + mul v24.4S, v15.4S, v1.S[2] // ...........................................................*............................................................................................................................................................................................................................ + mul v8.4S, v20.4S, v5.S[0] // ..................................*..................................................................................................................................................................................................................................................... + mls v27.4S, v10.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v20.4S, v5.S[1] // .................................*...................................................................................................................................................................................................................................................... + add v17.4S, v9.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... + sqrdmulh v20.4S, v12.4S, v2.S[3] // ..............................................................................*......................................................................................................................................................................................................... + mul v18.4S, v12.4S, v2.S[2] // ...............................................................................*........................................................................................................................................................................................................ + sub v19.4S, v9.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... + mls v8.4S, v10.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sub v9.4S, v27.4S, v14.4S // .............................................................*.......................................................................................................................................................................................................................... + sqrdmulh v10.4S, v15.4S, v1.S[3] // ..........................................................*............................................................................................................................................................................................................................. + add v12.4S, v27.4S, v14.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v15.4S, v9.4S, v1.S[2] // ................................................................*....................................................................................................................................................................................................................... + sqrdmulh v14.4S, v9.4S, v1.S[3] // ...............................................................*........................................................................................................................................................................................................................ + sub v27.4S, v13.4S, v8.4S // .......................................................................*................................................................................................................................................................................................................ + add v13.4S, v13.4S, v8.4S // ........................................................................*............................................................................................................................................................................................................... + ldr q9, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + ldr q8, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + mls v18.4S, v20.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + mls v15.4S, v14.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mls v24.4S, v10.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sqrdmulh v20.4S, v23.4S, v7.S[1] // .....................................................*.................................................................................................................................................................................................................................. + mul v10.4S, v23.4S, v7.S[0] // ......................................................*................................................................................................................................................................................................................................. + mul v23.4S, v27.4S, v2.S[0] // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v14.4S, v27.4S, v2.S[1] // .........................................................................*.............................................................................................................................................................................................................. + add v27.4S, v8.4S, v9.4S // ...............................................*........................................................................................................................................................................................................................................ + sub v8.4S, v8.4S, v9.4S // ..............................................*......................................................................................................................................................................................................................................... + add v9.4S, v12.4S, v13.4S // ......................................................................................................*................................................................................................................................................................................. + mls v10.4S, v20.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mul v20.4S, v8.4S, v6.S[2] // .................................................*...................................................................................................................................................................................................................................... + mls v23.4S, v14.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + sub v14.4S, v27.4S, v22.4S // ......................................................................................*................................................................................................................................................................................................. + sub v13.4S, v12.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. + mul v12.4S, v11.4S, v2.S[0] // .....................................................................*.................................................................................................................................................................................................................. + add v27.4S, v27.4S, v22.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v22.4S, v8.4S, v6.S[3] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v11.4S, v2.S[1] // ....................................................................*................................................................................................................................................................................................................... + add v8.4S, v21.4S, v28.4S // .................................................................................................*...................................................................................................................................................................................... + sub v21.4S, v21.4S, v28.4S // ................................................................................................*....................................................................................................................................................................................... + sub v28.4S, v16.4S, v27.4S // ....................................................................................................................*................................................................................................................................................................... + mls v20.4S, v22.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + add v22.4S, v16.4S, v27.4S // .....................................................................................................................*.................................................................................................................................................................. + mls v12.4S, v11.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v11.4S, v15.4S, v23.4S // ................................................................................................................*....................................................................................................................................................................... + sqrdmulh v16.4S, v13.4S, v0.S[3] // .......................................................................................................*................................................................................................................................................................................ + mul v13.4S, v13.4S, v0.S[2] // ........................................................................................................*............................................................................................................................................................................... + sqrdmulh v27.4S, v19.4S, v2.S[3] // ...................................................................................*.................................................................................................................................................................................................... + mul v19.4S, v19.4S, v2.S[2] // ....................................................................................*................................................................................................................................................................................................... + sub v15.4S, v15.4S, v23.4S // ...............................................................................................................*........................................................................................................................................................................ + add v23.4S, v20.4S, v10.4S // ............................................................................................*........................................................................................................................................................................................... + sub v20.4S, v20.4S, v10.4S // ...........................................................................................*............................................................................................................................................................................................ + mls v13.4S, v16.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + mul v16.4S, v14.4S, v3.S[0] // .........................................................................................*.............................................................................................................................................................................................. + sqrdmulh v14.4S, v14.4S, v3.S[1] // ........................................................................................*............................................................................................................................................................................................... + add v10.4S, v24.4S, v12.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v12.4S, v24.4S, v12.4S // ..........................................................................................................*............................................................................................................................................................................. + add v24.4S, v17.4S, v23.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v23.4S, v17.4S, v23.4S // .........................................................................................................................*.............................................................................................................................................................. mls v19.4S, v27.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. - sub v15.4S, v23.4S, v24.4S // ............................................................................*........................................................................................................................................................................................................... - sqrdmulh v27.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... - sub v23.4S, v14.4S, v18.4S // ...................................................*.................................................................................................................................................................................................................................... - mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... - add v24.4S, v14.4S, v18.4S // ....................................................*................................................................................................................................................................................................................................... - sqrdmulh v18.4S, v23.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. - mul v23.4S, v23.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. - sqrdmulh v14.4S, v13.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... - mul v13.4S, v13.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ - mls v12.4S, v27.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... - add v27.4S, v20.4S, v24.4S // .......................................................................................*................................................................................................................................................................................................ - sub v24.4S, v20.4S, v24.4S // ......................................................................................*................................................................................................................................................................................................. - mls v23.4S, v18.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ - mls v13.4S, v14.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. - mul v18.4S, v15.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... - sqrdmulh v15.4S, v15.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ - sub v14.4S, v16.4S, v27.4S // ....................................................................................................................*................................................................................................................................................................... - sub v20.4S, v21.4S, v23.4S // ...........................................................................................*............................................................................................................................................................................................ - add v16.4S, v16.4S, v27.4S // .....................................................................................................................*.................................................................................................................................................................. - add v23.4S, v21.4S, v23.4S // ............................................................................................*........................................................................................................................................................................................... - mul v27.4S, v24.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... - mul v21.4S, v20.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... - sqrdmulh v20.4S, v20.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... - mls v18.4S, v15.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... - add v15.4S, v17.4S, v23.4S // ..........................................................................................................................*............................................................................................................................................................. - sub v17.4S, v17.4S, v23.4S // .........................................................................................................................*.............................................................................................................................................................. - sqrdmulh v23.4S, v24.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. - mls v21.4S, v20.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ - sub v24.4S, v22.4S, v11.4S // ...............................................................................................................*........................................................................................................................................................................ - mls v27.4S, v23.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. - add v20.4S, v19.4S, v21.4S // ....................................................................................................................................*................................................................................................................................................... - sub v23.4S, v9.4S, v15.4S // .............................................................................................................................................*.......................................................................................................................................... - add v9.4S, v9.4S, v15.4S // ..............................................................................................................................................*......................................................................................................................................... - mul v15.4S, v14.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. - sub v21.4S, v19.4S, v21.4S // ...................................................................................................................................*.................................................................................................................................................... - sqrdmulh v19.4S, v14.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ - sub v14.4S, v18.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... - add v18.4S, v18.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ - add v22.4S, v22.4S, v11.4S // ................................................................................................................*....................................................................................................................................................................... - mul v27.4S, v14.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... - mls v15.4S, v19.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... - add v11.4S, v22.4S, v20.4S // ........................................................................................................................................................*............................................................................................................................... - sub v19.4S, v22.4S, v20.4S // .......................................................................................................................................................*................................................................................................................................ - mul v20.4S, v17.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ - sqrdmulh v17.4S, v17.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... - mls v27.4S, v14.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... - sub v22.4S, v12.4S, v15.4S // ............................................................................................................................................................*........................................................................................................................... - sqrdmulh v14.4S, v23.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... - add v12.4S, v12.4S, v15.4S // .............................................................................................................................................................*.......................................................................................................................... - sub v15.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... - mls v20.4S, v17.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - mul v17.4S, v23.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ - add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... - sqrdmulh v23.4S, v15.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - mul v18.4S, v15.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - mul v15.4S, v24.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sub v27.4S, v8.4S, v22.4S // ........................................................................................................................................*............................................................................................................................................... + sub v17.4S, v9.4S, v24.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v9.4S, v24.4S // ..............................................................................................................................................*......................................................................................................................................... + add v8.4S, v8.4S, v22.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v16.4S, v14.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sqrdmulh v22.4S, v23.4S, v1.S[1] // ...........................................................................................................................*............................................................................................................................................................ + mul v23.4S, v23.4S, v1.S[0] // ............................................................................................................................*........................................................................................................................................................... + sqrdmulh v24.4S, v17.4S, v0.S[1] // ...............................................................................................................................................*........................................................................................................................................ + mul v17.4S, v17.4S, v0.S[0] // ................................................................................................................................................*....................................................................................................................................... + mul v14.4S, v12.4S, v0.S[2] // .............................................................................................................*.......................................................................................................................................................................... + mls v23.4S, v22.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sqrdmulh v22.4S, v12.4S, v0.S[3] // ............................................................................................................*........................................................................................................................................................................... + mls v17.4S, v24.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + add v24.4S, v18.4S, v16.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v16.4S, v18.4S, v16.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v12.4S, v15.4S, v0.S[2] // ..................................................................................................................*..................................................................................................................................................................... + mls v14.4S, v22.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sqrdmulh v22.4S, v15.4S, v0.S[3] // .................................................................................................................*...................................................................................................................................................................... + mul v15.4S, v20.4S, v3.S[0] // ..............................................................................................*......................................................................................................................................................................................... + sqrdmulh v18.4S, v20.4S, v3.S[1] // .............................................................................................*.......................................................................................................................................................................................... + mul v20.4S, v21.4S, v0.S[2] // ...................................................................................................*.................................................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[3] // ..................................................................................................*..................................................................................................................................................................................... + mls v12.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub v22.4S, v10.4S, v24.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v15.4S, v18.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v18.4S, v27.4S, v0.S[1] // ..........................................................................................................................................*............................................................................................................................................. + mul v27.4S, v27.4S, v0.S[0] // ...........................................................................................................................................*............................................................................................................................................ + add v10.4S, v10.4S, v24.4S // ...................................................................................................................................................*.................................................................................................................................... + mls v20.4S, v21.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sqrdmulh v21.4S, v28.4S, v1.S[1] // ......................................................................................................................*................................................................................................................................................................. + sub v24.4S, v19.4S, v15.4S // ...................................................................................................................................*.................................................................................................................................................... + add v15.4S, v19.4S, v15.4S // ....................................................................................................................................*................................................................................................................................................... + mul v19.4S, v28.4S, v1.S[0] // .......................................................................................................................*................................................................................................................................................................ sub count, count, #1 layer1234_start: - sqrdmulh v24.4S, v24.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... - mls v18.4S, v23.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - sqrdmulh v23.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - mul v22.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - mls v17.4S, v14.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - mul v14.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - sqrdmulh v28.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - mls v15.4S, v24.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... - mls v22.4S, v23.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - mul v23.4S, v21.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - sqrdmulh v24.4S, v21.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - sub v21.4S, v8.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... - add v8.4S, v8.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. - mls v14.4S, v28.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - cmge v28.4S, v31.4S, v22.4S // ................................................................................................................................................................................................*....................................................................................... - cmge v16.4S, v22.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v23.4S, v24.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - sqrdmulh v24.4S, v21.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - sub v28.4S, v28.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... - mul v16.4S, v21.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - add v21.4S, v14.4S, v27.4S // .......................................................................................................................................................................*................................................................................................................ - mls v22.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - cmge v28.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - sub v27.4S, v14.4S, v27.4S // ......................................................................................................................................................................*................................................................................................................. - cmge v14.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... - mls v16.4S, v24.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - cmge v24.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... - str q22, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - sub v22.4S, v14.4S, v28.4S // ......................................................................................................................................................................................*................................................................................................. - cmge v14.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - add v28.4S, v15.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... - sub v15.4S, v15.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ - mls v17.4S, v22.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - sub v24.4S, v24.4S, v14.4S // ..........................................................................................................................................................................................*............................................................................................. - sub v14.4S, v13.4S, v20.4S // .................................................................................................................................................................*...................................................................................................................... - add v23.4S, v13.4S, v20.4S // ..................................................................................................................................................................*..................................................................................................................... - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - cmge v22.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... - str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - mul v13.4S, v14.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - mul v17.4S, v23.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - sqrdmulh v23.4S, v23.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - sqrdmulh v14.4S, v14.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - sqrdmulh v24.4S, v27.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mul v27.4S, v27.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - mls v17.4S, v23.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - mls v13.4S, v14.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - mls v27.4S, v24.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - sub v12.4S, v22.4S, v23.4S // ..................................................................................................................................................................................*..................................................................................................... - mul v22.4S, v21.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - sqrdmulh v18.4S, v21.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - sqrdmulh v23.4S, v15.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - mul v21.4S, v15.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - cmge v24.4S, v27.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - cmge v14.4S, v31.4S, v27.4S // ........................................................................................................................................................................................................*............................................................................... - mls v16.4S, v12.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - mul v12.4S, v28.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - sub v15.4S, v14.4S, v24.4S // ..........................................................................................................................................................................................................*............................................................................. - mls v21.4S, v23.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - mul v24.4S, v19.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sqrdmulh v28.4S, v28.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - sqrdmulh v23.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - sqrdmulh v16.4S, v19.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - sqrdmulh v19.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - mls v27.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - mls v12.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - mul v15.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - mls v24.4S, v16.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - mul v9.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - cmge v28.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................................*................... - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - cmge v11.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - mls v15.4S, v19.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - mls v9.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - cmge v14.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - sub v19.4S, v28.4S, v11.4S // ......................................................................................................................................................................................................................................................................*................. - cmge v28.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................*................................................................................... - cmge v23.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................*........................... - sub v14.4S, v28.4S, v14.4S // ......................................................................................................................................................................................................*................................................................................. - str q27, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - cmge v27.4S, v21.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - cmge v11.4S, v12.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - cmge v28.4S, v31.4S, v12.4S // ............................................................................................................................................................................................................................................................................*........... - mls v13.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - mls v17.4S, v19.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - cmge v14.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - sub v19.4S, v28.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... - mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - str q13, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - mls v22.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - str q17, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. - cmge v28.4S, v31.4S, v24.4S // ............................................................................................................................................................................................*........................................................................................... - cmge v17.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - cmge v8.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - cmge v13.4S, v31.4S, v21.4S // ............................................................................................................................................................................................................*........................................................................... - sub v17.4S, v23.4S, v17.4S // ..............................................................................................................................................................................................................................................................*......................... - sub v8.4S, v11.4S, v8.4S // ..................................................................................................................................................................................................................................................*..................................... - cmge v11.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... - mls v15.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - cmge v23.4S, v24.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - mls v16.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - sub v8.4S, v11.4S, v14.4S // ......................................................................................................................................................................................................................................................*................................. - sub v17.4S, v13.4S, v27.4S // ..............................................................................................................................................................................................................*......................................................................... - cmge v27.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - ldr q14, [x1, #80] // .e...................................................................................................................................................................................................................................................................................... - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - cmge v16.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - ldr q11, [x1, #0] // e....................................................................................................................................................................................................................................................................................... - cmge v13.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - str q15, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - cmge v15.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - sub v27.4S, v27.4S, v16.4S // ..................................................................................................................................................................................................................................................................*..................... - sqrdmulh v16.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - sub v10.4S, v28.4S, v23.4S // ..............................................................................................................................................................................................*......................................................................................... - ldr q28, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. - mls v12.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - ldr q19, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ - mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - mls v24.4S, v10.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - add v10.4S, v11.4S, v14.4S // .................e...................................................................................................................................................................................................................................................................... - mls v18.4S, v16.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - sub v11.4S, v11.4S, v14.4S // ................e....................................................................................................................................................................................................................................................................... - ldr q14, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - mls v21.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - ldr q17, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... - sub v13.4S, v13.4S, v15.4S // ..........................................................................................................................................................................................................................................................................*............. - sqrdmulh v23.4S, v11.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... - str q24, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... - ldr q24, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. - mul v16.4S, v11.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... - sub v27.4S, v17.4S, v14.4S // ..........................e............................................................................................................................................................................................................................................................. - add v15.4S, v24.4S, v19.4S // ................................e....................................................................................................................................................................................................................................................... - ldr q11, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... - ldr q20, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... - mls v22.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - str q12, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - add v12.4S, v17.4S, v14.4S // ...........................e............................................................................................................................................................................................................................................................ - mul v13.4S, v27.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... - sqrdmulh v14.4S, v27.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... - add v17.4S, v12.4S, v15.4S // ...................................................................e.................................................................................................................................................................................................................... - mls v9.4S, v8.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - str q21, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ - add v27.4S, v11.4S, v20.4S // ......................e................................................................................................................................................................................................................................................................. - sub v24.4S, v24.4S, v19.4S // ...............................e........................................................................................................................................................................................................................................................ - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - ldr q19, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ - mls v16.4S, v23.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... - sub v8.4S, v11.4S, v20.4S // .....................e.................................................................................................................................................................................................................................................................. - mls v13.4S, v14.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... - sub v14.4S, v12.4S, v15.4S // ..................................................................e..................................................................................................................................................................................................................... - ldr q15, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... - sqrdmulh v22.4S, v8.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... - mul v21.4S, v8.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ - ldr q11, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... - add v23.4S, v10.4S, v27.4S // .........................................................e.............................................................................................................................................................................................................................. - sqrdmulh v12.4S, v24.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... - sub v10.4S, v10.4S, v27.4S // ........................................................e............................................................................................................................................................................................................................... - cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - ldr q8, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. - mul v24.4S, v24.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... - str q9, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... - cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... - sub v9.4S, v28.4S, v19.4S // .........................................e.............................................................................................................................................................................................................................................. - mls v21.4S, v22.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. - sub v20.4S, v27.4S, v20.4S // ..........................................................................................................................................................................................................................................................*............................. - add v27.4S, v28.4S, v19.4S // ..........................................e............................................................................................................................................................................................................................................. - mul v19.4S, v9.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ - mls v24.4S, v12.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... - mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - sub v12.4S, v16.4S, v21.4S // .............................................................e.......................................................................................................................................................................................................................... - add v21.4S, v16.4S, v21.4S // ..............................................................e......................................................................................................................................................................................................................... - add v16.4S, v15.4S, v8.4S // .....................................e.................................................................................................................................................................................................................................................. - sub v22.4S, v13.4S, v24.4S // .......................................................................e................................................................................................................................................................................................................ - add v20.4S, v13.4S, v24.4S // ........................................................................e............................................................................................................................................................................................................... - str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... - add v18.4S, v16.4S, v27.4S // .............................................................................e.......................................................................................................................................................................................................... - sqrdmulh v28.4S, v9.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... - sub v13.4S, v21.4S, v20.4S // .....................................................................................................e.................................................................................................................................................................................. - add v9.4S, v21.4S, v20.4S // ......................................................................................................e................................................................................................................................................................................. - mul v24.4S, v12.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ - mul v21.4S, v22.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. - mls v19.4S, v28.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... - sqrdmulh v20.4S, v12.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... - ldr q12, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... - sqrdmulh v28.4S, v13.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... - mul v13.4S, v13.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ - sub v27.4S, v16.4S, v27.4S // ............................................................................e........................................................................................................................................................................................................... - mls v13.4S, v28.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. - sqrdmulh v16.4S, v14.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. - mul v28.4S, v14.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... - mul v14.4S, v10.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. - sqrdmulh v10.4S, v10.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ - mls v24.4S, v20.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... - sub v15.4S, v15.4S, v8.4S // ....................................e................................................................................................................................................................................................................................................... - sqrdmulh v8.4S, v22.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. - mul v22.4S, v27.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... - mls v28.4S, v16.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. - sqrdmulh v20.4S, v27.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ - mls v14.4S, v10.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... - mul v27.4S, v15.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. - add v16.4S, v11.4S, v12.4S // ...............................................e........................................................................................................................................................................................................................................ - mls v21.4S, v8.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ - sqrdmulh v8.4S, v15.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ - sub v15.4S, v11.4S, v12.4S // ..............................................e......................................................................................................................................................................................................................................... - mls v22.4S, v20.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... - add v10.4S, v14.4S, v28.4S // ...........................................................................................................e............................................................................................................................................................................ - add v11.4S, v24.4S, v21.4S // ................................................................................................................e....................................................................................................................................................................... - sub v28.4S, v14.4S, v28.4S // ..........................................................................................................e............................................................................................................................................................................. - mls v27.4S, v8.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... - ldr q14, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... - sub v24.4S, v24.4S, v21.4S // ...............................................................................................................e........................................................................................................................................................................ - ldr q20, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ - mul v21.4S, v15.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... - sqrdmulh v15.4S, v15.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... - mls v21.4S, v15.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... - sub v8.4S, v27.4S, v19.4S // .................................................................................e...................................................................................................................................................................................................... - add v19.4S, v27.4S, v19.4S // ..................................................................................e..................................................................................................................................................................................................... - add v12.4S, v14.4S, v20.4S // ....................................................e................................................................................................................................................................................................................................... - sqrdmulh v27.4S, v8.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... - mul v15.4S, v8.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... - sub v20.4S, v14.4S, v20.4S // ...................................................e.................................................................................................................................................................................................................................... - sub v14.4S, v23.4S, v17.4S // ................................................................................................e....................................................................................................................................................................................... - add v8.4S, v23.4S, v17.4S // .................................................................................................e...................................................................................................................................................................................... - sub v17.4S, v16.4S, v12.4S // ......................................................................................e................................................................................................................................................................................................. - add v12.4S, v16.4S, v12.4S // .......................................................................................e................................................................................................................................................................................................ - mls v15.4S, v27.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. - mul v23.4S, v17.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... - sqrdmulh v17.4S, v17.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. - mul v27.4S, v20.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. - add v16.4S, v18.4S, v12.4S // .....................................................................................................................e.................................................................................................................................................................. - sub v12.4S, v18.4S, v12.4S // ....................................................................................................................e................................................................................................................................................................... - sqrdmulh v18.4S, v20.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. - mls v23.4S, v17.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. - sqrdmulh v17.4S, v14.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... - mul v20.4S, v14.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... - sqrdmulh v14.4S, v12.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ - mls v27.4S, v18.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ - mul v12.4S, v12.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. - add v18.4S, v22.4S, v23.4S // ...............................................................................................................................e........................................................................................................................................................ - sub v22.4S, v22.4S, v23.4S // ..............................................................................................................................e......................................................................................................................................................... - mls v20.4S, v17.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... - sub v23.4S, v10.4S, v18.4S // ..................................................................................................................................................e..................................................................................................................................... - add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................e.................................................................................................................................... - sub v17.4S, v21.4S, v27.4S // ...........................................................................................e............................................................................................................................................................................................ - add v27.4S, v21.4S, v27.4S // ............................................................................................e........................................................................................................................................................................................... - sqrdmulh v21.4S, v22.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... - mls v12.4S, v14.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... - sqrdmulh v14.4S, v17.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... - mul v18.4S, v17.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... - add v17.4S, v19.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. - sub v19.4S, v19.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. - mul v27.4S, v22.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... - sub v22.4S, v20.4S, v12.4S // ............................................................................................................................................................e........................................................................................................................... - add v12.4S, v20.4S, v12.4S // .............................................................................................................................................................e.......................................................................................................................... - mls v18.4S, v14.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ - sub v14.4S, v9.4S, v17.4S // .............................................................................................................................................e.......................................................................................................................................... - add v9.4S, v9.4S, v17.4S // ..............................................................................................................................................e......................................................................................................................................... - sqrdmulh v17.4S, v19.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... - mul v20.4S, v19.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ - mls v27.4S, v21.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... - sub v21.4S, v15.4S, v18.4S // ...................................................................................................................................e.................................................................................................................................................... - add v15.4S, v15.4S, v18.4S // ....................................................................................................................................e................................................................................................................................................... - mul v18.4S, v23.4S, v0.S[0] // ....................................................................................................................................................e................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v0.S[1] // .....................................................................................................................................................e.................................................................................................................................. - sub v19.4S, v11.4S, v15.4S // .......................................................................................................................................................e................................................................................................................................ - add v11.4S, v11.4S, v15.4S // ........................................................................................................................................................e............................................................................................................................... - mls v20.4S, v17.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... - mul v15.4S, v24.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... - mul v17.4S, v14.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ - sqrdmulh v14.4S, v14.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + // Instructions: 280 + // Expected cycles: 70 + // Expected IPC: 4.00 + + // ---------------------------------------------------------------------------------------------------------------------------------- original position ----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + cmge v28.4S, v17.4S, v30.4S // ..*..................................................................................................................................................................................................................................................................................... + mls v19.4S, v21.4S, v29.4S // .*...................................................................................................................................................................................................................................................................................... + cmge v21.4S, v31.4S, v17.4S // ...*.................................................................................................................................................................................................................................................................................... + mls v27.4S, v18.4S, v29.4S // *....................................................................................................................................................................................................................................................................................... + mul v18.4S, v22.4S, v0.S[0] // ................................................*....................................................................................................................................................................................................................................... + sub v21.4S, v21.4S, v28.4S // .....*.................................................................................................................................................................................................................................................................................. + sqrdmulh v28.4S, v24.4S, v1.S[1] // ....*................................................................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[1] // .........................................*.............................................................................................................................................................................................................................................. + mul v24.4S, v24.4S, v1.S[0] // ......*................................................................................................................................................................................................................................................................................. + mls v17.4S, v21.4S, v29.4S // .........*.............................................................................................................................................................................................................................................................................. + sub v21.4S, v13.4S, v23.4S // ...............*........................................................................................................................................................................................................................................................................ + add v13.4S, v13.4S, v23.4S // ................*....................................................................................................................................................................................................................................................................... + cmge v23.4S, v27.4S, v30.4S // ...................................*.................................................................................................................................................................................................................................................... + mls v24.4S, v28.4S, v29.4S // ............*........................................................................................................................................................................................................................................................................... + mul v28.4S, v21.4S, v0.S[0] // ........................................*............................................................................................................................................................................................................................................... + mls v18.4S, v22.4S, v29.4S // ......................................................*................................................................................................................................................................................................................................. + cmge v22.4S, v31.4S, v27.4S // ..................................*..................................................................................................................................................................................................................................................... + str q17, [x1, #576] // .......................*................................................................................................................................................................................................................................................................ + sqrdmulh v21.4S, v21.4S, v0.S[1] // ...............................................*........................................................................................................................................................................................................................................ + sub v17.4S, v11.4S, v15.4S // ..........*............................................................................................................................................................................................................................................................................. + sub v22.4S, v22.4S, v23.4S // .......................................*................................................................................................................................................................................................................................................ + sub v23.4S, v12.4S, v24.4S // .................*...................................................................................................................................................................................................................................................................... + add v11.4S, v11.4S, v15.4S // ...........*............................................................................................................................................................................................................................................................................ + add v15.4S, v12.4S, v24.4S // ....................*................................................................................................................................................................................................................................................................... + mls v27.4S, v22.4S, v29.4S // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v24.4S, v23.4S, v0.S[1] // .....................*.................................................................................................................................................................................................................................................................. + sqrdmulh v22.4S, v17.4S, v0.S[1] // .............*.......................................................................................................................................................................................................................................................................... + mul v23.4S, v23.4S, v0.S[0] // ......................*................................................................................................................................................................................................................................................................. + str q27, [x1, #512] // .................................................*...................................................................................................................................................................................................................................... + mls v23.4S, v24.4S, v29.4S // ...............................*........................................................................................................................................................................................................................................................ + sub v12.4S, v20.4S, v19.4S // .........................*.............................................................................................................................................................................................................................................................. + mls v28.4S, v21.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + add v24.4S, v20.4S, v19.4S // ........................*............................................................................................................................................................................................................................................................... + mul v20.4S, v12.4S, v0.S[0] // ..............................*......................................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v12.4S, v0.S[1] // .............................*.......................................................................................................................................................................................................................................................... + cmge v12.4S, v31.4S, v23.4S // .................................................................................................*...................................................................................................................................................................................... + cmge v19.4S, v23.4S, v30.4S // ...................................................................................................*.................................................................................................................................................................................... + mul v21.4S, v16.4S, v1.S[0] // ....................................*................................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v16.4S, v1.S[1] // .................................*...................................................................................................................................................................................................................................................... + sub v19.4S, v12.4S, v19.4S // ......................................................................................................*................................................................................................................................................................................. + mls v20.4S, v27.4S, v29.4S // .....................................*.................................................................................................................................................................................................................................................. + mul v12.4S, v24.4S, v25.4S // ...................................................*.................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v24.4S, v26.4S // ..........................................*............................................................................................................................................................................................................................................. + mls v21.4S, v16.4S, v29.4S // ..............................................*......................................................................................................................................................................................................................................... + mul v17.4S, v17.4S, v0.S[0] // ..............*......................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v8.4S, v26.4S // ...............................................................*........................................................................................................................................................................................................................ + mls v23.4S, v19.4S, v29.4S // .................................................................................................................*...................................................................................................................................................................... + mls v12.4S, v27.4S, v29.4S // ..........................................................*............................................................................................................................................................................................................................. + mul v27.4S, v8.4S, v25.4S // ..................................................*..................................................................................................................................................................................................................................... + cmge v19.4S, v31.4S, v28.4S // ............................................................*........................................................................................................................................................................................................................... + mls v17.4S, v22.4S, v29.4S // ...................*.................................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v10.4S, v26.4S // .......*................................................................................................................................................................................................................................................................................ + cmge v16.4S, v28.4S, v30.4S // .............................................................*.......................................................................................................................................................................................................................... + mul v10.4S, v10.4S, v25.4S // ........*............................................................................................................................................................................................................................................................................... + mls v27.4S, v24.4S, v29.4S // ..........................................................................*............................................................................................................................................................................................................. + add v24.4S, v14.4S, v21.4S // ........................................................*............................................................................................................................................................................................................................... + sub v16.4S, v19.4S, v16.4S // .................................................................*...................................................................................................................................................................................................................... + sub v21.4S, v14.4S, v21.4S // .....................................................*.................................................................................................................................................................................................................................. + mul v14.4S, v24.4S, v25.4S // ...................................................................*.................................................................................................................................................................................................................... + str q23, [x1, #960] // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v19.4S, v11.4S, v26.4S // ...........................................................................*............................................................................................................................................................................................................ + mls v28.4S, v16.4S, v29.4S // ....................................................................*................................................................................................................................................................................................................... + mul v11.4S, v11.4S, v25.4S // ...................................................................................*.................................................................................................................................................................................................... + cmge v23.4S, v31.4S, v27.4S // ..................................................................................*..................................................................................................................................................................................................... + sqrdmulh v22.4S, v21.4S, v0.S[1] // .....................................................................*.................................................................................................................................................................................................................. + sqrdmulh v16.4S, v24.4S, v26.4S // ................................................................................*....................................................................................................................................................................................................... + mul v21.4S, v21.4S, v0.S[0] // .......................................................................*................................................................................................................................................................................................................ + mls v11.4S, v19.4S, v29.4S // ....................................................................................*................................................................................................................................................................................................... + mul v24.4S, v13.4S, v25.4S // .............................................*.......................................................................................................................................................................................................................................... + str q28, [x1, #832] // .............................................................................*.......................................................................................................................................................................................................... + cmge v28.4S, v27.4S, v30.4S // ...............................................................................................................*........................................................................................................................................................................ + sqrdmulh v19.4S, v9.4S, v26.4S // ..........................................................................................*............................................................................................................................................................................................. + mul v9.4S, v9.4S, v25.4S // ...........................................................................................*............................................................................................................................................................................................ + mls v14.4S, v16.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v16.4S, v23.4S, v28.4S // ..........................................................................................................................*............................................................................................................................................................. + mls v21.4S, v22.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + cmge v23.4S, v20.4S, v30.4S // .........................................................*.............................................................................................................................................................................................................................. + mls v9.4S, v19.4S, v29.4S // ................................................................................................*....................................................................................................................................................................................... + sqrdmulh v19.4S, v13.4S, v26.4S // ............................................*........................................................................................................................................................................................................................................... + mls v10.4S, v8.4S, v29.4S // ..................*..................................................................................................................................................................................................................................................................... + mls v27.4S, v16.4S, v29.4S // ..............................................................................................................................*......................................................................................................................................................... + cmge v8.4S, v17.4S, v30.4S // ..........................*............................................................................................................................................................................................................................................................. + cmge v22.4S, v31.4S, v17.4S // ...........................*............................................................................................................................................................................................................................................................ + cmge v16.4S, v31.4S, v21.4S // ..............................................................................................*......................................................................................................................................................................................... + sqrdmulh v13.4S, v15.4S, v26.4S // ........................................................................*............................................................................................................................................................................................................... + mul v15.4S, v15.4S, v25.4S // .........................................................................*.............................................................................................................................................................................................................. + str q27, [x1], #(16) // .....................................................................................................................................*.................................................................................................................................................. + cmge v28.4S, v12.4S, v30.4S // .....................................................................................................*.................................................................................................................................................................................. + cmge v27.4S, v31.4S, v12.4S // ..............................................................................*......................................................................................................................................................................................................... + sub v8.4S, v22.4S, v8.4S // ............................*........................................................................................................................................................................................................................................................... + cmge v22.4S, v21.4S, v30.4S // .............................................................................................*.......................................................................................................................................................................................... + mls v15.4S, v13.4S, v29.4S // ...............................................................................*........................................................................................................................................................................................................ + sub v27.4S, v27.4S, v28.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v13.4S, v16.4S, v22.4S // ..................................................................................................*..................................................................................................................................................................................... + cmge v22.4S, v31.4S, v20.4S // ...........................................................*............................................................................................................................................................................................................................ + mls v12.4S, v27.4S, v29.4S // .............................................................................................................*.......................................................................................................................................................................... + cmge v28.4S, v31.4S, v11.4S // .......................................................................................................................*................................................................................................................................................................ + cmge v27.4S, v15.4S, v30.4S // .........................................................................................*.............................................................................................................................................................................................. + cmge v16.4S, v31.4S, v15.4S // .......................................................................................*................................................................................................................................................................................................ + mls v21.4S, v13.4S, v29.4S // ..........................................................................................................*............................................................................................................................................................................. + sub v22.4S, v22.4S, v23.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v23.4S, v16.4S, v27.4S // ............................................................................................*........................................................................................................................................................................................... + str q12, [x1, #240] // ..................................................................................................................*..................................................................................................................................................................... + cmge v13.4S, v11.4S, v30.4S // ................................................................................................................*....................................................................................................................................................................... + cmge v27.4S, v31.4S, v10.4S // ............................................................................................................*........................................................................................................................................................................... + cmge v12.4S, v10.4S, v30.4S // ........................................................................................................*............................................................................................................................................................................... + str q21, [x1, #880] // ....................................................................................................................*................................................................................................................................................................... + sub v21.4S, v28.4S, v13.4S // .............................................................................................................................*.......................................................................................................................................................... + mls v17.4S, v8.4S, v29.4S // ................................*....................................................................................................................................................................................................................................................... + ldr q13, [x1, #256] // ............................................................................................................................................*........................................................................................................................................... + sub v8.4S, v27.4S, v12.4S // ................................................................................................................................*....................................................................................................................................................... + mls v20.4S, v22.4S, v29.4S // .................................................................................*...................................................................................................................................................................................................... + cmge v12.4S, v18.4S, v30.4S // ...............................................................................................................................*........................................................................................................................................................ + ldr q28, [x1, #704] // ..............................................................................................................................................*......................................................................................................................................... + ldr q27, [x1, #640] // ...............................................................................................................................................*........................................................................................................................................ + cmge v22.4S, v31.4S, v18.4S // ......................................................................................................................*................................................................................................................................................................. + mls v24.4S, v19.4S, v29.4S // ....................................................*................................................................................................................................................................................................................................... + ldr q19, [x1, #576] // .......................................................................................................................................................................*................................................................................................................ + mls v11.4S, v21.4S, v29.4S // ....................................................................................................................................*................................................................................................................................................... + ldr q16, [x1, #512] // ..................................................................................................................................................................*..................................................................................................................... + sub v22.4S, v22.4S, v12.4S // ..................................................................................................................................*..................................................................................................................................................... + mls v10.4S, v8.4S, v29.4S // ......................................................................................................................................*................................................................................................................................................. + ldr q21, [x1, #320] // .............................................................................................................................................*.......................................................................................................................................... + str q17, [x1, #688] // ......................................*................................................................................................................................................................................................................................................. + cmge v12.4S, v14.4S, v30.4S // ....................................................................................................*................................................................................................................................................................................... + str q20, [x1, #752] // ......................................................................................*................................................................................................................................................................................................. + add v20.4S, v27.4S, v28.4S // .......................................................................................................................................................*................................................................................................................................ + str q11, [x1, #176] // .........................................................................................................................................*.............................................................................................................................................. + cmge v8.4S, v24.4S, v30.4S // ..................................................................*..................................................................................................................................................................................................................... + add v11.4S, v16.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v17.4S, v31.4S, v24.4S // ................................................................*....................................................................................................................................................................................................................... + str q10, [x1, #112] // ..........................................................................................................................................*............................................................................................................................................. + sub v27.4S, v27.4S, v28.4S // ........................................................................................................................................................*............................................................................................................................... + sub v28.4S, v16.4S, v19.4S // ................................................................................................................................................................................*....................................................................................................... + add v16.4S, v11.4S, v20.4S // .................................................................................................................................................................................*...................................................................................................... + sub v10.4S, v17.4S, v8.4S // ......................................................................*................................................................................................................................................................................................................. + mls v18.4S, v22.4S, v29.4S // ........................................................................................................................................*............................................................................................................................................... + sqrdmulh v19.4S, v28.4S, v5.S[3] // ....................................................................................................................................................................................*................................................................................................... + cmge v8.4S, v31.4S, v14.4S // .......................................................................................................*................................................................................................................................................................................ + mls v24.4S, v10.4S, v29.4S // ............................................................................*........................................................................................................................................................................................................... + str q24, [x1, #304] // ........................................................................................*............................................................................................................................................................................................... + ldr q24, [x1, #128] // .........................................................................................................................................................*.............................................................................................................................. + mls v15.4S, v23.4S, v29.4S // ............................................................................................................................*........................................................................................................................................................... + mul v28.4S, v28.4S, v5.S[2] // .....................................................................................................................................................................................*.................................................................................................. + sub v22.4S, v8.4S, v12.4S // ...........................................................................................................*............................................................................................................................................................................ + ldr q8, [x1, #384] // .................................................................................................................................................*...................................................................................................................................... + ldr q10, [x1, #448] // ................................................................................................................................................*....................................................................................................................................... + sub v23.4S, v11.4S, v20.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v20.4S, v13.4S, v21.4S // ......................................................................................................................................................*................................................................................................................................. + add v11.4S, v13.4S, v21.4S // .....................................................................................................................................................*.................................................................................................................................. + str q18, [x1, #624] // ...........................................................................................................................................*............................................................................................................................................ + str q15, [x1, #432] // ...................................................................................................................................*.................................................................................................................................................... + cmge v17.4S, v31.4S, v9.4S // .....................................................................................................................*.................................................................................................................................................................. + mls v14.4S, v22.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + ldr q15, [x1, #960] // ....................................................................................................................................................*................................................................................................................................... + cmge v12.4S, v9.4S, v30.4S // ...................................................................................................................*.................................................................................................................................................................... + mul v21.4S, v27.4S, v6.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sub v22.4S, v8.4S, v10.4S // ..........................................................................................................................................................*............................................................................................................................. + sqrdmulh v18.4S, v27.4S, v6.S[1] // ....................................................................................................................................................................*................................................................................................................... + sub v27.4S, v17.4S, v12.4S // ...........................................................................................................................*............................................................................................................................................................ + ldr q17, [x1, #192] // ...............................................................................................................................................................*........................................................................................................................ + mul v13.4S, v20.4S, v4.S[2] // .............................................................................................................................................................*.......................................................................................................................... + str q14, [x1, #368] // ........................................................................................................................*............................................................................................................................................................... + add v12.4S, v8.4S, v10.4S // ...........................................................................................................................................................*............................................................................................................................ + ldr q14, [x1, #896] // ..................................................................................................................................................*..................................................................................................................................... + mul v10.4S, v22.4S, v5.S[0] // .............................................................................................................................................................................................*.......................................................................................... + mls v9.4S, v27.4S, v29.4S // .................................................................................................................................*...................................................................................................................................................... + mls v21.4S, v18.4S, v29.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v20.4S, v20.4S, v4.S[3] // ..............................................................................................................................................................*......................................................................................................................... + sub v18.4S, v24.4S, v17.4S // ........................................................................................................................................................................*............................................................................................................... + ldr q8, [x1, #64] // ...................................................................................................................................................*.................................................................................................................................... + ldr q27, [x1, #0] // ............................................................................................................................................................*........................................................................................................................... + add v24.4S, v24.4S, v17.4S // ..........................................................................................................................................................................*............................................................................................................. + mls v28.4S, v19.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + str q9, [x1, #48] // .......................................................................................................................................*................................................................................................................................................ + sqrdmulh v19.4S, v18.4S, v4.S[1] // ............................................................................................................................................................................*........................................................................................................... + mls v13.4S, v20.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + sqrdmulh v17.4S, v22.4S, v5.S[1] // ...............................................................................................................................................................................................*........................................................................................ + mul v20.4S, v18.4S, v4.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v22.4S, v27.4S, v8.4S // ......................................................................................................................................................................*................................................................................................................. + mul v18.4S, v23.4S, v2.S[2] // ..................................................................................................................................................................................................*..................................................................................... + add v9.4S, v27.4S, v8.4S // ..............................................................................................................................................................................*......................................................................................................... + mul v8.4S, v22.4S, v3.S[2] // .........................................................................................................................................................................*.............................................................................................................. + sqrdmulh v22.4S, v22.4S, v3.S[3] // ........................................................................................................................................................................................*............................................................................................... + mls v10.4S, v17.4S, v29.4S // ....................................................................................................................................................................................................*................................................................................... + mls v20.4S, v19.4S, v29.4S // ..................................................................................................................................................................................*..................................................................................................... + sub v19.4S, v28.4S, v21.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v27.4S, v14.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + add v17.4S, v28.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... + mls v8.4S, v22.4S, v29.4S // ..............................................................................................................................................................................................*......................................................................................... + sub v28.4S, v11.4S, v12.4S // .......................................................................................................................................................................................*................................................................................................ + add v22.4S, v14.4S, v15.4S // ................................................................................................................................................................*....................................................................................................................... + add v21.4S, v9.4S, v24.4S // ...................................................................................................................................................................................*.................................................................................................... + sqrdmulh v14.4S, v23.4S, v2.S[3] // .................................................................................................................................................................................................*...................................................................................... + mul v23.4S, v28.4S, v2.S[0] // .............................................................................................................................................................................................................................*.......................................................... + add v11.4S, v11.4S, v12.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v18.4S, v14.4S, v29.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v14.4S, v28.4S, v2.S[1] // ................................................................................................................................................................................................................................*....................................................... + sub v15.4S, v13.4S, v10.4S // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v12.4S, v27.4S, v7.S[1] // .................................................................................................................................................................................................................*...................................................................... + sub v24.4S, v9.4S, v24.4S // ......................................................................................................................................................................................*................................................................................................. + mls v23.4S, v14.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v9.4S, v8.4S, v20.4S // .....................................................................................................................................................................................................*.................................................................................. + add v14.4S, v13.4S, v10.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v10.4S, v27.4S, v7.S[0] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v13.4S, v9.4S, v1.S[3] // .........................................................................................................................................................................................................*.............................................................................. + mul v27.4S, v9.4S, v1.S[2] // ........................................................................................................................................................................................................*............................................................................... + sqrdmulh v28.4S, v15.4S, v2.S[1] // ....................................................................................................................................................................................................................*................................................................... + mul v9.4S, v15.4S, v2.S[0] // ...................................................................................................................................................................................................................*.................................................................... + mul v15.4S, v24.4S, v1.S[2] // ............................................................................................................................................................................................*........................................................................................... + mls v10.4S, v12.4S, v29.4S // ........................................................................................................................................................................................................................*............................................................... + ldr q12, [x1, #832] // ............................................................................................................................................................................................................*........................................................................... + mls v27.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sqrdmulh v13.4S, v24.4S, v1.S[3] // ......................................................................................................................................................................................................*................................................................................. + ldr q24, [x1, #768] // .............................................................................................................................................................................................................*.......................................................................... + mls v9.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + add v28.4S, v8.4S, v20.4S // .......................................................................................................................................................................................................*................................................................................ + sub v20.4S, v21.4S, v11.4S // ..................................................................................................................................................................................................................................*..................................................... + add v8.4S, v21.4S, v11.4S // .................................................................................................................................................................................................................................*...................................................... + mls v15.4S, v13.4S, v29.4S // ................................................................................................................................................................................................................*....................................................................... + sub v13.4S, v28.4S, v14.4S // ............................................................................................................................................................................................................................*........................................................... + add v11.4S, v27.4S, v9.4S // .......................................................................................................................................................................................................................................*................................................ + sub v21.4S, v24.4S, v12.4S // ......................................................................................................................................................................................................................*................................................................. + add v24.4S, v24.4S, v12.4S // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v12.4S, v19.4S, v2.S[3] // ..........................................................................................................................................................................................................................................*............................................. + mul v19.4S, v19.4S, v2.S[2] // ...........................................................................................................................................................................................................................................*............................................ + sub v9.4S, v27.4S, v9.4S // ............................................................................................................................................................................................................................................*........................................... + add v14.4S, v28.4S, v14.4S // .......................................................................................................................................................................................................................*................................................................ + mul v27.4S, v21.4S, v6.S[2] // .........................................................................................................................................................................................................................*.............................................................. + sqrdmulh v21.4S, v21.4S, v6.S[3] // ...............................................................................................................................................................................................................................*........................................................ + mls v19.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................................*................................. + add v28.4S, v24.4S, v22.4S // ..............................................................................................................................................................................................................................*......................................................... + sub v12.4S, v24.4S, v22.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v22.4S, v13.4S, v0.S[3] // ........................................................................................................................................................................................................................................*............................................... + mul v13.4S, v13.4S, v0.S[2] // .........................................................................................................................................................................................................................................*.............................................. + add v24.4S, v16.4S, v28.4S // .....................................................................................................................................................................................................................................*.................................................. + sub v28.4S, v16.4S, v28.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v27.4S, v21.4S, v29.4S // ....................................................................................................................................................................................................................................*................................................... + mul v16.4S, v12.4S, v3.S[0] // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v12.4S, v12.4S, v3.S[1] // .................................................................................................................................................................................................................................................*...................................... + mls v13.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v22.4S, v15.4S, v23.4S // ...................................................................................................................................................................................................................................................*.................................... + add v21.4S, v27.4S, v10.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v27.4S, v27.4S, v10.4S // ..............................................................................................................................................................................................................................................*......................................... + add v10.4S, v15.4S, v23.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v16.4S, v12.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + add v15.4S, v17.4S, v21.4S // ....................................................................................................................................................................................................................................................*................................... + sub v23.4S, v17.4S, v21.4S // .....................................................................................................................................................................................................................................................*.................................. + mul v12.4S, v9.4S, v0.S[2] // ......................................................................................................................................................................................................................................................................*................. + sqrdmulh v17.4S, v9.4S, v0.S[3] // ........................................................................................................................................................................................................................................................................*............... + add v21.4S, v18.4S, v16.4S // ....................................................................................................................................................................................................................................................................*................... + sub v16.4S, v18.4S, v16.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v18.4S, v8.4S, v24.4S // .......................................................................................................................................................................................................................................................*................................ + add v8.4S, v8.4S, v24.4S // ..........................................................................................................................................................................................................................................................*............................. + sub v24.4S, v14.4S, v15.4S // ........................................................................................................................................................................................................................................................*............................... + add v9.4S, v14.4S, v15.4S // .........................................................................................................................................................................................................................................................*.............................. + mul v15.4S, v27.4S, v3.S[0] // .........................................................................................................................................................................................................................................................................*.............. + sqrdmulh v27.4S, v27.4S, v3.S[1] // ..........................................................................................................................................................................................................................................................................*............. + mul v14.4S, v22.4S, v0.S[2] // ................................................................................................................................................................................................................................................................*....................... + mls v12.4S, v17.4S, v29.4S // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v17.4S, v23.4S, v1.S[1] // ............................................................................................................................................................................................................................................................*........................... + mul v23.4S, v23.4S, v1.S[0] // .............................................................................................................................................................................................................................................................*.......................... + mls v15.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sqrdmulh v27.4S, v22.4S, v0.S[3] // ..................................................................................................................................................................................................................................................................*..................... + sub v22.4S, v10.4S, v21.4S // ..............................................................................................................................................................................................................................................................................*......... + add v10.4S, v10.4S, v21.4S // ..................................................................................................................................................................................................................................................................................*..... + mls v23.4S, v17.4S, v29.4S // .................................................................................................................................................................................................................................................................*...................... + mul v17.4S, v24.4S, v0.S[0] // ...............................................................................................................................................................................................................................................................*........................ + mls v14.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sqrdmulh v27.4S, v24.4S, v0.S[1] // ..............................................................................................................................................................................................................................................................*......................... + sub v24.4S, v19.4S, v15.4S // .....................................................................................................................................................................................................................................................................................*.. + add v15.4S, v19.4S, v15.4S // ......................................................................................................................................................................................................................................................................................*. + sqrdmulh v19.4S, v20.4S, v0.S[3] // ............................................................................................................................................................................................................................................................................*........... + mul v20.4S, v20.4S, v0.S[2] // ...........................................................................................................................................................................................................................................................................*............ + mls v17.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mul v27.4S, v18.4S, v0.S[0] // .................................................................................................................................................................................................................................................................................*...... + sqrdmulh v21.4S, v28.4S, v1.S[1] // ....................................................................................................................................................................................................................................................................................*... + sqrdmulh v18.4S, v18.4S, v0.S[1] // ................................................................................................................................................................................................................................................................................*....... + mls v20.4S, v19.4S, v29.4S // ...................................................................................................................................................................................................................................................................................*.... + mul v19.4S, v28.4S, v1.S[0] // .......................................................................................................................................................................................................................................................................................* - // original source code - // ldr q8, [x1, #0] // ...e...................................................................................................................................................................|...................................................................................................................e.................................................................... - // ldr q9, [x1, #(1*(512/8))] // e......................................................................................................................................................................|................................................................................................................e....................................................................... - // ldr q10, [x1, #(2*(512/8))] // .............................e.........................................................................................................................................|.............................................................................................................................................e.......................................... - // ldr q11, [x1, #(3*(512/8))] // ..............................e........................................................................................................................................|..............................................................................................................................................e......................................... - // ldr q12, [x1, #(4*(512/8))] // .....................e.................................................................................................................................................|.....................................................................................................................................e.................................................. - // ldr q13, [x1, #(5*(512/8))] // ..................e....................................................................................................................................................|..................................................................................................................................e..................................................... - // ldr q14, [x1, #(6*(512/8))] // .........................e.............................................................................................................................................|.........................................................................................................................................e.............................................. - // ldr q15, [x1, #(7*(512/8))] // ............e..........................................................................................................................................................|............................................................................................................................e........................................................... - // ldr q16, [x1, #(8*(512/8))] // ...............................................e.......................................................................................................................|...............................................................................................................................................................e........................ - // ldr q17, [x1, #(9*(512/8))] // .......................................................e...............................................................................................................|.......................................................................................................................................................................e................ - // ldr q18, [x1, #(10*(512/8))] // ..........e............................................................................................................................................................|..........................................................................................................................e............................................................. - // ldr q19, [x1, #(11*(512/8))] // ..........................................e............................................................................................................................|..........................................................................................................................................................e............................. - // ldr q20, [x1, #(12*(512/8))] // ..................................................e....................................................................................................................|..................................................................................................................................................................e..................... - // ldr q21, [x1, #(13*(512/8))] // ................................................................................e......................................................................................|........................................................................................................................................................................................ - // ldr q22, [x1, #(14*(512/8))] // ..........................................................................................................e............................................................|........................................................................................................................................................................................ - // ldr q23, [x1, #(15*(512/8))] // ............................................................................................................e..........................................................|........................................................................................................................................................................................ - // sub v24.4s, v8.4s, v9.4s // .................e.....................................................................................................................................................|.................................................................................................................................e...................................................... - // add v8.4s, v8.4s, v9.4s // ...............e.......................................................................................................................................................|...............................................................................................................................e........................................................ - // mul v9.4s, v24.4s, v3.s[2] // ..........................e............................................................................................................................................|..........................................................................................................................................e............................................. - // sqrdmulh v24.4s, v24.4s, v3.s[3] // .......................e...............................................................................................................................................|.......................................................................................................................................e................................................ - // mls v9.4s, v24.4s, v29.4s // ...........................................e...........................................................................................................................|...........................................................................................................................................................e............................ - // sub v24.4s, v10.4s, v11.4s // ............................................e..........................................................................................................................|............................................................................................................................................................e........................... - // add v10.4s, v10.4s, v11.4s // .......................................e...............................................................................................................................|.......................................................................................................................................................e................................ - // mul v11.4s, v24.4s, v4.s[0] // .................................................e.....................................................................................................................|.................................................................................................................................................................e...................... - // sqrdmulh v24.4s, v24.4s, v4.s[1] // ................................................e......................................................................................................................|................................................................................................................................................................e....................... - // mls v11.4s, v24.4s, v29.4s // ............................................................e..........................................................................................................|............................................................................................................................................................................e........... - // sub v24.4s, v12.4s, v13.4s // ...........................e...........................................................................................................................................|...........................................................................................................................................e............................................ - // add v12.4s, v12.4s, v13.4s // .................................e.....................................................................................................................................|.................................................................................................................................................e...................................... - // mul v13.4s, v24.4s, v4.s[2] // ..................................e....................................................................................................................................|..................................................................................................................................................e..................................... - // sqrdmulh v24.4s, v24.4s, v4.s[3] // ...................................e...................................................................................................................................|...................................................................................................................................................e.................................... - // mls v13.4s, v24.4s, v29.4s // .............................................e.........................................................................................................................|.............................................................................................................................................................e.......................... - // sub v24.4s, v14.4s, v15.4s // ........................................e..............................................................................................................................|........................................................................................................................................................e............................... - // add v14.4s, v14.4s, v15.4s // ............................e..........................................................................................................................................|............................................................................................................................................e........................................... - // mul v15.4s, v24.4s, v5.s[0] // ........................................................e..............................................................................................................|........................................................................................................................................................................e............... - // sqrdmulh v24.4s, v24.4s, v5.s[1] // ....................................................e..................................................................................................................|....................................................................................................................................................................e................... - // mls v15.4s, v24.4s, v29.4s // ................................................................e......................................................................................................|................................................................................................................................................................................e....... - // sub v24.4s, v16.4s, v17.4s // ..........................................................................................e............................................................................|........................................................................................................................................................................................ - // add v16.4s, v16.4s, v17.4s // ....................................................................e..................................................................................................|....................................................................................................................................................................................e... - // mul v17.4s, v24.4s, v5.s[2] // ................................................................................................e......................................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...................................................................................................e...................................................................|........................................................................................................................................................................................ - // mls v17.4s, v24.4s, v29.4s // .........................................................................................................e.............................................................|........................................................................................................................................................................................ - // sub v24.4s, v18.4s, v19.4s // ...........................................................e...........................................................................................................|...........................................................................................................................................................................e............ - // add v18.4s, v18.4s, v19.4s // ..............................................................e........................................................................................................|..............................................................................................................................................................................e......... - // mul v19.4s, v24.4s, v6.s[0] // ...............................................................e.......................................................................................................|...............................................................................................................................................................................e........ - // sqrdmulh v24.4s, v24.4s, v6.s[1] // .........................................................................e.............................................................................................|........................................................................................................................................................................................ - // mls v19.4s, v24.4s, v29.4s // ..............................................................................e........................................................................................|........................................................................................................................................................................................ - // sub v24.4s, v20.4s, v21.4s // ....................................................................................................e..................................................................|........................................................................................................................................................................................ - // add v20.4s, v20.4s, v21.4s // .................................................................................................e.....................................................................|........................................................................................................................................................................................ - // mul v21.4s, v24.4s, v6.s[2] // .............................................................................................................e.........................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v6.s[3] // ..............................................................................................................e........................................................|........................................................................................................................................................................................ - // mls v21.4s, v24.4s, v29.4s // ...............................................................................................................e.......................................................|........................................................................................................................................................................................ - // sub v24.4s, v22.4s, v23.4s // .....................................................................................................................e.................................................|........................................................................................................................................................................................ - // add v22.4s, v22.4s, v23.4s // ..................................................................................................................e....................................................|........................................................................................................................................................................................ - // mul v23.4s, v24.4s, v7.s[0] // .............................................................................................................................e.........................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................................................e......................................|........................................................................................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // .....................................................................................................................................e.................................|........................................................................................................................................................................................ - // sub v24.4s, v8.4s, v10.4s // .....................................................e.................................................................................................................|.....................................................................................................................................................................e.................. - // add v8.4s, v8.4s, v10.4s // ...................................................e...................................................................................................................|...................................................................................................................................................................e.................... - // mul v10.4s, v24.4s, v1.s[2] // .......................................................................................e...............................................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................................e..............................................................................|........................................................................................................................................................................................ - // mls v10.4s, v24.4s, v29.4s // ...............................................................................................e.......................................................................|........................................................................................................................................................................................ - // sub v24.4s, v9.4s, v11.4s // ..................................................................e....................................................................................................|..................................................................................................................................................................................e..... - // add v9.4s, v9.4s, v11.4s // ...................................................................e...................................................................................................|...................................................................................................................................................................................e.... - // mul v11.4s, v24.4s, v1.s[2] // ............................................................................e..........................................................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...............................................................................e.......................................................................................|........................................................................................................................................................................................ - // mls v11.4s, v24.4s, v29.4s // .........................................................................................e.............................................................................|........................................................................................................................................................................................ - // sub v24.4s, v12.4s, v14.4s // ..............................................e........................................................................................................................|..............................................................................................................................................................e......................... - // add v12.4s, v12.4s, v14.4s // ....................................e..................................................................................................................................|....................................................................................................................................................e................................... - // mul v14.4s, v24.4s, v2.s[0] // ......................................................................................e................................................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................................................e.................................................................................|........................................................................................................................................................................................ - // mls v14.4s, v24.4s, v29.4s // .............................................................................................e.........................................................................|........................................................................................................................................................................................ - // sub v24.4s, v13.4s, v15.4s // .....................................................................e.................................................................................................|.....................................................................................................................................................................................e.. - // add v13.4s, v13.4s, v15.4s // ......................................................................e................................................................................................|......................................................................................................................................................................................e. - // mul v15.4s, v24.4s, v2.s[0] // .............................................................................e.........................................................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................e...........................................................................|........................................................................................................................................................................................ - // mls v15.4s, v24.4s, v29.4s // ..................................................................................................e....................................................................|........................................................................................................................................................................................ - // sub v24.4s, v16.4s, v18.4s // ...................................................................................e...................................................................................|........................................................................................................................................................................................ - // add v16.4s, v16.4s, v18.4s // ........................................................................e..............................................................................................|........................................................................................................................................................................................ - // mul v18.4s, v24.4s, v2.s[2] // ............................................................................................e..........................................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................e........................................................................|........................................................................................................................................................................................ - // mls v18.4s, v24.4s, v29.4s // .....................................................................................................e.................................................................|........................................................................................................................................................................................ - // sub v24.4s, v17.4s, v19.4s // ................................................................................................................e......................................................|........................................................................................................................................................................................ - // add v17.4s, v17.4s, v19.4s // .................................................................................................................e.....................................................|........................................................................................................................................................................................ - // mul v19.4s, v24.4s, v2.s[2] // ....................................................................................................................e..................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................................e...................................................|........................................................................................................................................................................................ - // mls v19.4s, v24.4s, v29.4s // ..........................................................................................................................e............................................|........................................................................................................................................................................................ - // sub v24.4s, v20.4s, v22.4s // ........................................................................................................................e..............................................|........................................................................................................................................................................................ - // add v20.4s, v20.4s, v22.4s // .........................................................................................................................e.............................................|........................................................................................................................................................................................ - // mul v22.4s, v24.4s, v3.s[0] // ...........................................................................................................................e...........................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ............................................................................................................................e..........................................|........................................................................................................................................................................................ - // mls v22.4s, v24.4s, v29.4s // .................................................................................................................................e.....................................|........................................................................................................................................................................................ - // sub v24.4s, v21.4s, v23.4s // ............................................................................................................................................e..........................|........................................................................................................................................................................................ - // add v21.4s, v21.4s, v23.4s // .............................................................................................................................................e.........................|........................................................................................................................................................................................ - // mul v23.4s, v24.4s, v3.s[0] // .................................................................................................................................................e.....................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................................e......................|........................................................................................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................e...............|........................................................................................................................................................................................ - // sub v24.4s, v8.4s, v12.4s // ......................................................................................................................e................................................|........................................................................................................................................................................................ - // add v8.4s, v8.4s, v12.4s // .......................................................................................................................e...............................................|........................................................................................................................................................................................ - // mul v12.4s, v24.4s, v0.s[2] // ...................................................................................................................................e...................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................e....................................|........................................................................................................................................................................................ - // mls v12.4s, v24.4s, v29.4s // .........................................................................................................................................e.............................|........................................................................................................................................................................................ - // sub v24.4s, v9.4s, v13.4s // ..........................................................................e............................................................................................|........................................................................................................................................................................................ - // add v9.4s, v9.4s, v13.4s // ...........................................................................e...........................................................................................|........................................................................................................................................................................................ - // mul v13.4s, v24.4s, v0.s[2] // ..................................................................................e....................................................................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .................................................................................e.....................................................................................|........................................................................................................................................................................................ - // mls v13.4s, v24.4s, v29.4s // ....................................................................................e..................................................................................|........................................................................................................................................................................................ - // sub v24.4s, v10.4s, v14.4s // ........................................................................................................e..............................................................|........................................................................................................................................................................................ - // add v10.4s, v10.4s, v14.4s // ......................................................................................................e................................................................|........................................................................................................................................................................................ - // mul v14.4s, v24.4s, v0.s[2] // .......................................................................................................................................................................|....*................................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................|.....*.................................................................................................................................................................................. - // mls v14.4s, v24.4s, v29.4s // .......................................................................................................................................................................|............*........................................................................................................................................................................... - // sub v24.4s, v11.4s, v15.4s // ...........................................................................................................e...........................................................|........................................................................................................................................................................................ - // add v11.4s, v11.4s, v15.4s // .......................................................................................................e...............................................................|........................................................................................................................................................................................ - // mul v15.4s, v24.4s, v0.s[2] // ....................................................................................................................................................................e..|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................*........................................................................................................................................................................................ - // mls v15.4s, v24.4s, v29.4s // .......................................................................................................................................................................|......*................................................................................................................................................................................. - // sub v24.4s, v16.4s, v20.4s // ...............................................................................................................................e.......................................|........................................................................................................................................................................................ - // add v16.4s, v16.4s, v20.4s // ..............................................................................................................................e........................................|........................................................................................................................................................................................ - // mul v20.4s, v24.4s, v1.s[0] // ......................................................................................................................................e................................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................................................e..................................|........................................................................................................................................................................................ - // mls v20.4s, v24.4s, v29.4s // ...............................................................................................................................................e.......................|........................................................................................................................................................................................ - // sub v24.4s, v17.4s, v21.4s // ...................................................................................................................................................e...................|........................................................................................................................................................................................ - // add v17.4s, v17.4s, v21.4s // ..................................................................................................................................................e....................|........................................................................................................................................................................................ - // mul v21.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................e...........|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................................................e............|........................................................................................................................................................................................ - // mls v21.4s, v24.4s, v29.4s // ...................................................................................................................................................................e...|........................................................................................................................................................................................ - // sub v24.4s, v18.4s, v22.4s // ........................................................................................................................................e..............................|........................................................................................................................................................................................ - // add v18.4s, v18.4s, v22.4s // .......................................................................................................................................e...............................|........................................................................................................................................................................................ - // mul v22.4s, v24.4s, v1.s[0] // ....................................................................................................................................................e..................|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................e........................|........................................................................................................................................................................................ - // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................e..........|........................................................................................................................................................................................ - // sub v24.4s, v19.4s, v23.4s // .............................................................................................................................................................e.........|........................................................................................................................................................................................ - // add v19.4s, v19.4s, v23.4s // ..............................................................................................................................................................e........|........................................................................................................................................................................................ - // mul v23.4s, v24.4s, v1.s[0] // .......................................................................................................................................................................|........*............................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................................................................................................|.........*.............................................................................................................................................................................. - // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...............*........................................................................................................................................................................ - // sub v24.4s, v8.4s, v16.4s // .......................................................................................................................................................................|..........*............................................................................................................................................................................. - // add v8.4s, v8.4s, v16.4s // .......................................................................................................................................................................|...........*............................................................................................................................................................................ - // mul v16.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|..................*..................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|................*....................................................................................................................................................................... - // mls v16.4s, v24.4s, v29.4s // .......................................................................................................................................................................|........................*............................................................................................................................................................... - // sub v24.4s, v9.4s, v17.4s // ........................................................................................................................................................e..............|........................................................................................................................................................................................ - // add v9.4s, v9.4s, v17.4s // .........................................................................................................................................................e.............|........................................................................................................................................................................................ - // mul v17.4s, v24.4s, v0.s[0] // .....................................................................................................................................................................e.|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................................................e|........................................................................................................................................................................................ - // mls v17.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...*.................................................................................................................................................................................... - // sub v24.4s, v10.4s, v18.4s // ..........................................................................................................................................e............................|........................................................................................................................................................................................ - // add v10.4s, v10.4s, v18.4s // ...........................................................................................................................................e...........................|........................................................................................................................................................................................ - // mul v18.4s, v24.4s, v0.s[0] // ...............................................................................................................................................................e.......|........................................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................................................................................e......|........................................................................................................................................................................................ - // mls v18.4s, v24.4s, v29.4s // .......................................................................................................................................................................|*....................................................................................................................................................................................... - // sub v24.4s, v11.4s, v19.4s // .................................................................................................................................................................e.....|........................................................................................................................................................................................ - // add v11.4s, v11.4s, v19.4s // ..................................................................................................................................................................e....|........................................................................................................................................................................................ - // mul v19.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|...............................................................*........................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|...................................................................*.................................................................................................................... - // mls v19.4s, v24.4s, v29.4s // .......................................................................................................................................................................|........................................................................*............................................................................................................... - // sub v24.4s, v12.4s, v20.4s // .....................................................................................................................................................e.................|........................................................................................................................................................................................ - // add v12.4s, v12.4s, v20.4s // ......................................................................................................................................................e................|........................................................................................................................................................................................ - // mul v20.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|..*..................................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|.*...................................................................................................................................................................................... - // mls v20.4s, v24.4s, v29.4s // .......................................................................................................................................................................|.......*................................................................................................................................................................................ - // sub v24.4s, v13.4s, v21.4s // .......................................................................................................................................................................|.................................*...................................................................................................................................................... - // add v13.4s, v13.4s, v21.4s // .......................................................................................................................................................................|..................................*..................................................................................................................................................... - // mul v21.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|......................................*................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|...........................................*............................................................................................................................................ - // mls v21.4s, v24.4s, v29.4s // .......................................................................................................................................................................|..................................................*..................................................................................................................................... - // sub v24.4s, v14.4s, v22.4s // .......................................................................................................................................................................|......................*................................................................................................................................................................. - // add v14.4s, v14.4s, v22.4s // .......................................................................................................................................................................|...................*.................................................................................................................................................................... - // mul v22.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|.............................................*.......................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|............................................*........................................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...................................................*.................................................................................................................................... - // sub v24.4s, v15.4s, v23.4s // .......................................................................................................................................................................|..............................*......................................................................................................................................................... - // add v15.4s, v15.4s, v23.4s // .......................................................................................................................................................................|.............................*.......................................................................................................................................................... - // mul v23.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|........................................................*............................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|.......................................................*................................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................................|..............................................................*......................................................................................................................... - // cmge v27.4s, v31.4s, v16.4s // .......................................................................................................................................................................|....................................*................................................................................................................................................... - // cmge v28.4s, v16.4s, v30.4s // .......................................................................................................................................................................|...............................................*........................................................................................................................................ - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|....................................................*................................................................................................................................... - // mls v16.4s, v28.4s, v29.4s // .......................................................................................................................................................................|...........................................................*............................................................................................................................ - // cmge v27.4s, v31.4s, v17.4s // .......................................................................................................................................................................|.......................*................................................................................................................................................................ - // cmge v28.4s, v17.4s, v30.4s // .......................................................................................................................................................................|.....................*.................................................................................................................................................................. - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|...........................*............................................................................................................................................................ - // mls v17.4s, v28.4s, v29.4s // .......................................................................................................................................................................|...............................*........................................................................................................................................................ - // cmge v27.4s, v31.4s, v18.4s // .......................................................................................................................................................................|.........................*.............................................................................................................................................................. - // cmge v28.4s, v18.4s, v30.4s // .......................................................................................................................................................................|............................*........................................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|................................*....................................................................................................................................................... - // mls v18.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................*............................................................................................................................................. - // cmge v27.4s, v31.4s, v19.4s // .......................................................................................................................................................................|..................................................................................................*..................................................................................... - // cmge v28.4s, v19.4s, v30.4s // .......................................................................................................................................................................|...........................................................................................................*............................................................................ - // sub v28.4s, v27.4s, v28.4s // .........*.............................................................................................................................................................|.........................................................................................................................*.............................................................. - // mls v19.4s, v28.4s, v29.4s // ..............*........................................................................................................................................................|..............................................................................................................................*......................................................... - // cmge v27.4s, v31.4s, v20.4s // .......................................................................................................................................................................|.............*.......................................................................................................................................................................... - // cmge v28.4s, v20.4s, v30.4s // .......................................................................................................................................................................|..............*......................................................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.................*...................................................................................................................................................................... - // mls v20.4s, v28.4s, v29.4s // .......................................................................................................................................................................|....................*................................................................................................................................................................... - // cmge v27.4s, v31.4s, v21.4s // .......................................................................................................................................................................|.................................................................................*...................................................................................................... - // cmge v28.4s, v21.4s, v30.4s // .......................................................................................................................................................................|...............................................................................*........................................................................................................ - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|...................................................................................*.................................................................................................... - // mls v21.4s, v28.4s, v29.4s // .......................................................................................................................................................................|.........................................................................................*.............................................................................................. - // cmge v27.4s, v31.4s, v22.4s // .......................................................................................................................................................................|..........................................................*............................................................................................................................. - // cmge v28.4s, v22.4s, v30.4s // .......................................................................................................................................................................|.........................................................*.............................................................................................................................. - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.............................................................*.......................................................................................................................... - // mls v22.4s, v28.4s, v29.4s // .......................................................................................................................................................................|.....................................................................*.................................................................................................................. - // cmge v27.4s, v31.4s, v23.4s // .......................................................................................................................................................................|......................................................................................................*................................................................................. - // cmge v28.4s, v23.4s, v30.4s // .......................................................................................................................................................................|.....................................................................................*.................................................................................................. - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|..............................................................................................................*......................................................................... - // mls v23.4s, v28.4s, v29.4s // ....................*..................................................................................................................................................|....................................................................................................................................*................................................... - // str q16, [x1, #(8*(512/8))] // .......................................................................................................................................................................|.................................................................*...................................................................................................................... - // str q17, [x1, #(9*(512/8))] // .......................................................................................................................................................................|.....................................*.................................................................................................................................................. - // str q18, [x1, #(10*(512/8))] // .......................................................................................................................................................................|................................................*....................................................................................................................................... - // str q19, [x1, #(11*(512/8))] // ........................*..............................................................................................................................................|........................................................................................................................................*............................................... - // str q20, [x1, #(12*(512/8))] // .......................................................................................................................................................................|..........................*............................................................................................................................................................. - // str q21, [x1, #(13*(512/8))] // .......................................................................................................................................................................|..............................................................................................*......................................................................................... - // str q22, [x1, #(14*(512/8))] // .......................................................................................................................................................................|....................................................................................*................................................................................................... - // str q23, [x1, #(15*(512/8))] // ......................................*................................................................................................................................|......................................................................................................................................................*................................. - // mul v16.4s, v8.4s, v25.4s // .......................................................................................................................................................................|...........................................................................*............................................................................................................ - // sqrdmulh v8.4s, v8.4s, v26.4s // .......................................................................................................................................................................|.......................................................................................*................................................................................................ - // mls v16.4s, v8.4s, v29.4s // .......................................................................................................................................................................|.............................................................................................*.......................................................................................... - // mul v17.4s, v9.4s, v25.4s // .......................................................................................................................................................................|.........................................................................*.............................................................................................................. - // sqrdmulh v9.4s, v9.4s, v26.4s // .......................................................................................................................................................................|..................................................................*..................................................................................................................... - // mls v17.4s, v9.4s, v29.4s // .......................................................................................................................................................................|..............................................................................*......................................................................................................... - // mul v18.4s, v10.4s, v25.4s // .......................................................................................................................................................................|................................................................................................*....................................................................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ........*..............................................................................................................................................................|........................................................................................................................*............................................................... - // mls v18.4s, v10.4s, v29.4s // ................*......................................................................................................................................................|................................................................................................................................*....................................................... - // mul v19.4s, v11.4s, v25.4s // .......................................................................................................................................................................|.......................................................................*................................................................................................................ - // sqrdmulh v11.4s, v11.4s, v26.4s // .......................................................................................................................................................................|....................................................................*................................................................................................................... - // mls v19.4s, v11.4s, v29.4s // .......................................................................................................................................................................|.............................................................................*.......................................................................................................... - // mul v20.4s, v12.4s, v25.4s // .......................................................................................................................................................................|...................................*.................................................................................................................................................... - // sqrdmulh v12.4s, v12.4s, v26.4s // .......................................................................................................................................................................|.......................................*................................................................................................................................................ - // mls v20.4s, v12.4s, v29.4s // .......................................................................................................................................................................|.................................................*...................................................................................................................................... - // mul v21.4s, v13.4s, v25.4s // .......................................................................................................................................................................|........................................*............................................................................................................................................... - // sqrdmulh v13.4s, v13.4s, v26.4s // .......................................................................................................................................................................|.........................................*.............................................................................................................................................. - // mls v21.4s, v13.4s, v29.4s // .......................................................................................................................................................................|..............................................*......................................................................................................................................... - // mul v22.4s, v14.4s, v25.4s // .......................................................................................................................................................................|.....................................................*.................................................................................................................................. - // sqrdmulh v14.4s, v14.4s, v26.4s // .......................................................................................................................................................................|......................................................*................................................................................................................................. - // mls v22.4s, v14.4s, v29.4s // .......................................................................................................................................................................|...............................................................................................*........................................................................................ - // mul v23.4s, v15.4s, v25.4s // .......................................................................................................................................................................|............................................................*........................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v26.4s // .......................................................................................................................................................................|................................................................*....................................................................................................................... - // mls v23.4s, v15.4s, v29.4s // .......................................................................................................................................................................|......................................................................*................................................................................................................. - // cmge v27.4s, v31.4s, v16.4s // .......................................................................................................................................................................|.....................................................................................................*.................................................................................. - // cmge v28.4s, v16.4s, v30.4s // .......................................................................................................................................................................|....................................................................................................*................................................................................... - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|........................................................................................................*............................................................................... - // mls v16.4s, v28.4s, v29.4s // .......................................................................................................................................................................|............................................................................................................*........................................................................... - // cmge v27.4s, v31.4s, v17.4s // .......................................................................................................................................................................|.........................................................................................................*.............................................................................. - // cmge v28.4s, v17.4s, v30.4s // .......................................................................................................................................................................|...........................................................................................*............................................................................................ - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.............................................................................................................*.......................................................................... - // mls v17.4s, v28.4s, v29.4s // .....................................*.................................................................................................................................|.....................................................................................................................................................*.................................. - // cmge v27.4s, v31.4s, v18.4s // ..........................................................*............................................................................................................|..........................................................................................................................................................................*............. - // cmge v28.4s, v18.4s, v30.4s // ......................................................*................................................................................................................|......................................................................................................................................................................*................. - // sub v28.4s, v27.4s, v28.4s // .............................................................*.........................................................................................................|.............................................................................................................................................................................*.......... - // mls v18.4s, v28.4s, v29.4s // .................................................................*.....................................................................................................|.................................................................................................................................................................................*...... - // cmge v27.4s, v31.4s, v19.4s // .......................................................................................................................................................................|..................................................................................*..................................................................................................... - // cmge v28.4s, v19.4s, v30.4s // .......................................................................................................................................................................|...................................................................................................*.................................................................................... - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.......................................................................................................*................................................................................ - // mls v19.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................................................................................*............................................................................. - // cmge v27.4s, v31.4s, v20.4s // .......................................................................................................................................................................|...............................................................................................................*........................................................................ - // cmge v28.4s, v20.4s, v30.4s // ..*....................................................................................................................................................................|..................................................................................................................*..................................................................... - // sub v28.4s, v27.4s, v28.4s // .......*...............................................................................................................................................................|.......................................................................................................................*................................................................ - // mls v20.4s, v28.4s, v29.4s // .............*.........................................................................................................................................................|.............................................................................................................................*.......................................................... - // cmge v27.4s, v31.4s, v21.4s // .......................................................................................................................................................................|..........................................................................*............................................................................................................. - // cmge v28.4s, v21.4s, v30.4s // .......................................................................................................................................................................|............................................................................*........................................................................................................... - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|................................................................................*....................................................................................................... - // mls v21.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................................................................*............................................................................................. - // cmge v27.4s, v31.4s, v22.4s // ....*..................................................................................................................................................................|....................................................................................................................*................................................................... - // cmge v28.4s, v22.4s, v30.4s // ......*................................................................................................................................................................|......................................................................................................................*................................................................. - // sub v28.4s, v27.4s, v28.4s // ......................*................................................................................................................................................|......................................................................................................................................*................................................. - // mls v22.4s, v28.4s, v29.4s // ...............................*.......................................................................................................................................|...............................................................................................................................................*........................................ - // cmge v27.4s, v31.4s, v23.4s // .......................................................................................................................................................................|........................................................................................*............................................................................................... - // cmge v28.4s, v23.4s, v30.4s // .......................................................................................................................................................................|......................................................................................*................................................................................................. - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|............................................................................................*........................................................................................... - // mls v23.4s, v28.4s, v29.4s // ...........*...........................................................................................................................................................|...........................................................................................................................*............................................................ - // str q16, [x1], #(16) // .*.....................................................................................................................................................................|.................................................................................................................*...................................................................... - // str q17, [x1, #(-16 + 1*(512/8))] // .........................................................*.............................................................................................................|.........................................................................................................................................................................*.............. - // str q18, [x1, #(-16 + 2*(512/8))] // .......................................................................*...............................................................................................|.......................................................................................................................................................................................* - // str q19, [x1, #(-16 + 3*(512/8))] // .....*.................................................................................................................................................................|.....................................................................................................................*.................................................................. - // str q20, [x1, #(-16 + 4*(512/8))] // ...................*...................................................................................................................................................|...................................................................................................................................*.................................................... - // str q21, [x1, #(-16 + 5*(512/8))] // .......................................................................................................................................................................|.................................................................................................*...................................................................................... - // str q22, [x1, #(-16 + 6*(512/8))] // .........................................*.............................................................................................................................|.........................................................................................................................................................*.............................. - // str q23, [x1, #(-16 + 7*(512/8))] // ................................*......................................................................................................................................|................................................................................................................................................*....................................... + // ------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // mls v27.4S, v18.4S, v29.4S // ...*.................................................................................................................................................................................................................................................................................... + // mls v19.4S, v21.4S, v29.4S // .*...................................................................................................................................................................................................................................................................................... + // cmge v28.4S, v17.4S, v30.4S // *....................................................................................................................................................................................................................................................................................... + // cmge v21.4S, v31.4S, v17.4S // ..*..................................................................................................................................................................................................................................................................................... + // sqrdmulh v18.4S, v24.4S, v1.S[1] // ......*................................................................................................................................................................................................................................................................................. + // sub v21.4S, v21.4S, v28.4S // .....*.................................................................................................................................................................................................................................................................................. + // mul v24.4S, v24.4S, v1.S[0] // ........*............................................................................................................................................................................................................................................................................... + // sqrdmulh v28.4S, v10.4S, v26.4S // ...................................................*.................................................................................................................................................................................................................................... + // mul v10.4S, v10.4S, v25.4S // .....................................................*.................................................................................................................................................................................................................................. + // mls v17.4S, v21.4S, v29.4S // .........*.............................................................................................................................................................................................................................................................................. + // sub v21.4S, v11.4S, v15.4S // ...................*.................................................................................................................................................................................................................................................................... + // add v11.4S, v11.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. + // mls v24.4S, v18.4S, v29.4S // .............*.......................................................................................................................................................................................................................................................................... + // sqrdmulh v15.4S, v21.4S, v0.S[1] // ..........................*............................................................................................................................................................................................................................................................. + // mul v21.4S, v21.4S, v0.S[0] // ............................................*........................................................................................................................................................................................................................................... + // sub v18.4S, v13.4S, v23.4S // ..........*............................................................................................................................................................................................................................................................................. + // add v13.4S, v13.4S, v23.4S // ...........*............................................................................................................................................................................................................................................................................ + // sub v23.4S, v12.4S, v24.4S // .....................*.................................................................................................................................................................................................................................................................. + // mls v10.4S, v28.4S, v29.4S // ...............................................................................*........................................................................................................................................................................................................ + // mls v21.4S, v15.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + // add v15.4S, v12.4S, v24.4S // .......................*................................................................................................................................................................................................................................................................ + // sqrdmulh v28.4S, v23.4S, v0.S[1] // .........................*.............................................................................................................................................................................................................................................................. + // mul v23.4S, v23.4S, v0.S[0] // ...........................*............................................................................................................................................................................................................................................................ + // str q17, [x1, #576] // .................*...................................................................................................................................................................................................................................................................... + // add v12.4S, v20.4S, v19.4S // ................................*....................................................................................................................................................................................................................................................... + // sub v20.4S, v20.4S, v19.4S // ..............................*......................................................................................................................................................................................................................................................... + // cmge v24.4S, v21.4S, v30.4S // .................................................................................*...................................................................................................................................................................................................... + // cmge v19.4S, v31.4S, v21.4S // ..................................................................................*..................................................................................................................................................................................................... + // sub v24.4S, v19.4S, v24.4S // .........................................................................................*.............................................................................................................................................................................................. + // sqrdmulh v19.4S, v20.4S, v0.S[1] // ..................................*..................................................................................................................................................................................................................................................... + // mul v20.4S, v20.4S, v0.S[0] // .................................*...................................................................................................................................................................................................................................................... + // mls v23.4S, v28.4S, v29.4S // .............................*.......................................................................................................................................................................................................................................................... + // mls v21.4S, v24.4S, v29.4S // ............................................................................................................*........................................................................................................................................................................... + // sqrdmulh v24.4S, v16.4S, v1.S[1] // ......................................*................................................................................................................................................................................................................................................. + // cmge v28.4S, v31.4S, v27.4S // ................*....................................................................................................................................................................................................................................................................... + // cmge v17.4S, v27.4S, v30.4S // ............*........................................................................................................................................................................................................................................................................... + // mul v16.4S, v16.4S, v1.S[0] // .....................................*.................................................................................................................................................................................................................................................. + // mls v20.4S, v19.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + // str q21, [x1, #704] // ...........................................................................................................................*............................................................................................................................................................ + // sub v21.4S, v28.4S, v17.4S // ....................*................................................................................................................................................................................................................................................................... + // mul v28.4S, v18.4S, v0.S[0] // ..............*......................................................................................................................................................................................................................................................................... + // sqrdmulh v19.4S, v22.4S, v0.S[1] // .......*................................................................................................................................................................................................................................................................................ + // sqrdmulh v17.4S, v12.4S, v26.4S // ..........................................*............................................................................................................................................................................................................................................. + // mls v27.4S, v21.4S, v29.4S // ........................*............................................................................................................................................................................................................................................................... + // sqrdmulh v21.4S, v13.4S, v26.4S // ..............................................................................*......................................................................................................................................................................................................... + // mul v13.4S, v13.4S, v25.4S // ....................................................................*................................................................................................................................................................................................................... + // mls v16.4S, v24.4S, v29.4S // ...........................................*............................................................................................................................................................................................................................................ + // sqrdmulh v24.4S, v18.4S, v0.S[1] // ..................*..................................................................................................................................................................................................................................................................... + // mul v18.4S, v22.4S, v0.S[0] // ....*................................................................................................................................................................................................................................................................................... + // str q27, [x1, #512] // ............................*........................................................................................................................................................................................................................................................... + // mul v27.4S, v8.4S, v25.4S // ................................................*....................................................................................................................................................................................................................................... + // mul v12.4S, v12.4S, v25.4S // .........................................*.............................................................................................................................................................................................................................................. + // mls v13.4S, v21.4S, v29.4S // ....................................................................................................................*................................................................................................................................................................... + // sub v21.4S, v14.4S, v16.4S // .........................................................*.............................................................................................................................................................................................................................. + // mls v18.4S, v19.4S, v29.4S // ...............*........................................................................................................................................................................................................................................................................ + // mls v28.4S, v24.4S, v29.4S // ...............................*........................................................................................................................................................................................................................................................ + // add v19.4S, v14.4S, v16.4S // .......................................................*................................................................................................................................................................................................................................ + // cmge v14.4S, v20.4S, v30.4S // ............................................................................*........................................................................................................................................................................................................... + // mls v12.4S, v17.4S, v29.4S // ...............................................*........................................................................................................................................................................................................................................ + // cmge v17.4S, v31.4S, v20.4S // ..............................................................................................*......................................................................................................................................................................................... + // cmge v22.4S, v31.4S, v28.4S // .................................................*...................................................................................................................................................................................................................................... + // cmge v24.4S, v28.4S, v30.4S // ....................................................*................................................................................................................................................................................................................................... + // sub v16.4S, v17.4S, v14.4S // ....................................................................................................*................................................................................................................................................................................... + // sqrdmulh v8.4S, v8.4S, v26.4S // .............................................*.......................................................................................................................................................................................................................................... + // cmge v17.4S, v31.4S, v13.4S // ..................................................................................................................................*..................................................................................................................................................... + // sub v24.4S, v22.4S, v24.4S // ........................................................*............................................................................................................................................................................................................................... + // cmge v22.4S, v13.4S, v30.4S // ................................................................................................................................*....................................................................................................................................................... + // mul v14.4S, v19.4S, v25.4S // ..........................................................*............................................................................................................................................................................................................................. + // mls v28.4S, v24.4S, v29.4S // .............................................................*.......................................................................................................................................................................................................................... + // sqrdmulh v24.4S, v21.4S, v0.S[1] // ................................................................*....................................................................................................................................................................................................................... + // sub v17.4S, v17.4S, v22.4S // .......................................................................................................................................*................................................................................................................................................ + // mul v21.4S, v21.4S, v0.S[0] // ..................................................................*..................................................................................................................................................................................................................... + // sqrdmulh v22.4S, v15.4S, v26.4S // ....................................................................................*................................................................................................................................................................................................... + // mul v15.4S, v15.4S, v25.4S // .....................................................................................*.................................................................................................................................................................................................. + // mls v27.4S, v8.4S, v29.4S // ......................................................*................................................................................................................................................................................................................................. + // sqrdmulh v8.4S, v11.4S, v26.4S // ............................................................*........................................................................................................................................................................................................................... + // mls v13.4S, v17.4S, v29.4S // ...........................................................................................................................................*............................................................................................................................................ + // str q28, [x1, #832] // .....................................................................*.................................................................................................................................................................................................................. + // cmge v28.4S, v31.4S, v12.4S // ........................................................................................*............................................................................................................................................................................................... + // mls v15.4S, v22.4S, v29.4S // ...........................................................................................*............................................................................................................................................................................................ + // sqrdmulh v17.4S, v19.4S, v26.4S // .................................................................*...................................................................................................................................................................................................................... + // mls v20.4S, v16.4S, v29.4S // ...............................................................................................................*........................................................................................................................................................................ + // cmge v16.4S, v31.4S, v27.4S // ...............................................................*........................................................................................................................................................................................................................ + // mul v11.4S, v11.4S, v25.4S // ..............................................................*......................................................................................................................................................................................................................... + // mls v11.4S, v8.4S, v29.4S // ...................................................................*.................................................................................................................................................................................................................... + // mls v21.4S, v24.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + // str q20, [x1, #768] // .............................................................................................................................*.......................................................................................................................................................... + // cmge v8.4S, v31.4S, v15.4S // ..................................................................................................*..................................................................................................................................................................................... + // str q13, [x1, #320] // ............................................................................................................................................*........................................................................................................................................... + // cmge v22.4S, v15.4S, v30.4S // .................................................................................................*...................................................................................................................................................................................... + // sqrdmulh v24.4S, v9.4S, v26.4S // .......................................................................*................................................................................................................................................................................................................ + // mul v9.4S, v9.4S, v25.4S // ........................................................................*............................................................................................................................................................................................................... + // sub v22.4S, v8.4S, v22.4S // .....................................................................................................*.................................................................................................................................................................................. + // cmge v8.4S, v21.4S, v30.4S // ..........................................................................................*............................................................................................................................................................................................. + // cmge v20.4S, v31.4S, v21.4S // ...................................................................................*.................................................................................................................................................................................................... + // mls v14.4S, v17.4S, v29.4S // .........................................................................*.............................................................................................................................................................................................................. + // mls v9.4S, v24.4S, v29.4S // .............................................................................*.......................................................................................................................................................................................................... + // cmge v24.4S, v31.4S, v23.4S // ...................................*.................................................................................................................................................................................................................................................... + // sub v19.4S, v20.4S, v8.4S // .............................................................................................*.......................................................................................................................................................................................... + // cmge v20.4S, v23.4S, v30.4S // ....................................*................................................................................................................................................................................................................................................... + // cmge v13.4S, v14.4S, v30.4S // ............................................................................................................................*........................................................................................................................................................... + // cmge v8.4S, v12.4S, v30.4S // .......................................................................................*................................................................................................................................................................................................ + // sub v17.4S, v24.4S, v20.4S // .......................................*................................................................................................................................................................................................................................................ + // cmge v24.4S, v31.4S, v14.4S // ..........................................................................................................................................*............................................................................................................................................. + // cmge v20.4S, v10.4S, v30.4S // .........................................................................................................*.............................................................................................................................................................................. + // sub v8.4S, v28.4S, v8.4S // ............................................................................................*........................................................................................................................................................................................... + // mls v21.4S, v19.4S, v29.4S // ...................................................................................................*.................................................................................................................................................................................... + // sub v28.4S, v24.4S, v13.4S // ................................................................................................................................................*....................................................................................................................................... + // cmge v13.4S, v31.4S, v10.4S // ........................................................................................................*............................................................................................................................................................................... + // mls v12.4S, v8.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + // mls v14.4S, v28.4S, v29.4S // .........................................................................................................................................................*.............................................................................................................................. + // cmge v28.4S, v27.4S, v30.4S // ......................................................................*................................................................................................................................................................................................................. + // cmge v19.4S, v11.4S, v30.4S // .......................................................................................................*................................................................................................................................................................................ + // mls v23.4S, v17.4S, v29.4S // ..............................................*......................................................................................................................................................................................................................................... + // str q12, [x1, #256] // ......................................................................................................*................................................................................................................................................................................. + // cmge v12.4S, v9.4S, v30.4S // ...........................................................................................................................................................*............................................................................................................................ + // str q21, [x1, #896] // ..........................................................................................................*............................................................................................................................................................................. + // cmge v21.4S, v31.4S, v9.4S // ........................................................................................................................................................*............................................................................................................................... + // cmge v24.4S, v31.4S, v18.4S // ...................................................................................................................*.................................................................................................................................................................... + // cmge v8.4S, v31.4S, v11.4S // ................................................................................................*....................................................................................................................................................................................... + // str q14, [x1, #384] // ..................................................................................................................................................................*..................................................................................................................... + // str q23, [x1, #960] // ...........................................................*............................................................................................................................................................................................................................ + // sub v16.4S, v16.4S, v28.4S // ..........................................................................*............................................................................................................................................................................................................. + // sub v14.4S, v21.4S, v12.4S // ...............................................................................................................................................................*........................................................................................................................ + // mls v15.4S, v22.4S, v29.4S // ..............................................................................................................................................*......................................................................................................................................... + // sub v21.4S, v8.4S, v19.4S // ...........................................................................................................*............................................................................................................................................................................ + // mls v27.4S, v16.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + // cmge v22.4S, v18.4S, v30.4S // ................................................................................................................*....................................................................................................................................................................... + // sub v23.4S, v13.4S, v20.4S // ..............................................................................................................*......................................................................................................................................................................... + // mls v9.4S, v14.4S, v29.4S // ......................................................................................................................................................................*................................................................................................................. + // sub v17.4S, v24.4S, v22.4S // ........................................................................................................................*............................................................................................................................................................... + // str q15, [x1, #448] // .......................................................................................................................................................*................................................................................................................................ + // mls v11.4S, v21.4S, v29.4S // ......................................................................................................................*................................................................................................................................................................. + // str q27, [x1], #(16) // ......................................................................................*................................................................................................................................................................................................. + // mls v10.4S, v23.4S, v29.4S // .........................................................................................................................*.............................................................................................................................................................. + // str q9, [x1, #48] // ..............................................................................................................................................................................*......................................................................................................... + // mls v18.4S, v17.4S, v29.4S // ........................................................................................................................................*............................................................................................................................................... + // str q11, [x1, #176] // ...............................................................................................................................*........................................................................................................................................................ + // str q10, [x1, #112] // ...................................................................................................................................*.................................................................................................................................................... + // str q18, [x1, #624] // ......................................................................................................................................................*................................................................................................................................. + // ldr q13, [x1, #256] // .............................................................................................................*.......................................................................................................................................................................... + // ldr q18, [x1, #320] // ..........................................................................................................................*............................................................................................................................................................. + // ldr q14, [x1, #704] // .................................................................................................................*...................................................................................................................................................................... + // ldr q16, [x1, #640] // ..................................................................................................................*..................................................................................................................................................................... + // ldr q27, [x1, #448] // ..................................................................................................................................................*..................................................................................................................................... + // ldr q19, [x1, #384] // .................................................................................................................................................*...................................................................................................................................... + // ldr q15, [x1, #896] // ....................................................................................................................................................................*................................................................................................................... + // ldr q12, [x1, #64] // ..........................................................................................................................................................................*............................................................................................................. + // ldr q23, [x1, #960] // ..........................................................................................................................................................*............................................................................................................................. + // add v28.4S, v13.4S, v18.4S // .....................................................................................................................................................*.................................................................................................................................. + // sub v18.4S, v13.4S, v18.4S // ....................................................................................................................................................*................................................................................................................................... + // add v21.4S, v16.4S, v14.4S // ..............................................................................................................................*......................................................................................................................................................... + // sub v14.4S, v16.4S, v14.4S // ....................................................................................................................................*................................................................................................................................................... + // ldr q16, [x1, #128] // .............................................................................................................................................*.......................................................................................................................................... + // sub v20.4S, v19.4S, v27.4S // .............................................................................................................................................................*.......................................................................................................................... + // add v8.4S, v19.4S, v27.4S // ...................................................................................................................................................................*.................................................................................................................... + // ldr q11, [x1, #0] // ...........................................................................................................................................................................*............................................................................................................ + // mul v13.4S, v18.4S, v4.S[2] // .................................................................................................................................................................*...................................................................................................................... + // sqrdmulh v18.4S, v18.4S, v4.S[3] // ........................................................................................................................................................................*............................................................................................................... + // ldr q10, [x1, #192] // ................................................................................................................................................................*....................................................................................................................... + // add v22.4S, v15.4S, v23.4S // ...............................................................................................................................................................................................*........................................................................................ + // sub v23.4S, v15.4S, v23.4S // ...........................................................................................................................................................................................*............................................................................................ + // ldr q9, [x1, #512] // .......................................................................................................................*................................................................................................................................................................ + // mul v19.4S, v14.4S, v6.S[0] // ............................................................................................................................................................*........................................................................................................................... + // sqrdmulh v17.4S, v14.4S, v6.S[1] // ..............................................................................................................................................................*......................................................................................................................... + // mls v13.4S, v18.4S, v29.4S // ................................................................................................................................................................................*....................................................................................................... + // sub v24.4S, v11.4S, v12.4S // ...................................................................................................................................................................................*.................................................................................................... + // ldr q18, [x1, #576] // .....................................................................................................................*.................................................................................................................................................................. + // sub v14.4S, v16.4S, v10.4S // .........................................................................................................................................................................*.............................................................................................................. + // mul v27.4S, v24.4S, v3.S[2] // ......................................................................................................................................................................................*................................................................................................. + // add v10.4S, v16.4S, v10.4S // ............................................................................................................................................................................*........................................................................................................... + // add v16.4S, v9.4S, v18.4S // .................................................................................................................................*...................................................................................................................................................... + // sqrdmulh v15.4S, v14.4S, v4.S[1] // ...............................................................................................................................................................................*........................................................................................................ + // mul v14.4S, v14.4S, v4.S[0] // ..................................................................................................................................................................................*..................................................................................................... + // add v11.4S, v11.4S, v12.4S // .....................................................................................................................................................................................*.................................................................................................. + // sub v12.4S, v16.4S, v21.4S // ...................................................................................................................................................*.................................................................................................................................... + // sub v9.4S, v9.4S, v18.4S // .....................................................................................................................................*.................................................................................................................................................. + // add v16.4S, v16.4S, v21.4S // ......................................................................................................................................*................................................................................................................................................. + // mls v14.4S, v15.4S, v29.4S // .........................................................................................................................................................................................*.............................................................................................. + // add v21.4S, v11.4S, v10.4S // ................................................................................................................................................................................................*....................................................................................... + // sqrdmulh v18.4S, v9.4S, v5.S[3] // .........................................................................................................................................*.............................................................................................................................................. + // mul v9.4S, v9.4S, v5.S[2] // ...............................................................................................................................................*........................................................................................................................................ + // sub v15.4S, v11.4S, v10.4S // ........................................................................................................................................................................................................*............................................................................... + // sub v11.4S, v28.4S, v8.4S // ..............................................................................................................................................................................................*......................................................................................... + // sqrdmulh v10.4S, v24.4S, v3.S[3] // .......................................................................................................................................................................................*................................................................................................ + // add v28.4S, v28.4S, v8.4S // ...................................................................................................................................................................................................*.................................................................................... + // mls v19.4S, v17.4S, v29.4S // .......................................................................................................................................................................*................................................................................................................ + // mls v9.4S, v18.4S, v29.4S // .............................................................................................................................................................................*.......................................................................................................... + // mul v24.4S, v15.4S, v1.S[2] // .................................................................................................................................................................................................................*...................................................................... + // mul v8.4S, v20.4S, v5.S[0] // .....................................................................................................................................................................*.................................................................................................................. + // mls v27.4S, v10.4S, v29.4S // .............................................................................................................................................................................................*.......................................................................................... + // sqrdmulh v10.4S, v20.4S, v5.S[1] // .................................................................................................................................................................................*...................................................................................................... + // add v17.4S, v9.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + // sqrdmulh v20.4S, v12.4S, v2.S[3] // .................................................................................................................................................................................................*...................................................................................... + // mul v18.4S, v12.4S, v2.S[2] // ....................................................................................................................................................................................*................................................................................................... + // sub v19.4S, v9.4S, v19.4S // ..........................................................................................................................................................................................*............................................................................................. + // mls v8.4S, v10.4S, v29.4S // ........................................................................................................................................................................................*............................................................................................... + // sub v9.4S, v27.4S, v14.4S // ..........................................................................................................................................................................................................*............................................................................. + // sqrdmulh v10.4S, v15.4S, v1.S[3] // .....................................................................................................................................................................................................................*.................................................................. + // add v12.4S, v27.4S, v14.4S // ........................................................................................................................................................................................................................*............................................................... + // mul v15.4S, v9.4S, v1.S[2] // ..............................................................................................................................................................................................................*......................................................................... + // sqrdmulh v14.4S, v9.4S, v1.S[3] // .............................................................................................................................................................................................................*.......................................................................... + // sub v27.4S, v13.4S, v8.4S // ......................................................................................................................................................................................................*................................................................................. + // add v13.4S, v13.4S, v8.4S // ...........................................................................................................................................................................................................*............................................................................ + // ldr q9, [x1, #832] // ...................................................................................................................................................................................................................*.................................................................... + // ldr q8, [x1, #768] // ......................................................................................................................................................................................................................*................................................................. + // mls v18.4S, v20.4S, v29.4S // ....................................................................................................................................................................................................*................................................................................... + // mls v15.4S, v14.4S, v29.4S // ....................................................................................................................................................................................................................*................................................................... + // mls v24.4S, v10.4S, v29.4S // ...........................................................................................................................................................................................................................*............................................................ + // sqrdmulh v20.4S, v23.4S, v7.S[1] // .......................................................................................................................................................................................................*................................................................................ + // mul v10.4S, v23.4S, v7.S[0] // ............................................................................................................................................................................................................*........................................................................... + // mul v23.4S, v27.4S, v2.S[0] // ................................................................................................................................................................................................................*....................................................................... + // sqrdmulh v14.4S, v27.4S, v2.S[1] // ...............................................................................................................................................................................................................*........................................................................ + // add v27.4S, v8.4S, v9.4S // ...............................................................................................................................................................................................................................*........................................................ + // sub v8.4S, v8.4S, v9.4S // ..............................................................................................................................................................................................................................*......................................................... + // add v9.4S, v12.4S, v13.4S // ...................................................................................................................................................................................................................................*.................................................... + // mls v10.4S, v20.4S, v29.4S // ..................................................................................................................................................................................................................*..................................................................... + // mul v20.4S, v8.4S, v6.S[2] // ....................................................................................................................................................................................................................................*................................................... + // mls v23.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................*................................................................ + // sub v14.4S, v27.4S, v22.4S // ........................................................................................................................................................................................................................................*............................................... + // sub v13.4S, v12.4S, v13.4S // ............................................................................................................................................................................................................................*........................................................... + // mul v12.4S, v11.4S, v2.S[0] // ..................................................................................................................................................................................................*..................................................................................... + // add v27.4S, v27.4S, v22.4S // .......................................................................................................................................................................................................................................*................................................ + // sqrdmulh v22.4S, v8.4S, v6.S[3] // .....................................................................................................................................................................................................................................*.................................................. + // sqrdmulh v11.4S, v11.4S, v2.S[1] // .....................................................................................................................................................................................................*.................................................................................. + // add v8.4S, v21.4S, v28.4S // ..........................................................................................................................................................................................................................*............................................................. + // sub v21.4S, v21.4S, v28.4S // .........................................................................................................................................................................................................................*.............................................................. + // sub v28.4S, v16.4S, v27.4S // ............................................................................................................................................................................................................................................*........................................... + // mls v20.4S, v22.4S, v29.4S // .............................................................................................................................................................................................................................................*.......................................... + // add v22.4S, v16.4S, v27.4S // ...........................................................................................................................................................................................................................................*............................................ + // mls v12.4S, v11.4S, v29.4S // .........................................................................................................................................................................................................*.............................................................................. + // add v11.4S, v15.4S, v23.4S // .............................................................................................................................................................................................................................*.......................................................... + // sqrdmulh v16.4S, v13.4S, v0.S[3] // .........................................................................................................................................................................................................................................*.............................................. + // mul v13.4S, v13.4S, v0.S[2] // ..........................................................................................................................................................................................................................................*............................................. + // sqrdmulh v27.4S, v19.4S, v2.S[3] // ................................................................................................................................................................................................................................*....................................................... + // mul v19.4S, v19.4S, v2.S[2] // .................................................................................................................................................................................................................................*...................................................... + // sub v15.4S, v15.4S, v23.4S // ..................................................................................................................................................................................................................................*..................................................... + // add v23.4S, v20.4S, v10.4S // ..................................................................................................................................................................................................................................................*..................................... + // sub v20.4S, v20.4S, v10.4S // ...................................................................................................................................................................................................................................................*.................................... + // mls v13.4S, v16.4S, v29.4S // ................................................................................................................................................................................................................................................*....................................... + // mul v16.4S, v14.4S, v3.S[0] // ..............................................................................................................................................................................................................................................*......................................... + // sqrdmulh v14.4S, v14.4S, v3.S[1] // ...............................................................................................................................................................................................................................................*........................................ + // add v10.4S, v24.4S, v12.4S // ....................................................................................................................................................................................................................................................*................................... + // sub v12.4S, v24.4S, v12.4S // .................................................................................................................................................................................................................................................*...................................... + // add v24.4S, v17.4S, v23.4S // ......................................................................................................................................................................................................................................................*................................. + // sub v23.4S, v17.4S, v23.4S // .......................................................................................................................................................................................................................................................*................................ + // mls v19.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + // sub v27.4S, v8.4S, v22.4S // ............................................................................................................................................................................................................................................................*........................... + // sub v17.4S, v9.4S, v24.4S // ..............................................................................................................................................................................................................................................................*......................... + // add v9.4S, v9.4S, v24.4S // ...............................................................................................................................................................................................................................................................*........................ + // add v8.4S, v8.4S, v22.4S // .............................................................................................................................................................................................................................................................*.......................... + // mls v16.4S, v14.4S, v29.4S // .....................................................................................................................................................................................................................................................*.................................. + // sqrdmulh v22.4S, v23.4S, v1.S[1] // ....................................................................................................................................................................................................................................................................*................... + // mul v23.4S, v23.4S, v1.S[0] // .....................................................................................................................................................................................................................................................................*.................. + // sqrdmulh v24.4S, v17.4S, v0.S[1] // .............................................................................................................................................................................................................................................................................*.......... + // mul v17.4S, v17.4S, v0.S[0] // ...........................................................................................................................................................................................................................................................................*............ + // mul v14.4S, v12.4S, v0.S[2] // ..................................................................................................................................................................................................................................................................*..................... + // mls v23.4S, v22.4S, v29.4S // ..........................................................................................................................................................................................................................................................................*............. + // sqrdmulh v22.4S, v12.4S, v0.S[3] // .......................................................................................................................................................................................................................................................................*................ + // mls v17.4S, v24.4S, v29.4S // ..................................................................................................................................................................................................................................................................................*..... + // add v24.4S, v18.4S, v16.4S // ..........................................................................................................................................................................................................................................................*............................. + // sub v16.4S, v18.4S, v16.4S // ...........................................................................................................................................................................................................................................................*............................ + // mul v12.4S, v15.4S, v0.S[2] // ........................................................................................................................................................................................................................................................*............................... + // mls v14.4S, v22.4S, v29.4S // ............................................................................................................................................................................................................................................................................*........... + // sqrdmulh v22.4S, v15.4S, v0.S[3] // .........................................................................................................................................................................................................................................................*.............................. + // mul v15.4S, v20.4S, v3.S[0] // ................................................................................................................................................................................................................................................................*....................... + // sqrdmulh v18.4S, v20.4S, v3.S[1] // .................................................................................................................................................................................................................................................................*...................... + // mul v20.4S, v21.4S, v0.S[2] // .................................................................................................................................................................................................................................................................................*...... + // sqrdmulh v21.4S, v21.4S, v0.S[3] // ................................................................................................................................................................................................................................................................................*....... + // mls v12.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + // sub v22.4S, v10.4S, v24.4S // ........................................................................................................................................................................................................................................................................*............... + // mls v15.4S, v18.4S, v29.4S // ......................................................................................................................................................................................................................................................................*................. + // sqrdmulh v18.4S, v27.4S, v0.S[1] // .....................................................................................................................................................................................................................................................................................*.. + // mul v27.4S, v27.4S, v0.S[0] // ...................................................................................................................................................................................................................................................................................*.... + // add v10.4S, v10.4S, v24.4S // .........................................................................................................................................................................................................................................................................*.............. + // mls v20.4S, v21.4S, v29.4S // ......................................................................................................................................................................................................................................................................................*. + // sqrdmulh v21.4S, v28.4S, v1.S[1] // ....................................................................................................................................................................................................................................................................................*... + // sub v24.4S, v19.4S, v15.4S // ..............................................................................................................................................................................................................................................................................*......... + // add v15.4S, v19.4S, v15.4S // ...............................................................................................................................................................................................................................................................................*........ + // mul v19.4S, v28.4S, v1.S[0] // .......................................................................................................................................................................................................................................................................................* sub count, count, #1 cbnz count, layer1234_start - mls v17.4S, v14.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - sqrdmulh v14.4S, v24.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... - mul v24.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - sqrdmulh v22.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - mls v18.4S, v23.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - sub v23.4S, v13.4S, v20.4S // .................................................................................................................................................................*...................................................................................................................... - add v13.4S, v13.4S, v20.4S // ..................................................................................................................................................................*..................................................................................................................... - mls v15.4S, v14.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... - sqrdmulh v14.4S, v21.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - mls v24.4S, v22.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - mul v20.4S, v21.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - mul v21.4S, v23.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - sqrdmulh v22.4S, v23.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - sqrdmulh v23.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - mul v28.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - mls v20.4S, v14.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - cmge v14.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - mls v21.4S, v22.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - cmge v22.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... - mls v28.4S, v23.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - cmge v23.4S, v31.4S, v24.4S // ................................................................................................................................................................................................*....................................................................................... - sub v22.4S, v22.4S, v14.4S // ......................................................................................................................................................................................*................................................................................................. - sub v14.4S, v8.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... - add v8.4S, v8.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. - cmge v16.4S, v24.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v17.4S, v22.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - add v22.4S, v28.4S, v27.4S // .......................................................................................................................................................................*................................................................................................................ - sub v23.4S, v23.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + mls v27.4S, v18.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mls v19.4S, v21.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + cmge v28.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + cmge v21.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + sqrdmulh v18.4S, v24.4S, v1.S[1] // .....................................................................................................................................*.................................................................................................................................................. + sub v21.4S, v21.4S, v28.4S // ......................................................................................................................................................................................*................................................................................................. + mul v24.4S, v24.4S, v1.S[0] // ......................................................................................................................................*................................................................................................................................................. + sqrdmulh v28.4S, v10.4S, v26.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v10.4S, v10.4S, v25.4S // ...............................................................................................................................................................................................................................*........................................................ + mls v17.4S, v21.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v21.4S, v11.4S, v15.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v15.4S // ........................................................................................................................................................*............................................................................................................................... + mls v24.4S, v18.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sqrdmulh v15.4S, v21.4S, v0.S[1] // .........................................................................................................................................................*.............................................................................................................................. + mul v21.4S, v21.4S, v0.S[0] // ..........................................................................................................................................................*............................................................................................................................. + sub v18.4S, v13.4S, v23.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v23.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v23.4S, v12.4S, v24.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v10.4S, v28.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mls v21.4S, v15.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + add v15.4S, v12.4S, v24.4S // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v28.4S, v23.4S, v0.S[1] // .............................................................................................................................................................................*.......................................................................................................... + mul v23.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................................*......................................................................................................... str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - mul v16.4S, v14.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - sub v28.4S, v28.4S, v27.4S // ......................................................................................................................................................................*................................................................................................................. - mls v24.4S, v23.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - sqrdmulh v23.4S, v22.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - mul v22.4S, v22.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - sqrdmulh v17.4S, v14.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - cmge v14.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... - sub v27.4S, v15.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ - str q24, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - cmge v24.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - add v15.4S, v15.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... - sqrdmulh v20.4S, v28.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mls v16.4S, v17.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - mls v22.4S, v23.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - sub v14.4S, v14.4S, v24.4S // ..........................................................................................................................................................................................*............................................................................................. - mul v17.4S, v28.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - sqrdmulh v24.4S, v27.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - mls v18.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... - cmge v14.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - mls v17.4S, v20.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - cmge v20.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... - sub v28.4S, v28.4S, v23.4S // ..................................................................................................................................................................................*..................................................................................................... - mul v23.4S, v27.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - sqrdmulh v27.4S, v19.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - mul v19.4S, v19.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sub v14.4S, v20.4S, v14.4S // ......................................................................................................................................................................................................*................................................................................. - str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - mls v16.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - mul v28.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - sqrdmulh v18.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - sqrdmulh v13.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - mls v19.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - mls v21.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - sqrdmulh v16.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - mls v28.4S, v18.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - cmge v24.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - cmge v20.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... - cmge v14.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... - cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - cmge v21.4S, v31.4S, v28.4S // ....................................................................................................................................................................................................................................................................*................... - cmge v27.4S, v28.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - sub v14.4S, v14.4S, v18.4S // ..............................................................................................................................................................................................*......................................................................................... - sub v18.4S, v20.4S, v24.4S // ..............................................................................................................................................................................................................*......................................................................... - sub v27.4S, v21.4S, v27.4S // ......................................................................................................................................................................................................................................................................*................. - cmge v20.4S, v31.4S, v17.4S // ........................................................................................................................................................................................................*............................................................................... - mls v23.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - cmge v21.4S, v17.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - mls v28.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - mls v19.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - sub v20.4S, v20.4S, v21.4S // ..........................................................................................................................................................................................................*............................................................................. - sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - mls v17.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + add v12.4S, v20.4S, v19.4S // .............................................................................................................................................................*.......................................................................................................................... + sub v20.4S, v20.4S, v19.4S // ............................................................................................................................................................*........................................................................................................................... + cmge v24.4S, v21.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v19.4S, v31.4S, v21.4S // ............................................................................................................................................................................................*........................................................................................... + sub v24.4S, v19.4S, v24.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v19.4S, v20.4S, v0.S[1] // ..............................................................................................................................................................*......................................................................................................................... + mul v20.4S, v20.4S, v0.S[0] // ...............................................................................................................................................................*........................................................................................................................ + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mls v21.4S, v24.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + sqrdmulh v24.4S, v16.4S, v1.S[1] // ................................................................................................................................*....................................................................................................................................................... + cmge v28.4S, v31.4S, v27.4S // ................................................................................................................................................................................*....................................................................................................... + cmge v17.4S, v27.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mul v16.4S, v16.4S, v1.S[0] // .................................................................................................................................*...................................................................................................................................................... + mls v20.4S, v19.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + str q21, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v21.4S, v28.4S, v17.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v28.4S, v18.4S, v0.S[0] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v19.4S, v22.4S, v0.S[1] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v17.4S, v12.4S, v26.4S // ....................................................................................................................................................................................................................................*................................................... + mls v27.4S, v21.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sqrdmulh v21.4S, v13.4S, v26.4S // .......................................................................................................................................................................................................................................*................................................ + mul v13.4S, v13.4S, v25.4S // ........................................................................................................................................................................................................................................*............................................... + mls v16.4S, v24.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + sqrdmulh v24.4S, v18.4S, v0.S[1] // ...................................................................................................................................................................*.................................................................................................................... + mul v18.4S, v22.4S, v0.S[0] // .....................................................................................................................................................*.................................................................................................................................. + str q27, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v27.4S, v8.4S, v25.4S // .........................................................................................................................................................................................................................*.............................................................. + mul v12.4S, v12.4S, v25.4S // .....................................................................................................................................................................................................................................*.................................................. + mls v13.4S, v21.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sub v21.4S, v14.4S, v16.4S // ......................................................................................................................................................................*................................................................................................................. + mls v18.4S, v19.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mls v28.4S, v24.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + add v19.4S, v14.4S, v16.4S // .......................................................................................................................................................................*................................................................................................................ + cmge v14.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v12.4S, v17.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + cmge v17.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v22.4S, v31.4S, v28.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v24.4S, v28.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + sub v16.4S, v17.4S, v14.4S // ..................................................................................................................................................................................................*..................................................................................... + sqrdmulh v8.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v17.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... + sub v24.4S, v22.4S, v24.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v22.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mul v14.4S, v19.4S, v25.4S // ...........................................................................................................................................................................................................................................*............................................ + mls v28.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sqrdmulh v24.4S, v21.4S, v0.S[1] // ........................................................................................................................................................................*............................................................................................................... + sub v17.4S, v17.4S, v22.4S // ......................................................................................................................................................................................................................................................................*................. + mul v21.4S, v21.4S, v0.S[0] // .........................................................................................................................................................................*.............................................................................................................. + sqrdmulh v22.4S, v15.4S, v26.4S // .............................................................................................................................................................................................................................................*.......................................... + mul v15.4S, v15.4S, v25.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v27.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sqrdmulh v8.4S, v11.4S, v26.4S // .................................................................................................................................................................................................................................*...................................................... + mls v13.4S, v17.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + str q28, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + cmge v28.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... + mls v15.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sqrdmulh v17.4S, v19.4S, v26.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v20.4S, v16.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v16.4S, v31.4S, v27.4S // ................................................................................................................................................................................................................................................*....................................... + mul v11.4S, v11.4S, v25.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v11.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v21.4S, v24.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + cmge v8.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + str q13, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + cmge v22.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v24.4S, v9.4S, v26.4S // ...........................................................................................................................................................................................................................*............................................................ + mul v9.4S, v9.4S, v25.4S // ............................................................................................................................................................................................................................*........................................................... + sub v22.4S, v8.4S, v22.4S // ..............................................................................................................................................................................................................................................................................*......... + cmge v8.4S, v21.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v20.4S, v31.4S, v21.4S // ........................................................................................................................................................................................................*............................................................................... + mls v14.4S, v17.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mls v9.4S, v24.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + cmge v24.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + sub v19.4S, v20.4S, v8.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v20.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v13.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + cmge v8.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sub v17.4S, v24.4S, v20.4S // ..............................................................................................................................................................................................................*......................................................................... + cmge v24.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v20.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sub v8.4S, v28.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v21.4S, v19.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sub v28.4S, v24.4S, v13.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v13.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + mls v12.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v14.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + cmge v28.4S, v27.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v19.4S, v11.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v23.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + str q12, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + cmge v12.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + str q21, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + cmge v21.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v24.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v8.4S, v31.4S, v11.4S // ............................................................................................................................................................................................................................................................*........................... + str q14, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - sqrdmulh v14.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - mul v12.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - str q28, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. - mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - str q17, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - sqrdmulh v21.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - mul v15.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - mls v18.4S, v14.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - mls v17.4S, v16.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - cmge v23.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - mls v15.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - cmge v11.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... - cmge v8.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - mls v12.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - sub v9.4S, v11.4S, v8.4S // ..........................................................................................................................................................................................................................................................*............................. - cmge v19.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - cmge v28.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - cmge v21.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................*........................... - sub v14.4S, v21.4S, v19.4S // ..............................................................................................................................................................................................................................................................*......................... - cmge v11.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - mls v18.4S, v9.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - mls v15.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - cmge v13.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - sub v24.4S, v27.4S, v11.4S // ......................................................................................................................................................................................................................................................*................................. - cmge v27.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - str q18, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... - cmge v8.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - sub v28.4S, v28.4S, v27.4S // ..........................................................................................................................................................................................................................................................................*............. - str q15, [x1, #192] // ...................................................................................................................................................................................................................................................................................*.... - cmge v21.4S, v12.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - cmge v24.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - sub v15.4S, v23.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... - cmge v27.4S, v31.4S, v12.4S // ............................................................................................................................................................................................................................................................................*........... - mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - sub v13.4S, v13.4S, v24.4S // ..................................................................................................................................................................................................................................................*..................................... - str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... - mls v20.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - sub v14.4S, v27.4S, v21.4S // ..............................................................................................................................................................................................................................................................................*......... - mls v16.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - mls v12.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - str q22, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. - str q20, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - str q12, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + sub v16.4S, v16.4S, v28.4S // ..................................................................................................................................................................................................................................................*..................................... + sub v14.4S, v21.4S, v12.4S // ......................................................................................................................................................................................................................................................*................................. + mls v15.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v21.4S, v8.4S, v19.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v27.4S, v16.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v22.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + sub v23.4S, v13.4S, v20.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v9.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v17.4S, v24.4S, v22.4S // ..........................................................................................................................................................................................*............................................................................................. + str q15, [x1, #448] // .......................................................................................................................................................................................................................................................................................* + mls v11.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + str q27, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + mls v10.4S, v23.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q9, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + mls v18.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + str q11, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + str q18, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a72.s b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a72.s index 8ae4f0b4..d4152d89 100644 --- a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a72.s +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a72.s @@ -35,18 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -85,15 +73,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm .macro barrett_reduce_single a @@ -114,12 +102,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -138,31 +120,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [\r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -177,7 +159,7 @@ trn1_d \data\()1, t1, t3 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -188,7 +170,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -198,7 +180,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -206,7 +188,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -217,24 +199,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -333,574 +321,622 @@ _intt_dilithium_1234_5678_opt_a72: mov count, #16 .p2align 2 - ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x0] // *................................................. - ldr q18, [x3, #48] // .*................................................ - ldr q22, [x3, #64] // ..*............................................... - ldr q26, [x3, #32] // ....*............................................. - ldr q30, [x3], #(6*16) // ...*.............................................. - // gap // .................................................. - ldr q27, [x3, #-16] // .....*............................................ - ldr q15, [x3, #-80] // ........*......................................... - // gap // .................................................. - ldr q7, [x4], #8 // ...................................*.............. - // gap // .................................................. - // gap // .................................................. - add v24.4S, v10.4S, v11.4S // ......*........................................... - sub v10.4S, v10.4S, v11.4S // .......*.......................................... - // gap // .................................................. - add v8.4S, v12.4S, v13.4S // .........*........................................ - sub v13.4S, v12.4S, v13.4S // ..........*....................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v18.4S, v10.4S, v18.4S // ...........*...................................... - // gap // .................................................. - // gap // .................................................. - sub v6.4S, v24.4S, v8.4S // ............*..................................... - // gap // .................................................. - // gap // .................................................. - mul v11.4S, v13.4S, v22.4S // .................*................................ - add v24.4S, v24.4S, v8.4S // ..............*................................... - ldr q22, [x4], #16 // ....................................*............. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v27.4S, v13.4S, v27.4S // ...............*.................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v10.4S, v10.4S, v26.4S // .............*.................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v10.4S, v18.4S, v29.4S // ................*................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v11.4S, v27.4S, v29.4S // ..................*............................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v18.4S, v6.4S, v15.4S // ......................*........................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v26.4S, v6.4S, v30.4S // ...................*.............................. - // gap // .................................................. - // gap // .................................................. - sub v27.4S, v10.4S, v11.4S // ....................*............................. - // gap // .................................................. - // gap // .................................................. - add v10.4S, v10.4S, v11.4S // .....................*............................ - // gap // .................................................. - // gap // .................................................. - mls v26.4S, v18.4S, v29.4S // ...........................*...................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v18.4S, v27.4S, v15.4S // .......................*.......................... - trn1 v13.4S, v24.4S, v10.4S // ........................*......................... - // gap // .................................................. - trn2 v10.4S, v24.4S, v10.4S // ..........................*....................... - // gap // .................................................. - // gap // .................................................. - mul v30.4S, v27.4S, v30.4S // .........................*........................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v30.4S, v18.4S, v29.4S // ............................*..................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn1 v18.4S, v26.4S, v30.4S // .............................*.................... - trn2 v26.4S, v26.4S, v30.4S // ..............................*................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn2 v27.2D, v13.2D, v18.2D // ................................*................. - trn2 v30.2D, v10.2D, v26.2D // ...............................*.................. - // gap // .................................................. - trn1 v18.2D, v13.2D, v18.2D // .................................*................ - trn1 v10.2D, v10.2D, v26.2D // ..................................*............... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v26.4S, v27.4S, v30.4S // ......................................*........... - sub v2.4S, v27.4S, v30.4S // ...............................................*.. - // gap // .................................................. - add v30.4S, v18.4S, v10.4S // .....................................*............ - sub v10.4S, v18.4S, v10.4S // .......................................*.......... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v18.4S, v30.4S, v26.4S // ........................................*......... - sub v26.4S, v30.4S, v26.4S // .........................................*........ - // gap // .................................................. - sqrdmulh v23.4S, v10.4S, v22.S[1] // ..........................................*....... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - srshr v30.4S, v18.4S, #23 // ...........................................*...... - mul v24.4S, v26.4S, v7.S[0] // ............................................*..... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v18.4S, v30.4S, v29.4S // .............................................*.... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v16.4S, v26.4S, v7.S[1] // ..............................................*... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v25.4S, v10.4S, v22.S[0] // ................................................*. - // gap // .................................................. - // gap // .................................................. - str q18, [x0], #(16*4) // .................................................* - // gap // .................................................. - // gap // .................................................. + // Instructions: 46 + // Expected cycles: 58 + // Expected IPC: 0.79 + // + // Wall time: 1.01s + // User time: 1.01s + // + // ------------- original position -------------> + // 0 25 + // |------------------------|-------------------- + ldr q13, [x3, #16] // *............................................. + ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x0] // .*............................................ + ldr q30, [x3], #(6*16) // ..*........................................... + ldr q24, [x3, #-48] // ......*....................................... + ldr q14, [x3, #-16] // ...*.......................................... + // gap // .............................................. + ldr q26, [x3, #-32] // ........*..................................... + ldr q0, [x3, #-64] // ...........*.................................. + // gap // .............................................. + ldr q25, [x4], #8 // .......................................*...... + ldr q31, [x4], #16 // .........................................*.... + // gap // .............................................. + add v12.4S, v17.4S, v18.4S // .........*.................................... + sub v17.4S, v17.4S, v18.4S // .......*...................................... + // gap // .............................................. + add v7.4S, v19.4S, v20.4S // ....*......................................... + // gap // .............................................. + // gap // .............................................. + sub v19.4S, v19.4S, v20.4S // .....*........................................ + // gap // .............................................. + // gap // .............................................. + sqrdmulh v24.4S, v17.4S, v24.4S // ............*................................. + // gap // .............................................. + // gap // .............................................. + sub v20.4S, v12.4S, v7.4S // .............*................................ + // gap // .............................................. + // gap // .............................................. + sqrdmulh v14.4S, v19.4S, v14.4S // ..........*................................... + add v12.4S, v12.4S, v7.4S // ...............*.............................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mul v19.4S, v19.4S, v26.4S // ................*............................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mul v17.4S, v17.4S, v0.4S // ..............*............................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v17.4S, v24.4S, v29.4S // .................*............................ + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v19.4S, v14.4S, v29.4S // ..................*........................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mul v24.4S, v20.4S, v30.4S // ...................*.......................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + sqrdmulh v14.4S, v20.4S, v13.4S // ......................*....................... + // gap // .............................................. + // gap // .............................................. + sub v26.4S, v17.4S, v19.4S // ....................*......................... + // gap // .............................................. + // gap // .............................................. + add v17.4S, v17.4S, v19.4S // .....................*........................ + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + sqrdmulh v13.4S, v26.4S, v13.4S // .......................*...................... + // gap // .............................................. + // gap // .............................................. + trn2 v19.4S, v12.4S, v17.4S // ........................*..................... + // gap // .............................................. + // gap // .............................................. + mul v30.4S, v26.4S, v30.4S // .........................*.................... + trn1 v17.4S, v12.4S, v17.4S // ............................*................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v24.4S, v14.4S, v29.4S // ..........................*................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v30.4S, v13.4S, v29.4S // ...........................*.................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + trn1 v13.4S, v24.4S, v30.4S // .............................*................ + trn2 v30.4S, v24.4S, v30.4S // ..............................*............... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + trn2 v24.2D, v17.2D, v13.2D // ...............................*.............. + trn1 v15.2D, v17.2D, v13.2D // ................................*............. + // gap // .............................................. + trn2 v17.2D, v19.2D, v30.2D // ..................................*........... + trn1 v13.2D, v19.2D, v30.2D // .................................*............ + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + add v19.4S, v15.4S, v13.4S // ....................................*......... + add v30.4S, v24.4S, v17.4S // ...................................*.......... + // gap // .............................................. + sub v17.4S, v24.4S, v17.4S // ......................................*....... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + add v24.4S, v19.4S, v30.4S // .....................................*........ + sub v20.4S, v19.4S, v30.4S // ........................................*..... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + srshr v30.4S, v24.4S, #23 // ..........................................*... + sqrdmulh v19.4S, v20.4S, v25.S[1] // ............................................*. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v24.4S, v30.4S, v29.4S // ...........................................*.. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + str q24, [x0], #(16*4) // .............................................* + // gap // .............................................. + // gap // .............................................. - // original source code - // ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x0] // *................................................. - // ldr q20, [x3, #48] // .*................................................ - // ldr q18, [x3, #64] // ..*............................................... - // ldr q6, [x3], #(6*16) // ....*............................................. - // ldr q0, [x3, #-64] // ...*.............................................. - // ldr q4, [x3, #-16] // .....*............................................ - // add v31.4S, v11.4S, v12.4S // ........*......................................... - // sub v12.4S, v11.4S, v12.4S // .........*........................................ - // ldr q5, [x3, #-80] // ......*........................................... - // add v17.4S, v13.4S, v14.4S // ..........*....................................... - // sub v11.4S, v13.4S, v14.4S // ...........*...................................... - // sqrdmulh v3.4S, v12.4S, v20.4S // ............*..................................... - // sub v30.4S, v31.4S, v17.4S // .............*.................................... - // mul v14.4S, v12.4S, v0.4S // ..................*............................... - // add v28.4S, v31.4S, v17.4S // ...............*.................................. - // sqrdmulh v19.4S, v11.4S, v4.4S // .................*................................ - // mls v14.4S, v3.4S, v29.4S // ...................*.............................. - // mul v13.4S, v11.4S, v18.4S // ..............*................................... - // mls v13.4S, v19.4S, v29.4S // ....................*............................. - // mul v27.4S, v30.4S, v6.4S // ......................*........................... - // sub v3.4S, v14.4S, v13.4S // .......................*.......................... - // add v2.4S, v14.4S, v13.4S // ........................*......................... - // sqrdmulh v0.4S, v30.4S, v5.4S // .....................*............................ - // sqrdmulh v10.4S, v3.4S, v5.4S // ..........................*....................... - // trn1 v30.4S, v28.4S, v2.4S // ...........................*...................... - // mul v18.4S, v3.4S, v6.4S // .............................*.................... - // trn2 v2.4S, v28.4S, v2.4S // ............................*..................... - // mls v27.4S, v0.4S, v29.4S // .........................*........................ - // mls v18.4S, v10.4S, v29.4S // ..............................*................... - // trn1 v31.4S, v27.4S, v18.4S // ...............................*.................. - // trn2 v3.4S, v27.4S, v18.4S // ................................*................. - // trn2 v21.2D, v2.2D, v3.2D // ..................................*............... - // trn2 v0.2D, v30.2D, v31.2D // .................................*................ - // trn1 v27.2D, v30.2D, v31.2D // ...................................*.............. - // trn1 v19.2D, v2.2D, v3.2D // ....................................*............. - // ldr q7, [x4], #8 // .......*.......................................... - // ldr q22, [x4], #16 // ................*................................. - // add v2.4S, v27.4S, v19.4S // .......................................*.......... - // add v4.4S, v0.4S, v21.4S // .....................................*............ - // sub v3.4S, v27.4S, v19.4S // ........................................*......... - // add v18.4S, v2.4S, v4.4S // .........................................*........ - // sub v2.4S, v2.4S, v4.4S // ..........................................*....... - // sqrdmulh v23.4S, v3.4S, v22.S[1] // ...........................................*...... - // srshr v10.4S, v18.4S, #23 // ............................................*..... - // mul v24.4S, v2.4S, v7.S[0] // .............................................*.... - // mls v18.4S, v10.4S, v29.4S // ..............................................*... - // sqrdmulh v16.4S, v2.4S, v7.S[1] // ...............................................*.. - // sub v2.4S, v0.4S, v21.4S // ......................................*........... - // mul v25.4S, v3.4S, v22.S[0] // ................................................*. - // str q18, [x0], #(16*4) // .................................................* + // --------------- new position ----------------> + // 0 25 + // |------------------------|-------------------- + // ldr q16, [x3, #16] // *............................................. + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // .*............................................ + // ldr q1, [x3], #(6*16) // ..*........................................... + // ldr q17, [x3, #-16] // ....*......................................... + // add v27.4S, v10.4S, v11.4S // ...........*.................................. + // sub v10.4S, v10.4S, v11.4S // ............*................................. + // ldr q15, [x3, #-48] // ...*.......................................... + // sub v0.4S, v8.4S, v9.4S // ..........*................................... + // ldr q22, [x3, #-32] // .....*........................................ + // add v23.4S, v8.4S, v9.4S // .........*.................................... + // sqrdmulh v17.4S, v10.4S, v17.4S // ...............*.............................. + // ldr q8, [x3, #-64] // ......*....................................... + // sqrdmulh v18.4S, v0.4S, v15.4S // .............*................................ + // sub v13.4S, v23.4S, v27.4S // ..............*............................... + // mul v8.4S, v0.4S, v8.4S // ..................*........................... + // add v26.4S, v23.4S, v27.4S // ................*............................. + // mul v14.4S, v10.4S, v22.4S // .................*............................ + // mls v8.4S, v18.4S, v29.4S // ...................*.......................... + // mls v14.4S, v17.4S, v29.4S // ....................*......................... + // mul v18.4S, v13.4S, v1.4S // .....................*........................ + // sub v4.4S, v8.4S, v14.4S // .......................*...................... + // add v2.4S, v8.4S, v14.4S // ........................*..................... + // sqrdmulh v8.4S, v13.4S, v16.4S // ......................*....................... + // sqrdmulh v22.4S, v4.4S, v16.4S // .........................*.................... + // trn2 v24.4S, v26.4S, v2.4S // ..........................*................... + // mul v21.4S, v4.4S, v1.4S // ...........................*.................. + // mls v18.4S, v8.4S, v29.4S // .............................*................ + // mls v21.4S, v22.4S, v29.4S // ..............................*............... + // trn1 v31.4S, v26.4S, v2.4S // ............................*................. + // trn1 v28.4S, v18.4S, v21.4S // ...............................*.............. + // trn2 v26.4S, v18.4S, v21.4S // ................................*............. + // trn2 v1.2D, v31.2D, v28.2D // .................................*............ + // trn1 v15.2D, v31.2D, v28.2D // ..................................*........... + // trn1 v13.2D, v24.2D, v26.2D // ....................................*......... + // trn2 v28.2D, v24.2D, v26.2D // ...................................*.......... + // add v4.4S, v1.4S, v28.4S // ......................................*....... + // add v21.4S, v15.4S, v13.4S // .....................................*........ + // add v14.4S, v21.4S, v4.4S // ........................................*..... + // sub v17.4S, v1.4S, v28.4S // .......................................*...... + // ldr q25, [x4], #8 // .......*...................................... + // sub v20.4S, v21.4S, v4.4S // .........................................*.... + // ldr q31, [x4], #16 // ........*..................................... + // srshr v19.4S, v14.4S, #23 // ..........................................*... + // mls v14.4S, v19.4S, v29.4S // ............................................*. + // sqrdmulh v19.4S, v20.4S, v25.S[1] // ...........................................*.. + // str q14, [x0], #(16*4) // .............................................* sub count, count, #1 layer5678_start: - ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x0] // e................................................................ - mul v10.4S, v2.4S, v22.S[2] // ............................................*.................... - ldr q20, [x3, #48] // ....e............................................................ - ldr q18, [x3, #64] // .....e........................................................... - ldr q6, [x3], #(6*16) // .e............................................................... - // gap // ................................................................. - mls v25.4S, v23.4S, v29.4S // .........................................*....................... - ldr q0, [x3, #-64] // ...e............................................................. - // gap // ................................................................. - ldr q4, [x3, #-16] // ......e.......................................................... - // gap // ................................................................. - // gap // ................................................................. - add v31.4S, v11.4S, v12.4S // ........e........................................................ - sub v12.4S, v11.4S, v12.4S // .......e......................................................... - ldr q5, [x3, #-80] // ..e.............................................................. - sqrdmulh v21.4S, v2.4S, v22.S[3] // .............................................*................... - add v17.4S, v13.4S, v14.4S // .............e................................................... - // gap // ................................................................. - sub v11.4S, v13.4S, v14.4S // ............e.................................................... - // gap // ................................................................. - // gap // ................................................................. - sqrdmulh v3.4S, v12.4S, v20.4S // ..........e...................................................... - // gap // ................................................................. - // gap // ................................................................. - sub v30.4S, v31.4S, v17.4S // .................e............................................... - // gap // ................................................................. - // gap // ................................................................. - mul v14.4S, v12.4S, v0.4S // .........e....................................................... - add v28.4S, v31.4S, v17.4S // ..................e.............................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sqrdmulh v19.4S, v11.4S, v4.4S // ...............e................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v14.4S, v3.4S, v29.4S // ...........e..................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mul v13.4S, v11.4S, v18.4S // ..............e.................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v13.4S, v19.4S, v29.4S // ................e................................................ - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v10.4S, v21.4S, v29.4S // ..............................................*.................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mul v27.4S, v30.4S, v6.4S // ...................e............................................. - // gap // ................................................................. - // gap // ................................................................. - sub v3.4S, v14.4S, v13.4S // ......................e.......................................... - // gap // ................................................................. - // gap // ................................................................. - add v2.4S, v14.4S, v13.4S // .......................e......................................... - sqrdmulh v0.4S, v30.4S, v5.4S // ....................e............................................ - // gap // ................................................................. - sub v8.4S, v25.4S, v10.4S // ....................................................*............ - // gap // ................................................................. - // gap // ................................................................. - add v13.4S, v25.4S, v10.4S // .....................................................*........... - sqrdmulh v10.4S, v3.4S, v5.4S // .........................e....................................... - // gap // ................................................................. - trn1 v30.4S, v28.4S, v2.4S // ...........................e..................................... - // gap // ................................................................. - // gap // ................................................................. - mul v18.4S, v3.4S, v6.4S // ........................e........................................ - trn2 v2.4S, v28.4S, v2.4S // ............................e.................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v27.4S, v0.4S, v29.4S // .....................e........................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v18.4S, v10.4S, v29.4S // ..........................e...................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sqrdmulh v14.4S, v8.4S, v7.S[1] // .......................................................*......... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - trn1 v31.4S, v27.4S, v18.4S // .............................e................................... - trn2 v3.4S, v27.4S, v18.4S // ..............................e.................................. - // gap // ................................................................. - mls v24.4S, v16.4S, v29.4S // ...................................................*............. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - srshr v25.4S, v13.4S, #23 // ...........................................................*..... - trn2 v21.2D, v2.2D, v3.2D // ................................e................................ - trn2 v0.2D, v30.2D, v31.2D // ...............................e................................. - // gap // ................................................................. - trn1 v27.2D, v30.2D, v31.2D // .................................e............................... - trn1 v19.2D, v2.2D, v3.2D // ..................................e.............................. - // gap // ................................................................. - mul v26.4S, v8.4S, v7.S[0] // ......................................................*.......... - ldr q7, [x4], #8 // ...................................e............................. - ldr q22, [x4], #16 // ....................................e............................ - str q24, [x0, #-32] // ...............................................................*. - // gap // ................................................................. - // gap // ................................................................. - add v2.4S, v27.4S, v19.4S // ......................................e.......................... - add v4.4S, v0.4S, v21.4S // ...........................................e..................... - // gap // ................................................................. - sub v3.4S, v27.4S, v19.4S // .....................................e........................... - mls v26.4S, v14.4S, v29.4S // ........................................................*........ - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v13.4S, v25.4S, v29.4S // ............................................................*.... - add v18.4S, v2.4S, v4.4S // ................................................e................ - // gap // ................................................................. - sub v2.4S, v2.4S, v4.4S // ...............................................e................. - // gap // ................................................................. - // gap // ................................................................. - sqrdmulh v23.4S, v3.4S, v22.S[1] // ........................................e........................ - // gap // ................................................................. - // gap // ................................................................. - str q26, [x0, #-16] // ................................................................* - srshr v10.4S, v18.4S, #23 // .........................................................e....... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mul v24.4S, v2.4S, v7.S[0] // .................................................e............... - str q13, [x0, #-48] // ..............................................................*.. - // gap // ................................................................. - // gap // ................................................................. - mls v18.4S, v10.4S, v29.4S // ..........................................................e...... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sqrdmulh v16.4S, v2.4S, v7.S[1] // ..................................................e.............. - sub v2.4S, v0.4S, v21.4S // ..........................................e...................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mul v25.4S, v3.4S, v22.S[0] // .......................................e......................... - // gap // ................................................................. - // gap // ................................................................. - str q18, [x0], #(16*4) // .............................................................e... - // gap // ................................................................. - // gap // ................................................................. + // Instructions: 65 + // Expected cycles: 58 + // Expected IPC: 1.12 + // + // Wall time: 15.06s + // User time: 15.06s + // + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q16, [x3, #16] // ..e.............................................................. + sqrdmulh v22.4S, v17.4S, v31.S[3] // ............................................*.................... + ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e................................................................ + ldr q1, [x3], #(6*16) // .e............................................................... + // gap // ................................................................. + // gap // ................................................................. + mul v30.4S, v17.4S, v31.S[2] // .............................................*................... + ldr q17, [x3, #-16] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v27.4S, v10.4S, v11.4S // .............e................................................... + sub v10.4S, v10.4S, v11.4S // ............e.................................................... + // gap // ................................................................. + sub v28.4S, v15.4S, v13.4S // .....................................*........................... + mls v30.4S, v22.4S, v29.4S // ..............................................*.................. + ldr q15, [x3, #-48] // ....e............................................................ + sub v0.4S, v8.4S, v9.4S // .......e......................................................... + ldr q22, [x3, #-32] // .....e........................................................... + // gap // ................................................................. + add v23.4S, v8.4S, v9.4S // ........e........................................................ + sqrdmulh v17.4S, v10.4S, v17.4S // ..............e.................................................. + ldr q8, [x3, #-64] // ...e............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v18.4S, v0.4S, v15.4S // .........e....................................................... + // gap // ................................................................. + // gap // ................................................................. + sub v13.4S, v23.4S, v27.4S // .................e............................................... + // gap // ................................................................. + // gap // ................................................................. + mul v8.4S, v0.4S, v8.4S // ..........e...................................................... + add v26.4S, v23.4S, v27.4S // ..................e.............................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v14.4S, v10.4S, v22.4S // ...............e................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v8.4S, v18.4S, v29.4S // ...........e..................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v17.4S, v29.4S // ................e................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v10.4S, v28.4S, v31.S[1] // .......................................*......................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v18.4S, v13.4S, v1.4S // ....................e............................................ + // gap // ................................................................. + // gap // ................................................................. + sub v4.4S, v8.4S, v14.4S // ......................e.......................................... + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v8.4S, v14.4S // .......................e......................................... + sqrdmulh v8.4S, v13.4S, v16.4S // ...................e............................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v22.4S, v4.4S, v16.4S // ........................e........................................ + // gap // ................................................................. + // gap // ................................................................. + trn2 v24.4S, v26.4S, v2.4S // ............................e.................................... + // gap // ................................................................. + // gap // ................................................................. + mul v21.4S, v4.4S, v1.4S // .........................e....................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v8.4S, v29.4S // .....................e........................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v21.4S, v22.4S, v29.4S // ..........................e...................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v27.4S, v28.4S, v31.S[0] // ........................................*........................ + trn1 v31.4S, v26.4S, v2.4S // ...........................e..................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v27.4S, v10.4S, v29.4S // .........................................*....................... + // gap // ................................................................. + // gap // ................................................................. + trn1 v28.4S, v18.4S, v21.4S // .............................e................................... + // gap // ................................................................. + // gap // ................................................................. + trn2 v26.4S, v18.4S, v21.4S // ..............................e.................................. + mul v17.4S, v20.4S, v25.S[0] // ..................................................*.............. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v1.2D, v31.2D, v28.2D // ...............................e................................. + trn1 v15.2D, v31.2D, v28.2D // .................................e............................... + // gap // ................................................................. + trn1 v13.2D, v24.2D, v26.2D // ..................................e.............................. + trn2 v28.2D, v24.2D, v26.2D // ................................e................................ + // gap // ................................................................. + mls v17.4S, v19.4S, v29.4S // ...................................................*............. + sub v26.4S, v27.4S, v30.4S // ....................................................*............ + // gap // ................................................................. + add v2.4S, v27.4S, v30.4S // .....................................................*........... + // gap // ................................................................. + // gap // ................................................................. + add v4.4S, v1.4S, v28.4S // ...........................................e..................... + // gap // ................................................................. + add v21.4S, v15.4S, v13.4S // ......................................e.......................... + sqrdmulh v22.4S, v26.4S, v25.S[1] // ......................................................*.......... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + srshr v8.4S, v2.4S, #23 // ...........................................................*..... + str q17, [x0, #-32] // ...............................................................*. + mul v24.4S, v26.4S, v25.S[0] // .......................................................*......... + add v14.4S, v21.4S, v4.4S // ................................................e................ + sub v17.4S, v1.4S, v28.4S // ..........................................e...................... + ldr q25, [x4], #8 // ...................................e............................. + // gap // ................................................................. + mls v2.4S, v8.4S, v29.4S // ............................................................*.... + sub v20.4S, v21.4S, v4.4S // ...............................................e................. + ldr q31, [x4], #16 // ....................................e............................ + srshr v19.4S, v14.4S, #23 // .........................................................e....... + // gap // ................................................................. + // gap // ................................................................. + mls v24.4S, v22.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v19.4S, v29.4S // ..........................................................e...... + str q2, [x0, #-48] // ..............................................................*.. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q24, [x0, #-16] // ................................................................* + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v19.4S, v20.4S, v25.S[1] // .................................................e............... + str q14, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + // gap // ................................................................. - // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e................................................................e........................................................... - // ldr q0, [x3], #(6*16) // ....e............................................................|...e....................................................... - // ldr q4, [x3, #(-6*16 + 1*16)] // ..........e......................................................|.........e................................................. - // ldr q1, [x3, #(-6*16 + 2*16)] // ......e..........................................................|.....e..................................................... - // ldr q5, [x3, #(-6*16 + 3*16)] // ..e..............................................................|.e......................................................... - // ldr q2, [x3, #(-6*16 + 4*16)] // ...e.............................................................|..e........................................................ - // ldr q6, [x3, #(-6*16 + 5*16)] // .......e.........................................................|......e.................................................... - // sub v24.4s, v8.4s, v9.4s // .........e.......................................................|........e.................................................. - // add v8.4s, v8.4s, v9.4s // ........e........................................................|.......e................................................... - // mul v9.4s, v24.4s, v1.4s // ................e................................................|...............e........................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ..............e..................................................|.............e............................................. - // mls v9.4s, v24.4s, v29.4s // ...................e.............................................|..................e........................................ - // sub v24.4s, v10.4s, v11.4s // .............e...................................................|............e.............................................. - // add v10.4s, v10.4s, v11.4s // ............e....................................................|...........e............................................... - // mul v11.4s, v24.4s, v2.4s // ....................e............................................|...................e....................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..................e..............................................|.................e......................................... - // mls v11.4s, v24.4s, v29.4s // .....................e...........................................|....................e...................................... - // sub v24.4s, v8.4s, v10.4s // ...............e.................................................|..............e............................................ - // add v8.4s, v8.4s, v10.4s // .................e...............................................|................e.......................................... - // mul v10.4s, v24.4s, v0.4s // .......................e.........................................|......................e.................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................e......................................|.........................e................................. - // mls v10.4s, v24.4s, v29.4s // .................................e...............................|................................e.......................... - // sub v24.4s, v9.4s, v11.4s // ........................e........................................|.......................e................................... - // add v9.4s, v9.4s, v11.4s // .........................e.......................................|........................e.................................. - // mul v11.4s, v24.4s, v0.4s // ...............................e.................................|..............................e............................ - // sqrdmulh v24.4s, v24.4s, v4.4s // .............................e...................................|............................e.............................. - // mls v11.4s, v24.4s, v29.4s // ..................................e..............................|.................................e......................... - // trn1 v25.4s, v8.4s, v9.4s // ..............................e..................................|.............................e............................. - // trn2 v26.4s, v8.4s, v9.4s // ................................e................................|...............................e........................... - // trn1 v27.4s, v10.4s, v11.4s // ....................................e............................|...................................e....................... - // trn2 v28.4s, v10.4s, v11.4s // .....................................e...........................|....................................e...................... - // trn2 v10.2d, v25.2d, v27.2d // .........................................e.......................|........................................e.................. - // trn2 v11.2d, v26.2d, v28.2d // ........................................e........................|.......................................e................... - // trn1 v8.2d, v25.2d, v27.2d // ..........................................e......................|.........................................e................. - // trn1 v9.2d, v26.2d, v28.2d // ...........................................e.....................|..........................................e................ - // ldr q1, [x4], #8 // .............................................e...................|............................................e.............. - // ldr q0, [x4], #16 // ..............................................e..................|.............................................e............. - // sub v24.4s, v8.4s, v9.4s // ..................................................e..............|.................................................e......... - // add v8.4s, v8.4s, v9.4s // ................................................e................|...............................................e........... - // mul v9.4s, v24.4s, v0.s[0] // ...............................................................e.|........................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................e.........|......................................................e.... - // mls v9.4s, v24.4s, v29.4s // .....*...........................................................|....*...................................................... - // sub v24.4s, v10.4s, v11.4s // ..............................................................e..|........................................................... - // add v10.4s, v10.4s, v11.4s // .................................................e...............|................................................e.......... - // mul v11.4s, v24.4s, v0.s[2] // .*...............................................................|*.......................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........*.....................................................|..........*................................................ - // mls v11.4s, v24.4s, v29.4s // ......................*..........................................|.....................*..................................... - // sub v24.4s, v8.4s, v10.4s // ......................................................e..........|.....................................................e..... - // add v8.4s, v8.4s, v10.4s // .....................................................e...........|....................................................e...... - // mul v10.4s, v24.4s, v1.s[0] // ..........................................................e......|.........................................................e. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................e...|........................................................... - // mls v10.4s, v24.4s, v29.4s // ......................................*..........................|.....................................*..................... - // sub v24.4s, v9.4s, v11.4s // ...........................*.....................................|..........................*................................ - // add v9.4s, v9.4s, v11.4s // ............................*....................................|...........................*............................... - // mul v11.4s, v24.4s, v1.s[0] // ............................................*....................|...........................................*............... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................*.............................|..................................*........................ - // mls v11.4s, v24.4s, v29.4s // ...................................................*.............|..................................................*........ - // srshr v24.4S, v8.4S, #23 // .........................................................e.......|........................................................e.. - // mls v8.4s, v24.4s, v29.4s // ............................................................e....|........................................................... - // srshr v24.4S, v9.4S, #23 // .......................................*.........................|......................................*.................... - // mls v9.4s, v24.4s, v29.4s // ....................................................*............|...................................................*....... - // str q8, [x0], #(16*4) // ................................................................e|........................................................... - // str q9, [x0, #(-16*4 + 1*16)] // ...........................................................*.....|..........................................................* - // str q10, [x0, #(-16*4 + 2*16)] // ...............................................*.................|..............................................*............ - // str q11, [x0, #(-16*4 + 3*16)] // ........................................................*........|.......................................................*... + // -------------------------------------------------------- new position ---------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-- + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // ..e..............................................................'.~............................................................ + // ldr q0, [x3], #(6*16) // ...e.............................................................'..~........................................................... + // ldr q4, [x3, #(-6*16 + 1*16)] // e................................................................~.............................................................. + // ldr q1, [x3, #(-6*16 + 2*16)] // ...............e.................................................'..............~............................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ..........e......................................................'.........~.................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ............e....................................................'...........~.................................................. + // ldr q6, [x3, #(-6*16 + 5*16)] // .....e...........................................................'....~......................................................... + // sub v24.4s, v8.4s, v9.4s // ...........e.....................................................'..........~................................................... + // add v8.4s, v8.4s, v9.4s // .............e...................................................'............~................................................. + // sqrdmulh v27.4s, v24.4s, v5.4s // ................e................................................'...............~.............................................. + // mul v9.4s, v24.4s, v1.4s // ..................e..............................................'.................~............................................ + // mls v9.4s, v27.4s, v29.4s // .....................e...........................................'....................~......................................... + // sub v24.4s, v10.4s, v11.4s // .......e.........................................................'......~....................................................... + // add v10.4s, v10.4s, v11.4s // ......e..........................................................'.....~........................................................ + // sqrdmulh v27.4s, v24.4s, v6.4s // ..............e..................................................'.............~................................................ + // mul v11.4s, v24.4s, v2.4s // ....................e............................................'...................~.......................................... + // mls v11.4s, v27.4s, v29.4s // ......................e..........................................'.....................~........................................ + // sub v24.4s, v8.4s, v10.4s // .................e...............................................'................~............................................. + // add v8.4s, v8.4s, v10.4s // ...................e.............................................'..................~........................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...........................e.....................................'..........................~................................... + // mul v10.4s, v24.4s, v0.4s // ........................e........................................'.......................~...................................... + // mls v10.4s, v27.4s, v29.4s // ...............................e.................................'..............................~............................... + // sub v24.4s, v9.4s, v11.4s // .........................e.......................................'........................~..................................... + // add v9.4s, v9.4s, v11.4s // ..........................e......................................'.........................~.................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ............................e....................................'...........................~.................................. + // mul v11.4s, v24.4s, v0.4s // ..............................e..................................'.............................~................................ + // mls v11.4s, v27.4s, v29.4s // ................................e................................'...............................~.............................. + // trn1 v25.4s, v8.4s, v9.4s // ..................................e..............................'.................................~............................ + // trn2 v26.4s, v8.4s, v9.4s // .............................e...................................'............................~................................. + // trn1 v27.4s, v10.4s, v11.4s // ....................................e............................'...................................~.......................... + // trn2 v28.4s, v10.4s, v11.4s // .....................................e...........................'....................................~......................... + // trn2 v10.2d, v25.2d, v27.2d // .......................................e.........................'......................................~....................... + // trn2 v11.2d, v26.2d, v28.2d // ..........................................e......................'.........................................~.................... + // trn1 v8.2d, v25.2d, v27.2d // ........................................e........................'.......................................~...................... + // trn1 v9.2d, v26.2d, v28.2d // .........................................e.......................'........................................~..................... + // ldr q1, [x4], #8 // ......................................................e..........'.....................................................~........ + // ldr q0, [x4], #16 // .........................................................e.......'........................................................~..... + // sub v24.4s, v8.4s, v9.4s // ........~........................................................'.......*...................................................... + // add v8.4s, v8.4s, v9.4s // ...............................................e.................'..............................................~............... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .......................~.........................................'......................*....................................... + // mul v9.4s, v24.4s, v0.s[0] // .................................~...............................'................................*............................. + // mls v9.4s, v27.4s, v29.4s // ...................................~.............................'..................................*........................... + // sub v24.4s, v10.4s, v11.4s // .....................................................e...........'....................................................~......... + // add v10.4s, v10.4s, v11.4s // ..............................................e..................'.............................................~................ + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .~...............................................................'*............................................................. + // mul v11.4s, v24.4s, v0.s[2] // ....~............................................................'...*.......................................................... + // mls v11.4s, v27.4s, v29.4s // .........~.......................................................'........*..................................................... + // sub v24.4s, v8.4s, v10.4s // ........................................................e........'.......................................................~...... + // add v8.4s, v8.4s, v10.4s // ....................................................e............'...................................................~.......... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ...............................................................e.'.............................................................. + // mul v10.4s, v24.4s, v1.s[0] // ......................................~..........................'.....................................*........................ + // mls v10.4s, v27.4s, v29.4s // ...........................................~.....................'..........................................*................... + // sub v24.4s, v9.4s, v11.4s // ............................................~....................'...........................................*.................. + // add v9.4s, v9.4s, v11.4s // .............................................~...................'............................................*................. + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ................................................~................'...............................................*.............. + // mul v11.4s, v24.4s, v1.s[0] // ...................................................~.............'..................................................*........... + // mls v11.4s, v27.4s, v29.4s // ...........................................................~.....'..........................................................*... + // srshr v24.4S, v8.4S, #23 // ..........................................................e......'.........................................................~.... + // mls v8.4s, v24.4s, v29.4s // ............................................................e....'...........................................................~.. + // srshr v24.4S, v9.4S, #23 // .................................................~...............'................................................*............. + // mls v9.4s, v24.4s, v29.4s // .......................................................~.........'......................................................*....... + // str q8, [x0], #(16*4) // ................................................................e'.............................................................. + // str q9, [x0, #(-16*4 + 1*16)] // .............................................................~...'............................................................*. + // str q10, [x0, #(-16*4 + 2*16)] // ..................................................~..............'.................................................*............ + // str q11, [x0, #(-16*4 + 3*16)] // ..............................................................~..'.............................................................* sub count, count, #1 cbnz count, layer5678_start - sqrdmulh v20.4S, v2.4S, v22.S[3] // ..*............ - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mul v4.4S, v2.4S, v22.S[2] // *.............. - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mls v25.4S, v23.4S, v29.4S // .*............. - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mls v4.4S, v20.4S, v29.4S // ...*........... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mls v24.4S, v16.4S, v29.4S // .......*....... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - sub v30.4S, v25.4S, v4.4S // ....*.......... - add v10.4S, v25.4S, v4.4S // .....*......... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - str q24, [x0, #-32] // ..........*.... - // gap // ............... - // gap // ............... - // gap // ............... - sqrdmulh v28.4S, v30.4S, v7.S[1] // ......*........ - srshr v4.4S, v10.4S, #23 // ........*...... - // gap // ............... - // gap // ............... - // gap // ............... - mul v23.4S, v30.4S, v7.S[0] // .........*..... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mls v10.4S, v4.4S, v29.4S // ............*.. - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - mls v23.4S, v28.4S, v29.4S // ...........*... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - str q10, [x0, #-48] // ..............* - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - // gap // ............... - str q23, [x0, #-16] // .............*. - // gap // ............... - // gap // ............... + // Instructions: 19 + // Expected cycles: 30 + // Expected IPC: 0.63 + // + // Wall time: 0.13s + // User time: 0.13s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sub v23.4S, v15.4S, v13.4S // ..*........................... + sqrdmulh v26.4S, v17.4S, v31.S[3] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v2.4S, v17.4S, v31.S[2] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v27.4S, v23.4S, v31.S[1] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v23.4S, v23.4S, v31.S[0] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v2.4S, v26.4S, v29.4S // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v23.4S, v27.4S, v29.4S // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v27.4S, v20.4S, v25.S[0] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v27.4S, v19.4S, v29.4S // ........*..................... + // gap // .............................. + // gap // .............................. + sub v9.4S, v23.4S, v2.4S // .........*.................... + // gap // .............................. + // gap // .............................. + add v22.4S, v23.4S, v2.4S // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v1.4S, v9.4S, v25.S[1] // ...........*.................. + // gap // .............................. + // gap // .............................. + str q27, [x0, #-32] // .............*................ + srshr v18.4S, v22.4S, #23 // ............*................. + // gap // .............................. + mul v27.4S, v9.4S, v25.S[0] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v22.4S, v18.4S, v29.4S // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v27.4S, v1.4S, v29.4S // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q22, [x0, #-48] // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q27, [x0, #-16] // ..................*........... + // gap // .............................. + // gap // .............................. - // original source code - // mul v10.4S, v2.4S, v22.S[2] // .*............. - // mls v25.4S, v23.4S, v29.4S // ..*............ - // sqrdmulh v21.4S, v2.4S, v22.S[3] // *.............. - // mls v10.4S, v21.4S, v29.4S // ...*........... - // sub v8.4S, v25.4S, v10.4S // .....*......... - // add v13.4S, v25.4S, v10.4S // ......*........ - // sqrdmulh v14.4S, v8.4S, v7.S[1] // ........*...... - // mls v24.4S, v16.4S, v29.4S // ....*.......... - // srshr v25.4S, v13.4S, #23 // .........*..... - // mul v26.4S, v8.4S, v7.S[0] // ..........*.... - // str q24, [x0, #-32] // .......*....... - // mls v26.4S, v14.4S, v29.4S // ............*.. - // mls v13.4S, v25.4S, v29.4S // ...........*... - // str q26, [x0, #-16] // ..............* - // str q13, [x0, #-48] // .............*. + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sqrdmulh v22.4S, v17.4S, v31.S[3] // .*............................. + // mul v30.4S, v17.4S, v31.S[2] // ..*............................ + // sub v28.4S, v15.4S, v13.4S // *.............................. + // mls v30.4S, v22.4S, v29.4S // .....*......................... + // sqrdmulh v10.4S, v28.4S, v31.S[1] // ...*........................... + // mul v27.4S, v28.4S, v31.S[0] // ....*.......................... + // mls v27.4S, v10.4S, v29.4S // ......*........................ + // mul v17.4S, v20.4S, v25.S[0] // .......*....................... + // mls v17.4S, v19.4S, v29.4S // ........*...................... + // sub v26.4S, v27.4S, v30.4S // .........*..................... + // add v2.4S, v27.4S, v30.4S // ..........*.................... + // sqrdmulh v22.4S, v26.4S, v25.S[1] // ...........*................... + // srshr v8.4S, v2.4S, #23 // .............*................. + // str q17, [x0, #-32] // ............*.................. + // mul v24.4S, v26.4S, v25.S[0] // ..............*................ + // mls v2.4S, v8.4S, v29.4S // ...............*............... + // mls v24.4S, v22.4S, v29.4S // ................*.............. + // str q2, [x0, #-48] // .................*............. + // str q24, [x0, #-16] // ..................*............ .unreq root0_tw @@ -942,853 +978,862 @@ layer5678_start: load_roots_1234 r_ptr1 .p2align 2 - ldr q10, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ - ldr q18, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q17, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q19, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + ldr q9, [x1] // *....................................................................................................................................................................................................................................................................................... ldr q22, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... - ldr q27, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... - ldr q13, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... - ldr q15, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. - ldr q24, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... - ldr q8, [x1, #0] // *....................................................................................................................................................................................................................................................................................... - sub v11.4S, v18.4S, v10.4S // ...............................*........................................................................................................................................................................................................................................................ - add v10.4S, v18.4S, v10.4S // ................................*....................................................................................................................................................................................................................................................... - ldr q18, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. - ldr q23, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... - add v16.4S, v22.4S, v15.4S // .....................................*.................................................................................................................................................................................................................................................. - sub v9.4S, v22.4S, v15.4S // ....................................*................................................................................................................................................................................................................................................... - ldr q20, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. - sub v12.4S, v13.4S, v27.4S // .....................*.................................................................................................................................................................................................................................................................. - ldr q22, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ - sqrdmulh v17.4S, v11.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... - add v27.4S, v13.4S, v27.4S // ......................*................................................................................................................................................................................................................................................................. - mul v15.4S, v11.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... - add v13.4S, v8.4S, v24.4S // .................*...................................................................................................................................................................................................................................................................... - sub v24.4S, v8.4S, v24.4S // ................*....................................................................................................................................................................................................................................................................... - add v19.4S, v23.4S, v20.4S // ...........................*............................................................................................................................................................................................................................................................ - sqrdmulh v21.4S, v12.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... - sub v11.4S, v13.4S, v27.4S // ........................................................*............................................................................................................................................................................................................................... - mul v28.4S, v12.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ - add v8.4S, v13.4S, v27.4S // .........................................................*.............................................................................................................................................................................................................................. - add v12.4S, v19.4S, v10.4S // ...................................................................*.................................................................................................................................................................................................................... - mls v28.4S, v21.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. - sub v21.4S, v8.4S, v12.4S // ................................................................................................*....................................................................................................................................................................................... - add v8.4S, v8.4S, v12.4S // .................................................................................................*...................................................................................................................................................................................... - sub v10.4S, v19.4S, v10.4S // ..................................................................*..................................................................................................................................................................................................................... - mul v13.4S, v11.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. - sub v14.4S, v18.4S, v22.4S // .........................................*.............................................................................................................................................................................................................................................. - add v27.4S, v18.4S, v22.4S // ..........................................*............................................................................................................................................................................................................................................. - sqrdmulh v22.4S, v11.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ - sub v23.4S, v23.4S, v20.4S // ..........................*............................................................................................................................................................................................................................................................. - mls v15.4S, v17.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... - sqrdmulh v17.4S, v24.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... - sqrdmulh v18.4S, v9.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ - mul v20.4S, v24.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... - mls v20.4S, v17.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... - sqrdmulh v24.4S, v10.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. - mul v17.4S, v23.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... - sub v19.4S, v20.4S, v28.4S // .............................................................*.......................................................................................................................................................................................................................... - add v12.4S, v20.4S, v28.4S // ..............................................................*......................................................................................................................................................................................................................... - mul v28.4S, v10.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... - sqrdmulh v10.4S, v23.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... - mls v28.4S, v24.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. - sqrdmulh v24.4S, v19.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... - mls v17.4S, v10.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... - ldr q10, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ - mul v23.4S, v19.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ - mls v23.4S, v24.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... - ldr q24, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... - add v19.4S, v17.4S, v15.4S // ........................................................................*............................................................................................................................................................................................................... - mls v13.4S, v22.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... - sub v15.4S, v17.4S, v15.4S // .......................................................................*................................................................................................................................................................................................................ - ldr q22, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... - add v20.4S, v24.4S, v10.4S // ....................................................*................................................................................................................................................................................................................................... - sub v10.4S, v24.4S, v10.4S // ...................................................*.................................................................................................................................................................................................................................... - mul v17.4S, v9.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. - sub v9.4S, v16.4S, v27.4S // ............................................................................*........................................................................................................................................................................................................... - add v11.4S, v16.4S, v27.4S // .............................................................................*.......................................................................................................................................................................................................... - ldr q27, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... - sqrdmulh v24.4S, v15.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. - mul v16.4S, v15.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. - mls v17.4S, v18.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... - sub v18.4S, v27.4S, v22.4S // ..............................................*......................................................................................................................................................................................................................................... - add v27.4S, v27.4S, v22.4S // ...............................................*........................................................................................................................................................................................................................................ - mls v16.4S, v24.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ - mul v15.4S, v10.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. - sqrdmulh v10.4S, v10.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. - sqrdmulh v24.4S, v18.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... - sqrdmulh v22.4S, v9.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + ldr q28, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q8, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q13, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + sub v11.4S, v17.4S, v19.4S // .........................................*.............................................................................................................................................................................................................................................. + add v18.4S, v17.4S, v19.4S // ..........................................*............................................................................................................................................................................................................................................. + ldr q24, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + sqrdmulh v10.4S, v11.4S, v6.S[1] // ...........................................*............................................................................................................................................................................................................................................ + sub v15.4S, v9.4S, v13.4S // ................*....................................................................................................................................................................................................................................................................... + sub v17.4S, v22.4S, v24.4S // ....................................*................................................................................................................................................................................................................................................... + add v16.4S, v22.4S, v24.4S // .....................................*.................................................................................................................................................................................................................................................. + mul v19.4S, v11.4S, v6.S[0] // ............................................*........................................................................................................................................................................................................................................... + ldr q24, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + mul v23.4S, v17.4S, v5.S[2] // .......................................*................................................................................................................................................................................................................................................ + sqrdmulh v12.4S, v17.4S, v5.S[3] // ......................................*................................................................................................................................................................................................................................................. + ldr q17, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + mul v22.4S, v15.4S, v3.S[2] // ...................*.................................................................................................................................................................................................................................................................... + mls v19.4S, v10.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + sub v27.4S, v24.4S, v17.4S // ..........................*............................................................................................................................................................................................................................................................. + add v20.4S, v24.4S, v17.4S // ...........................*............................................................................................................................................................................................................................................................ + ldr q17, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + mls v23.4S, v12.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sqrdmulh v12.4S, v27.4S, v4.S[3] // ............................*........................................................................................................................................................................................................................................................... + sub v21.4S, v8.4S, v17.4S // ...............................*........................................................................................................................................................................................................................................................ + mul v24.4S, v27.4S, v4.S[2] // .............................*.......................................................................................................................................................................................................................................................... + add v17.4S, v8.4S, v17.4S // ................................*....................................................................................................................................................................................................................................................... + mls v24.4S, v12.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + ldr q14, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + add v13.4S, v9.4S, v13.4S // .................*...................................................................................................................................................................................................................................................................... + ldr q12, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + ldr q11, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v3.S[3] // ..................*..................................................................................................................................................................................................................................................................... + sub v27.4S, v16.4S, v18.4S // ............................................................................*........................................................................................................................................................................................................... + add v16.4S, v16.4S, v18.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v18.4S, v20.4S, v17.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v9.4S, v21.4S, v5.S[1] // .................................*...................................................................................................................................................................................................................................................... + add v17.4S, v20.4S, v17.4S // ...................................................................*.................................................................................................................................................................................................................... + mul v10.4S, v21.4S, v5.S[0] // ..................................*..................................................................................................................................................................................................................................................... + add v21.4S, v14.4S, v28.4S // ......................*................................................................................................................................................................................................................................................................. + sub v14.4S, v14.4S, v28.4S // .....................*.................................................................................................................................................................................................................................................................. + mls v22.4S, v15.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + add v20.4S, v11.4S, v12.4S // ....................................................*................................................................................................................................................................................................................................... + sub v15.4S, v13.4S, v21.4S // ........................................................*............................................................................................................................................................................................................................... + add v13.4S, v13.4S, v21.4S // .........................................................*.............................................................................................................................................................................................................................. + mls v10.4S, v9.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sub v11.4S, v11.4S, v12.4S // ...................................................*.................................................................................................................................................................................................................................... + mul v28.4S, v15.4S, v1.S[2] // ...........................................................*............................................................................................................................................................................................................................ + add v8.4S, v13.4S, v17.4S // .................................................................................................*...................................................................................................................................................................................... + sub v17.4S, v13.4S, v17.4S // ................................................................................................*....................................................................................................................................................................................... + sqrdmulh v9.4S, v18.4S, v2.S[1] // ....................................................................*................................................................................................................................................................................................................... + sub v13.4S, v24.4S, v10.4S // .......................................................................*................................................................................................................................................................................................................ + add v24.4S, v24.4S, v10.4S // ........................................................................*............................................................................................................................................................................................................... + mul v21.4S, v18.4S, v2.S[0] // .....................................................................*.................................................................................................................................................................................................................. + mul v12.4S, v17.4S, v0.S[2] // ...................................................................................................*.................................................................................................................................................................................... + add v18.4S, v23.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... + sub v23.4S, v23.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... + sqrdmulh v19.4S, v17.4S, v0.S[3] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v10.4S, v14.4S, v4.S[1] // .......................*................................................................................................................................................................................................................................................................ + mul v14.4S, v14.4S, v4.S[0] // ........................*............................................................................................................................................................................................................................................................... + mul v17.4S, v13.4S, v2.S[0] // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v2.S[1] // .........................................................................*.............................................................................................................................................................................................................. + mls v14.4S, v10.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + sqrdmulh v10.4S, v15.4S, v1.S[3] // ..........................................................*............................................................................................................................................................................................................................. + ldr q15, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + mls v21.4S, v9.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v9.4S, v22.4S, v14.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v22.4S, v22.4S, v14.4S // .............................................................*.......................................................................................................................................................................................................................... + mls v17.4S, v13.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + ldr q14, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mls v28.4S, v10.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + add v13.4S, v15.4S, v14.4S // ...............................................*........................................................................................................................................................................................................................................ + sqrdmulh v10.4S, v11.4S, v7.S[1] // .....................................................*.................................................................................................................................................................................................................................. + sub v14.4S, v15.4S, v14.4S // ..............................................*......................................................................................................................................................................................................................................... + mul v15.4S, v11.4S, v7.S[0] // ......................................................*................................................................................................................................................................................................................................. + sub v11.4S, v28.4S, v21.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v12.4S, v19.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... mls v15.4S, v10.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ - mul v10.4S, v18.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... - mls v10.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... - sub v24.4S, v12.4S, v19.4S // .....................................................................................................*.................................................................................................................................................................................. - mul v18.4S, v9.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... - add v9.4S, v12.4S, v19.4S // ......................................................................................................*................................................................................................................................................................................. - sub v12.4S, v27.4S, v20.4S // ......................................................................................*................................................................................................................................................................................................. - mls v18.4S, v22.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... - sub v22.4S, v10.4S, v15.4S // ...........................................................................................*............................................................................................................................................................................................ - add v27.4S, v27.4S, v20.4S // .......................................................................................*................................................................................................................................................................................................ - sqrdmulh v19.4S, v12.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. - add v15.4S, v10.4S, v15.4S // ............................................................................................*........................................................................................................................................................................................... - mul v10.4S, v12.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... - add v20.4S, v11.4S, v27.4S // .....................................................................................................................*.................................................................................................................................................................. - sub v27.4S, v11.4S, v27.4S // ....................................................................................................................*................................................................................................................................................................... - sqrdmulh v12.4S, v24.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... - add v11.4S, v23.4S, v16.4S // ................................................................................................................*....................................................................................................................................................................... - mul v24.4S, v24.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ - sub v16.4S, v23.4S, v16.4S // ...............................................................................................................*........................................................................................................................................................................ - mls v10.4S, v19.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. - sub v19.4S, v8.4S, v20.4S // ........................................................................................................................................*............................................................................................................................................... - add v8.4S, v8.4S, v20.4S // .........................................................................................................................................*.............................................................................................................................................. - sub v20.4S, v13.4S, v28.4S // ..........................................................................................................*............................................................................................................................................................................. - sqrdmulh v23.4S, v14.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... - add v13.4S, v13.4S, v28.4S // ...........................................................................................................*............................................................................................................................................................................ - mul v14.4S, v14.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ - sub v28.4S, v18.4S, v10.4S // ..............................................................................................................................*......................................................................................................................................................... - add v18.4S, v18.4S, v10.4S // ...............................................................................................................................*........................................................................................................................................................ - sqrdmulh v10.4S, v22.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... - mls v14.4S, v23.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... - mul v23.4S, v22.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... - mls v23.4S, v10.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ - add v22.4S, v17.4S, v14.4S // ..................................................................................*..................................................................................................................................................................................................... - sub v14.4S, v17.4S, v14.4S // .................................................................................*...................................................................................................................................................................................................... - mls v24.4S, v12.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. - add v12.4S, v22.4S, v15.4S // ..........................................................................................................................*............................................................................................................................................................. - sqrdmulh v17.4S, v16.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... - sub v15.4S, v22.4S, v15.4S // .........................................................................................................................*.............................................................................................................................................................. - sqrdmulh v22.4S, v27.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ - sub v10.4S, v9.4S, v12.4S // .............................................................................................................................................*.......................................................................................................................................... - mul v27.4S, v27.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. - add v9.4S, v9.4S, v12.4S // ..............................................................................................................................................*......................................................................................................................................... - sqrdmulh v12.4S, v10.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... - mul v16.4S, v16.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... - mls v16.4S, v17.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... - mul v17.4S, v10.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ - sqrdmulh v10.4S, v19.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - mul v19.4S, v19.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - mls v17.4S, v12.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - mul v12.4S, v21.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... - sqrdmulh v21.4S, v21.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... - mls v27.4S, v22.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... - mls v19.4S, v10.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - mls v12.4S, v21.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... - sqrdmulh v10.4S, v14.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... - mul v22.4S, v28.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... - sub v21.4S, v12.4S, v27.4S // ............................................................................................................................................................*........................................................................................................................... - add v12.4S, v12.4S, v27.4S // .............................................................................................................................................................*.......................................................................................................................... - mul v27.4S, v14.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... - mls v27.4S, v10.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. - add v10.4S, v13.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... - sub v18.4S, v13.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... - mul v13.4S, v15.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ - sqrdmulh v15.4S, v15.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... - sub v14.4S, v27.4S, v23.4S // ...................................................................................................................................*.................................................................................................................................................... + add v10.4S, v28.4S, v21.4S // ...........................................................................................................*............................................................................................................................................................................ + add v28.4S, v13.4S, v20.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v19.4S, v27.4S, v2.S[3] // ..............................................................................*......................................................................................................................................................................................................... + sub v21.4S, v9.4S, v24.4S // .....................................................................................................*.................................................................................................................................................................................. + sub v13.4S, v13.4S, v20.4S // ......................................................................................*................................................................................................................................................................................................. + mul v20.4S, v27.4S, v2.S[2] // ...............................................................................*........................................................................................................................................................................................................ + add v9.4S, v9.4S, v24.4S // ......................................................................................................*................................................................................................................................................................................. + sqrdmulh v24.4S, v14.4S, v6.S[3] // ................................................*....................................................................................................................................................................................................................................... + mls v20.4S, v19.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sqrdmulh v19.4S, v11.4S, v0.S[3] // ............................................................................................................*........................................................................................................................................................................... + mul v27.4S, v14.4S, v6.S[2] // .................................................*...................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v22.4S, v1.S[3] // ...............................................................*........................................................................................................................................................................................................................ + mul v22.4S, v22.4S, v1.S[2] // ................................................................*....................................................................................................................................................................................................................... + mls v27.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mls v22.4S, v14.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mul v14.4S, v11.4S, v0.S[2] // .............................................................................................................*.......................................................................................................................................................................... + add v24.4S, v27.4S, v15.4S // ............................................................................................*........................................................................................................................................................................................... + mls v14.4S, v19.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sub v11.4S, v27.4S, v15.4S // ...........................................................................................*............................................................................................................................................................................................ + mul v27.4S, v23.4S, v2.S[2] // ....................................................................................*................................................................................................................................................................................................... + sub v19.4S, v18.4S, v24.4S // .........................................................................................................................*.............................................................................................................................................................. + add v24.4S, v18.4S, v24.4S // ..........................................................................................................................*............................................................................................................................................................. + sqrdmulh v15.4S, v23.4S, v2.S[3] // ...................................................................................*.................................................................................................................................................................................................... + mul v23.4S, v13.4S, v3.S[0] // .........................................................................................*.............................................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v3.S[1] // ........................................................................................*............................................................................................................................................................................................... + sqrdmulh v18.4S, v11.4S, v3.S[1] // .............................................................................................*.......................................................................................................................................................................................... + mls v27.4S, v15.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + mul v15.4S, v11.4S, v3.S[0] // ..............................................................................................*......................................................................................................................................................................................... + add v11.4S, v16.4S, v28.4S // .....................................................................................................................*.................................................................................................................................................................. + mls v23.4S, v13.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sub v13.4S, v16.4S, v28.4S // ....................................................................................................................*................................................................................................................................................................... + sub v28.4S, v8.4S, v11.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v11.4S // .........................................................................................................................................*.............................................................................................................................................. + mul v16.4S, v19.4S, v1.S[0] // ............................................................................................................................*........................................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v1.S[1] // ...........................................................................................................................*............................................................................................................................................................ + mls v15.4S, v18.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v18.4S, v13.4S, v1.S[1] // ......................................................................................................................*................................................................................................................................................................. + mul v11.4S, v13.4S, v1.S[0] // .......................................................................................................................*................................................................................................................................................................ + sub v13.4S, v27.4S, v15.4S // ...................................................................................................................................*.................................................................................................................................................... + add v15.4S, v27.4S, v15.4S // ....................................................................................................................................*................................................................................................................................................... + mls v16.4S, v19.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sqrdmulh v27.4S, v13.4S, v1.S[1] // .....................................................................................................................................*.................................................................................................................................................. + mul v19.4S, v13.4S, v1.S[0] // ......................................................................................................................................*................................................................................................................................................. + sub v13.4S, v22.4S, v17.4S // ...............................................................................................................*........................................................................................................................................................................ + add v17.4S, v22.4S, v17.4S // ................................................................................................................*....................................................................................................................................................................... + mls v11.4S, v18.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + sqrdmulh v18.4S, v21.4S, v0.S[3] // .......................................................................................................*................................................................................................................................................................................ + mls v19.4S, v27.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sub v27.4S, v17.4S, v15.4S // .......................................................................................................................................................*................................................................................................................................ + sub v22.4S, v12.4S, v11.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v11.4S // .............................................................................................................................................................*.......................................................................................................................... + mul v21.4S, v21.4S, v0.S[2] // ........................................................................................................*............................................................................................................................................................................... + add v11.4S, v17.4S, v15.4S // ........................................................................................................................................................*............................................................................................................................... + mls v21.4S, v18.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + add v17.4S, v20.4S, v23.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v23.4S, v20.4S, v23.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v15.4S, v13.4S, v0.S[2] // ..................................................................................................................*..................................................................................................................................................................... + sub v18.4S, v10.4S, v17.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v17.4S // ...................................................................................................................................................*.................................................................................................................................... + sqrdmulh v17.4S, v13.4S, v0.S[3] // .................................................................................................................*...................................................................................................................................................................... sub count, count, #1 layer1234_start: - sqrdmulh v28.4S, v28.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... - add v23.4S, v27.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... - cmge v27.4S, v31.4S, v19.4S // ................................................................................................................................................................................*....................................................................................................... - mls v13.4S, v15.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - sub v15.4S, v11.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ - add v11.4S, v11.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... - sqrdmulh v23.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - mls v22.4S, v28.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... - cmge v28.4S, v19.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - mul v20.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - sub v28.4S, v27.4S, v28.4S // ..................................................................................................................................................................................*..................................................................................................... - sqrdmulh v27.4S, v14.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - mls v20.4S, v23.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - mul v23.4S, v14.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - mls v23.4S, v27.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - sub v28.4S, v24.4S, v13.4S // .................................................................................................................................................................*...................................................................................................................... - add v13.4S, v24.4S, v13.4S // ..................................................................................................................................................................*..................................................................................................................... - sqrdmulh v27.4S, v15.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - sub v24.4S, v20.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. - add v14.4S, v20.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ - mul v22.4S, v15.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - add v15.4S, v16.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... - str q19, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - sub v19.4S, v16.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ - cmge v23.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... - sqrdmulh v16.4S, v18.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - mul v20.4S, v18.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - mls v22.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - mul v27.4S, v19.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - sub v18.4S, v23.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. - sqrdmulh v19.4S, v19.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - mls v17.4S, v18.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - sqrdmulh v18.4S, v21.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - mul v21.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - mls v20.4S, v16.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - cmge v16.4S, v22.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - sqrdmulh v23.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - cmge v9.4S, v31.4S, v22.4S // ............................................................................................................................................................................................*........................................................................................... - mls v21.4S, v18.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - sub v16.4S, v9.4S, v16.4S // ..............................................................................................................................................................................................*......................................................................................... - sqrdmulh v18.4S, v24.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mul v24.4S, v24.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - mls v27.4S, v19.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - cmge v19.4S, v31.4S, v20.4S // ........................................................................................................................................................................................*............................................................................................... - mls v24.4S, v18.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - cmge v18.4S, v20.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - sqrdmulh v9.4S, v28.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - sub v19.4S, v19.4S, v18.4S // ..........................................................................................................................................................................................*............................................................................................. - sqrdmulh v18.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - mls v20.4S, v19.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - mls v22.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - str q20, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - mul v11.4S, v28.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - str q22, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - cmge v16.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... - ldr q22, [x1, #464] // .......e................................................................................................................................................................................................................................................................................ - mls v11.4S, v9.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - cmge v9.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - cmge v20.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... - mls v19.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - cmge v18.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - sub v9.4S, v16.4S, v9.4S // ..........................................................................................................................................................................................................*............................................................................. - mls v17.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - cmge v23.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - sub v18.4S, v20.4S, v18.4S // ..............................................................................................................................................................................................................*......................................................................... - mls v24.4S, v9.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - cmge v9.4S, v11.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - cmge v16.4S, v31.4S, v11.4S // ....................................................................................................................................................................................................*................................................................................... - mls v27.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - cmge v18.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... - cmge v20.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - sub v9.4S, v16.4S, v9.4S // ......................................................................................................................................................................................................*................................................................................. - sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - sub v18.4S, v18.4S, v23.4S // ..................................................................................................................................................................................................*..................................................................................... - str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - cmge v8.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - str q27, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - cmge v24.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - mls v21.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - sub v18.4S, v24.4S, v20.4S // ......................................................................................................................................................................................................................................................*................................. - ldr q24, [x1, #400] // ......e................................................................................................................................................................................................................................................................................. - cmge v27.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - mls v11.4S, v9.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - sub v23.4S, v8.4S, v27.4S // ..............................................................................................................................................................................................................................................................*......................... - mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - ldr q8, [x1, #272] // ....e................................................................................................................................................................................................................................................................................... - str q21, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - sub v27.4S, v24.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ - sqrdmulh v9.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - str q11, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - mul v13.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - cmge v21.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - mls v17.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - sqrdmulh v18.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - str q19, [x1, #192] // ...................................................................................................................................................................................................................................................................................*.... - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... - mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - sqrdmulh v17.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - sub v15.4S, v11.4S, v21.4S // ..................................................................................................................................................................................................................................................*..................................... - mul v12.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - sqrdmulh v11.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - add v10.4S, v24.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... - mls v13.4S, v9.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - mls v20.4S, v18.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - cmge v22.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... - ldr q19, [x1, #336] // .....e.................................................................................................................................................................................................................................................................................. - cmge v18.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - mls v12.4S, v11.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - sqrdmulh v21.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - sub v28.4S, v22.4S, v18.4S // ......................................................................................................................................................................................................................................................................*................. - cmge v9.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - cmge v24.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - cmge v11.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................................................................*............................... - mls v23.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - cmge v18.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - sub v24.4S, v24.4S, v9.4S // ..................................................................................................................................................................................................................................................................*..................... - mls v22.4S, v21.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - sub v18.4S, v11.4S, v18.4S // ..........................................................................................................................................................................................................................................................*............................. - ldr q17, [x1, #592] // .........e.............................................................................................................................................................................................................................................................................. - mls v20.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - add v24.4S, v8.4S, v19.4S // ...........................e............................................................................................................................................................................................................................................................ - cmge v9.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - mls v12.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - cmge v21.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... - cmge v11.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - mls v13.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - cmge v18.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - str q20, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... - sub v28.4S, v21.4S, v9.4S // ..............................................................................................................................................................................................................................................................................*......... - mls v16.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - sub v20.4S, v8.4S, v19.4S // ..........................e............................................................................................................................................................................................................................................................. - ldr q21, [x1, #80] // .e...................................................................................................................................................................................................................................................................................... - sqrdmulh v8.4S, v27.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... - ldr q14, [x1, #720] // ...........e............................................................................................................................................................................................................................................................................ - ldr q9, [x1, #16] // e....................................................................................................................................................................................................................................................................................... - sub v11.4S, v11.4S, v18.4S // ..........................................................................................................................................................................................................................................................................*............. - mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - ldr q15, [x1, #208] // ...e.................................................................................................................................................................................................................................................................................... - ldr q28, [x1, #144] // ..e..................................................................................................................................................................................................................................................................................... - str q12, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... - str q13, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. - add v13.4S, v24.4S, v10.4S // ...................................................................e.................................................................................................................................................................................................................... - sub v12.4S, v24.4S, v10.4S // ..................................................................e..................................................................................................................................................................................................................... - mul v19.4S, v20.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... - ldr q24, [x1, #528] // ........e............................................................................................................................................................................................................................................................................... - add v10.4S, v9.4S, v21.4S // .................e...................................................................................................................................................................................................................................................................... - add v18.4S, v28.4S, v15.4S // ......................e................................................................................................................................................................................................................................................................. - sqrdmulh v20.4S, v20.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... - sub v28.4S, v28.4S, v15.4S // .....................e.................................................................................................................................................................................................................................................................. - str q23, [x1, #448] // .......................................................................................................................................................................................................................................................................................* - sub v21.4S, v9.4S, v21.4S // ................e....................................................................................................................................................................................................................................................................... - mls v22.4S, v11.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - add v9.4S, v10.4S, v18.4S // .........................................................e.............................................................................................................................................................................................................................. - sqrdmulh v11.4S, v28.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - ldr q16, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. - sqrdmulh v15.4S, v21.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... - mul v23.4S, v28.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - mul v22.4S, v21.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... - add v21.4S, v24.4S, v17.4S // .....................................e.................................................................................................................................................................................................................................................. - sub v24.4S, v24.4S, v17.4S // ....................................e................................................................................................................................................................................................................................................... - ldr q17, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... - ldr q28, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ - mls v22.4S, v15.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... - sub v15.4S, v10.4S, v18.4S // ........................................................e............................................................................................................................................................................................................................... - add v10.4S, v16.4S, v14.4S // ..........................................e............................................................................................................................................................................................................................................. - sub v16.4S, v16.4S, v14.4S // .........................................e.............................................................................................................................................................................................................................................. - mul v14.4S, v27.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... - sub v27.4S, v9.4S, v13.4S // ................................................................................................e....................................................................................................................................................................................... - mls v23.4S, v11.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. - add v18.4S, v21.4S, v10.4S // .............................................................................e.......................................................................................................................................................................................................... - sub v21.4S, v21.4S, v10.4S // ............................................................................e........................................................................................................................................................................................................... - mls v14.4S, v8.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... - add v8.4S, v9.4S, v13.4S // .................................................................................................e...................................................................................................................................................................................... - add v11.4S, v17.4S, v28.4S // ....................................................e................................................................................................................................................................................................................................... - mls v19.4S, v20.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... - sub v10.4S, v17.4S, v28.4S // ...................................................e.................................................................................................................................................................................................................................... - add v9.4S, v22.4S, v23.4S // ..............................................................e......................................................................................................................................................................................................................... - sqrdmulh v13.4S, v12.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. - mul v12.4S, v12.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... - add v28.4S, v19.4S, v14.4S // ........................................................................e............................................................................................................................................................................................................... - sqrdmulh v20.4S, v24.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ - mul v17.4S, v24.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. - sub v24.4S, v19.4S, v14.4S // .......................................................................e................................................................................................................................................................................................................ - sub v14.4S, v22.4S, v23.4S // .............................................................e.......................................................................................................................................................................................................................... - mls v12.4S, v13.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. - sub v23.4S, v9.4S, v28.4S // .....................................................................................................e.................................................................................................................................................................................. - add v9.4S, v9.4S, v28.4S // ......................................................................................................e................................................................................................................................................................................. - mul v19.4S, v21.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... - sqrdmulh v21.4S, v21.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ - sqrdmulh v28.4S, v14.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... - mul v22.4S, v14.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ - sqrdmulh v13.4S, v15.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ - mul v15.4S, v15.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. - mls v19.4S, v21.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... - mul v21.4S, v27.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... - sqrdmulh v27.4S, v27.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... - mul v14.4S, v16.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ - sqrdmulh v16.4S, v16.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... - mls v22.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... - ldr q28, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... - mls v15.4S, v13.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... - ldr q13, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... - mls v17.4S, v20.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... - mls v14.4S, v16.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... - add v16.4S, v28.4S, v13.4S // ...............................................e........................................................................................................................................................................................................................................ - mls v21.4S, v27.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... - sub v27.4S, v16.4S, v11.4S // ......................................................................................e................................................................................................................................................................................................. - add v16.4S, v16.4S, v11.4S // .......................................................................................e................................................................................................................................................................................................ - sqrdmulh v20.4S, v10.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. - sqrdmulh v11.4S, v27.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. - sub v28.4S, v28.4S, v13.4S // ..............................................e......................................................................................................................................................................................................................................... - mul v13.4S, v10.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. - sqrdmulh v10.4S, v28.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... - mls v13.4S, v20.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ - sub v20.4S, v15.4S, v12.4S // ..........................................................................................................e............................................................................................................................................................................. - add v12.4S, v15.4S, v12.4S // ...........................................................................................................e............................................................................................................................................................................ - mul v15.4S, v28.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... - add v28.4S, v18.4S, v16.4S // .....................................................................................................................e.................................................................................................................................................................. - sub v18.4S, v18.4S, v16.4S // ....................................................................................................................e................................................................................................................................................................... - mls v15.4S, v10.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... - sub v16.4S, v17.4S, v14.4S // .................................................................................e...................................................................................................................................................................................................... - add v14.4S, v17.4S, v14.4S // ..................................................................................e..................................................................................................................................................................................................... - sqrdmulh v10.4S, v18.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ - sub v17.4S, v8.4S, v28.4S // ........................................................................................................................................e............................................................................................................................................... - add v8.4S, v8.4S, v28.4S // .........................................................................................................................................e.............................................................................................................................................. - mul v28.4S, v27.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... - add v27.4S, v15.4S, v13.4S // ............................................................................................e........................................................................................................................................................................................... - sub v15.4S, v15.4S, v13.4S // ...........................................................................................e............................................................................................................................................................................................ - mul v13.4S, v18.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. - mls v13.4S, v10.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... - add v10.4S, v14.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. - sub v14.4S, v14.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. - mls v28.4S, v11.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. - sub v18.4S, v9.4S, v10.4S // .............................................................................................................................................e.......................................................................................................................................... - add v9.4S, v9.4S, v10.4S // ..............................................................................................................................................e......................................................................................................................................... - mul v11.4S, v24.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. - sqrdmulh v27.4S, v24.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. - add v10.4S, v19.4S, v28.4S // ...............................................................................................................................e........................................................................................................................................................ - sub v28.4S, v19.4S, v28.4S // ..............................................................................................................................e......................................................................................................................................................... - sqrdmulh v19.4S, v23.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... - mul v24.4S, v23.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ - mls v24.4S, v19.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. - mul v19.4S, v17.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. - sqrdmulh v23.4S, v17.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ - mls v11.4S, v27.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ - mul v17.4S, v18.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ - sqrdmulh v27.4S, v18.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... - mls v19.4S, v23.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... - sub v23.4S, v22.4S, v11.4S // ...............................................................................................................e........................................................................................................................................................................ - add v11.4S, v22.4S, v11.4S // ................................................................................................................e....................................................................................................................................................................... - sqrdmulh v22.4S, v16.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... - mls v17.4S, v27.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... - mul v27.4S, v16.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... - mls v27.4S, v22.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. - sqrdmulh v18.4S, v15.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... - sqrdmulh v22.4S, v23.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... - mul v16.4S, v23.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... - mul v23.4S, v15.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... - mls v23.4S, v18.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ - sub v18.4S, v12.4S, v10.4S // ..................................................................................................................................................e..................................................................................................................................... - add v10.4S, v12.4S, v10.4S // ...................................................................................................................................................e.................................................................................................................................... - add v12.4S, v21.4S, v13.4S // .............................................................................................................................................................e.......................................................................................................................... - sqrdmulh v15.4S, v14.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... - sub v21.4S, v21.4S, v13.4S // ............................................................................................................................................................e........................................................................................................................... - mul v13.4S, v14.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ - mls v16.4S, v22.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... - mul v22.4S, v28.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... - sub v14.4S, v27.4S, v23.4S // ...................................................................................................................................e.................................................................................................................................................... + // Instructions: 280 + // Expected cycles: 93 + // Expected IPC: 3.01 - // original source code - // ldr q8, [x1, #0] // ....................................................................................e.......................................................................................................................................|...............................................................................................................................................e........................ - // ldr q9, [x1, #(1*(512/8))] // .................................................................................e..........................................................................................................................................|............................................................................................................................................e........................... - // ldr q10, [x1, #(2*(512/8))] // ........................................................................................e...................................................................................................................................|...................................................................................................................................................e.................... - // ldr q11, [x1, #(3*(512/8))] // .......................................................................................e....................................................................................................................................|..................................................................................................................................................e..................... - // ldr q12, [x1, #(4*(512/8))] // ...............................e............................................................................................................................................................................................|..........................................................................................e............................................................................. - // ldr q13, [x1, #(5*(512/8))] // ......................................................e.....................................................................................................................................................................|.................................................................................................................e...................................................... - // ldr q14, [x1, #(6*(512/8))] // ..........................e.................................................................................................................................................................................................|.....................................................................................e.................................................................................. - // ldr q15, [x1, #(7*(512/8))] // e...........................................................................................................................................................................................................................|...........................................................e............................................................................................................ - // ldr q16, [x1, #(8*(512/8))] // ..............................................................................................e.............................................................................................................................|.........................................................................................................................................................e.............. - // ldr q17, [x1, #(9*(512/8))] // ....................................................................e.......................................................................................................................................................|...............................................................................................................................e........................................ - // ldr q18, [x1, #(10*(512/8))] // .........................................................................................................e..................................................................................................................|....................................................................................................................................................................e... - // ldr q19, [x1, #(11*(512/8))] // ...................................................................................e........................................................................................................................................|..............................................................................................................................................e......................... - // ldr q20, [x1, #(12*(512/8))] // .......................................................................................................................................................e....................................................................|........................................................................................................................................................................ - // ldr q21, [x1, #(13*(512/8))] // .........................................................................................................................................................e..................................................................|........................................................................................................................................................................ - // ldr q22, [x1, #(14*(512/8))] // ................................................................................................................e...........................................................................................................|........................................................................................................................................................................ - // ldr q23, [x1, #(15*(512/8))] // .................................................................................................................e..........................................................................................................|........................................................................................................................................................................ - // sub v24.4s, v8.4s, v9.4s // ....................................................................................................e.......................................................................................................................|...............................................................................................................................................................e........ - // add v8.4s, v8.4s, v9.4s // ...............................................................................................e............................................................................................................................|..........................................................................................................................................................e............. - // mul v9.4s, v24.4s, v3.s[2] // .............................................................................................................e..............................................................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..........................................................................................................e.................................................................................................................|.....................................................................................................................................................................e.. - // mls v9.4s, v24.4s, v29.4s // ..................................................................................................................e.........................................................................................................|........................................................................................................................................................................ - // sub v24.4s, v10.4s, v11.4s // ..................................................................................................e.........................................................................................................................|.............................................................................................................................................................e.......... - // add v10.4s, v10.4s, v11.4s // ................................................................................................e...........................................................................................................................|...........................................................................................................................................................e............ - // mul v11.4s, v24.4s, v4.s[0] // ...........................................................................................................e................................................................................................................|......................................................................................................................................................................e. - // sqrdmulh v24.4s, v24.4s, v4.s[1] // .......................................................................................................e....................................................................................................................|..................................................................................................................................................................e..... - // mls v11.4s, v24.4s, v29.4s // ........................................................................................................................e...................................................................................................|........................................................................................................................................................................ - // sub v24.4s, v12.4s, v13.4s // ................................................................................e...........................................................................................................................................|...........................................................................................................................................e............................ - // add v12.4s, v12.4s, v13.4s // ......................................................................e.....................................................................................................................................................|.................................................................................................................................e...................................... - // mul v13.4s, v24.4s, v4.s[2] // .............................................................................................e..............................................................................................................................|........................................................................................................................................................e............... - // sqrdmulh v24.4s, v24.4s, v4.s[3] // .................................................................................................e..........................................................................................................................|............................................................................................................................................................e........... - // mls v13.4s, v24.4s, v29.4s // ..............................................................................................................................e.............................................................................................|........................................................................................................................................................................ - // sub v24.4s, v14.4s, v15.4s // .................................e..........................................................................................................................................................................................|............................................................................................e........................................................................... - // add v14.4s, v14.4s, v15.4s // ..................................................e.........................................................................................................................................................................|.............................................................................................................e.......................................................... - // mul v15.4s, v24.4s, v5.s[0] // ......................................................................................................................e.....................................................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v5.s[1] // ..................................................................................e.........................................................................................................................................|.............................................................................................................................................e.......................... - // mls v15.4s, v24.4s, v29.4s // ...........................................................................................................................e................................................................................................|........................................................................................................................................................................ - // sub v24.4s, v16.4s, v17.4s // ...............................................................................................................e............................................................................................................|........................................................................................................................................................................ - // add v16.4s, v16.4s, v17.4s // ..............................................................................................................e.............................................................................................................|........................................................................................................................................................................ - // mul v17.4s, v24.4s, v5.s[2] // .....................................................................................................................................e......................................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v5.s[3] // ....................................................................................................................................e.......................................................................................|........................................................................................................................................................................ - // mls v17.4s, v24.4s, v29.4s // ..........................................................................................................................................................e.................................................................|........................................................................................................................................................................ - // sub v24.4s, v18.4s, v19.4s // .....................................................................................................................e......................................................................................................|........................................................................................................................................................................ - // add v18.4s, v18.4s, v19.4s // ....................................................................................................................e.......................................................................................................|........................................................................................................................................................................ - // mul v19.4s, v24.4s, v6.s[0] // ....................................................................................................................................................e.......................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v6.s[1] // .....................................................................................................................................................e......................................................................|........................................................................................................................................................................ - // mls v19.4s, v24.4s, v29.4s // ...........................................................................................................................................................e................................................................|........................................................................................................................................................................ - // sub v24.4s, v20.4s, v21.4s // ..................................................................................................................................................................e.........................................................|........................................................................................................................................................................ - // add v20.4s, v20.4s, v21.4s // ............................................................................................................................................................e...............................................................|........................................................................................................................................................................ - // mul v21.4s, v24.4s, v6.s[2] // ........................................................................................................................................................................e...................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v6.s[3] // ....................................................................................................................................................................e.......................................................|........................................................................................................................................................................ - // mls v21.4s, v24.4s, v29.4s // ...........................................................................................................................................................................e................................................|........................................................................................................................................................................ - // sub v24.4s, v22.4s, v23.4s // ...............................................................................................................................e............................................................................................|........................................................................................................................................................................ - // add v22.4s, v22.4s, v23.4s // .............................................................................................................................e..............................................................................................|........................................................................................................................................................................ - // mul v23.4s, v24.4s, v7.s[0] // ...................................................................................................................................................................e........................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................................................................................e...........................................................|........................................................................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // .....................................................................................................................................................................e......................................................|........................................................................................................................................................................ - // sub v24.4s, v8.4s, v10.4s // ...................................................................................................................e........................................................................................................|........................................................................................................................................................................ - // add v8.4s, v8.4s, v10.4s // ......................................................................................................e.....................................................................................................................|.................................................................................................................................................................e...... - // mul v10.4s, v24.4s, v1.s[2] // ................................................................................................................................................e...........................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...............................................................................................................................................e............................................................................|........................................................................................................................................................................ - // mls v10.4s, v24.4s, v29.4s // ........................................................................................................................................................e...................................................................|........................................................................................................................................................................ - // sub v24.4s, v9.4s, v11.4s // .......................................................................................................................................e....................................................................................|........................................................................................................................................................................ - // add v9.4s, v9.4s, v11.4s // ................................................................................................................................e...........................................................................................|........................................................................................................................................................................ - // mul v11.4s, v24.4s, v1.s[2] // ..............................................................................................................................................e.............................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................................................................................................e..............................................................................|........................................................................................................................................................................ - // mls v11.4s, v24.4s, v29.4s // ......................................................................................................................................................e.....................................................................|........................................................................................................................................................................ - // sub v24.4s, v12.4s, v14.4s // ............................................................................................e...............................................................................................................................|.......................................................................................................................................................e................ - // add v12.4s, v12.4s, v14.4s // ...........................................................................................e................................................................................................................................|......................................................................................................................................................e................. - // mul v14.4s, v24.4s, v2.s[0] // ..................................................................................................................................e.........................................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .................................................................................................................................e..........................................................................................|........................................................................................................................................................................ - // mls v14.4s, v24.4s, v29.4s // ........................................................................................................................................e...................................................................................|........................................................................................................................................................................ - // sub v24.4s, v13.4s, v15.4s // ......................................................................................................................................e.....................................................................................|........................................................................................................................................................................ - // add v13.4s, v13.4s, v15.4s // ...................................................................................................................................e........................................................................................|........................................................................................................................................................................ - // mul v15.4s, v24.4s, v2.s[0] // ...........................................................................................................................................................................................e................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................................................................................................e...............................|........................................................................................................................................................................ - // mls v15.4s, v24.4s, v29.4s // ....................................................................................................................................................................................................e.......................|........................................................................................................................................................................ - // sub v24.4s, v16.4s, v18.4s // ..........................................................................................................................e.................................................................................................|........................................................................................................................................................................ - // add v16.4s, v16.4s, v18.4s // .........................................................................................................................e..................................................................................................|........................................................................................................................................................................ - // mul v18.4s, v24.4s, v2.s[2] // ...........................................................................................................................................e................................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ............................................................................................................................................e...............................................................................|........................................................................................................................................................................ - // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................e..........................................................................|........................................................................................................................................................................ - // sub v24.4s, v17.4s, v19.4s // ............................................................................................................................................................................e...............................................|........................................................................................................................................................................ - // add v17.4s, v17.4s, v19.4s // .............................................................................................................................................................................e..............................................|........................................................................................................................................................................ - // mul v19.4s, v24.4s, v2.s[2] // ............................................................................................................................................................................................................e...............|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................................................................................................................................................e.................|........................................................................................................................................................................ - // mls v19.4s, v24.4s, v29.4s // .............................................................................................................................................................................................................e..............|........................................................................................................................................................................ - // sub v24.4s, v20.4s, v22.4s // ..............................................................................................................................................................e.............................................................|........................................................................................................................................................................ - // add v20.4s, v20.4s, v22.4s // ...............................................................................................................................................................e............................................................|........................................................................................................................................................................ - // mul v22.4s, v24.4s, v3.s[0] // .................................................................................................................................................................................e..........................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................................................................................e..........................................................|........................................................................................................................................................................ - // mls v22.4s, v24.4s, v29.4s // ........................................................................................................................................................................................e...................................|........................................................................................................................................................................ - // sub v24.4s, v21.4s, v23.4s // ...................................................................................................................................................................................e........................................|........................................................................................................................................................................ - // add v21.4s, v21.4s, v23.4s // ..................................................................................................................................................................................e.........................................|........................................................................................................................................................................ - // mul v23.4s, v24.4s, v3.s[0] // .................................................................................................................................................................................................................e..........|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................................................................................................................................................................................e.............|........................................................................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................................e.........|........................................................................................................................................................................ - // sub v24.4s, v8.4s, v12.4s // .......................................................................................................................e....................................................................................................|........................................................................................................................................................................ - // add v8.4s, v8.4s, v12.4s // ............................................................................................................................e...............................................................................................|........................................................................................................................................................................ - // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................................................e.........................................................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................................e........................................................................|........................................................................................................................................................................ - // mls v12.4s, v24.4s, v29.4s // .............................................................................................................................................................e..............................................................|........................................................................................................................................................................ - // sub v24.4s, v9.4s, v13.4s // .........................................................................................................................................e..................................................................................|........................................................................................................................................................................ - // add v9.4s, v9.4s, v13.4s // ..........................................................................................................................................e.................................................................................|........................................................................................................................................................................ - // mul v13.4s, v24.4s, v0.s[2] // ................................................................................................................................................................................................e...........................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................................................................e............................|........................................................................................................................................................................ - // mls v13.4s, v24.4s, v29.4s // .................................................................................................................................................................................................e..........................|........................................................................................................................................................................ - // sub v24.4s, v10.4s, v14.4s // ......................................................................................................................................................................e.....................................................|........................................................................................................................................................................ - // add v10.4s, v10.4s, v14.4s // .......................................................................................................................................................................e....................................................|........................................................................................................................................................................ - // mul v14.4s, v24.4s, v0.s[2] // ............................................................................................................................................................................................................................|........*............................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................................................................................................................................................|.....*.................................................................................................................................................................. - // mls v14.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...........*............................................................................................................................................................ - // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................................................................................e...................|........................................................................................................................................................................ - // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................................................................................e..................|........................................................................................................................................................................ - // mul v15.4s, v24.4s, v0.s[2] // ................................................................................................................................................................................................................e...........|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................................................................................e............|........................................................................................................................................................................ - // mls v15.4s, v24.4s, v29.4s // .........................................................................................................................................................................................................................e..|........................................................................................................................................................................ - // sub v24.4s, v16.4s, v20.4s // ..........................................................................................................................................................................e.................................................|........................................................................................................................................................................ - // add v16.4s, v16.4s, v20.4s // .........................................................................................................................................................................e..................................................|........................................................................................................................................................................ - // mul v20.4s, v24.4s, v1.s[0] // ....................................................................................................................................................................................e.......................................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................................e.............................................|........................................................................................................................................................................ - // mls v20.4s, v24.4s, v29.4s // .....................................................................................................................................................................................e......................................|........................................................................................................................................................................ - // sub v24.4s, v17.4s, v21.4s // .......................................................................................................................................................................................e....................................|........................................................................................................................................................................ - // add v17.4s, v17.4s, v21.4s // ......................................................................................................................................................................................e.....................................|........................................................................................................................................................................ - // mul v21.4s, v24.4s, v1.s[0] // ........................................................................................................................................................................................................................e...|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................................................................................................................e.....|........................................................................................................................................................................ - // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|..*..................................................................................................................................................................... - // sub v24.4s, v18.4s, v22.4s // ..............................................................................................................................................................................................e.............................|........................................................................................................................................................................ - // add v18.4s, v18.4s, v22.4s // .............................................................................................................................................................................................e..............................|........................................................................................................................................................................ - // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................................................................................................e.|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................................................*........................................................................................................................................................................ - // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|......*................................................................................................................................................................. - // sub v24.4s, v19.4s, v23.4s // ...........................................................................................................................................................................................................................e|........................................................................................................................................................................ - // add v19.4s, v19.4s, v23.4s // ............................................................................................................................................................................................................................|*....................................................................................................................................................................... - // mul v23.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................................................................|............*........................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................................................|..........*............................................................................................................................................................. - // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.............*.......................................................................................................................................................... - // sub v24.4s, v8.4s, v16.4s // ...............................................................................................................................................................................e............................................|........................................................................................................................................................................ - // add v8.4s, v8.4s, v16.4s // ................................................................................................................................................................................e...........................................|........................................................................................................................................................................ - // mul v16.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................e.........................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................................................e........................|........................................................................................................................................................................ - // mls v16.4s, v24.4s, v29.4s // .......................................................................................................................................................................................................e....................|........................................................................................................................................................................ - // sub v24.4s, v9.4s, v17.4s // .........................................................................................................................................................................................e..................................|........................................................................................................................................................................ - // add v9.4s, v9.4s, v17.4s // ..........................................................................................................................................................................................e.................................|........................................................................................................................................................................ - // mul v17.4s, v24.4s, v0.s[0] // .....................................................................................................................................................................................................e......................|........................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................................................................................e.....................|........................................................................................................................................................................ - // mls v17.4s, v24.4s, v29.4s // ...........................................................................................................................................................................................................e................|........................................................................................................................................................................ - // sub v24.4s, v10.4s, v18.4s // ...................................................................................................................................................................................................................e........|........................................................................................................................................................................ - // add v10.4s, v10.4s, v18.4s // ....................................................................................................................................................................................................................e.......|........................................................................................................................................................................ - // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|..........................*............................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.........................*.............................................................................................................................................. - // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|....................................*................................................................................................................................... - // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................................................................|...*.................................................................................................................................................................... - // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................................................................|....*................................................................................................................................................................... - // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|....................*................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................*...................................................................................................................................................... - // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...........................*............................................................................................................................................ - // sub v24.4s, v12.4s, v20.4s // .......................................................................................................................................................................................................................e....|........................................................................................................................................................................ - // add v12.4s, v12.4s, v20.4s // .....................................................................................................................................................................................................................e......|........................................................................................................................................................................ - // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|..................................*..................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................................*...................................................................................................................................... - // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.........................................*.............................................................................................................................. - // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................................................................|...............*........................................................................................................................................................ - // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................................................................|................*....................................................................................................................................................... - // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|........................................................*............................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................................................*...................................................................................................................... - // mls v21.4s, v24.4s, v29.4s // .*..........................................................................................................................................................................................................................|............................................................*........................................................................................................... - // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................................................................|..................*..................................................................................................................................................... - // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................................................................|...................*.................................................................................................................................................... - // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|............................................*........................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|...........................................*............................................................................................................................ - // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...............................................*........................................................................................................................ - // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................................................................|.......................*................................................................................................................................................ - // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................................................................|.....................*.................................................................................................................................................. - // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|.............................*.......................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|...............................*........................................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.............................................*.......................................................................................................................... - // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................................................................|.*...................................................................................................................................................................... - // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................................................................|.......*................................................................................................................................................................ - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|.........*.............................................................................................................................................................. - // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|..............*......................................................................................................................................................... - // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................................................|........................*............................................................................................................................................... - // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................................................|............................*........................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..............................*......................................................................................................................................... - // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|................................*....................................................................................................................................... - // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................................................................|..............................................*......................................................................................................................... - // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................................................................|................................................*....................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..................................................*..................................................................................................................... - // mls v18.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|....................................................*................................................................................................................... - // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................................................................|........................................*............................................................................................................................... - // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................................................................|.....................................*.................................................................................................................................. - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..........................................*............................................................................................................................. - // mls v19.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|.....................................................*.................................................................................................................. - // cmge v27.4s, v31.4s, v20.4s // ..............*.............................................................................................................................................................................................................|.........................................................................*.............................................................................................. - // cmge v28.4s, v20.4s, v30.4s // ........*...................................................................................................................................................................................................................|...................................................................*.................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ..................*.........................................................................................................................................................................................................|.............................................................................*.......................................................................................... - // mls v20.4s, v28.4s, v29.4s // ........................*...................................................................................................................................................................................................|...................................................................................*.................................................................................... - // cmge v27.4s, v31.4s, v21.4s // ............*...............................................................................................................................................................................................................|.......................................................................*................................................................................................ - // cmge v28.4s, v21.4s, v30.4s // ...........*................................................................................................................................................................................................................|......................................................................*................................................................................................. - // sub v28.4s, v27.4s, v28.4s // ................*...........................................................................................................................................................................................................|...........................................................................*............................................................................................ - // mls v21.4s, v28.4s, v29.4s // ............................*...............................................................................................................................................................................................|.......................................................................................*................................................................................ - // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................................................................|..........................................................*............................................................................................................. - // cmge v28.4s, v22.4s, v30.4s // ..*.........................................................................................................................................................................................................................|.............................................................*.......................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ......*.....................................................................................................................................................................................................................|.................................................................*...................................................................................................... - // mls v22.4s, v28.4s, v29.4s // ..........*.................................................................................................................................................................................................................|.....................................................................*.................................................................................................. - // cmge v27.4s, v31.4s, v23.4s // ...*........................................................................................................................................................................................................................|..............................................................*......................................................................................................... - // cmge v28.4s, v23.4s, v30.4s // .....*......................................................................................................................................................................................................................|................................................................*....................................................................................................... - // sub v28.4s, v27.4s, v28.4s // .........*..................................................................................................................................................................................................................|....................................................................*................................................................................................... - // mls v23.4s, v28.4s, v29.4s // .............*..............................................................................................................................................................................................................|........................................................................*............................................................................................... - // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................................................................|......................*................................................................................................................................................. - // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................................................................|...................................*.................................................................................................................................... - // str q18, [x1, #(10*(512/8))] // ............................................................................................................................................................................................................................|.......................................................*................................................................................................................ - // str q19, [x1, #(11*(512/8))] // ............................................................................................................................................................................................................................|.........................................................*.............................................................................................................. - // str q20, [x1, #(12*(512/8))] // ................................*...........................................................................................................................................................................................|...........................................................................................*............................................................................ - // str q21, [x1, #(13*(512/8))] // ...................................*........................................................................................................................................................................................|..............................................................................................*......................................................................... - // str q22, [x1, #(14*(512/8))] // ...................*........................................................................................................................................................................................................|..............................................................................*......................................................................................... - // str q23, [x1, #(15*(512/8))] // ......................*.....................................................................................................................................................................................................|.................................................................................*...................................................................................... - // mul v16.4s, v8.4s, v25.4s // ....................*.......................................................................................................................................................................................................|...............................................................................*........................................................................................ - // sqrdmulh v8.4s, v8.4s, v26.4s // .................*..........................................................................................................................................................................................................|............................................................................*........................................................................................... - // mls v16.4s, v8.4s, v29.4s // ..............................*.............................................................................................................................................................................................|.........................................................................................*.............................................................................. - // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................................................................|.......................................*................................................................................................................................ - // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................................................................|......................................*................................................................................................................................. - // mls v17.4s, v9.4s, v29.4s // .......*....................................................................................................................................................................................................................|..................................................................*..................................................................................................... - // mul v18.4s, v10.4s, v25.4s // ................................................*...........................................................................................................................................................................|...........................................................................................................*............................................................ - // sqrdmulh v10.4s, v10.4s, v26.4s // .................................................*..........................................................................................................................................................................|............................................................................................................*........................................................... - // mls v18.4s, v10.4s, v29.4s // ........................................................*...................................................................................................................................................................|...................................................................................................................*.................................................... - // mul v19.4s, v11.4s, v25.4s // ............................................................................................................................................................................................................................|......................................................*................................................................................................................. - // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................................................................................................................................................................................................|...................................................*.................................................................................................................... - // mls v19.4s, v11.4s, v29.4s // ....*.......................................................................................................................................................................................................................|...............................................................*........................................................................................................ - // mul v20.4s, v12.4s, v25.4s // ...........................................*................................................................................................................................................................................|......................................................................................................*................................................................. - // sqrdmulh v12.4s, v12.4s, v26.4s // .........................................*..................................................................................................................................................................................|....................................................................................................*................................................................... - // mls v20.4s, v12.4s, v29.4s // ....................................................*.......................................................................................................................................................................|...............................................................................................................*........................................................ - // mul v21.4s, v13.4s, v25.4s // ....................................*.......................................................................................................................................................................................|...............................................................................................*........................................................................ - // sqrdmulh v13.4s, v13.4s, v26.4s // ..................................*.........................................................................................................................................................................................|.............................................................................................*.......................................................................... - // mls v21.4s, v13.4s, v29.4s // ...................................................*........................................................................................................................................................................|..............................................................................................................*......................................................... - // mul v22.4s, v14.4s, v25.4s // ............................................................*...............................................................................................................................................................|.......................................................................................................................*................................................ - // sqrdmulh v14.4s, v14.4s, v26.4s // .........................................................*..................................................................................................................................................................|....................................................................................................................*................................................... - // mls v22.4s, v14.4s, v29.4s // ..................................................................*.........................................................................................................................................................|.............................................................................................................................*.......................................... - // mul v23.4s, v15.4s, v25.4s // .............................................*..............................................................................................................................................................................|........................................................................................................*............................................................... - // sqrdmulh v15.4s, v15.4s, v26.4s // ..............................................*.............................................................................................................................................................................|.........................................................................................................*.............................................................. - // mls v23.4s, v15.4s, v29.4s // ...............................................................*............................................................................................................................................................|..........................................................................................................................*............................................. - // cmge v27.4s, v31.4s, v16.4s // .....................................*......................................................................................................................................................................................|................................................................................................*....................................................................... - // cmge v28.4s, v16.4s, v30.4s // .......................................*....................................................................................................................................................................................|..................................................................................................*..................................................................... - // sub v28.4s, v27.4s, v28.4s // ...............................................*............................................................................................................................................................................|..........................................................................................................*............................................................. - // mls v16.4s, v28.4s, v29.4s // ...............................................................................*............................................................................................................................................|..........................................................................................................................................*............................. - // cmge v27.4s, v31.4s, v17.4s // .......................*....................................................................................................................................................................................................|..................................................................................*..................................................................................... - // cmge v28.4s, v17.4s, v30.4s // ...............*............................................................................................................................................................................................................|..........................................................................*............................................................................................. - // sub v28.4s, v27.4s, v28.4s // .........................*..................................................................................................................................................................................................|....................................................................................*................................................................................... - // mls v17.4s, v28.4s, v29.4s // ........................................*...................................................................................................................................................................................|...................................................................................................*.................................................................... - // cmge v27.4s, v31.4s, v18.4s // ..............................................................*.............................................................................................................................................................|.........................................................................................................................*.............................................. - // cmge v28.4s, v18.4s, v30.4s // ................................................................*...........................................................................................................................................................|...........................................................................................................................*............................................ - // sub v28.4s, v27.4s, v28.4s // ...................................................................*........................................................................................................................................................|..............................................................................................................................*......................................... - // mls v18.4s, v28.4s, v29.4s // ........................................................................*...................................................................................................................................................|...................................................................................................................................*.................................... - // cmge v27.4s, v31.4s, v19.4s // .....................*......................................................................................................................................................................................................|................................................................................*....................................................................................... - // cmge v28.4s, v19.4s, v30.4s // ...........................*................................................................................................................................................................................................|......................................................................................*................................................................................. - // sub v28.4s, v27.4s, v28.4s // .............................*..............................................................................................................................................................................................|........................................................................................*............................................................................... - // mls v19.4s, v28.4s, v29.4s // ......................................*.....................................................................................................................................................................................|.................................................................................................*...................................................................... - // cmge v27.4s, v31.4s, v20.4s // .............................................................*..............................................................................................................................................................|........................................................................................................................*............................................... - // cmge v28.4s, v20.4s, v30.4s // ...........................................................*................................................................................................................................................................|......................................................................................................................*................................................. - // sub v28.4s, v27.4s, v28.4s // .................................................................*..........................................................................................................................................................|............................................................................................................................*........................................... - // mls v20.4s, v28.4s, v29.4s // .....................................................................*......................................................................................................................................................|................................................................................................................................*....................................... - // cmge v27.4s, v31.4s, v21.4s // .....................................................*......................................................................................................................................................................|................................................................................................................*....................................................... - // cmge v28.4s, v21.4s, v30.4s // .......................................................*....................................................................................................................................................................|..................................................................................................................*..................................................... - // sub v28.4s, v27.4s, v28.4s // ..........................................................*.................................................................................................................................................................|.....................................................................................................................*.................................................. - // mls v21.4s, v28.4s, v29.4s // ...........................................................................*................................................................................................................................................|......................................................................................................................................*................................. - // cmge v27.4s, v31.4s, v22.4s // ..........................................................................*.................................................................................................................................................|.....................................................................................................................................*.................................. - // cmge v28.4s, v22.4s, v30.4s // ............................................................................*...............................................................................................................................................|.......................................................................................................................................*................................ - // sub v28.4s, v27.4s, v28.4s // .....................................................................................*......................................................................................................................................|................................................................................................................................................*....................... - // mls v22.4s, v28.4s, v29.4s // .....................................................................................................*......................................................................................................................|................................................................................................................................................................*....... - // cmge v27.4s, v31.4s, v23.4s // .........................................................................*..................................................................................................................................................|....................................................................................................................................*................................... - // cmge v28.4s, v23.4s, v30.4s // .......................................................................*....................................................................................................................................................|..................................................................................................................................*..................................... - // sub v28.4s, v27.4s, v28.4s // ..............................................................................*.............................................................................................................................................|.........................................................................................................................................*.............................. - // mls v23.4s, v28.4s, v29.4s // ......................................................................................*.....................................................................................................................................|.................................................................................................................................................*...................... - // str q16, [x1], #(16) // ........................................................................................................*...................................................................................................................|...................................................................................................................................................................*.... - // str q17, [x1, #(-16 + 1*(512/8))] // ............................................*...............................................................................................................................................................................|.......................................................................................................*................................................................ - // str q18, [x1, #(-16 + 2*(512/8))] // .........................................................................................*..................................................................................................................................|....................................................................................................................................................*................... - // str q19, [x1, #(-16 + 3*(512/8))] // ..........................................*.................................................................................................................................................................................|.....................................................................................................*.................................................................. - // str q20, [x1, #(-16 + 4*(512/8))] // .............................................................................*..............................................................................................................................................|........................................................................................................................................*............................... - // str q21, [x1, #(-16 + 5*(512/8))] // ..........................................................................................*.................................................................................................................................|.....................................................................................................................................................*.................. - // str q22, [x1, #(-16 + 6*(512/8))] // ............................................................................................................*...............................................................................................................|.......................................................................................................................................................................* - // str q23, [x1, #(-16 + 7*(512/8))] // ...................................................................................................*........................................................................................................................|..............................................................................................................................................................*......... + // ---------------------------------------------------------------------------------------------------------------------------------- original position ----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + sqrdmulh v20.4S, v8.4S, v26.4S // .........................................*.............................................................................................................................................................................................................................................. + mul v8.4S, v8.4S, v25.4S // ...........................................*............................................................................................................................................................................................................................................ + mls v15.4S, v17.4S, v29.4S // *....................................................................................................................................................................................................................................................................................... + sqrdmulh v13.4S, v23.4S, v1.S[1] // .................*...................................................................................................................................................................................................................................................................... + mul v17.4S, v23.4S, v1.S[0] // ....................*................................................................................................................................................................................................................................................................... + sqrdmulh v23.4S, v18.4S, v0.S[1] // .............*.......................................................................................................................................................................................................................................................................... + mls v17.4S, v13.4S, v29.4S // .....................*.................................................................................................................................................................................................................................................................. + mul v13.4S, v18.4S, v0.S[0] // ..................*..................................................................................................................................................................................................................................................................... + mls v13.4S, v23.4S, v29.4S // .......................*................................................................................................................................................................................................................................................................ + sub v23.4S, v14.4S, v17.4S // ..........................*............................................................................................................................................................................................................................................................. + add v14.4S, v14.4S, v17.4S // ...........................*............................................................................................................................................................................................................................................................ + sqrdmulh v18.4S, v28.4S, v0.S[1] // ........*............................................................................................................................................................................................................................................................................... + sub v17.4S, v15.4S, v19.4S // ...............*........................................................................................................................................................................................................................................................................ + add v15.4S, v15.4S, v19.4S // ................*....................................................................................................................................................................................................................................................................... + mul v28.4S, v28.4S, v0.S[0] // .......*................................................................................................................................................................................................................................................................................ + cmge v19.4S, v13.4S, v30.4S // ..........................................*............................................................................................................................................................................................................................................. + mls v8.4S, v20.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + cmge v20.4S, v31.4S, v13.4S // .............................................*.......................................................................................................................................................................................................................................... + mls v28.4S, v18.4S, v29.4S // ............*........................................................................................................................................................................................................................................................................... + sub v20.4S, v20.4S, v19.4S // ...............................................*........................................................................................................................................................................................................................................ + sqrdmulh v19.4S, v17.4S, v0.S[1] // ...................................*.................................................................................................................................................................................................................................................... + mls v13.4S, v20.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + cmge v20.4S, v31.4S, v28.4S // ................................*....................................................................................................................................................................................................................................................... + mul v17.4S, v17.4S, v0.S[0] // .................................*...................................................................................................................................................................................................................................................... + cmge v18.4S, v28.4S, v30.4S // ..............................*......................................................................................................................................................................................................................................................... + mls v17.4S, v19.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + cmge v19.4S, v8.4S, v30.4S // .............................................................*.......................................................................................................................................................................................................................... + str q13, [x1, #640] // .....................................................*.................................................................................................................................................................................................................................. + sub v20.4S, v20.4S, v18.4S // ..................................*..................................................................................................................................................................................................................................................... + sqrdmulh v13.4S, v22.4S, v0.S[1] // ......*................................................................................................................................................................................................................................................................................. + mul v18.4S, v22.4S, v0.S[0] // ...*.................................................................................................................................................................................................................................................................................... + mls v28.4S, v20.4S, v29.4S // .......................................*................................................................................................................................................................................................................................................ + cmge v20.4S, v31.4S, v8.4S // ...........................................................*............................................................................................................................................................................................................................ + mls v18.4S, v13.4S, v29.4S // ..........*............................................................................................................................................................................................................................................................................. + cmge v13.4S, v17.4S, v30.4S // .....................................................................*.................................................................................................................................................................................................................. + sub v19.4S, v20.4S, v19.4S // ................................................................*....................................................................................................................................................................................................................... + cmge v20.4S, v31.4S, v17.4S // ......................................................................*................................................................................................................................................................................................................. + sqrdmulh v22.4S, v23.4S, v0.S[1] // ................................................*....................................................................................................................................................................................................................................... + str q28, [x1, #512] // ............................................*........................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v11.4S, v26.4S // ...............................................................*........................................................................................................................................................................................................................ + sub v13.4S, v20.4S, v13.4S // ..........................................................................*............................................................................................................................................................................................................. + mul v11.4S, v11.4S, v25.4S // .................................................................*...................................................................................................................................................................................................................... + sub v20.4S, v21.4S, v16.4S // .*...................................................................................................................................................................................................................................................................................... + mls v17.4S, v13.4S, v29.4S // ....................................................................................*................................................................................................................................................................................................... + mls v8.4S, v19.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sqrdmulh v13.4S, v20.4S, v0.S[1] // .........................*.............................................................................................................................................................................................................................................................. + str q17, [x1, #960] // ............................................................................................*........................................................................................................................................................................................... + sub v17.4S, v9.4S, v24.4S // ....*................................................................................................................................................................................................................................................................................... + add v9.4S, v9.4S, v24.4S // .....*.................................................................................................................................................................................................................................................................................. + mul v20.4S, v20.4S, v0.S[0] // ............................*........................................................................................................................................................................................................................................................... + str q8, [x1], #(16) // .....................................................................................*.................................................................................................................................................................................................. + mul v24.4S, v23.4S, v0.S[0] // .................................................*...................................................................................................................................................................................................................................... + cmge v23.4S, v31.4S, v18.4S // ....................................*................................................................................................................................................................................................................................................... + mls v20.4S, v13.4S, v29.4S // .............................*.......................................................................................................................................................................................................................................................... + add v13.4S, v21.4S, v16.4S // ..*..................................................................................................................................................................................................................................................................................... + mls v24.4S, v22.4S, v29.4S // .........................................................*.............................................................................................................................................................................................................................. + cmge v16.4S, v18.4S, v30.4S // ......................................*................................................................................................................................................................................................................................................. + sqrdmulh v19.4S, v12.4S, v26.4S // .....................................................................................................*.................................................................................................................................................................................. + ldr q8, [x1, #64] // ..................................................................................................................................................*..................................................................................................................................... + mul v12.4S, v12.4S, v25.4S // ........................................................................................................*............................................................................................................................................................................... + sub v22.4S, v23.4S, v16.4S // ........................................*............................................................................................................................................................................................................................................... + mls v11.4S, v28.4S, v29.4S // ..............................................................................................*......................................................................................................................................................................................... + cmge v28.4S, v24.4S, v30.4S // ..................................................................*..................................................................................................................................................................................................................... + mls v18.4S, v22.4S, v29.4S // ..............................................*......................................................................................................................................................................................................................................... + cmge v22.4S, v31.4S, v24.4S // ...................................................................*.................................................................................................................................................................................................................... + sqrdmulh v23.4S, v27.4S, v0.S[1] // ...............................................................................*........................................................................................................................................................................................................ + sub v16.4S, v22.4S, v28.4S // .......................................................................*................................................................................................................................................................................................................ + mul v21.4S, v27.4S, v0.S[0] // ..................................................................................*..................................................................................................................................................................................................... + str q18, [x1, #752] // ........................................................*............................................................................................................................................................................................................................... + mul v18.4S, v10.4S, v25.4S // ....................................................*................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v10.4S, v26.4S // ...................................................*.................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v9.4S, v26.4S // .........................................................................................*.............................................................................................................................................................................................. + mul v9.4S, v9.4S, v25.4S // ...........................................................................................*............................................................................................................................................................................................ + mul v10.4S, v17.4S, v0.S[0] // .........*.............................................................................................................................................................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v0.S[1] // ...........*............................................................................................................................................................................................................................................................................ + mls v18.4S, v27.4S, v29.4S // ..............................................................................*......................................................................................................................................................................................................... + mls v24.4S, v16.4S, v29.4S // ..................................................................................................*..................................................................................................................................................................................... + mls v10.4S, v17.4S, v29.4S // ..............*......................................................................................................................................................................................................................................................................... + cmge v27.4S, v31.4S, v18.4S // .................................................................................*...................................................................................................................................................................................................... + mls v21.4S, v23.4S, v29.4S // ...................................................................................*.................................................................................................................................................................................................... + cmge v23.4S, v18.4S, v30.4S // ......................................................................................*................................................................................................................................................................................................. + str q24, [x1, #880] // .........................................................................................................*.............................................................................................................................................................................. + mls v12.4S, v19.4S, v29.4S // ...........................................................................................................*............................................................................................................................................................................ + cmge v19.4S, v31.4S, v10.4S // ...................*.................................................................................................................................................................................................................................................................... + cmge v17.4S, v31.4S, v21.4S // .......................................................................................*................................................................................................................................................................................................ + mls v9.4S, v22.4S, v29.4S // ..........................................................................................................*............................................................................................................................................................................. + cmge v24.4S, v21.4S, v30.4S // ........................................................................................*............................................................................................................................................................................................... + sqrdmulh v22.4S, v15.4S, v26.4S // ............................................................................*........................................................................................................................................................................................................... + cmge v16.4S, v31.4S, v20.4S // ......................................................*................................................................................................................................................................................................................................. + cmge v28.4S, v20.4S, v30.4S // ..........................................................*............................................................................................................................................................................................................................. + sub v24.4S, v17.4S, v24.4S // .............................................................................................*.......................................................................................................................................................................................... + mul v15.4S, v15.4S, v25.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v27.4S, v27.4S, v23.4S // ..........................................................................................*............................................................................................................................................................................................. + sub v16.4S, v16.4S, v28.4S // ..............................................................*......................................................................................................................................................................................................................... + sqrdmulh v23.4S, v14.4S, v26.4S // ........................................................................*............................................................................................................................................................................................................... + cmge v28.4S, v10.4S, v30.4S // ......................*................................................................................................................................................................................................................................................................. + mul v14.4S, v14.4S, v25.4S // .........................................................................*.............................................................................................................................................................................................................. + cmge v17.4S, v11.4S, v30.4S // ...................................................................................................*.................................................................................................................................................................................... + mls v21.4S, v24.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v19.4S, v19.4S, v28.4S // ........................*............................................................................................................................................................................................................................................................... + cmge v28.4S, v12.4S, v30.4S // ..................................................................................................................*..................................................................................................................................................................... + mls v15.4S, v22.4S, v29.4S // .......................................................................................................................*................................................................................................................................................................ + cmge v22.4S, v31.4S, v12.4S // .................................................................................................................*...................................................................................................................................................................... + sqrdmulh v24.4S, v13.4S, v26.4S // ............................................................................................................*........................................................................................................................................................................... + str q21, [x1, #688] // ....................................................................................................*................................................................................................................................................................................... + sub v28.4S, v22.4S, v28.4S // ........................................................................................................................*............................................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + cmge v23.4S, v31.4S, v11.4S // .................................................................................................*...................................................................................................................................................................................... + mul v13.4S, v13.4S, v25.4S // ................................................................................................................*....................................................................................................................................................................... + sub v22.4S, v23.4S, v17.4S // .......................................................................................................*................................................................................................................................................................................ + mls v13.4S, v24.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + cmge v24.4S, v31.4S, v14.4S // ....................................................................................................................*................................................................................................................................................................... + ldr q21, [x1, #576] // .....................................................................................................................................................*.................................................................................................................................. + mls v10.4S, v19.4S, v29.4S // ...............................*........................................................................................................................................................................................................................................................ + cmge v17.4S, v31.4S, v15.4S // ..............................................................................................................................*......................................................................................................................................................... + cmge v19.4S, v31.4S, v13.4S // ...........................................................................................................................*............................................................................................................................................................ + mls v12.4S, v28.4S, v29.4S // ............................................................................................................................*........................................................................................................................................................... + cmge v28.4S, v13.4S, v30.4S // .............................................................................................................................*.......................................................................................................................................................... + cmge v23.4S, v15.4S, v30.4S // ................................................................................................................................*....................................................................................................................................................... + mls v11.4S, v22.4S, v29.4S // .........................................................................................................................*.............................................................................................................................................................. + cmge v22.4S, v14.4S, v30.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v19.4S, v19.4S, v28.4S // ..................................................................................................................................*..................................................................................................................................................... + ldr q28, [x1, #704] // .............................................................................................................................................*.......................................................................................................................................... + str q10, [x1, #560] // .....................................*.................................................................................................................................................................................................................................................. + sub v17.4S, v17.4S, v23.4S // .....................................................................................................................................*.................................................................................................................................................. + ldr q23, [x1, #640] // ............................................................................................................................................*........................................................................................................................................... + mls v18.4S, v27.4S, v29.4S // ................................................................................................*....................................................................................................................................................................................... + sub v24.4S, v24.4S, v22.4S // ..........................................................................................................................*............................................................................................................................................................. + ldr q22, [x1] // ..............................................................................................................................................*......................................................................................................................................... + str q12, [x1, #240] // ....................................................................................................................................*................................................................................................................................................... + mls v13.4S, v19.4S, v29.4S // ......................................................................................................................................*................................................................................................................................................. + ldr q19, [x1, #512] // ...............................................................................................................................................*........................................................................................................................................ + cmge v27.4S, v9.4S, v30.4S // .............................................................................................................*.......................................................................................................................................................................... + str q11, [x1, #176] // .................................................................................................................................*...................................................................................................................................................... + cmge v12.4S, v31.4S, v9.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v14.4S, v24.4S, v29.4S // ...............................................................................................................................*........................................................................................................................................................ + add v10.4S, v23.4S, v28.4S // ....................................................................................................................................................*................................................................................................................................... + str q18, [x1, #112] // ......................................................................................................*................................................................................................................................................................................. + ldr q18, [x1, #192] // ................................................................................................................................................*....................................................................................................................................... + sub v12.4S, v12.4S, v27.4S // ......................................................................................................................*................................................................................................................................................................. + mls v20.4S, v16.4S, v29.4S // ....................................................................*................................................................................................................................................................................................................... + str q13, [x1, #304] // ..........................................................................................................................................*............................................................................................................................................. + add v13.4S, v22.4S, v8.4S // ...........................................................................................................................................................................*............................................................................................................ + sub v24.4S, v22.4S, v8.4S // .......................................................................................................................................................*................................................................................................................................ + str q14, [x1, #368] // .......................................................................................................................................*................................................................................................................................................ + ldr q14, [x1, #448] // ...................................................................................................................................................................*.................................................................................................................... + mls v15.4S, v17.4S, v29.4S // ........................................................................................................................................*............................................................................................................................................... + add v17.4S, v19.4S, v21.4S // .........................................................................................................................................................*.............................................................................................................................. + ldr q27, [x1, #320] // ..............................................................................................................................................................*......................................................................................................................... + sub v28.4S, v23.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... + ldr q23, [x1, #256] // ...........................................................................................................................................................*............................................................................................................................ + ldr q11, [x1, #128] // ..........................................................................................................................................................................*............................................................................................................. + sub v22.4S, v19.4S, v21.4S // ........................................................................................................................................................*............................................................................................................................... + mls v9.4S, v12.4S, v29.4S // ...................................................................................................................................*.................................................................................................................................................... + ldr q19, [x1, #384] // .................................................................................................................................................*...................................................................................................................................... + str q20, [x1, #816] // ...........................................................................*............................................................................................................................................................................................................ + sub v12.4S, v17.4S, v10.4S // ...............................................................................................................................................................................*........................................................................................................ + add v16.4S, v17.4S, v10.4S // ................................................................................................................................................................................*....................................................................................................... + sqrdmulh v8.4S, v28.4S, v6.S[1] // ......................................................................................................................................................*................................................................................................................................. + str q15, [x1, #432] // ...........................................................................................................................................*............................................................................................................................................ + add v15.4S, v11.4S, v18.4S // .....................................................................................................................................................................................*.................................................................................................. + sub v17.4S, v11.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v21.4S, v12.4S, v2.S[3] // ..............................................................................................................................................................................................................................*......................................................... + str q9, [x1, #48] // .........................................................................................................................................*.............................................................................................................................................. + add v20.4S, v19.4S, v14.4S // ........................................................................................................................................................................*............................................................................................................... + sub v9.4S, v13.4S, v15.4S // .........................................................................................................................................................................................*.............................................................................................. + sqrdmulh v11.4S, v24.4S, v3.S[3] // ..............................................................................................................................................................................*......................................................................................................... + add v15.4S, v13.4S, v15.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v13.4S, v24.4S, v3.S[2] // ...............................................................................................................................................................*........................................................................................................................ + sub v24.4S, v19.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + ldr q19, [x1, #832] // ...................................................................................................................................................................................................................*.................................................................... + add v14.4S, v23.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... + mls v13.4S, v11.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v11.4S, v23.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... + ldr q23, [x1, #768] // ..............................................................................................................................................................................................................*......................................................................... + mul v28.4S, v28.4S, v6.S[0] // ..........................................................................................................................................................*............................................................................................................................. + add v18.4S, v14.4S, v20.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v14.4S, v14.4S, v20.4S // .................................................................................................................................................................................*...................................................................................................... + mul v20.4S, v12.4S, v2.S[2] // .................................................................................................................................................................................................................................*...................................................... + sub v12.4S, v15.4S, v18.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v28.4S, v8.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + add v8.4S, v15.4S, v18.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v20.4S, v21.4S, v29.4S // ....................................................................................................................................................................................................................................*................................................... + sqrdmulh v15.4S, v9.4S, v1.S[3] // .............................................................................................................................................................................................................*.......................................................................... + mul v9.4S, v9.4S, v1.S[2] // .............................................................................................................................................................................................*.......................................................................................... + sqrdmulh v18.4S, v22.4S, v5.S[3] // .............................................................................................................................................................*.......................................................................................................................... + mul v10.4S, v22.4S, v5.S[2] // ............................................................................................................................................................*........................................................................................................................... + sqrdmulh v21.4S, v17.4S, v4.S[1] // ........................................................................................................................................................................................................*............................................................................... + mul v17.4S, v17.4S, v4.S[0] // .........................................................................................................................................................................................................*.............................................................................. + mul v27.4S, v24.4S, v5.S[0] // ....................................................................................................................................................................................*................................................................................................... + sqrdmulh v24.4S, v24.4S, v5.S[1] // ..................................................................................................................................................................................*..................................................................................................... + mls v17.4S, v21.4S, v29.4S // ............................................................................................................................................................................................................*........................................................................... + sqrdmulh v22.4S, v11.4S, v4.S[3] // .....................................................................................................................................................................*.................................................................................................................. + mul v21.4S, v11.4S, v4.S[2] // .......................................................................................................................................................................*................................................................................................................ + sub v11.4S, v13.4S, v17.4S // .................................................................................................................................................................................................................*...................................................................... + add v17.4S, v13.4S, v17.4S // ................................................................................................................................................................................................................*....................................................................... + mls v10.4S, v18.4S, v29.4S // ....................................................................................................................................................................*................................................................................................................... + mls v21.4S, v22.4S, v29.4S // .........................................................................................................................................................................*.............................................................................................................. + mls v9.4S, v15.4S, v29.4S // ....................................................................................................................................................................................................................*................................................................... + sub v15.4S, v10.4S, v28.4S // ......................................................................................................................................................................................................*................................................................................. + add v18.4S, v10.4S, v28.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v27.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + sqrdmulh v22.4S, v15.4S, v2.S[3] // ..................................................................................................................................................................................................................................................*..................................... + mul v15.4S, v15.4S, v2.S[2] // ...............................................................................................................................................................................................................................................*........................................ + sub v24.4S, v21.4S, v27.4S // .................................................................................................................................................................................................*...................................................................................... + add v13.4S, v21.4S, v27.4S // ..................................................................................................................................................................................................*..................................................................................... + mul v27.4S, v14.4S, v2.S[0] // ...................................................................................................................................................................................................*.................................................................................... + sqrdmulh v21.4S, v14.4S, v2.S[1] // ................................................................................................................................................................................................*....................................................................................... + mul v28.4S, v24.4S, v2.S[0] // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v14.4S, v24.4S, v2.S[1] // ...........................................................................................................................................................................................................*............................................................................ + sub v24.4S, v23.4S, v19.4S // .......................................................................................................................................................................................................................*................................................................ + add v19.4S, v23.4S, v19.4S // .....................................................................................................................................................................................................................*.................................................................. + mls v27.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mul v21.4S, v11.4S, v1.S[2] // ........................................................................................................................................................................................................................................*............................................... + mls v28.4S, v14.4S, v29.4S // ..................................................................................................................................................................................................................*..................................................................... + sub v23.4S, v9.4S, v27.4S // .........................................................................................................................................................................................................................*.............................................................. + add v10.4S, v9.4S, v27.4S // ............................................................................................................................................................................................................................*........................................................... + sqrdmulh v9.4S, v11.4S, v1.S[3] // .......................................................................................................................................................................................................................................*................................................ + mul v14.4S, v23.4S, v0.S[2] // ...........................................................................................................................................................................................................................................*............................................ + sqrdmulh v23.4S, v23.4S, v0.S[3] // .....................................................................................................................................................................................................................................*.................................................. + mls v21.4S, v9.4S, v29.4S // ..........................................................................................................................................................................................................................................*............................................. + add v9.4S, v17.4S, v13.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v15.4S, v22.4S, v29.4S // ......................................................................................................................................................................................................................................................*................................. + sub v22.4S, v17.4S, v13.4S // ...............................................................................................................................................................................................................................*........................................................ + sqrdmulh v11.4S, v12.4S, v0.S[3] // .......................................................................................................................................................................................................*................................................................................ + sub v13.4S, v21.4S, v28.4S // .......................................................................................................................................................................................................................................................................*................ + mul v12.4S, v12.4S, v0.S[2] // ....................................................................................................................................................................................................*................................................................................... + mul v27.4S, v24.4S, v6.S[2] // ......................................................................................................................................................................................................................................*................................................. + sqrdmulh v24.4S, v24.4S, v6.S[3] // ...................................................................................................................................................................................................................................*.................................................... + mls v12.4S, v11.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + add v11.4S, v21.4S, v28.4S // ........................................................................................................................................................................................................................................................................*............... + ldr q28, [x1, #896] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v17.4S, v22.4S, v0.S[3] // ..........................................................................................................................................................................................................................................................................*............. + mls v27.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + ldr q24, [x1, #960] // ............................................................................................................................................................................*........................................................................................................... + mls v14.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................................*.......................................... + mul v21.4S, v22.4S, v0.S[2] // ...............................................................................................................................................................................................................................................................................*........ + add v23.4S, v28.4S, v24.4S // ........................................................................................................................................................................................*............................................................................................... + sub v22.4S, v28.4S, v24.4S // ............................................................................................................................................................................................*........................................................................................... + mls v21.4S, v17.4S, v29.4S // .................................................................................................................................................................................................................................................................................*...... + sub v24.4S, v19.4S, v23.4S // ................................................................................................................................................................................................................................*....................................................... + add v19.4S, v19.4S, v23.4S // .............................................................................................................................................................................................................................*.......................................................... + sqrdmulh v28.4S, v22.4S, v7.S[1] // ......................................................................................................................................................................................................................*................................................................. + mul v17.4S, v22.4S, v7.S[0] // ........................................................................................................................................................................................................................*............................................................... + add v23.4S, v16.4S, v19.4S // ........................................................................................................................................................................................................................................................*............................... + sub v22.4S, v16.4S, v19.4S // ..........................................................................................................................................................................................................................................................*............................. + sqrdmulh v19.4S, v24.4S, v3.S[1] // ....................................................................................................................................................................................................................................................*................................... + mls v17.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................*............................................................ + sub v28.4S, v8.4S, v23.4S // ...........................................................................................................................................................................................................................................................*............................ + add v8.4S, v8.4S, v23.4S // ............................................................................................................................................................................................................................................................*........................... + sqrdmulh v16.4S, v22.4S, v1.S[1] // ................................................................................................................................................................................................................................................................*....................... + mul v23.4S, v22.4S, v1.S[0] // .................................................................................................................................................................................................................................................................*...................... + sub v22.4S, v27.4S, v17.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v24.4S, v24.4S, v3.S[0] // ...................................................................................................................................................................................................................................................*.................................... + add v17.4S, v27.4S, v17.4S // ............................................................................................................................................................................................................................................*........................................... + mls v23.4S, v16.4S, v29.4S // .........................................................................................................................................................................................................................................................................*.............. + sqrdmulh v16.4S, v22.4S, v3.S[1] // .....................................................................................................................................................................................................................................................*.................................. + mls v24.4S, v19.4S, v29.4S // .........................................................................................................................................................................................................................................................*.............................. + sub v19.4S, v18.4S, v17.4S // ................................................................................................................................................................................................................................................*....................................... + mul v27.4S, v22.4S, v3.S[0] // .......................................................................................................................................................................................................................................................*................................ + sub v22.4S, v12.4S, v23.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v27.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + add v12.4S, v12.4S, v23.4S // ..............................................................................................................................................................................................................................................................................*......... + add v16.4S, v20.4S, v24.4S // ..................................................................................................................................................................................................................................................................................*..... + sub v23.4S, v20.4S, v24.4S // ...................................................................................................................................................................................................................................................................................*.... + sqrdmulh v20.4S, v19.4S, v1.S[1] // ..............................................................................................................................................................................................................................................................*......................... + add v24.4S, v18.4S, v17.4S // .................................................................................................................................................................................................................................................*...................................... + sqrdmulh v17.4S, v13.4S, v0.S[3] // .......................................................................................................................................................................................................................................................................................* + sub v18.4S, v10.4S, v16.4S // .....................................................................................................................................................................................................................................................................................*.. + add v10.4S, v10.4S, v16.4S // ......................................................................................................................................................................................................................................................................................*. + mul v16.4S, v19.4S, v1.S[0] // .............................................................................................................................................................................................................................................................*.......................... + sub v19.4S, v15.4S, v27.4S // ..................................................................................................................................................................................................................................................................*..................... + add v15.4S, v15.4S, v27.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v16.4S, v20.4S, v29.4S // ....................................................................................................................................................................................................................................................................*................... + sqrdmulh v20.4S, v19.4S, v1.S[1] // .....................................................................................................................................................................................................................................................................*.................. + sub v27.4S, v11.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + add v11.4S, v11.4S, v15.4S // ................................................................................................................................................................................................................................................................................*....... + mul v15.4S, v13.4S, v0.S[2] // ....................................................................................................................................................................................................................................................................................*... + mul v19.4S, v19.4S, v1.S[0] // ......................................................................................................................................................................................................................................................................*................. + mls v19.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + + // ------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // mls v15.4S, v17.4S, v29.4S // ..*..................................................................................................................................................................................................................................................................................... + // sub v20.4S, v21.4S, v16.4S // ..........................................*............................................................................................................................................................................................................................................. + // add v13.4S, v21.4S, v16.4S // ......................................................*................................................................................................................................................................................................................................. + // mul v17.4S, v22.4S, v0.S[0] // ..............................*......................................................................................................................................................................................................................................................... + // sub v21.4S, v9.4S, v24.4S // ...............................................*........................................................................................................................................................................................................................................ + // add v9.4S, v9.4S, v24.4S // ................................................*....................................................................................................................................................................................................................................... + // sqrdmulh v24.4S, v22.4S, v0.S[1] // .............................*.......................................................................................................................................................................................................................................................... + // mul v22.4S, v28.4S, v0.S[0] // ..............*......................................................................................................................................................................................................................................................................... + // sqrdmulh v28.4S, v28.4S, v0.S[1] // ...........*............................................................................................................................................................................................................................................................................ + // mul v16.4S, v21.4S, v0.S[0] // .........................................................................*.............................................................................................................................................................................................................. + // mls v17.4S, v24.4S, v29.4S // .................................*...................................................................................................................................................................................................................................................... + // sqrdmulh v21.4S, v21.4S, v0.S[1] // ..........................................................................*............................................................................................................................................................................................................. + // mls v22.4S, v28.4S, v29.4S // ..................*..................................................................................................................................................................................................................................................................... + // sqrdmulh v28.4S, v18.4S, v0.S[1] // .....*.................................................................................................................................................................................................................................................................................. + // mls v16.4S, v21.4S, v29.4S // .............................................................................*.......................................................................................................................................................................................................... + // sub v21.4S, v15.4S, v19.4S // ............*........................................................................................................................................................................................................................................................................... + // add v15.4S, v15.4S, v19.4S // .............*.......................................................................................................................................................................................................................................................................... + // sqrdmulh v19.4S, v23.4S, v1.S[1] // ...*.................................................................................................................................................................................................................................................................................... + // mul v18.4S, v18.4S, v0.S[0] // .......*................................................................................................................................................................................................................................................................................ + // cmge v24.4S, v31.4S, v16.4S // ...................................................................................*.................................................................................................................................................................................................... + // mul v23.4S, v23.4S, v1.S[0] // ....*................................................................................................................................................................................................................................................................................... + // mls v23.4S, v19.4S, v29.4S // ......*................................................................................................................................................................................................................................................................................. + // cmge v19.4S, v16.4S, v30.4S // ...............................................................................................*........................................................................................................................................................................................ + // mls v18.4S, v28.4S, v29.4S // ........*............................................................................................................................................................................................................................................................................... + // sub v24.4S, v24.4S, v19.4S // ...................................................................................................*.................................................................................................................................................................................... + // sqrdmulh v28.4S, v20.4S, v0.S[1] // .............................................*.......................................................................................................................................................................................................................................... + // sub v19.4S, v14.4S, v23.4S // .........*.............................................................................................................................................................................................................................................................................. + // add v14.4S, v14.4S, v23.4S // ..........*............................................................................................................................................................................................................................................................................. + // mul v20.4S, v20.4S, v0.S[0] // .................................................*...................................................................................................................................................................................................................................... + // mls v20.4S, v28.4S, v29.4S // .....................................................*.................................................................................................................................................................................................................................. + // cmge v28.4S, v22.4S, v30.4S // ........................*............................................................................................................................................................................................................................................................... + // mls v16.4S, v24.4S, v29.4S // .................................................................................................................*...................................................................................................................................................................... + // cmge v24.4S, v31.4S, v22.4S // ......................*................................................................................................................................................................................................................................................................. + // mul v23.4S, v21.4S, v0.S[0] // .......................*................................................................................................................................................................................................................................................................ + // sub v28.4S, v24.4S, v28.4S // ............................*........................................................................................................................................................................................................................................................... + // sqrdmulh v21.4S, v21.4S, v0.S[1] // ....................*................................................................................................................................................................................................................................................................... + // cmge v24.4S, v31.4S, v17.4S // ....................................................*................................................................................................................................................................................................................................... + // str q16, [x1, #576] // ...........................................................................................................................*............................................................................................................................................................ + // cmge v16.4S, v17.4S, v30.4S // ........................................................*............................................................................................................................................................................................................................... + // mls v22.4S, v28.4S, v29.4S // ...............................*........................................................................................................................................................................................................................................................ + // sub v28.4S, v24.4S, v16.4S // ............................................................*........................................................................................................................................................................................................................... + // sqrdmulh v24.4S, v8.4S, v26.4S // *....................................................................................................................................................................................................................................................................................... + // cmge v16.4S, v18.4S, v30.4S // ...............*........................................................................................................................................................................................................................................................................ + // mul v8.4S, v8.4S, v25.4S // .*...................................................................................................................................................................................................................................................................................... + // str q22, [x1, #512] // ......................................*................................................................................................................................................................................................................................................. + // cmge v22.4S, v31.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... + // mls v17.4S, v28.4S, v29.4S // ...............................................................*........................................................................................................................................................................................................................ + // sub v28.4S, v22.4S, v16.4S // ...................*.................................................................................................................................................................................................................................................................... + // sqrdmulh v16.4S, v19.4S, v0.S[1] // .....................................*.................................................................................................................................................................................................................................................. + // mul v22.4S, v19.4S, v0.S[0] // ...................................................*.................................................................................................................................................................................................................................... + // mls v18.4S, v28.4S, v29.4S // .....................*.................................................................................................................................................................................................................................................................. + // sqrdmulh v19.4S, v10.4S, v26.4S // ......................................................................*................................................................................................................................................................................................................. + // mul v10.4S, v10.4S, v25.4S // .....................................................................*.................................................................................................................................................................................................................. + // str q18, [x1, #640] // ...........................*............................................................................................................................................................................................................................................................ + // cmge v18.4S, v31.4S, v20.4S // ........................................................................................*............................................................................................................................................................................................... + // mls v8.4S, v24.4S, v29.4S // ................*....................................................................................................................................................................................................................................................................... + // str q17, [x1, #768] // ....................................................................*................................................................................................................................................................................................................... + // mls v22.4S, v16.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + // cmge v17.4S, v20.4S, v30.4S // .........................................................................................*.............................................................................................................................................................................................. + // cmge v24.4S, v31.4S, v8.4S // ................................*....................................................................................................................................................................................................................................................... + // mls v23.4S, v21.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + // cmge v16.4S, v8.4S, v30.4S // ..........................*............................................................................................................................................................................................................................................................. + // sub v17.4S, v18.4S, v17.4S // .............................................................................................*.......................................................................................................................................................................................... + // sqrdmulh v28.4S, v11.4S, v26.4S // .......................................*................................................................................................................................................................................................................................................ + // sub v24.4S, v24.4S, v16.4S // ...................................*.................................................................................................................................................................................................................................................... + // mul v11.4S, v11.4S, v25.4S // .........................................*.............................................................................................................................................................................................................................................. + // cmge v18.4S, v22.4S, v30.4S // ..............................................................*......................................................................................................................................................................................................................... + // cmge v21.4S, v31.4S, v22.4S // ................................................................*....................................................................................................................................................................................................................... + // mls v20.4S, v17.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + // cmge v17.4S, v23.4S, v30.4S // ..................................*..................................................................................................................................................................................................................................................... + // cmge v16.4S, v31.4S, v23.4S // ....................................*................................................................................................................................................................................................................................................... + // sub v21.4S, v21.4S, v18.4S // ..................................................................*..................................................................................................................................................................................................................... + // sqrdmulh v18.4S, v14.4S, v26.4S // ..............................................................................................*......................................................................................................................................................................................... + // mul v14.4S, v14.4S, v25.4S // ................................................................................................*....................................................................................................................................................................................... + // sub v17.4S, v16.4S, v17.4S // ........................................*............................................................................................................................................................................................................................................... + // str q20, [x1, #832] // ...........................................................................................................................................................*............................................................................................................................ + // sqrdmulh v16.4S, v15.4S, v26.4S // .......................................................................................*................................................................................................................................................................................................ + // mul v15.4S, v15.4S, v25.4S // ...........................................................................................*............................................................................................................................................................................................ + // mls v10.4S, v19.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + // sqrdmulh v19.4S, v27.4S, v0.S[1] // .................................................................*...................................................................................................................................................................................................................... + // mls v8.4S, v24.4S, v29.4S // ............................................*........................................................................................................................................................................................................................................... + // cmge v20.4S, v31.4S, v10.4S // ..............................................................................*......................................................................................................................................................................................................... + // mul v27.4S, v27.4S, v0.S[0] // ...................................................................*.................................................................................................................................................................................................................... + // mls v27.4S, v19.4S, v29.4S // ...............................................................................*........................................................................................................................................................................................................ + // mls v23.4S, v17.4S, v29.4S // ...........................................*............................................................................................................................................................................................................................................ + // str q8, [x1], #(16) // ..................................................*..................................................................................................................................................................................................................................... + // cmge v17.4S, v10.4S, v30.4S // ................................................................................*....................................................................................................................................................................................................... + // cmge v19.4S, v31.4S, v27.4S // ....................................................................................*................................................................................................................................................................................................... + // cmge v24.4S, v27.4S, v30.4S // ......................................................................................*................................................................................................................................................................................................. + // sqrdmulh v8.4S, v9.4S, v26.4S // .......................................................................*................................................................................................................................................................................................................ + // sub v17.4S, v20.4S, v17.4S // ............................................................................................*........................................................................................................................................................................................... + // mul v9.4S, v9.4S, v25.4S // ........................................................................*............................................................................................................................................................................................................... + // str q23, [x1, #944] // ..............................................*......................................................................................................................................................................................................................................... + // sub v19.4S, v19.4S, v24.4S // ..........................................................................................*............................................................................................................................................................................................. + // mls v11.4S, v28.4S, v29.4S // .............................................................*.......................................................................................................................................................................................................................... + // mls v27.4S, v19.4S, v29.4S // ..................................................................................................*..................................................................................................................................................................................... + // mls v10.4S, v17.4S, v29.4S // ..............................................................................................................................*......................................................................................................................................................... + // cmge v17.4S, v31.4S, v11.4S // ...........................................................................................................*............................................................................................................................................................................ + // mls v22.4S, v21.4S, v29.4S // ............................................................................*........................................................................................................................................................................................................... + // cmge v19.4S, v11.4S, v30.4S // .................................................................................................*...................................................................................................................................................................................... + // str q27, [x1, #688] // ........................................................................................................*............................................................................................................................................................................... + // sqrdmulh v24.4S, v12.4S, v26.4S // .........................................................*.............................................................................................................................................................................................................................. + // str q10, [x1, #112] // .........................................................................................................................................*.............................................................................................................................................. + // sub v17.4S, v17.4S, v19.4S // .............................................................................................................*.......................................................................................................................................................................... + // mul v12.4S, v12.4S, v25.4S // ...........................................................*............................................................................................................................................................................................................................ + // str q22, [x1, #880] // .................................................................................*...................................................................................................................................................................................................... + // mls v9.4S, v8.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + // mls v12.4S, v24.4S, v29.4S // ..................................................................................*..................................................................................................................................................................................................... + // sqrdmulh v19.4S, v13.4S, v26.4S // .......................................................................................................*................................................................................................................................................................................ + // cmge v24.4S, v9.4S, v30.4S // ....................................................................................................................................*................................................................................................................................................... + // mls v14.4S, v18.4S, v29.4S // ..........................................................................................................*............................................................................................................................................................................. + // cmge v20.4S, v31.4S, v9.4S // ......................................................................................................................................*................................................................................................................................................. + // mul v13.4S, v13.4S, v25.4S // ............................................................................................................*........................................................................................................................................................................... + // cmge v28.4S, v31.4S, v12.4S // ......................................................................................................*................................................................................................................................................................................. + // cmge v18.4S, v12.4S, v30.4S // ....................................................................................................*................................................................................................................................................................................... + // mls v13.4S, v19.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + // cmge v19.4S, v31.4S, v14.4S // ...............................................................................................................*........................................................................................................................................................................ + // cmge v10.4S, v14.4S, v30.4S // ........................................................................................................................*............................................................................................................................................................... + // sub v24.4S, v20.4S, v24.4S // ...........................................................................................................................................*............................................................................................................................................ + // mls v15.4S, v16.4S, v29.4S // .....................................................................................................*.................................................................................................................................................................................. + // sub v20.4S, v28.4S, v18.4S // .........................................................................................................*.............................................................................................................................................................................. + // mls v11.4S, v17.4S, v29.4S // .......................................................................................................................*................................................................................................................................................................ + // sub v17.4S, v19.4S, v10.4S // ...............................................................................................................................*........................................................................................................................................................ + // cmge v19.4S, v31.4S, v13.4S // ...................................................................................................................*.................................................................................................................................................................... + // mls v12.4S, v20.4S, v29.4S // ....................................................................................................................*................................................................................................................................................................... + // cmge v20.4S, v13.4S, v30.4S // .....................................................................................................................*.................................................................................................................................................................. + // cmge v16.4S, v31.4S, v15.4S // ..................................................................................................................*..................................................................................................................................................................... + // mls v14.4S, v17.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + // cmge v17.4S, v15.4S, v30.4S // ......................................................................................................................*................................................................................................................................................................. + // str q11, [x1, #176] // .....................................................................................................................................*.................................................................................................................................................. + // sub v19.4S, v19.4S, v20.4S // .........................................................................................................................*.............................................................................................................................................................. + // mls v9.4S, v24.4S, v29.4S // .........................................................................................................................................................*.............................................................................................................................. + // str q12, [x1, #240] // .................................................................................................................................*...................................................................................................................................................... + // sub v17.4S, v16.4S, v17.4S // ............................................................................................................................*........................................................................................................................................................... + // mls v13.4S, v19.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + // str q14, [x1, #368] // ................................................................................................................................................*....................................................................................................................................... + // mls v15.4S, v17.4S, v29.4S // ..................................................................................................................................................*..................................................................................................................................... + // str q9, [x1, #48] // ...................................................................................................................................................................*.................................................................................................................... + // str q13, [x1, #304] // .............................................................................................................................................*.......................................................................................................................................... + // str q15, [x1, #432] // ...............................................................................................................................................................*........................................................................................................................ + // ldr q17, [x1, #640] // .............................................................................................................................*.......................................................................................................................................................... + // ldr q19, [x1, #704] // ..........................................................................................................................*............................................................................................................................................................. + // ldr q9, [x1] // ................................................................................................................................*....................................................................................................................................................... + // ldr q22, [x1, #512] // ...................................................................................................................................*.................................................................................................................................................... + // ldr q28, [x1, #192] // ..........................................................................................................................................*............................................................................................................................................. + // ldr q8, [x1, #384] // ..........................................................................................................................................................*............................................................................................................................. + // ldr q13, [x1, #64] // ..........................................................*............................................................................................................................................................................................................................. + // sub v11.4S, v17.4S, v19.4S // .....................................................................................................................................................*.................................................................................................................................. + // add v18.4S, v17.4S, v19.4S // ........................................................................................................................................*............................................................................................................................................... + // ldr q24, [x1, #576] // ................................................................................................................*....................................................................................................................................................................... + // sqrdmulh v10.4S, v11.4S, v6.S[1] // ..............................................................................................................................................................*......................................................................................................................... + // sub v15.4S, v9.4S, v13.4S // ...............................................................................................................................................*........................................................................................................................................ + // sub v17.4S, v22.4S, v24.4S // ........................................................................................................................................................*............................................................................................................................... + // add v16.4S, v22.4S, v24.4S // ...................................................................................................................................................*.................................................................................................................................... + // mul v19.4S, v11.4S, v6.S[0] // ...............................................................................................................................................................................*........................................................................................................ + // ldr q24, [x1, #256] // ......................................................................................................................................................*................................................................................................................................. + // mul v23.4S, v17.4S, v5.S[2] // ..........................................................................................................................................................................................*............................................................................................. + // sqrdmulh v12.4S, v17.4S, v5.S[3] // .........................................................................................................................................................................................*.............................................................................................. + // ldr q17, [x1, #320] // ....................................................................................................................................................*................................................................................................................................... + // mul v22.4S, v15.4S, v3.S[2] // ........................................................................................................................................................................*............................................................................................................... + // mls v19.4S, v10.4S, v29.4S // ....................................................................................................................................................................................*................................................................................................... + // sub v27.4S, v24.4S, v17.4S // .............................................................................................................................................................................*.......................................................................................................... + // add v20.4S, v24.4S, v17.4S // ...........................................................................................................................................................................*............................................................................................................ + // ldr q17, [x1, #448] // .................................................................................................................................................*...................................................................................................................................... + // mls v23.4S, v12.4S, v29.4S // ....................................................................................................................................................................................................*................................................................................... + // sqrdmulh v12.4S, v27.4S, v4.S[3] // ................................................................................................................................................................................................*....................................................................................... + // sub v21.4S, v8.4S, v17.4S // .........................................................................................................................................................................*.............................................................................................................. + // mul v24.4S, v27.4S, v4.S[2] // .................................................................................................................................................................................................*...................................................................................... + // add v17.4S, v8.4S, v17.4S // ....................................................................................................................................................................*................................................................................................................... + // mls v24.4S, v12.4S, v29.4S // .....................................................................................................................................................................................................*.................................................................................. + // ldr q14, [x1, #128] // .......................................................................................................................................................*................................................................................................................................ + // add v13.4S, v9.4S, v13.4S // ..............................................................................................................................................*......................................................................................................................................... + // ldr q12, [x1, #960] // ..........................................................................................................................................................................................................................................*............................................. + // ldr q11, [x1, #896] // .......................................................................................................................................................................................................................................*................................................ + // sqrdmulh v15.4S, v15.4S, v3.S[3] // ......................................................................................................................................................................*................................................................................................................. + // sub v27.4S, v16.4S, v18.4S // ............................................................................................................................................................*........................................................................................................................... + // add v16.4S, v16.4S, v18.4S // .............................................................................................................................................................*.......................................................................................................................... + // sub v18.4S, v20.4S, v17.4S // .................................................................................................................................................................................*...................................................................................................... + // sqrdmulh v9.4S, v21.4S, v5.S[1] // ..............................................................................................................................................................................................*......................................................................................... + // add v17.4S, v20.4S, v17.4S // ................................................................................................................................................................................*....................................................................................................... + // mul v10.4S, v21.4S, v5.S[0] // .............................................................................................................................................................................................*.......................................................................................... + // add v21.4S, v14.4S, v28.4S // ................................................................................................................................................................*....................................................................................................................... + // sub v14.4S, v14.4S, v28.4S // .................................................................................................................................................................*...................................................................................................................... + // mls v22.4S, v15.4S, v29.4S // ............................................................................................................................................................................*........................................................................................................... + // add v20.4S, v11.4S, v12.4S // .............................................................................................................................................................................................................................................*.......................................... + // sub v15.4S, v13.4S, v21.4S // .....................................................................................................................................................................*.................................................................................................................. + // add v13.4S, v13.4S, v21.4S // .......................................................................................................................................................................*................................................................................................................ + // mls v10.4S, v9.4S, v29.4S // .........................................................................................................................................................................................................*.............................................................................. + // sub v11.4S, v11.4S, v12.4S // ..............................................................................................................................................................................................................................................*......................................... + // mul v28.4S, v15.4S, v1.S[2] // ........................................................................................................................................................................................*............................................................................................... + // add v8.4S, v13.4S, v17.4S // .....................................................................................................................................................................................*.................................................................................................. + // sub v17.4S, v13.4S, v17.4S // ...................................................................................................................................................................................*.................................................................................................... + // sqrdmulh v9.4S, v18.4S, v2.S[1] // ...............................................................................................................................................................................................................*........................................................................ + // sub v13.4S, v24.4S, v10.4S // ............................................................................................................................................................................................................*........................................................................... + // add v24.4S, v24.4S, v10.4S // .............................................................................................................................................................................................................*.......................................................................... + // mul v21.4S, v18.4S, v2.S[0] // ..............................................................................................................................................................................................................*......................................................................... + // mul v12.4S, v17.4S, v0.S[2] // ..................................................................................................................................................................................................................................*..................................................... + // add v18.4S, v23.4S, v19.4S // ........................................................................................................................................................................................................*............................................................................... + // sub v23.4S, v23.4S, v19.4S // .......................................................................................................................................................................................................*................................................................................ + // sqrdmulh v19.4S, v17.4S, v0.S[3] // ................................................................................................................................................................................................................................*....................................................... + // sqrdmulh v10.4S, v14.4S, v4.S[1] // ...........................................................................................................................................................................................*............................................................................................ + // mul v14.4S, v14.4S, v4.S[0] // ............................................................................................................................................................................................*........................................................................................... + // mul v17.4S, v13.4S, v2.S[0] // ................................................................................................................................................................................................................*....................................................................... + // sqrdmulh v13.4S, v13.4S, v2.S[1] // .................................................................................................................................................................................................................*...................................................................... + // mls v14.4S, v10.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + // sqrdmulh v10.4S, v15.4S, v1.S[3] // .......................................................................................................................................................................................*................................................................................................ + // ldr q15, [x1, #768] // ..............................................................................................................................................................................*......................................................................................................... + // mls v21.4S, v9.4S, v29.4S // ....................................................................................................................................................................................................................*................................................................... + // add v9.4S, v22.4S, v14.4S // ...................................................................................................................................................................................................*.................................................................................... + // sub v22.4S, v22.4S, v14.4S // ..................................................................................................................................................................................................*..................................................................................... + // mls v17.4S, v13.4S, v29.4S // ......................................................................................................................................................................................................................*................................................................. + // ldr q14, [x1, #832] // ..........................................................................................................................................................................*............................................................................................................. + // mls v28.4S, v10.4S, v29.4S // ......................................................................................................................................................................................................*................................................................................. + // add v13.4S, v15.4S, v14.4S // ...................................................................................................................................................................................................................*.................................................................... + // sqrdmulh v10.4S, v11.4S, v7.S[1] // ..................................................................................................................................................................................................................................................*..................................... + // sub v14.4S, v15.4S, v14.4S // ..................................................................................................................................................................................................................*..................................................................... + // mul v15.4S, v11.4S, v7.S[0] // ...................................................................................................................................................................................................................................................*.................................... + // sub v11.4S, v28.4S, v21.4S // .......................................................................................................................................................................................................................*................................................................ + // mls v12.4S, v19.4S, v29.4S // .....................................................................................................................................................................................................................................*.................................................. + // mls v15.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + // add v10.4S, v28.4S, v21.4S // ........................................................................................................................................................................................................................*............................................................... + // add v28.4S, v13.4S, v20.4S // .................................................................................................................................................................................................................................................*...................................... + // sqrdmulh v19.4S, v27.4S, v2.S[3] // ..................................................................................................................................................................*..................................................................................................................... + // sub v21.4S, v9.4S, v24.4S // ...............................................................................................................................................................................................................................*........................................................ + // sub v13.4S, v13.4S, v20.4S // ................................................................................................................................................................................................................................................*....................................... + // mul v20.4S, v27.4S, v2.S[2] // ..................................................................................................................................................................................*..................................................................................................... + // add v9.4S, v9.4S, v24.4S // .............................................................................................................................................................................................................................*.......................................................... + // sqrdmulh v24.4S, v14.4S, v6.S[3] // ....................................................................................................................................................................................................................................*................................................... + // mls v20.4S, v19.4S, v29.4S // ......................................................................................................................................................................................*................................................................................................. + // sqrdmulh v19.4S, v11.4S, v0.S[3] // ...........................................................................................................................................................................................................................*............................................................ + // mul v27.4S, v14.4S, v6.S[2] // ...................................................................................................................................................................................................................................*.................................................... + // sqrdmulh v14.4S, v22.4S, v1.S[3] // .........................................................................................................................................................................................................................*.............................................................. + // mul v22.4S, v22.4S, v1.S[2] // .....................................................................................................................................................................................................................*.................................................................. + // mls v27.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + // mls v22.4S, v14.4S, v29.4S // ............................................................................................................................................................................................................................*........................................................... + // mul v14.4S, v11.4S, v0.S[2] // ..........................................................................................................................................................................................................................*............................................................. + // add v24.4S, v27.4S, v15.4S // ..............................................................................................................................................................................................................................................................*......................... + // mls v14.4S, v19.4S, v29.4S // ...........................................................................................................................................................................................................................................*............................................ + // sub v11.4S, v27.4S, v15.4S // ............................................................................................................................................................................................................................................................*........................... + // mul v27.4S, v23.4S, v2.S[2] // ...........................................................................................................................................................................................................*............................................................................ + // sub v19.4S, v18.4S, v24.4S // ..................................................................................................................................................................................................................................................................*..................... + // add v24.4S, v18.4S, v24.4S // ..........................................................................................................................................................................................................................................................................*............. + // sqrdmulh v15.4S, v23.4S, v2.S[3] // ..........................................................................................................................................................................................................*............................................................................. + // mul v23.4S, v13.4S, v3.S[0] // .............................................................................................................................................................................................................................................................*.......................... + // sqrdmulh v13.4S, v13.4S, v3.S[1] // ......................................................................................................................................................................................................................................................*................................. + // sqrdmulh v18.4S, v11.4S, v3.S[1] // ................................................................................................................................................................................................................................................................*....................... + // mls v27.4S, v15.4S, v29.4S // ..............................................................................................................................................................................................................................*......................................................... + // mul v15.4S, v11.4S, v3.S[0] // ...................................................................................................................................................................................................................................................................*.................... + // add v11.4S, v16.4S, v28.4S // ....................................................................................................................................................................................................................................................*................................... + // mls v23.4S, v13.4S, v29.4S // .................................................................................................................................................................................................................................................................*...................... + // sub v13.4S, v16.4S, v28.4S // .....................................................................................................................................................................................................................................................*.................................. + // sub v28.4S, v8.4S, v11.4S // ........................................................................................................................................................................................................................................................*............................... + // add v8.4S, v8.4S, v11.4S // .........................................................................................................................................................................................................................................................*.............................. + // mul v16.4S, v19.4S, v1.S[0] // ..............................................................................................................................................................................................................................................................................*......... + // sqrdmulh v19.4S, v19.4S, v1.S[1] // .........................................................................................................................................................................................................................................................................*.............. + // mls v15.4S, v18.4S, v29.4S // .....................................................................................................................................................................................................................................................................*.................. + // sqrdmulh v18.4S, v13.4S, v1.S[1] // ..........................................................................................................................................................................................................................................................*............................. + // mul v11.4S, v13.4S, v1.S[0] // ...........................................................................................................................................................................................................................................................*............................ + // sub v13.4S, v27.4S, v15.4S // ...............................................................................................................................................................................................................................................................................*........ + // add v15.4S, v27.4S, v15.4S // ................................................................................................................................................................................................................................................................................*....... + // mls v16.4S, v19.4S, v29.4S // .................................................................................................................................................................................................................................................................................*...... + // sqrdmulh v27.4S, v13.4S, v1.S[1] // ..................................................................................................................................................................................................................................................................................*..... + // mul v19.4S, v13.4S, v1.S[0] // ......................................................................................................................................................................................................................................................................................*. + // sub v13.4S, v22.4S, v17.4S // .................................................................................................................................................................................................................................*...................................................... + // add v17.4S, v22.4S, v17.4S // ......................................................................................................................................................................................................................................*................................................. + // mls v11.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + // sqrdmulh v18.4S, v21.4S, v0.S[3] // ........................................................................................................................................................................................................................................*............................................... + // mls v19.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................................................................................................* + // sub v27.4S, v17.4S, v15.4S // ...................................................................................................................................................................................................................................................................................*.... + // sub v22.4S, v12.4S, v11.4S // ....................................................................................................................................................................................................................................................................*................... + // add v12.4S, v12.4S, v11.4S // ......................................................................................................................................................................................................................................................................*................. + // mul v21.4S, v21.4S, v0.S[2] // ............................................................................................................................................................................................................................................*........................................... + // add v11.4S, v17.4S, v15.4S // ....................................................................................................................................................................................................................................................................................*... + // mls v21.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + // add v17.4S, v20.4S, v23.4S // .......................................................................................................................................................................................................................................................................*................ + // sub v23.4S, v20.4S, v23.4S // ........................................................................................................................................................................................................................................................................*............... + // mul v15.4S, v13.4S, v0.S[2] // .....................................................................................................................................................................................................................................................................................*.. + // sub v18.4S, v10.4S, v17.4S // ............................................................................................................................................................................................................................................................................*........... + // add v10.4S, v10.4S, v17.4S // .............................................................................................................................................................................................................................................................................*.......... + // sqrdmulh v17.4S, v13.4S, v0.S[3] // ...........................................................................................................................................................................................................................................................................*............ sub count, count, #1 cbnz count, layer1234_start - sqrdmulh v28.4S, v28.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... - add v23.4S, v27.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... - sqrdmulh v27.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - mls v13.4S, v15.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - sub v15.4S, v11.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ - add v11.4S, v11.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... - mul v23.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - cmge v20.4S, v19.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - mls v22.4S, v28.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... - cmge v28.4S, v31.4S, v19.4S // ................................................................................................................................................................................*....................................................................................................... - mls v23.4S, v27.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - sub v28.4S, v28.4S, v20.4S // ..................................................................................................................................................................................*..................................................................................................... - sqrdmulh v20.4S, v15.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - sqrdmulh v27.4S, v14.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - mul v28.4S, v15.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - str q19, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - mul v19.4S, v14.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - cmge v14.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - sqrdmulh v15.4S, v18.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - mls v19.4S, v27.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... - mul v18.4S, v18.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - sub v14.4S, v27.4S, v14.4S // ......................................................................................................................................................................................*................................................................................................. - mls v18.4S, v15.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - add v15.4S, v16.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... - mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - sub v16.4S, v16.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ - str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - mls v28.4S, v20.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - sub v27.4S, v23.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. - add v14.4S, v23.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ - cmge v22.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - sqrdmulh v20.4S, v21.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - cmge v23.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... - mul v19.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - sub v17.4S, v24.4S, v13.4S // .................................................................................................................................................................*...................................................................................................................... - add v13.4S, v24.4S, v13.4S // ..................................................................................................................................................................*..................................................................................................................... - sub v22.4S, v23.4S, v22.4S // ..........................................................................................................................................................................................*............................................................................................. - sqrdmulh v24.4S, v16.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - cmge v21.4S, v28.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - mul v23.4S, v16.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - cmge v16.4S, v31.4S, v28.4S // ............................................................................................................................................................................................*........................................................................................... - mls v18.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - mul v22.4S, v27.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - sqrdmulh v27.4S, v27.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v15.4S, v17.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub v20.4S, v21.4S, v16.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v21.4S, v16.4S // ..................................................................................................................................................................*..................................................................................................................... + mul v17.4S, v22.4S, v0.S[0] // ...............................................................................................................................................................*........................................................................................................................ + sub v21.4S, v9.4S, v24.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v9.4S, v24.4S // ..............................................................................................................................................*......................................................................................................................................... + sqrdmulh v24.4S, v22.4S, v0.S[1] // ..............................................................................................................................................................*......................................................................................................................... + mul v22.4S, v28.4S, v0.S[0] // ...........................................................................................................................................*............................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[1] // ..........................................................................................................................................*............................................................................................................................................. + mul v16.4S, v21.4S, v0.S[0] // ................................................................................................................................................*....................................................................................................................................... + mls v17.4S, v24.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // ...............................................................................................................................................*........................................................................................................................................ + mls v22.4S, v28.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sqrdmulh v28.4S, v18.4S, v0.S[1] // ....................................................................................................................................................*................................................................................................................................... + mls v16.4S, v21.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sub v21.4S, v15.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + add v15.4S, v15.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v19.4S, v23.4S, v1.S[1] // ................................................................................................................................*....................................................................................................................................................... + mul v18.4S, v18.4S, v0.S[0] // .....................................................................................................................................................*.................................................................................................................................. + cmge v24.4S, v31.4S, v16.4S // ....................................................................................................................................................................................*................................................................................................... + mul v23.4S, v23.4S, v1.S[0] // .................................................................................................................................*...................................................................................................................................................... + mls v23.4S, v19.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + cmge v19.4S, v16.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mls v18.4S, v28.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + sub v24.4S, v24.4S, v19.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v28.4S, v20.4S, v0.S[1] // ...................................................................................................................................................................*.................................................................................................................... + sub v19.4S, v14.4S, v23.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v14.4S, v23.4S // .......................................................................................................................................................................*................................................................................................................ + mul v20.4S, v20.4S, v0.S[0] // ....................................................................................................................................................................*................................................................................................................... + mls v20.4S, v28.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v28.4S, v22.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v16.4S, v24.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + cmge v24.4S, v31.4S, v22.4S // ................................................................................................................................................................................*....................................................................................................... + mul v23.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................................*......................................................................................................... + sub v28.4S, v24.4S, v28.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // .............................................................................................................................................................................*.......................................................................................................... + cmge v24.4S, v31.4S, v17.4S // ................................................................................................................................................................................................*....................................................................................... + str q16, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + cmge v16.4S, v17.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v22.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v28.4S, v24.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + sqrdmulh v24.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v16.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mul v8.4S, v8.4S, v25.4S // .........................................................................................................................................................................................................................*.............................................................. + str q22, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + cmge v22.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + mls v17.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v28.4S, v22.4S, v16.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v16.4S, v19.4S, v0.S[1] // ........................................................................................................................................................................*............................................................................................................... + mul v22.4S, v19.4S, v0.S[0] // .........................................................................................................................................................................*.............................................................................................................. + mls v18.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + sqrdmulh v19.4S, v10.4S, v26.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v10.4S, v10.4S, v25.4S // ...............................................................................................................................................................................................................................*........................................................ str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - sqrdmulh v18.4S, v17.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - mls v19.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - mul v20.4S, v17.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - mls v20.4S, v18.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - cmge v18.4S, v31.4S, v19.4S // ................................................................................................................................................................................................*....................................................................................... - mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - cmge v24.4S, v19.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v22.4S, v27.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - sub v27.4S, v16.4S, v21.4S // ..............................................................................................................................................................................................*......................................................................................... - sub v18.4S, v18.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... - mls v28.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - cmge v27.4S, v31.4S, v20.4S // ....................................................................................................................................................................................................*................................................................................... - cmge v24.4S, v20.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - cmge v16.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - cmge v21.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... - str q28, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - sub v27.4S, v27.4S, v24.4S // ......................................................................................................................................................................................................*................................................................................. - mls v19.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - cmge v18.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... - sub v24.4S, v21.4S, v16.4S // ..............................................................................................................................................................................................................*......................................................................... - mls v20.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - cmge v27.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - str q19, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - sub v18.4S, v18.4S, v27.4S // ..........................................................................................................................................................................................................*............................................................................. - sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + cmge v18.4S, v31.4S, v20.4S // ....................................................................................................................................................................................................*................................................................................... + mls v8.4S, v24.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + str q17, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + mls v22.4S, v16.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v17.4S, v20.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v24.4S, v31.4S, v8.4S // ................................................................................................................................................................................................................................................*....................................... + mls v23.4S, v21.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v16.4S, v8.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v17.4S, v18.4S, v17.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v28.4S, v11.4S, v26.4S // .................................................................................................................................................................................................................................*...................................................... + sub v24.4S, v24.4S, v16.4S // ..................................................................................................................................................................................................................................................*..................................... + mul v11.4S, v11.4S, v25.4S // ..................................................................................................................................................................................................................................*..................................................... + cmge v18.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v21.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + mls v20.4S, v17.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v17.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v16.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + sub v21.4S, v21.4S, v18.4S // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v18.4S, v14.4S, v26.4S // ..........................................................................................................................................................................................................................................*............................................. + mul v14.4S, v14.4S, v25.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v17.4S, v16.4S, v17.4S // ..............................................................................................................................................................................................................*......................................................................... str q20, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - mls v22.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - sqrdmulh v10.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - sqrdmulh v22.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - mls v19.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - sqrdmulh v24.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - cmge v8.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - cmge v11.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - sub v27.4S, v8.4S, v11.4S // ..............................................................................................................................................................................................................................................................*......................... - mls v18.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - cmge v10.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - cmge v8.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - sub v10.4S, v10.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. - cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - sub v11.4S, v11.4S, v23.4S // ..................................................................................................................................................................................................................................................*..................................... - sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - cmge v23.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - sqrdmulh v9.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - sub v8.4S, v8.4S, v23.4S // ..........................................................................................................................................................................................................................................................*............................. - mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - mls v22.4S, v24.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - mls v23.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - mls v17.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - cmge v10.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - cmge v15.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - cmge v13.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - sub v10.4S, v24.4S, v10.4S // ..........................................................................................................................................................................................................................................................................*............. - mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - cmge v24.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... - str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... - sub v13.4S, v15.4S, v13.4S // ..................................................................................................................................................................................................................................................................*..................... - mls v18.4S, v8.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - cmge v15.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - cmge v8.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - mls v20.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - cmge v13.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - sub v15.4S, v24.4S, v15.4S // ..............................................................................................................................................................................................................................................................................*......... - mls v19.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... - sub v18.4S, v8.4S, v13.4S // ......................................................................................................................................................................................................................................................................*................. - mls v23.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - mls v22.4S, v10.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - mls v21.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + sqrdmulh v16.4S, v15.4S, v26.4S // .............................................................................................................................................................................................................................................*.......................................... + mul v15.4S, v15.4S, v25.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v10.4S, v19.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v19.4S, v27.4S, v0.S[1] // .........................................................................................................................................................*.............................................................................................................................. + mls v8.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v20.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + mul v27.4S, v27.4S, v0.S[0] // ..........................................................................................................................................................*............................................................................................................................. + mls v27.4S, v19.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mls v23.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + str q8, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v17.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v19.4S, v31.4S, v27.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v24.4S, v27.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + sqrdmulh v8.4S, v9.4S, v26.4S // ...........................................................................................................................................................................................................................*............................................................ + sub v17.4S, v20.4S, v17.4S // ..........................................................................................................................................................................................................................................................*............................. + mul v9.4S, v9.4S, v25.4S // ............................................................................................................................................................................................................................*........................................................... + str q23, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + sub v19.4S, v19.4S, v24.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v11.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v27.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v10.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v17.4S, v31.4S, v11.4S // ............................................................................................................................................................................................................................................................*........................... + mls v22.4S, v21.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + cmge v19.4S, v11.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + str q27, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + sqrdmulh v24.4S, v12.4S, v26.4S // ....................................................................................................................................................................................................................................*................................................... + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v17.4S, v17.4S, v19.4S // ..............................................................................................................................................................................................................................................................*......................... + mul v12.4S, v12.4S, v25.4S // .....................................................................................................................................................................................................................................*.................................................. + str q22, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + mls v9.4S, v8.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mls v12.4S, v24.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sqrdmulh v19.4S, v13.4S, v26.4S // .......................................................................................................................................................................................................................................*................................................ + cmge v24.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mls v14.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v20.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... + mul v13.4S, v13.4S, v25.4S // ........................................................................................................................................................................................................................................*............................................... + cmge v28.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v18.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + mls v13.4S, v19.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v19.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v10.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + sub v24.4S, v20.4S, v24.4S // ......................................................................................................................................................................................................................................................*................................. + mls v15.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v20.4S, v28.4S, v18.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v11.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sub v17.4S, v19.4S, v10.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v19.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... + mls v12.4S, v20.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v20.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v16.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + mls v14.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + cmge v17.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + str q11, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v19.4S, v19.4S, v20.4S // ......................................................................................................................................................................................................................................................................*................. + mls v9.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + str q12, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + sub v17.4S, v16.4S, v17.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v13.4S, v19.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + str q14, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + mls v15.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q9, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + str q13, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_firestorm.s b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_firestorm.s index ff52d0f4..0379aa05 100644 --- a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_firestorm.s +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_firestorm.s @@ -35,18 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -85,15 +73,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm .macro barrett_reduce_single a @@ -114,12 +102,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -138,31 +120,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [\r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -177,7 +159,7 @@ trn1_d \data\()1, t1, t3 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -188,7 +170,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -198,7 +180,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -206,7 +188,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -217,24 +199,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -333,359 +321,379 @@ _intt_dilithium_1234_5678_opt_m1_firestorm: mov count, #16 .p2align 2 - ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x0] // *.......................................... - ldr q18, [x3, #32] // .*......................................... - ldr q22, [x3, #48] // ..*........................................ - ldr q26, [x3, #64] // ...*....................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - ldr q7, [x3, #80] // ....*...................................... - ldr q27, [x3, #16] // .........*................................. - ldr q15, [x3], #(6*16) // .................*......................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - ldr q19, [x4], #8 // .......................................*... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - sub v30.4S, v10.4S, v11.4S // .....*..................................... - add v10.4S, v10.4S, v11.4S // .......*................................... - sub v24.4S, v12.4S, v13.4S // ......*.................................... - add v13.4S, v12.4S, v13.4S // ........*.................................. - ldr q11, [x4], #16 // .........................................*. - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mul v18.4S, v30.4S, v18.4S // ..........*................................ - sqrdmulh v22.4S, v30.4S, v22.4S // ...........*............................... - mul v26.4S, v24.4S, v26.4S // ............*.............................. - sqrdmulh v7.4S, v24.4S, v7.4S // .............*............................. - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - sub v30.4S, v10.4S, v13.4S // ..............*............................ - add v10.4S, v10.4S, v13.4S // ...................*....................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mls v18.4S, v22.4S, v29.4S // ...............*........................... - mls v26.4S, v7.4S, v29.4S // ................*.......................... - sqrdmulh v22.4S, v30.4S, v27.4S // ..................*........................ - mul v7.4S, v30.4S, v15.4S // ......................*.................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - sub v13.4S, v18.4S, v26.4S // ....................*...................... - add v18.4S, v18.4S, v26.4S // ........................*.................. - mls v7.4S, v22.4S, v29.4S // ..........................*................ - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - trn2 v30.4S, v10.4S, v18.4S // ...........................*............... - sqrdmulh v22.4S, v13.4S, v27.4S // .....................*..................... - mul v26.4S, v13.4S, v15.4S // .......................*................... - trn1 v10.4S, v10.4S, v18.4S // ............................*.............. - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mls v26.4S, v22.4S, v29.4S // .........................*................. - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - trn1 v22.4S, v7.4S, v26.4S // .............................*............. - trn2 v18.4S, v7.4S, v26.4S // ..............................*............ - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - trn2 v28.2D, v30.2D, v18.2D // .................................*......... - trn1 v9.2D, v30.2D, v18.2D // ..................................*........ - trn1 v17.2D, v10.2D, v22.2D // ...............................*........... - trn2 v13.2D, v10.2D, v22.2D // ................................*.......... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - add v3.4S, v17.4S, v9.4S // ...................................*....... - add v2.4S, v13.4S, v28.4S // ....................................*...... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - add v10.4S, v3.4S, v2.4S // .....................................*..... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - srshr v18.4S, v10.4S, #23 // ......................................*.... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mls v10.4S, v18.4S, v29.4S // ........................................*.. - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - str q10, [x0], #(16*4) // ..........................................* - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... + // Instructions: 41 + // Expected cycles: 37 + // Expected IPC: 1.11 + // + // Wall time: 0.95s + // User time: 0.95s + // + // ---------- original position -----------> + // 0 25 + // |------------------------|--------------- + ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x0] // *........................................ + ldr q1, [x3, #32] // .*....................................... + ldr q18, [x3, #64] // ..*...................................... + ldr q8, [x3, #48] // ...*..................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + ldr q9, [x3, #80] // ....*.................................... + ldr q2, [x3, #16] // ...........*............................. + ldr q3, [x3], #(6*16) // .............*........................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v26.4S, v21.4S, v22.4S // .....*................................... + add v21.4S, v21.4S, v22.4S // ............*............................ + sub v22.4S, v23.4S, v24.4S // ......*.................................. + add v14.4S, v23.4S, v24.4S // ...............*......................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mul v1.4S, v26.4S, v1.4S // ........*................................ + mul v18.4S, v22.4S, v18.4S // .........*............................... + sqrdmulh v8.4S, v26.4S, v8.4S // .......*................................. + sqrdmulh v22.4S, v22.4S, v9.4S // ..........*.............................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v9.4S, v21.4S, v14.4S // .................*....................... + add v21.4S, v21.4S, v14.4S // ..................*...................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v1.4S, v8.4S, v29.4S // ..............*.......................... + mls v18.4S, v22.4S, v29.4S // ................*........................ + sqrdmulh v22.4S, v9.4S, v2.4S // ....................*.................... + mul v8.4S, v9.4S, v3.4S // .....................*................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v9.4S, v1.4S, v18.4S // ...................*..................... + add v1.4S, v1.4S, v18.4S // ........................*................ + mls v8.4S, v22.4S, v29.4S // ..........................*.............. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sqrdmulh v22.4S, v9.4S, v2.4S // .......................*................. + mul v18.4S, v9.4S, v3.4S // ......................*.................. + trn1 v9.4S, v21.4S, v1.4S // ...........................*............. + trn2 v21.4S, v21.4S, v1.4S // ............................*............ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v18.4S, v22.4S, v29.4S // .........................*............... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn2 v22.4S, v8.4S, v18.4S // ..............................*.......... + trn1 v1.4S, v8.4S, v18.4S // .............................*........... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn2 v5.2D, v9.2D, v1.2D // ...............................*......... + trn1 v4.2D, v9.2D, v1.2D // ................................*........ + trn1 v19.2D, v21.2D, v22.2D // .................................*....... + trn2 v6.2D, v21.2D, v22.2D // ..................................*...... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + add v0.4S, v4.4S, v19.4S // ...................................*..... + add v23.4S, v5.4S, v6.4S // ....................................*.... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + add v21.4S, v0.4S, v23.4S // .....................................*... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + srshr v1.4S, v21.4S, #23 // ......................................*.. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v21.4S, v1.4S, v29.4S // .......................................*. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + str q21, [x0], #(16*4) // ........................................* + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... - // original source code - // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x0] // *.......................................... - // ldr q18, [x3, #32] // .*......................................... - // ldr q23, [x3, #48] // ..*........................................ - // ldr q12, [x3, #64] // ...*....................................... - // ldr q24, [x3, #80] // ....*...................................... - // sub v5.4S, v14.4S, v15.4S // ........*.................................. - // sub v6.4S, v16.4S, v17.4S // ..........*................................ - // add v21.4S, v14.4S, v15.4S // .........*................................. - // add v30.4S, v16.4S, v17.4S // ...........*............................... - // ldr q14, [x3, #16] // .....*..................................... - // mul v1.4S, v5.4S, v18.4S // .............*............................. - // sqrdmulh v0.4S, v5.4S, v23.4S // ..............*............................ - // mul v16.4S, v6.4S, v12.4S // ...............*........................... - // sqrdmulh v9.4S, v6.4S, v24.4S // ................*.......................... - // sub v6.4S, v21.4S, v30.4S // .................*......................... - // mls v1.4S, v0.4S, v29.4S // ...................*....................... - // mls v16.4S, v9.4S, v29.4S // ....................*...................... - // ldr q12, [x3], #(6*16) // ......*.................................... - // sqrdmulh v18.4S, v6.4S, v14.4S // .....................*..................... - // add v20.4S, v21.4S, v30.4S // ..................*........................ - // sub v31.4S, v1.4S, v16.4S // .......................*................... - // sqrdmulh v22.4S, v31.4S, v14.4S // ...........................*............... - // mul v15.4S, v6.4S, v12.4S // ......................*.................... - // mul v14.4S, v31.4S, v12.4S // ............................*.............. - // add v2.4S, v1.4S, v16.4S // ........................*.................. - // mls v14.4S, v22.4S, v29.4S // ..............................*............ - // mls v15.4S, v18.4S, v29.4S // .........................*................. - // trn2 v9.4S, v20.4S, v2.4S // ..........................*................ - // trn1 v2.4S, v20.4S, v2.4S // .............................*............. - // trn1 v18.4S, v15.4S, v14.4S // ...............................*........... - // trn2 v10.4S, v15.4S, v14.4S // ................................*.......... - // trn1 v17.2D, v2.2D, v18.2D // ...................................*....... - // trn2 v13.2D, v2.2D, v18.2D // ....................................*...... - // trn2 v28.2D, v9.2D, v10.2D // .................................*......... - // trn1 v9.2D, v9.2D, v10.2D // ..................................*........ - // add v3.4S, v17.4S, v9.4S // .....................................*..... - // add v2.4S, v13.4S, v28.4S // ......................................*.... - // add v21.4S, v3.4S, v2.4S // .......................................*... - // srshr v12.4S, v21.4S, #23 // ........................................*.. - // ldr q19, [x4], #8 // .......*................................... - // mls v21.4S, v12.4S, v29.4S // .........................................*. - // ldr q11, [x4], #16 // ............*.............................. - // str q21, [x0], #(16*4) // ..........................................* + // ------------- new position -------------> + // 0 25 + // |------------------------|--------------- + // ld4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x0] // *........................................ + // ldr q6, [x3, #32] // .*....................................... + // ldr q12, [x3, #64] // ..*...................................... + // ldr q20, [x3, #48] // ...*..................................... + // ldr q3, [x3, #80] // ....*.................................... + // sub v9.4S, v24.4S, v25.4S // .......*................................. + // sub v11.4S, v26.4S, v27.4S // .........*............................... + // sqrdmulh v16.4S, v9.4S, v20.4S // .............*........................... + // mul v28.4S, v9.4S, v6.4S // ...........*............................. + // mul v7.4S, v11.4S, v12.4S // ............*............................ + // sqrdmulh v17.4S, v11.4S, v3.4S // ..............*.......................... + // ldr q20, [x3, #16] // .....*................................... + // add v0.4S, v24.4S, v25.4S // ........*................................ + // ldr q4, [x3], #(6*16) // ......*.................................. + // mls v28.4S, v16.4S, v29.4S // .................*....................... + // add v16.4S, v26.4S, v27.4S // ..........*.............................. + // mls v7.4S, v17.4S, v29.4S // ..................*...................... + // sub v25.4S, v0.4S, v16.4S // ...............*......................... + // add v21.4S, v0.4S, v16.4S // ................*........................ + // sub v31.4S, v28.4S, v7.4S // .....................*................... + // sqrdmulh v6.4S, v25.4S, v20.4S // ...................*..................... + // mul v18.4S, v25.4S, v4.4S // ....................*.................... + // mul v30.4S, v31.4S, v4.4S // .........................*............... + // sqrdmulh v26.4S, v31.4S, v20.4S // ........................*................ + // add v1.4S, v28.4S, v7.4S // ......................*.................. + // mls v30.4S, v26.4S, v29.4S // ............................*............ + // mls v18.4S, v6.4S, v29.4S // .......................*................. + // trn1 v7.4S, v21.4S, v1.4S // ..........................*.............. + // trn2 v6.4S, v21.4S, v1.4S // ...........................*............. + // trn1 v10.4S, v18.4S, v30.4S // ..............................*.......... + // trn2 v1.4S, v18.4S, v30.4S // .............................*........... + // trn2 v5.2D, v7.2D, v10.2D // ...............................*......... + // trn1 v4.2D, v7.2D, v10.2D // ................................*........ + // trn1 v19.2D, v6.2D, v1.2D // .................................*....... + // trn2 v6.2D, v6.2D, v1.2D // ..................................*...... + // add v0.4S, v4.4S, v19.4S // ...................................*..... + // add v23.4S, v5.4S, v6.4S // ....................................*.... + // add v22.4S, v0.4S, v23.4S // .....................................*... + // srshr v9.4S, v22.4S, #23 // ......................................*.. + // mls v22.4S, v9.4S, v29.4S // .......................................*. + // str q22, [x0], #(16*4) // ........................................* sub count, count, #1 layer5678_start: - sub v26.4S, v17.4S, v9.4S // .....................................*........................... + // Instructions: 65 + // Expected cycles: 37 + // Expected IPC: 1.76 + // + // Wall time: 8.89s + // User time: 8.89s + // + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- // gap // ................................................................. + ld4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x0] // e................................................................ // gap // ................................................................. - ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x0] // e................................................................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + sub v15.4S, v5.4S, v6.4S // ..........................................*...................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. @@ -693,241 +701,239 @@ layer5678_start: // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - mul v4.4S, v26.4S, v11.S[0] // .......................................*......................... - sqrdmulh v8.4S, v26.4S, v11.S[1] // ........................................*........................ - ldr q18, [x3, #32] // ...e............................................................. + ldr q6, [x3, #32] // ...e............................................................. + ldr q12, [x3, #64] // .....e........................................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - ldr q23, [x3, #48] // ....e............................................................ - ldr q12, [x3, #64] // .....e........................................................... // gap // ................................................................. + ldr q20, [x3, #48] // ....e............................................................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - ldr q24, [x3, #80] // ......e.......................................................... // gap // ................................................................. + ldr q3, [x3, #80] // ......e.......................................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - sub v2.4S, v3.4S, v2.4S // ...............................................*................. - mls v4.4S, v8.4S, v29.4S // .........................................*....................... - sub v8.4S, v13.4S, v28.4S // ..........................................*...................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - sub v5.4S, v14.4S, v15.4S // .......e......................................................... - sub v6.4S, v16.4S, v17.4S // ............e.................................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - sqrdmulh v28.4S, v8.4S, v11.S[3] // .............................................*................... - add v21.4S, v14.4S, v15.4S // ........e........................................................ - add v30.4S, v16.4S, v17.4S // .............e................................................... - ldr q14, [x3, #16] // ..e.............................................................. + sub v9.4S, v24.4S, v25.4S // .......e......................................................... // gap // ................................................................. // gap // ................................................................. + sub v11.4S, v26.4S, v27.4S // ............e.................................................... // gap // ................................................................. // gap // ................................................................. - mul v1.4S, v5.4S, v18.4S // .........e....................................................... - sqrdmulh v0.4S, v5.4S, v23.4S // ..........e...................................................... - mul v16.4S, v6.4S, v12.4S // ..............e.................................................. - sqrdmulh v9.4S, v6.4S, v24.4S // ...............e................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - mul v23.4S, v8.4S, v11.S[2] // ............................................*.................... - sqrdmulh v17.4S, v2.4S, v19.S[1] // ..................................................*.............. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + sqrdmulh v16.4S, v9.4S, v20.4S // .........e....................................................... + mul v28.4S, v9.4S, v6.4S // ..........e...................................................... + mul v7.4S, v11.4S, v12.4S // ...............e................................................. + sqrdmulh v17.4S, v11.4S, v3.4S // ..............e.................................................. + ldr q20, [x3, #16] // ..e.............................................................. + ldr q12, [x4], #8 // ...................................*............................. // gap // ................................................................. - sub v6.4S, v21.4S, v30.4S // .................e............................................... // gap // ................................................................. + sub v9.4S, v0.4S, v23.4S // ...............................................*................. + add v0.4S, v24.4S, v25.4S // ........e........................................................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + sub v24.4S, v4.4S, v19.4S // .....................................*........................... + ldr q4, [x3], #(6*16) // .e............................................................... // gap // ................................................................. // gap // ................................................................. - mls v1.4S, v0.4S, v29.4S // ...........e..................................................... - mls v16.4S, v9.4S, v29.4S // ................e................................................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - mls v23.4S, v28.4S, v29.4S // ..............................................*.................. // gap // ................................................................. + mls v28.4S, v16.4S, v29.4S // ...........e..................................................... + add v16.4S, v26.4S, v27.4S // .............e................................................... + mls v7.4S, v17.4S, v29.4S // ................e................................................ + ldr q27, [x4], #16 // ....................................*............................ // gap // ................................................................. - ldr q12, [x3], #(6*16) // .e............................................................... // gap // ................................................................. // gap // ................................................................. + sqrdmulh v19.4S, v9.4S, v12.S[1] // .................................................*............... // gap // ................................................................. // gap // ................................................................. - sqrdmulh v18.4S, v6.4S, v14.4S // ....................e............................................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + sub v25.4S, v0.4S, v16.4S // .................e............................................... + add v21.4S, v0.4S, v16.4S // ..................e.............................................. // gap // ................................................................. // gap // ................................................................. - add v20.4S, v21.4S, v30.4S // ..................e.............................................. - sub v31.4S, v1.4S, v16.4S // ......................e.......................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - add v28.4S, v4.4S, v23.4S // .....................................................*........... + sub v31.4S, v28.4S, v7.4S // ......................e.......................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + sqrdmulh v6.4S, v25.4S, v20.4S // ...................e............................................. + mul v18.4S, v25.4S, v4.4S // ....................e............................................ + sqrdmulh v10.4S, v24.4S, v27.S[1] // .......................................*......................... // gap // ................................................................. // gap // ................................................................. - sqrdmulh v22.4S, v31.4S, v14.4S // .........................e....................................... - mul v15.4S, v6.4S, v12.4S // ...................e............................................. - mul v14.4S, v31.4S, v12.4S // ........................e........................................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + mul v30.4S, v31.4S, v4.4S // .........................e....................................... + sqrdmulh v26.4S, v31.4S, v20.4S // ........................e........................................ + sqrdmulh v14.4S, v15.4S, v27.S[3] // ............................................*.................... // gap // ................................................................. // gap // ................................................................. - sub v25.4S, v4.4S, v23.4S // ....................................................*............ - mul v12.4S, v2.4S, v19.S[0] // .................................................*............... - srshr v6.4S, v28.4S, #23 // ...........................................................*..... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + mul v13.4S, v15.4S, v27.S[2] // .............................................*................... + mul v4.4S, v9.4S, v12.S[0] // ..................................................*.............. + mul v20.4S, v24.4S, v27.S[0] // ........................................*........................ // gap // ................................................................. // gap // ................................................................. - add v2.4S, v1.4S, v16.4S // .......................e......................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + add v1.4S, v28.4S, v7.4S // .......................e......................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - mls v14.4S, v22.4S, v29.4S // ..........................e...................................... - mls v15.4S, v18.4S, v29.4S // .....................e........................................... - sqrdmulh v31.4S, v25.4S, v19.S[1] // .......................................................*......... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + mls v30.4S, v26.4S, v29.4S // ..........................e...................................... + mls v18.4S, v6.4S, v29.4S // .....................e........................................... // gap // ................................................................. // gap // ................................................................. - mls v12.4S, v17.4S, v29.4S // ...................................................*............. - mls v28.4S, v6.4S, v29.4S // ............................................................*.... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + mls v4.4S, v19.4S, v29.4S // ...................................................*............. + mls v13.4S, v14.4S, v29.4S // ..............................................*.................. + trn1 v7.4S, v21.4S, v1.4S // ...........................e..................................... // gap // ................................................................. // gap // ................................................................. - trn2 v9.4S, v20.4S, v2.4S // ............................e.................................... - trn1 v2.4S, v20.4S, v2.4S // ...........................e..................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + mls v20.4S, v10.4S, v29.4S // .........................................*....................... + trn2 v6.4S, v21.4S, v1.4S // ............................e.................................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - trn1 v18.4S, v15.4S, v14.4S // .............................e................................... - trn2 v10.4S, v15.4S, v14.4S // ..............................e.................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + trn1 v10.4S, v18.4S, v30.4S // .............................e................................... + trn2 v1.4S, v18.4S, v30.4S // ..............................e.................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - str q12, [x0, #-32] // ...............................................................*. - mul v12.4S, v25.4S, v19.S[0] // ......................................................*.......... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + str q4, [x0, #-32] // ...............................................................*. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - trn1 v17.2D, v2.2D, v18.2D // .................................e............................... - trn2 v13.2D, v2.2D, v18.2D // ...............................e................................. - str q28, [x0, #-48] // ..............................................................*.. - trn2 v28.2D, v9.2D, v10.2D // ................................e................................ - trn1 v9.2D, v9.2D, v10.2D // ..................................e.............................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + trn2 v5.2D, v7.2D, v10.2D // ...............................e................................. + trn1 v4.2D, v7.2D, v10.2D // .................................e............................... + trn1 v19.2D, v6.2D, v1.2D // ..................................e.............................. + trn2 v6.2D, v6.2D, v1.2D // ................................e................................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + add v11.4S, v20.4S, v13.4S // .....................................................*........... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - add v3.4S, v17.4S, v9.4S // ......................................e.......................... - add v2.4S, v13.4S, v28.4S // ...........................................e..................... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + sub v2.4S, v20.4S, v13.4S // ....................................................*............ + add v0.4S, v4.4S, v19.4S // ......................................e.......................... + add v23.4S, v5.4S, v6.4S // ...........................................e..................... // gap // ................................................................. // gap // ................................................................. - mls v12.4S, v31.4S, v29.4S // ........................................................*........ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + srshr v24.4S, v11.4S, #23 // ...........................................................*..... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - add v21.4S, v3.4S, v2.4S // ................................................e................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + sqrdmulh v8.4S, v2.4S, v12.S[1] // ......................................................*.......... + add v22.4S, v0.4S, v23.4S // ................................................e................ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + mul v13.4S, v2.4S, v12.S[0] // .......................................................*......... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - str q12, [x0, #-16] // ................................................................* - srshr v12.4S, v21.4S, #23 // .........................................................e....... // gap // ................................................................. + srshr v9.4S, v22.4S, #23 // .........................................................e....... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - ldr q19, [x4], #8 // ...................................e............................. // gap // ................................................................. // gap // ................................................................. + mls v11.4S, v24.4S, v29.4S // ............................................................*.... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. @@ -935,22 +941,23 @@ layer5678_start: // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + mls v13.4S, v8.4S, v29.4S // ........................................................*........ // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - mls v21.4S, v12.4S, v29.4S // ..........................................................e...... // gap // ................................................................. + mls v22.4S, v9.4S, v29.4S // ..........................................................e...... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - ldr q11, [x4], #16 // ....................................e............................ // gap // ................................................................. + str q11, [x0, #-48] // ..............................................................*.. // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. @@ -958,6 +965,7 @@ layer5678_start: // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. + str q13, [x0, #-16] // ................................................................* // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. @@ -965,7 +973,7 @@ layer5678_start: // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. - str q21, [x0], #(16*4) // .............................................................e... + str q22, [x0], #(16*4) // .............................................................e... // gap // ................................................................. // gap // ................................................................. // gap // ................................................................. @@ -974,235 +982,267 @@ layer5678_start: // gap // ................................................................. // gap // ................................................................. - // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e...............................................................|e.......................................................... - // ldr q0, [x3], #(6*16) // ..........................e.....................................|..........................e................................ - // ldr q4, [x3, #(-6*16 + 1*16)] // ...............e................................................|...............e........................................... - // ldr q1, [x3, #(-6*16 + 2*16)] // ...e............................................................|...e....................................................... - // ldr q5, [x3, #(-6*16 + 3*16)] // ....e...........................................................|....e...................................................... - // ldr q2, [x3, #(-6*16 + 4*16)] // .....e..........................................................|.....e..................................................... - // ldr q6, [x3, #(-6*16 + 5*16)] // ......e.........................................................|......e.................................................... - // sub v24.4s, v8.4s, v9.4s // ..........e.....................................................|..........e................................................ - // add v8.4s, v8.4s, v9.4s // .............e..................................................|.............e............................................. - // mul v9.4s, v24.4s, v1.4s // ................e...............................................|................e.......................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .................e..............................................|.................e......................................... - // mls v9.4s, v24.4s, v29.4s // .......................e........................................|.......................e................................... - // sub v24.4s, v10.4s, v11.4s // ...........e....................................................|...........e............................................... - // add v10.4s, v10.4s, v11.4s // ..............e.................................................|..............e............................................ - // mul v11.4s, v24.4s, v2.4s // ..................e.............................................|..................e........................................ - // sqrdmulh v24.4s, v24.4s, v6.4s // ...................e............................................|...................e....................................... - // mls v11.4s, v24.4s, v29.4s // ........................e.......................................|........................e.................................. - // sub v24.4s, v8.4s, v10.4s // ......................e.........................................|......................e.................................... - // add v8.4s, v8.4s, v10.4s // ............................e...................................|............................e.............................. - // mul v10.4s, v24.4s, v0.4s // ................................e...............................|................................e.......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...........................e....................................|...........................e............................... - // mls v10.4s, v24.4s, v29.4s // .......................................e........................|.......................................e................... - // sub v24.4s, v9.4s, v11.4s // .............................e..................................|.............................e............................. - // add v9.4s, v9.4s, v11.4s // .....................................e..........................|.....................................e..................... - // mul v11.4s, v24.4s, v0.4s // .................................e..............................|.................................e......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................e................................|...............................e........................... - // mls v11.4s, v24.4s, v29.4s // ......................................e.........................|......................................e.................... - // trn1 v25.4s, v8.4s, v9.4s // ............................................e...................|............................................e.............. - // trn2 v26.4s, v8.4s, v9.4s // ...........................................e....................|...........................................e............... - // trn1 v27.4s, v10.4s, v11.4s // .............................................e..................|.............................................e............. - // trn2 v28.4s, v10.4s, v11.4s // ..............................................e.................|..............................................e............ - // trn2 v10.2d, v25.2d, v27.2d // ..................................................e.............|..................................................e........ - // trn2 v11.2d, v26.2d, v28.2d // ....................................................e...........|....................................................e...... - // trn1 v8.2d, v25.2d, v27.2d // .................................................e..............|.................................................e......... - // trn1 v9.2d, v26.2d, v28.2d // .....................................................e..........|.....................................................e..... - // ldr q1, [x4], #8 // ............................................................e...|........................................................... - // ldr q0, [x4], #16 // ..............................................................e.|........................................................... - // sub v24.4s, v8.4s, v9.4s // ................................................................*........................................................... - // add v8.4s, v8.4s, v9.4s // ......................................................e.........|......................................................e.... - // mul v9.4s, v24.4s, v0.s[0] // .*..............................................................|.*......................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..*.............................................................|..*........................................................ - // mls v9.4s, v24.4s, v29.4s // ........*.......................................................|........*.................................................. - // sub v24.4s, v10.4s, v11.4s // .........*......................................................|.........*................................................. - // add v10.4s, v10.4s, v11.4s // .......................................................e........|.......................................................e... - // mul v11.4s, v24.4s, v0.s[2] // ....................*...........................................|....................*...................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............*...................................................|............*.............................................. - // mls v11.4s, v24.4s, v29.4s // .........................*......................................|.........................*................................. - // sub v24.4s, v8.4s, v10.4s // .......*........................................................|.......*................................................... - // add v8.4s, v8.4s, v10.4s // .........................................................e......|.........................................................e. - // mul v10.4s, v24.4s, v1.s[0] // ...................................*............................|...................................*....................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................*..........................................|.....................*..................................... - // mls v10.4s, v24.4s, v29.4s // .........................................*......................|.........................................*................. - // sub v24.4s, v9.4s, v11.4s // ..................................*.............................|..................................*........................ - // add v9.4s, v9.4s, v11.4s // ..............................*.................................|..............................*............................ - // mul v11.4s, v24.4s, v1.s[0] // ................................................*...............|................................................*.......... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................*.......................|........................................*.................. - // mls v11.4s, v24.4s, v29.4s // ........................................................*.......|........................................................*.. - // srshr v24.4S, v8.4S, #23 // ...........................................................e....|........................................................... - // mls v8.4s, v24.4s, v29.4s // .............................................................e..|........................................................... - // srshr v24.4S, v9.4S, #23 // ....................................*...........................|....................................*...................... - // mls v9.4s, v24.4s, v29.4s // ..........................................*.....................|..........................................*................ - // str q8, [x0], #(16*4) // ...............................................................e|........................................................... - // str q9, [x0, #(-16*4 + 1*16)] // ...................................................*............|...................................................*....... - // str q10, [x0, #(-16*4 + 2*16)] // ...............................................*................|...............................................*........... - // str q11, [x0, #(-16*4 + 3*16)] // ..........................................................*.....|..........................................................* + // --------------------------------------------------------- new position ---------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--- + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e................................................................~............................................................... + // ldr q0, [x3], #(6*16) // .................e...............................................'................~.............................................. + // ldr q4, [x3, #(-6*16 + 1*16)] // ............e....................................................'...........~................................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // ..e..............................................................'.~............................................................. + // ldr q5, [x3, #(-6*16 + 3*16)] // ....e............................................................'...~........................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ...e.............................................................'..~............................................................ + // ldr q6, [x3, #(-6*16 + 5*16)] // .....e...........................................................'....~.......................................................... + // sub v24.4s, v8.4s, v9.4s // ......e..........................................................'.....~......................................................... + // add v8.4s, v8.4s, v9.4s // ...............e.................................................'..............~................................................ + // sqrdmulh v27.4s, v24.4s, v5.4s // ........e........................................................'.......~....................................................... + // mul v9.4s, v24.4s, v1.4s // .........e.......................................................'........~...................................................... + // mls v9.4s, v27.4s, v29.4s // ..................e..............................................'.................~............................................. + // sub v24.4s, v10.4s, v11.4s // .......e.........................................................'......~........................................................ + // add v10.4s, v10.4s, v11.4s // ...................e.............................................'..................~............................................ + // sqrdmulh v27.4s, v24.4s, v6.4s // ...........e.....................................................'..........~.................................................... + // mul v11.4s, v24.4s, v2.4s // ..........e......................................................'.........~..................................................... + // mls v11.4s, v27.4s, v29.4s // ....................e............................................'...................~........................................... + // sub v24.4s, v8.4s, v10.4s // .......................e.........................................'......................~........................................ + // add v8.4s, v8.4s, v10.4s // ........................e........................................'.......................~....................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ..........................e......................................'.........................~..................................... + // mul v10.4s, v24.4s, v0.4s // ...........................e.....................................'..........................~.................................... + // mls v10.4s, v27.4s, v29.4s // .....................................e...........................'....................................~.......................... + // sub v24.4s, v9.4s, v11.4s // .........................e.......................................'........................~...................................... + // add v9.4s, v9.4s, v11.4s // ...................................e.............................'..................................~............................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ..............................e..................................'.............................~................................. + // mul v11.4s, v24.4s, v0.4s // .............................e...................................'............................~.................................. + // mls v11.4s, v27.4s, v29.4s // ....................................e............................'...................................~........................... + // trn1 v25.4s, v8.4s, v9.4s // ........................................e........................'.......................................~....................... + // trn2 v26.4s, v8.4s, v9.4s // ..........................................e......................'.........................................~..................... + // trn1 v27.4s, v10.4s, v11.4s // ...........................................e.....................'..........................................~.................... + // trn2 v28.4s, v10.4s, v11.4s // ............................................e....................'...........................................~................... + // trn2 v10.2d, v25.2d, v27.2d // ..............................................e..................'.............................................~................. + // trn2 v11.2d, v26.2d, v28.2d // .................................................e...............'................................................~.............. + // trn1 v8.2d, v25.2d, v27.2d // ...............................................e.................'..............................................~................ + // trn1 v9.2d, v26.2d, v28.2d // ................................................e................'...............................................~............... + // ldr q1, [x4], #8 // .............~...................................................'............*.................................................. + // ldr q0, [x4], #16 // .....................~...........................................'....................*.......................................... + // sub v24.4s, v8.4s, v9.4s // ................~................................................'...............*............................................... + // add v8.4s, v8.4s, v9.4s // ....................................................e............'...................................................~........... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ............................~....................................'...........................*................................... + // mul v9.4s, v24.4s, v0.s[0] // ..................................~..............................'.................................*............................. + // mls v9.4s, v27.4s, v29.4s // .........................................~.......................'........................................*...................... + // sub v24.4s, v10.4s, v11.4s // .~...............................................................'*.............................................................. + // add v10.4s, v10.4s, v11.4s // .....................................................e...........'....................................................~.......... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ...............................~.................................'..............................*................................ + // mul v11.4s, v24.4s, v0.s[2] // ................................~................................'...............................*............................... + // mls v11.4s, v27.4s, v29.4s // .......................................~.........................'......................................*........................ + // sub v24.4s, v8.4s, v10.4s // ..............~..................................................'.............*................................................. + // add v8.4s, v8.4s, v10.4s // ........................................................e........'.......................................................~....... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ......................~..........................................'.....................*......................................... + // mul v10.4s, v24.4s, v1.s[0] // .................................~...............................'................................*.............................. + // mls v10.4s, v27.4s, v29.4s // ......................................~..........................'.....................................*......................... + // sub v24.4s, v9.4s, v11.4s // ...................................................~.............'..................................................*............ + // add v9.4s, v9.4s, v11.4s // ..................................................~..............'.................................................*............. + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .......................................................~.........'......................................................*........ + // mul v11.4s, v24.4s, v1.s[0] // .........................................................~.......'........................................................*...... + // mls v11.4s, v27.4s, v29.4s // ............................................................~....'...........................................................*... + // srshr v24.4S, v8.4S, #23 // ..........................................................e......'.........................................................~..... + // mls v8.4s, v24.4s, v29.4s // .............................................................e...'............................................................~.. + // srshr v24.4S, v9.4S, #23 // ......................................................~..........'.....................................................*......... + // mls v9.4s, v24.4s, v29.4s // ...........................................................~.....'..........................................................*.... + // str q8, [x0], #(16*4) // ................................................................e'............................................................... + // str q9, [x0, #(-16*4 + 1*16)] // ..............................................................~..'.............................................................*. + // str q10, [x0, #(-16*4 + 2*16)] // .............................................~...................'............................................*.................. + // str q11, [x0, #(-16*4 + 3*16)] // ...............................................................~.'..............................................................* sub count, count, #1 cbnz count, layer5678_start - sub v10.4S, v17.4S, v9.4S // *..................... - sub v18.4S, v3.4S, v2.4S // ...*.................. - sub v22.4S, v13.4S, v28.4S // .....*................ - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mul v26.4S, v10.4S, v11.S[0] // .*.................... - sqrdmulh v10.4S, v10.4S, v11.S[1] // ..*................... - sqrdmulh v7.4S, v22.4S, v11.S[3] // ......*............... - mul v22.4S, v22.4S, v11.S[2] // .......*.............. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqrdmulh v30.4S, v18.4S, v19.S[1] // ........*............. - mul v18.4S, v18.4S, v19.S[0] // ............*......... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v26.4S, v10.4S, v29.4S // ....*................. - mls v22.4S, v7.4S, v29.4S // .........*............ - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v18.4S, v30.4S, v29.4S // ...............*...... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sub v10.4S, v26.4S, v22.4S // ...........*.......... - add v22.4S, v26.4S, v22.4S // ..........*........... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - str q18, [x0, #-32] // .................*.... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqrdmulh v18.4S, v10.4S, v19.S[1] // ..............*....... - mul v10.4S, v10.4S, v19.S[0] // ..................*... - srshr v26.4S, v22.4S, #23 // .............*........ - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v22.4S, v26.4S, v29.4S // ................*..... - mls v10.4S, v18.4S, v29.4S // ....................*. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - str q22, [x0, #-48] // ...................*.. - str q10, [x0, #-16] // .....................* - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... + // Instructions: 24 + // Expected cycles: 19 + // Expected IPC: 1.26 + // + // Wall time: 0.28s + // User time: 0.28s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sub v21.4S, v5.4S, v6.4S // *............................. + sub v1.4S, v0.4S, v23.4S // ..*........................... + sub v22.4S, v4.4S, v19.4S // ...*.......................... + ldr q18, [x4], #8 // .*............................ + ldr q8, [x4], #16 // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v9.4S, v21.4S, v8.S[3] // .......*...................... + mul v21.4S, v21.4S, v8.S[2] // ........*..................... + sqrdmulh v2.4S, v22.4S, v8.S[1] // ......*....................... + mul v22.4S, v22.4S, v8.S[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v8.4S, v1.4S, v18.S[1] // .....*........................ + mul v1.4S, v1.4S, v18.S[0] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v21.4S, v9.4S, v29.4S // ............*................. + mls v22.4S, v2.4S, v29.4S // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v1.4S, v8.4S, v29.4S // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v8.4S, v22.4S, v21.4S // ................*............. + add v21.4S, v22.4S, v21.4S // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q1, [x0, #-32] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v1.4S, v8.4S, v18.S[1] // ..................*........... + mul v22.4S, v8.4S, v18.S[0] // ...................*.......... + srshr v18.4S, v21.4S, #23 // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v21.4S, v18.4S, v29.4S // ....................*......... + mls v22.4S, v1.4S, v29.4S // .....................*........ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q21, [x0, #-48] // ......................*....... + str q22, [x0, #-16] // .......................*...... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. - // original source code - // sub v26.4S, v17.4S, v9.4S // *..................... - // mul v4.4S, v26.4S, v11.S[0] // ...*.................. - // sqrdmulh v8.4S, v26.4S, v11.S[1] // ....*................. - // sub v2.4S, v3.4S, v2.4S // .*.................... - // mls v4.4S, v8.4S, v29.4S // .........*............ - // sub v8.4S, v13.4S, v28.4S // ..*................... - // sqrdmulh v28.4S, v8.4S, v11.S[3] // .....*................ - // mul v23.4S, v8.4S, v11.S[2] // ......*............... - // sqrdmulh v17.4S, v2.4S, v19.S[1] // .......*.............. - // mls v23.4S, v28.4S, v29.4S // ..........*........... - // add v28.4S, v4.4S, v23.4S // .............*........ - // sub v25.4S, v4.4S, v23.4S // ............*......... - // mul v12.4S, v2.4S, v19.S[0] // ........*............. - // srshr v6.4S, v28.4S, #23 // .................*.... - // sqrdmulh v31.4S, v25.4S, v19.S[1] // ...............*...... - // mls v12.4S, v17.4S, v29.4S // ...........*.......... - // mls v28.4S, v6.4S, v29.4S // ..................*... - // str q12, [x0, #-32] // ..............*....... - // mul v12.4S, v25.4S, v19.S[0] // ................*..... - // str q28, [x0, #-48] // ....................*. - // mls v12.4S, v31.4S, v29.4S // ...................*.. - // str q12, [x0, #-16] // .....................* + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sub v15.4S, v5.4S, v6.4S // *.............................. + // ldr q12, [x4], #8 // ...*........................... + // sub v9.4S, v0.4S, v23.4S // .*............................. + // sub v24.4S, v4.4S, v19.4S // ..*............................ + // ldr q27, [x4], #16 // ....*.......................... + // sqrdmulh v19.4S, v9.4S, v12.S[1] // .........*..................... + // sqrdmulh v10.4S, v24.4S, v27.S[1] // .......*....................... + // sqrdmulh v14.4S, v15.4S, v27.S[3] // .....*......................... + // mul v13.4S, v15.4S, v27.S[2] // ......*........................ + // mul v4.4S, v9.4S, v12.S[0] // ..........*.................... + // mul v20.4S, v24.4S, v27.S[0] // ........*...................... + // mls v4.4S, v19.4S, v29.4S // .............*................. + // mls v13.4S, v14.4S, v29.4S // ...........*................... + // mls v20.4S, v10.4S, v29.4S // ............*.................. + // str q4, [x0, #-32] // ................*.............. + // add v11.4S, v20.4S, v13.4S // ...............*............... + // sub v2.4S, v20.4S, v13.4S // ..............*................ + // srshr v24.4S, v11.4S, #23 // ...................*........... + // sqrdmulh v8.4S, v2.4S, v12.S[1] // .................*............. + // mul v13.4S, v2.4S, v12.S[0] // ..................*............ + // mls v11.4S, v24.4S, v29.4S // ....................*.......... + // mls v13.4S, v8.4S, v29.4S // .....................*......... + // str q11, [x0, #-48] // ......................*........ + // str q13, [x0, #-16] // .......................*....... .unreq root0_tw @@ -1244,853 +1284,862 @@ layer5678_start: load_roots_1234 r_ptr1 .p2align 2 - ldr q11, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... - ldr q9, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. - ldr q12, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... - ldr q22, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... - ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... - ldr q18, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... - ldr q27, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... - ldr q24, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. ldr q21, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ - ldr q15, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ - ldr q14, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. - sub v10.4S, v12.4S, v9.4S // ..........................*............................................................................................................................................................................................................................................................. - ldr q19, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. - sub v16.4S, v11.4S, v22.4S // .....................*.................................................................................................................................................................................................................................................................. - add v8.4S, v11.4S, v22.4S // ......................*................................................................................................................................................................................................................................................................. - sub v22.4S, v20.4S, v18.4S // ................*....................................................................................................................................................................................................................................................................... - sub v11.4S, v27.4S, v24.4S // ....................................*................................................................................................................................................................................................................................................... - add v17.4S, v27.4S, v24.4S // .....................................*.................................................................................................................................................................................................................................................. - mul v13.4S, v10.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... - sqrdmulh v10.4S, v10.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... - add v24.4S, v12.4S, v9.4S // ...........................*............................................................................................................................................................................................................................................................ - add v18.4S, v20.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... - sub v28.4S, v14.4S, v21.4S // .........................................*.............................................................................................................................................................................................................................................. - sqrdmulh v23.4S, v16.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... - sqrdmulh v9.4S, v11.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ - mul v20.4S, v11.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. - mul v11.4S, v16.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ - sub v12.4S, v19.4S, v15.4S // ...............................*........................................................................................................................................................................................................................................................ - add v14.4S, v14.4S, v21.4S // ..........................................*............................................................................................................................................................................................................................................. - mul v21.4S, v28.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ - sqrdmulh v28.4S, v28.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... - mls v13.4S, v10.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... - ldr q10, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... - ldr q16, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... - sqrdmulh v27.4S, v12.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... - add v19.4S, v19.4S, v15.4S // ................................*....................................................................................................................................................................................................................................................... - mul v15.4S, v12.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... - ldr q12, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... - mls v11.4S, v23.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. - ldr q23, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ - mls v20.4S, v9.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... - sub v9.4S, v18.4S, v8.4S // ........................................................*............................................................................................................................................................................................................................... - add v8.4S, v18.4S, v8.4S // .........................................................*.............................................................................................................................................................................................................................. - mul v18.4S, v22.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... - mls v21.4S, v28.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... - add v28.4S, v24.4S, v19.4S // ...................................................................*.................................................................................................................................................................................................................... - sub v24.4S, v24.4S, v19.4S // ..................................................................*..................................................................................................................................................................................................................... - sqrdmulh v19.4S, v22.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... - mls v15.4S, v27.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... - sub v27.4S, v16.4S, v10.4S // ..............................................*......................................................................................................................................................................................................................................... - add v22.4S, v16.4S, v10.4S // ...............................................*........................................................................................................................................................................................................................................ - sqrdmulh v16.4S, v9.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ - mul v10.4S, v9.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. - add v9.4S, v12.4S, v23.4S // ....................................................*................................................................................................................................................................................................................................... - sub v12.4S, v12.4S, v23.4S // ...................................................*.................................................................................................................................................................................................................................... - sub v23.4S, v20.4S, v21.4S // .................................................................................*...................................................................................................................................................................................................... - mls v18.4S, v19.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... - add v21.4S, v20.4S, v21.4S // ..................................................................................*..................................................................................................................................................................................................... - mls v10.4S, v16.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... - sqrdmulh v19.4S, v24.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. - mul v16.4S, v24.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... - mul v24.4S, v12.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. - sqrdmulh v20.4S, v12.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. - add v12.4S, v22.4S, v9.4S // .......................................................................................*................................................................................................................................................................................................ - sub v22.4S, v22.4S, v9.4S // ......................................................................................*................................................................................................................................................................................................. - add v9.4S, v18.4S, v11.4S // ..............................................................*......................................................................................................................................................................................................................... - sub v11.4S, v18.4S, v11.4S // .............................................................*.......................................................................................................................................................................................................................... - add v18.4S, v13.4S, v15.4S // ........................................................................*............................................................................................................................................................................................................... - mls v16.4S, v19.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. - mls v24.4S, v20.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ - sub v15.4S, v13.4S, v15.4S // .......................................................................*................................................................................................................................................................................................................ - add v19.4S, v17.4S, v14.4S // .............................................................................*.......................................................................................................................................................................................................... - sqrdmulh v20.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... - sub v17.4S, v17.4S, v14.4S // ............................................................................*........................................................................................................................................................................................................... - mul v14.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ - mul v13.4S, v27.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... - sqrdmulh v11.4S, v27.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... - mul v27.4S, v22.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... - sqrdmulh v22.4S, v22.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. - mls v14.4S, v20.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... - sqrdmulh v20.4S, v15.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. - mul v15.4S, v15.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. - mls v27.4S, v22.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. - add v22.4S, v19.4S, v12.4S // .....................................................................................................................*.................................................................................................................................................................. - sub v19.4S, v19.4S, v12.4S // ....................................................................................................................*................................................................................................................................................................... - sub v12.4S, v8.4S, v28.4S // ................................................................................................*....................................................................................................................................................................................... - add v8.4S, v8.4S, v28.4S // .................................................................................................*...................................................................................................................................................................................... - sub v28.4S, v10.4S, v16.4S // ..........................................................................................................*............................................................................................................................................................................. - add v10.4S, v10.4S, v16.4S // ...........................................................................................................*............................................................................................................................................................................ - sqrdmulh v16.4S, v23.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... - mul v23.4S, v23.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... - mls v13.4S, v11.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... - sub v11.4S, v8.4S, v22.4S // ........................................................................................................................................*............................................................................................................................................... - add v8.4S, v8.4S, v22.4S // .........................................................................................................................................*.............................................................................................................................................. - mul v22.4S, v17.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... - sqrdmulh v17.4S, v17.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ - mls v15.4S, v20.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ - mul v20.4S, v19.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. - sqrdmulh v19.4S, v19.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ - mls v23.4S, v16.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. - add v16.4S, v13.4S, v24.4S // ............................................................................................*........................................................................................................................................................................................... - mls v22.4S, v17.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... - sub v24.4S, v13.4S, v24.4S // ...........................................................................................*............................................................................................................................................................................................ - sqrdmulh v13.4S, v11.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - add v17.4S, v21.4S, v16.4S // ..........................................................................................................................*............................................................................................................................................................. - sub v21.4S, v21.4S, v16.4S // .........................................................................................................................*.............................................................................................................................................................. - mul v16.4S, v11.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - sub v11.4S, v9.4S, v18.4S // .....................................................................................................*.................................................................................................................................................................................. - mls v20.4S, v19.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... - add v9.4S, v9.4S, v18.4S // ......................................................................................................*................................................................................................................................................................................. - add v18.4S, v22.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ - sub v22.4S, v22.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... - sub v27.4S, v14.4S, v15.4S // ...............................................................................................................*........................................................................................................................................................................ - add v14.4S, v14.4S, v15.4S // ................................................................................................................*....................................................................................................................................................................... - sqrdmulh v19.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... - mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... - sub v15.4S, v9.4S, v17.4S // .............................................................................................................................................*.......................................................................................................................................... - add v9.4S, v9.4S, v17.4S // ..............................................................................................................................................*......................................................................................................................................... - sqrdmulh v17.4S, v24.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... - mul v24.4S, v24.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... - mls v16.4S, v13.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - sqrdmulh v13.4S, v22.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... - mls v12.4S, v19.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... - sub v19.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... - add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... - mul v18.4S, v22.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... - mls v24.4S, v17.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ - mul v17.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - sqrdmulh v22.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - mls v18.4S, v13.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... - mul v13.4S, v11.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ - sqrdmulh v11.4S, v11.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... - mls v17.4S, v22.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - sqrdmulh v22.4S, v27.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... - sub v28.4S, v23.4S, v24.4S // ...................................................................................................................................*.................................................................................................................................................... - add v23.4S, v23.4S, v24.4S // ....................................................................................................................................*................................................................................................................................................... - mul v24.4S, v27.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... - mul v27.4S, v21.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ - mls v13.4S, v11.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. - sqrdmulh v21.4S, v21.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + ldr q22, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q8, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q9, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q14, [x1] // *....................................................................................................................................................................................................................................................................................... + ldr q27, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q24, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q15, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q16, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q28, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q12, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q20, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + sub v19.4S, v22.4S, v21.4S // .........................................*.............................................................................................................................................................................................................................................. + add v21.4S, v22.4S, v21.4S // ..........................................*............................................................................................................................................................................................................................................. + ldr q22, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q11, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + sub v18.4S, v14.4S, v27.4S // ................*....................................................................................................................................................................................................................................................................... + add v23.4S, v9.4S, v8.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v13.4S, v9.4S, v8.4S // ..........................*............................................................................................................................................................................................................................................................. + add v8.4S, v14.4S, v27.4S // .................*...................................................................................................................................................................................................................................................................... + sub v14.4S, v15.4S, v24.4S // ....................................*................................................................................................................................................................................................................................................... + add v24.4S, v15.4S, v24.4S // .....................................*.................................................................................................................................................................................................................................................. + sqrdmulh v27.4S, v19.4S, v6.S[1] // ...........................................*............................................................................................................................................................................................................................................ + mul v19.4S, v19.4S, v6.S[0] // ............................................*........................................................................................................................................................................................................................................... + add v10.4S, v28.4S, v16.4S // ......................*................................................................................................................................................................................................................................................................. + sub v15.4S, v28.4S, v16.4S // .....................*.................................................................................................................................................................................................................................................................. + sqrdmulh v17.4S, v18.4S, v3.S[3] // ..................*..................................................................................................................................................................................................................................................................... + mul v9.4S, v18.4S, v3.S[2] // ...................*.................................................................................................................................................................................................................................................................... + sub v28.4S, v20.4S, v11.4S // ..............................................*......................................................................................................................................................................................................................................... + add v16.4S, v20.4S, v11.4S // ...............................................*........................................................................................................................................................................................................................................ + add v18.4S, v12.4S, v22.4S // ................................*....................................................................................................................................................................................................................................................... + sub v20.4S, v12.4S, v22.4S // ...............................*........................................................................................................................................................................................................................................................ + ldr q22, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + ldr q12, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + sqrdmulh v11.4S, v14.4S, v5.S[3] // ......................................*................................................................................................................................................................................................................................................. + mls v9.4S, v17.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + mls v19.4S, v27.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mul v17.4S, v14.4S, v5.S[2] // .......................................*................................................................................................................................................................................................................................................ + sub v14.4S, v23.4S, v18.4S // ..................................................................*..................................................................................................................................................................................................................... + mul v27.4S, v20.4S, v5.S[0] // ..................................*..................................................................................................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v5.S[1] // .................................*...................................................................................................................................................................................................................................................... + add v23.4S, v23.4S, v18.4S // ...................................................................*.................................................................................................................................................................................................................... + sub v18.4S, v8.4S, v10.4S // ........................................................*............................................................................................................................................................................................................................... + add v8.4S, v8.4S, v10.4S // .........................................................*.............................................................................................................................................................................................................................. + add v10.4S, v22.4S, v12.4S // ....................................................*................................................................................................................................................................................................................................... + sub v12.4S, v22.4S, v12.4S // ...................................................*.................................................................................................................................................................................................................................... + mls v17.4S, v11.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + add v22.4S, v24.4S, v21.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v24.4S, v24.4S, v21.4S // ............................................................................*........................................................................................................................................................................................................... + mls v27.4S, v20.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v20.4S, v15.4S, v4.S[1] // .......................*................................................................................................................................................................................................................................................................ + mul v21.4S, v15.4S, v4.S[0] // ........................*............................................................................................................................................................................................................................................................... + mul v11.4S, v28.4S, v6.S[2] // .................................................*...................................................................................................................................................................................................................................... + sub v15.4S, v16.4S, v10.4S // ......................................................................................*................................................................................................................................................................................................. + add v16.4S, v16.4S, v10.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v10.4S, v28.4S, v6.S[3] // ................................................*....................................................................................................................................................................................................................................... + mls v21.4S, v20.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + mls v11.4S, v10.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mul v10.4S, v13.4S, v4.S[2] // .............................*.......................................................................................................................................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v4.S[3] // ............................*........................................................................................................................................................................................................................................................... + add v20.4S, v17.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... + sub v17.4S, v17.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... + mul v19.4S, v14.4S, v2.S[0] // .....................................................................*.................................................................................................................................................................................................................. + sqrdmulh v14.4S, v14.4S, v2.S[1] // ....................................................................*................................................................................................................................................................................................................... + sub v28.4S, v22.4S, v16.4S // ....................................................................................................................*................................................................................................................................................................... + add v16.4S, v22.4S, v16.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v22.4S, v8.4S, v23.4S // ................................................................................................*....................................................................................................................................................................................... + add v8.4S, v8.4S, v23.4S // .................................................................................................*...................................................................................................................................................................................... + mls v10.4S, v13.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + sqrdmulh v13.4S, v17.4S, v2.S[3] // ...................................................................................*.................................................................................................................................................................................................... + mul v23.4S, v17.4S, v2.S[2] // ....................................................................................*................................................................................................................................................................................................... + sub v17.4S, v9.4S, v21.4S // .............................................................*.......................................................................................................................................................................................................................... + add v9.4S, v9.4S, v21.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v21.4S, v12.4S, v7.S[0] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v12.4S, v12.4S, v7.S[1] // .....................................................*.................................................................................................................................................................................................................................. + mls v19.4S, v14.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + sqrdmulh v14.4S, v15.4S, v3.S[1] // ........................................................................................*............................................................................................................................................................................................... + mul v15.4S, v15.4S, v3.S[0] // .........................................................................................*.............................................................................................................................................................................................. + mls v23.4S, v13.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v13.4S, v10.4S, v27.4S // ........................................................................*............................................................................................................................................................................................................... + sub v27.4S, v10.4S, v27.4S // .......................................................................*................................................................................................................................................................................................................ + sqrdmulh v10.4S, v18.4S, v1.S[3] // ..........................................................*............................................................................................................................................................................................................................. + mls v21.4S, v12.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mul v12.4S, v18.4S, v1.S[2] // ...........................................................*............................................................................................................................................................................................................................ + mls v12.4S, v10.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v10.4S, v8.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. + add v18.4S, v11.4S, v21.4S // ............................................................................................*........................................................................................................................................................................................... + mls v15.4S, v14.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sub v16.4S, v11.4S, v21.4S // ...........................................................................................*............................................................................................................................................................................................ + sub v14.4S, v9.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. + mul v21.4S, v24.4S, v2.S[2] // ...............................................................................*........................................................................................................................................................................................................ + add v11.4S, v20.4S, v18.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v20.4S, v20.4S, v18.4S // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v2.S[3] // ..............................................................................*......................................................................................................................................................................................................... + add v9.4S, v9.4S, v13.4S // ......................................................................................................*................................................................................................................................................................................. + sub v13.4S, v12.4S, v19.4S // ..........................................................................................................*............................................................................................................................................................................. + add v12.4S, v12.4S, v19.4S // ...........................................................................................................*............................................................................................................................................................................ + sqrdmulh v18.4S, v17.4S, v1.S[3] // ...............................................................*........................................................................................................................................................................................................................ + mul v19.4S, v17.4S, v1.S[2] // ................................................................*....................................................................................................................................................................................................................... + mul v17.4S, v28.4S, v1.S[0] // .......................................................................................................................*................................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v1.S[1] // ......................................................................................................................*................................................................................................................................................................. + mls v21.4S, v24.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + mul v24.4S, v27.4S, v2.S[0] // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v27.4S, v27.4S, v2.S[1] // .........................................................................*.............................................................................................................................................................................................................. + mls v17.4S, v28.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + mul v28.4S, v13.4S, v0.S[2] // .............................................................................................................*.......................................................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[3] // ............................................................................................................*........................................................................................................................................................................... + mls v19.4S, v18.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + add v18.4S, v21.4S, v15.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v21.4S, v21.4S, v15.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v15.4S, v14.4S, v0.S[2] // ........................................................................................................*............................................................................................................................................................................... + mls v28.4S, v13.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mul v13.4S, v16.4S, v3.S[0] // ..............................................................................................*......................................................................................................................................................................................... + sqrdmulh v16.4S, v16.4S, v3.S[1] // .............................................................................................*.......................................................................................................................................................................................... + mls v24.4S, v27.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + sub v27.4S, v9.4S, v11.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v9.4S, v11.4S // ..............................................................................................................................................*......................................................................................................................................... + sqrdmulh v11.4S, v14.4S, v0.S[3] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v14.4S, v10.4S, v0.S[1] // ..........................................................................................................................................*............................................................................................................................................. + mls v13.4S, v16.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + mul v16.4S, v10.4S, v0.S[0] // ...........................................................................................................................................*............................................................................................................................................ + add v10.4S, v12.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + sub v18.4S, v12.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v15.4S, v11.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + add v11.4S, v19.4S, v24.4S // ................................................................................................................*....................................................................................................................................................................... + sub v19.4S, v19.4S, v24.4S // ...............................................................................................................*........................................................................................................................................................................ + sqrdmulh v24.4S, v20.4S, v1.S[1] // ...........................................................................................................................*............................................................................................................................................................ + mul v20.4S, v20.4S, v1.S[0] // ............................................................................................................................*........................................................................................................................................................... + mul v12.4S, v22.4S, v0.S[2] // ...................................................................................................*.................................................................................................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[3] // ..................................................................................................*..................................................................................................................................................................................... + mls v16.4S, v14.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + add v14.4S, v23.4S, v13.4S // ....................................................................................................................................*................................................................................................................................................... + sub v23.4S, v23.4S, v13.4S // ...................................................................................................................................*.................................................................................................................................................... + mul v13.4S, v21.4S, v1.S[0] // .................................................................................................................................*...................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v1.S[1] // ................................................................................................................................*....................................................................................................................................................... + mls v20.4S, v24.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sqrdmulh v24.4S, v18.4S, v0.S[1] // ....................................................................................................................................................*................................................................................................................................... + mul v18.4S, v18.4S, v0.S[0] // .....................................................................................................................................................*.................................................................................................................................. + mls v12.4S, v22.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... sub count, count, #1 layer1234_start: - mls v24.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... - cmge v22.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - mls v27.4S, v21.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - cmge v21.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... - add v11.4S, v14.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... - sub v23.4S, v14.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ - add v14.4S, v17.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ - sub v18.4S, v17.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. - sub v17.4S, v21.4S, v22.4S // ..................................................................................................................................................................................*..................................................................................................... - mul v22.4S, v18.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - sqrdmulh v21.4S, v18.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - sqrdmulh v17.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sqrdmulh v23.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - mls v22.4S, v21.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - mul v21.4S, v28.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - sqrdmulh v28.4S, v28.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - mls v18.4S, v17.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - mul v17.4S, v15.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ - sqrdmulh v23.4S, v15.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... - str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - sub v16.4S, v13.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... - add v13.4S, v13.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... - sub v27.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... - add v12.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... - mls v17.4S, v23.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - mls v21.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - cmge v15.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... - cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... - sub v15.4S, v15.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. - mul v20.4S, v27.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - sqrdmulh v27.4S, v27.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - sub v23.4S, v23.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... - sub v28.4S, v24.4S, v21.4S // ...........................................................................................................................................................................*............................................................................................................ - mls v18.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - add v15.4S, v24.4S, v21.4S // ............................................................................................................................................................................*........................................................................................................... - cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... - cmge v21.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - mls v20.4S, v27.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - sqrdmulh v27.4S, v28.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - mul v23.4S, v28.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - sub v28.4S, v24.4S, v21.4S // ..........................................................................................................................................................................................................*............................................................................. - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - mul v24.4S, v16.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - sqrdmulh v18.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - sqrdmulh v13.4S, v16.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - sqrdmulh v8.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - mls v23.4S, v27.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... - mls v21.4S, v18.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - mls v24.4S, v13.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - cmge v13.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... - cmge v28.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v19.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - sub v8.4S, v27.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. - cmge v11.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... - cmge v18.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - sub v27.4S, v13.4S, v28.4S // ..................................................................................................................................................................................................*..................................................................................... - cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - sqrdmulh v13.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - sub v11.4S, v11.4S, v18.4S // ..............................................................................................................................................................................................................*......................................................................... - mls v17.4S, v8.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - sqrdmulh v8.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - cmge v14.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - cmge v10.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - cmge v27.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - mls v23.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - sub v11.4S, v10.4S, v14.4S // ..............................................................................................................................................................................................................................................................*......................... - sub v10.4S, v28.4S, v27.4S // ..................................................................................................................................................................................................................................................*..................................... - str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - mls v22.4S, v13.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - sqrdmulh v10.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - cmge v15.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - cmge v12.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - mls v17.4S, v10.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - cmge v10.4S, v24.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - cmge v16.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - sub v27.4S, v13.4S, v15.4S // ......................................................................................................................................................................................................................................................................*................. - mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - ldr q28, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... - mls v18.4S, v8.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - sub v13.4S, v12.4S, v16.4S // ..........................................................................................................................................................................................................................................................................*............. - ldr q12, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... - sub v15.4S, v12.4S, v28.4S // .....................e.................................................................................................................................................................................................................................................................. - cmge v9.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... - mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - cmge v11.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - cmge v16.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - cmge v8.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - cmge v14.4S, v31.4S, v24.4S // ....................................................................................................................................................................................................*................................................................................... - mls v22.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - ldr q13, [x1, #0] // e....................................................................................................................................................................................................................................................................................... - add v28.4S, v12.4S, v28.4S // ......................e................................................................................................................................................................................................................................................................. - sub v12.4S, v9.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... - cmge v9.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - ldr q11, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... - sub v8.4S, v16.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - mls v21.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - sub v27.4S, v14.4S, v10.4S // ......................................................................................................................................................................................................*................................................................................. - cmge v16.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - ldr q14, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. - ldr q19, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... - mls v23.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - sub v12.4S, v9.4S, v16.4S // ......................................................................................................................................................................................................................................................*................................. - mls v20.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - mls v24.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - sub v16.4S, v13.4S, v11.4S // ................e....................................................................................................................................................................................................................................................................... - str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. - cmge v21.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - add v9.4S, v19.4S, v14.4S // ...........................e............................................................................................................................................................................................................................................................ - add v8.4S, v13.4S, v11.4S // .................e...................................................................................................................................................................................................................................................................... - sqrdmulh v10.4S, v15.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... - str q24, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - ldr q20, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. - cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... - str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - sub v13.4S, v19.4S, v14.4S // ..........................e............................................................................................................................................................................................................................................................. - mul v11.4S, v15.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ - sqrdmulh v19.4S, v16.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... - ldr q15, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. - mul v16.4S, v16.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... - ldr q23, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ - ldr q22, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ - sub v24.4S, v27.4S, v21.4S // ..........................................................................................................................................................................................................................................................*............................. - mls v17.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - add v21.4S, v8.4S, v28.4S // .........................................................e.............................................................................................................................................................................................................................. - ldr q14, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... - sub v12.4S, v8.4S, v28.4S // ........................................................e............................................................................................................................................................................................................................... - ldr q28, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. - sqrdmulh v27.4S, v13.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... - mul v13.4S, v13.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... - mls v16.4S, v19.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... - mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - sqrdmulh v19.4S, v12.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ - mls v11.4S, v10.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. - str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... - mls v13.4S, v27.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... - str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... - add v8.4S, v20.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... - sub v17.4S, v20.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ - mul v10.4S, v12.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. - add v24.4S, v14.4S, v28.4S // .....................................e.................................................................................................................................................................................................................................................. - sub v22.4S, v15.4S, v23.4S // .........................................e.............................................................................................................................................................................................................................................. - add v23.4S, v15.4S, v23.4S // ..........................................e............................................................................................................................................................................................................................................. - sub v28.4S, v14.4S, v28.4S // ....................................e................................................................................................................................................................................................................................................... - add v14.4S, v9.4S, v8.4S // ...................................................................e.................................................................................................................................................................................................................... - sub v27.4S, v9.4S, v8.4S // ..................................................................e..................................................................................................................................................................................................................... - mul v20.4S, v17.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... - sqrdmulh v12.4S, v17.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... - mul v8.4S, v28.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. - sqrdmulh v15.4S, v28.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ - sub v17.4S, v16.4S, v11.4S // .............................................................e.......................................................................................................................................................................................................................... - mls v10.4S, v19.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... - ldr q19, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... - ldr q28, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ - add v18.4S, v16.4S, v11.4S // ..............................................................e......................................................................................................................................................................................................................... - mul v9.4S, v27.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... - add v16.4S, v21.4S, v14.4S // .................................................................................................e...................................................................................................................................................................................... - sqrdmulh v27.4S, v27.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. - sub v11.4S, v24.4S, v23.4S // ............................................................................e........................................................................................................................................................................................................... - add v24.4S, v24.4S, v23.4S // .............................................................................e.......................................................................................................................................................................................................... - mul v23.4S, v22.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ - sqrdmulh v22.4S, v22.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... - mls v20.4S, v12.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... - mls v8.4S, v15.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... - mls v9.4S, v27.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. - ldr q27, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... - sub v12.4S, v21.4S, v14.4S // ................................................................................................e....................................................................................................................................................................................... - ldr q15, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... - sub v14.4S, v19.4S, v28.4S // ...................................................e.................................................................................................................................................................................................................................... - mls v23.4S, v22.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... - add v28.4S, v19.4S, v28.4S // ....................................................e................................................................................................................................................................................................................................... - add v19.4S, v13.4S, v20.4S // ........................................................................e............................................................................................................................................................................................................... - sub v13.4S, v13.4S, v20.4S // .......................................................................e................................................................................................................................................................................................................ - mul v21.4S, v14.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. - sqrdmulh v22.4S, v14.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. - mul v14.4S, v11.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... - sqrdmulh v20.4S, v11.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ - add v11.4S, v8.4S, v23.4S // ..................................................................................e..................................................................................................................................................................................................... - sub v23.4S, v8.4S, v23.4S // .................................................................................e...................................................................................................................................................................................................... - add v8.4S, v15.4S, v27.4S // ...............................................e........................................................................................................................................................................................................................................ - sub v27.4S, v15.4S, v27.4S // ..............................................e......................................................................................................................................................................................................................................... - mls v21.4S, v22.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ - mul v15.4S, v13.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. - sqrdmulh v22.4S, v13.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. - mls v14.4S, v20.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... - add v13.4S, v8.4S, v28.4S // .......................................................................................e................................................................................................................................................................................................ - sub v8.4S, v8.4S, v28.4S // ......................................................................................e................................................................................................................................................................................................. - sqrdmulh v28.4S, v27.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... - mul v27.4S, v27.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... - sub v20.4S, v24.4S, v13.4S // ....................................................................................................................e................................................................................................................................................................... - add v13.4S, v24.4S, v13.4S // .....................................................................................................................e.................................................................................................................................................................. - mls v15.4S, v22.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ - mls v27.4S, v28.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... - sqrdmulh v22.4S, v20.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ - mul v20.4S, v20.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. - sqrdmulh v24.4S, v23.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... - mul v23.4S, v23.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... - add v28.4S, v18.4S, v19.4S // ......................................................................................................e................................................................................................................................................................................. - sub v19.4S, v18.4S, v19.4S // .....................................................................................................e.................................................................................................................................................................................. - mul v18.4S, v8.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... - mls v20.4S, v22.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... - sub v22.4S, v27.4S, v21.4S // ...........................................................................................e............................................................................................................................................................................................ - add v27.4S, v27.4S, v21.4S // ............................................................................................e........................................................................................................................................................................................... - sqrdmulh v21.4S, v8.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. - mls v23.4S, v24.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. - mul v24.4S, v17.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ - sqrdmulh v8.4S, v17.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... - sub v17.4S, v10.4S, v9.4S // ..........................................................................................................e............................................................................................................................................................................. - add v10.4S, v10.4S, v9.4S // ...........................................................................................................e............................................................................................................................................................................ - sqrdmulh v9.4S, v22.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... - mul v22.4S, v22.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... - mls v18.4S, v21.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. - sqrdmulh v21.4S, v19.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... - mls v24.4S, v8.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... - add v8.4S, v16.4S, v13.4S // .........................................................................................................................................e.............................................................................................................................................. - sub v16.4S, v16.4S, v13.4S // ........................................................................................................................................e............................................................................................................................................... - mul v13.4S, v19.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ - sqrdmulh v19.4S, v17.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... - mul v17.4S, v17.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... - mls v13.4S, v21.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. - mls v22.4S, v9.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ - sqrdmulh v9.4S, v16.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ - mul v16.4S, v16.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. - mls v17.4S, v19.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... - add v21.4S, v14.4S, v18.4S // ...............................................................................................................................e........................................................................................................................................................ - sub v18.4S, v14.4S, v18.4S // ..............................................................................................................................e......................................................................................................................................................... - add v14.4S, v24.4S, v15.4S // ................................................................................................................e....................................................................................................................................................................... - sub v19.4S, v24.4S, v15.4S // ...............................................................................................................e........................................................................................................................................................................ - add v15.4S, v11.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. - sqrdmulh v24.4S, v12.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... - mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... - mls v16.4S, v9.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... - sub v11.4S, v11.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. - sqrdmulh v27.4S, v18.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... - mul v18.4S, v18.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... - add v9.4S, v28.4S, v15.4S // ..............................................................................................................................................e......................................................................................................................................... - sub v15.4S, v28.4S, v15.4S // .............................................................................................................................................e.......................................................................................................................................... - sub v28.4S, v23.4S, v22.4S // ...................................................................................................................................e.................................................................................................................................................... - add v23.4S, v23.4S, v22.4S // ....................................................................................................................................e................................................................................................................................................... - mls v12.4S, v24.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... - mul v24.4S, v19.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... - sqrdmulh v22.4S, v19.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... - sub v19.4S, v10.4S, v21.4S // ..................................................................................................................................................e..................................................................................................................................... - add v10.4S, v10.4S, v21.4S // ...................................................................................................................................................e.................................................................................................................................... - sqrdmulh v21.4S, v11.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... - mls v18.4S, v27.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... - mul v27.4S, v11.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + // Instructions: 280 + // Expected cycles: 35 + // Expected IPC: 8.00 + + // ---------------------------------------------------------------------------------------------------------------------------------- original position ----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + sqrdmulh v22.4S, v19.4S, v0.S[3] // *....................................................................................................................................................................................................................................................................................... + mul v19.4S, v19.4S, v0.S[2] // ...*.................................................................................................................................................................................................................................................................................... + mls v18.4S, v24.4S, v29.4S // .*...................................................................................................................................................................................................................................................................................... + sub v24.4S, v11.4S, v14.4S // ........*............................................................................................................................................................................................................................................................................... + add v11.4S, v11.4S, v14.4S // ..........*............................................................................................................................................................................................................................................................................. + mls v13.4S, v21.4S, v29.4S // ..*..................................................................................................................................................................................................................................................................................... + cmge v14.4S, v16.4S, v30.4S // .....*.................................................................................................................................................................................................................................................................................. + cmge v21.4S, v31.4S, v16.4S // ....*................................................................................................................................................................................................................................................................................... + mls v19.4S, v22.4S, v29.4S // ......*................................................................................................................................................................................................................................................................................. + sub v14.4S, v21.4S, v14.4S // .......*................................................................................................................................................................................................................................................................................ + mul v21.4S, v23.4S, v1.S[0] // ...............*........................................................................................................................................................................................................................................................................ + sub v22.4S, v28.4S, v13.4S // ............*........................................................................................................................................................................................................................................................................... + add v28.4S, v28.4S, v13.4S // ...........*............................................................................................................................................................................................................................................................................ + sqrdmulh v13.4S, v9.4S, v26.4S // ...............................................................*........................................................................................................................................................................................................................ + mul v9.4S, v9.4S, v25.4S // ................................................................*....................................................................................................................................................................................................................... + mls v16.4S, v14.4S, v29.4S // .................*...................................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v23.4S, v1.S[1] // ..............*......................................................................................................................................................................................................................................................................... + sub v23.4S, v15.4S, v20.4S // .........*.............................................................................................................................................................................................................................................................................. + mls v9.4S, v13.4S, v29.4S // .........................................................................*.............................................................................................................................................................................................................. + add v13.4S, v15.4S, v20.4S // .............*.......................................................................................................................................................................................................................................................................... + sub v15.4S, v12.4S, v17.4S // ................*....................................................................................................................................................................................................................................................................... + cmge v20.4S, v18.4S, v30.4S // ..........................*............................................................................................................................................................................................................................................................. + add v12.4S, v12.4S, v17.4S // ..................*..................................................................................................................................................................................................................................................................... + str q16, [x1, #512] // ........................*............................................................................................................................................................................................................................................................... + mul v17.4S, v27.4S, v0.S[0] // ...................*.................................................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v27.4S, v0.S[1] // ....................*................................................................................................................................................................................................................................................................... + cmge v27.4S, v31.4S, v18.4S // .........................*.............................................................................................................................................................................................................................................................. + mls v21.4S, v14.4S, v29.4S // .......................*................................................................................................................................................................................................................................................................ + mls v17.4S, v16.4S, v29.4S // ...........................*............................................................................................................................................................................................................................................................ + mul v16.4S, v23.4S, v0.S[0] // .....................*.................................................................................................................................................................................................................................................................. + sqrdmulh v23.4S, v23.4S, v0.S[1] // ......................*................................................................................................................................................................................................................................................................. + sub v20.4S, v27.4S, v20.4S // ...............................*........................................................................................................................................................................................................................................................ + mul v27.4S, v15.4S, v0.S[0] // ............................*........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v15.4S, v0.S[1] // .............................*.......................................................................................................................................................................................................................................................... + add v15.4S, v19.4S, v21.4S // ...................................*.................................................................................................................................................................................................................................................... + mls v18.4S, v20.4S, v29.4S // ..................................*..................................................................................................................................................................................................................................................... + sub v20.4S, v19.4S, v21.4S // .....................................*.................................................................................................................................................................................................................................................. + mul v19.4S, v24.4S, v0.S[0] // .............................................*.......................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v24.4S, v0.S[1] // ...............................................*........................................................................................................................................................................................................................................ + mls v16.4S, v23.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + cmge v23.4S, v17.4S, v30.4S // .........................................*.............................................................................................................................................................................................................................................. + cmge v21.4S, v31.4S, v17.4S // ..........................................*............................................................................................................................................................................................................................................. + mls v27.4S, v14.4S, v29.4S // ....................................*................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v22.4S, v0.S[1] // .................................*...................................................................................................................................................................................................................................................... + mul v22.4S, v22.4S, v0.S[0] // ................................*....................................................................................................................................................................................................................................................... + str q18, [x1, #640] // ...........................................*............................................................................................................................................................................................................................................ + mls v19.4S, v24.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sub v23.4S, v21.4S, v23.4S // ..............................................*......................................................................................................................................................................................................................................... + sqrdmulh v21.4S, v13.4S, v26.4S // ........................................................*............................................................................................................................................................................................................................... + mul v13.4S, v13.4S, v25.4S // ..................................................................*..................................................................................................................................................................................................................... + cmge v24.4S, v31.4S, v16.4S // ......................................*................................................................................................................................................................................................................................................. + cmge v18.4S, v16.4S, v30.4S // .......................................*................................................................................................................................................................................................................................................ + mls v17.4S, v23.4S, v29.4S // ...................................................*.................................................................................................................................................................................................................................... + mls v22.4S, v14.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sqrdmulh v23.4S, v28.4S, v26.4S // ..........................................................*............................................................................................................................................................................................................................. + mul v14.4S, v28.4S, v25.4S // ...........................................................*............................................................................................................................................................................................................................ + sub v28.4S, v24.4S, v18.4S // ............................................*........................................................................................................................................................................................................................................... + cmge v18.4S, v27.4S, v30.4S // ................................................*....................................................................................................................................................................................................................................... + str q17, [x1, #576] // ............................................................*........................................................................................................................................................................................................................... + cmge v17.4S, v31.4S, v27.4S // .................................................*...................................................................................................................................................................................................................................... + cmge v24.4S, v19.4S, v30.4S // ..............................................................*......................................................................................................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + mls v13.4S, v21.4S, v29.4S // .............................................................................*.......................................................................................................................................................................................................... + cmge v21.4S, v22.4S, v30.4S // ......................................................*................................................................................................................................................................................................................................. + cmge v23.4S, v31.4S, v22.4S // .....................................................*.................................................................................................................................................................................................................................. + mls v16.4S, v28.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v17.4S, v17.4S, v18.4S // ....................................................*................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v20.4S, v0.S[1] // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v18.4S, v11.4S, v26.4S // .......................................................................*................................................................................................................................................................................................................ + sub v23.4S, v23.4S, v21.4S // .............................................................*.......................................................................................................................................................................................................................... + mul v11.4S, v11.4S, v25.4S // ..............................................................................*......................................................................................................................................................................................................... + cmge v21.4S, v31.4S, v19.4S // ............................................................................*........................................................................................................................................................................................................... + mls v27.4S, v17.4S, v29.4S // .....................................................................*.................................................................................................................................................................................................................. + sqrdmulh v17.4S, v12.4S, v26.4S // ....................................................................................*................................................................................................................................................................................................... + mul v12.4S, v12.4S, v25.4S // .....................................................................................*.................................................................................................................................................................................................. + str q16, [x1, #832] // .........................................................*.............................................................................................................................................................................................................................. + sub v24.4S, v21.4S, v24.4S // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v21.4S, v10.4S, v26.4S // ...................................................................*.................................................................................................................................................................................................................... + sqrdmulh v16.4S, v15.4S, v26.4S // ........................................................................*............................................................................................................................................................................................................... + mul v10.4S, v10.4S, v25.4S // ....................................................................*................................................................................................................................................................................................................... + mls v22.4S, v23.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mul v23.4S, v20.4S, v0.S[0] // ...........................................................................*............................................................................................................................................................................................................ + mls v11.4S, v18.4S, v29.4S // ............................................................................................*........................................................................................................................................................................................... + mul v15.4S, v15.4S, v25.4S // .................................................................................*...................................................................................................................................................................................................... + mls v15.4S, v16.4S, v29.4S // ................................................................................................*....................................................................................................................................................................................... + mls v12.4S, v17.4S, v29.4S // .................................................................................................*...................................................................................................................................................................................... + str q22, [x1, #896] // ................................................................................*....................................................................................................................................................................................................... + str q27, [x1, #768] // ..................................................................................*..................................................................................................................................................................................................... + sqrdmulh v20.4S, v8.4S, v26.4S // ........................................................................................*............................................................................................................................................................................................... + mul v8.4S, v8.4S, v25.4S // .........................................................................................*.............................................................................................................................................................................................. + cmge v16.4S, v14.4S, v30.4S // ............................................................................................................*........................................................................................................................................................................... + cmge v22.4S, v13.4S, v30.4S // ..........................................................................................*............................................................................................................................................................................................. + cmge v27.4S, v11.4S, v30.4S // ....................................................................................................................*................................................................................................................................................................... + cmge v18.4S, v31.4S, v11.4S // .......................................................................................................................*................................................................................................................................................................ + mls v10.4S, v21.4S, v29.4S // ...............................................................................*........................................................................................................................................................................................................ + mls v23.4S, v28.4S, v29.4S // ......................................................................................*................................................................................................................................................................................................. + cmge v17.4S, v31.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + cmge v28.4S, v31.4S, v13.4S // ...........................................................................................*............................................................................................................................................................................................ + sub v27.4S, v18.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v8.4S, v20.4S, v29.4S // .....................................................................................................*.................................................................................................................................................................................. + cmge v20.4S, v31.4S, v12.4S // ...............................................................................................................*........................................................................................................................................................................ + cmge v18.4S, v12.4S, v30.4S // .................................................................................................................*...................................................................................................................................................................... + sub v22.4S, v28.4S, v22.4S // ...................................................................................................*.................................................................................................................................................................................... + sub v21.4S, v17.4S, v16.4S // ......................................................................................................................*................................................................................................................................................................. + cmge v28.4S, v31.4S, v9.4S // .............................................................................................*.......................................................................................................................................................................................... + mls v19.4S, v24.4S, v29.4S // .......................................................................................*................................................................................................................................................................................................ + sub v24.4S, v20.4S, v18.4S // ..........................................................................................................................*............................................................................................................................................................. + cmge v18.4S, v9.4S, v30.4S // ..............................................................................................*......................................................................................................................................................................................... + cmge v20.4S, v23.4S, v30.4S // ........................................................................................................*............................................................................................................................................................................... + cmge v16.4S, v31.4S, v23.4S // .......................................................................................................*................................................................................................................................................................................ + mls v13.4S, v22.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + cmge v22.4S, v8.4S, v30.4S // ..................................................................................................................*..................................................................................................................................................................... + str q13, [x1, #320] // .....................................................................................................................*.................................................................................................................................................................. + cmge v13.4S, v31.4S, v15.4S // ........................................................................................................................*............................................................................................................................................................... + mls v12.4S, v24.4S, v29.4S // .................................................................................................................................*...................................................................................................................................................... + cmge v24.4S, v15.4S, v30.4S // .........................................................................................................................*.............................................................................................................................................................. + str q19, [x1, #704] // ..................................................................................................*..................................................................................................................................................................................... + cmge v19.4S, v31.4S, v8.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v11.4S, v27.4S, v29.4S // ...................................................................................................................................*.................................................................................................................................................... + cmge v17.4S, v10.4S, v30.4S // ....................................................................................................*................................................................................................................................................................................... + sub v27.4S, v16.4S, v20.4S // ..............................................................................................................*......................................................................................................................................................................... + cmge v16.4S, v31.4S, v10.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v18.4S, v28.4S, v18.4S // ......................................................................................................*................................................................................................................................................................................. + sub v19.4S, v19.4S, v22.4S // ............................................................................................................................*........................................................................................................................................................... + ldr q28, [x1, #720] // ............................................................................................................................................*........................................................................................................................................... + ldr q20, [x1, #656] // .............................................................................................................................................*.......................................................................................................................................... + sub v22.4S, v13.4S, v24.4S // ................................................................................................................................*....................................................................................................................................................... + ldr q24, [x1] // ................................................................................................................................................*....................................................................................................................................... + str q12, [x1, #256] // ........................................................................................................................................*............................................................................................................................................... + sub v16.4S, v16.4S, v17.4S // ..........................................................................................................*............................................................................................................................................................................. + ldr q17, [x1, #80] // .................................................................................................................................................*...................................................................................................................................... + mls v23.4S, v27.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + mls v14.4S, v21.4S, v29.4S // ..............................................................................................................................*......................................................................................................................................................... + str q11, [x1, #192] // ..........................................................................................................................................*............................................................................................................................................. + mls v15.4S, v22.4S, v29.4S // ....................................................................................................................................*................................................................................................................................................... + mls v9.4S, v18.4S, v29.4S // .............................................................................................................*.......................................................................................................................................................................... + mls v8.4S, v19.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mls v10.4S, v16.4S, v29.4S // ...........................................................................................................................*............................................................................................................................................................ + ldr q13, [x1, #272] // ...............................................................................................................................................*........................................................................................................................................ + ldr q22, [x1, #208] // ....................................................................................................................................................*................................................................................................................................... + str q15, [x1, #448] // ...........................................................................................................................................*............................................................................................................................................ + ldr q12, [x1, #592] // ..................................................................................................................................................*..................................................................................................................................... + sub v19.4S, v20.4S, v28.4S // ........................................................................................................................................................*............................................................................................................................... + ldr q15, [x1, #336] // ..............................................................................................................................................*......................................................................................................................................... + add v16.4S, v24.4S, v17.4S // ...............................................................................................................................................................*........................................................................................................................ + add v27.4S, v20.4S, v28.4S // .........................................................................................................................................................*.............................................................................................................................. + sub v11.4S, v24.4S, v17.4S // ............................................................................................................................................................*........................................................................................................................... + ldr q28, [x1, #528] // ...................................................................................................................................................*.................................................................................................................................... + ldr q18, [x1, #144] // .....................................................................................................................................................*.................................................................................................................................. + str q14, [x1, #384] // .......................................................................................................................................*................................................................................................................................................ + ldr q14, [x1, #400] // ......................................................................................................................................................*................................................................................................................................. + ldr q24, [x1, #464] // ..........................................................................................................................................................*............................................................................................................................. + str q9, [x1, #64] // ................................................................................................................*....................................................................................................................................................................... + str q10, [x1, #128] // .....................................................................................................................................*.................................................................................................................................................. + ldr q10, [x1, #976] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v9.4S, v19.4S, v6.S[1] // ..................................................................................................................................................................*..................................................................................................................... + mul v19.4S, v19.4S, v6.S[0] // ...................................................................................................................................................................*.................................................................................................................... + str q23, [x1, #960] // ......................................................................................................................................*................................................................................................................................................. + mul v21.4S, v11.4S, v3.S[2] // .......................................................................................................................................................................*................................................................................................................ + ldr q23, [x1, #912] // ............................................................................................................................................................................*........................................................................................................... + sub v20.4S, v28.4S, v12.4S // ................................................................................................................................................................*....................................................................................................................... + add v17.4S, v13.4S, v15.4S // .............................................................................................................................................................*.......................................................................................................................... + sub v15.4S, v13.4S, v15.4S // ..............................................................................................................................................................*......................................................................................................................... + add v13.4S, v28.4S, v12.4S // .................................................................................................................................................................*...................................................................................................................... + sub v12.4S, v14.4S, v24.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v19.4S, v9.4S, v29.4S // ................................................................................................................................................................................*....................................................................................................... + add v9.4S, v18.4S, v22.4S // ....................................................................................................................................................................*................................................................................................................... + str q8, [x1], #(16) // .........................................................................................................................................*.............................................................................................................................................. + sqrdmulh v11.4S, v11.4S, v3.S[3] // ......................................................................................................................................................................*................................................................................................................. + ldr q28, [x1, #768] // .......................................................................................................................................................*................................................................................................................................ + sub v8.4S, v23.4S, v10.4S // .........................................................................................................................................................................................*.............................................................................................. + add v10.4S, v23.4S, v10.4S // ........................................................................................................................................................................................*............................................................................................... + ldr q23, [x1, #832] // ...........................................................................................................................................................*............................................................................................................................ + add v14.4S, v14.4S, v24.4S // ..........................................................................................................................................................................*............................................................................................................. + sub v24.4S, v13.4S, v27.4S // ............................................................................................................................................................................................*........................................................................................... + add v27.4S, v13.4S, v27.4S // ...........................................................................................................................................................................................*............................................................................................ + sub v13.4S, v18.4S, v22.4S // .....................................................................................................................................................................*.................................................................................................................. + mul v22.4S, v8.4S, v7.S[0] // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v18.4S, v20.4S, v5.S[3] // ..............................................................................................................................................................................*......................................................................................................... + mul v20.4S, v20.4S, v5.S[2] // .................................................................................................................................................................................*...................................................................................................... + mls v21.4S, v11.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sqrdmulh v11.4S, v8.4S, v7.S[1] // ......................................................................................................................................................................................................................*................................................................. + add v8.4S, v28.4S, v23.4S // .........................................................................................................................................................................*.............................................................................................................. + sub v28.4S, v28.4S, v23.4S // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v23.4S, v15.4S, v4.S[3] // .......................................................................................................................................................................................................*................................................................................ + mul v15.4S, v15.4S, v4.S[2] // ......................................................................................................................................................................................................*................................................................................. + mls v20.4S, v18.4S, v29.4S // ..........................................................................................................................................................................................*............................................................................................. + add v18.4S, v17.4S, v14.4S // .....................................................................................................................................................................................*.................................................................................................. + sub v17.4S, v17.4S, v14.4S // ..................................................................................................................................................................................*..................................................................................................... + mls v22.4S, v11.4S, v29.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v11.4S, v28.4S, v6.S[2] // ................................................................................................................................................................................................*....................................................................................... + sqrdmulh v14.4S, v28.4S, v6.S[3] // ...................................................................................................................................................................................................*.................................................................................... + add v28.4S, v8.4S, v10.4S // ..................................................................................................................................................................................................*..................................................................................... + sub v8.4S, v8.4S, v10.4S // .................................................................................................................................................................................................*...................................................................................... + mls v15.4S, v23.4S, v29.4S // ................................................................................................................................................................................................................*....................................................................... + sqrdmulh v23.4S, v17.4S, v2.S[1] // ...........................................................................................................................................................................................................*............................................................................ + sub v10.4S, v16.4S, v9.4S // ......................................................................................................................................................................................*................................................................................................. + add v16.4S, v16.4S, v9.4S // .......................................................................................................................................................................................*................................................................................................ + mul v9.4S, v13.4S, v4.S[0] // ...............................................................................................................................................................................................*........................................................................................ + sqrdmulh v13.4S, v13.4S, v4.S[1] // ..............................................................................................................................................................................................*......................................................................................... + mls v11.4S, v14.4S, v29.4S // .....................................................................................................................................................................................................*.................................................................................. + add v14.4S, v27.4S, v28.4S // .............................................................................................................................................................................................................*.......................................................................... + sub v28.4S, v27.4S, v28.4S // ............................................................................................................................................................................................................*........................................................................... + mul v27.4S, v17.4S, v2.S[0] // ..........................................................................................................................................................................................................*............................................................................. + add v17.4S, v20.4S, v19.4S // ........................................................................................................................................................................................................*............................................................................... + sub v19.4S, v20.4S, v19.4S // .........................................................................................................................................................................................................*.............................................................................. + sqrdmulh v20.4S, v12.4S, v5.S[1] // ....................................................................................................................................................................................*................................................................................................... + mul v12.4S, v12.4S, v5.S[0] // ...................................................................................................................................................................................*.................................................................................................... + mls v9.4S, v13.4S, v29.4S // ....................................................................................................................................................................................................*................................................................................... + sqrdmulh v13.4S, v24.4S, v2.S[3] // ..........................................................................................................................................................................................................................................*............................................. + mul v24.4S, v24.4S, v2.S[2] // .......................................................................................................................................................................................................................................*................................................ + mls v27.4S, v23.4S, v29.4S // .......................................................................................................................................................................................................................*................................................................ + mul v23.4S, v19.4S, v2.S[2] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v19.4S, v19.4S, v2.S[3] // .................................................................................................................................................................................................................*...................................................................... + mls v12.4S, v20.4S, v29.4S // .............................................................................................................................................................................................*.......................................................................................... + add v20.4S, v11.4S, v22.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v24.4S, v13.4S, v29.4S // ..................................................................................................................................................................................................................................................*..................................... + sub v13.4S, v11.4S, v22.4S // .....................................................................................................................................................................................................................................*.................................................. + add v22.4S, v16.4S, v18.4S // ...............................................................................................................................................................................................................*........................................................................ + mul v11.4S, v8.4S, v3.S[0] // .........................................................................................................................................................................................................................*.............................................................. + mls v23.4S, v19.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v19.4S, v16.4S, v18.4S // ..............................................................................................................................................................................................................*......................................................................... + mul v16.4S, v10.4S, v1.S[2] // ...............................................................................................................................................................................................................................*........................................................ + sqrdmulh v10.4S, v10.4S, v1.S[3] // .............................................................................................................................................................................................................................*.......................................................... + mls v16.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v18.4S, v8.4S, v3.S[1] // ........................................................................................................................................................................................................................*............................................................... + sub v10.4S, v21.4S, v9.4S // ...................................................................................................................................................................................................................*.................................................................... + add v9.4S, v21.4S, v9.4S // ....................................................................................................................................................................................................................*................................................................... + add v21.4S, v15.4S, v12.4S // ...........................................................................................................................................................................................................................*............................................................ + sub v15.4S, v15.4S, v12.4S // ............................................................................................................................................................................................................................*........................................................... + sqrdmulh v8.4S, v13.4S, v3.S[1] // ..............................................................................................................................................................................................................................................................*......................... + mul v13.4S, v13.4S, v3.S[0] // .............................................................................................................................................................................................................................................................*.......................... + sub v12.4S, v16.4S, v27.4S // ............................................................................................................................................................................................................................................*........................................... + mls v11.4S, v18.4S, v29.4S // ....................................................................................................................................................................................................................................*................................................... + add v18.4S, v16.4S, v27.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v16.4S, v22.4S, v14.4S // .................................................................................................................................................................................................................................*...................................................... + mls v13.4S, v8.4S, v29.4S // ....................................................................................................................................................................................................................................................................*................... + add v8.4S, v22.4S, v14.4S // ..................................................................................................................................................................................................................................*..................................................... + sub v14.4S, v9.4S, v21.4S // ......................................................................................................................................................................................................................................*................................................. + add v27.4S, v9.4S, v21.4S // ...........................................................................................................................................................................................................................................*............................................ + sqrdmulh v9.4S, v15.4S, v2.S[1] // ....................................................................................................................................................................................................................................................*................................... + add v22.4S, v24.4S, v11.4S // .........................................................................................................................................................................................................................................................*.............................. + sub v21.4S, v24.4S, v11.4S // ..........................................................................................................................................................................................................................................................*............................. + mul v11.4S, v15.4S, v2.S[0] // ...................................................................................................................................................................................................................................................*.................................... + mul v24.4S, v10.4S, v1.S[2] // ...............................................................................................................................................................................................................................................*........................................ + sqrdmulh v15.4S, v10.4S, v1.S[3] // ..............................................................................................................................................................................................................................................*......................................... + sub v10.4S, v17.4S, v20.4S // .........................................................................................................................................................................................................................................*.............................................. + add v20.4S, v17.4S, v20.4S // ........................................................................................................................................................................................................................................*............................................... + mul v17.4S, v28.4S, v1.S[0] // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v28.4S, v28.4S, v1.S[1] // .................................................................................................................................................................................................................................................*...................................... + mls v11.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sqrdmulh v9.4S, v16.4S, v0.S[1] // ...................................................................................................................................................................................................................................................................*.................... + mls v24.4S, v15.4S, v29.4S // ........................................................................................................................................................................................................................................................*............................... + mul v16.4S, v16.4S, v0.S[0] // .....................................................................................................................................................................................................................................................................*.................. + mls v17.4S, v28.4S, v29.4S // .....................................................................................................................................................................................................................................................*.................................. + mul v15.4S, v14.4S, v0.S[2] // ...........................................................................................................................................................................................................................................................*............................ + sqrdmulh v14.4S, v14.4S, v0.S[3] // ..................................................................................................................................................................................................................................................................*..................... + mul v28.4S, v12.4S, v0.S[2] // ......................................................................................................................................................................................................................................................*................................. + sqrdmulh v12.4S, v12.4S, v0.S[3] // .......................................................................................................................................................................................................................................................*................................ + mls v16.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + add v9.4S, v27.4S, v20.4S // .................................................................................................................................................................................................................................................................*...................... + sub v27.4S, v27.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mul v20.4S, v10.4S, v1.S[0] // ............................................................................................................................................................................................................................................................................*........... + mls v15.4S, v14.4S, v29.4S // ........................................................................................................................................................................................................................................................................*............... + sqrdmulh v14.4S, v10.4S, v1.S[1] // ...........................................................................................................................................................................................................................................................................*............ + add v10.4S, v18.4S, v22.4S // ......................................................................................................................................................................................................................................................................*................. + sub v18.4S, v18.4S, v22.4S // .......................................................................................................................................................................................................................................................................*................ + mls v28.4S, v12.4S, v29.4S // ............................................................................................................................................................................................................................................................*........................... + mul v12.4S, v19.4S, v0.S[2] // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v22.4S, v19.4S, v0.S[3] // ..............................................................................................................................................................................................................................................................................*......... + sub v19.4S, v24.4S, v11.4S // ..........................................................................................................................................................................................................................................................................*............. + add v11.4S, v24.4S, v11.4S // .........................................................................................................................................................................................................................................................................*.............. + sqrdmulh v24.4S, v18.4S, v0.S[1] // .....................................................................................................................................................................................................................................................................................*.. + mul v18.4S, v18.4S, v0.S[0] // ......................................................................................................................................................................................................................................................................................*. + mls v20.4S, v14.4S, v29.4S // ....................................................................................................................................................................................................................................................................................*... + add v14.4S, v23.4S, v13.4S // ................................................................................................................................................................................................................................................................................*....... + sub v23.4S, v23.4S, v13.4S // .................................................................................................................................................................................................................................................................................*...... + mul v13.4S, v21.4S, v1.S[0] // ..................................................................................................................................................................................................................................................................................*..... + sqrdmulh v21.4S, v21.4S, v1.S[1] // ...................................................................................................................................................................................................................................................................................*.... + mls v12.4S, v22.4S, v29.4S // .......................................................................................................................................................................................................................................................................................* - // original source code - // ldr q8, [x1, #0] // ............e...............................................................................................................................................................|.......................................................................................................................e................................................. - // ldr q9, [x1, #(1*(512/8))] // ................e...........................................................................................................................................................|...........................................................................................................................e............................................. - // ldr q10, [x1, #(2*(512/8))] // ...e........................................................................................................................................................................|..............................................................................................................e.......................................................... - // ldr q11, [x1, #(3*(512/8))] // e...........................................................................................................................................................................|...........................................................................................................e............................................................. - // ldr q12, [x1, #(4*(512/8))] // .......................e....................................................................................................................................................|..................................................................................................................................e...................................... - // ldr q13, [x1, #(5*(512/8))] // ......................e.....................................................................................................................................................|.................................................................................................................................e....................................... - // ldr q14, [x1, #(6*(512/8))] // .....................................e......................................................................................................................................|................................................................................................................................................e........................ - // ldr q15, [x1, #(7*(512/8))] // ..............................................e.............................................................................................................................|.........................................................................................................................................................e............... - // ldr q16, [x1, #(8*(512/8))] // ..................................................e.........................................................................................................................|.............................................................................................................................................................e........... - // ldr q17, [x1, #(9*(512/8))] // ....................................................e.......................................................................................................................|...............................................................................................................................................................e......... - // ldr q18, [x1, #(10*(512/8))] // ...........................................e................................................................................................................................|......................................................................................................................................................e.................. - // ldr q19, [x1, #(11*(512/8))] // .............................................e..............................................................................................................................|........................................................................................................................................................e................ - // ldr q20, [x1, #(12*(512/8))] // ............................................................................................e...............................................................................|......................................................................................................................................................................... - // ldr q21, [x1, #(13*(512/8))] // ..........................................................................................e.................................................................................|......................................................................................................................................................................... - // ldr q22, [x1, #(14*(512/8))] // .............................................................................e..............................................................................................|......................................................................................................................................................................... - // ldr q23, [x1, #(15*(512/8))] // ..............................................................................e.............................................................................................|......................................................................................................................................................................... - // sub v24.4s, v8.4s, v9.4s // .............................e..............................................................................................................................................|........................................................................................................................................e................................ - // add v8.4s, v8.4s, v9.4s // .................................e..........................................................................................................................................|............................................................................................................................................e............................ - // mul v9.4s, v24.4s, v3.s[2] // ............................................e...............................................................................................................................|.......................................................................................................................................................e................. - // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..........................................e.................................................................................................................................|.....................................................................................................................................................e................... - // mls v9.4s, v24.4s, v29.4s // .......................................................e....................................................................................................................|..................................................................................................................................................................e...... - // sub v24.4s, v10.4s, v11.4s // ....e.......................................................................................................................................................................|...............................................................................................................e......................................................... - // add v10.4s, v10.4s, v11.4s // .............e..............................................................................................................................................................|........................................................................................................................e................................................ - // mul v11.4s, v24.4s, v4.s[0] // .........................................e..................................................................................................................................|....................................................................................................................................................e.................... - // sqrdmulh v24.4s, v24.4s, v4.s[1] // ..................................e.........................................................................................................................................|.............................................................................................................................................e........................... - // mls v11.4s, v24.4s, v29.4s // ..........................................................e.................................................................................................................|.....................................................................................................................................................................e... - // sub v24.4s, v12.4s, v13.4s // ........................................e...................................................................................................................................|...................................................................................................................................................e..................... - // add v12.4s, v12.4s, v13.4s // ................................e...........................................................................................................................................|...........................................................................................................................................e............................. - // mul v13.4s, v24.4s, v4.s[2] // ......................................................e.....................................................................................................................|.................................................................................................................................................................e....... - // sqrdmulh v24.4s, v24.4s, v4.s[3] // .....................................................e......................................................................................................................|................................................................................................................................................................e........ - // mls v13.4s, v24.4s, v29.4s // ............................................................e...............................................................................................................|.......................................................................................................................................................................e. - // sub v24.4s, v14.4s, v15.4s // ...............................................................e............................................................................................................|......................................................................................................................................................................... - // add v14.4s, v14.4s, v15.4s // ..............................................................e.............................................................................................................|......................................................................................................................................................................... - // mul v15.4s, v24.4s, v5.s[0] // .......................................................................e....................................................................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.s[1] // ........................................................................e...................................................................................................|......................................................................................................................................................................... - // mls v15.4s, v24.4s, v29.4s // .......................................................................................e....................................................................................|......................................................................................................................................................................... - // sub v24.4s, v16.4s, v17.4s // ....................................................................e.......................................................................................................|......................................................................................................................................................................... - // add v16.4s, v16.4s, v17.4s // .................................................................e..........................................................................................................|......................................................................................................................................................................... - // mul v17.4s, v24.4s, v5.s[2] // .........................................................................e..................................................................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.s[3] // ..........................................................................e.................................................................................................|......................................................................................................................................................................... - // mls v17.4s, v24.4s, v29.4s // ........................................................................................e...................................................................................|......................................................................................................................................................................... - // sub v24.4s, v18.4s, v19.4s // ..................................................................e.........................................................................................................|......................................................................................................................................................................... - // add v18.4s, v18.4s, v19.4s // ...................................................................e........................................................................................................|......................................................................................................................................................................... - // mul v19.4s, v24.4s, v6.s[0] // .....................................................................................e......................................................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.s[1] // ......................................................................................e.....................................................................................|......................................................................................................................................................................... - // mls v19.4s, v24.4s, v29.4s // ..............................................................................................e.............................................................................|......................................................................................................................................................................... - // sub v24.4s, v20.4s, v21.4s // .........................................................................................................e..................................................................|......................................................................................................................................................................... - // add v20.4s, v20.4s, v21.4s // ........................................................................................................e...................................................................|......................................................................................................................................................................... - // mul v21.4s, v24.4s, v6.s[2] // .................................................................................................................e..........................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.s[3] // ................................................................................................................e...........................................................|......................................................................................................................................................................... - // mls v21.4s, v24.4s, v29.4s // .....................................................................................................................e......................................................|......................................................................................................................................................................... - // sub v24.4s, v22.4s, v23.4s // .............................................................................................e..............................................................................|......................................................................................................................................................................... - // add v22.4s, v22.4s, v23.4s // ...............................................................................................e............................................................................|......................................................................................................................................................................... - // mul v23.4s, v24.4s, v7.s[0] // ..................................................................................................e.........................................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v7.s[1] // ...................................................................................................e........................................................................|......................................................................................................................................................................... - // mls v23.4s, v24.4s, v29.4s // ..........................................................................................................e.................................................................|......................................................................................................................................................................... - // sub v24.4s, v8.4s, v10.4s // ...................................................e........................................................................................................................|..............................................................................................................................................................e.......... - // add v8.4s, v8.4s, v10.4s // .................................................e..........................................................................................................................|............................................................................................................................................................e............ - // mul v10.4s, v24.4s, v1.s[2] // ................................................................e...........................................................................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................e..................................................................................................................|....................................................................................................................................................................e.... - // mls v10.4s, v24.4s, v29.4s // ............................................................................e...............................................................................................|......................................................................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ...........................................................................e................................................................................................|......................................................................................................................................................................... - // add v9.4s, v9.4s, v11.4s // ...............................................................................e............................................................................................|......................................................................................................................................................................... - // mul v11.4s, v24.4s, v1.s[2] // ..................................................................................................................................e.........................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................................................................................................................e........................................|......................................................................................................................................................................... - // mls v11.4s, v24.4s, v29.4s // ..........................................................................................................................................e.................................|......................................................................................................................................................................... - // sub v24.4s, v12.4s, v14.4s // ......................................................................e.....................................................................................................|......................................................................................................................................................................... - // add v12.4s, v12.4s, v14.4s // .....................................................................e......................................................................................................|......................................................................................................................................................................... - // mul v14.4s, v24.4s, v2.s[0] // ................................................................................e...........................................................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..................................................................................e.........................................................................................|......................................................................................................................................................................... - // mls v14.4s, v24.4s, v29.4s // .........................................................................................e..................................................................................|......................................................................................................................................................................... - // sub v24.4s, v13.4s, v15.4s // .................................................................................................e..........................................................................|......................................................................................................................................................................... - // add v13.4s, v13.4s, v15.4s // ................................................................................................e...........................................................................|......................................................................................................................................................................... - // mul v15.4s, v24.4s, v2.s[0] // ...........................................................................................................e................................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................e...............................................................|......................................................................................................................................................................... - // mls v15.4s, v24.4s, v29.4s // ....................................................................................................................e.......................................................|......................................................................................................................................................................... - // sub v24.4s, v16.4s, v18.4s // ...................................................................................e........................................................................................|......................................................................................................................................................................... - // add v16.4s, v16.4s, v18.4s // ....................................................................................e.......................................................................................|......................................................................................................................................................................... - // mul v18.4s, v24.4s, v2.s[2] // ....................................................................................................e.......................................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // .....................................................................................................e......................................................................|......................................................................................................................................................................... - // mls v18.4s, v24.4s, v29.4s // .............................................................................................................e..............................................................|......................................................................................................................................................................... - // sub v24.4s, v17.4s, v19.4s // .......................................................................................................e....................................................................|......................................................................................................................................................................... - // add v17.4s, v17.4s, v19.4s // ......................................................................................................e.....................................................................|......................................................................................................................................................................... - // mul v19.4s, v24.4s, v2.s[2] // .........................................................................................................................e..................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................................................................................................e...................................................|......................................................................................................................................................................... - // mls v19.4s, v24.4s, v29.4s // .................................................................................................................................e..........................................|......................................................................................................................................................................... - // sub v24.4s, v20.4s, v22.4s // ...............................................................................................................e............................................................|......................................................................................................................................................................... - // add v20.4s, v20.4s, v22.4s // ..............................................................................................................e.............................................................|......................................................................................................................................................................... - // mul v22.4s, v24.4s, v3.s[0] // ............................................................................................................................e...............................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................e...........................................|......................................................................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // ........................................................................................................................................e...................................|......................................................................................................................................................................... - // sub v24.4s, v21.4s, v23.4s // ..............................................................................................................................e.............................................|......................................................................................................................................................................... - // add v21.4s, v21.4s, v23.4s // ...............................................................................................................................e............................................|......................................................................................................................................................................... - // mul v23.4s, v24.4s, v3.s[0] // .......................................................................................................................................e....................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ......................................................................................................................................e.....................................|......................................................................................................................................................................... - // mls v23.4s, v24.4s, v29.4s // .................................................................................................................................................e..........................|......................................................................................................................................................................... - // sub v24.4s, v8.4s, v12.4s // ...........................................................................................e................................................................................|......................................................................................................................................................................... - // add v8.4s, v8.4s, v12.4s // .................................................................................e..........................................................................................|......................................................................................................................................................................... - // mul v12.4s, v24.4s, v0.s[2] // ...........................................................................................................................................................e................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........................................................................................................................................................e.................|......................................................................................................................................................................... - // mls v12.4s, v24.4s, v29.4s // ....................................................................................................................................................................e.......|......................................................................................................................................................................... - // sub v24.4s, v9.4s, v13.4s // ...........................................................................................................................e................................................|......................................................................................................................................................................... - // add v9.4s, v9.4s, v13.4s // ..........................................................................................................................e.................................................|......................................................................................................................................................................... - // mul v13.4s, v24.4s, v0.s[2] // .............................................................................................................................................e..............................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................................e..................................|......................................................................................................................................................................... - // mls v13.4s, v24.4s, v29.4s // ................................................................................................................................................e...........................|......................................................................................................................................................................... - // sub v24.4s, v10.4s, v14.4s // ....................................................................................................................................e.......................................|......................................................................................................................................................................... - // add v10.4s, v10.4s, v14.4s // .....................................................................................................................................e......................................|......................................................................................................................................................................... - // mul v14.4s, v24.4s, v0.s[2] // ...............................................................................................................................................e............................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................................................................e.............................|......................................................................................................................................................................... - // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................................................e.......................|......................................................................................................................................................................... - // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................................e...................|......................................................................................................................................................................... - // add v11.4s, v11.4s, v15.4s // .......................................................................................................................................................e....................|......................................................................................................................................................................... - // mul v15.4s, v24.4s, v0.s[2] // .....................................................................................................................................................................e......|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ......................................................................................................................................................................e.....|......................................................................................................................................................................... - // mls v15.4s, v24.4s, v29.4s // ............................................................................................................................................................................*......................................................................................................................................................................... - // sub v24.4s, v16.4s, v20.4s // ..................................................................................................................e.........................................................|......................................................................................................................................................................... - // add v16.4s, v16.4s, v20.4s // ...................................................................................................................e........................................................|......................................................................................................................................................................... - // mul v20.4s, v24.4s, v1.s[0] // .......................................................................................................................e....................................................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................e.....................................................|......................................................................................................................................................................... - // mls v20.4s, v24.4s, v29.4s // .............................................................................................................................e..............................................|......................................................................................................................................................................... - // sub v24.4s, v17.4s, v21.4s // .............................................................................................................................................................e..............|......................................................................................................................................................................... - // add v17.4s, v17.4s, v21.4s // .........................................................................................................................................................e..................|......................................................................................................................................................................... - // mul v21.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................................e|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................................................e..|......................................................................................................................................................................... - // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.*....................................................................................................................................................................... - // sub v24.4s, v18.4s, v22.4s // ......................................................................................................................................................e.....................|......................................................................................................................................................................... - // add v18.4s, v18.4s, v22.4s // .....................................................................................................................................................e......................|......................................................................................................................................................................... - // mul v22.4s, v24.4s, v1.s[0] // ...............................................................................................................................................................e............|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................e.............|......................................................................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.|......................................................................................................................................................................... - // sub v24.4s, v19.4s, v23.4s // ..................................................................................................................................................................e.........|......................................................................................................................................................................... - // add v19.4s, v19.4s, v23.4s // ...................................................................................................................................................................e........|......................................................................................................................................................................... - // mul v23.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................|................*........................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................|.................*....................................................................................................................................................... - // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................|............................*............................................................................................................................................ - // sub v24.4s, v8.4s, v16.4s // ............................................................................................................................................e...............................|......................................................................................................................................................................... - // add v8.4s, v8.4s, v16.4s // ...........................................................................................................................................e................................|......................................................................................................................................................................... - // mul v16.4s, v24.4s, v0.s[0] // ...................................................................................................................................................e........................|......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................e.........................|......................................................................................................................................................................... - // mls v16.4s, v24.4s, v29.4s // ............................................................................................................................................................e...............|......................................................................................................................................................................... - // sub v24.4s, v9.4s, v17.4s // .................................................................................................................................................................e..........|......................................................................................................................................................................... - // add v9.4s, v9.4s, v17.4s // ................................................................................................................................................................e...........|......................................................................................................................................................................... - // mul v17.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|....................*.................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|.....................*................................................................................................................................................... - // mls v17.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...........................*............................................................................................................................................. - // sub v24.4s, v10.4s, v18.4s // .......................................................................................................................................................................e....|......................................................................................................................................................................... - // add v10.4s, v10.4s, v18.4s // ........................................................................................................................................................................e...|......................................................................................................................................................................... - // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|..........*.............................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|............*............................................................................................................................................................ - // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................|..................*...................................................................................................................................................... - // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................|....*.................................................................................................................................................................... - // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................|...*..................................................................................................................................................................... - // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|.............*........................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|..............*.......................................................................................................................................................... - // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...................*..................................................................................................................................................... - // sub v24.4s, v12.4s, v20.4s // ............................................................................................................................................................................|.........................*............................................................................................................................................... - // add v12.4s, v12.4s, v20.4s // ............................................................................................................................................................................|..........................*.............................................................................................................................................. - // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|..................................*...................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|...................................*..................................................................................................................................... - // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...........................................*............................................................................................................................. - // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................|.......................*................................................................................................................................................. - // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................|........................*................................................................................................................................................ - // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|................................................*........................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|...................................................*..................................................................................................................... - // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.................................................................*....................................................................................................... - // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................|......*.................................................................................................................................................................. - // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................|.....*................................................................................................................................................................... - // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|........*................................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|.........*............................................................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...............*......................................................................................................................................................... - // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................|.....................................*................................................................................................................................... - // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................|.......................................*................................................................................................................................. - // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|.............................................*........................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|............................................*............................................................................................................................ - // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.........................................................*............................................................................................................... - // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................|..*...................................................................................................................................................................... - // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................|*........................................................................................................................................................................ - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.......*................................................................................................................................................................. - // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...........*............................................................................................................................................................. - // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................|.............................................................*........................................................................................................... - // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................|...............................................................*......................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.....................................................................*................................................................................................... - // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................|............................................................................*............................................................................................ - // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................|.............................*........................................................................................................................................... - // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................|..............................*.......................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.................................*....................................................................................................................................... - // mls v18.4s, v28.4s, v29.4s // ............................................................................................................................................................................|......................................*.................................................................................................................................. - // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................|................................*........................................................................................................................................ - // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................|...............................*......................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|....................................*.................................................................................................................................... - // mls v19.4s, v28.4s, v29.4s // ............................................................................................................................................................................|..........................................*.............................................................................................................................. - // cmge v27.4s, v31.4s, v20.4s // ............................................................................................................................................................................|..................................................................*...................................................................................................... - // cmge v28.4s, v20.4s, v30.4s // ............................................................................................................................................................................|...................................................................*..................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|........................................................................*................................................................................................ - // mls v20.4s, v28.4s, v29.4s // ............................................................................................................................................................................|.................................................................................*....................................................................................... - // cmge v27.4s, v31.4s, v21.4s // ..........*.................................................................................................................................................................|.....................................................................................................................*................................................... - // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................|.......................................................................................................*................................................................. - // sub v28.4s, v27.4s, v28.4s // ....................*.......................................................................................................................................................|...............................................................................................................................*......................................... - // mls v21.4s, v28.4s, v29.4s // ...........................*................................................................................................................................................|......................................................................................................................................*.................................. - // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................|........................................*................................................................................................................................ - // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................|.........................................*............................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|..............................................*.......................................................................................................................... - // mls v22.4s, v28.4s, v29.4s // ............................................................................................................................................................................|......................................................*.................................................................................................................. - // cmge v27.4s, v31.4s, v23.4s // ............................................................................................................................................................................|......................................................................*.................................................................................................. - // cmge v28.4s, v23.4s, v30.4s // ............................................................................................................................................................................|.......................................................................*................................................................................................. - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|...........................................................................*............................................................................................. - // mls v23.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...................................................................................*..................................................................................... - // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................|......................*.................................................................................................................................................. - // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................|......................................................................................*.................................................................................. - // str q18, [x1, #(10*(512/8))] // ............................................................................................................................................................................|.................................................*....................................................................................................................... - // str q19, [x1, #(11*(512/8))] // ............................................................................................................................................................................|...........................................................*............................................................................................................. - // str q20, [x1, #(12*(512/8))] // ............................................................................................................................................................................|........................................................................................*................................................................................ - // str q21, [x1, #(13*(512/8))] // ...................................*........................................................................................................................................|..............................................................................................................................................*.......................... - // str q22, [x1, #(14*(512/8))] // ............................................................................................................................................................................|..........................................................*.............................................................................................................. - // str q23, [x1, #(15*(512/8))] // ............................................................................................................................................................................|...............................................................................................*......................................................................... - // mul v16.4s, v8.4s, v25.4s // ............................................................................................................................................................................|....................................................*.................................................................................................................... - // sqrdmulh v8.4s, v8.4s, v26.4s // ............................................................................................................................................................................|.....................................................*................................................................................................................... - // mls v16.4s, v8.4s, v29.4s // ............................................................................................................................................................................|.......................................................*................................................................................................................. - // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................|..........................................................................................*.............................................................................. - // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................|............................................................................................*............................................................................ - // mls v17.4s, v9.4s, v29.4s // ............................................................................................................................................................................|.....................................................................................................*................................................................... - // mul v18.4s, v10.4s, v25.4s // ............................................................................................................................................................................|.............................................................................*........................................................................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ............................................................................................................................................................................|..............................................................................*.......................................................................................... - // mls v18.4s, v10.4s, v29.4s // .*..........................................................................................................................................................................|............................................................................................................*............................................................ - // mul v19.4s, v11.4s, v25.4s // ............................................................................................................................................................................|............................................................*............................................................................................................ - // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................................................................................................................................................|........................................................*................................................................................................................ - // mls v19.4s, v11.4s, v29.4s // ............................................................................................................................................................................|....................................................................*.................................................................................................... - // mul v20.4s, v12.4s, v25.4s // ............................................................................................................................................................................|.............................................................................................*........................................................................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ............................................................................................................................................................................|.........................................................................................*............................................................................... - // mls v20.4s, v12.4s, v29.4s // ............................................................................................................................................................................|....................................................................................................*.................................................................... - // mul v21.4s, v13.4s, v25.4s // ............................................................................................................................................................................|...............................................*......................................................................................................................... - // sqrdmulh v13.4s, v13.4s, v26.4s // ............................................................................................................................................................................|..................................................*...................................................................................................................... - // mls v21.4s, v13.4s, v29.4s // ............................................................................................................................................................................|..............................................................*.......................................................................................................... - // mul v22.4s, v14.4s, v25.4s // ............................................................................................................................................................................|................................................................*........................................................................................................ - // sqrdmulh v14.4s, v14.4s, v26.4s // ............................................................................................................................................................................|..........................................................................*.............................................................................................. - // mls v22.4s, v14.4s, v29.4s // ............................................................................................................................................................................|.......................................................................................*................................................................................. - // mul v23.4s, v15.4s, v25.4s // ............................................................................................................................................................................|................................................................................................*........................................................................ - // sqrdmulh v15.4s, v15.4s, v26.4s // ............................................................................................................................................................................|.................................................................................................*....................................................................... - // mls v23.4s, v15.4s, v29.4s // ............................................................................................................................................................................|..........................................................................................................*.............................................................. - // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................|.........................................................................*............................................................................................... - // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................|..................................................................................*...................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.....................................................................................*................................................................................... - // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...........................................................................................*............................................................................. - // cmge v27.4s, v31.4s, v17.4s // ...............*............................................................................................................................................................|..........................................................................................................................*.............................................. - // cmge v28.4s, v17.4s, v30.4s // .....................*......................................................................................................................................................|................................................................................................................................*........................................ - // sub v28.4s, v27.4s, v28.4s // .........................*..................................................................................................................................................|....................................................................................................................................*.................................... - // mls v17.4s, v28.4s, v29.4s // ................................................*...........................................................................................................................|...........................................................................................................................................................*............. - // cmge v27.4s, v31.4s, v18.4s // ......................................*.....................................................................................................................................|.................................................................................................................................................*....................... - // cmge v28.4s, v18.4s, v30.4s // ...............................*............................................................................................................................................|..........................................................................................................................................*.............................. - // sub v28.4s, v27.4s, v28.4s // ...............................................*............................................................................................................................|..........................................................................................................................................................*.............. - // mls v18.4s, v28.4s, v29.4s // ........................................................*...................................................................................................................|...................................................................................................................................................................*..... - // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................|................................................................................*........................................................................................ - // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................|...............................................................................*......................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|....................................................................................*.................................................................................... - // mls v19.4s, v28.4s, v29.4s // ......*.....................................................................................................................................................................|.................................................................................................................*....................................................... - // cmge v27.4s, v31.4s, v20.4s // ........*...................................................................................................................................................................|...................................................................................................................*..................................................... - // cmge v28.4s, v20.4s, v30.4s // .........*..................................................................................................................................................................|....................................................................................................................*.................................................... - // sub v28.4s, v27.4s, v28.4s // .................*..........................................................................................................................................................|............................................................................................................................*............................................ - // mls v20.4s, v28.4s, v29.4s // ..........................*.................................................................................................................................................|.....................................................................................................................................*................................... - // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................|..............................................................................................*.......................................................................... - // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................|..................................................................................................*...................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.........................................................................................................*............................................................... - // mls v21.4s, v28.4s, v29.4s // ...................*........................................................................................................................................................|..............................................................................................................................*.......................................... - // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................|...................................................................................................*..................................................................... - // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................|........................................................................................................*................................................................ - // sub v28.4s, v27.4s, v28.4s // ..*.........................................................................................................................................................................|.............................................................................................................*........................................................... - // mls v22.4s, v28.4s, v29.4s // ...........*................................................................................................................................................................|......................................................................................................................*.................................................. - // cmge v27.4s, v31.4s, v23.4s // .....*......................................................................................................................................................................|................................................................................................................*........................................................ - // cmge v28.4s, v23.4s, v30.4s // .......*....................................................................................................................................................................|..................................................................................................................*...................................................... - // sub v28.4s, v27.4s, v28.4s // ..............*.............................................................................................................................................................|.........................................................................................................................*............................................... - // mls v23.4s, v28.4s, v29.4s // ........................*...................................................................................................................................................|...................................................................................................................................*..................................... - // str q16, [x1], #(16) // ............................................................................................................................................................................|......................................................................................................*.................................................................. - // str q17, [x1, #(-16 + 1*(512/8))] // ...........................................................*................................................................................................................|......................................................................................................................................................................*.. - // str q18, [x1, #(-16 + 2*(512/8))] // .............................................................*..............................................................................................................|........................................................................................................................................................................* - // str q19, [x1, #(-16 + 3*(512/8))] // ..................*.........................................................................................................................................................|.............................................................................................................................*........................................... - // str q20, [x1, #(-16 + 4*(512/8))] // ....................................*.......................................................................................................................................|...............................................................................................................................................*......................... - // str q21, [x1, #(-16 + 5*(512/8))] // ..............................*.............................................................................................................................................|.........................................................................................................................................*............................... - // str q22, [x1, #(-16 + 6*(512/8))] // ............................*...............................................................................................................................................|.......................................................................................................................................*................................. - // str q23, [x1, #(-16 + 7*(512/8))] // .......................................*....................................................................................................................................|..................................................................................................................................................*...................... + // ------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // sqrdmulh v22.4S, v19.4S, v0.S[3] // *....................................................................................................................................................................................................................................................................................... + // mls v18.4S, v24.4S, v29.4S // ..*..................................................................................................................................................................................................................................................................................... + // mls v13.4S, v21.4S, v29.4S // .....*.................................................................................................................................................................................................................................................................................. + // mul v19.4S, v19.4S, v0.S[2] // .*...................................................................................................................................................................................................................................................................................... + // cmge v21.4S, v31.4S, v16.4S // .......*................................................................................................................................................................................................................................................................................ + // cmge v24.4S, v16.4S, v30.4S // ......*................................................................................................................................................................................................................................................................................. + // mls v19.4S, v22.4S, v29.4S // ........*............................................................................................................................................................................................................................................................................... + // sub v21.4S, v21.4S, v24.4S // .........*.............................................................................................................................................................................................................................................................................. + // sub v24.4S, v11.4S, v14.4S // ...*.................................................................................................................................................................................................................................................................................... + // sub v22.4S, v15.4S, v20.4S // .................*...................................................................................................................................................................................................................................................................... + // add v11.4S, v11.4S, v14.4S // ....*................................................................................................................................................................................................................................................................................... + // add v14.4S, v28.4S, v13.4S // ............*........................................................................................................................................................................................................................................................................... + // sub v28.4S, v28.4S, v13.4S // ...........*............................................................................................................................................................................................................................................................................ + // add v13.4S, v15.4S, v20.4S // ...................*.................................................................................................................................................................................................................................................................... + // sqrdmulh v15.4S, v23.4S, v1.S[1] // ................*....................................................................................................................................................................................................................................................................... + // mul v23.4S, v23.4S, v1.S[0] // ..........*............................................................................................................................................................................................................................................................................. + // sub v20.4S, v12.4S, v17.4S // ....................*................................................................................................................................................................................................................................................................... + // mls v16.4S, v21.4S, v29.4S // ...............*........................................................................................................................................................................................................................................................................ + // add v12.4S, v12.4S, v17.4S // ......................*................................................................................................................................................................................................................................................................. + // mul v17.4S, v27.4S, v0.S[0] // ........................*............................................................................................................................................................................................................................................................... + // sqrdmulh v27.4S, v27.4S, v0.S[1] // .........................*.............................................................................................................................................................................................................................................................. + // mul v21.4S, v22.4S, v0.S[0] // .............................*.......................................................................................................................................................................................................................................................... + // sqrdmulh v22.4S, v22.4S, v0.S[1] // ..............................*......................................................................................................................................................................................................................................................... + // mls v23.4S, v15.4S, v29.4S // ...........................*............................................................................................................................................................................................................................................................ + // str q16, [x1, #512] // .......................*................................................................................................................................................................................................................................................................ + // cmge v16.4S, v31.4S, v18.4S // ..........................*............................................................................................................................................................................................................................................................. + // cmge v15.4S, v18.4S, v30.4S // .....................*.................................................................................................................................................................................................................................................................. + // mls v17.4S, v27.4S, v29.4S // ............................*........................................................................................................................................................................................................................................................... + // mul v27.4S, v20.4S, v0.S[0] // ................................*....................................................................................................................................................................................................................................................... + // sqrdmulh v20.4S, v20.4S, v0.S[1] // .................................*...................................................................................................................................................................................................................................................... + // mls v21.4S, v22.4S, v29.4S // .......................................*................................................................................................................................................................................................................................................ + // sub v15.4S, v16.4S, v15.4S // ...............................*........................................................................................................................................................................................................................................................ + // mul v22.4S, v28.4S, v0.S[0] // ............................................*........................................................................................................................................................................................................................................... + // sqrdmulh v16.4S, v28.4S, v0.S[1] // ...........................................*............................................................................................................................................................................................................................................ + // mls v18.4S, v15.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + // add v15.4S, v19.4S, v23.4S // ..................................*..................................................................................................................................................................................................................................................... + // mls v27.4S, v20.4S, v29.4S // ..........................................*............................................................................................................................................................................................................................................. + // sub v20.4S, v19.4S, v23.4S // ....................................*................................................................................................................................................................................................................................................... + // cmge v19.4S, v31.4S, v21.4S // ..................................................*..................................................................................................................................................................................................................................... + // cmge v28.4S, v21.4S, v30.4S // ...................................................*.................................................................................................................................................................................................................................... + // mls v22.4S, v16.4S, v29.4S // .....................................................*.................................................................................................................................................................................................................................. + // cmge v16.4S, v17.4S, v30.4S // ........................................*............................................................................................................................................................................................................................................... + // cmge v23.4S, v31.4S, v17.4S // .........................................*.............................................................................................................................................................................................................................................. + // str q18, [x1, #640] // .............................................*.......................................................................................................................................................................................................................................... + // sub v28.4S, v19.4S, v28.4S // ........................................................*............................................................................................................................................................................................................................... + // mul v19.4S, v24.4S, v0.S[0] // .....................................*.................................................................................................................................................................................................................................................. + // sub v18.4S, v23.4S, v16.4S // ...............................................*........................................................................................................................................................................................................................................ + // sqrdmulh v23.4S, v24.4S, v0.S[1] // ......................................*................................................................................................................................................................................................................................................. + // cmge v24.4S, v27.4S, v30.4S // .........................................................*.............................................................................................................................................................................................................................. + // cmge v16.4S, v31.4S, v27.4S // ...........................................................*............................................................................................................................................................................................................................ + // mls v21.4S, v28.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + // mls v17.4S, v18.4S, v29.4S // ....................................................*................................................................................................................................................................................................................................... + // sub v28.4S, v16.4S, v24.4S // ..................................................................*..................................................................................................................................................................................................................... + // cmge v18.4S, v31.4S, v22.4S // ................................................................*....................................................................................................................................................................................................................... + // cmge v16.4S, v22.4S, v30.4S // ...............................................................*........................................................................................................................................................................................................................ + // mls v19.4S, v23.4S, v29.4S // ..............................................*......................................................................................................................................................................................................................................... + // sqrdmulh v24.4S, v13.4S, v26.4S // ................................................*....................................................................................................................................................................................................................................... + // str q21, [x1, #832] // ...........................................................................*............................................................................................................................................................................................................ + // sqrdmulh v21.4S, v14.4S, v26.4S // ......................................................*................................................................................................................................................................................................................................. + // mul v14.4S, v14.4S, v25.4S // .......................................................*................................................................................................................................................................................................................................ + // str q17, [x1, #576] // ..........................................................*............................................................................................................................................................................................................................. + // sub v17.4S, v18.4S, v16.4S // .....................................................................*.................................................................................................................................................................................................................. + // cmge v18.4S, v19.4S, v30.4S // ............................................................*........................................................................................................................................................................................................................... + // sqrdmulh v23.4S, v9.4S, v26.4S // .............*.......................................................................................................................................................................................................................................................................... + // mul v9.4S, v9.4S, v25.4S // ..............*......................................................................................................................................................................................................................................................................... + // mls v22.4S, v17.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + // mul v13.4S, v13.4S, v25.4S // .................................................*...................................................................................................................................................................................................................................... + // sqrdmulh v17.4S, v10.4S, v26.4S // .............................................................................*.......................................................................................................................................................................................................... + // mul v10.4S, v10.4S, v25.4S // ...............................................................................*........................................................................................................................................................................................................ + // mls v27.4S, v28.4S, v29.4S // ........................................................................*............................................................................................................................................................................................................... + // mls v14.4S, v21.4S, v29.4S // .............................................................*.......................................................................................................................................................................................................................... + // sqrdmulh v21.4S, v11.4S, v26.4S // ....................................................................*................................................................................................................................................................................................................... + // sqrdmulh v16.4S, v15.4S, v26.4S // ..............................................................................*......................................................................................................................................................................................................... + // mls v9.4S, v23.4S, v29.4S // ..................*..................................................................................................................................................................................................................................................................... + // sqrdmulh v28.4S, v20.4S, v0.S[1] // ...................................................................*.................................................................................................................................................................................................................... + // mul v23.4S, v20.4S, v0.S[0] // .................................................................................*...................................................................................................................................................................................................... + // cmge v20.4S, v31.4S, v19.4S // .......................................................................*................................................................................................................................................................................................................ + // mls v13.4S, v24.4S, v29.4S // ..............................................................*......................................................................................................................................................................................................................... + // mul v11.4S, v11.4S, v25.4S // ......................................................................*................................................................................................................................................................................................................. + // mls v10.4S, v17.4S, v29.4S // ..............................................................................................*......................................................................................................................................................................................... + // str q22, [x1, #896] // ......................................................................................*................................................................................................................................................................................................. + // mul v15.4S, v15.4S, v25.4S // ...................................................................................*.................................................................................................................................................................................................... + // str q27, [x1, #768] // .......................................................................................*................................................................................................................................................................................................ + // sub v24.4S, v20.4S, v18.4S // ............................................................................*........................................................................................................................................................................................................... + // sqrdmulh v22.4S, v12.4S, v26.4S // .........................................................................*.............................................................................................................................................................................................................. + // mul v12.4S, v12.4S, v25.4S // ..........................................................................*............................................................................................................................................................................................................. + // mls v23.4S, v28.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + // mls v19.4S, v24.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + // sqrdmulh v18.4S, v8.4S, v26.4S // ........................................................................................*............................................................................................................................................................................................... + // mul v8.4S, v8.4S, v25.4S // .........................................................................................*.............................................................................................................................................................................................. + // cmge v27.4S, v13.4S, v30.4S // ...........................................................................................*............................................................................................................................................................................................ + // cmge v24.4S, v31.4S, v13.4S // .................................................................................................*...................................................................................................................................................................................... + // mls v11.4S, v21.4S, v29.4S // ..................................................................................*..................................................................................................................................................................................................... + // cmge v21.4S, v31.4S, v9.4S // ........................................................................................................*............................................................................................................................................................................... + // cmge v20.4S, v9.4S, v30.4S // ...........................................................................................................*............................................................................................................................................................................ + // cmge v28.4S, v31.4S, v10.4S // .........................................................................................................................*.............................................................................................................................................................. + // mls v15.4S, v16.4S, v29.4S // ....................................................................................*................................................................................................................................................................................................... + // mls v12.4S, v22.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + // str q19, [x1, #704] // ....................................................................................................................*................................................................................................................................................................... + // sub v22.4S, v24.4S, v27.4S // ......................................................................................................*................................................................................................................................................................................. + // cmge v27.4S, v10.4S, v30.4S // .......................................................................................................................*................................................................................................................................................................ + // mls v8.4S, v18.4S, v29.4S // ...................................................................................................*.................................................................................................................................................................................... + // sub v21.4S, v21.4S, v20.4S // ..........................................................................................................................*............................................................................................................................................................. + // cmge v18.4S, v31.4S, v23.4S // .............................................................................................................*.......................................................................................................................................................................... + // cmge v24.4S, v23.4S, v30.4S // ............................................................................................................*........................................................................................................................................................................... + // mls v13.4S, v22.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + // sub v22.4S, v28.4S, v27.4S // .................................................................................................................................*...................................................................................................................................................... + // cmge v27.4S, v31.4S, v14.4S // ................................................................................................*....................................................................................................................................................................................... + // cmge v19.4S, v14.4S, v30.4S // ..........................................................................................*............................................................................................................................................................................................. + // mls v9.4S, v21.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + // sub v21.4S, v18.4S, v24.4S // ........................................................................................................................*............................................................................................................................................................... + // cmge v18.4S, v31.4S, v12.4S // ....................................................................................................*................................................................................................................................................................................... + // str q9, [x1, #64] // ........................................................................................................................................................*............................................................................................................................... + // cmge v17.4S, v12.4S, v30.4S // .....................................................................................................*.................................................................................................................................................................................. + // cmge v9.4S, v8.4S, v30.4S // ...............................................................................................................*........................................................................................................................................................................ + // cmge v24.4S, v31.4S, v8.4S // .....................................................................................................................*.................................................................................................................................................................. + // cmge v16.4S, v11.4S, v30.4S // ............................................................................................*........................................................................................................................................................................................... + // str q13, [x1, #320] // ................................................................................................................*....................................................................................................................................................................... + // sub v28.4S, v27.4S, v19.4S // .......................................................................................................*................................................................................................................................................................................ + // cmge v19.4S, v31.4S, v11.4S // .............................................................................................*.......................................................................................................................................................................................... + // cmge v13.4S, v31.4S, v15.4S // .................................................................................................................*...................................................................................................................................................................... + // cmge v27.4S, v15.4S, v30.4S // ...................................................................................................................*.................................................................................................................................................................... + // sub v20.4S, v18.4S, v17.4S // ..........................................................................................................*............................................................................................................................................................................. + // mls v10.4S, v22.4S, v29.4S // .........................................................................................................................................*.............................................................................................................................................. + // sub v18.4S, v24.4S, v9.4S // ...........................................................................................................................*............................................................................................................................................................ + // mls v23.4S, v21.4S, v29.4S // ...................................................................................................................................*.................................................................................................................................................... + // mls v14.4S, v28.4S, v29.4S // ....................................................................................................................................*................................................................................................................................................... + // sub v17.4S, v19.4S, v16.4S // ..................................................................................................*..................................................................................................................................................................................... + // sub v13.4S, v13.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... + // mls v12.4S, v20.4S, v29.4S // ..................................................................................................................*..................................................................................................................................................................... + // mls v8.4S, v18.4S, v29.4S // ........................................................................................................................................*............................................................................................................................................... + // mls v11.4S, v17.4S, v29.4S // ......................................................................................................................*................................................................................................................................................................. + // mls v15.4S, v13.4S, v29.4S // ......................................................................................................................................*................................................................................................................................................. + // str q10, [x1, #128] // .........................................................................................................................................................*.............................................................................................................................. + // str q23, [x1, #960] // .............................................................................................................................................................*.......................................................................................................................... + // str q14, [x1, #384] // .....................................................................................................................................................*.................................................................................................................................. + // str q12, [x1, #256] // ................................................................................................................................*....................................................................................................................................................... + // str q8, [x1], #(16) // .......................................................................................................................................................................*................................................................................................................ + // str q11, [x1, #176] // .....................................................................................................................................*.................................................................................................................................................. + // str q15, [x1, #432] // ............................................................................................................................................*........................................................................................................................................... + // ldr q21, [x1, #704] // ............................................................................................................................*........................................................................................................................................................... + // ldr q22, [x1, #640] // .............................................................................................................................*.......................................................................................................................................................... + // ldr q8, [x1, #320] // ...............................................................................................................................................*........................................................................................................................................ + // ldr q9, [x1, #256] // ..........................................................................................................................................*............................................................................................................................................. + // ldr q14, [x1] // ...............................................................................................................................*........................................................................................................................................................ + // ldr q27, [x1, #64] // ..................................................................................................................................*..................................................................................................................................................... + // ldr q24, [x1, #576] // .............................................................................................................................................*.......................................................................................................................................... + // ldr q15, [x1, #512] // ...................................................................................................................................................*.................................................................................................................................... + // ldr q16, [x1, #192] // ...........................................................................................................................................*............................................................................................................................................ + // ldr q28, [x1, #128] // ....................................................................................................................................................*................................................................................................................................... + // ldr q12, [x1, #384] // ......................................................................................................................................................*................................................................................................................................. + // ldr q20, [x1, #768] // .........................................................................................................................................................................*.............................................................................................................. + // sub v19.4S, v22.4S, v21.4S // ..............................................................................................................................................*......................................................................................................................................... + // add v21.4S, v22.4S, v21.4S // .................................................................................................................................................*...................................................................................................................................... + // ldr q22, [x1, #448] // .......................................................................................................................................................*................................................................................................................................ + // ldr q11, [x1, #832] // ............................................................................................................................................................................*........................................................................................................... + // sub v18.4S, v14.4S, v27.4S // ..................................................................................................................................................*..................................................................................................................................... + // add v23.4S, v9.4S, v8.4S // .................................................................................................................................................................*...................................................................................................................... + // sub v13.4S, v9.4S, v8.4S // ..................................................................................................................................................................*..................................................................................................................... + // add v8.4S, v14.4S, v27.4S // ................................................................................................................................................*....................................................................................................................................... + // sub v14.4S, v15.4S, v24.4S // ................................................................................................................................................................*....................................................................................................................... + // add v24.4S, v15.4S, v24.4S // ...................................................................................................................................................................*.................................................................................................................... + // sqrdmulh v27.4S, v19.4S, v6.S[1] // ...........................................................................................................................................................*............................................................................................................................ + // mul v19.4S, v19.4S, v6.S[0] // ............................................................................................................................................................*........................................................................................................................... + // add v10.4S, v28.4S, v16.4S // ......................................................................................................................................................................*................................................................................................................. + // sub v15.4S, v28.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + // sqrdmulh v17.4S, v18.4S, v3.S[3] // ........................................................................................................................................................................*............................................................................................................... + // mul v9.4S, v18.4S, v3.S[2] // ..............................................................................................................................................................*......................................................................................................................... + // sub v28.4S, v20.4S, v11.4S // .......................................................................................................................................................................................*................................................................................................ + // add v16.4S, v20.4S, v11.4S // ......................................................................................................................................................................................*................................................................................................. + // add v18.4S, v12.4S, v22.4S // .............................................................................................................................................................................*.......................................................................................................... + // sub v20.4S, v12.4S, v22.4S // ....................................................................................................................................................................*................................................................................................................... + // ldr q22, [x1, #896] // ...............................................................................................................................................................*........................................................................................................................ + // ldr q12, [x1, #960] // ..........................................................................................................................................................*............................................................................................................................. + // sqrdmulh v11.4S, v14.4S, v5.S[3] // ..................................................................................................................................................................................*..................................................................................................... + // mls v9.4S, v17.4S, v29.4S // ....................................................................................................................................................................................*................................................................................................... + // mls v19.4S, v27.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + // mul v17.4S, v14.4S, v5.S[2] // ...................................................................................................................................................................................*.................................................................................................... + // sub v14.4S, v23.4S, v18.4S // ............................................................................................................................................................................................*........................................................................................... + // mul v27.4S, v20.4S, v5.S[0] // ...............................................................................................................................................................................................................*........................................................................ + // sqrdmulh v20.4S, v20.4S, v5.S[1] // ..............................................................................................................................................................................................................*......................................................................... + // add v23.4S, v23.4S, v18.4S // ...........................................................................................................................................................................................*............................................................................................ + // sub v18.4S, v8.4S, v10.4S // ....................................................................................................................................................................................................*................................................................................... + // add v8.4S, v8.4S, v10.4S // .....................................................................................................................................................................................................*.................................................................................. + // add v10.4S, v22.4S, v12.4S // ...........................................................................................................................................................................*............................................................................................................ + // sub v12.4S, v22.4S, v12.4S // ..........................................................................................................................................................................*............................................................................................................. + // mls v17.4S, v11.4S, v29.4S // ..........................................................................................................................................................................................*............................................................................................. + // add v22.4S, v24.4S, v21.4S // ...............................................................................................................................................................................*........................................................................................................ + // sub v24.4S, v24.4S, v21.4S // ..............................................................................................................................................................................*......................................................................................................... + // mls v27.4S, v20.4S, v29.4S // ......................................................................................................................................................................................................................*................................................................. + // sqrdmulh v20.4S, v15.4S, v4.S[1] // .......................................................................................................................................................................................................*................................................................................ + // mul v21.4S, v15.4S, v4.S[0] // ......................................................................................................................................................................................................*................................................................................. + // mul v11.4S, v28.4S, v6.S[2] // ..............................................................................................................................................................................................*......................................................................................... + // sub v15.4S, v16.4S, v10.4S // .................................................................................................................................................................................................*...................................................................................... + // add v16.4S, v16.4S, v10.4S // ................................................................................................................................................................................................*....................................................................................... + // sqrdmulh v10.4S, v28.4S, v6.S[3] // ...............................................................................................................................................................................................*........................................................................................ + // mls v21.4S, v20.4S, v29.4S // ................................................................................................................................................................................................................*....................................................................... + // mls v11.4S, v10.4S, v29.4S // ........................................................................................................................................................................................................*............................................................................... + // mul v10.4S, v13.4S, v4.S[2] // .........................................................................................................................................................................................*.............................................................................................. + // sqrdmulh v13.4S, v13.4S, v4.S[3] // ........................................................................................................................................................................................*............................................................................................... + // add v20.4S, v17.4S, v19.4S // ............................................................................................................................................................................................................*........................................................................... + // sub v17.4S, v17.4S, v19.4S // .............................................................................................................................................................................................................*.......................................................................... + // mul v19.4S, v14.4S, v2.S[0] // ...........................................................................................................................................................................................................*............................................................................ + // sqrdmulh v14.4S, v14.4S, v2.S[1] // ...................................................................................................................................................................................................*.................................................................................... + // sub v28.4S, v22.4S, v16.4S // ..........................................................................................................................................................................................................*............................................................................. + // add v16.4S, v22.4S, v16.4S // .........................................................................................................................................................................................................*.............................................................................. + // sub v22.4S, v8.4S, v23.4S // .............................................................................................................................................................................................................................*.......................................................... + // add v8.4S, v8.4S, v23.4S // ..........................................................................................................................................................................................................................*............................................................. + // mls v10.4S, v13.4S, v29.4S // ..................................................................................................................................................................................................*..................................................................................... + // sqrdmulh v13.4S, v17.4S, v2.S[3] // .....................................................................................................................................................................................................................*.................................................................. + // mul v23.4S, v17.4S, v2.S[2] // ....................................................................................................................................................................................................................*................................................................... + // sub v17.4S, v9.4S, v21.4S // ..................................................................................................................................................................................................................................*..................................................... + // add v9.4S, v9.4S, v21.4S // ...................................................................................................................................................................................................................................*.................................................... + // mul v21.4S, v12.4S, v7.S[0] // .................................................................................................................................................................................*...................................................................................................... + // sqrdmulh v12.4S, v12.4S, v7.S[1] // .....................................................................................................................................................................................*.................................................................................................. + // mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................*.................................................................... + // sqrdmulh v14.4S, v15.4S, v3.S[1] // .................................................................................................................................................................................................................................*...................................................... + // mul v15.4S, v15.4S, v3.S[0] // ...........................................................................................................................................................................................................................*............................................................ + // mls v23.4S, v13.4S, v29.4S // ............................................................................................................................................................................................................................*........................................................... + // add v13.4S, v10.4S, v27.4S // ....................................................................................................................................................................................................................................*................................................... + // sub v27.4S, v10.4S, v27.4S // .....................................................................................................................................................................................................................................*.................................................. + // sqrdmulh v10.4S, v18.4S, v1.S[3] // ...............................................................................................................................................................................................................................*........................................................ + // mls v21.4S, v12.4S, v29.4S // .............................................................................................................................................................................................*.......................................................................................... + // mul v12.4S, v18.4S, v1.S[2] // ..............................................................................................................................................................................................................................*......................................................... + // mls v12.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + // sub v10.4S, v8.4S, v16.4S // ...........................................................................................................................................................................................................................................*............................................ + // add v8.4S, v8.4S, v16.4S // .............................................................................................................................................................................................................................................*.......................................... + // add v18.4S, v11.4S, v21.4S // .......................................................................................................................................................................................................................*................................................................ + // mls v15.4S, v14.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + // sub v16.4S, v11.4S, v21.4S // .........................................................................................................................................................................................................................*.............................................................. + // sub v14.4S, v9.4S, v13.4S // ..............................................................................................................................................................................................................................................*......................................... + // mul v21.4S, v24.4S, v2.S[2] // ..................................................................................................................................................................................................................*..................................................................... + // add v11.4S, v20.4S, v18.4S // .......................................................................................................................................................................................................................................................*................................ + // sub v20.4S, v20.4S, v18.4S // ......................................................................................................................................................................................................................................................*................................. + // sqrdmulh v24.4S, v24.4S, v2.S[3] // .................................................................................................................................................................................................................*...................................................................... + // add v9.4S, v9.4S, v13.4S // ...............................................................................................................................................................................................................................................*........................................ + // sub v13.4S, v12.4S, v19.4S // ........................................................................................................................................................................................................................................*............................................... + // add v12.4S, v12.4S, v19.4S // ..........................................................................................................................................................................................................................................*............................................. + // sqrdmulh v18.4S, v17.4S, v1.S[3] // .....................................................................................................................................................................................................................................................*.................................. + // mul v19.4S, v17.4S, v1.S[2] // ....................................................................................................................................................................................................................................................*................................... + // mul v17.4S, v28.4S, v1.S[0] // ........................................................................................................................................................................................................................................................*............................... + // sqrdmulh v28.4S, v28.4S, v1.S[1] // .........................................................................................................................................................................................................................................................*.............................. + // mls v21.4S, v24.4S, v29.4S // ........................................................................................................................................................................................................................*............................................................... + // mul v24.4S, v27.4S, v2.S[0] // ...................................................................................................................................................................................................................................................*.................................... + // sqrdmulh v27.4S, v27.4S, v2.S[1] // ................................................................................................................................................................................................................................................*....................................... + // mls v17.4S, v28.4S, v29.4S // ..............................................................................................................................................................................................................................................................*......................... + // mul v28.4S, v13.4S, v0.S[2] // .................................................................................................................................................................................................................................................................*...................... + // sqrdmulh v13.4S, v13.4S, v0.S[3] // ..................................................................................................................................................................................................................................................................*..................... + // mls v19.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................................*........................... + // add v18.4S, v21.4S, v15.4S // .................................................................................................................................................................................................................................................*...................................... + // sub v21.4S, v21.4S, v15.4S // ..................................................................................................................................................................................................................................................*..................................... + // mul v15.4S, v14.4S, v0.S[2] // ...............................................................................................................................................................................................................................................................*........................ + // mls v28.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + // mul v13.4S, v16.4S, v3.S[0] // .......................................................................................................................................................................................................................................*................................................ + // sqrdmulh v16.4S, v16.4S, v3.S[1] // ......................................................................................................................................................................................................................................*................................................. + // mls v24.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................................................*............................. + // sub v27.4S, v9.4S, v11.4S // .....................................................................................................................................................................................................................................................................*.................. + // add v9.4S, v9.4S, v11.4S // ....................................................................................................................................................................................................................................................................*................... + // sqrdmulh v11.4S, v14.4S, v0.S[3] // ................................................................................................................................................................................................................................................................*....................... + // sqrdmulh v14.4S, v10.4S, v0.S[1] // ...........................................................................................................................................................................................................................................................*............................ + // mls v13.4S, v16.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + // mul v16.4S, v10.4S, v0.S[0] // .............................................................................................................................................................................................................................................................*.......................... + // add v10.4S, v12.4S, v18.4S // .........................................................................................................................................................................................................................................................................*.............. + // sub v18.4S, v12.4S, v18.4S // ..........................................................................................................................................................................................................................................................................*............. + // mls v15.4S, v11.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + // add v11.4S, v19.4S, v24.4S // ...............................................................................................................................................................................................................................................................................*........ + // sub v19.4S, v19.4S, v24.4S // ..............................................................................................................................................................................................................................................................................*......... + // sqrdmulh v24.4S, v20.4S, v1.S[1] // ........................................................................................................................................................................................................................................................................*............... + // mul v20.4S, v20.4S, v1.S[0] // ......................................................................................................................................................................................................................................................................*................. + // mul v12.4S, v22.4S, v0.S[2] // ............................................................................................................................................................................................................................................................................*........... + // sqrdmulh v22.4S, v22.4S, v0.S[3] // .............................................................................................................................................................................................................................................................................*.......... + // mls v16.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + // add v14.4S, v23.4S, v13.4S // ...................................................................................................................................................................................................................................................................................*.... + // sub v23.4S, v23.4S, v13.4S // ....................................................................................................................................................................................................................................................................................*... + // mul v13.4S, v21.4S, v1.S[0] // .....................................................................................................................................................................................................................................................................................*.. + // sqrdmulh v21.4S, v21.4S, v1.S[1] // ......................................................................................................................................................................................................................................................................................*. + // mls v20.4S, v24.4S, v29.4S // ..................................................................................................................................................................................................................................................................................*..... + // sqrdmulh v24.4S, v18.4S, v0.S[1] // ................................................................................................................................................................................................................................................................................*....... + // mul v18.4S, v18.4S, v0.S[0] // .................................................................................................................................................................................................................................................................................*...... + // mls v12.4S, v22.4S, v29.4S // .......................................................................................................................................................................................................................................................................................* sub count, count, #1 cbnz count, layer1234_start - mls v24.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... - mls v27.4S, v21.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - cmge v22.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + sqrdmulh v22.4S, v19.4S, v0.S[3] // .................................................................................................................*...................................................................................................................................................................... + mls v18.4S, v24.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mls v13.4S, v21.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mul v19.4S, v19.4S, v0.S[2] // ..................................................................................................................*..................................................................................................................................................................... cmge v21.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... - add v11.4S, v14.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... - sub v23.4S, v14.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ - add v14.4S, v17.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ - sub v17.4S, v17.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. - mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - sqrdmulh v19.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - sub v21.4S, v21.4S, v22.4S // ..................................................................................................................................................................................*..................................................................................................... - sub v22.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... - add v12.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... - sqrdmulh v20.4S, v28.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - mul v28.4S, v28.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + cmge v24.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v19.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub v21.4S, v21.4S, v24.4S // ..................................................................................................................................................................................*..................................................................................................... + sub v24.4S, v11.4S, v14.4S // .......................................................................................................................................................*................................................................................................................................ + sub v22.4S, v15.4S, v20.4S // .................................................................................................................................................................*...................................................................................................................... + add v11.4S, v11.4S, v14.4S // ........................................................................................................................................................*............................................................................................................................... + add v14.4S, v28.4S, v13.4S // .......................................................................................................................................................................*................................................................................................................ + sub v28.4S, v28.4S, v13.4S // ......................................................................................................................................................................*................................................................................................................. + add v13.4S, v15.4S, v20.4S // ..................................................................................................................................................................*..................................................................................................................... + sqrdmulh v15.4S, v23.4S, v1.S[1] // .....................................................................................................................................*.................................................................................................................................................. + mul v23.4S, v23.4S, v1.S[0] // ......................................................................................................................................*................................................................................................................................................. + sub v20.4S, v12.4S, v17.4S // ............................................................................................................................................................*........................................................................................................................... mls v16.4S, v21.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - sub v21.4S, v13.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... - add v13.4S, v13.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... - mls v18.4S, v19.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sqrdmulh v23.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - mls v28.4S, v20.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - mul v20.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - sqrdmulh v27.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ - mul v22.4S, v17.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + add v12.4S, v12.4S, v17.4S // .............................................................................................................................................................*.......................................................................................................................... + mul v17.4S, v27.4S, v0.S[0] // ................................................................................................................................................*....................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[1] // ...............................................................................................................................................*........................................................................................................................................ + mul v21.4S, v22.4S, v0.S[0] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[1] // ...................................................................................................................................................................*.................................................................................................................... + mls v23.4S, v15.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - sqrdmulh v16.4S, v17.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mul v17.4S, v15.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + cmge v16.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v15.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v17.4S, v27.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mul v27.4S, v20.4S, v0.S[0] // ...............................................................................................................................................................*........................................................................................................................ + sqrdmulh v20.4S, v20.4S, v0.S[1] // ..............................................................................................................................................................*......................................................................................................................... + mls v21.4S, v22.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + sub v15.4S, v16.4S, v15.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v22.4S, v28.4S, v0.S[0] // .........................................................................................................................................................................*.............................................................................................................. + sqrdmulh v16.4S, v28.4S, v0.S[1] // ........................................................................................................................................................................*............................................................................................................... + mls v18.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + add v15.4S, v19.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... + mls v27.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v20.4S, v19.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v19.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v28.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. mls v22.4S, v16.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - cmge v16.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - mls v20.4S, v27.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... - mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - sqrdmulh v23.4S, v15.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... - add v15.4S, v24.4S, v28.4S // ............................................................................................................................................................................*........................................................................................................... - sub v27.4S, v27.4S, v16.4S // ..........................................................................................................................................................................................*............................................................................................. - sub v16.4S, v24.4S, v28.4S // ...........................................................................................................................................................................*............................................................................................................ - cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... - mls v18.4S, v27.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - sqrdmulh v27.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - mls v17.4S, v23.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - sub v23.4S, v24.4S, v28.4S // ..........................................................................................................................................................................................................*............................................................................. - cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - mul v24.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - cmge v21.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... - mls v22.4S, v23.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - mul v23.4S, v16.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - sqrdmulh v16.4S, v16.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - sub v28.4S, v21.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... - mls v24.4S, v27.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - cmge v21.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... - cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... - str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - cmge v18.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - mls v23.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - sub v18.4S, v21.4S, v18.4S // ..................................................................................................................................................................................................*..................................................................................... - str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - cmge v19.4S, v24.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - cmge v28.4S, v31.4S, v24.4S // ....................................................................................................................................................................................................*................................................................................... - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. cmge v16.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - sqrdmulh v14.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - mls v20.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - sub v18.4S, v28.4S, v19.4S // ......................................................................................................................................................................................................*................................................................................. - sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - sub v27.4S, v27.4S, v16.4S // ......................................................................................................................................................................................*................................................................................................. - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - cmge v19.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... - cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - mls v24.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - sub v8.4S, v19.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... - str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - mls v22.4S, v14.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - sqrdmulh v14.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - mls v17.4S, v27.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - mls v23.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - cmge v8.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - str q24, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - sqrdmulh v12.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v23.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sub v28.4S, v19.4S, v28.4S // ......................................................................................................................................................................................................*................................................................................. + mul v19.4S, v24.4S, v0.S[0] // ..........................................................................................................................................................*............................................................................................................................. + sub v18.4S, v23.4S, v16.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v23.4S, v24.4S, v0.S[1] // .........................................................................................................................................................*.............................................................................................................................. + cmge v24.4S, v27.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v16.4S, v31.4S, v27.4S // ................................................................................................................................................................................................*....................................................................................... + mls v21.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + mls v17.4S, v18.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v28.4S, v16.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v18.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v16.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sqrdmulh v24.4S, v13.4S, v26.4S // .......................................................................................................................................................................................................................................*................................................ + str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v21.4S, v14.4S, v26.4S // ..........................................................................................................................................................................................................................................*............................................. + mul v14.4S, v14.4S, v25.4S // ...........................................................................................................................................................................................................................................*............................................ str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - cmge v13.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - sqrdmulh v28.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - cmge v11.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - cmge v9.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v17.4S, v18.4S, v16.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + sqrdmulh v23.4S, v9.4S, v26.4S // ...........................................................................................................................................................................................................................*............................................................ + mul v9.4S, v9.4S, v25.4S // ............................................................................................................................................................................................................................*........................................................... + mls v22.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v13.4S, v13.4S, v25.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v17.4S, v10.4S, v26.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v10.4S, v10.4S, v25.4S // ...............................................................................................................................................................................................................................*........................................................ + mls v27.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mls v14.4S, v21.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sqrdmulh v21.4S, v11.4S, v26.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v16.4S, v15.4S, v26.4S // .............................................................................................................................................................................................................................................*.......................................... + mls v9.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sqrdmulh v28.4S, v20.4S, v0.S[1] // .............................................................................................................................................................................*.......................................................................................................... + mul v23.4S, v20.4S, v0.S[0] // ..............................................................................................................................................................................*......................................................................................................... + cmge v20.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + mls v13.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mul v11.4S, v11.4S, v25.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v10.4S, v17.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v15.4S, v15.4S, v25.4S // ..............................................................................................................................................................................................................................................*......................................... + str q27, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v24.4S, v20.4S, v18.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v22.4S, v12.4S, v26.4S // ....................................................................................................................................................................................................................................*................................................... + mul v12.4S, v12.4S, v25.4S // .....................................................................................................................................................................................................................................*.................................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mls v19.4S, v24.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + sqrdmulh v18.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................*............................................................... + mul v8.4S, v8.4S, v25.4S // .........................................................................................................................................................................................................................*.............................................................. + cmge v27.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v24.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... + mls v11.4S, v21.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v21.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v20.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v28.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + mls v15.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mls v12.4S, v22.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v22.4S, v24.4S, v27.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v27.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v8.4S, v18.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v21.4S, v21.4S, v20.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v18.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v24.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v13.4S, v22.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v22.4S, v28.4S, v27.4S // ..........................................................................................................................................................................................................................................................*............................. + cmge v27.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v19.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v9.4S, v21.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v21.4S, v18.4S, v24.4S // ..............................................................................................................................................................................................................*......................................................................... + cmge v18.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... + str q9, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + cmge v17.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v9.4S, v8.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v24.4S, v31.4S, v8.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v16.4S, v11.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + str q13, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + sub v28.4S, v27.4S, v19.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v19.4S, v31.4S, v11.4S // ............................................................................................................................................................................................................................................................*........................... + cmge v13.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v27.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + sub v20.4S, v18.4S, v17.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v10.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v18.4S, v24.4S, v9.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v23.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mls v14.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v17.4S, v19.4S, v16.4S // ..............................................................................................................................................................................................................................................................*......................... + sub v13.4S, v13.4S, v27.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v12.4S, v20.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v8.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + mls v11.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + mls v15.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q10, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - sqrdmulh v27.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - mls v18.4S, v12.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - sub v10.4S, v10.4S, v13.4S // ..........................................................................................................................................................................................................................................................................*............. - cmge v24.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - mls v17.4S, v28.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - sub v13.4S, v9.4S, v11.4S // ......................................................................................................................................................................................................................................................................*................. - cmge v15.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - sub v11.4S, v8.4S, v24.4S // ..................................................................................................................................................................................................................................................*..................................... - mls v23.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - cmge v27.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - cmge v9.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - mls v22.4S, v10.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - sub v10.4S, v9.4S, v27.4S // ..................................................................................................................................................................................................................................................................*..................... - cmge v27.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - cmge v13.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - cmge v24.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... - cmge v8.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - mls v20.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - cmge v10.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - cmge v11.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - cmge v12.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... - sub v27.4S, v15.4S, v27.4S // ..............................................................................................................................................................................................................................................................*......................... - str q22, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. - str q21, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. - sub v22.4S, v24.4S, v13.4S // ..........................................................................................................................................................................................................................................................*............................. - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - sub v10.4S, v8.4S, v10.4S // ......................................................................................................................................................................................................................................................*................................. - sub v13.4S, v12.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... - mls v19.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - mls v18.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - mls v23.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - mls v17.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... - str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + str q14, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. + str q12, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + str q8, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + str q11, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_icestorm.s b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_icestorm.s index d449be9f..a0e52425 100644 --- a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_icestorm.s +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_icestorm.s @@ -35,18 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vsub d,a,b sub \d\().4s, \a\().4s, \b\().4s .endm @@ -85,15 +73,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm .macro barrett_reduce_single a @@ -114,12 +102,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -138,31 +120,31 @@ .endm .macro load_roots_1234 r_ptr - ldr_vi root0, \r_ptr, (8*16) - ldr_vo root1, \r_ptr, (-8*16 + 1*16) - ldr_vo root2, \r_ptr, (-8*16 + 2*16) - ldr_vo root3, \r_ptr, (-8*16 + 3*16) - ldr_vo root4, \r_ptr, (-8*16 + 4*16) - ldr_vo root5, \r_ptr, (-8*16 + 5*16) - ldr_vo root6, \r_ptr, (-8*16 + 6*16) - ldr_vo root7, \r_ptr, (-8*16 + 7*16) + ldr qform_root0, [\r_ptr], #(8*16) + ldr qform_root1, [\r_ptr, #(-8*16 + 1*16)] + ldr qform_root2, [\r_ptr, #(-8*16 + 2*16)] + ldr qform_root3, [\r_ptr, #(-8*16 + 3*16)] + ldr qform_root4, [\r_ptr, #(-8*16 + 4*16)] + ldr qform_root5, [\r_ptr, #(-8*16 + 5*16)] + ldr qform_root6, [\r_ptr, #(-8*16 + 6*16)] + ldr qform_root7, [\r_ptr, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [\r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [\r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [\r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -177,7 +159,7 @@ trn1_d \data\()1, t1, t3 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -188,7 +170,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -198,7 +180,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -206,7 +188,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -217,24 +199,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -333,16 +321,26 @@ _intt_dilithium_1234_5678_opt_m1_icestorm: mov count, #16 .p2align 2 - ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x0] // *........................................ - ldr q5, [x3, #32] // ..*...................................... - ldr q8, [x3], #(6*16) // .*....................................... - // gap // ......................................... + // Instructions: 41 + // Expected cycles: 39 + // Expected IPC: 1.05 + // + // Wall time: 0.63s + // User time: 0.63s + // + // ---------- original position -----------> + // 0 25 + // |------------------------|--------------- + ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x0] // .*....................................... + ldr q1, [x3, #64] // *........................................ + ldr q18, [x3], #(6*16) // ..*...................................... + // gap // ......................................... + ldr q8, [x3, #-64] // ...*..................................... ldr q9, [x3, #-48] // ....*.................................... - ldr q0, [x3, #-80] // ...*..................................... // gap // ......................................... // gap // ......................................... - ldr q2, [x3, #-32] // .....*................................... - ldr q12, [x3, #-16] // ......*.................................. + ldr q2, [x3, #-16] // .....*................................... + ldr q3, [x3, #-80] // ............*............................ // gap // ......................................... // gap // ......................................... // gap // ......................................... @@ -357,35 +355,35 @@ _intt_dilithium_1234_5678_opt_m1_icestorm: // gap // ......................................... // gap // ......................................... // gap // ......................................... - sub v15.4S, v19.4S, v20.4S // .......*................................. - add v19.4S, v19.4S, v20.4S // ........*................................ + sub v26.4S, v21.4S, v22.4S // ......*.................................. + add v21.4S, v21.4S, v22.4S // .............*........................... // gap // ......................................... // gap // ......................................... - sub v13.4S, v21.4S, v22.4S // .........*............................... - add v18.4S, v21.4S, v22.4S // ..............*.......................... + sub v22.4S, v23.4S, v24.4S // .......*................................. + add v14.4S, v23.4S, v24.4S // ..............*.......................... // gap // ......................................... // gap // ......................................... - mul v5.4S, v15.4S, v5.4S // ..........*.............................. - sqrdmulh v9.4S, v15.4S, v9.4S // ...........*............................. + mul v8.4S, v26.4S, v8.4S // ........*................................ + sqrdmulh v9.4S, v26.4S, v9.4S // .........*............................... // gap // ......................................... // gap // ......................................... - mul v2.4S, v13.4S, v2.4S // ............*............................ - sqrdmulh v12.4S, v13.4S, v12.4S // .............*........................... + mul v1.4S, v22.4S, v1.4S // ..........*.............................. + sqrdmulh v22.4S, v22.4S, v2.4S // ...........*............................. // gap // ......................................... // gap // ......................................... - sub v15.4S, v19.4S, v18.4S // .................*....................... - add v19.4S, v19.4S, v18.4S // ..................*...................... + sub v2.4S, v21.4S, v14.4S // .................*....................... + add v21.4S, v21.4S, v14.4S // ..................*...................... // gap // ......................................... // gap // ......................................... - mls v5.4S, v9.4S, v29.4S // ...............*......................... + mls v8.4S, v9.4S, v29.4S // ...............*......................... // gap // ......................................... // gap // ......................................... // gap // ......................................... - mls v2.4S, v12.4S, v29.4S // ................*........................ - mul v9.4S, v15.4S, v8.4S // .....................*................... + mls v1.4S, v22.4S, v29.4S // ................*........................ + sqrdmulh v22.4S, v2.4S, v3.4S // ....................*.................... // gap // ......................................... // gap // ......................................... - sqrdmulh v12.4S, v15.4S, v0.4S // ......................*.................. + mul v9.4S, v2.4S, v18.4S // .....................*................... // gap // ......................................... // gap // ......................................... // gap // ......................................... @@ -393,27 +391,27 @@ _intt_dilithium_1234_5678_opt_m1_icestorm: // gap // ......................................... // gap // ......................................... // gap // ......................................... - sub v15.4S, v5.4S, v2.4S // ...................*..................... - add v5.4S, v5.4S, v2.4S // ....................*.................... + sub v2.4S, v8.4S, v1.4S // ...................*..................... + add v1.4S, v8.4S, v1.4S // ........................*................ // gap // ......................................... // gap // ......................................... - mls v9.4S, v12.4S, v29.4S // ...........................*............. + mls v9.4S, v22.4S, v29.4S // .........................*............... // gap // ......................................... // gap // ......................................... // gap // ......................................... - mul v8.4S, v15.4S, v8.4S // .......................*................. - sqrdmulh v0.4S, v15.4S, v0.4S // ........................*................ + mul v22.4S, v2.4S, v18.4S // ......................*.................. + sqrdmulh v18.4S, v2.4S, v3.4S // .......................*................. // gap // ......................................... // gap // ......................................... - trn1 v2.4S, v19.4S, v5.4S // .........................*............... - trn2 v19.4S, v19.4S, v5.4S // ............................*............ + trn2 v8.4S, v21.4S, v1.4S // ...........................*............. + trn1 v21.4S, v21.4S, v1.4S // ............................*............ // gap // ......................................... // gap // ......................................... // gap // ......................................... // gap // ......................................... // gap // ......................................... // gap // ......................................... - mls v8.4S, v0.4S, v29.4S // ..........................*.............. + mls v22.4S, v18.4S, v29.4S // ..........................*.............. // gap // ......................................... // gap // ......................................... // gap // ......................................... @@ -425,27 +423,27 @@ _intt_dilithium_1234_5678_opt_m1_icestorm: // gap // ......................................... // gap // ......................................... // gap // ......................................... - trn1 v5.4S, v9.4S, v8.4S // .............................*........... - trn2 v8.4S, v9.4S, v8.4S // ..............................*.......... + trn2 v1.4S, v9.4S, v22.4S // .............................*........... + trn1 v22.4S, v9.4S, v22.4S // ..............................*.......... // gap // ......................................... // gap // ......................................... // gap // ......................................... // gap // ......................................... // gap // ......................................... // gap // ......................................... - trn2 v7.2D, v2.2D, v5.2D // ...............................*......... - trn2 v12.2D, v19.2D, v8.2D // ................................*........ + trn2 v3.2D, v21.2D, v22.2D // ...............................*......... + trn2 v17.2D, v8.2D, v1.2D // ................................*........ // gap // ......................................... // gap // ......................................... - trn1 v2.2D, v2.2D, v5.2D // .................................*....... - trn1 v0.2D, v19.2D, v8.2D // ..................................*...... + trn1 v12.2D, v8.2D, v1.2D // .................................*....... + trn1 v6.2D, v21.2D, v22.2D // ..................................*...... // gap // ......................................... // gap // ......................................... - add v8.4S, v7.4S, v12.4S // ...................................*..... + add v21.4S, v3.4S, v17.4S // ...................................*..... // gap // ......................................... // gap // ......................................... // gap // ......................................... - add v9.4S, v2.4S, v0.4S // ....................................*.... + add v4.4S, v6.4S, v12.4S // ....................................*.... // gap // ......................................... // gap // ......................................... // gap // ......................................... @@ -453,7 +451,7 @@ _intt_dilithium_1234_5678_opt_m1_icestorm: // gap // ......................................... // gap // ......................................... // gap // ......................................... - add v19.4S, v9.4S, v8.4S // .....................................*... + add v1.4S, v4.4S, v21.4S // .....................................*... // gap // ......................................... // gap // ......................................... // gap // ......................................... @@ -461,7 +459,7 @@ _intt_dilithium_1234_5678_opt_m1_icestorm: // gap // ......................................... // gap // ......................................... // gap // ......................................... - srshr v5.4S, v19.4S, #23 // ......................................*.. + srshr v22.4S, v1.4S, #23 // ......................................*.. // gap // ......................................... // gap // ......................................... // gap // ......................................... @@ -473,7 +471,7 @@ _intt_dilithium_1234_5678_opt_m1_icestorm: // gap // ......................................... // gap // ......................................... // gap // ......................................... - mls v19.4S, v5.4S, v29.4S // .......................................*. + mls v1.4S, v22.4S, v29.4S // .......................................*. // gap // ......................................... // gap // ......................................... // gap // ......................................... @@ -485,392 +483,418 @@ _intt_dilithium_1234_5678_opt_m1_icestorm: // gap // ......................................... // gap // ......................................... // gap // ......................................... - str q19, [x0], #(16*4) // ........................................* + str q1, [x0], #(16*4) // ........................................* // gap // ......................................... // gap // ......................................... // gap // ......................................... - // original source code - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x0] // *........................................ - // ldr q26, [x3], #(6*16) // ..*...................................... - // ldr q27, [x3, #-64] // .*....................................... - // ldr q3, [x3, #-80] // ....*.................................... - // ldr q17, [x3, #-48] // ...*..................................... - // ldr q18, [x3, #-32] // .....*................................... - // ldr q20, [x3, #-16] // ......*.................................. - // sub v22.4S, v13.4S, v14.4S // .......*................................. - // add v5.4S, v13.4S, v14.4S // ........*................................ - // sub v9.4S, v15.4S, v16.4S // .........*............................... - // mul v11.4S, v22.4S, v27.4S // ...........*............................. - // sqrdmulh v24.4S, v22.4S, v17.4S // ............*............................ - // mul v28.4S, v9.4S, v18.4S // .............*........................... - // sqrdmulh v21.4S, v9.4S, v20.4S // ..............*.......................... - // add v14.4S, v15.4S, v16.4S // ..........*.............................. - // mls v11.4S, v24.4S, v29.4S // .................*....................... - // mls v28.4S, v21.4S, v29.4S // ..................*...................... - // sub v6.4S, v5.4S, v14.4S // ...............*......................... - // add v14.4S, v5.4S, v14.4S // ................*........................ - // sub v7.4S, v11.4S, v28.4S // .....................*................... - // add v13.4S, v11.4S, v28.4S // ......................*.................. - // mul v0.4S, v6.4S, v26.4S // ...................*..................... - // sqrdmulh v23.4S, v6.4S, v3.4S // ....................*.................... - // mul v18.4S, v7.4S, v26.4S // ........................*................ - // sqrdmulh v20.4S, v7.4S, v3.4S // .........................*............... - // trn1 v26.4S, v14.4S, v13.4S // ..........................*.............. - // mls v18.4S, v20.4S, v29.4S // ............................*............ - // mls v0.4S, v23.4S, v29.4S // .......................*................. - // trn2 v25.4S, v14.4S, v13.4S // ...........................*............. - // trn1 v6.4S, v0.4S, v18.4S // .............................*........... - // trn2 v27.4S, v0.4S, v18.4S // ..............................*.......... - // trn2 v7.2D, v26.2D, v6.2D // ...............................*......... - // trn2 v12.2D, v25.2D, v27.2D // ................................*........ - // trn1 v2.2D, v26.2D, v6.2D // .................................*....... - // trn1 v0.2D, v25.2D, v27.2D // ..................................*...... - // add v8.4S, v7.4S, v12.4S // ...................................*..... - // add v9.4S, v2.4S, v0.4S // ....................................*.... - // add v22.4S, v9.4S, v8.4S // .....................................*... - // srshr v13.4S, v22.4S, #23 // ......................................*.. - // mls v22.4S, v13.4S, v29.4S // .......................................*. - // str q22, [x0], #(16*4) // ........................................* + // ------------- new position -------------> + // 0 25 + // |------------------------|--------------- + // ldr q30, [x3, #64] // .*....................................... + // ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x0] // *........................................ + // ldr q17, [x3], #(6*16) // ..*...................................... + // ldr q9, [x3, #-64] // ...*..................................... + // ldr q11, [x3, #-48] // ....*.................................... + // ldr q15, [x3, #-16] // .....*................................... + // sub v31.4S, v0.4S, v1.4S // .......*................................. + // sub v8.4S, v2.4S, v3.4S // .........*............................... + // mul v27.4S, v31.4S, v9.4S // ...........*............................. + // sqrdmulh v25.4S, v31.4S, v11.4S // ............*............................ + // mul v31.4S, v8.4S, v30.4S // .............*........................... + // sqrdmulh v4.4S, v8.4S, v15.4S // ..............*.......................... + // ldr q30, [x3, #-80] // ......*.................................. + // add v11.4S, v0.4S, v1.4S // ........*................................ + // add v12.4S, v2.4S, v3.4S // ..........*.............................. + // mls v27.4S, v25.4S, v29.4S // .................*....................... + // mls v31.4S, v4.4S, v29.4S // ..................*...................... + // sub v9.4S, v11.4S, v12.4S // ...............*......................... + // add v25.4S, v11.4S, v12.4S // ................*........................ + // sub v12.4S, v27.4S, v31.4S // .....................*................... + // sqrdmulh v11.4S, v9.4S, v30.4S // ...................*..................... + // mul v8.4S, v9.4S, v17.4S // ....................*.................... + // mul v3.4S, v12.4S, v17.4S // ........................*................ + // sqrdmulh v15.4S, v12.4S, v30.4S // .........................*............... + // add v2.4S, v27.4S, v31.4S // ......................*.................. + // mls v8.4S, v11.4S, v29.4S // .......................*................. + // mls v3.4S, v15.4S, v29.4S // ............................*............ + // trn2 v22.4S, v25.4S, v2.4S // ..........................*.............. + // trn1 v2.4S, v25.4S, v2.4S // ...........................*............. + // trn2 v24.4S, v8.4S, v3.4S // .............................*........... + // trn1 v16.4S, v8.4S, v3.4S // ..............................*.......... + // trn2 v3.2D, v2.2D, v16.2D // ...............................*......... + // trn2 v17.2D, v22.2D, v24.2D // ................................*........ + // trn1 v12.2D, v22.2D, v24.2D // .................................*....... + // trn1 v6.2D, v2.2D, v16.2D // ..................................*...... + // add v21.4S, v3.4S, v17.4S // ...................................*..... + // add v4.4S, v6.4S, v12.4S // ....................................*.... + // add v22.4S, v4.4S, v21.4S // .....................................*... + // srshr v20.4S, v22.4S, #23 // ......................................*.. + // mls v22.4S, v20.4S, v29.4S // .......................................*. + // str q22, [x0], #(16*4) // ........................................* sub count, count, #1 layer5678_start: - sub v19.4S, v9.4S, v8.4S // ...............................................*................. - ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x0] // e................................................................ - ldr q26, [x3], #(6*16) // .e............................................................... - ldr q27, [x3, #-64] // ...e............................................................. - ldr q3, [x3, #-80] // ..e.............................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ldr q17, [x3, #-48] // ....e............................................................ - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ldr q18, [x3, #-32] // .....e........................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ldr q20, [x3, #-16] // ......e.......................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sub v30.4S, v2.4S, v0.4S // .....................................*........................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sub v22.4S, v13.4S, v14.4S // .......e......................................................... - add v5.4S, v13.4S, v14.4S // ........e........................................................ - // gap // ................................................................. - // gap // ................................................................. - sub v9.4S, v15.4S, v16.4S // ............e.................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mul v11.4S, v22.4S, v27.4S // .........e....................................................... - sqrdmulh v24.4S, v22.4S, v17.4S // ..........e...................................................... - // gap // ................................................................. - // gap // ................................................................. - mul v28.4S, v9.4S, v18.4S // ..............e.................................................. - sqrdmulh v21.4S, v9.4S, v20.4S // ...............e................................................. - // gap // ................................................................. - // gap // ................................................................. - add v14.4S, v15.4S, v16.4S // .............e................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v11.4S, v24.4S, v29.4S // ...........e..................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v28.4S, v21.4S, v29.4S // ................e................................................ - sub v6.4S, v5.4S, v14.4S // .................e............................................... - // gap // ................................................................. - // gap // ................................................................. - add v14.4S, v5.4S, v14.4S // ..................e.............................................. - ldr q5, [x4], #8 // ...................................*............................. - ldr q10, [x4], #16 // ....................................*............................ - // gap // ................................................................. - sub v1.4S, v7.4S, v12.4S // ..........................................*...................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sub v7.4S, v11.4S, v28.4S // ......................e.......................................... - add v13.4S, v11.4S, v28.4S // .......................e......................................... - // gap // ................................................................. - // gap // ................................................................. - mul v0.4S, v6.4S, v26.4S // ...................e............................................. - sqrdmulh v23.4S, v6.4S, v3.4S // ....................e............................................ - // gap // ................................................................. - // gap // ................................................................. - mul v18.4S, v7.4S, v26.4S // ........................e........................................ - sqrdmulh v20.4S, v7.4S, v3.4S // .........................e....................................... - // gap // ................................................................. - // gap // ................................................................. - sqrdmulh v24.4S, v30.4S, v10.S[1] // ........................................*........................ - trn1 v26.4S, v14.4S, v13.4S // ...........................e..................................... - // gap // ................................................................. - // gap // ................................................................. - mul v30.4S, v30.4S, v10.S[0] // .......................................*......................... - mul v31.4S, v1.4S, v10.S[2] // ............................................*.................... - // gap // ................................................................. - // gap // ................................................................. - mls v18.4S, v20.4S, v29.4S // ..........................e...................................... - mls v0.4S, v23.4S, v29.4S // .....................e........................................... - // gap // ................................................................. - // gap // ................................................................. - trn2 v25.4S, v14.4S, v13.4S // ............................e.................................... - sqrdmulh v16.4S, v1.4S, v10.S[3] // .............................................*................... - // gap // ................................................................. - // gap // ................................................................. - mls v30.4S, v24.4S, v29.4S // .........................................*....................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - trn1 v6.4S, v0.4S, v18.4S // .............................e................................... - trn2 v27.4S, v0.4S, v18.4S // ..............................e.................................. - // gap // ................................................................. - // gap // ................................................................. - mls v31.4S, v16.4S, v29.4S // ..............................................*.................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - trn2 v7.2D, v26.2D, v6.2D // ...............................e................................. - trn2 v12.2D, v25.2D, v27.2D // ................................e................................ - // gap // ................................................................. - // gap // ................................................................. - trn1 v2.2D, v26.2D, v6.2D // .................................e............................... - trn1 v0.2D, v25.2D, v27.2D // ..................................e.............................. - // gap // ................................................................. - // gap // ................................................................. - sub v20.4S, v30.4S, v31.4S // ....................................................*............ - add v8.4S, v7.4S, v12.4S // ...........................................e..................... - // gap // ................................................................. - // gap // ................................................................. - add v14.4S, v30.4S, v31.4S // .....................................................*........... - add v9.4S, v2.4S, v0.4S // ......................................e.......................... - // gap // ................................................................. - // gap // ................................................................. - mul v18.4S, v19.4S, v5.S[0] // .................................................*............... - sqrdmulh v19.4S, v19.4S, v5.S[1] // ..................................................*.............. - // gap // ................................................................. - // gap // ................................................................. - srshr v30.4S, v14.4S, #23 // ...........................................................*..... - add v22.4S, v9.4S, v8.4S // ................................................e................ - // gap // ................................................................. - // gap // ................................................................. - mul v3.4S, v20.4S, v5.S[0] // ......................................................*.......... - sqrdmulh v26.4S, v20.4S, v5.S[1] // .......................................................*......... - // gap // ................................................................. - // gap // ................................................................. - mls v18.4S, v19.4S, v29.4S // ...................................................*............. - srshr v13.4S, v22.4S, #23 // .........................................................e....... - // gap // ................................................................. - // gap // ................................................................. - mls v14.4S, v30.4S, v29.4S // ............................................................*.... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v3.4S, v26.4S, v29.4S // ........................................................*........ - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v22.4S, v13.4S, v29.4S // ..........................................................e...... - str q18, [x0, #-32] // ...............................................................*. - // gap // ................................................................. - // gap // ................................................................. - str q14, [x0, #-48] // ..............................................................*.. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - str q3, [x0, #-16] // ................................................................* - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - str q22, [x0], #(16*4) // .............................................................e... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. + // Instructions: 65 + // Expected cycles: 39 + // Expected IPC: 1.67 + // + // Wall time: 11.61s + // User time: 11.61s + // + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q18, [x4], #8 // ...................................*............................. + ldr q30, [x3, #64] // .....e........................................................... + sub v16.4S, v3.4S, v17.4S // ..........................................*...................... + ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x0] // e................................................................ + ldr q26, [x4], #16 // ....................................*............................ + sub v23.4S, v4.4S, v21.4S // ...............................................*................. + // gap // ................................................................. + ldr q17, [x3], #(6*16) // .e............................................................... + ldr q9, [x3, #-64] // ...e............................................................. + sub v21.4S, v6.4S, v12.4S // .....................................*........................... + // gap // ................................................................. + // gap // ................................................................. + ldr q11, [x3, #-48] // ....e............................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v20.4S, v23.4S, v18.S[1] // .................................................*............... + ldr q15, [x3, #-16] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + mul v28.4S, v16.4S, v26.S[2] // .............................................*................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v31.4S, v0.4S, v1.4S // .......e......................................................... + sqrdmulh v13.4S, v21.4S, v26.S[1] // .......................................*......................... + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v10.4S, v16.4S, v26.S[3] // ............................................*.................... + sub v8.4S, v2.4S, v3.4S // ............e.................................................... + // gap // ................................................................. + // gap // ................................................................. + mul v27.4S, v31.4S, v9.4S // ..........e...................................................... + sqrdmulh v25.4S, v31.4S, v11.4S // .........e....................................................... + // gap // ................................................................. + // gap // ................................................................. + mul v31.4S, v8.4S, v30.4S // ...............e................................................. + sqrdmulh v4.4S, v8.4S, v15.4S // ..............e.................................................. + ldr q30, [x3, #-80] // ..e.............................................................. + // gap // ................................................................. + add v11.4S, v0.4S, v1.4S // ........e........................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v12.4S, v2.4S, v3.4S // .............e................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v27.4S, v25.4S, v29.4S // ...........e..................................................... + mls v31.4S, v4.4S, v29.4S // ................e................................................ + // gap // ................................................................. + // gap // ................................................................. + sub v9.4S, v11.4S, v12.4S // .................e............................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v7.4S, v21.4S, v26.S[0] // ........................................*........................ + add v25.4S, v11.4S, v12.4S // ..................e.............................................. + // gap // ................................................................. + // gap // ................................................................. + sub v12.4S, v27.4S, v31.4S // ......................e.......................................... + sqrdmulh v11.4S, v9.4S, v30.4S // ...................e............................................. + // gap // ................................................................. + // gap // ................................................................. + mls v28.4S, v10.4S, v29.4S // ..............................................*.................. + mul v8.4S, v9.4S, v17.4S // ....................e............................................ + // gap // ................................................................. + // gap // ................................................................. + mul v3.4S, v12.4S, v17.4S // .........................e....................................... + sqrdmulh v15.4S, v12.4S, v30.4S // ........................e........................................ + // gap // ................................................................. + // gap // ................................................................. + mls v7.4S, v13.4S, v29.4S // .........................................*....................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v27.4S, v31.4S // .......................e......................................... + mls v8.4S, v11.4S, v29.4S // .....................e........................................... + // gap // ................................................................. + // gap // ................................................................. + mls v3.4S, v15.4S, v29.4S // ..........................e...................................... + mul v9.4S, v23.4S, v18.S[0] // ..................................................*.............. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v22.4S, v25.4S, v2.4S // ............................e.................................... + trn1 v2.4S, v25.4S, v2.4S // ...........................e..................................... + sub v14.4S, v7.4S, v28.4S // ....................................................*............ + // gap // ................................................................. + // gap // ................................................................. + trn2 v24.4S, v8.4S, v3.4S // ..............................e.................................. + trn1 v16.4S, v8.4S, v3.4S // .............................e................................... + // gap // ................................................................. + // gap // ................................................................. + mls v9.4S, v20.4S, v29.4S // ...................................................*............. + sqrdmulh v13.4S, v14.4S, v18.S[1] // ......................................................*.......... + // gap // ................................................................. + // gap // ................................................................. + trn2 v3.2D, v2.2D, v16.2D // ...............................e................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v17.2D, v22.2D, v24.2D // ................................e................................ + trn1 v12.2D, v22.2D, v24.2D // ..................................e.............................. + trn1 v6.2D, v2.2D, v16.2D // .................................e............................... + // gap // ................................................................. + // gap // ................................................................. + str q9, [x0, #-32] // ...............................................................*. + add v21.4S, v3.4S, v17.4S // ...........................................e..................... + // gap // ................................................................. + // gap // ................................................................. + add v5.4S, v7.4S, v28.4S // .....................................................*........... + add v4.4S, v6.4S, v12.4S // ......................................e.......................... + // gap // ................................................................. + // gap // ................................................................. + mul v19.4S, v14.4S, v18.S[0] // .......................................................*......... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v22.4S, v4.4S, v21.4S // ................................................e................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + srshr v7.4S, v5.4S, #23 // ...........................................................*..... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + srshr v20.4S, v22.4S, #23 // .........................................................e....... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v19.4S, v13.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v5.4S, v7.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v22.4S, v20.4S, v29.4S // ..........................................................e...... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q19, [x0, #-16] // ................................................................* + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q5, [x0, #-48] // ..............................................................*.. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q22, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. - // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e...............................................................|e.............................................................. - // ldr q0, [x3], #(6*16) // .e..............................................................|.e............................................................. - // ldr q4, [x3, #(-6*16 + 1*16)] // ...e............................................................|...e........................................................... - // ldr q1, [x3, #(-6*16 + 2*16)] // ..e.............................................................|..e............................................................ - // ldr q5, [x3, #(-6*16 + 3*16)] // ....e...........................................................|....e.......................................................... - // ldr q2, [x3, #(-6*16 + 4*16)] // .....e..........................................................|.....e......................................................... - // ldr q6, [x3, #(-6*16 + 5*16)] // ......e.........................................................|......e........................................................ - // sub v24.4s, v8.4s, v9.4s // ........e.......................................................|........e...................................................... - // add v8.4s, v8.4s, v9.4s // .........e......................................................|.........e..................................................... - // mul v9.4s, v24.4s, v1.4s // ...........e....................................................|...........e................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ............e...................................................|............e.................................................. - // mls v9.4s, v24.4s, v29.4s // ................e...............................................|................e.............................................. - // sub v24.4s, v10.4s, v11.4s // ..........e.....................................................|..........e.................................................... - // add v10.4s, v10.4s, v11.4s // ...............e................................................|...............e............................................... - // mul v11.4s, v24.4s, v2.4s // .............e..................................................|.............e................................................. - // sqrdmulh v24.4s, v24.4s, v6.4s // ..............e.................................................|..............e................................................ - // mls v11.4s, v24.4s, v29.4s // .................e..............................................|.................e............................................. - // sub v24.4s, v8.4s, v10.4s // ..................e.............................................|..................e............................................ - // add v8.4s, v8.4s, v10.4s // ...................e............................................|...................e........................................... - // mul v10.4s, v24.4s, v0.4s // .........................e......................................|.........................e..................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................e.....................................|..........................e.................................... - // mls v10.4s, v24.4s, v29.4s // ..................................e.............................|..................................e............................ - // sub v24.4s, v9.4s, v11.4s // .......................e........................................|.......................e....................................... - // add v9.4s, v9.4s, v11.4s // ........................e.......................................|........................e...................................... - // mul v11.4s, v24.4s, v0.4s // ...........................e....................................|...........................e................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ............................e...................................|............................e.................................. - // mls v11.4s, v24.4s, v29.4s // .................................e..............................|.................................e............................. - // trn1 v25.4s, v8.4s, v9.4s // ..............................e.................................|..............................e................................ - // trn2 v26.4s, v8.4s, v9.4s // ...................................e............................|...................................e........................... - // trn1 v27.4s, v10.4s, v11.4s // ......................................e.........................|......................................e........................ - // trn2 v28.4s, v10.4s, v11.4s // .......................................e........................|.......................................e....................... - // trn2 v10.2d, v25.2d, v27.2d // .........................................e......................|.........................................e..................... - // trn2 v11.2d, v26.2d, v28.2d // ..........................................e.....................|..........................................e.................... - // trn1 v8.2d, v25.2d, v27.2d // ...........................................e....................|...........................................e................... - // trn1 v9.2d, v26.2d, v28.2d // ............................................e...................|............................................e.................. - // ldr q1, [x4], #8 // ....................*...........................................|....................*.......................................... - // ldr q0, [x4], #16 // .....................*..........................................|.....................*......................................... - // sub v24.4s, v8.4s, v9.4s // .......*........................................................|.......*....................................................... - // add v8.4s, v8.4s, v9.4s // ................................................e...............|................................................e.............. - // mul v9.4s, v24.4s, v0.s[0] // ...............................*................................|...............................*............................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................*..................................|.............................*................................. - // mls v9.4s, v24.4s, v29.4s // .....................................*..........................|.....................................*......................... - // sub v24.4s, v10.4s, v11.4s // ......................*.........................................|......................*........................................ - // add v10.4s, v10.4s, v11.4s // ..............................................e.................|..............................................e................ - // mul v11.4s, v24.4s, v0.s[2] // ................................*...............................|................................*.............................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................................*...........................|....................................*.......................... - // mls v11.4s, v24.4s, v29.4s // ........................................*.......................|........................................*...................... - // sub v24.4s, v8.4s, v10.4s // ................................................................*............................................................... - // add v8.4s, v8.4s, v10.4s // ....................................................e...........|....................................................e.......... - // mul v10.4s, v24.4s, v1.s[0] // .................................................*..............|.................................................*............. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................*.............|..................................................*............ - // mls v10.4s, v24.4s, v29.4s // .......................................................*........|.......................................................*....... - // sub v24.4s, v9.4s, v11.4s // .............................................*..................|.............................................*................. - // add v9.4s, v9.4s, v11.4s // ...............................................*................|...............................................*............... - // mul v11.4s, v24.4s, v1.s[0] // .....................................................*..........|.....................................................*......... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................*.........|......................................................*........ - // mls v11.4s, v24.4s, v29.4s // ..........................................................*.....|..........................................................*.... - // srshr v24.4S, v8.4S, #23 // ........................................................e.......|........................................................e...... - // mls v8.4s, v24.4s, v29.4s // ...........................................................e....|...........................................................e... - // srshr v24.4S, v9.4S, #23 // ...................................................*............|...................................................*........... - // mls v9.4s, v24.4s, v29.4s // .........................................................*......|.........................................................*..... - // str q8, [x0], #(16*4) // ...............................................................e|............................................................... - // str q9, [x0, #(-16*4 + 1*16)] // .............................................................*..|.............................................................*. - // str q10, [x0, #(-16*4 + 2*16)] // ............................................................*...|............................................................*.. - // str q11, [x0, #(-16*4 + 3*16)] // ..............................................................*.|..............................................................* + // -------------------------------------------------------- new position ---------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-- + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // ..e.............................................................'..~............................................................ + // ldr q0, [x3], #(6*16) // .....e..........................................................'.....~......................................................... + // ldr q4, [x3, #(-6*16 + 1*16)] // ....................e...........................................'....................~.......................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // ......e.........................................................'......~........................................................ + // ldr q5, [x3, #(-6*16 + 3*16)] // ........e.......................................................'........~...................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // e...............................................................'~.............................................................. + // ldr q6, [x3, #(-6*16 + 5*16)] // ..........e.....................................................'..........~.................................................... + // sub v24.4s, v8.4s, v9.4s // ............e...................................................'............~.................................................. + // add v8.4s, v8.4s, v9.4s // .....................e..........................................'.....................~......................................... + // sqrdmulh v27.4s, v24.4s, v5.4s // .................e..............................................'.................~............................................. + // mul v9.4s, v24.4s, v1.4s // ................e...............................................'................~.............................................. + // mls v9.4s, v27.4s, v29.4s // .......................e........................................'.......................~....................................... + // sub v24.4s, v10.4s, v11.4s // ...............e................................................'...............~............................................... + // add v10.4s, v10.4s, v11.4s // ......................e.........................................'......................~........................................ + // sqrdmulh v27.4s, v24.4s, v6.4s // ...................e............................................'...................~........................................... + // mul v11.4s, v24.4s, v2.4s // ..................e.............................................'..................~............................................ + // mls v11.4s, v27.4s, v29.4s // ........................e.......................................'........................~...................................... + // sub v24.4s, v8.4s, v10.4s // .........................e......................................'.........................~..................................... + // add v8.4s, v8.4s, v10.4s // ...........................e....................................'...........................~................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // .............................e..................................'.............................~................................. + // mul v10.4s, v24.4s, v0.4s // ...............................e................................'...............................~............................... + // mls v10.4s, v27.4s, v29.4s // ....................................e...........................'....................................~.......................... + // sub v24.4s, v9.4s, v11.4s // ............................e...................................'............................~.................................. + // add v9.4s, v9.4s, v11.4s // ...................................e............................'...................................~........................... + // sqrdmulh v27.4s, v24.4s, v4.4s // .................................e..............................'.................................~............................. + // mul v11.4s, v24.4s, v0.4s // ................................e...............................'................................~.............................. + // mls v11.4s, v27.4s, v29.4s // .....................................e..........................'.....................................~......................... + // trn1 v25.4s, v8.4s, v9.4s // ........................................e.......................'........................................~...................... + // trn2 v26.4s, v8.4s, v9.4s // .......................................e........................'.......................................~....................... + // trn1 v27.4s, v10.4s, v11.4s // ...........................................e....................'...........................................~................... + // trn2 v28.4s, v10.4s, v11.4s // ..........................................e.....................'..........................................~.................... + // trn2 v10.2d, v25.2d, v27.2d // ..............................................e.................'..............................................~................ + // trn2 v11.2d, v26.2d, v28.2d // ...............................................e................'...............................................~............... + // trn1 v8.2d, v25.2d, v27.2d // .................................................e..............'.................................................~............. + // trn1 v9.2d, v26.2d, v28.2d // ................................................e...............'................................................~.............. + // ldr q1, [x4], #8 // ................................................................*............................................................... + // ldr q0, [x4], #16 // ...~............................................................'...*........................................................... + // sub v24.4s, v8.4s, v9.4s // .......~........................................................'.......*....................................................... + // add v8.4s, v8.4s, v9.4s // .....................................................e..........'.....................................................~......... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .............~..................................................'.............*................................................. + // mul v9.4s, v24.4s, v0.s[0] // ..........................~.....................................'..........................*.................................... + // mls v9.4s, v27.4s, v29.4s // ..................................~.............................'..................................*............................ + // sub v24.4s, v10.4s, v11.4s // .~..............................................................'.*............................................................. + // add v10.4s, v10.4s, v11.4s // ...................................................e............'...................................................~........... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ..............~.................................................'..............*................................................ + // mul v11.4s, v24.4s, v0.s[2] // ...........~....................................................'...........*................................................... + // mls v11.4s, v27.4s, v29.4s // ..............................~.................................'..............................*................................ + // sub v24.4s, v8.4s, v10.4s // ....~...........................................................'....*.......................................................... + // add v8.4s, v8.4s, v10.4s // .......................................................e........'.......................................................~....... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .........~......................................................'.........*..................................................... + // mul v10.4s, v24.4s, v1.s[0] // ......................................~.........................'......................................*........................ + // mls v10.4s, v27.4s, v29.4s // ............................................~...................'............................................*.................. + // sub v24.4s, v9.4s, v11.4s // .........................................~......................'.........................................*..................... + // add v9.4s, v9.4s, v11.4s // ....................................................~...........'....................................................*.......... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .............................................~..................'.............................................*................. + // mul v11.4s, v24.4s, v1.s[0] // ......................................................~.........'......................................................*........ + // mls v11.4s, v27.4s, v29.4s // ..........................................................~.....'..........................................................*.... + // srshr v24.4S, v8.4S, #23 // .........................................................e......'.........................................................~..... + // mls v8.4s, v24.4s, v29.4s // ............................................................e...'............................................................~.. + // srshr v24.4S, v9.4S, #23 // ........................................................~.......'........................................................*...... + // mls v9.4s, v24.4s, v29.4s // ...........................................................~....'...........................................................*... + // str q8, [x0], #(16*4) // ...............................................................e'............................................................... + // str q9, [x0, #(-16*4 + 1*16)] // ..............................................................~.'..............................................................* + // str q10, [x0, #(-16*4 + 2*16)] // ..................................................~.............'..................................................*............ + // str q11, [x0, #(-16*4 + 3*16)] // .............................................................~..'.............................................................*. sub count, count, #1 cbnz count, layer5678_start - sub v13.4S, v9.4S, v8.4S // *....................... - sub v8.4S, v2.4S, v0.4S // .*...................... - ldr q2, [x4], #8 // ..*..................... - ldr q5, [x4], #16 // ...*.................... - sub v19.4S, v7.4S, v12.4S // ....*................... - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - sqrdmulh v0.4S, v8.4S, v5.S[1] // .....*.................. - mul v9.4S, v8.4S, v5.S[0] // ......*................. - // gap // ........................ - // gap // ........................ - mul v8.4S, v19.4S, v5.S[2] // .......*................ - sqrdmulh v5.4S, v19.4S, v5.S[3] // ........*............... - // gap // ........................ - // gap // ........................ - mul v16.4S, v13.4S, v2.S[0] // .............*.......... - // gap // ........................ - // gap // ........................ - // gap // ........................ - mls v9.4S, v0.4S, v29.4S // .........*.............. - sqrdmulh v19.4S, v13.4S, v2.S[1] // ..............*......... - // gap // ........................ - // gap // ........................ - mls v8.4S, v5.4S, v29.4S // ..........*............. - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - mls v16.4S, v19.4S, v29.4S // ..................*..... - // gap // ........................ - // gap // ........................ - // gap // ........................ - sub v5.4S, v9.4S, v8.4S // ...........*............ - // gap // ........................ - // gap // ........................ - // gap // ........................ - add v9.4S, v9.4S, v8.4S // ............*........... - // gap // ........................ - // gap // ........................ - // gap // ........................ - sqrdmulh v19.4S, v5.4S, v2.S[1] // .................*...... - mul v5.4S, v5.4S, v2.S[0] // ................*....... - str q16, [x0, #-32] // .....................*.. - // gap // ........................ - srshr v8.4S, v9.4S, #23 // ...............*........ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - mls v5.4S, v19.4S, v29.4S // ....................*... - // gap // ........................ - // gap // ........................ - // gap // ........................ - mls v9.4S, v8.4S, v29.4S // ...................*.... - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - str q5, [x0, #-16] // .......................* - // gap // ........................ - // gap // ........................ - // gap // ........................ - str q9, [x0, #-48] // ......................*. - // gap // ........................ - // gap // ........................ - // gap // ........................ + // Instructions: 24 + // Expected cycles: 21 + // Expected IPC: 1.14 + // + // Wall time: 0.20s + // User time: 0.20s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sub v16.4S, v3.4S, v17.4S // .*............................ + ldr q2, [x4], #8 // *............................. + ldr q1, [x4], #16 // ..*........................... + // gap // .............................. + sub v30.4S, v4.4S, v21.4S // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v21.4S, v6.4S, v12.4S // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v8.4S, v16.4S, v1.S[2] // ......*....................... + sqrdmulh v22.4S, v16.4S, v1.S[3] // ........*..................... + // gap // .............................. + // gap // .............................. + sqrdmulh v18.4S, v21.4S, v1.S[1] // .......*...................... + mul v1.4S, v21.4S, v1.S[0] // .........*.................... + // gap // .............................. + // gap // .............................. + sqrdmulh v9.4S, v30.4S, v2.S[1] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v8.4S, v22.4S, v29.4S // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v1.4S, v18.4S, v29.4S // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v21.4S, v1.4S, v8.4S // .............*................ + add v22.4S, v1.4S, v8.4S // .................*............ + // gap // .............................. + // gap // .............................. + mul v8.4S, v30.4S, v2.S[0] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v18.4S, v21.4S, v2.S[1] // ...............*.............. + mul v1.4S, v21.4S, v2.S[0] // ..................*........... + // gap // .............................. + // gap // .............................. + srshr v13.4S, v22.4S, #23 // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v8.4S, v9.4S, v29.4S // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v1.4S, v18.4S, v29.4S // ....................*......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v22.4S, v13.4S, v29.4S // .....................*........ + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q8, [x0, #-32] // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q1, [x0, #-16] // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q22, [x0, #-48] // .......................*...... + // gap // .............................. + // gap // .............................. + // gap // .............................. - // original source code - // sub v19.4S, v9.4S, v8.4S // *....................... - // sub v30.4S, v2.4S, v0.4S // .*...................... - // ldr q5, [x4], #8 // ..*..................... - // ldr q10, [x4], #16 // ...*.................... - // sub v1.4S, v7.4S, v12.4S // ....*................... - // sqrdmulh v24.4S, v30.4S, v10.S[1] // .....*.................. - // mul v30.4S, v30.4S, v10.S[0] // ......*................. - // mul v31.4S, v1.4S, v10.S[2] // .......*................ - // sqrdmulh v16.4S, v1.4S, v10.S[3] // ........*............... - // mls v30.4S, v24.4S, v29.4S // ..........*............. - // mls v31.4S, v16.4S, v29.4S // ............*........... - // sub v20.4S, v30.4S, v31.4S // ..............*......... - // add v14.4S, v30.4S, v31.4S // ...............*........ - // mul v18.4S, v19.4S, v5.S[0] // .........*.............. - // sqrdmulh v19.4S, v19.4S, v5.S[1] // ...........*............ - // srshr v30.4S, v14.4S, #23 // ...................*.... - // mul v3.4S, v20.4S, v5.S[0] // .................*...... - // sqrdmulh v26.4S, v20.4S, v5.S[1] // ................*....... - // mls v18.4S, v19.4S, v29.4S // .............*.......... - // mls v14.4S, v30.4S, v29.4S // .....................*.. - // mls v3.4S, v26.4S, v29.4S // ....................*... - // str q18, [x0, #-32] // ..................*..... - // str q14, [x0, #-48] // .......................* - // str q3, [x0, #-16] // ......................*. + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q18, [x4], #8 // .*............................. + // sub v16.4S, v3.4S, v17.4S // *.............................. + // ldr q26, [x4], #16 // ..*............................ + // sub v23.4S, v4.4S, v21.4S // ...*........................... + // sub v21.4S, v6.4S, v12.4S // ....*.......................... + // sqrdmulh v20.4S, v23.4S, v18.S[1] // .........*..................... + // mul v28.4S, v16.4S, v26.S[2] // .....*......................... + // sqrdmulh v13.4S, v21.4S, v26.S[1] // .......*....................... + // sqrdmulh v10.4S, v16.4S, v26.S[3] // ......*........................ + // mul v7.4S, v21.4S, v26.S[0] // ........*...................... + // mls v28.4S, v10.4S, v29.4S // ..........*.................... + // mls v7.4S, v13.4S, v29.4S // ...........*................... + // mul v9.4S, v23.4S, v18.S[0] // ..............*................ + // sub v14.4S, v7.4S, v28.4S // ............*.................. + // mls v9.4S, v20.4S, v29.4S // ..................*............ + // sqrdmulh v13.4S, v14.4S, v18.S[1] // ...............*............... + // str q9, [x0, #-32] // .....................*......... + // add v5.4S, v7.4S, v28.4S // .............*................. + // mul v19.4S, v14.4S, v18.S[0] // ................*.............. + // srshr v7.4S, v5.4S, #23 // .................*............. + // mls v19.4S, v13.4S, v29.4S // ...................*........... + // mls v5.4S, v7.4S, v29.4S // ....................*.......... + // str q19, [x0, #-16] // ......................*........ + // str q5, [x0, #-48] // .......................*....... .unreq root0_tw @@ -912,853 +936,862 @@ layer5678_start: load_roots_1234 r_ptr1 .p2align 2 - ldr q14, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. - ldr q28, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... - ldr q22, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... - ldr q19, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. - ldr q27, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. - ldr q12, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ - add v17.4S, v28.4S, v14.4S // ...........................*............................................................................................................................................................................................................................................................ - sub v9.4S, v28.4S, v14.4S // ..........................*............................................................................................................................................................................................................................................................. - ldr q23, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... - sub v8.4S, v22.4S, v19.4S // ....................................*................................................................................................................................................................................................................................................... - ldr q15, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... - add v21.4S, v22.4S, v19.4S // .....................................*.................................................................................................................................................................................................................................................. - ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... - mul v24.4S, v9.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... - sqrdmulh v14.4S, v9.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... - mul v9.4S, v8.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. - sqrdmulh v28.4S, v8.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + ldr q18, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q12, [x1] // *....................................................................................................................................................................................................................................................................................... + ldr q17, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q8, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q27, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... ldr q10, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... - sub v16.4S, v23.4S, v12.4S // ...................................................*.................................................................................................................................................................................................................................... - add v22.4S, v23.4S, v12.4S // ....................................................*................................................................................................................................................................................................................................... - ldr q19, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... - mls v24.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... - ldr q14, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ - sub v11.4S, v20.4S, v15.4S // ................*....................................................................................................................................................................................................................................................................... - sqrdmulh v8.4S, v16.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. - mul v23.4S, v16.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. - add v15.4S, v20.4S, v15.4S // .................*...................................................................................................................................................................................................................................................................... - ldr q12, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ - add v13.4S, v27.4S, v14.4S // ................................*....................................................................................................................................................................................................................................................... - sub v14.4S, v27.4S, v14.4S // ...............................*........................................................................................................................................................................................................................................................ - sqrdmulh v20.4S, v11.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... - mul v11.4S, v11.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... - mul v16.4S, v14.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... - sub v18.4S, v17.4S, v13.4S // ..................................................................*..................................................................................................................................................................................................................... - add v17.4S, v17.4S, v13.4S // ...................................................................*.................................................................................................................................................................................................................... - mls v11.4S, v20.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... - add v13.4S, v19.4S, v10.4S // ......................*................................................................................................................................................................................................................................................................. - mls v16.4S, v14.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... - sqrdmulh v14.4S, v18.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. - sub v20.4S, v15.4S, v13.4S // ........................................................*............................................................................................................................................................................................................................... - mul v18.4S, v18.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... - mls v23.4S, v8.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ - add v27.4S, v15.4S, v13.4S // .........................................................*.............................................................................................................................................................................................................................. - ldr q13, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. - mul v8.4S, v20.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. - sqrdmulh v20.4S, v20.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ - mls v18.4S, v14.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. - add v15.4S, v24.4S, v16.4S // ........................................................................*............................................................................................................................................................................................................... - mls v9.4S, v28.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... - sub v19.4S, v19.4S, v10.4S // .....................*.................................................................................................................................................................................................................................................................. - mls v8.4S, v20.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... - sub v10.4S, v13.4S, v12.4S // .........................................*.............................................................................................................................................................................................................................................. - mul v28.4S, v19.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ - sqrdmulh v19.4S, v19.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... - add v12.4S, v13.4S, v12.4S // ..........................................*............................................................................................................................................................................................................................................. - sqrdmulh v13.4S, v10.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... - sub v20.4S, v8.4S, v18.4S // ..........................................................................................................*............................................................................................................................................................................. - mls v28.4S, v19.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. - sub v24.4S, v24.4S, v16.4S // .......................................................................*................................................................................................................................................................................................................ - mul v14.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... - mul v19.4S, v10.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ - sqrdmulh v16.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... - ldr q20, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... - add v10.4S, v8.4S, v18.4S // ...........................................................................................................*............................................................................................................................................................................ - sub v18.4S, v21.4S, v12.4S // ............................................................................*........................................................................................................................................................................................................... - sub v8.4S, v11.4S, v28.4S // .............................................................*.......................................................................................................................................................................................................................... - add v28.4S, v11.4S, v28.4S // ..............................................................*......................................................................................................................................................................................................................... - mls v19.4S, v13.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... - ldr q13, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... - mls v14.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... - add v16.4S, v21.4S, v12.4S // .............................................................................*.......................................................................................................................................................................................................... - mul v11.4S, v8.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ - sqrdmulh v8.4S, v8.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... - sqrdmulh v21.4S, v18.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ - mul v18.4S, v18.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... - add v12.4S, v13.4S, v20.4S // ...............................................*........................................................................................................................................................................................................................................ - sub v20.4S, v13.4S, v20.4S // ..............................................*......................................................................................................................................................................................................................................... - mls v11.4S, v8.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... - add v8.4S, v27.4S, v17.4S // .................................................................................................*...................................................................................................................................................................................... - mls v18.4S, v21.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... - sub v21.4S, v12.4S, v22.4S // ......................................................................................*................................................................................................................................................................................................. - sub v17.4S, v27.4S, v17.4S // ................................................................................................*....................................................................................................................................................................................... - add v12.4S, v12.4S, v22.4S // .......................................................................................*................................................................................................................................................................................................ - sqrdmulh v22.4S, v24.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. - mul v13.4S, v24.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. - sub v24.4S, v28.4S, v15.4S // .....................................................................................................*.................................................................................................................................................................................. - add v27.4S, v16.4S, v12.4S // .....................................................................................................................*.................................................................................................................................................................. - add v15.4S, v28.4S, v15.4S // ......................................................................................................*................................................................................................................................................................................. - sub v28.4S, v9.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... - add v9.4S, v9.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... - mls v13.4S, v22.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ - mul v19.4S, v21.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... - sub v12.4S, v16.4S, v12.4S // ....................................................................................................................*................................................................................................................................................................... - sqrdmulh v22.4S, v21.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. - sqrdmulh v16.4S, v20.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... - add v21.4S, v11.4S, v13.4S // ................................................................................................................*....................................................................................................................................................................... - sub v13.4S, v11.4S, v13.4S // ...............................................................................................................*........................................................................................................................................................................ - mul v11.4S, v20.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... - mul v20.4S, v28.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... - mls v19.4S, v22.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. - sqrdmulh v28.4S, v28.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... - sub v22.4S, v8.4S, v27.4S // ........................................................................................................................................*............................................................................................................................................... - add v8.4S, v8.4S, v27.4S // .........................................................................................................................................*.............................................................................................................................................. - mls v11.4S, v16.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... - mul v16.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... - mls v20.4S, v28.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. - sqrdmulh v28.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... - sqrdmulh v13.4S, v12.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ - mul v12.4S, v12.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. - add v27.4S, v11.4S, v23.4S // ............................................................................................*........................................................................................................................................................................................... - sub v23.4S, v11.4S, v23.4S // ...........................................................................................*............................................................................................................................................................................................ - mls v12.4S, v13.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... - add v13.4S, v9.4S, v27.4S // ..........................................................................................................................*............................................................................................................................................................. - sub v27.4S, v9.4S, v27.4S // .........................................................................................................................*.............................................................................................................................................................. - mul v11.4S, v23.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... - add v9.4S, v15.4S, v13.4S // ..............................................................................................................................................*......................................................................................................................................... - sub v13.4S, v15.4S, v13.4S // .............................................................................................................................................*.......................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... - mul v15.4S, v27.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ - sqrdmulh v27.4S, v27.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... - mls v16.4S, v28.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... - mul v28.4S, v17.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... - sqrdmulh v17.4S, v17.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... - mls v11.4S, v23.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ - sqrdmulh v23.4S, v24.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... - mul v24.4S, v24.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ - mls v15.4S, v27.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... - mls v28.4S, v17.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... - mul v27.4S, v13.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ - add v17.4S, v20.4S, v11.4S // ....................................................................................................................................*................................................................................................................................................... - sqrdmulh v13.4S, v13.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... - sub v11.4S, v20.4S, v11.4S // ...................................................................................................................................*.................................................................................................................................................... - mls v24.4S, v23.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. - sub v23.4S, v28.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... - add v12.4S, v28.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... - sqrdmulh v28.4S, v11.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. - mul v20.4S, v11.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. - mls v27.4S, v13.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... - sqrdmulh v13.4S, v23.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + add v14.4S, v12.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... + ldr q21, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + sub v11.4S, v12.4S, v18.4S // ................*....................................................................................................................................................................................................................................................................... + sub v23.4S, v17.4S, v8.4S // ...............................*........................................................................................................................................................................................................................................................ + add v19.4S, v17.4S, v8.4S // ................................*....................................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v11.4S, v3.S[3] // ..................*..................................................................................................................................................................................................................................................................... + mul v12.4S, v11.4S, v3.S[2] // ...................*.................................................................................................................................................................................................................................................................... + ldr q18, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + sub v17.4S, v27.4S, v10.4S // .....................*.................................................................................................................................................................................................................................................................. + add v11.4S, v27.4S, v10.4S // ......................*................................................................................................................................................................................................................................................................. + ldr q10, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + mul v15.4S, v23.4S, v5.S[0] // ..................................*..................................................................................................................................................................................................................................................... + add v20.4S, v14.4S, v11.4S // .........................................................*.............................................................................................................................................................................................................................. + mls v12.4S, v22.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + ldr q16, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v17.4S, v4.S[1] // .......................*................................................................................................................................................................................................................................................................ + sub v22.4S, v14.4S, v11.4S // ........................................................*............................................................................................................................................................................................................................... + add v11.4S, v10.4S, v21.4S // ...........................*............................................................................................................................................................................................................................................................ + mul v27.4S, v17.4S, v4.S[0] // ........................*............................................................................................................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v5.S[1] // .................................*...................................................................................................................................................................................................................................................... + sub v28.4S, v10.4S, v21.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q17, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q24, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + sub v9.4S, v16.4S, v18.4S // ....................................*................................................................................................................................................................................................................................................... + sub v21.4S, v11.4S, v19.4S // ..................................................................*..................................................................................................................................................................................................................... + ldr q10, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + mul v13.4S, v28.4S, v4.S[2] // .............................*.......................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v28.4S, v4.S[3] // ............................*........................................................................................................................................................................................................................................................... + add v16.4S, v16.4S, v18.4S // .....................................*.................................................................................................................................................................................................................................................. + sqrdmulh v18.4S, v9.4S, v5.S[3] // ......................................*................................................................................................................................................................................................................................................. + add v28.4S, v11.4S, v19.4S // ...................................................................*.................................................................................................................................................................................................................... + mul v11.4S, v9.4S, v5.S[2] // .......................................*................................................................................................................................................................................................................................................ + add v19.4S, v17.4S, v10.4S // ..........................................*............................................................................................................................................................................................................................................. + mls v27.4S, v8.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + add v8.4S, v20.4S, v28.4S // .................................................................................................*...................................................................................................................................................................................... + sub v9.4S, v20.4S, v28.4S // ................................................................................................*....................................................................................................................................................................................... + mls v11.4S, v18.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v18.4S, v16.4S, v19.4S // ............................................................................*........................................................................................................................................................................................................... + ldr q20, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + sub v10.4S, v17.4S, v10.4S // .........................................*.............................................................................................................................................................................................................................................. + mls v15.4S, v23.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v23.4S, v18.4S, v2.S[3] // ..............................................................................*......................................................................................................................................................................................................... + mul v18.4S, v18.4S, v2.S[2] // ...............................................................................*........................................................................................................................................................................................................ + mul v28.4S, v10.4S, v6.S[0] // ............................................*........................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v10.4S, v6.S[1] // ...........................................*............................................................................................................................................................................................................................................ + mls v13.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + ldr q10, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mul v14.4S, v21.4S, v2.S[0] // .....................................................................*.................................................................................................................................................................................................................. + mls v18.4S, v23.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v2.S[1] // ....................................................................*................................................................................................................................................................................................................... + add v19.4S, v16.4S, v19.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v16.4S, v12.4S, v27.4S // .............................................................*.......................................................................................................................................................................................................................... + add v12.4S, v12.4S, v27.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v27.4S, v22.4S, v1.S[2] // ...........................................................*............................................................................................................................................................................................................................ + sqrdmulh v23.4S, v22.4S, v1.S[3] // ..........................................................*............................................................................................................................................................................................................................. + mul v22.4S, v16.4S, v1.S[2] // ................................................................*....................................................................................................................................................................................................................... + mls v14.4S, v21.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v21.4S, v20.4S, v10.4S // ...............................................*........................................................................................................................................................................................................................................ + mls v28.4S, v17.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v16.4S, v1.S[3] // ...............................................................*........................................................................................................................................................................................................................ + sub v17.4S, v20.4S, v10.4S // ..............................................*......................................................................................................................................................................................................................................... + mls v27.4S, v23.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + ldr q20, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + sub v10.4S, v13.4S, v15.4S // .......................................................................*................................................................................................................................................................................................................ + add v13.4S, v13.4S, v15.4S // ........................................................................*............................................................................................................................................................................................................... + mls v22.4S, v16.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + sqrdmulh v16.4S, v17.4S, v6.S[3] // ................................................*....................................................................................................................................................................................................................................... + mul v17.4S, v17.4S, v6.S[2] // .................................................*...................................................................................................................................................................................................................................... + sub v15.4S, v12.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. + sub v23.4S, v20.4S, v24.4S // ...................................................*.................................................................................................................................................................................................................................... + add v20.4S, v20.4S, v24.4S // ....................................................*................................................................................................................................................................................................................................... + add v13.4S, v12.4S, v13.4S // ......................................................................................................*................................................................................................................................................................................. + add v12.4S, v11.4S, v28.4S // ..................................................................................*..................................................................................................................................................................................................... + mul v24.4S, v23.4S, v7.S[0] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v23.4S, v23.4S, v7.S[1] // .....................................................*.................................................................................................................................................................................................................................. + mls v17.4S, v16.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v16.4S, v11.4S, v28.4S // .................................................................................*...................................................................................................................................................................................................... + sqrdmulh v28.4S, v10.4S, v2.S[1] // .........................................................................*.............................................................................................................................................................................................................. + mls v24.4S, v23.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sub v23.4S, v27.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. + mul v11.4S, v10.4S, v2.S[0] // ..........................................................................*............................................................................................................................................................................................................. + add v10.4S, v27.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v14.4S, v21.4S, v20.4S // ......................................................................................*................................................................................................................................................................................................. + add v20.4S, v21.4S, v20.4S // .......................................................................................*................................................................................................................................................................................................ + add v27.4S, v17.4S, v24.4S // ............................................................................................*........................................................................................................................................................................................... + sub v24.4S, v17.4S, v24.4S // ...........................................................................................*............................................................................................................................................................................................ + mls v11.4S, v28.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v17.4S, v16.4S, v2.S[2] // ....................................................................................*................................................................................................................................................................................................... + sqrdmulh v28.4S, v16.4S, v2.S[3] // ...................................................................................*.................................................................................................................................................................................................... + add v21.4S, v19.4S, v20.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v20.4S, v19.4S, v20.4S // ....................................................................................................................*................................................................................................................................................................... + add v16.4S, v12.4S, v27.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v19.4S, v22.4S, v11.4S // ...............................................................................................................*........................................................................................................................................................................ + add v11.4S, v22.4S, v11.4S // ................................................................................................................*....................................................................................................................................................................... + mls v17.4S, v28.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + sub v28.4S, v12.4S, v27.4S // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v22.4S, v24.4S, v3.S[1] // .............................................................................................*.......................................................................................................................................................................................... + mul v24.4S, v24.4S, v3.S[0] // ..............................................................................................*......................................................................................................................................................................................... + mul v12.4S, v9.4S, v0.S[2] // ...................................................................................................*.................................................................................................................................................................................... + sqrdmulh v9.4S, v9.4S, v0.S[3] // ..................................................................................................*..................................................................................................................................................................................... + sub v27.4S, v8.4S, v21.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v21.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v24.4S, v22.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + mul v21.4S, v20.4S, v1.S[0] // .......................................................................................................................*................................................................................................................................................................ + mls v12.4S, v9.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + add v9.4S, v13.4S, v16.4S // ..............................................................................................................................................*......................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v1.S[1] // ......................................................................................................................*................................................................................................................................................................. + sub v22.4S, v13.4S, v16.4S // .............................................................................................................................................*.......................................................................................................................................... + sqrdmulh v13.4S, v15.4S, v0.S[3] // .......................................................................................................*................................................................................................................................................................................ + mul v16.4S, v15.4S, v0.S[2] // ........................................................................................................*............................................................................................................................................................................... + mul v15.4S, v22.4S, v0.S[0] // ................................................................................................................................................*....................................................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[1] // ...............................................................................................................................................*........................................................................................................................................ + mls v21.4S, v20.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + sqrdmulh v20.4S, v28.4S, v1.S[1] // ...........................................................................................................................*............................................................................................................................................................ + mls v16.4S, v13.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + mul v13.4S, v28.4S, v1.S[0] // ............................................................................................................................*........................................................................................................................................................... + mls v15.4S, v22.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v22.4S, v14.4S, v3.S[1] // ........................................................................................*............................................................................................................................................................................................... + mul v14.4S, v14.4S, v3.S[0] // .........................................................................................*.............................................................................................................................................................................................. + sub v28.4S, v12.4S, v21.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v21.4S // .............................................................................................................................................................*.......................................................................................................................... + sub v21.4S, v17.4S, v24.4S // ...................................................................................................................................*.................................................................................................................................................... + mls v13.4S, v20.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + cmge v20.4S, v15.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + add v24.4S, v17.4S, v24.4S // ....................................................................................................................................*................................................................................................................................................... + cmge v17.4S, v31.4S, v15.4S // ....................................................................................................................................................................................*................................................................................................... + mls v14.4S, v22.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + mul v22.4S, v19.4S, v0.S[2] // ..................................................................................................................*..................................................................................................................................................................... + sub v20.4S, v17.4S, v20.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v19.4S, v19.4S, v0.S[3] // .................................................................................................................*...................................................................................................................................................................... + mul v17.4S, v21.4S, v1.S[0] // ......................................................................................................................................*................................................................................................................................................. + sqrdmulh v21.4S, v21.4S, v1.S[1] // .....................................................................................................................................*.................................................................................................................................................. + mls v15.4S, v20.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mul v20.4S, v28.4S, v0.S[0] // ...............................................................................................................................................................*........................................................................................................................ sub count, count, #1 layer1234_start: - mls v20.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - mul v23.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - add v11.4S, v21.4S, v17.4S // ........................................................................................................................................................*............................................................................................................................... - sub v28.4S, v21.4S, v17.4S // .......................................................................................................................................................*................................................................................................................................ - sub v17.4S, v18.4S, v19.4S // ..............................................................................................................................*......................................................................................................................................................... - sub v21.4S, v24.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... - add v19.4S, v18.4S, v19.4S // ...............................................................................................................................*........................................................................................................................................................ - mls v23.4S, v13.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - sqrdmulh v18.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - mul v21.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - add v13.4S, v24.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... - add v15.4S, v16.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... - sub v20.4S, v16.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ - cmge v24.4S, v23.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v21.4S, v18.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - cmge v18.4S, v31.4S, v23.4S // ................................................................................................................................................................................................*....................................................................................... - mul v16.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - sub v18.4S, v18.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... - sqrdmulh v24.4S, v22.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - mul v22.4S, v28.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sqrdmulh v28.4S, v28.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - mls v23.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - mul v18.4S, v17.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... - mls v16.4S, v24.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - cmge v24.4S, v31.4S, v27.4S // ....................................................................................................................................................................................*................................................................................................... - mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - cmge v28.4S, v27.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - str q23, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - sub v24.4S, v24.4S, v28.4S // ......................................................................................................................................................................................*................................................................................................. - sqrdmulh v17.4S, v17.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... - cmge v28.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - cmge v23.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... - mls v27.4S, v24.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... - sub v23.4S, v23.4S, v28.4S // ......................................................................................................................................................................................................*................................................................................. - mls v18.4S, v17.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - mls v21.4S, v23.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - cmge v28.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - str q27, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - sub v23.4S, v14.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. - add v27.4S, v14.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ - mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - sub v14.4S, v24.4S, v28.4S // ..................................................................................................................................................................................*..................................................................................................... - sqrdmulh v28.4S, v23.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - sub v24.4S, v10.4S, v19.4S // ..................................................................................................................................................*..................................................................................................................................... - add v10.4S, v10.4S, v19.4S // ...................................................................................................................................................*.................................................................................................................................... - mls v16.4S, v14.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - mul v9.4S, v24.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - sqrdmulh v18.4S, v24.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - mul v24.4S, v23.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - mul v19.4S, v20.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - cmge v14.4S, v31.4S, v22.4S // ............................................................................................................................................................................................*........................................................................................... - sqrdmulh v23.4S, v20.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - mls v24.4S, v28.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - mls v9.4S, v18.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - sqrdmulh v20.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - cmge v13.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - cmge v18.4S, v9.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - cmge v16.4S, v31.4S, v9.4S // ........................................................................................................................................................................................*............................................................................................... - mls v21.4S, v20.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - cmge v20.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - sub v18.4S, v16.4S, v18.4S // ..........................................................................................................................................................................................*............................................................................................. - mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - sub v23.4S, v20.4S, v13.4S // ......................................................................................................................................................................................................................................................*................................. - cmge v20.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... - cmge v16.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - mls v17.4S, v23.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - cmge v23.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - sub v13.4S, v13.4S, v16.4S // ......................................................................................................................................................................................................................................................................*................. - sqrdmulh v16.4S, v27.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - sub v20.4S, v20.4S, v23.4S // ..........................................................................................................................................................................................................*............................................................................. - cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................*........................................................................... - mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... - mul v17.4S, v27.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - mls v24.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - str q21, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. - mls v17.4S, v16.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - cmge v13.4S, v22.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - cmge v20.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - sqrdmulh v21.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - sub v13.4S, v14.4S, v13.4S // ..............................................................................................................................................................................................*......................................................................................... - mul v14.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - ldr q10, [x1, #336] // .....e.................................................................................................................................................................................................................................................................................. - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - sub v23.4S, v23.4S, v20.4S // ..............................................................................................................................................................................................................*......................................................................... - mul v8.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - sqrdmulh v20.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - mls v14.4S, v21.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - sqrdmulh v28.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - mls v8.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - str q19, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - cmge v21.4S, v17.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - cmge v23.4S, v8.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - mls v22.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - cmge v15.4S, v31.4S, v8.4S // ............................................................................................................................................................................................................................................................................*........... - mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - mls v9.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - ldr q13, [x1, #272] // ....e................................................................................................................................................................................................................................................................................... - cmge v28.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - str q9, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - cmge v12.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - str q22, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - cmge v24.4S, v31.4S, v17.4S // ........................................................................................................................................................................................................................................................................*............... - sub v15.4S, v15.4S, v23.4S // ..............................................................................................................................................................................................................................................................................*......... - ldr q23, [x1, #400] // ......e................................................................................................................................................................................................................................................................................. - sub v9.4S, v11.4S, v28.4S // ..................................................................................................................................................................................................................................................*..................................... - ldr q28, [x1, #464] // .......e................................................................................................................................................................................................................................................................................ - mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - sub v11.4S, v24.4S, v21.4S // ..........................................................................................................................................................................................................................................................................*............. - ldr q27, [x1, #528] // ........e............................................................................................................................................................................................................................................................................... - mls v16.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - add v9.4S, v13.4S, v10.4S // ...........................e............................................................................................................................................................................................................................................................ - mls v17.4S, v11.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - sub v22.4S, v13.4S, v10.4S // ..........................e............................................................................................................................................................................................................................................................. - ldr q10, [x1, #592] // .........e.............................................................................................................................................................................................................................................................................. - cmge v13.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - cmge v21.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - add v11.4S, v23.4S, v28.4S // ................................e....................................................................................................................................................................................................................................................... - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - sub v24.4S, v23.4S, v28.4S // ...............................e........................................................................................................................................................................................................................................................ - sub v28.4S, v13.4S, v21.4S // ..................................................................................................................................................................................................................................................................*..................... - str q17, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - add v17.4S, v9.4S, v11.4S // ...................................................................e.................................................................................................................................................................................................................... - add v21.4S, v27.4S, v10.4S // .....................................e.................................................................................................................................................................................................................................................. - sqrdmulh v23.4S, v22.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... - mul v13.4S, v22.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... - ldr q16, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... - ldr q22, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ - mls v8.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - sub v15.4S, v12.4S, v18.4S // ..............................................................................................................................................................................................................................................................*......................... - ldr q12, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... - mul v18.4S, v24.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... - sqrdmulh v24.4S, v24.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... - mls v13.4S, v23.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... - mls v19.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - sub v23.4S, v16.4S, v22.4S // ...................................................e.................................................................................................................................................................................................................................... - add v22.4S, v16.4S, v22.4S // ....................................................e................................................................................................................................................................................................................................... - str q8, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - ldr q16, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... - mls v20.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - mls v18.4S, v24.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... - ldr q15, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... - sub v28.4S, v9.4S, v11.4S // ..................................................................e..................................................................................................................................................................................................................... - sub v11.4S, v27.4S, v10.4S // ....................................e................................................................................................................................................................................................................................................... - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - ldr q9, [x1, #0] // e....................................................................................................................................................................................................................................................................................... - cmge v8.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................*............................... - cmge v24.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - add v19.4S, v13.4S, v18.4S // ........................................................................e............................................................................................................................................................................................................... - sub v10.4S, v13.4S, v18.4S // .......................................................................e................................................................................................................................................................................................................ - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - sub v18.4S, v8.4S, v24.4S // ..........................................................................................................................................................................................................................................................*............................. - add v13.4S, v12.4S, v16.4S // ......................e................................................................................................................................................................................................................................................................. - ldr q20, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. - ldr q8, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ - sub v24.4S, v9.4S, v15.4S // ................e....................................................................................................................................................................................................................................................................... - sqrdmulh v27.4S, v10.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. - mls v14.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - add v15.4S, v9.4S, v15.4S // .................e...................................................................................................................................................................................................................................................................... - sqrdmulh v9.4S, v24.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... - mul v18.4S, v24.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... - mul v24.4S, v10.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. - str q14, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... - sub v10.4S, v20.4S, v8.4S // .........................................e.............................................................................................................................................................................................................................................. - sqrdmulh v14.4S, v28.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. - add v8.4S, v20.4S, v8.4S // ..........................................e............................................................................................................................................................................................................................................. - sub v20.4S, v15.4S, v13.4S // ........................................................e............................................................................................................................................................................................................................... - mls v24.4S, v27.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ - add v27.4S, v15.4S, v13.4S // .........................................................e.............................................................................................................................................................................................................................. - mls v18.4S, v9.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... - sub v16.4S, v12.4S, v16.4S // .....................e.................................................................................................................................................................................................................................................................. - mul v13.4S, v28.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... - sqrdmulh v15.4S, v10.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... - mul v28.4S, v16.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ - sqrdmulh v16.4S, v16.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... - mul v9.4S, v10.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ - sqrdmulh v12.4S, v23.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. - mul v23.4S, v23.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. - mls v13.4S, v14.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. - mls v28.4S, v16.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. - sub v16.4S, v18.4S, v28.4S // .............................................................e.......................................................................................................................................................................................................................... - add v28.4S, v18.4S, v28.4S // ..............................................................e......................................................................................................................................................................................................................... - ldr q18, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... - sqrdmulh v10.4S, v11.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ - mls v23.4S, v12.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ - mul v14.4S, v11.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. - sub v12.4S, v28.4S, v19.4S // .....................................................................................................e.................................................................................................................................................................................. - mls v9.4S, v15.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... - ldr q15, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... - add v11.4S, v28.4S, v19.4S // ......................................................................................................e................................................................................................................................................................................. - mul v19.4S, v20.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. - sqrdmulh v20.4S, v20.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ - mls v14.4S, v10.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... - sqrdmulh v28.4S, v16.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... - mul v16.4S, v16.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ - sub v10.4S, v15.4S, v18.4S // ..............................................e......................................................................................................................................................................................................................................... - mls v19.4S, v20.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... - sub v20.4S, v14.4S, v9.4S // .................................................................................e...................................................................................................................................................................................................... - add v9.4S, v14.4S, v9.4S // ..................................................................................e..................................................................................................................................................................................................... - mul v14.4S, v10.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... - sqrdmulh v10.4S, v10.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... - mls v16.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... - add v28.4S, v27.4S, v17.4S // .................................................................................................e...................................................................................................................................................................................... - add v15.4S, v15.4S, v18.4S // ...............................................e........................................................................................................................................................................................................................................ - sub v17.4S, v27.4S, v17.4S // ................................................................................................e....................................................................................................................................................................................... - mls v14.4S, v10.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... - add v10.4S, v19.4S, v13.4S // ...........................................................................................................e............................................................................................................................................................................ - sub v18.4S, v19.4S, v13.4S // ..........................................................................................................e............................................................................................................................................................................. - add v13.4S, v14.4S, v23.4S // ............................................................................................e........................................................................................................................................................................................... - sub v27.4S, v14.4S, v23.4S // ...........................................................................................e............................................................................................................................................................................................ - sub v23.4S, v21.4S, v8.4S // ............................................................................e........................................................................................................................................................................................................... - add v8.4S, v21.4S, v8.4S // .............................................................................e.......................................................................................................................................................................................................... - sub v19.4S, v16.4S, v24.4S // ...............................................................................................................e........................................................................................................................................................................ - add v21.4S, v16.4S, v24.4S // ................................................................................................................e....................................................................................................................................................................... - mul v14.4S, v18.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... - sqrdmulh v24.4S, v18.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... - add v18.4S, v9.4S, v13.4S // ..........................................................................................................................e............................................................................................................................................................. - sub v9.4S, v9.4S, v13.4S // .........................................................................................................................e.............................................................................................................................................................. - mul v13.4S, v20.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... - sqrdmulh v16.4S, v20.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... - sub v20.4S, v15.4S, v22.4S // ......................................................................................e................................................................................................................................................................................................. - add v22.4S, v15.4S, v22.4S // .......................................................................................e................................................................................................................................................................................................ - mul v15.4S, v9.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ - sqrdmulh v9.4S, v9.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... - mls v14.4S, v24.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... - mul v24.4S, v12.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ - mls v13.4S, v16.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. - mul v16.4S, v19.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... - mls v15.4S, v9.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... - sqrdmulh v12.4S, v12.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... - sqrdmulh v19.4S, v19.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... - add v9.4S, v11.4S, v18.4S // ..............................................................................................................................................e......................................................................................................................................... - sub v11.4S, v11.4S, v18.4S // .............................................................................................................................................e.......................................................................................................................................... - mul v18.4S, v23.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ - mls v16.4S, v19.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... - mls v24.4S, v12.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. - mls v18.4S, v23.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... - mul v12.4S, v17.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... - sub v19.4S, v8.4S, v22.4S // ....................................................................................................................e................................................................................................................................................................... - add v8.4S, v8.4S, v22.4S // .....................................................................................................................e.................................................................................................................................................................. - sqrdmulh v22.4S, v17.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... - sqrdmulh v23.4S, v19.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ - mul v19.4S, v19.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. - mul v17.4S, v27.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... - sqrdmulh v27.4S, v27.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... - mls v12.4S, v22.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... - sub v22.4S, v28.4S, v8.4S // ........................................................................................................................................e............................................................................................................................................... - mls v19.4S, v23.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... - add v8.4S, v28.4S, v8.4S // .........................................................................................................................................e.............................................................................................................................................. - sqrdmulh v28.4S, v11.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... - mls v17.4S, v27.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ - mul v27.4S, v11.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ - sqrdmulh v11.4S, v20.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. - sub v23.4S, v12.4S, v19.4S // ............................................................................................................................................................e........................................................................................................................... - add v12.4S, v12.4S, v19.4S // .............................................................................................................................................................e.......................................................................................................................... - mul v19.4S, v20.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... - sub v20.4S, v13.4S, v17.4S // ...................................................................................................................................e.................................................................................................................................................... - mls v27.4S, v28.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... - add v17.4S, v13.4S, v17.4S // ....................................................................................................................................e................................................................................................................................................... - sqrdmulh v13.4S, v23.4S, v0.S[1] // ...............................................................................................................................................................e........................................................................................................................ - sqrdmulh v28.4S, v20.4S, v1.S[1] // ......................................................................................................................................e................................................................................................................................................. - mls v19.4S, v11.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. - mul v20.4S, v20.4S, v1.S[0] // .....................................................................................................................................e.................................................................................................................................................. + // Instructions: 280 + // Expected cycles: 70 + // Expected IPC: 4.00 + + // ---------------------------------------------------------------------------------------------------------------------------------- original position ----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + mls v22.4S, v19.4S, v29.4S // ......*................................................................................................................................................................................................................................................................................. + mls v17.4S, v21.4S, v29.4S // ...*.................................................................................................................................................................................................................................................................................... + str q15, [x1, #576] // *....................................................................................................................................................................................................................................................................................... + add v19.4S, v11.4S, v24.4S // .....*.................................................................................................................................................................................................................................................................................. + sub v11.4S, v11.4S, v24.4S // ....*................................................................................................................................................................................................................................................................................... + mul v15.4S, v23.4S, v0.S[2] // .........*.............................................................................................................................................................................................................................................................................. + sqrdmulh v24.4S, v23.4S, v0.S[3] // ..........*............................................................................................................................................................................................................................................................................. + add v21.4S, v22.4S, v17.4S // ....................*................................................................................................................................................................................................................................................................... + sub v23.4S, v22.4S, v17.4S // ...............*........................................................................................................................................................................................................................................................................ + mul v17.4S, v11.4S, v0.S[0] // .......................*................................................................................................................................................................................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[1] // ........................*............................................................................................................................................................................................................................................................... + sub v22.4S, v18.4S, v14.4S // ........*............................................................................................................................................................................................................................................................................... + add v14.4S, v18.4S, v14.4S // .*...................................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v28.4S, v0.S[1] // ..*..................................................................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v9.4S, v26.4S // ..............................*......................................................................................................................................................................................................................................................... + mls v17.4S, v11.4S, v29.4S // .............................*.......................................................................................................................................................................................................................................................... + mul v9.4S, v9.4S, v25.4S // ...............................*........................................................................................................................................................................................................................................................ + mul v11.4S, v22.4S, v1.S[0] // ..............*......................................................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v22.4S, v1.S[1] // .............*.......................................................................................................................................................................................................................................................................... + mls v20.4S, v28.4S, v29.4S // .......*................................................................................................................................................................................................................................................................................ + mls v15.4S, v24.4S, v29.4S // .................*...................................................................................................................................................................................................................................................................... + mls v9.4S, v18.4S, v29.4S // .....................................*.................................................................................................................................................................................................................................................. + cmge v28.4S, v31.4S, v17.4S // ....................................*................................................................................................................................................................................................................................................... + mls v11.4S, v22.4S, v29.4S // ...................*.................................................................................................................................................................................................................................................................... + cmge v22.4S, v17.4S, v30.4S // ..................................*..................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v19.4S, v26.4S // .......................................................................................*................................................................................................................................................................................................ + sub v18.4S, v10.4S, v14.4S // ...........*............................................................................................................................................................................................................................................................................ + add v10.4S, v10.4S, v14.4S // ............*........................................................................................................................................................................................................................................................................... + sub v28.4S, v28.4S, v22.4S // .......................................*................................................................................................................................................................................................................................................ + add v22.4S, v16.4S, v13.4S // .................................*...................................................................................................................................................................................................................................................... + add v14.4S, v15.4S, v11.4S // .........................*.............................................................................................................................................................................................................................................................. + sub v15.4S, v15.4S, v11.4S // ..........................*............................................................................................................................................................................................................................................................. + mls v17.4S, v28.4S, v29.4S // ...............................................*........................................................................................................................................................................................................................................ + sqrdmulh v28.4S, v22.4S, v26.4S // .........................................*.............................................................................................................................................................................................................................................. + mul v11.4S, v19.4S, v25.4S // .........................................................................................*.............................................................................................................................................................................................. + sqrdmulh v19.4S, v14.4S, v26.4S // ..............................................................................*......................................................................................................................................................................................................... + mul v14.4S, v14.4S, v25.4S // ...............................................................................*........................................................................................................................................................................................................ + sub v13.4S, v16.4S, v13.4S // ...........................*............................................................................................................................................................................................................................................................ + str q17, [x1, #704] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v17.4S, v18.4S, v0.S[1] // ............................................................*........................................................................................................................................................................................................................... + mul v18.4S, v18.4S, v0.S[0] // .............................................................*.......................................................................................................................................................................................................................... + mul v16.4S, v27.4S, v0.S[0] // ..................................................*..................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[1] // ......................*................................................................................................................................................................................................................................................................. + mls v14.4S, v19.4S, v29.4S // ....................................................................................*................................................................................................................................................................................................... + mul v19.4S, v22.4S, v25.4S // ..........................................*............................................................................................................................................................................................................................................. + mls v11.4S, v24.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + mls v18.4S, v17.4S, v29.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v16.4S, v27.4S, v29.4S // .................................................................................*...................................................................................................................................................................................................... + cmge v17.4S, v31.4S, v20.4S // ..................*..................................................................................................................................................................................................................................................................... + mls v19.4S, v28.4S, v29.4S // ................................................*....................................................................................................................................................................................................................................... + cmge v22.4S, v20.4S, v30.4S // ................*....................................................................................................................................................................................................................................................................... + mul v27.4S, v13.4S, v0.S[0] // .........................................................*.............................................................................................................................................................................................................................. + sqrdmulh v24.4S, v23.4S, v0.S[1] // .............................................*.......................................................................................................................................................................................................................................... + sub v22.4S, v17.4S, v22.4S // .....................*.................................................................................................................................................................................................................................................................. + sqrdmulh v28.4S, v12.4S, v26.4S // ........................................*............................................................................................................................................................................................................................................... + mul v12.4S, v12.4S, v25.4S // ............................................*........................................................................................................................................................................................................................................... + mul v23.4S, v23.4S, v0.S[0] // ..............................................*......................................................................................................................................................................................................................................... + mls v12.4S, v28.4S, v29.4S // .................................................*...................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v10.4S, v26.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v10.4S, v10.4S, v25.4S // ............................................................................*........................................................................................................................................................................................................... + mls v20.4S, v22.4S, v29.4S // ............................*........................................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v13.4S, v0.S[1] // ..........................................................*............................................................................................................................................................................................................................. + cmge v22.4S, v31.4S, v12.4S // ............................................................................................*........................................................................................................................................................................................... + cmge v13.4S, v12.4S, v30.4S // ......................................................................*................................................................................................................................................................................................................. + mls v10.4S, v28.4S, v29.4S // ...................................................................................*.................................................................................................................................................................................................... + mls v23.4S, v24.4S, v29.4S // ....................................................*................................................................................................................................................................................................................................... + sub v28.4S, v22.4S, v13.4S // ................................................................................................*....................................................................................................................................................................................... + str q20, [x1, #768] // ...................................*.................................................................................................................................................................................................................................................... + cmge v20.4S, v9.4S, v30.4S // .....................................................................................*.................................................................................................................................................................................................. + mls v27.4S, v17.4S, v29.4S // ................................................................*....................................................................................................................................................................................................................... + cmge v17.4S, v31.4S, v9.4S // ..........................................................................*............................................................................................................................................................................................................. + mls v12.4S, v28.4S, v29.4S // ........................................................................................................*............................................................................................................................................................................... + cmge v24.4S, v23.4S, v30.4S // .................................................................................................................*...................................................................................................................................................................... + sub v22.4S, v17.4S, v20.4S // ........................................................................................*............................................................................................................................................................................................... + cmge v13.4S, v31.4S, v11.4S // .....................................................................................................*.................................................................................................................................................................................. + mul v20.4S, v15.4S, v0.S[0] // .......................................................*................................................................................................................................................................................................................................ + cmge v17.4S, v11.4S, v30.4S // ......................................................................................................*................................................................................................................................................................................. + str q12, [x1, #256] // ..............................................................................................................*......................................................................................................................................................................... + sqrdmulh v28.4S, v21.4S, v26.4S // ................................*....................................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v0.S[1] // ...................................................*.................................................................................................................................................................................................................................... + sub v12.4S, v13.4S, v17.4S // .........................................................................................................*.............................................................................................................................................................................. + mls v9.4S, v22.4S, v29.4S // .............................................................................................*.......................................................................................................................................................................................... + cmge v17.4S, v14.4S, v30.4S // ...........................................................................................*............................................................................................................................................................................................ + cmge v22.4S, v31.4S, v23.4S // ..................................................................................................................*..................................................................................................................................................................... + mls v20.4S, v15.4S, v29.4S // ........................................................*............................................................................................................................................................................................................................... + str q9, [x1, #64] // ..................................................................................................*..................................................................................................................................................................................... + cmge v9.4S, v10.4S, v30.4S // ............................................................................................................*........................................................................................................................................................................... + sub v22.4S, v22.4S, v24.4S // ......................................................................................................................*................................................................................................................................................................. + mls v11.4S, v12.4S, v29.4S // .............................................................................................................*.......................................................................................................................................................................... + mul v15.4S, v21.4S, v25.4S // ......................................*................................................................................................................................................................................................................................................. + cmge v13.4S, v31.4S, v10.4S // ..........................................................................................................................*............................................................................................................................................................. + cmge v24.4S, v20.4S, v30.4S // .................................................................*...................................................................................................................................................................................................................... + cmge v12.4S, v31.4S, v20.4S // ..............................................................*......................................................................................................................................................................................................................... + mls v23.4S, v22.4S, v29.4S // ............................................................................................................................*........................................................................................................................................................... + sub v9.4S, v13.4S, v9.4S // ..............................................................................................................................*......................................................................................................................................................... + str q11, [x1, #192] // ................................................................................................................*....................................................................................................................................................................... + ldr q13, [x1] // .............................................................................................................................................*.......................................................................................................................................... + sub v22.4S, v12.4S, v24.4S // .....................................................................*.................................................................................................................................................................................................................. + ldr q12, [x1, #144] // ................................................................................................................................................*....................................................................................................................................... + cmge v11.4S, v27.4S, v30.4S // ....................................................................................................................*................................................................................................................................................................... + ldr q24, [x1, #80] // ............................................................................................................................................*........................................................................................................................................... + mls v10.4S, v9.4S, v29.4S // ...................................................................................................................................*.................................................................................................................................................... + mls v15.4S, v28.4S, v29.4S // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v21.4S, v8.4S, v26.4S // .......................................................................*................................................................................................................................................................................................................ + str q23, [x1, #960] // ..................................................................................................................................*..................................................................................................................................................... + mls v20.4S, v22.4S, v29.4S // ........................................................................*............................................................................................................................................................................................................... + mul v8.4S, v8.4S, v25.4S // ....................................................................*................................................................................................................................................................................................................... + cmge v28.4S, v31.4S, v16.4S // ....................................................................................................*................................................................................................................................................................................... + str q10, [x1, #128] // ........................................................................................................................................*............................................................................................................................................... + cmge v9.4S, v18.4S, v30.4S // .........................................................................................................................*.............................................................................................................................................................. + add v22.4S, v13.4S, v24.4S // ..................................................................................................................................................*..................................................................................................................................... + str q20, [x1, #896] // ................................................................................*....................................................................................................................................................................................................... + cmge v10.4S, v31.4S, v15.4S // ................................................................................................................................*....................................................................................................................................................... + cmge v23.4S, v16.4S, v30.4S // ......................................................................................*................................................................................................................................................................................................. + ldr q20, [x1, #464] // ...............................................................................................................................................*........................................................................................................................................ + sub v13.4S, v13.4S, v24.4S // ....................................................................................................................................................*................................................................................................................................... + cmge v24.4S, v31.4S, v14.4S // ..........................................................................................*............................................................................................................................................................................................. + sub v28.4S, v28.4S, v23.4S // .......................................................................................................*................................................................................................................................................................................ + cmge v23.4S, v15.4S, v30.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v8.4S, v21.4S, v29.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v21.4S, v24.4S, v17.4S // ..............................................................................................*......................................................................................................................................................................................... + ldr q17, [x1, #208] // .................................................................................................................................................*...................................................................................................................................... + sub v10.4S, v10.4S, v23.4S // .....................................................................................................................................*.................................................................................................................................................. + cmge v23.4S, v31.4S, v27.4S // ........................................................................................................................*............................................................................................................................................................... + cmge v24.4S, v19.4S, v30.4S // ...........................................................*............................................................................................................................................................................................................................ + mls v14.4S, v21.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v15.4S, v10.4S, v29.4S // .........................................................................................................................................*.............................................................................................................................................. + cmge v21.4S, v8.4S, v30.4S // .................................................................................................*...................................................................................................................................................................................... + sub v10.4S, v23.4S, v11.4S // .............................................................................................................................*.......................................................................................................................................................... + cmge v23.4S, v31.4S, v18.4S // ...............................................................................................................................*........................................................................................................................................................ + str q14, [x1, #384] // ...........................................................................................................................*............................................................................................................................................................ + sub v14.4S, v12.4S, v17.4S // ..........................................................................................................................................................*............................................................................................................................. + sqrdmulh v11.4S, v13.4S, v3.S[3] // .......................................................................................................................................................*................................................................................................................................ + mls v27.4S, v10.4S, v29.4S // .................................................................................................................................*...................................................................................................................................................... + mul v13.4S, v13.4S, v3.S[2] // ........................................................................................................................................................*............................................................................................................................... + str q15, [x1, #448] // ...........................................................................................................................................*............................................................................................................................................ + ldr q15, [x1, #400] // ..............................................................................................................................................*......................................................................................................................................... + sub v9.4S, v23.4S, v9.4S // ....................................................................................................................................*................................................................................................................................................... + cmge v10.4S, v31.4S, v19.4S // ......................................................*................................................................................................................................................................................................................................. + mls v16.4S, v28.4S, v29.4S // ...........................................................................................................*............................................................................................................................................................................ + ldr q28, [x1, #272] // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v17.4S // ...........................................................................................................................................................*............................................................................................................................ + cmge v23.4S, v31.4S, v8.4S // ...................................................................................................*.................................................................................................................................................................................... + ldr q17, [x1, #336] // ...................................................................................................................................................*.................................................................................................................................... + str q16, [x1, #512] // .......................................................................................................................*................................................................................................................................................................ + mls v13.4S, v11.4S, v29.4S // ...............................................................................................................................................................*........................................................................................................................ + sub v16.4S, v15.4S, v20.4S // .....................................................................................................................................................*.................................................................................................................................. + sub v23.4S, v23.4S, v21.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v18.4S, v9.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v11.4S, v16.4S, v5.S[0] // .............................................................................................................................................................*.......................................................................................................................... + sub v21.4S, v10.4S, v24.4S // ...............................................................*........................................................................................................................................................................................................................ + ldr q24, [x1, #528] // ................................................................................................................................................................*....................................................................................................................... + ldr q10, [x1, #592] // .........................................................................................................................................................*.............................................................................................................................. + str q27, [x1, #832] // ......................................................................................................................................*................................................................................................................................................. + add v9.4S, v15.4S, v20.4S // ......................................................................................................................................................*................................................................................................................................. + sub v20.4S, v28.4S, v17.4S // ......................................................................................................................................................................*................................................................................................................. + str q18, [x1, #640] // ..........................................................................................................................................*............................................................................................................................................. + mul v18.4S, v14.4S, v4.S[0] // ....................................................................................................................................................................*................................................................................................................... + mls v8.4S, v23.4S, v29.4S // ...............................................................................................................*........................................................................................................................................................................ + mul v15.4S, v20.4S, v4.S[2] // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v27.4S, v14.4S, v4.S[1] // .................................................................................................................................................................*...................................................................................................................... + mls v19.4S, v21.4S, v29.4S // ..................................................................*..................................................................................................................................................................................................................... + sub v23.4S, v24.4S, v10.4S // .........................................................................................................................................................................*.............................................................................................................. + add v14.4S, v22.4S, v12.4S // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v21.4S, v16.4S, v5.S[1] // .....................................................................................................................................................................*.................................................................................................................. + str q8, [x1], #(16) // .....................................................................................................................*.................................................................................................................................................................. + sqrdmulh v8.4S, v23.4S, v5.S[3] // ...............................................................................................................................................................................*........................................................................................................ + mls v18.4S, v27.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sqrdmulh v20.4S, v20.4S, v4.S[3] // .............................................................................................................................................................................*.......................................................................................................... + sub v16.4S, v22.4S, v12.4S // ..................................................................................................................................................................*..................................................................................................................... + ldr q22, [x1, #832] // ................................................................................................................................................................................................*....................................................................................... + ldr q12, [x1, #768] // ........................................................................................................................................................................................*............................................................................................... + mul v23.4S, v23.4S, v5.S[2] // .................................................................................................................................................................................*...................................................................................................... + add v17.4S, v28.4S, v17.4S // ...................................................................................................................................................................*.................................................................................................................... + str q19, [x1, #304] // .........................................................................*.............................................................................................................................................................................................................. + mul v27.4S, v16.4S, v1.S[2] // .......................................................................................................................................................................................................*................................................................................ + sqrdmulh v19.4S, v16.4S, v1.S[3] // ........................................................................................................................................................................................................*............................................................................... + mls v15.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v11.4S, v21.4S, v29.4S // ..........................................................................................................................................................................................*............................................................................................. + sub v16.4S, v12.4S, v22.4S // ..............................................................................................................................................................................................................*......................................................................... + add v21.4S, v12.4S, v22.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v27.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + add v19.4S, v17.4S, v9.4S // ................................................................................................................................................................................*....................................................................................................... + add v12.4S, v13.4S, v18.4S // ......................................................................................................................................................................................................*................................................................................. + add v22.4S, v15.4S, v11.4S // ..................................................................................................................................................................................................................*..................................................................... + sub v20.4S, v14.4S, v19.4S // .....................................................................................................................................................................................*.................................................................................................. + mls v23.4S, v8.4S, v29.4S // ......................................................................................................................................................................................*................................................................................................. + add v8.4S, v14.4S, v19.4S // ....................................................................................................................................................................................*................................................................................................... + add v14.4S, v12.4S, v22.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v15.4S, v15.4S, v11.4S // .................................................................................................................................................................................................................*...................................................................... + ldr q11, [x1, #640] // .......................................................................................................................................................................*................................................................................................................ + sub v19.4S, v13.4S, v18.4S // .....................................................................................................................................................................................................*.................................................................................. + sub v18.4S, v17.4S, v9.4S // ..........................................................................................................................................................................*............................................................................................................. + ldr q9, [x1, #704] // ...........................................................................................................................................................................*............................................................................................................ + mul v13.4S, v16.4S, v6.S[2] // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v28.4S, v15.4S, v2.S[1] // ...............................................................................................................................................................................................................................*........................................................ + sub v17.4S, v11.4S, v9.4S // .........................................................................................................................................................................................*.............................................................................................. + sub v12.4S, v12.4S, v22.4S // ......................................................................................................................................................................................................................*................................................................. + sqrdmulh v22.4S, v18.4S, v2.S[1] // ...................................................................................................................................................................................................*.................................................................................... + mul v18.4S, v18.4S, v2.S[0] // .................................................................................................................................................................................................*...................................................................................... + add v11.4S, v11.4S, v9.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v9.4S, v16.4S, v6.S[3] // ....................................................................................................................................................................................................................*................................................................... + add v24.4S, v24.4S, v10.4S // ..............................................................................................................................................................................*......................................................................................................... + sqrdmulh v10.4S, v19.4S, v1.S[3] // .............................................................................................................................................................................................................*.......................................................................... + mls v18.4S, v22.4S, v29.4S // ..........................................................................................................................................................................................................*............................................................................. + ldr q16, [x1, #896] // ................................................................................................................................................................................................................*....................................................................... + mul v22.4S, v19.4S, v1.S[2] // .........................................................................................................................................................................................................*.............................................................................. + ldr q19, [x1, #960] // ........................................................................................................................................................................*............................................................................................................... + mls v13.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + add v9.4S, v24.4S, v11.4S // ....................................................................................................................................................................................................*................................................................................... + sub v24.4S, v24.4S, v11.4S // .......................................................................................................................................................................................*................................................................................................ + sqrdmulh v11.4S, v17.4S, v6.S[1] // ..............................................................................................................................................................................................*......................................................................................... + mls v22.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................*.................................................................... + mul v10.4S, v17.4S, v6.S[0] // .............................................................................................................................................................................................*.......................................................................................... + mul v15.4S, v15.4S, v2.S[0] // ..................................................................................................................................................................................................................................*..................................................... + sub v17.4S, v16.4S, v19.4S // .......................................................................................................................................................................................................................*................................................................ + add v19.4S, v16.4S, v19.4S // ........................................................................................................................................................................................................................*............................................................... + mul v16.4S, v12.4S, v0.S[2] // ...............................................................................................................................................................................................................................................................*........................ + sqrdmulh v12.4S, v12.4S, v0.S[3] // ..............................................................................................................................................................................................................................................................*......................... + mls v10.4S, v11.4S, v29.4S // ............................................................................................................................................................................................................*........................................................................... + mls v15.4S, v28.4S, v29.4S // ........................................................................................................................................................................................................................................*............................................... + mul v28.4S, v20.4S, v0.S[2] // ....................................................................................................................................................................................................................................................*................................... + add v11.4S, v21.4S, v19.4S // .....................................................................................................................................................................................................................................*.................................................. + sub v21.4S, v21.4S, v19.4S // ....................................................................................................................................................................................................................................*................................................... + mls v16.4S, v12.4S, v29.4S // ....................................................................................................................................................................................................................................................................*................... + sub v19.4S, v23.4S, v10.4S // ..............................................................................................................................................................................................................................*......................................................... + add v12.4S, v23.4S, v10.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v23.4S, v27.4S, v18.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v20.4S, v20.4S, v0.S[3] // .....................................................................................................................................................................................................................................................*.................................. + add v10.4S, v27.4S, v18.4S // ...................................................................................................................................................................................................................................*.................................................... + sqrdmulh v18.4S, v17.4S, v7.S[1] // ............................................................................................................................................................................................................................*........................................................... + mul v27.4S, v17.4S, v7.S[0] // ...........................................................................................................................................................................................................................*............................................................ + sub v17.4S, v9.4S, v11.4S // ............................................................................................................................................................................................................................................*........................................... + add v9.4S, v9.4S, v11.4S // ...........................................................................................................................................................................................................................................*............................................ + mls v28.4S, v20.4S, v29.4S // ..........................................................................................................................................................................................................................................................*............................. + add v11.4S, v22.4S, v15.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v22.4S, v22.4S, v15.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v27.4S, v18.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mul v20.4S, v19.4S, v2.S[2] // .........................................................................................................................................................................................................................................*.............................................. + mul v18.4S, v24.4S, v2.S[2] // ............................................................................................................................................................................................*........................................................................................... + sqrdmulh v15.4S, v24.4S, v2.S[3] // ...........................................................................................................................................................................................*............................................................................................ + sqrdmulh v24.4S, v19.4S, v2.S[3] // ..........................................................................................................................................................................................................................................*............................................. + sub v19.4S, v13.4S, v27.4S // .......................................................................................................................................................................................................................................*................................................ + add v13.4S, v13.4S, v27.4S // ......................................................................................................................................................................................................................................*................................................. + sub v27.4S, v8.4S, v9.4S // ......................................................................................................................................................................................................................................................*................................. + add v8.4S, v8.4S, v9.4S // .......................................................................................................................................................................................................................................................*................................ + add v9.4S, v12.4S, v13.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v13.4S, v12.4S, v13.4S // .................................................................................................................................................................................................................................................*...................................... + mls v20.4S, v24.4S, v29.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v24.4S, v21.4S, v3.S[1] // .......................................................................................................................................................................................................................................................................*................ + sub v12.4S, v14.4S, v9.4S // .............................................................................................................................................................................................................................................................*.......................... + add v9.4S, v14.4S, v9.4S // ...........................................................................................................................................................................................................................................................*............................ + mls v18.4S, v15.4S, v29.4S // ..................................................................................................................................................................................................*..................................................................................... + mul v15.4S, v12.4S, v0.S[0] // ................................................................................................................................................................................................................................................................*....................... + sqrdmulh v12.4S, v12.4S, v0.S[1] // .................................................................................................................................................................................................................................................................*...................... + mul v14.4S, v21.4S, v3.S[0] // ........................................................................................................................................................................................................................................................................*............... + mul v21.4S, v17.4S, v1.S[0] // .........................................................................................................................................................................................................................................................*.............................. + sqrdmulh v17.4S, v17.4S, v1.S[1] // ............................................................................................................................................................................................................................................................*........................... + mls v15.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................................................*................. + sqrdmulh v12.4S, v19.4S, v3.S[1] // ..................................................................................................................................................................................................................................................*..................................... + mls v21.4S, v17.4S, v29.4S // ..................................................................................................................................................................................................................................................................*..................... + mul v17.4S, v19.4S, v3.S[0] // ...................................................................................................................................................................................................................................................*.................................... + sqrdmulh v19.4S, v13.4S, v1.S[1] // ...................................................................................................................................................................................................................................................................*.................... + mul v13.4S, v13.4S, v1.S[0] // .....................................................................................................................................................................................................................................................................*.................. + mls v14.4S, v24.4S, v29.4S // ................................................................................................................................................................................................................................................................................*....... + cmge v24.4S, v31.4S, v15.4S // ...............................................................................................................................................................................................................................................................................*........ + mls v17.4S, v12.4S, v29.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v12.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v13.4S, v19.4S, v29.4S // ............................................................................................................................................................................................................................................................................*........... + sqrdmulh v19.4S, v22.4S, v0.S[3] // ...................................................................................................................................................................................................................................................................................*.... + sub v24.4S, v24.4S, v12.4S // ..................................................................................................................................................................................................................................................................................*..... + add v12.4S, v28.4S, v21.4S // ..........................................................................................................................................................................................................................................................................*............. + sub v28.4S, v28.4S, v21.4S // .........................................................................................................................................................................................................................................................................*.............. + sub v21.4S, v20.4S, v17.4S // ...........................................................................................................................................................................................................................................................................*............ + mul v22.4S, v22.4S, v0.S[2] // .................................................................................................................................................................................................................................................................................*...... + mls v15.4S, v24.4S, v29.4S // ......................................................................................................................................................................................................................................................................................*. + add v24.4S, v20.4S, v17.4S // ..............................................................................................................................................................................................................................................................................*......... + mul v17.4S, v21.4S, v1.S[0] // ....................................................................................................................................................................................................................................................................................*... + sqrdmulh v21.4S, v21.4S, v1.S[1] // .....................................................................................................................................................................................................................................................................................*.. + mul v20.4S, v28.4S, v0.S[0] // .......................................................................................................................................................................................................................................................................................* - // original source code - // ldr q8, [x1, #0] // .....................................................................e......................................................................................................................|................................................................................................................................................................e................. - // ldr q9, [x1, #(1*(512/8))] // .................................................................e..........................................................................................................................|............................................................................................................................................................e..................... - // ldr q10, [x1, #(2*(512/8))] // ......................................................e.....................................................................................................................................|.................................................................................................................................................e................................ - // ldr q11, [x1, #(3*(512/8))] // ..............................................................e.............................................................................................................................|.........................................................................................................................................................e........................ - // ldr q12, [x1, #(4*(512/8))] // ....................e.......................................................................................................................................................................|...............................................................................................................e.................................................................. - // ldr q13, [x1, #(5*(512/8))] // e...........................................................................................................................................................................................|...........................................................................................e...................................................................................... - // ldr q14, [x1, #(6*(512/8))] // ............................e...............................................................................................................................................................|.......................................................................................................................e.......................................................... - // ldr q15, [x1, #(7*(512/8))] // ..............................e.............................................................................................................................................................|.........................................................................................................................e........................................................ - // ldr q16, [x1, #(8*(512/8))] // .................................e..........................................................................................................................................................|............................................................................................................................e..................................................... - // ldr q17, [x1, #(9*(512/8))] // ......................................e.....................................................................................................................................................|.................................................................................................................................e................................................ - // ldr q18, [x1, #(10*(512/8))] // .............................................................................e..............................................................................................................|........................................................................................................................................................................e......... - // ldr q19, [x1, #(11*(512/8))] // ..............................................................................e.............................................................................................................|.........................................................................................................................................................................e........ - // ldr q20, [x1, #(12*(512/8))] // ................................................................................................................e...........................................................................|.................................................................................................................................................................................. - // ldr q21, [x1, #(13*(512/8))] // ..........................................................................................................e.................................................................................|.................................................................................................................................................................................. - // ldr q22, [x1, #(14*(512/8))] // ..................................................e.........................................................................................................................................|.............................................................................................................................................e.................................... - // ldr q23, [x1, #(15*(512/8))] // ...................................................e........................................................................................................................................|..............................................................................................................................................e................................... - // sub v24.4s, v8.4s, v9.4s // ...............................................................................e............................................................................................................|..........................................................................................................................................................................e....... - // add v8.4s, v8.4s, v9.4s // ..................................................................................e.........................................................................................................|.............................................................................................................................................................................e.... - // mul v9.4s, v24.4s, v3.s[2] // ....................................................................................e.......................................................................................................|...............................................................................................................................................................................e.. - // sqrdmulh v24.4s, v24.4s, v3.s[3] // ...................................................................................e........................................................................................................|..............................................................................................................................................................................e... - // mls v9.4s, v24.4s, v29.4s // .............................................................................................e..............................................................................................|.................................................................................................................................................................................. - // sub v24.4s, v10.4s, v11.4s // ..............................................................................................e.............................................................................................|.................................................................................................................................................................................. - // add v10.4s, v10.4s, v11.4s // ............................................................................e...............................................................................................................|.......................................................................................................................................................................e.......... - // mul v11.4s, v24.4s, v4.s[0] // .................................................................................................e..........................................................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.s[1] // ..................................................................................................e.........................................................................................|.................................................................................................................................................................................. - // mls v11.4s, v24.4s, v29.4s // .......................................................................................................e....................................................................................|.................................................................................................................................................................................. - // sub v24.4s, v12.4s, v13.4s // .....................................e......................................................................................................................................................|................................................................................................................................e................................................. - // add v12.4s, v12.4s, v13.4s // ...................................e........................................................................................................................................................|..............................................................................................................................e................................................... - // mul v13.4s, v24.4s, v4.s[2] // .................................................e..........................................................................................................................................|............................................................................................................................................e..................................... - // sqrdmulh v24.4s, v24.4s, v4.s[3] // ................................................e...........................................................................................................................................|...........................................................................................................................................e...................................... - // mls v13.4s, v24.4s, v29.4s // .........................................................e..................................................................................................................................|....................................................................................................................................................e............................. - // sub v24.4s, v14.4s, v15.4s // ...........................................e................................................................................................................................................|......................................................................................................................................e........................................... - // add v14.4s, v14.4s, v15.4s // .........................................e..................................................................................................................................................|....................................................................................................................................e............................................. - // mul v15.4s, v24.4s, v5.s[0] // .......................................................e....................................................................................................................................|..................................................................................................................................................e............................... - // sqrdmulh v24.4s, v24.4s, v5.s[1] // ........................................................e...................................................................................................................................|...................................................................................................................................................e.............................. - // mls v15.4s, v24.4s, v29.4s // ................................................................e...........................................................................................................................|...........................................................................................................................................................e...................... - // sub v24.4s, v16.4s, v17.4s // ...................................................................e........................................................................................................................|..............................................................................................................................................................e................... - // add v16.4s, v16.4s, v17.4s // ...............................................e............................................................................................................................................|..........................................................................................................................................e....................................... - // mul v17.4s, v24.4s, v5.s[2] // .............................................................................................................e..............................................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...........................................................................................................e................................................................................|.................................................................................................................................................................................. - // mls v17.4s, v24.4s, v29.4s // ....................................................................................................................e.......................................................................|.................................................................................................................................................................................. - // sub v24.4s, v18.4s, v19.4s // .......................................................................................e....................................................................................................|.................................................................................................................................................................................. - // add v18.4s, v18.4s, v19.4s // .........................................................................................e..................................................................................................|.................................................................................................................................................................................. - // mul v19.4s, v24.4s, v6.s[0] // ...................................................................................................e........................................................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v6.s[1] // ................................................................................................e...........................................................................................|.................................................................................................................................................................................. - // mls v19.4s, v24.4s, v29.4s // ...............................................................................................................e............................................................................|.................................................................................................................................................................................. - // sub v24.4s, v20.4s, v21.4s // .......................................................................................................................e....................................................................|.................................................................................................................................................................................. - // add v20.4s, v20.4s, v21.4s // ...............................................................................................................................e............................................................|.................................................................................................................................................................................. - // mul v21.4s, v24.4s, v6.s[2] // ...........................................................................................................................e................................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v6.s[3] // ............................................................................................................................e...............................................................|.................................................................................................................................................................................. - // mls v21.4s, v24.4s, v29.4s // .................................................................................................................................e..........................................................|.................................................................................................................................................................................. - // sub v24.4s, v22.4s, v23.4s // ...........................................................e................................................................................................................................|......................................................................................................................................................e........................... - // add v22.4s, v22.4s, v23.4s // ............................................................e...............................................................................................................................|.......................................................................................................................................................e.......................... - // mul v23.4s, v24.4s, v7.s[0] // .....................................................................................................e......................................................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v7.s[1] // ....................................................................................................e.......................................................................................|.................................................................................................................................................................................. - // mls v23.4s, v24.4s, v29.4s // ............................................................................................................e...............................................................................|.................................................................................................................................................................................. - // sub v24.4s, v8.4s, v10.4s // ..........................................................................................e.................................................................................................|.................................................................................................................................................................................. - // add v8.4s, v8.4s, v10.4s // ............................................................................................e...............................................................................................|.................................................................................................................................................................................. - // mul v10.4s, v24.4s, v1.s[2] // ..................................................................................................................e.........................................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................................................................................................e........................................................................|.................................................................................................................................................................................. - // mls v10.4s, v24.4s, v29.4s // ........................................................................................................................e...................................................................|.................................................................................................................................................................................. - // sub v24.4s, v9.4s, v11.4s // ........................................................................................................e...................................................................................|.................................................................................................................................................................................. - // add v9.4s, v9.4s, v11.4s // .........................................................................................................e..................................................................................|.................................................................................................................................................................................. - // mul v11.4s, v24.4s, v1.s[2] // ......................................................................................................................e.....................................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................................................................................................................e......................................................................|.................................................................................................................................................................................. - // mls v11.4s, v24.4s, v29.4s // .............................................................................................................................e..............................................................|.................................................................................................................................................................................. - // sub v24.4s, v12.4s, v14.4s // ..................................................................e.........................................................................................................................|.............................................................................................................................................................e.................... - // add v12.4s, v12.4s, v14.4s // ..............................................e.............................................................................................................................................|.........................................................................................................................................e........................................ - // mul v14.4s, v24.4s, v2.s[0] // ...............................................................................................e............................................................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ........................................................................................e...................................................................................................|.................................................................................................................................................................................. - // mls v14.4s, v24.4s, v29.4s // ......................................................................................................e.....................................................................................|.................................................................................................................................................................................. - // sub v24.4s, v13.4s, v15.4s // .........................................................................e..................................................................................................................|....................................................................................................................................................................e............. - // add v13.4s, v13.4s, v15.4s // ........................................................................e...................................................................................................................|...................................................................................................................................................................e.............. - // mul v15.4s, v24.4s, v2.s[0] // .....................................................................................e......................................................................................................|................................................................................................................................................................................e. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................e...........................................................................................................|...........................................................................................................................................................................e...... - // mls v15.4s, v24.4s, v29.4s // ...........................................................................................e................................................................................................|.................................................................................................................................................................................. - // sub v24.4s, v16.4s, v18.4s // ......................................................................................................................................e.....................................................|.................................................................................................................................................................................. - // add v16.4s, v16.4s, v18.4s // .......................................................................................................................................e....................................................|.................................................................................................................................................................................. - // mul v18.4s, v24.4s, v2.s[2] // .............................................................................................................................................................e..............................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................................................................................e.............................|.................................................................................................................................................................................. - // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................................e..........................|.................................................................................................................................................................................. - // sub v24.4s, v17.4s, v19.4s // .........................................................................................................................e..................................................................|.................................................................................................................................................................................. - // add v17.4s, v17.4s, v19.4s // ..........................................................................................................................e.................................................................|.................................................................................................................................................................................. - // mul v19.4s, v24.4s, v2.s[2] // ..............................................................................................................................................e.............................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...............................................................................................................................................e............................................|.................................................................................................................................................................................. - // mls v19.4s, v24.4s, v29.4s // ......................................................................................................................................................e.....................................|.................................................................................................................................................................................. - // sub v24.4s, v20.4s, v22.4s // ................................................................................................................................................e...........................................|.................................................................................................................................................................................. - // add v20.4s, v20.4s, v22.4s // .................................................................................................................................................e..........................................|.................................................................................................................................................................................. - // mul v22.4s, v24.4s, v3.s[0] // ....................................................................................................................................................................................e.......|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................................................................................................e..........|.................................................................................................................................................................................. - // mls v22.4s, v24.4s, v29.4s // ..........................................................................................................................................................................................e.|.................................................................................................................................................................................. - // sub v24.4s, v21.4s, v23.4s // .....................................................................................................................................e......................................................|.................................................................................................................................................................................. - // add v21.4s, v21.4s, v23.4s // ....................................................................................................................................e.......................................................|.................................................................................................................................................................................. - // mul v23.4s, v24.4s, v3.s[0] // ........................................................................................................................................................................e...................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................................................................e..................|.................................................................................................................................................................................. - // mls v23.4s, v24.4s, v29.4s // ...............................................................................................................................................................................e............|.................................................................................................................................................................................. - // sub v24.4s, v8.4s, v12.4s // ................................................................................................................................e...........................................................|.................................................................................................................................................................................. - // add v8.4s, v8.4s, v12.4s // ..............................................................................................................................e.............................................................|.................................................................................................................................................................................. - // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................e.........................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................................................e......................|.................................................................................................................................................................................. - // mls v12.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.................|.................................................................................................................................................................................. - // sub v24.4s, v9.4s, v13.4s // ..............................................................................................................e.............................................................................|.................................................................................................................................................................................. - // add v9.4s, v9.4s, v13.4s // .................................................................................................................e..........................................................................|.................................................................................................................................................................................. - // mul v13.4s, v24.4s, v0.s[2] // .....................................................................................................................................................e......................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................................................e..................................|.................................................................................................................................................................................. - // mls v13.4s, v24.4s, v29.4s // ................................................................................................................................................................e...........................|.................................................................................................................................................................................. - // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................................e........................................................|.................................................................................................................................................................................. - // add v10.4s, v10.4s, v14.4s // ..................................................................................................................................e.........................................................|.................................................................................................................................................................................. - // mul v14.4s, v24.4s, v0.s[2] // ..........................................................................................................................................e.................................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................................................................................................................................e................................................|.................................................................................................................................................................................. - // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................................................e.......................................|.................................................................................................................................................................................. - // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................e...................................................|.................................................................................................................................................................................. - // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................e..................................................|.................................................................................................................................................................................. - // mul v15.4s, v24.4s, v0.s[2] // .......................................................................................................................................................e....................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........................................................................................................................................................e.................................|.................................................................................................................................................................................. - // mls v15.4s, v24.4s, v29.4s // ...............................................................................................................................................................e............................|.................................................................................................................................................................................. - // sub v24.4s, v16.4s, v20.4s // ...................................................................................................................................................................e........................|.................................................................................................................................................................................. - // add v16.4s, v16.4s, v20.4s // ....................................................................................................................................................................e.......................|.................................................................................................................................................................................. - // mul v20.4s, v24.4s, v1.s[0] // .......................................................................................................................................................................e....................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................................................................e.....................|.................................................................................................................................................................................. - // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................e...............|.................................................................................................................................................................................. - // sub v24.4s, v17.4s, v21.4s // .............................................................................................................................................e..............................................|.................................................................................................................................................................................. - // add v17.4s, v17.4s, v21.4s // ............................................................................................................................................e...............................................|.................................................................................................................................................................................. - // mul v21.4s, v24.4s, v1.s[0] // ..................................................................................................................................................e.........................................|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................e........................................|.................................................................................................................................................................................. - // mls v21.4s, v24.4s, v29.4s // ........................................................................................................................................................e...................................|.................................................................................................................................................................................. - // sub v24.4s, v18.4s, v22.4s // ............................................................................................................................................................................................|...*.............................................................................................................................................................................. - // add v18.4s, v18.4s, v22.4s // ............................................................................................................................................................................................|.....*............................................................................................................................................................................ - // mul v22.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................................|.....................*............................................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................|............................*..................................................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|..................................*............................................................................................................................................... - // sub v24.4s, v19.4s, v23.4s // .....................................................................................................................................................................................e......|.................................................................................................................................................................................. - // add v19.4s, v19.4s, v23.4s // .......................................................................................................................................................................................e....|.................................................................................................................................................................................. - // mul v23.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................................................e|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................................................................e..|.................................................................................................................................................................................. - // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................*.................................................................................................................................................................................. - // sub v24.4s, v8.4s, v16.4s // ...........................................................................................................................................................................e................|.................................................................................................................................................................................. - // add v8.4s, v8.4s, v16.4s // .............................................................................................................................................................................e..............|.................................................................................................................................................................................. - // mul v16.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|...............*.................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.................*................................................................................................................................................................ - // mls v16.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|......................*........................................................................................................................................................... - // sub v24.4s, v9.4s, v17.4s // ............................................................................................................................................................e...............................|.................................................................................................................................................................................. - // add v9.4s, v9.4s, v17.4s // ...........................................................................................................................................................e................................|.................................................................................................................................................................................. - // mul v17.4s, v24.4s, v0.s[0] // ................................................................................................................................................................................e...........|.................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................................................................................................e.............|.................................................................................................................................................................................. - // mls v17.4s, v24.4s, v29.4s // ......................................................................................................................................................................................e.....|.................................................................................................................................................................................. - // sub v24.4s, v10.4s, v18.4s // ............................................................................................................................................................................................|.............................................*.................................................................................................................................... - // add v10.4s, v10.4s, v18.4s // ............................................................................................................................................................................................|..............................................*................................................................................................................................... - // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|................................................*................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.................................................*................................................................................................................................ - // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|.........................................................*........................................................................................................................ - // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................................|..*............................................................................................................................................................................... - // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................................|.*................................................................................................................................................................................ - // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|..................*............................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|...................*.............................................................................................................................................................. - // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|........................*......................................................................................................................................................... - // sub v24.4s, v12.4s, v20.4s // ..................................................................................................................................................................................e.........|.................................................................................................................................................................................. - // add v12.4s, v12.4s, v20.4s // ...................................................................................................................................................................................e........|.................................................................................................................................................................................. - // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|*................................................................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................................................................................................................................................e...|.................................................................................................................................................................................. - // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|......*........................................................................................................................................................................... - // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................................|....*............................................................................................................................................................................. - // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................................|.........*........................................................................................................................................................................ - // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|........*......................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.......*.......................................................................................................................................................................... - // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|.............*.................................................................................................................................................................... - // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................................|........................................*......................................................................................................................................... - // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................................|.........................................*........................................................................................................................................ - // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|..................................................*............................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|............................................*..................................................................................................................................... - // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|........................................................*......................................................................................................................... - // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................................|...........*...................................................................................................................................................................... - // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................................|..........*....................................................................................................................................................................... - // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|...................................................*.............................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|......................................................*........................................................................................................................... - // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|...................................................................*.............................................................................................................. - // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................................|................................*................................................................................................................................................. - // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................................|......................................*........................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|...........................................*...................................................................................................................................... - // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|...............................................*.................................................................................................................................. - // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................|.......................*.......................................................................................................................................................... - // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................|.........................*........................................................................................................................................................ - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|...........................*...................................................................................................................................................... - // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|...............................*.................................................................................................................................................. - // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................................|...............................................................*.................................................................................................................. - // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................................|..............................................................*................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|..................................................................*............................................................................................................... - // mls v18.4s, v28.4s, v29.4s // ...................*........................................................................................................................................................................|..............................................................................................................*................................................................... - // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................................|.....................................................*............................................................................................................................ - // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................................|.....................................................................................*............................................................................................ - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|.........................................................................................*........................................................................................ - // mls v19.4s, v28.4s, v29.4s // ................*...........................................................................................................................................................................|...........................................................................................................*...................................................................... - // cmge v27.4s, v31.4s, v20.4s // ............................................................................................................................................................................................|..............*................................................................................................................................................................... - // cmge v28.4s, v20.4s, v30.4s // ............................................................................................................................................................................................|............*..................................................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|................*................................................................................................................................................................. - // mls v20.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|....................*............................................................................................................................................................. - // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................................|..............................*................................................................................................................................................... - // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................................|.............................*.................................................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|.................................*................................................................................................................................................ - // mls v21.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|.....................................*............................................................................................................................................ - // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................................|.....................................................................*............................................................................................................ - // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................................|.........................................................................*........................................................................................................ - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|............................................................................*..................................................................................................... - // mls v22.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|.................................................................................*................................................................................................ - // cmge v27.4s, v31.4s, v23.4s // ............................................................................................................................................................................................|.............................................................................*.................................................................................................... - // cmge v28.4s, v23.4s, v30.4s // ............................................................................................................................................................................................|.......................................................................................*.......................................................................................... - // sub v28.4s, v27.4s, v28.4s // ..*.........................................................................................................................................................................................|.............................................................................................*.................................................................................... - // mls v23.4s, v28.4s, v29.4s // ......*.....................................................................................................................................................................................|.................................................................................................*................................................................................ - // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................................|....................................................*............................................................................................................................. - // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................................|.......................................*.......................................................................................................................................... - // str q18, [x1, #(10*(512/8))] // .......................*....................................................................................................................................................................|..................................................................................................................*............................................................... - // str q19, [x1, #(11*(512/8))] // .........................*..................................................................................................................................................................|....................................................................................................................*............................................................. - // str q20, [x1, #(12*(512/8))] // ............................................................................................................................................................................................|..........................*....................................................................................................................................................... - // str q21, [x1, #(13*(512/8))] // ............................................................................................................................................................................................|.......................................................*.......................................................................................................................... - // str q22, [x1, #(14*(512/8))] // ............................................................................................................................................................................................|......................................................................................*........................................................................................... - // str q23, [x1, #(15*(512/8))] // ..........*.................................................................................................................................................................................|.....................................................................................................*............................................................................ - // mul v16.4s, v8.4s, v25.4s // .*..........................................................................................................................................................................................|............................................................................................*..................................................................................... - // sqrdmulh v8.4s, v8.4s, v26.4s // ............................................................................................................................................................................................|............................................................*..................................................................................................................... - // mls v16.4s, v8.4s, v29.4s // .......*....................................................................................................................................................................................|..................................................................................................*............................................................................... - // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................................|...................................*.............................................................................................................................................. - // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................................|....................................*............................................................................................................................................. - // mls v17.4s, v9.4s, v29.4s // ............................................................................................................................................................................................|..........................................*....................................................................................................................................... - // mul v18.4s, v10.4s, v25.4s // ............................................................................................................................................................................................|..........................................................................................*....................................................................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ............................................................................................................................................................................................|........................................................................................*......................................................................................... - // mls v18.4s, v10.4s, v29.4s // .....*......................................................................................................................................................................................|................................................................................................*................................................................................. - // mul v19.4s, v11.4s, v25.4s // ............*...............................................................................................................................................................................|.......................................................................................................*.......................................................................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ........*...................................................................................................................................................................................|...................................................................................................*.............................................................................. - // mls v19.4s, v11.4s, v29.4s // ..................*.........................................................................................................................................................................|.............................................................................................................*.................................................................... - // mul v20.4s, v12.4s, v25.4s // ...........*................................................................................................................................................................................|......................................................................................................*........................................................................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ............................................................................................................................................................................................|..................................................................................*............................................................................................... - // mls v20.4s, v12.4s, v29.4s // ...............................*............................................................................................................................................................|..........................................................................................................................*....................................................... - // mul v21.4s, v13.4s, v25.4s // ............................................................................................................................................................................................|..........................................................*....................................................................................................................... - // sqrdmulh v13.4s, v13.4s, v26.4s // ............................................................................................................................................................................................|...........................................................*...................................................................................................................... - // mls v21.4s, v13.4s, v29.4s // ............................................................................................................................................................................................|................................................................*................................................................................................................. - // mul v22.4s, v14.4s, v25.4s // ............................................................................................................................................................................................|................................................................................*................................................................................................. - // sqrdmulh v14.4s, v14.4s, v26.4s // ............................................................................................................................................................................................|...........................................................................*...................................................................................................... - // mls v22.4s, v14.4s, v29.4s // ............................................................................................................................................................................................|....................................................................................*............................................................................................. - // mul v23.4s, v15.4s, v25.4s // ...*........................................................................................................................................................................................|..............................................................................................*................................................................................... - // sqrdmulh v15.4s, v15.4s, v26.4s // ....*.......................................................................................................................................................................................|...............................................................................................*.................................................................................. - // mls v23.4s, v15.4s, v29.4s // .........*..................................................................................................................................................................................|....................................................................................................*............................................................................. - // cmge v27.4s, v31.4s, v16.4s // .............*..............................................................................................................................................................................|........................................................................................................*......................................................................... - // cmge v28.4s, v16.4s, v30.4s // .....................*......................................................................................................................................................................|................................................................................................................*................................................................. - // sub v28.4s, v27.4s, v28.4s // .............................*..............................................................................................................................................................|........................................................................................................................*......................................................... - // mls v16.4s, v28.4s, v29.4s // ..................................*.........................................................................................................................................................|.............................................................................................................................*.................................................... - // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................|.................................................................*................................................................................................................ - // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................|.............................................................*.................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|....................................................................*............................................................................................................. - // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|........................................................................*......................................................................................................... - // cmge v27.4s, v31.4s, v18.4s // ......................................................................*.....................................................................................................................|.................................................................................................................................................................*................ - // cmge v28.4s, v18.4s, v30.4s // .......................................................................*....................................................................................................................|..................................................................................................................................................................*............... - // sub v28.4s, v27.4s, v28.4s // ...........................................................................*................................................................................................................|......................................................................................................................................................................*........... - // mls v18.4s, v28.4s, v29.4s // .................................................................................*..........................................................................................................|............................................................................................................................................................................*..... - // cmge v27.4s, v31.4s, v19.4s // ........................*...................................................................................................................................................................|...................................................................................................................*.............................................................. - // cmge v28.4s, v19.4s, v30.4s // ......................*.....................................................................................................................................................................|.................................................................................................................*................................................................ - // sub v28.4s, v27.4s, v28.4s // .....................................................*......................................................................................................................................|................................................................................................................................................*................................. - // mls v19.4s, v28.4s, v29.4s // ..........................................................*.................................................................................................................................|.....................................................................................................................................................*............................ - // cmge v27.4s, v31.4s, v20.4s // .......................................*....................................................................................................................................................|..................................................................................................................................*............................................... - // cmge v28.4s, v20.4s, v30.4s // ........................................*...................................................................................................................................................|...................................................................................................................................*.............................................. - // sub v28.4s, v27.4s, v28.4s // ............................................*...............................................................................................................................................|.......................................................................................................................................*.......................................... - // mls v20.4s, v28.4s, v29.4s // ...............................................................*............................................................................................................................|..........................................................................................................................................................*....................... - // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................................|.......................................................................*.......................................................................................................... - // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................................|......................................................................*........................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|..........................................................................*....................................................................................................... - // mls v21.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|..............................................................................*................................................................................................... - // cmge v27.4s, v31.4s, v22.4s // ..........................*.................................................................................................................................................................|.....................................................................................................................*............................................................ - // cmge v28.4s, v22.4s, v30.4s // ..............*.............................................................................................................................................................................|.........................................................................................................*........................................................................ - // sub v28.4s, v27.4s, v28.4s // ................................*...........................................................................................................................................................|...........................................................................................................................*...................................................... - // mls v22.4s, v28.4s, v29.4s // ....................................*.......................................................................................................................................................|...............................................................................................................................*.................................................. - // cmge v27.4s, v31.4s, v23.4s // .................*..........................................................................................................................................................................|............................................................................................................*..................................................................... - // cmge v28.4s, v23.4s, v30.4s // ...............*............................................................................................................................................................................|..........................................................................................................*....................................................................... - // sub v28.4s, v27.4s, v28.4s // ...........................*................................................................................................................................................................|......................................................................................................................*........................................................... - // mls v23.4s, v28.4s, v29.4s // ....................................................*.......................................................................................................................................|...............................................................................................................................................*.................................. - // str q16, [x1], #(16) // ..........................................*.................................................................................................................................................|.....................................................................................................................................*............................................ - // str q17, [x1, #(-16 + 1*(512/8))] // ............................................................................................................................................................................................|...............................................................................*.................................................................................................. - // str q18, [x1, #(-16 + 2*(512/8))] // ......................................................................................*.....................................................................................................|.................................................................................................................................................................................* - // str q19, [x1, #(-16 + 3*(512/8))] // ....................................................................*.......................................................................................................................|...............................................................................................................................................................*.................. - // str q20, [x1, #(-16 + 4*(512/8))] // ..........................................................................*.................................................................................................................|.....................................................................................................................................................................*............ - // str q21, [x1, #(-16 + 5*(512/8))] // ............................................................................................................................................................................................|...................................................................................*.............................................................................................. - // str q22, [x1, #(-16 + 6*(512/8))] // .............................................*..............................................................................................................................................|........................................................................................................................................*......................................... - // str q23, [x1, #(-16 + 7*(512/8))] // .............................................................*..............................................................................................................................|........................................................................................................................................................*......................... + // ------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // str q15, [x1, #576] // ..*..................................................................................................................................................................................................................................................................................... + // add v15.4S, v18.4S, v14.4S // ............*........................................................................................................................................................................................................................................................................... + // sqrdmulh v28.4S, v28.4S, v0.S[1] // .............*.......................................................................................................................................................................................................................................................................... + // mls v17.4S, v21.4S, v29.4S // .*...................................................................................................................................................................................................................................................................................... + // sub v21.4S, v11.4S, v24.4S // ....*................................................................................................................................................................................................................................................................................... + // add v11.4S, v11.4S, v24.4S // ...*.................................................................................................................................................................................................................................................................................... + // mls v22.4S, v19.4S, v29.4S // *....................................................................................................................................................................................................................................................................................... + // mls v20.4S, v28.4S, v29.4S // ...................*.................................................................................................................................................................................................................................................................... + // sub v28.4S, v18.4S, v14.4S // ...........*............................................................................................................................................................................................................................................................................ + // mul v24.4S, v23.4S, v0.S[2] // .....*.................................................................................................................................................................................................................................................................................. + // sqrdmulh v14.4S, v23.4S, v0.S[3] // ......*................................................................................................................................................................................................................................................................................. + // sub v18.4S, v10.4S, v15.4S // ..........................*............................................................................................................................................................................................................................................................. + // add v10.4S, v10.4S, v15.4S // ...........................*............................................................................................................................................................................................................................................................ + // sqrdmulh v15.4S, v28.4S, v1.S[1] // ..................*..................................................................................................................................................................................................................................................................... + // mul v28.4S, v28.4S, v1.S[0] // .................*...................................................................................................................................................................................................................................................................... + // sub v23.4S, v22.4S, v17.4S // ........*............................................................................................................................................................................................................................................................................... + // cmge v19.4S, v20.4S, v30.4S // ..................................................*..................................................................................................................................................................................................................................... + // mls v24.4S, v14.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + // cmge v14.4S, v31.4S, v20.4S // ................................................*....................................................................................................................................................................................................................................... + // mls v28.4S, v15.4S, v29.4S // .......................*................................................................................................................................................................................................................................................................ + // add v15.4S, v22.4S, v17.4S // .......*................................................................................................................................................................................................................................................................................ + // sub v19.4S, v14.4S, v19.4S // .....................................................*.................................................................................................................................................................................................................................. + // sqrdmulh v22.4S, v27.4S, v0.S[1] // ..........................................*............................................................................................................................................................................................................................................. + // mul v17.4S, v21.4S, v0.S[0] // .........*.............................................................................................................................................................................................................................................................................. + // sqrdmulh v21.4S, v21.4S, v0.S[1] // ..........*............................................................................................................................................................................................................................................................................. + // add v14.4S, v24.4S, v28.4S // ..............................*......................................................................................................................................................................................................................................................... + // sub v28.4S, v24.4S, v28.4S // ...............................*........................................................................................................................................................................................................................................................ + // sub v24.4S, v16.4S, v13.4S // .....................................*.................................................................................................................................................................................................................................................. + // mls v20.4S, v19.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + // mls v17.4S, v21.4S, v29.4S // ...............*........................................................................................................................................................................................................................................................................ + // sqrdmulh v21.4S, v9.4S, v26.4S // ..............*......................................................................................................................................................................................................................................................................... + // mul v9.4S, v9.4S, v25.4S // ................*....................................................................................................................................................................................................................................................................... + // sqrdmulh v19.4S, v15.4S, v26.4S // ..............................................................................*......................................................................................................................................................................................................... + // add v13.4S, v16.4S, v13.4S // .............................*.......................................................................................................................................................................................................................................................... + // cmge v16.4S, v17.4S, v30.4S // ........................*............................................................................................................................................................................................................................................................... + // str q20, [x1, #768] // ...................................................................*.................................................................................................................................................................................................................... + // cmge v20.4S, v31.4S, v17.4S // ......................*................................................................................................................................................................................................................................................................. + // mls v9.4S, v21.4S, v29.4S // .....................*.................................................................................................................................................................................................................................................................. + // mul v15.4S, v15.4S, v25.4S // .........................................................................................*.............................................................................................................................................................................................. + // sub v16.4S, v20.4S, v16.4S // ............................*........................................................................................................................................................................................................................................................... + // sqrdmulh v20.4S, v12.4S, v26.4S // ......................................................*................................................................................................................................................................................................................................. + // sqrdmulh v21.4S, v13.4S, v26.4S // .................................*...................................................................................................................................................................................................................................................... + // mul v13.4S, v13.4S, v25.4S // ............................................*........................................................................................................................................................................................................................................... + // mls v15.4S, v19.4S, v29.4S // ......................................................................................................*................................................................................................................................................................................. + // mul v12.4S, v12.4S, v25.4S // .......................................................*................................................................................................................................................................................................................................ + // sqrdmulh v19.4S, v23.4S, v0.S[1] // ....................................................*................................................................................................................................................................................................................................... + // mul v23.4S, v23.4S, v0.S[0] // ........................................................*............................................................................................................................................................................................................................... + // mls v17.4S, v16.4S, v29.4S // ................................*....................................................................................................................................................................................................................................................... + // mls v13.4S, v21.4S, v29.4S // .................................................*...................................................................................................................................................................................................................................... + // mls v12.4S, v20.4S, v29.4S // .........................................................*.............................................................................................................................................................................................................................. + // mul v16.4S, v27.4S, v0.S[0] // .........................................*.............................................................................................................................................................................................................................................. + // sqrdmulh v27.4S, v28.4S, v0.S[1] // ...............................................................................*........................................................................................................................................................................................................ + // mls v23.4S, v19.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + // str q17, [x1, #704] // ......................................*................................................................................................................................................................................................................................................. + // cmge v20.4S, v31.4S, v13.4S // ..........................................................................................................................................*............................................................................................................................................. + // mul v28.4S, v28.4S, v0.S[0] // ...........................................................................*............................................................................................................................................................................................................ + // mls v28.4S, v27.4S, v29.4S // ....................................................................................*................................................................................................................................................................................................... + // mul v27.4S, v24.4S, v0.S[0] // ...................................................*.................................................................................................................................................................................................................................... + // sqrdmulh v24.4S, v24.4S, v0.S[1] // .............................................................*.......................................................................................................................................................................................................................... + // cmge v19.4S, v13.4S, v30.4S // ............................................................................................................................*........................................................................................................................................................... + // sqrdmulh v21.4S, v18.4S, v0.S[1] // .......................................*................................................................................................................................................................................................................................................ + // mul v18.4S, v18.4S, v0.S[0] // ........................................*............................................................................................................................................................................................................................................... + // cmge v17.4S, v31.4S, v28.4S // ............................................................................................*........................................................................................................................................................................................... + // sub v20.4S, v20.4S, v19.4S // ......................................................................................................................................................*................................................................................................................................. + // mls v27.4S, v24.4S, v29.4S // .....................................................................*.................................................................................................................................................................................................................. + // cmge v24.4S, v28.4S, v30.4S // ...........................................................................................*............................................................................................................................................................................................ + // mls v13.4S, v20.4S, v29.4S // .................................................................................................................................................................*...................................................................................................................... + // mls v18.4S, v21.4S, v29.4S // ..............................................*......................................................................................................................................................................................................................................... + // mul v20.4S, v8.4S, v25.4S // ..........................................................................................................*............................................................................................................................................................................. + // sub v24.4S, v17.4S, v24.4S // .................................................................................................*...................................................................................................................................................................................... + // cmge v21.4S, v12.4S, v30.4S // ...............................................................*........................................................................................................................................................................................................................ + // sqrdmulh v19.4S, v8.4S, v26.4S // .......................................................................................................*................................................................................................................................................................................ + // mls v28.4S, v24.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + // str q13, [x1, #320] // ..............................................................................................................................................................................*......................................................................................................... + // cmge v13.4S, v31.4S, v9.4S // ......................................................................*................................................................................................................................................................................................................. + // sqrdmulh v17.4S, v10.4S, v26.4S // ..........................................................*............................................................................................................................................................................................................................. + // mul v10.4S, v10.4S, v25.4S // ...........................................................*............................................................................................................................................................................................................................ + // mls v20.4S, v19.4S, v29.4S // .......................................................................................................................*................................................................................................................................................................ + // sqrdmulh v8.4S, v14.4S, v26.4S // ...................................*.................................................................................................................................................................................................................................................... + // mul v14.4S, v14.4S, v25.4S // ....................................*................................................................................................................................................................................................................................................... + // str q28, [x1, #896] // ...............................................................................................................*........................................................................................................................................................................ + // mls v16.4S, v22.4S, v29.4S // ...............................................*........................................................................................................................................................................................................................................ + // cmge v24.4S, v15.4S, v30.4S // ......................................................................................................................*................................................................................................................................................................. + // mls v10.4S, v17.4S, v29.4S // ................................................................*....................................................................................................................................................................................................................... + // mls v14.4S, v8.4S, v29.4S // ...........................................*............................................................................................................................................................................................................................................ + // cmge v8.4S, v9.4S, v30.4S // ....................................................................*................................................................................................................................................................................................................... + // cmge v17.4S, v16.4S, v30.4S // .................................................................................................................*...................................................................................................................................................................... + // sqrdmulh v19.4S, v11.4S, v26.4S // .........................*.............................................................................................................................................................................................................................................................. + // sub v22.4S, v13.4S, v8.4S // .........................................................................*.............................................................................................................................................................................................................. + // mul v11.4S, v11.4S, v25.4S // ..................................*..................................................................................................................................................................................................................................................... + // cmge v28.4S, v31.4S, v14.4S // ....................................................................................................................*................................................................................................................................................................... + // cmge v13.4S, v14.4S, v30.4S // ..................................................................................*..................................................................................................................................................................................................... + // cmge v8.4S, v31.4S, v12.4S // ..............................................................*......................................................................................................................................................................................................................... + // mls v9.4S, v22.4S, v29.4S // .................................................................................*...................................................................................................................................................................................................... + // sub v13.4S, v28.4S, v13.4S // ........................................................................................................................*............................................................................................................................................................... + // mls v11.4S, v19.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + // sub v21.4S, v8.4S, v21.4S // ..................................................................*..................................................................................................................................................................................................................... + // cmge v22.4S, v20.4S, v30.4S // ...............................................................................................................................*........................................................................................................................................................ + // str q9, [x1, #64] // .....................................................................................*.................................................................................................................................................................................................. + // cmge v8.4S, v31.4S, v20.4S // ..............................................................................................................................................*......................................................................................................................................... + // cmge v9.4S, v31.4S, v16.4S // ...........................................................................................................*............................................................................................................................................................................ + // cmge v19.4S, v31.4S, v11.4S // ..........................................................................*............................................................................................................................................................................................................. + // cmge v28.4S, v11.4S, v30.4S // ............................................................................*........................................................................................................................................................................................................... + // sub v17.4S, v9.4S, v17.4S // .....................................................................................................................*.................................................................................................................................................................. + // mls v12.4S, v21.4S, v29.4S // .......................................................................*................................................................................................................................................................................................................ + // sub v28.4S, v19.4S, v28.4S // ................................................................................*....................................................................................................................................................................................................... + // sub v21.4S, v8.4S, v22.4S // ...................................................................................................................................................*.................................................................................................................................... + // mls v16.4S, v17.4S, v29.4S // ...........................................................................................................................................*............................................................................................................................................ + // cmge v19.4S, v10.4S, v30.4S // ......................................................................................*................................................................................................................................................................................................. + // mls v11.4S, v28.4S, v29.4S // ........................................................................................*............................................................................................................................................................................................... + // str q12, [x1, #256] // .............................................................................*.......................................................................................................................................................................................................... + // mls v20.4S, v21.4S, v29.4S // ..............................................................................................................................................................*......................................................................................................................... + // str q11, [x1, #192] // ...............................................................................................*........................................................................................................................................................................................ + // cmge v21.4S, v23.4S, v30.4S // ........................................................................*............................................................................................................................................................................................................... + // cmge v11.4S, v31.4S, v23.4S // ...................................................................................*.................................................................................................................................................................................................... + // mls v14.4S, v13.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + // cmge v22.4S, v27.4S, v30.4S // ...................................................................................................*.................................................................................................................................................................................... + // str q20, [x1], #(16) // .....................................................................................................................................................................*.................................................................................................................. + // sub v21.4S, v11.4S, v21.4S // .......................................................................................*................................................................................................................................................................................................ + // str q16, [x1, #496] // ................................................................................................................................................*....................................................................................................................................... + // cmge v17.4S, v31.4S, v27.4S // ...........................................................................................................................*............................................................................................................................................................ + // cmge v20.4S, v18.4S, v30.4S // .............................................................................................................*.......................................................................................................................................................................... + // cmge v13.4S, v31.4S, v10.4S // ..........................................................................................*............................................................................................................................................................................................. + // str q14, [x1, #368] // ..................................................................................................................................*..................................................................................................................................................... + // mls v23.4S, v21.4S, v29.4S // .............................................................................................*.......................................................................................................................................................................................... + // sub v8.4S, v17.4S, v22.4S // ................................................................................................................................*....................................................................................................................................................... + // sub v22.4S, v13.4S, v19.4S // ..............................................................................................*......................................................................................................................................................................................... + // cmge v21.4S, v31.4S, v18.4S // .................................................................................................................................*...................................................................................................................................................... + // cmge v11.4S, v31.4S, v15.4S // ................................................................................................................*....................................................................................................................................................................... + // mls v27.4S, v8.4S, v29.4S // .....................................................................................................................................*.................................................................................................................................................. + // str q23, [x1, #944] // ........................................................................................................*............................................................................................................................................................................... + // mls v10.4S, v22.4S, v29.4S // .....................................................................................................*.................................................................................................................................................................................. + // sub v21.4S, v21.4S, v20.4S // .........................................................................................................................................*.............................................................................................................................................. + // sub v22.4S, v11.4S, v24.4S // ..........................................................................................................................*............................................................................................................................................................. + // str q27, [x1, #816] // .........................................................................................................................................................*.............................................................................................................................. + // mls v18.4S, v21.4S, v29.4S // ....................................................................................................................................................*................................................................................................................................... + // str q10, [x1, #112] // ............................................................................................................*........................................................................................................................................................................... + // mls v15.4S, v22.4S, v29.4S // ..............................................................................................................................*......................................................................................................................................................... + // str q18, [x1, #624] // ............................................................................................................................................................*........................................................................................................................... + // str q15, [x1, #432] // .......................................................................................................................................*................................................................................................................................................ + // ldr q18, [x1, #64] // ....................................................................................................*................................................................................................................................................................................... + // ldr q12, [x1] // ................................................................................................*....................................................................................................................................................................................... + // ldr q17, [x1, #384] // ........................................................................................................................................*............................................................................................................................................... + // ldr q8, [x1, #448] // ..................................................................................................................*..................................................................................................................................................................... + // ldr q27, [x1, #128] // ..................................................................................................*..................................................................................................................................................................................... + // ldr q10, [x1, #192] // .........................................................................................................................*.............................................................................................................................................................. + // add v14.4S, v12.4S, v18.4S // ..............................................................................................................*......................................................................................................................................................................... + // ldr q21, [x1, #320] // ...............................................................................................................................................*........................................................................................................................................ + // sub v11.4S, v12.4S, v18.4S // ...................................................................................................................*.................................................................................................................................................................... + // sub v23.4S, v17.4S, v8.4S // ..................................................................................................................................................*..................................................................................................................................... + // add v19.4S, v17.4S, v8.4S // ..........................................................................................................................................................*............................................................................................................................. + // sqrdmulh v22.4S, v11.4S, v3.S[3] // ....................................................................................................................................*................................................................................................................................................... + // mul v12.4S, v11.4S, v3.S[2] // ......................................................................................................................................*................................................................................................................................................. + // ldr q18, [x1, #576] // ........................................................................................................................................................*............................................................................................................................... + // sub v17.4S, v27.4S, v10.4S // ...................................................................................................................................*.................................................................................................................................................... + // add v11.4S, v27.4S, v10.4S // .............................................................................................................................................*.......................................................................................................................................... + // ldr q10, [x1, #256] // ............................................................................................................................................*........................................................................................................................................... + // mul v15.4S, v23.4S, v5.S[0] // .....................................................................................................................................................*.................................................................................................................................. + // add v20.4S, v14.4S, v11.4S // ...................................................................................................................................................................*.................................................................................................................... + // mls v12.4S, v22.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + // ldr q16, [x1, #512] // .......................................................................................................................................................*................................................................................................................................ + // sqrdmulh v8.4S, v17.4S, v4.S[1] // ................................................................................................................................................................*....................................................................................................................... + // sub v22.4S, v14.4S, v11.4S // .........................................................................................................................................................................*.............................................................................................................. + // add v11.4S, v10.4S, v21.4S // .............................................................................................................................................................................*.......................................................................................................... + // mul v27.4S, v17.4S, v4.S[0] // .............................................................................................................................................................*.......................................................................................................................... + // sqrdmulh v23.4S, v23.4S, v5.S[1] // ....................................................................................................................................................................*................................................................................................................... + // sub v28.4S, v10.4S, v21.4S // ...........................................................................................................................................................*............................................................................................................................ + // ldr q17, [x1, #640] // ..............................................................................................................................................................................................*......................................................................................... + // ldr q24, [x1, #960] // ...............................................................................................................................................................................................................*........................................................................ + // sub v9.4S, v16.4S, v18.4S // ..................................................................................................................................................................*..................................................................................................................... + // sub v21.4S, v11.4S, v19.4S // ................................................................................................................................................................................................*....................................................................................... + // ldr q10, [x1, #704] // .................................................................................................................................................................................................*...................................................................................... + // mul v13.4S, v28.4S, v4.S[2] // ...............................................................................................................................................................*........................................................................................................................ + // sqrdmulh v14.4S, v28.4S, v4.S[3] // ........................................................................................................................................................................*............................................................................................................... + // add v16.4S, v16.4S, v18.4S // ..........................................................................................................................................................................................................*............................................................................. + // sqrdmulh v18.4S, v9.4S, v5.S[3] // ......................................................................................................................................................................*................................................................................................................. + // add v28.4S, v11.4S, v19.4S // ......................................................................................................................................................................................*................................................................................................. + // mul v11.4S, v9.4S, v5.S[2] // ............................................................................................................................................................................*........................................................................................................... + // add v19.4S, v17.4S, v10.4S // ........................................................................................................................................................................................................*............................................................................... + // mls v27.4S, v8.4S, v29.4S // .......................................................................................................................................................................*................................................................................................................ + // add v8.4S, v20.4S, v28.4S // ...........................................................................................................................................................................................*............................................................................................ + // sub v9.4S, v20.4S, v28.4S // .........................................................................................................................................................................................*.............................................................................................. + // mls v11.4S, v18.4S, v29.4S // ..........................................................................................................................................................................................*............................................................................................. + // sub v18.4S, v16.4S, v19.4S // ..................................................................................................................................................................................................................*..................................................................... + // ldr q20, [x1, #768] // ...........................................................................................................................................................................*............................................................................................................ + // sub v10.4S, v17.4S, v10.4S // ....................................................................................................................................................................................................*................................................................................... + // mls v15.4S, v23.4S, v29.4S // ..................................................................................................................................................................................*..................................................................................................... + // sqrdmulh v23.4S, v18.4S, v2.S[3] // ................................................................................................................................................................................................................................................*....................................... + // mul v18.4S, v18.4S, v2.S[2] // ...............................................................................................................................................................................................................................................*........................................ + // mul v28.4S, v10.4S, v6.S[0] // .....................................................................................................................................................................................................................*.................................................................. + // sqrdmulh v17.4S, v10.4S, v6.S[1] // ...................................................................................................................................................................................................................*.................................................................... + // mls v13.4S, v14.4S, v29.4S // .................................................................................................................................................................................*...................................................................................................... + // ldr q10, [x1, #832] // ..........................................................................................................................................................................*............................................................................................................. + // mul v14.4S, v21.4S, v2.S[0] // .......................................................................................................................................................................................................*................................................................................ + // mls v18.4S, v23.4S, v29.4S // ............................................................................................................................................................................................................................................................*........................... + // sqrdmulh v21.4S, v21.4S, v2.S[1] // ......................................................................................................................................................................................................*................................................................................. + // add v19.4S, v16.4S, v19.4S // .................................................................................................................................................................................................................*...................................................................... + // sub v16.4S, v12.4S, v27.4S // ...............................................................................................................................................................................................*........................................................................................ + // add v12.4S, v12.4S, v27.4S // .......................................................................................................................................................................................*................................................................................................ + // mul v27.4S, v22.4S, v1.S[2] // ...............................................................................................................................................................................*........................................................................................................ + // sqrdmulh v23.4S, v22.4S, v1.S[3] // ................................................................................................................................................................................*....................................................................................................... + // mul v22.4S, v16.4S, v1.S[2] // ..............................................................................................................................................................................................................*......................................................................... + // mls v14.4S, v21.4S, v29.4S // ............................................................................................................................................................................................................*........................................................................... + // add v21.4S, v20.4S, v10.4S // ....................................................................................................................................................................................*................................................................................................... + // mls v28.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................................*............................................................ + // sqrdmulh v16.4S, v16.4S, v1.S[3] // ...........................................................................................................................................................................................................*............................................................................ + // sub v17.4S, v20.4S, v10.4S // ...................................................................................................................................................................................*.................................................................................................... + // mls v27.4S, v23.4S, v29.4S // .....................................................................................................................................................................................*.................................................................................................. + // ldr q20, [x1, #896] // .............................................................................................................................................................................................................*.......................................................................... + // sub v10.4S, v13.4S, v15.4S // .............................................................................................................................................................................................*.......................................................................................... + // add v13.4S, v13.4S, v15.4S // ........................................................................................................................................................................................*............................................................................................... + // mls v22.4S, v16.4S, v29.4S // ....................................................................................................................................................................................................................*................................................................... + // sqrdmulh v16.4S, v17.4S, v6.S[3] // .........................................................................................................................................................................................................*.............................................................................. + // mul v17.4S, v17.4S, v6.S[2] // ..................................................................................................................................................................................................*..................................................................................... + // sub v15.4S, v12.4S, v13.4S // .....................................................................................................................................................................................................*.................................................................................. + // sub v23.4S, v20.4S, v24.4S // .......................................................................................................................................................................................................................*................................................................ + // add v20.4S, v20.4S, v24.4S // ........................................................................................................................................................................................................................*............................................................... + // add v13.4S, v12.4S, v13.4S // ............................................................................................................................................................................................*........................................................................................... + // add v12.4S, v11.4S, v28.4S // ..................................................................................................................................................................................................................................*..................................................... + // mul v24.4S, v23.4S, v7.S[0] // .......................................................................................................................................................................................................................................*................................................ + // sqrdmulh v23.4S, v23.4S, v7.S[1] // ......................................................................................................................................................................................................................................*................................................. + // mls v17.4S, v16.4S, v29.4S // ................................................................................................................................................................................................................*....................................................................... + // sub v16.4S, v11.4S, v28.4S // .................................................................................................................................................................................................................................*...................................................... + // sqrdmulh v28.4S, v10.4S, v2.S[1] // ...................................................................................................................................................................................................*.................................................................................... + // mls v24.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................................*.......................................... + // sub v23.4S, v27.4S, v14.4S // ...................................................................................................................................................................................................................................*.................................................... + // mul v11.4S, v10.4S, v2.S[0] // ......................................................................................................................................................................................................................*................................................................. + // add v10.4S, v27.4S, v14.4S // .....................................................................................................................................................................................................................................*.................................................. + // sub v14.4S, v21.4S, v20.4S // ...............................................................................................................................................................................................................................*........................................................ + // add v20.4S, v21.4S, v20.4S // ..............................................................................................................................................................................................................................*......................................................... + // add v27.4S, v17.4S, v24.4S // ...................................................................................................................................................................................................................................................*.................................... + // sub v24.4S, v17.4S, v24.4S // ..................................................................................................................................................................................................................................................*..................................... + // mls v11.4S, v28.4S, v29.4S // ............................................................................................................................................................................................................................*........................................................... + // mul v17.4S, v16.4S, v2.S[2] // ..............................................................................................................................................................................................................................................*......................................... + // sqrdmulh v28.4S, v16.4S, v2.S[3] // .................................................................................................................................................................................................................................................*...................................... + // add v21.4S, v19.4S, v20.4S // .........................................................................................................................................................................................................................................*.............................................. + // sub v20.4S, v19.4S, v20.4S // ........................................................................................................................................................................................................................................*............................................... + // add v16.4S, v12.4S, v27.4S // ......................................................................................................................................................................................................................................................*................................. + // sub v19.4S, v22.4S, v11.4S // ............................................................................................................................................................................................................................................*........................................... + // add v11.4S, v22.4S, v11.4S // ...........................................................................................................................................................................................................................................*............................................ + // mls v17.4S, v28.4S, v29.4S // ........................................................................................................................................................................................................................................................*............................... + // sub v28.4S, v12.4S, v27.4S // .......................................................................................................................................................................................................................................................*................................ + // sqrdmulh v22.4S, v24.4S, v3.S[1] // ...................................................................................................................................................................................................................................................................*.................... + // mul v24.4S, v24.4S, v3.S[0] // .....................................................................................................................................................................................................................................................................*.................. + // mul v12.4S, v9.4S, v0.S[2] // .............................................................................................................................................................................................................................*.......................................................... + // sqrdmulh v9.4S, v9.4S, v0.S[3] // ....................................................................................................................................................................................................................................*................................................... + // sub v27.4S, v8.4S, v21.4S // ....................................................................................................................................................................................................................................................*................................... + // add v8.4S, v8.4S, v21.4S // .....................................................................................................................................................................................................................................................*.................................. + // mls v24.4S, v22.4S, v29.4S // ..........................................................................................................................................................................................................................................................................*............. + // mul v21.4S, v20.4S, v1.S[0] // ................................................................................................................................................................................................................................................................*....................... + // mls v12.4S, v9.4S, v29.4S // ..........................................................................................................................................................................................................................................*............................................. + // add v9.4S, v13.4S, v16.4S // ...........................................................................................................................................................................................................................................................*............................ + // sqrdmulh v20.4S, v20.4S, v1.S[1] // .................................................................................................................................................................................................................................................................*...................... + // sub v22.4S, v13.4S, v16.4S // ..........................................................................................................................................................................................................................................................*............................. + // sqrdmulh v13.4S, v15.4S, v0.S[3] // ..........................................................................................................................................................................................................................*............................................................. + // mul v16.4S, v15.4S, v0.S[2] // .........................................................................................................................................................................................................................*.............................................................. + // mul v15.4S, v22.4S, v0.S[0] // .............................................................................................................................................................................................................................................................*.......................... + // sqrdmulh v22.4S, v22.4S, v0.S[1] // ..............................................................................................................................................................................................................................................................*......................... + // mls v21.4S, v20.4S, v29.4S // ....................................................................................................................................................................................................................................................................*................... + // sqrdmulh v20.4S, v28.4S, v1.S[1] // ......................................................................................................................................................................................................................................................................*................. + // mls v16.4S, v13.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + // mul v13.4S, v28.4S, v1.S[0] // .......................................................................................................................................................................................................................................................................*................ + // mls v15.4S, v22.4S, v29.4S // ..................................................................................................................................................................................................................................................................*..................... + // sqrdmulh v22.4S, v14.4S, v3.S[1] // .........................................................................................................................................................................................................................................................*.............................. + // mul v14.4S, v14.4S, v3.S[0] // ...............................................................................................................................................................................................................................................................*........................ + // sub v28.4S, v12.4S, v21.4S // ................................................................................................................................................................................................................................................................................*....... + // add v12.4S, v12.4S, v21.4S // ...............................................................................................................................................................................................................................................................................*........ + // sub v21.4S, v17.4S, v24.4S // .................................................................................................................................................................................................................................................................................*...... + // mls v13.4S, v20.4S, v29.4S // ............................................................................................................................................................................................................................................................................*........... + // cmge v20.4S, v15.4S, v30.4S // ...........................................................................................................................................................................................................................................................................*............ + // add v24.4S, v17.4S, v24.4S // ....................................................................................................................................................................................................................................................................................*... + // cmge v17.4S, v31.4S, v15.4S // .........................................................................................................................................................................................................................................................................*.............. + // mls v14.4S, v22.4S, v29.4S // ........................................................................................................................................................................................................................................................................*............... + // mul v22.4S, v19.4S, v0.S[2] // ..................................................................................................................................................................................................................................................................................*..... + // sub v20.4S, v17.4S, v20.4S // ..............................................................................................................................................................................................................................................................................*......... + // sqrdmulh v19.4S, v19.4S, v0.S[3] // .............................................................................................................................................................................................................................................................................*.......... + // mul v17.4S, v21.4S, v1.S[0] // .....................................................................................................................................................................................................................................................................................*.. + // sqrdmulh v21.4S, v21.4S, v1.S[1] // ......................................................................................................................................................................................................................................................................................*. + // mls v15.4S, v20.4S, v29.4S // ...................................................................................................................................................................................................................................................................................*.... + // mul v20.4S, v28.4S, v0.S[0] // .......................................................................................................................................................................................................................................................................................* sub count, count, #1 cbnz count, layer1234_start - mls v20.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ - mul v11.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... - add v28.4S, v18.4S, v19.4S // ...............................................................................................................................*........................................................................................................................................................ - sub v23.4S, v18.4S, v19.4S // ..............................................................................................................................*......................................................................................................................................................... - cmge v19.4S, v27.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. - cmge v18.4S, v31.4S, v27.4S // ....................................................................................................................................................................................*................................................................................................... - mls v11.4S, v13.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... - sub v13.4S, v10.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... - add v10.4S, v10.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... - sub v28.4S, v18.4S, v19.4S // ......................................................................................................................................................................................*................................................................................................. - mul v19.4S, v13.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... - sqrdmulh v13.4S, v13.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. - sqrdmulh v18.4S, v23.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... - mul v23.4S, v23.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... - mls v27.4S, v28.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ - sub v28.4S, v24.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... - mls v19.4S, v13.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. - add v13.4S, v24.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... - cmge v24.4S, v31.4S, v11.4S // ................................................................................................................................................................................................*....................................................................................... - mls v23.4S, v18.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... - sub v18.4S, v16.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ - add v15.4S, v16.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... - cmge v20.4S, v19.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. - str q27, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... - cmge v27.4S, v31.4S, v19.4S // ........................................................................................................................................................................................*............................................................................................... - mul v16.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. - sqrdmulh v22.4S, v22.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ - sub v20.4S, v27.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + str q15, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + add v15.4S, v18.4S, v14.4S // ...............................................................................................................................*........................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[1] // ..............................................................................................................................................................*......................................................................................................................... + mls v17.4S, v21.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sub v21.4S, v11.4S, v24.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v24.4S // ........................................................................................................................................................*............................................................................................................................... + mls v22.4S, v19.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v20.4S, v28.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v28.4S, v18.4S, v14.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v24.4S, v23.4S, v0.S[2] // .............................................................................................................*.......................................................................................................................................................................... + sqrdmulh v14.4S, v23.4S, v0.S[3] // ............................................................................................................*........................................................................................................................................................................... + sub v18.4S, v10.4S, v15.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v15.4S // ...................................................................................................................................................*.................................................................................................................................... + sqrdmulh v15.4S, v28.4S, v1.S[1] // ................................................................................................................................*....................................................................................................................................................... + mul v28.4S, v28.4S, v1.S[0] // .................................................................................................................................*...................................................................................................................................................... + sub v23.4S, v22.4S, v17.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v19.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v24.4S, v14.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + cmge v14.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + mls v28.4S, v15.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + add v15.4S, v22.4S, v17.4S // ............................................................................................................................................................................*........................................................................................................... + sub v19.4S, v14.4S, v19.4S // ..................................................................................................................................................................................................*..................................................................................... + sqrdmulh v22.4S, v27.4S, v0.S[1] // ..........................................................................................................................................*............................................................................................................................................. + mul v17.4S, v21.4S, v0.S[0] // ..........................................................................................................................................................*............................................................................................................................. + sqrdmulh v21.4S, v21.4S, v0.S[1] // .........................................................................................................................................................*.............................................................................................................................. + add v14.4S, v24.4S, v28.4S // .......................................................................................................................................................................*................................................................................................................ + sub v28.4S, v24.4S, v28.4S // ......................................................................................................................................................................*................................................................................................................. + sub v24.4S, v16.4S, v13.4S // .................................................................................................................................................................*...................................................................................................................... + mls v20.4S, v19.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mls v17.4S, v21.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sqrdmulh v21.4S, v9.4S, v26.4S // ...........................................................................................................................................................................................................................*............................................................ + mul v9.4S, v9.4S, v25.4S // ............................................................................................................................................................................................................................*........................................................... + sqrdmulh v19.4S, v15.4S, v26.4S // .............................................................................................................................................................................................................................................*.......................................... + add v13.4S, v16.4S, v13.4S // ..................................................................................................................................................................*..................................................................................................................... + cmge v16.4S, v17.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + cmge v20.4S, v31.4S, v17.4S // ............................................................................................................................................................................................*........................................................................................... + mls v9.4S, v21.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mul v15.4S, v15.4S, v25.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v16.4S, v20.4S, v16.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v20.4S, v12.4S, v26.4S // ....................................................................................................................................................................................................................................*................................................... + sqrdmulh v21.4S, v13.4S, v26.4S // .......................................................................................................................................................................................................................................*................................................ + mul v13.4S, v13.4S, v25.4S // ........................................................................................................................................................................................................................................*............................................... + mls v15.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mul v12.4S, v12.4S, v25.4S // .....................................................................................................................................................................................................................................*.................................................. + sqrdmulh v19.4S, v23.4S, v0.S[1] // .............................................................................................................................................................................*.......................................................................................................... + mul v23.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................................*......................................................................................................... + mls v17.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v13.4S, v21.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mls v12.4S, v20.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mul v16.4S, v27.4S, v0.S[0] // ...........................................................................................................................................*............................................................................................................................................ + sqrdmulh v27.4S, v28.4S, v0.S[1] // ........................................................................................................................................................................*............................................................................................................... + mls v23.4S, v19.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + str q17, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + cmge v20.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... + mul v28.4S, v28.4S, v0.S[0] // .........................................................................................................................................................................*.............................................................................................................. + mls v28.4S, v27.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mul v27.4S, v24.4S, v0.S[0] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v24.4S, v24.4S, v0.S[1] // ...................................................................................................................................................................*.................................................................................................................... + cmge v19.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sqrdmulh v21.4S, v18.4S, v0.S[1] // ....................................................................................................................................................*................................................................................................................................... + mul v18.4S, v18.4S, v0.S[0] // .....................................................................................................................................................*.................................................................................................................................. + cmge v17.4S, v31.4S, v28.4S // ........................................................................................................................................................................................................*............................................................................... + sub v20.4S, v20.4S, v19.4S // ......................................................................................................................................................................................................................................................................*................. + mls v27.4S, v24.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v24.4S, v28.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v13.4S, v20.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v18.4S, v21.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mul v20.4S, v8.4S, v25.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v24.4S, v17.4S, v24.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v21.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sqrdmulh v19.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................*............................................................... + mls v28.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + str q13, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + cmge v13.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... + sqrdmulh v17.4S, v10.4S, v26.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v10.4S, v10.4S, v25.4S // ...............................................................................................................................................................................................................................*........................................................ + mls v20.4S, v19.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sqrdmulh v8.4S, v14.4S, v26.4S // ..........................................................................................................................................................................................................................................*............................................. + mul v14.4S, v14.4S, v25.4S // ...........................................................................................................................................................................................................................................*............................................ + str q28, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. mls v16.4S, v22.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... - cmge v22.4S, v11.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... - mls v19.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ - add v27.4S, v14.4S, v23.4S // .......................................................................................................................................................................*................................................................................................................ - sub v20.4S, v14.4S, v23.4S // ......................................................................................................................................................................*................................................................................................................. - sub v23.4S, v24.4S, v22.4S // ..................................................................................................................................................................................................*..................................................................................... - cmge v14.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... - cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... - str q19, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... - sub v22.4S, v21.4S, v17.4S // .......................................................................................................................................................*................................................................................................................................ - mls v11.4S, v23.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... - sub v24.4S, v24.4S, v14.4S // ..................................................................................................................................................................................*..................................................................................................... - sqrdmulh v14.4S, v20.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. - mul v19.4S, v22.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. - sqrdmulh v23.4S, v22.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. - str q11, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... - add v11.4S, v21.4S, v17.4S // ........................................................................................................................................................*............................................................................................................................... - mls v16.4S, v24.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... - mul v21.4S, v28.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... - mul v24.4S, v20.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... - mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... - mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ - str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... - mul v23.4S, v18.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... - sqrdmulh v22.4S, v18.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... - sqrdmulh v18.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. - mls v24.4S, v14.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. - cmge v17.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... - sqrdmulh v16.4S, v28.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... - cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... - cmge v14.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. - cmge v12.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... - sub v28.4S, v17.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... - mls v23.4S, v22.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ - sub v17.4S, v12.4S, v14.4S // ..........................................................................................................................................................................................................*............................................................................. - mls v21.4S, v16.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. - mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ - sqrdmulh v14.4S, v27.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ - cmge v28.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... - cmge v12.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... - cmge v16.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. - cmge v22.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... - mls v24.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ - mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ - str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... - sub v28.4S, v28.4S, v12.4S // ..............................................................................................................................................................................................................*......................................................................... - sub v12.4S, v22.4S, v16.4S // ......................................................................................................................................................................................................*................................................................................. - sqrdmulh v16.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... - mul v22.4S, v27.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. - mls v21.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ - str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. - mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ - mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... - sqrdmulh v11.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... - mls v22.4S, v14.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... - mls v17.4S, v16.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... - str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. - cmge v12.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. - cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... - cmge v9.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... - cmge v24.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. - str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ - sub v12.4S, v27.4S, v12.4S // ......................................................................................................................................................................................................................................................*................................. - sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. - sub v24.4S, v9.4S, v24.4S // ..........................................................................................................................................................................................................................................................................*............. - mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... - mls v17.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ - mls v19.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... - mul v8.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... - sqrdmulh v9.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... - mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. - mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ - str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... - sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... - mul v11.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... - mls v20.4S, v18.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. - mls v8.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ - cmge v9.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... - cmge v12.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... - mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. - sqrdmulh v13.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ - sub v9.4S, v12.4S, v9.4S // ..................................................................................................................................................................................................................................................*..................................... - cmge v12.4S, v31.4S, v8.4S // ............................................................................................................................................................................................................................................................................*........... - cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... - mls v11.4S, v13.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... - mls v16.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... - cmge v23.4S, v8.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... - cmge v14.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... - cmge v9.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... - mls v22.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ - sub v28.4S, v12.4S, v23.4S // ..............................................................................................................................................................................................................................................................................*......... - str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... - sub v15.4S, v14.4S, v9.4S // ..................................................................................................................................................................................................................................................................*..................... - cmge v12.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... - mls v8.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ - cmge v14.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. - str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. - cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... - cmge v28.4S, v11.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. - sub v24.4S, v12.4S, v14.4S // ......................................................................................................................................................................................................................................................................*................. - cmge v9.4S, v31.4S, v11.4S // ........................................................................................................................................................................................................................................................*............................... - mls v20.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... - sub v13.4S, v23.4S, v18.4S // ..............................................................................................................................................................................................................................................................*......................... - str q8, [x1, #432] // .......................................................................................................................................................................................................................................................................................* - mls v21.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ - sub v22.4S, v9.4S, v28.4S // ..........................................................................................................................................................................................................................................................*............................. - mls v19.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ - mls v11.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ - str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... - str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. - str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... - str q11, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + cmge v24.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v10.4S, v17.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mls v14.4S, v8.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v8.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v17.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + sqrdmulh v19.4S, v11.4S, v26.4S // .................................................................................................................................................................................................................................*...................................................... + sub v22.4S, v13.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. + mul v11.4S, v11.4S, v25.4S // ..................................................................................................................................................................................................................................*..................................................... + cmge v28.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v13.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + cmge v8.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... + mls v9.4S, v22.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v13.4S, v28.4S, v13.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v11.4S, v19.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v21.4S, v8.4S, v21.4S // ..................................................................................................................................................................................................................................................................*..................... + cmge v22.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + str q9, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + cmge v8.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v9.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + cmge v19.4S, v31.4S, v11.4S // ............................................................................................................................................................................................................................................................*........................... + cmge v28.4S, v11.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + sub v17.4S, v9.4S, v17.4S // ..................................................................................................................................................................................*..................................................................................................... + mls v12.4S, v21.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sub v28.4S, v19.4S, v28.4S // ..............................................................................................................................................................................................................................................................*......................... + sub v21.4S, v8.4S, v22.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + cmge v19.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v11.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + str q12, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + mls v20.4S, v21.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + str q11, [x1, #192] // ...................................................................................................................................................................................................................................................................................*.... + cmge v21.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v11.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + mls v14.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + cmge v22.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + str q20, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v21.4S, v11.4S, v21.4S // ..............................................................................................................................................................................................................*......................................................................... + str q16, [x1, #496] // ................................................................................................................................................................................................................*....................................................................... + cmge v17.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + cmge v13.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + str q14, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + mls v23.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sub v8.4S, v17.4S, v22.4S // ......................................................................................................................................................................................................*................................................................................. + sub v22.4S, v13.4S, v19.4S // ..........................................................................................................................................................................................................................................................*............................. + cmge v21.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v11.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + mls v27.4S, v8.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + str q23, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + mls v10.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v21.4S, v21.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + sub v22.4S, v11.4S, v24.4S // ..............................................................................................................................................................................................................................................................................*......... + str q27, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + mls v18.4S, v21.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + mls v15.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q18, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... + str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a55.s b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a55.s index 12c85522..664ff2b7 100644 --- a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a55.s +++ b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a55.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +33,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro barrett_reduce_single a @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -187,7 +173,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -198,7 +184,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -208,7 +194,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -216,7 +202,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -227,24 +213,30 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -387,509 +379,531 @@ _intt_dilithium_123_45678_manual_ld4_opt_a55: qform_root3_tw .req q7 .p2align 2 - ldr q3, [x4, #48] // ........................* - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q21, [x5, #16] // .*....................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q28, [x5, #32] // ..*...................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q30, [x1, #0] // ...*..................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q12, [x2, #16] // .........*............... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q17, [x2, #32] // ..........*.............. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q7, [x1, #32] // .....*................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q2, [x5, #96] // ...............*......... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q14, [x5, #144] // ..................*...... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q5, [x5, #160] // ...................*..... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q29, [x5, #176] // ....................*.... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q23, [x4], #64 // .....................*... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q31, [x4, #-32] // .......................*. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q25, [x5, #128] // .................*....... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q1, [x1, #48] // ......*.................. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q11, [x5, #64] // .............*........... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q9, [x4, #-48] // ......................*.. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q19, [x5, #80] // ..............*.......... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q24, [x1, #16] // ....*.................... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q15, [x5], #(12*16) // *........................ - // gap // ......................... - // gap // ......................... - // gap // ......................... - trn1 v6.4S, v7.4S, v1.4S // ........*................ - // gap // ......................... - ldr q18, [x5, #-144] // ............*............ - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q13, [x2, #0] // .......*................. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q27, [x2, #48] // ...........*............. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr q4, [x5, #-80] // ................*........ - // gap // ......................... - - // original source code - // ldr q15, [x5], #(12*16) // ...................*..... - // ldr q21, [x5, #-176] // .*....................... - // ldr q28, [x5, #-160] // ..*...................... - // ldr q30, [x1, #0] // ...*..................... - // ldr q24, [x1, #16] // ..................*...... - // ldr q7, [x1, #32] // ......*.................. - // ldr q1, [x1, #48] // ..............*.......... - // ldr q13, [x2, #0] // ......................*.. - // trn1 v6.4S, v7.4S, v1.4S // ....................*.... - // ldr q12, [x2, #16] // ....*.................... - // ldr q17, [x2, #32] // .....*................... - // ldr q27, [x2, #48] // .......................*. - // ldr q18, [x5, #-144] // .....................*... - // ldr q11, [x5, #-128] // ...............*......... - // ldr q19, [x5, #-112] // .................*....... - // ldr q2, [x5, #-96] // .......*................. - // ldr q4, [x5, #-80] // ........................* - // ldr q25, [x5, #-64] // .............*........... - // ldr q14, [x5, #-48] // ........*................ - // ldr q5, [x5, #-32] // .........*............... - // ldr q29, [x5, #-16] // ..........*.............. - // ldr q23, [x4], #64 // ...........*............. - // ldr q9, [x4, #-48] // ................*........ - // ldr q31, [x4, #-32] // ............*............ - // ldr q3, [x4, #-16] // *........................ + // Instructions: 25 + // Expected cycles: 48 + // Expected IPC: 0.52 + // + // Wall time: 0.37s + // User time: 0.37s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q6, [x1, #48] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q27, [x1, #32] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q23, [x5], #(12*16) // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q0, [x5, #-176] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q16, [x5, #-160] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q10, [x1, #0] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q12, [x1, #16] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x2, #0] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + trn1 v2.4S, v27.4S, v6.4S // ........*..................... + // gap // .............................. + ldr q1, [x2, #16] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q24, [x2, #32] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q18, [x2, #48] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q31, [x5, #-144] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x5, #-128] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q20, [x5, #-112] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x5, #-96] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q5, [x5, #-80] // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q17, [x5, #-64] // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x5, #-48] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q3, [x5, #-32] // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q26, [x5, #-16] // ....................*......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q7, [x4], #64 // .....................*........ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q9, [x4, #-48] // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q28, [x4, #-16] // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q30, [x4, #-32] // .......................*...... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q23, [x5], #(12*16) // ..*............................ + // ldr q0, [x5, #-176] // ...*........................... + // ldr q16, [x5, #-160] // ....*.......................... + // ldr q10, [x1, #0] // .....*......................... + // ldr q12, [x1, #16] // ......*........................ + // ldr q27, [x1, #32] // .*............................. + // ldr q6, [x1, #48] // *.............................. + // ldr q4, [x2, #0] // .......*....................... + // trn1 v2.4S, v27.4S, v6.4S // ........*...................... + // ldr q1, [x2, #16] // .........*..................... + // ldr q24, [x2, #32] // ..........*.................... + // ldr q18, [x2, #48] // ...........*................... + // ldr q31, [x5, #-144] // ............*.................. + // ldr q25, [x5, #-128] // .............*................. + // ldr q20, [x5, #-112] // ..............*................ + // ldr q13, [x5, #-96] // ...............*............... + // ldr q5, [x5, #-80] // ................*.............. + // ldr q17, [x5, #-64] // .................*............. + // ldr q15, [x5, #-48] // ..................*............ + // ldr q3, [x5, #-32] // ...................*........... + // ldr q26, [x5, #-16] // ....................*.......... + // ldr q7, [x4], #64 // .....................*......... + // ldr q9, [x4, #-48] // ......................*........ + // ldr q30, [x4, #-32] // ........................*...... + // ldr q28, [x4, #-16] // .......................*....... sub count, count, #1 layer45678_start: - trn1 v0.4S, v30.4S, v24.4S // ....*......................................................................................................................................................................... + // Instructions: 174 + // Expected cycles: 196 + // Expected IPC: 0.89 + // + // Wall time: 68.38s + // User time: 68.38s + // + // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + trn1 v22.4S, v10.4S, v12.4S // ....*......................................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v10.4S, v30.4S, v24.4S // .....*........................................................................................................................................................................ + trn2 v11.4S, v10.4S, v12.4S // .....*........................................................................................................................................................................ // gap // .............................................................................................................................................................................. - trn2 v30.4S, v7.4S, v1.4S // .......*...................................................................................................................................................................... + trn2 v10.4S, v27.4S, v6.4S // .......*...................................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v24.2D, v0.2D, v6.2D // ........*..................................................................................................................................................................... + trn2 v12.2D, v22.2D, v2.2D // ........*..................................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v0.2D, v0.2D, v6.2D // ..........*................................................................................................................................................................... + trn1 v22.2D, v22.2D, v2.2D // ..........*................................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v7.2D, v10.2D, v30.2D // .........*.................................................................................................................................................................... + trn2 v6.2D, v11.2D, v10.2D // .........*.................................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v10.2D, v10.2D, v30.2D // ...........*.................................................................................................................................................................. + trn1 v11.2D, v11.2D, v10.2D // ...........*.................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v30.4S, v24.4S, v7.4S // ...................................*.......................................................................................................................................... + sub v10.4S, v12.4S, v6.4S // ...................................*.......................................................................................................................................... // gap // .............................................................................................................................................................................. - add v24.4S, v24.4S, v7.4S // ....................................*......................................................................................................................................... + add v12.4S, v12.4S, v6.4S // ....................................*......................................................................................................................................... // gap // .............................................................................................................................................................................. - sub v7.4S, v0.4S, v10.4S // ..............................*............................................................................................................................................... + sub v6.4S, v22.4S, v11.4S // ..............................*............................................................................................................................................... // gap // .............................................................................................................................................................................. - add v0.4S, v0.4S, v10.4S // ...............................*.............................................................................................................................................. + add v22.4S, v22.4S, v11.4S // ...............................*.............................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v10.4S, v13.4S, v12.4S // ................*............................................................................................................................................................. + trn1 v11.4S, v4.4S, v1.4S // ................*............................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v1.4S, v13.4S, v12.4S // .................*............................................................................................................................................................ + trn2 v27.4S, v4.4S, v1.4S // .................*............................................................................................................................................................ // gap // .............................................................................................................................................................................. - trn1 v13.4S, v17.4S, v27.4S // ..................*........................................................................................................................................................... + trn1 v4.4S, v24.4S, v18.4S // ..................*........................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v6.4S, v17.4S, v27.4S // ...................*.......................................................................................................................................................... + trn2 v2.4S, v24.4S, v18.4S // ...................*.......................................................................................................................................................... // gap // .............................................................................................................................................................................. - mul v12.4S, v30.4S, v11.4S // .....................................*........................................................................................................................................ + sqrdmulh v1.4S, v10.4S, v20.4S // .....................................*........................................................................................................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v30.4S, v30.4S, v19.4S // ......................................*....................................................................................................................................... + mul v10.4S, v10.4S, v25.4S // ......................................*....................................................................................................................................... // gap // .............................................................................................................................................................................. - sub v17.4S, v0.4S, v24.4S // ........................................*..................................................................................................................................... + sub v24.4S, v22.4S, v12.4S // ........................................*..................................................................................................................................... // gap // .............................................................................................................................................................................. - add v0.4S, v0.4S, v24.4S // .........................................*.................................................................................................................................... + add v22.4S, v22.4S, v12.4S // .........................................*.................................................................................................................................... // gap // .............................................................................................................................................................................. - mul v24.4S, v7.4S, v28.4S // ................................*............................................................................................................................................. + sqrdmulh v12.4S, v6.4S, v31.4S // ................................*............................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v7.4S, v7.4S, v18.4S // .................................*............................................................................................................................................ + mul v6.4S, v6.4S, v16.4S // .................................*............................................................................................................................................ // gap // .............................................................................................................................................................................. - trn2 v28.2D, v10.2D, v13.2D // ....................*......................................................................................................................................................... + trn2 v16.2D, v11.2D, v4.2D // ....................*......................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v27.2D, v1.2D, v6.2D // .....................*........................................................................................................................................................ + trn2 v18.2D, v27.2D, v2.2D // .....................*........................................................................................................................................................ // gap // .............................................................................................................................................................................. - trn1 v10.2D, v10.2D, v13.2D // ......................*....................................................................................................................................................... + trn1 v11.2D, v11.2D, v4.2D // ......................*....................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v1.2D, v1.2D, v6.2D // .......................*...................................................................................................................................................... + trn1 v27.2D, v27.2D, v2.2D // .......................*...................................................................................................................................................... // gap // .............................................................................................................................................................................. - mls v24.4S, v7.4S, v8.S[0] // ..................................*........................................................................................................................................... + mls v6.4S, v12.4S, v8.S[0] // ..................................*........................................................................................................................................... // gap // .............................................................................................................................................................................. - mls v12.4S, v30.4S, v8.S[0] // .......................................*...................................................................................................................................... + mls v10.4S, v1.4S, v8.S[0] // .......................................*...................................................................................................................................... // gap // .............................................................................................................................................................................. - mul v30.4S, v17.4S, v15.4S // ..........................................*................................................................................................................................... + sqrdmulh v12.4S, v24.4S, v0.4S // ..........................................*................................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v7.4S, v17.4S, v21.4S // ...........................................*.................................................................................................................................. + mul v4.4S, v24.4S, v23.4S // ...........................................*.................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v13.4S, v10.4S, v1.4S // ........................................................*..................................................................................................................... + sub v2.4S, v11.4S, v27.4S // ........................................................*..................................................................................................................... // gap // .............................................................................................................................................................................. - sub v6.4S, v24.4S, v12.4S // .............................................*................................................................................................................................ + sub v1.4S, v6.4S, v10.4S // .............................................*................................................................................................................................ // gap // .............................................................................................................................................................................. - add v24.4S, v24.4S, v12.4S // ..............................................*............................................................................................................................... + add v10.4S, v6.4S, v10.4S // ..............................................*............................................................................................................................... // gap // .............................................................................................................................................................................. - mls v30.4S, v7.4S, v8.S[0] // ............................................*................................................................................................................................. + mls v4.4S, v12.4S, v8.S[0] // ............................................*................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v7.4S, v6.4S, v15.4S // ...............................................*.............................................................................................................................. + sqrdmulh v12.4S, v1.4S, v0.4S // ...............................................*.............................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v6.4S, v6.4S, v21.4S // ................................................*............................................................................................................................. + mul v6.4S, v1.4S, v23.4S // ................................................*............................................................................................................................. // gap // .............................................................................................................................................................................. - add v10.4S, v10.4S, v1.4S // .........................................................*.................................................................................................................... + add v11.4S, v11.4S, v27.4S // .........................................................*.................................................................................................................... // gap // .............................................................................................................................................................................. - mul v1.4S, v13.4S, v25.4S // ..........................................................*................................................................................................................... + sqrdmulh v27.4S, v2.4S, v15.4S // ..........................................................*................................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v12.4S, v0.4S, v24.4S // ............................................................................*................................................................................................. + trn1 v1.4S, v22.4S, v10.4S // ............................................................................*................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v0.4S, v0.4S, v24.4S // .............................................................................*................................................................................................ + trn2 v22.4S, v22.4S, v10.4S // .............................................................................*................................................................................................ // gap // .............................................................................................................................................................................. - mls v7.4S, v6.4S, v8.S[0] // .................................................*............................................................................................................................ + mls v6.4S, v12.4S, v8.S[0] // .................................................*............................................................................................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v24.4S, v13.4S, v14.4S // ...........................................................*.................................................................................................................. + mul v10.4S, v2.4S, v17.4S // ...........................................................*.................................................................................................................. // gap // .............................................................................................................................................................................. - sub v13.4S, v28.4S, v27.4S // .............................................................*................................................................................................................ + sub v12.4S, v16.4S, v18.4S // .............................................................*................................................................................................................ // gap // .............................................................................................................................................................................. - add v6.4S, v28.4S, v27.4S // ..............................................................*............................................................................................................... + add v2.4S, v16.4S, v18.4S // ..............................................................*............................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v17.4S, v30.4S, v7.4S // ..............................................................................*............................................................................................... + trn1 v24.4S, v4.4S, v6.4S // ..............................................................................*............................................................................................... // gap // .............................................................................................................................................................................. - mls v1.4S, v24.4S, v8.S[0] // ............................................................*................................................................................................................. + mls v10.4S, v27.4S, v8.S[0] // ............................................................*................................................................................................................. // gap // .............................................................................................................................................................................. - mul v24.4S, v13.4S, v5.4S // ...............................................................*.............................................................................................................. + sqrdmulh v27.4S, v12.4S, v26.4S // ...............................................................*.............................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v13.4S, v29.4S // ................................................................*............................................................................................................. + mul v12.4S, v12.4S, v3.4S // ................................................................*............................................................................................................. // gap // .............................................................................................................................................................................. - sub v28.4S, v10.4S, v6.4S // ..................................................................*........................................................................................................... + sub v16.4S, v11.4S, v2.4S // ..................................................................*........................................................................................................... // gap // .............................................................................................................................................................................. - add v10.4S, v10.4S, v6.4S // ...................................................................*.......................................................................................................... + add v11.4S, v11.4S, v2.4S // ...................................................................*.......................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v30.4S, v30.4S, v7.4S // ...............................................................................*.............................................................................................. + trn2 v6.4S, v4.4S, v6.4S // ...............................................................................*.............................................................................................. // gap // .............................................................................................................................................................................. - mls v24.4S, v13.4S, v8.S[0] // .................................................................*............................................................................................................ + mls v12.4S, v27.4S, v8.S[0] // .................................................................*............................................................................................................ // gap // .............................................................................................................................................................................. - mul v7.4S, v28.4S, v2.4S // ....................................................................*......................................................................................................... + sqrdmulh v27.4S, v16.4S, v5.4S // ....................................................................*......................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v28.4S, v4.4S // .....................................................................*........................................................................................................ + mul v4.4S, v16.4S, v13.4S // .....................................................................*........................................................................................................ // gap // .............................................................................................................................................................................. - trn2 v6.2D, v12.2D, v17.2D // ................................................................................*............................................................................................. + trn2 v2.2D, v1.2D, v24.2D // ................................................................................*............................................................................................. // gap // .............................................................................................................................................................................. - sub v28.4S, v1.4S, v24.4S // .......................................................................*...................................................................................................... + sub v16.4S, v10.4S, v12.4S // .......................................................................*...................................................................................................... // gap // .............................................................................................................................................................................. - add v24.4S, v1.4S, v24.4S // ........................................................................*..................................................................................................... + add v10.4S, v10.4S, v12.4S // ........................................................................*..................................................................................................... // gap // .............................................................................................................................................................................. - mls v7.4S, v13.4S, v8.S[0] // ......................................................................*....................................................................................................... + mls v4.4S, v27.4S, v8.S[0] // ......................................................................*....................................................................................................... // gap // .............................................................................................................................................................................. - mul v1.4S, v28.4S, v2.4S // .........................................................................*.................................................................................................... + sqrdmulh v12.4S, v16.4S, v5.4S // .........................................................................*.................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v28.4S, v4.4S // ..........................................................................*................................................................................................... + mul v27.4S, v16.4S, v13.4S // ..........................................................................*................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v28.2D, v0.2D, v30.2D // .................................................................................*............................................................................................ + trn2 v16.2D, v22.2D, v6.2D // .................................................................................*............................................................................................ // gap // .............................................................................................................................................................................. - trn1 v12.2D, v12.2D, v17.2D // ..................................................................................*........................................................................................... + trn1 v1.2D, v1.2D, v24.2D // ..................................................................................*........................................................................................... // gap // .............................................................................................................................................................................. - trn1 v0.2D, v0.2D, v30.2D // ...................................................................................*.......................................................................................... + trn1 v22.2D, v22.2D, v6.2D // ...................................................................................*.......................................................................................... // gap // .............................................................................................................................................................................. - mls v1.4S, v13.4S, v8.S[0] // ...........................................................................*.................................................................................................. + mls v27.4S, v12.4S, v8.S[0] // ...........................................................................*.................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v30.4S, v10.4S, v24.4S // ....................................................................................*......................................................................................... + trn1 v12.4S, v11.4S, v10.4S // ....................................................................................*......................................................................................... // gap // .............................................................................................................................................................................. - trn2 v10.4S, v10.4S, v24.4S // .....................................................................................*........................................................................................ + trn2 v11.4S, v11.4S, v10.4S // .....................................................................................*........................................................................................ // gap // .............................................................................................................................................................................. - sub v24.4S, v12.4S, v0.4S // ................................................................................................*............................................................................. + sub v10.4S, v1.4S, v22.4S // ................................................................................................*............................................................................. // gap // .............................................................................................................................................................................. - trn1 v13.4S, v7.4S, v1.4S // ......................................................................................*....................................................................................... + trn1 v6.4S, v4.4S, v27.4S // ......................................................................................*....................................................................................... // gap // .............................................................................................................................................................................. - trn2 v7.4S, v7.4S, v1.4S // .......................................................................................*...................................................................................... + trn2 v27.4S, v4.4S, v27.4S // .......................................................................................*...................................................................................... // gap // .............................................................................................................................................................................. - add v0.4S, v12.4S, v0.4S // .................................................................................................*............................................................................ + add v22.4S, v1.4S, v22.4S // .................................................................................................*............................................................................ // gap // .............................................................................................................................................................................. - trn2 v1.2D, v30.2D, v13.2D // ........................................................................................*..................................................................................... + trn2 v4.2D, v12.2D, v6.2D // ........................................................................................*..................................................................................... // gap // .............................................................................................................................................................................. - trn2 v12.2D, v10.2D, v7.2D // .........................................................................................*.................................................................................... + trn2 v1.2D, v11.2D, v27.2D // .........................................................................................*.................................................................................... // gap // .............................................................................................................................................................................. - trn1 v30.2D, v30.2D, v13.2D // ..........................................................................................*................................................................................... + trn1 v12.2D, v12.2D, v6.2D // ..........................................................................................*................................................................................... // gap // .............................................................................................................................................................................. - trn1 v10.2D, v10.2D, v7.2D // ...........................................................................................*.................................................................................. + trn1 v11.2D, v11.2D, v27.2D // ...........................................................................................*.................................................................................. // gap // .............................................................................................................................................................................. - mul v7.4S, v24.4S, v9.S[2] // ..................................................................................................*........................................................................... + sqrdmulh v6.4S, v10.4S, v9.S[3] // ..................................................................................................*........................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v24.4S, v24.4S, v9.S[3] // ...................................................................................................*.......................................................................... + mul v10.4S, v10.4S, v9.S[2] // ...................................................................................................*.......................................................................... // gap // .............................................................................................................................................................................. - sub v13.4S, v6.4S, v28.4S // .....................................................................................................*........................................................................ + sub v27.4S, v2.4S, v16.4S // .....................................................................................................*........................................................................ // gap // .............................................................................................................................................................................. - add v6.4S, v6.4S, v28.4S // ......................................................................................................*....................................................................... + add v2.4S, v2.4S, v16.4S // ......................................................................................................*....................................................................... // gap // .............................................................................................................................................................................. - sub v17.4S, v30.4S, v10.4S // ..........................................................................................................*................................................................... + sub v24.4S, v12.4S, v11.4S // ..........................................................................................................*................................................................... // gap // .............................................................................................................................................................................. - mls v7.4S, v24.4S, v8.S[0] // ....................................................................................................*......................................................................... + mls v10.4S, v6.4S, v8.S[0] // ....................................................................................................*......................................................................... // gap // .............................................................................................................................................................................. - mul v24.4S, v13.4S, v31.S[0] // .......................................................................................................*...................................................................... + sqrdmulh v6.4S, v27.4S, v30.S[1] // .......................................................................................................*...................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v13.4S, v31.S[1] // ........................................................................................................*..................................................................... + mul v27.4S, v27.4S, v30.S[0] // ........................................................................................................*..................................................................... // gap // .............................................................................................................................................................................. - add v10.4S, v30.4S, v10.4S // ...........................................................................................................*.................................................................. + add v11.4S, v12.4S, v11.4S // ...........................................................................................................*.................................................................. // gap // .............................................................................................................................................................................. - mul v30.4S, v17.4S, v31.S[2] // ............................................................................................................*................................................................. + sqrdmulh v12.4S, v24.4S, v30.S[3] // ............................................................................................................*................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v17.4S, v17.4S, v31.S[3] // .............................................................................................................*................................................................ + mul v24.4S, v24.4S, v30.S[2] // .............................................................................................................*................................................................ // gap // .............................................................................................................................................................................. - mls v24.4S, v13.4S, v8.S[0] // .........................................................................................................*.................................................................... + mls v27.4S, v6.4S, v8.S[0] // .........................................................................................................*.................................................................... // gap // .............................................................................................................................................................................. - sub v13.4S, v1.4S, v12.4S // ...............................................................................................................*.............................................................. + sub v6.4S, v4.4S, v1.4S // ...............................................................................................................*.............................................................. // gap // .............................................................................................................................................................................. - add v1.4S, v1.4S, v12.4S // ................................................................................................................*............................................................. + add v4.4S, v4.4S, v1.4S // ................................................................................................................*............................................................. // gap // .............................................................................................................................................................................. - mls v30.4S, v17.4S, v8.S[0] // ..............................................................................................................*............................................................... + mls v24.4S, v12.4S, v8.S[0] // ..............................................................................................................*............................................................... // gap // .............................................................................................................................................................................. - mul v12.4S, v13.4S, v3.S[0] // .................................................................................................................*............................................................ + sqrdmulh v12.4S, v6.4S, v28.S[1] // .................................................................................................................*............................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v13.4S, v3.S[1] // ..................................................................................................................*........................................................... + mul v6.4S, v6.4S, v28.S[0] // ..................................................................................................................*........................................................... // gap // .............................................................................................................................................................................. - sub v17.4S, v0.4S, v6.4S // ....................................................................................................................*......................................................... + sub v1.4S, v22.4S, v2.4S // ....................................................................................................................*......................................................... // gap // .............................................................................................................................................................................. - add v0.4S, v0.4S, v6.4S // .....................................................................................................................*........................................................ + add v22.4S, v22.4S, v2.4S // .....................................................................................................................*........................................................ // gap // .............................................................................................................................................................................. - sub v6.4S, v7.4S, v24.4S // .........................................................................................................................*.................................................... + sub v2.4S, v10.4S, v27.4S // .........................................................................................................................*.................................................... // gap // .............................................................................................................................................................................. - mls v12.4S, v13.4S, v8.S[0] // ...................................................................................................................*.......................................................... + mls v6.4S, v12.4S, v8.S[0] // ...................................................................................................................*.......................................................... // gap // .............................................................................................................................................................................. - mul v13.4S, v17.4S, v23.S[2] // ......................................................................................................................*....................................................... + sqrdmulh v12.4S, v1.4S, v7.S[3] // ......................................................................................................................*....................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v17.4S, v17.4S, v23.S[3] // .......................................................................................................................*...................................................... + mul v1.4S, v1.4S, v7.S[2] // .......................................................................................................................*...................................................... // gap // .............................................................................................................................................................................. - add v24.4S, v7.4S, v24.4S // ..........................................................................................................................*................................................... + add v10.4S, v10.4S, v27.4S // ..........................................................................................................................*................................................... // gap // .............................................................................................................................................................................. - mul v7.4S, v6.4S, v23.S[2] // ...........................................................................................................................*.................................................. + sqrdmulh v27.4S, v2.4S, v7.S[3] // ...........................................................................................................................*.................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v6.4S, v6.4S, v23.S[3] // ............................................................................................................................*................................................. + mul v2.4S, v2.4S, v7.S[2] // ............................................................................................................................*................................................. // gap // .............................................................................................................................................................................. - mls v13.4S, v17.4S, v8.S[0] // ........................................................................................................................*..................................................... + mls v1.4S, v12.4S, v8.S[0] // ........................................................................................................................*..................................................... // gap // .............................................................................................................................................................................. - sub v17.4S, v10.4S, v1.4S // ..............................................................................................................................*............................................... + sub v12.4S, v11.4S, v4.4S // ..............................................................................................................................*............................................... // gap // .............................................................................................................................................................................. - add v10.4S, v10.4S, v1.4S // ...............................................................................................................................*.............................................. + add v11.4S, v11.4S, v4.4S // ...............................................................................................................................*.............................................. // gap // .............................................................................................................................................................................. - mls v7.4S, v6.4S, v8.S[0] // .............................................................................................................................*................................................ + mls v2.4S, v27.4S, v8.S[0] // .............................................................................................................................*................................................ // gap // .............................................................................................................................................................................. - mul v1.4S, v17.4S, v9.S[0] // ................................................................................................................................*............................................. + sqrdmulh v27.4S, v12.4S, v9.S[1] // ................................................................................................................................*............................................. // gap // .............................................................................................................................................................................. - sqrdmulh v6.4S, v17.4S, v9.S[1] // .................................................................................................................................*............................................ + mul v12.4S, v12.4S, v9.S[0] // .................................................................................................................................*............................................ // gap // .............................................................................................................................................................................. - sub v17.4S, v30.4S, v12.4S // ...................................................................................................................................*.......................................... + sub v4.4S, v24.4S, v6.4S // ...................................................................................................................................*.......................................... // gap // .............................................................................................................................................................................. - add v30.4S, v30.4S, v12.4S // ....................................................................................................................................*......................................... + add v6.4S, v24.4S, v6.4S // ....................................................................................................................................*......................................... // gap // .............................................................................................................................................................................. - srshr v12.4S, v0.4S, #23 // ........................................................................................................................................*..................................... + srshr v24.4S, v22.4S, #23 // ........................................................................................................................................*..................................... // gap // .............................................................................................................................................................................. - mls v1.4S, v6.4S, v8.S[0] // ..................................................................................................................................*........................................... + mls v12.4S, v27.4S, v8.S[0] // ..................................................................................................................................*........................................... // gap // .............................................................................................................................................................................. - mul v6.4S, v17.4S, v9.S[0] // .....................................................................................................................................*........................................ + sqrdmulh v27.4S, v4.4S, v9.S[1] // .....................................................................................................................................*........................................ // gap // .............................................................................................................................................................................. - sqrdmulh v17.4S, v17.4S, v9.S[1] // ......................................................................................................................................*....................................... + mul v4.4S, v4.4S, v9.S[0] // ......................................................................................................................................*....................................... // gap // .............................................................................................................................................................................. - mls v0.4S, v12.4S, v8.4S // .........................................................................................................................................*.................................... + mls v22.4S, v24.4S, v8.4S // .........................................................................................................................................*.................................... // gap // .............................................................................................................................................................................. - srshr v12.4S, v24.4S, #23 // ..........................................................................................................................................*................................... + srshr v24.4S, v10.4S, #23 // ..........................................................................................................................................*................................... // gap // .............................................................................................................................................................................. - srshr v28.4S, v10.4S, #23 // ............................................................................................................................................*................................. + srshr v16.4S, v11.4S, #23 // ............................................................................................................................................*................................. // gap // .............................................................................................................................................................................. - mls v6.4S, v17.4S, v8.S[0] // .......................................................................................................................................*...................................... + mls v4.4S, v27.4S, v8.S[0] // .......................................................................................................................................*...................................... // gap // .............................................................................................................................................................................. - mls v24.4S, v12.4S, v8.4S // ...........................................................................................................................................*.................................. + mls v10.4S, v24.4S, v8.4S // ...........................................................................................................................................*.................................. // gap // .............................................................................................................................................................................. - mls v10.4S, v28.4S, v8.4S // .............................................................................................................................................*................................ + mls v11.4S, v16.4S, v8.4S // .............................................................................................................................................*................................ // gap // .............................................................................................................................................................................. - srshr v12.4S, v30.4S, #23 // ..............................................................................................................................................*............................... + srshr v27.4S, v6.4S, #23 // ..............................................................................................................................................*............................... // gap // .............................................................................................................................................................................. - sub v17.4S, v13.4S, v1.4S // ..........................................................................................................................................................*................... + sub v24.4S, v1.4S, v12.4S // ..........................................................................................................................................................*................... // gap // .............................................................................................................................................................................. - add v1.4S, v13.4S, v1.4S // ...........................................................................................................................................................*.................. + add v12.4S, v1.4S, v12.4S // ...........................................................................................................................................................*.................. // gap // .............................................................................................................................................................................. - mls v30.4S, v12.4S, v8.4S // ...............................................................................................................................................*.............................. + mls v6.4S, v27.4S, v8.4S // ...............................................................................................................................................*.............................. // gap // .............................................................................................................................................................................. - sub v13.4S, v0.4S, v10.4S // ................................................................................................................................................*............................. + sub v27.4S, v22.4S, v11.4S // ................................................................................................................................................*............................. // gap // .............................................................................................................................................................................. - add v0.4S, v0.4S, v10.4S // .................................................................................................................................................*............................ + add v22.4S, v22.4S, v11.4S // .................................................................................................................................................*............................ // gap // .............................................................................................................................................................................. - mul v10.4S, v17.4S, v23.S[0] // ............................................................................................................................................................*................. + sqrdmulh v11.4S, v24.4S, v7.S[1] // ............................................................................................................................................................*................. // gap // .............................................................................................................................................................................. - mul v12.4S, v13.4S, v23.S[0] // ..................................................................................................................................................*........................... + sqrdmulh v1.4S, v27.4S, v7.S[1] // ..................................................................................................................................................*........................... // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v13.4S, v23.S[1] // ...................................................................................................................................................*.......................... + mul v27.4S, v27.4S, v7.S[0] // ...................................................................................................................................................*.......................... // gap // .............................................................................................................................................................................. - sub v28.4S, v24.4S, v30.4S // .....................................................................................................................................................*........................ + sub v16.4S, v10.4S, v6.4S // .....................................................................................................................................................*........................ // gap // .............................................................................................................................................................................. - add v30.4S, v24.4S, v30.4S // ......................................................................................................................................................*....................... + add v10.4S, v10.4S, v6.4S // ......................................................................................................................................................*....................... // gap // .............................................................................................................................................................................. - sqrdmulh v24.4S, v17.4S, v23.S[1] // .............................................................................................................................................................*................ + mul v6.4S, v24.4S, v7.S[0] // .............................................................................................................................................................*................ // gap // .............................................................................................................................................................................. - mls v12.4S, v13.4S, v8.S[0] // ....................................................................................................................................................*......................... + mls v27.4S, v1.4S, v8.S[0] // ....................................................................................................................................................*......................... // gap // .............................................................................................................................................................................. - mul v13.4S, v28.4S, v23.S[0] // .......................................................................................................................................................*...................... + sqrdmulh v1.4S, v16.4S, v7.S[1] // .......................................................................................................................................................*...................... // gap // .............................................................................................................................................................................. - sqrdmulh v17.4S, v28.4S, v23.S[1] // ........................................................................................................................................................*..................... + mul v24.4S, v16.4S, v7.S[0] // ........................................................................................................................................................*..................... // gap // .............................................................................................................................................................................. - mls v10.4S, v24.4S, v8.S[0] // ..............................................................................................................................................................*............... + mls v6.4S, v11.4S, v8.S[0] // ..............................................................................................................................................................*............... // gap // .............................................................................................................................................................................. - sub v24.4S, v7.4S, v6.4S // ...............................................................................................................................................................*.............. + sub v11.4S, v2.4S, v4.4S // ...............................................................................................................................................................*.............. // gap // .............................................................................................................................................................................. - add v7.4S, v7.4S, v6.4S // ................................................................................................................................................................*............. + add v4.4S, v2.4S, v4.4S // ................................................................................................................................................................*............. // gap // .............................................................................................................................................................................. - mls v13.4S, v17.4S, v8.S[0] // .........................................................................................................................................................*.................... + mls v24.4S, v1.4S, v8.S[0] // .........................................................................................................................................................*.................... // gap // .............................................................................................................................................................................. - mul v6.4S, v24.4S, v23.S[0] // .................................................................................................................................................................*............ + sqrdmulh v2.4S, v11.4S, v7.S[1] // .................................................................................................................................................................*............ // gap // .............................................................................................................................................................................. - sqrdmulh v24.4S, v24.4S, v23.S[1] // ..................................................................................................................................................................*........... + mul v11.4S, v11.4S, v7.S[0] // ..................................................................................................................................................................*........... // gap // .............................................................................................................................................................................. - str q0, [x1], #(16*4) // ....................................................................................................................................................................*......... + str q22, [x1], #(16*4) // ....................................................................................................................................................................*......... // gap // .............................................................................................................................................................................. - ldr q15, [x5], #(12*16) // ........................e..................................................................................................................................................... + ldr q23, [x5], #(12*16) // ........................e..................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v6.4S, v24.4S, v8.S[0] // ...................................................................................................................................................................*.......... + mls v11.4S, v2.4S, v8.S[0] // ...................................................................................................................................................................*.......... // gap // .............................................................................................................................................................................. - str q30, [x1, #-48] // .....................................................................................................................................................................*........ + str q10, [x1, #-48] // .....................................................................................................................................................................*........ // gap // .............................................................................................................................................................................. - ldr q21, [x5, #-176] // .........................e.................................................................................................................................................... + ldr q0, [x5, #-176] // .........................e.................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q1, [x1, #-32] // ......................................................................................................................................................................*....... + str q12, [x1, #-32] // ......................................................................................................................................................................*....... // gap // .............................................................................................................................................................................. - ldr q28, [x5, #-160] // ..........................e................................................................................................................................................... + ldr q16, [x5, #-160] // ..........................e................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q7, [x1, #-16] // .......................................................................................................................................................................*...... + str q4, [x1, #-16] // .......................................................................................................................................................................*...... add x1, x1, #64 // ............................................................................................................................................................................*. - ldr q30, [x1, #0] // e............................................................................................................................................................................. + ldr q10, [x1, #0] // e............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q12, [x2], #(16*4) // ........................................................................................................................................................................*..... + str q27, [x2], #(16*4) // ........................................................................................................................................................................*..... // gap // .............................................................................................................................................................................. - ldr q24, [x1, #16] // .e............................................................................................................................................................................ + ldr q12, [x1, #16] // .e............................................................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q13, [x2, #-48] // .........................................................................................................................................................................*.... + str q24, [x2, #-48] // .........................................................................................................................................................................*.... // gap // .............................................................................................................................................................................. - ldr q7, [x1, #32] // ..e........................................................................................................................................................................... + ldr q27, [x1, #32] // ..e........................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q10, [x2, #-32] // ..........................................................................................................................................................................*... + str q6, [x2, #-32] // ..........................................................................................................................................................................*... // gap // .............................................................................................................................................................................. - ldr q1, [x1, #48] // ...e.......................................................................................................................................................................... + ldr q6, [x1, #48] // ...e.......................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q6, [x2, #-16] // ...........................................................................................................................................................................*.. + str q11, [x2, #-16] // ...........................................................................................................................................................................*.. add x2, x2, #64 // .............................................................................................................................................................................* - ldr q13, [x2, #0] // ............e................................................................................................................................................................. + ldr q4, [x2, #0] // ............e................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v6.4S, v7.4S, v1.4S // ......e....................................................................................................................................................................... + trn1 v2.4S, v27.4S, v6.4S // ......e....................................................................................................................................................................... // gap // .............................................................................................................................................................................. - ldr q12, [x2, #16] // .............e................................................................................................................................................................ + ldr q1, [x2, #16] // .............e................................................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q17, [x2, #32] // ..............e............................................................................................................................................................... + ldr q24, [x2, #32] // ..............e............................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q27, [x2, #48] // ...............e.............................................................................................................................................................. + ldr q18, [x2, #48] // ...............e.............................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q18, [x5, #-144] // ...........................e.................................................................................................................................................. + ldr q31, [x5, #-144] // ...........................e.................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q11, [x5, #-128] // ............................e................................................................................................................................................. + ldr q25, [x5, #-128] // ............................e................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q19, [x5, #-112] // .............................e................................................................................................................................................ + ldr q20, [x5, #-112] // .............................e................................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q2, [x5, #-96] // ..................................................e........................................................................................................................... + ldr q13, [x5, #-96] // ..................................................e........................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q4, [x5, #-80] // ...................................................e.......................................................................................................................... + ldr q5, [x5, #-80] // ...................................................e.......................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q25, [x5, #-64] // ....................................................e......................................................................................................................... + ldr q17, [x5, #-64] // ....................................................e......................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q14, [x5, #-48] // .....................................................e........................................................................................................................ + ldr q15, [x5, #-48] // .....................................................e........................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q5, [x5, #-32] // ......................................................e....................................................................................................................... + ldr q3, [x5, #-32] // ......................................................e....................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q29, [x5, #-16] // .......................................................e...................................................................................................................... + ldr q26, [x5, #-16] // .......................................................e...................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q23, [x4], #64 // ............................................................................................e................................................................................. + ldr q7, [x4], #64 // ............................................................................................e................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -897,641 +911,655 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q31, [x4, #-32] // ..............................................................................................e............................................................................... - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - ldr q3, [x4, #-16] // ...............................................................................................e.............................................................................. - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - - // original source code - // ldr q9, [x1, #0] // ........e..........................|..................................................................................................................................................e........ - // ldr q10, [x1, #16] // ..........e........................|....................................................................................................................................................e...... - // ldr q11, [x1, #32] // ............e......................|......................................................................................................................................................e.... - // ldr q12, [x1, #48] // ..............e....................|........................................................................................................................................................e.. - // trn1 v25.4s, v9.4s, v10.4s // ...................................*........................................................................................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ...................................|*.......................................................................................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // ..................e................|........................................................................................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ...................................|.*......................................................................................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // ...................................|..*........................................................................................................................................................ - // trn2 v12.2d, v26.2d, v28.2d // ...................................|....*...................................................................................................................................................... - // trn1 v9.2d, v25.2d, v27.2d // ...................................|...*....................................................................................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ...................................|.....*..................................................................................................................................................... - // ldr q13, [x2, #0] // .................e.................|........................................................................................................................................................... - // ldr q14, [x2, #16] // ...................e...............|........................................................................................................................................................... - // ldr q15, [x2, #32] // ....................e..............|........................................................................................................................................................... - // ldr q16, [x2, #48] // .....................e.............|........................................................................................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ...................................|..........*................................................................................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // ...................................|...........*............................................................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ...................................|............*.............................................................................................................................................. - // trn2 v28.4s, v15.4s, v16.4s // ...................................|.............*............................................................................................................................................. - // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................*...................................................................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................*..................................................................................................................................... - // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................*.................................................................................................................................... - // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................*................................................................................................................................... - // ldr q0, [x5], #(12*16) // e..................................|..........................................................................................................................................e................ - // ldr q4, [x5, #(-12*16 + 1*16)] // ...e...............................|.............................................................................................................................................e............. - // ldr q1, [x5, #(-12*16 + 2*16)] // .....e.............................|...............................................................................................................................................e........... - // ldr q5, [x5, #(-12*16 + 3*16)] // ......................e............|........................................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 4*16)] // .......................e...........|........................................................................................................................................................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ........................e..........|........................................................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // ...................................|........*.................................................................................................................................................. - // add v9.4s, v9.4s, v10.4s // ...................................|.........*................................................................................................................................................. - // mul v10.4s, v24.4s, v1.4s // ...................................|..................*........................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|...................*....................................................................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ...................................|........................*.................................................................................................................................. - // sub v24.4s, v11.4s, v12.4s // ...................................|......*.................................................................................................................................................... - // add v11.4s, v11.4s, v12.4s // ...................................|.......*................................................................................................................................................... - // mul v12.4s, v24.4s, v2.4s // ...................................|..............*............................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|...............*........................................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................|.........................*................................................................................................................................. - // sub v24.4s, v9.4s, v11.4s // ...................................|................*.......................................................................................................................................... - // add v9.4s, v9.4s, v11.4s // ...................................|.................*......................................................................................................................................... - // mul v11.4s, v24.4s, v0.4s // ...................................|..........................*................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...........................*............................................................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ...................................|...............................*........................................................................................................................... - // sub v24.4s, v10.4s, v12.4s // ...................................|.............................*............................................................................................................................. - // add v10.4s, v10.4s, v12.4s // ...................................|..............................*............................................................................................................................ - // mul v12.4s, v24.4s, v0.4s // ...................................|................................*.......................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.................................*......................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................|......................................*.................................................................................................................... - // ldr q0, [x5, #(-12*16 + 6*16)] // .........................e.........|........................................................................................................................................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ..........................e........|........................................................................................................................................................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e.......|........................................................................................................................................................... - // ldr q5, [x5, #(-12*16 + 9*16)] // ............................e......|........................................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 10*16)] // .............................e.....|........................................................................................................................................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................e....|........................................................................................................................................................... - // sub v24.4s, v13.4s, v14.4s // ...................................|............................*.............................................................................................................................. - // add v13.4s, v13.4s, v14.4s // ...................................|..................................*........................................................................................................................ - // mul v14.4s, v24.4s, v1.4s // ...................................|...................................*....................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|.......................................*................................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ...................................|...........................................*............................................................................................................... - // sub v24.4s, v15.4s, v16.4s // ...................................|........................................*.................................................................................................................. - // add v15.4s, v15.4s, v16.4s // ...................................|.........................................*................................................................................................................. - // mul v16.4s, v24.4s, v2.4s // ...................................|............................................*.............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|.............................................*............................................................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................*......................................................................................................... - // sub v24.4s, v13.4s, v15.4s // ...................................|..............................................*............................................................................................................ - // add v13.4s, v13.4s, v15.4s // ...................................|...............................................*........................................................................................................... - // mul v15.4s, v24.4s, v0.4s // ...................................|..................................................*........................................................................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...................................................*....................................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ...................................|.......................................................*................................................................................................... - // sub v24.4s, v14.4s, v16.4s // ...................................|.....................................................*..................................................................................................... - // add v14.4s, v14.4s, v16.4s // ...................................|......................................................*.................................................................................................... - // mul v16.4s, v24.4s, v0.4s // ...................................|........................................................*.................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.........................................................*................................................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ...................................|.............................................................*............................................................................................. - // trn1 v25.4s, v9.4s, v10.4s // ...................................|....................................*...................................................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ...................................|.....................................*..................................................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // ...................................|..........................................*................................................................................................................ - // trn2 v28.4s, v11.4s, v12.4s // ...................................|................................................*.......................................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // ...................................|....................................................*...................................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // ...................................|..........................................................*................................................................................................ - // trn1 v9.2d, v25.2d, v27.2d // ...................................|...........................................................*............................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ...................................|............................................................*.............................................................................................. - // trn1 v25.4s, v13.4s, v14.4s // ...................................|..............................................................*............................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // ...................................|...............................................................*........................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ...................................|.................................................................*......................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ...................................|..................................................................*........................................................................................ - // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................................................................*...................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................................................................*..................................................................................... - // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................................................................*.................................................................................... - // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................................................................*................................................................................... - // ldr q0, [x4], #64 // ...............................e...|........................................................................................................................................................... - // ldr q1, [x4, #(-64 + 16)] // ................................e..|........................................................................................................................................................... - // ldr q2, [x4, #(-64 + 32)] // .................................e.|........................................................................................................................................................... - // ldr q3, [x4, #(-64 + 48)] // ..................................e|........................................................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // ...................................|................................................................*.......................................................................................... - // add v9.4s, v9.4s, v10.4s // ...................................|...................................................................*....................................................................................... - // mul v10.4s, v24.4s, v1.s[2] // ...................................|........................................................................*.................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................|.........................................................................*................................................................................. - // mls v10.4s, v24.4s, v8.s[0] // ...................................|.............................................................................*............................................................................. - // sub v24.4s, v11.4s, v12.4s // ...................................|..........................................................................*................................................................................ - // add v11.4s, v11.4s, v12.4s // ...................................|...........................................................................*............................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ...................................|..............................................................................*............................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................................|...............................................................................*........................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................|...................................................................................*....................................................................... - // sub v24.4s, v13.4s, v14.4s // ...................................|............................................................................*.............................................................................. - // add v13.4s, v13.4s, v14.4s // ...................................|................................................................................*.......................................................................... - // mul v14.4s, v24.4s, v2.s[2] // ...................................|.................................................................................*......................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................|..................................................................................*........................................................................ - // mls v14.4s, v24.4s, v8.s[0] // ...................................|......................................................................................*.................................................................... - // sub v24.4s, v15.4s, v16.4s // ...................................|....................................................................................*...................................................................... - // add v15.4s, v15.4s, v16.4s // ...................................|.....................................................................................*..................................................................... - // mul v16.4s, v24.4s, v3.s[0] // ...................................|.......................................................................................*................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...................................|........................................................................................*.................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ...................................|............................................................................................*.............................................................. - // sub v24.4s, v9.4s, v11.4s // ...................................|.........................................................................................*................................................................. - // add v9.4s, v9.4s, v11.4s // ...................................|..........................................................................................*................................................................ - // mul v11.4s, v24.4s, v0.s[2] // ...................................|.............................................................................................*............................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|..............................................................................................*............................................................ - // mls v11.4s, v24.4s, v8.s[0] // ...................................|..................................................................................................*........................................................ - // sub v24.4s, v10.4s, v12.4s // ...................................|...........................................................................................*............................................................... - // add v10.4s, v10.4s, v12.4s // ...................................|...............................................................................................*........................................................... - // mul v12.4s, v24.4s, v0.s[2] // ...................................|................................................................................................*.......................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|.................................................................................................*......................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................|.....................................................................................................*..................................................... - // sub v24.4s, v13.4s, v15.4s // ...................................|...................................................................................................*....................................................... - // add v13.4s, v13.4s, v15.4s // ...................................|....................................................................................................*...................................................... - // mul v15.4s, v24.4s, v1.s[0] // ...................................|......................................................................................................*.................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.......................................................................................................*................................................... - // mls v15.4s, v24.4s, v8.s[0] // ...................................|...........................................................................................................*............................................... - // sub v24.4s, v14.4s, v16.4s // ...................................|........................................................................................................*.................................................. - // add v14.4s, v14.4s, v16.4s // ...................................|.........................................................................................................*................................................. - // mul v16.4s, v24.4s, v1.s[0] // ...................................|............................................................................................................*.............................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.............................................................................................................*............................................. - // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................................................................................*......................................... - // srshr v24.4S, v9.4S, #23 // ...................................|..........................................................................................................*................................................ - // mls v9.4s, v24.4s, v8.4s // ...................................|..............................................................................................................*............................................ - // srshr v24.4S, v10.4S, #23 // ...................................|...............................................................................................................*........................................... - // mls v10.4s, v24.4s, v8.4s // ...................................|..................................................................................................................*........................................ - // srshr v24.4S, v13.4S, #23 // ...................................|................................................................................................................*.......................................... - // mls v13.4s, v24.4s, v8.4s // ...................................|...................................................................................................................*....................................... - // srshr v24.4S, v14.4S, #23 // ...................................|....................................................................................................................*...................................... - // mls v14.4s, v24.4s, v8.4s // ...................................|.......................................................................................................................*................................... - // sub v24.4s, v9.4s, v13.4s // ...................................|........................................................................................................................*.................................. - // add v9.4s, v9.4s, v13.4s // ...................................|.........................................................................................................................*................................. - // mul v13.4s, v24.4s, v0.s[0] // ...................................|...........................................................................................................................*............................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|............................................................................................................................*.............................. - // mls v13.4s, v24.4s, v8.s[0] // ...................................|................................................................................................................................*.......................... - // sub v24.4s, v10.4s, v14.4s // ...................................|.............................................................................................................................*............................. - // add v10.4s, v10.4s, v14.4s // ...................................|..............................................................................................................................*............................ - // mul v14.4s, v24.4s, v0.s[0] // ...................................|.................................................................................................................................*......................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|..................................................................................................................................*........................ - // mls v14.4s, v24.4s, v8.s[0] // ...................................|......................................................................................................................................*.................... - // sub v24.4s, v11.4s, v15.4s // ...................................|.....................................................................................................................*..................................... - // add v11.4s, v11.4s, v15.4s // ...................................|......................................................................................................................*.................................... - // mul v15.4s, v24.4s, v0.s[0] // ...................................|..........................................................................................................................*................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|...............................................................................................................................*........................... - // mls v15.4s, v24.4s, v8.s[0] // ...................................|...................................................................................................................................*....................... - // sub v24.4s, v12.4s, v16.4s // ...................................|....................................................................................................................................*...................... - // add v12.4s, v12.4s, v16.4s // ...................................|.....................................................................................................................................*..................... - // mul v16.4s, v24.4s, v0.s[0] // ...................................|.......................................................................................................................................*................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|........................................................................................................................................*.................. - // mls v16.4s, v24.4s, v8.s[0] // .*.................................|...........................................................................................................................................*............... - // str q9, [x1], #(16*4) // ...................................|.........................................................................................................................................*................. - // str q10, [x1, #(-16*4 + 1*16)] // ..*................................|............................................................................................................................................*.............. - // str q11, [x1, #(-16*4 + 2*16)] // ....*..............................|..............................................................................................................................................*............ - // str q12, [x1, #(-16*4 + 3*16)] // ......*............................|................................................................................................................................................*.......... - // str q13, [x2], #(16*4) // .........*.........................|...................................................................................................................................................*....... - // str q14, [x2, #(-16*4 + 1*16)] // ...........*.......................|.....................................................................................................................................................*..... - // str q15, [x2, #(-16*4 + 2*16)] // .............*.....................|.......................................................................................................................................................*... - // str q16, [x2, #(-16*4 + 3*16)] // ...............*...................|.........................................................................................................................................................*. - // add x1, x1, #64 // .......*...........................|.................................................................................................................................................*......... - // add x2, x2, #64 // ................*..................|..........................................................................................................................................................* + ldr q30, [x4, #-32] // ..............................................................................................e............................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q28, [x4, #-16] // ...............................................................................................e.............................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + + // ---------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q9, [x1, #0] // ........e..........................'..................................................................................................................................................~........ + // ldr q10, [x1, #16] // ..........e........................'....................................................................................................................................................~...... + // ldr q11, [x1, #32] // ............e......................'......................................................................................................................................................~.... + // ldr q12, [x1, #48] // ..............e....................'........................................................................................................................................................~.. + // trn1 v25.4s, v9.4s, v10.4s // ...................................*........................................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...................................'*.......................................................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ..................e................'........................................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ...................................'.*......................................................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................'..*........................................................................................................................................................ + // trn2 v12.2d, v26.2d, v28.2d // ...................................'....*...................................................................................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ...................................'...*....................................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...................................'.....*..................................................................................................................................................... + // ldr q13, [x2, #0] // .................e.................'........................................................................................................................................................... + // ldr q14, [x2, #16] // ...................e...............'........................................................................................................................................................... + // ldr q15, [x2, #32] // ....................e..............'........................................................................................................................................................... + // ldr q16, [x2, #48] // .....................e.............'........................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ...................................'..........*................................................................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ...................................'...........*............................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................'............*.............................................................................................................................................. + // trn2 v28.4s, v15.4s, v16.4s // ...................................'.............*............................................................................................................................................. + // trn2 v15.2d, v25.2d, v27.2d // ...................................'....................*...................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...................................'.....................*..................................................................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................'......................*.................................................................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ...................................'.......................*................................................................................................................................... + // ldr q0, [x5], #(12*16) // e..................................'..........................................................................................................................................~................ + // ldr q4, [x5, #(-12*16 + 1*16)] // ...e...............................'.............................................................................................................................................~............. + // ldr q1, [x5, #(-12*16 + 2*16)] // .....e.............................'...............................................................................................................................................~........... + // ldr q5, [x5, #(-12*16 + 3*16)] // ......................e............'........................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .......................e...........'........................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ........................e..........'........................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...................................'........*.................................................................................................................................................. + // add v9.4s, v9.4s, v10.4s // ...................................'.........*................................................................................................................................................. + // sqrdmulh v27.4s, v24.4s, v5.4s // ...................................'..................*........................................................................................................................................ + // mul v10.4s, v24.4s, v1.4s // ...................................'...................*....................................................................................................................................... + // mls v10.4s, v27.4s, v8.s[0] // ...................................'........................*.................................................................................................................................. + // sub v24.4s, v11.4s, v12.4s // ...................................'......*.................................................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................'.......*................................................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ...................................'..............*............................................................................................................................................ + // mul v12.4s, v24.4s, v2.4s // ...................................'...............*........................................................................................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ...................................'.........................*................................................................................................................................. + // sub v24.4s, v9.4s, v11.4s // ...................................'................*.......................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...................................'.................*......................................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...................................'..........................*................................................................................................................................ + // mul v11.4s, v24.4s, v0.4s // ...................................'...........................*............................................................................................................................... + // mls v11.4s, v27.4s, v8.s[0] // ...................................'...............................*........................................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ...................................'.............................*............................................................................................................................. + // add v10.4s, v10.4s, v12.4s // ...................................'..............................*............................................................................................................................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ...................................'................................*.......................................................................................................................... + // mul v12.4s, v24.4s, v0.4s // ...................................'.................................*......................................................................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ...................................'......................................*.................................................................................................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // .........................e.........'........................................................................................................................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..........................e........'........................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e.......'........................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ............................e......'........................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 10*16)] // .............................e.....'........................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................e....'........................................................................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ...................................'............................*.............................................................................................................................. + // add v13.4s, v13.4s, v14.4s // ...................................'..................................*........................................................................................................................ + // sqrdmulh v27.4s, v24.4s, v5.4s // ...................................'...................................*....................................................................................................................... + // mul v14.4s, v24.4s, v1.4s // ...................................'.......................................*................................................................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ...................................'...........................................*............................................................................................................... + // sub v24.4s, v15.4s, v16.4s // ...................................'........................................*.................................................................................................................. + // add v15.4s, v15.4s, v16.4s // ...................................'.........................................*................................................................................................................. + // sqrdmulh v27.4s, v24.4s, v6.4s // ...................................'............................................*.............................................................................................................. + // mul v16.4s, v24.4s, v2.4s // ...................................'.............................................*............................................................................................................. + // mls v16.4s, v27.4s, v8.s[0] // ...................................'.................................................*......................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ...................................'..............................................*............................................................................................................ + // add v13.4s, v13.4s, v15.4s // ...................................'...............................................*........................................................................................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...................................'..................................................*........................................................................................................ + // mul v15.4s, v24.4s, v0.4s // ...................................'...................................................*....................................................................................................... + // mls v15.4s, v27.4s, v8.s[0] // ...................................'.......................................................*................................................................................................... + // sub v24.4s, v14.4s, v16.4s // ...................................'.....................................................*..................................................................................................... + // add v14.4s, v14.4s, v16.4s // ...................................'......................................................*.................................................................................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...................................'........................................................*.................................................................................................. + // mul v16.4s, v24.4s, v0.4s // ...................................'.........................................................*................................................................................................. + // mls v16.4s, v27.4s, v8.s[0] // ...................................'.............................................................*............................................................................................. + // trn1 v25.4s, v9.4s, v10.4s // ...................................'....................................*...................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...................................'.....................................*..................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ...................................'..........................................*................................................................................................................ + // trn2 v28.4s, v11.4s, v12.4s // ...................................'................................................*.......................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................'....................................................*...................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ...................................'..........................................................*................................................................................................ + // trn1 v9.2d, v25.2d, v27.2d // ...................................'...........................................................*............................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...................................'............................................................*.............................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ...................................'..............................................................*............................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ...................................'...............................................................*........................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................'.................................................................*......................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ...................................'..................................................................*........................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ...................................'....................................................................*...................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...................................'.....................................................................*..................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................'......................................................................*.................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ...................................'.......................................................................*................................................................................... + // ldr q0, [x4], #64 // ...............................e...'........................................................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // ................................e..'........................................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // .................................e.'........................................................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ..................................e'........................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...................................'................................................................*.......................................................................................... + // add v9.4s, v9.4s, v10.4s // ...................................'...................................................................*....................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ...................................'........................................................................*.................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ...................................'.........................................................................*................................................................................. + // mls v10.4s, v27.4s, v8.s[0] // ...................................'.............................................................................*............................................................................. + // sub v24.4s, v11.4s, v12.4s // ...................................'..........................................................................*................................................................................ + // add v11.4s, v11.4s, v12.4s // ...................................'...........................................................................*............................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ...................................'..............................................................................*............................................................................ + // mul v12.4s, v24.4s, v2.s[0] // ...................................'...............................................................................*........................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ...................................'...................................................................................*....................................................................... + // sub v24.4s, v13.4s, v14.4s // ...................................'............................................................................*.............................................................................. + // add v13.4s, v13.4s, v14.4s // ...................................'................................................................................*.......................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ...................................'.................................................................................*......................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ...................................'..................................................................................*........................................................................ + // mls v14.4s, v27.4s, v8.s[0] // ...................................'......................................................................................*.................................................................... + // sub v24.4s, v15.4s, v16.4s // ...................................'....................................................................................*...................................................................... + // add v15.4s, v15.4s, v16.4s // ...................................'.....................................................................................*..................................................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ...................................'.......................................................................................*................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ...................................'........................................................................................*.................................................................. + // mls v16.4s, v27.4s, v8.s[0] // ...................................'............................................................................................*.............................................................. + // sub v24.4s, v9.4s, v11.4s // ...................................'.........................................................................................*................................................................. + // add v9.4s, v9.4s, v11.4s // ...................................'..........................................................................................*................................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ...................................'.............................................................................................*............................................................. + // mul v11.4s, v24.4s, v0.s[2] // ...................................'..............................................................................................*............................................................ + // mls v11.4s, v27.4s, v8.s[0] // ...................................'..................................................................................................*........................................................ + // sub v24.4s, v10.4s, v12.4s // ...................................'...........................................................................................*............................................................... + // add v10.4s, v10.4s, v12.4s // ...................................'...............................................................................................*........................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ...................................'................................................................................................*.......................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...................................'.................................................................................................*......................................................... + // mls v12.4s, v27.4s, v8.s[0] // ...................................'.....................................................................................................*..................................................... + // sub v24.4s, v13.4s, v15.4s // ...................................'...................................................................................................*....................................................... + // add v13.4s, v13.4s, v15.4s // ...................................'....................................................................................................*...................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ...................................'......................................................................................................*.................................................... + // mul v15.4s, v24.4s, v1.s[0] // ...................................'.......................................................................................................*................................................... + // mls v15.4s, v27.4s, v8.s[0] // ...................................'...........................................................................................................*............................................... + // sub v24.4s, v14.4s, v16.4s // ...................................'........................................................................................................*.................................................. + // add v14.4s, v14.4s, v16.4s // ...................................'.........................................................................................................*................................................. + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ...................................'............................................................................................................*.............................................. + // mul v16.4s, v24.4s, v1.s[0] // ...................................'.............................................................................................................*............................................. + // mls v16.4s, v27.4s, v8.s[0] // ...................................'.................................................................................................................*......................................... + // srshr v24.4S, v9.4S, #23 // ...................................'..........................................................................................................*................................................ + // mls v9.4s, v24.4s, v8.4s // ...................................'..............................................................................................................*............................................ + // srshr v24.4S, v10.4S, #23 // ...................................'...............................................................................................................*........................................... + // mls v10.4s, v24.4s, v8.4s // ...................................'..................................................................................................................*........................................ + // srshr v24.4S, v13.4S, #23 // ...................................'................................................................................................................*.......................................... + // mls v13.4s, v24.4s, v8.4s // ...................................'...................................................................................................................*....................................... + // srshr v24.4S, v14.4S, #23 // ...................................'....................................................................................................................*...................................... + // mls v14.4s, v24.4s, v8.4s // ...................................'.......................................................................................................................*................................... + // sub v24.4s, v9.4s, v13.4s // ...................................'........................................................................................................................*.................................. + // add v9.4s, v9.4s, v13.4s // ...................................'.........................................................................................................................*................................. + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...................................'...........................................................................................................................*............................... + // mul v13.4s, v24.4s, v0.s[0] // ...................................'............................................................................................................................*.............................. + // mls v13.4s, v27.4s, v8.s[0] // ...................................'................................................................................................................................*.......................... + // sub v24.4s, v10.4s, v14.4s // ...................................'.............................................................................................................................*............................. + // add v10.4s, v10.4s, v14.4s // ...................................'..............................................................................................................................*............................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...................................'.................................................................................................................................*......................... + // mul v14.4s, v24.4s, v0.s[0] // ...................................'..................................................................................................................................*........................ + // mls v14.4s, v27.4s, v8.s[0] // ...................................'......................................................................................................................................*.................... + // sub v24.4s, v11.4s, v15.4s // ...................................'.....................................................................................................................*..................................... + // add v11.4s, v11.4s, v15.4s // ...................................'......................................................................................................................*.................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...................................'..........................................................................................................................*................................ + // mul v15.4s, v24.4s, v0.s[0] // ...................................'...............................................................................................................................*........................... + // mls v15.4s, v27.4s, v8.s[0] // ...................................'...................................................................................................................................*....................... + // sub v24.4s, v12.4s, v16.4s // ...................................'....................................................................................................................................*...................... + // add v12.4s, v12.4s, v16.4s // ...................................'.....................................................................................................................................*..................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...................................'.......................................................................................................................................*................... + // mul v16.4s, v24.4s, v0.s[0] // ...................................'........................................................................................................................................*.................. + // mls v16.4s, v27.4s, v8.s[0] // .~.................................'...........................................................................................................................................*............... + // str q9, [x1], #(16*4) // ...................................'.........................................................................................................................................*................. + // str q10, [x1, #(-16*4 + 1*16)] // ..~................................'............................................................................................................................................*.............. + // str q11, [x1, #(-16*4 + 2*16)] // ....~..............................'..............................................................................................................................................*............ + // str q12, [x1, #(-16*4 + 3*16)] // ......~............................'................................................................................................................................................*.......... + // str q13, [x2], #(16*4) // .........~.........................'...................................................................................................................................................*....... + // str q14, [x2, #(-16*4 + 1*16)] // ...........~.......................'.....................................................................................................................................................*..... + // str q15, [x2, #(-16*4 + 2*16)] // .............~.....................'.......................................................................................................................................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ...............~...................'.........................................................................................................................................................*. + // add x1, x1, #64 // .......~...........................'.................................................................................................................................................*......... + // add x2, x2, #64 // ................~..................'..........................................................................................................................................................* sub count, count, #1 cbnz count, layer45678_start - trn1 v0.4S, v30.4S, v24.4S // *.................................................................................................................................................... + // Instructions: 149 + // Expected cycles: 149 + // Expected IPC: 1.00 + // + // Wall time: 20.31s + // User time: 20.31s + // + // ---------------------------------------------------------------- original position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + trn1 v22.4S, v10.4S, v12.4S // *.................................................................................................................................................... // gap // ..................................................................................................................................................... - trn2 v10.4S, v30.4S, v24.4S // .*................................................................................................................................................... + trn2 v11.4S, v10.4S, v12.4S // .*................................................................................................................................................... // gap // ..................................................................................................................................................... - trn2 v7.4S, v7.4S, v1.4S // ..*.................................................................................................................................................. + trn2 v6.4S, v27.4S, v6.4S // ..*.................................................................................................................................................. // gap // ..................................................................................................................................................... - trn2 v20.2D, v0.2D, v6.2D // ...*................................................................................................................................................. + trn2 v14.2D, v22.2D, v2.2D // ...*................................................................................................................................................. // gap // ..................................................................................................................................................... - trn1 v22.2D, v0.2D, v6.2D // ....*................................................................................................................................................ + trn1 v19.2D, v22.2D, v2.2D // ....*................................................................................................................................................ // gap // ..................................................................................................................................................... - trn1 v24.2D, v10.2D, v7.2D // ......*.............................................................................................................................................. + trn2 v10.2D, v11.2D, v6.2D // .....*............................................................................................................................................... // gap // ..................................................................................................................................................... - trn2 v10.2D, v10.2D, v7.2D // .....*............................................................................................................................................... + trn1 v21.2D, v11.2D, v6.2D // ......*.............................................................................................................................................. // gap // ..................................................................................................................................................... - sub v1.4S, v22.4S, v24.4S // .........*........................................................................................................................................... + sub v12.4S, v14.4S, v10.4S // .......*............................................................................................................................................. // gap // ..................................................................................................................................................... - sub v30.4S, v20.4S, v10.4S // .......*............................................................................................................................................. + add v29.4S, v14.4S, v10.4S // ........*............................................................................................................................................ // gap // ..................................................................................................................................................... - add v7.4S, v20.4S, v10.4S // ........*............................................................................................................................................ + sub v14.4S, v19.4S, v21.4S // .........*........................................................................................................................................... // gap // ..................................................................................................................................................... - sqrdmulh v10.4S, v1.4S, v18.4S // ....................*................................................................................................................................ + add v21.4S, v19.4S, v21.4S // ..........*.......................................................................................................................................... // gap // ..................................................................................................................................................... - mul v26.4S, v30.4S, v11.4S // ...............*..................................................................................................................................... + trn1 v6.4S, v4.4S, v1.4S // ...........*......................................................................................................................................... // gap // ..................................................................................................................................................... - sqrdmulh v6.4S, v30.4S, v19.4S // ................*.................................................................................................................................... + trn2 v27.4S, v4.4S, v1.4S // ............*........................................................................................................................................ // gap // ..................................................................................................................................................... - mul v18.4S, v1.4S, v28.4S // ...................*................................................................................................................................. + trn1 v4.4S, v24.4S, v18.4S // .............*....................................................................................................................................... // gap // ..................................................................................................................................................... - trn2 v16.4S, v13.4S, v12.4S // ............*........................................................................................................................................ + trn2 v2.4S, v24.4S, v18.4S // ..............*...................................................................................................................................... // gap // ..................................................................................................................................................... - add v0.4S, v22.4S, v24.4S // ..........*.......................................................................................................................................... + sqrdmulh v19.4S, v12.4S, v20.4S // ...............*..................................................................................................................................... // gap // ..................................................................................................................................................... - mls v26.4S, v6.4S, v8.S[0] // ..........................*.......................................................................................................................... + mul v25.4S, v12.4S, v25.4S // ................*.................................................................................................................................... // gap // ..................................................................................................................................................... - mls v18.4S, v10.4S, v8.S[0] // .........................*........................................................................................................................... + sub v12.4S, v21.4S, v29.4S // .................*................................................................................................................................... // gap // ..................................................................................................................................................... - trn2 v28.4S, v17.4S, v27.4S // ..............*...................................................................................................................................... + add v1.4S, v21.4S, v29.4S // ..................*.................................................................................................................................. // gap // ..................................................................................................................................................... - sub v11.4S, v0.4S, v7.4S // .................*................................................................................................................................... + sqrdmulh v21.4S, v14.4S, v31.4S // ...................*................................................................................................................................. // gap // ..................................................................................................................................................... - trn1 v30.4S, v17.4S, v27.4S // .............*....................................................................................................................................... + mul v29.4S, v14.4S, v16.4S // ....................*................................................................................................................................ // gap // ..................................................................................................................................................... - sub v1.4S, v18.4S, v26.4S // ..............................*...................................................................................................................... + trn2 v24.2D, v6.2D, v4.2D // .....................*............................................................................................................................... // gap // ..................................................................................................................................................... - sqrdmulh v19.4S, v11.4S, v21.4S // ............................*........................................................................................................................ + trn2 v16.2D, v27.2D, v2.2D // ......................*.............................................................................................................................. // gap // ..................................................................................................................................................... - mul v24.4S, v11.4S, v15.4S // ...........................*......................................................................................................................... + trn1 v18.2D, v6.2D, v4.2D // .......................*............................................................................................................................. // gap // ..................................................................................................................................................... - sqrdmulh v17.4S, v1.4S, v21.4S // ..................................*.................................................................................................................. + trn1 v20.2D, v27.2D, v2.2D // ........................*............................................................................................................................ // gap // ..................................................................................................................................................... - mul v6.4S, v1.4S, v15.4S // .................................*................................................................................................................... + mls v29.4S, v21.4S, v8.S[0] // .........................*........................................................................................................................... // gap // ..................................................................................................................................................... - add v1.4S, v18.4S, v26.4S // ...............................*..................................................................................................................... + mls v25.4S, v19.4S, v8.S[0] // ..........................*.......................................................................................................................... // gap // ..................................................................................................................................................... - add v27.4S, v0.4S, v7.4S // ..................*.................................................................................................................................. + sqrdmulh v21.4S, v12.4S, v0.4S // ...........................*......................................................................................................................... // gap // ..................................................................................................................................................... - mls v24.4S, v19.4S, v8.S[0] // ................................*.................................................................................................................... + mul v31.4S, v12.4S, v23.4S // ............................*........................................................................................................................ // gap // ..................................................................................................................................................... - mls v6.4S, v17.4S, v8.S[0] // .......................................*............................................................................................................. + sub v19.4S, v18.4S, v20.4S // .............................*....................................................................................................................... // gap // ..................................................................................................................................................... - trn1 v15.4S, v27.4S, v1.4S // .....................................*............................................................................................................... + sub v14.4S, v29.4S, v25.4S // ..............................*...................................................................................................................... // gap // ..................................................................................................................................................... - trn2 v11.2D, v16.2D, v28.2D // ......................*.............................................................................................................................. + add v29.4S, v29.4S, v25.4S // ...............................*..................................................................................................................... // gap // ..................................................................................................................................................... - trn2 v7.4S, v27.4S, v1.4S // ......................................*.............................................................................................................. + mls v31.4S, v21.4S, v8.S[0] // ................................*.................................................................................................................... // gap // ..................................................................................................................................................... - trn2 v10.4S, v24.4S, v6.4S // .................................................*................................................................................................... + sqrdmulh v21.4S, v14.4S, v0.4S // .................................*................................................................................................................... // gap // ..................................................................................................................................................... - trn1 v24.4S, v24.4S, v6.4S // ...........................................*......................................................................................................... + mul v25.4S, v14.4S, v23.4S // ..................................*.................................................................................................................. // gap // ..................................................................................................................................................... - trn1 v1.4S, v13.4S, v12.4S // ...........*......................................................................................................................................... + add v20.4S, v18.4S, v20.4S // ...................................*................................................................................................................. // gap // ..................................................................................................................................................... - trn1 v0.2D, v7.2D, v10.2D // .............................................................*....................................................................................... + sqrdmulh v14.4S, v19.4S, v15.4S // ....................................*................................................................................................................ // gap // ..................................................................................................................................................... - trn2 v22.2D, v7.2D, v10.2D // ...........................................................*......................................................................................... + trn1 v23.4S, v1.4S, v29.4S // .....................................*............................................................................................................... // gap // ..................................................................................................................................................... - trn2 v27.2D, v15.2D, v24.2D // .....................................................*............................................................................................... + trn2 v0.4S, v1.4S, v29.4S // ......................................*.............................................................................................................. // gap // ..................................................................................................................................................... - trn1 v7.2D, v15.2D, v24.2D // ............................................................*........................................................................................ + mls v25.4S, v21.4S, v8.S[0] // .......................................*............................................................................................................. // gap // ..................................................................................................................................................... - add v6.4S, v27.4S, v22.4S // ............................................................................*........................................................................ + mul v19.4S, v19.4S, v17.4S // ........................................*............................................................................................................ // gap // ..................................................................................................................................................... - add v18.4S, v7.4S, v0.4S // ....................................................................*................................................................................ + sub v21.4S, v24.4S, v16.4S // .........................................*........................................................................................................... // gap // ..................................................................................................................................................... - trn1 v28.2D, v16.2D, v28.2D // ........................*............................................................................................................................ + add v27.4S, v24.4S, v16.4S // ..........................................*.......................................................................................................... // gap // ..................................................................................................................................................... - trn1 v15.2D, v1.2D, v30.2D // .......................*............................................................................................................................. + trn1 v15.4S, v31.4S, v25.4S // ...........................................*......................................................................................................... // gap // ..................................................................................................................................................... - add v17.4S, v18.4S, v6.4S // ...........................................................................................*......................................................... + mls v19.4S, v14.4S, v8.S[0] // ............................................*........................................................................................................ // gap // ..................................................................................................................................................... - sub v10.4S, v15.4S, v28.4S // .............................*....................................................................................................................... + sqrdmulh v14.4S, v21.4S, v26.4S // .............................................*....................................................................................................... // gap // ..................................................................................................................................................... - trn2 v13.2D, v1.2D, v30.2D // .....................*............................................................................................................................... + mul v29.4S, v21.4S, v3.4S // ..............................................*...................................................................................................... // gap // ..................................................................................................................................................... - srshr v26.4S, v17.4S, #23 // ...........................................................................................................*......................................... + sub v21.4S, v20.4S, v27.4S // ...............................................*..................................................................................................... // gap // ..................................................................................................................................................... - sqrdmulh v30.4S, v10.4S, v14.4S // ........................................*............................................................................................................ + add v20.4S, v20.4S, v27.4S // ................................................*.................................................................................................... // gap // ..................................................................................................................................................... - mul v21.4S, v10.4S, v25.4S // ....................................*................................................................................................................ + trn2 v26.4S, v31.4S, v25.4S // .................................................*................................................................................................... // gap // ..................................................................................................................................................... - mls v17.4S, v26.4S, v8.4S // ...............................................................................................................*..................................... + mls v29.4S, v14.4S, v8.S[0] // ..................................................*.................................................................................................. // gap // ..................................................................................................................................................... - sub v10.4S, v13.4S, v11.4S // .........................................*........................................................................................................... + sqrdmulh v14.4S, v21.4S, v5.4S // ...................................................*................................................................................................. // gap // ..................................................................................................................................................... - sub v1.4S, v7.4S, v0.4S // .................................................................*................................................................................... + mul v17.4S, v21.4S, v13.4S // ....................................................*................................................................................................ // gap // ..................................................................................................................................................... - mls v21.4S, v30.4S, v8.S[0] // ............................................*........................................................................................................ + trn2 v31.2D, v23.2D, v15.2D // .....................................................*............................................................................................... // gap // ..................................................................................................................................................... - mul v7.4S, v10.4S, v5.4S // .............................................*....................................................................................................... + sub v21.4S, v19.4S, v29.4S // ......................................................*.............................................................................................. // gap // ..................................................................................................................................................... - sqrdmulh v24.4S, v10.4S, v29.4S // ..............................................*...................................................................................................... + add v19.4S, v19.4S, v29.4S // .......................................................*............................................................................................. // gap // ..................................................................................................................................................... - sqrdmulh v10.4S, v1.4S, v9.S[3] // ..........................................................................*.......................................................................... + mls v17.4S, v14.4S, v8.S[0] // ........................................................*............................................................................................ // gap // ..................................................................................................................................................... - add v30.4S, v13.4S, v11.4S // ..........................................*.......................................................................................................... + sqrdmulh v29.4S, v21.4S, v5.4S // .........................................................*........................................................................................... // gap // ..................................................................................................................................................... - add v13.4S, v15.4S, v28.4S // ...................................*................................................................................................................. + mul v21.4S, v21.4S, v13.4S // ..........................................................*.......................................................................................... // gap // ..................................................................................................................................................... - mls v7.4S, v24.4S, v8.S[0] // ..................................................*.................................................................................................. + trn2 v25.2D, v0.2D, v26.2D // ...........................................................*......................................................................................... // gap // ..................................................................................................................................................... - sub v0.4S, v27.4S, v22.4S // ...........................................................................*......................................................................... + trn1 v15.2D, v23.2D, v15.2D // ............................................................*........................................................................................ // gap // ..................................................................................................................................................... - sub v24.4S, v13.4S, v30.4S // ...............................................*..................................................................................................... + trn1 v14.2D, v0.2D, v26.2D // .............................................................*....................................................................................... // gap // ..................................................................................................................................................... - add v15.4S, v13.4S, v30.4S // ................................................*.................................................................................................... + mls v21.4S, v29.4S, v8.S[0] // ..............................................................*...................................................................................... // gap // ..................................................................................................................................................... - sub v12.4S, v21.4S, v7.4S // ......................................................*.............................................................................................. + trn1 v3.4S, v20.4S, v19.4S // ...............................................................*..................................................................................... // gap // ..................................................................................................................................................... - sqrdmulh v13.4S, v24.4S, v4.4S // ....................................................*................................................................................................ + trn2 v19.4S, v20.4S, v19.4S // ................................................................*.................................................................................... // gap // ..................................................................................................................................................... - mul v27.4S, v24.4S, v2.4S // ...................................................*................................................................................................. + sub v26.4S, v15.4S, v14.4S // .................................................................*................................................................................... // gap // ..................................................................................................................................................... - sqrdmulh v14.4S, v12.4S, v4.4S // ..........................................................*.......................................................................................... + trn1 v29.4S, v17.4S, v21.4S // ..................................................................*.................................................................................. // gap // ..................................................................................................................................................... - mul v28.4S, v12.4S, v2.4S // .........................................................*........................................................................................... + trn2 v21.4S, v17.4S, v21.4S // ...................................................................*................................................................................. // gap // ..................................................................................................................................................... - sqrdmulh v30.4S, v0.4S, v31.S[1] // ................................................................................*.................................................................... + add v20.4S, v15.4S, v14.4S // ....................................................................*................................................................................ // gap // ..................................................................................................................................................... - add v24.4S, v21.4S, v7.4S // .......................................................*............................................................................................. + trn2 v13.2D, v3.2D, v29.2D // .....................................................................*............................................................................... // gap // ..................................................................................................................................................... - mls v27.4S, v13.4S, v8.S[0] // ........................................................*............................................................................................ + trn2 v17.2D, v19.2D, v21.2D // ......................................................................*.............................................................................. // gap // ..................................................................................................................................................... - mls v28.4S, v14.4S, v8.S[0] // ..............................................................*...................................................................................... + trn1 v3.2D, v3.2D, v29.2D // .......................................................................*............................................................................. // gap // ..................................................................................................................................................... - trn1 v13.4S, v15.4S, v24.4S // ...............................................................*..................................................................................... + trn1 v19.2D, v19.2D, v21.2D // ........................................................................*............................................................................ // gap // ..................................................................................................................................................... - mul v7.4S, v0.4S, v31.S[0] // ...............................................................................*..................................................................... + sqrdmulh v14.4S, v26.4S, v9.S[3] // .........................................................................*........................................................................... // gap // ..................................................................................................................................................... - trn2 v24.4S, v15.4S, v24.4S // ................................................................*.................................................................................... + mul v5.4S, v26.4S, v9.S[2] // ..........................................................................*.......................................................................... // gap // ..................................................................................................................................................... - trn2 v0.4S, v27.4S, v28.4S // ...................................................................*................................................................................. + sub v21.4S, v31.4S, v25.4S // ...........................................................................*......................................................................... // gap // ..................................................................................................................................................... - trn1 v15.4S, v27.4S, v28.4S // ..................................................................*.................................................................................. + add v26.4S, v31.4S, v25.4S // ............................................................................*........................................................................ // gap // ..................................................................................................................................................... - mls v7.4S, v30.4S, v8.S[0] // ....................................................................................*................................................................ + sub v29.4S, v3.4S, v19.4S // .............................................................................*....................................................................... // gap // ..................................................................................................................................................... - trn1 v12.2D, v24.2D, v0.2D // ........................................................................*............................................................................ + mls v5.4S, v14.4S, v8.S[0] // ..............................................................................*...................................................................... // gap // ..................................................................................................................................................... - trn2 v28.2D, v24.2D, v0.2D // ......................................................................*.............................................................................. + sqrdmulh v14.4S, v21.4S, v30.S[1] // ...............................................................................*..................................................................... // gap // ..................................................................................................................................................... - trn2 v21.2D, v13.2D, v15.2D // .....................................................................*............................................................................... + mul v15.4S, v21.4S, v30.S[0] // ................................................................................*.................................................................... // gap // ..................................................................................................................................................... - trn1 v11.2D, v13.2D, v15.2D // .......................................................................*............................................................................. + add v2.4S, v3.4S, v19.4S // .................................................................................*................................................................... // gap // ..................................................................................................................................................... - sub v30.4S, v21.4S, v28.4S // .....................................................................................*............................................................... + sqrdmulh v21.4S, v29.4S, v30.S[3] // ..................................................................................*.................................................................. // gap // ..................................................................................................................................................... - sub v24.4S, v11.4S, v12.4S // .............................................................................*....................................................................... + mul v3.4S, v29.4S, v30.S[2] // ...................................................................................*................................................................. // gap // ..................................................................................................................................................... - mul v15.4S, v1.4S, v9.S[2] // .........................................................................*........................................................................... + mls v15.4S, v14.4S, v8.S[0] // ....................................................................................*................................................................ // gap // ..................................................................................................................................................... - mul v13.4S, v30.4S, v3.S[0] // ........................................................................................*............................................................ + sub v14.4S, v13.4S, v17.4S // .....................................................................................*............................................................... // gap // ..................................................................................................................................................... - sqrdmulh v0.4S, v24.4S, v31.S[3] // ...................................................................................*................................................................. + add v30.4S, v13.4S, v17.4S // ......................................................................................*.............................................................. // gap // ..................................................................................................................................................... - mul v1.4S, v24.4S, v31.S[2] // ..................................................................................*.................................................................. + mls v3.4S, v21.4S, v8.S[0] // .......................................................................................*............................................................. // gap // ..................................................................................................................................................... - mls v15.4S, v10.4S, v8.S[0] // ..............................................................................*...................................................................... + sqrdmulh v21.4S, v14.4S, v28.S[1] // ........................................................................................*............................................................ // gap // ..................................................................................................................................................... - add v27.4S, v21.4S, v28.4S // ......................................................................................*.............................................................. + mul v19.4S, v14.4S, v28.S[0] // .........................................................................................*........................................................... // gap // ..................................................................................................................................................... - sqrdmulh v30.4S, v30.4S, v3.S[1] // .........................................................................................*........................................................... + sub v14.4S, v20.4S, v26.4S // ..........................................................................................*.......................................................... // gap // ..................................................................................................................................................... - mls v1.4S, v0.4S, v8.S[0] // .......................................................................................*............................................................. + add v13.4S, v20.4S, v26.4S // ...........................................................................................*......................................................... // gap // ..................................................................................................................................................... - sub v10.4S, v15.4S, v7.4S // ............................................................................................*........................................................ + sub v29.4S, v5.4S, v15.4S // ............................................................................................*........................................................ // gap // ..................................................................................................................................................... - sub v0.4S, v18.4S, v6.4S // ..........................................................................................*.......................................................... + mls v19.4S, v21.4S, v8.S[0] // .............................................................................................*....................................................... // gap // ..................................................................................................................................................... - add v28.4S, v11.4S, v12.4S // .................................................................................*................................................................... + sqrdmulh v21.4S, v14.4S, v7.S[3] // ..............................................................................................*...................................................... // gap // ..................................................................................................................................................... - mls v13.4S, v30.4S, v8.S[0] // .............................................................................................*....................................................... + mul v26.4S, v14.4S, v7.S[2] // ...............................................................................................*..................................................... // gap // ..................................................................................................................................................... - mul v6.4S, v0.4S, v23.S[2] // ..............................................................................................*...................................................... + add v5.4S, v5.4S, v15.4S // ................................................................................................*.................................................... // gap // ..................................................................................................................................................... - sqrdmulh v30.4S, v0.4S, v23.S[3] // ...............................................................................................*..................................................... + sqrdmulh v14.4S, v29.4S, v7.S[3] // .................................................................................................*................................................... // gap // ..................................................................................................................................................... - sqrdmulh v24.4S, v10.4S, v23.S[3] // ..................................................................................................*.................................................. + mul v17.4S, v29.4S, v7.S[2] // ..................................................................................................*.................................................. // gap // ..................................................................................................................................................... - sub v0.4S, v1.4S, v13.4S // .........................................................................................................*........................................... + mls v26.4S, v21.4S, v8.S[0] // ...................................................................................................*................................................. // gap // ..................................................................................................................................................... - mul v12.4S, v10.4S, v23.S[2] // .................................................................................................*................................................... + sub v21.4S, v2.4S, v30.4S // ....................................................................................................*................................................ // gap // ..................................................................................................................................................... - add v7.4S, v15.4S, v7.4S // ................................................................................................*.................................................... + add v30.4S, v2.4S, v30.4S // .....................................................................................................*............................................... // gap // ..................................................................................................................................................... - sqrdmulh v31.4S, v0.4S, v9.S[1] // ..............................................................................................................*...................................... + mls v17.4S, v14.4S, v8.S[0] // ......................................................................................................*.............................................. // gap // ..................................................................................................................................................... - mls v6.4S, v30.4S, v8.S[0] // ...................................................................................................*................................................. + sqrdmulh v29.4S, v21.4S, v9.S[1] // .......................................................................................................*............................................. // gap // ..................................................................................................................................................... - mls v12.4S, v24.4S, v8.S[0] // ......................................................................................................*.............................................. + mul v28.4S, v21.4S, v9.S[0] // ........................................................................................................*............................................ // gap // ..................................................................................................................................................... - mul v3.4S, v0.4S, v9.S[0] // .............................................................................................................*....................................... + sub v14.4S, v3.4S, v19.4S // .........................................................................................................*........................................... // gap // ..................................................................................................................................................... - add v30.4S, v1.4S, v13.4S // ..........................................................................................................*.......................................... + add v19.4S, v3.4S, v19.4S // ..........................................................................................................*.......................................... // gap // ..................................................................................................................................................... - srshr v13.4S, v7.4S, #23 // ................................................................................................................*.................................... + srshr v21.4S, v13.4S, #23 // ...........................................................................................................*......................................... // gap // ..................................................................................................................................................... - add v2.4S, v28.4S, v27.4S // .....................................................................................................*............................................... + mls v28.4S, v29.4S, v8.S[0] // ............................................................................................................*........................................ // gap // ..................................................................................................................................................... - mls v3.4S, v31.4S, v8.S[0] // ..................................................................................................................*.................................. + sqrdmulh v29.4S, v14.4S, v9.S[1] // .............................................................................................................*....................................... // gap // ..................................................................................................................................................... - mls v7.4S, v13.4S, v8.4S // ...................................................................................................................*................................. + mul v15.4S, v14.4S, v9.S[0] // ..............................................................................................................*...................................... // gap // ..................................................................................................................................................... - srshr v10.4S, v30.4S, #23 // .....................................................................................................................*............................... + mls v13.4S, v21.4S, v8.4S // ...............................................................................................................*..................................... // gap // ..................................................................................................................................................... - srshr v24.4S, v2.4S, #23 // .................................................................................................................*................................... + srshr v14.4S, v5.4S, #23 // ................................................................................................................*.................................... // gap // ..................................................................................................................................................... - add v0.4S, v12.4S, v3.4S // ......................................................................................................................................*.............. + srshr v21.4S, v30.4S, #23 // .................................................................................................................*................................... // gap // ..................................................................................................................................................... - mls v30.4S, v10.4S, v8.4S // ........................................................................................................................*............................ + mls v15.4S, v29.4S, v8.S[0] // ..................................................................................................................*.................................. // gap // ..................................................................................................................................................... - mls v2.4S, v24.4S, v8.4S // ....................................................................................................................*................................ + mls v5.4S, v14.4S, v8.4S // ...................................................................................................................*................................. // gap // ..................................................................................................................................................... - str q0, [x1, #48] // ..............................................................................................................................................*...... + mls v30.4S, v21.4S, v8.4S // ....................................................................................................................*................................ // gap // ..................................................................................................................................................... - sub v19.4S, v28.4S, v27.4S // ....................................................................................................*................................................ + srshr v21.4S, v19.4S, #23 // .....................................................................................................................*............................... // gap // ..................................................................................................................................................... - sub v0.4S, v7.4S, v30.4S // ..............................................................................................................................*...................... + sub v29.4S, v26.4S, v28.4S // ......................................................................................................................*.............................. // gap // ..................................................................................................................................................... - add v7.4S, v7.4S, v30.4S // ...............................................................................................................................*..................... + add v3.4S, v26.4S, v28.4S // .......................................................................................................................*............................. // gap // ..................................................................................................................................................... - sub v24.4S, v17.4S, v2.4S // .........................................................................................................................*........................... + mls v19.4S, v21.4S, v8.4S // ........................................................................................................................*............................ // gap // ..................................................................................................................................................... - sqrdmulh v30.4S, v0.4S, v23.S[1] // ...................................................................................................................................*................. + sub v21.4S, v13.4S, v30.4S // .........................................................................................................................*........................... // gap // ..................................................................................................................................................... - mul v0.4S, v0.4S, v23.S[0] // ..................................................................................................................................*.................. + add v26.4S, v13.4S, v30.4S // ..........................................................................................................................*.......................... // gap // ..................................................................................................................................................... - sqrdmulh v10.4S, v24.4S, v23.S[1] // .............................................................................................................................*....................... + sqrdmulh v9.4S, v29.4S, v7.S[1] // ...........................................................................................................................*......................... // gap // ..................................................................................................................................................... - str q7, [x1, #16] // ............................................................................................................................................*........ + sqrdmulh v14.4S, v21.4S, v7.S[1] // ............................................................................................................................*........................ // gap // ..................................................................................................................................................... - mul v7.4S, v24.4S, v23.S[0] // ............................................................................................................................*........................ + mul v30.4S, v21.4S, v7.S[0] // .............................................................................................................................*....................... // gap // ..................................................................................................................................................... - mls v0.4S, v30.4S, v8.S[0] // .......................................................................................................................................*............. + sub v21.4S, v5.4S, v19.4S // ..............................................................................................................................*...................... // gap // ..................................................................................................................................................... - mul v13.4S, v19.4S, v9.S[0] // .......................................................................................................*............................................. + add v28.4S, v5.4S, v19.4S // ...............................................................................................................................*..................... // gap // ..................................................................................................................................................... - sqrdmulh v1.4S, v19.4S, v9.S[1] // ........................................................................................................*............................................ + mul v19.4S, v29.4S, v7.S[0] // ................................................................................................................................*.................... // gap // ..................................................................................................................................................... - mls v7.4S, v10.4S, v8.S[0] // .................................................................................................................................*................... + mls v30.4S, v14.4S, v8.S[0] // .................................................................................................................................*................... // gap // ..................................................................................................................................................... - str q0, [x2, #16] // .................................................................................................................................................*... + sqrdmulh v14.4S, v21.4S, v7.S[1] // ..................................................................................................................................*.................. // gap // ..................................................................................................................................................... - sub v0.4S, v12.4S, v3.4S // .....................................................................................................................................*............... + mul v22.4S, v21.4S, v7.S[0] // ...................................................................................................................................*................. // gap // ..................................................................................................................................................... - mls v13.4S, v1.4S, v8.S[0] // ............................................................................................................*........................................ + sub v21.4S, v17.4S, v15.4S // .....................................................................................................................................*............... // gap // ..................................................................................................................................................... - add v24.4S, v17.4S, v2.4S // ..........................................................................................................................*.......................... + add v29.4S, v17.4S, v15.4S // ......................................................................................................................................*.............. // gap // ..................................................................................................................................................... - str q7, [x2], #(16*4) // ................................................................................................................................................*.... + str q26, [x1], #(16*4) // ..........................................................................................................................................*.......... // gap // ..................................................................................................................................................... - sqrdmulh v30.4S, v0.4S, v23.S[1] // .........................................................................................................................................*........... + mls v22.4S, v14.4S, v8.S[0] // .......................................................................................................................................*............. // gap // ..................................................................................................................................................... - sub v7.4S, v6.4S, v13.4S // ......................................................................................................................*.............................. + str q28, [x1, #-48] // ............................................................................................................................................*........ // gap // ..................................................................................................................................................... - mul v0.4S, v0.4S, v23.S[0] // ........................................................................................................................................*............ + sqrdmulh v14.4S, v21.4S, v7.S[1] // ........................................................................................................................................*............ // gap // ..................................................................................................................................................... - add v1.4S, v6.4S, v13.4S // .......................................................................................................................*............................. + str q3, [x1, #-32] // .............................................................................................................................................*....... // gap // ..................................................................................................................................................... - sqrdmulh v10.4S, v7.4S, v23.S[1] // ................................................................................................................................*.................... + mul v21.4S, v21.4S, v7.S[0] // .........................................................................................................................................*........... // gap // ..................................................................................................................................................... - mul v7.4S, v7.4S, v23.S[0] // ...........................................................................................................................*......................... + str q29, [x1, #-16] // ..............................................................................................................................................*...... // gap // ..................................................................................................................................................... - str q1, [x1, #32] // .............................................................................................................................................*....... + mls v19.4S, v9.4S, v8.S[0] // ....................................................................................................................................*................ // gap // ..................................................................................................................................................... - mls v0.4S, v30.4S, v8.S[0] // ...........................................................................................................................................*......... + str q30, [x2], #(16*4) // ................................................................................................................................................*.... // gap // ..................................................................................................................................................... - str q24, [x1], #(16*4) // ..........................................................................................................................................*.......... - add x1, x1, #64 // ...............................................................................................................................................*..... - mls v7.4S, v10.4S, v8.S[0] // ....................................................................................................................................*................ + mls v21.4S, v14.4S, v8.S[0] // ...........................................................................................................................................*......... // gap // ..................................................................................................................................................... + str q22, [x2, #-48] // .................................................................................................................................................*... // gap // ..................................................................................................................................................... // gap // ..................................................................................................................................................... - str q0, [x2, #-16] // ...................................................................................................................................................*. // gap // ..................................................................................................................................................... + str q19, [x2, #-32] // ..................................................................................................................................................*.. // gap // ..................................................................................................................................................... // gap // ..................................................................................................................................................... - str q7, [x2, #-32] // ..................................................................................................................................................*.. + add x1, x1, #64 // ...............................................................................................................................................*..... + str q21, [x2, #-16] // ...................................................................................................................................................*. add x2, x2, #64 // ....................................................................................................................................................* - // original source code - // trn1 v0.4S, v30.4S, v24.4S // *.................................................................................................................................................... - // trn2 v10.4S, v30.4S, v24.4S // .*................................................................................................................................................... - // trn2 v30.4S, v7.4S, v1.4S // ..*.................................................................................................................................................. - // trn2 v24.2D, v0.2D, v6.2D // ...*................................................................................................................................................. - // trn1 v0.2D, v0.2D, v6.2D // ....*................................................................................................................................................ - // trn2 v7.2D, v10.2D, v30.2D // ......*.............................................................................................................................................. - // trn1 v10.2D, v10.2D, v30.2D // .....*............................................................................................................................................... - // sub v30.4S, v24.4S, v7.4S // ........*............................................................................................................................................ - // add v24.4S, v24.4S, v7.4S // .........*........................................................................................................................................... - // sub v7.4S, v0.4S, v10.4S // .......*............................................................................................................................................. - // add v0.4S, v0.4S, v10.4S // ...............*..................................................................................................................................... - // trn1 v10.4S, v13.4S, v12.4S // ...................................*................................................................................................................. - // trn2 v1.4S, v13.4S, v12.4S // ..............*...................................................................................................................................... - // trn1 v13.4S, v17.4S, v27.4S // ....................*................................................................................................................................ - // trn2 v6.4S, v17.4S, v27.4S // ..................*.................................................................................................................................. - // mul v12.4S, v30.4S, v11.4S // ...........*......................................................................................................................................... - // sqrdmulh v30.4S, v30.4S, v19.4S // ............*........................................................................................................................................ - // sub v17.4S, v0.4S, v24.4S // ...................*................................................................................................................................. - // add v0.4S, v0.4S, v24.4S // ...........................*......................................................................................................................... - // mul v24.4S, v7.4S, v28.4S // .............*....................................................................................................................................... - // sqrdmulh v7.4S, v7.4S, v18.4S // ..........*.......................................................................................................................................... - // trn2 v28.2D, v10.2D, v13.2D // ..............................................*...................................................................................................... - // trn2 v27.2D, v1.2D, v6.2D // ...............................*..................................................................................................................... - // trn1 v10.2D, v10.2D, v13.2D // ...........................................*......................................................................................................... - // trn1 v1.2D, v1.2D, v6.2D // ..........................................*.......................................................................................................... - // mls v24.4S, v7.4S, v8.S[0] // .................*................................................................................................................................... - // mls v12.4S, v30.4S, v8.S[0] // ................*.................................................................................................................................... - // mul v30.4S, v17.4S, v15.4S // .......................*............................................................................................................................. - // sqrdmulh v7.4S, v17.4S, v21.4S // ......................*.............................................................................................................................. - // sub v13.4S, v10.4S, v1.4S // .............................................*....................................................................................................... - // sub v6.4S, v24.4S, v12.4S // .....................*............................................................................................................................... - // add v24.4S, v24.4S, v12.4S // ..........................*.......................................................................................................................... - // mls v30.4S, v7.4S, v8.S[0] // ............................*........................................................................................................................ - // mul v7.4S, v6.4S, v15.4S // .........................*........................................................................................................................... - // sqrdmulh v6.4S, v6.4S, v21.4S // ........................*............................................................................................................................ - // add v10.4S, v10.4S, v1.4S // ..........................................................*.......................................................................................... - // mul v1.4S, v13.4S, v25.4S // .................................................*................................................................................................... - // trn1 v12.4S, v0.4S, v24.4S // ..............................*...................................................................................................................... - // trn2 v0.4S, v0.4S, v24.4S // ................................*.................................................................................................................... - // mls v7.4S, v6.4S, v8.S[0] // .............................*....................................................................................................................... - // sqrdmulh v24.4S, v13.4S, v14.4S // ................................................*.................................................................................................... - // sub v13.4S, v28.4S, v27.4S // ...................................................*................................................................................................. - // add v6.4S, v28.4S, v27.4S // .........................................................*........................................................................................... - // trn1 v17.4S, v30.4S, v7.4S // ..................................*.................................................................................................................. - // mls v1.4S, v24.4S, v8.S[0] // .....................................................*............................................................................................... - // mul v24.4S, v13.4S, v5.4S // ......................................................*.............................................................................................. - // sqrdmulh v13.4S, v13.4S, v29.4S // .......................................................*............................................................................................. - // sub v28.4S, v10.4S, v6.4S // .............................................................*....................................................................................... - // add v10.4S, v10.4S, v6.4S // ..............................................................*...................................................................................... - // trn2 v30.4S, v30.4S, v7.4S // .................................*................................................................................................................... - // mls v24.4S, v13.4S, v8.S[0] // ...........................................................*......................................................................................... - // mul v7.4S, v28.4S, v2.4S // .................................................................*................................................................................... - // sqrdmulh v13.4S, v28.4S, v4.4S // ................................................................*.................................................................................... - // trn2 v6.2D, v12.2D, v17.2D // ......................................*.............................................................................................................. - // sub v28.4S, v1.4S, v24.4S // ...............................................................*..................................................................................... - // add v24.4S, v1.4S, v24.4S // .....................................................................*............................................................................... - // mls v7.4S, v13.4S, v8.S[0] // ......................................................................*.............................................................................. - // mul v1.4S, v28.4S, v2.4S // ...................................................................*................................................................................. - // sqrdmulh v13.4S, v28.4S, v4.4S // ..................................................................*.................................................................................. - // trn2 v28.2D, v0.2D, v30.2D // .....................................*............................................................................................................... - // trn1 v12.2D, v12.2D, v17.2D // .......................................*............................................................................................................. - // trn1 v0.2D, v0.2D, v30.2D // ....................................*................................................................................................................ - // mls v1.4S, v13.4S, v8.S[0] // .......................................................................*............................................................................. - // trn1 v30.4S, v10.4S, v24.4S // ........................................................................*............................................................................ - // trn2 v10.4S, v10.4S, v24.4S // ..........................................................................*.......................................................................... - // sub v24.4S, v12.4S, v0.4S // ....................................................*................................................................................................ - // trn1 v13.4S, v7.4S, v1.4S // ............................................................................*........................................................................ - // trn2 v7.4S, v7.4S, v1.4S // ...........................................................................*......................................................................... - // add v0.4S, v12.4S, v0.4S // .........................................*........................................................................................................... - // trn2 v1.2D, v30.2D, v13.2D // ................................................................................*.................................................................... - // trn2 v12.2D, v10.2D, v7.2D // ...............................................................................*..................................................................... - // trn1 v30.2D, v30.2D, v13.2D // .................................................................................*................................................................... - // trn1 v10.2D, v10.2D, v7.2D // ..............................................................................*...................................................................... - // mul v7.4S, v24.4S, v9.S[2] // ....................................................................................*................................................................ - // sqrdmulh v24.4S, v24.4S, v9.S[3] // ........................................................*............................................................................................ - // sub v13.4S, v6.4S, v28.4S // ............................................................*........................................................................................ - // add v6.4S, v6.4S, v28.4S // ........................................*............................................................................................................ - // sub v17.4S, v30.4S, v10.4S // ...................................................................................*................................................................. - // mls v7.4S, v24.4S, v8.S[0] // ........................................................................................*............................................................ - // mul v24.4S, v13.4S, v31.S[0] // .........................................................................*........................................................................... - // sqrdmulh v13.4S, v13.4S, v31.S[1] // ....................................................................*................................................................................ - // add v10.4S, v30.4S, v10.4S // ..............................................................................................*...................................................... - // mul v30.4S, v17.4S, v31.S[2] // .......................................................................................*............................................................. - // sqrdmulh v17.4S, v17.4S, v31.S[3] // ......................................................................................*.............................................................. - // mls v24.4S, v13.4S, v8.S[0] // .............................................................................*....................................................................... - // sub v13.4S, v1.4S, v12.4S // ..................................................................................*.................................................................. - // add v1.4S, v1.4S, v12.4S // .........................................................................................*........................................................... - // mls v30.4S, v17.4S, v8.S[0] // ...........................................................................................*......................................................... - // mul v12.4S, v13.4S, v3.S[0] // .....................................................................................*............................................................... - // sqrdmulh v13.4S, v13.4S, v3.S[1] // ..........................................................................................*.......................................................... - // sub v17.4S, v0.4S, v6.4S // .............................................................................................*....................................................... - // add v0.4S, v0.4S, v6.4S // ............................................*........................................................................................................ - // sub v6.4S, v7.4S, v24.4S // ............................................................................................*........................................................ - // mls v12.4S, v13.4S, v8.S[0] // ...............................................................................................*..................................................... - // mul v13.4S, v17.4S, v23.S[2] // ................................................................................................*.................................................... - // sqrdmulh v17.4S, v17.4S, v23.S[3] // .................................................................................................*................................................... - // add v24.4S, v7.4S, v24.4S // .....................................................................................................*............................................... - // mul v7.4S, v6.4S, v23.S[2] // ....................................................................................................*................................................ - // sqrdmulh v6.4S, v6.4S, v23.S[3] // ..................................................................................................*.................................................. - // mls v13.4S, v17.4S, v8.S[0] // .......................................................................................................*............................................. - // sub v17.4S, v10.4S, v1.4S // .....................................................................................................................*............................... - // add v10.4S, v10.4S, v1.4S // ............................................................................................................*........................................ - // mls v7.4S, v6.4S, v8.S[0] // ........................................................................................................*............................................ - // mul v1.4S, v17.4S, v9.S[0] // ...............................................................................................................................*..................... - // sqrdmulh v6.4S, v17.4S, v9.S[1] // ................................................................................................................................*.................... - // sub v17.4S, v30.4S, v12.4S // ...................................................................................................*................................................. - // add v30.4S, v30.4S, v12.4S // ..........................................................................................................*.......................................... - // srshr v12.4S, v0.4S, #23 // ...............................................*..................................................................................................... - // mls v1.4S, v6.4S, v8.S[0] // ....................................................................................................................................*................ - // mul v6.4S, v17.4S, v9.S[0] // .........................................................................................................*........................................... - // sqrdmulh v17.4S, v17.4S, v9.S[1] // ......................................................................................................*.............................................. - // mls v0.4S, v12.4S, v8.4S // ..................................................*.................................................................................................. - // srshr v12.4S, v24.4S, #23 // ...........................................................................................................*......................................... - // srshr v28.4S, v10.4S, #23 // ................................................................................................................*.................................... - // mls v6.4S, v17.4S, v8.S[0] // .............................................................................................................*....................................... - // mls v24.4S, v12.4S, v8.4S // ..............................................................................................................*...................................... - // mls v10.4S, v28.4S, v8.4S // ...................................................................................................................*................................. - // srshr v12.4S, v30.4S, #23 // ...............................................................................................................*..................................... - // sub v17.4S, v13.4S, v1.4S // ........................................................................................................................................*............ - // add v1.4S, v13.4S, v1.4S // ..........................................................................................................................................*.......... - // mls v30.4S, v12.4S, v8.4S // ..................................................................................................................*.................................. - // sub v13.4S, v0.4S, v10.4S // ........................................................................................................................*............................ - // add v0.4S, v0.4S, v10.4S // .....................................................................................................................................*............... - // mul v10.4S, v17.4S, v23.S[0] // ............................................................................................................................................*........ - // mul v12.4S, v13.4S, v23.S[0] // .............................................................................................................................*....................... - // sqrdmulh v13.4S, v13.4S, v23.S[1] // ...........................................................................................................................*......................... - // sub v28.4S, v24.4S, v30.4S // ......................................................................................................................*.............................. - // add v30.4S, v24.4S, v30.4S // .......................................................................................................................*............................. - // sqrdmulh v24.4S, v17.4S, v23.S[1] // ...........................................................................................................................................*......... - // mls v12.4S, v13.4S, v8.S[0] // .................................................................................................................................*................... - // mul v13.4S, v28.4S, v23.S[0] // ..........................................................................................................................*.......................... - // sqrdmulh v17.4S, v28.4S, v23.S[1] // .........................................................................................................................*........................... - // mls v10.4S, v24.4S, v8.S[0] // .................................................................................................................................................*... - // sub v24.4S, v7.4S, v6.4S // ...................................................................................................................................*................. - // add v7.4S, v7.4S, v6.4S // .................................................................................................................*................................... - // mls v13.4S, v17.4S, v8.S[0] // ..............................................................................................................................*...................... - // mul v6.4S, v24.4S, v23.S[0] // .........................................................................................................................................*........... - // sqrdmulh v24.4S, v24.4S, v23.S[1] // .......................................................................................................................................*............. - // str q0, [x1], #(16*4) // ...............................................................................................................................................*..... - // mls v6.4S, v24.4S, v8.S[0] // ..............................................................................................................................................*...... - // str q30, [x1, #-48] // ............................................................................................................................*........................ - // str q1, [x1, #-32] // .............................................................................................................................................*....... - // str q7, [x1, #-16] // ....................................................................................................................*................................ - // add x1, x1, #64 // ................................................................................................................................................*.... - // str q12, [x2], #(16*4) // ......................................................................................................................................*.............. - // str q13, [x2, #-48] // ..................................................................................................................................*.................. - // str q10, [x2, #-32] // ...................................................................................................................................................*. - // str q6, [x2, #-16] // ..................................................................................................................................................*.. + // ------------------------------------------------------------------- new position -------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + // trn1 v22.4S, v10.4S, v12.4S // *.................................................................................................................................................... + // trn2 v11.4S, v10.4S, v12.4S // .*................................................................................................................................................... + // trn2 v10.4S, v27.4S, v6.4S // ..*.................................................................................................................................................. + // trn2 v12.2D, v22.2D, v2.2D // ...*................................................................................................................................................. + // trn1 v22.2D, v22.2D, v2.2D // ....*................................................................................................................................................ + // trn2 v6.2D, v11.2D, v10.2D // .....*............................................................................................................................................... + // trn1 v11.2D, v11.2D, v10.2D // ......*.............................................................................................................................................. + // sub v10.4S, v12.4S, v6.4S // .......*............................................................................................................................................. + // add v12.4S, v12.4S, v6.4S // ........*............................................................................................................................................ + // sub v6.4S, v22.4S, v11.4S // .........*........................................................................................................................................... + // add v22.4S, v22.4S, v11.4S // ..........*.......................................................................................................................................... + // trn1 v11.4S, v4.4S, v1.4S // ...........*......................................................................................................................................... + // trn2 v27.4S, v4.4S, v1.4S // ............*........................................................................................................................................ + // trn1 v4.4S, v24.4S, v18.4S // .............*....................................................................................................................................... + // trn2 v2.4S, v24.4S, v18.4S // ..............*...................................................................................................................................... + // sqrdmulh v1.4S, v10.4S, v20.4S // ...............*..................................................................................................................................... + // mul v10.4S, v10.4S, v25.4S // ................*.................................................................................................................................... + // sub v24.4S, v22.4S, v12.4S // .................*................................................................................................................................... + // add v22.4S, v22.4S, v12.4S // ..................*.................................................................................................................................. + // sqrdmulh v12.4S, v6.4S, v31.4S // ...................*................................................................................................................................. + // mul v6.4S, v6.4S, v16.4S // ....................*................................................................................................................................ + // trn2 v16.2D, v11.2D, v4.2D // .....................*............................................................................................................................... + // trn2 v18.2D, v27.2D, v2.2D // ......................*.............................................................................................................................. + // trn1 v11.2D, v11.2D, v4.2D // .......................*............................................................................................................................. + // trn1 v27.2D, v27.2D, v2.2D // ........................*............................................................................................................................ + // mls v6.4S, v12.4S, v8.S[0] // .........................*........................................................................................................................... + // mls v10.4S, v1.4S, v8.S[0] // ..........................*.......................................................................................................................... + // sqrdmulh v12.4S, v24.4S, v0.4S // ...........................*......................................................................................................................... + // mul v4.4S, v24.4S, v23.4S // ............................*........................................................................................................................ + // sub v2.4S, v11.4S, v27.4S // .............................*....................................................................................................................... + // sub v1.4S, v6.4S, v10.4S // ..............................*...................................................................................................................... + // add v10.4S, v6.4S, v10.4S // ...............................*..................................................................................................................... + // mls v4.4S, v12.4S, v8.S[0] // ................................*.................................................................................................................... + // sqrdmulh v12.4S, v1.4S, v0.4S // .................................*................................................................................................................... + // mul v6.4S, v1.4S, v23.4S // ..................................*.................................................................................................................. + // add v11.4S, v11.4S, v27.4S // ...................................*................................................................................................................. + // sqrdmulh v27.4S, v2.4S, v15.4S // ....................................*................................................................................................................ + // trn1 v1.4S, v22.4S, v10.4S // .....................................*............................................................................................................... + // trn2 v22.4S, v22.4S, v10.4S // ......................................*.............................................................................................................. + // mls v6.4S, v12.4S, v8.S[0] // .......................................*............................................................................................................. + // mul v10.4S, v2.4S, v17.4S // ........................................*............................................................................................................ + // sub v12.4S, v16.4S, v18.4S // .........................................*........................................................................................................... + // add v2.4S, v16.4S, v18.4S // ..........................................*.......................................................................................................... + // trn1 v24.4S, v4.4S, v6.4S // ...........................................*......................................................................................................... + // mls v10.4S, v27.4S, v8.S[0] // ............................................*........................................................................................................ + // sqrdmulh v27.4S, v12.4S, v26.4S // .............................................*....................................................................................................... + // mul v12.4S, v12.4S, v3.4S // ..............................................*...................................................................................................... + // sub v16.4S, v11.4S, v2.4S // ...............................................*..................................................................................................... + // add v11.4S, v11.4S, v2.4S // ................................................*.................................................................................................... + // trn2 v6.4S, v4.4S, v6.4S // .................................................*................................................................................................... + // mls v12.4S, v27.4S, v8.S[0] // ..................................................*.................................................................................................. + // sqrdmulh v27.4S, v16.4S, v5.4S // ...................................................*................................................................................................. + // mul v4.4S, v16.4S, v13.4S // ....................................................*................................................................................................ + // trn2 v2.2D, v1.2D, v24.2D // .....................................................*............................................................................................... + // sub v16.4S, v10.4S, v12.4S // ......................................................*.............................................................................................. + // add v10.4S, v10.4S, v12.4S // .......................................................*............................................................................................. + // mls v4.4S, v27.4S, v8.S[0] // ........................................................*............................................................................................ + // sqrdmulh v12.4S, v16.4S, v5.4S // .........................................................*........................................................................................... + // mul v27.4S, v16.4S, v13.4S // ..........................................................*.......................................................................................... + // trn2 v16.2D, v22.2D, v6.2D // ...........................................................*......................................................................................... + // trn1 v1.2D, v1.2D, v24.2D // ............................................................*........................................................................................ + // trn1 v22.2D, v22.2D, v6.2D // .............................................................*....................................................................................... + // mls v27.4S, v12.4S, v8.S[0] // ..............................................................*...................................................................................... + // trn1 v12.4S, v11.4S, v10.4S // ...............................................................*..................................................................................... + // trn2 v11.4S, v11.4S, v10.4S // ................................................................*.................................................................................... + // sub v10.4S, v1.4S, v22.4S // .................................................................*................................................................................... + // trn1 v6.4S, v4.4S, v27.4S // ..................................................................*.................................................................................. + // trn2 v27.4S, v4.4S, v27.4S // ...................................................................*................................................................................. + // add v22.4S, v1.4S, v22.4S // ....................................................................*................................................................................ + // trn2 v4.2D, v12.2D, v6.2D // .....................................................................*............................................................................... + // trn2 v1.2D, v11.2D, v27.2D // ......................................................................*.............................................................................. + // trn1 v12.2D, v12.2D, v6.2D // .......................................................................*............................................................................. + // trn1 v11.2D, v11.2D, v27.2D // ........................................................................*............................................................................ + // sqrdmulh v6.4S, v10.4S, v9.S[3] // .........................................................................*........................................................................... + // mul v10.4S, v10.4S, v9.S[2] // ..........................................................................*.......................................................................... + // sub v27.4S, v2.4S, v16.4S // ...........................................................................*......................................................................... + // add v2.4S, v2.4S, v16.4S // ............................................................................*........................................................................ + // sub v24.4S, v12.4S, v11.4S // .............................................................................*....................................................................... + // mls v10.4S, v6.4S, v8.S[0] // ..............................................................................*...................................................................... + // sqrdmulh v6.4S, v27.4S, v30.S[1] // ...............................................................................*..................................................................... + // mul v27.4S, v27.4S, v30.S[0] // ................................................................................*.................................................................... + // add v11.4S, v12.4S, v11.4S // .................................................................................*................................................................... + // sqrdmulh v12.4S, v24.4S, v30.S[3] // ..................................................................................*.................................................................. + // mul v24.4S, v24.4S, v30.S[2] // ...................................................................................*................................................................. + // mls v27.4S, v6.4S, v8.S[0] // ....................................................................................*................................................................ + // sub v6.4S, v4.4S, v1.4S // .....................................................................................*............................................................... + // add v4.4S, v4.4S, v1.4S // ......................................................................................*.............................................................. + // mls v24.4S, v12.4S, v8.S[0] // .......................................................................................*............................................................. + // sqrdmulh v12.4S, v6.4S, v28.S[1] // ........................................................................................*............................................................ + // mul v6.4S, v6.4S, v28.S[0] // .........................................................................................*........................................................... + // sub v1.4S, v22.4S, v2.4S // ..........................................................................................*.......................................................... + // add v22.4S, v22.4S, v2.4S // ...........................................................................................*......................................................... + // sub v2.4S, v10.4S, v27.4S // ............................................................................................*........................................................ + // mls v6.4S, v12.4S, v8.S[0] // .............................................................................................*....................................................... + // sqrdmulh v12.4S, v1.4S, v7.S[3] // ..............................................................................................*...................................................... + // mul v1.4S, v1.4S, v7.S[2] // ...............................................................................................*..................................................... + // add v10.4S, v10.4S, v27.4S // ................................................................................................*.................................................... + // sqrdmulh v27.4S, v2.4S, v7.S[3] // .................................................................................................*................................................... + // mul v2.4S, v2.4S, v7.S[2] // ..................................................................................................*.................................................. + // mls v1.4S, v12.4S, v8.S[0] // ...................................................................................................*................................................. + // sub v12.4S, v11.4S, v4.4S // ....................................................................................................*................................................ + // add v11.4S, v11.4S, v4.4S // .....................................................................................................*............................................... + // mls v2.4S, v27.4S, v8.S[0] // ......................................................................................................*.............................................. + // sqrdmulh v27.4S, v12.4S, v9.S[1] // .......................................................................................................*............................................. + // mul v12.4S, v12.4S, v9.S[0] // ........................................................................................................*............................................ + // sub v4.4S, v24.4S, v6.4S // .........................................................................................................*........................................... + // add v6.4S, v24.4S, v6.4S // ..........................................................................................................*.......................................... + // srshr v24.4S, v22.4S, #23 // ...........................................................................................................*......................................... + // mls v12.4S, v27.4S, v8.S[0] // ............................................................................................................*........................................ + // sqrdmulh v27.4S, v4.4S, v9.S[1] // .............................................................................................................*....................................... + // mul v4.4S, v4.4S, v9.S[0] // ..............................................................................................................*...................................... + // mls v22.4S, v24.4S, v8.4S // ...............................................................................................................*..................................... + // srshr v24.4S, v10.4S, #23 // ................................................................................................................*.................................... + // srshr v16.4S, v11.4S, #23 // .................................................................................................................*................................... + // mls v4.4S, v27.4S, v8.S[0] // ..................................................................................................................*.................................. + // mls v10.4S, v24.4S, v8.4S // ...................................................................................................................*................................. + // mls v11.4S, v16.4S, v8.4S // ....................................................................................................................*................................ + // srshr v27.4S, v6.4S, #23 // .....................................................................................................................*............................... + // sub v24.4S, v1.4S, v12.4S // ......................................................................................................................*.............................. + // add v12.4S, v1.4S, v12.4S // .......................................................................................................................*............................. + // mls v6.4S, v27.4S, v8.4S // ........................................................................................................................*............................ + // sub v27.4S, v22.4S, v11.4S // .........................................................................................................................*........................... + // add v22.4S, v22.4S, v11.4S // ..........................................................................................................................*.......................... + // sqrdmulh v11.4S, v24.4S, v7.S[1] // ...........................................................................................................................*......................... + // sqrdmulh v1.4S, v27.4S, v7.S[1] // ............................................................................................................................*........................ + // mul v27.4S, v27.4S, v7.S[0] // .............................................................................................................................*....................... + // sub v16.4S, v10.4S, v6.4S // ..............................................................................................................................*...................... + // add v10.4S, v10.4S, v6.4S // ...............................................................................................................................*..................... + // mul v6.4S, v24.4S, v7.S[0] // ................................................................................................................................*.................... + // mls v27.4S, v1.4S, v8.S[0] // .................................................................................................................................*................... + // sqrdmulh v1.4S, v16.4S, v7.S[1] // ..................................................................................................................................*.................. + // mul v24.4S, v16.4S, v7.S[0] // ...................................................................................................................................*................. + // mls v6.4S, v11.4S, v8.S[0] // .............................................................................................................................................*....... + // sub v11.4S, v2.4S, v4.4S // ....................................................................................................................................*................ + // add v4.4S, v2.4S, v4.4S // .....................................................................................................................................*............... + // mls v24.4S, v1.4S, v8.S[0] // .......................................................................................................................................*............. + // sqrdmulh v2.4S, v11.4S, v7.S[1] // .........................................................................................................................................*........... + // mul v11.4S, v11.4S, v7.S[0] // ...........................................................................................................................................*......... + // str q22, [x1], #(16*4) // ......................................................................................................................................*.............. + // mls v11.4S, v2.4S, v8.S[0] // ...............................................................................................................................................*..... + // str q10, [x1, #-48] // ........................................................................................................................................*............ + // str q12, [x1, #-32] // ..........................................................................................................................................*.......... + // str q4, [x1, #-16] // ............................................................................................................................................*........ + // add x1, x1, #64 // ..................................................................................................................................................*.. + // str q27, [x2], #(16*4) // ..............................................................................................................................................*...... + // str q24, [x2, #-48] // ................................................................................................................................................*.... + // str q6, [x2, #-32] // .................................................................................................................................................*... + // str q11, [x2, #-16] // ...................................................................................................................................................*. // add x2, x2, #64 // ....................................................................................................................................................* @@ -1555,772 +1583,808 @@ layer45678_start: load_roots_123 .p2align 2 - ldr q12, [x0, #256] // ..*......... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q21, [x0, #384] // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q15, [x0, #896] // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q17, [x0, #512] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - ldr q27, [x0, #768] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q28, [x0, #640] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - add v19.4S, v12.4S, v21.4S // ..........*. - // gap // ............ - add v11.4S, v27.4S, v15.4S // .........*.. - // gap // ............ - add v18.4S, v17.4S, v28.4S // ......*..... - // gap // ............ - ldr q7, [x0, #0] // *........... - // gap // ............ - // gap // ............ - // gap // ............ - add v4.4S, v18.4S, v11.4S // ...........* - // gap // ............ - ldr q6, [x0, #128] // .*.......... - // gap // ............ - - // original source code - // ldr q7, [x0, #0] // .........*.. - // ldr q6, [x0, #128] // ...........* - // ldr q12, [x0, #256] // *........... - // ldr q17, [x0, #512] // ...*........ - // ldr q28, [x0, #640] // .....*...... - // ldr q27, [x0, #768] // ....*....... - // add v18.4S, v17.4S, v28.4S // ........*... - // ldr q15, [x0, #896] // ..*......... - // ldr q21, [x0, #384] // .*.......... - // add v11.4S, v27.4S, v15.4S // .......*.... - // add v19.4S, v12.4S, v21.4S // ......*..... - // add v4.4S, v18.4S, v11.4S // ..........*. + // Instructions: 12 + // Expected cycles: 19 + // Expected IPC: 0.63 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q10, [x0, #256] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q6, [x0, #384] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q18, [x0, #896] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x0, #512] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q16, [x0, #768] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q24, [x0, #640] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v27.4S, v10.4S, v6.4S // ..........*................... + // gap // .............................. + add v13.4S, v16.4S, v18.4S // .........*.................... + // gap // .............................. + add v20.4S, v4.4S, v24.4S // ......*....................... + // gap // .............................. + ldr q11, [x0, #0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v23.4S, v20.4S, v13.4S // ...........*.................. + // gap // .............................. + ldr q12, [x0, #128] // .*............................ + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q11, [x0, #0] // .........*..................... + // ldr q12, [x0, #128] // ...........*................... + // ldr q10, [x0, #256] // *.............................. + // ldr q4, [x0, #512] // ...*........................... + // ldr q24, [x0, #640] // .....*......................... + // ldr q16, [x0, #768] // ....*.......................... + // add v20.4S, v4.4S, v24.4S // ........*...................... + // ldr q18, [x0, #896] // ..*............................ + // ldr q6, [x0, #384] // .*............................. + // add v13.4S, v16.4S, v18.4S // .......*....................... + // add v27.4S, v10.4S, v6.4S // ......*........................ + // add v23.4S, v20.4S, v13.4S // ..........*.................... sub count, count, #1 layer123_start: - sub v10.4S, v7.4S, v6.4S // ........*............................................................................................................... + // Instructions: 120 + // Expected cycles: 128 + // Expected IPC: 0.94 + // + // Wall time: 10.62s + // User time: 10.62s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + sub v22.4S, v11.4S, v12.4S // ........*............................................................................................................... // gap // ........................................................................................................................ - add v24.4S, v7.4S, v6.4S // .........*.............................................................................................................. + add v11.4S, v11.4S, v12.4S // .........*.............................................................................................................. // gap // ........................................................................................................................ - sub v7.4S, v12.4S, v21.4S // .............*.......................................................................................................... + sub v10.4S, v10.4S, v6.4S // .............*.......................................................................................................... // gap // ........................................................................................................................ - mul v13.4S, v10.4S, v1.S[2] // ..........*............................................................................................................. + sqrdmulh v12.4S, v22.4S, v1.S[3] // ..........*............................................................................................................. // gap // ........................................................................................................................ - sqrdmulh v10.4S, v10.4S, v1.S[3] // ...........*............................................................................................................ + mul v22.4S, v22.4S, v1.S[2] // ...........*............................................................................................................ // gap // ........................................................................................................................ - sub v6.4S, v24.4S, v19.4S // ............................*........................................................................................... + sub v6.4S, v11.4S, v27.4S // ............................*........................................................................................... // gap // ........................................................................................................................ - add v24.4S, v24.4S, v19.4S // .............................*.......................................................................................... + add v11.4S, v11.4S, v27.4S // .............................*.......................................................................................... // gap // ........................................................................................................................ - mul v12.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ + sqrdmulh v27.4S, v10.4S, v2.S[1] // ...............*........................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v7.4S, v7.4S, v2.S[1] // ................*....................................................................................................... + mul v10.4S, v10.4S, v2.S[0] // ................*....................................................................................................... // gap // ........................................................................................................................ - mls v13.4S, v10.4S, v8.S[0] // ............*........................................................................................................... + mls v22.4S, v12.4S, v8.S[0] // ............*........................................................................................................... // gap // ........................................................................................................................ - sub v10.4S, v17.4S, v28.4S // ..................*..................................................................................................... + sub v12.4S, v4.4S, v24.4S // ..................*..................................................................................................... // gap // ........................................................................................................................ - mul v17.4S, v6.4S, v0.S[2] // ..............................*......................................................................................... + sqrdmulh v4.4S, v6.4S, v0.S[3] // ..............................*......................................................................................... // gap // ........................................................................................................................ - sqrdmulh v6.4S, v6.4S, v0.S[3] // ...............................*........................................................................................ + mul v6.4S, v6.4S, v0.S[2] // ...............................*........................................................................................ // gap // ........................................................................................................................ - sub v28.4S, v24.4S, v4.4S // ................................................*....................................................................... + sub v24.4S, v11.4S, v23.4S // ................................................*....................................................................... // gap // ........................................................................................................................ - add v24.4S, v24.4S, v4.4S // .................................................*...................................................................... + add v11.4S, v11.4S, v23.4S // .................................................*...................................................................... // gap // ........................................................................................................................ - mls v12.4S, v7.4S, v8.S[0] // .................*...................................................................................................... + mls v10.4S, v27.4S, v8.S[0] // .................*...................................................................................................... // gap // ........................................................................................................................ - mul v7.4S, v10.4S, v2.S[2] // ....................*................................................................................................... + sqrdmulh v27.4S, v12.4S, v2.S[3] // ....................*................................................................................................... // gap // ........................................................................................................................ - sqrdmulh v10.4S, v10.4S, v2.S[3] // .....................*.................................................................................................. + mul v12.4S, v12.4S, v2.S[2] // .....................*.................................................................................................. // gap // ........................................................................................................................ - sub v27.4S, v27.4S, v15.4S // .......................*................................................................................................ + sub v16.4S, v16.4S, v18.4S // .......................*................................................................................................ // gap // ........................................................................................................................ - sub v15.4S, v13.4S, v12.4S // .................................*...................................................................................... + sub v18.4S, v22.4S, v10.4S // .................................*...................................................................................... // gap // ........................................................................................................................ - add v13.4S, v13.4S, v12.4S // ..................................*..................................................................................... + add v22.4S, v22.4S, v10.4S // ..................................*..................................................................................... // gap // ........................................................................................................................ - mls v7.4S, v10.4S, v8.S[0] // ......................*................................................................................................. + mls v12.4S, v27.4S, v8.S[0] // ......................*................................................................................................. // gap // ........................................................................................................................ - mul v10.4S, v27.4S, v3.S[0] // .........................*.............................................................................................. + sqrdmulh v10.4S, v16.4S, v3.S[1] // .........................*.............................................................................................. // gap // ........................................................................................................................ - mls v17.4S, v6.4S, v8.S[0] // ................................*....................................................................................... + mls v6.4S, v4.4S, v8.S[0] // ................................*....................................................................................... // gap // ........................................................................................................................ - sqrdmulh v6.4S, v27.4S, v3.S[1] // ..........................*............................................................................................. + mul v27.4S, v16.4S, v3.S[0] // ..........................*............................................................................................. // gap // ........................................................................................................................ - mul v12.4S, v15.4S, v0.S[2] // ...................................*.................................................................................... + sqrdmulh v4.4S, v18.4S, v0.S[3] // ...................................*.................................................................................... // gap // ........................................................................................................................ - sqrdmulh v27.4S, v15.4S, v0.S[3] // ....................................*................................................................................... + mul v16.4S, v18.4S, v0.S[2] // ....................................*................................................................................... // gap // ........................................................................................................................ - mul v15.4S, v28.4S, v0.S[0] // ..................................................*..................................................................... + sqrdmulh v18.4S, v24.4S, v0.S[1] // ..................................................*..................................................................... // gap // ........................................................................................................................ - sqrdmulh v28.4S, v28.4S, v0.S[1] // ...................................................*.................................................................... + mul v24.4S, v24.4S, v0.S[0] // ...................................................*.................................................................... // gap // ........................................................................................................................ - mul v21.4S, v24.4S, v25.4S // ........................................................................................*............................... + sqrdmulh v23.4S, v11.4S, v26.4S // ........................................................................................*............................... // gap // ........................................................................................................................ - sqrdmulh v24.4S, v24.4S, v26.4S // .........................................................................................*.............................. + mul v11.4S, v11.4S, v25.4S // .........................................................................................*.............................. // gap // ........................................................................................................................ - mls v10.4S, v6.4S, v8.S[0] // ...........................*............................................................................................ + mls v27.4S, v10.4S, v8.S[0] // ...........................*............................................................................................ // gap // ........................................................................................................................ - mls v12.4S, v27.4S, v8.S[0] // .....................................*.................................................................................. + mls v16.4S, v4.4S, v8.S[0] // .....................................*.................................................................................. // gap // ........................................................................................................................ - sub v6.4S, v18.4S, v11.4S // ......................................*................................................................................. + sub v10.4S, v20.4S, v13.4S // ......................................*................................................................................. // gap // ........................................................................................................................ - mls v15.4S, v28.4S, v8.S[0] // ....................................................*................................................................... + mls v24.4S, v18.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ - sub v28.4S, v7.4S, v10.4S // ...........................................*............................................................................ + sub v4.4S, v12.4S, v27.4S // ...........................................*............................................................................ // gap // ........................................................................................................................ - mul v27.4S, v6.4S, v1.S[0] // ........................................*............................................................................... + sqrdmulh v18.4S, v10.4S, v1.S[1] // ........................................*............................................................................... // gap // ........................................................................................................................ - sqrdmulh v6.4S, v6.4S, v1.S[1] // .........................................*.............................................................................. + mul v10.4S, v10.4S, v1.S[0] // .........................................*.............................................................................. // gap // ........................................................................................................................ - add v10.4S, v7.4S, v10.4S // ............................................*........................................................................... + add v12.4S, v12.4S, v27.4S // ............................................*........................................................................... // gap // ........................................................................................................................ - mul v7.4S, v28.4S, v1.S[0] // .............................................*.......................................................................... + sqrdmulh v27.4S, v4.4S, v1.S[1] // .............................................*.......................................................................... // gap // ........................................................................................................................ - sqrdmulh v28.4S, v28.4S, v1.S[1] // ..............................................*......................................................................... + mul v4.4S, v4.4S, v1.S[0] // ..............................................*......................................................................... // gap // ........................................................................................................................ - sub v18.4S, v13.4S, v10.4S // .....................................................*.................................................................. + sub v20.4S, v22.4S, v12.4S // .....................................................*.................................................................. // gap // ........................................................................................................................ - add v10.4S, v13.4S, v10.4S // ......................................................*................................................................. + add v22.4S, v22.4S, v12.4S // ......................................................*................................................................. // gap // ........................................................................................................................ - mls v27.4S, v6.4S, v8.S[0] // ..........................................*............................................................................. + mls v10.4S, v18.4S, v8.S[0] // ..........................................*............................................................................. // gap // ........................................................................................................................ - mls v7.4S, v28.4S, v8.S[0] // ...............................................*........................................................................ + mls v4.4S, v27.4S, v8.S[0] // ...............................................*........................................................................ // gap // ........................................................................................................................ - mul v13.4S, v18.4S, v0.S[0] // .......................................................*................................................................ + sqrdmulh v12.4S, v20.4S, v0.S[1] // .......................................................*................................................................ // gap // ........................................................................................................................ - sqrdmulh v6.4S, v18.4S, v0.S[1] // ........................................................*............................................................... + mul v27.4S, v20.4S, v0.S[0] // ........................................................*............................................................... // gap // ........................................................................................................................ - sub v28.4S, v17.4S, v27.4S // ..........................................................*............................................................. + sub v18.4S, v6.4S, v10.4S // ..........................................................*............................................................. // gap // ........................................................................................................................ - add v17.4S, v17.4S, v27.4S // ...........................................................*............................................................ + add v10.4S, v6.4S, v10.4S // ...........................................................*............................................................ // gap // ........................................................................................................................ - sub v27.4S, v12.4S, v7.4S // ...............................................................*........................................................ + sub v6.4S, v16.4S, v4.4S // ...............................................................*........................................................ // gap // ........................................................................................................................ - mls v13.4S, v6.4S, v8.S[0] // .........................................................*.............................................................. + mls v27.4S, v12.4S, v8.S[0] // .........................................................*.............................................................. // gap // ........................................................................................................................ - mul v6.4S, v28.4S, v0.S[0] // ............................................................*........................................................... + sqrdmulh v12.4S, v18.4S, v0.S[1] // ............................................................*........................................................... // gap // ........................................................................................................................ - sqrdmulh v28.4S, v28.4S, v0.S[1] // .............................................................*.......................................................... + mul v18.4S, v18.4S, v0.S[0] // .............................................................*.......................................................... // gap // ........................................................................................................................ - add v7.4S, v12.4S, v7.4S // ................................................................*....................................................... + add v4.4S, v16.4S, v4.4S // ................................................................*....................................................... // gap // ........................................................................................................................ - mul v12.4S, v27.4S, v0.S[0] // .................................................................*...................................................... + sqrdmulh v16.4S, v6.4S, v0.S[1] // .................................................................*...................................................... // gap // ........................................................................................................................ - sqrdmulh v27.4S, v27.4S, v0.S[1] // ..................................................................*..................................................... + mul v6.4S, v6.4S, v0.S[0] // ..................................................................*..................................................... // gap // ........................................................................................................................ - mls v6.4S, v28.4S, v8.S[0] // ..............................................................*......................................................... + mls v18.4S, v12.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ - cmge v28.4S, v31.4S, v15.4S // ....................................................................*................................................... + cmge v12.4S, v31.4S, v24.4S // ....................................................................*................................................... // gap // ........................................................................................................................ - cmge v18.4S, v15.4S, v30.4S // .....................................................................*.................................................. + cmge v20.4S, v24.4S, v30.4S // .....................................................................*.................................................. // gap // ........................................................................................................................ - mls v21.4S, v24.4S, v8.S[0] // ..........................................................................................*............................. + mls v11.4S, v23.4S, v8.S[0] // ..........................................................................................*............................. // gap // ........................................................................................................................ - mls v12.4S, v27.4S, v8.S[0] // ...................................................................*.................................................... + mls v6.4S, v16.4S, v8.S[0] // ...................................................................*.................................................... // gap // ........................................................................................................................ - sub v24.4S, v28.4S, v18.4S // ......................................................................*................................................. + sub v12.4S, v12.4S, v20.4S // ......................................................................*................................................. // gap // ........................................................................................................................ - cmge v28.4S, v31.4S, v13.4S // ........................................................................*............................................... + cmge v16.4S, v31.4S, v27.4S // ........................................................................*............................................... // gap // ........................................................................................................................ - cmge v27.4S, v13.4S, v30.4S // .........................................................................*.............................................. + cmge v23.4S, v27.4S, v30.4S // .........................................................................*.............................................. // gap // ........................................................................................................................ - mls v15.4S, v24.4S, v8.4S // .......................................................................*................................................ + mls v24.4S, v12.4S, v8.4S // .......................................................................*................................................ // gap // ........................................................................................................................ - sub v24.4S, v28.4S, v27.4S // ..........................................................................*............................................. + sub v12.4S, v16.4S, v23.4S // ..........................................................................*............................................. // gap // ........................................................................................................................ - cmge v28.4S, v31.4S, v6.4S // ............................................................................*........................................... + cmge v16.4S, v31.4S, v18.4S // ............................................................................*........................................... // gap // ........................................................................................................................ - cmge v27.4S, v6.4S, v30.4S // .............................................................................*.......................................... + cmge v23.4S, v18.4S, v30.4S // .............................................................................*.......................................... // gap // ........................................................................................................................ - mls v13.4S, v24.4S, v8.4S // ...........................................................................*............................................ + mls v27.4S, v12.4S, v8.4S // ...........................................................................*............................................ // gap // ........................................................................................................................ - sub v24.4S, v28.4S, v27.4S // ..............................................................................*......................................... + sub v12.4S, v16.4S, v23.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ - cmge v28.4S, v31.4S, v12.4S // ................................................................................*....................................... + cmge v16.4S, v31.4S, v6.4S // ................................................................................*....................................... // gap // ........................................................................................................................ - cmge v27.4S, v12.4S, v30.4S // .................................................................................*...................................... + cmge v23.4S, v6.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ - mls v6.4S, v24.4S, v8.4S // ...............................................................................*........................................ + mls v18.4S, v12.4S, v8.4S // ...............................................................................*........................................ // gap // ........................................................................................................................ - sub v24.4S, v28.4S, v27.4S // ..................................................................................*..................................... + sub v12.4S, v16.4S, v23.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ - str q15, [x0, #512] // ....................................................................................*................................... + str q24, [x0, #512] // ....................................................................................*................................... // gap // ........................................................................................................................ - mul v28.4S, v10.4S, v25.4S // ...........................................................................................*............................ + sqrdmulh v24.4S, v22.4S, v26.4S // ...........................................................................................*............................ // gap // ........................................................................................................................ - mls v12.4S, v24.4S, v8.4S // ...................................................................................*.................................... + mls v6.4S, v12.4S, v8.4S // ...................................................................................*.................................... // gap // ........................................................................................................................ - str q13, [x0, #640] // .....................................................................................*.................................. + str q27, [x0, #640] // .....................................................................................*.................................. // gap // ........................................................................................................................ - sqrdmulh v10.4S, v10.4S, v26.4S // ............................................................................................*........................... + mul v22.4S, v22.4S, v25.4S // ............................................................................................*........................... // gap // ........................................................................................................................ - str q6, [x0, #768] // ......................................................................................*................................. + str q18, [x0, #768] // ......................................................................................*................................. // gap // ........................................................................................................................ - mul v24.4S, v17.4S, v25.4S // ..............................................................................................*......................... + sqrdmulh v12.4S, v10.4S, v26.4S // ..............................................................................................*......................... // gap // ........................................................................................................................ - str q12, [x0, #896] // .......................................................................................*................................ + str q6, [x0, #896] // .......................................................................................*................................ // gap // ........................................................................................................................ - mls v28.4S, v10.4S, v8.S[0] // .............................................................................................*.......................... + mls v22.4S, v24.4S, v8.S[0] // .............................................................................................*.......................... // gap // ........................................................................................................................ - sqrdmulh v10.4S, v17.4S, v26.4S // ...............................................................................................*........................ + mul v10.4S, v10.4S, v25.4S // ...............................................................................................*........................ // gap // ........................................................................................................................ - mul v13.4S, v7.4S, v25.4S // .................................................................................................*...................... + sqrdmulh v6.4S, v4.4S, v26.4S // .................................................................................................*...................... // gap // ........................................................................................................................ - sqrdmulh v7.4S, v7.4S, v26.4S // ..................................................................................................*..................... + mul v27.4S, v4.4S, v25.4S // ..................................................................................................*..................... // gap // ........................................................................................................................ - cmge v6.4S, v31.4S, v21.4S // ....................................................................................................*................... + cmge v4.4S, v31.4S, v11.4S // ....................................................................................................*................... // gap // ........................................................................................................................ - mls v24.4S, v10.4S, v8.S[0] // ................................................................................................*....................... + mls v10.4S, v12.4S, v8.S[0] // ................................................................................................*....................... // gap // ........................................................................................................................ - cmge v10.4S, v21.4S, v30.4S // .....................................................................................................*.................. + cmge v12.4S, v11.4S, v30.4S // .....................................................................................................*.................. // gap // ........................................................................................................................ - mls v13.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... + mls v27.4S, v6.4S, v8.S[0] // ...................................................................................................*.................... // gap // ........................................................................................................................ - sub v10.4S, v6.4S, v10.4S // ......................................................................................................*................. + sub v12.4S, v4.4S, v12.4S // ......................................................................................................*................. // gap // ........................................................................................................................ - cmge v7.4S, v31.4S, v28.4S // ........................................................................................................*............... + cmge v6.4S, v31.4S, v22.4S // ........................................................................................................*............... // gap // ........................................................................................................................ - cmge v6.4S, v28.4S, v30.4S // .........................................................................................................*.............. + cmge v4.4S, v22.4S, v30.4S // .........................................................................................................*.............. // gap // ........................................................................................................................ - mls v21.4S, v10.4S, v8.4S // .......................................................................................................*................ + mls v11.4S, v12.4S, v8.4S // .......................................................................................................*................ // gap // ........................................................................................................................ - sub v10.4S, v7.4S, v6.4S // ..........................................................................................................*............. + sub v12.4S, v6.4S, v4.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ - cmge v7.4S, v31.4S, v24.4S // ............................................................................................................*........... + cmge v6.4S, v31.4S, v10.4S // ............................................................................................................*........... // gap // ........................................................................................................................ - cmge v6.4S, v24.4S, v30.4S // .............................................................................................................*.......... + cmge v4.4S, v10.4S, v30.4S // .............................................................................................................*.......... // gap // ........................................................................................................................ - mls v28.4S, v10.4S, v8.4S // ...........................................................................................................*............ + mls v22.4S, v12.4S, v8.4S // ...........................................................................................................*............ // gap // ........................................................................................................................ - sub v10.4S, v7.4S, v6.4S // ..............................................................................................................*......... + sub v12.4S, v6.4S, v4.4S // ..............................................................................................................*......... // gap // ........................................................................................................................ - cmge v7.4S, v31.4S, v13.4S // ................................................................................................................*....... + cmge v6.4S, v31.4S, v27.4S // ................................................................................................................*....... // gap // ........................................................................................................................ - cmge v6.4S, v13.4S, v30.4S // .................................................................................................................*...... + cmge v4.4S, v27.4S, v30.4S // .................................................................................................................*...... // gap // ........................................................................................................................ - mls v24.4S, v10.4S, v8.4S // ...............................................................................................................*........ + mls v10.4S, v12.4S, v8.4S // ...............................................................................................................*........ // gap // ........................................................................................................................ - sub v10.4S, v7.4S, v6.4S // ..................................................................................................................*..... + sub v12.4S, v6.4S, v4.4S // ..................................................................................................................*..... // gap // ........................................................................................................................ - str q21, [x0], #(16) // ....................................................................................................................*... + str q11, [x0], #(16) // ....................................................................................................................*... // gap // ........................................................................................................................ - ldr q7, [x0, #0] // e....................................................................................................................... + ldr q11, [x0, #0] // e....................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v13.4S, v10.4S, v8.4S // ...................................................................................................................*.... + mls v27.4S, v12.4S, v8.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ - str q28, [x0, #112] // .....................................................................................................................*.. + str q22, [x0, #112] // .....................................................................................................................*.. // gap // ........................................................................................................................ - ldr q6, [x0, #128] // .e...................................................................................................................... + ldr q12, [x0, #128] // .e...................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q24, [x0, #240] // ......................................................................................................................*. + str q10, [x0, #240] // ......................................................................................................................*. // gap // ........................................................................................................................ - ldr q12, [x0, #256] // ..e..................................................................................................................... + ldr q10, [x0, #256] // ..e..................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q17, [x0, #512] // ....e................................................................................................................... + ldr q4, [x0, #512] // ....e................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q28, [x0, #640] // .....e.................................................................................................................. + ldr q24, [x0, #640] // .....e.................................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q27, [x0, #768] // ......e................................................................................................................. + ldr q16, [x0, #768] // ......e................................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v18.4S, v17.4S, v28.4S // ...................e.................................................................................................... + add v20.4S, v4.4S, v24.4S // ...................e.................................................................................................... // gap // ........................................................................................................................ - ldr q15, [x0, #896] // .......e................................................................................................................ + ldr q18, [x0, #896] // .......e................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q21, [x0, #384] // ...e.................................................................................................................... + ldr q6, [x0, #384] // ...e.................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v11.4S, v27.4S, v15.4S // ........................e............................................................................................... + add v13.4S, v16.4S, v18.4S // ........................e............................................................................................... // gap // ........................................................................................................................ - str q13, [x0, #368] // .......................................................................................................................* + str q27, [x0, #368] // .......................................................................................................................* // gap // ........................................................................................................................ - add v19.4S, v12.4S, v21.4S // ..............e......................................................................................................... + add v27.4S, v10.4S, v6.4S // ..............e......................................................................................................... // gap // ........................................................................................................................ - add v4.4S, v18.4S, v11.4S // .......................................e................................................................................ + add v23.4S, v20.4S, v13.4S // .......................................e................................................................................ // gap // ........................................................................................................................ - // original source code - // ldr q9, [x0, #0] // e...............|.......................................................................................................e............. - // ldr q10, [x0, #(1*(1024/8))] // ...e............|..........................................................................................................e.......... - // ldr q11, [x0, #(2*(1024/8))] // .....e..........|............................................................................................................e........ - // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..................................................................................................................e.. - // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.............................................................................................................e....... - // ldr q14, [x0, #(5*(1024/8))] // .......e........|..............................................................................................................e...... - // ldr q15, [x0, #(6*(1024/8))] // ........e.......|...............................................................................................................e..... - // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.................................................................................................................e... + // ----------------------------------------------------------- new position ------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-------- + // ldr q9, [x0, #0] // e...............'.......................................................................................................~............. + // ldr q10, [x0, #(1*(1024/8))] // ...e............'..........................................................................................................~.......... + // ldr q11, [x0, #(2*(1024/8))] // .....e..........'............................................................................................................~........ + // ldr q12, [x0, #(3*(1024/8))] // ...........e....'..................................................................................................................~.. + // ldr q13, [x0, #(4*(1024/8))] // ......e.........'.............................................................................................................~....... + // ldr q14, [x0, #(5*(1024/8))] // .......e........'..............................................................................................................~...... + // ldr q15, [x0, #(6*(1024/8))] // ........e.......'...............................................................................................................~..... + // ldr q16, [x0, #(7*(1024/8))] // ..........e.....'.................................................................................................................~... // sub v24.4s, v9.4s, v10.4s // ................*..................................................................................................................... - // add v9.4s, v9.4s, v10.4s // ................|*.................................................................................................................... - // mul v10.4s, v24.4s, v1.s[2] // ................|..*.................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*................................................................................................................. - // mls v10.4s, v24.4s, v8.s[0] // ................|........*............................................................................................................ - // sub v24.4s, v11.4s, v12.4s // ................|.*................................................................................................................... - // add v11.4s, v11.4s, v12.4s // ..............e.|..................................................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ................|......*.............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*............................................................................................................. - // mls v12.4s, v24.4s, v8.s[0] // ................|..............*...................................................................................................... - // sub v24.4s, v13.4s, v14.4s // ................|.........*........................................................................................................... - // add v13.4s, v13.4s, v14.4s // .........e......|................................................................................................................e.... - // mul v14.4s, v24.4s, v2.s[2] // ................|...............*..................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*.................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ................|....................*................................................................................................ - // sub v24.4s, v15.4s, v16.4s // ................|.................*................................................................................................... - // add v15.4s, v15.4s, v16.4s // ............e...|...................................................................................................................e. - // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*............................................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*............................................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*...................................................................................... - // sub v24.4s, v9.4s, v11.4s // ................|....*................................................................................................................ - // add v9.4s, v9.4s, v11.4s // ................|.....*............................................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.......................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*......................................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ................|......................*.............................................................................................. - // sub v24.4s, v10.4s, v12.4s // ................|..................*.................................................................................................. - // add v10.4s, v10.4s, v12.4s // ................|...................*................................................................................................. - // mul v12.4s, v24.4s, v0.s[2] // ................|........................*............................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*........................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*..................................................................................... - // sub v24.4s, v13.4s, v15.4s // ................|................................*.................................................................................... - // add v13.4s, v13.4s, v15.4s // ...............e|..................................................................................................................... - // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*................................................................................ - // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.......................................................................... - // sub v24.4s, v14.4s, v16.4s // ................|..................................*.................................................................................. - // add v14.4s, v14.4s, v16.4s // ................|.....................................*............................................................................... - // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*.............................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*............................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*......................................................................... - // sub v24.4s, v9.4s, v13.4s // ................|............*........................................................................................................ - // add v9.4s, v9.4s, v13.4s // ................|.............*....................................................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.......................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*......................................................................................... - // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*................................................................................... - // sub v24.4s, v10.4s, v14.4s // ................|........................................*............................................................................ - // add v10.4s, v10.4s, v14.4s // ................|.........................................*........................................................................... - // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*........................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*....................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*................................................................... - // sub v24.4s, v11.4s, v15.4s // ................|..............................................*...................................................................... - // add v11.4s, v11.4s, v15.4s // ................|...............................................*..................................................................... - // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*................................................................. - // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*............................................................. - // sub v24.4s, v12.4s, v16.4s // ................|................................................*.................................................................... - // add v12.4s, v12.4s, v16.4s // ................|....................................................*................................................................ - // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*............................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*.............................................................. - // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*......................................................... - // cmge v27.4s, v31.4s, v13.4s // ................|........................................................*............................................................ - // cmge v28.4s, v13.4s, v30.4s // ................|.........................................................*........................................................... - // sub v28.4s, v27.4s, v28.4s // ................|............................................................*........................................................ - // mls v13.4s, v28.4s, v8.4s // ................|...............................................................*..................................................... - // cmge v27.4s, v31.4s, v14.4s // ................|.............................................................*....................................................... - // cmge v28.4s, v14.4s, v30.4s // ................|..............................................................*...................................................... - // sub v28.4s, v27.4s, v28.4s // ................|................................................................*.................................................... - // mls v14.4s, v28.4s, v8.4s // ................|...................................................................*................................................. - // cmge v27.4s, v31.4s, v15.4s // ................|.................................................................*................................................... - // cmge v28.4s, v15.4s, v30.4s // ................|..................................................................*.................................................. - // sub v28.4s, v27.4s, v28.4s // ................|....................................................................*................................................ - // mls v15.4s, v28.4s, v8.4s // ................|.......................................................................*............................................. - // cmge v27.4s, v31.4s, v16.4s // ................|.....................................................................*............................................... - // cmge v28.4s, v16.4s, v30.4s // ................|......................................................................*.............................................. - // sub v28.4s, v27.4s, v28.4s // ................|........................................................................*............................................ - // mls v16.4s, v28.4s, v8.4s // ................|...........................................................................*......................................... - // str q13, [x0, #(4*(1024/8))] // ................|.........................................................................*........................................... - // str q14, [x0, #(5*(1024/8))] // ................|............................................................................*........................................ - // str q15, [x0, #(6*(1024/8))] // ................|..............................................................................*...................................... - // str q16, [x0, #(7*(1024/8))] // ................|................................................................................*.................................... - // mul v13.4s, v9.4s, v25.4s // ................|............................*........................................................................................ - // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*....................................................................................... - // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.......................................................... - // mul v14.4s, v10.4s, v25.4s // ................|..........................................................................*.......................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ................|.............................................................................*....................................... - // mls v14.4s, v10.4s, v8.s[0] // ................|.................................................................................*................................... - // mul v15.4s, v11.4s, v25.4s // ................|...............................................................................*..................................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ................|..................................................................................*.................................. - // mls v15.4s, v11.4s, v8.s[0] // ................|......................................................................................*.............................. - // mul v16.4s, v12.4s, v25.4s // ................|...................................................................................*................................. - // sqrdmulh v12.4s, v12.4s, v26.4s // ................|....................................................................................*................................ - // mls v16.4s, v12.4s, v8.s[0] // ................|........................................................................................*............................ - // cmge v27.4s, v31.4s, v13.4s // ................|.....................................................................................*............................... - // cmge v28.4s, v13.4s, v30.4s // ................|.......................................................................................*............................. - // sub v28.4s, v27.4s, v28.4s // ................|.........................................................................................*........................... - // mls v13.4s, v28.4s, v8.4s // ................|............................................................................................*........................ - // cmge v27.4s, v31.4s, v14.4s // ................|..........................................................................................*.......................... - // cmge v28.4s, v14.4s, v30.4s // ................|...........................................................................................*......................... - // sub v28.4s, v27.4s, v28.4s // ................|.............................................................................................*....................... - // mls v14.4s, v28.4s, v8.4s // ................|................................................................................................*.................... - // cmge v27.4s, v31.4s, v15.4s // ................|..............................................................................................*...................... - // cmge v28.4s, v15.4s, v30.4s // ................|...............................................................................................*..................... - // sub v28.4s, v27.4s, v28.4s // ................|.................................................................................................*................... - // mls v15.4s, v28.4s, v8.4s // ................|....................................................................................................*................ - // cmge v27.4s, v31.4s, v16.4s // ................|..................................................................................................*.................. - // cmge v28.4s, v16.4s, v30.4s // ................|...................................................................................................*................. - // sub v28.4s, v27.4s, v28.4s // ................|.....................................................................................................*............... - // mls v16.4s, v28.4s, v8.4s // .*..............|........................................................................................................*............ - // str q13, [x0], #(16) // ................|......................................................................................................*.............. - // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.........................................................................................................*........... - // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...........................................................................................................*......... - // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|....................................................................................................................* + // add v9.4s, v9.4s, v10.4s // ................'*.................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ................'..*.................................................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ................'...*................................................................................................................. + // mls v10.4s, v27.4s, v8.s[0] // ................'........*............................................................................................................ + // sub v24.4s, v11.4s, v12.4s // ................'.*................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ..............e.'..................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ................'......*.............................................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ................'.......*............................................................................................................. + // mls v12.4s, v27.4s, v8.s[0] // ................'..............*...................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ................'.........*........................................................................................................... + // add v13.4s, v13.4s, v14.4s // .........e......'................................................................................................................~.... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ................'...............*..................................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ................'................*.................................................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ................'....................*................................................................................................ + // sub v24.4s, v15.4s, v16.4s // ................'.................*................................................................................................... + // add v15.4s, v15.4s, v16.4s // ............e...'...................................................................................................................~. + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ................'.....................*............................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ................'.......................*............................................................................................. + // mls v16.4s, v27.4s, v8.s[0] // ................'..............................*...................................................................................... + // sub v24.4s, v9.4s, v11.4s // ................'....*................................................................................................................ + // add v9.4s, v9.4s, v11.4s // ................'.....*............................................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ................'..........*.......................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................'...........*......................................................................................................... + // mls v11.4s, v27.4s, v8.s[0] // ................'......................*.............................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................'..................*.................................................................................................. + // add v10.4s, v10.4s, v12.4s // ................'...................*................................................................................................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ................'........................*............................................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ................'.........................*........................................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ................'...............................*..................................................................................... + // sub v24.4s, v13.4s, v15.4s // ................'................................*.................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............e'..................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ................'...................................*................................................................................. + // mul v15.4s, v24.4s, v1.s[0] // ................'....................................*................................................................................ + // mls v15.4s, v27.4s, v8.s[0] // ................'..........................................*.......................................................................... + // sub v24.4s, v14.4s, v16.4s // ................'..................................*.................................................................................. + // add v14.4s, v14.4s, v16.4s // ................'.....................................*............................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ................'......................................*.............................................................................. + // mul v16.4s, v24.4s, v1.s[0] // ................'.......................................*............................................................................. + // mls v16.4s, v27.4s, v8.s[0] // ................'...........................................*......................................................................... + // sub v24.4s, v9.4s, v13.4s // ................'............*........................................................................................................ + // add v9.4s, v9.4s, v13.4s // ................'.............*....................................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................'..........................*.......................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ................'...........................*......................................................................................... + // mls v13.4s, v27.4s, v8.s[0] // ................'.................................*................................................................................... + // sub v24.4s, v10.4s, v14.4s // ................'........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ................'.........................................*........................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................'............................................*........................................................................ + // mul v14.4s, v24.4s, v0.s[0] // ................'.............................................*....................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ................'.................................................*................................................................... + // sub v24.4s, v11.4s, v15.4s // ................'..............................................*...................................................................... + // add v11.4s, v11.4s, v15.4s // ................'...............................................*..................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................'..................................................*.................................................................. + // mul v15.4s, v24.4s, v0.s[0] // ................'...................................................*................................................................. + // mls v15.4s, v27.4s, v8.s[0] // ................'.......................................................*............................................................. + // sub v24.4s, v12.4s, v16.4s // ................'................................................*.................................................................... + // add v12.4s, v12.4s, v16.4s // ................'....................................................*................................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................'.....................................................*............................................................... + // mul v16.4s, v24.4s, v0.s[0] // ................'......................................................*.............................................................. + // mls v16.4s, v27.4s, v8.s[0] // ................'...........................................................*......................................................... + // cmge v27.4s, v31.4s, v13.4s // ................'........................................................*............................................................ + // cmge v28.4s, v13.4s, v30.4s // ................'.........................................................*........................................................... + // sub v28.4s, v27.4s, v28.4s // ................'............................................................*........................................................ + // mls v13.4s, v28.4s, v8.4s // ................'...............................................................*..................................................... + // cmge v27.4s, v31.4s, v14.4s // ................'.............................................................*....................................................... + // cmge v28.4s, v14.4s, v30.4s // ................'..............................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ................'................................................................*.................................................... + // mls v14.4s, v28.4s, v8.4s // ................'...................................................................*................................................. + // cmge v27.4s, v31.4s, v15.4s // ................'.................................................................*................................................... + // cmge v28.4s, v15.4s, v30.4s // ................'..................................................................*.................................................. + // sub v28.4s, v27.4s, v28.4s // ................'....................................................................*................................................ + // mls v15.4s, v28.4s, v8.4s // ................'.......................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // ................'.....................................................................*............................................... + // cmge v28.4s, v16.4s, v30.4s // ................'......................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ................'........................................................................*............................................ + // mls v16.4s, v28.4s, v8.4s // ................'...........................................................................*......................................... + // str q13, [x0, #(4*(1024/8))] // ................'.........................................................................*........................................... + // str q14, [x0, #(5*(1024/8))] // ................'............................................................................*........................................ + // str q15, [x0, #(6*(1024/8))] // ................'..............................................................................*...................................... + // str q16, [x0, #(7*(1024/8))] // ................'................................................................................*.................................... + // sqrdmulh v27.4s, v9.4s, v26.4s // ................'............................*........................................................................................ + // mul v9.4s, v9.4s, v25.4s // ................'.............................*....................................................................................... + // mls v9.4s, v27.4s, v8.s[0] // ................'..........................................................*.......................................................... + // sqrdmulh v27.4s, v10.4s, v26.4s // ................'..........................................................................*.......................................... + // mul v10.4s, v10.4s, v25.4s // ................'.............................................................................*....................................... + // mls v10.4s, v27.4s, v8.s[0] // ................'.................................................................................*................................... + // sqrdmulh v27.4s, v11.4s, v26.4s // ................'...............................................................................*..................................... + // mul v11.4s, v11.4s, v25.4s // ................'..................................................................................*.................................. + // mls v11.4s, v27.4s, v8.s[0] // ................'......................................................................................*.............................. + // sqrdmulh v27.4s, v12.4s, v26.4s // ................'...................................................................................*................................. + // mul v12.4s, v12.4s, v25.4s // ................'....................................................................................*................................ + // mls v12.4s, v27.4s, v8.s[0] // ................'........................................................................................*............................ + // cmge v27.4s, v31.4s, v9.4s // ................'.....................................................................................*............................... + // cmge v28.4s, v9.4s, v30.4s // ................'.......................................................................................*............................. + // sub v28.4s, v27.4s, v28.4s // ................'.........................................................................................*........................... + // mls v9.4s, v28.4s, v8.4s // ................'............................................................................................*........................ + // cmge v27.4s, v31.4s, v10.4s // ................'..........................................................................................*.......................... + // cmge v28.4s, v10.4s, v30.4s // ................'...........................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ................'.............................................................................................*....................... + // mls v10.4s, v28.4s, v8.4s // ................'................................................................................................*.................... + // cmge v27.4s, v31.4s, v11.4s // ................'..............................................................................................*...................... + // cmge v28.4s, v11.4s, v30.4s // ................'...............................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ................'.................................................................................................*................... + // mls v11.4s, v28.4s, v8.4s // ................'....................................................................................................*................ + // cmge v27.4s, v31.4s, v12.4s // ................'..................................................................................................*.................. + // cmge v28.4s, v12.4s, v30.4s // ................'...................................................................................................*................. + // sub v28.4s, v27.4s, v28.4s // ................'.....................................................................................................*............... + // mls v12.4s, v28.4s, v8.4s // .~..............'........................................................................................................*............ + // str q9, [x0], #(16) // ................'......................................................................................................*.............. + // str q10, [x0, #(-16 + 1*(1024/8))] // ..~.............'.........................................................................................................*........... + // str q11, [x0, #(-16 + 2*(1024/8))] // ....~...........'...........................................................................................................*......... + // str q12, [x0, #(-16 + 3*(1024/8))] // .............~..'....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v5.4S, v7.4S, v6.4S // *........................................................................................................... + // Instructions: 108 + // Expected cycles: 110 + // Expected IPC: 0.98 + // + // Wall time: 14.56s + // User time: 14.56s + // + // -------------------------------------------- original position --------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------- + sub v19.4S, v11.4S, v12.4S // *........................................................................................................... // gap // ............................................................................................................ - sub v27.4S, v27.4S, v15.4S // ..................*......................................................................................... + sub v14.4S, v10.4S, v6.4S // ..*......................................................................................................... // gap // ............................................................................................................ - sub v10.4S, v17.4S, v28.4S // ..........*................................................................................................. + sub v15.4S, v4.4S, v24.4S // ..........*................................................................................................. // gap // ............................................................................................................ - sqrdmulh v14.4S, v5.4S, v1.S[3] // ....*....................................................................................................... + sqrdmulh v29.4S, v19.4S, v1.S[3] // ...*........................................................................................................ // gap // ............................................................................................................ - mul v13.4S, v27.4S, v3.S[0] // ......................*..................................................................................... + sqrdmulh v28.4S, v14.4S, v2.S[1] // .......*.................................................................................................... // gap // ............................................................................................................ - sub v12.4S, v12.4S, v21.4S // ..*......................................................................................................... + mul v4.4S, v14.4S, v2.S[0] // ........*................................................................................................... // gap // ............................................................................................................ - mul v15.4S, v10.4S, v2.S[2] // ................*........................................................................................... + mul v5.4S, v19.4S, v1.S[2] // ....*....................................................................................................... // gap // ............................................................................................................ - sqrdmulh v20.4S, v10.4S, v2.S[3] // .................*.......................................................................................... + add v21.4S, v11.4S, v12.4S // .*.......................................................................................................... // gap // ............................................................................................................ - sqrdmulh v10.4S, v27.4S, v3.S[1] // ........................*................................................................................... + sub v19.4S, v16.4S, v18.4S // ..................*......................................................................................... // gap // ............................................................................................................ - sqrdmulh v27.4S, v12.4S, v2.S[1] // ........*................................................................................................... + mls v4.4S, v28.4S, v8.S[0] // ...............*............................................................................................ // gap // ............................................................................................................ - mul v17.4S, v12.4S, v2.S[0] // .......*.................................................................................................... + mls v5.4S, v29.4S, v8.S[0] // .........*.................................................................................................. // gap // ............................................................................................................ - mul v29.4S, v5.4S, v1.S[2] // ...*........................................................................................................ + add v6.4S, v21.4S, v27.4S // ......*..................................................................................................... // gap // ............................................................................................................ - mls v13.4S, v10.4S, v8.S[0] // ...............................*............................................................................ + sub v28.4S, v21.4S, v27.4S // .....*...................................................................................................... // gap // ............................................................................................................ - mls v15.4S, v20.4S, v8.S[0] // .....................*...................................................................................... + sqrdmulh v29.4S, v15.4S, v2.S[3] // ................*........................................................................................... // gap // ............................................................................................................ - mls v17.4S, v27.4S, v8.S[0] // ...............*............................................................................................ + sub v9.4S, v5.4S, v4.4S // ...................*........................................................................................ // gap // ............................................................................................................ - mls v29.4S, v14.4S, v8.S[0] // .........*.................................................................................................. + mul v21.4S, v19.4S, v3.S[0] // ........................*................................................................................... // gap // ............................................................................................................ - add v5.4S, v7.4S, v6.4S // .*.......................................................................................................... + sqrdmulh v14.4S, v19.4S, v3.S[1] // ......................*..................................................................................... // gap // ............................................................................................................ - sub v10.4S, v15.4S, v13.4S // ...................................*........................................................................ + sqrdmulh v7.4S, v28.4S, v0.S[3] // ...........*................................................................................................ // gap // ............................................................................................................ - add v28.4S, v15.4S, v13.4S // ......................................*..................................................................... + mul v10.4S, v28.4S, v0.S[2] // ............*............................................................................................... // gap // ............................................................................................................ - sub v7.4S, v29.4S, v17.4S // ...................*........................................................................................ + mul v28.4S, v15.4S, v2.S[2] // .................*.......................................................................................... // gap // ............................................................................................................ - sqrdmulh v24.4S, v10.4S, v1.S[1] // ........................................*................................................................... + mls v21.4S, v14.4S, v8.S[0] // ...............................*............................................................................ // gap // ............................................................................................................ - mul v21.4S, v10.4S, v1.S[0] // .......................................*.................................................................... + sub v17.4S, v6.4S, v23.4S // .............*.............................................................................................. // gap // ............................................................................................................ - sqrdmulh v13.4S, v7.4S, v0.S[3] // ..........................*................................................................................. + sub v14.4S, v20.4S, v13.4S // .................................*.......................................................................... // gap // ............................................................................................................ - mul v22.4S, v7.4S, v0.S[2] // .........................*.................................................................................. + mls v28.4S, v29.4S, v8.S[0] // .....................*...................................................................................... // gap // ............................................................................................................ - add v16.4S, v5.4S, v19.4S // ......*..................................................................................................... + add v12.4S, v5.4S, v4.4S // ....................*....................................................................................... // gap // ............................................................................................................ - mls v21.4S, v24.4S, v8.S[0] // ............................................*............................................................... + mul v18.4S, v14.4S, v1.S[0] // .....................................*...................................................................... // gap // ............................................................................................................ - add v14.4S, v29.4S, v17.4S // ....................*....................................................................................... + sqrdmulh v29.4S, v9.4S, v0.S[3] // .........................*.................................................................................. // gap // ............................................................................................................ - mls v22.4S, v13.4S, v8.S[0] // ................................*........................................................................... + sub v19.4S, v28.4S, v21.4S // ...................................*........................................................................ // gap // ............................................................................................................ - add v15.4S, v16.4S, v4.4S // ..............*............................................................................................. + add v15.4S, v28.4S, v21.4S // ......................................*..................................................................... // gap // ............................................................................................................ - add v24.4S, v14.4S, v28.4S // ..........................................*................................................................. + mul v5.4S, v9.4S, v0.S[2] // ..........................*................................................................................. // gap // ............................................................................................................ - sub v18.4S, v18.4S, v11.4S // .................................*.......................................................................... + sqrdmulh v14.4S, v14.4S, v1.S[1] // ....................................*....................................................................... // gap // ............................................................................................................ - add v10.4S, v22.4S, v21.4S // .....................................................*...................................................... + mul v16.4S, v19.4S, v1.S[0] // ........................................*................................................................... // gap // ............................................................................................................ - sqrdmulh v12.4S, v24.4S, v26.4S // ..............................................................................*............................. + add v9.4S, v6.4S, v23.4S // ..............*............................................................................................. // gap // ............................................................................................................ - mul v6.4S, v24.4S, v25.4S // ...........................................................................*................................ + sqrdmulh v21.4S, v19.4S, v1.S[1] // .......................................*.................................................................... // gap // ............................................................................................................ - sqrdmulh v13.4S, v10.4S, v26.4S // .....................................................................................*...................... + sqrdmulh v19.4S, v17.4S, v0.S[1] // ...........................*................................................................................ // gap // ............................................................................................................ - mul v7.4S, v10.4S, v25.4S // ....................................................................................*....................... + mls v5.4S, v29.4S, v8.S[0] // ................................*........................................................................... // gap // ............................................................................................................ - mul v27.4S, v15.4S, v25.4S // .............................*.............................................................................. + mls v18.4S, v14.4S, v8.S[0] // ...........................................*................................................................ // gap // ............................................................................................................ - sqrdmulh v10.4S, v15.4S, v26.4S // ..............................*............................................................................. + mls v16.4S, v21.4S, v8.S[0] // ............................................*............................................................... // gap // ............................................................................................................ - sub v20.4S, v5.4S, v19.4S // .....*...................................................................................................... + sub v29.4S, v12.4S, v15.4S // .........................................*.................................................................. // gap // ............................................................................................................ - mls v7.4S, v13.4S, v8.S[0] // .........................................................................................*.................. + mls v10.4S, v7.4S, v8.S[0] // .......................*.................................................................................... // gap // ............................................................................................................ - sqrdmulh v17.4S, v18.4S, v1.S[1] // .....................................*...................................................................... + mul v6.4S, v17.4S, v0.S[0] // ............................*............................................................................... // gap // ............................................................................................................ - mls v27.4S, v10.4S, v8.S[0] // ...........................................................*................................................ + sqrdmulh v28.4S, v29.4S, v0.S[1] // .............................................*.............................................................. // gap // ............................................................................................................ - sqrdmulh v11.4S, v20.4S, v0.S[3] // ............*............................................................................................... + sub v17.4S, v5.4S, v16.4S // .................................................*.......................................................... // gap // ............................................................................................................ - cmge v10.4S, v7.4S, v30.4S // ....................................................................................................*....... + mul v24.4S, v29.4S, v0.S[0] // ..............................................*............................................................. // gap // ............................................................................................................ - cmge v24.4S, v31.4S, v7.4S // ...................................................................................................*........ + mls v6.4S, v19.4S, v8.S[0] // ..................................*......................................................................... // gap // ............................................................................................................ - cmge v23.4S, v31.4S, v27.4S // ......................................................................................*..................... + sub v14.4S, v10.4S, v18.4S // ...............................................*............................................................ // gap // ............................................................................................................ - sub v10.4S, v24.4S, v10.4S // ......................................................................................................*..... + mul v4.4S, v9.4S, v25.4S // ..............................*............................................................................. // gap // ............................................................................................................ - mul v13.4S, v18.4S, v1.S[0] // ....................................*....................................................................... + sqrdmulh v20.4S, v9.4S, v26.4S // .............................*.............................................................................. // gap // ............................................................................................................ - cmge v15.4S, v27.4S, v30.4S // ........................................................................................*................... + sqrdmulh v9.4S, v14.4S, v0.S[1] // ...................................................*........................................................ // gap // ............................................................................................................ - mls v7.4S, v10.4S, v8.4S // ........................................................................................................*... + cmge v21.4S, v31.4S, v6.4S // .........................................................*.................................................. // gap // ............................................................................................................ - sub v10.4S, v23.4S, v15.4S // ..........................................................................................*................. + mls v24.4S, v28.4S, v8.S[0] // ..................................................*......................................................... // gap // ............................................................................................................ - mls v13.4S, v17.4S, v8.S[0] // ...........................................*................................................................ + mul v23.4S, v14.4S, v0.S[0] // ....................................................*....................................................... // gap // ............................................................................................................ - mul v5.4S, v20.4S, v0.S[2] // ...........*................................................................................................ + sqrdmulh v13.4S, v17.4S, v0.S[1] // ......................................................*..................................................... // gap // ............................................................................................................ - str q7, [x0, #384] // ...........................................................................................................* + mul v17.4S, v17.4S, v0.S[0] // .......................................................*.................................................... // gap // ............................................................................................................ - mls v27.4S, v10.4S, v8.4S // .............................................................................................*.............. + cmge v14.4S, v6.4S, v30.4S // ..........................................................*................................................. // gap // ............................................................................................................ - mls v6.4S, v12.4S, v8.S[0] // ..................................................................................*......................... + mls v23.4S, v9.4S, v8.S[0] // ........................................................*................................................... // gap // ............................................................................................................ - mls v5.4S, v11.4S, v8.S[0] // .......................*.................................................................................... + sub v29.4S, v21.4S, v14.4S // .............................................................*.............................................. // gap // ............................................................................................................ - sub v17.4S, v22.4S, v21.4S // .................................................*.......................................................... + mls v4.4S, v20.4S, v8.S[0] // ...........................................................*................................................ // gap // ............................................................................................................ - str q27, [x0], #(16) // .......................................................................................................*.... + cmge v7.4S, v31.4S, v24.4S // ..............................................................*............................................. // gap // ............................................................................................................ - cmge v24.4S, v31.4S, v6.4S // ...........................................................................................*................ + cmge v21.4S, v23.4S, v30.4S // ...................................................................*........................................ // gap // ............................................................................................................ - add v7.4S, v5.4S, v13.4S // ................................................*........................................................... + add v20.4S, v5.4S, v16.4S // .....................................................*...................................................... // gap // ............................................................................................................ - sub v27.4S, v5.4S, v13.4S // ...............................................*............................................................ + mls v17.4S, v13.4S, v8.S[0] // ............................................................*............................................... // gap // ............................................................................................................ - cmge v10.4S, v6.4S, v30.4S // ............................................................................................*............... + cmge v19.4S, v24.4S, v30.4S // ...............................................................*............................................ // gap // ............................................................................................................ - mul v13.4S, v7.4S, v25.4S // ................................................................................*........................... + cmge v14.4S, v31.4S, v23.4S // ..................................................................*......................................... // gap // ............................................................................................................ - sqrdmulh v7.4S, v7.4S, v26.4S // ...................................................................................*........................ + mls v6.4S, v29.4S, v8.4S // ................................................................*........................................... // gap // ............................................................................................................ - sub v10.4S, v24.4S, v10.4S // ..............................................................................................*............. + sub v14.4S, v14.4S, v21.4S // .....................................................................*...................................... // gap // ............................................................................................................ - sqrdmulh v12.4S, v17.4S, v0.S[1] // .......................................................*.................................................... + add v28.4S, v12.4S, v15.4S // ..........................................*................................................................. // gap // ............................................................................................................ - mul v15.4S, v17.4S, v0.S[0] // ......................................................*..................................................... + sub v9.4S, v7.4S, v19.4S // .................................................................*.......................................... // gap // ............................................................................................................ - mls v13.4S, v7.4S, v8.S[0] // .......................................................................................*.................... + cmge v21.4S, v17.4S, v30.4S // .......................................................................*.................................... // gap // ............................................................................................................ - mls v6.4S, v10.4S, v8.4S // .................................................................................................*.......... + cmge v29.4S, v31.4S, v17.4S // ......................................................................*..................................... // gap // ............................................................................................................ - sqrdmulh v11.4S, v27.4S, v0.S[1] // ....................................................*....................................................... + mls v23.4S, v14.4S, v8.4S // ........................................................................*................................... // gap // ............................................................................................................ - mls v15.4S, v12.4S, v8.S[0] // ............................................................*............................................... + add v14.4S, v10.4S, v18.4S // ................................................*........................................................... // gap // ............................................................................................................ - cmge v24.4S, v31.4S, v13.4S // ...............................................................................................*............ + sub v21.4S, v29.4S, v21.4S // .........................................................................*.................................. // gap // ............................................................................................................ - cmge v10.4S, v13.4S, v30.4S // ................................................................................................*........... + mls v24.4S, v9.4S, v8.4S // ....................................................................*....................................... // gap // ............................................................................................................ - mul v17.4S, v27.4S, v0.S[0] // ...................................................*........................................................ + str q6, [x0, #512] // ..........................................................................*................................. // gap // ............................................................................................................ - sub v10.4S, v24.4S, v10.4S // ..................................................................................................*......... + mls v17.4S, v21.4S, v8.4S // ............................................................................*............................... // gap // ............................................................................................................ - sub v12.4S, v14.4S, v28.4S // .........................................*.................................................................. + mul v5.4S, v28.4S, v25.4S // ..............................................................................*............................. // gap // ............................................................................................................ - cmge v7.4S, v31.4S, v15.4S // ......................................................................*..................................... + mul v15.4S, v14.4S, v25.4S // ...................................................................................*........................ // gap // ............................................................................................................ - cmge v24.4S, v15.4S, v30.4S // .......................................................................*.................................... + sqrdmulh v9.4S, v28.4S, v26.4S // ...........................................................................*................................ // gap // ............................................................................................................ - mls v17.4S, v11.4S, v8.S[0] // ........................................................*................................................... + str q24, [x0, #640] // .............................................................................*.............................. // gap // ............................................................................................................ - sub v24.4S, v7.4S, v24.4S // .........................................................................*.................................. + sqrdmulh v19.4S, v14.4S, v26.4S // ................................................................................*........................... // gap // ............................................................................................................ - sqrdmulh v28.4S, v12.4S, v0.S[1] // ..............................................*............................................................. + cmge v29.4S, v31.4S, v4.4S // ......................................................................................*..................... // gap // ............................................................................................................ - mul v27.4S, v12.4S, v0.S[0] // .............................................*.............................................................. + mls v5.4S, v9.4S, v8.S[0] // ..................................................................................*......................... // gap // ............................................................................................................ - cmge v12.4S, v31.4S, v17.4S // ..................................................................*......................................... + sqrdmulh v14.4S, v20.4S, v26.4S // ....................................................................................*....................... // gap // ............................................................................................................ - cmge v7.4S, v17.4S, v30.4S // ...................................................................*........................................ + mul v7.4S, v20.4S, v25.4S // .....................................................................................*...................... // gap // ............................................................................................................ - sub v19.4S, v16.4S, v4.4S // .............*.............................................................................................. + mls v15.4S, v19.4S, v8.S[0] // .......................................................................................*.................... // gap // ............................................................................................................ - sub v7.4S, v12.4S, v7.4S // .....................................................................*...................................... + cmge v21.4S, v4.4S, v30.4S // ........................................................................................*................... // gap // ............................................................................................................ - mls v27.4S, v28.4S, v8.S[0] // ..................................................*......................................................... + str q17, [x0, #896] // .................................................................................*.......................... // gap // ............................................................................................................ - sqrdmulh v11.4S, v19.4S, v0.S[1] // ............................*............................................................................... + sub v28.4S, v29.4S, v21.4S // ..........................................................................................*................. // gap // ............................................................................................................ - mls v17.4S, v7.4S, v8.4S // ........................................................................*................................... + cmge v29.4S, v31.4S, v5.4S // ...........................................................................................*................ // gap // ............................................................................................................ - mul v28.4S, v19.4S, v0.S[0] // ...........................*................................................................................ + mls v7.4S, v14.4S, v8.S[0] // .........................................................................................*.................. // gap // ............................................................................................................ - cmge v12.4S, v27.4S, v30.4S // ...............................................................*............................................ + cmge v14.4S, v5.4S, v30.4S // ............................................................................................*............... // gap // ............................................................................................................ - cmge v7.4S, v31.4S, v27.4S // ..............................................................*............................................. + cmge v21.4S, v31.4S, v15.4S // ...............................................................................................*............ // gap // ............................................................................................................ - str q17, [x0, #752] // ...............................................................................*............................ + cmge v19.4S, v15.4S, v30.4S // ................................................................................................*........... // gap // ............................................................................................................ - sub v7.4S, v7.4S, v12.4S // .................................................................*.......................................... + mls v4.4S, v28.4S, v8.4S // .............................................................................................*.............. // gap // ............................................................................................................ - mls v28.4S, v11.4S, v8.S[0] // ..................................*......................................................................... + sub v19.4S, v21.4S, v19.4S // ..................................................................................................*......... // gap // ............................................................................................................ - str q6, [x0, #112] // .........................................................................................................*.. + cmge v21.4S, v7.4S, v30.4S // ....................................................................................................*....... // gap // ............................................................................................................ - mls v27.4S, v7.4S, v8.4S // ....................................................................*....................................... + sub v9.4S, v29.4S, v14.4S // ..............................................................................................*............. // gap // ............................................................................................................ - mls v15.4S, v24.4S, v8.4S // ............................................................................*............................... + cmge v14.4S, v31.4S, v7.4S // ...................................................................................................*........ // gap // ............................................................................................................ - cmge v6.4S, v31.4S, v28.4S // .........................................................*.................................................. + str q23, [x0, #768] // ...............................................................................*............................ // gap // ............................................................................................................ - cmge v24.4S, v28.4S, v30.4S // ..........................................................*................................................. + sub v29.4S, v14.4S, v21.4S // ......................................................................................................*..... // gap // ............................................................................................................ - str q27, [x0, #624] // .............................................................................*.............................. + mls v5.4S, v9.4S, v8.4S // .................................................................................................*.......... // gap // ............................................................................................................ - sub v24.4S, v6.4S, v24.4S // .............................................................*.............................................. + str q4, [x0], #(16) // .......................................................................................................*.... // gap // ............................................................................................................ - mls v13.4S, v10.4S, v8.4S // .....................................................................................................*...... + mls v15.4S, v19.4S, v8.4S // .....................................................................................................*...... // gap // ............................................................................................................ - str q15, [x0, #880] // .................................................................................*.......................... + mls v7.4S, v29.4S, v8.4S // ........................................................................................................*... // gap // ............................................................................................................ - mls v28.4S, v24.4S, v8.4S // ................................................................*........................................... + str q5, [x0, #112] // .........................................................................................................*.. // gap // ............................................................................................................ // gap // ............................................................................................................ // gap // ............................................................................................................ - str q13, [x0, #240] // ..........................................................................................................*. + str q15, [x0, #240] // ..........................................................................................................*. // gap // ............................................................................................................ // gap // ............................................................................................................ // gap // ............................................................................................................ - str q28, [x0, #496] // ..........................................................................*................................. + str q7, [x0, #368] // ...........................................................................................................* // gap // ............................................................................................................ - // original source code - // sub v10.4S, v7.4S, v6.4S // *........................................................................................................... - // add v24.4S, v7.4S, v6.4S // ................*........................................................................................... - // sub v7.4S, v12.4S, v21.4S // .....*...................................................................................................... - // mul v13.4S, v10.4S, v1.S[2] // ...........*................................................................................................ - // sqrdmulh v10.4S, v10.4S, v1.S[3] // ...*........................................................................................................ - // sub v6.4S, v24.4S, v19.4S // ......................................*..................................................................... - // add v24.4S, v24.4S, v19.4S // ........................*................................................................................... - // mul v12.4S, v7.4S, v2.S[0] // ..........*................................................................................................. - // sqrdmulh v7.4S, v7.4S, v2.S[1] // .........*.................................................................................................. - // mls v13.4S, v10.4S, v8.S[0] // ...............*............................................................................................ - // sub v10.4S, v17.4S, v28.4S // ..*......................................................................................................... - // mul v17.4S, v6.4S, v0.S[2] // ....................................................*....................................................... - // sqrdmulh v6.4S, v6.4S, v0.S[3] // ..........................................*................................................................. - // sub v28.4S, v24.4S, v4.4S // .....................................................................................*...................... - // add v24.4S, v24.4S, v4.4S // ............................*............................................................................... - // mls v12.4S, v7.4S, v8.S[0] // ..............*............................................................................................. - // mul v7.4S, v10.4S, v2.S[2] // ......*..................................................................................................... - // sqrdmulh v10.4S, v10.4S, v2.S[3] // .......*.................................................................................................... - // sub v27.4S, v27.4S, v15.4S // .*.......................................................................................................... - // sub v15.4S, v13.4S, v12.4S // ...................*........................................................................................ - // add v13.4S, v13.4S, v12.4S // ..........................*................................................................................. - // mls v7.4S, v10.4S, v8.S[0] // .............*.............................................................................................. - // mul v10.4S, v27.4S, v3.S[0] // ....*....................................................................................................... - // mls v17.4S, v6.4S, v8.S[0] // ........................................................*................................................... - // sqrdmulh v6.4S, v27.4S, v3.S[1] // ........*................................................................................................... - // mul v12.4S, v15.4S, v0.S[2] // .......................*.................................................................................... - // sqrdmulh v27.4S, v15.4S, v0.S[3] // ......................*..................................................................................... - // mul v15.4S, v28.4S, v0.S[0] // ..........................................................................................*................. - // sqrdmulh v28.4S, v28.4S, v0.S[1] // ........................................................................................*................... - // mul v21.4S, v24.4S, v25.4S // ....................................*....................................................................... - // sqrdmulh v24.4S, v24.4S, v26.4S // .....................................*...................................................................... - // mls v10.4S, v6.4S, v8.S[0] // ............*............................................................................................... - // mls v12.4S, v27.4S, v8.S[0] // ...........................*................................................................................ - // sub v6.4S, v18.4S, v11.4S // ..............................*............................................................................. - // mls v15.4S, v28.4S, v8.S[0] // ...............................................................................................*............ - // sub v28.4S, v7.4S, v10.4S // .................*.......................................................................................... - // mul v27.4S, v6.4S, v1.S[0] // ...............................................*............................................................ - // sqrdmulh v6.4S, v6.4S, v1.S[1] // ........................................*................................................................... - // add v10.4S, v7.4S, v10.4S // ..................*......................................................................................... - // mul v7.4S, v28.4S, v1.S[0] // .....................*...................................................................................... - // sqrdmulh v28.4S, v28.4S, v1.S[1] // ....................*....................................................................................... - // sub v18.4S, v13.4S, v10.4S // ............................................................................*............................... - // add v10.4S, v13.4S, v10.4S // .............................*.............................................................................. - // mls v27.4S, v6.4S, v8.S[0] // ...................................................*........................................................ - // mls v7.4S, v28.4S, v8.S[0] // .........................*.................................................................................. - // mul v13.4S, v18.4S, v0.S[0] // ..................................................................................*......................... - // sqrdmulh v6.4S, v18.4S, v0.S[1] // .................................................................................*.......................... - // sub v28.4S, v17.4S, v27.4S // .............................................................*.............................................. - // add v17.4S, v17.4S, v27.4S // ............................................................*............................................... - // sub v27.4S, v12.4S, v7.4S // .........................................................*.................................................. - // mls v13.4S, v6.4S, v8.S[0] // .......................................................................................*.................... - // mul v6.4S, v28.4S, v0.S[0] // ..........................................................................*................................. - // sqrdmulh v28.4S, v28.4S, v0.S[1] // ......................................................................*..................................... - // add v7.4S, v12.4S, v7.4S // ...............................*............................................................................ - // mul v12.4S, v27.4S, v0.S[0] // ...................................................................*........................................ - // sqrdmulh v27.4S, v27.4S, v0.S[1] // ..................................................................*......................................... - // mls v6.4S, v28.4S, v8.S[0] // ...............................................................................*............................ - // cmge v28.4S, v31.4S, v15.4S // ...................................................................................................*........ - // cmge v18.4S, v15.4S, v30.4S // ....................................................................................................*....... - // mls v21.4S, v24.4S, v8.S[0] // .........................................*.................................................................. - // mls v12.4S, v27.4S, v8.S[0] // .......................................................................*.................................... - // sub v24.4S, v28.4S, v18.4S // ......................................................................................................*..... - // cmge v28.4S, v31.4S, v13.4S // ............................................................................................*............... - // cmge v27.4S, v13.4S, v30.4S // ...........................................................................................*................ - // mls v15.4S, v24.4S, v8.4S // .........................................................................................................*.. - // sub v24.4S, v28.4S, v27.4S // ..............................................................................................*............. - // cmge v28.4S, v31.4S, v6.4S // ...................................................................................*........................ - // cmge v27.4S, v6.4S, v30.4S // ....................................................................................*....................... - // mls v13.4S, v24.4S, v8.4S // .................................................................................................*.......... - // sub v24.4S, v28.4S, v27.4S // ......................................................................................*..................... - // cmge v28.4S, v31.4S, v12.4S // .............................................................................*.............................. - // cmge v27.4S, v12.4S, v30.4S // ..............................................................................*............................. - // mls v6.4S, v24.4S, v8.4S // .........................................................................................*.................. - // sub v24.4S, v28.4S, v27.4S // ................................................................................*........................... - // str q15, [x0, #512] // ...........................................................................................................* - // mul v28.4S, v10.4S, v25.4S // .................................*.......................................................................... - // mls v12.4S, v24.4S, v8.4S // ..................................................................................................*......... - // str q13, [x0, #640] // .....................................................................................................*...... - // sqrdmulh v10.4S, v10.4S, v26.4S // ................................*........................................................................... - // str q6, [x0, #768] // .............................................................................................*.............. - // mul v24.4S, v17.4S, v25.4S // ...............................................................*............................................ - // str q12, [x0, #896] // ........................................................................................................*... - // mls v28.4S, v10.4S, v8.S[0] // .......................................................*.................................................... - // sqrdmulh v10.4S, v17.4S, v26.4S // ................................................................*........................................... - // mul v13.4S, v7.4S, v25.4S // ...................................*........................................................................ - // sqrdmulh v7.4S, v7.4S, v26.4S // ..................................*......................................................................... - // cmge v6.4S, v31.4S, v21.4S // .............................................*.............................................................. - // mls v24.4S, v10.4S, v8.S[0] // ....................................................................*....................................... - // cmge v10.4S, v21.4S, v30.4S // ................................................*........................................................... - // mls v13.4S, v7.4S, v8.S[0] // .......................................*.................................................................... - // sub v10.4S, v6.4S, v10.4S // ..................................................*......................................................... - // cmge v7.4S, v31.4S, v28.4S // ...........................................................*................................................ - // cmge v6.4S, v28.4S, v30.4S // ..............................................................*............................................. - // mls v21.4S, v10.4S, v8.4S // ......................................................*..................................................... - // sub v10.4S, v7.4S, v6.4S // .................................................................*.......................................... - // cmge v7.4S, v31.4S, v24.4S // ........................................................................*................................... - // cmge v6.4S, v24.4S, v30.4S // .........................................................................*.................................. - // mls v28.4S, v10.4S, v8.4S // .....................................................................*...................................... - // sub v10.4S, v7.4S, v6.4S // ...........................................................................*................................ - // cmge v7.4S, v31.4S, v13.4S // ............................................*............................................................... - // cmge v6.4S, v13.4S, v30.4S // ...........................................*................................................................ - // mls v24.4S, v10.4S, v8.4S // .......................................................................................................*.... - // sub v10.4S, v7.4S, v6.4S // ..............................................*............................................................. - // str q21, [x0], #(16) // ..........................................................*................................................. - // mls v13.4S, v10.4S, v8.4S // .................................................*.......................................................... - // str q28, [x0, #112] // ................................................................................................*........... - // str q24, [x0, #240] // ..........................................................................................................*. - // str q13, [x0, #368] // .....................................................*...................................................... + // ---------------------------------------------- new position -----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------- + // sub v22.4S, v11.4S, v12.4S // *........................................................................................................... + // add v11.4S, v11.4S, v12.4S // .......*.................................................................................................... + // sub v10.4S, v10.4S, v6.4S // .*.......................................................................................................... + // sqrdmulh v12.4S, v22.4S, v1.S[3] // ...*........................................................................................................ + // mul v22.4S, v22.4S, v1.S[2] // ......*..................................................................................................... + // sub v6.4S, v11.4S, v27.4S // ............*............................................................................................... + // add v11.4S, v11.4S, v27.4S // ...........*................................................................................................ + // sqrdmulh v27.4S, v10.4S, v2.S[1] // ....*....................................................................................................... + // mul v10.4S, v10.4S, v2.S[0] // .....*...................................................................................................... + // mls v22.4S, v12.4S, v8.S[0] // ..........*................................................................................................. + // sub v12.4S, v4.4S, v24.4S // ..*......................................................................................................... + // sqrdmulh v4.4S, v6.4S, v0.S[3] // .................*.......................................................................................... + // mul v6.4S, v6.4S, v0.S[2] // ..................*......................................................................................... + // sub v24.4S, v11.4S, v23.4S // .....................*...................................................................................... + // add v11.4S, v11.4S, v23.4S // ................................*........................................................................... + // mls v10.4S, v27.4S, v8.S[0] // .........*.................................................................................................. + // sqrdmulh v27.4S, v12.4S, v2.S[3] // .............*.............................................................................................. + // mul v12.4S, v12.4S, v2.S[2] // ...................*........................................................................................ + // sub v16.4S, v16.4S, v18.4S // ........*................................................................................................... + // sub v18.4S, v22.4S, v10.4S // ..............*............................................................................................. + // add v22.4S, v22.4S, v10.4S // ........................*................................................................................... + // mls v12.4S, v27.4S, v8.S[0] // .......................*.................................................................................... + // sqrdmulh v10.4S, v16.4S, v3.S[1] // ................*........................................................................................... + // mls v6.4S, v4.4S, v8.S[0] // .......................................*.................................................................... + // mul v27.4S, v16.4S, v3.S[0] // ...............*............................................................................................ + // sqrdmulh v4.4S, v18.4S, v0.S[3] // ..........................*................................................................................. + // mul v16.4S, v18.4S, v0.S[2] // .............................*.............................................................................. + // sqrdmulh v18.4S, v24.4S, v0.S[1] // ..................................*......................................................................... + // mul v24.4S, v24.4S, v0.S[0] // ........................................*................................................................... + // sqrdmulh v23.4S, v11.4S, v26.4S // ...............................................*............................................................ + // mul v11.4S, v11.4S, v25.4S // ..............................................*............................................................. + // mls v27.4S, v10.4S, v8.S[0] // ....................*....................................................................................... + // mls v16.4S, v4.4S, v8.S[0] // ...................................*........................................................................ + // sub v10.4S, v20.4S, v13.4S // ......................*..................................................................................... + // mls v24.4S, v18.4S, v8.S[0] // ............................................*............................................................... + // sub v4.4S, v12.4S, v27.4S // ...........................*................................................................................ + // sqrdmulh v18.4S, v10.4S, v1.S[1] // ..............................*............................................................................. + // mul v10.4S, v10.4S, v1.S[0] // .........................*.................................................................................. + // add v12.4S, v12.4S, v27.4S // ............................*............................................................................... + // sqrdmulh v27.4S, v4.4S, v1.S[1] // .................................*.......................................................................... + // mul v4.4S, v4.4S, v1.S[0] // ...............................*............................................................................ + // sub v20.4S, v22.4S, v12.4S // ......................................*..................................................................... + // add v22.4S, v22.4S, v12.4S // ..................................................................*......................................... + // mls v10.4S, v18.4S, v8.S[0] // ....................................*....................................................................... + // mls v4.4S, v27.4S, v8.S[0] // .....................................*...................................................................... + // sqrdmulh v12.4S, v20.4S, v0.S[1] // .........................................*.................................................................. + // mul v27.4S, v20.4S, v0.S[0] // ...........................................*................................................................ + // sub v18.4S, v6.4S, v10.4S // .............................................*.............................................................. + // add v10.4S, v6.4S, v10.4S // .......................................................................*.................................... + // sub v6.4S, v16.4S, v4.4S // ..........................................*................................................................. + // mls v27.4S, v12.4S, v8.S[0] // ..................................................*......................................................... + // sqrdmulh v12.4S, v18.4S, v0.S[1] // ................................................*........................................................... + // mul v18.4S, v18.4S, v0.S[0] // ...................................................*........................................................ + // add v4.4S, v16.4S, v4.4S // ............................................................*............................................... + // sqrdmulh v16.4S, v6.4S, v0.S[1] // ....................................................*....................................................... + // mul v6.4S, v6.4S, v0.S[0] // .....................................................*...................................................... + // mls v18.4S, v12.4S, v8.S[0] // .......................................................*.................................................... + // cmge v12.4S, v31.4S, v24.4S // .................................................*.......................................................... + // cmge v20.4S, v24.4S, v30.4S // ......................................................*..................................................... + // mls v11.4S, v23.4S, v8.S[0] // .........................................................*.................................................. + // mls v6.4S, v16.4S, v8.S[0] // .............................................................*.............................................. + // sub v12.4S, v12.4S, v20.4S // ........................................................*................................................... + // cmge v16.4S, v31.4S, v27.4S // ..........................................................*................................................. + // cmge v23.4S, v27.4S, v30.4S // ..............................................................*............................................. + // mls v24.4S, v12.4S, v8.4S // ................................................................*........................................... + // sub v12.4S, v16.4S, v23.4S // ...................................................................*........................................ + // cmge v16.4S, v31.4S, v18.4S // ...............................................................*............................................ + // cmge v23.4S, v18.4S, v30.4S // ...........................................................*................................................ + // mls v27.4S, v12.4S, v8.4S // .........................................................................*.................................. + // sub v12.4S, v16.4S, v23.4S // .................................................................*.......................................... + // cmge v16.4S, v31.4S, v6.4S // .....................................................................*...................................... + // cmge v23.4S, v6.4S, v30.4S // ....................................................................*....................................... + // mls v18.4S, v12.4S, v8.4S // ......................................................................*..................................... + // sub v12.4S, v16.4S, v23.4S // ........................................................................*................................... + // str q24, [x0, #512] // ..........................................................................*................................. + // sqrdmulh v24.4S, v22.4S, v26.4S // ..............................................................................*............................. + // mls v6.4S, v12.4S, v8.4S // ...........................................................................*................................ + // str q27, [x0, #640] // ...............................................................................*............................ + // mul v22.4S, v22.4S, v25.4S // ............................................................................*............................... + // str q18, [x0, #768] // ...................................................................................................*........ + // sqrdmulh v12.4S, v10.4S, v26.4S // ................................................................................*........................... + // str q6, [x0, #896] // .......................................................................................*.................... + // mls v22.4S, v24.4S, v8.S[0] // ..................................................................................*......................... + // mul v10.4S, v10.4S, v25.4S // .............................................................................*.............................. + // sqrdmulh v6.4S, v4.4S, v26.4S // ...................................................................................*........................ + // mul v27.4S, v4.4S, v25.4S // ....................................................................................*....................... + // cmge v4.4S, v31.4S, v11.4S // .................................................................................*.......................... + // mls v10.4S, v12.4S, v8.S[0] // .....................................................................................*...................... + // cmge v12.4S, v11.4S, v30.4S // ......................................................................................*..................... + // mls v27.4S, v6.4S, v8.S[0] // ..........................................................................................*................. + // sub v12.4S, v4.4S, v12.4S // ........................................................................................*................... + // cmge v6.4S, v31.4S, v22.4S // .........................................................................................*.................. + // cmge v4.4S, v22.4S, v30.4S // ...........................................................................................*................ + // mls v11.4S, v12.4S, v8.4S // ..............................................................................................*............. + // sub v12.4S, v6.4S, v4.4S // .................................................................................................*.......... + // cmge v6.4S, v31.4S, v10.4S // ............................................................................................*............... + // cmge v4.4S, v10.4S, v30.4S // .............................................................................................*.............. + // mls v22.4S, v12.4S, v8.4S // .....................................................................................................*...... + // sub v12.4S, v6.4S, v4.4S // ...............................................................................................*............ + // cmge v6.4S, v31.4S, v27.4S // ..................................................................................................*......... + // cmge v4.4S, v27.4S, v30.4S // ................................................................................................*........... + // mls v10.4S, v12.4S, v8.4S // .......................................................................................................*.... + // sub v12.4S, v6.4S, v4.4S // ....................................................................................................*....... + // str q11, [x0], #(16) // ......................................................................................................*..... + // mls v27.4S, v12.4S, v8.4S // ........................................................................................................*... + // str q22, [x0, #112] // .........................................................................................................*.. + // str q10, [x0, #240] // ..........................................................................................................*. + // str q27, [x0, #368] // ...........................................................................................................* pop_stack diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a72.s b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a72.s index e58113ab..099a4bc9 100644 --- a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a72.s +++ b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a72.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +33,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro barrett_reduce_single a @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -245,6 +231,12 @@ xtmp1 .req x11 restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -387,1230 +379,1254 @@ _intt_dilithium_123_45678_manual_ld4_opt_a72: qform_root3_tw .req q7 .p2align 2 - // Instructions: 30 - // Expected cycles: 23 - // Expected IPC: 1.30 - // - // Wall time: 0.38s - // User time: 0.38s - // - // ----- original position -----> - // 0 25 - // |------------------------|---- - ldr q23, [x1, #0] // .....*........................ - ldr q2, [x1, #16] // ......*....................... - // gap // .............................. - ldr q11, [x1, #32] // .......*...................... - ldr q27, [x1, #48] // ........*..................... - // gap // .............................. - ldr q25, [x5, #32] // .*............................ - // gap // .............................. - // gap // .............................. - ldr q12, [x5, #48] // ..*........................... - // gap // .............................. - // gap // .............................. - trn2 v10.4S, v23.4S, v2.4S // ..........*................... - trn1 v23.4S, v23.4S, v2.4S // .........*.................... - ldr q30, [x5, #16] // ....*......................... - trn2 v18.4S, v11.4S, v27.4S // ............*................. - trn1 v11.4S, v11.4S, v27.4S // ...........*.................. - ldr q0, [x2, #48] // ..........................*... - ldr q17, [x2, #16] // ............................*. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - ldr q13, [x5, #80] // *............................. - trn2 v28.2D, v10.2D, v18.2D // ..............*............... - trn2 v7.2D, v23.2D, v11.2D // .............*................ - // gap // .............................. - trn1 v11.2D, v23.2D, v11.2D // ...............*.............. - // gap // .............................. - // gap // .............................. - trn1 v27.2D, v10.2D, v18.2D // ................*............. - ldr q18, [x2, #32] // .........................*.... - // gap // .............................. - // gap // .............................. - // gap // .............................. - sub v10.4S, v7.4S, v28.4S // .................*............ - add v24.4S, v7.4S, v28.4S // ..................*........... - // gap // .............................. - // gap // .............................. - add v1.4S, v11.4S, v27.4S // .....................*........ - sub v20.4S, v11.4S, v27.4S // ...................*.......... - // gap // .............................. - sqrdmulh v27.4S, v10.4S, v13.4S // ....................*......... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - ldr q7, [x5, #64] // ...*.......................... - mul v25.4S, v20.4S, v25.4S // ......................*....... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - sqrdmulh v9.4S, v20.4S, v12.4S // ........................*..... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - mul v14.4S, v10.4S, v7.4S // ...........................*.. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - mls v14.4S, v27.4S, v8.S[0] // .............................* - // gap // .............................. - sub v16.4S, v1.4S, v24.4S // .......................*...... - - // -------- new position --------> - // 0 25 - // |------------------------|----- - // ldr q7, [x5, #80] // .............*................. - // ldr q21, [x5, #32] // ....*.......................... - // ldr q14, [x5, #48] // .....*......................... - // ldr q29, [x5, #64] // ........................*...... - // ldr q30, [x5, #16] // ........*...................... - // ldr q25, [x1, #0] // *.............................. - // ldr q1, [x1, #16] // .*............................. - // ldr q20, [x1, #32] // ..*............................ - // ldr q15, [x1, #48] // ...*........................... - // trn1 v26.4S, v25.4S, v1.4S // .......*....................... - // trn2 v25.4S, v25.4S, v1.4S // ......*........................ - // trn1 v1.4S, v20.4S, v15.4S // ..........*.................... - // trn2 v20.4S, v20.4S, v15.4S // .........*..................... - // trn2 v27.2D, v26.2D, v1.2D // ...............*............... - // trn2 v23.2D, v25.2D, v20.2D // ..............*................ - // trn1 v1.2D, v26.2D, v1.2D // ................*.............. - // trn1 v11.2D, v25.2D, v20.2D // .................*............. - // sub v13.4S, v27.4S, v23.4S // ...................*........... - // add v24.4S, v27.4S, v23.4S // ....................*.......... - // sub v27.4S, v1.4S, v11.4S // ......................*........ - // sqrdmulh v7.4S, v13.4S, v7.4S // .......................*....... - // add v1.4S, v1.4S, v11.4S // .....................*......... - // mul v25.4S, v27.4S, v21.4S // .........................*..... - // sub v16.4S, v1.4S, v24.4S // .............................*. - // sqrdmulh v9.4S, v27.4S, v14.4S // ..........................*.... - // ldr q18, [x2, #32] // ..................*............ - // ldr q0, [x2, #48] // ...........*................... - // mul v14.4S, v13.4S, v29.4S // ...........................*... - // ldr q17, [x2, #16] // ............*.................. - // mls v14.4S, v7.4S, v8.S[0] // ............................*.. + // Instructions: 168 + // Expected cycles: 139 + // Expected IPC: 1.21 + // + // Wall time: 369.55s + // User time: 369.55s + // + // -------------------------------------------------------------------------- original position --------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + ldr q27, [x2, #48] // .......................*................................................................................................................................................ + ldr q18, [x2, #32] // ..........................*............................................................................................................................................. + // gap // ........................................................................................................................................................................ + ldr q21, [x2, #16] // .................*...................................................................................................................................................... + ldr q14, [x2, #0] // .....................*.................................................................................................................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + ldr q31, [x1, #48] // ...*.................................................................................................................................................................... + ldr q2, [x1, #32] // ..*..................................................................................................................................................................... + ldr q3, [x1, #0] // .*...................................................................................................................................................................... + ldr q23, [x1, #16] // *....................................................................................................................................................................... + // gap // ........................................................................................................................................................................ + trn2 v5.4S, v18.4S, v27.4S // ..................................*..................................................................................................................................... + trn1 v16.4S, v18.4S, v27.4S // .................................*...................................................................................................................................... + ldr q4, [x5, #16] // .....*.................................................................................................................................................................. + trn2 v17.4S, v14.4S, v21.4S // ............................*........................................................................................................................................... + trn1 v28.4S, v14.4S, v21.4S // ................................*....................................................................................................................................... + ldr q14, [x5, #112] // ..............................*......................................................................................................................................... + trn2 v29.4S, v2.4S, v31.4S // ...........*............................................................................................................................................................ + trn1 v0.4S, v2.4S, v31.4S // ............*........................................................................................................................................................... + ldr q18, [x5, #144] // ............................................*........................................................................................................................... + trn2 v2.4S, v3.4S, v23.4S // .........*.............................................................................................................................................................. + ldr q1, [x5, #176] // ....*................................................................................................................................................................... + ldr q31, [x5, #64] // ..................*..................................................................................................................................................... + trn1 v25.2D, v28.2D, v16.2D // .......................................*................................................................................................................................ + ldr q6, [x5, #80] // .............*.......................................................................................................................................................... + trn1 v7.2D, v17.2D, v5.2D // ...........................................*............................................................................................................................ + trn2 v24.2D, v17.2D, v5.2D // ........................................*............................................................................................................................... + trn2 v12.2D, v28.2D, v16.2D // ..........................................*............................................................................................................................. + ldr q15, [x5, #160] // .................................................*...................................................................................................................... + ldr q28, [x4, #32] // ......*................................................................................................................................................................. + trn1 v5.4S, v3.4S, v23.4S // ..........*............................................................................................................................................................. + trn2 v23.2D, v2.2D, v29.2D // ................*....................................................................................................................................................... + trn1 v10.2D, v2.2D, v29.2D // ...................*.................................................................................................................................................... + ldr q29, [x5, #48] // .......*................................................................................................................................................................ + sub v21.4S, v25.4S, v7.4S // ..................................................*..................................................................................................................... + ldr q2, [x5, #128] // ....................................*................................................................................................................................... + add v30.4S, v25.4S, v7.4S // .....................................................*.................................................................................................................. + add v27.4S, v12.4S, v24.4S // ...............................................*........................................................................................................................ + ldr q11, [x4, #16] // .................................................................*...................................................................................................... + sub v9.4S, v12.4S, v24.4S // ..............................................*......................................................................................................................... + trn2 v17.2D, v5.2D, v0.2D // ...............*........................................................................................................................................................ + sqrdmulh v7.4S, v21.4S, v18.4S // ..................................................................*..................................................................................................... + ldr q24, [x5, #96] // ...................................................*.................................................................................................................... + // gap // ........................................................................................................................................................................ + trn1 v13.2D, v5.2D, v0.2D // ....................*................................................................................................................................................... + ldr q16, [x5], #(12*16) // ..............*......................................................................................................................................................... + // gap // ........................................................................................................................................................................ + sqrdmulh v20.4S, v9.4S, v1.4S // ........................................................*............................................................................................................... + // gap // ........................................................................................................................................................................ + add v26.4S, v17.4S, v23.4S // ........................*............................................................................................................................................... + sub v22.4S, v17.4S, v23.4S // ......................*................................................................................................................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v1.4S, v21.4S, v2.4S // .........................................................................*.............................................................................................. + add v5.4S, v13.4S, v10.4S // .........................*.............................................................................................................................................. + // gap // ........................................................................................................................................................................ + sub v13.4S, v13.4S, v10.4S // .............................*.......................................................................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v1.4S, v7.4S, v8.S[0] // .............................................................................*.......................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v18.4S, v9.4S, v15.4S // ...................................................................*.................................................................................................... + add v0.4S, v30.4S, v27.4S // .........................................................*.............................................................................................................. + // gap // ........................................................................................................................................................................ + sub v15.4S, v30.4S, v27.4S // ....................................................................*................................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v18.4S, v20.4S, v8.S[0] // .......................................................................*................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sub v2.4S, v5.4S, v26.4S // .....................................*.................................................................................................................................. + sqrdmulh v30.4S, v13.4S, v29.4S // ...................................*.................................................................................................................................... + add v29.4S, v5.4S, v26.4S // .......................................................*................................................................................................................ + ldr q26, [x5, #-160] // ........*............................................................................................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v12.4S, v15.4S, v14.4S // .................................................................................*...................................................................................... + sub v7.4S, v1.4S, v18.4S // .....................................................................................*.................................................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v23.4S, v1.4S, v18.4S // ......................................................................................*................................................................................. + mul v5.4S, v15.4S, v24.4S // ...................................................................................*.................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v1.4S, v7.4S, v24.4S // ..............................................................................................*......................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v15.4S, v7.4S, v14.4S // ............................................................................................*........................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v20.4S, v22.4S, v31.4S // ...........................*............................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v5.4S, v12.4S, v8.S[0] // .........................................................................................*.............................................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v1.4S, v15.4S, v8.S[0] // ................................................................................................*....................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + trn2 v18.4S, v0.4S, v23.4S // ...........................................................................................*............................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v14.4S, v22.4S, v6.4S // ...............................*........................................................................................................................................ + trn1 v10.4S, v0.4S, v23.4S // ..........................................................................................*............................................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + ldr q6, [x4, #48] // ...............................................................................*........................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v12.4S, v2.4S, v4.4S // ................................................*....................................................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + trn1 v0.4S, v5.4S, v1.4S // ....................................................................................................*................................................................... + mul v17.4S, v13.4S, v26.4S // ......................................*................................................................................................................................. + trn2 v7.4S, v5.4S, v1.4S // .....................................................................................................*.................................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + trn2 v3.2D, v10.2D, v0.2D // ........................................................................................................*............................................................... + mls v20.4S, v14.4S, v8.S[0] // .........................................*.............................................................................................................................. + // gap // ........................................................................................................................................................................ + trn2 v21.2D, v18.2D, v7.2D // .........................................................................................................*.............................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v17.4S, v30.4S, v8.S[0] // .............................................*.......................................................................................................................... + trn1 v5.2D, v18.2D, v7.2D // ............................................................................................................*........................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v23.4S, v2.4S, v16.4S // ....................................................*................................................................................................................... + // gap // ........................................................................................................................................................................ + sub v15.4S, v3.4S, v21.4S // ..............................................................................................................*......................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v23.4S, v12.4S, v8.S[0] // ...........................................................*............................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sub v19.4S, v17.4S, v20.4S // ......................................................*................................................................................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v1.4S, v17.4S, v20.4S // ............................................................*........................................................................................................... + ldr q30, [x4], #64 // .....................................................................*.................................................................................................. + mul v26.4S, v15.4S, v6.S[0] // ...................................................................................................................*.................................................... + trn1 v2.2D, v10.2D, v0.2D // ..........................................................................................................*............................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v20.4S, v19.4S, v4.4S // ..........................................................*............................................................................................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + trn2 v12.4S, v29.4S, v1.4S // ...............................................................*........................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v10.4S, v15.4S, v6.S[1] // ......................................................................................................................*................................................. + trn1 v7.4S, v29.4S, v1.4S // ................................................................*....................................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v27.4S, v2.4S, v5.4S // ..................................................................................................................*..................................................... + mul v14.4S, v19.4S, v16.4S // .............................................................*.......................................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sub v29.4S, v2.4S, v5.4S // .................................................................................................................*...................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v14.4S, v20.4S, v8.S[0] // ..............................................................*......................................................................................................... + add v20.4S, v3.4S, v21.4S // ...............................................................................................................*........................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v17.4S, v29.4S, v28.S[3] // .........................................................................................................................*.............................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sub v25.4S, v27.4S, v20.4S // ..............................................................................................................................*......................................... + mul v19.4S, v29.4S, v28.S[2] // ............................................................................................................................*........................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + trn1 v3.4S, v23.4S, v14.4S // ......................................................................*................................................................................................. + trn2 v29.4S, v23.4S, v14.4S // ........................................................................*............................................................................................... + mls v26.4S, v10.4S, v8.S[0] // ...............................................................................................................................*........................................ + // gap // ........................................................................................................................................................................ + add v14.4S, v27.4S, v20.4S // .......................................................................................................................*................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v19.4S, v17.4S, v8.S[0] // .............................................................................................................................*.......................................... + trn2 v15.2D, v7.2D, v3.2D // ..........................................................................*............................................................................................. + // gap // ........................................................................................................................................................................ + trn2 v17.2D, v12.2D, v29.2D // ...........................................................................*............................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + trn1 v10.2D, v7.2D, v3.2D // ............................................................................*........................................................................................... + // gap // ........................................................................................................................................................................ + sqrdmulh v6.4S, v25.4S, v11.S[1] // ..................................................................................................................................*..................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + srshr v20.4S, v14.4S, #23 // ................................................................................................................................*....................................... + mul v0.4S, v25.4S, v11.S[0] // ...........................................................................................................................................*............................ + sub v9.4S, v15.4S, v17.4S // ..................................................................................*..................................................................................... + // gap // ........................................................................................................................................................................ + add v3.4S, v15.4S, v17.4S // ................................................................................*....................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v14.4S, v20.4S, v8.4S // .......................................................................................................................................*................................ + // gap // ........................................................................................................................................................................ + sub v22.4S, v19.4S, v26.4S // .....................................................................................................................................*.................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v7.4S, v9.4S, v28.S[1] // .......................................................................................*................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v25.4S, v9.4S, v28.S[0] // ...................................................................................................*.................................................................... + trn1 v9.2D, v12.2D, v29.2D // ..............................................................................*......................................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v0.4S, v6.4S, v8.S[0] // .............................................................................................................................................*.......................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v18.4S, v10.4S, v9.4S // ........................................................................................*............................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sub v31.4S, v10.4S, v9.4S // ....................................................................................*................................................................................... + mls v25.4S, v7.4S, v8.S[0] // .......................................................................................................*................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v13.4S, v22.4S, v11.S[0] // ..........................................................................................................................................*............................. + // gap // ........................................................................................................................................................................ + add v16.4S, v18.4S, v3.4S // .............................................................................................*.......................................................................... + sub v29.4S, v18.4S, v3.4S // ..................................................................................................*..................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v3.4S, v31.4S, v11.S[3] // .................................................................................................*...................................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + srshr v15.4S, v16.4S, #23 // ...................................................................................................................................*.................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v12.4S, v29.4S, v30.S[2] // .............................................................................................................*.......................................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v16.4S, v15.4S, v8.4S // ..............................................................................................................................................*......................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v9.4S, v31.4S, v11.S[2] // ...............................................................................................*........................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v9.4S, v3.4S, v8.S[0] // ...........................................................................................................*............................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v2.4S, v16.4S, v14.4S // .......................................................................................................................................................*................ + sub v20.4S, v16.4S, v14.4S // ......................................................................................................................................................*................. + sqrdmulh v14.4S, v22.4S, v11.S[1] // ........................................................................................................................................*............................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v7.4S, v29.4S, v30.S[3] // ......................................................................................................*................................................................. + str q2, [x1], #(16*4) // .............................................................................................................................................................*.......... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v29.4S, v9.4S, v25.4S // ....................................................................................................................*................................................... + sub v10.4S, v9.4S, v25.4S // .....................................................................................................................*.................................................. + // gap // ........................................................................................................................................................................ + mul v15.4S, v20.4S, v30.S[0] // ...................................................................................................................................................................*.... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v28.4S, v19.4S, v26.4S // ......................................................................................................................................*................................. + mls v13.4S, v14.4S, v8.S[0] // ............................................................................................................................................*........................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v12.4S, v7.4S, v8.S[0] // ................................................................................................................*....................................................... + // gap // ........................................................................................................................................................................ + srshr v7.4S, v29.4S, #23 // ........................................................................................................................*............................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + srshr v16.4S, v28.4S, #23 // .........................................................................................................................................*.............................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v14.4S, v10.4S, v30.S[3] // ..........................................................................................................................*............................................. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v18.4S, v10.4S, v30.S[2] // ...........................................................................................................................*............................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sub v31.4S, v12.4S, v0.4S // ...................................................................................................................................................*.................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v26.4S, v12.4S, v0.4S // ..................................................................................................................................................*..................... + mls v29.4S, v7.4S, v8.4S // ....................................................................................................................................*................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v28.4S, v16.4S, v8.4S // .................................................................................................................................................*...................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + str q26, [x1, #-32] // .........................................................................................................................................................*.............. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v18.4S, v14.4S, v8.S[0] // .................................................................................................................................*...................................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v16.4S, v20.4S, v30.S[1] // ....................................................................................................................................................................*... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v11.4S, v29.4S, v28.4S // ...........................................................................................................................................................*............ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v5.4S, v31.4S, v30.S[1] // ..................................................................................................................................................................*..... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sub v23.4S, v18.4S, v13.4S // ...............................................................................................................................................*........................ + str q11, [x1, #-48] // ...............................................................................................................................................................*........ + mul v11.4S, v31.4S, v30.S[0] // ............................................................................................................................................................*........... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v31.4S, v23.4S, v30.S[0] // ........................................................................................................................................................*............... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v23.4S, v23.4S, v30.S[1] // ....................................................................................................................................................*................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sub v21.4S, v29.4S, v28.4S // ..........................................................................................................................................................*............. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v11.4S, v5.4S, v8.S[0] // .......................................................................................................................................................................* + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + sqrdmulh v26.4S, v21.4S, v30.S[1] // ..............................................................................................................................................................*......... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mls v31.4S, v23.4S, v8.S[0] // ......................................................................................................................................................................*. + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + add v24.4S, v18.4S, v13.4S // ................................................................................................................................................*....................... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + mul v23.4S, v21.4S, v30.S[0] // .................................................................................................................................................................*...... + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + // gap // ........................................................................................................................................................................ + str q24, [x1, #-16] // .....................................................................................................................................................*.................. + add x1, x1, #64 // ................................................................................................................................................................*....... + mls v23.4S, v26.4S, v8.S[0] // .....................................................................................................................................................................*.. + + // ---------------------------------------------------------------------------- new position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q4, [x1, #16] // .......*................................................................................................................................................................ + // ldr q18, [x1, #0] // ......*................................................................................................................................................................. + // ldr q2, [x1, #32] // .....*.................................................................................................................................................................. + // ldr q13, [x1, #48] // ....*................................................................................................................................................................... + // ldr q3, [x5, #176] // ..................*..................................................................................................................................................... + // ldr q14, [x5, #16] // ..........*............................................................................................................................................................. + // ldr q0, [x4, #32] // ..........................*............................................................................................................................................. + // ldr q12, [x5, #48] // ..............................*......................................................................................................................................... + // ldr q7, [x5, #32] // ........................................................*............................................................................................................... + // trn2 v29.4S, v18.4S, v4.4S // .................*...................................................................................................................................................... + // trn1 v25.4S, v18.4S, v4.4S // ...........................*............................................................................................................................................ + // trn2 v19.4S, v2.4S, v13.4S // ..............*......................................................................................................................................................... + // trn1 v22.4S, v2.4S, v13.4S // ...............*........................................................................................................................................................ + // ldr q16, [x5, #80] // .....................*.................................................................................................................................................. + // ldr q10, [x5], #(12*16) // .........................................*.............................................................................................................................. + // trn2 v6.2D, v25.2D, v22.2D // .....................................*.................................................................................................................................. + // trn2 v5.2D, v29.2D, v19.2D // ............................*........................................................................................................................................... + // ldr q20, [x2, #16] // ..*..................................................................................................................................................................... + // ldr q9, [x5, #-128] // ...................*.................................................................................................................................................... + // trn1 v17.2D, v29.2D, v19.2D // .............................*.......................................................................................................................................... + // trn1 v26.2D, v25.2D, v22.2D // ........................................*............................................................................................................................... + // ldr q27, [x2, #0] // ...*.................................................................................................................................................................... + // sub v30.4S, v6.4S, v5.4S // ............................................*........................................................................................................................... + // ldr q15, [x2, #48] // *....................................................................................................................................................................... + // add v1.4S, v6.4S, v5.4S // ...........................................*............................................................................................................................ + // add v2.4S, v26.4S, v17.4S // ..............................................*......................................................................................................................... + // ldr q21, [x2, #32] // .*...................................................................................................................................................................... + // mul v22.4S, v30.4S, v9.4S // ...............................................................*........................................................................................................ + // trn2 v25.4S, v27.4S, v20.4S // ...........*............................................................................................................................................................ + // sub v13.4S, v26.4S, v17.4S // ...............................................*........................................................................................................................ + // ldr q26, [x5, #-80] // .............*.......................................................................................................................................................... + // sqrdmulh v9.4S, v30.4S, v16.4S // ...................................................................*.................................................................................................... + // trn1 v4.4S, v27.4S, v20.4S // ............*........................................................................................................................................................... + // trn1 v18.4S, v21.4S, v15.4S // .........*.............................................................................................................................................................. + // trn2 v21.4S, v21.4S, v15.4S // ........*............................................................................................................................................................... + // sqrdmulh v19.4S, v13.4S, v12.4S // ......................................................*................................................................................................................. + // ldr q24, [x5, #-64] // ................................*....................................................................................................................................... + // sub v23.4S, v2.4S, v1.4S // .....................................................*.................................................................................................................. + // mul v13.4S, v13.4S, v7.4S // ........................................................................*............................................................................................... + // trn1 v27.2D, v4.2D, v18.2D // ....................*................................................................................................................................................... + // trn2 v17.2D, v25.2D, v21.2D // .......................*................................................................................................................................................ + // mls v22.4S, v9.4S, v8.S[0] // ...........................................................................*............................................................................................ + // trn2 v31.2D, v4.2D, v18.2D // ........................*............................................................................................................................................... + // trn1 v29.2D, v25.2D, v21.2D // ......................*................................................................................................................................................. + // ldr q18, [x5, #-48] // ................*....................................................................................................................................................... + // mls v13.4S, v19.4S, v8.S[0] // .............................................................................*.......................................................................................... + // sub v11.4S, v31.4S, v17.4S // ....................................*................................................................................................................................... + // add v19.4S, v31.4S, v17.4S // ..................................*..................................................................................................................................... + // sqrdmulh v12.4S, v23.4S, v14.4S // ......................................................................*................................................................................................. + // ldr q31, [x5, #-32] // .........................*.............................................................................................................................................. + // sub v28.4S, v27.4S, v29.4S // ...............................*........................................................................................................................................ + // ldr q7, [x5, #-96] // .......................................*................................................................................................................................ + // mul v5.4S, v23.4S, v10.4S // ...............................................................................*........................................................................................ + // add v4.4S, v27.4S, v29.4S // .................................*...................................................................................................................................... + // sub v6.4S, v13.4S, v22.4S // ..................................................................................*..................................................................................... + // add v27.4S, v2.4S, v1.4S // .......................................................*................................................................................................................ + // sqrdmulh v20.4S, v11.4S, v3.4S // ..........................................*............................................................................................................................. + // add v9.4S, v4.4S, v19.4S // ..................................................*..................................................................................................................... + // sqrdmulh v17.4S, v6.4S, v14.4S // .......................................................................................*................................................................................ + // mls v5.4S, v12.4S, v8.S[0] // .................................................................................*...................................................................................... + // add v1.4S, v13.4S, v22.4S // ...................................................................................*.................................................................................... + // mul v12.4S, v6.4S, v10.4S // ............................................................................................*........................................................................... + // mls v12.4S, v17.4S, v8.S[0] // ..............................................................................................*......................................................................... + // trn2 v15.4S, v27.4S, v1.4S // ........................................................................................*............................................................................... + // trn1 v3.4S, v27.4S, v1.4S // ..........................................................................................*............................................................................. + // ldr q23, [x4, #16] // ...................................*.................................................................................................................................... + // sqrdmulh v27.4S, v28.4S, v18.4S // ......................................*................................................................................................................................. + // mul v10.4S, v11.4S, v31.4S // .................................................*...................................................................................................................... + // sub v11.4S, v4.4S, v19.4S // ...................................................*.................................................................................................................... + // ldr q21, [x4], #64 // ....................................................................................*................................................................................... + // trn1 v13.4S, v5.4S, v12.4S // ...................................................................................................*.................................................................... + // mls v10.4S, v20.4S, v8.S[0] // ....................................................*................................................................................................................... + // trn2 v31.4S, v5.4S, v12.4S // ....................................................................................................*................................................................... + // mul v24.4S, v28.4S, v24.4S // .............................................*.......................................................................................................................... + // trn2 v6.2D, v3.2D, v13.2D // ........................................................................................................*............................................................... + // trn2 v29.2D, v15.2D, v31.2D // .........................................................................................................*.............................................................. + // trn1 v17.2D, v3.2D, v13.2D // ..........................................................................................................*............................................................. + // mls v24.4S, v27.4S, v8.S[0] // ................................................*....................................................................................................................... + // trn1 v30.2D, v15.2D, v31.2D // ....................................................................................................................*................................................... + // ldr q15, [x4, #-16] // .....................................................................*.................................................................................................. + // add v13.4S, v6.4S, v29.4S // ...............................................................................................................*........................................................ + // sqrdmulh v25.4S, v11.4S, v26.4S // .........................................................*.............................................................................................................. + // sub v20.4S, v6.4S, v29.4S // ..............................................................................................................*......................................................... + // mul v18.4S, v11.4S, v7.4S // ............................................................*........................................................................................................... + // sub v4.4S, v17.4S, v30.4S // .......................................................................................................................*................................................ + // sub v19.4S, v24.4S, v10.4S // ..........................................................*............................................................................................................. + // add v22.4S, v24.4S, v10.4S // ...........................................................*............................................................................................................ + // sqrdmulh v31.4S, v20.4S, v0.S[1] // ..................................................................................................................*..................................................... + // add v27.4S, v17.4S, v30.4S // ......................................................................................................................*................................................. + // mls v18.4S, v25.4S, v8.S[0] // ................................................................*....................................................................................................... + // trn1 v1.4S, v9.4S, v22.4S // ....................................................................*................................................................................................... + // trn2 v24.4S, v9.4S, v22.4S // ..................................................................*..................................................................................................... + // sqrdmulh v11.4S, v19.4S, v26.4S // ..............................................................*......................................................................................................... + // add v25.4S, v27.4S, v13.4S // ..........................................................................................................................*............................................. + // mul v6.4S, v19.4S, v7.4S // .............................................................*.......................................................................................................... + // mul v12.4S, v4.4S, v23.S[2] // ................................................................................................................................*....................................... + // mls v6.4S, v11.4S, v8.S[0] // .................................................................*...................................................................................................... + // sqrdmulh v19.4S, v4.4S, v23.S[3] // ............................................................................................................................*........................................... + // sub v4.4S, v27.4S, v13.4S // ...........................................................................................................................*............................................ + // mul v29.4S, v20.4S, v0.S[0] // ...................................................................................................................*.................................................... + // trn1 v30.4S, v18.4S, v6.4S // .......................................................................*................................................................................................ + // trn2 v2.4S, v18.4S, v6.4S // .........................................................................*.............................................................................................. + // sqrdmulh v9.4S, v4.4S, v21.S[3] // .....................................................................................................................................*.................................. + // mls v29.4S, v31.4S, v8.S[0] // ........................................................................................................................*............................................... + // trn2 v3.2D, v1.2D, v30.2D // ..........................................................................*............................................................................................. + // trn2 v7.2D, v24.2D, v2.2D // ............................................................................*........................................................................................... + // trn1 v28.2D, v1.2D, v30.2D // ......................................................................................*................................................................................. + // mls v12.4S, v19.4S, v8.S[0] // .................................................................................................................................*...................................... + // trn1 v20.2D, v24.2D, v2.2D // ..............................................................................*......................................................................................... + // mul v6.4S, v4.4S, v21.S[2] // ..............................................................................................................................*......................................... + // sub v11.4S, v3.4S, v7.4S // ................................................................................*....................................................................................... + // add v5.4S, v3.4S, v7.4S // ...............................................................................................*........................................................................ + // mls v6.4S, v9.4S, v8.S[0] // ............................................................................................................................................*........................... + // sub v19.4S, v28.4S, v20.4S // .............................................................................................*.......................................................................... + // add v20.4S, v28.4S, v20.4S // ...........................................................................................*............................................................................ + // mul v27.4S, v11.4S, v15.S[0] // .....................................................................................*.................................................................................. + // add v17.4S, v12.4S, v29.4S // .......................................................................................................................................*................................ + // sub v9.4S, v12.4S, v29.4S // ........................................................................................................................................*............................... + // sqrdmulh v15.4S, v11.4S, v15.S[1] // .........................................................................................*.............................................................................. + // add v2.4S, v20.4S, v5.4S // ......................................................................................................*................................................................. + // srshr v29.4S, v17.4S, #23 // .............................................................................................................................................*.......................... + // sqrdmulh v18.4S, v19.4S, v0.S[3] // ................................................................................................*....................................................................... + // sqrdmulh v13.4S, v9.4S, v21.S[3] // ...............................................................................................................................................*........................ + // mul v4.4S, v9.4S, v21.S[2] // ................................................................................................................................................*....................... + // mul v3.4S, v19.4S, v0.S[2] // ..................................................................................................*..................................................................... + // mls v3.4S, v18.4S, v8.S[0] // .......................................................................................................*................................................................ + // sub v12.4S, v20.4S, v5.4S // .................................................................................................*...................................................................... + // mls v27.4S, v15.4S, v8.S[0] // .....................................................................................................*.................................................................. + // srshr v5.4S, v2.4S, #23 // ............................................................................................................*........................................................... + // mls v4.4S, v13.4S, v8.S[0] // ......................................................................................................................................................*................. + // sqrdmulh v13.4S, v12.4S, v23.S[1] // ...........................................................................................................*............................................................ + // srshr v14.4S, v25.4S, #23 // .............................................................................................................................*.......................................... + // mls v17.4S, v29.4S, v8.4S // ...................................................................................................................................................*.................... + // sub v15.4S, v3.4S, v27.4S // .................................................................................................................*...................................................... + // add v27.4S, v3.4S, v27.4S // ..........................................................................................................................................*............................. + // mls v2.4S, v5.4S, v8.4S // ................................................................................................................*....................................................... + // sqrdmulh v5.4S, v15.4S, v23.S[1] // ....................................................................................................................................*................................... + // srshr v19.4S, v27.4S, #23 // ..............................................................................................................................................*......................... + // mul v7.4S, v15.4S, v23.S[0] // .........................................................................................................................*.............................................. + // mul v15.4S, v12.4S, v23.S[0] // .............................................................................................................*.......................................................... + // mls v7.4S, v5.4S, v8.S[0] // ...........................................................................................................................................*............................ + // mls v15.4S, v13.4S, v8.S[0] // .....................................................................................................................*.................................................. + // mls v25.4S, v14.4S, v8.4S // ...............................................................................................................................*........................................ + // sub v30.4S, v4.4S, v7.4S // ..........................................................................................................................................................*............. + // add v29.4S, v4.4S, v7.4S // ...................................................................................................................................................................*.... + // mls v27.4S, v19.4S, v8.4S // ....................................................................................................................................................*................... + // add v23.4S, v6.4S, v15.4S // ..................................................................................................................................................*..................... + // sub v15.4S, v6.4S, v15.4S // .................................................................................................................................................*...................... + // sqrdmulh v7.4S, v30.4S, v21.S[1] // ..............................................................................................................................................................*......... + // str q29, [x1, #48] // .....................................................................................................................................................................*.. + // sub v16.4S, v25.4S, v2.4S // ...................................................................................................................................*.................................... + // add v4.4S, v25.4S, v2.4S // ..................................................................................................................................*..................................... + // mul v31.4S, v30.4S, v21.S[0] // .............................................................................................................................................................*.......... + // str q23, [x1, #32] // .....................................................................................................................................................*.................. + // sub v1.4S, v17.4S, v27.4S // ...............................................................................................................................................................*........ + // add v20.4S, v17.4S, v27.4S // ........................................................................................................................................................*............... + // mul v11.4S, v15.4S, v21.S[0] // ............................................................................................................................................................*........... + // str q4, [x1], #(16*4) // ......................................................................................................................................*................................. + // sqrdmulh v9.4S, v1.4S, v21.S[1] // .................................................................................................................................................................*...... + // str q20, [x1, #-48] // ...........................................................................................................................................................*............ + // add x1, x1, #64 // ......................................................................................................................................................................*. + // mul v23.4S, v1.4S, v21.S[0] // ....................................................................................................................................................................*... + // sqrdmulh v1.4S, v15.4S, v21.S[1] // .........................................................................................................................................................*.............. + // mul v15.4S, v16.4S, v21.S[0] // .........................................................................................................................................*.............................. + // sqrdmulh v16.4S, v16.4S, v21.S[1] // .......................................................................................................................................................*................ + // mls v23.4S, v9.4S, v8.S[0] // .......................................................................................................................................................................* + // mls v31.4S, v7.4S, v8.S[0] // ..................................................................................................................................................................*..... + // mls v11.4S, v1.4S, v8.S[0] // ................................................................................................................................................................*....... sub count, count, #1 layer45678_start: // Instructions: 174 - // Expected cycles: 129 - // Expected IPC: 1.35 + // Expected cycles: 139 + // Expected IPC: 1.25 // - // Wall time: 2804.18s - // User time: 2804.18s + // Wall time: 1451.71s + // User time: 1451.71s // // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> // 0 25 50 75 100 125 150 // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- - ldr q27, [x2, #0] // ............*................................................................................................................................................................. - trn1 v11.4S, v18.4S, v0.4S // ..................*........................................................................................................................................................... - ldr q7, [x5, #272] // .............................e................................................................................................................................................ - ldr q23, [x5], #(12*16) // ........................*..................................................................................................................................................... - mls v25.4S, v9.4S, v8.S[0] // ..................................*........................................................................................................................................... - add v24.4S, v1.4S, v24.4S // .........................................*.................................................................................................................................... - ldr q9, [x5, #-96] // ..................................................*........................................................................................................................... - ldr q13, [x5, #-80] // ...................................................*.......................................................................................................................... + ldr q4, [x1, #16] // .e............................................................................................................................................................................ + ldr q18, [x1, #0] // e............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q2, [x1, #32] // ..e........................................................................................................................................................................... + mls v15.4S, v16.4S, v8.S[0] // ....................................................................................................................................................*......................... + ldr q13, [x1, #48] // ...e.......................................................................................................................................................................... + ldr q3, [x5, #176] // .......................................................e...................................................................................................................... + ldr q14, [x5, #16] // .........................e.................................................................................................................................................... + str q23, [x2, #16] // .........................................................................................................................................................................*.... + ldr q0, [x4, #32] // ..............................................................................................e............................................................................... + ldr q12, [x5, #48] // ...........................e.................................................................................................................................................. + str q31, [x2, #48] // ...........................................................................................................................................................................*.. + ldr q7, [x5, #32] // ..........................e................................................................................................................................................... + trn2 v29.4S, v18.4S, v4.4S // .....e........................................................................................................................................................................ + trn1 v25.4S, v18.4S, v4.4S // ....e......................................................................................................................................................................... + trn2 v19.4S, v2.4S, v13.4S // .......e...................................................................................................................................................................... + trn1 v22.4S, v2.4S, v13.4S // ......e....................................................................................................................................................................... + str q11, [x2, #32] // ..........................................................................................................................................................................*... + ldr q16, [x5, #80] // .............................e................................................................................................................................................ + str q15, [x2], #(16*4) // ........................................................................................................................................................................*..... + add x2, x2, #64 // .............................................................................................................................................................................* + ldr q10, [x5], #(12*16) // ........................e..................................................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v1.4S, v16.4S, v30.4S // ...........................................*.................................................................................................................................. - ldr q3, [x5, #-64] // ....................................................*......................................................................................................................... - ldr q20, [x5, #-48] // .....................................................*........................................................................................................................ - trn1 v21.4S, v27.4S, v17.4S // ................*............................................................................................................................................................. - ldr q15, [x5, #-32] // ......................................................*....................................................................................................................... - ldr q26, [x5, #-16] // .......................................................*...................................................................................................................... - trn2 v27.4S, v27.4S, v17.4S // .................*............................................................................................................................................................ - trn2 v18.4S, v18.4S, v0.4S // ...................*.......................................................................................................................................................... - ldr q0, [x4], #64 // ............................................................................................*................................................................................. - mul v16.4S, v16.4S, v23.4S // ..........................................*................................................................................................................................... - sub v17.4S, v25.4S, v14.4S // .............................................*................................................................................................................................ - ldr q2, [x4, #-48] // .............................................................................................*................................................................................ - ldr q4, [x4, #-32] // ..............................................................................................*............................................................................... - trn2 v5.2D, v21.2D, v11.2D // ....................*......................................................................................................................................................... // gap // .............................................................................................................................................................................. - mls v16.4S, v1.4S, v8.S[0] // ............................................*................................................................................................................................. - trn2 v1.2D, v27.2D, v18.2D // .....................*........................................................................................................................................................ - ldr q22, [x4, #-16] // ...............................................................................................*.............................................................................. - trn1 v11.2D, v21.2D, v11.2D // ......................*....................................................................................................................................................... - ldr q21, [x5, #32] // ..........................e................................................................................................................................................... + trn2 v6.2D, v25.2D, v22.2D // ........e..................................................................................................................................................................... + trn2 v5.2D, v29.2D, v19.2D // .........e.................................................................................................................................................................... + ldr q20, [x2, #16] // .............e................................................................................................................................................................ + ldr q9, [x5, #-128] // ............................e................................................................................................................................................. + trn1 v17.2D, v29.2D, v19.2D // ...........e.................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v23.4S, v17.4S, v23.4S // ...............................................*.............................................................................................................................. - add v25.4S, v25.4S, v14.4S // ..............................................*............................................................................................................................... - ldr q14, [x5, #48] // ...........................e.................................................................................................................................................. - sub v19.4S, v5.4S, v1.4S // .............................................................*................................................................................................................ - ldr q29, [x5, #64] // ............................e................................................................................................................................................. + trn1 v26.2D, v25.2D, v22.2D // ..........e................................................................................................................................................................... + ldr q27, [x2, #0] // ............e................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v27.2D, v27.2D, v18.2D // .......................*...................................................................................................................................................... - sqrdmulh v18.4S, v17.4S, v30.4S // ................................................*............................................................................................................................. - ldr q30, [x5, #16] // .........................e.................................................................................................................................................... - trn1 v17.4S, v24.4S, v25.4S // ............................................................................*................................................................................................. + sub v30.4S, v6.4S, v5.4S // ...................................e.......................................................................................................................................... + ldr q15, [x2, #48] // ...............e.............................................................................................................................................................. + add v1.4S, v6.4S, v5.4S // ....................................e......................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v25.4S, v24.4S, v25.4S // .............................................................................*................................................................................................ - mul v24.4S, v19.4S, v15.4S // ...............................................................*.............................................................................................................. // gap // .............................................................................................................................................................................. - sub v15.4S, v11.4S, v27.4S // ........................................................*..................................................................................................................... + add v2.4S, v26.4S, v17.4S // ...............................e.............................................................................................................................................. // gap // .............................................................................................................................................................................. + ldr q21, [x2, #32] // ..............e............................................................................................................................................................... + mul v22.4S, v30.4S, v9.4S // ......................................e....................................................................................................................................... + trn2 v25.4S, v27.4S, v20.4S // .................e............................................................................................................................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v26.4S, v19.4S, v26.4S // ................................................................*............................................................................................................. - add v27.4S, v11.4S, v27.4S // .........................................................*.................................................................................................................... + sub v13.4S, v26.4S, v17.4S // ..............................e............................................................................................................................................... + ldr q26, [x5, #-80] // ...................................................e.......................................................................................................................... // gap // .............................................................................................................................................................................. - add v11.4S, v5.4S, v1.4S // ..............................................................*............................................................................................................... + sqrdmulh v9.4S, v30.4S, v16.4S // .....................................e........................................................................................................................................ + trn1 v4.4S, v27.4S, v20.4S // ................e............................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v18.4S, v21.4S, v15.4S // ..................e........................................................................................................................................................... // gap // .............................................................................................................................................................................. - mls v23.4S, v18.4S, v8.S[0] // .................................................*............................................................................................................................ // gap // .............................................................................................................................................................................. + trn2 v21.4S, v21.4S, v15.4S // ...................e.......................................................................................................................................................... + sqrdmulh v19.4S, v13.4S, v12.4S // ................................e............................................................................................................................................. + ldr q24, [x5, #-64] // ....................................................e......................................................................................................................... + sub v23.4S, v2.4S, v1.4S // ........................................e..................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v13.4S, v13.4S, v7.4S // .................................e............................................................................................................................................ + trn1 v27.2D, v4.2D, v18.2D // ......................e....................................................................................................................................................... // gap // .............................................................................................................................................................................. + trn2 v17.2D, v25.2D, v21.2D // .....................e........................................................................................................................................................ // gap // .............................................................................................................................................................................. - mul v1.4S, v15.4S, v3.4S // ..........................................................*................................................................................................................... - sub v3.4S, v27.4S, v11.4S // ..................................................................*........................................................................................................... // gap // .............................................................................................................................................................................. - add v27.4S, v27.4S, v11.4S // ...................................................................*.......................................................................................................... + mls v22.4S, v9.4S, v8.S[0] // .......................................e...................................................................................................................................... + trn2 v31.2D, v4.2D, v18.2D // ....................e......................................................................................................................................................... // gap // .............................................................................................................................................................................. + trn1 v29.2D, v25.2D, v21.2D // .......................e...................................................................................................................................................... + ldr q18, [x5, #-48] // .....................................................e........................................................................................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v11.4S, v15.4S, v20.4S // ...........................................................*.................................................................................................................. + mls v13.4S, v19.4S, v8.S[0] // ..................................e........................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v20.4S, v16.4S, v23.4S // ..............................................................................*............................................................................................... + sub v11.4S, v31.4S, v17.4S // .............................................................e................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v23.4S, v16.4S, v23.4S // ...............................................................................*.............................................................................................. - mls v24.4S, v26.4S, v8.S[0] // .................................................................*............................................................................................................ + add v19.4S, v31.4S, v17.4S // ..............................................................e............................................................................................................... + sqrdmulh v12.4S, v23.4S, v14.4S // ..........................................e................................................................................................................................... + ldr q31, [x5, #-32] // ......................................................e....................................................................................................................... + sub v28.4S, v27.4S, v29.4S // ........................................................e..................................................................................................................... + ldr q7, [x5, #-96] // ..................................................e........................................................................................................................... // gap // .............................................................................................................................................................................. + mul v5.4S, v23.4S, v10.4S // ...........................................e.................................................................................................................................. + add v4.4S, v27.4S, v29.4S // .........................................................e.................................................................................................................... // gap // .............................................................................................................................................................................. + sub v6.4S, v13.4S, v22.4S // .............................................e................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v15.4S, v3.4S, v13.4S // .....................................................................*........................................................................................................ - trn2 v26.2D, v17.2D, v20.2D // ................................................................................*............................................................................................. + add v27.4S, v2.4S, v1.4S // .........................................e.................................................................................................................................... + sqrdmulh v20.4S, v11.4S, v3.4S // ...............................................................e.............................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v20.2D, v17.2D, v20.2D // ..................................................................................*........................................................................................... + add v9.4S, v4.4S, v19.4S // ...................................................................e.......................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v1.4S, v11.4S, v8.S[0] // ............................................................*................................................................................................................. - trn1 v11.2D, v25.2D, v23.2D // ...................................................................................*.......................................................................................... + sqrdmulh v17.4S, v6.4S, v14.4S // ...............................................e.............................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v23.2D, v25.2D, v23.2D // .................................................................................*............................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v25.4S, v3.4S, v9.4S // ....................................................................*......................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v3.4S, v20.4S, v11.4S // ................................................................................................*............................................................................. + mls v5.4S, v12.4S, v8.S[0] // ............................................e................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v25.4S, v15.4S, v8.S[0] // ......................................................................*....................................................................................................... - add v11.4S, v20.4S, v11.4S // .................................................................................................*............................................................................ + add v1.4S, v13.4S, v22.4S // ..............................................e............................................................................................................................... // gap // .............................................................................................................................................................................. - sub v20.4S, v1.4S, v24.4S // .......................................................................*...................................................................................................... // gap // .............................................................................................................................................................................. + mul v12.4S, v6.4S, v10.4S // ................................................e............................................................................................................................. // gap // .............................................................................................................................................................................. - add v24.4S, v1.4S, v24.4S // ........................................................................*..................................................................................................... - mul v1.4S, v3.4S, v2.S[2] // ..................................................................................................*........................................................................... // gap // .............................................................................................................................................................................. - sub v15.4S, v26.4S, v23.4S // .....................................................................................................*........................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v9.4S, v20.4S, v9.4S // .........................................................................*.................................................................................................... - add v23.4S, v26.4S, v23.4S // ......................................................................................................*....................................................................... // gap // .............................................................................................................................................................................. - trn1 v26.4S, v27.4S, v24.4S // ....................................................................................*......................................................................................... + mls v12.4S, v17.4S, v8.S[0] // .................................................e............................................................................................................................ + trn2 v15.4S, v27.4S, v1.4S // .............................................................................e................................................................................................ // gap // .............................................................................................................................................................................. + trn1 v3.4S, v27.4S, v1.4S // ............................................................................e................................................................................................. + ldr q23, [x4, #16] // .............................................................................................e................................................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v20.4S, v13.4S // ..........................................................................*................................................................................................... - trn2 v27.4S, v27.4S, v24.4S // .....................................................................................*........................................................................................ + sqrdmulh v27.4S, v28.4S, v18.4S // ..........................................................e................................................................................................................... // gap // .............................................................................................................................................................................. - sub v24.4S, v11.4S, v23.4S // ....................................................................................................................*......................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v3.4S, v3.4S, v2.S[3] // ...................................................................................................*.......................................................................... - add v11.4S, v11.4S, v23.4S // .....................................................................................................................*........................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v10.4S, v11.4S, v31.4S // ................................................................e............................................................................................................. + sub v11.4S, v4.4S, v19.4S // ..................................................................e........................................................................................................... + ldr q21, [x4], #64 // ............................................................................................e................................................................................. + trn1 v13.4S, v5.4S, v12.4S // ..............................................................................e............................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v23.4S, v15.4S, v4.S[0] // .......................................................................................................*...................................................................... + mls v10.4S, v20.4S, v8.S[0] // .................................................................e............................................................................................................ + trn2 v31.4S, v5.4S, v12.4S // ...............................................................................e.............................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - srshr v20.4S, v11.4S, #23 // ........................................................................................................................................*..................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v9.4S, v13.4S, v8.S[0] // ...........................................................................*.................................................................................................. + mul v24.4S, v28.4S, v24.4S // ...........................................................e.................................................................................................................. + trn2 v6.2D, v3.2D, v13.2D // ................................................................................e............................................................................................. // gap // .............................................................................................................................................................................. + trn2 v29.2D, v15.2D, v31.2D // .................................................................................e............................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v17.2D, v3.2D, v13.2D // ..................................................................................e........................................................................................... + mls v24.4S, v27.4S, v8.S[0] // ............................................................e................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v30.2D, v15.2D, v31.2D // ...................................................................................e.......................................................................................... + ldr q15, [x4, #-16] // ...............................................................................................e.............................................................................. // gap // .............................................................................................................................................................................. - mls v1.4S, v3.4S, v8.S[0] // ....................................................................................................*......................................................................... + add v13.4S, v6.4S, v29.4S // ......................................................................................................e....................................................................... + sqrdmulh v25.4S, v11.4S, v26.4S // ....................................................................e......................................................................................................... // gap // .............................................................................................................................................................................. + sub v20.4S, v6.4S, v29.4S // .....................................................................................................e........................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v18.4S, v11.4S, v7.4S // .....................................................................e........................................................................................................ + sub v4.4S, v17.4S, v30.4S // ................................................................................................e............................................................................. // gap // .............................................................................................................................................................................. + sub v19.4S, v24.4S, v10.4S // .......................................................................e...................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v15.4S, v4.S[1] // ........................................................................................................*..................................................................... // gap // .............................................................................................................................................................................. + add v22.4S, v24.4S, v10.4S // ........................................................................e..................................................................................................... + sqrdmulh v31.4S, v20.4S, v0.S[1] // .......................................................................................................e...................................................................... // gap // .............................................................................................................................................................................. - trn1 v3.4S, v25.4S, v9.4S // ......................................................................................*....................................................................................... + add v27.4S, v17.4S, v30.4S // .................................................................................................e............................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v25.4S, v25.4S, v9.4S // .......................................................................................*...................................................................................... - mul v9.4S, v24.4S, v0.S[2] // ......................................................................................................................*....................................................... + mls v18.4S, v25.4S, v8.S[0] // ......................................................................e....................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v1.4S, v9.4S, v22.4S // ....................................................................................e......................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v24.4S, v24.4S, v0.S[3] // .......................................................................................................................*...................................................... - trn2 v15.2D, v26.2D, v3.2D // ........................................................................................*..................................................................................... + trn2 v24.4S, v9.4S, v22.4S // .....................................................................................e........................................................................................ + sqrdmulh v11.4S, v19.4S, v26.4S // .........................................................................e.................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v3.2D, v26.2D, v3.2D // ..........................................................................................*................................................................................... + add v25.4S, v27.4S, v13.4S // .....................................................................................................................e........................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v23.4S, v13.4S, v8.S[0] // .........................................................................................................*.................................................................... - trn1 v13.2D, v27.2D, v25.2D // ...........................................................................................*.................................................................................. + mul v6.4S, v19.4S, v7.4S // ..........................................................................e................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v27.2D, v27.2D, v25.2D // .........................................................................................*.................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v11.4S, v20.4S, v8.4S // .........................................................................................................................................*.................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v25.4S, v3.4S, v13.4S // ..........................................................................................................*................................................................... + mul v12.4S, v4.4S, v23.S[2] // ...................................................................................................e.......................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v9.4S, v24.4S, v8.S[0] // ........................................................................................................................*..................................................... - add v24.4S, v3.4S, v13.4S // ...........................................................................................................*.................................................................. // gap // .............................................................................................................................................................................. - sub v13.4S, v15.4S, v27.4S // ...............................................................................................................*.............................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v27.4S, v15.4S, v27.4S // ................................................................................................................*............................................................. - mul v3.4S, v25.4S, v4.S[2] // ............................................................................................................*................................................................. + mls v6.4S, v11.4S, v8.S[0] // ...........................................................................e.................................................................................................. // gap // .............................................................................................................................................................................. - sub v20.4S, v1.4S, v23.4S // .........................................................................................................................*.................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v25.4S, v25.4S, v4.S[3] // .............................................................................................................*................................................................ - add v23.4S, v1.4S, v23.4S // ..........................................................................................................................*................................................... // gap // .............................................................................................................................................................................. - sub v1.4S, v24.4S, v27.4S // ..............................................................................................................................*............................................... // gap // .............................................................................................................................................................................. + sqrdmulh v19.4S, v4.4S, v23.S[3] // ..................................................................................................e........................................................................... + sub v4.4S, v27.4S, v13.4S // ....................................................................................................................e......................................................... // gap // .............................................................................................................................................................................. - add v27.4S, v24.4S, v27.4S // ...............................................................................................................................*.............................................. - sqrdmulh v24.4S, v13.4S, v22.S[1] // ..................................................................................................................*........................................................... // gap // .............................................................................................................................................................................. - srshr v15.4S, v23.4S, #23 // ..........................................................................................................................................*................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v13.4S, v13.4S, v22.S[0] // .................................................................................................................*............................................................ + mul v29.4S, v20.4S, v0.S[0] // ........................................................................................................e..................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - srshr v26.4S, v27.4S, #23 // ............................................................................................................................................*................................. + trn1 v30.4S, v18.4S, v6.4S // ......................................................................................e....................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v3.4S, v25.4S, v8.S[0] // ..............................................................................................................*............................................................... + trn2 v2.4S, v18.4S, v6.4S // .......................................................................................e...................................................................................... + sqrdmulh v9.4S, v4.4S, v21.S[3] // ......................................................................................................................e....................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v29.4S, v31.4S, v8.S[0] // .........................................................................................................e.................................................................... + trn2 v3.2D, v1.2D, v30.2D // ........................................................................................e..................................................................................... // gap // .............................................................................................................................................................................. - mls v13.4S, v24.4S, v8.S[0] // ...................................................................................................................*.......................................................... + trn2 v7.2D, v24.2D, v2.2D // .........................................................................................e.................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v28.2D, v1.2D, v30.2D // ..........................................................................................e................................................................................... + mls v12.4S, v19.4S, v8.S[0] // ....................................................................................................e......................................................................... // gap // .............................................................................................................................................................................. + trn1 v20.2D, v24.2D, v2.2D // ...........................................................................................e.................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v25.4S, v20.4S, v0.S[2] // ...........................................................................................................................*.................................................. + mul v6.4S, v4.4S, v21.S[2] // .......................................................................................................................e...................................................... + sub v11.4S, v3.4S, v7.4S // ...............................................................................................................e.............................................................. // gap // .............................................................................................................................................................................. + add v5.4S, v3.4S, v7.4S // ................................................................................................................e............................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v6.4S, v9.4S, v8.S[0] // ........................................................................................................................e..................................................... + sub v19.4S, v28.4S, v20.4S // ..........................................................................................................e................................................................... // gap // .............................................................................................................................................................................. + add v20.4S, v28.4S, v20.4S // ...........................................................................................................e.................................................................. // gap // .............................................................................................................................................................................. - mul v24.4S, v1.4S, v2.S[0] // ................................................................................................................................*............................................. // gap // .............................................................................................................................................................................. + mul v27.4S, v11.4S, v15.S[0] // ..................................................................................................................e........................................................... + add v17.4S, v12.4S, v29.4S // ..........................................................................................................................e................................................... // gap // .............................................................................................................................................................................. - sub v18.4S, v3.4S, v13.4S // ...................................................................................................................................*.......................................... + sub v9.4S, v12.4S, v29.4S // .........................................................................................................................e.................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v1.4S, v1.4S, v2.S[1] // .................................................................................................................................*............................................ - add v13.4S, v3.4S, v13.4S // ....................................................................................................................................*......................................... + sqrdmulh v15.4S, v11.4S, v15.S[1] // .................................................................................................................e............................................................ + add v2.4S, v20.4S, v5.4S // ...............................................................................................................................e.............................................. // gap // .............................................................................................................................................................................. + srshr v29.4S, v17.4S, #23 // ..........................................................................................................................................e................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sqrdmulh v18.4S, v19.4S, v0.S[3] // ............................................................................................................e................................................................. // gap // .............................................................................................................................................................................. - mul v3.4S, v18.4S, v2.S[0] // .....................................................................................................................................*........................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - srshr v16.4S, v13.4S, #23 // ..............................................................................................................................................*............................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v18.4S, v18.4S, v2.S[1] // ......................................................................................................................................*....................................... + sqrdmulh v13.4S, v9.4S, v21.S[3] // ...........................................................................................................................e.................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v20.4S, v20.4S, v0.S[3] // ............................................................................................................................*................................................. + mul v4.4S, v9.4S, v21.S[2] // ............................................................................................................................e................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v24.4S, v1.4S, v8.S[0] // ..................................................................................................................................*........................................... + mul v3.4S, v19.4S, v0.S[2] // .............................................................................................................e................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v3.4S, v18.4S, v8.S[0] // .......................................................................................................................................*...................................... + mls v3.4S, v18.4S, v8.S[0] // ..............................................................................................................e............................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v25.4S, v20.4S, v8.S[0] // .............................................................................................................................*................................................ + sub v12.4S, v20.4S, v5.4S // ..............................................................................................................................e............................................... + mls v27.4S, v15.4S, v8.S[0] // ...................................................................................................................e.......................................................... // gap // .............................................................................................................................................................................. + srshr v5.4S, v2.4S, #23 // ............................................................................................................................................e................................. // gap // .............................................................................................................................................................................. - sub v1.4S, v9.4S, v24.4S // ..........................................................................................................................................................*................... // gap // .............................................................................................................................................................................. + mls v4.4S, v13.4S, v8.S[0] // .............................................................................................................................e................................................ // gap // .............................................................................................................................................................................. - add v24.4S, v9.4S, v24.4S // ...........................................................................................................................................................*.................. - mls v23.4S, v15.4S, v8.4S // ...........................................................................................................................................*.................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v27.4S, v26.4S, v8.4S // .............................................................................................................................................*................................ + sqrdmulh v13.4S, v12.4S, v23.S[1] // ................................................................................................................................e............................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q24, [x1, #32] // ......................................................................................................................................................................*....... - sub v24.4S, v25.4S, v3.4S // ...............................................................................................................................................................*.............. + srshr v14.4S, v25.4S, #23 // ........................................................................................................................................e..................................... // gap // .............................................................................................................................................................................. - mls v13.4S, v16.4S, v8.4S // ...............................................................................................................................................*.............................. - add v25.4S, v25.4S, v3.4S // ................................................................................................................................................................*............. // gap // .............................................................................................................................................................................. + mls v17.4S, v29.4S, v8.4S // ...........................................................................................................................................e.................................. + sub v15.4S, v3.4S, v27.4S // ...................................................................................................................................e.......................................... // gap // .............................................................................................................................................................................. + add v27.4S, v3.4S, v27.4S // ....................................................................................................................................e......................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v9.4S, v1.4S, v0.S[0] // ............................................................................................................................................................*................. + mls v2.4S, v5.4S, v8.4S // .............................................................................................................................................e................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q25, [x1, #48] // .......................................................................................................................................................................*...... - sub v25.4S, v11.4S, v27.4S // ................................................................................................................................................*............................. // gap // .............................................................................................................................................................................. - add v27.4S, v11.4S, v27.4S // .................................................................................................................................................*............................ - sqrdmulh v11.4S, v1.4S, v0.S[1] // .............................................................................................................................................................*................ // gap // .............................................................................................................................................................................. - add v1.4S, v23.4S, v13.4S // ......................................................................................................................................................*....................... // gap // .............................................................................................................................................................................. + sqrdmulh v5.4S, v15.4S, v23.S[1] // .....................................................................................................................................e........................................ + srshr v19.4S, v27.4S, #23 // ..............................................................................................................................................e............................... // gap // .............................................................................................................................................................................. - sub v23.4S, v23.4S, v13.4S // .....................................................................................................................................................*........................ - mul v13.4S, v25.4S, v0.S[0] // ..................................................................................................................................................*........................... // gap // .............................................................................................................................................................................. - str q27, [x1], #(16*4) // ....................................................................................................................................................................*......... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v27.4S, v25.4S, v0.S[1] // ...................................................................................................................................................*.......................... - str q1, [x1, #-48] // .....................................................................................................................................................................*........ - add x1, x1, #64 // ............................................................................................................................................................................*. + mul v7.4S, v15.4S, v23.S[0] // ......................................................................................................................................e....................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q25, [x1, #0] // e............................................................................................................................................................................. - ldr q1, [x1, #16] // .e............................................................................................................................................................................ - mul v3.4S, v23.4S, v0.S[0] // .......................................................................................................................................................*...................... - ldr q20, [x1, #32] // ..e........................................................................................................................................................................... - ldr q15, [x1, #48] // ...e.......................................................................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v23.4S, v23.4S, v0.S[1] // ........................................................................................................................................................*..................... // gap // .............................................................................................................................................................................. + mul v15.4S, v12.4S, v23.S[0] // .................................................................................................................................e............................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v26.4S, v25.4S, v1.4S // ....e......................................................................................................................................................................... - mul v18.4S, v24.4S, v0.S[0] // .................................................................................................................................................................*............ // gap // .............................................................................................................................................................................. - trn2 v25.4S, v25.4S, v1.4S // .....e........................................................................................................................................................................ + mls v7.4S, v5.4S, v8.S[0] // .......................................................................................................................................e...................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v24.4S, v24.4S, v0.S[1] // ..................................................................................................................................................................*........... - trn1 v1.4S, v20.4S, v15.4S // ......e....................................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn2 v20.4S, v20.4S, v15.4S // .......e...................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v13.4S, v27.4S, v8.S[0] // ....................................................................................................................................................*......................... + mls v15.4S, v13.4S, v8.S[0] // ..................................................................................................................................e........................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v27.2D, v26.2D, v1.2D // ........e..................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v3.4S, v23.4S, v8.S[0] // .........................................................................................................................................................*.................... - trn2 v23.2D, v25.2D, v20.2D // .........e.................................................................................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v1.2D, v26.2D, v1.2D // ..........e................................................................................................................................................................... + mls v25.4S, v14.4S, v8.4S // .........................................................................................................................................e.................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v9.4S, v11.4S, v8.S[0] // ..............................................................................................................................................................*............... - trn1 v11.2D, v25.2D, v20.2D // ...........e.................................................................................................................................................................. + sub v30.4S, v4.4S, v7.4S // ...............................................................................................................................................................e.............. // gap // .............................................................................................................................................................................. - str q13, [x2], #(16*4) // ........................................................................................................................................................................*..... - sub v13.4S, v27.4S, v23.4S // ...................................e.......................................................................................................................................... // gap // .............................................................................................................................................................................. - mls v18.4S, v24.4S, v8.S[0] // ...................................................................................................................................................................*.......... - add v24.4S, v27.4S, v23.4S // ....................................e......................................................................................................................................... + add v29.4S, v4.4S, v7.4S // ................................................................................................................................................................e............. + mls v27.4S, v19.4S, v8.4S // ...............................................................................................................................................e.............................. // gap // .............................................................................................................................................................................. - str q3, [x2, #-48] // .........................................................................................................................................................................*.... - sub v27.4S, v1.4S, v11.4S // ..............................e............................................................................................................................................... + add v23.4S, v6.4S, v15.4S // ...........................................................................................................................................................e.................. // gap // .............................................................................................................................................................................. - sqrdmulh v7.4S, v13.4S, v7.4S // ......................................e....................................................................................................................................... - add v1.4S, v1.4S, v11.4S // ...............................e.............................................................................................................................................. // gap // .............................................................................................................................................................................. - str q9, [x2, #-32] // ..........................................................................................................................................................................*... + sub v15.4S, v6.4S, v15.4S // ..........................................................................................................................................................e................... + sqrdmulh v7.4S, v30.4S, v21.S[1] // .................................................................................................................................................................e............ // gap // .............................................................................................................................................................................. + str q29, [x1, #48] // .......................................................................................................................................................................e...... + sub v16.4S, v25.4S, v2.4S // ................................................................................................................................................e............................. // gap // .............................................................................................................................................................................. - mul v25.4S, v27.4S, v21.4S // ................................e............................................................................................................................................. + add v4.4S, v25.4S, v2.4S // .................................................................................................................................................e............................ + mul v31.4S, v30.4S, v21.S[0] // ..................................................................................................................................................................e........... + str q23, [x1, #32] // ......................................................................................................................................................................e....... + sub v1.4S, v17.4S, v27.4S // .....................................................................................................................................................e........................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q18, [x2, #-16] // ...........................................................................................................................................................................*.. - sub v16.4S, v1.4S, v24.4S // ........................................e..................................................................................................................................... - add x2, x2, #64 // .............................................................................................................................................................................* - sqrdmulh v9.4S, v27.4S, v14.4S // .................................e............................................................................................................................................ - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - ldr q18, [x2, #32] // ..............e............................................................................................................................................................... - ldr q0, [x2, #48] // ...............e.............................................................................................................................................................. - // gap // .............................................................................................................................................................................. - mul v14.4S, v13.4S, v29.4S // .....................................e........................................................................................................................................ - ldr q17, [x2, #16] // .............e................................................................................................................................................................ - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - mls v14.4S, v7.4S, v8.S[0] // .......................................e...................................................................................................................................... - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - - // ------------------------------------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 250 275 300 325 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- - // ldr q9, [x1, #0] // ......................................................................................................................................e.....................................'.......................................................................................................................................~............................... - // ldr q10, [x1, #16] // .......................................................................................................................................e....................................'........................................................................................................................................~.............................. - // ldr q11, [x1, #32] // .........................................................................................................................................e..................................'..........................................................................................................................................~............................ - // ldr q12, [x1, #48] // ..........................................................................................................................................e.................................'...........................................................................................................................................~........................... - // trn1 v25.4s, v9.4s, v10.4s // ............................................................................................................................................e...............................'.............................................................................................................................................~......................... - // trn2 v26.4s, v9.4s, v10.4s // ..............................................................................................................................................e.............................'...............................................................................................................................................~....................... - // trn1 v27.4s, v11.4s, v12.4s // ................................................................................................................................................e...........................'.................................................................................................................................................~..................... - // trn2 v28.4s, v11.4s, v12.4s // .................................................................................................................................................e..........................'..................................................................................................................................................~.................... - // trn2 v11.2d, v25.2d, v27.2d // ...................................................................................................................................................e........................'....................................................................................................................................................~.................. - // trn2 v12.2d, v26.2d, v28.2d // .....................................................................................................................................................e......................'......................................................................................................................................................~................ - // trn1 v9.2d, v25.2d, v27.2d // ......................................................................................................................................................e.....................'.......................................................................................................................................................~............... - // trn1 v10.2d, v26.2d, v28.2d // ........................................................................................................................................................e...................'.........................................................................................................................................................~............. - // ldr q13, [x2, #0] // ............................................................................................................................................................................*....................................................................................................................................................................... - // ldr q14, [x2, #16] // ..........................................................................................................................................................................e.'....................................................................................................................................................................... - // ldr q15, [x2, #32] // .......................................................................................................................................................................e....'....................................................................................................................................................................... - // ldr q16, [x2, #48] // ........................................................................................................................................................................e...'....................................................................................................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // .........~..................................................................................................................................................................'..........*............................................................................................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // ............~...............................................................................................................................................................'.............*......................................................................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ............................................................................................................................................................................'*...................................................................................................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // .............~..............................................................................................................................................................'..............*........................................................................................................................................................ - // trn2 v15.2d, v25.2d, v27.2d // ...................~........................................................................................................................................................'....................*.................................................................................................................................................. - // trn2 v16.2d, v26.2d, v28.2d // .....................~......................................................................................................................................................'......................*................................................................................................................................................ - // trn1 v13.2d, v25.2d, v27.2d // .......................~....................................................................................................................................................'........................*.............................................................................................................................................. - // trn1 v14.2d, v26.2d, v28.2d // ..............................~.............................................................................................................................................'...............................*....................................................................................................................................... - // ldr q0, [x5], #(12*16) // .~..........................................................................................................................................................................'..*.................................................................................................................................................................... - // ldr q4, [x5, #(-12*16 + 1*16)] // ................................e...........................................................................................................................................'.................................~..................................................................................................................................... - // ldr q1, [x5, #(-12*16 + 2*16)] // ........................e...................................................................................................................................................'.........................~............................................................................................................................................. - // ldr q5, [x5, #(-12*16 + 3*16)] // ...........................e................................................................................................................................................'............................~.......................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 4*16)] // .............................e..............................................................................................................................................'..............................~........................................................................................................................................ - // ldr q6, [x5, #(-12*16 + 5*16)] // e...........................................................................................................................................................................'.~..................................................................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // ..............................................................................................................................................................e.............'...............................................................................................................................................................~....... - // add v9.4s, v9.4s, v10.4s // ................................................................................................................................................................e...........'.................................................................................................................................................................~..... - // mul v10.4s, v24.4s, v1.4s // ..................................................................................................................................................................e.........'...................................................................................................................................................................~... - // sqrdmulh v24.4s, v24.4s, v5.4s // ......................................................................................................................................................................e.....'....................................................................................................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ..~.........................................................................................................................................................................'...*................................................................................................................................................................... - // sub v24.4s, v11.4s, v12.4s // ..........................................................................................................................................................e.................'...........................................................................................................................................................~........... - // add v11.4s, v11.4s, v12.4s // ............................................................................................................................................................e...............'.............................................................................................................................................................~......... - // mul v12.4s, v24.4s, v2.4s // .........................................................................................................................................................................e..'....................................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ...............................................................................................................................................................e............'................................................................................................................................................................~...... - // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................................................................................................e'....................................................................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ....................................................................................................................................................................e.......'.....................................................................................................................................................................~. - // add v9.4s, v9.4s, v11.4s // ...~........................................................................................................................................................................'....*.................................................................................................................................................................. - // mul v11.4s, v24.4s, v0.4s // ...............~............................................................................................................................................................'................*...................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ......~.....................................................................................................................................................................'.......*............................................................................................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ....................~.......................................................................................................................................................'.....................*................................................................................................................................................. - // sub v24.4s, v10.4s, v12.4s // ................~...........................................................................................................................................................'.................*..................................................................................................................................................... - // add v10.4s, v10.4s, v12.4s // ..........................~.................................................................................................................................................'...........................*........................................................................................................................................... - // mul v12.4s, v24.4s, v0.4s // .........................~..................................................................................................................................................'..........................*............................................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................~............................................................................................................................................'................................*...................................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ........................................~...................................................................................................................................'.........................................*............................................................................................................................. - // ldr q0, [x5, #(-12*16 + 6*16)] // ....~.......................................................................................................................................................................'.....*................................................................................................................................................................. - // ldr q4, [x5, #(-12*16 + 7*16)] // .....~......................................................................................................................................................................'......*................................................................................................................................................................ - // ldr q1, [x5, #(-12*16 + 8*16)] // .......~....................................................................................................................................................................'........*.............................................................................................................................................................. - // ldr q5, [x5, #(-12*16 + 9*16)] // ........~...................................................................................................................................................................'.........*............................................................................................................................................................. - // ldr q2, [x5, #(-12*16 + 10*16)] // ..........~.................................................................................................................................................................'...........*........................................................................................................................................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ...........~................................................................................................................................................................'............*.......................................................................................................................................................... - // sub v24.4s, v13.4s, v14.4s // ....................................~.......................................................................................................................................'.....................................*................................................................................................................................. - // add v13.4s, v13.4s, v14.4s // ......................................~.....................................................................................................................................'.......................................*............................................................................................................................... - // mul v14.4s, v24.4s, v1.4s // .........................................~..................................................................................................................................'..........................................*............................................................................................................................ - // sqrdmulh v24.4s, v24.4s, v5.4s // ............................................~...............................................................................................................................'.............................................*......................................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ...................................................~........................................................................................................................'....................................................*.................................................................................................................. - // sub v24.4s, v15.4s, v16.4s // ............................~...............................................................................................................................................'.............................*......................................................................................................................................... - // add v15.4s, v15.4s, v16.4s // .......................................~....................................................................................................................................'........................................*.............................................................................................................................. - // mul v16.4s, v24.4s, v2.4s // ...................................~........................................................................................................................................'....................................*.................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v6.4s // .....................................~......................................................................................................................................'......................................*................................................................................................................................ - // mls v16.4s, v24.4s, v8.s[0] // ...............................................~............................................................................................................................'................................................*...................................................................................................................... - // sub v24.4s, v13.4s, v15.4s // ..........................................~.................................................................................................................................'...........................................*........................................................................................................................... - // add v13.4s, v13.4s, v15.4s // ...........................................~................................................................................................................................'............................................*.......................................................................................................................... - // mul v15.4s, v24.4s, v0.4s // ......................................................~.....................................................................................................................'.......................................................*............................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ................................................~...........................................................................................................................'.................................................*..................................................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ........................................................~...................................................................................................................'.........................................................*............................................................................................................. - // sub v24.4s, v14.4s, v16.4s // ..........................................................~.................................................................................................................'...........................................................*........................................................................................................... - // add v14.4s, v14.4s, v16.4s // ...........................................................~................................................................................................................'............................................................*.......................................................................................................... - // mul v16.4s, v24.4s, v0.4s // ..............................................................~.............................................................................................................'...............................................................*....................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // .................................................................~..........................................................................................................'..................................................................*.................................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ........................................................................~...................................................................................................'.........................................................................*............................................................................................. - // trn1 v25.4s, v9.4s, v10.4s // .................................~..........................................................................................................................................'..................................*.................................................................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ..................................~.........................................................................................................................................'...................................*................................................................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // .............................................~..............................................................................................................................'..............................................*........................................................................................................................ - // trn2 v28.4s, v11.4s, v12.4s // ..............................................~.............................................................................................................................'...............................................*....................................................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // .................................................~..........................................................................................................................'..................................................*.................................................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // .....................................................~......................................................................................................................'......................................................*................................................................................................................ - // trn1 v9.2d, v25.2d, v27.2d // ..................................................~.........................................................................................................................'...................................................*................................................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ....................................................~.......................................................................................................................'.....................................................*................................................................................................................. - // trn1 v25.4s, v13.4s, v14.4s // ................................................................~...........................................................................................................'.................................................................*..................................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // ..................................................................~.........................................................................................................'...................................................................*................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ...........................................................................~................................................................................................'............................................................................*.......................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ............................................................................~...............................................................................................'.............................................................................*......................................................................................... - // trn2 v15.2d, v25.2d, v27.2d // ...............................................................................~............................................................................................'................................................................................*...................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ...................................................................................~........................................................................................'....................................................................................*.................................................................................. - // trn1 v13.2d, v25.2d, v27.2d // ................................................................................~...........................................................................................'.................................................................................*..................................................................................... - // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................~.........................................................................................'...................................................................................*................................................................................... - // ldr q0, [x4], #64 // ..............~.............................................................................................................................................................'...............*....................................................................................................................................................... - // ldr q1, [x4, #(-64 + 16)] // .................~..........................................................................................................................................................'..................*.................................................................................................................................................... - // ldr q2, [x4, #(-64 + 32)] // ..................~.........................................................................................................................................................'...................*................................................................................................................................................... - // ldr q3, [x4, #(-64 + 48)] // ......................~.....................................................................................................................................................'.......................*............................................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // .......................................................~....................................................................................................................'........................................................*.............................................................................................................. - // add v9.4s, v9.4s, v10.4s // .........................................................~..................................................................................................................'..........................................................*............................................................................................................ - // mul v10.4s, v24.4s, v1.s[2] // ............................................................~...............................................................................................................'.............................................................*......................................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................................~.......................................................................................................'.....................................................................*................................................................................................. - // mls v10.4s, v24.4s, v8.s[0] // .........................................................................~..................................................................................................'..........................................................................*............................................................................................ - // sub v24.4s, v11.4s, v12.4s // .............................................................~..............................................................................................................'..............................................................*........................................................................................................ - // add v11.4s, v11.4s, v12.4s // ...............................................................~............................................................................................................'................................................................*...................................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ......................................................................~.....................................................................................................'.......................................................................*............................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................~.................................................................................................'...........................................................................*........................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // .................................................................................~..........................................................................................'..................................................................................*.................................................................................... - // sub v24.4s, v13.4s, v14.4s // .....................................................................................~......................................................................................'......................................................................................*................................................................................ - // add v13.4s, v13.4s, v14.4s // .......................................................................................~....................................................................................'........................................................................................*.............................................................................. - // mul v14.4s, v24.4s, v2.s[2] // ..........................................................................................~.................................................................................'...........................................................................................*........................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ............................................................................................~...............................................................................'.............................................................................................*......................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................~.......................................................................'.....................................................................................................*................................................................. - // sub v24.4s, v15.4s, v16.4s // ........................................................................................~...................................................................................'.........................................................................................*............................................................................. - // add v15.4s, v15.4s, v16.4s // .........................................................................................~..................................................................................'..........................................................................................*............................................................................ - // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................~.........................................................................'...................................................................................................*................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................~...........................................................................'.................................................................................................*..................................................................... - // mls v16.4s, v24.4s, v8.s[0] // .....................................................................................................~......................................................................'......................................................................................................*................................................................ - // sub v24.4s, v9.4s, v11.4s // ...................................................................~........................................................................................................'....................................................................*.................................................................................................. - // add v9.4s, v9.4s, v11.4s // .....................................................................~......................................................................................................'......................................................................*................................................................................................ - // mul v11.4s, v24.4s, v0.s[2] // .............................................................................~..............................................................................................'..............................................................................*........................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................~.............................................................................................'...............................................................................*....................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ......................................................................................~.....................................................................................'.......................................................................................*............................................................................... - // sub v24.4s, v10.4s, v12.4s // ...........................................................................................~................................................................................'............................................................................................*.......................................................................... - // add v10.4s, v10.4s, v12.4s // .............................................................................................~..............................................................................'..............................................................................................*........................................................................ - // mul v12.4s, v24.4s, v0.s[2] // ......................................................................................................~.....................................................................'.......................................................................................................*............................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................................~.............................................................'...............................................................................................................*....................................................... - // mls v12.4s, v24.4s, v8.s[0] // .................................................................................................................~..........................................................'..................................................................................................................*.................................................... - // sub v24.4s, v13.4s, v15.4s // ..............................................................................................~.............................................................................'...............................................................................................*....................................................................... - // add v13.4s, v13.4s, v15.4s // ...............................................................................................~............................................................................'................................................................................................*...................................................................... - // mul v15.4s, v24.4s, v1.s[0] // .......................................................................................................~....................................................................'........................................................................................................*.............................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................~..................................................................'..........................................................................................................*............................................................ - // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................~............................................................'................................................................................................................*...................................................... - // sub v24.4s, v14.4s, v16.4s // ........................................................................................................~...................................................................'.........................................................................................................*............................................................. - // add v14.4s, v14.4s, v16.4s // ..........................................................................................................~.................................................................'...........................................................................................................*........................................................... - // mul v16.4s, v24.4s, v1.s[0] // ...........................................................................................................~................................................................'............................................................................................................*.......................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................................................................~..............................................................'..............................................................................................................*........................................................ - // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................................~...........................................................'.................................................................................................................*..................................................... - // srshr v24.4S, v9.4S, #23 // .......................................................................~....................................................................................................'........................................................................*.............................................................................................. - // mls v9.4s, v24.4s, v8.4s // ....................................................................................~.......................................................................................'.....................................................................................*................................................................................. - // srshr v24.4S, v10.4S, #23 // .................................................................................................~..........................................................................'..................................................................................................*.................................................................... - // mls v10.4s, v24.4s, v8.4s // ....................................................................................................................~.......................................................'.....................................................................................................................*................................................. - // srshr v24.4S, v13.4S, #23 // ...................................................................................................~........................................................................'....................................................................................................*.................................................................. - // mls v13.4s, v24.4s, v8.4s // .....................................................................................................................~......................................................'......................................................................................................................*................................................ - // srshr v24.4S, v14.4S, #23 // ............................................................................................................~...............................................................'.............................................................................................................*......................................................... - // mls v14.4s, v24.4s, v8.4s // ........................................................................................................................~...................................................'.........................................................................................................................*............................................. - // sub v24.4s, v9.4s, v13.4s // ............................................................................................................................~...............................................'.............................................................................................................................*......................................... - // add v9.4s, v9.4s, v13.4s // .............................................................................................................................~..............................................'..............................................................................................................................*........................................ - // mul v13.4s, v24.4s, v0.s[0] // .................................................................................................................................~..........................................'..................................................................................................................................*.................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................~........................................'....................................................................................................................................*.................................. - // mls v13.4s, v24.4s, v8.s[0] // ..................................................................................................................................................~.........................'...................................................................................................................................................*................... - // sub v24.4s, v10.4s, v14.4s // ................................................................................................................................~...........................................'.................................................................................................................................*..................................... - // add v10.4s, v10.4s, v14.4s // ...............................................................................................................................~............................................'................................................................................................................................*...................................... - // mul v14.4s, v24.4s, v0.s[0] // ........................................................................................................................................~...................................'.........................................................................................................................................*............................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................................................~................................'............................................................................................................................................*.......................... - // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................................................................~.......................'.....................................................................................................................................................*................. - // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................~.........................................................'...................................................................................................................*................................................... - // add v11.4s, v11.4s, v15.4s // ...................................................................................................................~........................................................'....................................................................................................................*.................................................. - // mul v15.4s, v24.4s, v0.s[0] // ..........................................................................................................................~.................................................'...........................................................................................................................*........................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................................................~.............................................'...............................................................................................................................*....................................... - // mls v15.4s, v24.4s, v8.s[0] // .......................................................................................................................................................~....................'........................................................................................................................................................*.............. - // sub v24.4s, v12.4s, v16.4s // .......................................................................................................................~....................................................'........................................................................................................................*.............................................. - // add v12.4s, v12.4s, v16.4s // .........................................................................................................................~..................................................'..........................................................................................................................*............................................ - // mul v16.4s, v24.4s, v0.s[0] // .............................................................................................................................................~..............................'..............................................................................................................................................*........................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................................~............................'................................................................................................................................................*...................... - // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................................................~................'............................................................................................................................................................*.......... - // str q9, [x1], #(16*4) // ..................................................................................................................................~.........................................'...................................................................................................................................*................................... - // str q10, [x1, #(-16*4 + 1*16)] // ....................................................................................................................................~.......................................'.....................................................................................................................................*................................. - // str q11, [x1, #(-16*4 + 2*16)] // ......................................................................................................................~.....................................................'.......................................................................................................................*............................................... - // str q12, [x1, #(-16*4 + 3*16)] // ...........................................................................................................................~................................................'............................................................................................................................*.......................................... - // str q13, [x2], #(16*4) // .........................................................................................................................................................~..................'..........................................................................................................................................................*............ - // str q14, [x2, #(-16*4 + 1*16)] // .............................................................................................................................................................~..............'..............................................................................................................................................................*........ - // str q15, [x2, #(-16*4 + 2*16)] // .................................................................................................................................................................~..........'..................................................................................................................................................................*.... - // str q16, [x2, #(-16*4 + 3*16)] // ...................................................................................................................................................................~........'....................................................................................................................................................................*.. - // add x1, x1, #64 // .....................................................................................................................................~......................................'......................................................................................................................................*................................ - // add x2, x2, #64 // .....................................................................................................................................................................~......'......................................................................................................................................................................* + add v20.4S, v17.4S, v27.4S // ......................................................................................................................................................e....................... + mul v11.4S, v15.4S, v21.S[0] // .............................................................................................................................................................e................ + // gap // .............................................................................................................................................................................. + str q4, [x1], #(16*4) // ....................................................................................................................................................................e......... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v9.4S, v1.4S, v21.S[1] // .......................................................................................................................................................e...................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q20, [x1, #-48] // .....................................................................................................................................................................e........ + add x1, x1, #64 // ............................................................................................................................................................................e. + // gap // .............................................................................................................................................................................. + mul v23.4S, v1.4S, v21.S[0] // ........................................................................................................................................................e..................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v1.4S, v15.4S, v21.S[1] // ............................................................................................................................................................e................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v15.4S, v16.4S, v21.S[0] // ...................................................................................................................................................e.......................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v16.4S, v16.4S, v21.S[1] // ..................................................................................................................................................e........................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v23.4S, v9.4S, v8.S[0] // .........................................................................................................................................................e.................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v31.4S, v7.4S, v8.S[0] // ...................................................................................................................................................................e.......... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v11.4S, v1.4S, v8.S[0] // ..............................................................................................................................................................e............... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + + // ----------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q9, [x1, #0] // .e............................................................................................................................................................................'~.................. + // ldr q10, [x1, #16] // e.............................................................................................................................................................................~................... + // ldr q11, [x1, #32] // ..e...........................................................................................................................................................................'.~................. + // ldr q12, [x1, #48] // ....e.........................................................................................................................................................................'...~............... + // trn1 v25.4s, v9.4s, v10.4s // .............e................................................................................................................................................................'............~...... + // trn2 v26.4s, v9.4s, v10.4s // ............e.................................................................................................................................................................'...........~....... + // trn1 v27.4s, v11.4s, v12.4s // ...............e..............................................................................................................................................................'..............~.... + // trn2 v28.4s, v11.4s, v12.4s // ..............e...............................................................................................................................................................'.............~..... + // trn2 v11.2d, v25.2d, v27.2d // .....................e........................................................................................................................................................'................... + // trn2 v12.2d, v26.2d, v28.2d // ......................e.......................................................................................................................................................'................... + // trn1 v9.2d, v25.2d, v27.2d // ..........................e...................................................................................................................................................'................... + // trn1 v10.2d, v26.2d, v28.2d // .........................e....................................................................................................................................................'................... + // ldr q13, [x2, #0] // ...........................e..................................................................................................................................................'................... + // ldr q14, [x2, #16] // .......................e......................................................................................................................................................'................... + // ldr q15, [x2, #32] // ................................e.............................................................................................................................................'................... + // ldr q16, [x2, #48] // .............................e................................................................................................................................................'................... + // trn1 v25.4s, v13.4s, v14.4s // ......................................e.......................................................................................................................................'................... + // trn2 v26.4s, v13.4s, v14.4s // ..................................e...........................................................................................................................................'................... + // trn1 v27.4s, v15.4s, v16.4s // .......................................e......................................................................................................................................'................... + // trn2 v28.4s, v15.4s, v16.4s // ........................................e.....................................................................................................................................'................... + // trn2 v15.2d, v25.2d, v27.2d // ................................................e.............................................................................................................................'................... + // trn2 v16.2d, v26.2d, v28.2d // ..............................................e...............................................................................................................................'................... + // trn1 v13.2d, v25.2d, v27.2d // .............................................e................................................................................................................................'................... + // trn1 v14.2d, v26.2d, v28.2d // .................................................e............................................................................................................................'................... + // ldr q0, [x5], #(12*16) // ....................e.........................................................................................................................................................'................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ......e.......................................................................................................................................................................'.....~............. + // ldr q1, [x5, #(-12*16 + 2*16)] // ...........e..................................................................................................................................................................'..........~........ + // ldr q5, [x5, #(-12*16 + 3*16)] // .........e....................................................................................................................................................................'........~.......... + // ldr q2, [x5, #(-12*16 + 4*16)] // ........................e.....................................................................................................................................................'................... + // ldr q6, [x5, #(-12*16 + 5*16)] // .................e............................................................................................................................................................'................~.. + // sub v24.4s, v9.4s, v10.4s // ...................................e..........................................................................................................................................'................... + // add v9.4s, v9.4s, v10.4s // ...............................e..............................................................................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v5.4s // .........................................e....................................................................................................................................'................... + // mul v10.4s, v24.4s, v1.4s // ............................................e.................................................................................................................................'................... + // mls v10.4s, v27.4s, v8.s[0] // ...................................................e..........................................................................................................................'................... + // sub v24.4s, v11.4s, v12.4s // ............................e.................................................................................................................................................'................... + // add v11.4s, v11.4s, v12.4s // ..............................e...............................................................................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v6.4s // .....................................e........................................................................................................................................'................... + // mul v12.4s, v24.4s, v2.4s // .................................e............................................................................................................................................'................... + // mls v12.4s, v27.4s, v8.s[0] // ...............................................e..............................................................................................................................'................... + // sub v24.4s, v9.4s, v11.4s // ...........................................e..................................................................................................................................'................... + // add v9.4s, v9.4s, v11.4s // .............................................................e................................................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ......................................................e.......................................................................................................................'................... + // mul v11.4s, v24.4s, v0.4s // ..........................................................e...................................................................................................................'................... + // mls v11.4s, v27.4s, v8.s[0] // .................................................................e............................................................................................................'................... + // sub v24.4s, v10.4s, v12.4s // ............................................................e.................................................................................................................'................... + // add v10.4s, v10.4s, v12.4s // ..................................................................e...........................................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ................................................................e.............................................................................................................'................... + // mul v12.4s, v24.4s, v0.4s // ...................................................................e..........................................................................................................'................... + // mls v12.4s, v27.4s, v8.s[0] // ....................................................................e.........................................................................................................'................... + // ldr q0, [x5, #(-12*16 + 6*16)] // .........................................................e....................................................................................................................'................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ....................................e.........................................................................................................................................'................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ..........................................e...................................................................................................................................'................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ..................................................e...........................................................................................................................'................... + // ldr q2, [x5, #(-12*16 + 10*16)] // .......................................................e......................................................................................................................'................... + // ldr q6, [x5, #(-12*16 + 11*16)] // .....e........................................................................................................................................................................'....~.............. + // sub v24.4s, v13.4s, v14.4s // ........................................................e.....................................................................................................................'................... + // add v13.4s, v13.4s, v14.4s // ...........................................................e..................................................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v5.4s // ........................................................................e.....................................................................................................'................... + // mul v14.4s, v24.4s, v1.4s // ...............................................................................e..............................................................................................'................... + // mls v14.4s, v27.4s, v8.s[0] // ...................................................................................e..........................................................................................'................... + // sub v24.4s, v15.4s, v16.4s // ....................................................e.........................................................................................................................'................... + // add v15.4s, v15.4s, v16.4s // .....................................................e........................................................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ..............................................................e...............................................................................................................'................... + // mul v16.4s, v24.4s, v2.4s // .........................................................................e....................................................................................................'................... + // mls v16.4s, v27.4s, v8.s[0] // .............................................................................e................................................................................................'................... + // sub v24.4s, v13.4s, v15.4s // ..........................................................................e...................................................................................................'................... + // add v13.4s, v13.4s, v15.4s // ...............................................................e..............................................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v4.4s // .......................................................................................e......................................................................................'................... + // mul v15.4s, v24.4s, v0.4s // .........................................................................................e....................................................................................'................... + // mls v15.4s, v27.4s, v8.s[0] // ...............................................................................................e..............................................................................'................... + // sub v24.4s, v14.4s, v16.4s // ...........................................................................................e..................................................................................'................... + // add v14.4s, v14.4s, v16.4s // ............................................................................................e.................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ..................................................................................................e...........................................................................'................... + // mul v16.4s, v24.4s, v0.4s // ....................................................................................................e.........................................................................'................... + // mls v16.4s, v27.4s, v8.s[0] // ......................................................................................................e.......................................................................'................... + // trn1 v25.4s, v9.4s, v10.4s // ......................................................................e.......................................................................................................'................... + // trn2 v26.4s, v9.4s, v10.4s // .....................................................................e........................................................................................................'................... + // trn1 v27.4s, v11.4s, v12.4s // ............................................................................e.................................................................................................'................... + // trn2 v28.4s, v11.4s, v12.4s // ..............................................................................e...............................................................................................'................... + // trn2 v11.2d, v25.2d, v27.2d // ................................................................................e.............................................................................................'................... + // trn2 v12.2d, v26.2d, v28.2d // .................................................................................e............................................................................................'................... + // trn1 v9.2d, v25.2d, v27.2d // ..................................................................................e...........................................................................................'................... + // trn1 v10.2d, v26.2d, v28.2d // ....................................................................................e.........................................................................................'................... + // trn1 v25.4s, v13.4s, v14.4s // ................................................................................................e.............................................................................'................... + // trn2 v26.4s, v13.4s, v14.4s // .................................................................................................e............................................................................'................... + // trn1 v27.4s, v15.4s, v16.4s // ..........................................................................................................e...................................................................'................... + // trn2 v28.4s, v15.4s, v16.4s // ...........................................................................................................e..................................................................'................... + // trn2 v15.2d, v25.2d, v27.2d // ..............................................................................................................e...............................................................'................... + // trn2 v16.2d, v26.2d, v28.2d // ...............................................................................................................e..............................................................'................... + // trn1 v13.2d, v25.2d, v27.2d // ................................................................................................................e.............................................................'................... + // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................................................e...........................................................'................... + // ldr q0, [x4], #64 // ...........................................................................e..................................................................................................'................... + // ldr q1, [x4, #(-64 + 16)] // .......................................................................e......................................................................................................'................... + // ldr q2, [x4, #(-64 + 32)] // ........e.....................................................................................................................................................................'.......~........... + // ldr q3, [x4, #(-64 + 48)] // .....................................................................................e........................................................................................'................... + // sub v24.4s, v9.4s, v10.4s // ..........................................................................................e...................................................................................'................... + // add v9.4s, v9.4s, v10.4s // ..............................................................................................e...............................................................................'................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // .......................................................................................................e......................................................................'................... + // mul v10.4s, v24.4s, v1.s[2] // .....................................................................................................e........................................................................'................... + // mls v10.4s, v27.4s, v8.s[0] // .................................................................................................................e............................................................'................... + // sub v24.4s, v11.4s, v12.4s // ........................................................................................e.....................................................................................'................... + // add v11.4s, v11.4s, v12.4s // ......................................................................................e.......................................................................................'................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // .............................................................................................e................................................................................'................... + // mul v12.4s, v24.4s, v2.s[0] // .........................................................................................................e....................................................................'................... + // mls v12.4s, v27.4s, v8.s[0] // .............................................................................................................e................................................................'................... + // sub v24.4s, v13.4s, v14.4s // .......................................................................................................................e......................................................'................... + // add v13.4s, v13.4s, v14.4s // ........................................................................................................................e.....................................................'................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ...............................................................................................................................e..............................................'................... + // mul v14.4s, v24.4s, v2.s[2] // ..................................................................................................................................e...........................................'................... + // mls v14.4s, v27.4s, v8.s[0] // ...................................................................................................................................e..........................................'................... + // sub v24.4s, v15.4s, v16.4s // ....................................................................................................................e.........................................................'................... + // add v15.4s, v15.4s, v16.4s // .....................................................................................................................e........................................................'................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ............................................................................................................................e.................................................'................... + // mul v16.4s, v24.4s, v3.s[0] // .........................................................................................................................e....................................................'................... + // mls v16.4s, v27.4s, v8.s[0] // .....................................................................................................................................e........................................'................... + // sub v24.4s, v9.4s, v11.4s // ........................................................................................................e.....................................................................'................... + // add v9.4s, v9.4s, v11.4s // ...................................................................................................e..........................................................................'................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ............................................................................................................e.................................................................'................... + // mul v11.4s, v24.4s, v0.s[2] // ...................................................................................................................e..........................................................'................... + // mls v11.4s, v27.4s, v8.s[0] // ......................................................................................................................e.......................................................'................... + // sub v24.4s, v10.4s, v12.4s // ...........................................................................................................................e..................................................'................... + // add v10.4s, v10.4s, v12.4s // ..........................................................................................................................e...................................................'................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ................................................................................................................................e.............................................'................... + // mul v12.4s, v24.4s, v0.s[2] // .................................................................................................................................e............................................'................... + // mls v12.4s, v27.4s, v8.s[0] // .......................................................................................................................................e......................................'................... + // sub v24.4s, v13.4s, v15.4s // ....................................................................................................................................e.........................................'................... + // add v13.4s, v13.4s, v15.4s // .............................................................................................................................e................................................'................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ........................................................................................................................................e.....................................'................... + // mul v15.4s, v24.4s, v1.s[0] // .................................................................................................................................................e............................'................... + // mls v15.4s, v27.4s, v8.s[0] // ...................................................................................................................................................e..........................'................... + // sub v24.4s, v14.4s, v16.4s // ...........................................................................................................................................e..................................'................... + // add v14.4s, v14.4s, v16.4s // ............................................................................................................................................e.................................'................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ..............................................................................................................................................e...............................'................... + // mul v16.4s, v24.4s, v1.s[0] // ................................................................................................................................................e.............................'................... + // mls v16.4s, v27.4s, v8.s[0] // ..................................................................................................................................................e...........................'................... + // srshr v24.4S, v9.4S, #23 // .........................................................................................................................................e....................................'................... + // mls v9.4s, v24.4s, v8.4s // ....................................................................................................................................................e.........................'................... + // srshr v24.4S, v10.4S, #23 // ..............................................................................................................................e...............................................'................... + // mls v10.4s, v24.4s, v8.4s // ..........................................................................................................................................e...................................'................... + // srshr v24.4S, v13.4S, #23 // ......................................................................................................................................e.......................................'................... + // mls v13.4s, v24.4s, v8.4s // .............................................................................................................................................e................................'................... + // srshr v24.4S, v14.4S, #23 // ...............................................................................................................................................e..............................'................... + // mls v14.4s, v24.4s, v8.4s // .......................................................................................................................................................e......................'................... + // sub v24.4s, v9.4s, v13.4s // ............................................................................................................................................................e.................'................... + // add v9.4s, v9.4s, v13.4s // .............................................................................................................................................................e................'................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..........................................................................................................................................................................e...'................... + // mul v13.4s, v24.4s, v0.s[0] // .........................................................................................................................................................................e....'................... + // mls v13.4s, v27.4s, v8.s[0] // ...~..........................................................................................................................................................................'..*................ + // sub v24.4s, v10.4s, v14.4s // ................................................................................................................................................................e.............'................... + // add v10.4s, v10.4s, v14.4s // .................................................................................................................................................................e............'................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ....................................................................................................................................................................e.........'................... + // mul v14.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................e......'................... + // mls v14.4s, v27.4s, v8.s[0] // ...........................................................................................................................................................................e..'................... + // sub v24.4s, v11.4s, v15.4s // .........................................................................................................................................................e....................'................... + // add v11.4s, v11.4s, v15.4s // ........................................................................................................................................................e.....................'................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ........................................................................................................................................................................e.....'................... + // mul v15.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................e...........'................... + // mls v15.4s, v27.4s, v8.s[0] // .............................................................................................................................................................................e'................... + // sub v24.4s, v12.4s, v16.4s // .....................................................................................................................................................e........................'................... + // add v12.4s, v12.4s, v16.4s // ......................................................................................................................................................e.......................'................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..........................................................................................................................................................e...................'................... + // mul v16.4s, v24.4s, v0.s[0] // ..............................................................................................................................................................e...............'................... + // mls v16.4s, v27.4s, v8.s[0] // ............................................................................................................................................................................e.'................... + // str q9, [x1], #(16*4) // ...................................................................................................................................................................e..........'................... + // str q10, [x1, #(-16*4 + 1*16)] // .....................................................................................................................................................................e........'................... + // str q11, [x1, #(-16*4 + 2*16)] // ...............................................................................................................................................................e..............'................... + // str q12, [x1, #(-16*4 + 3*16)] // ...........................................................................................................................................................e..................'................... + // str q13, [x2], #(16*4) // ..................~...........................................................................................................................................................'.................*. + // str q14, [x2, #(-16*4 + 1*16)] // .......~......................................................................................................................................................................'......*............ + // str q15, [x2, #(-16*4 + 2*16)] // ................~.............................................................................................................................................................'...............*... + // str q16, [x2, #(-16*4 + 3*16)] // ..........~...................................................................................................................................................................'.........*......... + // add x1, x1, #64 // ......................................................................................................................................................................e.......'................... + // add x2, x2, #64 // ...................~..........................................................................................................................................................'..................* sub count, count, #1 cbnz count, layer45678_start - // Instructions: 144 - // Expected cycles: 124 - // Expected IPC: 1.16 - // - // Wall time: 86.82s - // User time: 86.82s - // - // -------------------------------------------------------------- original position --------------------------------------------------------------> - // 0 25 50 75 100 125 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ - trn1 v11.4S, v18.4S, v0.4S // .*.............................................................................................................................................. - mls v25.4S, v9.4S, v8.S[0] // ...*............................................................................................................................................ - ldr q27, [x2, #0] // *............................................................................................................................................... - ldr q5, [x4, #16] // ..................*............................................................................................................................. - add v21.4S, v1.4S, v24.4S // ....*........................................................................................................................................... - ldr q13, [x5], #(12*16) // ..*............................................................................................................................................. - trn2 v29.4S, v18.4S, v0.4S // ..............*................................................................................................................................. - ldr q20, [x5, #-96] // .....*.......................................................................................................................................... - ldr q4, [x4], #64 // ...............*................................................................................................................................ - sqrdmulh v7.4S, v16.4S, v30.4S // .......*........................................................................................................................................ - ldr q10, [x4, #-16] // .......................*........................................................................................................................ - ldr q26, [x4, #-32] // ...................*............................................................................................................................ - ldr q2, [x5, #-64] // ........*....................................................................................................................................... - trn1 v18.4S, v27.4S, v17.4S // ..........*..................................................................................................................................... - // gap // ................................................................................................................................................ - ldr q15, [x5, #-48] // .........*...................................................................................................................................... - mul v23.4S, v16.4S, v13.4S // ................*............................................................................................................................... - sub v28.4S, v25.4S, v14.4S // .................*.............................................................................................................................. - ldr q19, [x5, #-32] // ...........*.................................................................................................................................... - trn2 v24.4S, v27.4S, v17.4S // .............*.................................................................................................................................. - // gap // ................................................................................................................................................ - trn1 v12.2D, v18.2D, v11.2D // ........................*....................................................................................................................... - trn2 v11.2D, v18.2D, v11.2D // ....................*........................................................................................................................... - ldr q31, [x5, #-16] // ............*................................................................................................................................... - sqrdmulh v27.4S, v28.4S, v30.4S // .............................*.................................................................................................................. - ldr q1, [x5, #-80] // ......*......................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v9.4S, v28.4S, v13.4S // .........................*...................................................................................................................... - // gap // ................................................................................................................................................ - trn2 v0.2D, v24.2D, v29.2D // ......................*......................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v23.4S, v7.4S, v8.S[0] // .....................*.......................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v18.4S, v11.4S, v0.4S // ...........................*.................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v9.4S, v27.4S, v8.S[0] // .....................................*.......................................................................................................... - trn1 v27.2D, v24.2D, v29.2D // ............................*................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v16.4S, v18.4S, v31.4S // ..................................*............................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v31.4S, v12.4S, v27.4S // .................................*.............................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v3.4S, v12.4S, v27.4S // ...................................*............................................................................................................ - mul v18.4S, v18.4S, v19.4S // ................................*............................................................................................................... - // gap // ................................................................................................................................................ - add v12.4S, v25.4S, v14.4S // ..........................*..................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v31.4S, v15.4S // .........................................*...................................................................................................... - add v27.4S, v11.4S, v0.4S // ....................................*........................................................................................................... - // gap // ................................................................................................................................................ - trn1 v29.4S, v23.4S, v9.4S // ..........................................*..................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v11.4S, v23.4S, v9.4S // ...........................................*.................................................................................................... - mul v14.4S, v31.4S, v2.4S // ......................................*......................................................................................................... - // gap // ................................................................................................................................................ - sub v0.4S, v3.4S, v27.4S // .......................................*........................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v18.4S, v16.4S, v8.S[0] // ............................................*................................................................................................... - trn2 v13.4S, v21.4S, v12.4S // ...............................*................................................................................................................ - // gap // ................................................................................................................................................ - trn1 v22.4S, v21.4S, v12.4S // ..............................*................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v12.4S, v0.4S, v1.4S // .............................................*.................................................................................................. - add v30.4S, v3.4S, v27.4S // ........................................*....................................................................................................... - // gap // ................................................................................................................................................ - trn1 v31.2D, v13.2D, v11.2D // .................................................*.............................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn1 v2.2D, v22.2D, v29.2D // ...............................................*................................................................................................ - mls v14.4S, v28.4S, v8.S[0] // ................................................*............................................................................................... - // gap // ................................................................................................................................................ - trn2 v7.2D, v22.2D, v29.2D // ..............................................*................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v25.2D, v13.2D, v11.2D // ..................................................*............................................................................................. - mul v9.4S, v0.4S, v20.4S // ...................................................*............................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v11.4S, v2.4S, v31.4S // ....................................................*........................................................................................... - mls v9.4S, v12.4S, v8.S[0] // .....................................................*.......................................................................................... - add v23.4S, v2.4S, v31.4S // ......................................................*......................................................................................... - // gap // ................................................................................................................................................ - add v28.4S, v14.4S, v18.4S // ........................................................*....................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v2.4S, v11.4S, v5.S[2] // .........................................................*...................................................................................... - add v0.4S, v7.4S, v25.4S // ............................................................*................................................................................... - // gap // ................................................................................................................................................ - sub v21.4S, v7.4S, v25.4S // ..........................................................*..................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn1 v29.4S, v30.4S, v28.4S // .............................................................*.................................................................................. - // gap // ................................................................................................................................................ - sqrdmulh v12.4S, v11.4S, v5.S[3] // .................................................................*.............................................................................. - sub v3.4S, v23.4S, v0.4S // ................................................................*............................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v17.4S, v23.4S, v0.4S // ..................................................................*............................................................................. - mul v22.4S, v21.4S, v26.S[0] // ...................................................................*............................................................................ - // gap // ................................................................................................................................................ - trn2 v11.4S, v30.4S, v28.4S // ...............................................................*................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v15.4S, v14.4S, v18.4S // .......................................................*........................................................................................ - mul v6.4S, v3.4S, v4.S[2] // ..........................................................................*..................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - srshr v27.4S, v17.4S, #23 // ....................................................................*........................................................................... - mls v2.4S, v12.4S, v8.S[0] // ......................................................................*......................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v24.4S, v21.4S, v26.S[1] // .......................................................................*........................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v16.4S, v15.4S, v1.4S // ..............................................................*................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v30.4S, v15.4S, v20.4S // ...........................................................*.................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v22.4S, v24.4S, v8.S[0] // ..............................................................................*................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v30.4S, v16.4S, v8.S[0] // .....................................................................*.......................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v21.4S, v3.4S, v4.S[3] // ...........................................................................*.................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v18.4S, v2.4S, v22.4S // ........................................................................................*....................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v16.4S, v2.4S, v22.4S // ..........................................................................................*..................................................... - mls v17.4S, v27.4S, v8.4S // .................................................................................*.............................................................. - // gap // ................................................................................................................................................ - trn1 v31.4S, v9.4S, v30.4S // ........................................................................*....................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v13.4S, v18.4S, v4.S[3] // ...........................................................................................................*.................................... - trn2 v30.4S, v9.4S, v30.4S // .........................................................................*...................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - srshr v27.4S, v16.4S, #23 // ..............................................................................................*................................................. - mls v6.4S, v21.4S, v8.S[0] // ...................................................................................*............................................................ - trn2 v23.2D, v29.2D, v31.2D // ............................................................................*................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - trn2 v24.2D, v11.2D, v30.2D // ................................................................................*............................................................... - trn1 v21.2D, v29.2D, v31.2D // .............................................................................*.................................................................. - // gap // ................................................................................................................................................ - mul v3.4S, v18.4S, v4.S[2] // ...................................................................................................*............................................ - trn1 v15.2D, v11.2D, v30.2D // ...............................................................................*................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v22.4S, v23.4S, v24.4S // ......................................................................................*......................................................... - mls v16.4S, v27.4S, v8.4S // .................................................................................................................*.............................. - sub v28.4S, v23.4S, v24.4S // .....................................................................................*.......................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v3.4S, v13.4S, v8.S[0] // ..............................................................................................................*................................. - add v13.4S, v21.4S, v15.4S // ....................................................................................*........................................................... - // gap // ................................................................................................................................................ - sub v20.4S, v21.4S, v15.4S // ..................................................................................*............................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v30.4S, v28.4S, v10.S[1] // .............................................................................................*.................................................. - sub v14.4S, v13.4S, v22.4S // ...........................................................................................*.................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v1.4S, v28.4S, v10.S[0] // ...............................................................................................*................................................ - add v29.4S, v13.4S, v22.4S // ............................................................................................*................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v22.4S, v20.4S, v26.S[3] // .........................................................................................*...................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v2.4S, v20.4S, v26.S[2] // .......................................................................................*........................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v1.4S, v30.4S, v8.S[0] // ..................................................................................................*............................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v2.4S, v22.4S, v8.S[0] // .................................................................................................*.............................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - srshr v21.4S, v29.4S, #23 // ................................................................................................*............................................... - sqrdmulh v31.4S, v14.4S, v5.S[1] // ......................................................................................................*......................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v29.4S, v21.4S, v8.4S // ..................................................................................................................*............................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v12.4S, v2.4S, v1.4S // .....................................................................................................*.......................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v28.4S, v14.4S, v5.S[0] // ....................................................................................................*........................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v11.4S, v12.4S, v5.S[1] // ..........................................................................................................*..................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v27.4S, v17.4S, v29.4S // ..........................................................................................................................*..................... - mul v10.4S, v12.4S, v5.S[0] // ........................................................................................................*....................................... - add v12.4S, v2.4S, v1.4S // .......................................................................................................*........................................ - // gap // ................................................................................................................................................ - sub v19.4S, v17.4S, v29.4S // .........................................................................................................................*...................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v28.4S, v31.4S, v8.S[0] // ............................................................................................................*................................... - str q27, [x1], #(16*4) // ...............................................................................................................................*................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - srshr v1.4S, v12.4S, #23 // .........................................................................................................*...................................... - mls v10.4S, v11.4S, v8.S[0] // .............................................................................................................*.................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v12.4S, v1.4S, v8.4S // .....................................................................................................................*.......................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v31.4S, v6.4S, v28.4S // ...............................................................................................................*................................ - add v27.4S, v6.4S, v28.4S // ................................................................................................................*............................... - // gap // ................................................................................................................................................ - mul v21.4S, v19.4S, v4.S[0] // ..............................................................................................................................*................. - sub v25.4S, v3.4S, v10.4S // ....................................................................................................................*........................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v28.4S, v3.4S, v10.4S // ......................................................................................................................*......................... - mul v30.4S, v31.4S, v4.S[0] // .......................................................................................................................*........................ - // gap // ................................................................................................................................................ - str q27, [x1, #-32] // ...................................................................................................................*............................ - // gap // ................................................................................................................................................ - add v27.4S, v16.4S, v12.4S // ............................................................................................................................*................... - sub v29.4S, v16.4S, v12.4S // .............................................................................................................................*.................. - mul v0.4S, v25.4S, v4.S[0] // .....................................................................................................................................*.......... - // gap // ................................................................................................................................................ - str q28, [x1, #-16] // ........................................................................................................................*....................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q27, [x1, #-48] // .................................................................................................................................*.............. - add x1, x1, #64 // ..................................................................................................................................*............. - sqrdmulh v12.4S, v25.4S, v4.S[1] // ......................................................................................................................................*......... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v20.4S, v29.4S, v4.S[1] // ....................................................................................................................................*........... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v1.4S, v31.4S, v4.S[1] // ...........................................................................................................................*.................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v9.4S, v19.4S, v4.S[1] // ................................................................................................................................*............... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v14.4S, v29.4S, v4.S[0] // ...................................................................................................................................*............ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v14.4S, v20.4S, v8.S[0] // ........................................................................................................................................*....... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v30.4S, v1.4S, v8.S[0] // .........................................................................................................................................*...... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v21.4S, v9.4S, v8.S[0] // .......................................................................................................................................*........ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q14, [x2, #16] // ............................................................................................................................................*... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v0.4S, v12.4S, v8.S[0] // ...........................................................................................................................................*.... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q30, [x2, #32] // .............................................................................................................................................*.. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q21, [x2], #(16*4) // ..........................................................................................................................................*..... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - str q0, [x2, #-16] // ..............................................................................................................................................*. - add x2, x2, #64 // ...............................................................................................................................................* - // gap // ................................................................................................................................................ - - // ---------------------------------------------------------------- new position -----------------------------------------------------------------> - // 0 25 50 75 100 125 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ - // ldr q27, [x2, #0] // ..*............................................................................................................................................. - // trn1 v11.4S, v18.4S, v0.4S // *............................................................................................................................................... - // ldr q23, [x5], #(12*16) // .....*.......................................................................................................................................... - // mls v25.4S, v9.4S, v8.S[0] // .*.............................................................................................................................................. - // add v24.4S, v1.4S, v24.4S // ....*........................................................................................................................................... - // ldr q9, [x5, #-96] // .......*........................................................................................................................................ - // ldr q13, [x5, #-80] // .......................*........................................................................................................................ - // sqrdmulh v1.4S, v16.4S, v30.4S // .........*...................................................................................................................................... - // ldr q3, [x5, #-64] // ............*................................................................................................................................... - // ldr q20, [x5, #-48] // ..............*................................................................................................................................. - // trn1 v21.4S, v27.4S, v17.4S // .............*.................................................................................................................................. - // ldr q15, [x5, #-32] // .................*.............................................................................................................................. - // ldr q26, [x5, #-16] // .....................*.......................................................................................................................... - // trn2 v27.4S, v27.4S, v17.4S // ..................*............................................................................................................................. - // trn2 v18.4S, v18.4S, v0.4S // ......*......................................................................................................................................... - // ldr q0, [x4], #64 // ........*....................................................................................................................................... - // mul v16.4S, v16.4S, v23.4S // ...............*................................................................................................................................ - // sub v17.4S, v25.4S, v14.4S // ................*............................................................................................................................... - // ldr q2, [x4, #-48] // ...*............................................................................................................................................ - // ldr q4, [x4, #-32] // ...........*.................................................................................................................................... - // trn2 v5.2D, v21.2D, v11.2D // ....................*........................................................................................................................... - // mls v16.4S, v1.4S, v8.S[0] // ..........................*..................................................................................................................... - // trn2 v1.2D, v27.2D, v18.2D // .........................*...................................................................................................................... - // ldr q22, [x4, #-16] // ..........*..................................................................................................................................... - // trn1 v11.2D, v21.2D, v11.2D // ...................*............................................................................................................................ - // mul v23.4S, v17.4S, v23.4S // ........................*....................................................................................................................... - // add v25.4S, v25.4S, v14.4S // ..................................*............................................................................................................. - // sub v19.4S, v5.4S, v1.4S // ...........................*.................................................................................................................... - // trn1 v27.2D, v27.2D, v18.2D // .............................*.................................................................................................................. - // sqrdmulh v18.4S, v17.4S, v30.4S // ......................*......................................................................................................................... - // trn1 v17.4S, v24.4S, v25.4S // ...........................................*.................................................................................................... - // trn2 v25.4S, v24.4S, v25.4S // ..........................................*..................................................................................................... - // mul v24.4S, v19.4S, v15.4S // .................................*.............................................................................................................. - // sub v15.4S, v11.4S, v27.4S // ...............................*................................................................................................................ - // sqrdmulh v26.4S, v19.4S, v26.4S // ..............................*................................................................................................................. - // add v27.4S, v11.4S, v27.4S // ................................*............................................................................................................... - // add v11.4S, v5.4S, v1.4S // ....................................*........................................................................................................... - // mls v23.4S, v18.4S, v8.S[0] // ............................*................................................................................................................... - // mul v1.4S, v15.4S, v3.4S // .......................................*........................................................................................................ - // sub v3.4S, v27.4S, v11.4S // ........................................*....................................................................................................... - // add v27.4S, v27.4S, v11.4S // .............................................*.................................................................................................. - // sqrdmulh v11.4S, v15.4S, v20.4S // ...................................*............................................................................................................ - // trn1 v20.4S, v16.4S, v23.4S // .....................................*.......................................................................................................... - // trn2 v23.4S, v16.4S, v23.4S // ......................................*......................................................................................................... - // mls v24.4S, v26.4S, v8.S[0] // .........................................*...................................................................................................... - // sqrdmulh v15.4S, v3.4S, v13.4S // ............................................*................................................................................................... - // trn2 v26.2D, v17.2D, v20.2D // .................................................*.............................................................................................. - // trn1 v20.2D, v17.2D, v20.2D // ...............................................*................................................................................................ - // mls v1.4S, v11.4S, v8.S[0] // ................................................*............................................................................................... - // trn1 v11.2D, v25.2D, v23.2D // ..............................................*................................................................................................. - // trn2 v23.2D, v25.2D, v23.2D // ..................................................*............................................................................................. - // mul v25.4S, v3.4S, v9.4S // ...................................................*............................................................................................ - // sub v3.4S, v20.4S, v11.4S // ....................................................*........................................................................................... - // mls v25.4S, v15.4S, v8.S[0] // .....................................................*.......................................................................................... - // add v11.4S, v20.4S, v11.4S // ......................................................*......................................................................................... - // sub v20.4S, v1.4S, v24.4S // .................................................................*.............................................................................. - // add v24.4S, v1.4S, v24.4S // .......................................................*........................................................................................ - // mul v1.4S, v3.4S, v2.S[2] // ........................................................*....................................................................................... - // sub v15.4S, v26.4S, v23.4S // ..........................................................*..................................................................................... - // mul v9.4S, v20.4S, v9.4S // .......................................................................*........................................................................ - // add v23.4S, v26.4S, v23.4S // .........................................................*...................................................................................... - // trn1 v26.4S, v27.4S, v24.4S // ...........................................................*.................................................................................... - // sqrdmulh v13.4S, v20.4S, v13.4S // ......................................................................*......................................................................... - // trn2 v27.4S, v27.4S, v24.4S // ................................................................*............................................................................... - // sub v24.4S, v11.4S, v23.4S // .............................................................*.................................................................................. - // sqrdmulh v3.4S, v3.4S, v2.S[3] // ............................................................*................................................................................... - // add v11.4S, v11.4S, v23.4S // ..............................................................*................................................................................. - // mul v23.4S, v15.4S, v4.S[0] // ...............................................................*................................................................................ - // srshr v20.4S, v11.4S, #23 // ...................................................................*............................................................................ - // mls v9.4S, v13.4S, v8.S[0] // .........................................................................*...................................................................... - // mls v1.4S, v3.4S, v8.S[0] // ....................................................................*........................................................................... - // sqrdmulh v13.4S, v15.4S, v4.S[1] // .....................................................................*.......................................................................... - // trn1 v3.4S, v25.4S, v9.4S // ..............................................................................*................................................................. - // trn2 v25.4S, v25.4S, v9.4S // ................................................................................*............................................................... - // mul v9.4S, v24.4S, v0.S[2] // ..................................................................*............................................................................. - // sqrdmulh v24.4S, v24.4S, v0.S[3] // ..........................................................................*..................................................................... - // trn2 v15.2D, v26.2D, v3.2D // ...................................................................................*............................................................ - // trn1 v3.2D, v26.2D, v3.2D // .....................................................................................*.......................................................... - // mls v23.4S, v13.4S, v8.S[0] // ........................................................................*....................................................................... - // trn1 v13.2D, v27.2D, v25.2D // .......................................................................................*........................................................ - // trn2 v27.2D, v27.2D, v25.2D // ....................................................................................*........................................................... - // mls v11.4S, v20.4S, v8.4S // .............................................................................*.................................................................. - // sub v25.4S, v3.4S, v13.4S // .............................................................................................*.................................................. - // mls v9.4S, v24.4S, v8.S[0] // ..................................................................................*............................................................. - // add v24.4S, v3.4S, v13.4S // ............................................................................................*................................................... - // sub v13.4S, v15.4S, v27.4S // ..........................................................................................*..................................................... - // add v27.4S, v15.4S, v27.4S // ........................................................................................*....................................................... - // mul v3.4S, v25.4S, v4.S[2] // ...................................................................................................*............................................ - // sub v20.4S, v1.4S, v23.4S // ...........................................................................*.................................................................... - // sqrdmulh v25.4S, v25.4S, v4.S[3] // ..................................................................................................*............................................. - // add v23.4S, v1.4S, v23.4S // ............................................................................*................................................................... - // sub v1.4S, v24.4S, v27.4S // ...............................................................................................*................................................ - // add v27.4S, v24.4S, v27.4S // .................................................................................................*.............................................. - // sqrdmulh v24.4S, v13.4S, v22.S[1] // ..............................................................................................*................................................. - // srshr v15.4S, v23.4S, #23 // .................................................................................*.............................................................. - // mul v13.4S, v13.4S, v22.S[0] // ................................................................................................*............................................... - // srshr v26.4S, v27.4S, #23 // ......................................................................................................*......................................... - // mls v3.4S, v25.4S, v8.S[0] // .....................................................................................................*.......................................... - // mls v13.4S, v24.4S, v8.S[0] // ....................................................................................................*........................................... - // mul v25.4S, v20.4S, v0.S[2] // ......................................................................................*......................................................... - // mul v24.4S, v1.4S, v2.S[0] // ..........................................................................................................*..................................... - // sub v18.4S, v3.4S, v13.4S // .........................................................................................................*...................................... - // sqrdmulh v1.4S, v1.4S, v2.S[1] // .......................................................................................................*........................................ - // add v13.4S, v3.4S, v13.4S // ..............................................................................................................*................................. - // mul v3.4S, v18.4S, v2.S[0] // .............................................................................................................*.................................. - // srshr v16.4S, v13.4S, #23 // ..................................................................................................................*............................. - // sqrdmulh v18.4S, v18.4S, v2.S[1] // ...........................................................................................................*.................................... - // sqrdmulh v20.4S, v20.4S, v0.S[3] // ...............................................................................*................................................................ - // mls v24.4S, v1.4S, v8.S[0] // ................................................................................................................*............................... - // mls v3.4S, v18.4S, v8.S[0] // ...................................................................................................................*............................ - // mls v25.4S, v20.4S, v8.S[0] // ...........................................................................................*.................................................... - // sub v1.4S, v9.4S, v24.4S // .....................................................................................................................*.......................... - // add v24.4S, v9.4S, v24.4S // ......................................................................................................................*......................... - // mls v23.4S, v15.4S, v8.4S // .........................................................................................*...................................................... - // mls v27.4S, v26.4S, v8.4S // ........................................................................................................*....................................... - // str q24, [x1, #32] // ...........................................................................................................................*.................... - // sub v24.4S, v25.4S, v3.4S // ........................................................................................................................*....................... - // mls v13.4S, v16.4S, v8.4S // ....................................................................................................................*........................... - // add v25.4S, v25.4S, v3.4S // .........................................................................................................................*...................... - // mul v9.4S, v1.4S, v0.S[0] // ..........................................................................................................................*..................... - // str q25, [x1, #48] // ...............................................................................................................................*................ - // sub v25.4S, v11.4S, v27.4S // ...............................................................................................................*................................ - // add v27.4S, v11.4S, v27.4S // ............................................................................................................*................................... - // sqrdmulh v11.4S, v1.4S, v0.S[1] // ....................................................................................................................................*........... - // add v1.4S, v23.4S, v13.4S // ............................................................................................................................*................... - // sub v23.4S, v23.4S, v13.4S // .............................................................................................................................*.................. - // mul v13.4S, v25.4S, v0.S[0] // .......................................................................................................................*........................ - // str q27, [x1], #(16*4) // .................................................................................................................*.............................. - // sqrdmulh v27.4S, v25.4S, v0.S[1] // .....................................................................................................................................*.......... - // str q1, [x1, #-48] // ................................................................................................................................*............... - // add x1, x1, #64 // .................................................................................................................................*.............. - // mul v3.4S, v23.4S, v0.S[0] // ......................................................................................................................................*......... - // sqrdmulh v23.4S, v23.4S, v0.S[1] // ...................................................................................................................................*............ - // mul v18.4S, v24.4S, v0.S[0] // ..............................................................................................................................*................. - // sqrdmulh v24.4S, v24.4S, v0.S[1] // ..................................................................................................................................*............. - // mls v13.4S, v27.4S, v8.S[0] // .........................................................................................................................................*...... - // mls v3.4S, v23.4S, v8.S[0] // .......................................................................................................................................*........ - // mls v9.4S, v11.4S, v8.S[0] // ........................................................................................................................................*....... - // str q13, [x2], #(16*4) // .............................................................................................................................................*.. - // mls v18.4S, v24.4S, v8.S[0] // ...........................................................................................................................................*.... - // str q3, [x2, #-48] // ..........................................................................................................................................*..... - // str q9, [x2, #-32] // ............................................................................................................................................*... - // str q18, [x2, #-16] // ..............................................................................................................................................*. - // add x2, x2, #64 // ...............................................................................................................................................* + // Instructions: 6 + // Expected cycles: 6 + // Expected IPC: 1.00 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mls v15.4S, v16.4S, v8.S[0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q23, [x2, #16] // .*............................ + str q31, [x2, #48] // ..*........................... + str q15, [x2], #(16*4) // ....*......................... + str q11, [x2, #-32] // ...*.......................... + add x2, x2, #64 // .....*........................ + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // mls v15.4S, v16.4S, v8.S[0] // *.............................. + // str q23, [x2, #16] // .*............................. + // str q31, [x2, #48] // ..*............................ + // str q11, [x2, #32] // ....*.......................... + // str q15, [x2], #(16*4) // ...*........................... + // add x2, x2, #64 // .....*......................... // ----------------------------------------------------------------------------- @@ -1633,75 +1649,75 @@ layer45678_start: load_roots_123 .p2align 2 - // Instructions: 13 - // Expected cycles: 14 - // Expected IPC: 0.93 - // - // Wall time: 0.04s - // User time: 0.04s - // - // ----- original position -----> - // 0 25 - // |------------------------|---- - ldr q11, [x0, #768] // *............................. - ldr q4, [x0, #896] // .....*........................ - // gap // .............................. - ldr q13, [x0, #256] // .*............................ - // gap // .............................. - // gap // .............................. - ldr q20, [x0, #384] // ..*........................... - // gap // .............................. - // gap // .............................. - ldr q18, [x0, #512] // ...*.......................... - // gap // .............................. - // gap // .............................. - sub v28.4S, v11.4S, v4.4S // ......*....................... - ldr q16, [x0, #640] // ....*......................... - // gap // .............................. - add v14.4S, v11.4S, v4.4S // .......*...................... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - sqrdmulh v6.4S, v28.4S, v3.S[1] // .........*.................... - // gap // .............................. - // gap // .............................. - sub v5.4S, v18.4S, v16.4S // ........*..................... - // gap // .............................. - // gap // .............................. - mul v19.4S, v28.4S, v3.S[0] // ..........*................... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - sqrdmulh v4.4S, v5.4S, v2.S[3] // ...........*.................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - mls v19.4S, v6.4S, v8.S[0] // ............*................. - // gap // .............................. - // gap // .............................. + // Instructions: 13 + // Expected cycles: 14 + // Expected IPC: 0.93 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q18, [x0, #896] // *............................. + // gap // .............................. + ldr q20, [x0, #768] // .....*........................ + ldr q6, [x0, #256] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q27, [x0, #384] // ..*........................... + // gap // .............................. + // gap // .............................. + ldr q12, [x0, #512] // ...*.......................... + // gap // .............................. + // gap // .............................. + add v5.4S, v20.4S, v18.4S // .......*...................... + sub v18.4S, v20.4S, v18.4S // ......*....................... + ldr q14, [x0, #640] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v20.4S, v18.4S, v3.S[1] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v11.4S, v12.4S, v14.4S // ........*..................... + mul v9.4S, v18.4S, v3.S[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v23.4S, v11.4S, v2.S[2] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v9.4S, v20.4S, v8.S[0] // ............*................. + // gap // .............................. + // gap // .............................. // -------- new position --------> // 0 25 // |------------------------|----- - // ldr q21, [x0, #768] // *.............................. - // ldr q13, [x0, #256] // ..*............................ - // ldr q20, [x0, #384] // ...*........................... - // ldr q18, [x0, #512] // ....*.......................... - // ldr q16, [x0, #640] // ......*........................ - // ldr q17, [x0, #896] // .*............................. - // sub v22.4S, v21.4S, v17.4S // .....*......................... - // add v14.4S, v21.4S, v17.4S // .......*....................... - // sub v5.4S, v18.4S, v16.4S // .........*..................... - // sqrdmulh v24.4S, v22.4S, v3.S[1] // ........*...................... - // mul v19.4S, v22.4S, v3.S[0] // ..........*.................... - // sqrdmulh v4.4S, v5.4S, v2.S[3] // ...........*................... - // mls v19.4S, v24.4S, v8.S[0] // ............*.................. + // ldr q21, [x0, #896] // *.............................. + // ldr q6, [x0, #256] // ..*............................ + // ldr q27, [x0, #384] // ...*........................... + // ldr q12, [x0, #512] // ....*.......................... + // ldr q14, [x0, #640] // .......*....................... + // ldr q16, [x0, #768] // .*............................. + // sub v28.4S, v16.4S, v21.4S // ......*........................ + // add v5.4S, v16.4S, v21.4S // .....*......................... + // sub v11.4S, v12.4S, v14.4S // .........*..................... + // sqrdmulh v13.4S, v28.4S, v3.S[1] // ........*...................... + // mul v9.4S, v28.4S, v3.S[0] // ..........*.................... + // mul v23.4S, v11.4S, v2.S[2] // ...........*................... + // mls v9.4S, v13.4S, v8.S[0] // ............*.................. sub count, count, #1 layer123_start: @@ -1709,346 +1725,346 @@ layer123_start: // Expected cycles: 112 // Expected IPC: 1.07 // - // Wall time: 13.14s - // User time: 13.14s + // Wall time: 9.59s + // User time: 9.59s // // -------------------------------------------------- original position --------------------------------------------------> // 0 25 50 75 100 // |------------------------|------------------------|------------------------|------------------------|------------------- - ldr q27, [x0, #0] // *....................................................................................................................... - ldr q11, [x0, #128] // .*...................................................................................................................... - sub v7.4S, v13.4S, v20.4S // .............*.......................................................................................................... - mul v24.4S, v5.4S, v2.S[2] // ....................*................................................................................................... - add v23.4S, v13.4S, v20.4S // ..............*......................................................................................................... - ldr q21, [x0, #784] // ......e................................................................................................................. - add v9.4S, v18.4S, v16.4S // ...................*.................................................................................................... - ldr q13, [x0, #272] // ..e..................................................................................................................... - ldr q20, [x0, #400] // ...e.................................................................................................................... - mul v15.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ - ldr q18, [x0, #528] // ....e................................................................................................................... - ldr q16, [x0, #656] // .....e.................................................................................................................. - ldr q17, [x0, #912] // .......e................................................................................................................ - sub v5.4S, v27.4S, v11.4S // ........*............................................................................................................... + ldr q20, [x0, #0] // *....................................................................................................................... + ldr q18, [x0, #128] // .*...................................................................................................................... + sub v19.4S, v6.4S, v27.4S // .............*.......................................................................................................... + sqrdmulh v22.4S, v11.4S, v2.S[3] // ....................*................................................................................................... + add v11.4S, v6.4S, v27.4S // ..............*......................................................................................................... + ldr q21, [x0, #912] // .......e................................................................................................................ + add v13.4S, v12.4S, v14.4S // ...................*.................................................................................................... + ldr q6, [x0, #272] // ..e..................................................................................................................... + ldr q27, [x0, #400] // ...e.................................................................................................................... + sqrdmulh v17.4S, v19.4S, v2.S[1] // ...............*........................................................................................................ + ldr q12, [x0, #528] // ....e................................................................................................................... + ldr q14, [x0, #656] // .....e.................................................................................................................. + ldr q16, [x0, #784] // ......e................................................................................................................. + sub v10.4S, v20.4S, v18.4S // ........*............................................................................................................... // gap // ........................................................................................................................ - add v27.4S, v27.4S, v11.4S // .........*.............................................................................................................. - sqrdmulh v11.4S, v7.4S, v2.S[1] // ................*....................................................................................................... + add v20.4S, v20.4S, v18.4S // .........*.............................................................................................................. + mul v18.4S, v19.4S, v2.S[0] // ................*....................................................................................................... // gap // ........................................................................................................................ - sub v7.4S, v9.4S, v14.4S // ......................................*................................................................................. + sub v19.4S, v13.4S, v5.4S // ......................................*................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v24.4S, v4.4S, v8.S[0] // ......................*................................................................................................. - add v9.4S, v9.4S, v14.4S // .......................................*................................................................................ + mls v23.4S, v22.4S, v8.S[0] // ......................*................................................................................................. + add v22.4S, v13.4S, v5.4S // .......................................*................................................................................ // gap // ........................................................................................................................ - sub v4.4S, v27.4S, v23.4S // ............................*........................................................................................... + sub v13.4S, v20.4S, v11.4S // ............................*........................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v27.4S, v27.4S, v23.4S // .............................*.......................................................................................... - mul v23.4S, v5.4S, v1.S[2] // ..........*............................................................................................................. + add v20.4S, v20.4S, v11.4S // .............................*.......................................................................................... + sqrdmulh v11.4S, v10.4S, v1.S[3] // ..........*............................................................................................................. // gap // ........................................................................................................................ - sub v22.4S, v21.4S, v17.4S // .......................e................................................................................................ + sub v28.4S, v16.4S, v21.4S // .......................e................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v14.4S, v21.4S, v17.4S // ........................e............................................................................................... - mls v15.4S, v11.4S, v8.S[0] // .................*...................................................................................................... + add v5.4S, v16.4S, v21.4S // ........................e............................................................................................... + mls v18.4S, v17.4S, v8.S[0] // .................*...................................................................................................... // gap // ........................................................................................................................ - sub v11.4S, v24.4S, v19.4S // ...........................................*............................................................................ + sub v17.4S, v23.4S, v9.4S // ...........................................*............................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v21.4S, v5.4S, v1.S[3] // ...........*............................................................................................................ - add v24.4S, v24.4S, v19.4S // ............................................*........................................................................... + mul v16.4S, v10.4S, v1.S[2] // ...........*............................................................................................................ + add v10.4S, v23.4S, v9.4S // ............................................*........................................................................... // gap // ........................................................................................................................ - sub v17.4S, v27.4S, v9.4S // ................................................*....................................................................... + sub v23.4S, v20.4S, v22.4S // ................................................*....................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v27.4S, v27.4S, v9.4S // .................................................*...................................................................... - mul v9.4S, v4.4S, v0.S[2] // ..............................*......................................................................................... + add v20.4S, v20.4S, v22.4S // .................................................*...................................................................... + mls v16.4S, v11.4S, v8.S[0] // ............*........................................................................................................... // gap // ........................................................................................................................ - sub v5.4S, v18.4S, v16.4S // ..................e..................................................................................................... + sub v11.4S, v12.4S, v14.4S // ..................e..................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v4.4S, v4.4S, v0.S[3] // ...............................*........................................................................................ + sqrdmulh v22.4S, v13.4S, v0.S[3] // ..............................*......................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v19.4S, v7.4S, v1.S[0] // ........................................*............................................................................... + sqrdmulh v9.4S, v19.4S, v1.S[1] // ........................................*............................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v7.4S, v16.4S, v18.4S // .................................*...................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v19.4S, v19.4S, v1.S[0] // .........................................*.............................................................................. + add v18.4S, v16.4S, v18.4S // ..................................*..................................................................................... // gap // ........................................................................................................................ - sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.............................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v13.4S, v13.4S, v0.S[2] // ...............................*........................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v23.4S, v21.4S, v8.S[0] // ............*........................................................................................................... + sub v16.4S, v18.4S, v10.4S // .....................................................*.................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + add v18.4S, v18.4S, v10.4S // ......................................................*................................................................. + mls v13.4S, v22.4S, v8.S[0] // ................................*....................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v9.4S, v4.4S, v8.S[0] // ................................*....................................................................................... // gap // ........................................................................................................................ + sqrdmulh v22.4S, v7.4S, v0.S[3] // ...................................*.................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v19.4S, v7.4S, v8.S[0] // ..........................................*............................................................................. // gap // ........................................................................................................................ + mul v10.4S, v7.4S, v0.S[2] // ....................................*................................................................................... // gap // ........................................................................................................................ - sub v7.4S, v23.4S, v15.4S // .................................*...................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v23.4S, v23.4S, v15.4S // ..................................*..................................................................................... - mul v21.4S, v11.4S, v1.S[0] // .............................................*.......................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v19.4S, v9.4S, v8.S[0] // ..........................................*............................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v15.4S, v7.4S, v0.S[2] // ...................................*.................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v4.4S, v23.4S, v24.4S // .....................................................*.................................................................. // gap // ........................................................................................................................ + mls v10.4S, v22.4S, v8.S[0] // .....................................*.................................................................................. // gap // ........................................................................................................................ - add v23.4S, v23.4S, v24.4S // ......................................................*................................................................. - sqrdmulh v7.4S, v7.4S, v0.S[3] // ....................................*................................................................................... // gap // ........................................................................................................................ - sub v24.4S, v9.4S, v19.4S // ..........................................................*............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v11.4S, v11.4S, v1.S[1] // ..............................................*......................................................................... - add v9.4S, v9.4S, v19.4S // ...........................................................*............................................................ // gap // ........................................................................................................................ + sqrdmulh v22.4S, v17.4S, v1.S[1] // .............................................*.......................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v9.4S, v13.4S, v19.4S // ..........................................................*............................................................. // gap // ........................................................................................................................ - mul v19.4S, v17.4S, v0.S[0] // ..................................................*..................................................................... // gap // ........................................................................................................................ + mul v17.4S, v17.4S, v1.S[0] // ..............................................*......................................................................... + add v19.4S, v13.4S, v19.4S // ...........................................................*............................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v15.4S, v7.4S, v8.S[0] // .....................................*.................................................................................. + sqrdmulh v13.4S, v23.4S, v0.S[1] // ..................................................*..................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v11.4S, v8.S[0] // ...............................................*........................................................................ + mls v17.4S, v22.4S, v8.S[0] // ...............................................*........................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v11.4S, v17.4S, v0.S[1] // ...................................................*.................................................................... + mul v22.4S, v23.4S, v0.S[0] // ...................................................*.................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v7.4S, v4.4S, v0.S[0] // .......................................................*................................................................ + mls v22.4S, v13.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v17.4S, v15.4S, v21.4S // ...............................................................*........................................................ + sub v13.4S, v10.4S, v17.4S // ...............................................................*........................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v21.4S, v15.4S, v21.4S // ................................................................*....................................................... - mul v15.4S, v27.4S, v25.4S // ........................................................................................*............................... + add v17.4S, v10.4S, v17.4S // ................................................................*....................................................... + sqrdmulh v10.4S, v20.4S, v26.4S // ........................................................................................*............................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v27.4S, v27.4S, v26.4S // .........................................................................................*.............................. + mul v20.4S, v20.4S, v25.4S // .........................................................................................*.............................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v23.4S, v31.4S, v22.4S // ....................................................................*................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v7.4S, v22.4S, v30.4S // .....................................................................*.................................................. + sqrdmulh v24.4S, v16.4S, v0.S[1] // .......................................................*................................................................ // gap // ........................................................................................................................ - mls v19.4S, v11.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v16.4S, v16.4S, v0.S[0] // ........................................................*............................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v11.4S, v4.4S, v0.S[1] // ........................................................*............................................................... + sub v23.4S, v23.4S, v7.4S // ......................................................................*................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sqrdmulh v7.4S, v9.4S, v0.S[1] // ............................................................*........................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v4.4S, v24.4S, v0.S[0] // ............................................................*........................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v29.4S, v31.4S, v19.4S // ....................................................................*................................................... + mls v16.4S, v24.4S, v8.S[0] // .........................................................*.............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v24.4S, v24.4S, v0.S[1] // .............................................................*.......................................................... - cmge v6.4S, v19.4S, v30.4S // .....................................................................*.................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v9.4S, v9.4S, v0.S[0] // .............................................................*.......................................................... // gap // ........................................................................................................................ - mls v7.4S, v11.4S, v8.S[0] // .........................................................*.............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v11.4S, v29.4S, v6.4S // ......................................................................*................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v29.4S, v17.4S, v0.S[0] // .................................................................*...................................................... + mls v9.4S, v7.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v7.4S, v31.4S, v16.4S // ........................................................................*............................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v24.4S, v16.4S, v30.4S // .........................................................................*.............................................. + sqrdmulh v4.4S, v13.4S, v0.S[1] // .................................................................*...................................................... // gap // ........................................................................................................................ - mls v4.4S, v24.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v24.4S, v31.4S, v7.4S // ........................................................................*............................................... // gap // ........................................................................................................................ + mul v13.4S, v13.4S, v0.S[0] // ..................................................................*..................................................... // gap // ........................................................................................................................ - sqrdmulh v17.4S, v17.4S, v0.S[1] // ..................................................................*..................................................... - cmge v6.4S, v7.4S, v30.4S // .........................................................................*.............................................. // gap // ........................................................................................................................ + sub v7.4S, v7.4S, v24.4S // ..........................................................................*............................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v22.4S, v23.4S, v8.4S // .......................................................................*................................................ + cmge v23.4S, v31.4S, v9.4S // ............................................................................*........................................... // gap // ........................................................................................................................ - mls v19.4S, v11.4S, v8.4S // .......................................................................*................................................ + cmge v24.4S, v9.4S, v30.4S // .............................................................................*.......................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v11.4S, v24.4S, v6.4S // ..........................................................................*............................................. + mls v13.4S, v4.4S, v8.S[0] // ...................................................................*.................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v15.4S, v27.4S, v8.S[0] // ..........................................................................................*............................. - cmge v27.4S, v31.4S, v4.4S // ............................................................................*........................................... // gap // ........................................................................................................................ - cmge v24.4S, v4.4S, v30.4S // .............................................................................*.......................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v29.4S, v17.4S, v8.S[0] // ...................................................................*.................................................... + mls v16.4S, v7.4S, v8.4S // ...........................................................................*............................................ + sub v23.4S, v23.4S, v24.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ + str q22, [x0, #512] // ....................................................................................*................................... // gap // ........................................................................................................................ - str q19, [x0, #512] // ....................................................................................*................................... // gap // ........................................................................................................................ + mls v20.4S, v10.4S, v8.S[0] // ..........................................................................................*............................. // gap // ........................................................................................................................ - mls v7.4S, v11.4S, v8.4S // ...........................................................................*............................................ - sub v27.4S, v27.4S, v24.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ - cmge v11.4S, v31.4S, v15.4S // ....................................................................................................*................... + cmge v22.4S, v31.4S, v13.4S // ................................................................................*....................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v24.4S, v15.4S, v30.4S // .....................................................................................................*.................. - mul v17.4S, v23.4S, v25.4S // ...........................................................................................*............................ + mls v9.4S, v23.4S, v8.4S // ...............................................................................*........................................ + cmge v10.4S, v13.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ - cmge v19.4S, v31.4S, v29.4S // ................................................................................*....................................... + str q16, [x0, #640] // .....................................................................................*.................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v4.4S, v27.4S, v8.4S // ...............................................................................*........................................ - cmge v27.4S, v29.4S, v30.4S // .................................................................................*...................................... + sqrdmulh v16.4S, v18.4S, v26.4S // ...........................................................................................*............................ // gap // ........................................................................................................................ - str q7, [x0, #640] // .....................................................................................*.................................. - sub v11.4S, v11.4S, v24.4S // ......................................................................................................*................. // gap // ........................................................................................................................ - sqrdmulh v7.4S, v23.4S, v26.4S // ............................................................................................*........................... + sub v22.4S, v22.4S, v10.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v27.4S, v19.4S, v27.4S // ..................................................................................*..................................... + mul v18.4S, v18.4S, v25.4S // ............................................................................................*........................... + cmge v10.4S, v31.4S, v20.4S // ....................................................................................................*................... // gap // ........................................................................................................................ + str q9, [x0, #768] // ......................................................................................*................................. + cmge v23.4S, v20.4S, v30.4S // .....................................................................................................*.................. // gap // ........................................................................................................................ - mul v23.4S, v9.4S, v25.4S // ..............................................................................................*......................... + sqrdmulh v9.4S, v19.4S, v26.4S // ..............................................................................................*......................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q4, [x0, #768] // ......................................................................................*................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v24.4S, v9.4S, v26.4S // ...............................................................................................*........................ // gap // ........................................................................................................................ + mls v18.4S, v16.4S, v8.S[0] // .............................................................................................*.......................... + sub v16.4S, v10.4S, v23.4S // ......................................................................................................*................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v17.4S, v7.4S, v8.S[0] // .............................................................................................*.......................... + mul v19.4S, v19.4S, v25.4S // ...............................................................................................*........................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v7.4S, v21.4S, v26.4S // ..................................................................................................*..................... + sqrdmulh v10.4S, v17.4S, v26.4S // .................................................................................................*...................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v23.4S, v31.4S, v18.4S // ........................................................................................................*............... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v19.4S, v9.4S, v8.S[0] // ................................................................................................*....................... + cmge v9.4S, v18.4S, v30.4S // .........................................................................................................*.............. // gap // ........................................................................................................................ - mls v23.4S, v24.4S, v8.S[0] // ................................................................................................*....................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v24.4S, v31.4S, v17.4S // ........................................................................................................*............... // gap // ........................................................................................................................ + mul v17.4S, v17.4S, v25.4S // ..................................................................................................*..................... // gap // ........................................................................................................................ - mul v9.4S, v21.4S, v25.4S // .................................................................................................*...................... - cmge v21.4S, v17.4S, v30.4S // .........................................................................................................*.............. // gap // ........................................................................................................................ + sub v23.4S, v23.4S, v9.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v17.4S, v10.4S, v8.S[0] // ...................................................................................................*.................... // gap // ........................................................................................................................ - mls v9.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... // gap // ........................................................................................................................ + cmge v10.4S, v31.4S, v19.4S // ............................................................................................................*........... // gap // ........................................................................................................................ - sub v7.4S, v24.4S, v21.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ + mls v13.4S, v22.4S, v8.4S // ...................................................................................*.................................... + cmge v22.4S, v19.4S, v30.4S // .............................................................................................................*.......... // gap // ........................................................................................................................ - mls v29.4S, v27.4S, v8.4S // ...................................................................................*.................................... - cmge v27.4S, v31.4S, v23.4S // ............................................................................................................*........... // gap // ........................................................................................................................ - cmge v24.4S, v23.4S, v30.4S // .............................................................................................................*.......... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v15.4S, v11.4S, v8.4S // .......................................................................................................*................ + mls v20.4S, v16.4S, v8.4S // .......................................................................................................*................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v11.4S, v31.4S, v9.4S // ................................................................................................................*....... + cmge v16.4S, v31.4S, v17.4S // ................................................................................................................*....... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v17.4S, v7.4S, v8.4S // ...........................................................................................................*............ - cmge v7.4S, v9.4S, v30.4S // .................................................................................................................*...... + mls v18.4S, v23.4S, v8.4S // ...........................................................................................................*............ + cmge v23.4S, v17.4S, v30.4S // .................................................................................................................*...... // gap // ........................................................................................................................ - str q29, [x0, #896] // .......................................................................................*................................ - sub v27.4S, v27.4S, v24.4S // ..............................................................................................................*......... + sub v22.4S, v10.4S, v22.4S // ..............................................................................................................*......... + str q13, [x0, #896] // .......................................................................................*................................ // gap // ........................................................................................................................ - sqrdmulh v24.4S, v22.4S, v3.S[1] // ..........................e............................................................................................. + sqrdmulh v13.4S, v28.4S, v3.S[1] // .........................e.............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q15, [x0], #(16) // ....................................................................................................................*... - sub v11.4S, v11.4S, v7.4S // ..................................................................................................................*..... + str q20, [x0], #(16) // ....................................................................................................................*... + sub v20.4S, v16.4S, v23.4S // ..................................................................................................................*..... // gap // ........................................................................................................................ - mls v23.4S, v27.4S, v8.4S // ...............................................................................................................*........ + mls v19.4S, v22.4S, v8.4S // ...............................................................................................................*........ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q17, [x0, #112] // .....................................................................................................................*.. + str q18, [x0, #112] // .....................................................................................................................*.. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v9.4S, v11.4S, v8.4S // ...................................................................................................................*.... + mls v17.4S, v20.4S, v8.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v19.4S, v22.4S, v3.S[0] // .........................e.............................................................................................. + mul v9.4S, v28.4S, v3.S[0] // ..........................e............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q23, [x0, #240] // ......................................................................................................................*. + str q19, [x0, #240] // ......................................................................................................................*. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v4.4S, v5.4S, v2.S[3] // .....................e.................................................................................................. + mul v23.4S, v11.4S, v2.S[2] // .....................e.................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q9, [x0, #368] // .......................................................................................................................* + str q17, [x0, #368] // .......................................................................................................................* // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v19.4S, v24.4S, v8.S[0] // ...........................e............................................................................................ + mls v9.4S, v13.4S, v8.S[0] // ...........................e............................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ @@ -2061,568 +2077,571 @@ layer123_start: // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................'.......~.............................................................................................................. // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................'.........~............................................................................................................ // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................'..........~........................................................................................................... - // ldr q15, [x0, #(6*(1024/8))] // e..................................................................................................................'....~................................................................................................................. - // ldr q16, [x0, #(7*(1024/8))] // .......e...........................................................................................................'...........~.......................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .......e...........................................................................................................'...........~.......................................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // e..................................................................................................................'....~................................................................................................................. // sub v24.4s, v9.4s, v10.4s // ........~..........................................................................................................'............*......................................................................................................... // add v9.4s, v9.4s, v10.4s // .........~.........................................................................................................'.............*........................................................................................................ - // mul v10.4s, v24.4s, v1.s[2] // ................~..................................................................................................'....................*................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................~.............................................................................................'.........................*............................................................................................ - // mls v10.4s, v24.4s, v8.s[0] // ..............................~....................................................................................'..................................*................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ................~..................................................................................................'....................*................................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // .....................~.............................................................................................'.........................*............................................................................................ + // mls v10.4s, v27.4s, v8.s[0] // .........................~.........................................................................................'.............................*........................................................................................ // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................'.*.................................................................................................................... // add v11.4s, v11.4s, v12.4s // ...................................................................................................................'...*.................................................................................................................. - // mul v12.4s, v24.4s, v2.s[0] // ....~..............................................................................................................'........*............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........~........................................................................................................'..............*....................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................~...............................................................................................'.......................*.............................................................................................. + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ....~..............................................................................................................'........*............................................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ..........~........................................................................................................'..............*....................................................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ...................~...............................................................................................'.......................*.............................................................................................. // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................'..............................~....................................................................................... // add v13.4s, v13.4s, v14.4s // .~.................................................................................................................'.....*................................................................................................................ - // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................'..*................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................e..'....................................................................................................................~. - // mls v14.4s, v24.4s, v8.s[0] // ............~......................................................................................................'................*..................................................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ...................................................................................................................'..*................................................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................................e..'....................................................................................................................~. + // mls v14.4s, v27.4s, v8.s[0] // ............~......................................................................................................'................*..................................................................................................... // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................'.....................~................................................................................................ // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................'......................~............................................................................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ........................................................................................................e..........'............................................................................................................~......... // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....'..................................................................................................................~... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................e..........'............................................................................................................~......... - // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................e'...................................................................................................................... + // mls v16.4s, v27.4s, v8.s[0] // ..................................................................................................................e'...................................................................................................................... // sub v24.4s, v9.4s, v11.4s // ..............~....................................................................................................'..................*................................................................................................... // add v9.4s, v9.4s, v11.4s // ...............~...................................................................................................'...................*.................................................................................................. - // mul v11.4s, v24.4s, v0.s[2] // .........................~.........................................................................................'.............................*........................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................~.......................................................................................'...............................*...................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ...............................~...................................................................................'...................................*.................................................................................. - // sub v24.4s, v10.4s, v12.4s // .................................~.................................................................................'.....................................*................................................................................ - // add v10.4s, v10.4s, v12.4s // ..................................~................................................................................'......................................*............................................................................... - // mul v12.4s, v24.4s, v0.s[2] // ....................................~..............................................................................'........................................*............................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................~...........................................................................'...........................................*.......................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ............................................~......................................................................'................................................*..................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ...........................~.......................................................................................'...............................*...................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................................~..................................................................................'....................................*................................................................................. + // mls v11.4s, v27.4s, v8.s[0] // ...................................~...............................................................................'.......................................*.............................................................................. + // sub v24.4s, v10.4s, v12.4s // .............................~.....................................................................................'.................................*.................................................................................... + // add v10.4s, v10.4s, v12.4s // ...............................~...................................................................................'...................................*.................................................................................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ....................................~..............................................................................'........................................*............................................................................. + // mul v12.4s, v24.4s, v0.s[2] // .....................................~.............................................................................'.........................................*............................................................................ + // mls v12.4s, v27.4s, v8.s[0] // .......................................~...........................................................................'...........................................*.......................................................................... // sub v24.4s, v13.4s, v15.4s // ...........~.......................................................................................................'...............*...................................................................................................... // add v13.4s, v13.4s, v15.4s // .............~.....................................................................................................'.................*.................................................................................................... - // mul v15.4s, v24.4s, v1.s[0] // ............................~......................................................................................'................................*..................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................~.....................................................................................'.................................*.................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ................................~..................................................................................'....................................*................................................................................. + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ............................~......................................................................................'................................*..................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ..............................~....................................................................................'..................................*................................................................................... + // mls v15.4s, v27.4s, v8.s[0] // ......................................~............................................................................'..........................................*........................................................................... // sub v24.4s, v14.4s, v16.4s // ....................~..............................................................................................'........................*............................................................................................. // add v14.4s, v14.4s, v16.4s // ......................~............................................................................................'..........................*........................................................................................... - // mul v16.4s, v24.4s, v1.s[0] // ...................................~...............................................................................'.......................................*.............................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................~.........................................................................'.............................................*........................................................................ - // mls v16.4s, v24.4s, v8.s[0] // .............................................~.....................................................................'.................................................*.................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ........................................~..........................................................................'............................................*......................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ..........................................~........................................................................'..............................................*....................................................................... + // mls v16.4s, v27.4s, v8.s[0] // .............................................~.....................................................................'.................................................*.................................................................... // sub v24.4s, v9.4s, v13.4s // .......................~...........................................................................................'...........................*.......................................................................................... // add v9.4s, v9.4s, v13.4s // ........................~..........................................................................................'............................*......................................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ...........................................~.......................................................................'...............................................*...................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................~....................................................................'..................................................*................................................................... - // mls v13.4s, v24.4s, v8.s[0] // ....................................................~..............................................................'........................................................*............................................................. - // sub v24.4s, v10.4s, v14.4s // .....................................~.............................................................................'.........................................*............................................................................ - // add v10.4s, v10.4s, v14.4s // ......................................~............................................................................'..........................................*........................................................................... - // mul v14.4s, v24.4s, v0.s[0] // ...............................................~...................................................................'...................................................*.................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................~.............................................................'.........................................................*............................................................ - // mls v14.4s, v24.4s, v8.s[0] // ..........................................................~........................................................'..............................................................*....................................................... - // sub v24.4s, v11.4s, v15.4s // ........................................~..........................................................................'............................................*......................................................................... - // add v11.4s, v11.4s, v15.4s // ..........................................~........................................................................'..............................................*....................................................................... - // mul v15.4s, v24.4s, v0.s[0] // ......................................................~............................................................'..........................................................*........................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................~..........................................................'............................................................*......................................................... - // mls v15.4s, v24.4s, v8.s[0] // .............................................................~.....................................................'.................................................................*.................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ............................................~......................................................................'................................................*..................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ..............................................~....................................................................'..................................................*................................................................... + // mls v13.4s, v27.4s, v8.s[0] // ...............................................~...................................................................'...................................................*.................................................................. + // sub v24.4s, v10.4s, v14.4s // .................................~.................................................................................'.....................................*................................................................................ + // add v10.4s, v10.4s, v14.4s // ..................................~................................................................................'......................................*............................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ......................................................~............................................................'..........................................................*........................................................... + // mul v14.4s, v24.4s, v0.s[0] // .......................................................~...........................................................'...........................................................*.......................................................... + // mls v14.4s, v27.4s, v8.s[0] // ..........................................................~........................................................'..............................................................*....................................................... + // sub v24.4s, v11.4s, v15.4s // .........................................~.........................................................................'.............................................*........................................................................ + // add v11.4s, v11.4s, v15.4s // ...........................................~.......................................................................'...............................................*...................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .........................................................~.........................................................'.............................................................*........................................................ + // mul v15.4s, v24.4s, v0.s[0] // ...........................................................~.......................................................'...............................................................*...................................................... + // mls v15.4s, v27.4s, v8.s[0] // ............................................................~......................................................'................................................................*..................................................... // sub v24.4s, v12.4s, v16.4s // ................................................~..................................................................'....................................................*................................................................. // add v12.4s, v12.4s, v16.4s // .................................................~.................................................................'.....................................................*................................................................ - // mul v16.4s, v24.4s, v0.s[0] // ............................................................~......................................................'................................................................*..................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................~...................................................'...................................................................*.................................................. - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................~............................................'..........................................................................*........................................... - // cmge v27.4s, v31.4s, v13.4s // .......................................................~...........................................................'...........................................................*.......................................................... - // cmge v28.4s, v13.4s, v30.4s // .........................................................~.........................................................'.............................................................*........................................................ - // sub v28.4s, v27.4s, v28.4s // ...........................................................~.......................................................'...............................................................*...................................................... - // mls v13.4s, v28.4s, v8.4s // .................................................................~.................................................'.....................................................................*................................................ - // cmge v27.4s, v31.4s, v14.4s // ..............................................................~....................................................'..................................................................*................................................... - // cmge v28.4s, v14.4s, v30.4s // ................................................................~..................................................'....................................................................*................................................. - // sub v28.4s, v27.4s, v28.4s // ..................................................................~................................................'......................................................................*............................................... - // mls v14.4s, v28.4s, v8.4s // ........................................................................~..........................................'............................................................................*......................................... - // cmge v27.4s, v31.4s, v15.4s // ....................................................................~..............................................'........................................................................*............................................. - // cmge v28.4s, v15.4s, v30.4s // .....................................................................~.............................................'.........................................................................*............................................ - // sub v28.4s, v27.4s, v28.4s // .........................................................................~.........................................'.............................................................................*........................................ - // mls v15.4s, v28.4s, v8.4s // ..............................................................................~....................................'..................................................................................*................................... - // cmge v27.4s, v31.4s, v16.4s // .............................................................................~.....................................'.................................................................................*.................................... - // cmge v28.4s, v16.4s, v30.4s // ...............................................................................~...................................'...................................................................................*.................................. - // sub v28.4s, v27.4s, v28.4s // ...................................................................................~...............................'.......................................................................................*.............................. - // mls v16.4s, v28.4s, v8.4s // ...............................................................................................~...................'...................................................................................................*.................. - // str q13, [x0, #(4*(1024/8))] // .......................................................................~...........................................'...........................................................................*.......................................... - // str q14, [x0, #(5*(1024/8))] // ................................................................................~..................................'....................................................................................*................................. - // str q15, [x0, #(6*(1024/8))] // .....................................................................................~.............................'.........................................................................................*............................ - // str q16, [x0, #(7*(1024/8))] // ......................................................................................................~............'..........................................................................................................*........... - // mul v13.4s, v9.4s, v25.4s // ..................................................~................................................................'......................................................*............................................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................~...............................................................'.......................................................*.............................................................. - // mls v13.4s, v9.4s, v8.s[0] // ...................................................................~...............................................'.......................................................................*.............................................. - // mul v14.4s, v10.4s, v25.4s // ............................................................................~......................................'................................................................................*..................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................~................................'......................................................................................*............................... - // mls v14.4s, v10.4s, v8.s[0] // .......................................................................................~...........................'...........................................................................................*.......................... - // mul v15.4s, v11.4s, v25.4s // ....................................................................................~..............................'........................................................................................*............................. - // sqrdmulh v11.4s, v11.4s, v26.4s // ......................................................................................~............................'..........................................................................................*........................... - // mls v15.4s, v11.4s, v8.s[0] // .........................................................................................~.........................'.............................................................................................*........................ - // mul v16.4s, v12.4s, v25.4s // ...........................................................................................~.......................'...............................................................................................*...................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ........................................................................................~..........................'............................................................................................*......................... - // mls v16.4s, v12.4s, v8.s[0] // .............................................................................................~.....................'.................................................................................................*.................... - // cmge v27.4s, v31.4s, v13.4s // ..........................................................................~........................................'..............................................................................*....................................... - // cmge v28.4s, v13.4s, v30.4s // ...........................................................................~.......................................'...............................................................................*...................................... - // sub v28.4s, v27.4s, v28.4s // .................................................................................~.................................'.....................................................................................*................................ - // mls v13.4s, v28.4s, v8.4s // ..................................................................................................~................'......................................................................................................*............... - // cmge v27.4s, v31.4s, v14.4s // ..........................................................................................~........................'..............................................................................................*....................... - // cmge v28.4s, v14.4s, v30.4s // ............................................................................................~......................'................................................................................................*..................... - // sub v28.4s, v27.4s, v28.4s // ..............................................................................................~....................'..................................................................................................*................... - // mls v14.4s, v28.4s, v8.4s // ....................................................................................................~..............'........................................................................................................*............. - // cmge v27.4s, v31.4s, v15.4s // ................................................................................................~..................'....................................................................................................*................. - // cmge v28.4s, v15.4s, v30.4s // .................................................................................................~.................'.....................................................................................................*................ - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................~...........'...........................................................................................................*.......... - // mls v15.4s, v28.4s, v8.4s // ...........................................................................................................~.......'...............................................................................................................*...... - // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................~...............'.......................................................................................................*.............. - // cmge v28.4s, v16.4s, v30.4s // .....................................................................................................~.............'.........................................................................................................*............ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...............................................................~...................................................'...................................................................*.................................................. + // mul v16.4s, v24.4s, v0.s[0] // ................................................................~..................................................'....................................................................*................................................. + // mls v16.4s, v27.4s, v8.s[0] // .....................................................................~.............................................'.........................................................................*............................................ + // cmge v27.4s, v31.4s, v13.4s // ....................................................~..............................................................'........................................................*............................................................. + // cmge v28.4s, v13.4s, v30.4s // .....................................................~.............................................................'.........................................................*............................................................ + // sub v28.4s, v27.4s, v28.4s // ........................................................~..........................................................'............................................................*......................................................... + // mls v13.4s, v28.4s, v8.4s // ..................................................................~................................................'......................................................................*............................................... + // cmge v27.4s, v31.4s, v14.4s // .............................................................~.....................................................'.................................................................*.................................................... + // cmge v28.4s, v14.4s, v30.4s // ..............................................................~....................................................'..................................................................*................................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................~.................................................'.....................................................................*................................................ + // mls v14.4s, v28.4s, v8.4s // ......................................................................~............................................'..........................................................................*........................................... + // cmge v27.4s, v31.4s, v15.4s // ...................................................................~...............................................'.......................................................................*.............................................. + // cmge v28.4s, v15.4s, v30.4s // ....................................................................~..............................................'........................................................................*............................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................~...........................................'...........................................................................*.......................................... + // mls v15.4s, v28.4s, v8.4s // ...........................................................................~.......................................'...............................................................................*...................................... + // cmge v27.4s, v31.4s, v16.4s // ..........................................................................~........................................'..............................................................................*....................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................~......................................'................................................................................*..................................... + // sub v28.4s, v27.4s, v28.4s // ...............................................................................~...................................'...................................................................................*.................................. + // mls v16.4s, v28.4s, v8.4s // ................................................................................................~..................'....................................................................................................*................. + // str q13, [x0, #(4*(1024/8))] // ........................................................................~..........................................'............................................................................*......................................... + // str q14, [x0, #(5*(1024/8))] // .............................................................................~.....................................'.................................................................................*.................................... + // str q15, [x0, #(6*(1024/8))] // ..................................................................................~................................'......................................................................................*............................... + // str q16, [x0, #(7*(1024/8))] // .......................................................................................................~...........'...........................................................................................................*.......... + // sqrdmulh v27.4s, v9.4s, v26.4s // ..................................................~................................................................'......................................................*............................................................... + // mul v9.4s, v9.4s, v25.4s // ...................................................~...............................................................'.......................................................*.............................................................. + // mls v9.4s, v27.4s, v8.s[0] // .........................................................................~.........................................'.............................................................................*........................................ + // sqrdmulh v27.4s, v10.4s, v26.4s // ..............................................................................~....................................'..................................................................................*................................... + // mul v10.4s, v10.4s, v25.4s // ................................................................................~..................................'....................................................................................*................................. + // mls v10.4s, v27.4s, v8.s[0] // .....................................................................................~.............................'.........................................................................................*............................ + // sqrdmulh v27.4s, v11.4s, v26.4s // ....................................................................................~..............................'........................................................................................*............................. + // mul v11.4s, v11.4s, v25.4s // .......................................................................................~...........................'...........................................................................................*.......................... + // mls v11.4s, v27.4s, v8.s[0] // ..........................................................................................~........................'..............................................................................................*....................... + // sqrdmulh v27.4s, v12.4s, v26.4s // ........................................................................................~..........................'............................................................................................*......................... + // mul v12.4s, v12.4s, v25.4s // ............................................................................................~......................'................................................................................................*..................... + // mls v12.4s, v27.4s, v8.s[0] // ..............................................................................................~....................'..................................................................................................*................... + // cmge v27.4s, v31.4s, v9.4s // .................................................................................~.................................'.....................................................................................*................................ + // cmge v28.4s, v9.4s, v30.4s // ...................................................................................~...............................'.......................................................................................*.............................. + // sub v28.4s, v27.4s, v28.4s // ......................................................................................~............................'..........................................................................................*........................... + // mls v9.4s, v28.4s, v8.4s // ..................................................................................................~................'......................................................................................................*............... + // cmge v27.4s, v31.4s, v10.4s // .........................................................................................~.........................'.............................................................................................*........................ + // cmge v28.4s, v10.4s, v30.4s // ...........................................................................................~.......................'...............................................................................................*...................... + // sub v28.4s, v27.4s, v28.4s // .............................................................................................~.....................'.................................................................................................*.................... + // mls v10.4s, v28.4s, v8.4s // ....................................................................................................~..............'........................................................................................................*............. + // cmge v27.4s, v31.4s, v11.4s // ...............................................................................................~...................'...................................................................................................*.................. + // cmge v28.4s, v11.4s, v30.4s // .................................................................................................~.................'.....................................................................................................*................ + // sub v28.4s, v27.4s, v28.4s // ......................................................................................................~............'..........................................................................................................*........... + // mls v11.4s, v28.4s, v8.4s // ...........................................................................................................~.......'...............................................................................................................*...... + // cmge v27.4s, v31.4s, v12.4s // ...................................................................................................~...............'.......................................................................................................*.............. + // cmge v28.4s, v12.4s, v30.4s // .....................................................................................................~.............'.........................................................................................................*............ // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................~........'..............................................................................................................*....... - // mls v16.4s, v28.4s, v8.4s // .............................................................................................................~.....'.................................................................................................................*.... - // str q13, [x0], #(16) // .........................................................................................................~.........'.............................................................................................................*........ - // str q14, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................~......'................................................................................................................*..... - // str q15, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................~...'...................................................................................................................*.. - // str q16, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................~.'.....................................................................................................................* + // mls v12.4s, v28.4s, v8.4s // .............................................................................................................~.....'.................................................................................................................*.... + // str q9, [x0], #(16) // .........................................................................................................~.........'.............................................................................................................*........ + // str q10, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................~......'................................................................................................................*..... + // str q11, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................~...'...................................................................................................................*.. + // str q12, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................~.'.....................................................................................................................* sub count, count, #1 cbnz count, layer123_start // Instructions: 107 - // Expected cycles: 108 - // Expected IPC: 0.99 + // Expected cycles: 109 + // Expected IPC: 0.98 // - // Wall time: 2.05s - // User time: 2.05s + // Wall time: 1.98s + // User time: 1.98s // // ------------------------------------------- original position --------------------------------------------> // 0 25 50 75 100 // |------------------------|------------------------|------------------------|------------------------|------ - mul v27.4S, v5.4S, v2.S[2] // ...*....................................................................................................... - add v11.4S, v18.4S, v16.4S // .....*..................................................................................................... - ldr q7, [x0, #0] // *.......................................................................................................... - ldr q23, [x0, #128] // .*......................................................................................................... - sub v24.4S, v13.4S, v20.4S // ..*........................................................................................................ + sub v18.4S, v6.4S, v27.4S // ..*........................................................................................................ + add v20.4S, v6.4S, v27.4S // ....*...................................................................................................... + ldr q19, [x0, #0] // *.......................................................................................................... + sqrdmulh v11.4S, v11.4S, v2.S[3] // ...*....................................................................................................... + add v22.4S, v12.4S, v14.4S // .....*..................................................................................................... + ldr q13, [x0, #128] // .*......................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v6.4S, v18.4S, v2.S[1] // ......*.................................................................................................... // gap // ........................................................................................................... - add v9.4S, v13.4S, v20.4S // ....*...................................................................................................... - mls v27.4S, v4.4S, v8.S[0] // ...........*............................................................................................... // gap // ........................................................................................................... - sub v13.4S, v11.4S, v14.4S // ..........*................................................................................................ + sub v27.4S, v22.4S, v5.4S // ..........*................................................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - add v11.4S, v11.4S, v14.4S // ............*.............................................................................................. - mul v20.4S, v24.4S, v2.S[0] // ......*.................................................................................................... + mul v18.4S, v18.4S, v2.S[0] // .........*................................................................................................. + add v22.4S, v22.4S, v5.4S // ............*.............................................................................................. // gap // ........................................................................................................... - sub v21.4S, v7.4S, v23.4S // .......*................................................................................................... + add v17.4S, v19.4S, v13.4S // ........*.................................................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - add v7.4S, v7.4S, v23.4S // ........*.................................................................................................. - sqrdmulh v23.4S, v24.4S, v2.S[1] // .........*................................................................................................. + mls v23.4S, v11.4S, v8.S[0] // ...........*............................................................................................... + sub v19.4S, v19.4S, v13.4S // .......*................................................................................................... // gap // ........................................................................................................... - sub v24.4S, v27.4S, v19.4S // .................*......................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - add v27.4S, v27.4S, v19.4S // ...................*....................................................................................... - mul v15.4S, v21.4S, v1.S[2] // ...............*........................................................................................... // gap // ........................................................................................................... - sub v18.4S, v7.4S, v9.4S // .............*............................................................................................. + mls v18.4S, v6.4S, v8.S[0] // ................*.......................................................................................... + sub v11.4S, v17.4S, v20.4S // .............*............................................................................................. // gap // ........................................................................................................... + add v20.4S, v17.4S, v20.4S // ..............*............................................................................................ // gap // ........................................................................................................... - add v7.4S, v7.4S, v9.4S // ..............*............................................................................................ - mul v9.4S, v13.4S, v1.S[0] // ........................*.................................................................................. // gap // ........................................................................................................... + sqrdmulh v13.4S, v19.4S, v1.S[3] // ...............*........................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + sub v6.4S, v23.4S, v9.4S // .................*......................................................................................... // gap // ........................................................................................................... - sqrdmulh v13.4S, v13.4S, v1.S[1] // .........................*................................................................................. // gap // ........................................................................................................... + add v17.4S, v23.4S, v9.4S // ...................*....................................................................................... + sqrdmulh v12.4S, v27.4S, v1.S[1] // ........................*.................................................................................. // gap // ........................................................................................................... - sub v16.4S, v7.4S, v11.4S // ....................*...................................................................................... + sub v14.4S, v20.4S, v22.4S // ....................*...................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - add v11.4S, v7.4S, v11.4S // .....................*..................................................................................... - mls v20.4S, v23.4S, v8.S[0] // ................*.......................................................................................... + mul v27.4S, v27.4S, v1.S[0] // ..........................*................................................................................ + add v20.4S, v20.4S, v22.4S // .....................*..................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v7.4S, v21.4S, v1.S[3] // ..................*........................................................................................ + mul v19.4S, v19.4S, v1.S[2] // ..................*........................................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v23.4S, v18.4S, v0.S[2] // ......................*.................................................................................... + mls v19.4S, v13.4S, v8.S[0] // ......................*.................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v21.4S, v18.4S, v0.S[3] // .......................*................................................................................... + sqrdmulh v22.4S, v11.4S, v0.S[3] // .......................*................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v18.4S, v24.4S, v1.S[0] // ...............................*........................................................................... + mul v11.4S, v11.4S, v0.S[2] // ............................*.............................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + sub v13.4S, v19.4S, v18.4S // .........................*................................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + add v18.4S, v19.4S, v18.4S // ...........................*............................................................................... + mls v27.4S, v12.4S, v8.S[0] // ..................................*........................................................................ // gap // ........................................................................................................... - sqrdmulh v24.4S, v24.4S, v1.S[1] // .....................................*..................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mls v11.4S, v22.4S, v8.S[0] // ...............................*........................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v15.4S, v7.4S, v8.S[0] // ..........................*................................................................................ + sub v19.4S, v18.4S, v17.4S // .............................*............................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + add v18.4S, v18.4S, v17.4S // ..............................*............................................................................ + sqrdmulh v22.4S, v13.4S, v0.S[3] // ................................*.......................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v23.4S, v21.4S, v8.S[0] // ...........................*............................................................................... // gap // ........................................................................................................... + mul v13.4S, v13.4S, v0.S[2] // .................................*......................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + sub v17.4S, v11.4S, v27.4S // .....................................*..................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v9.4S, v13.4S, v8.S[0] // ............................*.............................................................................. + add v11.4S, v11.4S, v27.4S // .......................................*................................................................... + sqrdmulh v27.4S, v6.4S, v1.S[1] // ....................................*...................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - add v7.4S, v15.4S, v20.4S // ..............................*............................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v13.4S, v15.4S, v20.4S // .............................*............................................................................. - mul v20.4S, v16.4S, v0.S[0] // .......................................*................................................................... + mls v13.4S, v22.4S, v8.S[0] // ...................................*....................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v18.4S, v24.4S, v8.S[0] // .........................................*................................................................. - sub v24.4S, v7.4S, v27.4S // .................................*......................................................................... // gap // ........................................................................................................... - add v27.4S, v7.4S, v27.4S // ..................................*........................................................................ + mul v22.4S, v6.4S, v1.S[0] // ......................................*.................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v7.4S, v13.4S, v0.S[2] // ................................*.......................................................................... - sub v21.4S, v23.4S, v9.4S // ....................................*...................................................................... // gap // ........................................................................................................... - add v23.4S, v23.4S, v9.4S // ......................................*.................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v9.4S, v13.4S, v0.S[3] // ...................................*....................................................................... + sqrdmulh v6.4S, v14.4S, v0.S[1] // ........................................*.................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v13.4S, v16.4S, v0.S[1] // ..........................................*................................................................ + mls v22.4S, v27.4S, v8.S[0] // .........................................*................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v15.4S, v24.4S, v0.S[0] // ...........................................*............................................................... + mul v27.4S, v14.4S, v0.S[0] // ..........................................*................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v16.4S, v11.4S, v25.4S // ..............................................*............................................................ + mls v27.4S, v6.4S, v8.S[0] // ...........................................*............................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + sub v6.4S, v13.4S, v22.4S // ............................................*.............................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + add v22.4S, v13.4S, v22.4S // .............................................*............................................................. + sqrdmulh v13.4S, v20.4S, v26.4S // ..............................................*............................................................ // gap // ........................................................................................................... - sqrdmulh v11.4S, v11.4S, v26.4S // ...............................................*........................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mul v20.4S, v20.4S, v25.4S // ...............................................*........................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v7.4S, v9.4S, v8.S[0] // ........................................*.................................................................. + cmge v12.4S, v31.4S, v27.4S // ................................................*.......................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + cmge v14.4S, v27.4S, v30.4S // .................................................*......................................................... + sqrdmulh v16.4S, v19.4S, v0.S[1] // ..................................................*........................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v20.4S, v13.4S, v8.S[0] // ................................................*.......................................................... // gap // ........................................................................................................... + mul v19.4S, v19.4S, v0.S[0] // ...................................................*....................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + sub v12.4S, v12.4S, v14.4S // ....................................................*...................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v24.4S, v24.4S, v0.S[1] // .................................................*......................................................... + sqrdmulh v14.4S, v17.4S, v0.S[1] // .....................................................*..................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v9.4S, v7.4S, v18.4S // ............................................*.............................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - add v7.4S, v7.4S, v18.4S // .............................................*............................................................. - mul v13.4S, v21.4S, v0.S[0] // ..................................................*........................................................ // gap // ........................................................................................................... - cmge v18.4S, v31.4S, v20.4S // ...................................................*....................................................... + mls v19.4S, v16.4S, v8.S[0] // ......................................................*.................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v21.4S, v21.4S, v0.S[1] // ....................................................*...................................................... - cmge v17.4S, v20.4S, v30.4S // .....................................................*..................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mul v17.4S, v17.4S, v0.S[0] // .......................................................*................................................... // gap // ........................................................................................................... - mls v15.4S, v24.4S, v8.S[0] // ......................................................*.................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v24.4S, v18.4S, v17.4S // .......................................................*................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v18.4S, v9.4S, v0.S[0] // ........................................................*.................................................. + mls v17.4S, v14.4S, v8.S[0] // ........................................................*.................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + cmge v14.4S, v31.4S, v19.4S // .........................................................*................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + cmge v16.4S, v19.4S, v30.4S // ..........................................................*................................................ + sqrdmulh v10.4S, v6.4S, v0.S[1] // ...........................................................*............................................... // gap // ........................................................................................................... - mls v13.4S, v21.4S, v8.S[0] // .........................................................*................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v21.4S, v31.4S, v15.4S // ..........................................................*................................................ // gap // ........................................................................................................... + mul v6.4S, v6.4S, v0.S[0] // ............................................................*.............................................. // gap // ........................................................................................................... - sqrdmulh v9.4S, v9.4S, v0.S[1] // ...........................................................*............................................... - cmge v17.4S, v15.4S, v30.4S // ............................................................*.............................................. // gap // ........................................................................................................... + sub v14.4S, v14.4S, v16.4S // .............................................................*............................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + mls v27.4S, v12.4S, v8.4S // ..............................................................*............................................ + cmge v12.4S, v31.4S, v17.4S // ...............................................................*........................................... // gap // ........................................................................................................... - mls v20.4S, v24.4S, v8.4S // .............................................................*............................................. + cmge v16.4S, v17.4S, v30.4S // ................................................................*.......................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v24.4S, v21.4S, v17.4S // ..............................................................*............................................ + mls v6.4S, v10.4S, v8.S[0] // .................................................................*......................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v16.4S, v11.4S, v8.S[0] // ...............................................................*........................................... - cmge v11.4S, v31.4S, v13.4S // ................................................................*.......................................... // gap // ........................................................................................................... - cmge v21.4S, v13.4S, v30.4S // .................................................................*......................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v18.4S, v9.4S, v8.S[0] // ..................................................................*........................................ + mls v19.4S, v14.4S, v8.4S // ..................................................................*........................................ + sub v12.4S, v12.4S, v16.4S // ...................................................................*....................................... // gap // ........................................................................................................... + str q27, [x0, #512] // ....................................................................*...................................... // gap // ........................................................................................................... - str q20, [x0, #512] // ...................................................................*....................................... // gap // ........................................................................................................... + mls v20.4S, v13.4S, v8.S[0] // .....................................................................*..................................... // gap // ........................................................................................................... - mls v15.4S, v24.4S, v8.4S // ....................................................................*...................................... - sub v11.4S, v11.4S, v21.4S // .....................................................................*..................................... // gap // ........................................................................................................... - cmge v24.4S, v31.4S, v16.4S // ......................................................................*.................................... + cmge v13.4S, v31.4S, v6.4S // ......................................................................*.................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v9.4S, v16.4S, v30.4S // .......................................................................*................................... - mul v20.4S, v27.4S, v25.4S // ........................................................................*.................................. + cmge v27.4S, v6.4S, v30.4S // ........................................................................*.................................. + sqrdmulh v14.4S, v18.4S, v26.4S // ..........................................................................*................................ // gap // ........................................................................................................... - cmge v21.4S, v31.4S, v18.4S // .........................................................................*................................. + str q19, [x0, #640] // .........................................................................*................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v27.4S, v27.4S, v26.4S // ..............................................................................*............................ - cmge v17.4S, v18.4S, v30.4S // ...........................................................................*............................... + mul v18.4S, v18.4S, v25.4S // ............................................................................*.............................. // gap // ........................................................................................................... - str q15, [x0, #640] // ............................................................................*.............................. - sub v24.4S, v24.4S, v9.4S // .............................................................................*............................. // gap // ........................................................................................................... - mul v9.4S, v23.4S, v25.4S // ................................................................................*.......................... + sub v19.4S, v13.4S, v27.4S // ...........................................................................*............................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v21.4S, v21.4S, v17.4S // ...............................................................................*........................... + cmge v13.4S, v31.4S, v20.4S // .............................................................................*............................. + sqrdmulh v27.4S, v11.4S, v26.4S // ................................................................................*.......................... // gap // ........................................................................................................... + cmge v16.4S, v20.4S, v30.4S // ...............................................................................*........................... // gap // ........................................................................................................... - sqrdmulh v23.4S, v23.4S, v26.4S // ..................................................................................*........................ // gap // ........................................................................................................... + mls v18.4S, v14.4S, v8.S[0] // .................................................................................*......................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v20.4S, v27.4S, v8.S[0] // ...................................................................................*....................... // gap // ........................................................................................................... + mul v11.4S, v11.4S, v25.4S // ...................................................................................*....................... + sub v13.4S, v13.4S, v16.4S // ..................................................................................*........................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v27.4S, v7.4S, v26.4S // ....................................................................................*...................... + sqrdmulh v14.4S, v22.4S, v26.4S // ....................................................................................*...................... // gap // ........................................................................................................... // gap // ........................................................................................................... + cmge v16.4S, v31.4S, v18.4S // .....................................................................................*..................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mls v11.4S, v27.4S, v8.S[0] // ......................................................................................*.................... + cmge v27.4S, v18.4S, v30.4S // .......................................................................................*................... // gap // ........................................................................................................... - mls v9.4S, v23.4S, v8.S[0] // .....................................................................................*..................... // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v23.4S, v31.4S, v20.4S // ......................................................................................*.................... // gap // ........................................................................................................... + mul v22.4S, v22.4S, v25.4S // ........................................................................................*.................. // gap // ........................................................................................................... - mul v7.4S, v7.4S, v25.4S // .......................................................................................*................... - cmge v15.4S, v20.4S, v30.4S // ........................................................................................*.................. // gap // ........................................................................................................... + sub v27.4S, v16.4S, v27.4S // .........................................................................................*................. // gap // ........................................................................................................... // gap // ........................................................................................................... + mls v22.4S, v14.4S, v8.S[0] // ..........................................................................................*................ // gap // ........................................................................................................... - mls v7.4S, v27.4S, v8.S[0] // .........................................................................................*................. // gap // ........................................................................................................... + cmge v14.4S, v31.4S, v11.4S // ...........................................................................................*............... // gap // ........................................................................................................... - sub v27.4S, v23.4S, v15.4S // ..........................................................................................*................ // gap // ........................................................................................................... + mls v17.4S, v12.4S, v8.4S // .......................................................................*................................... + cmge v12.4S, v11.4S, v30.4S // .............................................................................................*............. // gap // ........................................................................................................... - mls v13.4S, v11.4S, v8.4S // ..........................................................................*................................ - cmge v11.4S, v31.4S, v9.4S // ............................................................................................*.............. // gap // ........................................................................................................... - cmge v23.4S, v9.4S, v30.4S // .............................................................................................*............. // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v18.4S, v21.4S, v8.4S // ...........................................................................................*............... + mls v6.4S, v19.4S, v8.4S // ............................................................................................*.............. // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v21.4S, v31.4S, v7.4S // ...............................................................................................*........... + cmge v19.4S, v31.4S, v22.4S // ...............................................................................................*........... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v16.4S, v24.4S, v8.4S // ..............................................................................................*............ - cmge v24.4S, v7.4S, v30.4S // .................................................................................................*......... + mls v20.4S, v13.4S, v8.4S // ..............................................................................................*............ + cmge v13.4S, v22.4S, v30.4S // .................................................................................................*......... // gap // ........................................................................................................... - str q13, [x0, #768] // .................................................................................*......................... - sub v11.4S, v11.4S, v23.4S // ...................................................................................................*....... + sub v12.4S, v14.4S, v12.4S // ..................................................................................................*........ + str q17, [x0, #768] // ..............................................................................*............................ // gap // ........................................................................................................... - mls v20.4S, v27.4S, v8.4S // ................................................................................................*.......... + mls v18.4S, v27.4S, v8.4S // ................................................................................................*.......... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q18, [x0, #896] // ..................................................................................................*........ - sub v27.4S, v21.4S, v24.4S // .....................................................................................................*..... + str q6, [x0, #896] // ...................................................................................................*....... + sub v19.4S, v19.4S, v13.4S // .....................................................................................................*..... // gap // ........................................................................................................... - mls v9.4S, v11.4S, v8.4S // ......................................................................................................*.... + mls v11.4S, v12.4S, v8.4S // ......................................................................................................*.... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q16, [x0], #(16) // ....................................................................................................*...... + str q20, [x0], #(16) // ....................................................................................................*...... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v7.4S, v27.4S, v8.4S // ........................................................................................................*.. + mls v22.4S, v19.4S, v8.4S // ........................................................................................................*.. // gap // ........................................................................................................... // gap // ........................................................................................................... - str q20, [x0, #112] // .......................................................................................................*... + str q18, [x0, #112] // .......................................................................................................*... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q9, [x0, #240] // .........................................................................................................*. + str q11, [x0, #240] // .........................................................................................................*. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q7, [x0, #368] // ..........................................................................................................* + str q22, [x0, #368] // ..........................................................................................................* // gap // ........................................................................................................... // gap // ........................................................................................................... // ---------------------------------------------- new position ----------------------------------------------> // 0 25 50 75 100 // |------------------------|------------------------|------------------------|------------------------|------ - // ldr q27, [x0, #0] // ..*........................................................................................................ - // ldr q11, [x0, #128] // ...*....................................................................................................... - // sub v7.4S, v13.4S, v20.4S // ....*...................................................................................................... - // mul v24.4S, v5.4S, v2.S[2] // *.......................................................................................................... - // add v23.4S, v13.4S, v20.4S // .....*..................................................................................................... - // add v9.4S, v18.4S, v16.4S // .*......................................................................................................... - // mul v15.4S, v7.4S, v2.S[0] // .........*................................................................................................. - // sub v5.4S, v27.4S, v11.4S // ..........*................................................................................................ - // add v27.4S, v27.4S, v11.4S // ...........*............................................................................................... - // sqrdmulh v11.4S, v7.4S, v2.S[1] // ............*.............................................................................................. - // sub v7.4S, v9.4S, v14.4S // .......*................................................................................................... - // mls v24.4S, v4.4S, v8.S[0] // ......*.................................................................................................... - // add v9.4S, v9.4S, v14.4S // ........*.................................................................................................. - // sub v4.4S, v27.4S, v23.4S // ................*.......................................................................................... - // add v27.4S, v27.4S, v23.4S // .................*......................................................................................... - // mul v23.4S, v5.4S, v1.S[2] // ...............*........................................................................................... - // mls v15.4S, v11.4S, v8.S[0] // ......................*.................................................................................... - // sub v11.4S, v24.4S, v19.4S // .............*............................................................................................. - // sqrdmulh v21.4S, v5.4S, v1.S[3] // .......................*................................................................................... - // add v24.4S, v24.4S, v19.4S // ..............*............................................................................................ - // sub v17.4S, v27.4S, v9.4S // ....................*...................................................................................... - // add v27.4S, v27.4S, v9.4S // .....................*..................................................................................... - // mul v9.4S, v4.4S, v0.S[2] // ........................*.................................................................................. - // sqrdmulh v4.4S, v4.4S, v0.S[3] // .........................*................................................................................. - // mul v19.4S, v7.4S, v1.S[0] // ..................*........................................................................................ - // sqrdmulh v7.4S, v7.4S, v1.S[1] // ...................*....................................................................................... - // mls v23.4S, v21.4S, v8.S[0] // ............................*.............................................................................. - // mls v9.4S, v4.4S, v8.S[0] // .............................*............................................................................. - // mls v19.4S, v7.4S, v8.S[0] // ..............................*............................................................................ - // sub v7.4S, v23.4S, v15.4S // ................................*.......................................................................... - // add v23.4S, v23.4S, v15.4S // ...............................*........................................................................... - // mul v21.4S, v11.4S, v1.S[0] // ..........................*................................................................................ - // mul v15.4S, v7.4S, v0.S[2] // .....................................*..................................................................... - // sub v4.4S, v23.4S, v24.4S // ...................................*....................................................................... - // add v23.4S, v23.4S, v24.4S // ....................................*...................................................................... - // sqrdmulh v7.4S, v7.4S, v0.S[3] // ........................................*.................................................................. - // sub v24.4S, v9.4S, v19.4S // ......................................*.................................................................... - // sqrdmulh v11.4S, v11.4S, v1.S[1] // ...........................*............................................................................... - // add v9.4S, v9.4S, v19.4S // .......................................*................................................................... - // mul v19.4S, v17.4S, v0.S[0] // .................................*......................................................................... - // mls v15.4S, v7.4S, v8.S[0] // .............................................*............................................................. - // mls v21.4S, v11.4S, v8.S[0] // ..................................*........................................................................ - // sqrdmulh v11.4S, v17.4S, v0.S[1] // .........................................*................................................................. - // mul v7.4S, v4.4S, v0.S[0] // ..........................................*................................................................ - // sub v17.4S, v15.4S, v21.4S // ................................................*.......................................................... - // add v21.4S, v15.4S, v21.4S // .................................................*......................................................... - // mul v15.4S, v27.4S, v25.4S // ...........................................*............................................................... - // sqrdmulh v27.4S, v27.4S, v26.4S // ............................................*.............................................................. - // mls v19.4S, v11.4S, v8.S[0] // ..............................................*............................................................ - // sqrdmulh v11.4S, v4.4S, v0.S[1] // ...............................................*........................................................... - // mul v4.4S, v24.4S, v0.S[0] // ..................................................*........................................................ - // cmge v29.4S, v31.4S, v19.4S // ...................................................*....................................................... - // sqrdmulh v24.4S, v24.4S, v0.S[1] // ....................................................*...................................................... - // cmge v6.4S, v19.4S, v30.4S // .....................................................*..................................................... - // mls v7.4S, v11.4S, v8.S[0] // ......................................................*.................................................... - // sub v11.4S, v29.4S, v6.4S // .......................................................*................................................... - // mul v29.4S, v17.4S, v0.S[0] // ........................................................*.................................................. - // mls v4.4S, v24.4S, v8.S[0] // .........................................................*................................................. - // cmge v24.4S, v31.4S, v7.4S // ..........................................................*................................................ - // sqrdmulh v17.4S, v17.4S, v0.S[1] // ...........................................................*............................................... - // cmge v6.4S, v7.4S, v30.4S // ............................................................*.............................................. - // mls v19.4S, v11.4S, v8.4S // .............................................................*............................................. - // sub v11.4S, v24.4S, v6.4S // ..............................................................*............................................ - // mls v15.4S, v27.4S, v8.S[0] // ...............................................................*........................................... - // cmge v27.4S, v31.4S, v4.4S // ................................................................*.......................................... - // cmge v24.4S, v4.4S, v30.4S // .................................................................*......................................... - // mls v29.4S, v17.4S, v8.S[0] // ..................................................................*........................................ - // str q19, [x0, #512] // ...................................................................*....................................... - // mls v7.4S, v11.4S, v8.4S // ....................................................................*...................................... - // sub v27.4S, v27.4S, v24.4S // .....................................................................*..................................... - // cmge v11.4S, v31.4S, v15.4S // ......................................................................*.................................... - // cmge v24.4S, v15.4S, v30.4S // .......................................................................*................................... - // mul v17.4S, v23.4S, v25.4S // ........................................................................*.................................. - // cmge v19.4S, v31.4S, v29.4S // .........................................................................*................................. - // mls v4.4S, v27.4S, v8.4S // .........................................................................................*................. - // cmge v27.4S, v29.4S, v30.4S // ...........................................................................*............................... - // str q7, [x0, #640] // ............................................................................*.............................. - // sub v11.4S, v11.4S, v24.4S // .............................................................................*............................. - // sqrdmulh v7.4S, v23.4S, v26.4S // ..........................................................................*................................ - // sub v27.4S, v19.4S, v27.4S // ...............................................................................*........................... - // mul v23.4S, v9.4S, v25.4S // ..............................................................................*............................ - // str q4, [x0, #768] // ................................................................................................*.......... - // sqrdmulh v24.4S, v9.4S, v26.4S // ................................................................................*.......................... - // mls v17.4S, v7.4S, v8.S[0] // .................................................................................*......................... - // sqrdmulh v7.4S, v21.4S, v26.4S // ..................................................................................*........................ - // mls v23.4S, v24.4S, v8.S[0] // ...................................................................................*....................... - // cmge v24.4S, v31.4S, v17.4S // ....................................................................................*...................... - // mul v9.4S, v21.4S, v25.4S // .....................................................................................*..................... - // cmge v21.4S, v17.4S, v30.4S // ......................................................................................*.................... - // mls v9.4S, v7.4S, v8.S[0] // .......................................................................................*................... - // sub v7.4S, v24.4S, v21.4S // ........................................................................................*.................. - // mls v29.4S, v27.4S, v8.4S // ............................................................................................*.............. - // cmge v27.4S, v31.4S, v23.4S // ..........................................................................................*................ - // cmge v24.4S, v23.4S, v30.4S // ...........................................................................................*............... - // mls v15.4S, v11.4S, v8.4S // ..............................................................................................*............ - // cmge v11.4S, v31.4S, v9.4S // .............................................................................................*............. - // mls v17.4S, v7.4S, v8.4S // ..................................................................................................*........ - // cmge v7.4S, v9.4S, v30.4S // ...............................................................................................*........... - // str q29, [x0, #896] // ...................................................................................................*....... - // sub v27.4S, v27.4S, v24.4S // .................................................................................................*......... - // str q15, [x0], #(16) // ......................................................................................................*.... - // sub v11.4S, v11.4S, v7.4S // ....................................................................................................*...... - // mls v23.4S, v27.4S, v8.4S // .....................................................................................................*..... - // str q17, [x0, #112] // ........................................................................................................*.. - // mls v9.4S, v11.4S, v8.4S // .......................................................................................................*... - // str q23, [x0, #240] // .........................................................................................................*. - // str q9, [x0, #368] // ..........................................................................................................* + // ldr q20, [x0, #0] // ..*........................................................................................................ + // ldr q18, [x0, #128] // .....*..................................................................................................... + // sub v19.4S, v6.4S, v27.4S // *.......................................................................................................... + // sqrdmulh v22.4S, v11.4S, v2.S[3] // ...*....................................................................................................... + // add v11.4S, v6.4S, v27.4S // .*......................................................................................................... + // add v13.4S, v12.4S, v14.4S // ....*...................................................................................................... + // sqrdmulh v17.4S, v19.4S, v2.S[1] // ......*.................................................................................................... + // sub v10.4S, v20.4S, v18.4S // ............*.............................................................................................. + // add v20.4S, v20.4S, v18.4S // ..........*................................................................................................ + // mul v18.4S, v19.4S, v2.S[0] // ........*.................................................................................................. + // sub v19.4S, v13.4S, v5.4S // .......*................................................................................................... + // mls v23.4S, v22.4S, v8.S[0] // ...........*............................................................................................... + // add v22.4S, v13.4S, v5.4S // .........*................................................................................................. + // sub v13.4S, v20.4S, v11.4S // ..............*............................................................................................ + // add v20.4S, v20.4S, v11.4S // ...............*........................................................................................... + // sqrdmulh v11.4S, v10.4S, v1.S[3] // ................*.......................................................................................... + // mls v18.4S, v17.4S, v8.S[0] // .............*............................................................................................. + // sub v17.4S, v23.4S, v9.4S // .................*......................................................................................... + // mul v16.4S, v10.4S, v1.S[2] // .......................*................................................................................... + // add v10.4S, v23.4S, v9.4S // ..................*........................................................................................ + // sub v23.4S, v20.4S, v22.4S // ....................*...................................................................................... + // add v20.4S, v20.4S, v22.4S // ......................*.................................................................................... + // mls v16.4S, v11.4S, v8.S[0] // ........................*.................................................................................. + // sqrdmulh v22.4S, v13.4S, v0.S[3] // .........................*................................................................................. + // sqrdmulh v9.4S, v19.4S, v1.S[1] // ...................*....................................................................................... + // sub v7.4S, v16.4S, v18.4S // ...........................*............................................................................... + // mul v19.4S, v19.4S, v1.S[0] // .....................*..................................................................................... + // add v18.4S, v16.4S, v18.4S // ............................*.............................................................................. + // mul v13.4S, v13.4S, v0.S[2] // ..........................*................................................................................ + // sub v16.4S, v18.4S, v10.4S // ...............................*........................................................................... + // add v18.4S, v18.4S, v10.4S // ................................*.......................................................................... + // mls v13.4S, v22.4S, v8.S[0] // ..............................*............................................................................ + // sqrdmulh v22.4S, v7.4S, v0.S[3] // .................................*......................................................................... + // mul v10.4S, v7.4S, v0.S[2] // ..................................*........................................................................ + // mls v19.4S, v9.4S, v8.S[0] // .............................*............................................................................. + // mls v10.4S, v22.4S, v8.S[0] // ......................................*.................................................................... + // sqrdmulh v22.4S, v17.4S, v1.S[1] // .....................................*..................................................................... + // sub v9.4S, v13.4S, v19.4S // ...................................*....................................................................... + // mul v17.4S, v17.4S, v1.S[0] // .......................................*................................................................... + // add v19.4S, v13.4S, v19.4S // ....................................*...................................................................... + // sqrdmulh v13.4S, v23.4S, v0.S[1] // ........................................*.................................................................. + // mls v17.4S, v22.4S, v8.S[0] // .........................................*................................................................. + // mul v22.4S, v23.4S, v0.S[0] // ..........................................*................................................................ + // mls v22.4S, v13.4S, v8.S[0] // ...........................................*............................................................... + // sub v13.4S, v10.4S, v17.4S // ............................................*.............................................................. + // add v17.4S, v10.4S, v17.4S // .............................................*............................................................. + // sqrdmulh v10.4S, v20.4S, v26.4S // ..............................................*............................................................ + // mul v20.4S, v20.4S, v25.4S // ...............................................*........................................................... + // cmge v23.4S, v31.4S, v22.4S // ................................................*.......................................................... + // cmge v7.4S, v22.4S, v30.4S // .................................................*......................................................... + // sqrdmulh v24.4S, v16.4S, v0.S[1] // ..................................................*........................................................ + // mul v16.4S, v16.4S, v0.S[0] // ...................................................*....................................................... + // sub v23.4S, v23.4S, v7.4S // ....................................................*...................................................... + // sqrdmulh v7.4S, v9.4S, v0.S[1] // .....................................................*..................................................... + // mls v16.4S, v24.4S, v8.S[0] // ......................................................*.................................................... + // mul v9.4S, v9.4S, v0.S[0] // .......................................................*................................................... + // mls v9.4S, v7.4S, v8.S[0] // ........................................................*.................................................. + // cmge v7.4S, v31.4S, v16.4S // .........................................................*................................................. + // cmge v24.4S, v16.4S, v30.4S // ..........................................................*................................................ + // sqrdmulh v4.4S, v13.4S, v0.S[1] // ...........................................................*............................................... + // mul v13.4S, v13.4S, v0.S[0] // ............................................................*.............................................. + // sub v7.4S, v7.4S, v24.4S // .............................................................*............................................. + // mls v22.4S, v23.4S, v8.4S // ..............................................................*............................................ + // cmge v23.4S, v31.4S, v9.4S // ...............................................................*........................................... + // cmge v24.4S, v9.4S, v30.4S // ................................................................*.......................................... + // mls v13.4S, v4.4S, v8.S[0] // .................................................................*......................................... + // mls v16.4S, v7.4S, v8.4S // ..................................................................*........................................ + // sub v23.4S, v23.4S, v24.4S // ...................................................................*....................................... + // str q22, [x0, #512] // ....................................................................*...................................... + // mls v20.4S, v10.4S, v8.S[0] // .....................................................................*..................................... + // cmge v22.4S, v31.4S, v13.4S // ......................................................................*.................................... + // mls v9.4S, v23.4S, v8.4S // ..........................................................................................*................ + // cmge v10.4S, v13.4S, v30.4S // .......................................................................*................................... + // str q16, [x0, #640] // .........................................................................*................................. + // sqrdmulh v16.4S, v18.4S, v26.4S // ........................................................................*.................................. + // sub v22.4S, v22.4S, v10.4S // ...........................................................................*............................... + // mul v18.4S, v18.4S, v25.4S // ..........................................................................*................................ + // cmge v10.4S, v31.4S, v20.4S // ............................................................................*.............................. + // str q9, [x0, #768] // .................................................................................................*......... + // cmge v23.4S, v20.4S, v30.4S // ..............................................................................*............................ + // sqrdmulh v9.4S, v19.4S, v26.4S // .............................................................................*............................. + // mls v18.4S, v16.4S, v8.S[0] // ...............................................................................*........................... + // sub v16.4S, v10.4S, v23.4S // .................................................................................*......................... + // mul v19.4S, v19.4S, v25.4S // ................................................................................*.......................... + // sqrdmulh v10.4S, v17.4S, v26.4S // ..................................................................................*........................ + // cmge v23.4S, v31.4S, v18.4S // ...................................................................................*....................... + // mls v19.4S, v9.4S, v8.S[0] // ....................................................................................*...................... + // cmge v9.4S, v18.4S, v30.4S // .....................................................................................*..................... + // mul v17.4S, v17.4S, v25.4S // ......................................................................................*.................... + // sub v23.4S, v23.4S, v9.4S // .......................................................................................*................... + // mls v17.4S, v10.4S, v8.S[0] // ........................................................................................*.................. + // cmge v10.4S, v31.4S, v19.4S // .........................................................................................*................. + // mls v13.4S, v22.4S, v8.4S // ............................................................................................*.............. + // cmge v22.4S, v19.4S, v30.4S // ...........................................................................................*............... + // mls v20.4S, v16.4S, v8.4S // ..............................................................................................*............ + // cmge v16.4S, v31.4S, v17.4S // .............................................................................................*............. + // mls v18.4S, v23.4S, v8.4S // ..................................................................................................*........ + // cmge v23.4S, v17.4S, v30.4S // ...............................................................................................*........... + // sub v22.4S, v10.4S, v22.4S // ................................................................................................*.......... + // str q13, [x0, #896] // ...................................................................................................*....... + // str q20, [x0], #(16) // ......................................................................................................*.... + // sub v20.4S, v16.4S, v23.4S // ....................................................................................................*...... + // mls v19.4S, v22.4S, v8.4S // .....................................................................................................*..... + // str q18, [x0, #112] // ........................................................................................................*.. + // mls v17.4S, v20.4S, v8.4S // .......................................................................................................*... + // str q19, [x0, #240] // .........................................................................................................*. + // str q17, [x0, #368] // ..........................................................................................................* pop_stack diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s index f05c8e90..1ef7b881 100644 --- a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s +++ b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +33,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro barrett_reduce_single a @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -245,6 +231,12 @@ xtmp1 .req x11 restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -387,62 +379,577 @@ _intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: qform_root3_tw .req q7 .p2align 2 - // Instructions: 10 - // Expected cycles: 4 - // Expected IPC: 2.50 - // - // Wall time: 0.09s - // User time: 0.09s - // - // ----- original position -----> - // 0 25 - // |------------------------|---- - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - ldr q29, [x1, #0] // .....*........................ - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - ldr q11, [x5, #96] // .........*.................... - ldr q0, [x5, #32] // *............................. - ldr q24, [x5], #(12*16) // .*............................ - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - ldr q19, [x1, #32] // ........*..................... - // gap // .............................. - ldr q2, [x5, #-112] // ...*.......................... - // gap // .............................. - ldr q30, [x5, #-128] // ....*......................... - ldr q7, [x1, #48] // ......*....................... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - ldr q15, [x5, #-144] // ..*........................... - ldr q6, [x1, #16] // .......*...................... - - // -------- new position --------> - // 0 25 - // |------------------------|----- - // ldr q0, [x5, #32] // ..*............................ - // ldr q24, [x5], #(12*16) // ...*........................... - // ldr q15, [x5, #-144] // ........*...................... - // ldr q2, [x5, #-112] // .....*......................... - // ldr q30, [x5, #-128] // ......*........................ - // ldr q29, [x1, #0] // *.............................. - // ldr q7, [x1, #48] // .......*....................... - // ldr q6, [x1, #16] // .........*..................... - // ldr q19, [x1, #32] // ....*.......................... - // ldr q11, [x5, #-96] // .*............................. + // Instructions: 165 + // Expected cycles: 49 + // Expected IPC: 3.37 + // + // Wall time: 163.43s + // User time: 163.43s + // + // ------------------------------------------------------------------------ original position -------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + ldr q6, [x1, #0] // *.................................................................................................................................................................... + ldr q19, [x1, #16] // .*................................................................................................................................................................... + ldr q26, [x2, #0] // .....................*............................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + ldr q28, [x1, #32] // ..*.................................................................................................................................................................. + ldr q22, [x1, #48] // ...*................................................................................................................................................................. + ldr q29, [x5, #32] // ....*................................................................................................................................................................ + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + ldr q4, [x2, #16] // ....................*................................................................................................................................................ + ldr q13, [x2, #32] // ......................*.............................................................................................................................................. + ldr q0, [x2, #48] // ...........................*......................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + ldr q14, [x5, #128] // ..............*...................................................................................................................................................... + ldr q15, [x5, #16] // .....*............................................................................................................................................................... + ldr q10, [x4, #32] // ............................*........................................................................................................................................ + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn2 v11.4S, v6.4S, v19.4S // ............*........................................................................................................................................................ + trn1 v6.4S, v6.4S, v19.4S // .............*....................................................................................................................................................... + ldr q19, [x5, #64] // ......*.............................................................................................................................................................. + ldr q21, [x5, #144] // ...............*..................................................................................................................................................... + ldr q30, [x4, #16] // .............................*....................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn2 v12.4S, v28.4S, v22.4S // .................*................................................................................................................................................... + trn1 v28.4S, v28.4S, v22.4S // ................*.................................................................................................................................................... + ldr q22, [x5, #80] // .......*............................................................................................................................................................. + ldr q3, [x5, #160] // ..................*.................................................................................................................................................. + ldr q7, [x4, #48] // ..............................*...................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn1 v9.4S, v26.4S, v4.4S // ....................................*................................................................................................................................ + trn2 v26.4S, v26.4S, v4.4S // .....................................*............................................................................................................................... + trn1 v4.4S, v13.4S, v0.4S // ........................................*............................................................................................................................ + trn2 v13.4S, v13.4S, v0.4S // .........................................*........................................................................................................................... + ldr q0, [x5, #176] // ...................*................................................................................................................................................. + ldr q17, [x5], #(12*16) // ........*............................................................................................................................................................ + ldr q31, [x4], #64 // ...............................*..................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn2 v23.2D, v11.2D, v12.2D // .......................*............................................................................................................................................. + trn1 v11.2D, v11.2D, v12.2D // ........................*............................................................................................................................................ + trn2 v12.2D, v6.2D, v28.2D // .........................*........................................................................................................................................... + trn1 v6.2D, v6.2D, v28.2D // ..........................*.......................................................................................................................................... + ldr q28, [x5, #-144] // .........*........................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn1 v24.2D, v9.2D, v4.2D // ..............................................*...................................................................................................................... + trn2 v4.2D, v9.2D, v4.2D // ...............................................*..................................................................................................................... + trn1 v9.2D, v26.2D, v13.2D // ................................................*.................................................................................................................... + trn2 v26.2D, v26.2D, v13.2D // .................................................*................................................................................................................... + ldr q13, [x5, #-96] // ..........*.......................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v18.4S, v12.4S, v23.4S // ................................*.................................................................................................................................... + add v12.4S, v12.4S, v23.4S // .................................*................................................................................................................................... + sub v23.4S, v6.4S, v11.4S // ..................................*.................................................................................................................................. + add v6.4S, v6.4S, v11.4S // ...................................*................................................................................................................................. + ldr q11, [x5, #-80] // ...........*......................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v25.4S, v24.4S, v9.4S // ......................................................*.............................................................................................................. + add v9.4S, v24.4S, v9.4S // .......................................................*............................................................................................................. + sub v24.4S, v4.4S, v26.4S // ........................................................*............................................................................................................ + add v26.4S, v4.4S, v26.4S // .........................................................*........................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mul v29.4S, v23.4S, v29.4S // ......................................*.............................................................................................................................. + mul v19.4S, v18.4S, v19.4S // ..........................................*.......................................................................................................................... + sqrdmulh v22.4S, v18.4S, v22.4S // ...........................................*......................................................................................................................... + sqrdmulh v28.4S, v23.4S, v28.4S // .......................................*............................................................................................................................. + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mul v4.4S, v25.4S, v14.4S // ..........................................................*.......................................................................................................... + sqrdmulh v14.4S, v25.4S, v21.4S // ...........................................................*......................................................................................................... + mul v21.4S, v24.4S, v3.4S // ............................................................*........................................................................................................ + sqrdmulh v0.4S, v24.4S, v0.4S // .............................................................*....................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v3.4S, v9.4S, v26.4S // .................................................................*................................................................................................... + add v26.4S, v9.4S, v26.4S // ..................................................................*.................................................................................................. + sub v9.4S, v6.4S, v12.4S // ............................................*........................................................................................................................ + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + add v6.4S, v6.4S, v12.4S // .............................................*....................................................................................................................... + mls v29.4S, v28.4S, v8.S[0] // ..................................................*.................................................................................................................. + mls v19.4S, v22.4S, v8.S[0] // .....................................................*............................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mls v4.4S, v14.4S, v8.S[0] // .....................................................................*............................................................................................... + mls v21.4S, v0.4S, v8.S[0] // ......................................................................*.............................................................................................. + sqrdmulh v28.4S, v9.4S, v15.4S // ...................................................*................................................................................................................. + mul v22.4S, v9.4S, v17.4S // ....................................................*................................................................................................................ + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sqrdmulh v0.4S, v3.4S, v11.4S // .........................................................................*........................................................................................... + mul v14.4S, v3.4S, v13.4S // ..........................................................................*.......................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v12.4S, v29.4S, v19.4S // ..............................................................*...................................................................................................... + add v19.4S, v29.4S, v19.4S // ...............................................................*..................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v29.4S, v4.4S, v21.4S // ............................................................................*........................................................................................ + add v4.4S, v4.4S, v21.4S // .............................................................................*....................................................................................... + mls v22.4S, v28.4S, v8.S[0] // ................................................................*.................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sqrdmulh v28.4S, v12.4S, v15.4S // ...................................................................*................................................................................................. + mul v15.4S, v12.4S, v17.4S // ....................................................................*................................................................................................ + trn1 v21.4S, v6.4S, v19.4S // .......................................................................*............................................................................................. + trn2 v6.4S, v6.4S, v19.4S // ........................................................................*............................................................................................ + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mul v19.4S, v29.4S, v13.4S // ...............................................................................*..................................................................................... + sqrdmulh v29.4S, v29.4S, v11.4S // ................................................................................*.................................................................................... + mls v14.4S, v0.4S, v8.S[0] // ..............................................................................*...................................................................................... + trn1 v13.4S, v26.4S, v4.4S // .................................................................................*................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn2 v26.4S, v26.4S, v4.4S // ..................................................................................*.................................................................................. + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mls v15.4S, v28.4S, v8.S[0] // ...........................................................................*......................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mls v19.4S, v29.4S, v8.S[0] // .......................................................................................*............................................................................. + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn1 v28.4S, v22.4S, v15.4S // ...................................................................................*................................................................................. + trn2 v22.4S, v22.4S, v15.4S // ....................................................................................*................................................................................ + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn1 v29.4S, v14.4S, v19.4S // ............................................................................................*........................................................................ + trn2 v19.4S, v14.4S, v19.4S // .............................................................................................*....................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn2 v4.2D, v21.2D, v28.2D // .....................................................................................*............................................................................... + trn1 v28.2D, v21.2D, v28.2D // ......................................................................................*.............................................................................. + trn2 v0.2D, v6.2D, v22.2D // ........................................................................................*............................................................................ + trn1 v6.2D, v6.2D, v22.2D // .........................................................................................*........................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + trn1 v22.2D, v13.2D, v29.2D // ..................................................................................................*.................................................................. + trn2 v29.2D, v13.2D, v29.2D // ...................................................................................................*................................................................. + trn1 v13.2D, v26.2D, v19.2D // ....................................................................................................*................................................................ + trn2 v19.2D, v26.2D, v19.2D // .....................................................................................................*............................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v26.4S, v4.4S, v0.4S // ..........................................................................................*.......................................................................... + add v4.4S, v4.4S, v0.4S // ...........................................................................................*......................................................................... + sub v0.4S, v28.4S, v6.4S // ...............................................................................................*..................................................................... + add v6.4S, v28.4S, v6.4S // ..............................................................................................*...................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v28.4S, v22.4S, v13.4S // ...........................................................................................................*......................................................... + add v22.4S, v22.4S, v13.4S // ............................................................................................................*........................................................ + sub v13.4S, v29.4S, v19.4S // .............................................................................................................*....................................................... + add v19.4S, v29.4S, v19.4S // ..............................................................................................................*...................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sqrdmulh v29.4S, v26.4S, v10.S[1] // ................................................................................................*.................................................................... + mul v26.4S, v26.4S, v10.S[0] // .................................................................................................*................................................................... + sqrdmulh v14.4S, v0.4S, v30.S[3] // ........................................................................................................*............................................................ + mul v0.4S, v0.4S, v30.S[2] // .........................................................................................................*........................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sqrdmulh v15.4S, v28.4S, v10.S[3] // ..................................................................................................................*.................................................. + mul v28.4S, v28.4S, v10.S[2] // ...................................................................................................................*................................................. + sqrdmulh v10.4S, v13.4S, v7.S[1] // ....................................................................................................................*................................................ + mul v13.4S, v13.4S, v7.S[0] // .....................................................................................................................*............................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v11.4S, v22.4S, v19.4S // .......................................................................................................................*............................................. + add v19.4S, v22.4S, v19.4S // ........................................................................................................................*............................................ + sub v22.4S, v6.4S, v4.4S // ......................................................................................................*.............................................................. + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + add v6.4S, v6.4S, v4.4S // .......................................................................................................*............................................................. + mls v26.4S, v29.4S, v8.S[0] // ..........................................................................................................*.......................................................... + mls v0.4S, v14.4S, v8.S[0] // ......................................................................................................................*.............................................. + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mls v28.4S, v15.4S, v8.S[0] // ...........................................................................................................................*......................................... + mls v13.4S, v10.4S, v8.S[0] // ............................................................................................................................*........................................ + sqrdmulh v29.4S, v22.4S, v31.S[3] // ...............................................................................................................*..................................................... + mul v22.4S, v22.4S, v31.S[2] // ................................................................................................................*.................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sqrdmulh v4.4S, v11.4S, v30.S[1] // .............................................................................................................................*....................................... + mul v14.4S, v11.4S, v30.S[0] // ..............................................................................................................................*...................................... + srshr v15.4S, v19.4S, #23 // .................................................................................................................................*................................... + srshr v10.4S, v6.4S, #23 // .................................................................................................................*................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v11.4S, v0.4S, v26.4S // ...............................................................................................................................*..................................... + add v0.4S, v0.4S, v26.4S // ................................................................................................................................*.................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v26.4S, v28.4S, v13.4S // ..................................................................................................................................*.................................. + add v28.4S, v28.4S, v13.4S // ...................................................................................................................................*................................. + mls v22.4S, v29.4S, v8.S[0] // ..........................................................................................................................*.......................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mls v19.4S, v15.4S, v8.4S // ......................................................................................................................................*.............................. + mls v6.4S, v10.4S, v8.4S // .........................................................................................................................*........................................... + sqrdmulh v29.4S, v11.4S, v31.S[3] // ....................................................................................................................................*................................ + mul v15.4S, v11.4S, v31.S[2] // .....................................................................................................................................*............................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sqrdmulh v13.4S, v26.4S, v30.S[1] // .........................................................................................................................................*........................... + mul v10.4S, v26.4S, v30.S[0] // ..........................................................................................................................................*.......................... + srshr v26.4S, v0.4S, #23 // ........................................................................................................................................*............................ + srshr v11.4S, v28.4S, #23 // ...........................................................................................................................................*......................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mls v14.4S, v4.4S, v8.S[0] // .......................................................................................................................................*............................. + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + add v4.4S, v6.4S, v19.4S // .............................................................................................................................................*....................... + sub v6.4S, v6.4S, v19.4S // ..............................................................................................................................................*...................... + mls v15.4S, v29.4S, v8.S[0] // ............................................................................................................................................*........................ + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + mls v0.4S, v26.4S, v8.4S // ...............................................................................................................................................*..................... + mls v28.4S, v11.4S, v8.4S // ..................................................................................................................................................*.................. + mls v10.4S, v13.4S, v8.S[0] // ...................................................................................................................................................*................. + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + add v19.4S, v22.4S, v14.4S // .................................................................................................................................................*................... + sub v22.4S, v22.4S, v14.4S // ................................................................................................................................................*.................... + str q4, [x1], #(16*4) // ....................................................................................................................................................*................ + sqrdmulh v29.4S, v6.4S, v31.S[1] // .....................................................................................................................................................*............... + mul v6.4S, v6.4S, v31.S[0] // ......................................................................................................................................................*.............. + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sub v13.4S, v15.4S, v10.4S // ..........................................................................................................................................................*.......... + sub v26.4S, v0.4S, v28.4S // ............................................................................................................................................................*........ + add v14.4S, v15.4S, v10.4S // ...........................................................................................................................................................*......... + add v28.4S, v0.4S, v28.4S // .............................................................................................................................................................*....... + str q19, [x1, #-32] // .........................................................................................................................................................*........... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + sqrdmulh v4.4S, v22.4S, v31.S[1] // .......................................................................................................................................................*............. + mul v22.4S, v22.4S, v31.S[0] // ........................................................................................................................................................*............ + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + // gap // ..................................................................................................................................................................... + str q14, [x1, #-16] // ..................................................................................................................................................................*.. + str q28, [x1, #-48] // ...................................................................................................................................................................*. + add x1, x1, #64 // ....................................................................................................................................................................* + sqrdmulh v15.4S, v13.4S, v31.S[1] // ..............................................................................................................................................................*...... + mul v14.4S, v13.4S, v31.S[0] // ...............................................................................................................................................................*..... + sqrdmulh v10.4S, v26.4S, v31.S[1] // ................................................................................................................................................................*.... + mul v0.4S, v26.4S, v31.S[0] // .................................................................................................................................................................*... + // gap // ..................................................................................................................................................................... + + // --------------------------------------------------------------------------- new position ---------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + // ldr q19, [x1, #0] // *.................................................................................................................................................................... + // ldr q26, [x1, #16] // .*................................................................................................................................................................... + // ldr q28, [x1, #32] // ...*................................................................................................................................................................. + // ldr q29, [x1, #48] // ....*................................................................................................................................................................ + // ldr q4, [x5, #32] // .....*............................................................................................................................................................... + // ldr q13, [x5, #16] // ..........*.......................................................................................................................................................... + // ldr q15, [x5, #64] // ..............*...................................................................................................................................................... + // ldr q10, [x5, #80] // ...................*................................................................................................................................................. + // ldr q11, [x5], #(12*16) // ...........................*......................................................................................................................................... + // ldr q6, [x5, #-144] // .................................*................................................................................................................................... + // ldr q21, [x5, #-96] // ......................................*.............................................................................................................................. + // ldr q30, [x5, #-80] // ...........................................*......................................................................................................................... + // trn2 v9.4S, v19.4S, v26.4S // ............*........................................................................................................................................................ + // trn1 v19.4S, v19.4S, v26.4S // .............*....................................................................................................................................................... + // ldr q26, [x5, #-64] // .........*........................................................................................................................................................... + // ldr q22, [x5, #-48] // ...............*..................................................................................................................................................... + // trn1 v7.4S, v28.4S, v29.4S // ..................*.................................................................................................................................................. + // trn2 v28.4S, v28.4S, v29.4S // .................*................................................................................................................................................... + // ldr q29, [x5, #-32] // ....................*................................................................................................................................................ + // ldr q0, [x5, #-16] // ..........................*.......................................................................................................................................... + // ldr q3, [x2, #16] // ......*.............................................................................................................................................................. + // ldr q14, [x2, #0] // ..*.................................................................................................................................................................. + // ldr q17, [x2, #32] // .......*............................................................................................................................................................. + // trn2 v12.2D, v9.2D, v28.2D // .............................*....................................................................................................................................... + // trn1 v28.2D, v9.2D, v28.2D // ..............................*...................................................................................................................................... + // trn2 v9.2D, v19.2D, v7.2D // ...............................*..................................................................................................................................... + // trn1 v19.2D, v19.2D, v7.2D // ................................*.................................................................................................................................... + // ldr q7, [x2, #48] // ........*............................................................................................................................................................ + // ldr q31, [x4, #32] // ...........*......................................................................................................................................................... + // ldr q23, [x4, #16] // ................*.................................................................................................................................................... + // ldr q24, [x4, #48] // .....................*............................................................................................................................................... + // ldr q18, [x4], #64 // ............................*........................................................................................................................................ + // sub v25.4S, v9.4S, v12.4S // .......................................*............................................................................................................................. + // add v12.4S, v9.4S, v12.4S // ........................................*............................................................................................................................ + // sub v9.4S, v19.4S, v28.4S // .........................................*........................................................................................................................... + // add v19.4S, v19.4S, v28.4S // ..........................................*.......................................................................................................................... + // trn1 v28.4S, v14.4S, v3.4S // ......................*.............................................................................................................................................. + // trn2 v14.4S, v14.4S, v3.4S // .......................*............................................................................................................................................. + // mul v4.4S, v9.4S, v4.4S // ................................................*.................................................................................................................... + // sqrdmulh v6.4S, v9.4S, v6.4S // ...................................................*................................................................................................................. + // trn1 v3.4S, v17.4S, v7.4S // ........................*............................................................................................................................................ + // trn2 v7.4S, v17.4S, v7.4S // .........................*........................................................................................................................................... + // mul v15.4S, v25.4S, v15.4S // .................................................*................................................................................................................... + // sqrdmulh v10.4S, v25.4S, v10.4S // ..................................................*.................................................................................................................. + // sub v9.4S, v19.4S, v12.4S // ..........................................................*.......................................................................................................... + // add v19.4S, v19.4S, v12.4S // ...........................................................*......................................................................................................... + // trn1 v12.2D, v28.2D, v3.2D // ..................................*.................................................................................................................................. + // trn2 v28.2D, v28.2D, v3.2D // ...................................*................................................................................................................................. + // trn1 v3.2D, v14.2D, v7.2D // ....................................*................................................................................................................................ + // trn2 v14.2D, v14.2D, v7.2D // .....................................*............................................................................................................................... + // mls v4.4S, v6.4S, v8.S[0] // ............................................................*........................................................................................................ + // sqrdmulh v6.4S, v9.4S, v13.4S // ................................................................*.................................................................................................... + // mul v7.4S, v9.4S, v11.4S // .................................................................*................................................................................................... + // mls v15.4S, v10.4S, v8.S[0] // .............................................................*....................................................................................................... + // sub v10.4S, v12.4S, v3.4S // ............................................*........................................................................................................................ + // add v12.4S, v12.4S, v3.4S // .............................................*....................................................................................................................... + // sub v3.4S, v28.4S, v14.4S // ..............................................*...................................................................................................................... + // add v28.4S, v28.4S, v14.4S // ...............................................*..................................................................................................................... + // mul v26.4S, v10.4S, v26.4S // ....................................................*................................................................................................................ + // sqrdmulh v22.4S, v10.4S, v22.4S // .....................................................*............................................................................................................... + // mul v29.4S, v3.4S, v29.4S // ......................................................*.............................................................................................................. + // sqrdmulh v0.4S, v3.4S, v0.4S // .......................................................*............................................................................................................. + // sub v14.4S, v4.4S, v15.4S // ....................................................................*................................................................................................ + // add v4.4S, v4.4S, v15.4S // .....................................................................*............................................................................................... + // mls v7.4S, v6.4S, v8.S[0] // ........................................................................*............................................................................................ + // sub v6.4S, v12.4S, v28.4S // ........................................................*............................................................................................................ + // add v28.4S, v12.4S, v28.4S // .........................................................*........................................................................................................... + // sqrdmulh v13.4S, v14.4S, v13.4S // .........................................................................*........................................................................................... + // mul v14.4S, v14.4S, v11.4S // ..........................................................................*.......................................................................................... + // mls v26.4S, v22.4S, v8.S[0] // ..............................................................*...................................................................................................... + // mls v29.4S, v0.4S, v8.S[0] // ...............................................................*..................................................................................................... + // trn1 v22.4S, v19.4S, v4.4S // ...........................................................................*......................................................................................... + // trn2 v19.4S, v19.4S, v4.4S // ............................................................................*........................................................................................ + // sqrdmulh v4.4S, v6.4S, v30.4S // ..................................................................*.................................................................................................. + // mul v6.4S, v6.4S, v21.4S // ...................................................................*................................................................................................. + // mls v14.4S, v13.4S, v8.S[0] // ..................................................................................*.................................................................................. + // sub v13.4S, v26.4S, v29.4S // ......................................................................*.............................................................................................. + // add v26.4S, v26.4S, v29.4S // .......................................................................*............................................................................................. + // mls v6.4S, v4.4S, v8.S[0] // ...............................................................................*..................................................................................... + // mul v29.4S, v13.4S, v21.4S // .............................................................................*....................................................................................... + // sqrdmulh v4.4S, v13.4S, v30.4S // ..............................................................................*...................................................................................... + // trn1 v13.4S, v28.4S, v26.4S // ................................................................................*.................................................................................... + // trn2 v26.4S, v28.4S, v26.4S // .................................................................................*................................................................................... + // trn1 v28.4S, v7.4S, v14.4S // ....................................................................................*................................................................................ + // trn2 v0.4S, v7.4S, v14.4S // .....................................................................................*............................................................................... + // trn2 v14.2D, v22.2D, v28.2D // ........................................................................................*............................................................................ + // trn1 v28.2D, v22.2D, v28.2D // .........................................................................................*........................................................................... + // mls v29.4S, v4.4S, v8.S[0] // ...................................................................................*................................................................................. + // trn2 v22.2D, v19.2D, v0.2D // ..........................................................................................*.......................................................................... + // trn1 v19.2D, v19.2D, v0.2D // ...........................................................................................*......................................................................... + // sub v4.4S, v14.4S, v22.4S // ................................................................................................*.................................................................... + // add v22.4S, v14.4S, v22.4S // .................................................................................................*................................................................... + // trn1 v0.4S, v6.4S, v29.4S // ......................................................................................*.............................................................................. + // trn2 v6.4S, v6.4S, v29.4S // .......................................................................................*............................................................................. + // add v29.4S, v28.4S, v19.4S // ...................................................................................................*................................................................. + // sub v19.4S, v28.4S, v19.4S // ..................................................................................................*.................................................................. + // sqrdmulh v28.4S, v4.4S, v31.S[1] // ........................................................................................................*............................................................ + // mul v4.4S, v4.4S, v31.S[0] // .........................................................................................................*........................................................... + // trn1 v14.2D, v13.2D, v0.2D // ............................................................................................*........................................................................ + // trn2 v13.2D, v13.2D, v0.2D // .............................................................................................*....................................................................... + // trn1 v0.2D, v26.2D, v6.2D // ..............................................................................................*...................................................................... + // trn2 v6.2D, v26.2D, v6.2D // ...............................................................................................*..................................................................... + // sub v26.4S, v29.4S, v22.4S // ..................................................................................................................*.................................................. + // add v22.4S, v29.4S, v22.4S // ...................................................................................................................*................................................. + // sqrdmulh v29.4S, v19.4S, v23.S[3] // ..........................................................................................................*.......................................................... + // mul v19.4S, v19.4S, v23.S[2] // ...........................................................................................................*......................................................... + // mls v4.4S, v28.4S, v8.S[0] // ....................................................................................................................*................................................ + // sub v28.4S, v14.4S, v0.4S // ....................................................................................................*................................................................ + // add v0.4S, v14.4S, v0.4S // .....................................................................................................*............................................................... + // sub v14.4S, v13.4S, v6.4S // ......................................................................................................*.............................................................. + // add v6.4S, v13.4S, v6.4S // .......................................................................................................*............................................................. + // sqrdmulh v13.4S, v26.4S, v18.S[3] // ........................................................................................................................*............................................ + // mul v26.4S, v26.4S, v18.S[2] // .........................................................................................................................*........................................... + // srshr v15.4S, v22.4S, #23 // .............................................................................................................................*....................................... + // sqrdmulh v10.4S, v28.4S, v31.S[3] // ............................................................................................................*........................................................ + // mul v28.4S, v28.4S, v31.S[2] // .............................................................................................................*....................................................... + // sqrdmulh v11.4S, v14.4S, v24.S[1] // ..............................................................................................................*...................................................... + // mul v14.4S, v14.4S, v24.S[0] // ...............................................................................................................*..................................................... + // mls v19.4S, v29.4S, v8.S[0] // .....................................................................................................................*............................................... + // sub v29.4S, v0.4S, v6.4S // ................................................................................................................*.................................................... + // add v6.4S, v0.4S, v6.4S // .................................................................................................................*................................................... + // mls v22.4S, v15.4S, v8.4S // ....................................................................................................................................*................................ + // mls v26.4S, v13.4S, v8.S[0] // ..................................................................................................................................*.................................. + // mls v28.4S, v10.4S, v8.S[0] // ......................................................................................................................*.............................................. + // mls v14.4S, v11.4S, v8.S[0] // .......................................................................................................................*............................................. + // sqrdmulh v13.4S, v29.4S, v23.S[1] // ..........................................................................................................................*.......................................... + // mul v29.4S, v29.4S, v23.S[0] // ...........................................................................................................................*......................................... + // sub v0.4S, v19.4S, v4.4S // ..............................................................................................................................*...................................... + // add v19.4S, v19.4S, v4.4S // ...............................................................................................................................*..................................... + // srshr v4.4S, v6.4S, #23 // ............................................................................................................................*........................................ + // sub v15.4S, v28.4S, v14.4S // ................................................................................................................................*.................................... + // add v28.4S, v28.4S, v14.4S // .................................................................................................................................*................................... + // sqrdmulh v14.4S, v0.4S, v18.S[3] // .....................................................................................................................................*............................... + // mul v0.4S, v0.4S, v18.S[2] // ......................................................................................................................................*.............................. + // mls v6.4S, v4.4S, v8.4S // ...................................................................................................................................*................................. + // mls v29.4S, v13.4S, v8.S[0] // ...........................................................................................................................................*......................... + // srshr v4.4S, v19.4S, #23 // .........................................................................................................................................*........................... + // sqrdmulh v13.4S, v15.4S, v23.S[1] // .......................................................................................................................................*............................. + // mul v15.4S, v15.4S, v23.S[0] // ........................................................................................................................................*............................ + // srshr v10.4S, v28.4S, #23 // ..........................................................................................................................................*.......................... + // mls v0.4S, v14.4S, v8.S[0] // ..............................................................................................................................................*...................... + // add v14.4S, v22.4S, v6.4S // ............................................................................................................................................*........................ + // sub v6.4S, v22.4S, v6.4S // .............................................................................................................................................*....................... + // mls v19.4S, v4.4S, v8.4S // ...............................................................................................................................................*..................... + // sub v22.4S, v26.4S, v29.4S // ...................................................................................................................................................*................. + // add v26.4S, v26.4S, v29.4S // ..................................................................................................................................................*.................. + // mls v28.4S, v10.4S, v8.4S // ................................................................................................................................................*.................... + // mls v15.4S, v13.4S, v8.S[0] // .................................................................................................................................................*................... + // str q14, [x1], #(16*4) // ....................................................................................................................................................*................ + // sqrdmulh v29.4S, v6.4S, v18.S[1] // .....................................................................................................................................................*............... + // mul v6.4S, v6.4S, v18.S[0] // ......................................................................................................................................................*.............. + // sqrdmulh v4.4S, v22.4S, v18.S[1] // ............................................................................................................................................................*........ + // mul v22.4S, v22.4S, v18.S[0] // .............................................................................................................................................................*....... + // str q26, [x1, #-32] // ...........................................................................................................................................................*......... + // sub v26.4S, v0.4S, v15.4S // .......................................................................................................................................................*............. + // add v13.4S, v0.4S, v15.4S // .........................................................................................................................................................*........... + // sub v0.4S, v19.4S, v28.4S // ........................................................................................................................................................*............ + // add v19.4S, v19.4S, v28.4S // ..........................................................................................................................................................*.......... + // sqrdmulh v15.4S, v26.4S, v18.S[1] // .................................................................................................................................................................*... + // mul v14.4S, v26.4S, v18.S[0] // ..................................................................................................................................................................*.. + // sqrdmulh v10.4S, v0.4S, v18.S[1] // ...................................................................................................................................................................*. + // mul v0.4S, v0.4S, v18.S[0] // ....................................................................................................................................................................* + // str q13, [x1, #-16] // ..............................................................................................................................................................*...... + // str q19, [x1, #-48] // ...............................................................................................................................................................*..... + // add x1, x1, #64 // ................................................................................................................................................................*.... sub count, count, #1 layer45678_start: @@ -450,177 +957,184 @@ layer45678_start: // Expected cycles: 54 // Expected IPC: 3.22 // - // Wall time: 209.71s - // User time: 209.71s + // Wall time: 691.58s + // User time: 691.58s // // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> // 0 25 50 75 100 125 150 // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- - ldr q10, [x2, #32] // ..............*............................................................................................................................................................... - trn2 v25.4S, v29.4S, v6.4S // .....*........................................................................................................................................................................ - trn2 v22.4S, v19.4S, v7.4S // .......*...................................................................................................................................................................... - ldr q14, [x2, #16] // .............*................................................................................................................................................................ - ldr q20, [x2, #0] // ............*................................................................................................................................................................. - trn1 v23.4S, v29.4S, v6.4S // ....*......................................................................................................................................................................... - trn1 v4.4S, v19.4S, v7.4S // ......*....................................................................................................................................................................... + mls v6.4S, v29.4S, v8.S[0] // ....................................................................................................................................................*......................... + ldr q19, [x1, #0] // e............................................................................................................................................................................. + ldr q26, [x1, #16] // .e............................................................................................................................................................................ + ldr q28, [x1, #32] // ..e........................................................................................................................................................................... // gap // .............................................................................................................................................................................. - ldr q19, [x2, #48] // ...............*.............................................................................................................................................................. - ldr q26, [x5, #-64] // ....................................................*......................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v22.4S, v4.4S, v8.S[0] // ..............................................................................................................................................................*............... + ldr q29, [x1, #48] // ...e.......................................................................................................................................................................... + ldr q4, [x5, #32] // ..........................e................................................................................................................................................... + ldr q13, [x5, #16] // .........................e.................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v17.2D, v23.2D, v4.2D // ..........*................................................................................................................................................................... - trn2 v18.2D, v23.2D, v4.2D // ........*..................................................................................................................................................................... - trn1 v5.2D, v25.2D, v22.2D // ...........*.................................................................................................................................................................. - trn2 v7.2D, v25.2D, v22.2D // .........*.................................................................................................................................................................... - ldr q31, [x5, #-16] // .......................................................*...................................................................................................................... // gap // .............................................................................................................................................................................. + mls v0.4S, v10.4S, v8.S[0] // .........................................................................................................................................................*.................... + mls v14.4S, v15.4S, v8.S[0] // ...................................................................................................................................................................*.......... + ldr q15, [x5, #64] // ............................e................................................................................................................................................. + ldr q10, [x5, #80] // .............................e................................................................................................................................................ + ldr q11, [x5], #(12*16) // ........................e..................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + str q6, [x2], #(16*4) // ........................................................................................................................................................................*..... + ldr q6, [x5, #-144] // ...........................e.................................................................................................................................................. + ldr q21, [x5, #-96] // ..................................................e........................................................................................................................... + ldr q30, [x5, #-80] // ...................................................e.......................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn2 v9.4S, v19.4S, v26.4S // .....e........................................................................................................................................................................ + trn1 v19.4S, v19.4S, v26.4S // ....e......................................................................................................................................................................... + str q22, [x2, #-32] // ..........................................................................................................................................................................*... + ldr q26, [x5, #-64] // ....................................................e......................................................................................................................... + ldr q22, [x5, #-48] // .....................................................e........................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v28.4S, v20.4S, v14.4S // .................*............................................................................................................................................................ - sub v22.4S, v18.4S, v7.4S // ...................................*.......................................................................................................................................... - sub v9.4S, v17.4S, v5.4S // ..............................*............................................................................................................................................... - add v16.4S, v17.4S, v5.4S // ...............................*.............................................................................................................................................. + trn1 v7.4S, v28.4S, v29.4S // ......e....................................................................................................................................................................... + trn2 v28.4S, v28.4S, v29.4S // .......e...................................................................................................................................................................... + str q0, [x2, #-48] // .........................................................................................................................................................................*.... + str q14, [x2, #-16] // ...........................................................................................................................................................................*.. + ldr q29, [x5, #-32] // ......................................................e....................................................................................................................... + ldr q0, [x5, #-16] // .......................................................e...................................................................................................................... + add x2, x2, #64 // .............................................................................................................................................................................* // gap // .............................................................................................................................................................................. + ldr q3, [x2, #16] // .............e................................................................................................................................................................ + ldr q14, [x2, #0] // ............e................................................................................................................................................................. + ldr q17, [x2, #32] // ..............e............................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v21.4S, v10.4S, v19.4S // ..................*........................................................................................................................................................... - trn2 v6.4S, v10.4S, v19.4S // ...................*.......................................................................................................................................................... - trn1 v19.4S, v20.4S, v14.4S // ................*............................................................................................................................................................. - add v4.4S, v18.4S, v7.4S // ....................................*......................................................................................................................................... - ldr q14, [x5, #-80] // ...................................................*.......................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn2 v12.2D, v9.2D, v28.2D // .........e.................................................................................................................................................................... + trn1 v28.2D, v9.2D, v28.2D // ...........e.................................................................................................................................................................. + trn2 v9.2D, v19.2D, v7.2D // ........e..................................................................................................................................................................... + trn1 v19.2D, v19.2D, v7.2D // ..........e................................................................................................................................................................... + ldr q7, [x2, #48] // ...............e.............................................................................................................................................................. + ldr q31, [x4, #32] // ..............................................................................................e............................................................................... + ldr q23, [x4, #16] // .............................................................................................e................................................................................ // gap // .............................................................................................................................................................................. - mul v0.4S, v9.4S, v0.4S // ................................*............................................................................................................................................. - sqrdmulh v29.4S, v9.4S, v15.4S // .................................*............................................................................................................................................ - sqrdmulh v17.4S, v22.4S, v2.4S // ......................................*....................................................................................................................................... - mul v22.4S, v22.4S, v30.4S // .....................................*........................................................................................................................................ - ldr q30, [x5, #-48] // .....................................................*........................................................................................................................ + ldr q24, [x4, #48] // ...............................................................................................e.............................................................................. + ldr q18, [x4], #64 // ............................................................................................e................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v20.2D, v28.2D, v6.2D // .....................*........................................................................................................................................................ - trn1 v28.2D, v28.2D, v6.2D // .......................*...................................................................................................................................................... - trn2 v10.2D, v19.2D, v21.2D // ....................*......................................................................................................................................................... - trn1 v21.2D, v19.2D, v21.2D // ......................*....................................................................................................................................................... - ldr q6, [x5, #-32] // ......................................................*....................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q18, [x5, #-176] // .........................*.................................................................................................................................................... - sub v12.4S, v16.4S, v4.4S // ........................................*..................................................................................................................................... + sub v25.4S, v9.4S, v12.4S // ...................................e.......................................................................................................................................... + add v12.4S, v9.4S, v12.4S // ....................................e......................................................................................................................................... + sub v9.4S, v19.4S, v28.4S // ..............................e............................................................................................................................................... + add v19.4S, v19.4S, v28.4S // ...............................e.............................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v28.4S, v14.4S, v3.4S // ................e............................................................................................................................................................. + trn2 v14.4S, v14.4S, v3.4S // .................e............................................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v22.4S, v17.4S, v8.S[0] // .......................................*...................................................................................................................................... - ldr q17, [x4, #16] // .............................................................................................*................................................................................ - sub v13.4S, v21.4S, v28.4S // ........................................................*..................................................................................................................... - sub v27.4S, v10.4S, v20.4S // .............................................................*................................................................................................................ - mls v0.4S, v29.4S, v8.S[0] // ..................................*........................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v2.4S, v21.4S, v28.4S // .........................................................*.................................................................................................................... - add v28.4S, v10.4S, v20.4S // ..............................................................*............................................................................................................... // gap // .............................................................................................................................................................................. + mul v4.4S, v9.4S, v4.4S // .................................e............................................................................................................................................ + sqrdmulh v6.4S, v9.4S, v6.4S // ................................e............................................................................................................................................. + trn1 v3.4S, v17.4S, v7.4S // ..................e........................................................................................................................................................... + trn2 v7.4S, v17.4S, v7.4S // ...................e.......................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v15.4S, v25.4S, v15.4S // ......................................e....................................................................................................................................... + sqrdmulh v10.4S, v25.4S, v10.4S // .....................................e........................................................................................................................................ + sub v9.4S, v19.4S, v12.4S // ........................................e..................................................................................................................................... + add v19.4S, v19.4S, v12.4S // .........................................e.................................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v19.4S, v13.4S, v30.4S // ...........................................................*.................................................................................................................. - mul v29.4S, v13.4S, v26.4S // ..........................................................*................................................................................................................... - mul v30.4S, v27.4S, v6.4S // ...............................................................*.............................................................................................................. - sqrdmulh v6.4S, v27.4S, v31.4S // ................................................................*............................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v12.2D, v28.2D, v3.2D // ......................e....................................................................................................................................................... + trn2 v28.2D, v28.2D, v3.2D // ....................e......................................................................................................................................................... + trn1 v3.2D, v14.2D, v7.2D // .......................e...................................................................................................................................................... + trn2 v14.2D, v14.2D, v7.2D // .....................e........................................................................................................................................................ // gap // .............................................................................................................................................................................. - add v21.4S, v0.4S, v22.4S // ..............................................*............................................................................................................................... - sub v31.4S, v0.4S, v22.4S // .............................................*................................................................................................................................ - ldr q0, [x5, #32] // ..........................e................................................................................................................................................... - sub v10.4S, v2.4S, v28.4S // ..................................................................*........................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v4.4S, v6.4S, v8.S[0] // ..................................e........................................................................................................................................... + sqrdmulh v6.4S, v9.4S, v13.4S // ..........................................e................................................................................................................................... + mul v7.4S, v9.4S, v11.4S // ...........................................e.................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v27.4S, v12.4S, v24.4S // ..........................................*................................................................................................................................... - sqrdmulh v12.4S, v12.4S, v18.4S // ...........................................*.................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v15.4S, v10.4S, v8.S[0] // .......................................e...................................................................................................................................... + sub v10.4S, v12.4S, v3.4S // ........................................................e..................................................................................................................... + add v12.4S, v12.4S, v3.4S // .........................................................e.................................................................................................................... + sub v3.4S, v28.4S, v14.4S // .............................................................e................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v30.4S, v6.4S, v8.S[0] // .................................................................*............................................................................................................ - ldr q6, [x4, #32] // ..............................................................................................*............................................................................... - sqrdmulh v3.4S, v31.4S, v18.4S // ................................................*............................................................................................................................. - mls v29.4S, v19.4S, v8.S[0] // ............................................................*................................................................................................................. - mul v19.4S, v31.4S, v24.4S // ...............................................*.............................................................................................................................. - ldr q24, [x5], #(12*16) // ........................e..................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v31.4S, v16.4S, v4.4S // .........................................*.................................................................................................................................... - ldr q16, [x4], #64 // ............................................................................................*................................................................................. - mul v22.4S, v10.4S, v11.4S // ....................................................................*......................................................................................................... - sqrdmulh v10.4S, v10.4S, v14.4S // .....................................................................*........................................................................................................ + add v28.4S, v28.4S, v14.4S // ..............................................................e............................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v27.4S, v12.4S, v8.S[0] // ............................................*................................................................................................................................. - add v12.4S, v2.4S, v28.4S // ...................................................................*.......................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v26.4S, v10.4S, v26.4S // ...........................................................e.................................................................................................................. + sqrdmulh v22.4S, v10.4S, v22.4S // ..........................................................e................................................................................................................... + mul v29.4S, v3.4S, v29.4S // ................................................................e............................................................................................................. + sqrdmulh v0.4S, v3.4S, v0.4S // ...............................................................e.............................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v19.4S, v3.4S, v8.S[0] // .................................................*............................................................................................................................ - sub v1.4S, v29.4S, v30.4S // .......................................................................*...................................................................................................... - add v28.4S, v29.4S, v30.4S // ........................................................................*..................................................................................................... // gap // .............................................................................................................................................................................. + sub v14.4S, v4.4S, v15.4S // .............................................e................................................................................................................................ + add v4.4S, v4.4S, v15.4S // ..............................................e............................................................................................................................... + mls v7.4S, v6.4S, v8.S[0] // ............................................e................................................................................................................................. + sub v6.4S, v12.4S, v28.4S // ..................................................................e........................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v22.4S, v10.4S, v8.S[0] // ......................................................................*....................................................................................................... - trn2 v30.4S, v31.4S, v21.4S // .............................................................................*................................................................................................ + add v28.4S, v12.4S, v28.4S // ...................................................................e.......................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v25.4S, v12.4S, v28.4S // ....................................................................................*......................................................................................... - trn2 v28.4S, v12.4S, v28.4S // .....................................................................................*........................................................................................ - sqrdmulh v10.4S, v1.4S, v14.4S // ..........................................................................*................................................................................................... - mul v5.4S, v1.4S, v11.4S // .........................................................................*.................................................................................................... // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v14.4S, v13.4S // ...............................................e.............................................................................................................................. + mul v14.4S, v14.4S, v11.4S // ................................................e............................................................................................................................. + mls v26.4S, v22.4S, v8.S[0] // ............................................................e................................................................................................................. + mls v29.4S, v0.4S, v8.S[0] // .................................................................e............................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v26.4S, v27.4S, v19.4S // ...............................................................................*.............................................................................................. - trn1 v27.4S, v27.4S, v19.4S // ..............................................................................*............................................................................................... - trn1 v19.4S, v31.4S, v21.4S // ............................................................................*................................................................................................. - ldr q31, [x4, #-16] // ...............................................................................................*.............................................................................. // gap // .............................................................................................................................................................................. + trn1 v22.4S, v19.4S, v4.4S // ............................................................................e................................................................................................. + trn2 v19.4S, v19.4S, v4.4S // .............................................................................e................................................................................................ + sqrdmulh v4.4S, v6.4S, v30.4S // ....................................................................e......................................................................................................... + mul v6.4S, v6.4S, v21.4S // .....................................................................e........................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -632,238 +1146,224 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v5.4S, v10.4S, v8.S[0] // ...........................................................................*.................................................................................................. - trn2 v10.2D, v19.2D, v27.2D // ................................................................................*............................................................................................. - trn1 v19.2D, v19.2D, v27.2D // ..................................................................................*........................................................................................... - trn2 v29.2D, v30.2D, v26.2D // .................................................................................*............................................................................................ // gap // .............................................................................................................................................................................. + mls v14.4S, v13.4S, v8.S[0] // .................................................e............................................................................................................................ + sub v13.4S, v26.4S, v29.4S // .......................................................................e...................................................................................................... + add v26.4S, v26.4S, v29.4S // ........................................................................e..................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v4.2D, v30.2D, v26.2D // ...................................................................................*.......................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v6.4S, v4.4S, v8.S[0] // ......................................................................e....................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v12.4S, v10.4S, v29.4S // .....................................................................................................*........................................................................ - add v23.4S, v10.4S, v29.4S // ......................................................................................................*....................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v29.4S, v13.4S, v21.4S // ..........................................................................e................................................................................................... + sqrdmulh v4.4S, v13.4S, v30.4S // .........................................................................e.................................................................................................... + trn1 v13.4S, v28.4S, v26.4S // ....................................................................................e......................................................................................... + trn2 v26.4S, v28.4S, v26.4S // .....................................................................................e........................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v30.4S, v22.4S, v5.4S // ......................................................................................*....................................................................................... - trn2 v9.4S, v22.4S, v5.4S // .......................................................................................*...................................................................................... + trn1 v28.4S, v7.4S, v14.4S // ..............................................................................e............................................................................................... + trn2 v0.4S, v7.4S, v14.4S // ...............................................................................e.............................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v14.4S, v12.4S, v6.S[0] // .......................................................................................................*...................................................................... - add v18.4S, v19.4S, v4.4S // .................................................................................................*............................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v3.2D, v25.2D, v30.2D // ........................................................................................*..................................................................................... - trn1 v10.2D, v28.2D, v9.2D // ...........................................................................................*.................................................................................. - trn2 v7.2D, v28.2D, v9.2D // .........................................................................................*.................................................................................... - trn1 v11.2D, v25.2D, v30.2D // ..........................................................................................*................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn2 v14.2D, v22.2D, v28.2D // ................................................................................e............................................................................................. + trn1 v28.2D, v22.2D, v28.2D // ..................................................................................e........................................................................................... + mls v29.4S, v4.4S, v8.S[0] // ...........................................................................e.................................................................................................. + trn2 v22.2D, v19.2D, v0.2D // .................................................................................e............................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v28.4S, v19.4S, v4.4S // ................................................................................................*............................................................................. - sqrdmulh v5.4S, v12.4S, v6.S[1] // ........................................................................................................*..................................................................... - add v1.4S, v18.4S, v23.4S // .....................................................................................................................*........................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + trn1 v19.2D, v19.2D, v0.2D // ...................................................................................e.......................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v29.4S, v3.4S, v7.4S // ...............................................................................................................*.............................................................. - add v27.4S, v3.4S, v7.4S // ................................................................................................................*............................................................. - sub v12.4S, v11.4S, v10.4S // ..........................................................................................................*................................................................... - add v7.4S, v11.4S, v10.4S // ...........................................................................................................*.................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v22.4S, v28.4S, v17.S[2] // ..................................................................................................*........................................................................... - sqrdmulh v19.4S, v28.4S, v17.S[3] // ...................................................................................................*.......................................................................... - srshr v10.4S, v1.4S, #23 // ........................................................................................................................................*..................................... + sub v4.4S, v14.4S, v22.4S // .....................................................................................................e........................................................................ + add v22.4S, v14.4S, v22.4S // ......................................................................................................e....................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v30.4S, v12.4S, v6.S[2] // ............................................................................................................*................................................................. - sqrdmulh v6.4S, v12.4S, v6.S[3] // .............................................................................................................*................................................................ - mul v20.4S, v29.4S, v31.S[0] // .................................................................................................................*............................................................ - sqrdmulh v15.4S, v29.4S, v31.S[1] // ..................................................................................................................*........................................................... // gap // .............................................................................................................................................................................. + trn1 v0.4S, v6.4S, v29.4S // ......................................................................................e....................................................................................... + trn2 v6.4S, v6.4S, v29.4S // .......................................................................................e...................................................................................... + add v29.4S, v28.4S, v19.4S // .................................................................................................e............................................................................ + sub v19.4S, v28.4S, v19.4S // ................................................................................................e............................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v14.4S, v5.4S, v8.S[0] // .........................................................................................................*.................................................................... - sub v2.4S, v7.4S, v27.4S // ..............................................................................................................................*............................................... // gap // .............................................................................................................................................................................. + sqrdmulh v28.4S, v4.4S, v31.S[1] // .......................................................................................................e...................................................................... + mul v4.4S, v4.4S, v31.S[0] // ........................................................................................................e..................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v1.4S, v10.4S, v8.4S // .........................................................................................................................................*.................................... - mls v22.4S, v19.4S, v8.S[0] // ....................................................................................................*......................................................................... - add v19.4S, v7.4S, v27.4S // ...............................................................................................................................*.............................................. // gap // .............................................................................................................................................................................. + trn1 v14.2D, v13.2D, v0.2D // ..........................................................................................e................................................................................... + trn2 v13.2D, v13.2D, v0.2D // ........................................................................................e..................................................................................... + trn1 v0.2D, v26.2D, v6.2D // ...........................................................................................e.................................................................................. + trn2 v6.2D, v26.2D, v6.2D // .........................................................................................e.................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v20.4S, v15.4S, v8.S[0] // ...................................................................................................................*.......................................................... - mls v30.4S, v6.4S, v8.S[0] // ..............................................................................................................*............................................................... - sub v12.4S, v18.4S, v23.4S // ....................................................................................................................*......................................................... - sqrdmulh v11.4S, v2.4S, v17.S[1] // .................................................................................................................................*............................................ - ldr q15, [x5, #-144] // ...........................e.................................................................................................................................................. + sub v26.4S, v29.4S, v22.4S // ....................................................................................................................e......................................................... + add v22.4S, v29.4S, v22.4S // .....................................................................................................................e........................................................ + sqrdmulh v29.4S, v19.4S, v23.S[3] // ..................................................................................................e........................................................................... + mul v19.4S, v19.4S, v23.S[2] // ...................................................................................................e.......................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - srshr v10.4S, v19.4S, #23 // ............................................................................................................................................*................................. - mul v29.4S, v2.4S, v17.S[0] // ................................................................................................................................*............................................. // gap // .............................................................................................................................................................................. + mls v4.4S, v28.4S, v8.S[0] // .........................................................................................................e.................................................................... + sub v28.4S, v14.4S, v0.4S // ..........................................................................................................e................................................................... + add v0.4S, v14.4S, v0.4S // ...........................................................................................................e.................................................................. + sub v14.4S, v13.4S, v6.4S // ...............................................................................................................e.............................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + add v6.4S, v13.4S, v6.4S // ................................................................................................................e............................................................. + sqrdmulh v13.4S, v26.4S, v18.S[3] // ......................................................................................................................e....................................................... + mul v26.4S, v26.4S, v18.S[2] // .......................................................................................................................e...................................................... + srshr v15.4S, v22.4S, #23 // ........................................................................................................................................e..................................... // gap // .............................................................................................................................................................................. - sub v28.4S, v22.4S, v14.4S // .........................................................................................................................*.................................................... - add v4.4S, v22.4S, v14.4S // ..........................................................................................................................*................................................... - mul v5.4S, v12.4S, v16.S[2] // ......................................................................................................................*....................................................... - sqrdmulh v27.4S, v12.4S, v16.S[3] // .......................................................................................................................*...................................................... - ldr q2, [x5, #-112] // .............................e................................................................................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v31.4S, v30.4S, v20.4S // ...................................................................................................................................*.......................................... - add v18.4S, v30.4S, v20.4S // ....................................................................................................................................*......................................... - ldr q30, [x5, #-128] // ............................e................................................................................................................................................. + sqrdmulh v10.4S, v28.4S, v31.S[3] // ............................................................................................................e................................................................. + mul v28.4S, v28.4S, v31.S[2] // .............................................................................................................e................................................................ + sqrdmulh v11.4S, v14.4S, v24.S[1] // .................................................................................................................e............................................................ + mul v14.4S, v14.4S, v24.S[0] // ..................................................................................................................e........................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v19.4S, v29.4S, v8.S[0] // ....................................................................................................e......................................................................... + sub v29.4S, v0.4S, v6.4S // ..............................................................................................................................e............................................... + add v6.4S, v0.4S, v6.4S // ...............................................................................................................................e.............................................. // gap // .............................................................................................................................................................................. - mls v19.4S, v10.4S, v8.4S // .............................................................................................................................................*................................ - mul v22.4S, v28.4S, v16.S[2] // ...........................................................................................................................*.................................................. - sqrdmulh v14.4S, v28.4S, v16.S[3] // ............................................................................................................................*................................................. - srshr v28.4S, v4.4S, #23 // ..........................................................................................................................................*................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v6.4S, v31.4S, v17.S[0] // .....................................................................................................................................*........................................ - sqrdmulh v31.4S, v31.4S, v17.S[1] // ......................................................................................................................................*....................................... - mls v5.4S, v27.4S, v8.S[0] // ........................................................................................................................*..................................................... - srshr v12.4S, v18.4S, #23 // ..............................................................................................................................................*............................... + mls v22.4S, v15.4S, v8.4S // .........................................................................................................................................e.................................... + mls v26.4S, v13.4S, v8.S[0] // ........................................................................................................................e..................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v29.4S, v11.4S, v8.S[0] // ..................................................................................................................................*........................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v28.4S, v10.4S, v8.S[0] // ..............................................................................................................e............................................................... + mls v14.4S, v11.4S, v8.S[0] // ...................................................................................................................e.......................................................... + sqrdmulh v13.4S, v29.4S, v23.S[1] // ................................................................................................................................e............................................. + mul v29.4S, v29.4S, v23.S[0] // .................................................................................................................................e............................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sub v0.4S, v19.4S, v4.4S // .........................................................................................................................e.................................................... + add v19.4S, v19.4S, v4.4S // ..........................................................................................................................e................................................... + srshr v4.4S, v6.4S, #23 // ............................................................................................................................................e................................. // gap // .............................................................................................................................................................................. - sub v11.4S, v1.4S, v19.4S // ................................................................................................................................................*............................. - add v10.4S, v1.4S, v19.4S // .................................................................................................................................................*............................ - mls v22.4S, v14.4S, v8.S[0] // .............................................................................................................................*................................................ - mls v4.4S, v28.4S, v8.4S // ...........................................................................................................................................*.................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v18.4S, v12.4S, v8.4S // ...............................................................................................................................................*.............................. - mls v6.4S, v31.4S, v8.S[0] // .......................................................................................................................................*...................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v7.4S, v11.4S, v16.S[0] // ..................................................................................................................................................*........................... - sqrdmulh v11.4S, v11.4S, v16.S[1] // ...................................................................................................................................................*.......................... - str q10, [x1], #(16*4) // ....................................................................................................................................................................*......... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sub v15.4S, v28.4S, v14.4S // ...................................................................................................................................e.......................................... + add v28.4S, v28.4S, v14.4S // ....................................................................................................................................e......................................... + sqrdmulh v14.4S, v0.4S, v18.S[3] // ...........................................................................................................................e.................................................. + mul v0.4S, v0.4S, v18.S[2] // ............................................................................................................................e................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v23.4S, v5.4S, v29.4S // ..........................................................................................................................................................*................... - add v31.4S, v5.4S, v29.4S // ...........................................................................................................................................................*.................. // gap // .............................................................................................................................................................................. + mls v6.4S, v4.4S, v8.4S // .............................................................................................................................................e................................ + mls v29.4S, v13.4S, v8.S[0] // ..................................................................................................................................e........................................... + srshr v4.4S, v19.4S, #23 // ..........................................................................................................................................e................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v12.4S, v4.4S, v18.4S // .....................................................................................................................................................*........................ - sub v27.4S, v22.4S, v6.4S // ...............................................................................................................................................................*.............. - add v13.4S, v4.4S, v18.4S // ......................................................................................................................................................*....................... - add v10.4S, v22.4S, v6.4S // ................................................................................................................................................................*............. + sqrdmulh v13.4S, v15.4S, v23.S[1] // .....................................................................................................................................e........................................ + mul v15.4S, v15.4S, v23.S[0] // ......................................................................................................................................e....................................... + srshr v10.4S, v28.4S, #23 // ..............................................................................................................................................e............................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v7.4S, v11.4S, v8.S[0] // ....................................................................................................................................................*......................... - str q31, [x1, #-32] // ......................................................................................................................................................................*....... - mul v31.4S, v23.4S, v16.S[0] // ............................................................................................................................................................*................. - sqrdmulh v3.4S, v23.4S, v16.S[1] // .............................................................................................................................................................*................ // gap // .............................................................................................................................................................................. + mls v0.4S, v14.4S, v8.S[0] // .............................................................................................................................e................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v5.4S, v12.4S, v16.S[0] // .......................................................................................................................................................*...................... - sqrdmulh v12.4S, v12.4S, v16.S[1] // ........................................................................................................................................................*..................... - mul v28.4S, v27.4S, v16.S[0] // .................................................................................................................................................................*............ - sqrdmulh v27.4S, v27.4S, v16.S[1] // ..................................................................................................................................................................*........... - str q13, [x1, #-48] // .....................................................................................................................................................................*........ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q10, [x1, #-16] // .......................................................................................................................................................................*...... - add x1, x1, #64 // ............................................................................................................................................................................*. // gap // .............................................................................................................................................................................. + add v14.4S, v22.4S, v6.4S // .................................................................................................................................................e............................ + sub v6.4S, v22.4S, v6.4S // ................................................................................................................................................e............................. + mls v19.4S, v4.4S, v8.4S // ...........................................................................................................................................e.................................. + sub v22.4S, v26.4S, v29.4S // ..........................................................................................................................................................e................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + add v26.4S, v26.4S, v29.4S // ...........................................................................................................................................................e.................. + mls v28.4S, v10.4S, v8.4S // ...............................................................................................................................................e.............................. + mls v15.4S, v13.4S, v8.S[0] // .......................................................................................................................................e...................................... // gap // .............................................................................................................................................................................. - str q7, [x2], #(16*4) // ........................................................................................................................................................................*..... - mls v31.4S, v3.4S, v8.S[0] // ..............................................................................................................................................................*............... - ldr q29, [x1, #0] // e............................................................................................................................................................................. - ldr q7, [x1, #48] // ...e.......................................................................................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v5.4S, v12.4S, v8.S[0] // .........................................................................................................................................................*.................... - mls v28.4S, v27.4S, v8.S[0] // ...................................................................................................................................................................*.......... - ldr q6, [x1, #16] // .e............................................................................................................................................................................ - ldr q19, [x1, #32] // ..e........................................................................................................................................................................... - ldr q11, [x5, #-96] // ..................................................e........................................................................................................................... + str q14, [x1], #(16*4) // ....................................................................................................................................................................e......... + sqrdmulh v29.4S, v6.4S, v18.S[1] // ..................................................................................................................................................e........................... + mul v6.4S, v6.4S, v18.S[0] // ...................................................................................................................................................e.......................... + sqrdmulh v4.4S, v22.4S, v18.S[1] // ............................................................................................................................................................e................. + mul v22.4S, v22.4S, v18.S[0] // .............................................................................................................................................................e................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + str q26, [x1, #-32] // ......................................................................................................................................................................e....... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -871,8 +1371,11 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sub v26.4S, v0.4S, v15.4S // ...............................................................................................................................................................e.............. + add v13.4S, v0.4S, v15.4S // ................................................................................................................................................................e............. + sub v0.4S, v19.4S, v28.4S // .....................................................................................................................................................e........................ + add v19.4S, v19.4S, v28.4S // ......................................................................................................................................................e....................... // gap // .............................................................................................................................................................................. - str q31, [x2, #-32] // ..........................................................................................................................................................................*... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. @@ -880,805 +1383,262 @@ layer45678_start: // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - str q5, [x2, #-48] // .........................................................................................................................................................................*.... - str q28, [x2, #-16] // ...........................................................................................................................................................................*.. - add x2, x2, #64 // .............................................................................................................................................................................* // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + sqrdmulh v15.4S, v26.4S, v18.S[1] // .................................................................................................................................................................e............ + mul v14.4S, v26.4S, v18.S[0] // ..................................................................................................................................................................e........... + sqrdmulh v10.4S, v0.4S, v18.S[1] // .......................................................................................................................................................e...................... + mul v0.4S, v0.4S, v18.S[0] // ........................................................................................................................................................e..................... + str q13, [x1, #-16] // .......................................................................................................................................................................e...... + str q19, [x1, #-48] // .....................................................................................................................................................................e........ + add x1, x1, #64 // ............................................................................................................................................................................e. // gap // .............................................................................................................................................................................. - // ---------------------------------------------------------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 250 275 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------ - // ldr q9, [x1, #0] // ...................................................................................................................e..........'..................................................................................................................................................................~.......... - // ldr q10, [x1, #16] // .......................................................................................................................e......'......................................................................................................................................................................~...... - // ldr q11, [x1, #32] // ........................................................................................................................e.....'.......................................................................................................................................................................~..... - // ldr q12, [x1, #48] // ....................................................................................................................e.........'...................................................................................................................................................................~......... - // trn1 v25.4s, v9.4s, v10.4s // ..............................................................................................................................'....*........................................................................................................................................................................ - // trn2 v26.4s, v9.4s, v10.4s // ..............................................................................................................................'*............................................................................................................................................................................ - // trn1 v27.4s, v11.4s, v12.4s // ..............................................................................................................................'.....*....................................................................................................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ..............................................................................................................................'.*........................................................................................................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // ..............................................................................................................................'.........*................................................................................................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // ..............................................................................................................................'...........*................................................................................................................................................................. - // trn1 v9.2d, v25.2d, v27.2d // ..............................................................................................................................'........*.................................................................................................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ..............................................................................................................................'..........*.................................................................................................................................................................. - // ldr q13, [x2, #0] // ..............................................................................................................................'...*......................................................................................................................................................................... - // ldr q14, [x2, #16] // ..............................................................................................................................'..*.......................................................................................................................................................................... - // ldr q15, [x2, #32] // ..............................................................................................................................*............................................................................................................................................................................. - // ldr q16, [x2, #48] // ..............................................................................................................................'......*...................................................................................................................................................................... - // trn1 v25.4s, v13.4s, v14.4s // ..............................................................................................................................'...................*......................................................................................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // ..............................................................................................................................'.............*............................................................................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ..............................................................................................................................'.................*........................................................................................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................................................'..................*.......................................................................................................................................................... - // trn2 v15.2d, v25.2d, v27.2d // ..............................................................................................................................'.............................*............................................................................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ..............................................................................................................................'...........................*................................................................................................................................................. - // trn1 v13.2d, v25.2d, v27.2d // ..............................................................................................................................'..............................*.............................................................................................................................................. - // trn1 v14.2d, v26.2d, v28.2d // ..............................................................................................................................'............................*................................................................................................................................................ - // ldr q0, [x5], #(12*16) // .........e....................................................................................................................'........................................................~.................................................................................................................... - // ldr q4, [x5, #(-12*16 + 1*16)] // ..............................................................................................................................'................................*............................................................................................................................................ - // ldr q1, [x5, #(-12*16 + 2*16)] // e.............................................................................................................................'...............................................~............................................................................................................................. - // ldr q5, [x5, #(-12*16 + 3*16)] // ...................................................................e..........................................................'..................................................................................................................~.......................................................... - // ldr q2, [x5, #(-12*16 + 4*16)] // .............................................................................e................................................'............................................................................................................................~................................................ - // ldr q6, [x5, #(-12*16 + 5*16)] // ..........................................................................e...................................................'.........................................................................................................................~................................................... - // sub v24.4s, v9.4s, v10.4s // ..............................................................................................................................'...............*............................................................................................................................................................. - // add v9.4s, v9.4s, v10.4s // ..............................................................................................................................'................*............................................................................................................................................................ - // mul v10.4s, v24.4s, v1.4s // ..............................................................................................................................'......................*...................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................................................................................................'.......................*..................................................................................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ..............................................................................................................................'......................................*...................................................................................................................................... - // sub v24.4s, v11.4s, v12.4s // ..............................................................................................................................'..............*.............................................................................................................................................................. - // add v11.4s, v11.4s, v12.4s // ..............................................................................................................................'....................*........................................................................................................................................................ - // mul v12.4s, v24.4s, v2.4s // ..............................................................................................................................'.........................*................................................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................................................................................................'........................*.................................................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ..............................................................................................................................'..................................*.......................................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ..............................................................................................................................'.................................*........................................................................................................................................... - // add v9.4s, v9.4s, v11.4s // ..........~...................................................................................................................'.........................................................*................................................................................................................... - // mul v11.4s, v24.4s, v0.4s // ..~...........................................................................................................................'.................................................*........................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...~..........................................................................................................................'..................................................*.......................................................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ..............~...............................................................................................................'.............................................................*............................................................................................................... - // sub v24.4s, v10.4s, v12.4s // ..............................................................................................................................'..............................................*.............................................................................................................................. - // add v10.4s, v10.4s, v12.4s // ..............................................................................................................................'.............................................*............................................................................................................................... - // mul v12.4s, v24.4s, v0.4s // ........~.....................................................................................................................'.......................................................*..................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ......~.......................................................................................................................'.....................................................*....................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................~.............................................................................................................'...............................................................*............................................................................................................. - // ldr q0, [x5, #(-12*16 + 6*16)] // .........................................................................................................................e....'........................................................................................................................................................................~.... - // ldr q4, [x5, #(-12*16 + 7*16)] // ..............................................................................................................................'.....................*....................................................................................................................................................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ..............................................................................................................................'.......*..................................................................................................................................................................... - // ldr q5, [x5, #(-12*16 + 9*16)] // ..............................................................................................................................'..........................*.................................................................................................................................................. - // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................................................................................................................'...............................*............................................................................................................................................. - // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................................................................................................................'............*................................................................................................................................................................ - // sub v24.4s, v13.4s, v14.4s // ..............................................................................................................................'....................................*........................................................................................................................................ - // add v13.4s, v13.4s, v14.4s // ..............................................................................................................................'.......................................*..................................................................................................................................... - // mul v14.4s, v24.4s, v1.4s // ..............................................................................................................................'..........................................*.................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................................................................................................'.........................................*................................................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // .......~......................................................................................................................'......................................................*...................................................................................................................... - // sub v24.4s, v15.4s, v16.4s // ..............................................................................................................................'.....................................*....................................................................................................................................... - // add v15.4s, v15.4s, v16.4s // ..............................................................................................................................'........................................*.................................................................................................................................... - // mul v16.4s, v24.4s, v2.4s // ..............................................................................................................................'...........................................*................................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................................................................................................'............................................*................................................................................................................................ - // mls v16.4s, v24.4s, v8.s[0] // ....~.........................................................................................................................'...................................................*......................................................................................................................... - // sub v24.4s, v13.4s, v15.4s // .~............................................................................................................................'................................................*............................................................................................................................ - // add v13.4s, v13.4s, v15.4s // ...............~..............................................................................................................'..............................................................*.............................................................................................................. - // mul v15.4s, v24.4s, v0.4s // ............~.................................................................................................................'...........................................................*................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v4.4s // .............~................................................................................................................'............................................................*................................................................................................................ - // mls v15.4s, v24.4s, v8.s[0] // ...................~..........................................................................................................'..................................................................*.......................................................................................................... - // sub v24.4s, v14.4s, v16.4s // .................~............................................................................................................'................................................................*............................................................................................................ - // add v14.4s, v14.4s, v16.4s // ..................~...........................................................................................................'.................................................................*........................................................................................................... - // mul v16.4s, v24.4s, v0.4s // ........................~.....................................................................................................'.......................................................................*..................................................................................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // .......................~......................................................................................................'......................................................................*...................................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // .............................~................................................................................................'............................................................................*................................................................................................ - // trn1 v25.4s, v9.4s, v10.4s // ...........................~..................................................................................................'..........................................................................*.................................................................................................. - // trn2 v26.4s, v9.4s, v10.4s // ....................~.........................................................................................................'...................................................................*......................................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // ..........................~...................................................................................................'.........................................................................*................................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // .........................~....................................................................................................'........................................................................*.................................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // ..............................~...............................................................................................'.............................................................................*............................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // ................................~.............................................................................................'...............................................................................*............................................................................................. - // trn1 v9.2d, v25.2d, v27.2d // ...............................~..............................................................................................'..............................................................................*.............................................................................................. - // trn1 v10.2d, v26.2d, v28.2d // .................................~............................................................................................'................................................................................*............................................................................................ - // trn1 v25.4s, v13.4s, v14.4s // .....................~........................................................................................................'....................................................................*........................................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // ......................~.......................................................................................................'.....................................................................*....................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ....................................~.........................................................................................'...................................................................................*......................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // .....................................~........................................................................................'....................................................................................*........................................................................................ - // trn2 v15.2d, v25.2d, v27.2d // ........................................~.....................................................................................'.......................................................................................*..................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ..........................................~...................................................................................'.........................................................................................*................................................................................... - // trn1 v13.2d, v25.2d, v27.2d // ...........................................~..................................................................................'..........................................................................................*.................................................................................. - // trn1 v14.2d, v26.2d, v28.2d // .........................................~....................................................................................'........................................................................................*.................................................................................... - // ldr q0, [x4], #64 // ...........~..................................................................................................................'..........................................................*.................................................................................................................. - // ldr q1, [x4, #(-64 + 16)] // ..............................................................................................................................'...................................*......................................................................................................................................... - // ldr q2, [x4, #(-64 + 32)] // .....~........................................................................................................................'....................................................*........................................................................................................................ - // ldr q3, [x4, #(-64 + 48)] // ............................~.................................................................................................'...........................................................................*................................................................................................. - // sub v24.4s, v9.4s, v10.4s // ............................................~.................................................................................'...........................................................................................*................................................................................. - // add v9.4s, v9.4s, v10.4s // .......................................~......................................................................................'......................................................................................*...................................................................................... - // mul v10.4s, v24.4s, v1.s[2] // ...................................................~..........................................................................'..................................................................................................*.......................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................~.........................................................................'...................................................................................................*......................................................................... - // mls v10.4s, v24.4s, v8.s[0] // .............................................................~................................................................'............................................................................................................*................................................................ - // sub v24.4s, v11.4s, v12.4s // ..................................~...........................................................................................'.................................................................................*........................................................................................... - // add v11.4s, v11.4s, v12.4s // ...................................~..........................................................................................'..................................................................................*.......................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ......................................~.......................................................................................'.....................................................................................*....................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .............................................~................................................................................'............................................................................................*................................................................................ - // mls v12.4s, v24.4s, v8.s[0] // ..........................................................~...................................................................'.........................................................................................................*................................................................... - // sub v24.4s, v13.4s, v14.4s // .................................................~............................................................................'................................................................................................*............................................................................ - // add v13.4s, v13.4s, v14.4s // ..................................................~...........................................................................'.................................................................................................*........................................................................... - // mul v14.4s, v24.4s, v2.s[2] // ......................................................~.......................................................................'.....................................................................................................*....................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // .......................................................~......................................................................'......................................................................................................*...................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ................................................................~.............................................................'...............................................................................................................*............................................................. - // sub v24.4s, v15.4s, v16.4s // ...............................................~..............................................................................'..............................................................................................*.............................................................................. - // add v15.4s, v15.4s, v16.4s // ................................................~.............................................................................'...............................................................................................*............................................................................. - // mul v16.4s, v24.4s, v3.s[0] // ........................................................~.....................................................................'.......................................................................................................*..................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................~....................................................................'........................................................................................................*.................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ...............................................................~..............................................................'..............................................................................................................*.............................................................. - // sub v24.4s, v9.4s, v11.4s // .................................................................~............................................................'................................................................................................................*............................................................ - // add v9.4s, v9.4s, v11.4s // ..............................................~...............................................................................'.............................................................................................*............................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ........................................................................~.....................................................'.......................................................................................................................*..................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................~....................................................'........................................................................................................................*.................................................... - // mls v11.4s, v24.4s, v8.s[0] // ....................................................................................~.........................................'...................................................................................................................................*......................................... - // sub v24.4s, v10.4s, v12.4s // ......................................................................~.......................................................'.....................................................................................................................*....................................................... - // add v10.4s, v10.4s, v12.4s // .......................................................................~......................................................'......................................................................................................................*...................................................... - // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................~..............................................'..............................................................................................................................*.............................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................~.............................................'...............................................................................................................................*............................................. - // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................~....................................'........................................................................................................................................*.................................... - // sub v24.4s, v13.4s, v15.4s // ...........................................................~..................................................................'..........................................................................................................*.................................................................. - // add v13.4s, v13.4s, v15.4s // ..............................................................~...............................................................'.............................................................................................................*............................................................... - // mul v15.4s, v24.4s, v1.s[0] // .....................................................................~........................................................'....................................................................................................................*........................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................................~...........................................................'.................................................................................................................*........................................................... - // mls v15.4s, v24.4s, v8.s[0] // ......................................................................................~.......................................'.....................................................................................................................................*....................................... - // sub v24.4s, v14.4s, v16.4s // ...........................................................................~..................................................'..........................................................................................................................*.................................................. - // add v14.4s, v14.4s, v16.4s // ............................................................................~.................................................'...........................................................................................................................*................................................. - // mul v16.4s, v24.4s, v1.s[0] // ..................................................................................~...........................................'.................................................................................................................................*........................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................~..........................................'..................................................................................................................................*.......................................... - // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................~.................................'...........................................................................................................................................*................................. - // srshr v24.4S, v9.4S, #23 // .....................................................~........................................................................'....................................................................................................*........................................................................ - // mls v9.4s, v24.4s, v8.4s // ............................................................~.................................................................'...........................................................................................................*................................................................. - // srshr v24.4S, v10.4S, #23 // .................................................................................~............................................'................................................................................................................................*............................................ - // mls v10.4s, v24.4s, v8.4s // ..........................................................................................~...................................'.........................................................................................................................................*................................... - // srshr v24.4S, v13.4S, #23 // ....................................................................~.........................................................'...................................................................................................................*......................................................... - // mls v13.4s, v24.4s, v8.4s // ..............................................................................~...............................................'.............................................................................................................................*............................................... - // srshr v24.4S, v14.4S, #23 // .....................................................................................~........................................'....................................................................................................................................*........................................ - // mls v14.4s, v24.4s, v8.4s // ...........................................................................................~..................................'..........................................................................................................................................*.................................. - // sub v24.4s, v9.4s, v13.4s // .......................................................................................~......................................'......................................................................................................................................*...................................... - // add v9.4s, v9.4s, v13.4s // ........................................................................................~.....................................'.......................................................................................................................................*..................................... - // mul v13.4s, v24.4s, v0.s[0] // .............................................................................................~................................'............................................................................................................................................*................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................~...............................'.............................................................................................................................................*............................... - // mls v13.4s, v24.4s, v8.s[0] // ......................................................................................................~.......................'.....................................................................................................................................................*....................... - // sub v24.4s, v10.4s, v14.4s // ..................................................................................................~...........................'.................................................................................................................................................*........................... - // add v10.4s, v10.4s, v14.4s // ....................................................................................................~.........................'...................................................................................................................................................*......................... - // mul v14.4s, v24.4s, v0.s[0] // ..........................................................................................................~...................'.........................................................................................................................................................*................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................~..................'..........................................................................................................................................................*.................. - // mls v14.4s, v24.4s, v8.s[0] // .....................................................................................................................~........'....................................................................................................................................................................*........ - // sub v24.4s, v11.4s, v15.4s // ................................................................................................~.............................'...............................................................................................................................................*............................. - // add v11.4s, v11.4s, v15.4s // .................................................................................................~............................'................................................................................................................................................*............................ - // mul v15.4s, v24.4s, v0.s[0] // ........................................................................................................~.....................'.......................................................................................................................................................*..................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................~....................'........................................................................................................................................................*.................... - // mls v15.4s, v24.4s, v8.s[0] // ..................................................................................................................~...........'.................................................................................................................................................................*........... - // sub v24.4s, v12.4s, v16.4s // ...................................................................................................~..........................'..................................................................................................................................................*.......................... - // add v12.4s, v12.4s, v16.4s // .....................................................................................................~........................'....................................................................................................................................................*........................ - // mul v16.4s, v24.4s, v0.s[0] // ............................................................................................................~.................'...........................................................................................................................................................*................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................................................~................'............................................................................................................................................................*................ - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................~.......'.....................................................................................................................................................................*....... - // str q9, [x1], #(16*4) // ...............................................................................................~..............................'..............................................................................................................................................*.............................. - // str q10, [x1, #(-16*4 + 1*16)] // ..............................................................................................................~...............'.............................................................................................................................................................*............... - // str q11, [x1, #(-16*4 + 2*16)] // .......................................................................................................~......................'......................................................................................................................................................*...................... - // str q12, [x1, #(-16*4 + 3*16)] // ...............................................................................................................~..............'..............................................................................................................................................................*.............. - // str q13, [x2], #(16*4) // .................................................................................................................~............'................................................................................................................................................................*............ - // str q14, [x2, #(-16*4 + 1*16)] // ...........................................................................................................................~..'..........................................................................................................................................................................*.. - // str q15, [x2, #(-16*4 + 2*16)] // ..........................................................................................................................~...'.........................................................................................................................................................................*... - // str q16, [x2, #(-16*4 + 3*16)] // ............................................................................................................................~.'...........................................................................................................................................................................*. - // add x1, x1, #64 // ................................................................................................................~.............'...............................................................................................................................................................*............. - // add x2, x2, #64 // .............................................................................................................................~'............................................................................................................................................................................* + // --------------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr q9, [x1, #0] // e............................................................................................................................................................................'~........................... + // ldr q10, [x1, #16] // .e...........................................................................................................................................................................'.~.......................... + // ldr q11, [x1, #32] // ..e..........................................................................................................................................................................'..~......................... + // ldr q12, [x1, #48] // ....e........................................................................................................................................................................'....~....................... + // trn1 v25.4s, v9.4s, v10.4s // .................e...........................................................................................................................................................'.................~.......... + // trn2 v26.4s, v9.4s, v10.4s // ................e............................................................................................................................................................'................~........... + // trn1 v27.4s, v11.4s, v12.4s // .....................e.......................................................................................................................................................'.....................~...... + // trn2 v28.4s, v11.4s, v12.4s // ......................e......................................................................................................................................................'......................~..... + // trn2 v11.2d, v25.2d, v27.2d // .................................e...........................................................................................................................................'............................ + // trn2 v12.2d, v26.2d, v28.2d // ...............................e.............................................................................................................................................'............................ + // trn1 v9.2d, v25.2d, v27.2d // ..................................e..........................................................................................................................................'............................ + // trn1 v10.2d, v26.2d, v28.2d // ................................e............................................................................................................................................'............................ + // ldr q13, [x2, #0] // .............................e...............................................................................................................................................'............................ + // ldr q14, [x2, #16] // ............................e................................................................................................................................................'............................ + // ldr q15, [x2, #32] // ..............................e..............................................................................................................................................'............................ + // ldr q16, [x2, #48] // ...................................e.........................................................................................................................................'............................ + // trn1 v25.4s, v13.4s, v14.4s // ............................................e................................................................................................................................'............................ + // trn2 v26.4s, v13.4s, v14.4s // .............................................e...............................................................................................................................'............................ + // trn1 v27.4s, v15.4s, v16.4s // ................................................e............................................................................................................................'............................ + // trn2 v28.4s, v15.4s, v16.4s // .................................................e...........................................................................................................................'............................ + // trn2 v15.2d, v25.2d, v27.2d // .......................................................e.....................................................................................................................'............................ + // trn2 v16.2d, v26.2d, v28.2d // .........................................................e...................................................................................................................'............................ + // trn1 v13.2d, v25.2d, v27.2d // ......................................................e......................................................................................................................'............................ + // trn1 v14.2d, v26.2d, v28.2d // ........................................................e....................................................................................................................'............................ + // ldr q0, [x5], #(12*16) // ...........e.................................................................................................................................................................'...........~................ + // ldr q4, [x5, #(-12*16 + 1*16)] // ......e......................................................................................................................................................................'......~..................... + // ldr q1, [x5, #(-12*16 + 2*16)] // .....e.......................................................................................................................................................................'.....~...................... + // ldr q5, [x5, #(-12*16 + 3*16)] // .............e...............................................................................................................................................................'.............~.............. + // ldr q2, [x5, #(-12*16 + 4*16)] // .........e...................................................................................................................................................................'.........~.................. + // ldr q6, [x5, #(-12*16 + 5*16)] // ..........e..................................................................................................................................................................'..........~................. + // sub v24.4s, v9.4s, v10.4s // ..........................................e..................................................................................................................................'............................ + // add v9.4s, v9.4s, v10.4s // ...........................................e.................................................................................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v5.4s // ...............................................e.............................................................................................................................'............................ + // mul v10.4s, v24.4s, v1.4s // ..............................................e..............................................................................................................................'............................ + // mls v10.4s, v27.4s, v8.s[0] // ..........................................................e..................................................................................................................'............................ + // sub v24.4s, v11.4s, v12.4s // ........................................e....................................................................................................................................'............................ + // add v11.4s, v11.4s, v12.4s // .........................................e...................................................................................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v6.4s // ...................................................e.........................................................................................................................'............................ + // mul v12.4s, v24.4s, v2.4s // ..................................................e..........................................................................................................................'............................ + // mls v12.4s, v27.4s, v8.s[0] // .............................................................e...............................................................................................................'............................ + // sub v24.4s, v9.4s, v11.4s // ....................................................e........................................................................................................................'............................ + // add v9.4s, v9.4s, v11.4s // .....................................................e.......................................................................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ...........................................................e.................................................................................................................'............................ + // mul v11.4s, v24.4s, v0.4s // ............................................................e................................................................................................................'............................ + // mls v11.4s, v27.4s, v8.s[0] // ........................................................................e....................................................................................................'............................ + // sub v24.4s, v10.4s, v12.4s // ......................................................................e......................................................................................................'............................ + // add v10.4s, v10.4s, v12.4s // .......................................................................e.....................................................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ...........................................................................e.................................................................................................'............................ + // mul v12.4s, v24.4s, v0.4s // ............................................................................e................................................................................................'............................ + // mls v12.4s, v27.4s, v8.s[0] // ...................................................................................e.........................................................................................'............................ + // ldr q0, [x5, #(-12*16 + 6*16)] // ..............e..............................................................................................................................................................'..............~............. + // ldr q4, [x5, #(-12*16 + 7*16)] // ...............e.............................................................................................................................................................'...............~............ + // ldr q1, [x5, #(-12*16 + 8*16)] // ...................e.........................................................................................................................................................'...................~........ + // ldr q5, [x5, #(-12*16 + 9*16)] // ....................e........................................................................................................................................................'....................~....... + // ldr q2, [x5, #(-12*16 + 10*16)] // .........................e...................................................................................................................................................'.........................~.. + // ldr q6, [x5, #(-12*16 + 11*16)] // ..........................e..................................................................................................................................................'..........................~. + // sub v24.4s, v13.4s, v14.4s // ..............................................................e..............................................................................................................'............................ + // add v13.4s, v13.4s, v14.4s // ...............................................................e.............................................................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v5.4s // ...................................................................e.........................................................................................................'............................ + // mul v14.4s, v24.4s, v1.4s // ..................................................................e..........................................................................................................'............................ + // mls v14.4s, v27.4s, v8.s[0] // .............................................................................e...............................................................................................'............................ + // sub v24.4s, v15.4s, v16.4s // ................................................................e............................................................................................................'............................ + // add v15.4s, v15.4s, v16.4s // .................................................................e...........................................................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v6.4s // .....................................................................e.......................................................................................................'............................ + // mul v16.4s, v24.4s, v2.4s // ....................................................................e........................................................................................................'............................ + // mls v16.4s, v27.4s, v8.s[0] // ..............................................................................e..............................................................................................'............................ + // sub v24.4s, v13.4s, v15.4s // .........................................................................e...................................................................................................'............................ + // add v13.4s, v13.4s, v15.4s // ..........................................................................e..................................................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v4.4s // .................................................................................e...........................................................................................'............................ + // mul v15.4s, v24.4s, v0.4s // ..................................................................................e..........................................................................................'............................ + // mls v15.4s, v27.4s, v8.s[0] // ......................................................................................e......................................................................................'............................ + // sub v24.4s, v14.4s, v16.4s // ....................................................................................e........................................................................................'............................ + // add v14.4s, v14.4s, v16.4s // .....................................................................................e.......................................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ........................................................................................e....................................................................................'............................ + // mul v16.4s, v24.4s, v0.4s // .......................................................................................e.....................................................................................'............................ + // mls v16.4s, v27.4s, v8.s[0] // ...............................................................................................e.............................................................................'............................ + // trn1 v25.4s, v9.4s, v10.4s // ...............................................................................e.............................................................................................'............................ + // trn2 v26.4s, v9.4s, v10.4s // ................................................................................e............................................................................................'............................ + // trn1 v27.4s, v11.4s, v12.4s // ...........................................................................................e.................................................................................'............................ + // trn2 v28.4s, v11.4s, v12.4s // ............................................................................................e................................................................................'............................ + // trn2 v11.2d, v25.2d, v27.2d // .............................................................................................e...............................................................................'............................ + // trn2 v12.2d, v26.2d, v28.2d // ................................................................................................e............................................................................'............................ + // trn1 v9.2d, v25.2d, v27.2d // ..............................................................................................e..............................................................................'............................ + // trn1 v10.2d, v26.2d, v28.2d // .................................................................................................e...........................................................................'............................ + // trn1 v25.4s, v13.4s, v14.4s // .........................................................................................e...................................................................................'............................ + // trn2 v26.4s, v13.4s, v14.4s // ..........................................................................................e..................................................................................'............................ + // trn1 v27.4s, v15.4s, v16.4s // ....................................................................................................e........................................................................'............................ + // trn2 v28.4s, v15.4s, v16.4s // .....................................................................................................e.......................................................................'............................ + // trn2 v15.2d, v25.2d, v27.2d // ...........................................................................................................e.................................................................'............................ + // trn2 v16.2d, v26.2d, v28.2d // .............................................................................................................e...............................................................'............................ + // trn1 v13.2d, v25.2d, v27.2d // ..........................................................................................................e..................................................................'............................ + // trn1 v14.2d, v26.2d, v28.2d // ............................................................................................................e................................................................'............................ + // ldr q0, [x4], #64 // .......................................e.....................................................................................................................................'............................ + // ldr q1, [x4, #(-64 + 16)] // .....................................e.......................................................................................................................................'............................ + // ldr q2, [x4, #(-64 + 32)] // ....................................e........................................................................................................................................'............................ + // ldr q3, [x4, #(-64 + 48)] // ......................................e......................................................................................................................................'............................ + // sub v24.4s, v9.4s, v10.4s // .......................................................................................................e.....................................................................'............................ + // add v9.4s, v9.4s, v10.4s // ......................................................................................................e......................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ................................................................................................................e............................................................'............................ + // mul v10.4s, v24.4s, v1.s[2] // .................................................................................................................e...........................................................'............................ + // mls v10.4s, v27.4s, v8.s[0] // ..............................................................................................................................e..............................................'............................ + // sub v24.4s, v11.4s, v12.4s // ..................................................................................................e..........................................................................'............................ + // add v11.4s, v11.4s, v12.4s // ...................................................................................................e.........................................................................'............................ + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ........................................................................................................e....................................................................'............................ + // mul v12.4s, v24.4s, v2.s[0] // .........................................................................................................e...................................................................'............................ + // mls v12.4s, v27.4s, v8.s[0] // ..................................................................................................................e..........................................................'............................ + // sub v24.4s, v13.4s, v14.4s // ...................................................................................................................e.........................................................'............................ + // add v13.4s, v13.4s, v14.4s // ....................................................................................................................e........................................................'............................ + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ..........................................................................................................................e..................................................'............................ + // mul v14.4s, v24.4s, v2.s[2] // ...........................................................................................................................e.................................................'............................ + // mls v14.4s, v27.4s, v8.s[0] // ...................................................................................................................................e.........................................'............................ + // sub v24.4s, v15.4s, v16.4s // .....................................................................................................................e.......................................................'............................ + // add v15.4s, v15.4s, v16.4s // ......................................................................................................................e......................................................'............................ + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ............................................................................................................................e................................................'............................ + // mul v16.4s, v24.4s, v3.s[0] // .............................................................................................................................e...............................................'............................ + // mls v16.4s, v27.4s, v8.s[0] // ....................................................................................................................................e........................................'............................ + // sub v24.4s, v9.4s, v11.4s // ..............................................................................................................e..............................................................'............................ + // add v9.4s, v9.4s, v11.4s // ...............................................................................................................e.............................................................'............................ + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .......................................................................................................................e.....................................................'............................ + // mul v11.4s, v24.4s, v0.s[2] // ........................................................................................................................e....................................................'............................ + // mls v11.4s, v27.4s, v8.s[0] // ..................................................................................................................................e..........................................'............................ + // sub v24.4s, v10.4s, v12.4s // .......................................................................................................................................e.....................................'............................ + // add v10.4s, v10.4s, v12.4s // ........................................................................................................................................e....................................'............................ + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ............................................................................................................................................e................................'............................ + // mul v12.4s, v24.4s, v0.s[2] // .............................................................................................................................................e...............................'............................ + // mls v12.4s, v27.4s, v8.s[0] // ....................................................................................................................................................e........................'............................ + // sub v24.4s, v13.4s, v15.4s // ...............................................................................................................................e.............................................'............................ + // add v13.4s, v13.4s, v15.4s // ................................................................................................................................e............................................'............................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .....................................................................................................................................e.......................................'............................ + // mul v15.4s, v24.4s, v1.s[0] // ......................................................................................................................................e......................................'............................ + // mls v15.4s, v27.4s, v8.s[0] // ...............................................................................................................................................e.............................'............................ + // sub v24.4s, v14.4s, v16.4s // ..........................................................................................................................................e..................................'............................ + // add v14.4s, v14.4s, v16.4s // ...........................................................................................................................................e.................................'............................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .................................................................................................................................................e...........................'............................ + // mul v16.4s, v24.4s, v1.s[0] // ..................................................................................................................................................e..........................'............................ + // mls v16.4s, v27.4s, v8.s[0] // ...........................................................................................................................................................e.................'............................ + // srshr v24.4S, v9.4S, #23 // .........................................................................................................................e...................................................'............................ + // mls v9.4s, v24.4s, v8.4s // .................................................................................................................................e...........................................'............................ + // srshr v24.4S, v10.4S, #23 // ................................................................................................................................................e............................'............................ + // mls v10.4s, v24.4s, v8.4s // .......................................................................................................................................................e.....................'............................ + // srshr v24.4S, v13.4S, #23 // .........................................................................................................................................e...................................'............................ + // mls v13.4s, v24.4s, v8.4s // ..............................................................................................................................................e..............................'............................ + // srshr v24.4S, v14.4S, #23 // ...................................................................................................................................................e.........................'............................ + // mls v14.4s, v24.4s, v8.4s // ..........................................................................................................................................................e..................'............................ + // sub v24.4s, v9.4s, v13.4s // ......................................................................................................................................................e......................'............................ + // add v9.4s, v9.4s, v13.4s // .....................................................................................................................................................e.......................'............................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .............................................................................................................................................................e...............'............................ + // mul v13.4s, v24.4s, v0.s[0] // ..............................................................................................................................................................e..............'............................ + // mls v13.4s, v27.4s, v8.s[0] // .............................................................................................................................................................................*............................ + // sub v24.4s, v10.4s, v14.4s // ....................................................................................................................................................................e........'............................ + // add v10.4s, v10.4s, v14.4s // .....................................................................................................................................................................e.......'............................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ........................................................................................................................................................................e....'............................ + // mul v14.4s, v24.4s, v0.s[0] // .........................................................................................................................................................................e...'............................ + // mls v14.4s, v27.4s, v8.s[0] // .......~.....................................................................................................................................................................'.......*.................... + // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................................e....................'............................ + // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................................e...................'............................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...............................................................................................................................................................e.............'............................ + // mul v15.4s, v24.4s, v0.s[0] // ................................................................................................................................................................e............'............................ + // mls v15.4s, v27.4s, v8.s[0] // ...~.........................................................................................................................................................................'...*........................ + // sub v24.4s, v12.4s, v16.4s // ..................................................................................................................................................................e..........'............................ + // add v12.4s, v12.4s, v16.4s // ...................................................................................................................................................................e.........'............................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ......................................................................................................................................................................e......'............................ + // mul v16.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................e.....'............................ + // mls v16.4s, v27.4s, v8.s[0] // ........~....................................................................................................................................................................'........*................... + // str q9, [x1], #(16*4) // ............................................................................................................................................................e................'............................ + // str q10, [x1, #(-16*4 + 1*16)] // ...........................................................................................................................................................................e.'............................ + // str q11, [x1, #(-16*4 + 2*16)] // .................................................................................................................................................................e...........'............................ + // str q12, [x1, #(-16*4 + 3*16)] // ..........................................................................................................................................................................e..'............................ + // str q13, [x2], #(16*4) // ............~................................................................................................................................................................'............*............... + // str q14, [x2, #(-16*4 + 1*16)] // .......................~.....................................................................................................................................................'.......................*.... + // str q15, [x2, #(-16*4 + 2*16)] // ..................~..........................................................................................................................................................'..................*......... + // str q16, [x2, #(-16*4 + 3*16)] // ........................~....................................................................................................................................................'........................*... + // add x1, x1, #64 // ............................................................................................................................................................................e'............................ + // add x2, x2, #64 // ...........................~.................................................................................................................................................'...........................* sub count, count, #1 cbnz count, layer45678_start - // Instructions: 164 - // Expected cycles: 54 - // Expected IPC: 3.04 - // - // Wall time: 78.88s - // User time: 78.88s - // - // ------------------------------------------------------------------------ original position ------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- - trn2 v27.4S, v29.4S, v6.4S // .*.................................................................................................................................................................. - trn1 v23.4S, v29.4S, v6.4S // .....*.............................................................................................................................................................. - trn2 v25.4S, v19.4S, v7.4S // ..*................................................................................................................................................................. - trn1 v7.4S, v19.4S, v7.4S // ......*............................................................................................................................................................. - ldr q9, [x2, #16] // ...*................................................................................................................................................................ - ldr q13, [x2, #0] // ....*............................................................................................................................................................... - ldr q1, [x2, #32] // *................................................................................................................................................................... - // gap // .................................................................................................................................................................... - ldr q3, [x2, #48] // .......*............................................................................................................................................................ - ldr q20, [x5, #-64] // ........*........................................................................................................................................................... - ldr q21, [x5, #-16] // .............*...................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn1 v26.2D, v27.2D, v25.2D // ...........*........................................................................................................................................................ - trn2 v27.2D, v27.2D, v25.2D // ............*....................................................................................................................................................... - trn1 v25.2D, v23.2D, v7.2D // .........*.......................................................................................................................................................... - trn2 v7.2D, v23.2D, v7.2D // ..........*......................................................................................................................................................... - ldr q23, [x5, #-48] // ...........................*........................................................................................................................................ - ldr q18, [x5, #-32] // ................................*................................................................................................................................... - ldr q16, [x5, #-80] // ......................*............................................................................................................................................. - // gap // .................................................................................................................................................................... - ldr q17, [x5, #-176] // .................................*.................................................................................................................................. - ldr q5, [x4, #32] // ....................................................*............................................................................................................... - ldr q4, [x4, #16] // ....................................*............................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v22.4S, v25.4S, v26.4S // ................*................................................................................................................................................... - add v25.4S, v25.4S, v26.4S // .................*.................................................................................................................................................. - sub v26.4S, v7.4S, v27.4S // ...............*.................................................................................................................................................... - trn2 v14.4S, v13.4S, v9.4S // ..............*..................................................................................................................................................... - ldr q19, [x4, #48] // ..........................................................................*......................................................................................... - ldr q29, [x4], #64 // .........................................................*.......................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn1 v9.4S, v13.4S, v9.4S // ....................*............................................................................................................................................... - trn2 v13.4S, v1.4S, v3.4S // ...................*................................................................................................................................................ - trn1 v1.4S, v1.4S, v3.4S // ..................*................................................................................................................................................. - add v27.4S, v7.4S, v27.4S // .....................*.............................................................................................................................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mul v7.4S, v22.4S, v0.4S // .......................*............................................................................................................................................ - sqrdmulh v3.4S, v22.4S, v15.4S // ........................*........................................................................................................................................... - sqrdmulh v15.4S, v26.4S, v2.4S // .........................*.......................................................................................................................................... - mul v26.4S, v26.4S, v30.4S // ..........................*......................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn2 v0.2D, v14.2D, v13.2D // ............................*....................................................................................................................................... - trn1 v13.2D, v14.2D, v13.2D // .............................*...................................................................................................................................... - trn2 v2.2D, v9.2D, v1.2D // ..............................*..................................................................................................................................... - trn1 v9.2D, v9.2D, v1.2D // ...............................*.................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v1.4S, v25.4S, v27.4S // ..................................*................................................................................................................................. - add v27.4S, v25.4S, v27.4S // ........................................................*........................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v7.4S, v3.4S, v8.S[0] // .......................................*............................................................................................................................ - mls v26.4S, v15.4S, v8.S[0] // ...................................*................................................................................................................................ - sub v25.4S, v9.4S, v13.4S // .....................................*.............................................................................................................................. - sub v3.4S, v2.4S, v0.4S // ......................................*............................................................................................................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - add v15.4S, v2.4S, v0.4S // .........................................*.......................................................................................................................... - add v9.4S, v9.4S, v13.4S // ........................................*........................................................................................................................... - mul v13.4S, v1.4S, v24.4S // .................................................*.................................................................................................................. - sqrdmulh v1.4S, v1.4S, v17.4S // ..................................................*................................................................................................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mul v20.4S, v25.4S, v20.4S // ...........................................*........................................................................................................................ - sqrdmulh v21.4S, v3.4S, v21.4S // .............................................*...................................................................................................................... - sqrdmulh v23.4S, v25.4S, v23.4S // ..........................................*......................................................................................................................... - mul v25.4S, v3.4S, v18.4S // ............................................*....................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v3.4S, v7.4S, v26.4S // ...............................................*.................................................................................................................... - add v7.4S, v7.4S, v26.4S // ..............................................*..................................................................................................................... - sub v26.4S, v9.4S, v15.4S // ................................................*................................................................................................................... - add v9.4S, v9.4S, v15.4S // .............................................................*...................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v13.4S, v1.4S, v8.S[0] // ............................................................*....................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mul v24.4S, v3.4S, v24.4S // .......................................................*............................................................................................................ - sqrdmulh v1.4S, v3.4S, v17.4S // .....................................................*.............................................................................................................. - mls v20.4S, v23.4S, v8.S[0] // ......................................................*............................................................................................................. - mls v25.4S, v21.4S, v8.S[0] // ...................................................*................................................................................................................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn1 v23.4S, v27.4S, v7.4S // .........................................................................*.......................................................................................... - trn2 v27.4S, v27.4S, v7.4S // ..................................................................*................................................................................................. - mul v7.4S, v26.4S, v11.4S // ..........................................................*......................................................................................................... - sqrdmulh v3.4S, v26.4S, v16.4S // ...........................................................*........................................................................................................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v24.4S, v1.4S, v8.S[0] // ..............................................................*..................................................................................................... - sub v1.4S, v20.4S, v25.4S // ...............................................................*.................................................................................................... - add v25.4S, v20.4S, v25.4S // ................................................................*................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v7.4S, v3.4S, v8.S[0] // .................................................................*.................................................................................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mul v11.4S, v1.4S, v11.4S // ......................................................................*............................................................................................. - sqrdmulh v1.4S, v1.4S, v16.4S // .....................................................................*.............................................................................................. - trn1 v3.4S, v9.4S, v25.4S // ...................................................................*................................................................................................ - trn2 v25.4S, v9.4S, v25.4S // ....................................................................*............................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn1 v9.4S, v13.4S, v24.4S // ........................................................................*........................................................................................... - trn2 v24.4S, v13.4S, v24.4S // .......................................................................*............................................................................................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn2 v13.2D, v23.2D, v9.2D // ............................................................................*....................................................................................... - trn1 v23.2D, v23.2D, v9.2D // .............................................................................*...................................................................................... - mls v11.4S, v1.4S, v8.S[0] // ...........................................................................*........................................................................................ - trn2 v9.2D, v27.2D, v24.2D // ..............................................................................*..................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn1 v27.2D, v27.2D, v24.2D // ...............................................................................*.................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v24.4S, v13.4S, v9.4S // ................................................................................*................................................................................... - add v9.4S, v13.4S, v9.4S // .................................................................................*.................................................................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn1 v13.4S, v7.4S, v11.4S // ..................................................................................*................................................................................. - trn2 v11.4S, v7.4S, v11.4S // ...................................................................................*................................................................................ - add v7.4S, v23.4S, v27.4S // .....................................................................................*.............................................................................. - sub v27.4S, v23.4S, v27.4S // ..........................................................................................*......................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mul v23.4S, v24.4S, v5.S[0] // ....................................................................................*............................................................................... - sqrdmulh v24.4S, v24.4S, v5.S[1] // ...........................................................................................*........................................................................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn2 v1.2D, v3.2D, v13.2D // ......................................................................................*............................................................................. - trn1 v13.2D, v3.2D, v13.2D // .........................................................................................*.......................................................................... - trn2 v3.2D, v25.2D, v11.2D // ........................................................................................*........................................................................... - trn1 v11.2D, v25.2D, v11.2D // .......................................................................................*............................................................................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v25.4S, v7.4S, v9.4S // ...............................................................................................................*.................................................... - add v7.4S, v7.4S, v9.4S // ............................................................................................*....................................................................... - mul v9.4S, v27.4S, v4.S[2] // .................................................................................................*.................................................................. - sqrdmulh v27.4S, v27.4S, v4.S[3] // ..................................................................................................*................................................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v20.4S, v1.4S, v3.4S // .............................................................................................*...................................................................... - add v1.4S, v1.4S, v3.4S // ..............................................................................................*..................................................................... - sub v3.4S, v13.4S, v11.4S // ...............................................................................................*.................................................................... - add v11.4S, v13.4S, v11.4S // ................................................................................................*................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v23.4S, v24.4S, v8.S[0] // ........................................................................................................*........................................................... - mul v24.4S, v25.4S, v29.S[2] // .....................................................................................................................*.............................................. - sqrdmulh v25.4S, v25.4S, v29.S[3] // ......................................................................................................................*............................................. - srshr v13.4S, v7.4S, #23 // ...................................................................................................*................................................................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mul v21.4S, v3.4S, v5.S[2] // ....................................................................................................*............................................................... - sqrdmulh v3.4S, v3.4S, v5.S[3] // .....................................................................................................*.............................................................. - mul v15.4S, v20.4S, v19.S[0] // ......................................................................................................*............................................................. - sqrdmulh v20.4S, v20.4S, v19.S[1] // .......................................................................................................*............................................................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v9.4S, v27.4S, v8.S[0] // ...........................................................................................................*........................................................ - sub v27.4S, v11.4S, v1.4S // .........................................................................................................*.......................................................... - add v11.4S, v11.4S, v1.4S // ............................................................................................................*....................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v7.4S, v13.4S, v8.4S // ..........................................................................................................*......................................................... - mls v24.4S, v25.4S, v8.S[0] // ...............................................................................................................................*.................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v21.4S, v3.4S, v8.S[0] // ..............................................................................................................*..................................................... - mls v15.4S, v20.4S, v8.S[0] // .............................................................................................................*...................................................... - srshr v25.4S, v11.4S, #23 // .................................................................................................................*.................................................. - sqrdmulh v13.4S, v27.4S, v4.S[1] // ................................................................................................................*................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v1.4S, v9.4S, v23.4S // ...................................................................................................................*................................................ - add v3.4S, v9.4S, v23.4S // ....................................................................................................................*............................................... - mul v27.4S, v27.4S, v4.S[0] // ..................................................................................................................*................................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v11.4S, v25.4S, v8.4S // .........................................................................................................................*.......................................... - sub v23.4S, v21.4S, v15.4S // .......................................................................................................................*............................................ - add v9.4S, v21.4S, v15.4S // ........................................................................................................................*........................................... - mul v21.4S, v1.4S, v29.S[2] // ..........................................................................................................................*......................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v27.4S, v13.4S, v8.S[0] // .................................................................................................................................*.................................. - sqrdmulh v25.4S, v1.4S, v29.S[3] // ...........................................................................................................................*........................................ - srshr v13.4S, v3.4S, #23 // ............................................................................................................................*....................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mul v20.4S, v23.4S, v4.S[0] // .............................................................................................................................*...................................... - sqrdmulh v23.4S, v23.4S, v4.S[1] // ..............................................................................................................................*..................................... - srshr v1.4S, v9.4S, #23 // ................................................................................................................................*................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - add v15.4S, v7.4S, v11.4S // ...................................................................................................................................*................................ - sub v11.4S, v7.4S, v11.4S // ..................................................................................................................................*................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - add v7.4S, v24.4S, v27.4S // ............................................................................................................................................*....................... - sub v27.4S, v24.4S, v27.4S // ...........................................................................................................................................*........................ - mls v3.4S, v13.4S, v8.4S // .....................................................................................................................................*.............................. - mls v21.4S, v25.4S, v8.S[0] // ....................................................................................................................................*............................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v9.4S, v1.4S, v8.4S // ......................................................................................................................................*............................. - mls v20.4S, v23.4S, v8.S[0] // .......................................................................................................................................*............................ - str q15, [x1], #(16*4) // ..........................................................................................................................................*......................... - mul v24.4S, v11.4S, v29.S[0] // ........................................................................................................................................*........................... - sqrdmulh v11.4S, v11.4S, v29.S[1] // .........................................................................................................................................*.......................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - str q7, [x1, #-32] // ..................................................................................................................................................*................. - mul v7.4S, v27.4S, v29.S[0] // ...................................................................................................................................................*................ - sqrdmulh v27.4S, v27.4S, v29.S[1] // ....................................................................................................................................................*............... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v23.4S, v3.4S, v9.4S // .............................................................................................................................................*...................... - add v9.4S, v3.4S, v9.4S // ...............................................................................................................................................*.................... - sub v25.4S, v21.4S, v20.4S // ..............................................................................................................................................*..................... - add v13.4S, v21.4S, v20.4S // ................................................................................................................................................*................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v24.4S, v11.4S, v8.S[0] // .................................................................................................................................................*.................. - mls v7.4S, v27.4S, v8.S[0] // .............................................................................................................................................................*...... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - str q9, [x1, #-48] // .........................................................................................................................................................*.......... - str q13, [x1, #-16] // ..........................................................................................................................................................*......... - add x1, x1, #64 // ...........................................................................................................................................................*........ - mul v27.4S, v23.4S, v29.S[0] // .....................................................................................................................................................*.............. - sqrdmulh v11.4S, v23.4S, v29.S[1] // ......................................................................................................................................................*............. - mul v23.4S, v25.4S, v29.S[0] // .......................................................................................................................................................*............ - sqrdmulh v25.4S, v25.4S, v29.S[1] // ........................................................................................................................................................*........... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - str q24, [x2], #(16*4) // ............................................................................................................................................................*....... - str q7, [x2, #-32] // ................................................................................................................................................................*... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v27.4S, v11.4S, v8.S[0] // ..............................................................................................................................................................*..... - mls v23.4S, v25.4S, v8.S[0] // ...............................................................................................................................................................*.... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - str q27, [x2, #-48] // .................................................................................................................................................................*.. - str q23, [x2, #-16] // ..................................................................................................................................................................*. - add x2, x2, #64 // ...................................................................................................................................................................* - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - - // -------------------------------------------------------------------------- new position ---------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- - // ldr q10, [x2, #32] // ......*............................................................................................................................................................. - // trn2 v25.4S, v29.4S, v6.4S // *................................................................................................................................................................... - // trn2 v22.4S, v19.4S, v7.4S // ..*................................................................................................................................................................. - // ldr q14, [x2, #16] // ....*............................................................................................................................................................... - // ldr q20, [x2, #0] // .....*.............................................................................................................................................................. - // trn1 v23.4S, v29.4S, v6.4S // .*.................................................................................................................................................................. - // trn1 v4.4S, v19.4S, v7.4S // ...*................................................................................................................................................................ - // ldr q19, [x2, #48] // .......*............................................................................................................................................................ - // ldr q26, [x5, #-64] // ........*........................................................................................................................................................... - // trn1 v17.2D, v23.2D, v4.2D // ............*....................................................................................................................................................... - // trn2 v18.2D, v23.2D, v4.2D // .............*...................................................................................................................................................... - // trn1 v5.2D, v25.2D, v22.2D // ..........*......................................................................................................................................................... - // trn2 v7.2D, v25.2D, v22.2D // ...........*........................................................................................................................................................ - // ldr q31, [x5, #-16] // .........*.......................................................................................................................................................... - // trn2 v28.4S, v20.4S, v14.4S // .......................*............................................................................................................................................ - // sub v22.4S, v18.4S, v7.4S // ......................*............................................................................................................................................. - // sub v9.4S, v17.4S, v5.4S // ....................*............................................................................................................................................... - // add v16.4S, v17.4S, v5.4S // .....................*.............................................................................................................................................. - // trn1 v21.4S, v10.4S, v19.4S // ............................*....................................................................................................................................... - // trn2 v6.4S, v10.4S, v19.4S // ...........................*........................................................................................................................................ - // trn1 v19.4S, v20.4S, v14.4S // ..........................*......................................................................................................................................... - // add v4.4S, v18.4S, v7.4S // .............................*...................................................................................................................................... - // ldr q14, [x5, #-80] // ................*................................................................................................................................................... - // mul v0.4S, v9.4S, v0.4S // ..............................*..................................................................................................................................... - // sqrdmulh v29.4S, v9.4S, v15.4S // ...............................*.................................................................................................................................... - // sqrdmulh v17.4S, v22.4S, v2.4S // ................................*................................................................................................................................... - // mul v22.4S, v22.4S, v30.4S // .................................*.................................................................................................................................. - // ldr q30, [x5, #-48] // ..............*..................................................................................................................................................... - // trn2 v20.2D, v28.2D, v6.2D // ..................................*................................................................................................................................. - // trn1 v28.2D, v28.2D, v6.2D // ...................................*................................................................................................................................ - // trn2 v10.2D, v19.2D, v21.2D // ....................................*............................................................................................................................... - // trn1 v21.2D, v19.2D, v21.2D // .....................................*.............................................................................................................................. - // ldr q6, [x5, #-32] // ...............*.................................................................................................................................................... - // ldr q18, [x5, #-176] // .................*.................................................................................................................................................. - // sub v12.4S, v16.4S, v4.4S // ......................................*............................................................................................................................. - // mls v22.4S, v17.4S, v8.S[0] // .........................................*.......................................................................................................................... - // ldr q17, [x4, #16] // ...................*................................................................................................................................................ - // sub v13.4S, v21.4S, v28.4S // ..........................................*......................................................................................................................... - // sub v27.4S, v10.4S, v20.4S // ...........................................*........................................................................................................................ - // mls v0.4S, v29.4S, v8.S[0] // ........................................*........................................................................................................................... - // add v2.4S, v21.4S, v28.4S // .............................................*...................................................................................................................... - // add v28.4S, v10.4S, v20.4S // ............................................*....................................................................................................................... - // sqrdmulh v19.4S, v13.4S, v30.4S // ..................................................*................................................................................................................. - // mul v29.4S, v13.4S, v26.4S // ................................................*................................................................................................................... - // mul v30.4S, v27.4S, v6.4S // ...................................................*................................................................................................................ - // sqrdmulh v6.4S, v27.4S, v31.4S // .................................................*.................................................................................................................. - // add v21.4S, v0.4S, v22.4S // .....................................................*.............................................................................................................. - // sub v31.4S, v0.4S, v22.4S // ....................................................*............................................................................................................... - // sub v10.4S, v2.4S, v28.4S // ......................................................*............................................................................................................. - // mul v27.4S, v12.4S, v24.4S // ..............................................*..................................................................................................................... - // sqrdmulh v12.4S, v12.4S, v18.4S // ...............................................*.................................................................................................................... - // mls v30.4S, v6.4S, v8.S[0] // ............................................................*....................................................................................................... - // ldr q6, [x4, #32] // ..................*................................................................................................................................................. - // sqrdmulh v3.4S, v31.4S, v18.4S // ..........................................................*......................................................................................................... - // mls v29.4S, v19.4S, v8.S[0] // ...........................................................*........................................................................................................ - // mul v19.4S, v31.4S, v24.4S // .........................................................*.......................................................................................................... - // add v31.4S, v16.4S, v4.4S // .......................................*............................................................................................................................ - // ldr q16, [x4], #64 // .........................*.......................................................................................................................................... - // mul v22.4S, v10.4S, v11.4S // ...............................................................*.................................................................................................... - // sqrdmulh v10.4S, v10.4S, v14.4S // ................................................................*................................................................................................... - // mls v27.4S, v12.4S, v8.S[0] // ........................................................*........................................................................................................... - // add v12.4S, v2.4S, v28.4S // .......................................................*............................................................................................................ - // mls v19.4S, v3.4S, v8.S[0] // .................................................................*.................................................................................................. - // sub v1.4S, v29.4S, v30.4S // ..................................................................*................................................................................................. - // add v28.4S, v29.4S, v30.4S // ...................................................................*................................................................................................ - // mls v22.4S, v10.4S, v8.S[0] // ....................................................................*............................................................................................... - // trn2 v30.4S, v31.4S, v21.4S // ..............................................................*..................................................................................................... - // trn1 v25.4S, v12.4S, v28.4S // .......................................................................*............................................................................................ - // trn2 v28.4S, v12.4S, v28.4S // ........................................................................*........................................................................................... - // sqrdmulh v10.4S, v1.4S, v14.4S // ......................................................................*............................................................................................. - // mul v5.4S, v1.4S, v11.4S // .....................................................................*.............................................................................................. - // trn2 v26.4S, v27.4S, v19.4S // ..........................................................................*......................................................................................... - // trn1 v27.4S, v27.4S, v19.4S // .........................................................................*.......................................................................................... - // trn1 v19.4S, v31.4S, v21.4S // .............................................................*...................................................................................................... - // ldr q31, [x4, #-16] // ........................*........................................................................................................................................... - // mls v5.4S, v10.4S, v8.S[0] // .............................................................................*...................................................................................... - // trn2 v10.2D, v19.2D, v27.2D // ...........................................................................*........................................................................................ - // trn1 v19.2D, v19.2D, v27.2D // ............................................................................*....................................................................................... - // trn2 v29.2D, v30.2D, v26.2D // ..............................................................................*..................................................................................... - // trn1 v4.2D, v30.2D, v26.2D // ...............................................................................*.................................................................................... - // sub v12.4S, v10.4S, v29.4S // ................................................................................*................................................................................... - // add v23.4S, v10.4S, v29.4S // .................................................................................*.................................................................................. - // trn1 v30.4S, v22.4S, v5.4S // ..................................................................................*................................................................................. - // trn2 v9.4S, v22.4S, v5.4S // ...................................................................................*................................................................................ - // mul v14.4S, v12.4S, v6.S[0] // ......................................................................................*............................................................................. - // add v18.4S, v19.4S, v4.4S // ....................................................................................*............................................................................... - // trn2 v3.2D, v25.2D, v30.2D // ........................................................................................*........................................................................... - // trn1 v10.2D, v28.2D, v9.2D // ...........................................................................................*........................................................................ - // trn2 v7.2D, v28.2D, v9.2D // ..........................................................................................*......................................................................... - // trn1 v11.2D, v25.2D, v30.2D // .........................................................................................*.......................................................................... - // sub v28.4S, v19.4S, v4.4S // .....................................................................................*.............................................................................. - // sqrdmulh v5.4S, v12.4S, v6.S[1] // .......................................................................................*............................................................................ - // add v1.4S, v18.4S, v23.4S // .............................................................................................*...................................................................... - // sub v29.4S, v3.4S, v7.4S // ................................................................................................*................................................................... - // add v27.4S, v3.4S, v7.4S // .................................................................................................*.................................................................. - // sub v12.4S, v11.4S, v10.4S // ..................................................................................................*................................................................. - // add v7.4S, v11.4S, v10.4S // ...................................................................................................*................................................................ - // mul v22.4S, v28.4S, v17.S[2] // ..............................................................................................*..................................................................... - // sqrdmulh v19.4S, v28.4S, v17.S[3] // ...............................................................................................*.................................................................... - // srshr v10.4S, v1.4S, #23 // .......................................................................................................*............................................................ - // mul v30.4S, v12.4S, v6.S[2] // ........................................................................................................*........................................................... - // sqrdmulh v6.4S, v12.4S, v6.S[3] // .........................................................................................................*.......................................................... - // mul v20.4S, v29.4S, v31.S[0] // ..........................................................................................................*......................................................... - // sqrdmulh v15.4S, v29.4S, v31.S[1] // ...........................................................................................................*........................................................ - // mls v14.4S, v5.4S, v8.S[0] // ....................................................................................................*............................................................... - // sub v2.4S, v7.4S, v27.4S // .............................................................................................................*...................................................... - // mls v1.4S, v10.4S, v8.4S // ...............................................................................................................*.................................................... - // mls v22.4S, v19.4S, v8.S[0] // ............................................................................................................*....................................................... - // add v19.4S, v7.4S, v27.4S // ..............................................................................................................*..................................................... - // mls v20.4S, v15.4S, v8.S[0] // ..................................................................................................................*................................................. - // mls v30.4S, v6.4S, v8.S[0] // .................................................................................................................*.................................................. - // sub v12.4S, v18.4S, v23.4S // ............................................................................................*....................................................................... - // sqrdmulh v11.4S, v2.4S, v17.S[1] // ....................................................................................................................*............................................... - // srshr v10.4S, v19.4S, #23 // ...................................................................................................................*................................................ - // mul v29.4S, v2.4S, v17.S[0] // .......................................................................................................................*............................................ - // sub v28.4S, v22.4S, v14.4S // .....................................................................................................................*.............................................. - // add v4.4S, v22.4S, v14.4S // ......................................................................................................................*............................................. - // mul v5.4S, v12.4S, v16.S[2] // .....................................................................................................*.............................................................. - // sqrdmulh v27.4S, v12.4S, v16.S[3] // ......................................................................................................*............................................................. - // sub v31.4S, v30.4S, v20.4S // .........................................................................................................................*.......................................... - // add v18.4S, v30.4S, v20.4S // ..........................................................................................................................*......................................... - // mls v19.4S, v10.4S, v8.4S // ........................................................................................................................*........................................... - // mul v22.4S, v28.4S, v16.S[2] // ...........................................................................................................................*........................................ - // sqrdmulh v14.4S, v28.4S, v16.S[3] // .............................................................................................................................*...................................... - // srshr v28.4S, v4.4S, #23 // ..............................................................................................................................*..................................... - // mul v6.4S, v31.4S, v17.S[0] // ...............................................................................................................................*.................................... - // sqrdmulh v31.4S, v31.4S, v17.S[1] // ................................................................................................................................*................................... - // mls v5.4S, v27.4S, v8.S[0] // ................................................................................................................*................................................... - // srshr v12.4S, v18.4S, #23 // .................................................................................................................................*.................................. - // mls v29.4S, v11.4S, v8.S[0] // ............................................................................................................................*....................................... - // sub v11.4S, v1.4S, v19.4S // ...................................................................................................................................*................................ - // add v10.4S, v1.4S, v19.4S // ..................................................................................................................................*................................. - // mls v22.4S, v14.4S, v8.S[0] // .......................................................................................................................................*............................ - // mls v4.4S, v28.4S, v8.4S // ......................................................................................................................................*............................. - // mls v18.4S, v12.4S, v8.4S // ........................................................................................................................................*........................... - // mls v6.4S, v31.4S, v8.S[0] // .........................................................................................................................................*.......................... - // mul v7.4S, v11.4S, v16.S[0] // ...........................................................................................................................................*........................ - // sqrdmulh v11.4S, v11.4S, v16.S[1] // ............................................................................................................................................*....................... - // str q10, [x1], #(16*4) // ..........................................................................................................................................*......................... - // sub v23.4S, v5.4S, v29.4S // .....................................................................................................................................*.............................. - // add v31.4S, v5.4S, v29.4S // ....................................................................................................................................*............................... - // sub v12.4S, v4.4S, v18.4S // ................................................................................................................................................*................... - // sub v27.4S, v22.4S, v6.4S // ..................................................................................................................................................*................. - // add v13.4S, v4.4S, v18.4S // .................................................................................................................................................*.................. - // add v10.4S, v22.4S, v6.4S // ...................................................................................................................................................*................ - // mls v7.4S, v11.4S, v8.S[0] // ....................................................................................................................................................*............... - // str q31, [x1, #-32] // .............................................................................................................................................*...................... - // mul v31.4S, v23.4S, v16.S[0] // ..............................................................................................................................................*..................... - // sqrdmulh v3.4S, v23.4S, v16.S[1] // ...............................................................................................................................................*.................... - // mul v5.4S, v12.4S, v16.S[0] // .........................................................................................................................................................*.......... - // sqrdmulh v12.4S, v12.4S, v16.S[1] // ..........................................................................................................................................................*......... - // mul v28.4S, v27.4S, v16.S[0] // ...........................................................................................................................................................*........ - // sqrdmulh v27.4S, v27.4S, v16.S[1] // ............................................................................................................................................................*....... - // str q13, [x1, #-48] // ......................................................................................................................................................*............. - // str q10, [x1, #-16] // .......................................................................................................................................................*............ - // add x1, x1, #64 // ........................................................................................................................................................*........... - // str q7, [x2], #(16*4) // .............................................................................................................................................................*...... - // mls v31.4S, v3.4S, v8.S[0] // .....................................................................................................................................................*.............. - // mls v5.4S, v12.4S, v8.S[0] // ...............................................................................................................................................................*.... - // mls v28.4S, v27.4S, v8.S[0] // ................................................................................................................................................................*... - // str q31, [x2, #-32] // ..............................................................................................................................................................*..... - // str q5, [x2, #-48] // .................................................................................................................................................................*.. - // str q28, [x2, #-16] // ..................................................................................................................................................................*. - // add x2, x2, #64 // ...................................................................................................................................................................* + // Instructions: 9 + // Expected cycles: 5 + // Expected IPC: 1.80 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mls v14.4S, v15.4S, v8.S[0] // ...*.......................... + mls v6.4S, v29.4S, v8.S[0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v0.4S, v10.4S, v8.S[0] // ..*........................... + mls v22.4S, v4.4S, v8.S[0] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q14, [x2, #48] // .......*...................... + str q6, [x2], #(16*4) // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q0, [x2, #-48] // ......*....................... + str q22, [x2, #-32] // .....*........................ + add x2, x2, #64 // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // mls v6.4S, v29.4S, v8.S[0] // .*............................. + // mls v22.4S, v4.4S, v8.S[0] // ...*........................... + // mls v0.4S, v10.4S, v8.S[0] // ..*............................ + // mls v14.4S, v15.4S, v8.S[0] // *.............................. + // str q6, [x2], #(16*4) // .....*......................... + // str q22, [x2, #-32] // .......*....................... + // str q0, [x2, #-48] // ......*........................ + // str q14, [x2, #-16] // ....*.......................... + // add x2, x2, #64 // ........*...................... // ----------------------------------------------------------------------------- @@ -1701,830 +1661,862 @@ layer45678_start: load_roots_123 .p2align 2 - // Instructions: 78 - // Expected cycles: 25 - // Expected IPC: 3.12 + // Instructions: 88 + // Expected cycles: 27 + // Expected IPC: 3.26 // - // Wall time: 3.49s - // User time: 3.49s + // Wall time: 5.97s + // User time: 5.97s // - // ----------------------------- original position -----------------------------> + // ---------------------------------- original position ----------------------------------> // 0 25 50 75 - // |------------------------|------------------------|------------------------|-- - ldr q11, [x0, #384] // ..*........................................................................... - ldr q27, [x0, #256] // *............................................................................. - ldr q7, [x0, #896] // .*............................................................................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - ldr q23, [x0, #768] // ...*.......................................................................... - ldr q24, [x0, #128] // ....*......................................................................... - ldr q9, [x0, #0] // .....*........................................................................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - ldr q13, [x0, #640] // ......*....................................................................... - ldr q20, [x0, #512] // .......*...................................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v21.4S, v27.4S, v11.4S // ........*..................................................................... - add v27.4S, v27.4S, v11.4S // .........*.................................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v11.4S, v23.4S, v7.4S // .............*................................................................ - add v7.4S, v23.4S, v7.4S // ............*................................................................. - sub v23.4S, v9.4S, v24.4S // ..........*................................................................... - add v24.4S, v9.4S, v24.4S // ...........*.................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v9.4S, v20.4S, v13.4S // .................*............................................................ - add v13.4S, v20.4S, v13.4S // ................*............................................................. - sqrdmulh v20.4S, v21.4S, v2.S[1] // ..............*............................................................... - mul v21.4S, v21.4S, v2.S[0] // ...............*.............................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v15.4S, v11.4S, v3.S[0] // ..................*........................................................... - sqrdmulh v11.4S, v11.4S, v3.S[1] // .....................*........................................................ - add v18.4S, v24.4S, v27.4S // ....................*......................................................... - mul v16.4S, v23.4S, v1.S[2] // ...................*.......................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v23.4S, v23.4S, v1.S[3] // .......................*...................................................... - mul v17.4S, v9.4S, v2.S[2] // ........................*..................................................... - sqrdmulh v9.4S, v9.4S, v2.S[3] // .........................*.................................................... - add v5.4S, v13.4S, v7.4S // ......................*....................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v27.4S, v24.4S, v27.4S // ...........................*.................................................. - sub v7.4S, v13.4S, v7.4S // ............................*................................................. - mls v21.4S, v20.4S, v8.S[0] // ..........................*................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v15.4S, v11.4S, v8.S[0] // ..............................*............................................... - sub v11.4S, v18.4S, v5.4S // .............................*................................................ - add v24.4S, v18.4S, v5.4S // ...............................*.............................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v16.4S, v23.4S, v8.S[0] // ..................................*........................................... - mls v17.4S, v9.4S, v8.S[0] // ...................................*.......................................... - mul v9.4S, v27.4S, v0.S[2] // ................................*............................................. - sqrdmulh v27.4S, v27.4S, v0.S[3] // .................................*............................................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v13.4S, v11.4S, v0.S[1] // .....................................*........................................ - mul v10.4S, v11.4S, v0.S[0] // .......................................*...................................... - mul v23.4S, v24.4S, v25.4S // ....................................*......................................... - sqrdmulh v11.4S, v24.4S, v26.4S // ......................................*....................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v20.4S, v7.4S, v1.S[0] // ........................................*..................................... - sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - add v24.4S, v16.4S, v21.4S // ...........................................*.................................. - sub v21.4S, v16.4S, v21.4S // ..................................................*........................... - mls v9.4S, v27.4S, v8.S[0] // ............................................*................................. - add v27.4S, v17.4S, v15.4S // ..........................................*................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v18.4S, v17.4S, v15.4S // ........................................................*..................... - mls v10.4S, v13.4S, v8.S[0] // .............................................*................................ - mls v23.4S, v11.4S, v8.S[0] // ..............................................*............................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - add v11.4S, v24.4S, v27.4S // ................................................*............................. - mls v20.4S, v7.4S, v8.S[0] // .................................................*............................ - sub v27.4S, v24.4S, v27.4S // ...............................................*.............................. - sqrdmulh v7.4S, v21.4S, v0.S[3] // ..........................................................*................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v15.4S, v21.4S, v0.S[2] // ...........................................................*.................. - mul v4.4S, v18.4S, v1.S[0] // .............................................................*................ - sqrdmulh v13.4S, v18.4S, v1.S[1] // ...............................................................*.............. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v24.4S, v11.4S, v25.4S // ....................................................*......................... - sqrdmulh v11.4S, v11.4S, v26.4S // .......................................................*...................... - sqrdmulh v21.4S, v27.4S, v0.S[1] // .....................................................*........................ - mul v27.4S, v27.4S, v0.S[0] // ......................................................*....................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v18.4S, v9.4S, v20.4S // .........................................................*.................... - cmge v16.4S, v23.4S, v30.4S // ...................................................*.......................... - cmge v17.4S, v31.4S, v23.4S // ..................................................................*........... - cmge v5.4S, v31.4S, v10.4S // ....................................................................*......... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - add v9.4S, v9.4S, v20.4S // ......................................................................*....... - mls v15.4S, v7.4S, v8.S[0] // ...................................................................*.......... - mls v4.4S, v13.4S, v8.S[0] // .......................................................................*...... - cmge v7.4S, v10.4S, v30.4S // .....................................................................*........ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v24.4S, v11.4S, v8.S[0] // ................................................................*............. - mls v27.4S, v21.4S, v8.S[0] // ............................................................*................. - mul v11.4S, v18.4S, v0.S[0] // ..............................................................*............... - sqrdmulh v13.4S, v18.4S, v0.S[1] // .................................................................*............ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v19.4S, v17.4S, v16.4S // .............................................................................* - sub v6.4S, v5.4S, v7.4S // ...........................................................................*.. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v11.4S, v13.4S, v8.S[0] // ............................................................................*. - cmge v28.4S, v31.4S, v27.4S // ........................................................................*..... - cmge v20.4S, v27.4S, v30.4S // .........................................................................*.... - cmge v22.4S, v24.4S, v30.4S // ..........................................................................*... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - - // ------------------------------- new position --------------------------------> + // |------------------------|------------------------|------------------------|------------ + ldr q6, [x0, #128] // *....................................................................................... + ldr q19, [x0, #0] // ..*..................................................................................... + ldr q28, [x0, #768] // .*...................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q22, [x0, #896] // .....*.................................................................................. + ldr q29, [x0, #512] // ...*.................................................................................... + ldr q4, [x0, #640] // ....*................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q13, [x0, #256] // ......*................................................................................. + ldr q14, [x0, #384] // .......*................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v15.4S, v19.4S, v6.4S // ........*............................................................................... + add v6.4S, v19.4S, v6.4S // .........*.............................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v19.4S, v28.4S, v22.4S // .............*.......................................................................... + add v28.4S, v28.4S, v22.4S // ..........*............................................................................. + add v22.4S, v29.4S, v4.4S // ...........*............................................................................ + sub v29.4S, v29.4S, v4.4S // ............*........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v4.4S, v13.4S, v14.4S // .................*...................................................................... + sub v13.4S, v13.4S, v14.4S // ................*....................................................................... + sqrdmulh v14.4S, v15.4S, v1.S[3] // ..............*......................................................................... + mul v15.4S, v15.4S, v1.S[2] // ...............*........................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v10.4S, v19.4S, v3.S[1] // ...................*.................................................................... + mul v19.4S, v19.4S, v3.S[0] // ....................*................................................................... + sub v11.4S, v22.4S, v28.4S // .....................*.................................................................. + sqrdmulh v21.4S, v29.4S, v2.S[3] // ..................*..................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v12.4S, v6.4S, v4.4S // .......................*................................................................ + add v6.4S, v6.4S, v4.4S // ........................*............................................................... + add v28.4S, v22.4S, v28.4S // .........................*.............................................................. + mul v22.4S, v29.4S, v2.S[2] // ......................*................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v29.4S, v13.4S, v2.S[1] // ...........................*............................................................ + mul v4.4S, v13.4S, v2.S[0] // ............................*........................................................... + mul v13.4S, v11.4S, v1.S[0] // ..........................*............................................................. + sqrdmulh v11.4S, v11.4S, v1.S[1] // ..............................*......................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.4S, v10.4S, v8.S[0] // .................................*...................................................... + mul v10.4S, v12.4S, v0.S[2] // .............................*.......................................................... + sqrdmulh v12.4S, v12.4S, v0.S[3] // ...............................*........................................................ + add v7.4S, v6.4S, v28.4S // ................................*....................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v15.4S, v14.4S, v8.S[0] // ...................................*.................................................... + mls v22.4S, v21.4S, v8.S[0] // .....................................*.................................................. + sub v6.4S, v6.4S, v28.4S // ..................................*..................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v4.4S, v29.4S, v8.S[0] // ......................................*................................................. + mls v13.4S, v11.4S, v8.S[0] // ..........................................*............................................. + sqrdmulh v28.4S, v7.4S, v26.4S // ....................................*................................................... + mul v14.4S, v7.4S, v25.4S // .......................................*................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v10.4S, v12.4S, v8.S[0] // ........................................*............................................... + mul v11.4S, v6.4S, v0.S[0] // .........................................*.............................................. + sqrdmulh v6.4S, v6.4S, v0.S[1] // ...........................................*............................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v29.4S, v22.4S, v19.4S // ............................................*........................................... + sub v19.4S, v22.4S, v19.4S // ................................................*....................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v22.4S, v15.4S, v4.4S // ..............................................*......................................... + sub v4.4S, v15.4S, v4.4S // ...................................................................*.................... + mls v14.4S, v28.4S, v8.S[0] // .............................................*.......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v28.4S, v10.4S, v13.4S // ...............................................*........................................ + sub v13.4S, v10.4S, v13.4S // ....................................................*................................... + mls v11.4S, v6.4S, v8.S[0] // .................................................*...................................... + sqrdmulh v15.4S, v19.4S, v1.S[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v10.4S, v22.4S, v29.4S // ..................................................*..................................... + sub v22.4S, v22.4S, v29.4S // ...................................................*.................................... + sqrdmulh v21.4S, v4.4S, v0.S[3] // ........................................................................*............... + mul v12.4S, v4.4S, v0.S[2] // .........................................................................*.............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v7.4S, v28.4S, v26.4S // ......................................................*................................. + mul v4.4S, v28.4S, v25.4S // .......................................................*................................ + cmge v28.4S, v31.4S, v14.4S // .....................................................*.................................. + cmge v9.4S, v14.4S, v30.4S // ........................................................*............................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + cmge v17.4S, v31.4S, v11.4S // .........................................................*.............................. + cmge v23.4S, v11.4S, v30.4S // ..........................................................*............................. + sqrdmulh v24.4S, v10.4S, v26.4S // ...........................................................*............................ + mul v6.4S, v22.4S, v0.S[0] // ............................................................*........................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v29.4S, v10.4S, v25.4S // .................................................................*...................... + sqrdmulh v22.4S, v22.4S, v0.S[1] // ..............................................................*......................... + sub v28.4S, v28.4S, v9.4S // ...............................................................*........................ + sqrdmulh v10.4S, v13.4S, v0.S[1] // .............................................................*.......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v9.4S, v19.4S, v1.S[0] // ..........................................................................*............. + mul v19.4S, v13.4S, v0.S[0] // ....................................................................*................... + mls v4.4S, v7.4S, v8.S[0] // ................................................................*....................... + sub v13.4S, v17.4S, v23.4S // .....................................................................*.................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v14.4S, v28.4S, v8.4S // ......................................................................*................. + mls v12.4S, v21.4S, v8.S[0] // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v11.4S, v13.4S, v8.4S // ..............................................................................*......... + mls v29.4S, v24.4S, v8.S[0] // ...........................................................................*............ + mls v6.4S, v22.4S, v8.S[0] // .......................................................................*................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v9.4S, v15.4S, v8.S[0] // .................................................................................*...... + mls v19.4S, v10.4S, v8.S[0] // ................................................................................*....... + cmge v28.4S, v31.4S, v4.4S // ............................................................................*........... + cmge v22.4S, v4.4S, v30.4S // .............................................................................*.......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q14, [x0], #(16) // ...............................................................................*........ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q11, [x0, #496] // ......................................................................................*. + sub v22.4S, v28.4S, v22.4S // ...................................................................................*.... + cmge v28.4S, v29.4S, v30.4S // ....................................................................................*... + cmge v24.4S, v31.4S, v29.4S // .....................................................................................*.. + cmge v13.4S, v6.4S, v30.4S // .......................................................................................* + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + + // ------------------------------------ new position -------------------------------------> // 0 25 50 75 - // |------------------------|------------------------|------------------------|-- - // ldr q16, [x0, #256] // .*............................................................................ - // ldr q12, [x0, #896] // ..*........................................................................... - // ldr q18, [x0, #384] // *............................................................................. - // ldr q5, [x0, #768] // ...*.......................................................................... - // ldr q21, [x0, #128] // ....*......................................................................... - // ldr q20, [x0, #0] // .....*........................................................................ - // ldr q29, [x0, #640] // ......*....................................................................... - // ldr q15, [x0, #512] // .......*...................................................................... - // sub v19.4S, v16.4S, v18.4S // ........*..................................................................... - // add v17.4S, v16.4S, v18.4S // .........*.................................................................... - // sub v23.4S, v20.4S, v21.4S // ............*................................................................. - // add v6.4S, v20.4S, v21.4S // .............*................................................................ - // add v28.4S, v5.4S, v12.4S // ...........*.................................................................. - // sub v21.4S, v5.4S, v12.4S // ..........*................................................................... - // sqrdmulh v5.4S, v19.4S, v2.S[1] // ................*............................................................. - // mul v20.4S, v19.4S, v2.S[0] // .................*............................................................ - // add v19.4S, v15.4S, v29.4S // ...............*.............................................................. - // sub v29.4S, v15.4S, v29.4S // ..............*............................................................... - // mul v15.4S, v21.4S, v3.S[0] // ..................*........................................................... - // mul v27.4S, v23.4S, v1.S[2] // .....................*........................................................ - // add v18.4S, v6.4S, v17.4S // ....................*......................................................... - // sqrdmulh v12.4S, v21.4S, v3.S[1] // ...................*.......................................................... - // add v10.4S, v19.4S, v28.4S // .........................*.................................................... - // sqrdmulh v23.4S, v23.4S, v1.S[3] // ......................*....................................................... - // mul v21.4S, v29.4S, v2.S[2] // .......................*...................................................... - // sqrdmulh v24.4S, v29.4S, v2.S[3] // ........................*..................................................... - // mls v20.4S, v5.4S, v8.S[0] // ............................*................................................. - // sub v6.4S, v6.4S, v17.4S // ..........................*................................................... - // sub v19.4S, v19.4S, v28.4S // ...........................*.................................................. - // sub v16.4S, v18.4S, v10.4S // ..............................*............................................... - // mls v15.4S, v12.4S, v8.S[0] // .............................*................................................ - // add v10.4S, v18.4S, v10.4S // ...............................*.............................................. - // mul v18.4S, v6.4S, v0.S[2] // ..................................*........................................... - // sqrdmulh v12.4S, v6.4S, v0.S[3] // ...................................*.......................................... - // mls v27.4S, v23.4S, v8.S[0] // ................................*............................................. - // mls v21.4S, v24.4S, v8.S[0] // .................................*............................................ - // mul v23.4S, v10.4S, v25.4S // ......................................*....................................... - // sqrdmulh v5.4S, v16.4S, v0.S[1] // ....................................*......................................... - // sqrdmulh v17.4S, v10.4S, v26.4S // .......................................*...................................... - // mul v10.4S, v16.4S, v0.S[0] // .....................................*........................................ - // mul v16.4S, v19.4S, v1.S[0] // ........................................*..................................... - // sqrdmulh v28.4S, v19.4S, v1.S[1] // .........................................*.................................... - // add v22.4S, v21.4S, v15.4S // .............................................*................................ - // add v24.4S, v27.4S, v20.4S // ..........................................*................................... - // mls v18.4S, v12.4S, v8.S[0] // ............................................*................................. - // mls v10.4S, v5.4S, v8.S[0] // ...............................................*.............................. - // mls v23.4S, v17.4S, v8.S[0] // ................................................*............................. - // sub v4.4S, v24.4S, v22.4S // ...................................................*.......................... - // add v12.4S, v24.4S, v22.4S // .................................................*............................ - // mls v16.4S, v28.4S, v8.S[0] // ..................................................*........................... - // sub v20.4S, v27.4S, v20.4S // ...........................................*.................................. - // cmge v17.4S, v23.4S, v30.4S // .............................................................*................ - // mul v24.4S, v12.4S, v25.4S // ........................................................*..................... - // sqrdmulh v22.4S, v4.4S, v0.S[1] // ..........................................................*................... - // mul v27.4S, v4.4S, v0.S[0] // ...........................................................*.................. - // sqrdmulh v28.4S, v12.4S, v26.4S // .........................................................*.................... - // sub v29.4S, v21.4S, v15.4S // ..............................................*............................... - // sub v21.4S, v18.4S, v16.4S // ............................................................*................. - // sqrdmulh v14.4S, v20.4S, v0.S[3] // ....................................................*......................... - // mul v15.4S, v20.4S, v0.S[2] // .....................................................*........................ - // mls v27.4S, v22.4S, v8.S[0] // .....................................................................*........ - // mul v4.4S, v29.4S, v1.S[0] // ......................................................*....................... - // mul v11.4S, v21.4S, v0.S[0] // ......................................................................*....... - // sqrdmulh v12.4S, v29.4S, v1.S[1] // .......................................................*...................... - // mls v24.4S, v28.4S, v8.S[0] // ....................................................................*......... - // sqrdmulh v5.4S, v21.4S, v0.S[1] // .......................................................................*...... - // cmge v19.4S, v31.4S, v23.4S // ..............................................................*............... - // mls v15.4S, v14.4S, v8.S[0] // .................................................................*............ - // cmge v29.4S, v31.4S, v10.4S // ...............................................................*.............. - // cmge v21.4S, v10.4S, v30.4S // ...................................................................*.......... - // add v9.4S, v18.4S, v16.4S // ................................................................*............. - // mls v4.4S, v12.4S, v8.S[0] // ..................................................................*........... - // cmge v28.4S, v31.4S, v27.4S // ...........................................................................*.. - // cmge v20.4S, v27.4S, v30.4S // ............................................................................*. - // cmge v22.4S, v24.4S, v30.4S // .............................................................................* - // sub v6.4S, v29.4S, v21.4S // .........................................................................*.... - // mls v11.4S, v5.4S, v8.S[0] // ..........................................................................*... - // sub v19.4S, v19.4S, v17.4S // ........................................................................*..... + // |------------------------|------------------------|------------------------|------------ + // ldr q14, [x0, #128] // *....................................................................................... + // ldr q21, [x0, #768] // ..*..................................................................................... + // ldr q16, [x0, #0] // .*...................................................................................... + // ldr q27, [x0, #512] // ....*................................................................................... + // ldr q20, [x0, #640] // .....*.................................................................................. + // ldr q5, [x0, #896] // ...*.................................................................................... + // ldr q11, [x0, #256] // ......*................................................................................. + // ldr q24, [x0, #384] // .......*................................................................................ + // sub v18.4S, v16.4S, v14.4S // ........*............................................................................... + // add v16.4S, v16.4S, v14.4S // .........*.............................................................................. + // add v4.4S, v21.4S, v5.4S // ...........*............................................................................ + // add v7.4S, v27.4S, v20.4S // ............*........................................................................... + // sub v22.4S, v27.4S, v20.4S // .............*.......................................................................... + // sub v23.4S, v21.4S, v5.4S // ..........*............................................................................. + // sqrdmulh v27.4S, v18.4S, v1.S[3] // ................*....................................................................... + // mul v14.4S, v18.4S, v1.S[2] // .................*...................................................................... + // sub v21.4S, v11.4S, v24.4S // ...............*........................................................................ + // add v20.4S, v11.4S, v24.4S // ..............*......................................................................... + // sqrdmulh v5.4S, v22.4S, v2.S[3] // .....................*.................................................................. + // sqrdmulh v18.4S, v23.4S, v3.S[1] // ..................*..................................................................... + // mul v11.4S, v23.4S, v3.S[0] // ...................*.................................................................... + // sub v15.4S, v7.4S, v4.4S // ....................*................................................................... + // mul v10.4S, v22.4S, v2.S[2] // .........................*.............................................................. + // sub v22.4S, v16.4S, v20.4S // ......................*................................................................. + // add v20.4S, v16.4S, v20.4S // .......................*................................................................ + // add v16.4S, v7.4S, v4.4S // ........................*............................................................... + // mul v7.4S, v15.4S, v1.S[0] // ............................*........................................................... + // sqrdmulh v4.4S, v21.4S, v2.S[1] // ..........................*............................................................. + // mul v19.4S, v21.4S, v2.S[0] // ...........................*............................................................ + // mul v29.4S, v22.4S, v0.S[2] // ...............................*........................................................ + // sqrdmulh v24.4S, v15.4S, v1.S[1] // .............................*.......................................................... + // sqrdmulh v15.4S, v22.4S, v0.S[3] // ................................*....................................................... + // add v22.4S, v20.4S, v16.4S // .................................*...................................................... + // mls v11.4S, v18.4S, v8.S[0] // ..............................*......................................................... + // sub v20.4S, v20.4S, v16.4S // ....................................*................................................... + // mls v14.4S, v27.4S, v8.S[0] // ..................................*..................................................... + // sqrdmulh v21.4S, v22.4S, v26.4S // .......................................*................................................ + // mls v10.4S, v5.4S, v8.S[0] // ...................................*.................................................... + // mls v19.4S, v4.4S, v8.S[0] // .....................................*.................................................. + // mul v22.4S, v22.4S, v25.4S // ........................................*............................................... + // mls v29.4S, v15.4S, v8.S[0] // .........................................*.............................................. + // mul v16.4S, v20.4S, v0.S[0] // ..........................................*............................................. + // mls v7.4S, v24.4S, v8.S[0] // ......................................*................................................. + // sqrdmulh v23.4S, v20.4S, v0.S[1] // ...........................................*............................................ + // add v17.4S, v10.4S, v11.4S // ............................................*........................................... + // mls v22.4S, v21.4S, v8.S[0] // ................................................*....................................... + // add v20.4S, v14.4S, v19.4S // ..............................................*......................................... + // add v12.4S, v29.4S, v7.4S // .................................................*...................................... + // sub v27.4S, v10.4S, v11.4S // .............................................*.......................................... + // mls v16.4S, v23.4S, v8.S[0] // ...................................................*.................................... + // add v21.4S, v20.4S, v17.4S // .....................................................*.................................. + // sub v5.4S, v20.4S, v17.4S // ......................................................*................................. + // sub v20.4S, v29.4S, v7.4S // ..................................................*..................................... + // cmge v23.4S, v31.4S, v22.4S // ...........................................................*............................ + // sqrdmulh v17.4S, v12.4S, v26.4S // .........................................................*.............................. + // mul v4.4S, v12.4S, v25.4S // ..........................................................*............................. + // cmge v10.4S, v22.4S, v30.4S // ............................................................*........................... + // cmge v12.4S, v31.4S, v16.4S // .............................................................*.......................... + // cmge v11.4S, v16.4S, v30.4S // ..............................................................*......................... + // sqrdmulh v18.4S, v21.4S, v26.4S // ...............................................................*........................ + // mul v6.4S, v5.4S, v0.S[0] // ................................................................*....................... + // sqrdmulh v24.4S, v20.4S, v0.S[1] // ....................................................................*................... + // sqrdmulh v7.4S, v5.4S, v0.S[1] // ..................................................................*..................... + // sub v10.4S, v23.4S, v10.4S // ...................................................................*.................... + // mls v4.4S, v17.4S, v8.S[0] // .......................................................................*................ + // mul v29.4S, v21.4S, v25.4S // .................................................................*...................... + // sqrdmulh v21.4S, v27.4S, v1.S[1] // ....................................................*................................... + // sub v14.4S, v14.4S, v19.4S // ...............................................*........................................ + // mul v19.4S, v20.4S, v0.S[0] // ......................................................................*................. + // sub v11.4S, v12.4S, v11.4S // ........................................................................*............... + // mls v22.4S, v10.4S, v8.4S // .........................................................................*.............. + // mls v6.4S, v7.4S, v8.S[0] // .............................................................................*.......... + // sqrdmulh v20.4S, v14.4S, v0.S[3] // .......................................................*................................ + // mul v12.4S, v14.4S, v0.S[2] // ........................................................*............................... + // mul v9.4S, v27.4S, v1.S[0] // .....................................................................*.................. + // mls v29.4S, v18.4S, v8.S[0] // ............................................................................*........... + // cmge v7.4S, v31.4S, v4.4S // ................................................................................*....... + // cmge v15.4S, v4.4S, v30.4S // .................................................................................*...... + // mls v16.4S, v11.4S, v8.4S // ...........................................................................*............ + // str q22, [x0], #(16) // ..................................................................................*..... + // mls v19.4S, v24.4S, v8.S[0] // ...............................................................................*........ + // mls v9.4S, v21.4S, v8.S[0] // ..............................................................................*......... + // mls v12.4S, v20.4S, v8.S[0] // ..........................................................................*............. + // sub v22.4S, v7.4S, v15.4S // ....................................................................................*... + // cmge v28.4S, v29.4S, v30.4S // .....................................................................................*.. + // cmge v24.4S, v31.4S, v29.4S // ......................................................................................*. + // str q16, [x0, #496] // ...................................................................................*.... + // cmge v13.4S, v6.4S, v30.4S // .......................................................................................* sub count, count, #1 layer123_start: // Instructions: 120 - // Expected cycles: 26 - // Expected IPC: 4.62 + // Expected cycles: 28 + // Expected IPC: 4.29 // - // Wall time: 966.20s - // User time: 966.20s + // Wall time: 603.23s + // User time: 603.23s // // -------------------------------------------------- original position --------------------------------------------------> // 0 25 50 75 100 // |------------------------|------------------------|------------------------|------------------------|------------------- - ldr q16, [x0, #272] // ..e..................................................................................................................... - ldr q12, [x0, #912] // .......e................................................................................................................ - ldr q18, [x0, #400] // ...e.................................................................................................................... - cmge v7.4S, v31.4S, v24.4S // ........................................................................................................*............... + mls v4.4S, v22.4S, v8.4S // ...............................................................................................................*........ + ldr q14, [x0, #128] // .e...................................................................................................................... + ldr q21, [x0, #768] // ......e................................................................................................................. + ldr q16, [x0, #0] // e....................................................................................................................... + cmge v11.4S, v31.4S, v6.4S // ........................................................................*............................................... + cmge v23.4S, v31.4S, v19.4S // ............................................................................*........................................... + cmge v15.4S, v19.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + ldr q27, [x0, #512] // ....e................................................................................................................... + ldr q20, [x0, #640] // .....e.................................................................................................................. + ldr q5, [x0, #896] // .......e................................................................................................................ + sub v10.4S, v12.4S, v9.4S // ...............................................................*........................................................ + add v18.4S, v12.4S, v9.4S // ................................................................*....................................................... + sub v28.4S, v24.4S, v28.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v9.4S, v11.4S, v13.4S // ..........................................................................*............................................. + sub v22.4S, v23.4S, v15.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q11, [x0, #256] // ..e..................................................................................................................... + ldr q24, [x0, #384] // ...e.................................................................................................................... + mls v29.4S, v28.4S, v8.4S // ...........................................................................................................*............ + mul v13.4S, v18.4S, v25.4S // ..................................................................................................*..................... + str q4, [x0, #240] // ......................................................................................................................*. + sqrdmulh v17.4S, v10.4S, v0.S[1] // .................................................................*...................................................... + // gap // ........................................................................................................................ + sqrdmulh v12.4S, v18.4S, v26.4S // .................................................................................................*...................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v22.4S, v8.4S // ...............................................................................*........................................ + sub v18.4S, v16.4S, v14.4S // ........e............................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v16.4S, v16.4S, v14.4S // .........e.............................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v28.4S, v10.4S, v0.S[0] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + add v4.4S, v21.4S, v5.4S // ........................e............................................................................................... + add v7.4S, v27.4S, v20.4S // ...................e.................................................................................................... + sub v22.4S, v27.4S, v20.4S // ..................e..................................................................................................... + sub v23.4S, v21.4S, v5.4S // .......................e................................................................................................ + // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v17.4S, v28.4S, v20.4S // ..........................................................................*............................................. - sqrdmulh v13.4S, v9.4S, v26.4S // ...............................................................................................*........................ - mul v9.4S, v9.4S, v25.4S // ..............................................................................................*......................... - ldr q5, [x0, #784] // ......e................................................................................................................. - mls v10.4S, v6.4S, v8.4S // .......................................................................*................................................ // gap // ........................................................................................................................ - sub v28.4S, v15.4S, v4.4S // ...............................................................*........................................................ - add v6.4S, v15.4S, v4.4S // ................................................................*....................................................... - mls v23.4S, v19.4S, v8.4S // .......................................................................................................*................ - ldr q21, [x0, #144] // .e...................................................................................................................... - ldr q20, [x0, #16] // e....................................................................................................................... - mls v27.4S, v17.4S, v8.4S // ...........................................................................*............................................ - sub v17.4S, v7.4S, v22.4S // ..........................................................................................................*............. - cmge v4.4S, v11.4S, v30.4S // .............................................................................*.......................................... - cmge v14.4S, v31.4S, v11.4S // ............................................................................*........................................... + str q29, [x0, #112] // .....................................................................................................................*.. + sqrdmulh v27.4S, v18.4S, v1.S[3] // ..........e............................................................................................................. + mul v14.4S, v18.4S, v1.S[2] // ...........e............................................................................................................ + sub v21.4S, v11.4S, v24.4S // .............e.......................................................................................................... + add v20.4S, v11.4S, v24.4S // ..............e......................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q29, [x0, #656] // .....e.................................................................................................................. - ldr q15, [x0, #528] // ....e................................................................................................................... // gap // ........................................................................................................................ + sqrdmulh v5.4S, v22.4S, v2.S[3] // ....................e................................................................................................... + sqrdmulh v18.4S, v23.4S, v3.S[1] // .........................e.............................................................................................. + mul v11.4S, v23.4S, v3.S[0] // ..........................e............................................................................................. + str q19, [x0, #752] // ......................................................................................*................................. + sub v15.4S, v7.4S, v4.4S // ......................................e................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v7.4S, v28.4S, v0.S[0] // .................................................................*...................................................... - mls v9.4S, v13.4S, v8.S[0] // ................................................................................................*....................... - sqrdmulh v22.4S, v28.4S, v0.S[1] // ..................................................................*..................................................... - mul v13.4S, v6.4S, v25.4S // .................................................................................................*...................... - mls v24.4S, v17.4S, v8.4S // ...........................................................................................................*............ - sub v19.4S, v16.4S, v18.4S // .............e.......................................................................................................... - add v17.4S, v16.4S, v18.4S // ..............e......................................................................................................... // gap // ........................................................................................................................ + mul v10.4S, v22.4S, v2.S[2] // .....................e.................................................................................................. + sub v22.4S, v16.4S, v20.4S // ............................e........................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v16.4S, v6.4S, v26.4S // ..................................................................................................*..................... + add v20.4S, v16.4S, v20.4S // .............................e.......................................................................................... + add v16.4S, v7.4S, v4.4S // .......................................e................................................................................ + mul v7.4S, v15.4S, v1.S[0] // .........................................e.............................................................................. + mls v13.4S, v12.4S, v8.S[0] // ...................................................................................................*.................... + sqrdmulh v4.4S, v21.4S, v2.S[1] // ...............e........................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q23, [x0], #(16) // ....................................................................................................................*... - sub v23.4S, v20.4S, v21.4S // ........e............................................................................................................... - add v6.4S, v20.4S, v21.4S // .........e.............................................................................................................. - add v28.4S, v5.4S, v12.4S // ........................e............................................................................................... - sub v21.4S, v5.4S, v12.4S // .......................e................................................................................................ - sqrdmulh v5.4S, v19.4S, v2.S[1] // ................e....................................................................................................... - mul v20.4S, v19.4S, v2.S[0] // ...............e........................................................................................................ - add v19.4S, v15.4S, v29.4S // ...................e.................................................................................................... - str q27, [x0, #624] // .....................................................................................*.................................. // gap // ........................................................................................................................ + mul v19.4S, v21.4S, v2.S[0] // ................e....................................................................................................... + mul v29.4S, v22.4S, v0.S[2] // ...............................e........................................................................................ + sqrdmulh v24.4S, v15.4S, v1.S[1] // ........................................e............................................................................... + sqrdmulh v15.4S, v22.4S, v0.S[3] // ..............................e......................................................................................... // gap // ........................................................................................................................ - sub v29.4S, v15.4S, v29.4S // ..................e..................................................................................................... - str q10, [x0, #496] // ....................................................................................*................................... - str q24, [x0, #112] // .....................................................................................................................*.. - mul v15.4S, v21.4S, v3.S[0] // .........................e.............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v27.4S, v23.4S, v1.S[2] // ..........e............................................................................................................. - add v18.4S, v6.4S, v17.4S // .............................e.......................................................................................... - sqrdmulh v12.4S, v21.4S, v3.S[1] // ..........................e............................................................................................. - add v10.4S, v19.4S, v28.4S // .......................................e................................................................................ - sqrdmulh v23.4S, v23.4S, v1.S[3] // ...........e............................................................................................................ + add v22.4S, v20.4S, v16.4S // .................................................e...................................................................... + mls v11.4S, v18.4S, v8.S[0] // ...........................e............................................................................................ + sub v20.4S, v20.4S, v16.4S // ................................................e....................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v21.4S, v29.4S, v2.S[2] // ....................e................................................................................................... - sqrdmulh v24.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. + mls v14.4S, v27.4S, v8.S[0] // ............e........................................................................................................... + mls v28.4S, v17.4S, v8.S[0] // ...................................................................*.................................................... + sqrdmulh v21.4S, v22.4S, v26.4S // ........................................................................................e............................... + mls v10.4S, v5.4S, v8.S[0] // ......................e................................................................................................. + mls v19.4S, v4.4S, v8.S[0] // .................e...................................................................................................... // gap // ........................................................................................................................ - mls v20.4S, v5.4S, v8.S[0] // .................e...................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v22.4S, v22.4S, v25.4S // .........................................................................................e.............................. // gap // ........................................................................................................................ - mls v13.4S, v16.4S, v8.S[0] // ...................................................................................................*.................... - cmge v29.4S, v9.4S, v30.4S // .............................................................................................................*.......... - sub v6.4S, v6.4S, v17.4S // ............................e........................................................................................... - sub v19.4S, v19.4S, v28.4S // ......................................e................................................................................. + mls v29.4S, v15.4S, v8.S[0] // ................................e....................................................................................... + mul v16.4S, v20.4S, v0.S[0] // ...................................................e.................................................................... + mls v7.4S, v24.4S, v8.S[0] // ..........................................e............................................................................. + sqrdmulh v23.4S, v20.4S, v0.S[1] // ..................................................e..................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v16.4S, v18.4S, v10.4S // ................................................e....................................................................... - mls v15.4S, v12.4S, v8.S[0] // ...........................e............................................................................................ - add v10.4S, v18.4S, v10.4S // .................................................e...................................................................... - mul v18.4S, v6.4S, v0.S[2] // ..............................e......................................................................................... - sqrdmulh v12.4S, v6.4S, v0.S[3] // ...............................e........................................................................................ - mls v27.4S, v23.4S, v8.S[0] // ............e........................................................................................................... + cmge v12.4S, v28.4S, v30.4S // .................................................................................*...................................... + cmge v4.4S, v31.4S, v13.4S // ................................................................................................................*....... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v24.4S, v8.S[0] // ......................e................................................................................................. - mul v23.4S, v10.4S, v25.4S // ........................................................................................e............................... + cmge v18.4S, v31.4S, v28.4S // ................................................................................*....................................... + mls v6.4S, v9.4S, v8.4S // ...........................................................................*............................................ + add v17.4S, v10.4S, v11.4S // ............................................e........................................................................... + mls v22.4S, v21.4S, v8.S[0] // ..........................................................................................e............................. // gap // ........................................................................................................................ - sqrdmulh v5.4S, v16.4S, v0.S[1] // ...................................................e.................................................................... // gap // ........................................................................................................................ - sqrdmulh v17.4S, v10.4S, v26.4S // .........................................................................................e.............................. // gap // ........................................................................................................................ - mul v10.4S, v16.4S, v0.S[0] // ..................................................e..................................................................... // gap // ........................................................................................................................ - cmge v6.4S, v31.4S, v9.4S // ............................................................................................................*........... - mls v7.4S, v22.4S, v8.S[0] // ...................................................................*.................................................... + add v20.4S, v14.4S, v19.4S // ..................................e..................................................................................... + cmge v24.4S, v13.4S, v30.4S // .................................................................................................................*...... + sub v15.4S, v18.4S, v12.4S // ..................................................................................*..................................... + add v12.4S, v29.4S, v7.4S // ...........................................................e............................................................ + sub v27.4S, v10.4S, v11.4S // ...........................................e............................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v16.4S, v19.4S, v1.S[0] // ........................................e............................................................................... - sqrdmulh v28.4S, v19.4S, v1.S[1] // .........................................e.............................................................................. - add v22.4S, v21.4S, v15.4S // ............................................e........................................................................... + mls v16.4S, v23.4S, v8.S[0] // ....................................................e................................................................... + sub v9.4S, v4.4S, v24.4S // ..................................................................................................................*..... + str q6, [x0, #624] // .....................................................................................*.................................. + add v21.4S, v20.4S, v17.4S // ......................................................e................................................................. + sub v5.4S, v20.4S, v17.4S // .....................................................e.................................................................. // gap // ........................................................................................................................ - add v24.4S, v27.4S, v20.4S // ..................................e..................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v20.4S, v29.4S, v7.4S // ..........................................................e............................................................. + cmge v23.4S, v31.4S, v22.4S // ....................................................................................................e................... // gap // ........................................................................................................................ - cmge v19.4S, v13.4S, v30.4S // .................................................................................................................*...... - mls v18.4S, v12.4S, v8.S[0] // ................................e....................................................................................... - mls v10.4S, v5.4S, v8.S[0] // ....................................................e................................................................... // gap // ........................................................................................................................ + sqrdmulh v17.4S, v12.4S, v26.4S // ..............................................................................................e......................... // gap // ........................................................................................................................ + mul v4.4S, v12.4S, v25.4S // ...............................................................................................e........................ // gap // ........................................................................................................................ + cmge v10.4S, v22.4S, v30.4S // .....................................................................................................e.................. + cmge v12.4S, v31.4S, v16.4S // ....................................................................e................................................... + cmge v11.4S, v16.4S, v30.4S // .....................................................................e.................................................. + sqrdmulh v18.4S, v21.4S, v26.4S // ...........................................................................................e............................ + mul v6.4S, v5.4S, v0.S[0] // ........................................................e............................................................... // gap // ........................................................................................................................ - mls v23.4S, v17.4S, v8.S[0] // ..........................................................................................e............................. - sub v29.4S, v6.4S, v29.4S // ..............................................................................................................*......... - sub v17.4S, v14.4S, v4.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ - cmge v6.4S, v7.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ - sub v4.4S, v24.4S, v22.4S // .....................................................e.................................................................. // gap // ........................................................................................................................ + sqrdmulh v24.4S, v20.4S, v0.S[1] // ............................................................e........................................................... + sqrdmulh v7.4S, v5.4S, v0.S[1] // .......................................................e................................................................ + mls v28.4S, v15.4S, v8.4S // ...................................................................................*.................................... + sub v10.4S, v23.4S, v10.4S // ......................................................................................................e................. // gap // ........................................................................................................................ - add v12.4S, v24.4S, v22.4S // ......................................................e................................................................. - mls v16.4S, v28.4S, v8.S[0] // ..........................................e............................................................................. - cmge v5.4S, v31.4S, v13.4S // ................................................................................................................*....... - cmge v14.4S, v31.4S, v7.4S // ................................................................................*....................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v4.4S, v17.4S, v8.S[0] // ................................................................................................e....................... + mul v29.4S, v21.4S, v25.4S // ............................................................................................e........................... // gap // ........................................................................................................................ - sub v20.4S, v27.4S, v20.4S // .................................e...................................................................................... - mls v11.4S, v17.4S, v8.4S // ...............................................................................*........................................ - cmge v17.4S, v23.4S, v30.4S // .....................................................................................................e.................. - mul v24.4S, v12.4S, v25.4S // ...........................................................................................e............................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sqrdmulh v21.4S, v27.4S, v1.S[1] // .............................................e.......................................................................... + sub v14.4S, v14.4S, v19.4S // .................................e...................................................................................... + mul v19.4S, v20.4S, v0.S[0] // .............................................................e.......................................................... + mls v13.4S, v9.4S, v8.4S // ...................................................................................................................*.... + sub v11.4S, v12.4S, v11.4S // ......................................................................e................................................. // gap // ........................................................................................................................ - sqrdmulh v22.4S, v4.4S, v0.S[1] // ........................................................e............................................................... - mul v27.4S, v4.4S, v0.S[0] // .......................................................e................................................................ - sqrdmulh v28.4S, v12.4S, v26.4S // ............................................................................................e........................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v22.4S, v10.4S, v8.4S // .......................................................................................................e................ + mls v6.4S, v7.4S, v8.S[0] // .........................................................e.............................................................. + str q28, [x0, #880] // .......................................................................................*................................ // gap // ........................................................................................................................ - mls v9.4S, v29.4S, v8.4S // ...............................................................................................................*........ - sub v29.4S, v21.4S, v15.4S // ...........................................e............................................................................ - sub v21.4S, v18.4S, v16.4S // ..........................................................e............................................................. - sub v5.4S, v5.4S, v19.4S // ..................................................................................................................*..... - sub v19.4S, v14.4S, v6.4S // ..................................................................................*..................................... + sqrdmulh v20.4S, v14.4S, v0.S[3] // ...................................e.................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v12.4S, v14.4S, v0.S[2] // ....................................e................................................................................... + mul v9.4S, v27.4S, v1.S[0] // ..............................................e......................................................................... + mls v29.4S, v18.4S, v8.S[0] // .............................................................................................e.......................... + cmge v7.4S, v31.4S, v4.4S // ............................................................................................................e........... + cmge v15.4S, v4.4S, v30.4S // .............................................................................................................e.......... + mls v16.4S, v11.4S, v8.4S // .......................................................................e................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v14.4S, v20.4S, v0.S[3] // ....................................e................................................................................... - mul v15.4S, v20.4S, v0.S[2] // ...................................e.................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + str q22, [x0], #(16) // ....................................................................................................................e... + str q13, [x0, #352] // .......................................................................................................................* // gap // ........................................................................................................................ - mls v27.4S, v22.4S, v8.S[0] // .........................................................e.............................................................. - mul v4.4S, v29.4S, v1.S[0] // .............................................e.......................................................................... - str q11, [x0, #752] // ......................................................................................*................................. - mul v11.4S, v21.4S, v0.S[0] // ............................................................e........................................................... - sqrdmulh v12.4S, v29.4S, v1.S[1] // ..............................................e......................................................................... // gap // ........................................................................................................................ - mls v13.4S, v5.4S, v8.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ - mls v24.4S, v28.4S, v8.S[0] // .............................................................................................e.......................... - sqrdmulh v5.4S, v21.4S, v0.S[1] // .............................................................e.......................................................... // gap // ........................................................................................................................ - mls v7.4S, v19.4S, v8.4S // ...................................................................................*.................................... // gap // ........................................................................................................................ - cmge v19.4S, v31.4S, v23.4S // ....................................................................................................e................... + mls v19.4S, v24.4S, v8.S[0] // ..............................................................e......................................................... + mls v9.4S, v21.4S, v8.S[0] // ...............................................e........................................................................ + mls v12.4S, v20.4S, v8.S[0] // .....................................e.................................................................................. + sub v22.4S, v7.4S, v15.4S // ..............................................................................................................e......... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q9, [x0, #240] // ......................................................................................................................*. - mls v15.4S, v14.4S, v8.S[0] // .....................................e.................................................................................. - cmge v29.4S, v31.4S, v10.4S // ....................................................................e................................................... - cmge v21.4S, v10.4S, v30.4S // .....................................................................e.................................................. - add v9.4S, v18.4S, v16.4S // ...........................................................e............................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v4.4S, v12.4S, v8.S[0] // ...............................................e........................................................................ - cmge v28.4S, v31.4S, v27.4S // ........................................................................e............................................... - cmge v20.4S, v27.4S, v30.4S // .........................................................................e.............................................. - cmge v22.4S, v24.4S, v30.4S // .........................................................................................................e.............. - sub v6.4S, v29.4S, v21.4S // ......................................................................e................................................. - mls v11.4S, v5.4S, v8.S[0] // ..............................................................e......................................................... - str q7, [x0, #880] // .......................................................................................*................................ - str q13, [x0, #368] // .......................................................................................................................* - sub v19.4S, v19.4S, v17.4S // ......................................................................................................e................. // gap // ........................................................................................................................ + cmge v28.4S, v29.4S, v30.4S // .........................................................................................................e.............. + cmge v24.4S, v31.4S, v29.4S // ........................................................................................................e............... // gap // ........................................................................................................................ + str q16, [x0, #496] // ....................................................................................e................................... + cmge v13.4S, v6.4S, v30.4S // .........................................................................e.............................................. - // ---------------------------------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------------------------------> + // ------------------------------------------------------------------------------------------------------------ new position ------------------------------------------------------------------------------------------------------------> // 0 25 50 75 100 125 150 175 200 225 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- - // ldr q9, [x0, #0] // .............e..........................................................................................................'............~......................................................................................................... - // ldr q10, [x0, #(1*(1024/8))] // ............e...........................................................................................................'...........~.......................................................................................................... - // ldr q11, [x0, #(2*(1024/8))] // e.......................................................................................................................~...................................................................................................................... - // ldr q12, [x0, #(3*(1024/8))] // ..e.....................................................................................................................'.~.................................................................................................................... - // ldr q13, [x0, #(4*(1024/8))] // ...................e....................................................................................................'..................~................................................................................................... - // ldr q14, [x0, #(5*(1024/8))] // ..................e.....................................................................................................'.................~.................................................................................................... - // ldr q15, [x0, #(6*(1024/8))] // .......e................................................................................................................'......~............................................................................................................... - // ldr q16, [x0, #(7*(1024/8))] // .e......................................................................................................................'~..................................................................................................................... - // sub v24.4s, v9.4s, v10.4s // .............................e..........................................................................................'............................~......................................................................................... - // add v9.4s, v9.4s, v10.4s // ..............................e.........................................................................................'.............................~........................................................................................ - // mul v10.4s, v24.4s, v1.s[2] // .........................................e..............................................................................'........................................~............................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................e..........................................................................'............................................~......................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ..........................................................e.............................................................'.........................................................~............................................................ - // sub v24.4s, v11.4s, v12.4s // .........................e..............................................................................................'........................~............................................................................................. - // add v11.4s, v11.4s, v12.4s // ..........................e.............................................................................................'.........................~............................................................................................ - // mul v12.4s, v24.4s, v2.s[0] // ..................................e.....................................................................................'.................................~.................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .................................e......................................................................................'................................~..................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................................................e.......................................................................'...............................................~...................................................................... - // sub v24.4s, v13.4s, v14.4s // .....................................e..................................................................................'....................................~................................................................................. - // add v13.4s, v13.4s, v14.4s // ...................................e....................................................................................'..................................~................................................................................... - // mul v14.4s, v24.4s, v2.s[2] // ..............................................e.........................................................................'.............................................~........................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...............................................e........................................................................'..............................................~....................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ...........................................................e............................................................'..........................................................~........................................................... - // sub v24.4s, v15.4s, v16.4s // ................................e.......................................................................................'...............................~...................................................................................... - // add v15.4s, v15.4s, v16.4s // ...............................e........................................................................................'..............................~....................................................................................... - // mul v16.4s, v24.4s, v3.s[0] // ........................................e...............................................................................'.......................................~.............................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...........................................e............................................................................'..........................................~........................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ......................................................e.................................................................'.....................................................~................................................................ - // sub v24.4s, v9.4s, v11.4s // ...................................................e....................................................................'..................................................~................................................................... - // add v9.4s, v9.4s, v11.4s // ..........................................e.............................................................................'.........................................~............................................................................ - // mul v11.4s, v24.4s, v0.s[2] // ........................................................e...............................................................'.......................................................~.............................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................e..............................................................'........................................................~............................................................. - // mls v11.4s, v24.4s, v8.s[0] // .......................................................................e................................................'......................................................................~............................................... - // sub v24.4s, v10.4s, v12.4s // ..................................................................................e.....................................'.................................................................................~.................................... - // add v10.4s, v10.4s, v12.4s // .....................................................................e..................................................'....................................................................~................................................. - // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................................e........................'..............................................................................................~....................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................e.........................'.............................................................................................~........................ - // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................................e............'..........................................................................................................~........... - // sub v24.4s, v13.4s, v15.4s // ....................................................e...................................................................'...................................................~.................................................................. - // add v13.4s, v13.4s, v15.4s // ............................................e...........................................................................'...........................................~.......................................................................... - // mul v15.4s, v24.4s, v1.s[0] // ..................................................................e.....................................................'.................................................................~.................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................e....................................................'..................................................................~................................................... - // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................e........................................'..............................................................................~....................................... - // sub v24.4s, v14.4s, v16.4s // ..........................................................................................e.............................'.........................................................................................~............................ - // add v14.4s, v14.4s, v16.4s // ....................................................................e...................................................'...................................................................~.................................................. - // mul v16.4s, v24.4s, v1.s[0] // .................................................................................................e......................'................................................................................................~..................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................e...................'...................................................................................................~.................. - // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................................................e........'..............................................................................................................~....... - // sub v24.4s, v9.4s, v13.4s // .....................................................e..................................................................'....................................................~................................................................. - // add v9.4s, v9.4s, v13.4s // .......................................................e................................................................'......................................................~............................................................... - // mul v13.4s, v24.4s, v0.s[0] // ...............................................................e........................................................'..............................................................~....................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................e..........................................................'............................................................~......................................................... - // mls v13.4s, v24.4s, v8.s[0] // ........................................................................e...............................................'.......................................................................~.............................................. - // sub v24.4s, v10.4s, v14.4s // .............................................................................e..........................................'............................................................................~......................................... - // add v10.4s, v10.4s, v14.4s // ..............................................................................e.........................................'.............................................................................~........................................ - // mul v14.4s, v24.4s, v0.s[0] // .......................................................................................e................................'......................................................................................~............................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................e.................................'.....................................................................................~................................ - // mls v14.4s, v24.4s, v8.s[0] // ................................................................................................e.......................'...............................................................................................~...................... - // sub v24.4s, v11.4s, v15.4s // ...........................................................................................e............................'..........................................................................................~........................... - // add v11.4s, v11.4s, v15.4s // ..............................................................................................................e.........'.............................................................................................................~........ - // mul v15.4s, v24.4s, v0.s[0] // ...................................................................................................e....................'..................................................................................................~................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................e................'......................................................................................................~............... - // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................................................e...'...................................................................................................................~.. - // sub v24.4s, v12.4s, v16.4s // .........~..............................................................................................................'........*............................................................................................................. - // add v12.4s, v12.4s, v16.4s // ..........~.............................................................................................................'.........*............................................................................................................ - // mul v16.4s, v24.4s, v0.s[0] // ....................~...................................................................................................'...................*.................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................~.................................................................................................'.....................*................................................................................................ - // mls v16.4s, v24.4s, v8.s[0] // .................................................................~......................................................'................................................................*..................................................... - // cmge v27.4s, v31.4s, v13.4s // ............................................................................................................e...........'...........................................................................................................~.......... - // cmge v28.4s, v13.4s, v30.4s // .............................................................................................................e..........'............................................................................................................~......... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................e....'..................................................................................................................~... - // mls v13.4s, v28.4s, v8.4s // ........~...............................................................................................................'.......*.............................................................................................................. - // cmge v27.4s, v31.4s, v14.4s // ................................................................................................................e.......'...............................................................................................................~...... - // cmge v28.4s, v14.4s, v30.4s // .................................................................................................................e......'................................................................................................................~..... - // sub v28.4s, v27.4s, v28.4s // ....~...................................................................................................................'...*.................................................................................................................. - // mls v14.4s, v28.4s, v8.4s // ..............~.........................................................................................................'.............*........................................................................................................ - // cmge v27.4s, v31.4s, v15.4s // .................~......................................................................................................'................*..................................................................................................... - // cmge v28.4s, v15.4s, v30.4s // ................~.......................................................................................................'...............*...................................................................................................... - // sub v28.4s, v27.4s, v28.4s // ...........................................................................~............................................'..........................................................................*........................................... - // mls v15.4s, v28.4s, v8.4s // ...................................................................................~....................................'..................................................................................*................................... - // cmge v27.4s, v31.4s, v16.4s // .................................................................................~......................................'................................................................................*..................................... - // cmge v28.4s, v16.4s, v30.4s // ............................................................................~...........................................'...........................................................................*.......................................... - // sub v28.4s, v27.4s, v28.4s // .............................................................................................~..........................'............................................................................................*......................... - // mls v16.4s, v28.4s, v8.4s // ........................................................................................................~...............'.......................................................................................................*.............. - // str q13, [x0, #(4*(1024/8))] // ......................................~.................................................................................'.....................................*................................................................................ - // str q14, [x0, #(5*(1024/8))] // ....................................~...................................................................................'...................................*.................................................................................. - // str q15, [x0, #(6*(1024/8))] // ..................................................................................................~.....................'.................................................................................................*.................... - // str q16, [x0, #(7*(1024/8))] // .....................................................................................................................~..'....................................................................................................................*. - // mul v13.4s, v9.4s, v25.4s // ............................................................e...........................................................'...........................................................~.......................................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ..............................................................e.........................................................'.............................................................~........................................................ - // mls v13.4s, v9.4s, v8.s[0] // .........................................................................e..............................................'........................................................................~............................................. - // mul v14.4s, v10.4s, v25.4s // .....................................................................................e..................................'....................................................................................~................................. - // sqrdmulh v10.4s, v10.4s, v26.4s // ........................................................................................e...............................'.......................................................................................~.............................. - // mls v14.4s, v10.4s, v8.s[0] // ......................................................................................................e.................'.....................................................................................................~................ - // mul v15.4s, v11.4s, v25.4s // ......~.................................................................................................................'.....*................................................................................................................ - // sqrdmulh v11.4s, v11.4s, v26.4s // .....~..................................................................................................................'....*................................................................................................................. - // mls v15.4s, v11.4s, v8.s[0] // .....................~..................................................................................................'....................*................................................................................................. - // mul v16.4s, v12.4s, v25.4s // .......................~................................................................................................'......................*............................................................................................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ...........................~............................................................................................'..........................*........................................................................................... - // mls v16.4s, v12.4s, v8.s[0] // .................................................~......................................................................'................................................*..................................................................... - // cmge v27.4s, v31.4s, v13.4s // .........................................................................................................e..............'........................................................................................................~............. - // cmge v28.4s, v13.4s, v30.4s // ....................................................................................e...................................'...................................................................................~.................................. - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................e'...................................................................................................................... - // mls v13.4s, v28.4s, v8.4s // ...........~............................................................................................................'..........*........................................................................................................... - // cmge v27.4s, v31.4s, v14.4s // ...~....................................................................................................................'..*................................................................................................................... - // cmge v28.4s, v14.4s, v30.4s // ..................................................................................................................e.....'.................................................................................................................~.... - // sub v28.4s, v27.4s, v28.4s // ...............~........................................................................................................'..............*....................................................................................................... - // mls v14.4s, v28.4s, v8.4s // ........................~...............................................................................................'.......................*.............................................................................................. - // cmge v27.4s, v31.4s, v15.4s // ................................................................~.......................................................'...............................................................*...................................................... - // cmge v28.4s, v15.4s, v30.4s // ..................................................~.....................................................................'.................................................*.................................................................... - // sub v28.4s, v27.4s, v28.4s // ..........................................................................~.............................................'.........................................................................*............................................ - // mls v15.4s, v28.4s, v8.4s // .........................................................................................~..............................'........................................................................................*............................. - // cmge v27.4s, v31.4s, v16.4s // ................................................................................~.......................................'...............................................................................*...................................... - // cmge v28.4s, v16.4s, v30.4s // ......................................................................~.................................................'.....................................................................*................................................ - // sub v28.4s, v27.4s, v28.4s // ............................................................................................~...........................'...........................................................................................*.......................... - // mls v16.4s, v28.4s, v8.4s // .....................................................................................................~..................'....................................................................................................*................. - // str q13, [x0], #(16) // ............................~...........................................................................................'...........................*.......................................................................................... - // str q14, [x0, #(-16 + 1*(1024/8))] // .......................................~................................................................................'......................................*............................................................................... - // str q15, [x0, #(-16 + 2*(1024/8))] // ..........................................................................................................~.............'.........................................................................................................*............ - // str q16, [x0, #(-16 + 3*(1024/8))] // ......................................................................................................................~.'.....................................................................................................................* + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----- + // ldr q9, [x0, #0] // ..e....................................................................................................................'..~............................................................................................................ + // ldr q10, [x0, #(1*(1024/8))] // e......................................................................................................................'~.............................................................................................................. + // ldr q11, [x0, #(2*(1024/8))] // ..............e........................................................................................................'..............~................................................................................................ + // ldr q12, [x0, #(3*(1024/8))] // ...............e.......................................................................................................'...............~............................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ......e................................................................................................................'......~........................................................................................................ + // ldr q14, [x0, #(5*(1024/8))] // .......e...............................................................................................................'.......~....................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .e.....................................................................................................................'.~............................................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // ........e..............................................................................................................'........~...................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ......................e................................................................................................'......................~........................................................................................ + // add v9.4s, v9.4s, v10.4s // .......................e...............................................................................................'.......................~....................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ..............................e........................................................................................'..............................~................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ...............................e.......................................................................................'...............................~............................................................................... + // mls v10.4s, v27.4s, v8.s[0] // .....................................................e.................................................................'.....................................................~......................................................... + // sub v24.4s, v11.4s, v12.4s // ................................e......................................................................................'................................~.............................................................................. + // add v11.4s, v11.4s, v12.4s // .................................e.....................................................................................'.................................~............................................................................. + // sqrdmulh v27.4s, v24.4s, v2.s[1] // .............................................e.........................................................................'.............................................~................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ..............................................e........................................................................'..............................................~................................................................ + // mls v12.4s, v27.4s, v8.s[0] // .........................................................e.............................................................'.........................................................~..................................................... + // sub v24.4s, v13.4s, v14.4s // ...........................e...........................................................................................'...........................~................................................................................... + // add v13.4s, v13.4s, v14.4s // ..........................e............................................................................................'..........................~.................................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ..................................e....................................................................................'..................................~............................................................................ + // mul v14.4s, v24.4s, v2.s[2] // .......................................e...............................................................................'.......................................~....................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ........................................................e..............................................................'........................................................~...................................................... + // sub v24.4s, v15.4s, v16.4s // ............................e..........................................................................................'............................~.................................................................................. + // add v15.4s, v15.4s, v16.4s // .........................e.............................................................................................'.........................~..................................................................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ...................................e...................................................................................'...................................~........................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ....................................e..................................................................................'....................................~.......................................................................... + // mls v16.4s, v27.4s, v8.s[0] // ...................................................e...................................................................'...................................................~........................................................... + // sub v24.4s, v9.4s, v11.4s // ........................................e..............................................................................'........................................~...................................................................... + // add v9.4s, v9.4s, v11.4s // .........................................e.............................................................................'.........................................~..................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .................................................e.....................................................................'.................................................~............................................................. + // mul v11.4s, v24.4s, v0.s[2] // ...............................................e.......................................................................'...............................................~............................................................... + // mls v11.4s, v27.4s, v8.s[0] // ...........................................................e...........................................................'...........................................................~................................................... + // sub v24.4s, v10.4s, v12.4s // ...............................................................................................e.......................'...............................................................................................~............... + // add v10.4s, v10.4s, v12.4s // .....................................................................e.................................................'.....................................................................~......................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ......................................................................................................e................'......................................................................................................~........ + // mul v12.4s, v24.4s, v0.s[2] // .......................................................................................................e...............'.......................................................................................................~....... + // mls v12.4s, v27.4s, v8.s[0] // .................................................................................................................e.....'............................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ......................................e................................................................................'......................................~........................................................................ + // add v13.4s, v13.4s, v15.4s // ..........................................e............................................................................'..........................................~.................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ................................................e......................................................................'................................................~.............................................................. + // mul v15.4s, v24.4s, v1.s[0] // ...........................................e...........................................................................'...........................................~................................................................... + // mls v15.4s, v27.4s, v8.s[0] // .............................................................e.........................................................'.............................................................~................................................. + // sub v24.4s, v14.4s, v16.4s // .........................................................................e.............................................'.........................................................................~..................................... + // add v14.4s, v14.4s, v16.4s // ...................................................................e...................................................'...................................................................~........................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ..............................................................................................e........................'..............................................................................................~................ + // mul v16.4s, v24.4s, v1.s[0] // ........................................................................................................e..............'........................................................................................................~...... + // mls v16.4s, v27.4s, v8.s[0] // ................................................................................................................e......'............................................................................................................... + // sub v24.4s, v9.4s, v13.4s // ....................................................e..................................................................'....................................................~.......................................................... + // add v9.4s, v9.4s, v13.4s // ..................................................e....................................................................'..................................................~............................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..............................................................e........................................................'..............................................................~................................................ + // mul v13.4s, v24.4s, v0.s[0] // ............................................................e..........................................................'............................................................~.................................................. + // mls v13.4s, v27.4s, v8.s[0] // ..........................................................................e............................................'..........................................................................~.................................... + // sub v24.4s, v10.4s, v14.4s // ..............................................................................e........................................'..............................................................................~................................ + // add v10.4s, v10.4s, v14.4s // .............................................................................e.........................................'.............................................................................~................................. + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .........................................................................................e.............................'.........................................................................................~..................... + // mul v14.4s, v24.4s, v0.s[0] // .......................................................................................e...............................'.......................................................................................~....................... + // mls v14.4s, v27.4s, v8.s[0] // ....................................................................................................e..................'....................................................................................................~.......... + // sub v24.4s, v11.4s, v15.4s // ...............................................................................e.......................................'...............................................................................~............................... + // add v11.4s, v11.4s, v15.4s // ........................................................................e..............................................'........................................................................~...................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ........................................................................................e..............................'........................................................................................~...................... + // mul v15.4s, v24.4s, v0.s[0] // ................................................................................................e......................'................................................................................................~.............. + // mls v15.4s, v27.4s, v8.s[0] // ...............................................................................................................e.......'............................................................................................................... + // sub v24.4s, v12.4s, v16.4s // .........~.............................................................................................................'.........*..................................................................................................... + // add v12.4s, v12.4s, v16.4s // ..........~............................................................................................................'..........*.................................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...................~...................................................................................................'...................*........................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ........................~..............................................................................................'........................*...................................................................................... + // mls v16.4s, v27.4s, v8.s[0] // ......................................................~................................................................'......................................................*........................................................ + // cmge v27.4s, v31.4s, v13.4s // ....................................................................................e..................................'....................................................................................~.......................... + // cmge v28.4s, v13.4s, v30.4s // .....................................................................................e.................................'.....................................................................................~......................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................e....................'..................................................................................................~............ + // mls v13.4s, v28.4s, v8.4s // ............................................................................................................e..........'............................................................................................................~.. + // cmge v27.4s, v31.4s, v14.4s // ...~...................................................................................................................'...*........................................................................................................... + // cmge v28.4s, v14.4s, v30.4s // ......................................................................................................................e'............................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............~..........................................................................................................'............*.................................................................................................. + // mls v14.4s, v28.4s, v8.4s // ..................................................................~....................................................'..................................................................*............................................ + // cmge v27.4s, v31.4s, v15.4s // ....~..................................................................................................................'....*.......................................................................................................... + // cmge v28.4s, v15.4s, v30.4s // .....~.................................................................................................................'.....*......................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .............~.........................................................................................................'.............*................................................................................................. + // mls v15.4s, v28.4s, v8.4s // .....................~.................................................................................................'.....................*......................................................................................... + // cmge v27.4s, v31.4s, v16.4s // .................................................................~.....................................................'.................................................................*............................................. + // cmge v28.4s, v16.4s, v30.4s // ...............................................................~.......................................................'...............................................................*............................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................~...............................................'.......................................................................*....................................... + // mls v16.4s, v28.4s, v8.4s // ..........................................................................................~............................'..........................................................................................*.................... + // str q13, [x0, #(4*(1024/8))] // .....................................................................................................................e.'............................................................................................................... + // str q14, [x0, #(5*(1024/8))] // ............................................................................~..........................................'............................................................................*.................................. + // str q15, [x0, #(6*(1024/8))] // .....................................~.................................................................................'.....................................*......................................................................... + // str q16, [x0, #(7*(1024/8))] // .....................................................................................................~.................'.....................................................................................................*......... + // sqrdmulh v27.4s, v9.4s, v26.4s // .......................................................e...............................................................'.......................................................~....................................................... + // mul v9.4s, v9.4s, v25.4s // ..........................................................e............................................................'..........................................................~.................................................... + // mls v9.4s, v27.4s, v8.s[0] // ....................................................................e..................................................'....................................................................~.......................................... + // sqrdmulh v27.4s, v10.4s, v26.4s // ......................................................................................e................................'......................................................................................~........................ + // mul v10.4s, v10.4s, v25.4s // .............................................................................................e.........................'.............................................................................................~................. + // mls v10.4s, v27.4s, v8.s[0] // .........................................................................................................e.............'.........................................................................................................~..... + // sqrdmulh v27.4s, v11.4s, v26.4s // .................................................................................e.....................................'.................................................................................~............................. + // mul v11.4s, v11.4s, v25.4s // ..................................................................................e....................................'..................................................................................~............................ + // mls v11.4s, v27.4s, v8.s[0] // ............................................................................................e..........................'............................................................................................~.................. + // sqrdmulh v27.4s, v12.4s, v26.4s // ....................~..................................................................................................'....................*.......................................................................................... + // mul v12.4s, v12.4s, v25.4s // .................~.....................................................................................................'.................*............................................................................................. + // mls v12.4s, v27.4s, v8.s[0] // ............................................~..........................................................................'............................................*.................................................................. + // cmge v27.4s, v31.4s, v9.4s // ................................................................................e......................................'................................................................................~.............................. + // cmge v28.4s, v9.4s, v30.4s // ...................................................................................e...................................'...................................................................................~........................... + // sub v28.4s, v27.4s, v28.4s // ...........................................................................................e...........................'...........................................................................................~................... + // mls v9.4s, v28.4s, v8.4s // ...................................................................................................e...................'...................................................................................................~........... + // cmge v27.4s, v31.4s, v10.4s // ....................................................................................................................e..'............................................................................................................... + // cmge v28.4s, v10.4s, v30.4s // ...................................................................................................................e...'............................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........~...........................................................................................................'...........*................................................................................................... + // mls v10.4s, v28.4s, v8.4s // ................~......................................................................................................'................*.............................................................................................. + // cmge v27.4s, v31.4s, v11.4s // ..........................................................................................................e............'..........................................................................................................~.... + // cmge v28.4s, v11.4s, v30.4s // ...........................................................................................................e...........'...........................................................................................................~... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................e....'............................................................................................................... + // mls v11.4s, v28.4s, v8.4s // .......................................................................................................................*............................................................................................................... + // cmge v27.4s, v31.4s, v12.4s // ................................................................~......................................................'................................................................*.............................................. + // cmge v28.4s, v12.4s, v30.4s // ......................................................................~................................................'......................................................................*........................................ + // sub v28.4s, v27.4s, v28.4s // ...........................................................................~...........................................'...........................................................................*................................... + // mls v12.4s, v28.4s, v8.4s // .................................................................................................~.....................'.................................................................................................*............. + // str q9, [x0], #(16) // .............................................................................................................e.........'.............................................................................................................~. + // str q10, [x0, #(-16 + 1*(1024/8))] // .............................~.........................................................................................'.............................*................................................................................. + // str q11, [x0, #(-16 + 2*(1024/8))] // ..................~....................................................................................................'..................*............................................................................................ + // str q12, [x0, #(-16 + 3*(1024/8))] // ..............................................................................................................~........'..............................................................................................................* sub count, count, #1 cbnz count, layer123_start - // Instructions: 42 + // Instructions: 32 // Expected cycles: 16 - // Expected IPC: 2.62 + // Expected IPC: 2.00 // - // Wall time: 0.44s - // User time: 0.44s + // Wall time: 0.22s + // User time: 0.22s // - // ----------- original position -----------> + // ------ original position ------> // 0 25 - // |------------------------|---------------- - add v21.4S, v15.4S, v4.4S // ......*................................... - sqrdmulh v17.4S, v9.4S, v26.4S // ..*....................................... - mul v16.4S, v9.4S, v25.4S // ...*...................................... - sub v15.4S, v15.4S, v4.4S // .....*.................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - cmge v13.4S, v11.4S, v30.4S // ..........*............................... - cmge v7.4S, v31.4S, v11.4S // ...........*.............................. - cmge v5.4S, v31.4S, v24.4S // *......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mul v4.4S, v21.4S, v25.4S // ...............*.......................... - sqrdmulh v9.4S, v21.4S, v26.4S // .................*........................ - sqrdmulh v21.4S, v15.4S, v0.S[1] // ..............*........................... - mul v15.4S, v15.4S, v0.S[0] // ............*............................. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v10.4S, v6.4S, v8.4S // ....*..................................... - mls v23.4S, v19.4S, v8.4S // .......*.................................. - mls v16.4S, v17.4S, v8.S[0] // .............*............................ - sub v7.4S, v7.4S, v13.4S // ............................*............. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sub v18.4S, v5.4S, v22.4S // .........*................................ - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v15.4S, v21.4S, v8.S[0] // .........................*................ - sub v21.4S, v28.4S, v20.4S // .*........................................ - mls v11.4S, v7.4S, v8.4S // ................................*......... - mls v4.4S, v9.4S, v8.S[0] // ......................*................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v24.4S, v18.4S, v8.4S // ................*......................... - str q10, [x0, #512] // ....................*..................... - cmge v13.4S, v16.4S, v30.4S // .......................*.................. - cmge v10.4S, v31.4S, v16.4S // ........................*................. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v27.4S, v21.4S, v8.4S // ........*................................. - str q23, [x0], #(16) // ..................*....................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - str q11, [x0, #752] // ....................................*..... - cmge v9.4S, v4.4S, v30.4S // ..........................*............... - cmge v11.4S, v31.4S, v4.4S // ..............................*........... - cmge v7.4S, v15.4S, v30.4S // .............................*............ - cmge v23.4S, v31.4S, v15.4S // ...............................*.......... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - str q24, [x0, #112] // .....................*.................... - sub v24.4S, v10.4S, v13.4S // ...........................*.............. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - str q27, [x0, #624] // ...................*...................... - sub v11.4S, v11.4S, v9.4S // ..................................*....... - sub v27.4S, v23.4S, v7.4S // ...................................*...... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v16.4S, v24.4S, v8.4S // .................................*........ - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v15.4S, v27.4S, v8.4S // ......................................*... - mls v4.4S, v11.4S, v8.4S // .....................................*.... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - str q16, [x0, #240] // .......................................*.. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - str q15, [x0, #880] // ........................................*. - str q4, [x0, #368] // .........................................* - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - - // ------------- new position --------------> + // |------------------------|------ + mls v4.4S, v22.4S, v8.4S // *............................... + sub v22.4S, v12.4S, v9.4S // ....*........................... + sub v10.4S, v24.4S, v28.4S // ......*......................... + add v28.4S, v12.4S, v9.4S // .....*.......................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + cmge v12.4S, v31.4S, v6.4S // .*.............................. + cmge v21.4S, v31.4S, v19.4S // ..*............................. + cmge v11.4S, v19.4S, v30.4S // ...*............................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mul v14.4S, v22.4S, v0.S[0] // ...............*................ + sqrdmulh v22.4S, v22.4S, v0.S[1] // ............*................... + mul v15.4S, v28.4S, v25.4S // ..........*..................... + sqrdmulh v28.4S, v28.4S, v26.4S // .............*.................. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sub v13.4S, v12.4S, v13.4S // .......*........................ + str q4, [x0, #240] // ...........*.................... + sub v4.4S, v21.4S, v11.4S // ........*....................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v29.4S, v10.4S, v8.4S // .........*...................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v6.4S, v13.4S, v8.4S // .......................*........ + mls v19.4S, v4.4S, v8.4S // ..............*................. + mls v14.4S, v22.4S, v8.S[0] // ...................*............ + mls v15.4S, v28.4S, v8.S[0] // ..................*............. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + str q29, [x0, #112] // ................*............... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + str q6, [x0, #624] // ...........................*.... + str q19, [x0, #752] // .................*.............. + cmge v22.4S, v14.4S, v30.4S // ....................*........... + cmge v19.4S, v31.4S, v14.4S // ......................*......... + cmge v28.4S, v31.4S, v15.4S // .....................*.......... + cmge v6.4S, v15.4S, v30.4S // ........................*....... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sub v19.4S, v19.4S, v22.4S // .........................*...... + sub v6.4S, v28.4S, v6.4S // ..........................*..... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v14.4S, v19.4S, v8.4S // ............................*... + mls v15.4S, v6.4S, v8.4S // .............................*.. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + str q14, [x0, #880] // ..............................*. + str q15, [x0, #368] // ...............................* + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + + // -------- new position ---------> // 0 25 - // |------------------------|---------------- - // cmge v7.4S, v31.4S, v24.4S // ......*................................... - // sub v17.4S, v28.4S, v20.4S // .................*........................ - // sqrdmulh v13.4S, v9.4S, v26.4S // .*........................................ - // mul v9.4S, v9.4S, v25.4S // ..*....................................... - // mls v10.4S, v6.4S, v8.4S // ...........*.............................. - // sub v28.4S, v15.4S, v4.4S // ...*...................................... - // add v6.4S, v15.4S, v4.4S // *......................................... - // mls v23.4S, v19.4S, v8.4S // ............*............................. - // mls v27.4S, v17.4S, v8.4S // ........................*................. - // sub v17.4S, v7.4S, v22.4S // ...............*.......................... - // cmge v4.4S, v11.4S, v30.4S // ....*..................................... - // cmge v14.4S, v31.4S, v11.4S // .....*.................................... - // mul v7.4S, v28.4S, v0.S[0] // ..........*............................... - // mls v9.4S, v13.4S, v8.S[0] // .............*............................ - // sqrdmulh v22.4S, v28.4S, v0.S[1] // .........*................................ - // mul v13.4S, v6.4S, v25.4S // .......*.................................. - // mls v24.4S, v17.4S, v8.4S // ....................*..................... - // sqrdmulh v16.4S, v6.4S, v26.4S // ........*................................. - // str q23, [x0], #(16) // .........................*................ - // str q27, [x0, #624] // .................................*........ - // str q10, [x0, #496] // .....................*.................... - // str q24, [x0, #112] // ...............................*.......... - // mls v13.4S, v16.4S, v8.S[0] // ...................*...................... - // cmge v29.4S, v9.4S, v30.4S // ......................*................... - // cmge v6.4S, v31.4S, v9.4S // .......................*.................. - // mls v7.4S, v22.4S, v8.S[0] // ................*......................... - // cmge v19.4S, v13.4S, v30.4S // ...........................*.............. - // sub v29.4S, v6.4S, v29.4S // ................................*......... - // sub v17.4S, v14.4S, v4.4S // ..............*........................... - // cmge v6.4S, v7.4S, v30.4S // .............................*............ - // cmge v5.4S, v31.4S, v13.4S // ............................*............. - // cmge v14.4S, v31.4S, v7.4S // ..............................*........... - // mls v11.4S, v17.4S, v8.4S // ..................*....................... - // mls v9.4S, v29.4S, v8.4S // ....................................*..... - // sub v5.4S, v5.4S, v19.4S // ..................................*....... - // sub v19.4S, v14.4S, v6.4S // ...................................*...... - // str q11, [x0, #752] // ..........................*............... - // mls v13.4S, v5.4S, v8.4S // ......................................*... - // mls v7.4S, v19.4S, v8.4S // .....................................*.... - // str q9, [x0, #240] // .......................................*.. - // str q7, [x0, #880] // ........................................*. - // str q13, [x0, #368] // .........................................* + // |------------------------|------ + // mls v4.4S, v22.4S, v8.4S // *............................... + // cmge v11.4S, v31.4S, v6.4S // ....*........................... + // cmge v23.4S, v31.4S, v19.4S // .....*.......................... + // cmge v15.4S, v19.4S, v30.4S // ......*......................... + // sub v10.4S, v12.4S, v9.4S // .*.............................. + // add v18.4S, v12.4S, v9.4S // ...*............................ + // sub v28.4S, v24.4S, v28.4S // ..*............................. + // sub v9.4S, v11.4S, v13.4S // ...........*.................... + // sub v22.4S, v23.4S, v15.4S // .............*.................. + // mls v29.4S, v28.4S, v8.4S // ..............*................. + // mul v13.4S, v18.4S, v25.4S // .........*...................... + // str q4, [x0, #240] // ............*................... + // sqrdmulh v17.4S, v10.4S, v0.S[1] // ........*....................... + // sqrdmulh v12.4S, v18.4S, v26.4S // ..........*..................... + // mls v19.4S, v22.4S, v8.4S // ................*............... + // mul v28.4S, v10.4S, v0.S[0] // .......*........................ + // str q29, [x0, #112] // ...................*............ + // str q19, [x0, #752] // .....................*.......... + // mls v13.4S, v12.4S, v8.S[0] // ..................*............. + // mls v28.4S, v17.4S, v8.S[0] // .................*.............. + // cmge v12.4S, v28.4S, v30.4S // ......................*......... + // cmge v4.4S, v31.4S, v13.4S // ........................*....... + // cmge v18.4S, v31.4S, v28.4S // .......................*........ + // mls v6.4S, v9.4S, v8.4S // ...............*................ + // cmge v24.4S, v13.4S, v30.4S // .........................*...... + // sub v15.4S, v18.4S, v12.4S // ..........................*..... + // sub v9.4S, v4.4S, v24.4S // ...........................*.... + // str q6, [x0, #624] // ....................*........... + // mls v28.4S, v15.4S, v8.4S // ............................*... + // mls v13.4S, v9.4S, v8.4S // .............................*.. + // str q28, [x0, #880] // ..............................*. + // str q13, [x0, #368] // ...............................* pop_stack diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s index d6caacdb..c3efa747 100644 --- a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s +++ b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +33,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro barrett_reduce_single a @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -245,6 +231,12 @@ xtmp1 .req x11 restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -387,1006 +379,1014 @@ _intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm: qform_root3_tw .req q7 .p2align 2 - // Instructions: 162 - // Expected cycles: 71 - // Expected IPC: 2.28 - // - // Wall time: 162.16s - // User time: 162.16s - // - // ----------------------------------------------------------------------- original position -----------------------------------------------------------------------> - // 0 25 50 75 100 125 150 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- - ldr q25, [x2, #16] // ......................*........................................................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - ldr q22, [x2, #0] // ........................*......................................................................................................................................... - ldr q30, [x2, #32] // .........................*........................................................................................................................................ - ldr q9, [x2, #48] // ...............................*.................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - ldr q23, [x1, #32] // ..*............................................................................................................................................................... - // gap // .................................................................................................................................................................. - ldr q10, [x1, #48] // ...*.............................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - ldr q24, [x1, #16] // .*................................................................................................................................................................ - ldr q20, [x1, #0] // *................................................................................................................................................................. - // gap // .................................................................................................................................................................. - trn2 v3.4S, v22.4S, v25.4S // ......................................*........................................................................................................................... - ldr q0, [x4, #16] // ........................................................................*......................................................................................... - trn1 v31.4S, v22.4S, v25.4S // ....................................*............................................................................................................................. - ldr q1, [x5, #144] // ..................*............................................................................................................................................... - ldr q16, [x4], #64 // ............................................................................................*..................................................................... - trn2 v29.4S, v30.4S, v9.4S // ..........................................*....................................................................................................................... - // gap // .................................................................................................................................................................. - trn1 v28.4S, v30.4S, v9.4S // ............................................*..................................................................................................................... - ldr q6, [x4, #-16] // .....................................................................*............................................................................................ - trn2 v25.4S, v23.4S, v10.4S // .......*.......................................................................................................................................................... - // gap // .................................................................................................................................................................. - trn1 v21.4S, v23.4S, v10.4S // ........*......................................................................................................................................................... - trn2 v10.2D, v3.2D, v29.2D // ..............................................*................................................................................................................... - ldr q11, [x5, #96] // .........................................*........................................................................................................................ - trn2 v14.4S, v20.4S, v24.4S // ....*............................................................................................................................................................. - ldr q5, [x5, #160] // ...........*...................................................................................................................................................... - trn1 v17.2D, v3.2D, v29.2D // ................................................*................................................................................................................. - ldr q7, [x5, #176] // ................................*................................................................................................................................. - // gap // .................................................................................................................................................................. - trn2 v26.2D, v31.2D, v28.2D // .................................................*................................................................................................................ - trn1 v12.2D, v14.2D, v25.2D // ................*................................................................................................................................................. - trn1 v2.2D, v31.2D, v28.2D // ....................................................*............................................................................................................. - ldr q31, [x5, #128] // .........*........................................................................................................................................................ - // gap // .................................................................................................................................................................. - ldr q23, [x5, #112] // ........................................*......................................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sub v13.4S, v26.4S, v10.4S // .....................................................*............................................................................................................ - sub v19.4S, v2.4S, v17.4S // .........................................................*........................................................................................................ - ldr q29, [x5], #(12*16) // ...................................*.............................................................................................................................. - trn1 v28.4S, v20.4S, v24.4S // .....*............................................................................................................................................................ - ldr q15, [x5, #-128] // ......*........................................................................................................................................................... - ldr q27, [x5, #-144] // ..........*....................................................................................................................................................... - // gap // .................................................................................................................................................................. - mul v9.4S, v13.4S, v5.4S // ................................................................*................................................................................................. - sqrdmulh v20.4S, v13.4S, v7.4S // ............................................................*..................................................................................................... - sqrdmulh v1.4S, v19.4S, v1.4S // .................................................................*................................................................................................ - mul v24.4S, v19.4S, v31.4S // .............................................................*.................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - add v13.4S, v2.4S, v17.4S // ..........................................................................*....................................................................................... - trn1 v19.2D, v28.2D, v21.2D // .................*................................................................................................................................................ - // gap // .................................................................................................................................................................. - ldr q2, [x5, #-160] // ..............*................................................................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - add v22.4S, v26.4S, v10.4S // ......................................................................*........................................................................................... - mls v9.4S, v20.4S, v8.S[0] // ...........................................................................*...................................................................................... - mls v24.4S, v1.4S, v8.S[0] // .........................................................................*........................................................................................ - sub v17.4S, v19.4S, v12.4S // .....................*............................................................................................................................................ - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - add v26.4S, v13.4S, v22.4S // ..............................................................................*................................................................................... - sub v10.4S, v13.4S, v22.4S // ...............................................................................*.................................................................................. - sqrdmulh v18.4S, v17.4S, v27.4S // ............................*..................................................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - trn2 v31.2D, v14.2D, v25.2D // ............*..................................................................................................................................................... - sub v4.4S, v24.4S, v9.4S // .................................................................................*................................................................................ - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v27.4S, v17.4S, v2.4S // .............................*.................................................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sqrdmulh v30.4S, v10.4S, v23.4S // ..................................................................................*............................................................................... - mul v20.4S, v10.4S, v11.4S // ...................................................................................*.............................................................................. - mul v10.4S, v4.4S, v11.4S // .....................................................................................*............................................................................ - sqrdmulh v14.4S, v4.4S, v23.4S // ....................................................................................*............................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mls v27.4S, v18.4S, v8.S[0] // .......................................*.......................................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - trn2 v3.2D, v28.2D, v21.2D // .............*.................................................................................................................................................... - mls v20.4S, v30.4S, v8.S[0] // .........................................................................................*........................................................................ - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - add v22.4S, v24.4S, v9.4S // ................................................................................*................................................................................. - // gap // .................................................................................................................................................................. - ldr q17, [x5, #-112] // ...............*.................................................................................................................................................. - sub v4.4S, v3.4S, v31.4S // ...................*.............................................................................................................................................. - mls v10.4S, v14.4S, v8.S[0] // ...........................................................................................*...................................................................... - trn1 v28.4S, v26.4S, v22.4S // ........................................................................................*......................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - trn2 v26.4S, v26.4S, v22.4S // ......................................................................................*........................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v25.4S, v4.4S, v15.4S // ..........................*....................................................................................................................................... - add v5.4S, v3.4S, v31.4S // ....................*............................................................................................................................................. - trn2 v1.4S, v20.4S, v10.4S // .................................................................................................*................................................................ - trn1 v18.4S, v20.4S, v10.4S // ..................................................................................................*............................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sqrdmulh v7.4S, v4.4S, v17.4S // ...........................*...................................................................................................................................... - add v17.4S, v19.4S, v12.4S // .......................*.......................................................................................................................................... - trn2 v19.2D, v28.2D, v18.2D // ......................................................................................................*........................................................... - trn2 v14.2D, v26.2D, v1.2D // .....................................................................................................*............................................................ - // gap // .................................................................................................................................................................. - ldr q31, [x5, #-176] // ..............................*................................................................................................................................... - trn1 v30.2D, v26.2D, v1.2D // .......................................................................................................*.......................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sub v10.4S, v17.4S, v5.4S // .................................*................................................................................................................................ - add v22.4S, v19.4S, v14.4S // ...............................................................................................................*.................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mls v25.4S, v7.4S, v8.S[0] // .....................................*............................................................................................................................ - ldr q26, [x4, #-32] // ....................................................................*............................................................................................. - // gap // .................................................................................................................................................................. - trn1 v12.2D, v28.2D, v18.2D // ........................................................................................................*......................................................... - mul v11.4S, v10.4S, v29.4S // .............................................*.................................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sqrdmulh v21.4S, v10.4S, v31.4S // ...........................................*...................................................................................................................... - add v20.4S, v17.4S, v5.4S // ..................................*............................................................................................................................... - // gap // .................................................................................................................................................................. - add v10.4S, v12.4S, v30.4S // .................................................................................................................*................................................ - // gap // .................................................................................................................................................................. - sub v4.4S, v27.4S, v25.4S // ...............................................*.................................................................................................................. - sub v2.4S, v12.4S, v30.4S // ...........................................................................................................*...................................................... - // gap // .................................................................................................................................................................. - add v17.4S, v27.4S, v25.4S // .......................................................*.......................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v1.4S, v4.4S, v29.4S // ..................................................*............................................................................................................... - sqrdmulh v28.4S, v4.4S, v31.4S // ...................................................*.............................................................................................................. - trn2 v15.4S, v20.4S, v17.4S // ...........................................................*...................................................................................................... - // gap // .................................................................................................................................................................. - sub v12.4S, v10.4S, v22.4S // .....................................................................................................................*............................................ - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sqrdmulh v30.4S, v2.4S, v26.S[3] // ...................................................................................................................*.............................................. - // gap // .................................................................................................................................................................. - add v7.4S, v10.4S, v22.4S // ........................................................................................................................*......................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mls v1.4S, v28.4S, v8.S[0] // ........................................................*......................................................................................................... - mls v11.4S, v21.4S, v8.S[0] // ......................................................*........................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v5.4S, v12.4S, v0.S[0] // ...........................................................................................................................*...................................... - srshr v31.4S, v7.4S, #23 // ............................................................................................................................*..................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sqrdmulh v28.4S, v12.4S, v0.S[1] // ..............................................................................................................................*................................... - sub v19.4S, v19.4S, v14.4S // ..........................................................................................................*....................................................... - trn1 v27.4S, v11.4S, v1.4S // ..............................................................*................................................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - trn1 v12.4S, v20.4S, v17.4S // ..........................................................*....................................................................................................... - mul v13.4S, v2.4S, v26.S[2] // ................................................................................................................*................................................. - trn2 v24.4S, v11.4S, v1.4S // ...............................................................*.................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sqrdmulh v17.4S, v19.4S, v6.S[1] // ..............................................................................................................*................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - trn2 v4.2D, v12.2D, v27.2D // .......................................................................................*.......................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v18.4S, v19.4S, v6.S[0] // .............................................................................................................*.................................................... - trn2 v2.2D, v15.2D, v24.2D // ............................................................................*..................................................................................... - mls v13.4S, v30.4S, v8.S[0] // .........................................................................................................................*........................................ - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - trn1 v14.2D, v15.2D, v24.2D // ...................................................................*.............................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sub v10.4S, v4.4S, v2.4S // ..........................................................................................*....................................................................... - trn1 v25.2D, v12.2D, v27.2D // ..................................................................*............................................................................................... - mls v18.4S, v17.4S, v8.S[0] // .......................................................................................................................*.......................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mls v5.4S, v28.4S, v8.S[0] // ..............................................................................................................................................*................... - // gap // .................................................................................................................................................................. - sub v20.4S, v25.4S, v14.4S // .......................................................................*.......................................................................................... - // gap // .................................................................................................................................................................. - mul v15.4S, v10.4S, v26.S[0] // ...................................................................................................*.............................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - add v30.4S, v25.4S, v14.4S // .............................................................................*.................................................................................... - sqrdmulh v12.4S, v10.4S, v26.S[1] // ................................................................................................*................................................................. - sqrdmulh v23.4S, v20.4S, v0.S[3] // ..............................................................................................*................................................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v1.4S, v20.4S, v0.S[2] // .............................................................................................*.................................................................... - // gap // .................................................................................................................................................................. - add v17.4S, v4.4S, v2.4S // ...............................................................................................*.................................................................. - // gap // .................................................................................................................................................................. - add v4.4S, v13.4S, v18.4S // ...............................................................................................................................*.................................. - sub v6.4S, v13.4S, v18.4S // .................................................................................................................................*................................ - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mls v7.4S, v31.4S, v8.4S // .....................................................................................................................................*............................ - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mls v15.4S, v12.4S, v8.S[0] // ............................................................................................................*..................................................... - mls v1.4S, v23.4S, v8.S[0] // ....................................................................................................*............................................................. - sqrdmulh v24.4S, v6.4S, v0.S[1] // ........................................................................................................................................*......................... - sub v23.4S, v30.4S, v17.4S // ......................................................................................................................*........................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v10.4S, v6.4S, v0.S[0] // .......................................................................................................................................*.......................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - srshr v28.4S, v4.4S, #23 // ...................................................................................................................................*.............................. - sqrdmulh v18.4S, v23.4S, v16.S[3] // ............................................................................................................................................*..................... - add v2.4S, v1.4S, v15.4S // ....................................................................................................................*............................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v21.4S, v23.4S, v16.S[2] // ..........................................................................................................................................*....................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - sub v12.4S, v1.4S, v15.4S // ................................................................................................................................*................................. - add v26.4S, v30.4S, v17.4S // .........................................................................................................*........................................................ - srshr v30.4S, v2.4S, #23 // ..........................................................................................................................*....................................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mul v0.4S, v12.4S, v16.S[2] // ......................................................................................................................................*........................... - sqrdmulh v13.4S, v12.4S, v16.S[3] // ....................................................................................................................................*............................. - mls v21.4S, v18.4S, v8.S[0] // .................................................................................................................................................*................ - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - srshr v12.4S, v26.4S, #23 // ..................................................................................................................*............................................... - mls v4.4S, v28.4S, v8.4S // .........................................................................................................................................*........................ - mls v2.4S, v30.4S, v8.4S // ..................................................................................................................................*............................... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - mls v10.4S, v24.4S, v8.S[0] // .............................................................................................................................................*.................... - mls v0.4S, v13.4S, v8.S[0] // ...........................................................................................................................................*...................... - add v18.4S, v21.4S, v5.4S // ........................................................................................................................................................*......... - // gap // .................................................................................................................................................................. - mls v26.4S, v12.4S, v8.4S // .............................................................................................................................*.................................... - // gap // .................................................................................................................................................................. - sub v31.4S, v2.4S, v4.4S // ...............................................................................................................................................*.................. - // gap // .................................................................................................................................................................. - add v30.4S, v2.4S, v4.4S // ..................................................................................................................................................*............... - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - // gap // .................................................................................................................................................................. - add v3.4S, v0.4S, v10.4S // ...................................................................................................................................................*.............. - sub v12.4S, v0.4S, v10.4S // ....................................................................................................................................................*............. - sqrdmulh v13.4S, v31.4S, v16.S[1] // ......................................................................................................................................................*........... - str q30, [x1, #16] // .........................................................................................................................................................*........ - mul v25.4S, v31.4S, v16.S[0] // .......................................................................................................................................................*.......... - // gap // .................................................................................................................................................................. - add v10.4S, v26.4S, v7.4S // ................................................................................................................................................*................. - // gap // .................................................................................................................................................................. - sub v1.4S, v26.4S, v7.4S // ............................................................................................................................................................*..... - str q18, [x1, #32] // ...............................................................................................................................................................*.. - // gap // .................................................................................................................................................................. - sqrdmulh v27.4S, v12.4S, v16.S[1] // ..........................................................................................................................................................*....... - mul v11.4S, v12.4S, v16.S[0] // .............................................................................................................................................................*.... - str q3, [x1, #48] // ...........................................................................................................................................................*...... - str q10, [x1], #(16*4) // .....................................................................................................................................................*............ - mls v25.4S, v13.4S, v8.S[0] // ..............................................................................................................................................................*... - sub v23.4S, v21.4S, v5.4S // .................................................................................................................................................................* - add x1, x1, #64 // ................................................................................................................................................................*. - - // ------------------------------------------------------------------------- new position --------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- - // ldr q15, [x1, #0] // .......*.......................................................................................................................................................... - // ldr q20, [x1, #16] // ......*........................................................................................................................................................... - // ldr q19, [x1, #32] // ....*............................................................................................................................................................. - // ldr q13, [x1, #48] // .....*............................................................................................................................................................ - // trn2 v25.4S, v15.4S, v20.4S // ....................*............................................................................................................................................. - // trn1 v17.4S, v15.4S, v20.4S // ................................*................................................................................................................................. - // ldr q9, [x5, #64] // .................................*................................................................................................................................ - // trn2 v11.4S, v19.4S, v13.4S // ................*................................................................................................................................................. - // trn1 v31.4S, v19.4S, v13.4S // .................*................................................................................................................................................ - // ldr q21, [x5, #128] // ...........................*...................................................................................................................................... - // ldr q3, [x5, #48] // ..................................*............................................................................................................................... - // ldr q13, [x5, #160] // .....................*............................................................................................................................................ - // trn2 v26.2D, v25.2D, v11.2D // .................................................*................................................................................................................ - // trn2 v20.2D, v17.2D, v31.2D // .........................................................*........................................................................................................ - // ldr q24, [x5, #32] // .........................................*........................................................................................................................ - // ldr q23, [x5, #80] // ............................................................*..................................................................................................... - // trn1 v27.2D, v25.2D, v11.2D // .........................*........................................................................................................................................ - // trn1 v11.2D, v17.2D, v31.2D // ........................................*......................................................................................................................... - // ldr q16, [x5, #144] // ...........*...................................................................................................................................................... - // sub v18.4S, v20.4S, v26.4S // .............................................................*.................................................................................................... - // add v25.4S, v20.4S, v26.4S // ..................................................................*............................................................................................... - // sub v14.4S, v11.4S, v27.4S // .............................................*.................................................................................................................... - // ldr q26, [x2, #16] // *................................................................................................................................................................. - // add v20.4S, v11.4S, v27.4S // ......................................................................*........................................................................................... - // ldr q0, [x2, #0] // .*................................................................................................................................................................ - // ldr q17, [x2, #32] // ..*............................................................................................................................................................... - // mul v27.4S, v18.4S, v9.4S // .................................................................*................................................................................................ - // sqrdmulh v7.4S, v18.4S, v23.4S // .....................................................................*............................................................................................ - // sqrdmulh v11.4S, v14.4S, v3.4S // ................................................*................................................................................................................. - // mul v10.4S, v14.4S, v24.4S // ...................................................*.............................................................................................................. - // ldr q1, [x5, #16] // .........................................................................*........................................................................................ - // ldr q24, [x2, #48] // ...*.............................................................................................................................................................. - // ldr q23, [x5, #176] // .......................*.......................................................................................................................................... - // sub v18.4S, v20.4S, v25.4S // ...........................................................................*...................................................................................... - // add v3.4S, v20.4S, v25.4S // ..................................................................................*............................................................................... - // ldr q20, [x5], #(12*16) // ...............................*.................................................................................................................................. - // trn1 v15.4S, v0.4S, v26.4S // ..........*....................................................................................................................................................... - // mls v27.4S, v7.4S, v8.S[0] // .............................................................................*.................................................................................... - // trn2 v0.4S, v0.4S, v26.4S // ........*......................................................................................................................................................... - // mls v10.4S, v11.4S, v8.S[0] // ........................................................*......................................................................................................... - // ldr q14, [x5, #-80] // ............................*..................................................................................................................................... - // ldr q5, [x5, #-96] // ...................*.............................................................................................................................................. - // trn2 v7.4S, v17.4S, v24.4S // .............*.................................................................................................................................................... - // sqrdmulh v26.4S, v18.4S, v1.4S // .................................................................................*................................................................................ - // trn1 v25.4S, v17.4S, v24.4S // ..............*................................................................................................................................................... - // mul v11.4S, v18.4S, v20.4S // ................................................................................*................................................................................. - // trn2 v9.2D, v0.2D, v7.2D // ..................*............................................................................................................................................... - // sub v24.4S, v10.4S, v27.4S // ....................................................................................*............................................................................. - // trn1 v31.2D, v0.2D, v7.2D // ......................*........................................................................................................................................... - // trn2 v18.2D, v15.2D, v25.2D // ........................*......................................................................................................................................... - // mul v7.4S, v24.4S, v20.4S // .......................................................................................*.......................................................................... - // sqrdmulh v24.4S, v24.4S, v1.4S // ........................................................................................*......................................................................... - // trn1 v1.2D, v15.2D, v25.2D // ..........................*....................................................................................................................................... - // sub v25.4S, v18.4S, v9.4S // .............................*.................................................................................................................................... - // mls v11.4S, v26.4S, v8.S[0] // ..............................................................................................*................................................................... - // add v20.4S, v10.4S, v27.4S // ......................................................................................*........................................................................... - // mls v7.4S, v24.4S, v8.S[0] // .............................................................................................*.................................................................... - // sub v0.4S, v1.4S, v31.4S // ..............................*................................................................................................................................... - // trn1 v15.4S, v3.4S, v20.4S // ....................................................................................................*............................................................. - // trn2 v24.4S, v3.4S, v20.4S // .........................................................................................*........................................................................ - // sqrdmulh v3.4S, v25.4S, v23.4S // ....................................*............................................................................................................................. - // mul v23.4S, v0.4S, v21.4S // ......................................*........................................................................................................................... - // trn1 v20.4S, v11.4S, v7.4S // ...................................................................................................*.............................................................. - // trn2 v7.4S, v11.4S, v7.4S // ......................................................................................................*........................................................... - // mul v10.4S, v25.4S, v13.4S // ...................................*.............................................................................................................................. - // sqrdmulh v26.4S, v0.4S, v16.4S // .....................................*............................................................................................................................ - // trn1 v11.2D, v15.2D, v20.2D // ..............................................................................................................*................................................... - // trn1 v13.2D, v24.2D, v7.2D // ............................................................................................................*..................................................... - // ldr q17, [x4, #32] // ..............................................................................*................................................................................... - // ldr q25, [x4, #48] // ...............*.................................................................................................................................................. - // add v9.4S, v18.4S, v9.4S // ..........................................*....................................................................................................................... - // sub v27.4S, v11.4S, v13.4S // .................................................................................................................*................................................ - // ldr q0, [x4, #16] // .........*........................................................................................................................................................ - // mls v23.4S, v26.4S, v8.S[0] // ............................................*..................................................................................................................... - // add v21.4S, v1.4S, v31.4S // .......................................*.......................................................................................................................... - // mls v10.4S, v3.4S, v8.S[0] // ...........................................*...................................................................................................................... - // trn2 v2.2D, v24.2D, v7.2D // ..........................................................................................................*....................................................... - // add v18.4S, v11.4S, v13.4S // ...................................................................................................................*.............................................. - // add v1.4S, v21.4S, v9.4S // ..............................................*................................................................................................................... - // sub v7.4S, v21.4S, v9.4S // ...............................................*.................................................................................................................. - // add v11.4S, v23.4S, v10.4S // ...........................................................*...................................................................................................... - // sub v24.4S, v23.4S, v10.4S // ..................................................*............................................................................................................... - // sqrdmulh v13.4S, v7.4S, v14.4S // ....................................................*............................................................................................................. - // mul v23.4S, v7.4S, v5.4S // .....................................................*............................................................................................................ - // sqrdmulh v7.4S, v24.4S, v14.4S // .......................................................*.......................................................................................................... - // mul v24.4S, v24.4S, v5.4S // ......................................................*........................................................................................................... - // trn2 v3.4S, v1.4S, v11.4S // ................................................................*................................................................................................. - // trn2 v9.2D, v15.2D, v20.2D // ........................................................................................................*......................................................... - // trn1 v20.4S, v1.4S, v11.4S // ...............................................................*.................................................................................................. - // mls v23.4S, v13.4S, v8.S[0] // ..........................................................*....................................................................................................... - // sub v11.4S, v9.4S, v2.4S // .............................................................................................................*.................................................... - // mls v24.4S, v7.4S, v8.S[0] // ..............................................................*................................................................................................... - // ldr q16, [x4], #64 // ............*..................................................................................................................................................... - // mul v10.4S, v27.4S, v0.S[2] // ......................................................................................................................*........................................... - // sqrdmulh v7.4S, v27.4S, v0.S[3] // .....................................................................................................................*............................................ - // add v9.4S, v9.4S, v2.4S // .......................................................................................................................*.......................................... - // sqrdmulh v27.4S, v11.4S, v17.S[1] // ....................................................................................................................*............................................. - // trn2 v1.4S, v23.4S, v24.4S // ...................................................................*.............................................................................................. - // trn1 v21.4S, v23.4S, v24.4S // ....................................................................*............................................................................................. - // mul v24.4S, v11.4S, v17.S[0] // ..................................................................................................................*............................................... - // mls v10.4S, v7.4S, v8.S[0] // ............................................................................................................................*..................................... - // trn2 v7.2D, v3.2D, v1.2D // ........................................................................*......................................................................................... - // trn2 v13.2D, v20.2D, v21.2D // .......................................................................*.......................................................................................... - // trn1 v26.2D, v3.2D, v1.2D // ..........................................................................*....................................................................................... - // trn1 v19.2D, v20.2D, v21.2D // ...............................................................................*.................................................................................. - // add v3.4S, v18.4S, v9.4S // .....................................................................................................................................*............................ - // sub v23.4S, v13.4S, v7.4S // ..................................................................................................*............................................................... - // sub v21.4S, v19.4S, v26.4S // .....................................................................................*............................................................................ - // mls v24.4S, v27.4S, v8.S[0] // ...........................................................................................................................*...................................... - // mul v11.4S, v23.4S, v25.S[0] // .........................................................................................................*........................................................ - // sqrdmulh v27.4S, v23.4S, v25.S[1] // .......................................................................................................*.......................................................... - // add v7.4S, v13.4S, v7.4S // ............................................................................*..................................................................................... - // mul v1.4S, v21.4S, v17.S[2] // .....................................................................................................*............................................................ - // add v20.4S, v19.4S, v26.4S // ...................................................................................*.............................................................................. - // srshr v13.4S, v3.4S, #23 // ..........................................................................................................................................*....................... - // sqrdmulh v21.4S, v21.4S, v17.S[3] // ...........................................................................................*...................................................................... - // add v15.4S, v10.4S, v24.4S // ..................................................................................................................................*............................... - // sub v25.4S, v20.4S, v7.4S // ..........................................................................................*....................................................................... - // sub v23.4S, v18.4S, v9.4S // ..............................................................................................................................*................................... - // mls v11.4S, v27.4S, v8.S[0] // ...............................................................................................................*.................................................. - // add v20.4S, v20.4S, v7.4S // ............................................................................................*..................................................................... - // mls v1.4S, v21.4S, v8.S[0] // ...........................................................................................................*...................................................... - // srshr v27.4S, v15.4S, #23 // ......................................................................................................................................*........................... - // mul v26.4S, v25.4S, v0.S[0] // ...............................................................................................*.................................................................. - // srshr v7.4S, v20.4S, #23 // ................................................................................................*................................................................. - // mls v3.4S, v13.4S, v8.4S // ................................................................................................................................................*................. - // sqrdmulh v25.4S, v25.4S, v0.S[1] // .................................................................................................*................................................................ - // add v9.4S, v1.4S, v11.4S // ........................................................................................................................*......................................... - // sub v24.4S, v10.4S, v24.4S // ....................................................................................................................................*............................. - // sub v1.4S, v1.4S, v11.4S // .........................................................................................................................*........................................ - // mls v15.4S, v27.4S, v8.4S // ............................................................................................................................................*..................... - // srshr v27.4S, v9.4S, #23 // ................................................................................................................................*................................. - // sqrdmulh v11.4S, v24.4S, v16.S[3] // ........................................................................................................................................*......................... - // mls v20.4S, v7.4S, v8.4S // ..........................................................................................................................*....................................... - // mul v13.4S, v24.4S, v16.S[2] // .......................................................................................................................................*.......................... - // mul v24.4S, v1.4S, v0.S[0] // ...............................................................................................................................*.................................. - // sqrdmulh v7.4S, v1.4S, v0.S[1] // .............................................................................................................................*.................................... - // mls v9.4S, v27.4S, v8.4S // ...........................................................................................................................................*...................... - // mul v10.4S, v23.4S, v16.S[2] // ...................................................................................................................................*.............................. - // mls v13.4S, v11.4S, v8.S[0] // ..............................................................................................................................................*................... - // sqrdmulh v23.4S, v23.4S, v16.S[3] // .................................................................................................................................*................................ - // mls v24.4S, v7.4S, v8.S[0] // .............................................................................................................................................*.................... - // mls v26.4S, v25.4S, v8.S[0] // ................................................................................................................*................................................. - // sub v11.4S, v15.4S, v9.4S // .................................................................................................................................................*................ - // add v27.4S, v3.4S, v20.4S // ........................................................................................................................................................*......... - // mls v10.4S, v23.4S, v8.S[0] // .........................................................................................................................................*........................ - // add v9.4S, v15.4S, v9.4S // ..................................................................................................................................................*............... - // add v23.4S, v13.4S, v24.4S // ...................................................................................................................................................*.............. - // sub v24.4S, v13.4S, v24.4S // ....................................................................................................................................................*............. - // str q27, [x1], #(16*4) // ..............................................................................................................................................................*... - // sqrdmulh v7.4S, v11.4S, v16.S[1] // .....................................................................................................................................................*............ - // mul v25.4S, v11.4S, v16.S[0] // .......................................................................................................................................................*.......... - // add v22.4S, v10.4S, v26.4S // ...............................................................................................................................................*.................. - // str q9, [x1, #-48] // ......................................................................................................................................................*........... - // sqrdmulh v27.4S, v24.4S, v16.S[1] // ...........................................................................................................................................................*...... - // str q23, [x1, #-16] // .............................................................................................................................................................*.... - // sub v1.4S, v3.4S, v20.4S // .........................................................................................................................................................*........ - // mul v11.4S, v24.4S, v16.S[0] // ............................................................................................................................................................*..... - // mls v25.4S, v7.4S, v8.S[0] // ...............................................................................................................................................................*.. - // str q22, [x1, #-32] // ..........................................................................................................................................................*....... - // add x1, x1, #64 // .................................................................................................................................................................* - // sub v23.4S, v10.4S, v26.4S // ................................................................................................................................................................*. + // Instructions: 41 + // Expected cycles: 15 + // Expected IPC: 2.73 + // + // Wall time: 1.78s + // User time: 1.78s + // + // ---------- original position -----------> + // 0 25 + // |------------------------|--------------- + ldr q0, [x1, #48] // .......*................................. + // gap // ......................................... + // gap // ......................................... + ldr q9, [x1, #32] // ......*.................................. + ldr q10, [x1, #16] // .........*............................... + // gap // ......................................... + // gap // ......................................... + ldr q3, [x1, #0] // ........*................................ + // gap // ......................................... + ldr q16, [x5, #64] // ....*.................................... + ldr q20, [x5, #16] // *........................................ + // gap // ......................................... + ldr q23, [x5, #80] // ...............*......................... + // gap // ......................................... + ldr q13, [x5, #128] // ..........*.............................. + // gap // ......................................... + ldr q18, [x5, #48] // .*....................................... + trn2 v17.4S, v9.4S, v0.4S // .............*........................... + ldr q5, [x5, #176] // ..*...................................... + trn1 v14.4S, v9.4S, v0.4S // ............*............................ + trn1 v31.4S, v3.4S, v10.4S // ..............*.......................... + trn2 v10.4S, v3.4S, v10.4S // ................*........................ + ldr q27, [x5, #32] // ...........*............................. + // gap // ......................................... + // gap // ......................................... + ldr q22, [x2, #32] // ...............................*......... + // gap // ......................................... + ldr q12, [x5, #160] // ....................*.................... + trn2 v29.2D, v31.2D, v14.2D // .................*....................... + trn2 v19.2D, v10.2D, v17.2D // ..................*...................... + // gap // ......................................... + // gap // ......................................... + trn1 v3.2D, v10.2D, v17.2D // ...................*..................... + trn1 v10.2D, v31.2D, v14.2D // .....................*................... + ldr q26, [x5, #112] // .................................*....... + ldr q0, [x2, #48] // ..................................*...... + // gap // ......................................... + sub v9.4S, v29.4S, v19.4S // ......................*.................. + add v17.4S, v29.4S, v19.4S // ...................................*..... + ldr q31, [x2, #0] // ...........................*............. + add v21.4S, v10.4S, v3.4S // .........................*............... + sub v14.4S, v10.4S, v3.4S // ........................*................ + // gap // ......................................... + ldr q3, [x2, #16] // ..........................*.............. + sqrdmulh v23.4S, v9.4S, v23.4S // ............................*............ + // gap // ......................................... + mul v9.4S, v9.4S, v16.4S // .............................*........... + // gap // ......................................... + sqrdmulh v18.4S, v14.4S, v18.4S // ................................*........ + sub v10.4S, v21.4S, v17.4S // ........................................* + ldr q24, [x5, #144] // ...*..................................... + ldr q30, [x5, #96] // .....*................................... + mul v1.4S, v14.4S, v27.4S // ....................................*.... + add v16.4S, v21.4S, v17.4S // .......................................*. + ldr q4, [x4], #64 // .......................*................. + ldr q7, [x5], #(12*16) // ..............................*.......... + // gap // ......................................... + // gap // ......................................... + mls v9.4S, v23.4S, v8.S[0] // .....................................*... + trn2 v29.4S, v31.4S, v3.4S // ......................................*.. + + // ------------- new position -------------> + // 0 25 + // |------------------------|--------------- + // ldr q20, [x5, #16] // .....*................................... + // ldr q28, [x5, #48] // ........*................................ + // ldr q5, [x5, #176] // ..........*.............................. + // ldr q24, [x5, #144] // .................................*....... + // ldr q22, [x5, #64] // ....*.................................... + // ldr q30, [x5, #96] // ..................................*...... + // ldr q2, [x1, #32] // .*....................................... + // ldr q26, [x1, #48] // *........................................ + // ldr q29, [x1, #0] // ...*..................................... + // ldr q11, [x1, #16] // ..*...................................... + // ldr q13, [x5, #128] // .......*................................. + // ldr q16, [x5, #32] // ..............*.......................... + // trn1 v4.4S, v2.4S, v26.4S // ...........*............................. + // trn2 v18.4S, v2.4S, v26.4S // .........*............................... + // trn1 v3.4S, v29.4S, v11.4S // ............*............................ + // ldr q21, [x5, #80] // ......*.................................. + // trn2 v31.4S, v29.4S, v11.4S // .............*........................... + // trn2 v2.2D, v3.2D, v4.2D // .................*....................... + // trn2 v15.2D, v31.2D, v18.2D // ..................*...................... + // trn1 v25.2D, v31.2D, v18.2D // ...................*..................... + // ldr q12, [x5, #160] // ................*........................ + // trn1 v10.2D, v3.2D, v4.2D // ....................*.................... + // sub v19.4S, v2.4S, v15.4S // .......................*................. + // ldr q4, [x4], #64 // .....................................*... + // sub v17.4S, v10.4S, v25.4S // ...........................*............. + // add v27.4S, v10.4S, v25.4S // ..........................*.............. + // ldr q3, [x2, #16] // ............................*............ + // ldr q31, [x2, #0] // .........................*............... + // sqrdmulh v25.4S, v19.4S, v21.4S // .............................*........... + // mul v9.4S, v19.4S, v22.4S // ..............................*.......... + // ldr q7, [x5], #(12*16) // ......................................*.. + // ldr q22, [x2, #32] // ...............*......................... + // sqrdmulh v18.4S, v17.4S, v28.4S // ...............................*......... + // ldr q26, [x5, #-80] // .....................*................... + // ldr q0, [x2, #48] // ......................*.................. + // add v6.4S, v2.4S, v15.4S // ........................*................ + // mul v1.4S, v17.4S, v16.4S // ...................................*..... + // mls v9.4S, v25.4S, v8.S[0] // .......................................*. + // trn2 v29.4S, v31.4S, v3.4S // ........................................* + // add v16.4S, v27.4S, v6.4S // ....................................*.... + // sub v10.4S, v27.4S, v6.4S // ................................*........ sub count, count, #1 layer45678_start: // Instructions: 174 - // Expected cycles: 72 - // Expected IPC: 2.42 + // Expected cycles: 74 + // Expected IPC: 2.35 // - // Wall time: 2084.65s - // User time: 2084.65s + // Wall time: 741.15s + // User time: 741.15s // // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> // 0 25 50 75 100 125 150 // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- - sqrdmulh v9.4S, v1.4S, v16.S[1] // ...................................................................................................................................................*.......................... - ldr q15, [x1, #0] // e............................................................................................................................................................................. - mul v1.4S, v1.4S, v16.S[0] // ..................................................................................................................................................*........................... - ldr q20, [x1, #16] // .e............................................................................................................................................................................ - sqrdmulh v24.4S, v23.4S, v16.S[1] // .............................................................................................................................................................*................ - // gap // .............................................................................................................................................................................. - ldr q19, [x1, #32] // ..e........................................................................................................................................................................... - ldr q13, [x1, #48] // ...e.......................................................................................................................................................................... - // gap // .............................................................................................................................................................................. - str q25, [x2, #16] // .........................................................................................................................................................................*.... - mls v11.4S, v27.4S, v8.S[0] // ...................................................................................................................................................................*.......... - mul v7.4S, v23.4S, v16.S[0] // ............................................................................................................................................................*................. - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - // gap // .............................................................................................................................................................................. - mls v1.4S, v9.4S, v8.S[0] // ....................................................................................................................................................*......................... - trn2 v25.4S, v15.4S, v20.4S // .....e........................................................................................................................................................................ - trn1 v17.4S, v15.4S, v20.4S // ....e......................................................................................................................................................................... - // gap // .............................................................................................................................................................................. - ldr q9, [x5, #64] // ............................e................................................................................................................................................. - str q11, [x2, #48] // ...........................................................................................................................................................................*.. - trn2 v11.4S, v19.4S, v13.4S // .......e...................................................................................................................................................................... - trn1 v31.4S, v19.4S, v13.4S // ......e....................................................................................................................................................................... - ldr q21, [x5, #128] // ....................................................e......................................................................................................................... - mls v7.4S, v24.4S, v8.S[0] // ..............................................................................................................................................................*............... - // gap // .............................................................................................................................................................................. - ldr q3, [x5, #48] // ...........................e.................................................................................................................................................. - ldr q13, [x5, #160] // ......................................................e....................................................................................................................... - trn2 v26.2D, v25.2D, v11.2D // .........e.................................................................................................................................................................... - trn2 v20.2D, v17.2D, v31.2D // ........e..................................................................................................................................................................... - ldr q24, [x5, #32] // ..........................e................................................................................................................................................... - ldr q23, [x5, #80] // .............................e................................................................................................................................................ - trn1 v27.2D, v25.2D, v11.2D // ...........e.................................................................................................................................................................. - trn1 v11.2D, v17.2D, v31.2D // ..........e................................................................................................................................................................... - str q1, [x2], #(16*4) // ........................................................................................................................................................................*..... - ldr q16, [x5, #144] // .....................................................e........................................................................................................................ - str q7, [x2, #-32] // ..........................................................................................................................................................................*... - add x2, x2, #64 // .............................................................................................................................................................................* - sub v18.4S, v20.4S, v26.4S // ...................................e.......................................................................................................................................... - add v25.4S, v20.4S, v26.4S // ....................................e......................................................................................................................................... - sub v14.4S, v11.4S, v27.4S // ..............................e............................................................................................................................................... - ldr q26, [x2, #16] // .............e................................................................................................................................................................ - add v20.4S, v11.4S, v27.4S // ...............................e.............................................................................................................................................. - ldr q0, [x2, #0] // ............e................................................................................................................................................................. - ldr q17, [x2, #32] // ..............e............................................................................................................................................................... - mul v27.4S, v18.4S, v9.4S // .....................................e........................................................................................................................................ - sqrdmulh v7.4S, v18.4S, v23.4S // ......................................e....................................................................................................................................... + mls v1.4S, v18.4S, v8.S[0] // ..................................*........................................................................................................................................... + trn2 v6.4S, v22.4S, v0.4S // ...................*.......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v18.4S, v22.4S, v0.4S // ..................*........................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v21.4S, v10.4S, v20.4S // ..........................................*................................................................................................................................... + trn1 v31.4S, v31.4S, v3.4S // ................*............................................................................................................................................................. + trn1 v19.2D, v29.2D, v6.2D // .......................*...................................................................................................................................................... + // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v11.4S, v14.4S, v3.4S // .................................e............................................................................................................................................ - mul v10.4S, v14.4S, v24.4S // ................................e............................................................................................................................................. - ldr q1, [x5, #16] // .........................e.................................................................................................................................................... - ldr q24, [x2, #48] // ...............e.............................................................................................................................................................. - ldr q23, [x5, #176] // .......................................................e...................................................................................................................... - sub v18.4S, v20.4S, v25.4S // ........................................e..................................................................................................................................... - add v3.4S, v20.4S, v25.4S // .........................................e.................................................................................................................................... - ldr q20, [x5], #(12*16) // ........................e..................................................................................................................................................... - trn1 v15.4S, v0.4S, v26.4S // ................e............................................................................................................................................................. + mul v0.4S, v10.4S, v7.4S // ...........................................*.................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v27.4S, v7.4S, v8.S[0] // .......................................e...................................................................................................................................... - trn2 v0.4S, v0.4S, v26.4S // .................e............................................................................................................................................................ - mls v10.4S, v11.4S, v8.S[0] // ..................................e........................................................................................................................................... + sub v23.4S, v1.4S, v9.4S // .............................................*................................................................................................................................ + trn1 v10.2D, v31.2D, v18.2D // ......................*....................................................................................................................................................... // gap // .............................................................................................................................................................................. - ldr q14, [x5, #-80] // ...................................................e.......................................................................................................................... - ldr q5, [x5, #-96] // ..................................................e........................................................................................................................... - trn2 v7.4S, v17.4S, v24.4S // ...................e.......................................................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v26.4S, v18.4S, v1.4S // ...........................................e.................................................................................................................................. - trn1 v25.4S, v17.4S, v24.4S // ..................e........................................................................................................................................................... + add v3.4S, v1.4S, v9.4S // ..............................................*............................................................................................................................... + sqrdmulh v25.4S, v23.4S, v20.4S // ...............................................*.............................................................................................................................. + ldr q20, [x5, #16] // .........................e.................................................................................................................................................... + mul v27.4S, v23.4S, v7.4S // ................................................*............................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v11.4S, v18.4S, v20.4S // ..........................................e................................................................................................................................... - trn2 v9.2D, v0.2D, v7.2D // .....................e........................................................................................................................................................ // gap // .............................................................................................................................................................................. + trn2 v31.2D, v31.2D, v18.2D // ....................*......................................................................................................................................................... + trn2 v28.2D, v29.2D, v6.2D // .....................*........................................................................................................................................................ + add v29.4S, v10.4S, v19.4S // .........................................................*.................................................................................................................... + mls v0.4S, v21.4S, v8.S[0] // ............................................*................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v24.4S, v10.4S, v27.4S // .............................................e................................................................................................................................ - trn1 v31.2D, v0.2D, v7.2D // .......................e...................................................................................................................................................... - trn2 v18.2D, v15.2D, v25.2D // ....................e......................................................................................................................................................... // gap // .............................................................................................................................................................................. + sub v18.4S, v10.4S, v19.4S // ........................................................*..................................................................................................................... + mls v27.4S, v25.4S, v8.S[0] // .................................................*............................................................................................................................ // gap // .............................................................................................................................................................................. - mul v7.4S, v24.4S, v20.4S // ...............................................e.............................................................................................................................. - sqrdmulh v24.4S, v24.4S, v1.4S // ................................................e............................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v1.2D, v15.2D, v25.2D // ......................e....................................................................................................................................................... + add v10.4S, v31.4S, v28.4S // ..............................................................*............................................................................................................... + sub v14.4S, v31.4S, v28.4S // .............................................................*................................................................................................................ + mul v9.4S, v18.4S, v13.4S // ...........................................................*.................................................................................................................. // gap // .............................................................................................................................................................................. - sub v25.4S, v18.4S, v9.4S // .............................................................e................................................................................................................ // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v18.4S, v24.4S // ..........................................................*................................................................................................................... + trn1 v25.4S, v16.4S, v3.4S // ............................................................................*................................................................................................. + trn1 v2.4S, v0.4S, v27.4S // ..............................................................................*............................................................................................... // gap // .............................................................................................................................................................................. - mls v11.4S, v26.4S, v8.S[0] // ............................................e................................................................................................................................. - add v20.4S, v10.4S, v27.4S // ..............................................e............................................................................................................................... - mls v7.4S, v24.4S, v8.S[0] // .................................................e............................................................................................................................ - sub v0.4S, v1.4S, v31.4S // ........................................................e..................................................................................................................... + ldr q28, [x5, #48] // ...........................e.................................................................................................................................................. // gap // .............................................................................................................................................................................. + sqrdmulh v19.4S, v14.4S, v5.4S // ...............................................................*.............................................................................................................. + ldr q22, [x4, #-32] // ..............................................................................................*............................................................................... + mul v13.4S, v14.4S, v12.4S // ................................................................*............................................................................................................. + trn1 v31.2D, v25.2D, v2.2D // ..................................................................................*........................................................................................... + mls v9.4S, v7.4S, v8.S[0] // ............................................................*................................................................................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v15.4S, v3.4S, v20.4S // ............................................................................e................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v24.4S, v3.4S, v20.4S // .............................................................................e................................................................................................ + sub v11.4S, v29.4S, v10.4S // ..................................................................*........................................................................................................... + trn2 v7.4S, v16.4S, v3.4S // .............................................................................*................................................................................................ // gap // .............................................................................................................................................................................. + trn2 v6.4S, v0.4S, v27.4S // ...............................................................................*.............................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v3.4S, v25.4S, v23.4S // ................................................................e............................................................................................................. - mul v23.4S, v0.4S, v21.4S // ..........................................................e................................................................................................................... - trn1 v20.4S, v11.4S, v7.4S // ..............................................................................e............................................................................................... - trn2 v7.4S, v11.4S, v7.4S // ...............................................................................e.............................................................................................. // gap // .............................................................................................................................................................................. + mls v13.4S, v19.4S, v8.S[0] // .................................................................*............................................................................................................ + trn2 v5.2D, v25.2D, v2.2D // ................................................................................*............................................................................................. // gap // .............................................................................................................................................................................. - mul v10.4S, v25.4S, v13.4S // ...............................................................e.............................................................................................................. - sqrdmulh v26.4S, v0.4S, v16.4S // ...........................................................e.................................................................................................................. // gap // .............................................................................................................................................................................. + add v1.4S, v29.4S, v10.4S // ...................................................................*.......................................................................................................... // gap // .............................................................................................................................................................................. - trn1 v11.2D, v15.2D, v20.2D // ..................................................................................e........................................................................................... - trn1 v13.2D, v24.2D, v7.2D // ...................................................................................e.......................................................................................... // gap // .............................................................................................................................................................................. + trn1 v24.2D, v7.2D, v6.2D // ...................................................................................*.......................................................................................... + mul v27.4S, v11.4S, v30.4S // .....................................................................*........................................................................................................ + trn2 v10.2D, v7.2D, v6.2D // .................................................................................*............................................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - ldr q17, [x4, #32] // ..............................................................................................e............................................................................... - ldr q25, [x4, #48] // ...............................................................................................e.............................................................................. - add v9.4S, v18.4S, v9.4S // ..............................................................e............................................................................................................... - sub v27.4S, v11.4S, v13.4S // ................................................................................................e............................................................................. + sub v25.4S, v9.4S, v13.4S // .......................................................................*...................................................................................................... // gap // .............................................................................................................................................................................. - ldr q0, [x4, #16] // .............................................................................................e................................................................................ - mls v23.4S, v26.4S, v8.S[0] // ............................................................e................................................................................................................. - add v21.4S, v1.4S, v31.4S // .........................................................e.................................................................................................................... + sub v6.4S, v31.4S, v24.4S // ................................................................................................*............................................................................. + ldr q29, [x4, #-48] // .............................................................................................*................................................................................ + sqrdmulh v3.4S, v11.4S, v26.4S // ....................................................................*......................................................................................................... // gap // .............................................................................................................................................................................. - mls v10.4S, v3.4S, v8.S[0] // .................................................................e............................................................................................................ // gap // .............................................................................................................................................................................. - trn2 v2.2D, v24.2D, v7.2D // .................................................................................e............................................................................................ + mul v0.4S, v25.4S, v30.4S // ..........................................................................*................................................................................................... + sqrdmulh v15.4S, v25.4S, v26.4S // .........................................................................*.................................................................................................... // gap // .............................................................................................................................................................................. + add v26.4S, v9.4S, v13.4S // ........................................................................*..................................................................................................... + add v11.4S, v5.4S, v10.4S // ......................................................................................................*....................................................................... // gap // .............................................................................................................................................................................. - add v18.4S, v11.4S, v13.4S // .................................................................................................e............................................................................ - add v1.4S, v21.4S, v9.4S // ...................................................................e.......................................................................................................... - sub v7.4S, v21.4S, v9.4S // ..................................................................e........................................................................................................... + sub v17.4S, v5.4S, v10.4S // .....................................................................................................*........................................................................ + add v2.4S, v31.4S, v24.4S // .................................................................................................*............................................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v11.4S, v23.4S, v10.4S // ........................................................................e..................................................................................................... - sub v24.4S, v23.4S, v10.4S // .......................................................................e...................................................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mls v0.4S, v15.4S, v8.S[0] // ...........................................................................*.................................................................................................. + mls v27.4S, v3.4S, v8.S[0] // ......................................................................*....................................................................................................... + mul v10.4S, v17.4S, v22.S[0] // ........................................................................................................*..................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v13.4S, v7.4S, v14.4S // .....................................................................e........................................................................................................ - mul v23.4S, v7.4S, v5.4S // ....................................................................e......................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v7.4S, v24.4S, v14.4S // ..........................................................................e................................................................................................... - mul v24.4S, v24.4S, v5.4S // .........................................................................e.................................................................................................... + trn1 v31.4S, v1.4S, v26.4S // ....................................................................................*......................................................................................... + add v21.4S, v2.4S, v11.4S // .....................................................................................................................*........................................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn2 v3.4S, v1.4S, v11.4S // .....................................................................................e........................................................................................ - trn2 v9.2D, v15.2D, v20.2D // ................................................................................e............................................................................................. + mul v9.4S, v6.4S, v29.S[2] // ...................................................................................................*.......................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - trn1 v20.4S, v1.4S, v11.4S // ....................................................................................e......................................................................................... + trn2 v16.4S, v1.4S, v26.4S // .....................................................................................*........................................................................................ + trn1 v12.4S, v27.4S, v0.4S // ......................................................................................*....................................................................................... + srshr v7.4S, v21.4S, #23 // ........................................................................................................................................*..................................... // gap // .............................................................................................................................................................................. + ldr q26, [x4, #-16] // ...............................................................................................*.............................................................................. + trn2 v24.4S, v27.4S, v0.4S // .......................................................................................*...................................................................................... + sqrdmulh v14.4S, v6.4S, v29.S[3] // ..................................................................................................*........................................................................... + trn2 v13.2D, v31.2D, v12.2D // ........................................................................................*..................................................................................... // gap // .............................................................................................................................................................................. - mls v23.4S, v13.4S, v8.S[0] // ......................................................................e....................................................................................................... - sub v11.4S, v9.4S, v2.4S // .....................................................................................................e........................................................................ - mls v24.4S, v7.4S, v8.S[0] // ...........................................................................e.................................................................................................. // gap // .............................................................................................................................................................................. - ldr q16, [x4], #64 // ............................................................................................e................................................................................. - mul v10.4S, v27.4S, v0.S[2] // ..................................................................................................e........................................................................... + ldr q5, [x5, #176] // .......................................................e...................................................................................................................... // gap // .............................................................................................................................................................................. + trn2 v19.2D, v16.2D, v24.2D // .........................................................................................*.................................................................................... + trn1 v6.2D, v16.2D, v24.2D // ...........................................................................................*.................................................................................. // gap // .............................................................................................................................................................................. - sqrdmulh v7.4S, v27.4S, v0.S[3] // ...................................................................................................e.......................................................................... - add v9.4S, v9.4S, v2.4S // ......................................................................................................e....................................................................... + sqrdmulh v23.4S, v17.4S, v22.S[1] // .......................................................................................................*...................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v27.4S, v11.4S, v17.S[1] // ........................................................................................................e..................................................................... + trn1 v25.2D, v31.2D, v12.2D // ..........................................................................................*................................................................................... + mls v9.4S, v14.4S, v8.S[0] // ....................................................................................................*......................................................................... // gap // .............................................................................................................................................................................. - trn2 v1.4S, v23.4S, v24.4S // .......................................................................................e...................................................................................... - trn1 v21.4S, v23.4S, v24.4S // ......................................................................................e....................................................................................... // gap // .............................................................................................................................................................................. + sub v15.4S, v13.4S, v19.4S // ...............................................................................................................*.............................................................. + add v18.4S, v25.4S, v6.4S // ...........................................................................................................*.................................................................. // gap // .............................................................................................................................................................................. - mul v24.4S, v11.4S, v17.S[0] // .......................................................................................................e...................................................................... // gap // .............................................................................................................................................................................. + sub v27.4S, v25.4S, v6.4S // ..........................................................................................................*................................................................... + mls v10.4S, v23.4S, v8.S[0] // .........................................................................................................*.................................................................... // gap // .............................................................................................................................................................................. - mls v10.4S, v7.4S, v8.S[0] // ....................................................................................................e......................................................................... - trn2 v7.2D, v3.2D, v1.2D // .........................................................................................e.................................................................................... - trn2 v13.2D, v20.2D, v21.2D // ........................................................................................e..................................................................................... // gap // .............................................................................................................................................................................. + add v19.4S, v13.4S, v19.4S // ................................................................................................................*............................................................. // gap // .............................................................................................................................................................................. - trn1 v26.2D, v3.2D, v1.2D // ...........................................................................................e.................................................................................. - trn1 v19.2D, v20.2D, v21.2D // ..........................................................................................e................................................................................... // gap // .............................................................................................................................................................................. + mul v0.4S, v27.4S, v22.S[2] // .............................................................................................................*................................................................ + sqrdmulh v30.4S, v27.4S, v22.S[3] // ............................................................................................................*................................................................. // gap // .............................................................................................................................................................................. - add v3.4S, v18.4S, v9.4S // .....................................................................................................................e........................................................ - sub v23.4S, v13.4S, v7.4S // ...............................................................................................................e.............................................................. // gap // .............................................................................................................................................................................. + add v23.4S, v18.4S, v19.4S // ...............................................................................................................................*.............................................. + sub v1.4S, v2.4S, v11.4S // ....................................................................................................................*......................................................... + sqrdmulh v25.4S, v15.4S, v26.S[1] // .................................................................................................................*............................................................ + mul v26.4S, v15.4S, v26.S[0] // ..................................................................................................................*........................................................... // gap // .............................................................................................................................................................................. - sub v21.4S, v19.4S, v26.4S // ..........................................................................................................e................................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v24.4S, v27.4S, v8.S[0] // .........................................................................................................e.................................................................... - mul v11.4S, v23.4S, v25.S[0] // .................................................................................................................e............................................................ // gap // .............................................................................................................................................................................. + srshr v3.4S, v23.4S, #23 // ............................................................................................................................................*................................. + add v27.4S, v9.4S, v10.4S // ..........................................................................................................................*................................................... + mls v21.4S, v7.4S, v8.4S // .........................................................................................................................................*.................................... + mls v0.4S, v30.4S, v8.S[0] // ..............................................................................................................*............................................................... + ldr q24, [x5, #144] // .....................................................e........................................................................................................................ // gap // .............................................................................................................................................................................. - sqrdmulh v27.4S, v23.4S, v25.S[1] // ..................................................................................................................e........................................................... - add v7.4S, v13.4S, v7.4S // ................................................................................................................e............................................................. - mul v1.4S, v21.4S, v17.S[2] // ............................................................................................................e................................................................. + mls v26.4S, v25.4S, v8.S[0] // ...................................................................................................................*.......................................................... + ldr q22, [x5, #64] // ............................e................................................................................................................................................. + sub v11.4S, v9.4S, v10.4S // .........................................................................................................................*.................................................... // gap // .............................................................................................................................................................................. + srshr v14.4S, v27.4S, #23 // ..........................................................................................................................................*................................... + mls v23.4S, v3.4S, v8.4S // .............................................................................................................................................*................................ // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. + mul v13.4S, v11.4S, v4.S[2] // ............................................................................................................................*................................................. // gap // .............................................................................................................................................................................. - add v20.4S, v19.4S, v26.4S // ...........................................................................................................e.................................................................. - srshr v13.4S, v3.4S, #23 // ........................................................................................................................................e..................................... - sqrdmulh v21.4S, v21.4S, v17.S[3] // .............................................................................................................e................................................................ // gap // .............................................................................................................................................................................. + sub v6.4S, v18.4S, v19.4S // ..............................................................................................................................*............................................... // gap // .............................................................................................................................................................................. - add v15.4S, v10.4S, v24.4S // ..........................................................................................................................e................................................... - sub v25.4S, v20.4S, v7.4S // ..............................................................................................................................e............................................... - sub v23.4S, v18.4S, v9.4S // ....................................................................................................................e......................................................... // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v11.4S, v4.S[3] // ...........................................................................................................................*.................................................. + add v2.4S, v0.4S, v26.4S // ....................................................................................................................................*......................................... + mls v27.4S, v14.4S, v8.4S // ...........................................................................................................................................*.................................. + sub v3.4S, v21.4S, v23.4S // ................................................................................................................................................*............................. // gap // .............................................................................................................................................................................. - mls v11.4S, v27.4S, v8.S[0] // ...................................................................................................................e.......................................................... // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v6.4S, v29.S[1] // ................................................................................................................................*............................................. // gap // .............................................................................................................................................................................. - add v20.4S, v20.4S, v7.4S // ...............................................................................................................................e.............................................. - mls v1.4S, v21.4S, v8.S[0] // ..............................................................................................................e............................................................... // gap // .............................................................................................................................................................................. + sub v31.4S, v0.4S, v26.4S // ...................................................................................................................................*.......................................... + mul v9.4S, v3.4S, v4.S[0] // ...................................................................................................................................................*.......................... + ldr q30, [x5, #96] // ..................................................e........................................................................................................................... + sqrdmulh v16.4S, v3.4S, v4.S[1] // ..................................................................................................................................................*........................... // gap // .............................................................................................................................................................................. - srshr v27.4S, v15.4S, #23 // ..........................................................................................................................................e................................... // gap // .............................................................................................................................................................................. - mul v26.4S, v25.4S, v0.S[0] // ................................................................................................................................e............................................. - srshr v7.4S, v20.4S, #23 // ............................................................................................................................................e................................. // gap // .............................................................................................................................................................................. - mls v3.4S, v13.4S, v8.4S // .........................................................................................................................................e.................................... - sqrdmulh v25.4S, v25.4S, v0.S[1] // .................................................................................................................................e............................................ + mul v19.4S, v6.4S, v29.S[0] // .................................................................................................................................*............................................ + srshr v12.4S, v2.4S, #23 // ..............................................................................................................................................*............................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v9.4S, v1.4S, v11.4S // ....................................................................................................................................e......................................... - sub v24.4S, v10.4S, v24.4S // .........................................................................................................................e.................................................... + mul v3.4S, v31.4S, v29.S[0] // ......................................................................................................................................*....................................... + sqrdmulh v15.4S, v31.4S, v29.S[1] // .....................................................................................................................................*........................................ + mls v9.4S, v16.4S, v8.S[0] // ....................................................................................................................................................*......................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - sub v1.4S, v1.4S, v11.4S // ...................................................................................................................................e.......................................... + mul v14.4S, v1.4S, v4.S[2] // .......................................................................................................................*...................................................... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mls v15.4S, v27.4S, v8.4S // ...........................................................................................................................................e.................................. + sqrdmulh v18.4S, v1.4S, v4.S[3] // ......................................................................................................................*....................................................... + mls v2.4S, v12.4S, v8.4S // ...............................................................................................................................................*.............................. + add v26.4S, v21.4S, v23.4S // .................................................................................................................................................*............................ + mls v19.4S, v17.4S, v8.S[0] // ..................................................................................................................................*........................................... // gap // .............................................................................................................................................................................. - srshr v27.4S, v9.4S, #23 // ..............................................................................................................................................e............................... - sqrdmulh v11.4S, v24.4S, v16.S[3] // ............................................................................................................................e................................................. // gap // .............................................................................................................................................................................. - mls v20.4S, v7.4S, v8.4S // .............................................................................................................................................e................................ - mul v13.4S, v24.4S, v16.S[2] // ...........................................................................................................................e.................................................. // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - mul v24.4S, v1.4S, v0.S[0] // .....................................................................................................................................e........................................ - sqrdmulh v7.4S, v1.4S, v0.S[1] // ......................................................................................................................................e....................................... + mls v3.4S, v15.4S, v8.S[0] // .......................................................................................................................................*...................................... + mls v13.4S, v7.4S, v8.S[0] // .............................................................................................................................*................................................ + str q9, [x2], #(16*4) // ........................................................................................................................................................................*..... // gap // .............................................................................................................................................................................. + mls v14.4S, v18.4S, v8.S[0] // ........................................................................................................................*..................................................... + sub v6.4S, v27.4S, v2.4S // .....................................................................................................................................................*........................ // gap // .............................................................................................................................................................................. - mls v9.4S, v27.4S, v8.4S // ...............................................................................................................................................e.............................. // gap // .............................................................................................................................................................................. + add v11.4S, v27.4S, v2.4S // ......................................................................................................................................................*....................... // gap // .............................................................................................................................................................................. - mul v10.4S, v23.4S, v16.S[2] // ......................................................................................................................e....................................................... - mls v13.4S, v11.4S, v8.S[0] // .............................................................................................................................e................................................ // gap // .............................................................................................................................................................................. + add v7.4S, v13.4S, v3.4S // ................................................................................................................................................................*............. + str q26, [x1], #(16*4) // ....................................................................................................................................................................*......... + sub v31.4S, v13.4S, v3.4S // ...............................................................................................................................................................*.............. + add v29.4S, v14.4S, v19.4S // ...........................................................................................................................................................*.................. + str q11, [x1, #-48] // .....................................................................................................................................................................*........ // gap // .............................................................................................................................................................................. - sqrdmulh v23.4S, v23.4S, v16.S[3] // .......................................................................................................................e...................................................... - mls v24.4S, v7.4S, v8.S[0] // .......................................................................................................................................e...................................... + sub v26.4S, v14.4S, v19.4S // ..........................................................................................................................................................*................... + sqrdmulh v9.4S, v31.4S, v4.S[1] // .................................................................................................................................................................*............ + mul v10.4S, v31.4S, v4.S[0] // ..................................................................................................................................................................*........... + str q7, [x1, #-16] // .......................................................................................................................................................................*...... // gap // .............................................................................................................................................................................. + str q29, [x1, #-32] // ......................................................................................................................................................................*....... + add x1, x1, #64 // ............................................................................................................................................................................*. + sqrdmulh v14.4S, v26.4S, v4.S[1] // ............................................................................................................................................................*................. // gap // .............................................................................................................................................................................. - mls v26.4S, v25.4S, v8.S[0] // ..................................................................................................................................e........................................... - sub v11.4S, v15.4S, v9.4S // .....................................................................................................................................................e........................ + ldr q2, [x1, #32] // ..e........................................................................................................................................................................... + mul v27.4S, v6.4S, v4.S[0] // ........................................................................................................................................................*..................... + mul v0.4S, v26.4S, v4.S[0] // .............................................................................................................................................................*................ + ldr q26, [x1, #48] // ...e.......................................................................................................................................................................... + mls v10.4S, v9.4S, v8.S[0] // ...................................................................................................................................................................*.......... + ldr q29, [x1, #0] // e............................................................................................................................................................................. + ldr q11, [x1, #16] // .e............................................................................................................................................................................ // gap // .............................................................................................................................................................................. - add v27.4S, v3.4S, v20.4S // .................................................................................................................................................e............................ + sqrdmulh v6.4S, v6.4S, v4.S[1] // .......................................................................................................................................................*...................... + ldr q13, [x5, #128] // ....................................................e......................................................................................................................... + ldr q16, [x5, #32] // ..........................e................................................................................................................................................... // gap // .............................................................................................................................................................................. - mls v10.4S, v23.4S, v8.S[0] // ........................................................................................................................e..................................................... - add v9.4S, v15.4S, v9.4S // ......................................................................................................................................................e....................... + mls v0.4S, v14.4S, v8.S[0] // ..............................................................................................................................................................*............... // gap // .............................................................................................................................................................................. // gap // .............................................................................................................................................................................. - add v23.4S, v13.4S, v24.4S // ................................................................................................................................................................e............. // gap // .............................................................................................................................................................................. + trn1 v4.4S, v2.4S, v26.4S // ......e....................................................................................................................................................................... // gap // .............................................................................................................................................................................. - sub v24.4S, v13.4S, v24.4S // ...............................................................................................................................................................e.............. - str q27, [x1], #(16*4) // ....................................................................................................................................................................e......... - sqrdmulh v7.4S, v11.4S, v16.S[1] // ........................................................................................................................................................e..................... // gap // .............................................................................................................................................................................. - mul v25.4S, v11.4S, v16.S[0] // .......................................................................................................................................................e...................... - add v22.4S, v10.4S, v26.4S // ...........................................................................................................................................................e.................. - str q9, [x1, #-48] // .....................................................................................................................................................................e........ + trn2 v18.4S, v2.4S, v26.4S // .......e...................................................................................................................................................................... + trn1 v3.4S, v29.4S, v11.4S // ....e......................................................................................................................................................................... // gap // .............................................................................................................................................................................. - sqrdmulh v27.4S, v24.4S, v16.S[1] // ..................................................................................................................................................................e........... - str q23, [x1, #-16] // .......................................................................................................................................................................e...... - sub v1.4S, v3.4S, v20.4S // ................................................................................................................................................e............................. - mul v11.4S, v24.4S, v16.S[0] // .................................................................................................................................................................e............ - // gap // .............................................................................................................................................................................. - mls v25.4S, v7.4S, v8.S[0] // .........................................................................................................................................................e.................... - str q22, [x1, #-32] // ......................................................................................................................................................................e....... - add x1, x1, #64 // ............................................................................................................................................................................e. - sub v23.4S, v10.4S, v26.4S // ..........................................................................................................................................................e................... - - // ---------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--- - // ldr q9, [x1, #0] // e............................................................................................................................................................................'~............................. - // ldr q10, [x1, #16] // ..e..........................................................................................................................................................................'..~........................... - // ldr q11, [x1, #32] // ....e........................................................................................................................................................................'....~......................... - // ldr q12, [x1, #48] // .....e.......................................................................................................................................................................'.....~........................ - // trn1 v25.4s, v9.4s, v10.4s // ...........e.................................................................................................................................................................'...........~.................. - // trn2 v26.4s, v9.4s, v10.4s // ..........e..................................................................................................................................................................'..........~................... - // trn1 v27.4s, v11.4s, v12.4s // ...............e.............................................................................................................................................................'...............~.............. - // trn2 v28.4s, v11.4s, v12.4s // ..............e..............................................................................................................................................................'..............~............... - // trn2 v11.2d, v25.2d, v27.2d // .....................e.......................................................................................................................................................'.....................~........ - // trn2 v12.2d, v26.2d, v28.2d // ....................e........................................................................................................................................................'....................~......... - // trn1 v9.2d, v25.2d, v27.2d // .........................e...................................................................................................................................................'.........................~.... - // trn1 v10.2d, v26.2d, v28.2d // ........................e....................................................................................................................................................'........................~..... - // ldr q13, [x2, #0] // ...................................e.........................................................................................................................................'.............................. - // ldr q14, [x2, #16] // .................................e...........................................................................................................................................'.............................. - // ldr q15, [x2, #32] // ....................................e........................................................................................................................................'.............................. - // ldr q16, [x2, #48] // ..........................................e..................................................................................................................................'.............................. - // trn1 v25.4s, v13.4s, v14.4s // ...............................................e.............................................................................................................................'.............................. - // trn2 v26.4s, v13.4s, v14.4s // .................................................e...........................................................................................................................'.............................. - // trn1 v27.4s, v15.4s, v16.4s // .......................................................e.....................................................................................................................'.............................. - // trn2 v28.4s, v15.4s, v16.4s // .....................................................e.......................................................................................................................'.............................. - // trn2 v15.2d, v25.2d, v27.2d // ............................................................e................................................................................................................'.............................. - // trn2 v16.2d, v26.2d, v28.2d // .........................................................e...................................................................................................................'.............................. - // trn1 v13.2d, v25.2d, v27.2d // ...............................................................e.............................................................................................................'.............................. - // trn1 v14.2d, v26.2d, v28.2d // ...........................................................e.................................................................................................................'.............................. - // ldr q0, [x5], #(12*16) // ..............................................e..............................................................................................................................'.............................. - // ldr q4, [x5, #(-12*16 + 1*16)] // .........................................e...................................................................................................................................'.............................. - // ldr q1, [x5, #(-12*16 + 2*16)] // ......................e......................................................................................................................................................'......................~....... - // ldr q5, [x5, #(-12*16 + 3*16)] // ..................e..........................................................................................................................................................'..................~........... - // ldr q2, [x5, #(-12*16 + 4*16)] // ............e................................................................................................................................................................'............~................. - // ldr q6, [x5, #(-12*16 + 5*16)] // .......................e.....................................................................................................................................................'.......................~...... - // sub v24.4s, v9.4s, v10.4s // ................................e............................................................................................................................................'.............................. - // add v9.4s, v9.4s, v10.4s // ..................................e..........................................................................................................................................'.............................. - // mul v10.4s, v24.4s, v1.4s // ........................................e....................................................................................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................e.....................................................................................................................................'.............................. - // mls v10.4s, v24.4s, v8.s[0] // ..................................................e..........................................................................................................................'.............................. - // sub v24.4s, v11.4s, v12.4s // ..............................e..............................................................................................................................................'.............................. - // add v11.4s, v11.4s, v12.4s // ...............................e.............................................................................................................................................'.............................. - // mul v12.4s, v24.4s, v2.4s // .....................................e.......................................................................................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v6.4s // ......................................e......................................................................................................................................'.............................. - // mls v12.4s, v24.4s, v8.s[0] // ................................................e............................................................................................................................'.............................. - // sub v24.4s, v9.4s, v11.4s // ............................................e................................................................................................................................'.............................. - // add v9.4s, v9.4s, v11.4s // .............................................e...............................................................................................................................'.............................. - // mul v11.4s, v24.4s, v0.4s // ........................................................e....................................................................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................................e......................................................................................................................'.............................. - // mls v11.4s, v24.4s, v8.s[0] // .................................................................e...........................................................................................................'.............................. - // sub v24.4s, v10.4s, v12.4s // ..........................................................e..................................................................................................................'.............................. - // add v10.4s, v10.4s, v12.4s // ..................................................................e..........................................................................................................'.............................. - // mul v12.4s, v24.4s, v0.4s // .............................................................e...............................................................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................................e..............................................................................................................'.............................. - // mls v12.4s, v24.4s, v8.s[0] // ...................................................................e.........................................................................................................'.............................. - // ldr q0, [x5, #(-12*16 + 6*16)] // ....................................................e........................................................................................................................'.............................. - // ldr q4, [x5, #(-12*16 + 7*16)] // ...................................................e.........................................................................................................................'.............................. - // ldr q1, [x5, #(-12*16 + 8*16)] // ................e............................................................................................................................................................'................~............. - // ldr q5, [x5, #(-12*16 + 9*16)] // ...........................e.................................................................................................................................................'...........................~.. - // ldr q2, [x5, #(-12*16 + 10*16)] // ...................e.........................................................................................................................................................'...................~.......... - // ldr q6, [x5, #(-12*16 + 11*16)] // ...........................................e.................................................................................................................................'.............................. - // sub v24.4s, v13.4s, v14.4s // ....................................................................e........................................................................................................'.............................. - // add v13.4s, v13.4s, v14.4s // .....................................................................................e.......................................................................................'.............................. - // mul v14.4s, v24.4s, v1.4s // ........................................................................e....................................................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v5.4s // ............................................................................e................................................................................................'.............................. - // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................e........................................................................................'.............................. - // sub v24.4s, v15.4s, v16.4s // ................................................................e............................................................................................................'.............................. - // add v15.4s, v15.4s, v16.4s // .................................................................................e...........................................................................................'.............................. - // mul v16.4s, v24.4s, v2.4s // ...........................................................................e.................................................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v6.4s // .......................................................................e.....................................................................................................'.............................. - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................e......................................................................................'.............................. - // sub v24.4s, v13.4s, v15.4s // ..........................................................................................e..................................................................................'.............................. - // add v13.4s, v13.4s, v15.4s // .........................................................................................e...................................................................................'.............................. - // mul v15.4s, v24.4s, v0.4s // ..............................................................................................e..............................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................................................................e...............................................................................'.............................. - // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................................e........................................................................'.............................. - // sub v24.4s, v14.4s, v16.4s // ............................................................................................e................................................................................'.............................. - // add v14.4s, v14.4s, v16.4s // ...........................................................................................e.................................................................................'.............................. - // mul v16.4s, v24.4s, v0.4s // ................................................................................................e............................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................................................................e.............................................................................'.............................. - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................e......................................................................'.............................. - // trn1 v25.4s, v9.4s, v10.4s // .....................................................................e.......................................................................................................'.............................. - // trn2 v26.4s, v9.4s, v10.4s // ......................................................................e......................................................................................................'.............................. - // trn1 v27.4s, v11.4s, v12.4s // .........................................................................e...................................................................................................'.............................. - // trn2 v28.4s, v11.4s, v12.4s // ..........................................................................e..................................................................................................'.............................. - // trn2 v11.2d, v25.2d, v27.2d // ..................................................................................................e..........................................................................'.............................. - // trn2 v12.2d, v26.2d, v28.2d // .......................................................................................e.....................................................................................'.............................. - // trn1 v9.2d, v25.2d, v27.2d // .............................................................................e...............................................................................................'.............................. - // trn1 v10.2d, v26.2d, v28.2d // ..............................................................................e..............................................................................................'.............................. - // trn1 v25.4s, v13.4s, v14.4s // ...................................................................................................e.........................................................................'.............................. - // trn2 v26.4s, v13.4s, v14.4s // .................................................................................................e...........................................................................'.............................. - // trn1 v27.4s, v15.4s, v16.4s // .............................................................................................................e...............................................................'.............................. - // trn2 v28.4s, v15.4s, v16.4s // ............................................................................................................e................................................................'.............................. - // trn2 v15.2d, v25.2d, v27.2d // .................................................................................................................e...........................................................'.............................. - // trn2 v16.2d, v26.2d, v28.2d // ................................................................................................................e............................................................'.............................. - // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................................e.........................................................'.............................. - // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................................................e..........................................................'.............................. - // ldr q0, [x4], #64 // .......................................................................................................e.....................................................................'.............................. - // ldr q1, [x4, #(-64 + 16)] // ...................................................................................e.........................................................................................'.............................. - // ldr q2, [x4, #(-64 + 32)] // ...............................................................................e.............................................................................................'.............................. - // ldr q3, [x4, #(-64 + 48)] // ................................................................................e............................................................................................'.............................. - // sub v24.4s, v9.4s, v10.4s // ..................................................................................e..........................................................................................'.............................. - // add v9.4s, v9.4s, v10.4s // ........................................................................................e....................................................................................'.............................. - // mul v10.4s, v24.4s, v1.s[2] // ........................................................................................................e....................................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................................................................e...................................................................'.............................. - // mls v10.4s, v24.4s, v8.s[0] // ...............................................................................................................e.............................................................'.............................. - // sub v24.4s, v11.4s, v12.4s // .....................................................................................................e.......................................................................'.............................. - // add v11.4s, v11.4s, v12.4s // ..........................................................................................................e..................................................................'.............................. - // mul v12.4s, v24.4s, v2.s[0] // ..............................................................................................................e..............................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................................e.................................................................'.............................. - // mls v12.4s, v24.4s, v8.s[0] // .......................................................................................................................e.....................................................'.............................. - // sub v24.4s, v13.4s, v14.4s // ......................................................................................................................e......................................................'.............................. - // add v13.4s, v13.4s, v14.4s // ............................................................................................................................e................................................'.............................. - // mul v14.4s, v24.4s, v2.s[2] // ...........................................................................................................................e.................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................................................e..............................................'.............................. - // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................................................e........................................'.............................. - // sub v24.4s, v15.4s, v16.4s // .....................................................................................................................e.......................................................'.............................. - // add v15.4s, v15.4s, v16.4s // ..........................................................................................................................e..................................................'.............................. - // mul v16.4s, v24.4s, v3.s[0] // ........................................................................................................................e....................................................'.............................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................e...................................................'.............................. - // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................................e..........................................'.............................. - // sub v24.4s, v9.4s, v11.4s // .................................................................................................................................e...........................................'.............................. - // add v9.4s, v9.4s, v11.4s // ....................................................................................................................e........................................................'.............................. - // mul v11.4s, v24.4s, v0.s[2] // .....................................................................................................................................................e.......................'.............................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................e.....................'.............................. - // mls v11.4s, v24.4s, v8.s[0] // ............................................................................................................................................................e................'.............................. - // sub v24.4s, v10.4s, v12.4s // ...........................................................................................................................................e.................................'.............................. - // add v10.4s, v10.4s, v12.4s // ...............................................................................................................................e.............................................'.............................. - // mul v12.4s, v24.4s, v0.s[2] // .................................................................................................................................................e...........................'.............................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................e.............................'.............................. - // mls v12.4s, v24.4s, v8.s[0] // ......................................................................................................................................................e......................'.............................. - // sub v24.4s, v13.4s, v15.4s // ................................................................................................................................e............................................'.............................. - // add v13.4s, v13.4s, v15.4s // ...................................................................................................................................e.........................................'.............................. - // mul v15.4s, v24.4s, v1.s[0] // ......................................................................................................................................e......................................'.............................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................e...................................'.............................. - // mls v15.4s, v24.4s, v8.s[0] // .........................................................................................................................................................e...................'.............................. - // sub v24.4s, v14.4s, v16.4s // ............................................................................................................................................e................................'.............................. - // add v14.4s, v14.4s, v16.4s // ..........................................................................................................................................e..................................'.............................. - // mul v16.4s, v24.4s, v1.s[0] // ..................................................................................................................................................e..........................'.............................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................e.........................'.............................. - // mls v16.4s, v24.4s, v8.s[0] // ........................................................................................................................................................e....................'.............................. - // srshr v24.4S, v9.4S, #23 // .............................................................................................................................e...............................................'.............................. - // mls v9.4s, v24.4s, v8.4s // ........................................................................................................................................e....................................'.............................. - // srshr v24.4S, v10.4S, #23 // .....................................................................................................................................e.......................................'.............................. - // mls v10.4s, v24.4s, v8.4s // .............................................................................................................................................e...............................'.............................. - // srshr v24.4S, v13.4S, #23 // .......................................................................................................................................e.....................................'.............................. - // mls v13.4s, v24.4s, v8.4s // ................................................................................................................................................e............................'.............................. - // srshr v24.4S, v14.4S, #23 // ..............................................................................................................................................e..............................'.............................. - // mls v14.4s, v24.4s, v8.4s // ....................................................................................................................................................e........................'.............................. - // sub v24.4s, v9.4s, v13.4s // .......................................................................................................................................................................e.....'.............................. - // add v9.4s, v9.4s, v13.4s // ...........................................................................................................................................................e.................'.............................. - // mul v13.4s, v24.4s, v0.s[0] // .~...........................................................................................................................................................................'.*............................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................................................................................................................*.............................. - // mls v13.4s, v24.4s, v8.s[0] // .........~...................................................................................................................................................................'.........*.................... - // sub v24.4s, v10.4s, v14.4s // ..........................................................................................................................................................e..................'.............................. - // add v10.4s, v10.4s, v14.4s // .............................................................................................................................................................e...............'.............................. - // mul v14.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................e..........'.............................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................................................................................................e...........'.............................. - // mls v14.4s, v24.4s, v8.s[0] // .........................................................................................................................................................................e...'.............................. - // sub v24.4s, v11.4s, v15.4s // ............................................................................................................................................................................e'.............................. - // add v11.4s, v11.4s, v15.4s // ...................................................................................................................................................................e.........'.............................. - // mul v15.4s, v24.4s, v0.s[0] // ........~....................................................................................................................................................................'........*..................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...~.........................................................................................................................................................................'...*.......................... - // mls v15.4s, v24.4s, v8.s[0] // .................~...........................................................................................................................................................'.................*............ - // sub v24.4s, v12.4s, v16.4s // ...............................................................................................................................................................e.............'.............................. - // add v12.4s, v12.4s, v16.4s // ..............................................................................................................................................................e..............'.............................. - // mul v16.4s, v24.4s, v0.s[0] // ........................................................................................................................................................................e....'.............................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................................................................e.......'.............................. - // mls v16.4s, v24.4s, v8.s[0] // .......~.....................................................................................................................................................................'.......*...................... - // str q9, [x1], #(16*4) // ................................................................................................................................................................e............'.............................. - // str q10, [x1, #(-16*4 + 1*16)] // ....................................................................................................................................................................e........'.............................. - // str q11, [x1, #(-16*4 + 2*16)] // ..........................................................................................................................................................................e..'.............................. - // str q12, [x1, #(-16*4 + 3*16)] // ......................................................................................................................................................................e......'.............................. - // str q13, [x2], #(16*4) // ..........................~..................................................................................................................................................'..........................*... - // str q14, [x2, #(-16*4 + 1*16)] // ......~......................................................................................................................................................................'......*....................... - // str q15, [x2, #(-16*4 + 2*16)] // ............................~................................................................................................................................................'............................*. - // str q16, [x2, #(-16*4 + 3*16)] // .............~...............................................................................................................................................................'.............*................ - // add x1, x1, #64 // ...........................................................................................................................................................................e.'.............................. - // add x2, x2, #64 // .............................~...............................................................................................................................................'.............................* + ldr q21, [x5, #80] // .............................e................................................................................................................................................ + trn2 v31.4S, v29.4S, v11.4S // .....e........................................................................................................................................................................ + mls v27.4S, v6.4S, v8.S[0] // .........................................................................................................................................................*.................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v2.2D, v3.2D, v4.2D // ........e..................................................................................................................................................................... + str q10, [x2, #-16] // ...........................................................................................................................................................................*.. + trn2 v15.2D, v31.2D, v18.2D // .........e.................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v25.2D, v31.2D, v18.2D // ...........e.................................................................................................................................................................. + str q0, [x2, #-32] // ..........................................................................................................................................................................*... + ldr q12, [x5, #160] // ......................................................e....................................................................................................................... + trn1 v10.2D, v3.2D, v4.2D // ..........e................................................................................................................................................................... + sub v19.4S, v2.4S, v15.4S // ...................................e.......................................................................................................................................... + ldr q4, [x4], #64 // ............................................................................................e................................................................................. + str q27, [x2, #-48] // .........................................................................................................................................................................*.... + add x2, x2, #64 // .............................................................................................................................................................................* + sub v17.4S, v10.4S, v25.4S // ..............................e............................................................................................................................................... + add v27.4S, v10.4S, v25.4S // ...............................e.............................................................................................................................................. + ldr q3, [x2, #16] // .............e................................................................................................................................................................ + ldr q31, [x2, #0] // ............e................................................................................................................................................................. + sqrdmulh v25.4S, v19.4S, v21.4S // .....................................e........................................................................................................................................ + mul v9.4S, v19.4S, v22.4S // ......................................e....................................................................................................................................... + ldr q7, [x5], #(12*16) // ........................e..................................................................................................................................................... + ldr q22, [x2, #32] // ..............e............................................................................................................................................................... + sqrdmulh v18.4S, v17.4S, v28.4S // ................................e............................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q26, [x5, #-80] // ...................................................e.......................................................................................................................... + ldr q0, [x2, #48] // ...............e.............................................................................................................................................................. + add v6.4S, v2.4S, v15.4S // ....................................e......................................................................................................................................... + mul v1.4S, v17.4S, v16.4S // .................................e............................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v25.4S, v8.S[0] // .......................................e...................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v29.4S, v31.4S, v3.4S // .................e............................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v16.4S, v27.4S, v6.4S // .........................................e.................................................................................................................................... + sub v10.4S, v27.4S, v6.4S // ........................................e..................................................................................................................................... + + // -------------------------------------------------------------------------------------------------------------------------------------------------------- new position ---------------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ldr q9, [x1, #0] // ...........................................................................................................................e.......................................'.....................................................................................................................................~...................... + // ldr q10, [x1, #16] // ............................................................................................................................e......................................'......................................................................................................................................~..................... + // ldr q11, [x1, #32] // ......................................................................................................................e............................................'................................................................................................................................~........................... + // ldr q12, [x1, #48] // .........................................................................................................................e.........................................'...................................................................................................................................~........................ + // trn1 v25.4s, v9.4s, v10.4s // ...................................................................................................................................e...............................'.............................................................................................................................................~.............. + // trn2 v26.4s, v9.4s, v10.4s // .....................................................................................................................................e.............................'...............................................................................................................................................~............ + // trn1 v27.4s, v11.4s, v12.4s // .................................................................................................................................e.................................'...........................................................................................................................................~................ + // trn2 v28.4s, v11.4s, v12.4s // ..................................................................................................................................e................................'............................................................................................................................................~............... + // trn2 v11.2d, v25.2d, v27.2d // .......................................................................................................................................e...........................'.................................................................................................................................................~.......... + // trn2 v12.2d, v26.2d, v28.2d // .........................................................................................................................................e.........................'...................................................................................................................................................~........ + // trn1 v9.2d, v25.2d, v27.2d // .............................................................................................................................................e.....................'.......................................................................................................................................................~.... + // trn1 v10.2d, v26.2d, v28.2d // ..........................................................................................................................................e........................'....................................................................................................................................................~....... + // ldr q13, [x2, #0] // .....................................................................................................................................................e.............'............................................................................................................................................................ + // ldr q14, [x2, #16] // ....................................................................................................................................................e..............'............................................................................................................................................................ + // ldr q15, [x2, #32] // .........................................................................................................................................................e.........'............................................................................................................................................................ + // ldr q16, [x2, #48] // ............................................................................................................................................................e......'............................................................................................................................................................ + // trn1 v25.4s, v13.4s, v14.4s // ...................................................................................................................................................................'...*........................................................................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ................................................................................................................................................................e..'............................................................................................................................................................ + // trn1 v27.4s, v15.4s, v16.4s // ...................................................................................................................................................................'.*.......................................................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ...................................................................................................................................................................'*........................................................................................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ..~................................................................................................................................................................'............*............................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...~...............................................................................................................................................................'.............*.............................................................................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................................................................................'.......*.................................................................................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ...................................................................................................................................................................'....*....................................................................................................................................................... + // ldr q0, [x5], #(12*16) // ........................................................................................................................................................e..........'............................................................................................................................................................ + // ldr q4, [x5, #(-12*16 + 1*16)] // e..................................................................................................................................................................'..........~................................................................................................................................................. + // ldr q1, [x5, #(-12*16 + 2*16)] // ...............................................................................................................................e...................................'.........................................................................................................................................~.................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ..............e....................................................................................................................................................'........................~................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // ...........................................................................e.......................................................................................'.....................................................................................~...................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ....................................................................................................................................e..............................'..............................................................................................................................................~............. + // sub v24.4s, v9.4s, v10.4s // ..................................................................................................................................................e................'............................................................................................................................................................ + // add v9.4s, v9.4s, v10.4s // ...................................................................................................................................................e...............'............................................................................................................................................................ + // sqrdmulh v27.4s, v24.4s, v5.4s // ..........................................................................................................................................................e........'............................................................................................................................................................ + // mul v10.4s, v24.4s, v1.4s // ..............................................................................................................................................................e....'............................................................................................................................................................ + // mls v10.4s, v27.4s, v8.s[0] // ...................................................................................................................................................................*............................................................................................................................................................ + // sub v24.4s, v11.4s, v12.4s // ..............................................................................................................................................e....................'........................................................................................................................................................~... + // add v11.4s, v11.4s, v12.4s // .............................................................................................................................................................e.....'............................................................................................................................................................ + // sqrdmulh v27.4s, v24.4s, v6.4s // ......................................................................................................................................................e............'............................................................................................................................................................ + // mul v12.4s, v24.4s, v2.4s // .......................................................................................................................................................e...........'............................................................................................................................................................ + // mls v12.4s, v27.4s, v8.s[0] // ...............................................................................................................................................................e...'............................................................................................................................................................ + // sub v24.4s, v9.4s, v11.4s // ..................................................................................................................................................................e'............................................................................................................................................................ + // add v9.4s, v9.4s, v11.4s // .................................................................................................................................................................e.'............................................................................................................................................................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ...................................................................................................................................................................'..*......................................................................................................................................................... + // mul v11.4s, v24.4s, v0.4s // ...................................................................................................................................................................'.....*...................................................................................................................................................... + // mls v11.4s, v27.4s, v8.s[0] // .....~.............................................................................................................................................................'...............*............................................................................................................................................ + // sub v24.4s, v10.4s, v12.4s // ...................................................................................................................................................................'......*..................................................................................................................................................... + // add v10.4s, v10.4s, v12.4s // ...................................................................................................................................................................'........*................................................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...................................................................................................................................................................'.........*.................................................................................................................................................. + // mul v12.4s, v24.4s, v0.4s // .~.................................................................................................................................................................'...........*................................................................................................................................................ + // mls v12.4s, v27.4s, v8.s[0] // .......~...........................................................................................................................................................'.................*.......................................................................................................................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ........................................................................................e..........................................................................'..................................................................................................~......................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ...........................................................................................................................................................e.......'............................................................................................................................................................ + // ldr q1, [x5, #(-12*16 + 8*16)] // ..............................................................................................................................e....................................'........................................................................................................................................~................... + // ldr q5, [x5, #(-12*16 + 9*16)] // .........................................................................e.........................................................................................'...................................................................................~........................................................................ + // ldr q2, [x5, #(-12*16 + 10*16)] // ............................................................................................................................................e......................'......................................................................................................................................................~..... + // ldr q6, [x5, #(-12*16 + 11*16)] // ....................................................e..............................................................................................................'..............................................................~............................................................................................. + // sub v24.4s, v13.4s, v14.4s // ......~............................................................................................................................................................'................*........................................................................................................................................... + // add v13.4s, v13.4s, v14.4s // ....~..............................................................................................................................................................'..............*............................................................................................................................................. + // sqrdmulh v27.4s, v24.4s, v5.4s // ...........~.......................................................................................................................................................'.....................*...................................................................................................................................... + // mul v14.4s, v24.4s, v1.4s // ..........~........................................................................................................................................................'....................*....................................................................................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ...................~...............................................................................................................................................'.............................*.............................................................................................................................. + // sub v24.4s, v15.4s, v16.4s // .........~.........................................................................................................................................................'...................*........................................................................................................................................ + // add v15.4s, v15.4s, v16.4s // ........~..........................................................................................................................................................'..................*......................................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ...............~...................................................................................................................................................'.........................*.................................................................................................................................. + // mul v16.4s, v24.4s, v2.4s // .................~.................................................................................................................................................'...........................*................................................................................................................................ + // mls v16.4s, v27.4s, v8.s[0] // .......................~...........................................................................................................................................'.................................*.......................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ....................~..............................................................................................................................................'..............................*............................................................................................................................. + // add v13.4s, v13.4s, v15.4s // .........................~.........................................................................................................................................'...................................*........................................................................................................................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ................................~..................................................................................................................................'..........................................*................................................................................................................. + // mul v15.4s, v24.4s, v0.4s // ...........................~.......................................................................................................................................'.....................................*...................................................................................................................... + // mls v15.4s, v27.4s, v8.s[0] // ........................................~..........................................................................................................................'..................................................*......................................................................................................... + // sub v24.4s, v14.4s, v16.4s // .............................~.....................................................................................................................................'.......................................*.................................................................................................................... + // add v14.4s, v14.4s, v16.4s // ...................................~...............................................................................................................................'.............................................*.............................................................................................................. + // sqrdmulh v27.4s, v24.4s, v4.4s // ..................................~................................................................................................................................'............................................*............................................................................................................... + // mul v16.4s, v24.4s, v0.4s // .................................~.................................................................................................................................'...........................................*................................................................................................................ + // mls v16.4s, v27.4s, v8.s[0] // .......................................~...........................................................................................................................'.................................................*.......................................................................................................... + // trn1 v25.4s, v9.4s, v10.4s // ............~......................................................................................................................................................'......................*..................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // .....................~.............................................................................................................................................'...............................*............................................................................................................................ + // trn1 v27.4s, v11.4s, v12.4s // .............~.....................................................................................................................................................'.......................*.................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ......................~............................................................................................................................................'................................*........................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ........................~..........................................................................................................................................'..................................*......................................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ............................~......................................................................................................................................'......................................*..................................................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ..................~................................................................................................................................................'............................*............................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ..........................~........................................................................................................................................'....................................*....................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ..........................................~........................................................................................................................'....................................................*....................................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // .............................................~.....................................................................................................................'.......................................................*.................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ..............................................~....................................................................................................................'........................................................*................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // .................................................~.................................................................................................................'...........................................................*................................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ...................................................~...............................................................................................................'.............................................................*.............................................................................................. + // trn2 v16.2d, v26.2d, v28.2d // .....................................................~.............................................................................................................'...............................................................*............................................................................................ + // trn1 v13.2d, v25.2d, v27.2d // ........................................................~..........................................................................................................'..................................................................*......................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ......................................................~............................................................................................................'................................................................*........................................................................................... + // ldr q0, [x4], #64 // ...............................................................................................................................................e...................'.........................................................................................................................................................~.. + // ldr q1, [x4, #(-64 + 16)] // ...............................~...................................................................................................................................'.........................................*.................................................................................................................. + // ldr q2, [x4, #(-64 + 32)] // ................~..................................................................................................................................................'..........................*................................................................................................................................. + // ldr q3, [x4, #(-64 + 48)] // ................................................~..................................................................................................................'..........................................................*................................................................................................. + // sub v24.4s, v9.4s, v10.4s // ..............................~....................................................................................................................................'........................................*................................................................................................................... + // add v9.4s, v9.4s, v10.4s // ......................................~............................................................................................................................'................................................*........................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ..................................................~................................................................................................................'............................................................*............................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ............................................~......................................................................................................................'......................................................*..................................................................................................... + // mls v10.4s, v27.4s, v8.s[0] // .........................................................~.........................................................................................................'...................................................................*........................................................................................ + // sub v24.4s, v11.4s, v12.4s // .....................................~.............................................................................................................................'...............................................*............................................................................................................ + // add v11.4s, v11.4s, v12.4s // ....................................~..............................................................................................................................'..............................................*............................................................................................................. + // sqrdmulh v27.4s, v24.4s, v2.s[1] // .......................................................~...........................................................................................................'.................................................................*.......................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // .........................................~.........................................................................................................................'...................................................*........................................................................................................ + // mls v12.4s, v27.4s, v8.s[0] // .............................................................~.....................................................................................................'.......................................................................*.................................................................................... + // sub v24.4s, v13.4s, v14.4s // ............................................................~......................................................................................................'......................................................................*..................................................................................... + // add v13.4s, v13.4s, v14.4s // ...........................................................~.......................................................................................................'.....................................................................*...................................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ................................................................~..................................................................................................'..........................................................................*................................................................................. + // mul v14.4s, v24.4s, v2.s[2] // ...............................................................~...................................................................................................'.........................................................................*.................................................................................. + // mls v14.4s, v27.4s, v8.s[0] // ........................................................................~..........................................................................................'..................................................................................*......................................................................... + // sub v24.4s, v15.4s, v16.4s // ..........................................................~........................................................................................................'....................................................................*....................................................................................... + // add v15.4s, v15.4s, v16.4s // ..............................................................~....................................................................................................'........................................................................*................................................................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ...................................................................~...............................................................................................'.............................................................................*.............................................................................. + // mul v16.4s, v24.4s, v3.s[0] // ....................................................................~..............................................................................................'..............................................................................*............................................................................. + // mls v16.4s, v27.4s, v8.s[0] // ..........................................................................~........................................................................................'....................................................................................*....................................................................... + // sub v24.4s, v9.4s, v11.4s // ..................................................................~................................................................................................'............................................................................*............................................................................... + // add v9.4s, v9.4s, v11.4s // ...........................................~.......................................................................................................................'.....................................................*...................................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ................................................................................................~..................................................................'..........................................................................................................*................................................. + // mul v11.4s, v24.4s, v0.s[2] // ...............................................................................................~...................................................................'.........................................................................................................*.................................................. + // mls v11.4s, v27.4s, v8.s[0] // .......................................................................................................~...........................................................'.................................................................................................................*.......................................... + // sub v24.4s, v10.4s, v12.4s // ............................................................................~......................................................................................'......................................................................................*..................................................................... + // add v10.4s, v10.4s, v12.4s // ......................................................................~............................................................................................'................................................................................*........................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .................................................................................~.................................................................................'...........................................................................................*................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................~...................................................................................'.........................................................................................*.................................................................. + // mls v12.4s, v27.4s, v8.s[0] // .....................................................................................................~.............................................................'...............................................................................................................*............................................ + // sub v24.4s, v13.4s, v15.4s // ................................................................................~..................................................................................'..........................................................................................*................................................................. + // add v13.4s, v13.4s, v15.4s // .................................................................~.................................................................................................'...........................................................................*................................................................................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .....................................................................................~.............................................................................'...............................................................................................*............................................................ + // mul v15.4s, v24.4s, v1.s[0] // ..........................................................................................~........................................................................'....................................................................................................*....................................................... + // mls v15.4s, v27.4s, v8.s[0] // ...................................................................................................~...............................................................'.............................................................................................................*.............................................. + // sub v24.4s, v14.4s, v16.4s // ......................................................................................~............................................................................'................................................................................................*........................................................... + // add v14.4s, v14.4s, v16.4s // ..................................................................................~................................................................................'............................................................................................*............................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .............................................................................................~.....................................................................'.......................................................................................................*.................................................... + // mul v16.4s, v24.4s, v1.s[0] // ............................................................................................~......................................................................'......................................................................................................*..................................................... + // mls v16.4s, v27.4s, v8.s[0] // ....................................................................................................~..............................................................'..............................................................................................................*............................................. + // srshr v24.4S, v9.4S, #23 // ...............................................~...................................................................................................................'.........................................................*.................................................................................................. + // mls v9.4s, v24.4s, v8.4s // .......................................................................~...........................................................................................'.................................................................................*.......................................................................... + // srshr v24.4S, v10.4S, #23 // .............................................................................~.....................................................................................'.......................................................................................*.................................................................... + // mls v10.4s, v24.4s, v8.4s // ...................................................................................~...............................................................................'.............................................................................................*.............................................................. + // srshr v24.4S, v13.4S, #23 // .....................................................................~.............................................................................................'...............................................................................*............................................................................ + // mls v13.4s, v24.4s, v8.4s // ..............................................................................~....................................................................................'........................................................................................*................................................................... + // srshr v24.4S, v14.4S, #23 // ...........................................................................................~.......................................................................'.....................................................................................................*...................................................... + // mls v14.4s, v24.4s, v8.4s // .................................................................................................~.................................................................'...........................................................................................................*................................................ + // sub v24.4s, v9.4s, v13.4s // ....................................................................................~..............................................................................'..............................................................................................*............................................................. + // add v9.4s, v9.4s, v13.4s // ..................................................................................................~................................................................'............................................................................................................*............................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .........................................................................................~.........................................................................'...................................................................................................*........................................................ + // mul v13.4s, v24.4s, v0.s[0] // .......................................................................................~...........................................................................'.................................................................................................*.......................................................... + // mls v13.4s, v27.4s, v8.s[0] // ..............................................................................................~....................................................................'........................................................................................................*................................................... + // sub v24.4s, v10.4s, v14.4s // ........................................................................................................~..........................................................'..................................................................................................................*......................................... + // add v10.4s, v10.4s, v14.4s // .........................................................................................................~.........................................................'...................................................................................................................*........................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .............................................................................................................................~.....................................'.......................................................................................................................................*.................... + // mul v14.4s, v24.4s, v0.s[0] // .......................................................................................................................~...........................................'.................................................................................................................................*.......................... + // mls v14.4s, v27.4s, v8.s[0] // ......................................................................................................................................~............................'................................................................................................................................................*........... + // sub v24.4s, v11.4s, v15.4s // ...............................................................................................................~...................................................'.........................................................................................................................*.................................. + // add v11.4s, v11.4s, v15.4s // .............................................................................................................~.....................................................'.......................................................................................................................*.................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .....................................................................................................................~.............................................'...............................................................................................................................*............................ + // mul v15.4s, v24.4s, v0.s[0] // ........................................................................................................................~..........................................'..................................................................................................................................*......................... + // mls v15.4s, v27.4s, v8.s[0] // ................................................................................................................................~..................................'..........................................................................................................................................*................. + // sub v24.4s, v12.4s, v16.4s // ............................................................................................................~......................................................'......................................................................................................................*..................................... + // add v12.4s, v12.4s, v16.4s // ..........................................................................................................~........................................................'....................................................................................................................*....................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................................................................................................................~..................................................'..........................................................................................................................*................................. + // mul v16.4s, v24.4s, v0.s[0] // .................................................................................................................~.................................................'...........................................................................................................................*................................ + // mls v16.4s, v27.4s, v8.s[0] // ..........................................................................................................................~........................................'....................................................................................................................................*....................... + // str q9, [x1], #(16*4) // ...........................................................................................................~.......................................................'.....................................................................................................................*...................................... + // str q10, [x1, #(-16*4 + 1*16)] // ..............................................................................................................~....................................................'........................................................................................................................*................................... + // str q11, [x1, #(-16*4 + 2*16)] // ...................................................................................................................~...............................................'.............................................................................................................................*.............................. + // str q12, [x1, #(-16*4 + 3*16)] // ..................................................................................................................~................................................'............................................................................................................................*............................... + // str q13, [x2], #(16*4) // ......................................................................................................~............................................................'................................................................................................................*........................................... + // str q14, [x2, #(-16*4 + 1*16)] // ................................................................................................................................................~..................'..........................................................................................................................................................*. + // str q15, [x2, #(-16*4 + 2*16)] // ...........................................................................................................................................~.......................'.....................................................................................................................................................*...... + // str q16, [x2, #(-16*4 + 3*16)] // ........................................................................................................................................~..........................'..................................................................................................................................................*......... + // add x1, x1, #64 // ....................................................................................................................~..............................................'..............................................................................................................................*............................. + // add x2, x2, #64 // .................................................................................................................................................~.................'...........................................................................................................................................................* sub count, count, #1 cbnz count, layer45678_start - // Instructions: 12 - // Expected cycles: 8 - // Expected IPC: 1.50 + // Instructions: 133 + // Expected cycles: 64 + // Expected IPC: 2.08 // - // Wall time: 0.07s - // User time: 0.07s + // Wall time: 724.38s + // User time: 724.38s // - // ----- original position -----> - // 0 25 - // |------------------------|---- - mul v10.4S, v1.4S, v16.S[0] // .*............................ - sqrdmulh v24.4S, v1.4S, v16.S[1] // *............................. - // gap // .............................. - // gap // .............................. - mul v7.4S, v23.4S, v16.S[0] // .....*........................ - sqrdmulh v9.4S, v23.4S, v16.S[1] // ..*........................... - // gap // .............................. - // gap // .............................. - mls v11.4S, v27.4S, v8.S[0] // ....*......................... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - mls v10.4S, v24.4S, v8.S[0] // ......*....................... - mls v7.4S, v9.4S, v8.S[0] // ........*..................... - // gap // .............................. - // gap // .............................. - str q25, [x2, #16] // ...*.......................... - // gap // .............................. - // gap // .............................. - str q11, [x2, #48] // .......*...................... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - str q10, [x2], #(16*4) // .........*.................... - // gap // .............................. - str q7, [x2, #-32] // ..........*................... - // gap // .............................. - add x2, x2, #64 // ...........*.................. - - // -------- new position --------> - // 0 25 - // |------------------------|----- - // sqrdmulh v9.4S, v1.4S, v16.S[1] // .*............................. - // mul v1.4S, v1.4S, v16.S[0] // *.............................. - // sqrdmulh v24.4S, v23.4S, v16.S[1] // ...*........................... - // str q25, [x2, #16] // .......*....................... - // mls v11.4S, v27.4S, v8.S[0] // ....*.......................... - // mul v7.4S, v23.4S, v16.S[0] // ..*............................ - // mls v1.4S, v9.4S, v8.S[0] // .....*......................... - // str q11, [x2, #48] // ........*...................... - // mls v7.4S, v24.4S, v8.S[0] // ......*........................ - // str q1, [x2], #(16*4) // .........*..................... - // str q7, [x2, #-32] // ..........*.................... - // add x2, x2, #64 // ...........*................... + // -------------------------------------------------------- original position ---------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------- + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v1.4S, v18.4S, v8.S[0] // *.................................................................................................................................... + trn2 v25.4S, v22.4S, v0.4S // .*................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v28.4S, v22.4S, v0.4S // ..*.................................................................................................................................. + sqrdmulh v6.4S, v10.4S, v20.4S // ...*................................................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v11.4S, v31.4S, v3.4S // ....*................................................................................................................................ + trn1 v23.2D, v29.2D, v25.2D // .....*............................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v18.4S, v1.4S, v9.4S // .......*............................................................................................................................. + mul v3.4S, v10.4S, v7.4S // ......*.............................................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v2.2D, v11.2D, v28.2D // ........*............................................................................................................................ + trn2 v28.2D, v11.2D, v28.2D // ............*........................................................................................................................ + sqrdmulh v31.4S, v18.4S, v20.4S // ..........*.......................................................................................................................... + // gap // ..................................................................................................................................... + mul v22.4S, v18.4S, v7.4S // ...........*......................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v11.4S, v1.4S, v9.4S // .........*........................................................................................................................... + add v14.4S, v2.4S, v23.4S // ..............*...................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v18.2D, v29.2D, v25.2D // .............*....................................................................................................................... + mls v3.4S, v6.4S, v8.S[0] // ...............*..................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v1.4S, v2.4S, v23.4S // ................*.................................................................................................................... + mls v22.4S, v31.4S, v8.S[0] // .................*................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v25.4S, v28.4S, v18.4S // ..................*.................................................................................................................. + sub v7.4S, v28.4S, v18.4S // ...................*................................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v17.4S, v1.4S, v13.4S // ....................*................................................................................................................ + sqrdmulh v1.4S, v1.4S, v24.4S // .....................*............................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v28.4S, v16.4S, v11.4S // ......................*.............................................................................................................. + trn1 v20.4S, v3.4S, v22.4S // .......................*............................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v18.4S, v7.4S, v5.4S // ........................*............................................................................................................ + mul v12.4S, v7.4S, v12.4S // ..........................*.......................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v31.2D, v28.2D, v20.2D // ...........................*......................................................................................................... + mls v17.4S, v1.4S, v8.S[0] // ............................*........................................................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v7.4S, v14.4S, v25.4S // .............................*....................................................................................................... + trn2 v5.4S, v16.4S, v11.4S // ..............................*...................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v1.4S, v3.4S, v22.4S // ...............................*..................................................................................................... + mls v12.4S, v18.4S, v8.S[0] // ................................*.................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v11.2D, v28.2D, v20.2D // .................................*................................................................................................... + add v24.4S, v14.4S, v25.4S // ..................................*.................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v28.2D, v5.2D, v1.2D // ...................................*................................................................................................. + mul v21.4S, v7.4S, v30.4S // ....................................*................................................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v18.2D, v5.2D, v1.2D // .....................................*............................................................................................... + sub v1.4S, v17.4S, v12.4S // ......................................*.............................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v20.4S, v31.4S, v28.4S // .......................................*............................................................................................. + sqrdmulh v7.4S, v7.4S, v26.4S // .........................................*........................................................................................... + // gap // ..................................................................................................................................... + ldr q3, [x4, #-32] // .........................*........................................................................................................... + mul v25.4S, v1.4S, v30.4S // ..........................................*.......................................................................................... + sqrdmulh v5.4S, v1.4S, v26.4S // ...........................................*......................................................................................... + // gap // ..................................................................................................................................... + ldr q23, [x4, #-48] // ........................................*............................................................................................ + add v1.4S, v17.4S, v12.4S // ............................................*........................................................................................ + add v16.4S, v11.4S, v18.4S // .............................................*....................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v6.4S, v11.4S, v18.4S // ..............................................*...................................................................................... + add v26.4S, v31.4S, v28.4S // ...............................................*..................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v25.4S, v5.4S, v8.S[0] // ................................................*.................................................................................... + mls v21.4S, v7.4S, v8.S[0] // .................................................*................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v15.4S, v6.4S, v3.S[0] // ..................................................*.................................................................................. + trn1 v28.4S, v24.4S, v1.4S // ...................................................*................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v29.4S, v26.4S, v16.4S // ....................................................*................................................................................ + mul v14.4S, v20.4S, v23.S[2] // .....................................................*............................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v18.4S, v24.4S, v1.4S // ......................................................*.............................................................................. + trn1 v7.4S, v21.4S, v25.4S // .......................................................*............................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + srshr v31.4S, v29.4S, #23 // ........................................................*............................................................................ + trn2 v5.4S, v21.4S, v25.4S // ..........................................................*.......................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v1.4S, v20.4S, v23.S[3] // ...........................................................*......................................................................... + trn2 v20.2D, v28.2D, v7.2D // ............................................................*........................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v12.2D, v18.2D, v5.2D // .............................................................*....................................................................... + trn1 v11.2D, v18.2D, v5.2D // ..............................................................*...................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v5.4S, v6.4S, v3.S[1] // ...............................................................*..................................................................... + trn1 v25.2D, v28.2D, v7.2D // ................................................................*.................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v14.4S, v1.4S, v8.S[0] // .................................................................*................................................................... + sub v7.4S, v20.4S, v12.4S // ..................................................................*.................................................................. + // gap // ..................................................................................................................................... + ldr q18, [x4, #-16] // .........................................................*........................................................................... + add v2.4S, v25.4S, v11.4S // ...................................................................*................................................................. + sub v1.4S, v25.4S, v11.4S // ....................................................................*................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v15.4S, v5.4S, v8.S[0] // .....................................................................*............................................................... + add v20.4S, v20.4S, v12.4S // ......................................................................*.............................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v6.4S, v1.4S, v3.S[2] // .......................................................................*............................................................. + sqrdmulh v5.4S, v1.4S, v3.S[3] // ........................................................................*............................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v30.4S, v2.4S, v20.4S // .........................................................................*........................................................... + sub v16.4S, v26.4S, v16.4S // ..........................................................................*.......................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v1.4S, v7.4S, v18.S[1] // ...........................................................................*......................................................... + mul v18.4S, v7.4S, v18.S[0] // ............................................................................*........................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + srshr v7.4S, v30.4S, #23 // .............................................................................*....................................................... + add v24.4S, v14.4S, v15.4S // ..............................................................................*...................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v29.4S, v31.4S, v8.4S // ...............................................................................*..................................................... + mls v6.4S, v5.4S, v8.S[0] // ................................................................................*.................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v18.4S, v1.4S, v8.S[0] // .................................................................................*................................................... + sub v11.4S, v14.4S, v15.4S // ..................................................................................*.................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + srshr v25.4S, v24.4S, #23 // ...................................................................................*................................................. + mls v30.4S, v7.4S, v8.4S // ....................................................................................*................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v19.4S, v11.4S, v4.S[2] // .....................................................................................*............................................... + sub v5.4S, v2.4S, v20.4S // ......................................................................................*.............................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v28.4S, v11.4S, v4.S[3] // .......................................................................................*............................................. + add v20.4S, v6.4S, v18.4S // ........................................................................................*............................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v24.4S, v25.4S, v8.4S // .........................................................................................*........................................... + sub v7.4S, v29.4S, v30.4S // ..........................................................................................*.......................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v31.4S, v5.4S, v23.S[1] // ...........................................................................................*......................................... + sub v1.4S, v6.4S, v18.4S // ............................................................................................*........................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v18.4S, v7.4S, v4.S[0] // .............................................................................................*....................................... + sqrdmulh v7.4S, v7.4S, v4.S[1] // ..............................................................................................*...................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v12.4S, v5.4S, v23.S[0] // ...............................................................................................*..................................... + srshr v5.4S, v20.4S, #23 // ................................................................................................*.................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v6.4S, v1.4S, v23.S[0] // .................................................................................................*................................... + sqrdmulh v25.4S, v1.4S, v23.S[1] // ..................................................................................................*.................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v11.4S, v16.4S, v4.S[2] // ....................................................................................................*................................ + sqrdmulh v1.4S, v16.4S, v4.S[3] // .....................................................................................................*............................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v18.4S, v7.4S, v8.S[0] // ...................................................................................................*................................. + mls v20.4S, v5.4S, v8.4S // ......................................................................................................*.............................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v6.4S, v25.4S, v8.S[0] // .........................................................................................................*........................... + mls v19.4S, v28.4S, v8.S[0] // ..........................................................................................................*.......................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v12.4S, v31.4S, v8.S[0] // ........................................................................................................*............................ + mls v11.4S, v1.4S, v8.S[0] // ............................................................................................................*........................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v31.4S, v29.4S, v30.4S // .......................................................................................................*............................. + sub v1.4S, v24.4S, v20.4S // .............................................................................................................*....................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v28.4S, v24.4S, v20.4S // ..............................................................................................................*...................... + sub v5.4S, v19.4S, v6.4S // .................................................................................................................*................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v24.4S, v19.4S, v6.4S // ...............................................................................................................*..................... + sub v2.4S, v11.4S, v12.4S // ....................................................................................................................*................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v7.4S, v5.4S, v4.S[1] // .....................................................................................................................*............... + mul v20.4S, v5.4S, v4.S[0] // ......................................................................................................................*.............. + // gap // ..................................................................................................................................... + str q18, [x2], #(16*4) // ...........................................................................................................*......................... + sqrdmulh v18.4S, v2.4S, v4.S[1] // ..........................................................................................................................*.......... + mul v25.4S, v2.4S, v4.S[0] // ............................................................................................................................*........ + // gap // ..................................................................................................................................... + str q31, [x1], #(16*4) // ................................................................................................................*.................... + sqrdmulh v5.4S, v1.4S, v4.S[1] // ..............................................................................................................................*...... + mul v1.4S, v1.4S, v4.S[0] // ...........................................................................................................................*......... + // gap // ..................................................................................................................................... + add v11.4S, v11.4S, v12.4S // ..................................................................................................................*.................. + str q28, [x1, #-48] // ...................................................................................................................*................. + mls v20.4S, v7.4S, v8.S[0] // .............................................................................................................................*....... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + str q24, [x1, #-16] // .......................................................................................................................*............. + mls v25.4S, v18.4S, v8.S[0] // ...............................................................................................................................*..... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v1.4S, v5.4S, v8.S[0] // ................................................................................................................................*.... + str q11, [x1, #-32] // ........................................................................................................................*............ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + str q20, [x2, #-16] // .................................................................................................................................*... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + str q25, [x2, #-32] // ..................................................................................................................................*.. + // gap // ..................................................................................................................................... + add x1, x1, #64 // .........................................................................................................................*........... + str q1, [x2, #-48] // ...................................................................................................................................*. + add x2, x2, #64 // ....................................................................................................................................* + + // ----------------------------------------------------------- new position -----------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------- + // mls v1.4S, v18.4S, v8.S[0] // *.................................................................................................................................... + // trn2 v6.4S, v22.4S, v0.4S // .*................................................................................................................................... + // trn1 v18.4S, v22.4S, v0.4S // ..*.................................................................................................................................. + // sqrdmulh v21.4S, v10.4S, v20.4S // ...*................................................................................................................................. + // trn1 v31.4S, v31.4S, v3.4S // ....*................................................................................................................................ + // trn1 v19.2D, v29.2D, v6.2D // .....*............................................................................................................................... + // mul v0.4S, v10.4S, v7.4S // .......*............................................................................................................................. + // sub v23.4S, v1.4S, v9.4S // ......*.............................................................................................................................. + // trn1 v10.2D, v31.2D, v18.2D // ........*............................................................................................................................ + // add v3.4S, v1.4S, v9.4S // ............*........................................................................................................................ + // sqrdmulh v25.4S, v23.4S, v20.4S // ..........*.......................................................................................................................... + // mul v27.4S, v23.4S, v7.4S // ...........*......................................................................................................................... + // trn2 v31.2D, v31.2D, v18.2D // .........*........................................................................................................................... + // trn2 v28.2D, v29.2D, v6.2D // ..............*...................................................................................................................... + // add v29.4S, v10.4S, v19.4S // .............*....................................................................................................................... + // mls v0.4S, v21.4S, v8.S[0] // ...............*..................................................................................................................... + // sub v18.4S, v10.4S, v19.4S // ................*.................................................................................................................... + // mls v27.4S, v25.4S, v8.S[0] // .................*................................................................................................................... + // add v10.4S, v31.4S, v28.4S // ..................*.................................................................................................................. + // sub v14.4S, v31.4S, v28.4S // ...................*................................................................................................................. + // mul v9.4S, v18.4S, v13.4S // ....................*................................................................................................................ + // sqrdmulh v7.4S, v18.4S, v24.4S // .....................*............................................................................................................... + // trn1 v25.4S, v16.4S, v3.4S // ......................*.............................................................................................................. + // trn1 v2.4S, v0.4S, v27.4S // .......................*............................................................................................................. + // sqrdmulh v19.4S, v14.4S, v5.4S // ........................*............................................................................................................ + // ldr q22, [x4, #-32] // ........................................*............................................................................................ + // mul v13.4S, v14.4S, v12.4S // .........................*........................................................................................................... + // trn1 v31.2D, v25.2D, v2.2D // ..........................*.......................................................................................................... + // mls v9.4S, v7.4S, v8.S[0] // ...........................*......................................................................................................... + // sub v11.4S, v29.4S, v10.4S // ............................*........................................................................................................ + // trn2 v7.4S, v16.4S, v3.4S // .............................*....................................................................................................... + // trn2 v6.4S, v0.4S, v27.4S // ..............................*...................................................................................................... + // mls v13.4S, v19.4S, v8.S[0] // ...............................*..................................................................................................... + // trn2 v5.2D, v25.2D, v2.2D // ................................*.................................................................................................... + // add v1.4S, v29.4S, v10.4S // .................................*................................................................................................... + // trn1 v24.2D, v7.2D, v6.2D // ..................................*.................................................................................................. + // mul v27.4S, v11.4S, v30.4S // ...................................*................................................................................................. + // trn2 v10.2D, v7.2D, v6.2D // ....................................*................................................................................................ + // sub v25.4S, v9.4S, v13.4S // .....................................*............................................................................................... + // sub v6.4S, v31.4S, v24.4S // ......................................*.............................................................................................. + // ldr q29, [x4, #-48] // ...........................................*......................................................................................... + // sqrdmulh v3.4S, v11.4S, v26.4S // .......................................*............................................................................................. + // mul v0.4S, v25.4S, v30.4S // .........................................*........................................................................................... + // sqrdmulh v15.4S, v25.4S, v26.4S // ..........................................*.......................................................................................... + // add v26.4S, v9.4S, v13.4S // ............................................*........................................................................................ + // add v11.4S, v5.4S, v10.4S // .............................................*....................................................................................... + // sub v17.4S, v5.4S, v10.4S // ..............................................*...................................................................................... + // add v2.4S, v31.4S, v24.4S // ...............................................*..................................................................................... + // mls v0.4S, v15.4S, v8.S[0] // ................................................*.................................................................................... + // mls v27.4S, v3.4S, v8.S[0] // .................................................*................................................................................... + // mul v10.4S, v17.4S, v22.S[0] // ..................................................*.................................................................................. + // trn1 v31.4S, v1.4S, v26.4S // ...................................................*................................................................................. + // add v21.4S, v2.4S, v11.4S // ....................................................*................................................................................ + // mul v9.4S, v6.4S, v29.S[2] // .....................................................*............................................................................... + // trn2 v16.4S, v1.4S, v26.4S // ......................................................*.............................................................................. + // trn1 v12.4S, v27.4S, v0.4S // .......................................................*............................................................................. + // srshr v7.4S, v21.4S, #23 // ........................................................*............................................................................ + // ldr q26, [x4, #-16] // ..................................................................*.................................................................. + // trn2 v24.4S, v27.4S, v0.4S // .........................................................*........................................................................... + // sqrdmulh v14.4S, v6.4S, v29.S[3] // ..........................................................*.......................................................................... + // trn2 v13.2D, v31.2D, v12.2D // ...........................................................*......................................................................... + // trn2 v19.2D, v16.2D, v24.2D // ............................................................*........................................................................ + // trn1 v6.2D, v16.2D, v24.2D // .............................................................*....................................................................... + // sqrdmulh v23.4S, v17.4S, v22.S[1] // ..............................................................*...................................................................... + // trn1 v25.2D, v31.2D, v12.2D // ...............................................................*..................................................................... + // mls v9.4S, v14.4S, v8.S[0] // ................................................................*.................................................................... + // sub v15.4S, v13.4S, v19.4S // .................................................................*................................................................... + // add v18.4S, v25.4S, v6.4S // ...................................................................*................................................................. + // sub v27.4S, v25.4S, v6.4S // ....................................................................*................................................................ + // mls v10.4S, v23.4S, v8.S[0] // .....................................................................*............................................................... + // add v19.4S, v13.4S, v19.4S // ......................................................................*.............................................................. + // mul v0.4S, v27.4S, v22.S[2] // .......................................................................*............................................................. + // sqrdmulh v30.4S, v27.4S, v22.S[3] // ........................................................................*............................................................ + // add v23.4S, v18.4S, v19.4S // .........................................................................*........................................................... + // sub v1.4S, v2.4S, v11.4S // ..........................................................................*.......................................................... + // sqrdmulh v25.4S, v15.4S, v26.S[1] // ...........................................................................*......................................................... + // mul v26.4S, v15.4S, v26.S[0] // ............................................................................*........................................................ + // srshr v3.4S, v23.4S, #23 // .............................................................................*....................................................... + // add v27.4S, v9.4S, v10.4S // ..............................................................................*...................................................... + // mls v21.4S, v7.4S, v8.4S // ...............................................................................*..................................................... + // mls v0.4S, v30.4S, v8.S[0] // ................................................................................*.................................................... + // mls v26.4S, v25.4S, v8.S[0] // .................................................................................*................................................... + // sub v11.4S, v9.4S, v10.4S // ..................................................................................*.................................................. + // srshr v14.4S, v27.4S, #23 // ...................................................................................*................................................. + // mls v23.4S, v3.4S, v8.4S // ....................................................................................*................................................ + // mul v13.4S, v11.4S, v4.S[2] // .....................................................................................*............................................... + // sub v6.4S, v18.4S, v19.4S // ......................................................................................*.............................................. + // sqrdmulh v7.4S, v11.4S, v4.S[3] // .......................................................................................*............................................. + // add v2.4S, v0.4S, v26.4S // ........................................................................................*............................................ + // mls v27.4S, v14.4S, v8.4S // .........................................................................................*........................................... + // sub v3.4S, v21.4S, v23.4S // ..........................................................................................*.......................................... + // sqrdmulh v17.4S, v6.4S, v29.S[1] // ...........................................................................................*......................................... + // sub v31.4S, v0.4S, v26.4S // ............................................................................................*........................................ + // mul v9.4S, v3.4S, v4.S[0] // .............................................................................................*....................................... + // sqrdmulh v16.4S, v3.4S, v4.S[1] // ..............................................................................................*...................................... + // mul v19.4S, v6.4S, v29.S[0] // ...............................................................................................*..................................... + // srshr v12.4S, v2.4S, #23 // ................................................................................................*.................................... + // mul v3.4S, v31.4S, v29.S[0] // .................................................................................................*................................... + // sqrdmulh v15.4S, v31.4S, v29.S[1] // ..................................................................................................*.................................. + // mls v9.4S, v16.4S, v8.S[0] // .....................................................................................................*............................... + // mul v14.4S, v1.4S, v4.S[2] // ...................................................................................................*................................. + // sqrdmulh v18.4S, v1.4S, v4.S[3] // ....................................................................................................*................................ + // mls v2.4S, v12.4S, v8.4S // ......................................................................................................*.............................. + // add v26.4S, v21.4S, v23.4S // ...........................................................................................................*......................... + // mls v19.4S, v17.4S, v8.S[0] // .........................................................................................................*........................... + // mls v3.4S, v15.4S, v8.S[0] // .......................................................................................................*............................. + // mls v13.4S, v7.4S, v8.S[0] // ........................................................................................................*............................ + // str q9, [x2], #(16*4) // ...................................................................................................................*................. + // mls v14.4S, v18.4S, v8.S[0] // ..........................................................................................................*.......................... + // sub v6.4S, v27.4S, v2.4S // ............................................................................................................*........................ + // add v11.4S, v27.4S, v2.4S // .............................................................................................................*....................... + // add v7.4S, v13.4S, v3.4S // ...............................................................................................................*..................... + // str q26, [x1], #(16*4) // ......................................................................................................................*.............. + // sub v31.4S, v13.4S, v3.4S // ..............................................................................................................*...................... + // add v29.4S, v14.4S, v19.4S // .........................................................................................................................*........... + // str q11, [x1, #-48] // ..........................................................................................................................*.......... + // sub v26.4S, v14.4S, v19.4S // ................................................................................................................*.................... + // sqrdmulh v9.4S, v31.4S, v4.S[1] // .................................................................................................................*................... + // mul v10.4S, v31.4S, v4.S[0] // ..................................................................................................................*.................. + // str q7, [x1, #-16] // ............................................................................................................................*........ + // str q29, [x1, #-32] // ...............................................................................................................................*..... + // add x1, x1, #64 // ..................................................................................................................................*.. + // sqrdmulh v14.4S, v26.4S, v4.S[1] // ....................................................................................................................*................ + // mul v27.4S, v6.4S, v4.S[0] // ........................................................................................................................*............ + // mul v0.4S, v26.4S, v4.S[0] // .....................................................................................................................*............... + // mls v10.4S, v9.4S, v8.S[0] // ...........................................................................................................................*......... + // sqrdmulh v6.4S, v6.4S, v4.S[1] // .......................................................................................................................*............. + // mls v0.4S, v14.4S, v8.S[0] // .............................................................................................................................*....... + // mls v27.4S, v6.4S, v8.S[0] // ..............................................................................................................................*...... + // str q10, [x2, #-16] // ................................................................................................................................*.... + // str q0, [x2, #-32] // .................................................................................................................................*... + // str q27, [x2, #-48] // ...................................................................................................................................*. + // add x2, x2, #64 // ....................................................................................................................................* // ----------------------------------------------------------------------------- @@ -1413,299 +1413,299 @@ layer45678_start: // Expected cycles: 47 // Expected IPC: 2.02 // - // Wall time: 38.83s - // User time: 38.83s + // Wall time: 36.09s + // User time: 36.09s // // ------------------------------------- original position --------------------------------------> // 0 25 50 75 // |------------------------|------------------------|------------------------|------------------- + ldr q17, [x0, #896] // .....*......................................................................................... + ldr q23, [x0, #768] // ....*.......................................................................................... // gap // ............................................................................................... // gap // ............................................................................................... - ldr q7, [x0, #768] // .*............................................................................................. - ldr q11, [x0, #896] // ....*.......................................................................................... - ldr q23, [x0, #640] // ...*........................................................................................... + ldr q9, [x0, #512] // .......*....................................................................................... + ldr q4, [x0, #640] // ......*........................................................................................ // gap // ............................................................................................... // gap // ............................................................................................... - ldr q20, [x0, #512] // ........*...................................................................................... - ldr q21, [x0, #128] // .....*......................................................................................... + ldr q7, [x0, #0] // ...*........................................................................................... + ldr q28, [x0, #384] // ..*............................................................................................ // gap // ............................................................................................... // gap // ............................................................................................... - ldr q9, [x0, #0] // ......*........................................................................................ - ldr q15, [x0, #256] // ..*............................................................................................ + ldr q19, [x0, #256] // *.............................................................................................. + ldr q24, [x0, #128] // .*............................................................................................. // gap // ............................................................................................... // gap // ............................................................................................... // gap // ............................................................................................... - sub v27.4S, v7.4S, v11.4S // ..............................*................................................................ - add v5.4S, v7.4S, v11.4S // .........*..................................................................................... - ldr q28, [x0, #384] // *.............................................................................................. // gap // ............................................................................................... - sub v7.4S, v20.4S, v23.4S // ..............*................................................................................ + add v22.4S, v23.4S, v17.4S // ..........*.................................................................................... + sub v17.4S, v23.4S, v17.4S // .............*................................................................................. + sub v13.4S, v9.4S, v4.4S // ..............*................................................................................ // gap // ............................................................................................... // gap // ............................................................................................... - add v17.4S, v20.4S, v23.4S // ...............*............................................................................... + add v20.4S, v9.4S, v4.4S // ...................*........................................................................... // gap // ............................................................................................... // gap // ............................................................................................... - mul v23.4S, v27.4S, v3.S[0] // ....................................*.......................................................... - sqrdmulh v27.4S, v27.4S, v3.S[1] // .....................................*......................................................... - sqrdmulh v11.4S, v7.4S, v2.S[3] // ..........................*.................................................................... - mul v13.4S, v7.4S, v2.S[2] // ............................*.................................................................. + mul v14.4S, v17.4S, v3.S[0] // .............................*................................................................. + sqrdmulh v27.4S, v17.4S, v3.S[1] // ....................*.......................................................................... + sqrdmulh v9.4S, v13.4S, v2.S[3] // ................................*.............................................................. + mul v16.4S, v13.4S, v2.S[2] // .....................*......................................................................... // gap // ............................................................................................... // gap // ............................................................................................... - sub v16.4S, v9.4S, v21.4S // ............*.................................................................................. - add v24.4S, v15.4S, v28.4S // ...........*................................................................................... + sub v21.4S, v7.4S, v24.4S // ............*.................................................................................. + sub v29.4S, v20.4S, v22.4S // ........................*...................................................................... // gap // ............................................................................................... // gap // ............................................................................................... - sub v22.4S, v15.4S, v28.4S // .......*....................................................................................... + sub v12.4S, v19.4S, v28.4S // ...........*................................................................................... + mls v14.4S, v27.4S, v8.S[0] // .........................................*..................................................... // gap // ............................................................................................... // gap // ............................................................................................... - add v7.4S, v9.4S, v21.4S // .............*................................................................................. - mls v13.4S, v11.4S, v8.S[0] // .........................................*..................................................... + mls v16.4S, v9.4S, v8.S[0] // ............................................*.................................................. // gap // ............................................................................................... // gap // ............................................................................................... - mls v23.4S, v27.4S, v8.S[0] // ..........................................*.................................................... - mul v20.4S, v16.4S, v1.S[2] // .........................*..................................................................... + sqrdmulh v10.4S, v29.4S, v1.S[1] // ............................*.................................................................. + add v13.4S, v20.4S, v22.4S // ..........................*.................................................................... + sqrdmulh v9.4S, v21.4S, v1.S[3] // .....................................*......................................................... // gap // ............................................................................................... - sub v27.4S, v7.4S, v24.4S // .................*............................................................................. // gap // ............................................................................................... - add v18.4S, v7.4S, v24.4S // ........................*...................................................................... - sub v9.4S, v17.4S, v5.4S // ...................*........................................................................... + add v6.4S, v7.4S, v24.4S // .........*..................................................................................... + add v19.4S, v19.4S, v28.4S // ........*...................................................................................... // gap // ............................................................................................... // gap // ............................................................................................... - sqrdmulh v24.4S, v22.4S, v2.S[1] // .................................*............................................................. + sub v15.4S, v16.4S, v14.4S // .......................................................*....................................... + mul v28.4S, v21.4S, v1.S[2] // ......................................*........................................................ // gap // ............................................................................................... // gap // ............................................................................................... - sub v11.4S, v13.4S, v23.4S // ....................................................*.......................................... - mul v14.4S, v27.4S, v0.S[2] // ....................*.......................................................................... + sqrdmulh v24.4S, v12.4S, v2.S[1] // ................*.............................................................................. + sub v4.4S, v6.4S, v19.4S // .................*............................................................................. // gap // ............................................................................................... // gap // ............................................................................................... - sqrdmulh v15.4S, v27.4S, v0.S[3] // .....................*......................................................................... - sqrdmulh v27.4S, v11.4S, v1.S[1] // ........................................................*...................................... - mul v7.4S, v11.4S, v1.S[0] // .......................................................*....................................... // gap // ............................................................................................... + mul v21.4S, v12.4S, v2.S[0] // ...............*............................................................................... + mul v23.4S, v29.4S, v1.S[0] // ...........................*................................................................... // gap // ............................................................................................... - mul v21.4S, v9.4S, v1.S[0] // .......................*....................................................................... - sqrdmulh v11.4S, v9.4S, v1.S[1] // ......................*........................................................................ + mul v22.4S, v4.4S, v0.S[2] // .......................*....................................................................... + mls v28.4S, v9.4S, v8.S[0] // .............................................*................................................. // gap // ............................................................................................... // gap // ............................................................................................... - sqrdmulh v16.4S, v16.4S, v1.S[3] // ................*.............................................................................. - mul v9.4S, v22.4S, v2.S[0] // ..........*.................................................................................... + sqrdmulh v27.4S, v15.4S, v1.S[1] // ..............................................................*................................ + mul v9.4S, v15.4S, v1.S[0] // ...................................................................*........................... // gap // ............................................................................................... // gap // ............................................................................................... - mls v7.4S, v27.4S, v8.S[0] // ..............................................................*................................ - add v27.4S, v17.4S, v5.4S // ..................*............................................................................ + add v6.4S, v6.4S, v19.4S // ..................*............................................................................ + mls v21.4S, v24.4S, v8.S[0] // .........................*..................................................................... // gap // ............................................................................................... // gap // ............................................................................................... + mls v23.4S, v10.4S, v8.S[0] // ..................................*............................................................ + sqrdmulh v15.4S, v4.4S, v0.S[3] // ......................*........................................................................ // gap // ............................................................................................... - mls v21.4S, v11.4S, v8.S[0] // .............................*................................................................. - mls v14.4S, v15.4S, v8.S[0] // ...........................*................................................................... // gap // ............................................................................................... - mls v9.4S, v24.4S, v8.S[0] // ........................................*...................................................... - mls v20.4S, v16.4S, v8.S[0] // ................................*.............................................................. + add v10.4S, v6.4S, v13.4S // ..............................*................................................................ // gap // ............................................................................................... + mls v9.4S, v27.4S, v8.S[0] // ........................................................................*...................... // gap // ............................................................................................... - sub v4.4S, v18.4S, v27.4S // ...............................*............................................................... - add v17.4S, v18.4S, v27.4S // ..................................*............................................................ + sub v19.4S, v28.4S, v21.4S // ..................................................*............................................ // gap // ............................................................................................... + sub v27.4S, v6.4S, v13.4S // ...............................*............................................................... // gap // ............................................................................................... - add v24.4S, v13.4S, v23.4S // ...............................................*............................................... // gap // ............................................................................................... + sqrdmulh v24.4S, v10.4S, v26.4S // .................................................*............................................. + mls v22.4S, v15.4S, v8.S[0] // .................................*............................................................. // gap // ............................................................................................... - sub v11.4S, v14.4S, v21.4S // ...................................*........................................................... - add v13.4S, v20.4S, v9.4S // ..............................................*................................................ - sub v27.4S, v20.4S, v9.4S // .............................................*................................................. + sqrdmulh v7.4S, v19.4S, v0.S[3] // ........................................................*...................................... + mul v17.4S, v19.4S, v0.S[2] // ............................................................*.................................. // gap // ............................................................................................... // gap // ............................................................................................... - mul v22.4S, v11.4S, v0.S[0] // ......................................*........................................................ - sqrdmulh v20.4S, v11.4S, v0.S[1] // .......................................*....................................................... + sqrdmulh v4.4S, v27.4S, v0.S[1] // ....................................*.......................................................... + mul v10.4S, v10.4S, v25.4S // ....................................................*.......................................... // gap // ............................................................................................... // gap // ............................................................................................... - sub v11.4S, v13.4S, v24.4S // ..........................................................*.................................... - mul v10.4S, v4.4S, v0.S[0] // ............................................*.................................................. + mul v20.4S, v27.4S, v0.S[0] // ...................................*........................................................... + sub v13.4S, v22.4S, v23.4S // ................................................................*.............................. // gap // ............................................................................................... // gap // ............................................................................................... - sqrdmulh v23.4S, v27.4S, v0.S[3] // .....................................................*......................................... // gap // ............................................................................................... + mls v17.4S, v7.4S, v8.S[0] // .......................................................................*....................... // gap // ............................................................................................... - mul v9.4S, v27.4S, v0.S[2] // ...................................................*........................................... - mls v22.4S, v20.4S, v8.S[0] // ...........................................*................................................... - mul v27.4S, v11.4S, v0.S[0] // ................................................................*.............................. + add v19.4S, v22.4S, v23.4S // .......................................*....................................................... + sqrdmulh v22.4S, v13.4S, v0.S[1] // .........................................................................*..................... + mul v5.4S, v13.4S, v0.S[0] // .....................................................................*......................... // gap // ............................................................................................... // gap // ............................................................................................... - sqrdmulh v15.4S, v11.4S, v0.S[1] // .............................................................*................................. - add v18.4S, v13.4S, v24.4S // ............................................................*.................................. + mls v20.4S, v4.4S, v8.S[0] // ........................................*...................................................... + mls v10.4S, v24.4S, v8.S[0] // ...........................................................*................................... // gap // ............................................................................................... // gap // ............................................................................................... + add v12.4S, v16.4S, v14.4S // ...............................................................*............................... // gap // ............................................................................................... // gap // ............................................................................................... - mls v9.4S, v23.4S, v8.S[0] // ...........................................................*................................... - add v11.4S, v14.4S, v21.4S // .......................................................................*....................... - cmge v20.4S, v31.4S, v22.4S // ..................................................*............................................ - mul v24.4S, v18.4S, v25.4S // ...................................................................*........................... + mul v7.4S, v19.4S, v25.4S // ..........................................*.................................................... + mls v5.4S, v22.4S, v8.S[0] // ..............................................................................*................ + sub v14.4S, v17.4S, v9.4S // ...........................................................................*................... // gap // ............................................................................................... // gap // ............................................................................................... - sqrdmulh v21.4S, v11.4S, v26.4S // ..............................................................................*................ + cmge v15.4S, v20.4S, v30.4S // ..............................................*................................................ // gap // ............................................................................................... - cmge v16.4S, v22.4S, v30.4S // .................................................*............................................. // gap // ............................................................................................... + add v23.4S, v17.4S, v9.4S // ............................................................................*.................. + cmge v9.4S, v10.4S, v30.4S // ....................................................................................*.......... + cmge v27.4S, v31.4S, v10.4S // ......................................................................*........................ // gap // ............................................................................................... // gap // ............................................................................................... - sub v13.4S, v9.4S, v7.4S // ....................................................................*.......................... - add v23.4S, v9.4S, v7.4S // .....................................................................*......................... - mul v9.4S, v11.4S, v25.4S // ...........................................................................*................... - sub v11.4S, v20.4S, v16.4S // ......................................................*........................................ + cmge v22.4S, v31.4S, v20.4S // ................................................*.............................................. + sqrdmulh v29.4S, v19.4S, v26.4S // ...........................................*................................................... // gap // ............................................................................................... // gap // ............................................................................................... - mul v7.4S, v13.4S, v0.S[0] // ..........................................................................*.................... - sqrdmulh v20.4S, v13.4S, v0.S[1] // ........................................................................*...................... + sub v16.4S, v27.4S, v9.4S // ...........................................................................................*... + add v28.4S, v28.4S, v21.4S // .................................................................*............................. // gap // ............................................................................................... // gap // ............................................................................................... + sqrdmulh v21.4S, v23.4S, v26.4S // ...............................................................................*............... + sub v9.4S, v22.4S, v15.4S // ...................................................*........................................... // gap // ............................................................................................... // gap // ............................................................................................... - mls v27.4S, v15.4S, v8.S[0] // ......................................................................*........................ - mul v13.4S, v23.4S, v25.4S // .........................................................................*..................... - sqrdmulh v15.4S, v23.4S, v26.4S // .............................................................................*................. - mls v22.4S, v11.4S, v8.4S // .........................................................*..................................... + mls v7.4S, v29.4S, v8.S[0] // ...............................................*............................................... // gap // ............................................................................................... // gap // ............................................................................................... + add v22.4S, v28.4S, v12.4S // ..........................................................................*.................... + mls v20.4S, v9.4S, v8.4S // .........................................................*..................................... + mul v23.4S, v23.4S, v25.4S // ................................................................................*.............. // gap // ............................................................................................... // gap // ............................................................................................... - mls v9.4S, v21.4S, v8.S[0] // ...................................................................................*........... - mls v7.4S, v20.4S, v8.S[0] // ................................................................................*.............. - sqrdmulh v11.4S, v4.4S, v0.S[1] // ................................................*.............................................. + cmge v17.4S, v5.4S, v30.4S // .......................................................................................*....... // gap // ............................................................................................... // gap // ............................................................................................... - sqrdmulh v18.4S, v18.4S, v26.4S // .................................................................*............................. - mls v13.4S, v15.4S, v8.S[0] // .................................................................................*............. - str q22, [x0, #768] // ...............................................................*............................... + mul v27.4S, v22.4S, v25.4S // .............................................................................*................. // gap // ............................................................................................... - cmge v28.4S, v31.4S, v27.4S // ............................................................................*.................. - cmge v20.4S, v7.4S, v30.4S // .....................................................................................*......... - cmge v21.4S, v31.4S, v7.4S // ....................................................................................*.......... + cmge v9.4S, v31.4S, v7.4S // .....................................................*......................................... + cmge v15.4S, v7.4S, v30.4S // ......................................................*........................................ // gap // ............................................................................................... + mls v23.4S, v21.4S, v8.S[0] // .....................................................................................*......... + str q20, [x0, #512] // ..................................................................*............................ // gap // ............................................................................................... + sqrdmulh v21.4S, v22.4S, v26.4S // .................................................................................*............. + sub v15.4S, v9.4S, v15.4S // ..........................................................*.................................... // gap // ............................................................................................... // gap // ............................................................................................... - mls v10.4S, v11.4S, v8.S[0] // ..................................................................*............................ - cmge v22.4S, v31.4S, v9.4S // ...........................................................................................*... - cmge v11.4S, v13.4S, v30.4S // .......................................................................................*....... + mul v13.4S, v14.4S, v0.S[0] // ..................................................................................*............ + cmge v9.4S, v31.4S, v5.4S // .........................................................................................*..... // gap // ............................................................................................... // gap // ............................................................................................... - cmge v23.4S, v31.4S, v13.4S // ......................................................................................*........ - sub v20.4S, v21.4S, v20.4S // ........................................................................................*...... + sqrdmulh v14.4S, v14.4S, v0.S[1] // ...................................................................................*........... + mls v7.4S, v15.4S, v8.4S // .............................................................*................................. // gap // ............................................................................................... // gap // ............................................................................................... - mls v24.4S, v18.4S, v8.S[0] // ...............................................................................*............... - sub v11.4S, v23.4S, v11.4S // ..........................................................................................*.... - mul v23.4S, v17.4S, v25.4S // ..............................................................................................* + mls v27.4S, v21.4S, v8.S[0] // ......................................................................................*........ + sub v20.4S, v9.4S, v17.4S // ..............................................................................................* // gap // ............................................................................................... // gap // ............................................................................................... // gap // ............................................................................................... + cmge v21.4S, v31.4S, v23.4S // ..........................................................................................*.... + mls v13.4S, v14.4S, v8.S[0] // ........................................................................................*...... // gap // ............................................................................................... - cmge v29.4S, v31.4S, v10.4S // ..................................................................................*............ - mls v7.4S, v20.4S, v8.4S // ............................................................................................*.. - mls v13.4S, v11.4S, v8.4S // .............................................................................................*. // gap // ............................................................................................... // gap // ............................................................................................... - cmge v14.4S, v24.4S, v30.4S // .........................................................................................*..... + cmge v15.4S, v31.4S, v27.4S // ............................................................................................*.. + cmge v14.4S, v27.4S, v30.4S // .............................................................................................*. + str q7, [x0, #256] // ....................................................................*.......................... // ---------------------------------------- new position ----------------------------------------> // 0 25 50 75 // |------------------------|------------------------|------------------------|------------------- - // ldr q15, [x0, #384] // .........*..................................................................................... - // ldr q12, [x0, #768] // *.............................................................................................. - // ldr q20, [x0, #256] // ......*........................................................................................ - // ldr q19, [x0, #640] // ..*............................................................................................ - // ldr q4, [x0, #896] // .*............................................................................................. - // ldr q11, [x0, #128] // ....*.......................................................................................... - // ldr q5, [x0, #0] // .....*......................................................................................... - // sub v14.4S, v20.4S, v15.4S // ..................*............................................................................ - // ldr q17, [x0, #512] // ...*........................................................................................... - // add v28.4S, v12.4S, v4.4S // ........*...................................................................................... - // mul v21.4S, v14.4S, v2.S[0] // ...................................*........................................................... - // add v15.4S, v20.4S, v15.4S // .................*............................................................................. - // sub v20.4S, v5.4S, v11.4S // ................*.............................................................................. - // add v5.4S, v5.4S, v11.4S // ...................*........................................................................... - // sub v29.4S, v17.4S, v19.4S // ..........*.................................................................................... - // add v11.4S, v17.4S, v19.4S // ...........*................................................................................... - // sqrdmulh v19.4S, v20.4S, v1.S[3] // ..................................*............................................................ - // sub v6.4S, v5.4S, v15.4S // .......................*....................................................................... - // add v7.4S, v11.4S, v28.4S // .....................................*......................................................... - // sub v28.4S, v11.4S, v28.4S // .........................*..................................................................... - // mul v11.4S, v6.4S, v0.S[2] // ............................*.................................................................. - // sqrdmulh v17.4S, v6.4S, v0.S[3] // .............................*................................................................. - // sqrdmulh v6.4S, v28.4S, v1.S[1] // .................................*............................................................. - // mul v18.4S, v28.4S, v1.S[0] // ................................*.............................................................. - // add v27.4S, v5.4S, v15.4S // ........................*...................................................................... - // mul v20.4S, v20.4S, v1.S[2] // ......................*........................................................................ - // sqrdmulh v28.4S, v29.4S, v2.S[3] // ..............*................................................................................ - // mls v11.4S, v17.4S, v8.S[0] // .......................................*....................................................... - // mul v15.4S, v29.4S, v2.S[2] // ...............*............................................................................... - // mls v18.4S, v6.4S, v8.S[0] // ......................................*........................................................ - // sub v5.4S, v12.4S, v4.4S // .......*....................................................................................... - // sub v6.4S, v27.4S, v7.4S // ..........................................*.................................................... - // mls v20.4S, v19.4S, v8.S[0] // .........................................*..................................................... - // sqrdmulh v29.4S, v14.4S, v2.S[1] // ..........................*.................................................................... - // add v17.4S, v27.4S, v7.4S // ...........................................*................................................... - // sub v10.4S, v11.4S, v18.4S // .............................................*................................................. - // mul v7.4S, v5.4S, v3.S[0] // ............*.................................................................................. - // sqrdmulh v19.4S, v5.4S, v3.S[1] // .............*................................................................................. - // mul v27.4S, v10.4S, v0.S[0] // ................................................*.............................................. - // sqrdmulh v10.4S, v10.4S, v0.S[1] // .................................................*............................................. - // mls v21.4S, v29.4S, v8.S[0] // ........................................*...................................................... - // mls v15.4S, v28.4S, v8.S[0] // ....................*.......................................................................... - // mls v7.4S, v19.4S, v8.S[0] // .....................*......................................................................... - // mls v27.4S, v10.4S, v8.S[0] // ......................................................*........................................ - // mul v10.4S, v6.4S, v0.S[0] // ...................................................*........................................... - // sub v29.4S, v20.4S, v21.4S // ...............................................*............................................... - // add v22.4S, v20.4S, v21.4S // ..............................................*................................................ - // add v4.4S, v15.4S, v7.4S // ............................................*.................................................. - // sqrdmulh v5.4S, v6.4S, v0.S[1] // ............................................................................*.................. - // cmge v12.4S, v27.4S, v30.4S // ...............................................................*............................... - // cmge v19.4S, v31.4S, v27.4S // ............................................................*.................................. - // mul v21.4S, v29.4S, v0.S[2] // .....................................................*......................................... - // sub v28.4S, v15.4S, v7.4S // ...........................*................................................................... - // sqrdmulh v6.4S, v29.4S, v0.S[3] // ....................................................*.......................................... - // sub v29.4S, v19.4S, v12.4S // ...................................................................*........................... - // mul v20.4S, v28.4S, v1.S[0] // ...............................*............................................................... - // sqrdmulh v14.4S, v28.4S, v1.S[1] // ..............................*................................................................ - // mls v27.4S, v29.4S, v8.4S // .........................................................................*..................... - // sub v12.4S, v22.4S, v4.4S // ..................................................*............................................ - // mls v21.4S, v6.4S, v8.S[0] // ..........................................................*.................................... - // add v29.4S, v22.4S, v4.4S // .........................................................*..................................... - // sqrdmulh v22.4S, v12.4S, v0.S[1] // ........................................................*...................................... - // mls v20.4S, v14.4S, v8.S[0] // ....................................*.......................................................... - // str q27, [x0, #768] // ...............................................................................*............... - // mul v27.4S, v12.4S, v0.S[0] // .......................................................*....................................... - // sqrdmulh v6.4S, v29.4S, v26.4S // .............................................................................*................. - // mls v10.4S, v5.4S, v8.S[0] // ...................................................................................*........... - // mul v24.4S, v29.4S, v25.4S // .............................................................*................................. - // sub v29.4S, v21.4S, v20.4S // ................................................................*.............................. - // add v12.4S, v21.4S, v20.4S // .................................................................*............................. - // mls v27.4S, v22.4S, v8.S[0] // ......................................................................*........................ - // add v14.4S, v11.4S, v18.4S // ...........................................................*................................... - // sqrdmulh v19.4S, v29.4S, v0.S[1] // .....................................................................*......................... - // mul v13.4S, v12.4S, v25.4S // .......................................................................*....................... - // mul v7.4S, v29.4S, v0.S[0] // ....................................................................*.......................... - // mul v9.4S, v14.4S, v25.4S // ..................................................................*............................ - // cmge v28.4S, v31.4S, v27.4S // ................................................................................*.............. - // sqrdmulh v12.4S, v12.4S, v26.4S // ........................................................................*...................... - // sqrdmulh v20.4S, v14.4S, v26.4S // ..............................................................*................................ - // mls v24.4S, v6.4S, v8.S[0] // ........................................................................................*...... - // mls v7.4S, v19.4S, v8.S[0] // ...........................................................................*................... - // mls v13.4S, v12.4S, v8.S[0] // ..............................................................................*................ - // cmge v29.4S, v31.4S, v10.4S // ...........................................................................................*... - // mls v9.4S, v20.4S, v8.S[0] // ..........................................................................*.................... - // cmge v15.4S, v31.4S, v7.4S // ..................................................................................*............ - // cmge v21.4S, v7.4S, v30.4S // .................................................................................*............. - // cmge v5.4S, v31.4S, v13.4S // ......................................................................................*........ - // cmge v12.4S, v13.4S, v30.4S // .....................................................................................*......... - // sub v15.4S, v15.4S, v21.4S // .......................................................................................*....... - // cmge v14.4S, v24.4S, v30.4S // ..............................................................................................* - // sub v21.4S, v5.4S, v12.4S // .........................................................................................*..... - // cmge v22.4S, v31.4S, v9.4S // ....................................................................................*.......... - // mls v7.4S, v15.4S, v8.4S // ............................................................................................*.. - // mls v13.4S, v21.4S, v8.4S // .............................................................................................*. - // mul v23.4S, v17.4S, v25.4S // ..........................................................................................*.... + // ldr q6, [x0, #256] // ......*........................................................................................ + // ldr q4, [x0, #128] // .......*....................................................................................... + // ldr q29, [x0, #384] // .....*......................................................................................... + // ldr q24, [x0, #0] // ....*.......................................................................................... + // ldr q17, [x0, #768] // .*............................................................................................. + // ldr q7, [x0, #896] // *.............................................................................................. + // ldr q22, [x0, #640] // ...*........................................................................................... + // ldr q19, [x0, #512] // ..*............................................................................................ + // add v14.4S, v6.4S, v29.4S // .........................*..................................................................... + // add v5.4S, v24.4S, v4.4S // ........................*...................................................................... + // add v21.4S, v17.4S, v7.4S // ........*...................................................................................... + // sub v11.4S, v6.4S, v29.4S // ..................*............................................................................ + // sub v28.4S, v24.4S, v4.4S // ................*.............................................................................. + // sub v15.4S, v17.4S, v7.4S // .........*..................................................................................... + // sub v6.4S, v19.4S, v22.4S // ..........*.................................................................................... + // mul v17.4S, v11.4S, v2.S[0] // ..............................*................................................................ + // sqrdmulh v4.4S, v11.4S, v2.S[1] // ............................*.................................................................. + // sub v11.4S, v5.4S, v14.4S // .............................*................................................................. + // add v7.4S, v5.4S, v14.4S // ....................................*.......................................................... + // add v29.4S, v19.4S, v22.4S // ...........*................................................................................... + // sqrdmulh v22.4S, v15.4S, v3.S[1] // .............*................................................................................. + // mul v14.4S, v6.4S, v2.S[2] // ...............*............................................................................... + // sqrdmulh v5.4S, v11.4S, v0.S[3] // .......................................*....................................................... + // mul v19.4S, v11.4S, v0.S[2] // ................................*.............................................................. + // sub v11.4S, v29.4S, v21.4S // .................*............................................................................. + // mls v17.4S, v4.4S, v8.S[0] // .....................................*......................................................... + // add v24.4S, v29.4S, v21.4S // ......................*........................................................................ + // mul v4.4S, v11.4S, v1.S[0] // ...............................*............................................................... + // sqrdmulh v11.4S, v11.4S, v1.S[1] // .....................*......................................................................... + // mul v29.4S, v15.4S, v3.S[0] // ............*.................................................................................. + // add v21.4S, v7.4S, v24.4S // ........................................*...................................................... + // sub v15.4S, v7.4S, v24.4S // ...........................................*................................................... + // sqrdmulh v7.4S, v6.4S, v2.S[3] // ..............*................................................................................ + // mls v19.4S, v5.4S, v8.S[0] // .............................................*................................................. + // mls v4.4S, v11.4S, v8.S[0] // ......................................*........................................................ + // mul v5.4S, v15.4S, v0.S[0] // ..................................................*............................................ + // sqrdmulh v15.4S, v15.4S, v0.S[1] // ................................................*.............................................. + // sqrdmulh v24.4S, v28.4S, v1.S[3] // .......................*....................................................................... + // mul v27.4S, v28.4S, v1.S[2] // ...........................*................................................................... + // add v6.4S, v19.4S, v4.4S // .....................................................*......................................... + // mls v5.4S, v15.4S, v8.S[0] // ........................................................*...................................... + // mls v29.4S, v22.4S, v8.S[0] // ...................*........................................................................... + // mul v22.4S, v6.4S, v25.4S // ...........................................................*................................... + // sqrdmulh v6.4S, v6.4S, v26.4S // ...................................................................*........................... + // mls v14.4S, v7.4S, v8.S[0] // ....................*.......................................................................... + // mls v27.4S, v24.4S, v8.S[0] // .................................*............................................................. + // cmge v11.4S, v5.4S, v30.4S // ..............................................................*................................ + // mls v22.4S, v6.4S, v8.S[0] // ........................................................................*...................... + // cmge v15.4S, v31.4S, v5.4S // ..................................................................*............................ + // sqrdmulh v16.4S, v21.4S, v26.4S // ............................................*.................................................. + // sub v12.4S, v27.4S, v17.4S // ..........................................*.................................................... + // sub v6.4S, v15.4S, v11.4S // .......................................................................*....................... + // mul v10.4S, v21.4S, v25.4S // .................................................*............................................. + // cmge v28.4S, v31.4S, v22.4S // ..............................................................................*................ + // cmge v24.4S, v22.4S, v30.4S // ...............................................................................*............... + // sub v11.4S, v14.4S, v29.4S // ..........................*.................................................................... + // sqrdmulh v7.4S, v12.4S, v0.S[3] // ..............................................*................................................ + // mls v5.4S, v6.4S, v8.4S // ..........................................................................*.................... + // sub v15.4S, v28.4S, v24.4S // ...................................................................................*........... + // mls v10.4S, v16.4S, v8.S[0] // .........................................................*..................................... + // mul v16.4S, v12.4S, v0.S[2] // ...............................................*............................................... + // mls v22.4S, v15.4S, v8.4S // .......................................................................................*....... + // sqrdmulh v21.4S, v11.4S, v1.S[1] // ..................................*............................................................ + // add v12.4S, v14.4S, v29.4S // ..........................................................*.................................... + // sub v24.4S, v19.4S, v4.4S // ...................................................*........................................... + // add v28.4S, v27.4S, v17.4S // .....................................................................*......................... + // str q5, [x0, #512] // .................................................................................*............. + // mul v4.4S, v11.4S, v1.S[0] // ...................................*........................................................... + // str q22, [x0, #256] // ..............................................................................................* + // mul v5.4S, v24.4S, v0.S[0] // .......................................................*....................................... + // cmge v6.4S, v31.4S, v10.4S // .................................................................*............................. + // mls v16.4S, v7.4S, v8.S[0] // ....................................................*.......................................... + // mls v4.4S, v21.4S, v8.S[0] // .........................................*..................................................... + // sqrdmulh v22.4S, v24.4S, v0.S[1] // ......................................................*........................................ + // add v17.4S, v28.4S, v12.4S // .........................................................................*..................... + // sub v18.4S, v16.4S, v4.4S // .............................................................*................................. + // add v14.4S, v16.4S, v4.4S // ...............................................................*............................... + // mul v27.4S, v17.4S, v25.4S // .............................................................................*................. + // mls v5.4S, v22.4S, v8.S[0] // ............................................................*.................................. + // sqrdmulh v22.4S, v14.4S, v26.4S // ......................................................................*........................ + // mul v23.4S, v14.4S, v25.4S // ...........................................................................*................... + // sqrdmulh v20.4S, v17.4S, v26.4S // ..................................................................................*............ + // mul v13.4S, v18.4S, v0.S[0] // ....................................................................................*.......... + // sqrdmulh v7.4S, v18.4S, v0.S[1] // ......................................................................................*........ + // cmge v18.4S, v10.4S, v30.4S // ................................................................*.............................. + // mls v23.4S, v22.4S, v8.S[0] // ................................................................................*.............. + // mls v27.4S, v20.4S, v8.S[0] // ........................................................................................*...... + // cmge v20.4S, v5.4S, v30.4S // ............................................................................*.................. + // mls v13.4S, v7.4S, v8.S[0] // ...........................................................................................*... + // cmge v19.4S, v31.4S, v5.4S // .....................................................................................*......... + // cmge v21.4S, v31.4S, v23.4S // ..........................................................................................*.... + // sub v16.4S, v6.4S, v18.4S // ....................................................................*.......................... + // cmge v15.4S, v31.4S, v27.4S // ............................................................................................*.. + // cmge v14.4S, v27.4S, v30.4S // .............................................................................................*. + // sub v20.4S, v19.4S, v20.4S // .........................................................................................*..... sub count, count, #1 layer123_start: @@ -1713,442 +1713,450 @@ layer123_start: // Expected cycles: 52 // Expected IPC: 2.31 // - // Wall time: 122.00s - // User time: 122.00s + // Wall time: 695.43s + // User time: 695.43s // // -------------------------------------------------- original position --------------------------------------------------> // 0 25 50 75 100 // |------------------------|------------------------|------------------------|------------------------|------------------- - sqrdmulh v5.4S, v17.4S, v26.4S // .........................................................................................*.............................. - cmge v18.4S, v10.4S, v30.4S // .....................................................................*.................................................. - ldr q15, [x0, #400] // ...e.................................................................................................................... - ldr q12, [x0, #784] // ......e................................................................................................................. - ldr q20, [x0, #272] // ..e..................................................................................................................... - cmge v16.4S, v9.4S, v30.4S // .............................................................................................................*.......... - cmge v17.4S, v27.4S, v30.4S // .........................................................................*.............................................. - ldr q19, [x0, #656] // .....e.................................................................................................................. - ldr q4, [x0, #912] // .......e................................................................................................................ - sub v6.4S, v29.4S, v18.4S // ......................................................................*................................................. - cmge v21.4S, v31.4S, v24.4S // ........................................................................................................*............... + cmge v11.4S, v13.4S, v30.4S // .................................................................................*...................................... + cmge v19.4S, v31.4S, v13.4S // ................................................................................*....................................... + ldr q6, [x0, #272] // ..e..................................................................................................................... + ldr q4, [x0, #144] // .e...................................................................................................................... + mls v5.4S, v20.4S, v8.4S // ...............................................................................*........................................ + cmge v22.4S, v23.4S, v30.4S // .................................................................................................................*...... + ldr q29, [x0, #400] // ...e.................................................................................................................... + ldr q24, [x0, #16] // e....................................................................................................................... + ldr q17, [x0, #784] // ......e................................................................................................................. + ldr q7, [x0, #912] // .......e................................................................................................................ + sub v20.4S, v19.4S, v11.4S // ..................................................................................*..................................... + sub v19.4S, v28.4S, v12.4S // .....................................................*.................................................................. + sub v18.4S, v21.4S, v22.4S // ..................................................................................................................*..... + ldr q22, [x0, #656] // .....e.................................................................................................................. // gap // ........................................................................................................................ - mls v23.4S, v5.4S, v8.S[0] // ..........................................................................................*............................. + sub v15.4S, v15.4S, v14.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ - sub v16.4S, v22.4S, v16.4S // ..............................................................................................................*......... - ldr q11, [x0, #144] // .e...................................................................................................................... - ldr q5, [x0, #16] // e....................................................................................................................... - sub v18.4S, v28.4S, v17.4S // ..........................................................................*............................................. + mul v9.4S, v19.4S, v0.S[0] // ........................................................*............................................................... + sqrdmulh v12.4S, v19.4S, v0.S[1] // .......................................................*................................................................ + ldr q19, [x0, #528] // ....e................................................................................................................... + add v14.4S, v6.4S, v29.4S // ..............e......................................................................................................... + str q5, [x0, #768] // ......................................................................................*................................. + add v5.4S, v24.4S, v4.4S // .........e.............................................................................................................. // gap // ........................................................................................................................ - sub v22.4S, v21.4S, v14.4S // ..........................................................................................................*............. - sub v14.4S, v20.4S, v15.4S // .............e.......................................................................................................... + add v21.4S, v17.4S, v7.4S // ........................e............................................................................................... // gap // ........................................................................................................................ - mls v9.4S, v16.4S, v8.4S // ...............................................................................................................*........ - ldr q17, [x0, #528] // ....e................................................................................................................... // gap // ........................................................................................................................ + mls v27.4S, v15.4S, v8.4S // ...........................................................................................................*............ // gap // ........................................................................................................................ - cmge v16.4S, v31.4S, v23.4S // ....................................................................................................*................... - add v28.4S, v12.4S, v4.4S // ........................e............................................................................................... - mul v21.4S, v14.4S, v2.S[0] // ...............e........................................................................................................ + sub v11.4S, v6.4S, v29.4S // .............e.......................................................................................................... // gap // ........................................................................................................................ + sub v28.4S, v24.4S, v4.4S // ........e............................................................................................................... + sub v15.4S, v17.4S, v7.4S // .......................e................................................................................................ + sub v6.4S, v19.4S, v22.4S // ..................e..................................................................................................... // gap // ........................................................................................................................ - add v15.4S, v20.4S, v15.4S // ..............e......................................................................................................... - sub v20.4S, v5.4S, v11.4S // ........e............................................................................................................... - add v5.4S, v5.4S, v11.4S // .........e.............................................................................................................. // gap // ........................................................................................................................ + mul v17.4S, v11.4S, v2.S[0] // ................e....................................................................................................... + sqrdmulh v4.4S, v11.4S, v2.S[1] // ...............e........................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v11.4S, v5.4S, v14.4S // ............................e........................................................................................... // gap // ........................................................................................................................ - sub v29.4S, v17.4S, v19.4S // ..................e..................................................................................................... - add v11.4S, v17.4S, v19.4S // ...................e.................................................................................................... // gap // ........................................................................................................................ - mls v10.4S, v6.4S, v8.4S // .......................................................................*................................................ + add v7.4S, v5.4S, v14.4S // .............................e.......................................................................................... // gap // ........................................................................................................................ - sqrdmulh v19.4S, v20.4S, v1.S[3] // ...........e............................................................................................................ + add v29.4S, v19.4S, v22.4S // ...................e.................................................................................................... + sqrdmulh v22.4S, v15.4S, v3.S[1] // .........................e.............................................................................................. // gap // ........................................................................................................................ - mls v27.4S, v18.4S, v8.4S // ...........................................................................*............................................ // gap // ........................................................................................................................ - sub v6.4S, v5.4S, v15.4S // ............................e........................................................................................... // gap // ........................................................................................................................ - str q7, [x0, #896] // .......................................................................................*................................ - add v7.4S, v11.4S, v28.4S // .......................................e................................................................................ - sub v28.4S, v11.4S, v28.4S // ......................................e................................................................................. + mul v14.4S, v6.4S, v2.S[2] // .....................e.................................................................................................. + sqrdmulh v5.4S, v11.4S, v0.S[3] // ..............................e......................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v11.4S, v6.4S, v0.S[2] // ..............................e......................................................................................... - sqrdmulh v17.4S, v6.4S, v0.S[3] // ...............................e........................................................................................ - sqrdmulh v6.4S, v28.4S, v1.S[1] // .........................................e.............................................................................. + mul v19.4S, v11.4S, v0.S[2] // ...............................e........................................................................................ + sub v11.4S, v29.4S, v21.4S // ......................................e................................................................................. // gap // ........................................................................................................................ - str q27, [x0, #640] // .....................................................................................*.................................. - mul v18.4S, v28.4S, v1.S[0] // ........................................e............................................................................... - add v27.4S, v5.4S, v15.4S // .............................e.......................................................................................... - mul v20.4S, v20.4S, v1.S[2] // ..........e............................................................................................................. // gap // ........................................................................................................................ + mls v17.4S, v4.4S, v8.S[0] // .................e...................................................................................................... + add v24.4S, v29.4S, v21.4S // .......................................e................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v4.4S, v11.4S, v1.S[0] // .........................................e.............................................................................. + sqrdmulh v11.4S, v11.4S, v1.S[1] // ........................................e............................................................................... // gap // ........................................................................................................................ - sqrdmulh v28.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. - mls v11.4S, v17.4S, v8.S[0] // ................................e....................................................................................... // gap // ........................................................................................................................ + mul v29.4S, v15.4S, v3.S[0] // ..........................e............................................................................................. + add v21.4S, v7.4S, v24.4S // .................................................e...................................................................... // gap // ........................................................................................................................ - mul v15.4S, v29.4S, v2.S[2] // ....................e................................................................................................... - mls v18.4S, v6.4S, v8.S[0] // ..........................................e............................................................................. - sub v5.4S, v12.4S, v4.4S // .......................e................................................................................................ // gap // ........................................................................................................................ - sub v6.4S, v27.4S, v7.4S // ................................................e....................................................................... + sub v15.4S, v7.4S, v24.4S // ................................................e....................................................................... + sqrdmulh v7.4S, v6.4S, v2.S[3] // ....................e................................................................................................... // gap // ........................................................................................................................ - mls v20.4S, v19.4S, v8.S[0] // ............e........................................................................................................... // gap // ........................................................................................................................ + mls v19.4S, v5.4S, v8.S[0] // ................................e....................................................................................... + mls v4.4S, v11.4S, v8.S[0] // ..........................................e............................................................................. + mul v5.4S, v15.4S, v0.S[0] // ...................................................e.................................................................... + sqrdmulh v15.4S, v15.4S, v0.S[1] // ..................................................e..................................................................... // gap // ........................................................................................................................ - sqrdmulh v29.4S, v14.4S, v2.S[1] // ................e....................................................................................................... - add v17.4S, v27.4S, v7.4S // .................................................e...................................................................... // gap // ........................................................................................................................ - str q10, [x0, #512] // ....................................................................................*................................... - sub v10.4S, v11.4S, v18.4S // ..........................................................e............................................................. - mul v7.4S, v5.4S, v3.S[0] // .........................e.............................................................................................. - sqrdmulh v19.4S, v5.4S, v3.S[1] // ..........................e............................................................................................. + mls v9.4S, v12.4S, v8.S[0] // .........................................................*.............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sqrdmulh v24.4S, v28.4S, v1.S[3] // ..........e............................................................................................................. // gap // ........................................................................................................................ + str q27, [x0, #128] // .....................................................................................................................*.. + mul v27.4S, v28.4S, v1.S[2] // ...........e............................................................................................................ + add v6.4S, v19.4S, v4.4S // ...........................................................e............................................................ // gap // ........................................................................................................................ - mul v27.4S, v10.4S, v0.S[0] // ............................................................e........................................................... - sqrdmulh v10.4S, v10.4S, v0.S[1] // .............................................................e.......................................................... - mls v21.4S, v29.4S, v8.S[0] // .................e...................................................................................................... // gap // ........................................................................................................................ + mls v5.4S, v15.4S, v8.S[0] // ....................................................e................................................................... + mls v29.4S, v22.4S, v8.S[0] // ...........................e............................................................................................ // gap // ........................................................................................................................ - mls v15.4S, v28.4S, v8.S[0] // ......................e................................................................................................. // gap // ........................................................................................................................ - mls v7.4S, v19.4S, v8.S[0] // ...........................e............................................................................................ + mul v22.4S, v6.4S, v25.4S // ...............................................................................................e........................ + sqrdmulh v6.4S, v6.4S, v26.4S // ..............................................................................................e......................... // gap // ........................................................................................................................ - mls v24.4S, v22.4S, v8.4S // ...........................................................................................................*............ - mls v27.4S, v10.4S, v8.S[0] // ..............................................................e......................................................... - mul v10.4S, v6.4S, v0.S[0] // ..................................................e..................................................................... // gap // ........................................................................................................................ + mls v14.4S, v7.4S, v8.S[0] // ......................e................................................................................................. + mls v27.4S, v24.4S, v8.S[0] // ............e........................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v11.4S, v5.4S, v30.4S // .....................................................................e.................................................. + mls v10.4S, v16.4S, v8.4S // .......................................................................................................*................ + mls v22.4S, v6.4S, v8.S[0] // ................................................................................................e....................... // gap // ........................................................................................................................ - sub v29.4S, v20.4S, v21.4S // .................................e...................................................................................... - add v22.4S, v20.4S, v21.4S // ..................................e..................................................................................... - add v4.4S, v15.4S, v7.4S // ............................................e........................................................................... // gap // ........................................................................................................................ + cmge v15.4S, v31.4S, v5.4S // ....................................................................e................................................... + sqrdmulh v16.4S, v21.4S, v26.4S // ........................................................................................e............................... // gap // ........................................................................................................................ - sqrdmulh v5.4S, v6.4S, v0.S[1] // ...................................................e.................................................................... // gap // ........................................................................................................................ + sub v12.4S, v27.4S, v17.4S // .................................e...................................................................................... // gap // ........................................................................................................................ - cmge v12.4S, v27.4S, v30.4S // .............................................................................e.......................................... - cmge v19.4S, v31.4S, v27.4S // ............................................................................e........................................... + sub v6.4S, v15.4S, v11.4S // ......................................................................e................................................. + str q10, [x0], #(16) // ....................................................................................................................*... + mul v10.4S, v21.4S, v25.4S // .........................................................................................e.............................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v21.4S, v29.4S, v0.S[2] // ...................................e.................................................................................... - sub v28.4S, v15.4S, v7.4S // ...........................................e............................................................................ - sqrdmulh v6.4S, v29.4S, v0.S[3] // ....................................e................................................................................... + cmge v28.4S, v31.4S, v22.4S // ............................................................................................................e........... + cmge v24.4S, v22.4S, v30.4S // .............................................................................................................e.......... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v29.4S, v19.4S, v12.4S // ..............................................................................e......................................... - mul v20.4S, v28.4S, v1.S[0] // .............................................e.......................................................................... + sub v11.4S, v14.4S, v29.4S // ...........................................e............................................................................ + sqrdmulh v7.4S, v12.4S, v0.S[3] // ...................................e.................................................................................... + mls v5.4S, v6.4S, v8.4S // .......................................................................e................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v14.4S, v28.4S, v1.S[1] // ..............................................e......................................................................... - mls v27.4S, v29.4S, v8.4S // ...............................................................................e........................................ - sub v12.4S, v22.4S, v4.4S // .....................................................e.................................................................. + sub v15.4S, v28.4S, v24.4S // ..............................................................................................................e......... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v10.4S, v16.4S, v8.S[0] // ..........................................................................................e............................. + mul v16.4S, v12.4S, v0.S[2] // ....................................e................................................................................... + mls v22.4S, v15.4S, v8.4S // ...............................................................................................................e........ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v6.4S, v8.S[0] // .....................................e.................................................................................. - add v29.4S, v22.4S, v4.4S // ......................................................e................................................................. - sqrdmulh v22.4S, v12.4S, v0.S[1] // ........................................................e............................................................... + sqrdmulh v21.4S, v11.4S, v1.S[1] // .............................................e.......................................................................... // gap // ........................................................................................................................ - mls v20.4S, v14.4S, v8.S[0] // ...............................................e........................................................................ + add v12.4S, v14.4S, v29.4S // ............................................e........................................................................... // gap // ........................................................................................................................ - str q27, [x0, #784] // ......................................................................................e................................. - mul v27.4S, v12.4S, v0.S[0] // .......................................................e................................................................ + sub v24.4S, v19.4S, v4.4S // ..........................................................e............................................................. + add v28.4S, v27.4S, v17.4S // ..................................e..................................................................................... // gap // ........................................................................................................................ - sqrdmulh v6.4S, v29.4S, v26.4S // ............................................................................................e........................... - mls v10.4S, v5.4S, v8.S[0] // ....................................................e................................................................... + str q5, [x0, #512] // ....................................................................................e................................... + mul v4.4S, v11.4S, v1.S[0] // ..............................................e......................................................................... + str q22, [x0, #256] // ......................................................................................................................e. // gap // ........................................................................................................................ - str q24, [x0, #128] // .....................................................................................................................*.. - mul v24.4S, v29.4S, v25.4S // ...........................................................................................e............................ + mul v5.4S, v24.4S, v0.S[0] // .............................................................e.......................................................... + cmge v6.4S, v31.4S, v10.4S // ....................................................................................................e................... + cmge v15.4S, v9.4S, v30.4S // .........................................................................*.............................................. // gap // ........................................................................................................................ - sub v29.4S, v21.4S, v20.4S // ...............................................................e........................................................ - add v12.4S, v21.4S, v20.4S // ................................................................e....................................................... // gap // ........................................................................................................................ - mls v27.4S, v22.4S, v8.S[0] // .........................................................e.............................................................. - str q13, [x0, #384] // .......................................................................................................................* + mls v16.4S, v7.4S, v8.S[0] // .....................................e.................................................................................. + cmge v29.4S, v31.4S, v9.4S // ........................................................................*............................................... // gap // ........................................................................................................................ - add v14.4S, v11.4S, v18.4S // ...........................................................e............................................................ // gap // ........................................................................................................................ + mls v4.4S, v21.4S, v8.S[0] // ...............................................e........................................................................ + sqrdmulh v22.4S, v24.4S, v0.S[1] // ............................................................e........................................................... // gap // ........................................................................................................................ - sqrdmulh v19.4S, v29.4S, v0.S[1] // ..................................................................e..................................................... - mul v13.4S, v12.4S, v25.4S // .................................................................................................e...................... - str q9, [x0, #256] // ......................................................................................................................*. - mul v7.4S, v29.4S, v0.S[0] // .................................................................e...................................................... - mul v9.4S, v14.4S, v25.4S // ..............................................................................................e......................... // gap // ........................................................................................................................ + mls v23.4S, v18.4S, v8.4S // ...................................................................................................................*.... + add v17.4S, v28.4S, v12.4S // ......................................................e................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v28.4S, v31.4S, v27.4S // ........................................................................e............................................... - sqrdmulh v12.4S, v12.4S, v26.4S // ..................................................................................................e..................... - cmge v21.4S, v23.4S, v30.4S // .....................................................................................................*.................. - sqrdmulh v20.4S, v14.4S, v26.4S // ...............................................................................................e........................ + mls v13.4S, v20.4S, v8.4S // ...................................................................................*.................................... + sub v18.4S, v16.4S, v4.4S // ...............................................................e........................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v24.4S, v6.4S, v8.S[0] // .............................................................................................e.......................... - mls v7.4S, v19.4S, v8.S[0] // ...................................................................e.................................................... + add v14.4S, v16.4S, v4.4S // ................................................................e....................................................... + mul v27.4S, v17.4S, v25.4S // ............................................................................................e........................... // gap // ........................................................................................................................ + mls v5.4S, v22.4S, v8.S[0] // ..............................................................e......................................................... // gap // ........................................................................................................................ - mls v13.4S, v12.4S, v8.S[0] // ...................................................................................................e.................... // gap // ........................................................................................................................ + str q23, [x0, #368] // .......................................................................................................................* + sqrdmulh v22.4S, v14.4S, v26.4S // .................................................................................................e...................... + mul v23.4S, v14.4S, v25.4S // ..................................................................................................e..................... + sqrdmulh v20.4S, v17.4S, v26.4S // ...........................................................................................e............................ // gap // ........................................................................................................................ - cmge v29.4S, v31.4S, v10.4S // ....................................................................e................................................... - sub v18.4S, v16.4S, v21.4S // ......................................................................................................*................. - mls v9.4S, v20.4S, v8.S[0] // ................................................................................................e....................... + str q13, [x0, #880] // .......................................................................................*................................ + mul v13.4S, v18.4S, v0.S[0] // ..................................................................e..................................................... + sqrdmulh v7.4S, v18.4S, v0.S[1] // .................................................................e...................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v14.4S, v29.4S, v15.4S // ..........................................................................*............................................. + cmge v18.4S, v10.4S, v30.4S // .....................................................................................................e.................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v15.4S, v31.4S, v7.4S // ................................................................................e....................................... - cmge v21.4S, v7.4S, v30.4S // .................................................................................e...................................... + mls v23.4S, v22.4S, v8.S[0] // ...................................................................................................e.................... + mls v27.4S, v20.4S, v8.S[0] // .............................................................................................e.......................... // gap // ........................................................................................................................ - cmge v5.4S, v31.4S, v13.4S // ................................................................................................................e....... - cmge v12.4S, v13.4S, v30.4S // .................................................................................................................e...... + mls v9.4S, v14.4S, v8.4S // ...........................................................................*............................................ // gap // ........................................................................................................................ - mls v23.4S, v18.4S, v8.4S // .......................................................................................................*................ + cmge v20.4S, v5.4S, v30.4S // .............................................................................e.......................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v15.4S, v15.4S, v21.4S // ..................................................................................e..................................... - cmge v14.4S, v24.4S, v30.4S // .........................................................................................................e.............. + mls v13.4S, v7.4S, v8.S[0] // ...................................................................e.................................................... + cmge v19.4S, v31.4S, v5.4S // ............................................................................e........................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v21.4S, v5.4S, v12.4S // ..................................................................................................................e..... - cmge v22.4S, v31.4S, v9.4S // ............................................................................................................e........... + cmge v21.4S, v31.4S, v23.4S // ................................................................................................................e....... + sub v16.4S, v6.4S, v18.4S // ......................................................................................................e................. // gap // ........................................................................................................................ + str q9, [x0, #624] // .....................................................................................*.................................. + cmge v15.4S, v31.4S, v27.4S // ........................................................................................................e............... + cmge v14.4S, v27.4S, v30.4S // .........................................................................................................e.............. + sub v20.4S, v19.4S, v20.4S // ..............................................................................e......................................... // gap // ........................................................................................................................ - mls v7.4S, v15.4S, v8.4S // ...................................................................................e.................................... - mls v13.4S, v21.4S, v8.4S // ...................................................................................................................e.... - str q23, [x0], #(16) // ....................................................................................................................*... - mul v23.4S, v17.4S, v25.4S // ........................................................................................e............................... // gap // ........................................................................................................................ - // --------------------------------------------------------------------------------------------------------------- new position ---------------------------------------------------------------------------------------------------------------> + // -------------------------------------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------------------------------------> // 0 25 50 75 100 125 150 175 200 225 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- - // ldr q9, [x0, #0] // ............e.........................................................................................................'.............~........................................................................................................ - // ldr q10, [x0, #(1*(1024/8))] // ...........e..........................................................................................................'............~......................................................................................................... - // ldr q11, [x0, #(2*(1024/8))] // ..e...................................................................................................................'...~.................................................................................................................. - // ldr q12, [x0, #(3*(1024/8))] // e.....................................................................................................................'.~.................................................................................................................... - // ldr q13, [x0, #(4*(1024/8))] // .................e....................................................................................................'..................~................................................................................................... - // ldr q14, [x0, #(5*(1024/8))] // .....e................................................................................................................'......~............................................................................................................... - // ldr q15, [x0, #(6*(1024/8))] // .e....................................................................................................................'..~................................................................................................................... - // ldr q16, [x0, #(7*(1024/8))] // ......e...............................................................................................................'.......~.............................................................................................................. - // sub v24.4s, v9.4s, v10.4s // ......................e...............................................................................................'.......................~.............................................................................................. - // add v9.4s, v9.4s, v10.4s // .......................e..............................................................................................'........................~............................................................................................. - // mul v10.4s, v24.4s, v1.s[2] // .......................................e..............................................................................'........................................~............................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...........................e..........................................................................................'............................~......................................................................................... - // mls v10.4s, v24.4s, v8.s[0] // ..............................................e.......................................................................'...............................................~...................................................................... - // sub v24.4s, v11.4s, v12.4s // ...............e......................................................................................................'................~..................................................................................................... - // add v11.4s, v11.4s, v12.4s // .....................e................................................................................................'......................~............................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ....................e.................................................................................................'.....................~................................................................................................ - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...............................................e......................................................................'................................................~..................................................................... - // mls v12.4s, v24.4s, v8.s[0] // .......................................................e..............................................................'........................................................~............................................................. - // sub v24.4s, v13.4s, v14.4s // ........................e.............................................................................................'.........................~............................................................................................ - // add v13.4s, v13.4s, v14.4s // .........................e............................................................................................'..........................~........................................................................................... - // mul v14.4s, v24.4s, v2.s[2] // ..........................................e...........................................................................'...........................................~.......................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................e.............................................................................'.........................................~............................................................................ - // mls v14.4s, v24.4s, v8.s[0] // ........................................................e.............................................................'.........................................................~............................................................ - // sub v24.4s, v15.4s, v16.4s // ............................................e.........................................................................'.............................................~........................................................................ - // add v15.4s, v15.4s, v16.4s // ...................e..................................................................................................'....................~................................................................................................. - // mul v16.4s, v24.4s, v3.s[0] // ...................................................e..................................................................'....................................................~................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ....................................................e.................................................................'.....................................................~................................................................ - // mls v16.4s, v24.4s, v8.s[0] // .........................................................e............................................................'..........................................................~........................................................... - // sub v24.4s, v9.4s, v11.4s // .............................e........................................................................................'..............................~....................................................................................... - // add v9.4s, v9.4s, v11.4s // ......................................e...............................................................................'.......................................~.............................................................................. - // mul v11.4s, v24.4s, v0.s[2] // .................................e....................................................................................'..................................~................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................e...................................................................................'...................................~.................................................................................. - // mls v11.4s, v24.4s, v8.s[0] // .........................................e............................................................................'..........................................~........................................................................... - // sub v24.4s, v10.4s, v12.4s // .............................................................e........................................................'..............................................................~....................................................... - // add v10.4s, v10.4s, v12.4s // ..............................................................e.......................................................'...............................................................~...................................................... - // mul v12.4s, v24.4s, v0.s[2] // ...................................................................e..................................................'....................................................................~................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................e................................................'......................................................................~............................................... - // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................e..........................................'............................................................................~......................................... - // sub v24.4s, v13.4s, v15.4s // ................................e.....................................................................................'.................................~.................................................................................... - // add v13.4s, v13.4s, v15.4s // ...............................e......................................................................................'................................~..................................................................................... - // mul v15.4s, v24.4s, v1.s[0] // .....................................e................................................................................'......................................~............................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................e..................................................................................'....................................~................................................................................. - // mls v15.4s, v24.4s, v8.s[0] // ...........................................e..........................................................................'............................................~......................................................................... - // sub v24.4s, v14.4s, v16.4s // ....................................................................e.................................................'.....................................................................~................................................ - // add v14.4s, v14.4s, v16.4s // ...............................................................e......................................................'................................................................~..................................................... - // mul v16.4s, v24.4s, v1.s[0] // .......................................................................e..............................................'........................................................................~............................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................e.............................................'.........................................................................~............................................ - // mls v16.4s, v24.4s, v8.s[0] // ..............................................................................e.......................................'...............................................................................~...................................... - // sub v24.4s, v9.4s, v13.4s // .............................................e........................................................................'..............................................~....................................................................... - // add v9.4s, v9.4s, v13.4s // ................................................e.....................................................................'.................................................~.................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ............................................................e.........................................................'.............................................................~........................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................e.....................................................'.................................................................~.................................................... - // mls v13.4s, v24.4s, v8.s[0] // ..................................................................................e...................................'...................................................................................~.................................. - // sub v24.4s, v10.4s, v14.4s // ..........................................................................e...........................................'...........................................................................~.......................................... - // add v10.4s, v10.4s, v14.4s // ............................................................................e.........................................'.............................................................................~........................................ - // mul v14.4s, v24.4s, v0.s[0] // ................................................................................e.....................................'.................................................................................~.................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................e........................................'..............................................................................~....................................... - // mls v14.4s, v24.4s, v8.s[0] // .......................................................................................e..............................'........................................................................................~............................. - // sub v24.4s, v11.4s, v15.4s // ..................................................e...................................................................'...................................................~.................................................................. - // add v11.4s, v11.4s, v15.4s // .........................................................................................e............................'..........................................................................................~........................... - // mul v15.4s, v24.4s, v0.s[0] // .....................................................e................................................................'......................................................~............................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................e...............................................................'.......................................................~.............................................................. - // mls v15.4s, v24.4s, v8.s[0] // ...........................................................e..........................................................'............................................................~......................................................... - // sub v24.4s, v12.4s, v16.4s // .....................................................................................e................................'......................................................................................~............................... - // add v12.4s, v12.4s, v16.4s // ......................................................................................e...............................'.......................................................................................~.............................. - // mul v16.4s, v24.4s, v0.s[0] // .............................................................................................e........................'..............................................................................................~....................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................................................................e...........................'...........................................................................................~.......................... - // mls v16.4s, v24.4s, v8.s[0] // ....................................................................................................e.................'.....................................................................................................~................ - // cmge v27.4s, v31.4s, v13.4s // ......................................................................................................e...............'.......................................................................................................~.............. - // cmge v28.4s, v13.4s, v30.4s // ......................................................................................................................'*..................................................................................................................... - // sub v28.4s, v27.4s, v28.4s // .......~..............................................................................................................'........*............................................................................................................. - // mls v13.4s, v28.4s, v8.4s // ..........................~...........................................................................................'...........................*.......................................................................................... - // cmge v27.4s, v31.4s, v14.4s // ...............................................................................................e......................'................................................................................................~..................... - // cmge v28.4s, v14.4s, v30.4s // ....~.................................................................................................................'.....*................................................................................................................ - // sub v28.4s, v27.4s, v28.4s // .............~........................................................................................................'..............*....................................................................................................... - // mls v14.4s, v28.4s, v8.4s // ............................~.........................................................................................'.............................*........................................................................................ - // cmge v27.4s, v31.4s, v15.4s // ..................................................................e...................................................'...................................................................~.................................................. - // cmge v28.4s, v15.4s, v30.4s // .................................................................e....................................................'..................................................................~................................................... - // sub v28.4s, v27.4s, v28.4s // ......................................................................e...............................................'.......................................................................~.............................................. - // mls v15.4s, v28.4s, v8.4s // .........................................................................e............................................'..........................................................................~........................................... - // cmge v27.4s, v31.4s, v16.4s // .........................................................................................................e............'..........................................................................................................~........... - // cmge v28.4s, v16.4s, v30.4s // ..........................................................................................................e...........'...........................................................................................................~.......... - // sub v28.4s, v27.4s, v28.4s // ..............................................................................................................e.......'...............................................................................................................~...... - // mls v16.4s, v28.4s, v8.4s // ..................................................................................................................e...'...................................................................................................................~.. - // str q13, [x0, #(4*(1024/8))] // .................................................~....................................................................'..................................................*................................................................... - // str q14, [x0, #(5*(1024/8))] // ....................................~.................................................................................'.....................................*................................................................................ - // str q15, [x0, #(6*(1024/8))] // ...............................................................................e......................................'................................................................................~..................................... - // str q16, [x0, #(7*(1024/8))] // ..............................~.......................................................................................'...............................*...................................................................................... - // mul v13.4s, v9.4s, v25.4s // .....................................................................................................................e'...................................................................................................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ......................................................................................................................*...................................................................................................................... - // mls v13.4s, v9.4s, v8.s[0] // .........~............................................................................................................'..........*........................................................................................................... - // mul v14.4s, v10.4s, v25.4s // ....................................................................................e.................................'.....................................................................................~................................ - // sqrdmulh v10.4s, v10.4s, v26.4s // .................................................................................e....................................'..................................................................................~................................... - // mls v14.4s, v10.4s, v8.s[0] // ...................................................................................................e..................'....................................................................................................~................. - // mul v15.4s, v11.4s, v25.4s // ..............................................................................................e.......................'...............................................................................................~...................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ..................................................................................................e...................'...................................................................................................~.................. - // mls v15.4s, v11.4s, v8.s[0] // ........................................................................................................e.............'.........................................................................................................~............ - // mul v16.4s, v12.4s, v25.4s // ...........................................................................................e..........................'............................................................................................~......................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ................................................................................................e.....................'.................................................................................................~.................... - // mls v16.4s, v12.4s, v8.s[0] // .....................................................................................................e................'......................................................................................................~............... - // cmge v27.4s, v31.4s, v13.4s // ..................~...................................................................................................'...................*.................................................................................................. - // cmge v28.4s, v13.4s, v30.4s // .................................................................................................~....................'..................................................................................................*................... - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................~..............'........................................................................................................*............. - // mls v13.4s, v28.4s, v8.4s // .............................................................................................................~........'..............................................................................................................*....... - // cmge v27.4s, v31.4s, v14.4s // ........~.............................................................................................................'.........*............................................................................................................ - // cmge v28.4s, v14.4s, v30.4s // ...............................................................................................................e......'................................................................................................................~..... - // sub v28.4s, v27.4s, v28.4s // ..............~.......................................................................................................'...............*...................................................................................................... - // mls v14.4s, v28.4s, v8.4s // ..........................................................~...........................................................'...........................................................*.......................................................... - // cmge v27.4s, v31.4s, v15.4s // .................................................................................................................e....'..................................................................................................................~... - // cmge v28.4s, v15.4s, v30.4s // ...~..................................................................................................................'....*................................................................................................................. - // sub v28.4s, v27.4s, v28.4s // ..........~...........................................................................................................'...........*.......................................................................................................... - // mls v15.4s, v28.4s, v8.4s // ................~.....................................................................................................'.................*.................................................................................................... - // cmge v27.4s, v31.4s, v16.4s // ...........................................................................................................e..........'............................................................................................................~......... - // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................e.........'.............................................................................................................~........ - // sub v28.4s, v27.4s, v28.4s // ................................................................................................................e.....'.................................................................................................................~.... - // mls v16.4s, v28.4s, v8.4s // ...................................................................................................................e..'....................................................................................................................~. - // str q13, [x0], #(16) // ....................................................................................................................~.'.....................................................................................................................* - // str q14, [x0, #(-16 + 1*(1024/8))] // ...................................................................................~..................................'....................................................................................*................................. - // str q15, [x0, #(-16 + 2*(1024/8))] // ............................................................................................~.........................'.............................................................................................*........................ - // str q16, [x0, #(-16 + 3*(1024/8))] // ........................................................................................~.............................'.........................................................................................*............................ + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------- + // ldr q9, [x0, #0] // .....e................................................................................................................'......~............................................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // .e....................................................................................................................'..~................................................................................................................. + // ldr q11, [x0, #(2*(1024/8))] // e.....................................................................................................................'.~.................................................................................................................. + // ldr q12, [x0, #(3*(1024/8))] // ....e.................................................................................................................'.....~.............................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // ...............e......................................................................................................'................~................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ...........e..........................................................................................................'............~....................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ......e...............................................................................................................'.......~............................................................................................................ + // ldr q16, [x0, #(7*(1024/8))] // .......e..............................................................................................................'........~........................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ......................e...............................................................................................'.......................~............................................................................................ + // add v9.4s, v9.4s, v10.4s // ..................e...................................................................................................'...................~................................................................................................ + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ................................................e.....................................................................'.................................................~.................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ..................................................e...................................................................'...................................................~................................................................ + // mls v10.4s, v27.4s, v8.s[0] // .........................................................e............................................................'..........................................................~......................................................... + // sub v24.4s, v11.4s, v12.4s // .....................e................................................................................................'......................~............................................................................................. + // add v11.4s, v11.4s, v12.4s // ................e.....................................................................................................'.................~.................................................................................................. + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ..........................e...........................................................................................'...........................~........................................................................................ + // mul v12.4s, v24.4s, v2.s[0] // .........................e............................................................................................'..........................~......................................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ...................................e..................................................................................'....................................~............................................................................... + // sub v24.4s, v13.4s, v14.4s // ........................e.............................................................................................'.........................~.......................................................................................... + // add v13.4s, v13.4s, v14.4s // .............................e........................................................................................'..............................~..................................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ..........................................e...........................................................................'...........................................~........................................................................ + // mul v14.4s, v24.4s, v2.s[2] // ...............................e......................................................................................'................................~................................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ........................................................e.............................................................'.........................................................~.......................................................... + // sub v24.4s, v15.4s, v16.4s // .......................e..............................................................................................'........................~........................................................................................... + // add v15.4s, v15.4s, v16.4s // ...................e..................................................................................................'....................~............................................................................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ..............................e.......................................................................................'...............................~.................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // .......................................e..............................................................................'........................................~........................................................................... + // mls v16.4s, v27.4s, v8.s[0] // .....................................................e................................................................'......................................................~............................................................. + // sub v24.4s, v9.4s, v11.4s // ...........................e..........................................................................................'............................~....................................................................................... + // add v9.4s, v9.4s, v11.4s // ............................e.........................................................................................'.............................~...................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ................................e.....................................................................................'.................................~.................................................................................. + // mul v11.4s, v24.4s, v0.s[2] // .................................e....................................................................................'..................................~................................................................................. + // mls v11.4s, v27.4s, v8.s[0] // ...........................................e..........................................................................'............................................~....................................................................... + // sub v24.4s, v10.4s, v12.4s // ...............................................................e......................................................'................................................................~................................................... + // add v10.4s, v10.4s, v12.4s // ...............................................................................e......................................'................................................................................~................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ......................................................................e...............................................'.......................................................................~............................................ + // mul v12.4s, v24.4s, v0.s[2] // ..........................................................................e...........................................'...........................................................................~........................................ + // mls v12.4s, v27.4s, v8.s[0] // ......................................................................................e...............................'.......................................................................................~............................ + // sub v24.4s, v13.4s, v15.4s // ..................................e...................................................................................'...................................~................................................................................ + // add v13.4s, v13.4s, v15.4s // ....................................e.................................................................................'.....................................~.............................................................................. + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ......................................e...............................................................................'.......................................~............................................................................ + // mul v15.4s, v24.4s, v1.s[0] // .....................................e................................................................................'......................................~............................................................................. + // mls v15.4s, v27.4s, v8.s[0] // ............................................e.........................................................................'.............................................~...................................................................... + // sub v24.4s, v14.4s, v16.4s // .....................................................................e................................................'......................................................................~............................................. + // add v14.4s, v14.4s, v16.4s // .............................................................................e........................................'..............................................................................~..................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ............................................................................e.........................................'.............................................................................~...................................... + // mul v16.4s, v24.4s, v1.s[0] // .................................................................................e....................................'..................................................................................~................................. + // mls v16.4s, v27.4s, v8.s[0] // ........................................................................................e.............................'.........................................................................................~.......................... + // sub v24.4s, v9.4s, v13.4s // .........................................e............................................................................'..........................................~......................................................................... + // add v9.4s, v9.4s, v13.4s // ........................................e.............................................................................'.........................................~.......................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..............................................e.......................................................................'...............................................~.................................................................... + // mul v13.4s, v24.4s, v0.s[0] // .............................................e........................................................................'..............................................~..................................................................... + // mls v13.4s, v27.4s, v8.s[0] // ....................................................e.................................................................'.....................................................~.............................................................. + // sub v24.4s, v10.4s, v14.4s // .........~............................................................................................................'..........*......................................................................................................... + // add v10.4s, v10.4s, v14.4s // ...........................................................................................e..........................'............................................................................................~....................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..............~.......................................................................................................'...............*.................................................................................................... + // mul v14.4s, v24.4s, v0.s[0] // .............~........................................................................................................'..............*..................................................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ...............................................~......................................................................'................................................*................................................................... + // sub v24.4s, v11.4s, v15.4s // ..............................................................................e.......................................'...............................................................................~.................................... + // add v11.4s, v11.4s, v15.4s // ...................................................e..................................................................'....................................................~............................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .........................................................................................e............................'..........................................................................................~......................... + // mul v15.4s, v24.4s, v0.s[0] // ...................................................................................e..................................'....................................................................................~............................... + // mls v15.4s, v27.4s, v8.s[0] // ................................................................................................e.....................'.................................................................................................~.................. + // sub v24.4s, v12.4s, v16.4s // .............................................................................................e........................'..............................................................................................~..................... + // add v12.4s, v12.4s, v16.4s // ..............................................................................................e.......................'...............................................................................................~.................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .......................................................................................................e..............'........................................................................................................~........... + // mul v16.4s, v24.4s, v0.s[0] // ......................................................................................................e...............'.......................................................................................................~............ + // mls v16.4s, v27.4s, v8.s[0] // ..............................................................................................................e.......'...............................................................................................................~.... + // cmge v27.4s, v31.4s, v13.4s // .............................................................e........................................................'..............................................................~..................................................... + // cmge v28.4s, v13.4s, v30.4s // ..........................................................e...........................................................'...........................................................~........................................................ + // sub v28.4s, v27.4s, v28.4s // ................................................................e.....................................................'.................................................................~.................................................. + // mls v13.4s, v28.4s, v8.4s // .......................................................................e..............................................'........................................................................~........................................... + // cmge v27.4s, v31.4s, v14.4s // .......................................................................................~..............................'........................................................................................*........................... + // cmge v28.4s, v14.4s, v30.4s // .....................................................................................~................................'......................................................................................*............................. + // sub v28.4s, v27.4s, v28.4s // ........................................................................................................~.............'.........................................................................................................*.......... + // mls v14.4s, v28.4s, v8.4s // ............................................................................................................~.........'.............................................................................................................*...... + // cmge v27.4s, v31.4s, v15.4s // ...............................................................................................................e......'................................................................................................................~... + // cmge v28.4s, v15.4s, v30.4s // .............................................................................................................e........'..............................................................................................................~..... + // sub v28.4s, v27.4s, v28.4s // .....................................................................................................................e'.................................................................................................................... + // mls v15.4s, v28.4s, v8.4s // ..~...................................................................................................................'...*................................................................................................................ + // cmge v27.4s, v31.4s, v16.4s // ......................................................................................................................'*................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ......................................................................................................................*.................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ........~.............................................................................................................'.........*.......................................................................................................... + // mls v16.4s, v28.4s, v8.4s // ............................................................................................~.........................'.............................................................................................*...................... + // str q13, [x0, #(4*(1024/8))] // ................................................................................e.....................................'.................................................................................~.................................. + // str q14, [x0, #(5*(1024/8))] // ..................................................................................................................~...'...................................................................................................................* + // str q15, [x0, #(6*(1024/8))] // .................~....................................................................................................'..................*................................................................................................. + // str q16, [x0, #(7*(1024/8))] // .....................................................................................................~................'......................................................................................................*............. + // sqrdmulh v27.4s, v9.4s, v26.4s // ..............................................................e.......................................................'...............................................................~.................................................... + // mul v9.4s, v9.4s, v25.4s // ..................................................................e...................................................'...................................................................~................................................ + // mls v9.4s, v27.4s, v8.s[0] // .........................................................................e............................................'..........................................................................~......................................... + // sqrdmulh v27.4s, v10.4s, v26.4s // ....................................................................................................e.................'.....................................................................................................~.............. + // mul v10.4s, v10.4s, v25.4s // ...............................................................................................e......................'................................................................................................~................... + // mls v10.4s, v27.4s, v8.s[0] // ...........................................................................................................e..........'............................................................................................................~....... + // sqrdmulh v27.4s, v11.4s, v26.4s // .......................................................e..............................................................'........................................................~........................................................... + // mul v11.4s, v11.4s, v25.4s // ......................................................e...............................................................'.......................................................~............................................................ + // mls v11.4s, v27.4s, v8.s[0] // ............................................................e.........................................................'.............................................................~...................................................... + // sqrdmulh v27.4s, v12.4s, v26.4s // ..................................................................................................e...................'...................................................................................................~................ + // mul v12.4s, v12.4s, v25.4s // ...................................................................................................e..................'....................................................................................................~............... + // mls v12.4s, v27.4s, v8.s[0] // ..........................................................................................................e...........'...........................................................................................................~........ + // cmge v27.4s, v31.4s, v9.4s // ....................................................................................e.................................'.....................................................................................~.............................. + // cmge v28.4s, v9.4s, v30.4s // .........................................................................................................e............'..........................................................................................................~......... + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................e....'..................................................................................................................~. + // mls v9.4s, v28.4s, v8.4s // ...........................................................~..........................................................'............................................................*....................................................... + // cmge v27.4s, v31.4s, v10.4s // ...................................................................................................................e..'.................................................................................................................... + // cmge v28.4s, v10.4s, v30.4s // ....................................................................................................................e.'.................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............~.........................................................................................................'.............*...................................................................................................... + // mls v10.4s, v28.4s, v8.4s // ....................~.................................................................................................'.....................*.............................................................................................. + // cmge v27.4s, v31.4s, v11.4s // ...................................................................e..................................................'....................................................................~............................................... + // cmge v28.4s, v11.4s, v30.4s // ....................................................................e.................................................'.....................................................................~.............................................. + // sub v28.4s, v27.4s, v28.4s // ........................................................................e.............................................'.........................................................................~.......................................... + // mls v11.4s, v28.4s, v8.4s // ...........................................................................e..........................................'............................................................................~....................................... + // cmge v27.4s, v31.4s, v12.4s // ................................................................................................................e.....'.................................................................................................................~.. + // cmge v28.4s, v12.4s, v30.4s // ...~..................................................................................................................'....*............................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..........~...........................................................................................................'...........*........................................................................................................ + // mls v12.4s, v28.4s, v8.4s // ..........................................................................................~...........................'...........................................................................................*........................ + // str q9, [x0], #(16) // .................................................................~....................................................'..................................................................*................................................. + // str q10, [x0, #(-16 + 1*(1024/8))] // .................................................~....................................................................'..................................................*................................................................. + // str q11, [x0, #(-16 + 2*(1024/8))] // ..................................................................................e...................................'...................................................................................~................................ + // str q12, [x0, #(-16 + 3*(1024/8))] // .................................................................................................~....................'..................................................................................................*................. sub count, count, #1 cbnz count, layer123_start - // Instructions: 25 - // Expected cycles: 14 - // Expected IPC: 1.79 - // - // Wall time: 0.08s - // User time: 0.08s - // - // ----- original position -----> - // 0 25 - // |------------------------|---- - str q7, [x0, #896] // ..............*............... - sqrdmulh v7.4S, v17.4S, v26.4S // *............................. - cmge v5.4S, v9.4S, v30.4S // ..*........................... - // gap // .............................. - str q13, [x0, #384] // ...................*.......... - cmge v11.4S, v27.4S, v30.4S // ...*.......................... - // gap // .............................. - // gap // .............................. - sub v17.4S, v22.4S, v5.4S // .......*...................... - cmge v22.4S, v31.4S, v24.4S // .....*........................ - // gap // .............................. - // gap // .............................. - mls v23.4S, v7.4S, v8.S[0] // ......*....................... - sub v7.4S, v28.4S, v11.4S // ........*..................... - // gap // .............................. - // gap // .............................. - mls v9.4S, v17.4S, v8.4S // ..........*................... - sub v17.4S, v22.4S, v14.4S // .........*.................... - // gap // .............................. - // gap // .............................. - mls v27.4S, v7.4S, v8.4S // .............*................ - cmge v28.4S, v10.4S, v30.4S // .*............................ - // gap // .............................. - // gap // .............................. - cmge v11.4S, v31.4S, v23.4S // ...........*.................. - cmge v21.4S, v23.4S, v30.4S // .....................*........ - // gap // .............................. - // gap // .............................. - sub v28.4S, v29.4S, v28.4S // ....*......................... - mls v24.4S, v17.4S, v8.4S // .................*............ - str q9, [x0, #256] // ....................*......... - // gap // .............................. - str q27, [x0, #640] // ...............*.............. - sub v16.4S, v11.4S, v21.4S // ......................*....... - // gap // .............................. - // gap // .............................. - mls v10.4S, v28.4S, v8.4S // ............*................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - mls v23.4S, v16.4S, v8.4S // .......................*...... - str q24, [x0, #128] // ..................*........... - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - // gap // .............................. - str q10, [x0, #512] // ................*............. - // gap // .............................. - // gap // .............................. - // gap // .............................. - str q23, [x0], #(16) // ........................*..... - // gap // .............................. - // gap // .............................. - // gap // .............................. - - // -------- new position --------> + // Instructions: 25 + // Expected cycles: 16 + // Expected IPC: 1.56 + // + // Wall time: 0.13s + // User time: 0.13s + // + // ----- original position -----> // 0 25 - // |------------------------|----- - // sqrdmulh v5.4S, v17.4S, v26.4S // .*............................. - // cmge v18.4S, v10.4S, v30.4S // ............*.................. - // cmge v16.4S, v9.4S, v30.4S // ..*............................ - // cmge v17.4S, v27.4S, v30.4S // ....*.......................... - // sub v6.4S, v29.4S, v18.4S // ...............*............... - // cmge v21.4S, v31.4S, v24.4S // ......*........................ - // mls v23.4S, v5.4S, v8.S[0] // .......*....................... - // sub v16.4S, v22.4S, v16.4S // .....*......................... - // sub v18.4S, v28.4S, v17.4S // ........*...................... - // sub v22.4S, v21.4S, v14.4S // ..........*.................... - // mls v9.4S, v16.4S, v8.4S // .........*..................... - // cmge v16.4S, v31.4S, v23.4S // .............*................. - // mls v10.4S, v6.4S, v8.4S // ....................*.......... - // mls v27.4S, v18.4S, v8.4S // ...........*................... - // str q7, [x0, #896] // *.............................. - // str q27, [x0, #640] // ..................*............ - // str q10, [x0, #512] // .......................*....... - // mls v24.4S, v22.4S, v8.4S // ................*.............. - // str q24, [x0, #128] // ......................*........ - // str q13, [x0, #384] // ...*........................... - // str q9, [x0, #256] // .................*............. - // cmge v21.4S, v23.4S, v30.4S // ..............*................ - // sub v18.4S, v16.4S, v21.4S // ...................*........... - // mls v23.4S, v18.4S, v8.4S // .....................*......... - // str q23, [x0], #(16) // ........................*...... + // |------------------------|---- + sub v11.4S, v28.4S, v12.4S // .....*........................ + cmge v7.4S, v31.4S, v13.4S // .*............................ + // gap // .............................. + // gap // .............................. + cmge v18.4S, v23.4S, v30.4S // ...*.......................... + cmge v6.4S, v13.4S, v30.4S // *............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v4.4S, v11.4S, v0.S[1] // .........*.................... + mul v11.4S, v11.4S, v0.S[0] // ........*..................... + // gap // .............................. + // gap // .............................. + sub v19.4S, v21.4S, v18.4S // ......*....................... + sub v21.4S, v7.4S, v6.4S // ....*......................... + // gap // .............................. + // gap // .............................. + mls v5.4S, v20.4S, v8.4S // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v13.4S, v21.4S, v8.4S // ...................*.......... + mls v11.4S, v4.4S, v8.S[0] // ............*................. + // gap // .............................. + // gap // .............................. + mls v23.4S, v19.4S, v8.4S // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v6.4S, v15.4S, v14.4S // .......*...................... + str q5, [x0, #768] // ..........*................... + // gap // .............................. + // gap // .............................. + str q13, [x0, #896] // .....................*........ + cmge v7.4S, v11.4S, v30.4S // ................*............. + cmge v13.4S, v31.4S, v11.4S // .................*............ + // gap // .............................. + mls v27.4S, v6.4S, v8.4S // ...........*.................. + str q23, [x0, #384] // ....................*......... + // gap // .............................. + // gap // .............................. + sub v17.4S, v13.4S, v7.4S // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v10.4S, v16.4S, v8.4S // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v11.4S, v17.4S, v8.4S // .......................*...... + str q27, [x0, #128] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q10, [x0], #(16) // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q11, [x0, #624] // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // cmge v11.4S, v13.4S, v30.4S // ...*........................... + // cmge v19.4S, v31.4S, v13.4S // .*............................. + // mls v5.4S, v20.4S, v8.4S // ........*...................... + // cmge v22.4S, v23.4S, v30.4S // ..*............................ + // sub v20.4S, v19.4S, v11.4S // .......*....................... + // sub v19.4S, v28.4S, v12.4S // *.............................. + // sub v18.4S, v21.4S, v22.4S // ......*........................ + // sub v15.4S, v15.4S, v14.4S // ............*.................. + // mul v9.4S, v19.4S, v0.S[0] // .....*......................... + // sqrdmulh v12.4S, v19.4S, v0.S[1] // ....*.......................... + // str q5, [x0, #768] // .............*................. + // mls v27.4S, v15.4S, v8.4S // .................*............. + // mls v9.4S, v12.4S, v8.S[0] // ..........*.................... + // str q27, [x0, #128] // ......................*........ + // mls v10.4S, v16.4S, v8.4S // ....................*.......... + // str q10, [x0], #(16) // .......................*....... + // cmge v15.4S, v9.4S, v30.4S // ...............*............... + // cmge v29.4S, v31.4S, v9.4S // ................*.............. + // mls v23.4S, v18.4S, v8.4S // ...........*................... + // mls v13.4S, v20.4S, v8.4S // .........*..................... + // str q23, [x0, #368] // ..................*............ + // str q13, [x0, #880] // ..............*................ + // sub v14.4S, v29.4S, v15.4S // ...................*........... + // mls v9.4S, v14.4S, v8.4S // .....................*......... + // str q9, [x0, #624] // ........................*...... pop_stack diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_opt_a55.s b/examples/opt/aarch64/intt_dilithium_123_45678_opt_a55.s index efd56dce..7ab93829 100644 --- a/examples/opt/aarch64/intt_dilithium_123_45678_opt_a55.s +++ b/examples/opt/aarch64/intt_dilithium_123_45678_opt_a55.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +33,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro barrett_reduce_single a @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -187,7 +173,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -198,7 +184,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -208,7 +194,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -216,7 +202,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -227,24 +213,30 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -387,1058 +379,1108 @@ _intt_dilithium_123_45678_opt_a55: qform_root3_tw .req q7 .p2align 2 - ldr q0, [x5], #(12*16) // .*........................................................................................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - ldr q17, [x5, #-128] // ......*...................................................................................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - ldr q31, [x5, #-160] // ..........*.................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1] // *............................................................................................................................................ - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - ldr q18, [x5, #-144] // ..............*.............................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - sub v6.4S, v21.4S, v22.4S // .........*................................................................................................................................... - // gap // ............................................................................................................................................. - sub v14.4S, v19.4S, v20.4S // .............*............................................................................................................................... - // gap // ............................................................................................................................................. - ldr q11, [x5, #-112] // ...............*............................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - mul v17.4S, v6.4S, v17.4S // ............*................................................................................................................................ - // gap // ............................................................................................................................................. - mul v31.4S, v14.4S, v31.4S // .................*........................................................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v18.4S, v14.4S, v18.4S // ................*............................................................................................................................ - // gap // ............................................................................................................................................. - sqrdmulh v6.4S, v6.4S, v11.4S // ..................*.......................................................................................................................... - // gap // ............................................................................................................................................. - add v19.4S, v19.4S, v20.4S // ...*......................................................................................................................................... - // gap // ............................................................................................................................................. - add v14.4S, v21.4S, v22.4S // ..*.......................................................................................................................................... - // gap // ............................................................................................................................................. - mls v31.4S, v18.4S, v8.S[0] // ...................*......................................................................................................................... - // gap // ............................................................................................................................................. - mls v17.4S, v6.4S, v8.S[0] // ....................*........................................................................................................................ - // gap // ............................................................................................................................................. - sub v18.4S, v19.4S, v14.4S // .....*....................................................................................................................................... - // gap // ............................................................................................................................................. - ldr q6, [x5, #-176] // ....*........................................................................................................................................ - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - sub v11.4S, v31.4S, v17.4S // .....................*....................................................................................................................... - // gap // ............................................................................................................................................. - add v19.4S, v19.4S, v14.4S // .......................................................*..................................................................................... - // gap // ............................................................................................................................................. - mul v14.4S, v18.4S, v0.4S // .......*..................................................................................................................................... - // gap // ............................................................................................................................................. - mul v0.4S, v11.4S, v0.4S // ...................................................*......................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v18.4S, v18.4S, v6.4S // ........*.................................................................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v6.4S, v11.4S, v6.4S // .......................*..................................................................................................................... - // gap // ............................................................................................................................................. - add v17.4S, v31.4S, v17.4S // .........................................*................................................................................................... - // gap // ............................................................................................................................................. - ldr q31, [x5, #-64] // ......................*...................................................................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - mls v14.4S, v18.4S, v8.S[0] // ...........*................................................................................................................................. - // gap // ............................................................................................................................................. - mls v0.4S, v6.4S, v8.S[0] // ........................................................*.................................................................................... - // gap // ............................................................................................................................................. - trn1 v18.4S, v19.4S, v17.4S // ..........................................................*.................................................................................. - // gap // ............................................................................................................................................. - trn2 v17.4S, v19.4S, v17.4S // ...........................................................*................................................................................. - // gap // ............................................................................................................................................. - ldr q19, [x5, #-48] // ........................*.................................................................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - trn1 v6.4S, v14.4S, v0.4S // ............................................................*................................................................................ - // gap // ............................................................................................................................................. - trn2 v0.4S, v14.4S, v0.4S // .............................................................*............................................................................... - // gap // ............................................................................................................................................. - ldr q14, [x5, #-32] // ..........................*.................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - trn2 v11.2D, v18.2D, v6.2D // ...............................................................*............................................................................. - // gap // ............................................................................................................................................. - trn2 v28.2D, v17.2D, v0.2D // ................................................................*............................................................................ - // gap // ............................................................................................................................................. - trn1 v18.2D, v18.2D, v6.2D // ............................................................................*................................................................ - // gap // ............................................................................................................................................. - sub v6.4S, v11.4S, v28.4S // ..................................................................*.......................................................................... - // gap // ............................................................................................................................................. - trn1 v0.2D, v17.2D, v0.2D // .............................................................................*............................................................... - // gap // ............................................................................................................................................. - add v17.4S, v11.4S, v28.4S // ....................................................................................................*........................................ - // gap // ............................................................................................................................................. - sub v11.4S, v18.4S, v0.4S // ................................................................................*............................................................ - // gap // ............................................................................................................................................. - add v0.4S, v18.4S, v0.4S // ...................................................................................................*......................................... - // gap // ............................................................................................................................................. - ldr q18, [x5, #-16] // ............................*................................................................................................................ - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - add v28.4S, v0.4S, v17.4S // .......................................................................................................*..................................... - // gap // ............................................................................................................................................. - sub v0.4S, v0.4S, v17.4S // .......................................................................................................................*..................... - // gap // ............................................................................................................................................. - ldr q17, [x5, #-80] // .......................................*..................................................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - srshr v26.4S, v28.4S, #23 // ..........................................................................................................*.................................. - // gap // ............................................................................................................................................. - ldr q29, [x5, #-96] // ............................................*................................................................................................ - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - mls v28.4S, v26.4S, v8.4S // .............................................................................................................*............................... - // gap // ............................................................................................................................................. - ldr q26, [x4, #16] // .....................................................*....................................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - ldr q9, [x4, #32] // ......................................................*...................................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - mul v4.4S, v11.4S, v26.S[2] // ...................................................................................*......................................................... - // gap // ............................................................................................................................................. - sqrdmulh v11.4S, v11.4S, v26.S[3] // ....................................................................................*........................................................ - // gap // ............................................................................................................................................. - mul v20.4S, v6.4S, v9.S[0] // .....................................................................*....................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v6.4S, v6.4S, v9.S[1] // .................................................................................*........................................................... - // gap // ............................................................................................................................................. - ldr q25, [x4, #48] // .......................................................................*..................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x2] // .........................*................................................................................................................... - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - mls v20.4S, v6.4S, v8.S[0] // .....................................................................................*....................................................... - // gap // ............................................................................................................................................. - mls v4.4S, v11.4S, v8.S[0] // ........................................................................................*.................................................... - // gap // ............................................................................................................................................. - sub v6.4S, v21.4S, v22.4S // .............................*............................................................................................................... - // gap // ............................................................................................................................................. - sub v11.4S, v23.4S, v24.4S // ...........................*................................................................................................................. - // gap // ............................................................................................................................................. - add v13.4S, v21.4S, v22.4S // ..................................*.......................................................................................................... - // gap // ............................................................................................................................................. - mul v31.4S, v6.4S, v31.4S // .................................*........................................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v19.4S, v6.4S, v19.4S // ................................*............................................................................................................ - // gap // ............................................................................................................................................. - mul v6.4S, v11.4S, v14.4S // ..............................*.............................................................................................................. - // gap // ............................................................................................................................................. - sqrdmulh v18.4S, v11.4S, v18.4S // ...............................*............................................................................................................. - // gap // ............................................................................................................................................. - add v14.4S, v23.4S, v24.4S // ...................................*......................................................................................................... - // gap // ............................................................................................................................................. - mls v31.4S, v19.4S, v8.S[0] // .....................................*....................................................................................................... - // gap // ............................................................................................................................................. - add v27.4S, v4.4S, v20.4S // ............................................................................................*................................................ - // gap // ............................................................................................................................................. - mls v6.4S, v18.4S, v8.S[0] // ....................................*........................................................................................................ - // gap // ............................................................................................................................................. - sub v19.4S, v13.4S, v14.4S // ......................................*...................................................................................................... - // gap // ............................................................................................................................................. - add v18.4S, v13.4S, v14.4S // .............................................*............................................................................................... - // gap // ............................................................................................................................................. - srshr v14.4S, v27.4S, #23 // ...............................................................................................*............................................. - // gap // ............................................................................................................................................. - sub v11.4S, v31.4S, v6.4S // ........................................*.................................................................................................... - // gap // ............................................................................................................................................. - sqrdmulh v13.4S, v19.4S, v17.4S // ..........................................*.................................................................................................. - // gap // ............................................................................................................................................. - add v31.4S, v31.4S, v6.4S // ..............................................*.............................................................................................. - // gap // ............................................................................................................................................. - sqrdmulh v17.4S, v11.4S, v17.4S // ...........................................*................................................................................................. - // gap // ............................................................................................................................................. - mul v6.4S, v11.4S, v29.4S // ...............................................*............................................................................................. - // gap // ............................................................................................................................................. - mul v19.4S, v19.4S, v29.4S // ................................................*............................................................................................ - // gap // ............................................................................................................................................. - trn2 v11.4S, v18.4S, v31.4S // .................................................*........................................................................................... - // gap // ............................................................................................................................................. - trn1 v31.4S, v18.4S, v31.4S // ..................................................*.......................................................................................... - // gap // ............................................................................................................................................. - mls v6.4S, v17.4S, v8.S[0] // .........................................................*................................................................................... - // gap // ............................................................................................................................................. - mls v19.4S, v13.4S, v8.S[0] // ....................................................*........................................................................................ - // gap // ............................................................................................................................................. - mls v27.4S, v14.4S, v8.4S // ..................................................................................................*.......................................... - // gap // ............................................................................................................................................. - sub v17.4S, v4.4S, v20.4S // ..............................................................................................................*.............................. - // gap // ............................................................................................................................................. - ldr q29, [x4], #64 // ...............................................................................................................*............................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - // gap // ............................................................................................................................................. - trn1 v18.4S, v19.4S, v6.4S // ..............................................................*.............................................................................. - // gap // ............................................................................................................................................. - trn2 v19.4S, v19.4S, v6.4S // .................................................................*........................................................................... - // gap // ............................................................................................................................................. - mul v6.4S, v17.4S, v29.S[2] // ..................................................................................................................*.......................... - // gap // ............................................................................................................................................. - trn2 v14.2D, v31.2D, v18.2D // ...................................................................*......................................................................... - // gap // ............................................................................................................................................. - trn2 v4.2D, v11.2D, v19.2D // ....................................................................*........................................................................ - // gap // ............................................................................................................................................. - trn1 v19.2D, v11.2D, v19.2D // ........................................................................*.................................................................... - // gap // ............................................................................................................................................. - sub v11.4S, v14.4S, v4.4S // ......................................................................*...................................................................... - // gap // ............................................................................................................................................. - mul v20.4S, v0.4S, v29.S[2] // .............................................................................................................................*............... - // gap // ............................................................................................................................................. - sqrdmulh v0.4S, v0.4S, v29.S[3] // ..............................................................................................................................*.............. - // gap // ............................................................................................................................................. - trn1 v31.2D, v31.2D, v18.2D // .........................................................................*................................................................... - // gap // ............................................................................................................................................. - mul v18.4S, v11.4S, v25.S[0] // ..........................................................................*.................................................................. - // gap // ............................................................................................................................................. - sub v13.4S, v31.4S, v19.4S // ...........................................................................*................................................................. - // gap // ............................................................................................................................................. - sqrdmulh v11.4S, v11.4S, v25.S[1] // ..................................................................................*.......................................................... - // gap // ............................................................................................................................................. - add v31.4S, v31.4S, v19.4S // .........................................................................................*................................................... - // gap // ............................................................................................................................................. - mul v19.4S, v13.4S, v9.S[2] // ..............................................................................*.............................................................. - // gap // ............................................................................................................................................. - sqrdmulh v9.4S, v13.4S, v9.S[3] // ...............................................................................*............................................................. - // gap // ............................................................................................................................................. - mls v18.4S, v11.4S, v8.S[0] // .......................................................................................*..................................................... - // gap // ............................................................................................................................................. - add v14.4S, v14.4S, v4.4S // ..........................................................................................*.................................................. - // gap // ............................................................................................................................................. - sqrdmulh v17.4S, v17.4S, v29.S[3] // ....................................................................................................................*........................ - // gap // ............................................................................................................................................. - mls v19.4S, v9.4S, v8.S[0] // ......................................................................................*...................................................... - // gap // ............................................................................................................................................. - add v11.4S, v31.4S, v14.4S // ................................................................................................*............................................ - // gap // ............................................................................................................................................. - sub v31.4S, v31.4S, v14.4S // .............................................................................................*............................................... - // gap // ............................................................................................................................................. - mls v6.4S, v17.4S, v8.S[0] // ........................................................................................................................*.................... - // gap // ............................................................................................................................................. - srshr v17.4S, v11.4S, #23 // ........................................................................................................*.................................... - // gap // ............................................................................................................................................. - sub v14.4S, v19.4S, v18.4S // .....................................................................................................*....................................... - // gap // ............................................................................................................................................. - sqrdmulh v9.4S, v31.4S, v26.S[1] // ...........................................................................................................................*................. - // gap // ............................................................................................................................................. - mls v11.4S, v17.4S, v8.4S // ............................................................................................................*................................ - // gap // ............................................................................................................................................. - mul v17.4S, v14.4S, v26.S[0] // ...........................................................................................................*................................. - // gap // ............................................................................................................................................. - sqrdmulh v14.4S, v14.4S, v26.S[1] // .................................................................................................................*........................... - // gap // ............................................................................................................................................. - mul v31.4S, v31.4S, v26.S[0] // .................................................................................................................................*........... - // gap // ............................................................................................................................................. - add v26.4S, v28.4S, v11.4S // ................................................................................................................*............................ - // gap // ............................................................................................................................................. - sub v11.4S, v28.4S, v11.4S // ......................................................................................................................*...................... - // gap // ............................................................................................................................................. - add v4.4S, v19.4S, v18.4S // ...........................................................................................*................................................. - // gap // ............................................................................................................................................. - str q26, [x1], #(16*4) // ...................................................................................................................*......................... - // gap // ............................................................................................................................................. - mls v17.4S, v14.4S, v8.S[0] // .....................................................................................................................*....................... - // gap // ............................................................................................................................................. - srshr v19.4S, v4.4S, #23 // ..............................................................................................*.............................................. - // gap // ............................................................................................................................................. - sqrdmulh v18.4S, v11.4S, v29.S[1] // .........................................................................................................................*................... - // gap // ............................................................................................................................................. - mul v14.4S, v11.4S, v29.S[0] // ..........................................................................................................................*.................. - // gap // ............................................................................................................................................. - sub v11.4S, v6.4S, v17.4S // ............................................................................................................................*................ - // gap // ............................................................................................................................................. - mls v4.4S, v19.4S, v8.4S // .................................................................................................*........................................... - // gap // ............................................................................................................................................. - add v17.4S, v6.4S, v17.4S // .......................................................................................................................................*..... - // gap // ............................................................................................................................................. - sqrdmulh v19.4S, v11.4S, v29.S[1] // ................................................................................................................................*............ - // gap // ............................................................................................................................................. - mul v6.4S, v11.4S, v29.S[0] // ...............................................................................................................................*............. - // gap // ............................................................................................................................................. - mls v14.4S, v18.4S, v8.S[0] // ..................................................................................................................................*.......... - // gap // ............................................................................................................................................. - mls v20.4S, v0.4S, v8.S[0] // ...................................................................................................................................*......... - // gap // ............................................................................................................................................. - str q17, [x1, #-16] // ..........................................................................................................................................*.. - // gap // ............................................................................................................................................. - mls v6.4S, v19.4S, v8.S[0] // ....................................................................................................................................*........ - // gap // ............................................................................................................................................. - mls v31.4S, v9.4S, v8.S[0] // .....................................................................................................................................*....... - // gap // ............................................................................................................................................. - str q14, [x2], #(16*4) // ......................................................................................................................................*...... - // gap // ............................................................................................................................................. - add v0.4S, v27.4S, v4.4S // ......................................................................................................*...................................... - // gap // ............................................................................................................................................. - str q6, [x2, #-16] // ........................................................................................................................................*.... - // gap // ............................................................................................................................................. - add v17.4S, v20.4S, v31.4S // .........................................................................................................................................*... - // gap // ............................................................................................................................................. - str q0, [x1, #-48] // .........................................................................................................*................................... - // gap // ............................................................................................................................................. - sub v10.4S, v20.4S, v31.4S // ...........................................................................................................................................*. - // gap // ............................................................................................................................................. - str q17, [x1, #-32] // ............................................................................................................................................* - // gap // ............................................................................................................................................. - - // original source code - // ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] // ...*......................................................................................................................................... - // ldr q2, [x5], #(12*16) // *............................................................................................................................................ - // add v0.4S, v18.4S, v19.4S // .............*............................................................................................................................... - // add v21.4S, v16.4S, v17.4S // ............*................................................................................................................................ - // ldr q27, [x5, #-176] // .................*........................................................................................................................... - // sub v24.4S, v21.4S, v0.4S // ................*............................................................................................................................ - // ldr q25, [x5, #-128] // .*........................................................................................................................................... - // mul v28.4S, v24.4S, v2.4S // ....................*........................................................................................................................ - // sqrdmulh v23.4S, v24.4S, v27.4S // ......................*...................................................................................................................... - // sub v12.4S, v18.4S, v19.4S // .....*....................................................................................................................................... - // ldr q3, [x5, #-160] // ..*.......................................................................................................................................... - // mls v28.4S, v23.4S, v8.S[0] // ..........................*.................................................................................................................. - // mul v1.4S, v12.4S, v25.4S // ........*.................................................................................................................................... - // sub v13.4S, v16.4S, v17.4S // ......*...................................................................................................................................... - // ldr q22, [x5, #-144] // ....*........................................................................................................................................ - // ldr q5, [x5, #-112] // .......*..................................................................................................................................... - // sqrdmulh v20.4S, v13.4S, v22.4S // ..........*.................................................................................................................................. - // mul v11.4S, v13.4S, v3.4S // .........*................................................................................................................................... - // sqrdmulh v30.4S, v12.4S, v5.4S // ...........*................................................................................................................................. - // mls v11.4S, v20.4S, v8.S[0] // ..............*.............................................................................................................................. - // mls v1.4S, v30.4S, v8.S[0] // ...............*............................................................................................................................. - // sub v31.4S, v11.4S, v1.4S // ..................*.......................................................................................................................... - // ldr q12, [x5, #-64] // .........................*................................................................................................................... - // sqrdmulh v24.4S, v31.4S, v27.4S // .......................*..................................................................................................................... - // ldr q4, [x5, #-48] // ..............................*.............................................................................................................. - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ........................................................*.................................................................................... - // ldr q17, [x5, #-32] // .................................*........................................................................................................... - // sub v20.4S, v15.4S, v16.4S // ............................................................*................................................................................ - // ldr q5, [x5, #-16] // ..........................................*.................................................................................................. - // sub v30.4S, v13.4S, v14.4S // ...........................................................*................................................................................. - // mul v3.4S, v20.4S, v17.4S // ................................................................*............................................................................ - // sqrdmulh v27.4S, v20.4S, v5.4S // .................................................................*........................................................................... - // sqrdmulh v29.4S, v30.4S, v4.4S // ...............................................................*............................................................................. - // mul v12.4S, v30.4S, v12.4S // ..............................................................*.............................................................................. - // add v14.4S, v13.4S, v14.4S // .............................................................*............................................................................... - // add v6.4S, v15.4S, v16.4S // ..................................................................*.......................................................................... - // mls v3.4S, v27.4S, v8.S[0] // .....................................................................*....................................................................... - // mls v12.4S, v29.4S, v8.S[0] // ...................................................................*......................................................................... - // sub v20.4S, v14.4S, v6.4S // ......................................................................*...................................................................... - // ldr q26, [x5, #-80] // .............................................*............................................................................................... - // sub v13.4S, v12.4S, v3.4S // .........................................................................*................................................................... - // add v22.4S, v11.4S, v1.4S // ........................*.................................................................................................................... - // sqrdmulh v4.4S, v20.4S, v26.4S // ..........................................................................*.................................................................. - // sqrdmulh v7.4S, v13.4S, v26.4S // ............................................................................*................................................................ - // ldr q19, [x5, #-96] // ...............................................*............................................................................................. - // add v25.4S, v14.4S, v6.4S // .......................................................................*..................................................................... - // add v1.4S, v12.4S, v3.4S // ...........................................................................*................................................................. - // mul v5.4S, v13.4S, v19.4S // .............................................................................*............................................................... - // mul v29.4S, v20.4S, v19.4S // ..............................................................................*.............................................................. - // trn2 v18.4S, v25.4S, v1.4S // ...............................................................................*............................................................. - // trn1 v12.4S, v25.4S, v1.4S // ................................................................................*............................................................ - // mul v30.4S, v31.4S, v2.4S // .....................*....................................................................................................................... - // mls v29.4S, v4.4S, v8.S[0] // ..................................................................................*.......................................................... - // ldr q10, [x4, #16] // .................................................*........................................................................................... - // ldr q15, [x4, #32] // ..................................................*.......................................................................................... - // add v27.4S, v21.4S, v0.4S // ...................*......................................................................................................................... - // mls v30.4S, v24.4S, v8.S[0] // ...........................*................................................................................................................. - // mls v5.4S, v7.4S, v8.S[0] // .................................................................................*........................................................... - // trn1 v17.4S, v27.4S, v22.4S // ............................*................................................................................................................ - // trn2 v2.4S, v27.4S, v22.4S // .............................*............................................................................................................... - // trn1 v11.4S, v28.4S, v30.4S // ...............................*............................................................................................................. - // trn2 v26.4S, v28.4S, v30.4S // ................................*............................................................................................................ - // trn1 v4.4S, v29.4S, v5.4S // ......................................................................................*...................................................... - // trn2 v7.2D, v17.2D, v11.2D // ..................................*.......................................................................................................... - // trn2 v3.2D, v2.2D, v26.2D // ...................................*......................................................................................................... - // trn2 v0.4S, v29.4S, v5.4S // .......................................................................................*..................................................... - // sub v20.4S, v7.4S, v3.4S // .....................................*....................................................................................................... - // trn2 v14.2D, v12.2D, v4.2D // .........................................................................................*................................................... - // trn2 v27.2D, v18.2D, v0.2D // ..........................................................................................*.................................................. - // mul v31.4S, v20.4S, v15.S[0] // .....................................................*....................................................................................... - // sub v1.4S, v14.4S, v27.4S // ............................................................................................*................................................ - // ldr q23, [x4, #48] // .......................................................*..................................................................................... - // trn1 v19.2D, v18.2D, v0.2D // ...........................................................................................*................................................. - // trn1 v25.2D, v12.2D, v4.2D // ...............................................................................................*............................................. - // mul v29.4S, v1.4S, v23.S[0] // ................................................................................................*............................................ - // sub v5.4S, v25.4S, v19.4S // .................................................................................................*........................................... - // trn1 v6.2D, v17.2D, v11.2D // ....................................*........................................................................................................ - // trn1 v9.2D, v2.2D, v26.2D // ......................................*...................................................................................................... - // mul v24.4S, v5.4S, v15.S[2] // ....................................................................................................*........................................ - // sqrdmulh v11.4S, v5.4S, v15.S[3] // .....................................................................................................*....................................... - // sub v16.4S, v6.4S, v9.4S // ........................................*.................................................................................................... - // sqrdmulh v13.4S, v20.4S, v15.S[1] // ......................................................*...................................................................................... - // sqrdmulh v30.4S, v1.4S, v23.S[1] // ..................................................................................................*.......................................... - // mul v22.4S, v16.4S, v10.S[2] // ...................................................*......................................................................................... - // sqrdmulh v17.4S, v16.4S, v10.S[3] // ....................................................*........................................................................................ - // mls v31.4S, v13.4S, v8.S[0] // .........................................................*................................................................................... - // mls v24.4S, v11.4S, v8.S[0] // .........................................................................................................*................................... - // mls v29.4S, v30.4S, v8.S[0] // ......................................................................................................*...................................... - // mls v22.4S, v17.4S, v8.S[0] // ..........................................................*.................................................................................. - // add v17.4S, v25.4S, v19.4S // ...................................................................................................*......................................... - // add v18.4S, v14.4S, v27.4S // .......................................................................................................*..................................... - // add v4.4S, v24.4S, v29.4S // ......................................................................................................................*...................... - // add v27.4S, v22.4S, v31.4S // ....................................................................*........................................................................ - // sub v5.4S, v17.4S, v18.4S // ...........................................................................................................*................................. - // srshr v25.4S, v4.4S, #23 // .........................................................................................................................*................... - // srshr v26.4S, v27.4S, #23 // ........................................................................*.................................................................... - // add v21.4S, v17.4S, v18.4S // ..........................................................................................................*.................................. - // mls v4.4S, v25.4S, v8.4S // .............................................................................................................................*............... - // mls v27.4S, v26.4S, v8.4S // ...................................................................................*......................................................... - // add v28.4S, v6.4S, v9.4S // .........................................*................................................................................................... - // add v23.4S, v7.4S, v3.4S // .......................................*..................................................................................................... - // sub v13.4S, v24.4S, v29.4S // ..............................................................................................................*.............................. - // add v18.4S, v27.4S, v4.4S // .......................................................................................................................................*..... - // add v2.4S, v28.4S, v23.4S // ...........................................*................................................................................................. - // srshr v26.4S, v21.4S, #23 // .............................................................................................................*............................... - // str q18, [x1, #16] // ..........................................................................................................................................*.. - // srshr v12.4S, v2.4S, #23 // ..............................................*.............................................................................................. - // mul v19.4S, v13.4S, v10.S[0] // .................................................................................................................*........................... - // mls v21.4S, v26.4S, v8.4S // ................................................................................................................*............................ - // mls v2.4S, v12.4S, v8.4S // ................................................*............................................................................................ - // sub v6.4S, v22.4S, v31.4S // ....................................................................................*........................................................ - // ldr q29, [x4], #64 // .....................................................................................*....................................................... - // add v14.4S, v2.4S, v21.4S // ....................................................................................................................*........................ - // sqrdmulh v11.4S, v13.4S, v10.S[1] // ..................................................................................................................*.......................... - // mul v9.4S, v6.4S, v29.S[2] // ........................................................................................*.................................................... - // str q14, [x1], #(16*4) // .......................................................................................................................*..................... - // sqrdmulh v1.4S, v6.4S, v29.S[3] // ........................................................................................................*.................................... - // mls v19.4S, v11.4S, v8.S[0] // ........................................................................................................................*.................... - // sub v7.4S, v2.4S, v21.4S // .....................................................................................................................*....................... - // sub v14.4S, v28.4S, v23.4S // ............................................*................................................................................................ - // mls v9.4S, v1.4S, v8.S[0] // ............................................................................................................*................................ - // sqrdmulh v1.4S, v7.4S, v29.S[1] // ..........................................................................................................................*.................. - // mul v23.4S, v7.4S, v29.S[0] // ...........................................................................................................................*................. - // sqrdmulh v0.4S, v5.4S, v10.S[1] // ...............................................................................................................*............................. - // sub v15.4S, v9.4S, v19.4S // ............................................................................................................................*................ - // mul v12.4S, v14.4S, v29.S[2] // .............................................................................................*............................................... - // sqrdmulh v17.4S, v14.4S, v29.S[3] // ..............................................................................................*.............................................. - // mul v30.4S, v15.4S, v29.S[0] // ................................................................................................................................*............ - // sqrdmulh v31.4S, v15.4S, v29.S[1] // ...............................................................................................................................*............. - // mul v16.4S, v5.4S, v10.S[0] // ...................................................................................................................*......................... - // mls v23.4S, v1.4S, v8.S[0] // .................................................................................................................................*........... - // mls v12.4S, v17.4S, v8.S[0] // ..................................................................................................................................*.......... - // mls v30.4S, v31.4S, v8.S[0] // ....................................................................................................................................*........ - // mls v16.4S, v0.4S, v8.S[0] // .....................................................................................................................................*....... - // str q23, [x2], #(16*4) // ......................................................................................................................................*...... - // add v24.4S, v9.4S, v19.4S // ..............................................................................................................................*.............. - // str q30, [x2, #-16] // ........................................................................................................................................*.... - // add v22.4S, v12.4S, v16.4S // .........................................................................................................................................*... - // str q24, [x1, #-16] // ...................................................................................................................................*......... - // sub v10.4S, v12.4S, v16.4S // ...........................................................................................................................................*. - // str q22, [x1, #-32] // ............................................................................................................................................* + // Instructions: 136 + // Expected cycles: 168 + // Expected IPC: 0.81 + // + // Wall time: 8.05s + // User time: 8.05s + // + // ---------------------------------------------------------- original position ----------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------- + ldr q6, [x5, #112] // *....................................................................................................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q26, [x5, #144] // ......................................*................................................................................................. + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q17, [x5, #160] // ........................................*............................................................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x2] // .....................................*.................................................................................................. + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q5, [x5, #128] // ..........................................*............................................................................................. + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + sub v7.4S, v18.4S, v19.4S // .......................................*................................................................................................ + // gap // ........................................................................................................................................ + sub v14.4S, v20.4S, v21.4S // ............................................*........................................................................................... + // gap // ........................................................................................................................................ + ldr q4, [x5, #176] // ..............................................*......................................................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + sqrdmulh v26.4S, v7.4S, v26.4S // .........................................*.............................................................................................. + // gap // ........................................................................................................................................ + mul v17.4S, v14.4S, v17.4S // ................................................*....................................................................................... + // gap // ........................................................................................................................................ + mul v5.4S, v7.4S, v5.4S // .............................................*.......................................................................................... + // gap // ........................................................................................................................................ + sqrdmulh v7.4S, v14.4S, v4.4S // .................................................*...................................................................................... + // gap // ........................................................................................................................................ + add v18.4S, v18.4S, v19.4S // ...................................................*.................................................................................... + // gap // ........................................................................................................................................ + add v14.4S, v20.4S, v21.4S // ......................................................*................................................................................. + // gap // ........................................................................................................................................ + mls v5.4S, v26.4S, v8.S[0] // ....................................................*................................................................................... + // gap // ........................................................................................................................................ + mls v17.4S, v7.4S, v8.S[0] // .....................................................*.................................................................................. + // gap // ........................................................................................................................................ + sub v26.4S, v18.4S, v14.4S // .........................................................*.............................................................................. + // gap // ........................................................................................................................................ + ldr q7, [x5, #16] // .*...................................................................................................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + sub v4.4S, v5.4S, v17.4S // ..........................................................*............................................................................. + // gap // ........................................................................................................................................ + add v18.4S, v18.4S, v14.4S // .................................................................*...................................................................... + // gap // ........................................................................................................................................ + sqrdmulh v14.4S, v26.4S, v6.4S // ............................................................*........................................................................... + // gap // ........................................................................................................................................ + sqrdmulh v10.4S, v4.4S, v6.4S // .............................................................*.......................................................................... + // gap // ........................................................................................................................................ + add v6.4S, v5.4S, v17.4S // ........................................................*............................................................................... + // gap // ........................................................................................................................................ + ldr q17, [x5, #48] // ..*..................................................................................................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + trn2 v5.4S, v18.4S, v6.4S // .....................................................................*.................................................................. + // gap // ........................................................................................................................................ + trn1 v18.4S, v18.4S, v6.4S // ......................................................................*................................................................. + // gap // ........................................................................................................................................ + ldr q1, [x5], #(12*16) // ...*.................................................................................................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q31, [x5, #-112] // ....*................................................................................................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q27, [x5, #-160] // .....*.................................................................................................................................. + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q11, [x5, #-128] // .......*................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q15, [x4, #48] // .................*...................................................................................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q21, [x5, #-96] // .....................*.................................................................................................................. + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q6, [x4, #16] // .......................................................*................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + mul v26.4S, v26.4S, v21.4S // ..............................................................*......................................................................... + // gap // ........................................................................................................................................ + mul v21.4S, v4.4S, v21.4S // ...............................................................*........................................................................ + // gap // ........................................................................................................................................ + ldr q2, [x4, #32] // ...................................................................................*.................................................... + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + mls v26.4S, v14.4S, v8.S[0] // ..................................................................*..................................................................... + // gap // ........................................................................................................................................ + mls v21.4S, v10.4S, v8.S[0] // ...................................................................*.................................................................... + // gap // ........................................................................................................................................ + ld4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1] // ......*................................................................................................................................. + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + ldr q4, [x4], #64 // .......................................................................................................*................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + // gap // ........................................................................................................................................ + sub v14.4S, v22.4S, v23.4S // ........*............................................................................................................................... + // gap // ........................................................................................................................................ + add v10.4S, v22.4S, v23.4S // .........*.............................................................................................................................. + // gap // ........................................................................................................................................ + sub v23.4S, v24.4S, v25.4S // ............*........................................................................................................................... + // gap // ........................................................................................................................................ + sqrdmulh v17.4S, v14.4S, v17.4S // ..............*......................................................................................................................... + // gap // ........................................................................................................................................ + mul v14.4S, v14.4S, v27.4S // ...........*............................................................................................................................ + // gap // ........................................................................................................................................ + sqrdmulh v31.4S, v23.4S, v31.4S // ...............*........................................................................................................................ + // gap // ........................................................................................................................................ + mul v27.4S, v23.4S, v11.4S // ................*....................................................................................................................... + // gap // ........................................................................................................................................ + add v11.4S, v24.4S, v25.4S // ..........*............................................................................................................................. + // gap // ........................................................................................................................................ + trn2 v23.4S, v26.4S, v21.4S // ........................................................................*............................................................... + // gap // ........................................................................................................................................ + mls v14.4S, v17.4S, v8.S[0] // ..................*..................................................................................................................... + // gap // ........................................................................................................................................ + mls v27.4S, v31.4S, v8.S[0] // ...................*.................................................................................................................... + // gap // ........................................................................................................................................ + trn2 v17.2D, v5.2D, v23.2D // ...........................................................................*............................................................ + // gap // ........................................................................................................................................ + trn1 v5.2D, v5.2D, v23.2D // ..............................................................................*......................................................... + // gap // ........................................................................................................................................ + sub v31.4S, v10.4S, v11.4S // ....................*................................................................................................................... + // gap // ........................................................................................................................................ + sub v23.4S, v14.4S, v27.4S // ......................*................................................................................................................. + // gap // ........................................................................................................................................ + trn1 v26.4S, v26.4S, v21.4S // .......................................................................*................................................................ + // gap // ........................................................................................................................................ + sqrdmulh v21.4S, v31.4S, v7.4S // .......................*................................................................................................................ + // gap // ........................................................................................................................................ + sqrdmulh v7.4S, v23.4S, v7.4S // ..........................*............................................................................................................. + // gap // ........................................................................................................................................ + trn2 v16.2D, v18.2D, v26.2D // ..........................................................................*............................................................. + // gap // ........................................................................................................................................ + trn1 v26.2D, v18.2D, v26.2D // ............................................................................*........................................................... + // gap // ........................................................................................................................................ + mul v18.4S, v31.4S, v1.4S // ........................*............................................................................................................... + // gap // ........................................................................................................................................ + mul v1.4S, v23.4S, v1.4S // .........................*.............................................................................................................. + // gap // ........................................................................................................................................ + add v10.4S, v10.4S, v11.4S // .............*.......................................................................................................................... + // gap // ........................................................................................................................................ + add v14.4S, v14.4S, v27.4S // ...........................*............................................................................................................ + // gap // ........................................................................................................................................ + mls v18.4S, v21.4S, v8.S[0] // ............................*........................................................................................................... + // gap // ........................................................................................................................................ + mls v1.4S, v7.4S, v8.S[0] // .............................*.......................................................................................................... + // gap // ........................................................................................................................................ + trn1 v7.4S, v10.4S, v14.4S // ..............................*......................................................................................................... + // gap // ........................................................................................................................................ + sub v31.4S, v16.4S, v17.4S // .............................................................................*.......................................................... + // gap // ........................................................................................................................................ + trn2 v14.4S, v10.4S, v14.4S // ...............................*........................................................................................................ + // gap // ........................................................................................................................................ + trn2 v10.4S, v18.4S, v1.4S // ................................*....................................................................................................... + // gap // ........................................................................................................................................ + sqrdmulh v27.4S, v31.4S, v15.S[1] // ................................................................................*....................................................... + // gap // ........................................................................................................................................ + mul v31.4S, v31.4S, v15.S[0] // .................................................................................*...................................................... + // gap // ........................................................................................................................................ + trn1 v18.4S, v18.4S, v1.4S // .................................*...................................................................................................... + // gap // ........................................................................................................................................ + trn2 v1.2D, v14.2D, v10.2D // ..................................*..................................................................................................... + // gap // ........................................................................................................................................ + trn1 v14.2D, v14.2D, v10.2D // ...................................*.................................................................................................... + // gap // ........................................................................................................................................ + trn1 v10.2D, v7.2D, v18.2D // ....................................*................................................................................................... + // gap // ........................................................................................................................................ + trn2 v18.2D, v7.2D, v18.2D // ...............................................*........................................................................................ + // gap // ........................................................................................................................................ + add v7.4S, v10.4S, v14.4S // ...........................................*............................................................................................ + // gap // ........................................................................................................................................ + sub v14.4S, v10.4S, v14.4S // ..................................................*..................................................................................... + // gap // ........................................................................................................................................ + add v10.4S, v18.4S, v1.4S // .........................................................................*.............................................................. + // gap // ........................................................................................................................................ + sub v11.4S, v26.4S, v5.4S // ..................................................................................*..................................................... + // gap // ........................................................................................................................................ + mul v15.4S, v14.4S, v6.S[2] // ...........................................................*............................................................................ + // gap // ........................................................................................................................................ + sqrdmulh v14.4S, v14.4S, v6.S[3] // ................................................................*....................................................................... + // gap // ........................................................................................................................................ + sub v18.4S, v18.4S, v1.4S // ....................................................................................*................................................... + // gap // ........................................................................................................................................ + sqrdmulh v1.4S, v11.4S, v2.S[3] // ......................................................................................*................................................. + // gap // ........................................................................................................................................ + mul v11.4S, v11.4S, v2.S[2] // .........................................................................................*.............................................. + // gap // ........................................................................................................................................ + mul v23.4S, v18.4S, v2.S[0] // .......................................................................................*................................................ + // gap // ........................................................................................................................................ + sqrdmulh v18.4S, v18.4S, v2.S[1] // ........................................................................................*............................................... + // gap // ........................................................................................................................................ + mls v15.4S, v14.4S, v8.S[0] // ....................................................................*................................................................... + // gap // ........................................................................................................................................ + sub v14.4S, v7.4S, v10.4S // ...............................................................................*........................................................ + // gap // ........................................................................................................................................ + add v7.4S, v7.4S, v10.4S // .....................................................................................*.................................................. + // gap // ........................................................................................................................................ + mls v31.4S, v27.4S, v8.S[0] // ...........................................................................................*............................................ + // gap // ........................................................................................................................................ + mls v23.4S, v18.4S, v8.S[0] // ............................................................................................*........................................... + // gap // ........................................................................................................................................ + srshr v18.4S, v7.4S, #23 // ..........................................................................................*............................................. + // gap // ........................................................................................................................................ + mls v11.4S, v1.4S, v8.S[0] // .............................................................................................*.......................................... + // gap // ........................................................................................................................................ + add v17.4S, v16.4S, v17.4S // ..............................................................................................*......................................... + // gap // ........................................................................................................................................ + add v21.4S, v26.4S, v5.4S // ...............................................................................................*........................................ + // gap // ........................................................................................................................................ + add v26.4S, v15.4S, v23.4S // ................................................................................................*....................................... + // gap // ........................................................................................................................................ + sub v5.4S, v11.4S, v31.4S // .................................................................................................*...................................... + // gap // ........................................................................................................................................ + add v10.4S, v11.4S, v31.4S // ..................................................................................................*..................................... + // gap // ........................................................................................................................................ + mls v7.4S, v18.4S, v8.4S // ...................................................................................................*.................................... + // gap // ........................................................................................................................................ + sqrdmulh v18.4S, v5.4S, v6.S[1] // ....................................................................................................*................................... + // gap // ........................................................................................................................................ + mul v5.4S, v5.4S, v6.S[0] // .....................................................................................................*.................................. + // gap // ........................................................................................................................................ + srshr v1.4S, v26.4S, #23 // ......................................................................................................*................................. + // gap // ........................................................................................................................................ + sub v31.4S, v15.4S, v23.4S // ........................................................................................................*............................... + // gap // ........................................................................................................................................ + srshr v27.4S, v10.4S, #23 // .........................................................................................................*.............................. + // gap // ........................................................................................................................................ + sqrdmulh v11.4S, v14.4S, v4.S[3] // ..........................................................................................................*............................. + // gap // ........................................................................................................................................ + sqrdmulh v15.4S, v31.4S, v4.S[3] // ...........................................................................................................*............................ + // gap // ........................................................................................................................................ + mul v31.4S, v31.4S, v4.S[2] // ............................................................................................................*........................... + // gap // ........................................................................................................................................ + add v2.4S, v21.4S, v17.4S // .............................................................................................................*.......................... + // gap // ........................................................................................................................................ + mls v26.4S, v1.4S, v8.4S // ..............................................................................................................*......................... + // gap // ........................................................................................................................................ + mls v10.4S, v27.4S, v8.4S // ...............................................................................................................*........................ + // gap // ........................................................................................................................................ + mls v5.4S, v18.4S, v8.S[0] // ................................................................................................................*....................... + // gap // ........................................................................................................................................ + mls v31.4S, v15.4S, v8.S[0] // .................................................................................................................*...................... + // gap // ........................................................................................................................................ + srshr v18.4S, v2.4S, #23 // ..................................................................................................................*..................... + // gap // ........................................................................................................................................ + sub v1.4S, v26.4S, v10.4S // ...................................................................................................................*.................... + // gap // ........................................................................................................................................ + add v26.4S, v26.4S, v10.4S // ....................................................................................................................*................... + // gap // ........................................................................................................................................ + mls v2.4S, v18.4S, v8.4S // .....................................................................................................................*.................. + // gap // ........................................................................................................................................ + sqrdmulh v18.4S, v1.4S, v4.S[1] // ......................................................................................................................*................. + // gap // ........................................................................................................................................ + mul v10.4S, v1.4S, v4.S[0] // .......................................................................................................................*................ + // gap // ........................................................................................................................................ + mul v27.4S, v14.4S, v4.S[2] // ........................................................................................................................*............... + // gap // ........................................................................................................................................ + sub v14.4S, v7.4S, v2.4S // .........................................................................................................................*.............. + // gap // ........................................................................................................................................ + add v7.4S, v7.4S, v2.4S // ..........................................................................................................................*............. + // gap // ........................................................................................................................................ + mls v10.4S, v18.4S, v8.S[0] // ...........................................................................................................................*............ + // gap // ........................................................................................................................................ + mls v27.4S, v11.4S, v8.S[0] // ............................................................................................................................*........... + // gap // ........................................................................................................................................ + sqrdmulh v18.4S, v14.4S, v4.S[1] // .............................................................................................................................*.......... + // gap // ........................................................................................................................................ + mul v14.4S, v14.4S, v4.S[0] // ..............................................................................................................................*......... + // gap // ........................................................................................................................................ + sub v1.4S, v31.4S, v5.4S // ...............................................................................................................................*........ + // gap // ........................................................................................................................................ + add v31.4S, v31.4S, v5.4S // ................................................................................................................................*....... + // gap // ........................................................................................................................................ + str q10, [x2, #16] // .................................................................................................................................*...... + // gap // ........................................................................................................................................ + mls v14.4S, v18.4S, v8.S[0] // ..................................................................................................................................*..... + // gap // ........................................................................................................................................ + str q26, [x1, #16] // ...................................................................................................................................*.... + // gap // ........................................................................................................................................ + sqrdmulh v12.4S, v1.4S, v4.S[1] // ....................................................................................................................................*... + // gap // ........................................................................................................................................ + str q7, [x1], #(16*4) // .....................................................................................................................................*.. + // gap // ........................................................................................................................................ + mul v23.4S, v1.4S, v4.S[0] // ......................................................................................................................................*. + // gap // ........................................................................................................................................ + str q14, [x2], #(16*4) // .......................................................................................................................................* + // gap // ........................................................................................................................................ + + // ------------------------------------------------------------ new position -------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------- + // ldr q19, [x5, #112] // *....................................................................................................................................... + // ldr q10, [x5, #16] // .................*...................................................................................................................... + // ldr q11, [x5, #48] // .......................*................................................................................................................ + // ldr q24, [x5], #(12*16) // ..........................*............................................................................................................. + // ldr q17, [x5, #-112] // ...........................*............................................................................................................ + // ldr q5, [x5, #-160] // ............................*........................................................................................................... + // ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x1] // ......................................*................................................................................................. + // ldr q31, [x5, #-128] // .............................*.......................................................................................................... + // sub v6.4S, v0.4S, v1.4S // ........................................*............................................................................................... + // add v25.4S, v0.4S, v1.4S // .........................................*.............................................................................................. + // add v13.4S, v2.4S, v3.4S // ...............................................*........................................................................................ + // mul v20.4S, v6.4S, v5.4S // ............................................*........................................................................................... + // sub v2.4S, v2.4S, v3.4S // ..........................................*............................................................................................. + // add v23.4S, v25.4S, v13.4S // ..............................................................*......................................................................... + // sqrdmulh v21.4S, v6.4S, v11.4S // ...........................................*............................................................................................ + // sqrdmulh v15.4S, v2.4S, v17.4S // .............................................*.......................................................................................... + // mul v16.4S, v2.4S, v31.4S // ..............................................*......................................................................................... + // ldr q2, [x4, #48] // ..............................*......................................................................................................... + // mls v20.4S, v21.4S, v8.S[0] // .................................................*...................................................................................... + // mls v16.4S, v15.4S, v8.S[0] // ..................................................*..................................................................................... + // sub v27.4S, v25.4S, v13.4S // .....................................................*.................................................................................. + // ldr q26, [x5, #-96] // ...............................*........................................................................................................ + // sub v30.4S, v20.4S, v16.4S // ......................................................*................................................................................. + // sqrdmulh v15.4S, v27.4S, v10.4S // ........................................................*............................................................................... + // mul v21.4S, v27.4S, v24.4S // ............................................................*........................................................................... + // mul v0.4S, v30.4S, v24.4S // .............................................................*.......................................................................... + // sqrdmulh v7.4S, v30.4S, v10.4S // .........................................................*.............................................................................. + // add v31.4S, v20.4S, v16.4S // ...............................................................*........................................................................ + // mls v21.4S, v15.4S, v8.S[0] // ................................................................*....................................................................... + // mls v0.4S, v7.4S, v8.S[0] // .................................................................*...................................................................... + // trn1 v27.4S, v23.4S, v31.4S // ..................................................................*..................................................................... + // trn2 v23.4S, v23.4S, v31.4S // ....................................................................*................................................................... + // trn2 v18.4S, v21.4S, v0.4S // .....................................................................*.................................................................. + // trn1 v11.4S, v21.4S, v0.4S // ........................................................................*............................................................... + // trn2 v24.2D, v23.2D, v18.2D // .........................................................................*.............................................................. + // trn1 v31.2D, v23.2D, v18.2D // ..........................................................................*............................................................. + // trn1 v21.2D, v27.2D, v11.2D // ...........................................................................*............................................................ + // ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x2] // ...*.................................................................................................................................... + // ldr q23, [x5, #-48] // .*...................................................................................................................................... + // sub v13.4S, v15.4S, v16.4S // .....*.................................................................................................................................. + // ldr q10, [x5, #-32] // ..*..................................................................................................................................... + // sqrdmulh v0.4S, v13.4S, v23.4S // ........*............................................................................................................................... + // ldr q25, [x5, #-64] // ....*................................................................................................................................... + // add v9.4S, v21.4S, v31.4S // .............................................................................*.......................................................... + // sub v30.4S, v17.4S, v18.4S // ......*................................................................................................................................. + // mul v20.4S, v13.4S, v25.4S // ..........*............................................................................................................................. + // ldr q13, [x5, #-16] // .......*................................................................................................................................ + // trn2 v7.2D, v27.2D, v11.2D // ............................................................................*........................................................... + // mul v28.4S, v30.4S, v10.4S // .........*.............................................................................................................................. + // sqrdmulh v1.4S, v30.4S, v13.4S // ...........*............................................................................................................................ + // sub v23.4S, v21.4S, v31.4S // ..............................................................................*......................................................... + // add v16.4S, v15.4S, v16.4S // ............*........................................................................................................................... + // mls v20.4S, v0.4S, v8.S[0] // ..............*......................................................................................................................... + // mls v28.4S, v1.4S, v8.S[0] // ...............*........................................................................................................................ + // add v15.4S, v17.4S, v18.4S // .............*.......................................................................................................................... + // ldr q6, [x4, #16] // ................................*....................................................................................................... + // add v14.4S, v20.4S, v28.4S // ......................*................................................................................................................. + // sub v4.4S, v16.4S, v15.4S // ................*....................................................................................................................... + // sub v25.4S, v20.4S, v28.4S // ..................*..................................................................................................................... + // mul v27.4S, v23.4S, v6.S[2] // .................................................................................*...................................................... + // sqrdmulh v31.4S, v4.4S, v19.4S // ....................*................................................................................................................... + // sqrdmulh v13.4S, v25.4S, v19.4S // .....................*.................................................................................................................. + // mul v11.4S, v4.4S, v26.4S // .................................*...................................................................................................... + // mul v4.4S, v25.4S, v26.4S // ..................................*..................................................................................................... + // sqrdmulh v22.4S, v23.4S, v6.S[3] // ..................................................................................*..................................................... + // add v21.4S, v16.4S, v15.4S // ...................*.................................................................................................................... + // mls v11.4S, v31.4S, v8.S[0] // ....................................*................................................................................................... + // mls v4.4S, v13.4S, v8.S[0] // .....................................*.................................................................................................. + // mls v27.4S, v22.4S, v8.S[0] // ........................................................................................*............................................... + // trn2 v19.4S, v21.4S, v14.4S // ........................*............................................................................................................... + // trn1 v16.4S, v21.4S, v14.4S // .........................*.............................................................................................................. + // trn1 v28.4S, v11.4S, v4.4S // .......................................................*................................................................................ + // trn2 v4.4S, v11.4S, v4.4S // ................................................*....................................................................................... + // add v3.4S, v7.4S, v24.4S // ...............................................................................*........................................................ + // trn2 v29.2D, v16.2D, v28.2D // ..........................................................*............................................................................. + // trn2 v25.2D, v19.2D, v4.2D // ...................................................*.................................................................................... + // trn1 v13.2D, v16.2D, v28.2D // ...........................................................*............................................................................ + // sub v26.4S, v29.4S, v25.4S // ...................................................................*.................................................................... + // trn1 v14.2D, v19.2D, v4.2D // ....................................................*................................................................................... + // sub v0.4S, v9.4S, v3.4S // .........................................................................................*.............................................. + // sqrdmulh v20.4S, v26.4S, v2.S[1] // ......................................................................*................................................................. + // mul v22.4S, v26.4S, v2.S[0] // .......................................................................*................................................................ + // sub v5.4S, v13.4S, v14.4S // ................................................................................*....................................................... + // ldr q1, [x4, #32] // ...................................*.................................................................................................... + // sub v17.4S, v7.4S, v24.4S // ...................................................................................*.................................................... + // add v30.4S, v9.4S, v3.4S // ..........................................................................................*............................................. + // sqrdmulh v23.4S, v5.4S, v1.S[3] // ....................................................................................*................................................... + // mul v11.4S, v17.4S, v1.S[0] // ......................................................................................*................................................. + // sqrdmulh v21.4S, v17.4S, v1.S[1] // .......................................................................................*................................................ + // mul v16.4S, v5.4S, v1.S[2] // .....................................................................................*.................................................. + // srshr v1.4S, v30.4S, #23 // .............................................................................................*.......................................... + // mls v22.4S, v20.4S, v8.S[0] // ...........................................................................................*............................................ + // mls v11.4S, v21.4S, v8.S[0] // ............................................................................................*........................................... + // mls v16.4S, v23.4S, v8.S[0] // ..............................................................................................*......................................... + // add v17.4S, v29.4S, v25.4S // ...............................................................................................*........................................ + // add v21.4S, v13.4S, v14.4S // ................................................................................................*....................................... + // add v13.4S, v27.4S, v11.4S // .................................................................................................*...................................... + // sub v4.4S, v16.4S, v22.4S // ..................................................................................................*..................................... + // add v24.4S, v16.4S, v22.4S // ...................................................................................................*.................................... + // mls v30.4S, v1.4S, v8.4S // ....................................................................................................*................................... + // sqrdmulh v28.4S, v4.4S, v6.S[1] // .....................................................................................................*.................................. + // mul v15.4S, v4.4S, v6.S[0] // ......................................................................................................*................................. + // srshr v12.4S, v13.4S, #23 // .......................................................................................................*................................ + // ldr q4, [x4], #64 // .......................................*................................................................................................ + // sub v20.4S, v27.4S, v11.4S // ........................................................................................................*............................... + // srshr v11.4S, v24.4S, #23 // .........................................................................................................*.............................. + // sqrdmulh v23.4S, v0.4S, v4.S[3] // ..........................................................................................................*............................. + // sqrdmulh v29.4S, v20.4S, v4.S[3] // ...........................................................................................................*............................ + // mul v1.4S, v20.4S, v4.S[2] // ............................................................................................................*........................... + // add v20.4S, v21.4S, v17.4S // .............................................................................................................*.......................... + // mls v13.4S, v12.4S, v8.4S // ..............................................................................................................*......................... + // mls v24.4S, v11.4S, v8.4S // ...............................................................................................................*........................ + // mls v15.4S, v28.4S, v8.S[0] // ................................................................................................................*....................... + // mls v1.4S, v29.4S, v8.S[0] // .................................................................................................................*...................... + // srshr v12.4S, v20.4S, #23 // ..................................................................................................................*..................... + // sub v27.4S, v13.4S, v24.4S // ...................................................................................................................*.................... + // add v18.4S, v13.4S, v24.4S // ....................................................................................................................*................... + // mls v20.4S, v12.4S, v8.4S // .....................................................................................................................*.................. + // sqrdmulh v5.4S, v27.4S, v4.S[1] // ......................................................................................................................*................. + // mul v9.4S, v27.4S, v4.S[0] // .......................................................................................................................*................ + // mul v27.4S, v0.4S, v4.S[2] // ........................................................................................................................*............... + // sub v11.4S, v30.4S, v20.4S // .........................................................................................................................*.............. + // add v2.4S, v30.4S, v20.4S // ..........................................................................................................................*............. + // mls v9.4S, v5.4S, v8.S[0] // ...........................................................................................................................*............ + // mls v27.4S, v23.4S, v8.S[0] // ............................................................................................................................*........... + // sqrdmulh v23.4S, v11.4S, v4.S[1] // .............................................................................................................................*.......... + // mul v16.4S, v11.4S, v4.S[0] // ..............................................................................................................................*......... + // sub v25.4S, v1.4S, v15.4S // ...............................................................................................................................*........ + // add v31.4S, v1.4S, v15.4S // ................................................................................................................................*....... + // str q9, [x2, #16] // .................................................................................................................................*...... + // mls v16.4S, v23.4S, v8.S[0] // ..................................................................................................................................*..... + // str q18, [x1, #16] // ...................................................................................................................................*.... + // sqrdmulh v12.4S, v25.4S, v4.S[1] // ....................................................................................................................................*... + // str q2, [x1], #(16*4) // .....................................................................................................................................*.. + // mul v23.4S, v25.4S, v4.S[0] // ......................................................................................................................................*. + // str q16, [x2], #(16*4) // .......................................................................................................................................* sub count, count, #1 layer45678_start: - sub v6.4S, v27.4S, v4.4S // ...............................................................................................................................*........................ - add x1, x1, #64 // ......................................................................................................................................................*. - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] // e....................................................................................................................................................... + // Instructions: 152 + // Expected cycles: 182 + // Expected IPC: 0.84 + // + // Wall time: 1339.03s + // User time: 1339.03s + // + // ------------------------------------------------------------------ original position ------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + ldr q19, [x5, #112] // .............................e.......................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + ldr q10, [x5, #16] // ...e.................................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + ldr q11, [x5, #48] // .....e.................................................................................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sub v26.4S, v21.4S, v17.4S // ........................................................................................................*............................................... // gap // ........................................................................................................................................................ + ldr q24, [x5], #(12*16) // ..e..................................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mul v16.4S, v26.4S, v6.S[0] // ...........................................................................................................*............................................ // gap // ........................................................................................................................................................ + sqrdmulh v0.4S, v26.4S, v6.S[1] // ..........................................................................................................*............................................. // gap // ........................................................................................................................................................ + ldr q17, [x5, #-112] // .......e................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - ldr q2, [x5], #(12*16) // ..e..................................................................................................................................................... // gap // ........................................................................................................................................................ + mls v23.4S, v12.4S, v8.S[0] // .............................................................................................................................................*.......... // gap // ........................................................................................................................................................ + mls v16.4S, v0.4S, v8.S[0] // ............................................................................................................*........................................... // gap // ........................................................................................................................................................ - add v0.4S, v18.4S, v19.4S // ..............e......................................................................................................................................... + ldr q5, [x5, #-160] // ....e................................................................................................................................................... // gap // ........................................................................................................................................................ - add v21.4S, v16.4S, v17.4S // .........e.............................................................................................................................................. // gap // ........................................................................................................................................................ - ldr q27, [x5, #-176] // ...e.................................................................................................................................................... // gap // ........................................................................................................................................................ + str q23, [x2, #-16] // .....................................................................................................................................................*.. // gap // ........................................................................................................................................................ + add v14.4S, v27.4S, v16.4S // .....................................................................................................................................*.................. // gap // ........................................................................................................................................................ - sub v24.4S, v21.4S, v0.4S // ..................e..................................................................................................................................... + str q31, [x1, #-16] // .................................................................................................................................................*...... // gap // ........................................................................................................................................................ - ldr q25, [x5, #-128] // ......e................................................................................................................................................. + sub v9.4S, v27.4S, v16.4S // ....................................................................................................................................*................... // gap // ........................................................................................................................................................ + str q14, [x1, #-32] // ................................................................................................................................................*....... + add x1, x1, #64 // ......................................................................................................................................................*. + ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x1] // e....................................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mul v28.4S, v24.4S, v2.4S // ....................e................................................................................................................................... // gap // ........................................................................................................................................................ - sqrdmulh v23.4S, v24.4S, v27.4S // .....................e.................................................................................................................................. // gap // ........................................................................................................................................................ - sub v12.4S, v18.4S, v19.4S // .............e.......................................................................................................................................... // gap // ........................................................................................................................................................ - ldr q3, [x5, #-160] // ....e................................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mls v28.4S, v23.4S, v8.S[0] // ......................e................................................................................................................................. // gap // ........................................................................................................................................................ - mul v1.4S, v12.4S, v25.4S // ...............e........................................................................................................................................ // gap // ........................................................................................................................................................ - sub v13.4S, v16.4S, v17.4S // ........e............................................................................................................................................... // gap // ........................................................................................................................................................ - ldr q22, [x5, #-144] // .....e.................................................................................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - ldr q5, [x5, #-112] // .......e................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sqrdmulh v20.4S, v13.4S, v22.4S // ...........e............................................................................................................................................ + ldr q31, [x5, #-128] // ......e................................................................................................................................................. // gap // ........................................................................................................................................................ - mul v11.4S, v13.4S, v3.4S // ..........e............................................................................................................................................. // gap // ........................................................................................................................................................ - sqrdmulh v30.4S, v12.4S, v5.4S // ................e....................................................................................................................................... // gap // ........................................................................................................................................................ - sqrdmulh v12.4S, v6.4S, v29.S[1] // ..................................................................................................................................*..................... + sub v6.4S, v0.4S, v1.4S // ........e............................................................................................................................................... // gap // ........................................................................................................................................................ - mul v6.4S, v6.4S, v29.S[0] // .................................................................................................................................*...................... + add v25.4S, v0.4S, v1.4S // .........e.............................................................................................................................................. // gap // ........................................................................................................................................................ - mls v11.4S, v20.4S, v8.S[0] // ............e........................................................................................................................................... + add v13.4S, v2.4S, v3.4S // ..............e......................................................................................................................................... // gap // ........................................................................................................................................................ - mls v1.4S, v30.4S, v8.S[0] // .................e...................................................................................................................................... + mul v20.4S, v6.4S, v5.4S // ...........e............................................................................................................................................ // gap // ........................................................................................................................................................ - sqrdmulh v9.4S, v10.4S, v29.S[1] // .......................................................................................................................................*................ + sub v2.4S, v2.4S, v3.4S // .............e.......................................................................................................................................... // gap // ........................................................................................................................................................ - mls v6.4S, v12.4S, v8.S[0] // ...................................................................................................................................*.................... + add v23.4S, v25.4S, v13.4S // ...................e.................................................................................................................................... // gap // ........................................................................................................................................................ - mul v13.4S, v10.4S, v29.S[0] // ......................................................................................................................................*................. + sqrdmulh v21.4S, v6.4S, v11.4S // ..........e............................................................................................................................................. // gap // ........................................................................................................................................................ - sub v31.4S, v11.4S, v1.4S // .......................e................................................................................................................................ + sqrdmulh v15.4S, v2.4S, v17.4S // ...............e........................................................................................................................................ // gap // ........................................................................................................................................................ - ldr q12, [x5, #-64] // ..............................e......................................................................................................................... + mul v16.4S, v2.4S, v31.4S // ................e....................................................................................................................................... // gap // ........................................................................................................................................................ + ldr q2, [x4, #48] // .........................................................................e.............................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sqrdmulh v24.4S, v31.4S, v27.4S // ..........................e............................................................................................................................. // gap // ........................................................................................................................................................ - mls v13.4S, v9.4S, v8.S[0] // ........................................................................................................................................*............... + mls v20.4S, v21.4S, v8.S[0] // ............e........................................................................................................................................... // gap // ........................................................................................................................................................ - str q6, [x2, #-48] // ...................................................................................................................................................*.... + mls v16.4S, v15.4S, v8.S[0] // .................e...................................................................................................................................... // gap // ........................................................................................................................................................ - ldr q4, [x5, #-48] // ...............................e........................................................................................................................ + sub v27.4S, v25.4S, v13.4S // ..................e..................................................................................................................................... // gap // ........................................................................................................................................................ + ldr q26, [x5, #-96] // ............................e........................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - str q13, [x2, #-32] // ....................................................................................................................................................*... - add x2, x2, #64 // .......................................................................................................................................................* - ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .e...................................................................................................................................................... // gap // ........................................................................................................................................................ + sub v30.4S, v20.4S, v16.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v15.4S, v27.4S, v10.4S // ....................e................................................................................................................................... // gap // ........................................................................................................................................................ + mul v21.4S, v27.4S, v24.4S // .....................e.................................................................................................................................. // gap // ........................................................................................................................................................ + mul v0.4S, v30.4S, v24.4S // ..........................e............................................................................................................................. // gap // ........................................................................................................................................................ + sqrdmulh v7.4S, v30.4S, v10.4S // .........................e.............................................................................................................................. // gap // ........................................................................................................................................................ + add v31.4S, v20.4S, v16.4S // ........................e............................................................................................................................... // gap // ........................................................................................................................................................ + mls v21.4S, v15.4S, v8.S[0] // ......................e................................................................................................................................. // gap // ........................................................................................................................................................ + mul v30.4S, v9.4S, v4.S[0] // .......................................................................................................................................*................ // gap // ........................................................................................................................................................ + mls v0.4S, v7.4S, v8.S[0] // ...........................e............................................................................................................................ // gap // ........................................................................................................................................................ + trn1 v27.4S, v23.4S, v31.4S // ......................................................e................................................................................................. // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v9.4S, v4.S[1] // ......................................................................................................................................*................. // gap // ........................................................................................................................................................ + trn2 v23.4S, v23.4S, v31.4S // .......................................................e................................................................................................ // gap // ........................................................................................................................................................ + trn2 v18.4S, v21.4S, v0.4S // .........................................................e.............................................................................................. // gap // ........................................................................................................................................................ + trn1 v11.4S, v21.4S, v0.4S // ........................................................e............................................................................................... // gap // ........................................................................................................................................................ + mls v30.4S, v1.4S, v8.S[0] // ........................................................................................................................................*............... // gap // ........................................................................................................................................................ + trn2 v24.2D, v23.2D, v18.2D // ...........................................................e............................................................................................ // gap // ........................................................................................................................................................ + trn1 v31.2D, v23.2D, v18.2D // .............................................................e.......................................................................................... // gap // ........................................................................................................................................................ - ldr q17, [x5, #-32] // ................................e....................................................................................................................... + trn1 v21.2D, v27.2D, v11.2D // ............................................................e........................................................................................... // gap // ........................................................................................................................................................ + str q30, [x2, #-32] // ....................................................................................................................................................*... + add x2, x2, #64 // .......................................................................................................................................................* + ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x2] // .e...................................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v20.4S, v15.4S, v16.4S // .......................................e................................................................................................................ // gap // ........................................................................................................................................................ - ldr q5, [x5, #-16] // .................................e...................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v30.4S, v13.4S, v14.4S // ..................................e..................................................................................................................... // gap // ........................................................................................................................................................ - mul v3.4S, v20.4S, v17.4S // .........................................e.............................................................................................................. // gap // ........................................................................................................................................................ - sqrdmulh v27.4S, v20.4S, v5.4S // ..........................................e............................................................................................................. // gap // ........................................................................................................................................................ - sqrdmulh v29.4S, v30.4S, v4.4S // .....................................e.................................................................................................................. // gap // ........................................................................................................................................................ - mul v12.4S, v30.4S, v12.4S // ....................................e................................................................................................................... // gap // ........................................................................................................................................................ - add v14.4S, v13.4S, v14.4S // ...................................e.................................................................................................................... // gap // ........................................................................................................................................................ - add v6.4S, v15.4S, v16.4S // ........................................e............................................................................................................... // gap // ........................................................................................................................................................ - mls v3.4S, v27.4S, v8.S[0] // ...........................................e............................................................................................................ // gap // ........................................................................................................................................................ - mls v12.4S, v29.4S, v8.S[0] // ......................................e................................................................................................................. // gap // ........................................................................................................................................................ - sub v20.4S, v14.4S, v6.4S // ............................................e........................................................................................................... // gap // ........................................................................................................................................................ - ldr q26, [x5, #-80] // .............................e.......................................................................................................................... // gap // ........................................................................................................................................................ + ldr q23, [x5, #-48] // ...............................e........................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v13.4S, v12.4S, v3.4S // .................................................e...................................................................................................... // gap // ........................................................................................................................................................ - add v22.4S, v11.4S, v1.4S // ........................e............................................................................................................................... + sub v13.4S, v15.4S, v16.4S // ..................................e..................................................................................................................... // gap // ........................................................................................................................................................ - sqrdmulh v4.4S, v20.4S, v26.4S // ...............................................e........................................................................................................ + ldr q10, [x5, #-32] // ................................e....................................................................................................................... // gap // ........................................................................................................................................................ - sqrdmulh v7.4S, v13.4S, v26.4S // ....................................................e................................................................................................... // gap // ........................................................................................................................................................ - ldr q19, [x5, #-96] // ............................e........................................................................................................................... // gap // ........................................................................................................................................................ + sqrdmulh v0.4S, v13.4S, v23.4S // ....................................e................................................................................................................... // gap // ........................................................................................................................................................ + ldr q25, [x5, #-64] // ..............................e......................................................................................................................... // gap // ........................................................................................................................................................ - add v25.4S, v14.4S, v6.4S // .............................................e.......................................................................................................... // gap // ........................................................................................................................................................ - add v1.4S, v12.4S, v3.4S // ..................................................e..................................................................................................... // gap // ........................................................................................................................................................ - mul v5.4S, v13.4S, v19.4S // ...................................................e.................................................................................................... + add v9.4S, v21.4S, v31.4S // ...........................................................................e............................................................................ // gap // ........................................................................................................................................................ - mul v29.4S, v20.4S, v19.4S // ..............................................e......................................................................................................... + sub v30.4S, v17.4S, v18.4S // .......................................e................................................................................................................ // gap // ........................................................................................................................................................ - trn2 v18.4S, v25.4S, v1.4S // ...............................................................e........................................................................................ + mul v20.4S, v13.4S, v25.4S // .....................................e.................................................................................................................. // gap // ........................................................................................................................................................ - trn1 v12.4S, v25.4S, v1.4S // ..............................................................e......................................................................................... + ldr q13, [x5, #-16] // .................................e...................................................................................................................... // gap // ........................................................................................................................................................ - mul v30.4S, v31.4S, v2.4S // .........................e.............................................................................................................................. // gap // ........................................................................................................................................................ - mls v29.4S, v4.4S, v8.S[0] // ................................................e....................................................................................................... // gap // ........................................................................................................................................................ - ldr q10, [x4, #16] // .......................................................................e................................................................................ + trn2 v7.2D, v27.2D, v11.2D // ..........................................................e............................................................................................. // gap // ........................................................................................................................................................ + mul v28.4S, v30.4S, v10.4S // ..........................................e............................................................................................................. // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v30.4S, v13.4S // .........................................e.............................................................................................................. // gap // ........................................................................................................................................................ - ldr q15, [x4, #32] // ........................................................................e............................................................................... + sub v23.4S, v21.4S, v31.4S // ..........................................................................e............................................................................. // gap // ........................................................................................................................................................ + add v16.4S, v15.4S, v16.4S // ...................................e.................................................................................................................... // gap // ........................................................................................................................................................ + mls v20.4S, v0.4S, v8.S[0] // ......................................e................................................................................................................. // gap // ........................................................................................................................................................ - add v27.4S, v21.4S, v0.4S // ...................e.................................................................................................................................... + mls v28.4S, v1.4S, v8.S[0] // ...........................................e............................................................................................................ // gap // ........................................................................................................................................................ - mls v30.4S, v24.4S, v8.S[0] // ...........................e............................................................................................................................ + add v15.4S, v17.4S, v18.4S // ........................................e............................................................................................................... // gap // ........................................................................................................................................................ - mls v5.4S, v7.4S, v8.S[0] // .....................................................e.................................................................................................. + ldr q6, [x4, #16] // .......................................................................e................................................................................ // gap // ........................................................................................................................................................ - trn1 v17.4S, v27.4S, v22.4S // ......................................................e................................................................................................. // gap // ........................................................................................................................................................ - trn2 v2.4S, v27.4S, v22.4S // .......................................................e................................................................................................ // gap // ........................................................................................................................................................ - trn1 v11.4S, v28.4S, v30.4S // ........................................................e............................................................................................... + add v14.4S, v20.4S, v28.4S // ..................................................e..................................................................................................... // gap // ........................................................................................................................................................ - trn2 v26.4S, v28.4S, v30.4S // .........................................................e.............................................................................................. + sub v4.4S, v16.4S, v15.4S // ............................................e........................................................................................................... // gap // ........................................................................................................................................................ - trn1 v4.4S, v29.4S, v5.4S // ................................................................e....................................................................................... + sub v25.4S, v20.4S, v28.4S // .................................................e...................................................................................................... // gap // ........................................................................................................................................................ - trn2 v7.2D, v17.2D, v11.2D // ..........................................................e............................................................................................. + mul v27.4S, v23.4S, v6.S[2] // .............................................................................e.......................................................................... // gap // ........................................................................................................................................................ - trn2 v3.2D, v2.2D, v26.2D // ...........................................................e............................................................................................ + sqrdmulh v31.4S, v4.4S, v19.4S // ..............................................e......................................................................................................... // gap // ........................................................................................................................................................ - trn2 v0.4S, v29.4S, v5.4S // .................................................................e...................................................................................... + sqrdmulh v13.4S, v25.4S, v19.4S // ...................................................e.................................................................................................... // gap // ........................................................................................................................................................ - sub v20.4S, v7.4S, v3.4S // ...............................................................................e........................................................................ + mul v11.4S, v4.4S, v26.4S // ...............................................e........................................................................................................ // gap // ........................................................................................................................................................ - trn2 v14.2D, v12.2D, v4.2D // ..................................................................e..................................................................................... + mul v4.4S, v25.4S, v26.4S // ....................................................e................................................................................................... // gap // ........................................................................................................................................................ - trn2 v27.2D, v18.2D, v0.2D // ...................................................................e.................................................................................... + sqrdmulh v22.4S, v23.4S, v6.S[3] // ............................................................................e........................................................................... // gap // ........................................................................................................................................................ - mul v31.4S, v20.4S, v15.S[0] // .................................................................................e...................................................................... + add v21.4S, v16.4S, v15.4S // .............................................e.......................................................................................................... // gap // ........................................................................................................................................................ - sub v1.4S, v14.4S, v27.4S // .........................................................................................e.............................................................. + mls v11.4S, v31.4S, v8.S[0] // ................................................e....................................................................................................... // gap // ........................................................................................................................................................ - ldr q23, [x4, #48] // .........................................................................e.............................................................................. + mls v4.4S, v13.4S, v8.S[0] // .....................................................e.................................................................................................. // gap // ........................................................................................................................................................ + mls v27.4S, v22.4S, v8.S[0] // ..............................................................................e......................................................................... // gap // ........................................................................................................................................................ + trn2 v19.4S, v21.4S, v14.4S // ...............................................................e........................................................................................ // gap // ........................................................................................................................................................ - trn1 v19.2D, v18.2D, v0.2D // .....................................................................e.................................................................................. + trn1 v16.4S, v21.4S, v14.4S // ..............................................................e......................................................................................... // gap // ........................................................................................................................................................ - trn1 v25.2D, v12.2D, v4.2D // ....................................................................e................................................................................... + trn1 v28.4S, v11.4S, v4.4S // ................................................................e....................................................................................... // gap // ........................................................................................................................................................ - mul v29.4S, v1.4S, v23.S[0] // ...........................................................................................e............................................................ + trn2 v4.4S, v11.4S, v4.4S // .................................................................e...................................................................................... // gap // ........................................................................................................................................................ - sub v5.4S, v25.4S, v19.4S // ....................................................................................e................................................................... + add v3.4S, v7.4S, v24.4S // ................................................................................e....................................................................... // gap // ........................................................................................................................................................ - trn1 v6.2D, v17.2D, v11.2D // ............................................................e........................................................................................... + trn2 v29.2D, v16.2D, v28.2D // ..................................................................e..................................................................................... // gap // ........................................................................................................................................................ - trn1 v9.2D, v2.2D, v26.2D // .............................................................e.......................................................................................... + trn2 v25.2D, v19.2D, v4.2D // ...................................................................e.................................................................................... // gap // ........................................................................................................................................................ - mul v24.4S, v5.4S, v15.S[2] // ......................................................................................e................................................................. + trn1 v13.2D, v16.2D, v28.2D // ....................................................................e................................................................................... // gap // ........................................................................................................................................................ - sqrdmulh v11.4S, v5.4S, v15.S[3] // .......................................................................................e................................................................ + sub v26.4S, v29.4S, v25.4S // .........................................................................................e.............................................................. // gap // ........................................................................................................................................................ - sub v16.4S, v6.4S, v9.4S // ..........................................................................e............................................................................. + trn1 v14.2D, v19.2D, v4.2D // .....................................................................e.................................................................................. // gap // ........................................................................................................................................................ - sqrdmulh v13.4S, v20.4S, v15.S[1] // ..................................................................................e..................................................................... + sub v0.4S, v9.4S, v3.4S // ..............................................................................................e......................................................... // gap // ........................................................................................................................................................ - sqrdmulh v30.4S, v1.4S, v23.S[1] // ............................................................................................e........................................................... + sqrdmulh v20.4S, v26.4S, v2.S[1] // ...........................................................................................e............................................................ // gap // ........................................................................................................................................................ - mul v22.4S, v16.4S, v10.S[2] // ............................................................................e........................................................................... + mul v22.4S, v26.4S, v2.S[0] // ............................................................................................e........................................................... // gap // ........................................................................................................................................................ - sqrdmulh v17.4S, v16.4S, v10.S[3] // .............................................................................e.......................................................................... + sub v5.4S, v13.4S, v14.4S // ....................................................................................e................................................................... // gap // ........................................................................................................................................................ - mls v31.4S, v13.4S, v8.S[0] // ...................................................................................e.................................................................... + ldr q1, [x4, #32] // ........................................................................e............................................................................... // gap // ........................................................................................................................................................ - mls v24.4S, v11.4S, v8.S[0] // ........................................................................................e............................................................... // gap // ........................................................................................................................................................ - mls v29.4S, v30.4S, v8.S[0] // .............................................................................................e.......................................................... // gap // ........................................................................................................................................................ - mls v22.4S, v17.4S, v8.S[0] // ..............................................................................e......................................................................... + sub v17.4S, v7.4S, v24.4S // ...............................................................................e........................................................................ // gap // ........................................................................................................................................................ - add v17.4S, v25.4S, v19.4S // .....................................................................................e.................................................................. + add v30.4S, v9.4S, v3.4S // ...............................................................................................e........................................................ // gap // ........................................................................................................................................................ - add v18.4S, v14.4S, v27.4S // ..........................................................................................e............................................................. + sqrdmulh v23.4S, v5.4S, v1.S[3] // ......................................................................................e................................................................. // gap // ........................................................................................................................................................ - add v4.4S, v24.4S, v29.4S // ..............................................................................................................e......................................... + mul v11.4S, v17.4S, v1.S[0] // ..................................................................................e..................................................................... // gap // ........................................................................................................................................................ - add v27.4S, v22.4S, v31.4S // ....................................................................................................e................................................... + sqrdmulh v21.4S, v17.4S, v1.S[1] // .................................................................................e...................................................................... // gap // ........................................................................................................................................................ - sub v5.4S, v17.4S, v18.4S // ........................................................................................................e............................................... + mul v16.4S, v5.4S, v1.S[2] // .......................................................................................e................................................................ // gap // ........................................................................................................................................................ - srshr v25.4S, v4.4S, #23 // ........................................................................................................................e............................... + srshr v1.4S, v30.4S, #23 // ..................................................................................................................e..................................... // gap // ........................................................................................................................................................ - srshr v26.4S, v27.4S, #23 // ....................................................................................................................e................................... + mls v22.4S, v20.4S, v8.S[0] // .............................................................................................e.......................................................... // gap // ........................................................................................................................................................ - add v21.4S, v17.4S, v18.4S // .........................................................................................................e.............................................. + mls v11.4S, v21.4S, v8.S[0] // ...................................................................................e.................................................................... // gap // ........................................................................................................................................................ - mls v4.4S, v25.4S, v8.4S // .........................................................................................................................e.............................. + mls v16.4S, v23.4S, v8.S[0] // ........................................................................................e............................................................... // gap // ........................................................................................................................................................ - mls v27.4S, v26.4S, v8.4S // .....................................................................................................................e.................................. + add v17.4S, v29.4S, v25.4S // ..........................................................................................e............................................................. // gap // ........................................................................................................................................................ - add v28.4S, v6.4S, v9.4S // ...........................................................................e............................................................................ + add v21.4S, v13.4S, v14.4S // .....................................................................................e.................................................................. // gap // ........................................................................................................................................................ - add v23.4S, v7.4S, v3.4S // ................................................................................e....................................................................... + add v13.4S, v27.4S, v11.4S // ....................................................................................................e................................................... // gap // ........................................................................................................................................................ - sub v13.4S, v24.4S, v29.4S // .............................................................................................................e.......................................... + sub v4.4S, v16.4S, v22.4S // .............................................................................................................e.......................................... // gap // ........................................................................................................................................................ - add v18.4S, v27.4S, v4.4S // ................................................................................................................................e....................... + add v24.4S, v16.4S, v22.4S // ..............................................................................................................e......................................... // gap // ........................................................................................................................................................ - add v2.4S, v28.4S, v23.4S // ...............................................................................................e........................................................ + mls v30.4S, v1.4S, v8.4S // ...................................................................................................................e.................................... // gap // ........................................................................................................................................................ - srshr v26.4S, v21.4S, #23 // ......................................................................................................................e................................. + sqrdmulh v28.4S, v4.4S, v6.S[1] // ...............................................................................................................e........................................ // gap // ........................................................................................................................................................ - str q18, [x1, #16] // ...............................................................................................................................................e........ + mul v15.4S, v4.4S, v6.S[0] // ................................................................................................................e....................................... // gap // ........................................................................................................................................................ - srshr v12.4S, v2.4S, #23 // ..................................................................................................................e..................................... + srshr v12.4S, v13.4S, #23 // ....................................................................................................................e................................... // gap // ........................................................................................................................................................ - mul v19.4S, v13.4S, v10.S[0] // ...............................................................................................................e........................................ + ldr q4, [x4], #64 // ......................................................................e................................................................................. // gap // ........................................................................................................................................................ - mls v21.4S, v26.4S, v8.4S // .......................................................................................................................e................................ // gap // ........................................................................................................................................................ - mls v2.4S, v12.4S, v8.4S // ...................................................................................................................e.................................... // gap // ........................................................................................................................................................ - sub v6.4S, v22.4S, v31.4S // ...................................................................................................e.................................................... + sub v20.4S, v27.4S, v11.4S // ...................................................................................................e.................................................... // gap // ........................................................................................................................................................ - ldr q29, [x4], #64 // ......................................................................e................................................................................. + srshr v11.4S, v24.4S, #23 // ........................................................................................................................e............................... // gap // ........................................................................................................................................................ + sqrdmulh v23.4S, v0.4S, v4.S[3] // ................................................................................................e....................................................... // gap // ........................................................................................................................................................ + sqrdmulh v29.4S, v20.4S, v4.S[3] // .....................................................................................................e.................................................. // gap // ........................................................................................................................................................ - add v14.4S, v2.4S, v21.4S // ...........................................................................................................................e............................ + mul v1.4S, v20.4S, v4.S[2] // ......................................................................................................e................................................. // gap // ........................................................................................................................................................ - sqrdmulh v11.4S, v13.4S, v10.S[1] // ................................................................................................................e....................................... + add v20.4S, v21.4S, v17.4S // .........................................................................................................e.............................................. // gap // ........................................................................................................................................................ - mul v9.4S, v6.4S, v29.S[2] // .....................................................................................................e.................................................. + mls v13.4S, v12.4S, v8.4S // .....................................................................................................................e.................................. // gap // ........................................................................................................................................................ - str q14, [x1], #(16*4) // ..............................................................................................................................................e......... + mls v24.4S, v11.4S, v8.4S // .........................................................................................................................e.............................. // gap // ........................................................................................................................................................ - sqrdmulh v1.4S, v6.4S, v29.S[3] // ......................................................................................................e................................................. + mls v15.4S, v28.4S, v8.S[0] // .................................................................................................................e...................................... // gap // ........................................................................................................................................................ - mls v19.4S, v11.4S, v8.S[0] // .................................................................................................................e...................................... + mls v1.4S, v29.4S, v8.S[0] // .......................................................................................................e................................................ // gap // ........................................................................................................................................................ - sub v7.4S, v2.4S, v21.4S // ..........................................................................................................................e............................. + srshr v12.4S, v20.4S, #23 // ......................................................................................................................e................................. // gap // ........................................................................................................................................................ - sub v14.4S, v28.4S, v23.4S // ..............................................................................................e......................................................... + sub v27.4S, v13.4S, v24.4S // ...............................................................................................................................e........................ // gap // ........................................................................................................................................................ - mls v9.4S, v1.4S, v8.S[0] // .......................................................................................................e................................................ + add v18.4S, v13.4S, v24.4S // ................................................................................................................................e....................... // gap // ........................................................................................................................................................ - sqrdmulh v1.4S, v7.4S, v29.S[1] // .............................................................................................................................e.......................... + mls v20.4S, v12.4S, v8.4S // .......................................................................................................................e................................ // gap // ........................................................................................................................................................ - mul v23.4S, v7.4S, v29.S[0] // ............................................................................................................................e........................... + sqrdmulh v5.4S, v27.4S, v4.S[1] // .................................................................................................................................e...................... // gap // ........................................................................................................................................................ - sqrdmulh v0.4S, v5.4S, v10.S[1] // ...........................................................................................................e............................................ + mul v9.4S, v27.4S, v4.S[0] // ..................................................................................................................................e..................... // gap // ........................................................................................................................................................ - sub v15.4S, v9.4S, v19.4S // .........................................................................................................................................e.............. + mul v27.4S, v0.4S, v4.S[2] // .................................................................................................e...................................................... // gap // ........................................................................................................................................................ - mul v12.4S, v14.4S, v29.S[2] // ................................................................................................e....................................................... + sub v11.4S, v30.4S, v20.4S // ..........................................................................................................................e............................. // gap // ........................................................................................................................................................ - sqrdmulh v17.4S, v14.4S, v29.S[3] // .................................................................................................e...................................................... + add v2.4S, v30.4S, v20.4S // ...........................................................................................................................e............................ // gap // ........................................................................................................................................................ - mul v30.4S, v15.4S, v29.S[0] // ...........................................................................................................................................e............ + mls v9.4S, v5.4S, v8.S[0] // ...................................................................................................................................e.................... // gap // ........................................................................................................................................................ - sqrdmulh v31.4S, v15.4S, v29.S[1] // ............................................................................................................................................e........... + mls v27.4S, v23.4S, v8.S[0] // ..................................................................................................e..................................................... // gap // ........................................................................................................................................................ - mul v16.4S, v5.4S, v10.S[0] // ..........................................................................................................e............................................. + sqrdmulh v23.4S, v11.4S, v4.S[1] // ............................................................................................................................e........................... // gap // ........................................................................................................................................................ - mls v23.4S, v1.4S, v8.S[0] // ..............................................................................................................................e......................... + mul v16.4S, v11.4S, v4.S[0] // .............................................................................................................................e.......................... // gap // ........................................................................................................................................................ - mls v12.4S, v17.4S, v8.S[0] // ..................................................................................................e..................................................... + sub v25.4S, v1.4S, v15.4S // .........................................................................................................................................e.............. // gap // ........................................................................................................................................................ - mls v30.4S, v31.4S, v8.S[0] // .............................................................................................................................................e.......... + add v31.4S, v1.4S, v15.4S // ..........................................................................................................................................e............. // gap // ........................................................................................................................................................ - mls v16.4S, v0.4S, v8.S[0] // ............................................................................................................e........................................... + str q9, [x2, #16] // ...................................................................................................................................................e.... // gap // ........................................................................................................................................................ - str q23, [x2], #(16*4) // ..................................................................................................................................................e..... - // gap // ........................................................................................................................................................ - add v24.4S, v9.4S, v19.4S // ..........................................................................................................................................e............. - // gap // ........................................................................................................................................................ - str q30, [x2, #-16] // .....................................................................................................................................................e.. - // gap // ........................................................................................................................................................ - add v22.4S, v12.4S, v16.4S // .....................................................................................................................................e.................. - // gap // ........................................................................................................................................................ - str q24, [x1, #-16] // .................................................................................................................................................e...... - // gap // ........................................................................................................................................................ - sub v10.4S, v12.4S, v16.4S // ....................................................................................................................................e................... - // gap // ........................................................................................................................................................ - str q22, [x1, #-32] // ................................................................................................................................................e....... - // gap // ........................................................................................................................................................ - - // original source code - // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // e.....................................................................................................................................................|.e................................. - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ..................................e...................................................................................................................|................................... - // ldr q0, [x5], #(12*16) // .e....................................................................................................................................................|..e................................ - // ldr q4, [x5, #(-12*16 + 1*16)] // ....e.................................................................................................................................................|.....e............................. - // ldr q1, [x5, #(-12*16 + 2*16)] // ..........e...........................................................................................................................................|...........e....................... - // ldr q5, [x5, #(-12*16 + 3*16)] // ..............e.......................................................................................................................................|...............e................... - // ldr q2, [x5, #(-12*16 + 4*16)] // ......e...............................................................................................................................................|.......e........................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ...............e......................................................................................................................................|................e.................. - // sub v24.4s, v9.4s, v10.4s // .............e........................................................................................................................................|..............e.................... - // add v9.4s, v9.4s, v10.4s // ...e..................................................................................................................................................|....e.............................. - // mul v10.4s, v24.4s, v1.4s // .................e....................................................................................................................................|..................e................ - // sqrdmulh v24.4s, v24.4s, v5.4s // ................e.....................................................................................................................................|.................e................. - // mls v10.4s, v24.4s, v8.s[0] // .....................e................................................................................................................................|......................e............ - // sub v24.4s, v11.4s, v12.4s // .........e............................................................................................................................................|..........e........................ - // add v11.4s, v11.4s, v12.4s // ..e...................................................................................................................................................|...e............................... - // mul v12.4s, v24.4s, v2.4s // ............e.........................................................................................................................................|.............e..................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..................e...................................................................................................................................|...................e............... - // mls v12.4s, v24.4s, v8.s[0] // ......................e...............................................................................................................................|.......................e........... - // sub v24.4s, v9.4s, v11.4s // .....e................................................................................................................................................|......e............................ - // add v9.4s, v9.4s, v11.4s // ................................................................e.....................................................................................|................................... - // mul v11.4s, v24.4s, v0.4s // .......e..............................................................................................................................................|........e.......................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ........e.............................................................................................................................................|.........e......................... - // mls v11.4s, v24.4s, v8.s[0] // ...........e..........................................................................................................................................|............e...................... - // sub v24.4s, v10.4s, v12.4s // ..........................e...........................................................................................................................|...........................e....... - // add v10.4s, v10.4s, v12.4s // ..................................................e...................................................................................................|................................... - // mul v12.4s, v24.4s, v0.4s // ............................................................e.........................................................................................|................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ............................e.........................................................................................................................|.............................e..... - // mls v12.4s, v24.4s, v8.s[0] // .................................................................e....................................................................................|................................... - // ldr q0, [x5, #(-12*16 + 6*16)] // .....................................................e................................................................................................|................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ................................................e.....................................................................................................|................................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e..........................................................................................................................|............................e...... - // ldr q5, [x5, #(-12*16 + 9*16)] // ...............................e......................................................................................................................|................................e.. - // ldr q2, [x5, #(-12*16 + 10*16)] // ...................................e..................................................................................................................|................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // .....................................e................................................................................................................|................................... - // sub v24.4s, v13.4s, v14.4s // ......................................e...............................................................................................................|................................... - // add v13.4s, v13.4s, v14.4s // ...........................................e..........................................................................................................|................................... - // mul v14.4s, v24.4s, v1.4s // ..........................................e...........................................................................................................|................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .........................................e............................................................................................................|................................... - // mls v14.4s, v24.4s, v8.s[0] // ..............................................e.......................................................................................................|................................... - // sub v24.4s, v15.4s, v16.4s // ....................................e.................................................................................................................|................................... - // add v15.4s, v15.4s, v16.4s // ............................................e.........................................................................................................|................................... - // mul v16.4s, v24.4s, v2.4s // .......................................e..............................................................................................................|................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ........................................e.............................................................................................................|................................... - // mls v16.4s, v24.4s, v8.s[0] // .............................................e........................................................................................................|................................... - // sub v24.4s, v13.4s, v15.4s // ...............................................e......................................................................................................|................................... - // add v13.4s, v13.4s, v15.4s // ......................................................e...............................................................................................|................................... - // mul v15.4s, v24.4s, v0.4s // .........................................................e............................................................................................|................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e..................................................................................................|................................... - // mls v15.4s, v24.4s, v8.s[0] // .............................................................e........................................................................................|................................... - // sub v24.4s, v14.4s, v16.4s // .................................................e....................................................................................................|................................... - // add v14.4s, v14.4s, v16.4s // .......................................................e..............................................................................................|................................... - // mul v16.4s, v24.4s, v0.4s // ........................................................e.............................................................................................|................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................e.................................................................................................|................................... - // mls v16.4s, v24.4s, v8.s[0] // ..................................................................e...................................................................................|................................... - // trn1 v25.4s, v9.4s, v10.4s // ...................................................................e..................................................................................|................................... - // trn2 v26.4s, v9.4s, v10.4s // ....................................................................e.................................................................................|................................... - // trn1 v27.4s, v11.4s, v12.4s // .....................................................................e................................................................................|................................... - // trn2 v28.4s, v11.4s, v12.4s // ......................................................................e...............................................................................|................................... - // trn2 v11.2d, v25.2d, v27.2d // ........................................................................e.............................................................................|................................... - // trn2 v12.2d, v26.2d, v28.2d // .........................................................................e............................................................................|................................... - // trn1 v9.2d, v25.2d, v27.2d // .....................................................................................e................................................................|................................... - // trn1 v10.2d, v26.2d, v28.2d // ......................................................................................e...............................................................|................................... - // trn1 v25.4s, v13.4s, v14.4s // ...........................................................e..........................................................................................|................................... - // trn2 v26.4s, v13.4s, v14.4s // ..........................................................e...........................................................................................|................................... - // trn1 v27.4s, v15.4s, v16.4s // .......................................................................e..............................................................................|................................... - // trn2 v28.4s, v15.4s, v16.4s // ..........................................................................e...........................................................................|................................... - // trn2 v15.2d, v25.2d, v27.2d // ............................................................................e.........................................................................|................................... - // trn2 v16.2d, v26.2d, v28.2d // .............................................................................e........................................................................|................................... - // trn1 v13.2d, v25.2d, v27.2d // ..................................................................................e...................................................................|................................... - // trn1 v14.2d, v26.2d, v28.2d // .................................................................................e....................................................................|................................... - // ldr q0, [x4], #64 // ........................................................................................................................e.............................|................................... - // ldr q1, [x4, #(-64 + 16)] // ..............................................................e.......................................................................................|................................... - // ldr q2, [x4, #(-64 + 32)] // ...............................................................e......................................................................................|................................... - // ldr q3, [x4, #(-64 + 48)] // ................................................................................e.....................................................................|................................... - // sub v24.4s, v9.4s, v10.4s // .........................................................................................e............................................................|................................... - // add v9.4s, v9.4s, v10.4s // ............................................................................................................e.........................................|................................... - // mul v10.4s, v24.4s, v1.s[2] // ............................................................................................e.........................................................|................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................................................e........................................................|................................... - // mls v10.4s, v24.4s, v8.s[0] // .................................................................................................e....................................................|................................... - // sub v24.4s, v11.4s, v12.4s // ...........................................................................e..........................................................................|................................... - // add v11.4s, v11.4s, v12.4s // .............................................................................................................e........................................|................................... - // mul v12.4s, v24.4s, v2.s[0] // ..............................................................................e.......................................................................|................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................e...........................................................|................................... - // mls v12.4s, v24.4s, v8.s[0] // ..............................................................................................e.......................................................|................................... - // sub v24.4s, v13.4s, v14.4s // ....................................................................................e.................................................................|................................... - // add v13.4s, v13.4s, v14.4s // ..................................................................................................e...................................................|................................... - // mul v14.4s, v24.4s, v2.s[2] // .......................................................................................e..............................................................|................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................................................................e.............................................................|................................... - // mls v14.4s, v24.4s, v8.s[0] // ...............................................................................................e......................................................|................................... - // sub v24.4s, v15.4s, v16.4s // ...............................................................................e......................................................................|................................... - // add v15.4s, v15.4s, v16.4s // ...................................................................................................e..................................................|................................... - // mul v16.4s, v24.4s, v3.s[0] // ...................................................................................e..................................................................|................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...........................................................................................e..........................................................|................................... - // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................e.....................................................|................................... - // sub v24.4s, v9.4s, v11.4s // ................................................................................................................................e.....................|................................... - // add v9.4s, v9.4s, v11.4s // ................................................................................................................e.....................................|................................... - // mul v11.4s, v24.4s, v0.s[2] // ......................................................................................................................................e...............|................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................e..............|................................... - // mls v11.4s, v24.4s, v8.s[0] // ............................................................................................................................................e.........|................................... - // sub v24.4s, v10.4s, v12.4s // .......................................................................................................................e..............................|................................... - // add v10.4s, v10.4s, v12.4s // .....................................................................................................e................................................|................................... - // mul v12.4s, v24.4s, v0.s[2] // ...........................................................................................................................e..........................|................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................e........................|................................... - // mls v12.4s, v24.4s, v8.s[0] // .................................................................................................................................e....................|................................... - // sub v24.4s, v13.4s, v15.4s // ......................................................................................................e...............................................|................................... - // add v13.4s, v13.4s, v15.4s // .........................................................................................................e............................................|................................... - // mul v15.4s, v24.4s, v1.s[0] // ..........................................................................................................................................e...........|................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................................................e.................|................................... - // mls v15.4s, v24.4s, v8.s[0] // ..............................................................................................................................................e.......|................................... - // sub v24.4s, v14.4s, v16.4s // ..............................................................................................................e.......................................|................................... - // add v14.4s, v14.4s, v16.4s // ....................................................................................................e.................................................|................................... - // mul v16.4s, v24.4s, v1.s[0] // ....................................................................................................................e.................................|................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................e...........................|................................... - // mls v16.4s, v24.4s, v8.s[0] // ..............................................................................................................................e.......................|................................... - // srshr v24.4S, v9.4S, #23 // ...................................................................................................................e..................................|................................... - // mls v9.4s, v24.4s, v8.4s // ......................................................................................................................e...............................|................................... - // srshr v24.4S, v10.4S, #23 // ........................................................................................................e.............................................|................................... - // mls v10.4s, v24.4s, v8.4s // ...........................................................................................................e..........................................|................................... - // srshr v24.4S, v13.4S, #23 // .................................................................................................................e....................................|................................... - // mls v13.4s, v24.4s, v8.4s // .....................................................................................................................e................................|................................... - // srshr v24.4S, v14.4S, #23 // .......................................................................................................e..............................................|................................... - // mls v14.4s, v24.4s, v8.4s // ..........................................................................................................e...........................................|................................... - // sub v24.4s, v9.4s, v13.4s // ...............................................................................................................................e......................|................................... - // add v9.4s, v9.4s, v13.4s // .........................................................................................................................e............................|................................... - // mul v13.4s, v24.4s, v0.s[0] // ...................................................................................................................................e..................|................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................e...................|................................... - // mls v13.4s, v24.4s, v8.s[0] // ...........................................................................................................................................e..........|................................... - // sub v24.4s, v10.4s, v14.4s // ......................................................................................................................................................*................................... - // add v10.4s, v10.4s, v14.4s // ...............................................................................................................e......................................|................................... - // mul v14.4s, v24.4s, v0.s[0] // ....................*.................................................................................................................................|.....................*............. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................*..................................................................................................................................|....................*.............. - // mls v14.4s, v24.4s, v8.s[0] // ........................*.............................................................................................................................|.........................*......... - // sub v24.4s, v11.4s, v15.4s // ....................................................................................................................................................e.|................................... - // add v11.4s, v11.4s, v15.4s // ..................................................................................................................................................e...|................................... - // mul v15.4s, v24.4s, v0.s[0] // .........................*............................................................................................................................|..........................*........ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................*..............................................................................................................................|........................*.......... - // mls v15.4s, v24.4s, v8.s[0] // .............................*........................................................................................................................|..............................*.... - // sub v24.4s, v12.4s, v16.4s // .....................................................................................................................................e................|................................... - // add v12.4s, v12.4s, v16.4s // ................................................................................................................................................e.....|................................... - // mul v16.4s, v24.4s, v0.s[0] // ........................................................................................................................................e.............|................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................e............|................................... - // mls v16.4s, v24.4s, v8.s[0] // .............................................................................................................................................e........|................................... - // str q9, [x1], #(16*4) // ............................................................................................................................e.........................|................................... - // str q10, [x1, #(-16*4 + 1*16)] // ..................................................................................................................e...................................|................................... - // str q11, [x1, #(-16*4 + 2*16)] // .....................................................................................................................................................e|................................... - // str q12, [x1, #(-16*4 + 3*16)] // ...................................................................................................................................................e..|................................... - // str q13, [x2], #(16*4) // ...............................................................................................................................................e......|................................... - // str q14, [x2, #(-16*4 + 1*16)] // ..............................*.......................................................................................................................|...............................*... - // str q15, [x2, #(-16*4 + 2*16)] // ................................*.....................................................................................................................|.................................*. - // str q16, [x2, #(-16*4 + 3*16)] // .................................................................................................................................................e....|................................... - // add x1, x1, #64 // ......................................................................................................................................................|*.................................. - // add x2, x2, #64 // .................................*....................................................................................................................|..................................* + mls v16.4S, v23.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + str q18, [x1, #16] // ...............................................................................................................................................e........ + // gap // ........................................................................................................................................................ + sqrdmulh v12.4S, v25.4S, v4.S[1] // ...........................................................................................................................................e............ + // gap // ........................................................................................................................................................ + str q2, [x1], #(16*4) // ..............................................................................................................................................e......... + // gap // ........................................................................................................................................................ + mul v23.4S, v25.4S, v4.S[0] // ............................................................................................................................................e........... + // gap // ........................................................................................................................................................ + str q16, [x2], #(16*4) // ..................................................................................................................................................e..... + // gap // ........................................................................................................................................................ + + // ----------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // .................e......................................................................................................................................'................~................................... + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .....................................................e..................................................................................................'.................................................... + // ldr q0, [x5], #(12*16) // ....e...................................................................................................................................................'...~................................................ + // ldr q4, [x5, #(-12*16 + 1*16)] // .e......................................................................................................................................................'~................................................... + // ldr q1, [x5, #(-12*16 + 2*16)] // ..........e.............................................................................................................................................'.........~.......................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ..e.....................................................................................................................................................'.~.................................................. + // ldr q2, [x5, #(-12*16 + 4*16)] // ..................e.....................................................................................................................................'.................~.................................. + // ldr q6, [x5, #(-12*16 + 5*16)] // .......e................................................................................................................................................'......~............................................. + // sub v24.4s, v9.4s, v10.4s // ...................e....................................................................................................................................'..................~................................. + // add v9.4s, v9.4s, v10.4s // ....................e...................................................................................................................................'...................~................................ + // sqrdmulh v27.4s, v24.4s, v5.4s // .........................e..............................................................................................................................'........................~........................... + // mul v10.4s, v24.4s, v1.4s // ......................e.................................................................................................................................'.....................~.............................. + // mls v10.4s, v27.4s, v8.s[0] // .............................e..........................................................................................................................'............................~....................... + // sub v24.4s, v11.4s, v12.4s // .......................e................................................................................................................................'......................~............................. + // add v11.4s, v11.4s, v12.4s // .....................e..................................................................................................................................'....................~............................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ..........................e.............................................................................................................................'.........................~.......................... + // mul v12.4s, v24.4s, v2.4s // ...........................e............................................................................................................................'..........................~......................... + // mls v12.4s, v27.4s, v8.s[0] // ..............................e.........................................................................................................................'.............................~...................... + // sub v24.4s, v9.4s, v11.4s // ...............................e........................................................................................................................'..............................~..................... + // add v9.4s, v9.4s, v11.4s // ........................e...............................................................................................................................'.......................~............................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ..................................e.....................................................................................................................'.................................~.................. + // mul v11.4s, v24.4s, v0.4s // ...................................e....................................................................................................................'..................................~................. + // mls v11.4s, v27.4s, v8.s[0] // .......................................e................................................................................................................'......................................~............. + // sub v24.4s, v10.4s, v12.4s // .................................e......................................................................................................................'................................~................... + // add v10.4s, v10.4s, v12.4s // ......................................e.................................................................................................................'.....................................~.............. + // sqrdmulh v27.4s, v24.4s, v4.4s // .....................................e..................................................................................................................'....................................~............... + // mul v12.4s, v24.4s, v0.4s // ....................................e...................................................................................................................'...................................~................ + // mls v12.4s, v27.4s, v8.s[0] // .........................................e..............................................................................................................'........................................~........... + // ldr q0, [x5, #(-12*16 + 6*16)] // ................................e.......................................................................................................................'...............................~.................... + // ldr q4, [x5, #(-12*16 + 7*16)] // e.......................................................................................................................................................~.................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ..........................................................e.............................................................................................'.................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ......................................................e.................................................................................................'.................................................... + // ldr q2, [x5, #(-12*16 + 10*16)] // ........................................................e...............................................................................................'.................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................................................e.........................................................................................'.................................................... + // sub v24.4s, v13.4s, v14.4s // .......................................................e................................................................................................'.................................................... + // add v13.4s, v13.4s, v14.4s // ...................................................................e....................................................................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v5.4s // .........................................................e..............................................................................................'.................................................... + // mul v14.4s, v24.4s, v1.4s // .............................................................e..........................................................................................'.................................................... + // mls v14.4s, v27.4s, v8.s[0] // ....................................................................e...................................................................................'.................................................... + // sub v24.4s, v15.4s, v16.4s // ............................................................e...........................................................................................'.................................................... + // add v15.4s, v15.4s, v16.4s // ......................................................................e.................................................................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v6.4s // .................................................................e......................................................................................'.................................................... + // mul v16.4s, v24.4s, v2.4s // ................................................................e.......................................................................................'.................................................... + // mls v16.4s, v27.4s, v8.s[0] // .....................................................................e..................................................................................'.................................................... + // sub v24.4s, v13.4s, v15.4s // .........................................................................e..............................................................................'.................................................... + // add v13.4s, v13.4s, v15.4s // .................................................................................e......................................................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ............................................................................e...........................................................................'.................................................... + // mul v15.4s, v24.4s, v0.4s // ..............................................................................e.........................................................................'.................................................... + // mls v15.4s, v27.4s, v8.s[0] // ..................................................................................e.....................................................................'.................................................... + // sub v24.4s, v14.4s, v16.4s // ..........................................................................e.............................................................................'.................................................... + // add v14.4s, v14.4s, v16.4s // ........................................................................e...............................................................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v4.4s // .............................................................................e..........................................................................'.................................................... + // mul v16.4s, v24.4s, v0.4s // ...............................................................................e........................................................................'.................................................... + // mls v16.4s, v27.4s, v8.s[0] // ...................................................................................e....................................................................'.................................................... + // trn1 v25.4s, v9.4s, v10.4s // ..........................................e.............................................................................................................'.........................................~.......... + // trn2 v26.4s, v9.4s, v10.4s // ............................................e...........................................................................................................'...........................................~........ + // trn1 v27.4s, v11.4s, v12.4s // ..............................................e.........................................................................................................'.............................................~...... + // trn2 v28.4s, v11.4s, v12.4s // .............................................e..........................................................................................................'............................................~....... + // trn2 v11.2d, v25.2d, v27.2d // ...............................................................e........................................................................................'.................................................... + // trn2 v12.2d, v26.2d, v28.2d // ................................................e.......................................................................................................'...............................................~.... + // trn1 v9.2d, v25.2d, v27.2d // ..................................................e.....................................................................................................'.................................................~.. + // trn1 v10.2d, v26.2d, v28.2d // .................................................e......................................................................................................'................................................~... + // trn1 v25.4s, v13.4s, v14.4s // ......................................................................................e.................................................................'.................................................... + // trn2 v26.4s, v13.4s, v14.4s // .....................................................................................e..................................................................'.................................................... + // trn1 v27.4s, v15.4s, v16.4s // .......................................................................................e................................................................'.................................................... + // trn2 v28.4s, v15.4s, v16.4s // ........................................................................................e...............................................................'.................................................... + // trn2 v15.2d, v25.2d, v27.2d // ..........................................................................................e.............................................................'.................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...........................................................................................e............................................................'.................................................... + // trn1 v13.2d, v25.2d, v27.2d // ............................................................................................e...........................................................'.................................................... + // trn1 v14.2d, v26.2d, v28.2d // ..............................................................................................e.........................................................'.................................................... + // ldr q0, [x4], #64 // .......................................................................................................................e................................'.................................................... + // ldr q1, [x4, #(-64 + 16)] // .......................................................................e................................................................................'.................................................... + // ldr q2, [x4, #(-64 + 32)] // ...................................................................................................e....................................................'.................................................... + // ldr q3, [x4, #(-64 + 48)] // ............................e...........................................................................................................................'...........................~........................ + // sub v24.4s, v9.4s, v10.4s // ..................................................................e.....................................................................................'.................................................... + // add v9.4s, v9.4s, v10.4s // ...........................................................e............................................................................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ................................................................................e.......................................................................'.................................................... + // mul v10.4s, v24.4s, v1.s[2] // ...........................................................................e............................................................................'.................................................... + // mls v10.4s, v27.4s, v8.s[0] // ....................................................................................e...................................................................'.................................................... + // sub v24.4s, v11.4s, v12.4s // ....................................................................................................e...................................................'.................................................... + // add v11.4s, v11.4s, v12.4s // .........................................................................................e..............................................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ........................................................................................................e...............................................'.................................................... + // mul v12.4s, v24.4s, v2.s[0] // .......................................................................................................e................................................'.................................................... + // mls v12.4s, v27.4s, v8.s[0] // ............................................................................................................e...........................................'.................................................... + // sub v24.4s, v13.4s, v14.4s // ..................................................................................................e.....................................................'.................................................... + // add v13.4s, v13.4s, v14.4s // ...............................................................................................................e........................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ......................................................................................................e.................................................'.................................................... + // mul v14.4s, v24.4s, v2.s[2] // .........................................................................................................e..............................................'.................................................... + // mls v14.4s, v27.4s, v8.s[0] // .............................................................................................................e..........................................'.................................................... + // sub v24.4s, v15.4s, v16.4s // .............................................................................................e..........................................................'.................................................... + // add v15.4s, v15.4s, v16.4s // ..............................................................................................................e.........................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ................................................................................................e.......................................................'.................................................... + // mul v16.4s, v24.4s, v3.s[0] // .................................................................................................e......................................................'.................................................... + // mls v16.4s, v27.4s, v8.s[0] // ...........................................................................................................e............................................'.................................................... + // sub v24.4s, v9.4s, v11.4s // ...............................................................................................e........................................................'.................................................... + // add v9.4s, v9.4s, v11.4s // .....................................................................................................e..................................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ..........................................................................................................................e.............................'.................................................... + // mul v11.4s, v24.4s, v0.s[2] // ........................................................................................................................................e...............'.................................................... + // mls v11.4s, v27.4s, v8.s[0] // ............................................................................................................................................e...........'.................................................... + // sub v24.4s, v10.4s, v12.4s // ........................................................................................................................e...............................'.................................................... + // add v10.4s, v10.4s, v12.4s // ................................................................................................................e.......................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ...........................................................................................................................e............................'.................................................... + // mul v12.4s, v24.4s, v0.s[2] // ............................................................................................................................e...........................'.................................................... + // mls v12.4s, v27.4s, v8.s[0] // .................................................................................................................................e......................'.................................................... + // sub v24.4s, v13.4s, v15.4s // ...~....................................................................................................................................................'..*................................................. + // add v13.4s, v13.4s, v15.4s // .............................................................................................................................e..........................'.................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ......~.................................................................................................................................................'.....*.............................................. + // mul v15.4s, v24.4s, v1.s[0] // .....~..................................................................................................................................................'....*............................................... + // mls v15.4s, v27.4s, v8.s[0] // .........~..............................................................................................................................................'........*........................................... + // sub v24.4s, v14.4s, v16.4s // .................................................................................................................e......................................'.................................................... + // add v14.4s, v14.4s, v16.4s // ..................................................................................................................e.....................................'.................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ....................................................................................................................e...................................'.................................................... + // mul v16.4s, v24.4s, v1.s[0] // .....................................................................................................................e..................................'.................................................... + // mls v16.4s, v27.4s, v8.s[0] // ................................................................................................................................e.......................'.................................................... + // srshr v24.4S, v9.4S, #23 // ..........................................................................................................e.............................................'.................................................... + // mls v9.4s, v24.4s, v8.4s // ...................................................................................................................e....................................'.................................................... + // srshr v24.4S, v10.4S, #23 // ......................................................................................................................e.................................'.................................................... + // mls v10.4s, v24.4s, v8.4s // ..............................................................................................................................e.........................'.................................................... + // srshr v24.4S, v13.4S, #23 // ..................................................................................................................................e.....................'.................................................... + // mls v13.4s, v24.4s, v8.4s // .....................................................................................................................................e..................'.................................................... + // srshr v24.4S, v14.4S, #23 // .........................................................................................................................e..............................'.................................................... + // mls v14.4s, v24.4s, v8.4s // ...............................................................................................................................e........................'.................................................... + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................................................e..............'.................................................... + // add v9.4s, v9.4s, v13.4s // ..........................................................................................................................................e.............'.................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .............................................................................................................................................e..........'.................................................... + // mul v13.4s, v24.4s, v0.s[0] // ..............................................................................................................................................e.........'.................................................... + // mls v13.4s, v27.4s, v8.s[0] // ..................................................................................................................................................e.....'.................................................... + // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................................e....................'.................................................... + // add v10.4s, v10.4s, v14.4s // ....................................................................................................................................e...................'.................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ......................................................................................................................................e.................'.................................................... + // mul v14.4s, v24.4s, v0.s[0] // .......................................................................................................................................e................'.................................................... + // mls v14.4s, v27.4s, v8.s[0] // ...........................................................................................................................................e............'.................................................... + // sub v24.4s, v11.4s, v15.4s // ..............~.........................................................................................................................................'.............*...................................... + // add v11.4s, v11.4s, v15.4s // ............~...........................................................................................................................................'...........*........................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...........................................~............................................................................................................'..........................................*......... + // mul v15.4s, v24.4s, v0.s[0] // ........................................~...............................................................................................................'.......................................*............ + // mls v15.4s, v27.4s, v8.s[0] // ...............................................~........................................................................................................'..............................................*..... + // sub v24.4s, v12.4s, v16.4s // ...............................................................................................................................................e........'.................................................... + // add v12.4s, v12.4s, v16.4s // ................................................................................................................................................e.......'.................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ....................................................................................................................................................e...'.................................................... + // mul v16.4s, v24.4s, v0.s[0] // ......................................................................................................................................................e.'.................................................... + // mls v16.4s, v27.4s, v8.s[0] // ........~...............................................................................................................................................'.......*............................................ + // str q9, [x1], #(16*4) // .....................................................................................................................................................e..'.................................................... + // str q10, [x1, #(-16*4 + 1*16)] // ...................................................................................................................................................e....'.................................................... + // str q11, [x1, #(-16*4 + 2*16)] // ...............~........................................................................................................................................'..............*..................................... + // str q12, [x1, #(-16*4 + 3*16)] // .............~..........................................................................................................................................'............*....................................... + // str q13, [x2], #(16*4) // .......................................................................................................................................................e'.................................................... + // str q14, [x2, #(-16*4 + 1*16)] // .................................................................................................................................................e......'.................................................... + // str q15, [x2, #(-16*4 + 2*16)] // ...................................................~....................................................................................................'..................................................*. + // str q16, [x2, #(-16*4 + 3*16)] // ...........~............................................................................................................................................'..........*......................................... + // add x1, x1, #64 // ................~.......................................................................................................................................'...............*.................................... + // add x2, x2, #64 // ....................................................~...................................................................................................'...................................................* sub count, count, #1 cbnz count, layer45678_start - sub v4.4S, v27.4S, v4.4S // *.......... - add x1, x1, #64 // .*......... - sqrdmulh v17.4S, v10.4S, v29.S[1] // ....*...... - // gap // ........... - mul v0.4S, v10.4S, v29.S[0] // ......*.... - // gap // ........... - sqrdmulh v31.4S, v4.4S, v29.S[1] // ..*........ - // gap // ........... - mul v29.4S, v4.4S, v29.S[0] // ...*....... - // gap // ........... - // gap // ........... - // gap // ........... - mls v0.4S, v17.4S, v8.S[0] // .......*... - // gap // ........... - // gap // ........... - // gap // ........... - mls v29.4S, v31.4S, v8.S[0] // .....*..... - // gap // ........... - // gap // ........... - // gap // ........... - str q0, [x2, #-32] // .........*. - // gap // ........... - // gap // ........... - // gap // ........... - str q29, [x2, #-48] // ........*.. - add x2, x2, #64 // ..........* - - // original source code - // sub v6.4S, v27.4S, v4.4S // *.......... - // add x1, x1, #64 // .*......... - // sqrdmulh v12.4S, v6.4S, v29.S[1] // ....*...... - // mul v6.4S, v6.4S, v29.S[0] // .....*..... - // sqrdmulh v9.4S, v10.4S, v29.S[1] // ..*........ - // mls v6.4S, v12.4S, v8.S[0] // .......*... - // mul v13.4S, v10.4S, v29.S[0] // ...*....... - // mls v13.4S, v9.4S, v8.S[0] // ......*.... - // str q6, [x2, #-48] // .........*. - // str q13, [x2, #-32] // ........*.. - // add x2, x2, #64 // ..........* + // Instructions: 16 + // Expected cycles: 25 + // Expected IPC: 0.64 + // + // Wall time: 0.09s + // User time: 0.09s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sub v26.4S, v21.4S, v17.4S // *............................. + // gap // .............................. + mls v23.4S, v12.4S, v8.S[0] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v17.4S, v26.4S, v6.S[0] // .*............................ + // gap // .............................. + sqrdmulh v6.4S, v26.4S, v6.S[1] // ..*........................... + // gap // .............................. + str q23, [x2, #-16] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q31, [x1, #-16] // .......*...................... + // gap // .............................. + mls v17.4S, v6.4S, v8.S[0] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v6.4S, v27.4S, v17.4S // ........*..................... + // gap // .............................. + add v17.4S, v27.4S, v17.4S // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v26.4S, v6.4S, v4.S[0] // ...........*.................. + // gap // .............................. + sqrdmulh v6.4S, v6.4S, v4.S[1] // ............*................. + // gap // .............................. + str q17, [x1, #-32] // .........*.................... + add x1, x1, #64 // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v26.4S, v6.4S, v8.S[0] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q26, [x2, #-32] // ..............*............... + add x2, x2, #64 // ...............*.............. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sub v26.4S, v21.4S, v17.4S // *.............................. + // mul v16.4S, v26.4S, v6.S[0] // ..*............................ + // sqrdmulh v0.4S, v26.4S, v6.S[1] // ...*........................... + // mls v23.4S, v12.4S, v8.S[0] // .*............................. + // mls v16.4S, v0.4S, v8.S[0] // ......*........................ + // str q23, [x2, #-16] // ....*.......................... + // add v14.4S, v27.4S, v16.4S // ........*...................... + // str q31, [x1, #-16] // .....*......................... + // sub v9.4S, v27.4S, v16.4S // .......*....................... + // str q14, [x1, #-32] // ...........*................... + // add x1, x1, #64 // ............*.................. + // mul v30.4S, v9.4S, v4.S[0] // .........*..................... + // sqrdmulh v1.4S, v9.4S, v4.S[1] // ..........*.................... + // mls v30.4S, v1.4S, v8.S[0] // .............*................. + // str q30, [x2, #-32] // ..............*................ + // add x2, x2, #64 // ...............*............... // ----------------------------------------------------------------------------- @@ -1461,772 +1503,808 @@ layer45678_start: load_roots_123 .p2align 2 - ldr q11, [x0, #256] // ..*......... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q20, [x0, #384] // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q4, [x0, #896] // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q28, [x0, #512] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - ldr q9, [x0, #768] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q29, [x0, #640] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - add v15.4S, v11.4S, v20.4S // ..........*. - // gap // ............ - add v22.4S, v9.4S, v4.4S // .........*.. - // gap // ............ - add v13.4S, v28.4S, v29.4S // ......*..... - // gap // ............ - ldr q18, [x0, #0] // *........... - // gap // ............ - // gap // ............ - // gap // ............ - add v23.4S, v13.4S, v22.4S // ...........* - // gap // ............ - ldr q14, [x0, #128] // .*.......... - // gap // ............ - - // original source code - // ldr q18, [x0, #0] // .........*.. - // ldr q14, [x0, #128] // ...........* - // ldr q11, [x0, #256] // *........... - // ldr q28, [x0, #512] // ...*........ - // ldr q29, [x0, #640] // .....*...... - // ldr q9, [x0, #768] // ....*....... - // add v13.4S, v28.4S, v29.4S // ........*... - // ldr q4, [x0, #896] // ..*......... - // ldr q20, [x0, #384] // .*.......... - // add v22.4S, v9.4S, v4.4S // .......*.... - // add v15.4S, v11.4S, v20.4S // ......*..... - // add v23.4S, v13.4S, v22.4S // ..........*. + // Instructions: 12 + // Expected cycles: 19 + // Expected IPC: 0.63 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q18, [x0, #256] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q7, [x0, #384] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x0, #896] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x0, #512] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q27, [x0, #768] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q10, [x0, #640] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v14.4S, v18.4S, v7.4S // ..........*................... + // gap // .............................. + add v23.4S, v27.4S, v11.4S // .........*.................... + // gap // .............................. + add v21.4S, v4.4S, v10.4S // ......*....................... + // gap // .............................. + ldr q17, [x0, #0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v15.4S, v21.4S, v23.4S // ...........*.................. + // gap // .............................. + ldr q5, [x0, #128] // .*............................ + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q17, [x0, #0] // .........*..................... + // ldr q5, [x0, #128] // ...........*................... + // ldr q18, [x0, #256] // *.............................. + // ldr q4, [x0, #512] // ...*........................... + // ldr q10, [x0, #640] // .....*......................... + // ldr q27, [x0, #768] // ....*.......................... + // add v21.4S, v4.4S, v10.4S // ........*...................... + // ldr q11, [x0, #896] // ..*............................ + // ldr q7, [x0, #384] // .*............................. + // add v23.4S, v27.4S, v11.4S // .......*....................... + // add v14.4S, v18.4S, v7.4S // ......*........................ + // add v15.4S, v21.4S, v23.4S // ..........*.................... sub count, count, #1 layer123_start: - sub v17.4S, v18.4S, v14.4S // ........*............................................................................................................... + // Instructions: 120 + // Expected cycles: 128 + // Expected IPC: 0.94 + // + // Wall time: 10.36s + // User time: 10.36s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + sub v6.4S, v17.4S, v5.4S // ........*............................................................................................................... // gap // ........................................................................................................................ - add v19.4S, v18.4S, v14.4S // .........*.............................................................................................................. + add v17.4S, v17.4S, v5.4S // .........*.............................................................................................................. // gap // ........................................................................................................................ - sub v18.4S, v11.4S, v20.4S // .............*.......................................................................................................... + sub v18.4S, v18.4S, v7.4S // .............*.......................................................................................................... // gap // ........................................................................................................................ - mul v6.4S, v17.4S, v1.S[2] // ..........*............................................................................................................. + sqrdmulh v5.4S, v6.4S, v1.S[3] // ..........*............................................................................................................. // gap // ........................................................................................................................ - sqrdmulh v17.4S, v17.4S, v1.S[3] // ...........*............................................................................................................ + mul v6.4S, v6.4S, v1.S[2] // ...........*............................................................................................................ // gap // ........................................................................................................................ - sub v14.4S, v19.4S, v15.4S // ............................*........................................................................................... + sub v7.4S, v17.4S, v14.4S // ............................*........................................................................................... // gap // ........................................................................................................................ - add v19.4S, v19.4S, v15.4S // .............................*.......................................................................................... + add v17.4S, v17.4S, v14.4S // .............................*.......................................................................................... // gap // ........................................................................................................................ - mul v11.4S, v18.4S, v2.S[0] // ...............*........................................................................................................ + sqrdmulh v14.4S, v18.4S, v2.S[1] // ...............*........................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v18.4S, v18.4S, v2.S[1] // ................*....................................................................................................... + mul v18.4S, v18.4S, v2.S[0] // ................*....................................................................................................... // gap // ........................................................................................................................ - mls v6.4S, v17.4S, v8.S[0] // ............*........................................................................................................... + mls v6.4S, v5.4S, v8.S[0] // ............*........................................................................................................... // gap // ........................................................................................................................ - sub v17.4S, v28.4S, v29.4S // ..................*..................................................................................................... + sub v5.4S, v4.4S, v10.4S // ..................*..................................................................................................... // gap // ........................................................................................................................ - mul v28.4S, v14.4S, v0.S[2] // ..............................*......................................................................................... + sqrdmulh v4.4S, v7.4S, v0.S[3] // ..............................*......................................................................................... // gap // ........................................................................................................................ - sqrdmulh v14.4S, v14.4S, v0.S[3] // ...............................*........................................................................................ + mul v7.4S, v7.4S, v0.S[2] // ...............................*........................................................................................ // gap // ........................................................................................................................ - sub v29.4S, v19.4S, v23.4S // ................................................*....................................................................... + sub v10.4S, v17.4S, v15.4S // ................................................*....................................................................... // gap // ........................................................................................................................ - add v19.4S, v19.4S, v23.4S // .................................................*...................................................................... + add v17.4S, v17.4S, v15.4S // .................................................*...................................................................... // gap // ........................................................................................................................ - mls v11.4S, v18.4S, v8.S[0] // .................*...................................................................................................... + mls v18.4S, v14.4S, v8.S[0] // .................*...................................................................................................... // gap // ........................................................................................................................ - mul v18.4S, v17.4S, v2.S[2] // ....................*................................................................................................... + sqrdmulh v14.4S, v5.4S, v2.S[3] // ....................*................................................................................................... // gap // ........................................................................................................................ - sqrdmulh v17.4S, v17.4S, v2.S[3] // .....................*.................................................................................................. + mul v5.4S, v5.4S, v2.S[2] // .....................*.................................................................................................. // gap // ........................................................................................................................ - sub v9.4S, v9.4S, v4.4S // .......................*................................................................................................ + sub v27.4S, v27.4S, v11.4S // .......................*................................................................................................ // gap // ........................................................................................................................ - sub v4.4S, v6.4S, v11.4S // .................................*...................................................................................... + sub v11.4S, v6.4S, v18.4S // .................................*...................................................................................... // gap // ........................................................................................................................ - add v6.4S, v6.4S, v11.4S // ..................................*..................................................................................... + add v6.4S, v6.4S, v18.4S // ..................................*..................................................................................... // gap // ........................................................................................................................ - mls v18.4S, v17.4S, v8.S[0] // ......................*................................................................................................. + mls v5.4S, v14.4S, v8.S[0] // ......................*................................................................................................. // gap // ........................................................................................................................ - mul v17.4S, v9.4S, v3.S[0] // .........................*.............................................................................................. + sqrdmulh v18.4S, v27.4S, v3.S[1] // .........................*.............................................................................................. // gap // ........................................................................................................................ - mls v28.4S, v14.4S, v8.S[0] // ................................*....................................................................................... + mls v7.4S, v4.4S, v8.S[0] // ................................*....................................................................................... // gap // ........................................................................................................................ - sqrdmulh v14.4S, v9.4S, v3.S[1] // ..........................*............................................................................................. + mul v14.4S, v27.4S, v3.S[0] // ..........................*............................................................................................. // gap // ........................................................................................................................ - mul v11.4S, v4.4S, v0.S[2] // ...................................*.................................................................................... + sqrdmulh v4.4S, v11.4S, v0.S[3] // ...................................*.................................................................................... // gap // ........................................................................................................................ - sqrdmulh v9.4S, v4.4S, v0.S[3] // ....................................*................................................................................... + mul v27.4S, v11.4S, v0.S[2] // ....................................*................................................................................... // gap // ........................................................................................................................ - mul v4.4S, v29.4S, v0.S[0] // ..................................................*..................................................................... + sqrdmulh v11.4S, v10.4S, v0.S[1] // ..................................................*..................................................................... // gap // ........................................................................................................................ - sqrdmulh v29.4S, v29.4S, v0.S[1] // ...................................................*.................................................................... + mul v10.4S, v10.4S, v0.S[0] // ...................................................*.................................................................... // gap // ........................................................................................................................ - mul v20.4S, v19.4S, v25.4S // ........................................................................................*............................... + sqrdmulh v15.4S, v17.4S, v26.4S // ........................................................................................*............................... // gap // ........................................................................................................................ - sqrdmulh v19.4S, v19.4S, v26.4S // .........................................................................................*.............................. + mul v17.4S, v17.4S, v25.4S // .........................................................................................*.............................. // gap // ........................................................................................................................ - mls v17.4S, v14.4S, v8.S[0] // ...........................*............................................................................................ + mls v14.4S, v18.4S, v8.S[0] // ...........................*............................................................................................ // gap // ........................................................................................................................ - mls v11.4S, v9.4S, v8.S[0] // .....................................*.................................................................................. + mls v27.4S, v4.4S, v8.S[0] // .....................................*.................................................................................. // gap // ........................................................................................................................ - sub v14.4S, v13.4S, v22.4S // ......................................*................................................................................. + sub v18.4S, v21.4S, v23.4S // ......................................*................................................................................. // gap // ........................................................................................................................ - mls v4.4S, v29.4S, v8.S[0] // ....................................................*................................................................... + mls v10.4S, v11.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ - sub v29.4S, v18.4S, v17.4S // ...........................................*............................................................................ + sub v4.4S, v5.4S, v14.4S // ...........................................*............................................................................ // gap // ........................................................................................................................ - mul v9.4S, v14.4S, v1.S[0] // ........................................*............................................................................... + sqrdmulh v11.4S, v18.4S, v1.S[1] // ........................................*............................................................................... // gap // ........................................................................................................................ - sqrdmulh v14.4S, v14.4S, v1.S[1] // .........................................*.............................................................................. + mul v18.4S, v18.4S, v1.S[0] // .........................................*.............................................................................. // gap // ........................................................................................................................ - add v17.4S, v18.4S, v17.4S // ............................................*........................................................................... + add v5.4S, v5.4S, v14.4S // ............................................*........................................................................... // gap // ........................................................................................................................ - mul v18.4S, v29.4S, v1.S[0] // .............................................*.......................................................................... + sqrdmulh v14.4S, v4.4S, v1.S[1] // .............................................*.......................................................................... // gap // ........................................................................................................................ - sqrdmulh v29.4S, v29.4S, v1.S[1] // ..............................................*......................................................................... + mul v4.4S, v4.4S, v1.S[0] // ..............................................*......................................................................... // gap // ........................................................................................................................ - sub v13.4S, v6.4S, v17.4S // .....................................................*.................................................................. + sub v21.4S, v6.4S, v5.4S // .....................................................*.................................................................. // gap // ........................................................................................................................ - add v17.4S, v6.4S, v17.4S // ......................................................*................................................................. + add v6.4S, v6.4S, v5.4S // ......................................................*................................................................. // gap // ........................................................................................................................ - mls v9.4S, v14.4S, v8.S[0] // ..........................................*............................................................................. + mls v18.4S, v11.4S, v8.S[0] // ..........................................*............................................................................. // gap // ........................................................................................................................ - mls v18.4S, v29.4S, v8.S[0] // ...............................................*........................................................................ + mls v4.4S, v14.4S, v8.S[0] // ...............................................*........................................................................ // gap // ........................................................................................................................ - mul v6.4S, v13.4S, v0.S[0] // .......................................................*................................................................ + sqrdmulh v5.4S, v21.4S, v0.S[1] // .......................................................*................................................................ // gap // ........................................................................................................................ - sqrdmulh v14.4S, v13.4S, v0.S[1] // ........................................................*............................................................... + mul v14.4S, v21.4S, v0.S[0] // ........................................................*............................................................... // gap // ........................................................................................................................ - sub v29.4S, v28.4S, v9.4S // ..........................................................*............................................................. + sub v11.4S, v7.4S, v18.4S // ..........................................................*............................................................. // gap // ........................................................................................................................ - add v28.4S, v28.4S, v9.4S // ...........................................................*............................................................ + add v18.4S, v7.4S, v18.4S // ...........................................................*............................................................ // gap // ........................................................................................................................ - sub v9.4S, v11.4S, v18.4S // ...............................................................*........................................................ + sub v7.4S, v27.4S, v4.4S // ...............................................................*........................................................ // gap // ........................................................................................................................ - mls v6.4S, v14.4S, v8.S[0] // .........................................................*.............................................................. + mls v14.4S, v5.4S, v8.S[0] // .........................................................*.............................................................. // gap // ........................................................................................................................ - mul v14.4S, v29.4S, v0.S[0] // ............................................................*........................................................... + sqrdmulh v5.4S, v11.4S, v0.S[1] // ............................................................*........................................................... // gap // ........................................................................................................................ - sqrdmulh v29.4S, v29.4S, v0.S[1] // .............................................................*.......................................................... + mul v11.4S, v11.4S, v0.S[0] // .............................................................*.......................................................... // gap // ........................................................................................................................ - add v18.4S, v11.4S, v18.4S // ................................................................*....................................................... + add v4.4S, v27.4S, v4.4S // ................................................................*....................................................... // gap // ........................................................................................................................ - mul v11.4S, v9.4S, v0.S[0] // .................................................................*...................................................... + sqrdmulh v27.4S, v7.4S, v0.S[1] // .................................................................*...................................................... // gap // ........................................................................................................................ - sqrdmulh v9.4S, v9.4S, v0.S[1] // ..................................................................*..................................................... + mul v7.4S, v7.4S, v0.S[0] // ..................................................................*..................................................... // gap // ........................................................................................................................ - mls v14.4S, v29.4S, v8.S[0] // ..............................................................*......................................................... + mls v11.4S, v5.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ - cmge v29.4S, v31.4S, v4.4S // ....................................................................*................................................... + cmge v5.4S, v31.4S, v10.4S // ....................................................................*................................................... // gap // ........................................................................................................................ - cmge v13.4S, v4.4S, v30.4S // .....................................................................*.................................................. + cmge v21.4S, v10.4S, v30.4S // .....................................................................*.................................................. // gap // ........................................................................................................................ - mls v20.4S, v19.4S, v8.S[0] // ..........................................................................................*............................. + mls v17.4S, v15.4S, v8.S[0] // ..........................................................................................*............................. // gap // ........................................................................................................................ - mls v11.4S, v9.4S, v8.S[0] // ...................................................................*.................................................... + mls v7.4S, v27.4S, v8.S[0] // ...................................................................*.................................................... // gap // ........................................................................................................................ - sub v19.4S, v29.4S, v13.4S // ......................................................................*................................................. + sub v5.4S, v5.4S, v21.4S // ......................................................................*................................................. // gap // ........................................................................................................................ - cmge v29.4S, v31.4S, v6.4S // ........................................................................*............................................... + cmge v27.4S, v31.4S, v14.4S // ........................................................................*............................................... // gap // ........................................................................................................................ - cmge v9.4S, v6.4S, v30.4S // .........................................................................*.............................................. + cmge v15.4S, v14.4S, v30.4S // .........................................................................*.............................................. // gap // ........................................................................................................................ - mls v4.4S, v19.4S, v8.4S // .......................................................................*................................................ + mls v10.4S, v5.4S, v8.4S // .......................................................................*................................................ // gap // ........................................................................................................................ - sub v19.4S, v29.4S, v9.4S // ..........................................................................*............................................. + sub v5.4S, v27.4S, v15.4S // ..........................................................................*............................................. // gap // ........................................................................................................................ - cmge v29.4S, v31.4S, v14.4S // ............................................................................*........................................... + cmge v27.4S, v31.4S, v11.4S // ............................................................................*........................................... // gap // ........................................................................................................................ - cmge v9.4S, v14.4S, v30.4S // .............................................................................*.......................................... + cmge v15.4S, v11.4S, v30.4S // .............................................................................*.......................................... // gap // ........................................................................................................................ - mls v6.4S, v19.4S, v8.4S // ...........................................................................*............................................ + mls v14.4S, v5.4S, v8.4S // ...........................................................................*............................................ // gap // ........................................................................................................................ - sub v19.4S, v29.4S, v9.4S // ..............................................................................*......................................... + sub v5.4S, v27.4S, v15.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ - cmge v29.4S, v31.4S, v11.4S // ................................................................................*....................................... + cmge v27.4S, v31.4S, v7.4S // ................................................................................*....................................... // gap // ........................................................................................................................ - cmge v9.4S, v11.4S, v30.4S // .................................................................................*...................................... + cmge v15.4S, v7.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ - mls v14.4S, v19.4S, v8.4S // ...............................................................................*........................................ + mls v11.4S, v5.4S, v8.4S // ...............................................................................*........................................ // gap // ........................................................................................................................ - sub v19.4S, v29.4S, v9.4S // ..................................................................................*..................................... + sub v5.4S, v27.4S, v15.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ - str q4, [x0, #512] // ....................................................................................*................................... + str q10, [x0, #512] // ....................................................................................*................................... // gap // ........................................................................................................................ - mul v29.4S, v17.4S, v25.4S // ...........................................................................................*............................ + sqrdmulh v10.4S, v6.4S, v26.4S // ...........................................................................................*............................ // gap // ........................................................................................................................ - mls v11.4S, v19.4S, v8.4S // ...................................................................................*.................................... + mls v7.4S, v5.4S, v8.4S // ...................................................................................*.................................... // gap // ........................................................................................................................ - str q6, [x0, #640] // .....................................................................................*.................................. + str q14, [x0, #640] // .....................................................................................*.................................. // gap // ........................................................................................................................ - sqrdmulh v17.4S, v17.4S, v26.4S // ............................................................................................*........................... + mul v6.4S, v6.4S, v25.4S // ............................................................................................*........................... // gap // ........................................................................................................................ - str q14, [x0, #768] // ......................................................................................*................................. + str q11, [x0, #768] // ......................................................................................*................................. // gap // ........................................................................................................................ - mul v19.4S, v28.4S, v25.4S // ..............................................................................................*......................... + sqrdmulh v5.4S, v18.4S, v26.4S // ..............................................................................................*......................... // gap // ........................................................................................................................ - str q11, [x0, #896] // .......................................................................................*................................ + str q7, [x0, #896] // .......................................................................................*................................ // gap // ........................................................................................................................ - mls v29.4S, v17.4S, v8.S[0] // .............................................................................................*.......................... + mls v6.4S, v10.4S, v8.S[0] // .............................................................................................*.......................... // gap // ........................................................................................................................ - sqrdmulh v17.4S, v28.4S, v26.4S // ...............................................................................................*........................ + mul v18.4S, v18.4S, v25.4S // ...............................................................................................*........................ // gap // ........................................................................................................................ - mul v6.4S, v18.4S, v25.4S // .................................................................................................*...................... + sqrdmulh v7.4S, v4.4S, v26.4S // .................................................................................................*...................... // gap // ........................................................................................................................ - sqrdmulh v18.4S, v18.4S, v26.4S // ..................................................................................................*..................... + mul v14.4S, v4.4S, v25.4S // ..................................................................................................*..................... // gap // ........................................................................................................................ - cmge v14.4S, v31.4S, v20.4S // ....................................................................................................*................... + cmge v4.4S, v31.4S, v17.4S // ....................................................................................................*................... // gap // ........................................................................................................................ - mls v19.4S, v17.4S, v8.S[0] // ................................................................................................*....................... + mls v18.4S, v5.4S, v8.S[0] // ................................................................................................*....................... // gap // ........................................................................................................................ - cmge v17.4S, v20.4S, v30.4S // .....................................................................................................*.................. + cmge v5.4S, v17.4S, v30.4S // .....................................................................................................*.................. // gap // ........................................................................................................................ - mls v6.4S, v18.4S, v8.S[0] // ...................................................................................................*.................... + mls v14.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... // gap // ........................................................................................................................ - sub v17.4S, v14.4S, v17.4S // ......................................................................................................*................. + sub v5.4S, v4.4S, v5.4S // ......................................................................................................*................. // gap // ........................................................................................................................ - cmge v18.4S, v31.4S, v29.4S // ........................................................................................................*............... + cmge v7.4S, v31.4S, v6.4S // ........................................................................................................*............... // gap // ........................................................................................................................ - cmge v14.4S, v29.4S, v30.4S // .........................................................................................................*.............. + cmge v4.4S, v6.4S, v30.4S // .........................................................................................................*.............. // gap // ........................................................................................................................ - mls v20.4S, v17.4S, v8.4S // .......................................................................................................*................ + mls v17.4S, v5.4S, v8.4S // .......................................................................................................*................ // gap // ........................................................................................................................ - sub v17.4S, v18.4S, v14.4S // ..........................................................................................................*............. + sub v5.4S, v7.4S, v4.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ - cmge v18.4S, v31.4S, v19.4S // ............................................................................................................*........... + cmge v7.4S, v31.4S, v18.4S // ............................................................................................................*........... // gap // ........................................................................................................................ - cmge v14.4S, v19.4S, v30.4S // .............................................................................................................*.......... + cmge v4.4S, v18.4S, v30.4S // .............................................................................................................*.......... // gap // ........................................................................................................................ - mls v29.4S, v17.4S, v8.4S // ...........................................................................................................*............ + mls v6.4S, v5.4S, v8.4S // ...........................................................................................................*............ // gap // ........................................................................................................................ - sub v17.4S, v18.4S, v14.4S // ..............................................................................................................*......... + sub v5.4S, v7.4S, v4.4S // ..............................................................................................................*......... // gap // ........................................................................................................................ - cmge v18.4S, v31.4S, v6.4S // ................................................................................................................*....... + cmge v7.4S, v31.4S, v14.4S // ................................................................................................................*....... // gap // ........................................................................................................................ - cmge v14.4S, v6.4S, v30.4S // .................................................................................................................*...... + cmge v4.4S, v14.4S, v30.4S // .................................................................................................................*...... // gap // ........................................................................................................................ - mls v19.4S, v17.4S, v8.4S // ...............................................................................................................*........ + mls v18.4S, v5.4S, v8.4S // ...............................................................................................................*........ // gap // ........................................................................................................................ - sub v17.4S, v18.4S, v14.4S // ..................................................................................................................*..... + sub v5.4S, v7.4S, v4.4S // ..................................................................................................................*..... // gap // ........................................................................................................................ - str q20, [x0], #(16) // ....................................................................................................................*... + str q17, [x0], #(16) // ....................................................................................................................*... // gap // ........................................................................................................................ - ldr q18, [x0, #0] // e....................................................................................................................... + ldr q17, [x0, #0] // e....................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v6.4S, v17.4S, v8.4S // ...................................................................................................................*.... + mls v14.4S, v5.4S, v8.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ - str q29, [x0, #112] // .....................................................................................................................*.. + str q6, [x0, #112] // .....................................................................................................................*.. // gap // ........................................................................................................................ - ldr q14, [x0, #128] // .e...................................................................................................................... + ldr q5, [x0, #128] // .e...................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q19, [x0, #240] // ......................................................................................................................*. + str q18, [x0, #240] // ......................................................................................................................*. // gap // ........................................................................................................................ - ldr q11, [x0, #256] // ..e..................................................................................................................... + ldr q18, [x0, #256] // ..e..................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q28, [x0, #512] // ....e................................................................................................................... + ldr q4, [x0, #512] // ....e................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q29, [x0, #640] // .....e.................................................................................................................. + ldr q10, [x0, #640] // .....e.................................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q9, [x0, #768] // ......e................................................................................................................. + ldr q27, [x0, #768] // ......e................................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v13.4S, v28.4S, v29.4S // ...................e.................................................................................................... + add v21.4S, v4.4S, v10.4S // ...................e.................................................................................................... // gap // ........................................................................................................................ - ldr q4, [x0, #896] // .......e................................................................................................................ + ldr q11, [x0, #896] // .......e................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - ldr q20, [x0, #384] // ...e.................................................................................................................... + ldr q7, [x0, #384] // ...e.................................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v22.4S, v9.4S, v4.4S // ........................e............................................................................................... + add v23.4S, v27.4S, v11.4S // ........................e............................................................................................... // gap // ........................................................................................................................ - str q6, [x0, #368] // .......................................................................................................................* + str q14, [x0, #368] // .......................................................................................................................* // gap // ........................................................................................................................ - add v15.4S, v11.4S, v20.4S // ..............e......................................................................................................... + add v14.4S, v18.4S, v7.4S // ..............e......................................................................................................... // gap // ........................................................................................................................ - add v23.4S, v13.4S, v22.4S // .......................................e................................................................................ + add v15.4S, v21.4S, v23.4S // .......................................e................................................................................ // gap // ........................................................................................................................ - // original source code - // ldr q9, [x0, #0] // e...............|.......................................................................................................e............. - // ldr q10, [x0, #(1*(1024/8))] // ...e............|..........................................................................................................e.......... - // ldr q11, [x0, #(2*(1024/8))] // .....e..........|............................................................................................................e........ - // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..................................................................................................................e.. - // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.............................................................................................................e....... - // ldr q14, [x0, #(5*(1024/8))] // .......e........|..............................................................................................................e...... - // ldr q15, [x0, #(6*(1024/8))] // ........e.......|...............................................................................................................e..... - // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.................................................................................................................e... + // ----------------------------------------------------------- new position ------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-------- + // ldr q9, [x0, #0] // e...............'.......................................................................................................~............. + // ldr q10, [x0, #(1*(1024/8))] // ...e............'..........................................................................................................~.......... + // ldr q11, [x0, #(2*(1024/8))] // .....e..........'............................................................................................................~........ + // ldr q12, [x0, #(3*(1024/8))] // ...........e....'..................................................................................................................~.. + // ldr q13, [x0, #(4*(1024/8))] // ......e.........'.............................................................................................................~....... + // ldr q14, [x0, #(5*(1024/8))] // .......e........'..............................................................................................................~...... + // ldr q15, [x0, #(6*(1024/8))] // ........e.......'...............................................................................................................~..... + // ldr q16, [x0, #(7*(1024/8))] // ..........e.....'.................................................................................................................~... // sub v24.4s, v9.4s, v10.4s // ................*..................................................................................................................... - // add v9.4s, v9.4s, v10.4s // ................|*.................................................................................................................... - // mul v10.4s, v24.4s, v1.s[2] // ................|..*.................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*................................................................................................................. - // mls v10.4s, v24.4s, v8.s[0] // ................|........*............................................................................................................ - // sub v24.4s, v11.4s, v12.4s // ................|.*................................................................................................................... - // add v11.4s, v11.4s, v12.4s // ..............e.|..................................................................................................................... - // mul v12.4s, v24.4s, v2.s[0] // ................|......*.............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*............................................................................................................. - // mls v12.4s, v24.4s, v8.s[0] // ................|..............*...................................................................................................... - // sub v24.4s, v13.4s, v14.4s // ................|.........*........................................................................................................... - // add v13.4s, v13.4s, v14.4s // .........e......|................................................................................................................e.... - // mul v14.4s, v24.4s, v2.s[2] // ................|...............*..................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*.................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ................|....................*................................................................................................ - // sub v24.4s, v15.4s, v16.4s // ................|.................*................................................................................................... - // add v15.4s, v15.4s, v16.4s // ............e...|...................................................................................................................e. - // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*............................................................................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*............................................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*...................................................................................... - // sub v24.4s, v9.4s, v11.4s // ................|....*................................................................................................................ - // add v9.4s, v9.4s, v11.4s // ................|.....*............................................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.......................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*......................................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ................|......................*.............................................................................................. - // sub v24.4s, v10.4s, v12.4s // ................|..................*.................................................................................................. - // add v10.4s, v10.4s, v12.4s // ................|...................*................................................................................................. - // mul v12.4s, v24.4s, v0.s[2] // ................|........................*............................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*........................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*..................................................................................... - // sub v24.4s, v13.4s, v15.4s // ................|................................*.................................................................................... - // add v13.4s, v13.4s, v15.4s // ...............e|..................................................................................................................... - // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*................................................................................ - // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.......................................................................... - // sub v24.4s, v14.4s, v16.4s // ................|..................................*.................................................................................. - // add v14.4s, v14.4s, v16.4s // ................|.....................................*............................................................................... - // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*.............................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*............................................................................. - // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*......................................................................... - // sub v24.4s, v9.4s, v13.4s // ................|............*........................................................................................................ - // add v9.4s, v9.4s, v13.4s // ................|.............*....................................................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.......................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*......................................................................................... - // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*................................................................................... - // sub v24.4s, v10.4s, v14.4s // ................|........................................*............................................................................ - // add v10.4s, v10.4s, v14.4s // ................|.........................................*........................................................................... - // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*........................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*....................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*................................................................... - // sub v24.4s, v11.4s, v15.4s // ................|..............................................*...................................................................... - // add v11.4s, v11.4s, v15.4s // ................|...............................................*..................................................................... - // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*................................................................. - // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*............................................................. - // sub v24.4s, v12.4s, v16.4s // ................|................................................*.................................................................... - // add v12.4s, v12.4s, v16.4s // ................|....................................................*................................................................ - // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*............................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*.............................................................. - // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*......................................................... - // cmge v27.4s, v31.4s, v13.4s // ................|........................................................*............................................................ - // cmge v28.4s, v13.4s, v30.4s // ................|.........................................................*........................................................... - // sub v28.4s, v27.4s, v28.4s // ................|............................................................*........................................................ - // mls v13.4s, v28.4s, v8.4s // ................|...............................................................*..................................................... - // cmge v27.4s, v31.4s, v14.4s // ................|.............................................................*....................................................... - // cmge v28.4s, v14.4s, v30.4s // ................|..............................................................*...................................................... - // sub v28.4s, v27.4s, v28.4s // ................|................................................................*.................................................... - // mls v14.4s, v28.4s, v8.4s // ................|...................................................................*................................................. - // cmge v27.4s, v31.4s, v15.4s // ................|.................................................................*................................................... - // cmge v28.4s, v15.4s, v30.4s // ................|..................................................................*.................................................. - // sub v28.4s, v27.4s, v28.4s // ................|....................................................................*................................................ - // mls v15.4s, v28.4s, v8.4s // ................|.......................................................................*............................................. - // cmge v27.4s, v31.4s, v16.4s // ................|.....................................................................*............................................... - // cmge v28.4s, v16.4s, v30.4s // ................|......................................................................*.............................................. - // sub v28.4s, v27.4s, v28.4s // ................|........................................................................*............................................ - // mls v16.4s, v28.4s, v8.4s // ................|...........................................................................*......................................... - // str q13, [x0, #(4*(1024/8))] // ................|.........................................................................*........................................... - // str q14, [x0, #(5*(1024/8))] // ................|............................................................................*........................................ - // str q15, [x0, #(6*(1024/8))] // ................|..............................................................................*...................................... - // str q16, [x0, #(7*(1024/8))] // ................|................................................................................*.................................... - // mul v13.4s, v9.4s, v25.4s // ................|............................*........................................................................................ - // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*....................................................................................... - // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.......................................................... - // mul v14.4s, v10.4s, v25.4s // ................|..........................................................................*.......................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ................|.............................................................................*....................................... - // mls v14.4s, v10.4s, v8.s[0] // ................|.................................................................................*................................... - // mul v15.4s, v11.4s, v25.4s // ................|...............................................................................*..................................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ................|..................................................................................*.................................. - // mls v15.4s, v11.4s, v8.s[0] // ................|......................................................................................*.............................. - // mul v16.4s, v12.4s, v25.4s // ................|...................................................................................*................................. - // sqrdmulh v12.4s, v12.4s, v26.4s // ................|....................................................................................*................................ - // mls v16.4s, v12.4s, v8.s[0] // ................|........................................................................................*............................ - // cmge v27.4s, v31.4s, v13.4s // ................|.....................................................................................*............................... - // cmge v28.4s, v13.4s, v30.4s // ................|.......................................................................................*............................. - // sub v28.4s, v27.4s, v28.4s // ................|.........................................................................................*........................... - // mls v13.4s, v28.4s, v8.4s // ................|............................................................................................*........................ - // cmge v27.4s, v31.4s, v14.4s // ................|..........................................................................................*.......................... - // cmge v28.4s, v14.4s, v30.4s // ................|...........................................................................................*......................... - // sub v28.4s, v27.4s, v28.4s // ................|.............................................................................................*....................... - // mls v14.4s, v28.4s, v8.4s // ................|................................................................................................*.................... - // cmge v27.4s, v31.4s, v15.4s // ................|..............................................................................................*...................... - // cmge v28.4s, v15.4s, v30.4s // ................|...............................................................................................*..................... - // sub v28.4s, v27.4s, v28.4s // ................|.................................................................................................*................... - // mls v15.4s, v28.4s, v8.4s // ................|....................................................................................................*................ - // cmge v27.4s, v31.4s, v16.4s // ................|..................................................................................................*.................. - // cmge v28.4s, v16.4s, v30.4s // ................|...................................................................................................*................. - // sub v28.4s, v27.4s, v28.4s // ................|.....................................................................................................*............... - // mls v16.4s, v28.4s, v8.4s // .*..............|........................................................................................................*............ - // str q13, [x0], #(16) // ................|......................................................................................................*.............. - // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.........................................................................................................*........... - // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...........................................................................................................*......... - // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|....................................................................................................................* + // add v9.4s, v9.4s, v10.4s // ................'*.................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ................'..*.................................................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ................'...*................................................................................................................. + // mls v10.4s, v27.4s, v8.s[0] // ................'........*............................................................................................................ + // sub v24.4s, v11.4s, v12.4s // ................'.*................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ..............e.'..................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ................'......*.............................................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ................'.......*............................................................................................................. + // mls v12.4s, v27.4s, v8.s[0] // ................'..............*...................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ................'.........*........................................................................................................... + // add v13.4s, v13.4s, v14.4s // .........e......'................................................................................................................~.... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ................'...............*..................................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ................'................*.................................................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ................'....................*................................................................................................ + // sub v24.4s, v15.4s, v16.4s // ................'.................*................................................................................................... + // add v15.4s, v15.4s, v16.4s // ............e...'...................................................................................................................~. + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ................'.....................*............................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ................'.......................*............................................................................................. + // mls v16.4s, v27.4s, v8.s[0] // ................'..............................*...................................................................................... + // sub v24.4s, v9.4s, v11.4s // ................'....*................................................................................................................ + // add v9.4s, v9.4s, v11.4s // ................'.....*............................................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ................'..........*.......................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................'...........*......................................................................................................... + // mls v11.4s, v27.4s, v8.s[0] // ................'......................*.............................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................'..................*.................................................................................................. + // add v10.4s, v10.4s, v12.4s // ................'...................*................................................................................................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ................'........................*............................................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ................'.........................*........................................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ................'...............................*..................................................................................... + // sub v24.4s, v13.4s, v15.4s // ................'................................*.................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............e'..................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ................'...................................*................................................................................. + // mul v15.4s, v24.4s, v1.s[0] // ................'....................................*................................................................................ + // mls v15.4s, v27.4s, v8.s[0] // ................'..........................................*.......................................................................... + // sub v24.4s, v14.4s, v16.4s // ................'..................................*.................................................................................. + // add v14.4s, v14.4s, v16.4s // ................'.....................................*............................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ................'......................................*.............................................................................. + // mul v16.4s, v24.4s, v1.s[0] // ................'.......................................*............................................................................. + // mls v16.4s, v27.4s, v8.s[0] // ................'...........................................*......................................................................... + // sub v24.4s, v9.4s, v13.4s // ................'............*........................................................................................................ + // add v9.4s, v9.4s, v13.4s // ................'.............*....................................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................'..........................*.......................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ................'...........................*......................................................................................... + // mls v13.4s, v27.4s, v8.s[0] // ................'.................................*................................................................................... + // sub v24.4s, v10.4s, v14.4s // ................'........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ................'.........................................*........................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................'............................................*........................................................................ + // mul v14.4s, v24.4s, v0.s[0] // ................'.............................................*....................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ................'.................................................*................................................................... + // sub v24.4s, v11.4s, v15.4s // ................'..............................................*...................................................................... + // add v11.4s, v11.4s, v15.4s // ................'...............................................*..................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................'..................................................*.................................................................. + // mul v15.4s, v24.4s, v0.s[0] // ................'...................................................*................................................................. + // mls v15.4s, v27.4s, v8.s[0] // ................'.......................................................*............................................................. + // sub v24.4s, v12.4s, v16.4s // ................'................................................*.................................................................... + // add v12.4s, v12.4s, v16.4s // ................'....................................................*................................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................'.....................................................*............................................................... + // mul v16.4s, v24.4s, v0.s[0] // ................'......................................................*.............................................................. + // mls v16.4s, v27.4s, v8.s[0] // ................'...........................................................*......................................................... + // cmge v27.4s, v31.4s, v13.4s // ................'........................................................*............................................................ + // cmge v28.4s, v13.4s, v30.4s // ................'.........................................................*........................................................... + // sub v28.4s, v27.4s, v28.4s // ................'............................................................*........................................................ + // mls v13.4s, v28.4s, v8.4s // ................'...............................................................*..................................................... + // cmge v27.4s, v31.4s, v14.4s // ................'.............................................................*....................................................... + // cmge v28.4s, v14.4s, v30.4s // ................'..............................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ................'................................................................*.................................................... + // mls v14.4s, v28.4s, v8.4s // ................'...................................................................*................................................. + // cmge v27.4s, v31.4s, v15.4s // ................'.................................................................*................................................... + // cmge v28.4s, v15.4s, v30.4s // ................'..................................................................*.................................................. + // sub v28.4s, v27.4s, v28.4s // ................'....................................................................*................................................ + // mls v15.4s, v28.4s, v8.4s // ................'.......................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // ................'.....................................................................*............................................... + // cmge v28.4s, v16.4s, v30.4s // ................'......................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ................'........................................................................*............................................ + // mls v16.4s, v28.4s, v8.4s // ................'...........................................................................*......................................... + // str q13, [x0, #(4*(1024/8))] // ................'.........................................................................*........................................... + // str q14, [x0, #(5*(1024/8))] // ................'............................................................................*........................................ + // str q15, [x0, #(6*(1024/8))] // ................'..............................................................................*...................................... + // str q16, [x0, #(7*(1024/8))] // ................'................................................................................*.................................... + // sqrdmulh v27.4s, v9.4s, v26.4s // ................'............................*........................................................................................ + // mul v9.4s, v9.4s, v25.4s // ................'.............................*....................................................................................... + // mls v9.4s, v27.4s, v8.s[0] // ................'..........................................................*.......................................................... + // sqrdmulh v27.4s, v10.4s, v26.4s // ................'..........................................................................*.......................................... + // mul v10.4s, v10.4s, v25.4s // ................'.............................................................................*....................................... + // mls v10.4s, v27.4s, v8.s[0] // ................'.................................................................................*................................... + // sqrdmulh v27.4s, v11.4s, v26.4s // ................'...............................................................................*..................................... + // mul v11.4s, v11.4s, v25.4s // ................'..................................................................................*.................................. + // mls v11.4s, v27.4s, v8.s[0] // ................'......................................................................................*.............................. + // sqrdmulh v27.4s, v12.4s, v26.4s // ................'...................................................................................*................................. + // mul v12.4s, v12.4s, v25.4s // ................'....................................................................................*................................ + // mls v12.4s, v27.4s, v8.s[0] // ................'........................................................................................*............................ + // cmge v27.4s, v31.4s, v9.4s // ................'.....................................................................................*............................... + // cmge v28.4s, v9.4s, v30.4s // ................'.......................................................................................*............................. + // sub v28.4s, v27.4s, v28.4s // ................'.........................................................................................*........................... + // mls v9.4s, v28.4s, v8.4s // ................'............................................................................................*........................ + // cmge v27.4s, v31.4s, v10.4s // ................'..........................................................................................*.......................... + // cmge v28.4s, v10.4s, v30.4s // ................'...........................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ................'.............................................................................................*....................... + // mls v10.4s, v28.4s, v8.4s // ................'................................................................................................*.................... + // cmge v27.4s, v31.4s, v11.4s // ................'..............................................................................................*...................... + // cmge v28.4s, v11.4s, v30.4s // ................'...............................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ................'.................................................................................................*................... + // mls v11.4s, v28.4s, v8.4s // ................'....................................................................................................*................ + // cmge v27.4s, v31.4s, v12.4s // ................'..................................................................................................*.................. + // cmge v28.4s, v12.4s, v30.4s // ................'...................................................................................................*................. + // sub v28.4s, v27.4s, v28.4s // ................'.....................................................................................................*............... + // mls v12.4s, v28.4s, v8.4s // .~..............'........................................................................................................*............ + // str q9, [x0], #(16) // ................'......................................................................................................*.............. + // str q10, [x0, #(-16 + 1*(1024/8))] // ..~.............'.........................................................................................................*........... + // str q11, [x0, #(-16 + 2*(1024/8))] // ....~...........'...........................................................................................................*......... + // str q12, [x0, #(-16 + 3*(1024/8))] // .............~..'....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v20.4S, v11.4S, v20.4S // ..*......................................................................................................... + // Instructions: 108 + // Expected cycles: 110 + // Expected IPC: 0.98 + // + // Wall time: 13.27s + // User time: 13.27s + // + // -------------------------------------------- original position --------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------- + sub v6.4S, v17.4S, v5.4S // *........................................................................................................... // gap // ............................................................................................................ - sub v5.4S, v18.4S, v14.4S // *........................................................................................................... + add v20.4S, v17.4S, v5.4S // .*.......................................................................................................... // gap // ............................................................................................................ - sub v11.4S, v28.4S, v29.4S // ..........*................................................................................................. + sub v28.4S, v18.4S, v7.4S // ..*......................................................................................................... // gap // ............................................................................................................ - sqrdmulh v29.4S, v20.4S, v2.S[1] // ........*................................................................................................... + mul v9.4S, v6.4S, v1.S[2] // ....*....................................................................................................... // gap // ............................................................................................................ - mul v17.4S, v20.4S, v2.S[0] // .......*.................................................................................................... + sqrdmulh v29.4S, v6.4S, v1.S[3] // ...*........................................................................................................ // gap // ............................................................................................................ - sub v9.4S, v9.4S, v4.4S // ..................*......................................................................................... + sub v24.4S, v20.4S, v14.4S // .....*...................................................................................................... // gap // ............................................................................................................ - sqrdmulh v24.4S, v5.4S, v1.S[3] // ....*....................................................................................................... + add v12.4S, v20.4S, v14.4S // ......*..................................................................................................... // gap // ............................................................................................................ - sqrdmulh v21.4S, v11.4S, v2.S[3] // .................*.......................................................................................... + sqrdmulh v20.4S, v28.4S, v2.S[1] // .......*.................................................................................................... // gap // ............................................................................................................ - mul v19.4S, v9.4S, v3.S[0] // ......................*..................................................................................... + mls v9.4S, v29.4S, v8.S[0] // .........*.................................................................................................. // gap // ............................................................................................................ - mul v27.4S, v5.4S, v1.S[2] // ...*........................................................................................................ + mul v28.4S, v28.4S, v2.S[0] // ........*................................................................................................... // gap // ............................................................................................................ - mul v28.4S, v11.4S, v2.S[2] // ................*........................................................................................... + sub v29.4S, v4.4S, v10.4S // ..........*................................................................................................. // gap // ............................................................................................................ - sqrdmulh v6.4S, v9.4S, v3.S[1] // ........................*................................................................................... + sqrdmulh v22.4S, v24.4S, v0.S[3] // ...........*................................................................................................ // gap // ............................................................................................................ - mls v17.4S, v29.4S, v8.S[0] // ...............*............................................................................................ + mul v10.4S, v24.4S, v0.S[2] // ............*............................................................................................... // gap // ............................................................................................................ - mls v27.4S, v24.4S, v8.S[0] // .........*.................................................................................................. + sub v13.4S, v12.4S, v15.4S // .............*.............................................................................................. // gap // ............................................................................................................ - mls v28.4S, v21.4S, v8.S[0] // .....................*...................................................................................... + add v24.4S, v12.4S, v15.4S // ..............*............................................................................................. // gap // ............................................................................................................ - mls v19.4S, v6.4S, v8.S[0] // ...............................*............................................................................ + mls v28.4S, v20.4S, v8.S[0] // ...............*............................................................................................ // gap // ............................................................................................................ - add v16.4S, v18.4S, v14.4S // .*.......................................................................................................... + sqrdmulh v12.4S, v29.4S, v2.S[3] // ................*........................................................................................... // gap // ............................................................................................................ - sub v18.4S, v27.4S, v17.4S // ...................*........................................................................................ + mul v19.4S, v29.4S, v2.S[2] // .................*.......................................................................................... // gap // ............................................................................................................ - add v7.4S, v27.4S, v17.4S // ....................*....................................................................................... + sub v29.4S, v27.4S, v11.4S // ..................*......................................................................................... // gap // ............................................................................................................ - sub v17.4S, v28.4S, v19.4S // ...................................*........................................................................ + sub v20.4S, v9.4S, v28.4S // ...................*........................................................................................ // gap // ............................................................................................................ - sqrdmulh v14.4S, v18.4S, v0.S[3] // ..........................*................................................................................. + add v9.4S, v9.4S, v28.4S // ....................*....................................................................................... // gap // ............................................................................................................ - mul v4.4S, v18.4S, v0.S[2] // .........................*.................................................................................. + mls v19.4S, v12.4S, v8.S[0] // .....................*...................................................................................... // gap // ............................................................................................................ - sqrdmulh v18.4S, v17.4S, v1.S[1] // ........................................*................................................................... + sqrdmulh v28.4S, v29.4S, v3.S[1] // ......................*..................................................................................... // gap // ............................................................................................................ - mul v11.4S, v17.4S, v1.S[0] // .......................................*.................................................................... + mls v10.4S, v22.4S, v8.S[0] // .......................*.................................................................................... // gap // ............................................................................................................ - add v27.4S, v28.4S, v19.4S // ......................................*..................................................................... + mul v12.4S, v29.4S, v3.S[0] // ........................*................................................................................... // gap // ............................................................................................................ - sub v28.4S, v16.4S, v15.4S // .....*...................................................................................................... + sqrdmulh v29.4S, v20.4S, v0.S[3] // .........................*.................................................................................. // gap // ............................................................................................................ - mls v4.4S, v14.4S, v8.S[0] // ................................*........................................................................... + mul v22.4S, v20.4S, v0.S[2] // ..........................*................................................................................. // gap // ............................................................................................................ - mls v11.4S, v18.4S, v8.S[0] // ............................................*............................................................... + sqrdmulh v20.4S, v13.4S, v0.S[1] // ...........................*................................................................................ // gap // ............................................................................................................ - mul v20.4S, v28.4S, v0.S[2] // ...........*................................................................................................ + mul v27.4S, v13.4S, v0.S[0] // ............................*............................................................................... // gap // ............................................................................................................ - add v19.4S, v7.4S, v27.4S // ..........................................*................................................................. + sqrdmulh v11.4S, v24.4S, v26.4S // .............................*.............................................................................. // gap // ............................................................................................................ - sqrdmulh v21.4S, v28.4S, v0.S[3] // ............*............................................................................................... + mul v15.4S, v24.4S, v25.4S // ..............................*............................................................................. // gap // ............................................................................................................ - add v17.4S, v4.4S, v11.4S // .....................................................*...................................................... + mls v12.4S, v28.4S, v8.S[0] // ...............................*............................................................................ // gap // ............................................................................................................ - mul v28.4S, v19.4S, v25.4S // ...........................................................................*................................ + mls v22.4S, v29.4S, v8.S[0] // ................................*........................................................................... // gap // ............................................................................................................ - sub v14.4S, v13.4S, v22.4S // .................................*.......................................................................... + sub v29.4S, v21.4S, v23.4S // .................................*.......................................................................... // gap // ............................................................................................................ - mul v9.4S, v17.4S, v25.4S // ....................................................................................*....................... + mls v27.4S, v20.4S, v8.S[0] // ..................................*......................................................................... // gap // ............................................................................................................ - sqrdmulh v17.4S, v17.4S, v26.4S // .....................................................................................*...................... + sub v20.4S, v19.4S, v12.4S // ...................................*........................................................................ // gap // ............................................................................................................ - mls v20.4S, v21.4S, v8.S[0] // .......................*.................................................................................... + sqrdmulh v13.4S, v29.4S, v1.S[1] // ....................................*....................................................................... // gap // ............................................................................................................ - sqrdmulh v18.4S, v14.4S, v1.S[1] // .....................................*...................................................................... + mul v24.4S, v29.4S, v1.S[0] // .....................................*...................................................................... // gap // ............................................................................................................ - mul v13.4S, v14.4S, v1.S[0] // ....................................*....................................................................... + add v12.4S, v19.4S, v12.4S // ......................................*..................................................................... // gap // ............................................................................................................ - mls v9.4S, v17.4S, v8.S[0] // .........................................................................................*.................. + sqrdmulh v29.4S, v20.4S, v1.S[1] // .......................................*.................................................................... // gap // ............................................................................................................ - add v21.4S, v16.4S, v15.4S // ......*..................................................................................................... + mul v28.4S, v20.4S, v1.S[0] // ........................................*................................................................... // gap // ............................................................................................................ - sqrdmulh v10.4S, v19.4S, v26.4S // ..............................................................................*............................. + sub v20.4S, v9.4S, v12.4S // .........................................*.................................................................. // gap // ............................................................................................................ - mls v13.4S, v18.4S, v8.S[0] // ...........................................*................................................................ + add v9.4S, v9.4S, v12.4S // ..........................................*................................................................. // gap // ............................................................................................................ - cmge v17.4S, v9.4S, v30.4S // ....................................................................................................*....... + mls v24.4S, v13.4S, v8.S[0] // ...........................................*................................................................ // gap // ............................................................................................................ - cmge v19.4S, v31.4S, v9.4S // ...................................................................................................*........ + mls v28.4S, v29.4S, v8.S[0] // ............................................*............................................................... // gap // ............................................................................................................ - add v18.4S, v21.4S, v23.4S // ..............*............................................................................................. + sqrdmulh v29.4S, v20.4S, v0.S[1] // .............................................*.............................................................. // gap // ............................................................................................................ - sub v12.4S, v19.4S, v17.4S // ......................................................................................................*..... + mul v16.4S, v20.4S, v0.S[0] // ..............................................*............................................................. // gap // ............................................................................................................ - add v17.4S, v20.4S, v13.4S // ................................................*........................................................... + sub v12.4S, v10.4S, v24.4S // ...............................................*............................................................ // gap // ............................................................................................................ - sqrdmulh v16.4S, v18.4S, v26.4S // ..............................*............................................................................. + add v24.4S, v10.4S, v24.4S // ................................................*........................................................... // gap // ............................................................................................................ - mls v9.4S, v12.4S, v8.4S // ........................................................................................................*... + sub v20.4S, v22.4S, v28.4S // .................................................*.......................................................... // gap // ............................................................................................................ - mul v6.4S, v17.4S, v25.4S // ................................................................................*........................... + mls v16.4S, v29.4S, v8.S[0] // ..................................................*......................................................... // gap // ............................................................................................................ - sqrdmulh v14.4S, v17.4S, v26.4S // ...................................................................................*........................ + sqrdmulh v29.4S, v12.4S, v0.S[1] // ...................................................*........................................................ // gap // ............................................................................................................ - mls v28.4S, v10.4S, v8.S[0] // ..................................................................................*......................... + mul v19.4S, v12.4S, v0.S[0] // ....................................................*....................................................... // gap // ............................................................................................................ - mul v15.4S, v18.4S, v25.4S // .............................*.............................................................................. + add v28.4S, v22.4S, v28.4S // .....................................................*...................................................... // gap // ............................................................................................................ - sub v4.4S, v4.4S, v11.4S // .................................................*.......................................................... + sqrdmulh v12.4S, v20.4S, v0.S[1] // ......................................................*..................................................... // gap // ............................................................................................................ - mls v6.4S, v14.4S, v8.S[0] // .......................................................................................*.................... + mul v22.4S, v20.4S, v0.S[0] // .......................................................*.................................................... // gap // ............................................................................................................ - cmge v11.4S, v31.4S, v28.4S // ...........................................................................................*................ + mls v19.4S, v29.4S, v8.S[0] // ........................................................*................................................... // gap // ............................................................................................................ - cmge v17.4S, v28.4S, v30.4S // ............................................................................................*............... + cmge v20.4S, v31.4S, v27.4S // .........................................................*.................................................. // gap // ............................................................................................................ - mls v15.4S, v16.4S, v8.S[0] // ...........................................................*................................................ + cmge v29.4S, v27.4S, v30.4S // ..........................................................*................................................. // gap // ............................................................................................................ - sub v18.4S, v11.4S, v17.4S // ..............................................................................................*............. + mls v15.4S, v11.4S, v8.S[0] // ...........................................................*................................................ // gap // ............................................................................................................ - cmge v17.4S, v6.4S, v30.4S // ................................................................................................*........... + mls v22.4S, v12.4S, v8.S[0] // ............................................................*............................................... // gap // ............................................................................................................ - cmge v19.4S, v31.4S, v6.4S // ...............................................................................................*............ + sub v12.4S, v20.4S, v29.4S // .............................................................*.............................................. // gap // ............................................................................................................ - mls v28.4S, v18.4S, v8.4S // .................................................................................................*.......... + cmge v20.4S, v31.4S, v16.4S // ..............................................................*............................................. // gap // ............................................................................................................ - cmge v18.4S, v31.4S, v15.4S // ......................................................................................*..................... + cmge v29.4S, v16.4S, v30.4S // ...............................................................*............................................ // gap // ............................................................................................................ - cmge v29.4S, v15.4S, v30.4S // ........................................................................................*................... + mls v27.4S, v12.4S, v8.4S // ................................................................*........................................... // gap // ............................................................................................................ - sub v14.4S, v20.4S, v13.4S // ...............................................*............................................................ + sub v12.4S, v20.4S, v29.4S // .................................................................*.......................................... // gap // ............................................................................................................ - sub v29.4S, v18.4S, v29.4S // ..........................................................................................*................. + cmge v20.4S, v31.4S, v19.4S // ..................................................................*......................................... // gap // ............................................................................................................ - sqrdmulh v18.4S, v4.4S, v0.S[1] // .......................................................*.................................................... + cmge v29.4S, v19.4S, v30.4S // ...................................................................*........................................ // gap // ............................................................................................................ - mul v11.4S, v4.4S, v0.S[0] // ......................................................*..................................................... + mls v16.4S, v12.4S, v8.4S // ....................................................................*....................................... // gap // ............................................................................................................ - mls v15.4S, v29.4S, v8.4S // .............................................................................................*.............. + sub v12.4S, v20.4S, v29.4S // .....................................................................*...................................... + // gap // ............................................................................................................ + cmge v20.4S, v31.4S, v22.4S // ......................................................................*..................................... + // gap // ............................................................................................................ + cmge v29.4S, v22.4S, v30.4S // .......................................................................*.................................... + // gap // ............................................................................................................ + mls v19.4S, v12.4S, v8.4S // ........................................................................*................................... + // gap // ............................................................................................................ + sub v29.4S, v20.4S, v29.4S // .........................................................................*.................................. + // gap // ............................................................................................................ + str q27, [x0, #512] // ..........................................................................*................................. + // gap // ............................................................................................................ + sqrdmulh v12.4S, v9.4S, v26.4S // ...........................................................................*................................ + // gap // ............................................................................................................ + mls v22.4S, v29.4S, v8.4S // ............................................................................*............................... + // gap // ............................................................................................................ + str q16, [x0, #640] // .............................................................................*.............................. // gap // ............................................................................................................ - sub v4.4S, v7.4S, v27.4S // .........................................*.................................................................. + mul v13.4S, v9.4S, v25.4S // ..............................................................................*............................. // gap // ............................................................................................................ - sqrdmulh v29.4S, v14.4S, v0.S[1] // ....................................................*....................................................... + str q19, [x0, #768] // ...............................................................................*............................ // gap // ............................................................................................................ - mls v11.4S, v18.4S, v8.S[0] // ............................................................*............................................... + sqrdmulh v29.4S, v24.4S, v26.4S // ................................................................................*........................... // gap // ............................................................................................................ - sqrdmulh v22.4S, v4.4S, v0.S[1] // ..............................................*............................................................. + str q22, [x0, #896] // .................................................................................*.......................... // gap // ............................................................................................................ - mul v20.4S, v4.4S, v0.S[0] // .............................................*.............................................................. + mls v13.4S, v12.4S, v8.S[0] // ..................................................................................*......................... // gap // ............................................................................................................ - mul v4.4S, v14.4S, v0.S[0] // ...................................................*........................................................ + mul v9.4S, v24.4S, v25.4S // ...................................................................................*........................ // gap // ............................................................................................................ - cmge v18.4S, v31.4S, v11.4S // ......................................................................*..................................... + sqrdmulh v12.4S, v28.4S, v26.4S // ....................................................................................*....................... // gap // ............................................................................................................ - cmge v14.4S, v11.4S, v30.4S // .......................................................................*.................................... + mul v24.4S, v28.4S, v25.4S // .....................................................................................*...................... // gap // ............................................................................................................ - mls v20.4S, v22.4S, v8.S[0] // ..................................................*......................................................... + cmge v20.4S, v31.4S, v15.4S // ......................................................................................*..................... // gap // ............................................................................................................ - sub v24.4S, v18.4S, v14.4S // .........................................................................*.................................. + mls v9.4S, v29.4S, v8.S[0] // .......................................................................................*.................... + // gap // ............................................................................................................ + cmge v29.4S, v15.4S, v30.4S // ........................................................................................*................... + // gap // ............................................................................................................ + mls v24.4S, v12.4S, v8.S[0] // .........................................................................................*.................. + // gap // ............................................................................................................ + sub v29.4S, v20.4S, v29.4S // ..........................................................................................*................. + // gap // ............................................................................................................ + cmge v28.4S, v31.4S, v13.4S // ...........................................................................................*................ + // gap // ............................................................................................................ + cmge v12.4S, v13.4S, v30.4S // ............................................................................................*............... + // gap // ............................................................................................................ + mls v15.4S, v29.4S, v8.4S // .............................................................................................*.............. // gap // ............................................................................................................ - mls v4.4S, v29.4S, v8.S[0] // ........................................................*................................................... + cmge v20.4S, v31.4S, v9.4S // ...............................................................................................*............ // gap // ............................................................................................................ - sub v13.4S, v21.4S, v23.4S // .............*.............................................................................................. + cmge v29.4S, v9.4S, v30.4S // ................................................................................................*........... // gap // ............................................................................................................ - mls v11.4S, v24.4S, v8.4S // ............................................................................*............................... + sub v28.4S, v28.4S, v12.4S // ..............................................................................................*............. // gap // ............................................................................................................ - cmge v14.4S, v31.4S, v20.4S // ..............................................................*............................................. + sub v12.4S, v20.4S, v29.4S // ..................................................................................................*......... // gap // ............................................................................................................ - cmge v22.4S, v31.4S, v4.4S // ..................................................................*......................................... + cmge v20.4S, v31.4S, v24.4S // ...................................................................................................*........ // gap // ............................................................................................................ - cmge v18.4S, v20.4S, v30.4S // ...............................................................*............................................ + cmge v29.4S, v24.4S, v30.4S // ....................................................................................................*....... // gap // ............................................................................................................ - str q11, [x0, #896] // .................................................................................*.......................... + mls v13.4S, v28.4S, v8.4S // .................................................................................................*.......... // gap // ............................................................................................................ - sub v11.4S, v14.4S, v18.4S // .................................................................*.......................................... + sub v29.4S, v20.4S, v29.4S // ......................................................................................................*..... // gap // ............................................................................................................ - cmge v18.4S, v4.4S, v30.4S // ...................................................................*........................................ + mls v9.4S, v12.4S, v8.4S // .....................................................................................................*...... // gap // ............................................................................................................ str q15, [x0], #(16) // .......................................................................................................*.... // gap // ............................................................................................................ - mul v14.4S, v13.4S, v0.S[0] // ...........................*................................................................................ - // gap // ............................................................................................................ - sqrdmulh v27.4S, v13.4S, v0.S[1] // ............................*............................................................................... - // gap // ............................................................................................................ - sub v18.4S, v22.4S, v18.4S // .....................................................................*...................................... - // gap // ............................................................................................................ - mls v20.4S, v11.4S, v8.4S // ....................................................................*....................................... - // gap // ............................................................................................................ - str q28, [x0, #112] // .........................................................................................................*.. - // gap // ............................................................................................................ - mls v14.4S, v27.4S, v8.S[0] // ..................................*......................................................................... - // gap // ............................................................................................................ - mls v4.4S, v18.4S, v8.4S // ........................................................................*................................... - // gap // ............................................................................................................ - str q20, [x0, #624] // .............................................................................*.............................. - // gap // ............................................................................................................ - sub v17.4S, v19.4S, v17.4S // ..................................................................................................*......... - // gap // ............................................................................................................ - cmge v19.4S, v14.4S, v30.4S // ..........................................................*................................................. - // gap // ............................................................................................................ - cmge v18.4S, v31.4S, v14.4S // .........................................................*.................................................. - // gap // ............................................................................................................ - str q9, [x0, #368] // ...........................................................................................................* - // gap // ............................................................................................................ - sub v19.4S, v18.4S, v19.4S // .............................................................*.............................................. - // gap // ............................................................................................................ - mls v6.4S, v17.4S, v8.4S // .....................................................................................................*...... - // gap // ............................................................................................................ - str q4, [x0, #752] // ...............................................................................*............................ - // gap // ............................................................................................................ - mls v14.4S, v19.4S, v8.4S // ................................................................*........................................... - // gap // ............................................................................................................ - // gap // ............................................................................................................ - // gap // ............................................................................................................ - str q6, [x0, #240] // ..........................................................................................................*. - // gap // ............................................................................................................ - // gap // ............................................................................................................ - // gap // ............................................................................................................ - str q14, [x0, #496] // ..........................................................................*................................. - // gap // ............................................................................................................ - - // original source code - // sub v17.4S, v18.4S, v14.4S // .*.......................................................................................................... - // add v19.4S, v18.4S, v14.4S // ................*........................................................................................... - // sub v18.4S, v11.4S, v20.4S // *........................................................................................................... - // mul v6.4S, v17.4S, v1.S[2] // .........*.................................................................................................. - // sqrdmulh v17.4S, v17.4S, v1.S[3] // ......*..................................................................................................... - // sub v14.4S, v19.4S, v15.4S // .........................*.................................................................................. - // add v19.4S, v19.4S, v15.4S // ........................................*................................................................... - // mul v11.4S, v18.4S, v2.S[0] // ....*....................................................................................................... - // sqrdmulh v18.4S, v18.4S, v2.S[1] // ...*........................................................................................................ - // mls v6.4S, v17.4S, v8.S[0] // .............*.............................................................................................. - // sub v17.4S, v28.4S, v29.4S // ..*......................................................................................................... - // mul v28.4S, v14.4S, v0.S[2] // ............................*............................................................................... - // sqrdmulh v14.4S, v14.4S, v0.S[3] // ..............................*............................................................................. - // sub v29.4S, v19.4S, v23.4S // .................................................................................*.......................... - // add v19.4S, v19.4S, v23.4S // .............................................*.............................................................. - // mls v11.4S, v18.4S, v8.S[0] // ............*............................................................................................... - // mul v18.4S, v17.4S, v2.S[2] // ..........*................................................................................................. - // sqrdmulh v17.4S, v17.4S, v2.S[3] // .......*.................................................................................................... - // sub v9.4S, v9.4S, v4.4S // .....*...................................................................................................... - // sub v4.4S, v6.4S, v11.4S // .................*.......................................................................................... - // add v6.4S, v6.4S, v11.4S // ..................*......................................................................................... - // mls v18.4S, v17.4S, v8.S[0] // ..............*............................................................................................. - // mul v17.4S, v9.4S, v3.S[0] // ........*................................................................................................... - // mls v28.4S, v14.4S, v8.S[0] // ....................................*....................................................................... - // sqrdmulh v14.4S, v9.4S, v3.S[1] // ...........*................................................................................................ - // mul v11.4S, v4.4S, v0.S[2] // .....................*...................................................................................... - // sqrdmulh v9.4S, v4.4S, v0.S[3] // ....................*....................................................................................... - // mul v4.4S, v29.4S, v0.S[0] // ..........................................................................................*................. - // sqrdmulh v29.4S, v29.4S, v0.S[1] // ...........................................................................................*................ - // mul v20.4S, v19.4S, v25.4S // .....................................................*...................................................... - // sqrdmulh v19.4S, v19.4S, v26.4S // ................................................*........................................................... - // mls v17.4S, v14.4S, v8.S[0] // ...............*............................................................................................ - // mls v11.4S, v9.4S, v8.S[0] // ..........................*................................................................................. - // sub v14.4S, v13.4S, v22.4S // .................................*.......................................................................... - // mls v4.4S, v29.4S, v8.S[0] // ...............................................................................................*............ - // sub v29.4S, v18.4S, v17.4S // ...................*........................................................................................ - // mul v9.4S, v14.4S, v1.S[0] // ......................................*..................................................................... - // sqrdmulh v14.4S, v14.4S, v1.S[1] // .....................................*...................................................................... - // add v17.4S, v18.4S, v17.4S // ........................*................................................................................... - // mul v18.4S, v29.4S, v1.S[0] // .......................*.................................................................................... - // sqrdmulh v29.4S, v29.4S, v1.S[1] // ......................*..................................................................................... - // sub v13.4S, v6.4S, v17.4S // ......................................................................*..................................... - // add v17.4S, v6.4S, v17.4S // .............................*.............................................................................. - // mls v9.4S, v14.4S, v8.S[0] // ..........................................*................................................................. - // mls v18.4S, v29.4S, v8.S[0] // ...........................*................................................................................ - // mul v6.4S, v13.4S, v0.S[0] // ..........................................................................*................................. - // sqrdmulh v14.4S, v13.4S, v0.S[1] // .........................................................................*.................................. - // sub v29.4S, v28.4S, v9.4S // .................................................................*.......................................... - // add v28.4S, v28.4S, v9.4S // ...............................................*............................................................ - // sub v9.4S, v11.4S, v18.4S // ......................................................*..................................................... - // mls v6.4S, v14.4S, v8.S[0] // ..............................................................................*............................. - // mul v14.4S, v29.4S, v0.S[0] // ...........................................................................*................................ - // sqrdmulh v29.4S, v29.4S, v0.S[1] // .......................................................................*.................................... - // add v18.4S, v11.4S, v18.4S // ...............................*............................................................................ - // mul v11.4S, v9.4S, v0.S[0] // ....................................................................*....................................... - // sqrdmulh v9.4S, v9.4S, v0.S[1] // ...................................................................*........................................ - // mls v14.4S, v29.4S, v8.S[0] // ................................................................................*........................... - // cmge v29.4S, v31.4S, v4.4S // ....................................................................................................*....... - // cmge v13.4S, v4.4S, v30.4S // ...................................................................................................*........ - // mls v20.4S, v19.4S, v8.S[0] // ..........................................................*................................................. - // mls v11.4S, v9.4S, v8.S[0] // ........................................................................*................................... - // sub v19.4S, v29.4S, v13.4S // ......................................................................................................*..... - // cmge v29.4S, v31.4S, v6.4S // ...................................................................................*........................ - // cmge v9.4S, v6.4S, v30.4S // .....................................................................................*...................... - // mls v4.4S, v19.4S, v8.4S // .........................................................................................................*.. - // sub v19.4S, v29.4S, v9.4S // .......................................................................................*.................... - // cmge v29.4S, v31.4S, v14.4S // ....................................................................................*....................... - // cmge v9.4S, v14.4S, v30.4S // ........................................................................................*................... - // mls v6.4S, v19.4S, v8.4S // .............................................................................................*.............. - // sub v19.4S, v29.4S, v9.4S // ............................................................................................*............... - // cmge v29.4S, v31.4S, v11.4S // ............................................................................*............................... - // cmge v9.4S, v11.4S, v30.4S // .............................................................................*.............................. - // mls v14.4S, v19.4S, v8.4S // ................................................................................................*........... - // sub v19.4S, v29.4S, v9.4S // ...............................................................................*............................ - // str q4, [x0, #512] // ...........................................................................................................* - // mul v29.4S, v17.4S, v25.4S // ................................*........................................................................... - // mls v11.4S, v19.4S, v8.4S // ..................................................................................*......................... - // str q6, [x0, #640] // .................................................................................................*.......... - // sqrdmulh v17.4S, v17.4S, v26.4S // .........................................*.................................................................. - // str q14, [x0, #768] // ........................................................................................................*... - // mul v19.4S, v28.4S, v25.4S // ..................................................*......................................................... - // str q11, [x0, #896] // ......................................................................................*..................... - // mls v29.4S, v17.4S, v8.S[0] // ....................................................*....................................................... - // sqrdmulh v17.4S, v28.4S, v26.4S // ...................................................*........................................................ - // mul v6.4S, v18.4S, v25.4S // ..................................*......................................................................... - // sqrdmulh v18.4S, v18.4S, v26.4S // ...................................*........................................................................ - // cmge v14.4S, v31.4S, v20.4S // ...............................................................*............................................ - // mls v19.4S, v17.4S, v8.S[0] // .......................................................*.................................................... - // cmge v17.4S, v20.4S, v30.4S // ................................................................*........................................... - // mls v6.4S, v18.4S, v8.S[0] // .......................................*.................................................................... - // sub v17.4S, v14.4S, v17.4S // ..................................................................*......................................... - // cmge v18.4S, v31.4S, v29.4S // ........................................................*................................................... - // cmge v14.4S, v29.4S, v30.4S // .........................................................*.................................................. - // mls v20.4S, v17.4S, v8.4S // .....................................................................*...................................... - // sub v17.4S, v18.4S, v14.4S // ...........................................................*................................................ - // cmge v18.4S, v31.4S, v19.4S // .............................................................*.............................................. - // cmge v14.4S, v19.4S, v30.4S // ............................................................*............................................... - // mls v29.4S, v17.4S, v8.4S // ..............................................................*............................................. - // sub v17.4S, v18.4S, v14.4S // ..................................................................................................*......... - // cmge v18.4S, v31.4S, v6.4S // ............................................*............................................................... - // cmge v14.4S, v6.4S, v30.4S // ...........................................*................................................................ - // mls v19.4S, v17.4S, v8.4S // .......................................................................................................*.... - // sub v17.4S, v18.4S, v14.4S // ..............................................*............................................................. - // str q20, [x0], #(16) // .........................................................................................*.................. - // mls v6.4S, v17.4S, v8.4S // .................................................*.......................................................... - // str q29, [x0, #112] // ..............................................................................................*............. - // str q19, [x0, #240] // ..........................................................................................................*. - // str q6, [x0, #368] // .....................................................................................................*...... + mls v24.4S, v29.4S, v8.4S // ........................................................................................................*... + // gap // ............................................................................................................ + str q13, [x0, #112] // .........................................................................................................*.. + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q9, [x0, #240] // ..........................................................................................................*. + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q24, [x0, #368] // ...........................................................................................................* + // gap // ............................................................................................................ + + // ---------------------------------------------- new position -----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------- + // sub v6.4S, v17.4S, v5.4S // *........................................................................................................... + // add v17.4S, v17.4S, v5.4S // .*.......................................................................................................... + // sub v18.4S, v18.4S, v7.4S // ..*......................................................................................................... + // sqrdmulh v5.4S, v6.4S, v1.S[3] // ....*....................................................................................................... + // mul v6.4S, v6.4S, v1.S[2] // ...*........................................................................................................ + // sub v7.4S, v17.4S, v14.4S // .....*...................................................................................................... + // add v17.4S, v17.4S, v14.4S // ......*..................................................................................................... + // sqrdmulh v14.4S, v18.4S, v2.S[1] // .......*.................................................................................................... + // mul v18.4S, v18.4S, v2.S[0] // .........*.................................................................................................. + // mls v6.4S, v5.4S, v8.S[0] // ........*................................................................................................... + // sub v5.4S, v4.4S, v10.4S // ..........*................................................................................................. + // sqrdmulh v4.4S, v7.4S, v0.S[3] // ...........*................................................................................................ + // mul v7.4S, v7.4S, v0.S[2] // ............*............................................................................................... + // sub v10.4S, v17.4S, v15.4S // .............*.............................................................................................. + // add v17.4S, v17.4S, v15.4S // ..............*............................................................................................. + // mls v18.4S, v14.4S, v8.S[0] // ...............*............................................................................................ + // sqrdmulh v14.4S, v5.4S, v2.S[3] // ................*........................................................................................... + // mul v5.4S, v5.4S, v2.S[2] // .................*.......................................................................................... + // sub v27.4S, v27.4S, v11.4S // ..................*......................................................................................... + // sub v11.4S, v6.4S, v18.4S // ...................*........................................................................................ + // add v6.4S, v6.4S, v18.4S // ....................*....................................................................................... + // mls v5.4S, v14.4S, v8.S[0] // .....................*...................................................................................... + // sqrdmulh v18.4S, v27.4S, v3.S[1] // ......................*..................................................................................... + // mls v7.4S, v4.4S, v8.S[0] // .......................*.................................................................................... + // mul v14.4S, v27.4S, v3.S[0] // ........................*................................................................................... + // sqrdmulh v4.4S, v11.4S, v0.S[3] // .........................*.................................................................................. + // mul v27.4S, v11.4S, v0.S[2] // ..........................*................................................................................. + // sqrdmulh v11.4S, v10.4S, v0.S[1] // ...........................*................................................................................ + // mul v10.4S, v10.4S, v0.S[0] // ............................*............................................................................... + // sqrdmulh v15.4S, v17.4S, v26.4S // .............................*.............................................................................. + // mul v17.4S, v17.4S, v25.4S // ..............................*............................................................................. + // mls v14.4S, v18.4S, v8.S[0] // ...............................*............................................................................ + // mls v27.4S, v4.4S, v8.S[0] // ................................*........................................................................... + // sub v18.4S, v21.4S, v23.4S // .................................*.......................................................................... + // mls v10.4S, v11.4S, v8.S[0] // ..................................*......................................................................... + // sub v4.4S, v5.4S, v14.4S // ...................................*........................................................................ + // sqrdmulh v11.4S, v18.4S, v1.S[1] // ....................................*....................................................................... + // mul v18.4S, v18.4S, v1.S[0] // .....................................*...................................................................... + // add v5.4S, v5.4S, v14.4S // ......................................*..................................................................... + // sqrdmulh v14.4S, v4.4S, v1.S[1] // .......................................*.................................................................... + // mul v4.4S, v4.4S, v1.S[0] // ........................................*................................................................... + // sub v21.4S, v6.4S, v5.4S // .........................................*.................................................................. + // add v6.4S, v6.4S, v5.4S // ..........................................*................................................................. + // mls v18.4S, v11.4S, v8.S[0] // ...........................................*................................................................ + // mls v4.4S, v14.4S, v8.S[0] // ............................................*............................................................... + // sqrdmulh v5.4S, v21.4S, v0.S[1] // .............................................*.............................................................. + // mul v14.4S, v21.4S, v0.S[0] // ..............................................*............................................................. + // sub v11.4S, v7.4S, v18.4S // ...............................................*............................................................ + // add v18.4S, v7.4S, v18.4S // ................................................*........................................................... + // sub v7.4S, v27.4S, v4.4S // .................................................*.......................................................... + // mls v14.4S, v5.4S, v8.S[0] // ..................................................*......................................................... + // sqrdmulh v5.4S, v11.4S, v0.S[1] // ...................................................*........................................................ + // mul v11.4S, v11.4S, v0.S[0] // ....................................................*....................................................... + // add v4.4S, v27.4S, v4.4S // .....................................................*...................................................... + // sqrdmulh v27.4S, v7.4S, v0.S[1] // ......................................................*..................................................... + // mul v7.4S, v7.4S, v0.S[0] // .......................................................*.................................................... + // mls v11.4S, v5.4S, v8.S[0] // ........................................................*................................................... + // cmge v5.4S, v31.4S, v10.4S // .........................................................*.................................................. + // cmge v21.4S, v10.4S, v30.4S // ..........................................................*................................................. + // mls v17.4S, v15.4S, v8.S[0] // ...........................................................*................................................ + // mls v7.4S, v27.4S, v8.S[0] // ............................................................*............................................... + // sub v5.4S, v5.4S, v21.4S // .............................................................*.............................................. + // cmge v27.4S, v31.4S, v14.4S // ..............................................................*............................................. + // cmge v15.4S, v14.4S, v30.4S // ...............................................................*............................................ + // mls v10.4S, v5.4S, v8.4S // ................................................................*........................................... + // sub v5.4S, v27.4S, v15.4S // .................................................................*.......................................... + // cmge v27.4S, v31.4S, v11.4S // ..................................................................*......................................... + // cmge v15.4S, v11.4S, v30.4S // ...................................................................*........................................ + // mls v14.4S, v5.4S, v8.4S // ....................................................................*....................................... + // sub v5.4S, v27.4S, v15.4S // .....................................................................*...................................... + // cmge v27.4S, v31.4S, v7.4S // ......................................................................*..................................... + // cmge v15.4S, v7.4S, v30.4S // .......................................................................*.................................... + // mls v11.4S, v5.4S, v8.4S // ........................................................................*................................... + // sub v5.4S, v27.4S, v15.4S // .........................................................................*.................................. + // str q10, [x0, #512] // ..........................................................................*................................. + // sqrdmulh v10.4S, v6.4S, v26.4S // ...........................................................................*................................ + // mls v7.4S, v5.4S, v8.4S // ............................................................................*............................... + // str q14, [x0, #640] // .............................................................................*.............................. + // mul v6.4S, v6.4S, v25.4S // ..............................................................................*............................. + // str q11, [x0, #768] // ...............................................................................*............................ + // sqrdmulh v5.4S, v18.4S, v26.4S // ................................................................................*........................... + // str q7, [x0, #896] // .................................................................................*.......................... + // mls v6.4S, v10.4S, v8.S[0] // ..................................................................................*......................... + // mul v18.4S, v18.4S, v25.4S // ...................................................................................*........................ + // sqrdmulh v7.4S, v4.4S, v26.4S // ....................................................................................*....................... + // mul v14.4S, v4.4S, v25.4S // .....................................................................................*...................... + // cmge v4.4S, v31.4S, v17.4S // ......................................................................................*..................... + // mls v18.4S, v5.4S, v8.S[0] // .......................................................................................*.................... + // cmge v5.4S, v17.4S, v30.4S // ........................................................................................*................... + // mls v14.4S, v7.4S, v8.S[0] // .........................................................................................*.................. + // sub v5.4S, v4.4S, v5.4S // ..........................................................................................*................. + // cmge v7.4S, v31.4S, v6.4S // ...........................................................................................*................ + // cmge v4.4S, v6.4S, v30.4S // ............................................................................................*............... + // mls v17.4S, v5.4S, v8.4S // .............................................................................................*.............. + // sub v5.4S, v7.4S, v4.4S // ................................................................................................*........... + // cmge v7.4S, v31.4S, v18.4S // ..............................................................................................*............. + // cmge v4.4S, v18.4S, v30.4S // ...............................................................................................*............ + // mls v6.4S, v5.4S, v8.4S // ....................................................................................................*....... + // sub v5.4S, v7.4S, v4.4S // .................................................................................................*.......... + // cmge v7.4S, v31.4S, v14.4S // ..................................................................................................*......... + // cmge v4.4S, v14.4S, v30.4S // ...................................................................................................*........ + // mls v18.4S, v5.4S, v8.4S // ......................................................................................................*..... + // sub v5.4S, v7.4S, v4.4S // .....................................................................................................*...... + // str q17, [x0], #(16) // .......................................................................................................*.... + // mls v14.4S, v5.4S, v8.4S // ........................................................................................................*... + // str q6, [x0, #112] // .........................................................................................................*.. + // str q18, [x0, #240] // ..........................................................................................................*. + // str q14, [x0, #368] // ...........................................................................................................* pop_stack diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_opt_a72.s b/examples/opt/aarch64/intt_dilithium_123_45678_opt_a72.s index d3b6904f..77190edd 100644 --- a/examples/opt/aarch64/intt_dilithium_123_45678_opt_a72.s +++ b/examples/opt/aarch64/intt_dilithium_123_45678_opt_a72.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +33,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro barrett_reduce_single a @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -187,7 +173,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -198,7 +184,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -208,7 +194,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -216,7 +202,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -227,24 +213,30 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -387,1120 +379,1156 @@ _intt_dilithium_123_45678_opt_a72: qform_root3_tw .req q7 .p2align 2 - ldr q20, [x5, #16] // ..........*................................................................................................................................. - ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // ..............................*............................................................................................................. - ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x1] // .......*.................................................................................................................................... - ldr q1, [x5, #96] // ..*......................................................................................................................................... - ldr q4, [x5, #112] // .......................................*.................................................................................................... - // gap // ............................................................................................................................................ - ldr q7, [x5, #48] // ........*................................................................................................................................... - ldr q5, [x5, #64] // .*.......................................................................................................................................... - // gap // ............................................................................................................................................ - ldr q6, [x5, #32] // ....*....................................................................................................................................... - ldr q30, [x5, #144] // .....*...................................................................................................................................... - // gap // ............................................................................................................................................ - sub v10.4S, v23.4S, v24.4S // ...........*................................................................................................................................ - sub v2.4S, v14.4S, v15.4S // ..................................*......................................................................................................... - ldr q25, [x5, #128] // ......*..................................................................................................................................... - add v29.4S, v16.4S, v17.4S // ....................................*....................................................................................................... - add v3.4S, v14.4S, v15.4S // .....................................*...................................................................................................... - ldr q0, [x5, #160] // ......................................*..................................................................................................... - sub v13.4S, v16.4S, v17.4S // ..........................................*................................................................................................. - add v14.4S, v23.4S, v24.4S // ............*............................................................................................................................... - ldr q18, [x5, #176] // ...*........................................................................................................................................ - sqrdmulh v24.4S, v2.4S, v30.4S // ...........................................*................................................................................................ - add v28.4S, v21.4S, v22.4S // .............*.............................................................................................................................. - ldr q15, [x5], #(12*16) // .....................*...................................................................................................................... - sub v9.4S, v3.4S, v29.4S // ............................................*............................................................................................... - ldr q16, [x5, #-112] // .........*.................................................................................................................................. - // gap // ............................................................................................................................................ - sub v22.4S, v21.4S, v22.4S // ..............*............................................................................................................................. - mul v19.4S, v2.4S, v25.4S // ........................................*................................................................................................... - // gap // ............................................................................................................................................ - sub v12.4S, v28.4S, v14.4S // ..................*......................................................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v2.4S, v13.4S, v18.4S // ...............................................*............................................................................................ - add v18.4S, v28.4S, v14.4S // .................*.......................................................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v14.4S, v9.4S, v4.4S // .................................................*.......................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v13.4S, v13.4S, v0.4S // ..............................................................*............................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v19.4S, v24.4S, v8.S[0] // ...................................................*........................................................................................ - add v24.4S, v3.4S, v29.4S // ..............................................*............................................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v13.4S, v2.4S, v8.S[0] // ................................................................*........................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v31.4S, v9.4S, v1.4S // ......................................................*..................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v31.4S, v14.4S, v8.S[0] // ..........................................................*................................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sub v26.4S, v19.4S, v13.4S // .......................................................................*.................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v3.4S, v22.4S, v7.4S // ...................*........................................................................................................................ - add v11.4S, v19.4S, v13.4S // ...........................................................................*................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v19.4S, v26.4S, v4.4S // ..........................................................................*................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - trn1 v14.4S, v24.4S, v11.4S // ..............................................................................*............................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - trn2 v7.4S, v24.4S, v11.4S // ...................................................................................*........................................................ - sqrdmulh v11.4S, v12.4S, v20.4S // ........................*................................................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v4.4S, v26.4S, v1.4S // ...............................................................................*............................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v4.4S, v19.4S, v8.S[0] // ................................................................................*........................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v29.4S, v10.4S, v5.4S // ...............*............................................................................................................................ - ldr q0, [x4, #32] // .........................................*.................................................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v26.4S, v12.4S, v15.4S // ..........................*................................................................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v30.4S, v10.4S, v16.4S // ................*........................................................................................................................... - trn1 v17.4S, v31.4S, v4.4S // ......................................................................................*..................................................... - // gap // ............................................................................................................................................ - trn2 v23.4S, v31.4S, v4.4S // ........................................................................................*................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v31.4S, v22.4S, v6.4S // ....................*....................................................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - trn2 v12.2D, v14.2D, v17.2D // ..........................................................................................*................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v31.4S, v3.4S, v8.S[0] // .......................*.................................................................................................................... - trn2 v3.2D, v7.2D, v23.2D // ............................................................................................*............................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v29.4S, v30.4S, v8.S[0] // ......................*..................................................................................................................... - ldr q30, [x4, #48] // *........................................................................................................................................... - // gap // ............................................................................................................................................ - sub v25.4S, v12.4S, v3.4S // .................................................................................................*.......................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v26.4S, v11.4S, v8.S[0] // ...............................*............................................................................................................ - trn1 v11.2D, v7.2D, v23.2D // ...............................................................................................*............................................ - // gap // ............................................................................................................................................ - trn1 v23.2D, v14.2D, v17.2D // .............................................................................................*.............................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - add v5.4S, v12.4S, v3.4S // ..................................................................................................*......................................... - sqrdmulh v4.4S, v25.4S, v30.S[1] // ..........................................................................................................*................................. - // gap // ............................................................................................................................................ - sub v3.4S, v31.4S, v29.4S // .........................*.................................................................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v22.4S, v25.4S, v30.S[0] // ......................................................................................................*..................................... - add v16.4S, v23.4S, v11.4S // .....................................................................................................*...................................... - // gap // ............................................................................................................................................ - add v13.4S, v31.4S, v29.4S // ...........................*................................................................................................................ - ldr q31, [x4, #16] // .....................................................*...................................................................................... - // gap // ............................................................................................................................................ - sub v24.4S, v23.4S, v11.4S // ....................................................................................................*....................................... - mul v2.4S, v3.4S, v15.4S // .................................*.......................................................................................................... - // gap // ............................................................................................................................................ - sub v1.4S, v16.4S, v5.4S // .........................................................................................................*.................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - add v16.4S, v16.4S, v5.4S // ........................................................................................................*................................... - sqrdmulh v21.4S, v3.4S, v20.4S // ............................*............................................................................................................... - // gap // ............................................................................................................................................ - trn2 v3.4S, v18.4S, v13.4S // .............................*.............................................................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - trn1 v18.4S, v18.4S, v13.4S // ................................*........................................................................................................... - mls v22.4S, v4.4S, v8.S[0] // ...............................................................................................................*............................ - // gap // ............................................................................................................................................ - srshr v29.4S, v16.4S, #23 // ...........................................................................................................*................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v25.4S, v24.4S, v0.S[3] // .......................................................................................................*.................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v2.4S, v21.4S, v8.S[0] // ...................................*........................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v14.4S, v24.4S, v0.S[2] // ............................................................................................................*............................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v14.4S, v25.4S, v8.S[0] // .............................................................................................................*.............................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - trn1 v23.4S, v26.4S, v2.4S // .............................................*.............................................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - trn2 v30.4S, v26.4S, v2.4S // ................................................*........................................................................................... - sqrdmulh v12.4S, v1.4S, v31.S[1] // ..............................................................................................................*............................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v9.4S, v1.4S, v31.S[0] // ................................................................................................................*........................... - trn1 v11.2D, v18.2D, v23.2D // ..................................................*......................................................................................... - // gap // ............................................................................................................................................ - sub v4.4S, v14.4S, v22.4S // ..................................................................................................................*......................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v16.4S, v29.4S, v8.4S // .................................................................................................................*.......................... - add v22.4S, v14.4S, v22.4S // ....................................................................................................................*....................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v6.4S, v4.4S, v31.S[0] // .....................................................................................................................*...................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - srshr v2.4S, v22.4S, #23 // .........................................................................................................................*.................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v9.4S, v12.4S, v8.S[0] // ...................................................................................................................*........................ - trn1 v29.2D, v3.2D, v30.2D // ....................................................*....................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v12.4S, v4.4S, v31.S[1] // .......................................................................................................................*.................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sub v19.4S, v11.4S, v29.4S // ........................................................*................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - trn2 v20.2D, v18.2D, v23.2D // .........................................................*.................................................................................. - mls v22.4S, v2.4S, v8.4S // ..............................................................................................................................*............. - // gap // ............................................................................................................................................ - trn2 v23.2D, v3.2D, v30.2D // .......................................................*.................................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v10.4S, v19.4S, v31.S[2] // .....................................................................*...................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - add v2.4S, v11.4S, v29.4S // ...........................................................*................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v19.4S, v19.4S, v31.S[3] // ............................................................*............................................................................... - add v27.4S, v20.4S, v23.4S // .............................................................*.............................................................................. - // gap // ............................................................................................................................................ - sub v15.4S, v20.4S, v23.4S // ...............................................................*............................................................................ - ldr q20, [x4], #64 // ..................................................................*......................................................................... - // gap // ............................................................................................................................................ - mls v6.4S, v12.4S, v8.S[0] // ...............................................................................................................................*............ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sub v28.4S, v2.4S, v27.4S // .................................................................*.......................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - add v26.4S, v2.4S, v27.4S // ...................................................................*........................................................................ - sqrdmulh v2.4S, v15.4S, v0.S[1] // ....................................................................*....................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v0.4S, v15.4S, v0.S[0] // ........................................................................*................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - srshr v13.4S, v26.4S, #23 // ......................................................................*..................................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v23.4S, v28.4S, v20.S[3] // ............................................................................*............................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v0.4S, v2.4S, v8.S[0] // .........................................................................*.................................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v10.4S, v19.4S, v8.S[0] // .............................................................................*.............................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v26.4S, v13.4S, v8.4S // ..................................................................................*......................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v11.4S, v28.4S, v20.S[2] // .....................................................................................*...................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sub v27.4S, v10.4S, v0.4S // .................................................................................*.......................................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v11.4S, v23.4S, v8.S[0] // .......................................................................................*.................................................... - add v14.4S, v10.4S, v0.4S // ....................................................................................*....................................................... - // gap // ............................................................................................................................................ - sub v2.4S, v26.4S, v16.4S // ......................................................................................................................*..................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - add v0.4S, v26.4S, v16.4S // ........................................................................................................................*................... - sqrdmulh v4.4S, v27.4S, v20.S[3] // ...........................................................................................*................................................ - // gap // ............................................................................................................................................ - srshr v23.4S, v14.4S, #23 // .........................................................................................*.................................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mul v7.4S, v27.4S, v20.S[2] // ..............................................................................................*............................................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - str q0, [x1], #(16*4) // ............................................................................................................................*............... - sub v19.4S, v11.4S, v9.4S // ...........................................................................................................................*................ - // gap // ............................................................................................................................................ - add v9.4S, v11.4S, v9.4S // .............................................................................................................................*.............. - mls v14.4S, v23.4S, v8.4S // ................................................................................................*........................................... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v16.4S, v2.4S, v20.S[1] // ..........................................................................................................................*................. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v7.4S, v4.4S, v8.S[0] // ...................................................................................................*........................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sub v15.4S, v14.4S, v22.4S // .................................................................................................................................*.......... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - add v30.4S, v14.4S, v22.4S // ..................................................................................................................................*......... - mul v2.4S, v2.4S, v20.S[0] // ................................................................................................................................*........... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - mls v2.4S, v16.4S, v8.S[0] // ...................................................................................................................................*........ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - str q30, [x1, #-48] // .......................................................................................................................................*.... - sub v31.4S, v7.4S, v6.4S // ....................................................................................................................................*....... - // gap // ............................................................................................................................................ - add v4.4S, v7.4S, v6.4S // .....................................................................................................................................*...... - mul v23.4S, v19.4S, v20.S[0] // ......................................................................................................................................*..... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - sqrdmulh v11.4S, v15.4S, v20.S[1] // ........................................................................................................................................*... - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - str q4, [x1, #-16] // ..........................................................................................................................................*. - // gap // ............................................................................................................................................ - // gap // ............................................................................................................................................ - str q2, [x2], #(16*4) // .........................................................................................................................................*.. - sqrdmulh v4.4S, v31.4S, v20.S[1] // ...........................................................................................................................................* - // gap // ............................................................................................................................................ - - // original source code - // ldr q21, [x4, #48] // .......................................................*.................................................................................... - // ldr q24, [x5, #64] // ......*..................................................................................................................................... - // ldr q3, [x5, #96] // ...*........................................................................................................................................ - // ldr q10, [x5, #176] // .................*.......................................................................................................................... - // ldr q7, [x5, #32] // .......*.................................................................................................................................... - // ldr q12, [x5, #144] // ........*................................................................................................................................... - // ldr q22, [x5, #128] // ...........*................................................................................................................................ - // ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1] // ..*......................................................................................................................................... - // ldr q29, [x5, #48] // .....*...................................................................................................................................... - // ldr q20, [x5, #80] // ......................*..................................................................................................................... - // ldr q18, [x5, #16] // *........................................................................................................................................... - // sub v9.4S, v27.4S, v28.4S // .........*.................................................................................................................................. - // add v27.4S, v27.4S, v28.4S // ................*........................................................................................................................... - // add v19.4S, v25.4S, v26.4S // ...................*........................................................................................................................ - // sub v26.4S, v25.4S, v26.4S // .......................*.................................................................................................................... - // mul v6.4S, v9.4S, v24.4S // ............................................*............................................................................................... - // sqrdmulh v5.4S, v9.4S, v20.4S // ...............................................*............................................................................................ - // add v31.4S, v19.4S, v27.4S // ...........................*................................................................................................................ - // sub v25.4S, v19.4S, v27.4S // .........................*.................................................................................................................. - // sqrdmulh v15.4S, v26.4S, v29.4S // ....................................*....................................................................................................... - // mul v2.4S, v26.4S, v7.4S // ..................................................*......................................................................................... - // ldr q14, [x5], #(12*16) // ....................*....................................................................................................................... - // mls v6.4S, v5.4S, v8.S[0] // ......................................................*..................................................................................... - // mls v2.4S, v15.4S, v8.S[0] // ....................................................*....................................................................................... - // sqrdmulh v15.4S, v25.4S, v18.4S // .........................................*.................................................................................................. - // sub v16.4S, v2.4S, v6.4S // ..............................................................*............................................................................. - // mul v13.4S, v25.4S, v14.4S // ..............................................*............................................................................................. - // add v28.4S, v2.4S, v6.4S // .................................................................*.......................................................................... - // sqrdmulh v11.4S, v16.4S, v18.4S // .......................................................................*.................................................................... - // trn2 v23.4S, v31.4S, v28.4S // ........................................................................*................................................................... - // ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x2] // .*.......................................................................................................................................... - // mls v13.4S, v15.4S, v8.S[0] // .........................................................*.................................................................................. - // trn1 v17.4S, v31.4S, v28.4S // .........................................................................*.................................................................. - // mul v0.4S, v16.4S, v14.4S // ....................................................................*....................................................................... - // sub v26.4S, v4.4S, v5.4S // ..........*................................................................................................................................. - // mls v0.4S, v11.4S, v8.S[0] // .............................................................................*.............................................................. - // add v31.4S, v6.4S, v7.4S // ............*............................................................................................................................... - // add v24.4S, v4.4S, v5.4S // .............*.............................................................................................................................. - // ldr q11, [x5, #-32] // ..............*............................................................................................................................. - // ldr q18, [x5, #-80] // ....*....................................................................................................................................... - // mul v9.4S, v26.4S, v22.4S // ........................*................................................................................................................... - // ldr q5, [x4, #32] // .............................................*.............................................................................................. - // sub v16.4S, v6.4S, v7.4S // ...............*............................................................................................................................ - // sqrdmulh v30.4S, v26.4S, v12.4S // ..................*......................................................................................................................... - // sub v15.4S, v24.4S, v31.4S // .....................*...................................................................................................................... - // trn1 v26.4S, v13.4S, v0.4S // ................................................................................*........................................................... - // add v24.4S, v24.4S, v31.4S // ...............................*............................................................................................................ - // sqrdmulh v27.4S, v16.4S, v10.4S // ..........................*................................................................................................................. - // trn2 v6.4S, v13.4S, v0.4S // .................................................................................*.......................................................... - // sqrdmulh v25.4S, v15.4S, v18.4S // ............................*............................................................................................................... - // trn1 v19.2D, v17.2D, v26.2D // ....................................................................................*....................................................... - // mls v9.4S, v30.4S, v8.S[0] // ..............................*............................................................................................................. - // trn1 v28.2D, v23.2D, v6.2D // ...........................................................................................*................................................ - // ldr q7, [x4, #16] // ..................................................................*......................................................................... - // mul v0.4S, v15.4S, v3.4S // .................................*.......................................................................................................... - // trn2 v15.2D, v23.2D, v6.2D // ................................................................................................*........................................... - // sub v4.4S, v19.4S, v28.4S // .............................................................................................*.............................................. - // trn2 v23.2D, v17.2D, v26.2D // ..............................................................................................*............................................. - // mls v0.4S, v25.4S, v8.S[0] // ..................................*......................................................................................................... - // add v13.4S, v19.4S, v28.4S // ..................................................................................................*......................................... - // sqrdmulh v26.4S, v4.4S, v7.S[3] // ...................................................................................................*........................................ - // add v1.4S, v23.4S, v15.4S // ....................................................................................................*....................................... - // mul v31.4S, v16.4S, v11.4S // .............................*.............................................................................................................. - // sub v29.4S, v23.4S, v15.4S // .....................................................................................................*...................................... - // mls v31.4S, v27.4S, v8.S[0] // ................................*........................................................................................................... - // sub v27.4S, v13.4S, v1.4S // ........................................................................................................*................................... - // ldr q20, [x4], #64 // ......................................................................................................*..................................... - // add v25.4S, v13.4S, v1.4S // .........................................................................................................*.................................. - // sqrdmulh v6.4S, v29.4S, v5.S[1] // ..........................................................................................................*................................. - // mul v28.4S, v4.4S, v7.S[2] // .................................................................................................*.......................................... - // srshr v15.4S, v25.4S, #23 // ............................................................................................................*............................... - // sub v17.4S, v9.4S, v31.4S // ...................................*........................................................................................................ - // mul v13.4S, v29.4S, v5.S[0] // ...........................................................................................................*................................ - // mls v13.4S, v6.4S, v8.S[0] // ..............................................................................................................*............................. - // sqrdmulh v16.4S, v17.4S, v18.4S // ......................................*..................................................................................................... - // add v10.4S, v9.4S, v31.4S // .....................................*...................................................................................................... - // sqrdmulh v23.4S, v27.4S, v20.S[3] // .............................................................................................................*.............................. - // mls v28.4S, v26.4S, v8.S[0] // ...............................................................................................................*............................ - // trn1 v30.4S, v24.4S, v10.4S // .......................................*.................................................................................................... - // mul v12.4S, v17.4S, v3.4S // ..........................................*................................................................................................. - // mls v12.4S, v16.4S, v8.S[0] // ...........................................*................................................................................................ - // sub v22.4S, v28.4S, v13.4S // ..................................................................................................................*......................... - // mls v25.4S, v15.4S, v8.4S // ................................................................................................................*........................... - // trn2 v15.4S, v24.4S, v10.4S // ........................................*................................................................................................... - // add v31.4S, v28.4S, v13.4S // ....................................................................................................................*....................... - // mul v1.4S, v27.4S, v20.S[2] // .................................................................................................................*.......................... - // trn1 v2.4S, v0.4S, v12.4S // ................................................*........................................................................................... - // mls v1.4S, v23.4S, v8.S[0] // ...................................................................................................................*........................ - // trn2 v10.4S, v0.4S, v12.4S // .................................................*.......................................................................................... - // srshr v28.4S, v31.4S, #23 // ........................................................................................................................*................... - // trn2 v11.2D, v30.2D, v2.2D // ...................................................*........................................................................................ - // sqrdmulh v19.4S, v22.4S, v20.S[3] // .......................................................................................................................*.................... - // trn2 v12.2D, v15.2D, v10.2D // .....................................................*...................................................................................... - // trn1 v14.2D, v30.2D, v2.2D // ...........................................................*................................................................................ - // mul v6.4S, v22.4S, v20.S[2] // .........................................................................................................................*.................. - // trn1 v2.2D, v15.2D, v10.2D // ..........................................................*................................................................................. - // mls v31.4S, v28.4S, v8.4S // .............................................................................................................................*.............. - // sub v13.4S, v11.4S, v12.4S // ........................................................*................................................................................... - // add v23.4S, v11.4S, v12.4S // ............................................................*............................................................................... - // mls v6.4S, v19.4S, v8.S[0] // ...............................................................................................................................*............ - // sub v17.4S, v14.4S, v2.4S // ...................................................................*........................................................................ - // add v26.4S, v14.4S, v2.4S // ................................................................*........................................................................... - // mul v24.4S, v13.4S, v21.S[0] // ...............................................................*............................................................................ - // sqrdmulh v19.4S, v17.4S, v5.S[3] // ............................................................................*............................................................... - // add v16.4S, v26.4S, v23.4S // ......................................................................*..................................................................... - // sub v18.4S, v26.4S, v23.4S // .....................................................................*...................................................................... - // sqrdmulh v23.4S, v13.4S, v21.S[1] // .............................................................*.............................................................................. - // srshr v27.4S, v16.4S, #23 // ...........................................................................*................................................................ - // mul v17.4S, v17.4S, v5.S[2] // ..............................................................................*............................................................. - // mls v17.4S, v19.4S, v8.S[0] // ...............................................................................*............................................................ - // sqrdmulh v2.4S, v18.4S, v7.S[1] // ..................................................................................*......................................................... - // mls v24.4S, v23.4S, v8.S[0] // ..........................................................................*................................................................. - // mul v10.4S, v18.4S, v7.S[0] // ...................................................................................*........................................................ - // mls v16.4S, v27.4S, v8.4S // ......................................................................................*..................................................... - // sub v19.4S, v17.4S, v24.4S // .....................................................................................*...................................................... - // mls v10.4S, v2.4S, v8.S[0] // ..........................................................................................*................................................. - // add v2.4S, v17.4S, v24.4S // .......................................................................................*.................................................... - // mul v14.4S, v19.4S, v7.S[0] // ........................................................................................*................................................... - // sub v12.4S, v25.4S, v16.4S // .....................................................................................................................*...................... - // sqrdmulh v24.4S, v19.4S, v7.S[1] // ............................................................................................*............................................... - // add v9.4S, v25.4S, v16.4S // ......................................................................................................................*..................... - // srshr v16.4S, v2.4S, #23 // .........................................................................................*.................................................. - // sqrdmulh v0.4S, v12.4S, v20.S[1] // ..............................................................................................................................*............. - // sub v19.4S, v1.4S, v10.4S // ...........................................................................................................................*................ - // str q9, [x1], #(16*4) // ..........................................................................................................................*................. - // add v9.4S, v1.4S, v10.4S // ............................................................................................................................*............... - // mls v2.4S, v16.4S, v8.4S // ...............................................................................................*............................................ - // mls v14.4S, v24.4S, v8.S[0] // .......................................................................................................*.................................... - // mul v13.4S, v12.4S, v20.S[0] // ..................................................................................................................................*......... - // sub v15.4S, v31.4S, v2.4S // ................................................................................................................................*........... - // add v2.4S, v31.4S, v2.4S // .................................................................................................................................*.......... - // mls v13.4S, v0.4S, v8.S[0] // ...................................................................................................................................*........ - // sub v31.4S, v6.4S, v14.4S // .....................................................................................................................................*...... - // add v27.4S, v6.4S, v14.4S // ......................................................................................................................................*..... - // mul v23.4S, v19.4S, v20.S[0] // .......................................................................................................................................*.... - // str q2, [x1, #-48] // ....................................................................................................................................*....... - // sqrdmulh v11.4S, v15.4S, v20.S[1] // ........................................................................................................................................*... - // str q13, [x2], #(16*4) // ..........................................................................................................................................*. - // str q27, [x1, #-16] // .........................................................................................................................................*.. - // sqrdmulh v4.4S, v31.4S, v20.S[1] // ...........................................................................................................................................* + // Instructions: 145 + // Expected cycles: 128 + // Expected IPC: 1.13 + // + // Wall time: 1182.06s + // User time: 1182.06s + // + // -------------------------------------------------------------- original position ---------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // gap // ................................................................................................................................................. + ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x2] // ..................*.............................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x1] // .*............................................................................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + ldr q10, [x5, #160] // ......................................................*.......................................................................................... + add v15.4S, v27.4S, v28.4S // ........................*........................................................................................................................ + sub v27.4S, v27.4S, v28.4S // ................................................*................................................................................................ + // gap // ................................................................................................................................................. + add v23.4S, v3.4S, v4.4S // .........*....................................................................................................................................... + // gap // ................................................................................................................................................. + ldr q29, [x5, #144] // ......................................*.......................................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sub v7.4S, v25.4S, v26.4S // .....................*........................................................................................................................... + mul v0.4S, v27.4S, v10.4S // ...........................................................*..................................................................................... + add v31.4S, v25.4S, v26.4S // .........................*....................................................................................................................... + ldr q22, [x5, #128] // ........*........................................................................................................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v19.4S, v7.4S, v29.4S // ...........................................*..................................................................................................... + ldr q28, [x5, #176] // ................................*................................................................................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v30.4S, v7.4S, v22.4S // ..........................*...................................................................................................................... + ldr q5, [x5, #96] // .............................*................................................................................................................... + sub v29.4S, v3.4S, v4.4S // .......*......................................................................................................................................... + // gap // ................................................................................................................................................. + sqrdmulh v13.4S, v27.4S, v28.4S // .....................................................*........................................................................................... + // gap // ................................................................................................................................................. + sub v22.4S, v1.4S, v2.4S // .....*........................................................................................................................................... + ldr q11, [x5, #80] // ....*............................................................................................................................................ + sub v14.4S, v31.4S, v15.4S // ..........................................*...................................................................................................... + // gap // ................................................................................................................................................. + add v24.4S, v31.4S, v15.4S // ............................................*.................................................................................................... + mls v30.4S, v19.4S, v8.S[0] // ..................................................*.............................................................................................. + ldr q4, [x5], #(12*16) // ......*.......................................................................................................................................... + ldr q18, [x5, #-160] // ..*.............................................................................................................................................. + add v3.4S, v1.4S, v2.4S // ..........*...................................................................................................................................... + // gap // ................................................................................................................................................. + ldr q16, [x5, #-144] // ...*............................................................................................................................................. + mul v15.4S, v14.4S, v5.4S // ...............................................*................................................................................................. + // gap // ................................................................................................................................................. + ldr q9, [x5, #-176] // ............*.................................................................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v0.4S, v13.4S, v8.S[0] // .............................................................*................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sub v19.4S, v3.4S, v23.4S // ..............*.................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v27.4S, v22.4S, v16.4S // ...........*..................................................................................................................................... + // gap // ................................................................................................................................................. + ldr q28, [x5, #-80] // .....................................*........................................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v25.4S, v19.4S, v9.4S // ...........................*..................................................................................................................... + sub v13.4S, v30.4S, v0.4S // .................................................................*............................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v20.4S, v30.4S, v0.4S // ..................................................................*.............................................................................. + sqrdmulh v14.4S, v14.4S, v28.4S // .............................................*................................................................................................... + ldr q30, [x5, #-128] // ................*................................................................................................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v12.4S, v13.4S, v28.4S // .....................................................................*........................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v16.4S, v13.4S, v5.4S // ..........................................................................*...................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v15.4S, v14.4S, v8.S[0] // .........................................................*....................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v16.4S, v12.4S, v8.S[0] // ............................................................................*.................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + trn1 v6.4S, v24.4S, v20.4S // ......................................................................................*.......................................................... + // gap // ................................................................................................................................................. + mul v26.4S, v22.4S, v18.4S // .............*................................................................................................................................... + trn2 v0.4S, v24.4S, v20.4S // ........................................................................*........................................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v7.4S, v29.4S, v11.4S // .................*............................................................................................................................... + trn2 v13.4S, v15.4S, v16.4S // ................................................................................*................................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v20.4S, v29.4S, v30.4S // ....................*............................................................................................................................ + // gap // ................................................................................................................................................. + trn1 v22.4S, v15.4S, v16.4S // .................................................................................*............................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v26.4S, v27.4S, v8.S[0] // ...................*............................................................................................................................. + // gap // ................................................................................................................................................. + trn1 v27.2D, v0.2D, v13.2D // ...........................................................................................*..................................................... + ldr q30, [x4, #32] // ........................................*........................................................................................................ + // gap // ................................................................................................................................................. + trn1 v18.2D, v6.2D, v22.2D // ..........................................................................................*...................................................... + trn2 v10.2D, v0.2D, v13.2D // .....................................................................................*........................................................... + mls v20.4S, v7.4S, v8.S[0] // .......................*......................................................................................................................... + // gap // ................................................................................................................................................. + trn2 v15.2D, v6.2D, v22.2D // .........................................................................................*....................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v16.4S, v19.4S, v4.4S // ..............................*.................................................................................................................. + // gap // ................................................................................................................................................. + add v2.4S, v18.4S, v27.4S // ......................................................................................................*.......................................... + sub v21.4S, v18.4S, v27.4S // ................................................................................................*................................................ + ldr q18, [x4, #16] // ......................*.......................................................................................................................... + // gap // ................................................................................................................................................. + mls v16.4S, v25.4S, v8.S[0] // ...................................*............................................................................................................. + add v17.4S, v15.4S, v10.4S // ...................................................................................................*............................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sub v19.4S, v26.4S, v20.4S // ............................*.................................................................................................................... + add v7.4S, v3.4S, v23.4S // ...............*................................................................................................................................. + mul v0.4S, v21.4S, v30.S[2] // .......................................................................................................*......................................... + // gap // ................................................................................................................................................. + sub v1.4S, v2.4S, v17.4S // ..........................................................................................................*...................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v11.4S, v26.4S, v20.4S // ...............................*................................................................................................................. + sqrdmulh v29.4S, v19.4S, v9.4S // .................................*............................................................................................................... + // gap // ................................................................................................................................................. + ldr q3, [x4, #48] // .............................................................................................*................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sub v26.4S, v15.4S, v10.4S // ..............................................................................................*.................................................. + // gap // ................................................................................................................................................. + sqrdmulh v20.4S, v1.4S, v18.S[1] // .................................................................................................................*............................... + trn1 v12.4S, v7.4S, v11.4S // ..................................*.............................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + trn2 v27.4S, v7.4S, v11.4S // ....................................*............................................................................................................ + mul v9.4S, v19.4S, v4.4S // .......................................*......................................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v9.4S, v29.4S, v8.S[0] // .........................................*....................................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v22.4S, v21.4S, v30.S[3] // .....................................................................................................*........................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v6.4S, v26.4S, v3.S[1] // ..................................................................................................*.............................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + trn1 v4.4S, v16.4S, v9.4S // ..............................................*.................................................................................................. + add v28.4S, v2.4S, v17.4S // .........................................................................................................*....................................... + mul v29.4S, v26.4S, v3.S[0] // .................................................................................................*............................................... + // gap // ................................................................................................................................................. + trn2 v17.4S, v16.4S, v9.4S // .................................................*............................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v0.4S, v22.4S, v8.S[0] // ...........................................................................................................*..................................... + trn1 v24.2D, v12.2D, v4.2D // ...................................................*............................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + srshr v9.4S, v28.4S, #23 // ............................................................................................................*.................................... + mls v29.4S, v6.4S, v8.S[0] // ........................................................................................................*........................................ + // gap // ................................................................................................................................................. + trn1 v10.2D, v27.2D, v17.2D // ........................................................*........................................................................................ + trn2 v11.2D, v12.2D, v4.2D // ....................................................*............................................................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + trn2 v12.2D, v27.2D, v17.2D // .......................................................*......................................................................................... + mls v28.4S, v9.4S, v8.4S // ..............................................................................................................*.................................. + // gap // ................................................................................................................................................. + sub v26.4S, v24.4S, v10.4S // ...............................................................*................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v21.4S, v24.4S, v10.4S // ....................................................................*............................................................................ + mul v24.4S, v1.4S, v18.S[0] // .............................................................................................................*................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v14.4S, v11.4S, v12.4S // ......................................................................*.......................................................................... + sub v13.4S, v11.4S, v12.4S // ..........................................................*...................................................................................... + // gap // ................................................................................................................................................. + mul v11.4S, v26.4S, v18.S[2] // .......................................................................*......................................................................... + ldr q10, [x4], #64 // *................................................................................................................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v5.4S, v26.4S, v18.S[3] // ...................................................................*............................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sub v15.4S, v21.4S, v14.4S // .........................................................................*....................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v31.4S, v13.4S, v30.S[0] // ..............................................................*.................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sub v1.4S, v0.4S, v29.4S // ...............................................................................................................*................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v25.4S, v15.4S, v10.S[2] // ....................................................................................*............................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v3.4S, v0.4S, v29.4S // ................................................................................................................*................................ + // gap // ................................................................................................................................................. + sqrdmulh v29.4S, v1.4S, v18.S[1] // ..................................................................................................................*.............................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v14.4S, v21.4S, v14.4S // ...........................................................................*..................................................................... + sqrdmulh v26.4S, v13.4S, v30.S[1] // ............................................................*.................................................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v11.4S, v5.4S, v8.S[0] // ..............................................................................*.................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + srshr v16.4S, v14.4S, #23 // .............................................................................*................................................................... + sqrdmulh v7.4S, v15.4S, v10.S[3] // ..................................................................................*.............................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v14.4S, v16.4S, v8.4S // ...............................................................................*................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v31.4S, v26.4S, v8.S[0] // ................................................................*................................................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v25.4S, v7.4S, v8.S[0] // ........................................................................................*........................................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v26.4S, v1.4S, v18.S[0] // ....................................................................................................................*............................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sub v19.4S, v11.4S, v31.4S // ...................................................................................*............................................................. + mls v26.4S, v29.4S, v8.S[0] // ........................................................................................................................*........................ + add v29.4S, v11.4S, v31.4S // ....................................................................................................*............................................ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + srshr v30.4S, v3.4S, #23 // ...................................................................................................................*............................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v11.4S, v19.4S, v10.S[3] // .......................................................................................*......................................................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + srshr v4.4S, v29.4S, #23 // ..................................................................................................................................*.............. + mls v3.4S, v30.4S, v8.4S // ..........................................................................................................................*...................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v29.4S, v4.4S, v8.4S // .....................................................................................................................................*........... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v17.4S, v14.4S, v28.4S // .......................................................................................................................*......................... + mls v24.4S, v20.4S, v8.S[0] // ......................................................................................................................*.......................... + sub v20.4S, v14.4S, v28.4S // .....................................................................................................................*........................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v28.4S, v19.4S, v10.S[2] // ............................................................................................*.................................................... + str q17, [x1], #(16*4) // .........................................................................................................................*....................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mls v28.4S, v11.4S, v8.S[0] // ...............................................................................................*................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v19.4S, v25.4S, v24.4S // ............................................................................................................................*.................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v23.4S, v20.4S, v10.S[0] // ................................................................................................................................*................ + sub v30.4S, v29.4S, v3.4S // ........................................................................................................................................*........ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v5.4S, v29.4S, v3.4S // .........................................................................................................................................*....... + sqrdmulh v20.4S, v20.4S, v10.S[1] // .............................................................................................................................*................... + str q19, [x1, #-32] // .................................................................................................................................*............... + sub v3.4S, v25.4S, v24.4S // ...........................................................................................................................*..................... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v9.4S, v30.4S, v10.S[0] // ...............................................................................................................................................*. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v13.4S, v3.4S, v10.S[0] // ..........................................................................................................................................*...... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + add v21.4S, v28.4S, v26.4S // ...............................................................................................................................*................. + mls v23.4S, v20.4S, v8.S[0] // ......................................................................................................................................*.......... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sub v27.4S, v28.4S, v26.4S // ..............................................................................................................................*.................. + sqrdmulh v31.4S, v3.4S, v10.S[1] // ...................................................................................................................................*............. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + sqrdmulh v2.4S, v30.4S, v10.S[1] // ............................................................................................................................................*.... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + str q23, [x2], #(16*4) // ...........................................................................................................................................*..... + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + mul v3.4S, v27.4S, v10.S[0] // ................................................................................................................................................* + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + str q21, [x1, #-16] // ....................................................................................................................................*............ + // gap // ................................................................................................................................................. + // gap // ................................................................................................................................................. + str q5, [x1, #-48] // .............................................................................................................................................*... + sqrdmulh v27.4S, v27.4S, v10.S[1] // .......................................................................................................................................*......... + add x1, x1, #64 // ..............................................................................................................................................*.. + + // ----------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ldr q12, [x4], #64 // ..............................................................................................*.................................................. + // ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x1] // .*............................................................................................................................................... + // ldr q25, [x5, #32] // .......................*......................................................................................................................... + // ldr q30, [x5, #48] // .........................*....................................................................................................................... + // ldr q27, [x5, #80] // ..................*.............................................................................................................................. + // sub v14.4S, v4.4S, v5.4S // .................*............................................................................................................................... + // ldr q28, [x5], #(12*16) // ......................*.......................................................................................................................... + // sub v26.4S, v6.4S, v7.4S // ...............*................................................................................................................................. + // ldr q17, [x5, #-64] // ..........*...................................................................................................................................... + // add v16.4S, v6.4S, v7.4S // .....*........................................................................................................................................... + // add v15.4S, v4.4S, v5.4S // ........................*........................................................................................................................ + // sqrdmulh v6.4S, v14.4S, v30.4S // ..............................*.................................................................................................................. + // ldr q19, [x5, #-176] // ...........................*..................................................................................................................... + // mul v29.4S, v14.4S, v25.4S // ..........................................*...................................................................................................... + // sub v23.4S, v15.4S, v16.4S // .............................*................................................................................................................... + // add v13.4S, v15.4S, v16.4S // ..............................................................*.................................................................................. + // ldr q31, [x5, #-128] // ....................................*............................................................................................................ + // sqrdmulh v20.4S, v26.4S, v27.4S // ............................................*.................................................................................................... + // ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x2] // *................................................................................................................................................ + // mls v29.4S, v6.4S, v8.S[0] // ................................................*................................................................................................ + // mul v21.4S, v26.4S, v31.4S // ..............................................*.................................................................................................. + // sub v18.4S, v0.4S, v1.4S // .......*......................................................................................................................................... + // ldr q22, [x4, #-48] // ..........................................................*...................................................................................... + // mls v21.4S, v20.4S, v8.S[0] // .....................................................*........................................................................................... + // add v16.4S, v2.4S, v3.4S // ...*............................................................................................................................................. + // add v7.4S, v0.4S, v1.4S // .........*....................................................................................................................................... + // mul v15.4S, v18.4S, v17.4S // .............*................................................................................................................................... + // sqrdmulh v24.4S, v23.4S, v19.4S // ................................*................................................................................................................ + // sub v0.4S, v29.4S, v21.4S // .............................................................*................................................................................... + // ldr q17, [x5, #-96] // ..............*.................................................................................................................................. + // mul v10.4S, v23.4S, v28.4S // .......................................................*......................................................................................... + // add v1.4S, v29.4S, v21.4S // .................................................................*............................................................................... + // ldr q23, [x5, #-16] // ............*.................................................................................................................................... + // sqrdmulh v21.4S, v0.4S, v19.4S // ..................................................................*.............................................................................. + // trn1 v31.4S, v13.4S, v1.4S // ......................................................................*.......................................................................... + // mls v10.4S, v24.4S, v8.S[0] // ...........................................................*..................................................................................... + // trn2 v29.4S, v13.4S, v1.4S // .......................................................................*......................................................................... + // ldr q9, [x5, #-80] // ...............................*................................................................................................................. + // ldr q24, [x5, #-48] // ......*.......................................................................................................................................... + // mul v6.4S, v0.4S, v28.4S // ........................................................................*........................................................................ + // ldr q1, [x4, #-32] // ..................................................*.............................................................................................. + // mls v6.4S, v21.4S, v8.S[0] // .........................................................................*....................................................................... + // sub v20.4S, v7.4S, v16.4S // ...................*............................................................................................................................. + // sqrdmulh v11.4S, v18.4S, v24.4S // ...........*..................................................................................................................................... + // add v18.4S, v7.4S, v16.4S // ....................*............................................................................................................................ + // sqrdmulh v16.4S, v20.4S, v9.4S // ...................................*............................................................................................................. + // trn1 v0.4S, v10.4S, v6.4S // ............................................................................*.................................................................... + // mul v4.4S, v20.4S, v17.4S // ..........................*...................................................................................................................... + // sub v20.4S, v2.4S, v3.4S // ....*............................................................................................................................................ + // trn2 v19.4S, v10.4S, v6.4S // ...............................................................................*................................................................. + // mls v15.4S, v11.4S, v8.S[0] // .....................*........................................................................................................................... + // trn1 v14.2D, v31.2D, v0.2D // .................................................................................*............................................................... + // trn2 v0.2D, v31.2D, v0.2D // .....................................................................................*........................................................... + // sqrdmulh v24.4S, v20.4S, v23.4S // ................*................................................................................................................................ + // ldr q10, [x5, #-32] // ..*.............................................................................................................................................. + // trn2 v13.2D, v29.2D, v19.2D // ......................................................................................*.......................................................... + // trn1 v6.2D, v29.2D, v19.2D // ....................................................................................*............................................................ + // mls v4.4S, v16.4S, v8.S[0] // .......................................*......................................................................................................... + // sub v29.4S, v0.4S, v13.4S // ............................................................................................*.................................................... + // mul v20.4S, v20.4S, v10.4S // ........*........................................................................................................................................ + // sqrdmulh v21.4S, v29.4S, v1.S[1] // .......................................................................................................*......................................... + // mls v20.4S, v24.4S, v8.S[0] // ............................*.................................................................................................................... + // mul v24.4S, v29.4S, v1.S[0] // .................................................................................................*............................................... + // sub v29.4S, v14.4S, v6.4S // ........................................................................................*........................................................ + // mls v24.4S, v21.4S, v8.S[0] // ............................................................................................................*.................................... + // sub v2.4S, v15.4S, v20.4S // .................................*............................................................................................................... + // add v10.4S, v15.4S, v20.4S // ..................................*.............................................................................................................. + // sqrdmulh v3.4S, v29.4S, v22.S[3] // ...............................................................................................*................................................. + // add v19.4S, v14.4S, v6.4S // .........................................................................................*....................................................... + // sqrdmulh v21.4S, v2.4S, v9.4S // .....................................*........................................................................................................... + // add v20.4S, v0.4S, v13.4S // ...........................................................................................*..................................................... + // mul v26.4S, v29.4S, v22.S[2] // .............................................................................................*................................................... + // trn2 v29.4S, v18.4S, v10.4S // ...........................................*..................................................................................................... + // sub v5.4S, v19.4S, v20.4S // ................................................................................................*................................................ + // mul v14.4S, v2.4S, v17.4S // ......................................*.......................................................................................................... + // add v19.4S, v19.4S, v20.4S // ......................................................................................................*.......................................... + // mls v14.4S, v21.4S, v8.S[0] // ........................................*........................................................................................................ + // srshr v20.4S, v19.4S, #23 // .........................................................................................................*....................................... + // mls v26.4S, v3.4S, v8.S[0] // ........................................................................................................*........................................ + // mls v19.4S, v20.4S, v8.4S // ...........................................................................................................*..................................... + // trn2 v3.4S, v4.4S, v14.4S // .............................................*................................................................................................... + // trn1 v25.4S, v4.4S, v14.4S // ...............................................*................................................................................................. + // sqrdmulh v11.4S, v5.4S, v12.S[3] // ..........................................................................................................*...................................... + // sub v16.4S, v26.4S, v24.4S // ...............................................................................................................*................................. + // mul v23.4S, v5.4S, v12.S[2] // ...................................................................................................*............................................. + // trn2 v27.2D, v29.2D, v3.2D // ....................................................*............................................................................................ + // trn1 v4.4S, v18.4S, v10.4S // .........................................*....................................................................................................... + // sqrdmulh v5.4S, v16.4S, v12.S[3] // ...................................................................................................................*............................. + // mls v23.4S, v11.4S, v8.S[0] // .............................................................................................................*................................... + // trn2 v18.2D, v4.2D, v25.2D // ......................................................*.......................................................................................... + // trn1 v15.2D, v4.2D, v25.2D // ...................................................*............................................................................................. + // trn1 v11.2D, v29.2D, v3.2D // .................................................*............................................................................................... + // mul v28.4S, v16.4S, v12.S[2] // ..........................................................................................................................*...................... + // ldr q29, [x4, #-16] // ...................................................................*............................................................................. + // sub v10.4S, v18.4S, v27.4S // ....................................................................*............................................................................ + // mls v28.4S, v5.4S, v8.S[0] // ............................................................................................................................*.................... + // sub v21.4S, v15.4S, v11.4S // .........................................................*....................................................................................... + // mul v6.4S, v10.4S, v29.S[0] // ..............................................................................*.................................................................. + // sqrdmulh v29.4S, v10.4S, v29.S[1] // ...........................................................................*..................................................................... + // add v3.4S, v18.4S, v27.4S // ............................................................*.................................................................................... + // add v18.4S, v26.4S, v24.4S // .................................................................................................................*............................... + // sqrdmulh v0.4S, v21.4S, v1.S[3] // ..........................................................................*...................................................................... + // add v20.4S, v15.4S, v11.4S // ........................................................*........................................................................................ + // mul v30.4S, v21.4S, v1.S[2] // ...............................................................*................................................................................. + // mls v6.4S, v29.4S, v8.S[0] // ...................................................................................*............................................................. + // add v4.4S, v20.4S, v3.4S // .............................................................................*................................................................... + // sub v2.4S, v20.4S, v3.4S // ................................................................*................................................................................ + // mls v30.4S, v0.4S, v8.S[0] // ................................................................................*................................................................ + // srshr v20.4S, v4.4S, #23 // ..................................................................................*.............................................................. + // mul v9.4S, v2.4S, v22.S[0] // ..........................................................................................*...................................................... + // mls v4.4S, v20.4S, v8.4S // .......................................................................................*......................................................... + // sub v0.4S, v30.4S, v6.4S // ..................................................................................................*.............................................. + // add v13.4S, v30.4S, v6.4S // ....................................................................................................*............................................ + // sqrdmulh v20.4S, v2.4S, v22.S[1] // .....................................................................*........................................................................... + // sqrdmulh v31.4S, v0.4S, v22.S[1] // .....................................................................................................*........................................... + // srshr v3.4S, v13.4S, #23 // ..................................................................................................................*.............................. + // mul v26.4S, v0.4S, v22.S[0] // ..............................................................................................................*.................................. + // sub v29.4S, v19.4S, v4.4S // .........................................................................................................................*....................... + // mls v9.4S, v20.4S, v8.S[0] // ........................................................................................................................*........................ + // add v20.4S, v19.4S, v4.4S // .......................................................................................................................*......................... + // mls v26.4S, v31.4S, v8.S[0] // ................................................................................................................*................................ + // str q20, [x1], #(16*4) // ...........................................................................................................................*..................... + // mls v13.4S, v3.4S, v8.4S // .....................................................................................................................*........................... + // sub v19.4S, v23.4S, v9.4S // ...................................................................................................................................*............. + // add v11.4S, v23.4S, v9.4S // .............................................................................................................................*................... + // sqrdmulh v21.4S, v29.4S, v12.S[1] // .................................................................................................................................*............... + // sub v10.4S, v28.4S, v26.4S // ........................................................................................................................................*........ + // add v27.4S, v28.4S, v26.4S // ......................................................................................................................................*.......... + // mul v14.4S, v29.4S, v12.S[0] // ..............................................................................................................................*.................. + // str q11, [x1, #-32] // ..................................................................................................................................*.............. + // srshr v28.4S, v18.4S, #23 // ....................................................................................................................*............................ + // sqrdmulh v31.4S, v19.4S, v12.S[1] // .........................................................................................................................................*....... + // str q27, [x1, #-16] // .............................................................................................................................................*... + // mls v18.4S, v28.4S, v8.4S // ......................................................................................................................*.......................... + // mls v14.4S, v21.4S, v8.S[0] // .......................................................................................................................................*......... + // sqrdmulh v27.4S, v10.4S, v12.S[1] // ...............................................................................................................................................*. + // sub v29.4S, v18.4S, v13.4S // ...............................................................................................................................*................. + // add v20.4S, v18.4S, v13.4S // ................................................................................................................................*................ + // mul v13.4S, v19.4S, v12.S[0] // .....................................................................................................................................*........... + // str q14, [x2], #(16*4) // ...........................................................................................................................................*..... + // sqrdmulh v2.4S, v29.4S, v12.S[1] // ..........................................................................................................................................*...... + // str q20, [x1, #-48] // ..............................................................................................................................................*.. + // add x1, x1, #64 // ................................................................................................................................................* + // mul v9.4S, v29.4S, v12.S[0] // ....................................................................................................................................*............ + // mul v3.4S, v10.4S, v12.S[0] // ............................................................................................................................................*.... sub count, count, #1 layer45678_start: - ldr q21, [x4, #48] // .........................................................................e.............................................................................. - ldr q24, [x5, #64] // ......e................................................................................................................................................. - // gap // ........................................................................................................................................................ - ldr q3, [x5, #96] // ............................e........................................................................................................................... - ldr q10, [x5, #176] // .................................e...................................................................................................................... - mul v30.4S, v31.4S, v20.S[0] // ...........................................................................................................................................*............ - ldr q7, [x5, #32] // ....e................................................................................................................................................... - str q9, [x1, #-32] // ................................................................................................................................................*....... - add x1, x1, #64 // ......................................................................................................................................................*. - ldr q12, [x5, #144] // ...............................e........................................................................................................................ - sqrdmulh v0.4S, v19.4S, v20.S[1] // .......................................................................................................................................*................ - // gap // ........................................................................................................................................................ - ldr q22, [x5, #128] // ..............................e......................................................................................................................... - ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1] // e....................................................................................................................................................... - // gap // ........................................................................................................................................................ - ldr q29, [x5, #48] // .....e.................................................................................................................................................. - mul v1.4S, v15.4S, v20.S[0] // .................................................................................................................................*...................... - ldr q20, [x5, #80] // .......e................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - ldr q18, [x5, #16] // ...e.................................................................................................................................................... - mls v30.4S, v4.4S, v8.S[0] // .............................................................................................................................................*.......... - // gap // ........................................................................................................................................................ - sub v9.4S, v27.4S, v28.4S // .............e.......................................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v27.4S, v27.4S, v28.4S // ..............e......................................................................................................................................... - mls v1.4S, v11.4S, v8.S[0] // ...................................................................................................................................*.................... - // gap // ........................................................................................................................................................ - add v19.4S, v25.4S, v26.4S // .........e.............................................................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v26.4S, v25.4S, v26.4S // ........e............................................................................................................................................... - mul v6.4S, v9.4S, v24.4S // ...............e........................................................................................................................................ - // gap // ........................................................................................................................................................ - str q30, [x2, #-16] // .....................................................................................................................................................*.. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v5.4S, v9.4S, v20.4S // ................e....................................................................................................................................... - add v31.4S, v19.4S, v27.4S // ...................e.................................................................................................................................... - // gap // ........................................................................................................................................................ - str q1, [x2, #-48] // ...................................................................................................................................................*.... - sub v25.4S, v19.4S, v27.4S // ..................e..................................................................................................................................... - // gap // ........................................................................................................................................................ - sqrdmulh v15.4S, v26.4S, v29.4S // ...........e............................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v2.4S, v26.4S, v7.4S // ..........e............................................................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - ldr q14, [x5], #(12*16) // ..e..................................................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v6.4S, v5.4S, v8.S[0] // .................e...................................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v2.4S, v15.4S, v8.S[0] // ............e........................................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v23.4S, v0.4S, v8.S[0] // ........................................................................................................................................*............... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v15.4S, v25.4S, v18.4S // .....................e.................................................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v16.4S, v2.4S, v6.4S // .......................e................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v13.4S, v25.4S, v14.4S // ....................e................................................................................................................................... - add v28.4S, v2.4S, v6.4S // ........................e............................................................................................................................... - // gap // ........................................................................................................................................................ - str q23, [x2, #-32] // ....................................................................................................................................................*... - add x2, x2, #64 // .......................................................................................................................................................* - // gap // ........................................................................................................................................................ - sqrdmulh v11.4S, v16.4S, v18.4S // ..........................e............................................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn2 v23.4S, v31.4S, v28.4S // .......................................................e................................................................................................ - ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x2] // .e...................................................................................................................................................... - // gap // ........................................................................................................................................................ - mls v13.4S, v15.4S, v8.S[0] // ......................e................................................................................................................................. - trn1 v17.4S, v31.4S, v28.4S // ......................................................e................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v0.4S, v16.4S, v14.4S // .........................e.............................................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v26.4S, v4.4S, v5.4S // ..................................e..................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v0.4S, v11.4S, v8.S[0] // ...........................e............................................................................................................................ - add v31.4S, v6.4S, v7.4S // ........................................e............................................................................................................... - // gap // ........................................................................................................................................................ - add v24.4S, v4.4S, v5.4S // ...................................e.................................................................................................................... - ldr q11, [x5, #-32] // ................................e....................................................................................................................... - ldr q18, [x5, #-80] // .............................e.......................................................................................................................... - mul v9.4S, v26.4S, v22.4S // ....................................e................................................................................................................... - ldr q5, [x4, #32] // ........................................................................e............................................................................... - // gap // ........................................................................................................................................................ - sub v16.4S, v6.4S, v7.4S // .......................................e................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v30.4S, v26.4S, v12.4S // .....................................e.................................................................................................................. - sub v15.4S, v24.4S, v31.4S // ............................................e........................................................................................................... - // gap // ........................................................................................................................................................ - trn1 v26.4S, v13.4S, v0.4S // ........................................................e............................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v24.4S, v24.4S, v31.4S // .............................................e.......................................................................................................... - sqrdmulh v27.4S, v16.4S, v10.4S // ..........................................e............................................................................................................. - // gap // ........................................................................................................................................................ - trn2 v6.4S, v13.4S, v0.4S // .........................................................e.............................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v25.4S, v15.4S, v18.4S // ...............................................e........................................................................................................ - trn1 v19.2D, v17.2D, v26.2D // ............................................................e........................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v9.4S, v30.4S, v8.S[0] // ......................................e................................................................................................................. - trn1 v28.2D, v23.2D, v6.2D // .............................................................e.......................................................................................... - ldr q7, [x4, #16] // .......................................................................e................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v0.4S, v15.4S, v3.4S // ..............................................e......................................................................................................... - trn2 v15.2D, v23.2D, v6.2D // ...........................................................e............................................................................................ - // gap // ........................................................................................................................................................ - sub v4.4S, v19.4S, v28.4S // ..........................................................................e............................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn2 v23.2D, v17.2D, v26.2D // ..........................................................e............................................................................................. - mls v0.4S, v25.4S, v8.S[0] // ................................................e....................................................................................................... - // gap // ........................................................................................................................................................ - add v13.4S, v19.4S, v28.4S // ...........................................................................e............................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v26.4S, v4.4S, v7.S[3] // .............................................................................e.......................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v1.4S, v23.4S, v15.4S // ................................................................................e....................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v31.4S, v16.4S, v11.4S // .........................................e.............................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v29.4S, v23.4S, v15.4S // ...............................................................................e........................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v31.4S, v27.4S, v8.S[0] // ...........................................e............................................................................................................ - sub v27.4S, v13.4S, v1.4S // ..............................................................................................e......................................................... - ldr q20, [x4], #64 // ......................................................................e................................................................................. - add v25.4S, v13.4S, v1.4S // ...............................................................................................e........................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v6.4S, v29.4S, v5.S[1] // ..................................................................................e..................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v28.4S, v4.4S, v7.S[2] // ............................................................................e........................................................................... - srshr v15.4S, v25.4S, #23 // ..................................................................................................................e..................................... - // gap // ........................................................................................................................................................ - sub v17.4S, v9.4S, v31.4S // .................................................e...................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v13.4S, v29.4S, v5.S[0] // .................................................................................e...................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v13.4S, v6.4S, v8.S[0] // ...................................................................................e.................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v16.4S, v17.4S, v18.4S // ....................................................e................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v10.4S, v9.4S, v31.4S // ..................................................e..................................................................................................... - sqrdmulh v23.4S, v27.4S, v20.S[3] // .................................................................................................e...................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v28.4S, v26.4S, v8.S[0] // ..............................................................................e......................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn1 v30.4S, v24.4S, v10.4S // ..............................................................e......................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v12.4S, v17.4S, v3.4S // ...................................................e.................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v12.4S, v16.4S, v8.S[0] // .....................................................e.................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v22.4S, v28.4S, v13.4S // ...................................................................................................e.................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v25.4S, v15.4S, v8.4S // ...................................................................................................................e.................................... - trn2 v15.4S, v24.4S, v10.4S // ...............................................................e........................................................................................ - // gap // ........................................................................................................................................................ - add v31.4S, v28.4S, v13.4S // ....................................................................................................e................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v1.4S, v27.4S, v20.S[2] // ................................................................................................e....................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn1 v2.4S, v0.4S, v12.4S // ................................................................e....................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v1.4S, v23.4S, v8.S[0] // ..................................................................................................e..................................................... - trn2 v10.4S, v0.4S, v12.4S // .................................................................e...................................................................................... - // gap // ........................................................................................................................................................ - srshr v28.4S, v31.4S, #23 // ....................................................................................................................e................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn2 v11.2D, v30.2D, v2.2D // ..................................................................e..................................................................................... - sqrdmulh v19.4S, v22.4S, v20.S[3] // ......................................................................................................e................................................. - // gap // ........................................................................................................................................................ - trn2 v12.2D, v15.2D, v10.2D // ...................................................................e.................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn1 v14.2D, v30.2D, v2.2D // ....................................................................e................................................................................... - mul v6.4S, v22.4S, v20.S[2] // .....................................................................................................e.................................................. - // gap // ........................................................................................................................................................ - trn1 v2.2D, v15.2D, v10.2D // .....................................................................e.................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v31.4S, v28.4S, v8.4S // .....................................................................................................................e.................................. - sub v13.4S, v11.4S, v12.4S // .........................................................................................e.............................................................. - // gap // ........................................................................................................................................................ - add v23.4S, v11.4S, v12.4S // ..........................................................................................e............................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v6.4S, v19.4S, v8.S[0] // .......................................................................................................e................................................ - sub v17.4S, v14.4S, v2.4S // ....................................................................................e................................................................... - // gap // ........................................................................................................................................................ - add v26.4S, v14.4S, v2.4S // .....................................................................................e.................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v24.4S, v13.4S, v21.S[0] // ...........................................................................................e............................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v19.4S, v17.4S, v5.S[3] // .......................................................................................e................................................................ - add v16.4S, v26.4S, v23.4S // .........................................................................................................e.............................................. - // gap // ........................................................................................................................................................ - sub v18.4S, v26.4S, v23.4S // ........................................................................................................e............................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v23.4S, v13.4S, v21.S[1] // ............................................................................................e........................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - srshr v27.4S, v16.4S, #23 // ......................................................................................................................e................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v17.4S, v17.4S, v5.S[2] // ......................................................................................e................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v17.4S, v19.4S, v8.S[0] // ........................................................................................e............................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v2.4S, v18.4S, v7.S[1] // ...........................................................................................................e............................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v24.4S, v23.4S, v8.S[0] // .............................................................................................e.......................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v10.4S, v18.4S, v7.S[0] // ..........................................................................................................e............................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v16.4S, v27.4S, v8.4S // .......................................................................................................................e................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v19.4S, v17.4S, v24.4S // .............................................................................................................e.......................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v10.4S, v2.4S, v8.S[0] // ............................................................................................................e........................................... - add v2.4S, v17.4S, v24.4S // ..............................................................................................................e......................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v14.4S, v19.4S, v7.S[0] // ...............................................................................................................e........................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v12.4S, v25.4S, v16.4S // ..........................................................................................................................e............................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v24.4S, v19.4S, v7.S[1] // ................................................................................................................e....................................... - add v9.4S, v25.4S, v16.4S // ...........................................................................................................................e............................ - // gap // ........................................................................................................................................................ - srshr v16.4S, v2.4S, #23 // ........................................................................................................................e............................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v0.4S, v12.4S, v20.S[1] // .............................................................................................................................e.......................... - sub v19.4S, v1.4S, v10.4S // ....................................................................................................................................e................... - // gap // ........................................................................................................................................................ - str q9, [x1], #(16*4) // ..............................................................................................................................................e......... - add v9.4S, v1.4S, v10.4S // .....................................................................................................................................e.................. - // gap // ........................................................................................................................................................ - mls v2.4S, v16.4S, v8.4S // .........................................................................................................................e.............................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v14.4S, v24.4S, v8.S[0] // .................................................................................................................e...................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v13.4S, v12.4S, v20.S[0] // ............................................................................................................................e........................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v15.4S, v31.4S, v2.4S // ...............................................................................................................................e........................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v2.4S, v31.4S, v2.4S // ................................................................................................................................e....................... - mls v13.4S, v0.4S, v8.S[0] // ..............................................................................................................................e......................... - // gap // ........................................................................................................................................................ - sub v31.4S, v6.4S, v14.4S // .........................................................................................................................................e.............. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v27.4S, v6.4S, v14.4S // ..........................................................................................................................................e............. - mul v23.4S, v19.4S, v20.S[0] // ......................................................................................................................................e................. - // gap // ........................................................................................................................................................ - str q2, [x1, #-48] // ...............................................................................................................................................e........ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v11.4S, v15.4S, v20.S[1] // ..................................................................................................................................e..................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - str q13, [x2], #(16*4) // ..................................................................................................................................................e..... - str q27, [x1, #-16] // .................................................................................................................................................e...... - // gap // ........................................................................................................................................................ - sqrdmulh v4.4S, v31.4S, v20.S[1] // ............................................................................................................................................e........... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - - // original source code - // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ...........e............................................................................................................................................|..........e............................ - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ..........................................e.............................................................................................................|....................................... - // ldr q0, [x5], #(12*16) // ..............................e.........................................................................................................................|.............................e......... - // ldr q4, [x5, #(-12*16 + 1*16)] // ...............e........................................................................................................................................|..............e........................ - // ldr q1, [x5, #(-12*16 + 2*16)] // .....e..................................................................................................................................................|....e.................................. - // ldr q5, [x5, #(-12*16 + 3*16)] // ............e...........................................................................................................................................|...........e........................... - // ldr q2, [x5, #(-12*16 + 4*16)] // .e......................................................................................................................................................|e...................................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ..............e.........................................................................................................................................|.............e......................... - // sub v24.4s, v9.4s, v10.4s // .....................e..................................................................................................................................|....................e.................. - // add v9.4s, v9.4s, v10.4s // ....................e...................................................................................................................................|...................e................... - // mul v10.4s, v24.4s, v1.4s // .............................e..........................................................................................................................|............................e.......... - // sqrdmulh v24.4s, v24.4s, v5.4s // ............................e...........................................................................................................................|...........................e........... - // mls v10.4s, v24.4s, v8.s[0] // ................................e.......................................................................................................................|...............................e....... - // sub v24.4s, v11.4s, v12.4s // .................e......................................................................................................................................|................e...................... - // add v11.4s, v11.4s, v12.4s // ..................e.....................................................................................................................................|.................e..................... - // mul v12.4s, v24.4s, v2.4s // ......................e.................................................................................................................................|.....................e................. - // sqrdmulh v24.4s, v24.4s, v6.4s // ........................e...............................................................................................................................|.......................e............... - // mls v12.4s, v24.4s, v8.s[0] // ...............................e........................................................................................................................|..............................e........ - // sub v24.4s, v9.4s, v11.4s // ...........................e............................................................................................................................|..........................e............ - // add v9.4s, v9.4s, v11.4s // .........................e..............................................................................................................................|........................e.............. - // mul v11.4s, v24.4s, v0.4s // ....................................e...................................................................................................................|...................................e... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................e.....................................................................................................................|.................................e..... - // mls v11.4s, v24.4s, v8.s[0] // ...........................................e............................................................................................................|....................................... - // sub v24.4s, v10.4s, v12.4s // ...................................e....................................................................................................................|..................................e.... - // add v10.4s, v10.4s, v12.4s // .....................................e..................................................................................................................|....................................e.. - // mul v12.4s, v24.4s, v0.4s // .............................................e..........................................................................................................|....................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................e...............................................................................................................|....................................... - // mls v12.4s, v24.4s, v8.s[0] // ...............................................e........................................................................................................|....................................... - // ldr q0, [x5, #(-12*16 + 6*16)] // ..e.....................................................................................................................................................|.e..................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ...................................................e....................................................................................................|....................................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ..........e.............................................................................................................................................|.........e............................. - // ldr q5, [x5, #(-12*16 + 9*16)] // ........e...............................................................................................................................................|.......e............................... - // ldr q2, [x5, #(-12*16 + 10*16)] // ..................................................e.....................................................................................................|....................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ...e....................................................................................................................................................|..e.................................... - // sub v24.4s, v13.4s, v14.4s // ..............................................e.........................................................................................................|....................................... - // add v13.4s, v13.4s, v14.4s // .................................................e......................................................................................................|....................................... - // mul v14.4s, v24.4s, v1.4s // ....................................................e...................................................................................................|....................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................................e................................................................................................|....................................... - // mls v14.4s, v24.4s, v8.s[0] // ...............................................................e........................................................................................|....................................... - // sub v24.4s, v15.4s, v16.4s // ......................................................e.................................................................................................|....................................... - // add v15.4s, v15.4s, v16.4s // ................................................e.......................................................................................................|....................................... - // mul v16.4s, v24.4s, v2.4s // ..........................................................................e.............................................................................|....................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................................e............................................................................................|....................................... - // mls v16.4s, v24.4s, v8.s[0] // ............................................................................e...........................................................................|....................................... - // sub v24.4s, v13.4s, v15.4s // ........................................................e...............................................................................................|....................................... - // add v13.4s, v13.4s, v15.4s // ..........................................................e.............................................................................................|....................................... - // mul v15.4s, v24.4s, v0.4s // ..................................................................e.....................................................................................|....................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................................e..........................................................................................|....................................... - // mls v15.4s, v24.4s, v8.s[0] // ......................................................................e.................................................................................|....................................... - // sub v24.4s, v14.4s, v16.4s // ...................................................................................e....................................................................|....................................... - // add v14.4s, v14.4s, v16.4s // .......................................................................................e................................................................|....................................... - // mul v16.4s, v24.4s, v0.4s // ...........................................................................................e............................................................|....................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................................................................e.................................................................|....................................... - // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................e...........................................................|....................................... - // trn1 v25.4s, v9.4s, v10.4s // ............................................e...........................................................................................................|....................................... - // trn2 v26.4s, v9.4s, v10.4s // .........................................e..............................................................................................................|....................................... - // trn1 v27.4s, v11.4s, v12.4s // .........................................................e..............................................................................................|....................................... - // trn2 v28.4s, v11.4s, v12.4s // ............................................................e...........................................................................................|....................................... - // trn2 v11.2d, v25.2d, v27.2d // .....................................................................e..................................................................................|....................................... - // trn2 v12.2d, v26.2d, v28.2d // ...................................................................e....................................................................................|....................................... - // trn1 v9.2d, v25.2d, v27.2d // ..............................................................e.........................................................................................|....................................... - // trn1 v10.2d, v26.2d, v28.2d // ................................................................e.......................................................................................|....................................... - // trn1 v25.4s, v13.4s, v14.4s // ..........................................................................................e.............................................................|....................................... - // trn2 v26.4s, v13.4s, v14.4s // ...............................................................................................e........................................................|....................................... - // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................................e.....................................................|....................................... - // trn2 v28.4s, v15.4s, v16.4s // ....................................................................................................e...................................................|....................................... - // trn2 v15.2d, v25.2d, v27.2d // ......................................................................................................e.................................................|....................................... - // trn2 v16.2d, v26.2d, v28.2d // ........................................................................................................e...............................................|....................................... - // trn1 v13.2d, v25.2d, v27.2d // .........................................................................................................e..............................................|....................................... - // trn1 v14.2d, v26.2d, v28.2d // ...........................................................................................................e............................................|....................................... - // ldr q0, [x4], #64 // ..............................................................................e.........................................................................|....................................... - // ldr q1, [x4, #(-64 + 16)] // .................................................................e......................................................................................|....................................... - // ldr q2, [x4, #(-64 + 32)] // .....................................................e..................................................................................................|....................................... - // ldr q3, [x4, #(-64 + 48)] // e.......................................................................................................................................................e....................................... - // sub v24.4s, v9.4s, v10.4s // ....................................................................e...................................................................................|....................................... - // add v9.4s, v9.4s, v10.4s // .......................................................................e................................................................................|....................................... - // mul v10.4s, v24.4s, v1.s[2] // .................................................................................e......................................................................|....................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................e...............................................................................|....................................... - // mls v10.4s, v24.4s, v8.s[0] // .........................................................................................e..............................................................|....................................... - // sub v24.4s, v11.4s, v12.4s // ...........................................................................e............................................................................|....................................... - // add v11.4s, v11.4s, v12.4s // .........................................................................e..............................................................................|....................................... - // mul v12.4s, v24.4s, v2.s[0] // ....................................................................................e...................................................................|....................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................e.......................................................................|....................................... - // mls v12.4s, v24.4s, v8.s[0] // .....................................................................................e..................................................................|....................................... - // sub v24.4s, v13.4s, v14.4s // ................................................................................................................e.......................................|....................................... - // add v13.4s, v13.4s, v14.4s // .................................................................................................................e......................................|....................................... - // mul v14.4s, v24.4s, v2.s[2] // ........................................................................................................................e...............................|....................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................................e....................................|....................................... - // mls v14.4s, v24.4s, v8.s[0] // .........................................................................................................................e..............................|....................................... - // sub v24.4s, v15.4s, v16.4s // .............................................................................................................e..........................................|....................................... - // add v15.4s, v15.4s, v16.4s // ..............................................................................................................e.........................................|....................................... - // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................................e.....................................|....................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ......................................................................................................................e.................................|....................................... - // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................e............................|....................................... - // sub v24.4s, v9.4s, v11.4s // .............................................................................e..........................................................................|....................................... - // add v9.4s, v9.4s, v11.4s // ...............................................................................e........................................................................|....................................... - // mul v11.4s, v24.4s, v0.s[2] // .................................................................................................e......................................................|....................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ........................................................................................e...............................................................|....................................... - // mls v11.4s, v24.4s, v8.s[0] // ...................................................................................................e....................................................|....................................... - // sub v24.4s, v10.4s, v12.4s // .............................................................................................e..........................................................|....................................... - // add v10.4s, v10.4s, v12.4s // ................................................................................................e.......................................................|....................................... - // mul v12.4s, v24.4s, v0.s[2] // ..........................................................................................................e.............................................|....................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................e................................................|....................................... - // mls v12.4s, v24.4s, v8.s[0] // ...............................................................................................................e........................................|....................................... - // sub v24.4s, v13.4s, v15.4s // .....................................................................................................................e..................................|....................................... - // add v13.4s, v13.4s, v15.4s // ....................................................................................................................e...................................|....................................... - // mul v15.4s, v24.4s, v1.s[0] // ............................................................................................................................e...........................|....................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................e.............................|....................................... - // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................................e........................|....................................... - // sub v24.4s, v14.4s, v16.4s // ..............................................................................................................................e.........................|....................................... - // add v14.4s, v14.4s, v16.4s // ................................................................................................................................e.......................|....................................... - // mul v16.4s, v24.4s, v1.s[0] // .................................................................................................................................e......................|....................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................e....................|....................................... - // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................................e............|....................................... - // srshr v24.4S, v9.4S, #23 // ..................................................................................e.....................................................................|....................................... - // mls v9.4s, v24.4s, v8.4s // ..............................................................................................e.........................................................|....................................... - // srshr v24.4S, v10.4S, #23 // .....................................................................................................e..................................................|....................................... - // mls v10.4s, v24.4s, v8.4s // ............................................................................................................e...........................................|....................................... - // srshr v24.4S, v13.4S, #23 // .......................................................................................................................e................................|....................................... - // mls v13.4s, v24.4s, v8.4s // .............................................................................................................................e..........................|....................................... - // srshr v24.4S, v14.4S, #23 // .....................................................................................................................................e..................|....................................... - // mls v14.4s, v24.4s, v8.4s // ..........................................................................................................................................e.............|....................................... - // sub v24.4s, v9.4s, v13.4s // ..................................................................................................................................e.....................|....................................... - // add v9.4s, v9.4s, v13.4s // ....................................................................................................................................e...................|....................................... - // mul v13.4s, v24.4s, v0.s[0] // ............................................................................................................................................e...........|....................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................e.................|....................................... - // mls v13.4s, v24.4s, v8.s[0] // ...............................................................................................................................................e........|....................................... - // sub v24.4s, v10.4s, v14.4s // .............................................................................................................................................e..........|....................................... - // add v10.4s, v10.4s, v14.4s // ..............................................................................................................................................e.........|....................................... - // mul v14.4s, v24.4s, v0.s[0] // .............*..........................................................................................................................................|............*.......................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................................................................................................................................................e...|....................................... - // mls v14.4s, v24.4s, v8.s[0] // ...................*....................................................................................................................................|..................*.................... - // sub v24.4s, v11.4s, v15.4s // .......................................................................................................................................e................|....................................... - // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................e..............|....................................... - // mul v15.4s, v24.4s, v0.s[0] // ..................................................................................................................................................e.....|....................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........*..............................................................................................................................................|........*.............................. - // mls v15.4s, v24.4s, v8.s[0] // .................................*......................................................................................................................|................................*...... - // sub v24.4s, v12.4s, v16.4s // ................................................................................................................................................e.......|....................................... - // add v12.4s, v12.4s, v16.4s // .................................................................................................................................................e......|....................................... - // mul v16.4s, v24.4s, v0.s[0] // ....*...................................................................................................................................................|...*................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................e|....................................... - // mls v16.4s, v24.4s, v8.s[0] // ................*.......................................................................................................................................|...............*....................... - // str q9, [x1], #(16*4) // ........................................................................................................................................e...............|....................................... - // str q10, [x1, #(-16*4 + 1*16)] // ...................................................................................................................................................e....|....................................... - // str q11, [x1, #(-16*4 + 2*16)] // ......*.................................................................................................................................................|.....*................................. - // str q12, [x1, #(-16*4 + 3*16)] // ......................................................................................................................................................e.|....................................... - // str q13, [x2], #(16*4) // .....................................................................................................................................................e..|....................................... - // str q14, [x2, #(-16*4 + 1*16)] // ..........................*.............................................................................................................................|.........................*............. - // str q15, [x2, #(-16*4 + 2*16)] // ......................................*.................................................................................................................|.....................................*. - // str q16, [x2, #(-16*4 + 3*16)] // .......................*................................................................................................................................|......................*................ - // add x1, x1, #64 // .......*................................................................................................................................................|......*................................ - // add x2, x2, #64 // .......................................*................................................................................................................|......................................* + // Instructions: 152 + // Expected cycles: 128 + // Expected IPC: 1.19 + // + // Wall time: 904.77s + // User time: 904.77s + // + // ------------------------------------------------------------------ original position ------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + ldr q12, [x4], #64 // ......................................................................e................................................................................. + ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x1] // e....................................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q25, [x5, #32] // ....e................................................................................................................................................... + mls v13.4S, v31.4S, v8.S[0] // ........................................................................................................................................*............... + // gap // ........................................................................................................................................................ + ldr q30, [x5, #48] // .....e.................................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v3.4S, v27.4S, v8.S[0] // .............................................................................................................................................*.......... + ldr q27, [x5, #80] // .......e................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v14.4S, v4.4S, v5.4S // ........e............................................................................................................................................... + ldr q28, [x5], #(12*16) // ..e..................................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v9.4S, v2.4S, v8.S[0] // ...................................................................................................................................*.................... + sub v26.4S, v6.4S, v7.4S // .............e.......................................................................................................................................... + ldr q17, [x5, #-64] // ..............................e......................................................................................................................... + add v16.4S, v6.4S, v7.4S // ..............e......................................................................................................................................... + str q13, [x2, #-32] // ....................................................................................................................................................*... + // gap // ........................................................................................................................................................ + add v15.4S, v4.4S, v5.4S // .........e.............................................................................................................................................. + sqrdmulh v6.4S, v14.4S, v30.4S // ..........e............................................................................................................................................. + // gap // ........................................................................................................................................................ + str q3, [x2, #-16] // .....................................................................................................................................................*.. + ldr q19, [x5, #-176] // ...e.................................................................................................................................................... + // gap // ........................................................................................................................................................ + mul v29.4S, v14.4S, v25.4S // ...........e............................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q9, [x2, #-48] // ...................................................................................................................................................*.... + sub v23.4S, v15.4S, v16.4S // ..................e..................................................................................................................................... + add x2, x2, #64 // .......................................................................................................................................................* + add v13.4S, v15.4S, v16.4S // ...................e.................................................................................................................................... + ldr q31, [x5, #-128] // ......e................................................................................................................................................. + sqrdmulh v20.4S, v26.4S, v27.4S // ...............e........................................................................................................................................ + ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x2] // .e...................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v29.4S, v6.4S, v8.S[0] // ............e........................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v21.4S, v26.4S, v31.4S // ................e....................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v18.4S, v0.4S, v1.4S // ..................................e..................................................................................................................... + ldr q22, [x4, #-48] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + mls v21.4S, v20.4S, v8.S[0] // .................e...................................................................................................................................... + add v16.4S, v2.4S, v3.4S // ........................................e............................................................................................................... + // gap // ........................................................................................................................................................ + add v7.4S, v0.4S, v1.4S // ...................................e.................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v15.4S, v18.4S, v17.4S // .....................................e.................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v23.4S, v19.4S // ....................e................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v0.4S, v29.4S, v21.4S // .......................e................................................................................................................................ + ldr q17, [x5, #-96] // ............................e........................................................................................................................... + // gap // ........................................................................................................................................................ + mul v10.4S, v23.4S, v28.4S // .....................e.................................................................................................................................. + add v1.4S, v29.4S, v21.4S // ........................e............................................................................................................................... + ldr q23, [x5, #-16] // .................................e...................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v21.4S, v0.4S, v19.4S // .........................e.............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v31.4S, v13.4S, v1.4S // ......................................................e................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v10.4S, v24.4S, v8.S[0] // ......................e................................................................................................................................. + trn2 v29.4S, v13.4S, v1.4S // .......................................................e................................................................................................ + ldr q9, [x5, #-80] // .............................e.......................................................................................................................... + ldr q24, [x5, #-48] // ...............................e........................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v6.4S, v0.4S, v28.4S // ..........................e............................................................................................................................. + ldr q1, [x4, #-32] // ........................................................................e............................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v6.4S, v21.4S, v8.S[0] // ...........................e............................................................................................................................ + sub v20.4S, v7.4S, v16.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v18.4S, v24.4S // ....................................e................................................................................................................... + add v18.4S, v7.4S, v16.4S // .............................................e.......................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v16.4S, v20.4S, v9.4S // ..............................................e......................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v0.4S, v10.4S, v6.4S // ........................................................e............................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v4.4S, v20.4S, v17.4S // ...............................................e........................................................................................................ + sub v20.4S, v2.4S, v3.4S // .......................................e................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v19.4S, v10.4S, v6.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v15.4S, v11.4S, v8.S[0] // ......................................e................................................................................................................. + trn1 v14.2D, v31.2D, v0.2D // ............................................................e........................................................................................... + // gap // ........................................................................................................................................................ + trn2 v0.2D, v31.2D, v0.2D // ..........................................................e............................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v20.4S, v23.4S // .........................................e.............................................................................................................. + ldr q10, [x5, #-32] // ................................e....................................................................................................................... + trn2 v13.2D, v29.2D, v19.2D // ...........................................................e............................................................................................ + trn1 v6.2D, v29.2D, v19.2D // .............................................................e.......................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v4.4S, v16.4S, v8.S[0] // ................................................e....................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v29.4S, v0.4S, v13.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v20.4S, v20.4S, v10.4S // ..........................................e............................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v21.4S, v29.4S, v1.S[1] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v20.4S, v24.4S, v8.S[0] // ...........................................e............................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v24.4S, v29.4S, v1.S[0] // ..................................................................................e..................................................................... + sub v29.4S, v14.4S, v6.4S // ..........................................................................e............................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v24.4S, v21.4S, v8.S[0] // ...................................................................................e.................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v2.4S, v15.4S, v20.4S // .................................................e...................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v10.4S, v15.4S, v20.4S // ..................................................e..................................................................................................... + sqrdmulh v3.4S, v29.4S, v22.S[3] // ............................................................................e........................................................................... + // gap // ........................................................................................................................................................ + add v19.4S, v14.4S, v6.4S // ...........................................................................e............................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v21.4S, v2.4S, v9.4S // ...................................................e.................................................................................................... + add v20.4S, v0.4S, v13.4S // ................................................................................e....................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v26.4S, v29.4S, v22.S[2] // .............................................................................e.......................................................................... + trn2 v29.4S, v18.4S, v10.4S // ...............................................................e........................................................................................ + // gap // ........................................................................................................................................................ + sub v5.4S, v19.4S, v20.4S // ..............................................................................................e......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v14.4S, v2.4S, v17.4S // ....................................................e................................................................................................... + add v19.4S, v19.4S, v20.4S // ...............................................................................................e........................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v14.4S, v21.4S, v8.S[0] // .....................................................e.................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v20.4S, v19.4S, #23 // ..................................................................................................................e..................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v26.4S, v3.4S, v8.S[0] // ..............................................................................e......................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v19.4S, v20.4S, v8.4S // ...................................................................................................................e.................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v3.4S, v4.4S, v14.4S // .................................................................e...................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v25.4S, v4.4S, v14.4S // ................................................................e....................................................................................... + sqrdmulh v11.4S, v5.4S, v12.S[3] // ................................................................................................e....................................................... + // gap // ........................................................................................................................................................ + sub v16.4S, v26.4S, v24.4S // ...................................................................................................e.................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v23.4S, v5.4S, v12.S[2] // .................................................................................................e...................................................... + trn2 v27.2D, v29.2D, v3.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + trn1 v4.4S, v18.4S, v10.4S // ..............................................................e......................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v5.4S, v16.4S, v12.S[3] // .....................................................................................................e.................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v23.4S, v11.4S, v8.S[0] // ..................................................................................................e..................................................... + trn2 v18.2D, v4.2D, v25.2D // ..................................................................e..................................................................................... + // gap // ........................................................................................................................................................ + trn1 v15.2D, v4.2D, v25.2D // ....................................................................e................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v11.2D, v29.2D, v3.2D // .....................................................................e.................................................................................. + mul v28.4S, v16.4S, v12.S[2] // ......................................................................................................e................................................. + ldr q29, [x4, #-16] // .........................................................................e.............................................................................. + sub v10.4S, v18.4S, v27.4S // .........................................................................................e.............................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v28.4S, v5.4S, v8.S[0] // .......................................................................................................e................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v21.4S, v15.4S, v11.4S // ....................................................................................e................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v6.4S, v10.4S, v29.S[0] // ............................................................................................e........................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v29.4S, v10.4S, v29.S[1] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v3.4S, v18.4S, v27.4S // ..........................................................................................e............................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v18.4S, v26.4S, v24.4S // ....................................................................................................e................................................... + sqrdmulh v0.4S, v21.4S, v1.S[3] // ......................................................................................e................................................................. + // gap // ........................................................................................................................................................ + add v20.4S, v15.4S, v11.4S // .....................................................................................e.................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v30.4S, v21.4S, v1.S[2] // .......................................................................................e................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v6.4S, v29.4S, v8.S[0] // .............................................................................................e.......................................................... + add v4.4S, v20.4S, v3.4S // .........................................................................................................e.............................................. + // gap // ........................................................................................................................................................ + sub v2.4S, v20.4S, v3.4S // ........................................................................................................e............................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v30.4S, v0.4S, v8.S[0] // ........................................................................................e............................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v20.4S, v4.4S, #23 // ......................................................................................................................e................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v9.4S, v2.4S, v22.S[0] // ...........................................................................................................e............................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v4.4S, v20.4S, v8.4S // .......................................................................................................................e................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v0.4S, v30.4S, v6.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v13.4S, v30.4S, v6.4S // ..............................................................................................................e......................................... + sqrdmulh v20.4S, v2.4S, v22.S[1] // ..........................................................................................................e............................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v31.4S, v0.4S, v22.S[1] // ...............................................................................................................e........................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v3.4S, v13.4S, #23 // ........................................................................................................................e............................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v26.4S, v0.4S, v22.S[0] // ................................................................................................................e....................................... + sub v29.4S, v19.4S, v4.4S // ..........................................................................................................................e............................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v9.4S, v20.4S, v8.S[0] // ............................................................................................................e........................................... + add v20.4S, v19.4S, v4.4S // ...........................................................................................................................e............................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v26.4S, v31.4S, v8.S[0] // .................................................................................................................e...................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q20, [x1], #(16*4) // ..............................................................................................................................................e......... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v13.4S, v3.4S, v8.4S // .........................................................................................................................e.............................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v19.4S, v23.4S, v9.4S // ....................................................................................................................................e................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v11.4S, v23.4S, v9.4S // .....................................................................................................................................e.................. + sqrdmulh v21.4S, v29.4S, v12.S[1] // ............................................................................................................................e........................... + // gap // ........................................................................................................................................................ + sub v10.4S, v28.4S, v26.4S // .........................................................................................................................................e.............. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v27.4S, v28.4S, v26.4S // ..........................................................................................................................................e............. + mul v14.4S, v29.4S, v12.S[0] // .............................................................................................................................e.......................... + // gap // ........................................................................................................................................................ + str q11, [x1, #-32] // ................................................................................................................................................e....... + srshr v28.4S, v18.4S, #23 // ....................................................................................................................e................................... + // gap // ........................................................................................................................................................ + sqrdmulh v31.4S, v19.4S, v12.S[1] // ......................................................................................................................................e................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q27, [x1, #-16] // .................................................................................................................................................e...... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v18.4S, v28.4S, v8.4S // .....................................................................................................................e.................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v14.4S, v21.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v27.4S, v10.4S, v12.S[1] // ...........................................................................................................................................e............ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v29.4S, v18.4S, v13.4S // ...............................................................................................................................e........................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v20.4S, v18.4S, v13.4S // ................................................................................................................................e....................... + mul v13.4S, v19.4S, v12.S[0] // .......................................................................................................................................e................ + // gap // ........................................................................................................................................................ + str q14, [x2], #(16*4) // ..................................................................................................................................................e..... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v2.4S, v29.4S, v12.S[1] // .................................................................................................................................e...................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q20, [x1, #-48] // ...............................................................................................................................................e........ + add x1, x1, #64 // ......................................................................................................................................................e. + // gap // ........................................................................................................................................................ + mul v9.4S, v29.4S, v12.S[0] // ..................................................................................................................................e..................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v3.4S, v10.4S, v12.S[0] // ............................................................................................................................................e........... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + + // ------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // .e......................................................................................................................................................'~.................... + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .........................e..............................................................................................................................'..................... + // ldr q0, [x5], #(12*16) // ........e...............................................................................................................................................'.......~............. + // ldr q4, [x5, #(-12*16 + 1*16)] // .................e......................................................................................................................................'................~.... + // ldr q1, [x5, #(-12*16 + 2*16)] // ..e.....................................................................................................................................................'.~................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ....e...................................................................................................................................................'...~................. + // ldr q2, [x5, #(-12*16 + 4*16)] // .......................e................................................................................................................................'..................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ......e.................................................................................................................................................'.....~............... + // sub v24.4s, v9.4s, v10.4s // .......e................................................................................................................................................'......~.............. + // add v9.4s, v9.4s, v10.4s // ..............e.........................................................................................................................................'.............~....... + // sqrdmulh v27.4s, v24.4s, v5.4s // ...............e........................................................................................................................................'..............~...... + // mul v10.4s, v24.4s, v1.4s // ..................e.....................................................................................................................................'.................~... + // mls v10.4s, v27.4s, v8.s[0] // ..........................e.............................................................................................................................'..................... + // sub v24.4s, v11.4s, v12.4s // ..........e.............................................................................................................................................'.........~........... + // add v11.4s, v11.4s, v12.4s // ............e...........................................................................................................................................'...........~......... + // sqrdmulh v27.4s, v24.4s, v6.4s // ........................e...............................................................................................................................'..................... + // mul v12.4s, v24.4s, v2.4s // ...........................e............................................................................................................................'..................... + // mls v12.4s, v27.4s, v8.s[0] // ..............................e.........................................................................................................................'..................... + // sub v24.4s, v9.4s, v11.4s // ....................e...................................................................................................................................'...................~. + // add v9.4s, v9.4s, v11.4s // ......................e.................................................................................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ..................................e.....................................................................................................................'..................... + // mul v11.4s, v24.4s, v0.4s // .....................................e..................................................................................................................'..................... + // mls v11.4s, v27.4s, v8.s[0] // ..........................................e.............................................................................................................'..................... + // sub v24.4s, v10.4s, v12.4s // ...................................e....................................................................................................................'..................... + // add v10.4s, v10.4s, v12.4s // ......................................e.................................................................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ........................................e...............................................................................................................'..................... + // mul v12.4s, v24.4s, v0.4s // ..............................................e.........................................................................................................'..................... + // mls v12.4s, v27.4s, v8.s[0] // ................................................e.......................................................................................................'..................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ....................................e...................................................................................................................'..................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ............................................e...........................................................................................................'..................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...........e............................................................................................................................................'..........~.......... + // ldr q5, [x5, #(-12*16 + 9*16)] // .............................................e..........................................................................................................'..................... + // ldr q2, [x5, #(-12*16 + 10*16)] // .............................................................e..........................................................................................'..................... + // ldr q6, [x5, #(-12*16 + 11*16)] // .......................................e................................................................................................................'..................... + // sub v24.4s, v13.4s, v14.4s // ............................e...........................................................................................................................'..................... + // add v13.4s, v13.4s, v14.4s // ................................e.......................................................................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v5.4s // ..................................................e.....................................................................................................'..................... + // mul v14.4s, v24.4s, v1.4s // .................................e......................................................................................................................'..................... + // mls v14.4s, v27.4s, v8.s[0] // .........................................................e..............................................................................................'..................... + // sub v24.4s, v15.4s, v16.4s // .......................................................e................................................................................................'..................... + // add v15.4s, v15.4s, v16.4s // ...............................e........................................................................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ............................................................e...........................................................................................'..................... + // mul v16.4s, v24.4s, v2.4s // ..................................................................e.....................................................................................'..................... + // mls v16.4s, v27.4s, v8.s[0] // ....................................................................e...................................................................................'..................... + // sub v24.4s, v13.4s, v15.4s // .................................................e......................................................................................................'..................... + // add v13.4s, v13.4s, v15.4s // ...................................................e....................................................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ....................................................e...................................................................................................'..................... + // mul v15.4s, v24.4s, v0.4s // ......................................................e.................................................................................................'..................... + // mls v15.4s, v27.4s, v8.s[0] // ................................................................e.......................................................................................'..................... + // sub v24.4s, v14.4s, v16.4s // ........................................................................e...............................................................................'..................... + // add v14.4s, v14.4s, v16.4s // .........................................................................e..............................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ............................................................................e...........................................................................'..................... + // mul v16.4s, v24.4s, v0.4s // .................................................................................e......................................................................'..................... + // mls v16.4s, v27.4s, v8.s[0] // ...................................................................................e....................................................................'..................... + // trn1 v25.4s, v9.4s, v10.4s // .........................................e..............................................................................................................'..................... + // trn2 v26.4s, v9.4s, v10.4s // ...........................................e............................................................................................................'..................... + // trn1 v27.4s, v11.4s, v12.4s // .....................................................e..................................................................................................'..................... + // trn2 v28.4s, v11.4s, v12.4s // ........................................................e...............................................................................................'..................... + // trn2 v11.2d, v25.2d, v27.2d // ...........................................................e............................................................................................'..................... + // trn2 v12.2d, v26.2d, v28.2d // ..............................................................e.........................................................................................'..................... + // trn1 v9.2d, v25.2d, v27.2d // ..........................................................e.............................................................................................'..................... + // trn1 v10.2d, v26.2d, v28.2d // ...............................................................e........................................................................................'..................... + // trn1 v25.4s, v13.4s, v14.4s // .............................................................................................e..........................................................'..................... + // trn2 v26.4s, v13.4s, v14.4s // ...............................................................................e........................................................................'..................... + // trn1 v27.4s, v15.4s, v16.4s // ........................................................................................e...............................................................'..................... + // trn2 v28.4s, v15.4s, v16.4s // .......................................................................................e................................................................'..................... + // trn2 v15.2d, v25.2d, v27.2d // ................................................................................................e.......................................................'..................... + // trn2 v16.2d, v26.2d, v28.2d // ............................................................................................e...........................................................'..................... + // trn1 v13.2d, v25.2d, v27.2d // .................................................................................................e......................................................'..................... + // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................................e.....................................................'..................... + // ldr q0, [x4], #64 // e.......................................................................................................................................................~..................... + // ldr q1, [x4, #(-64 + 16)] // .............................e..........................................................................................................................'..................... + // ldr q2, [x4, #(-64 + 32)] // ...............................................e........................................................................................................'..................... + // ldr q3, [x4, #(-64 + 48)] // ....................................................................................................e...................................................'..................... + // sub v24.4s, v9.4s, v10.4s // ......................................................................e.................................................................................'..................... + // add v9.4s, v9.4s, v10.4s // ...........................................................................e............................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ..........................................................................e.............................................................................'..................... + // mul v10.4s, v24.4s, v1.s[2] // ..............................................................................e.........................................................................'..................... + // mls v10.4s, v27.4s, v8.s[0] // .....................................................................................e..................................................................'..................... + // sub v24.4s, v11.4s, v12.4s // .................................................................e......................................................................................'..................... + // add v11.4s, v11.4s, v12.4s // .............................................................................e..........................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ...................................................................e....................................................................................'..................... + // mul v12.4s, v24.4s, v2.s[0] // .....................................................................e..................................................................................'..................... + // mls v12.4s, v27.4s, v8.s[0] // .......................................................................e................................................................................'..................... + // sub v24.4s, v13.4s, v14.4s // .......................................................................................................e................................................'..................... + // add v13.4s, v13.4s, v14.4s // .............................................................................................................e..........................................'..................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ............................................................................................................e...........................................'..................... + // mul v14.4s, v24.4s, v2.s[2] // ..............................................................................................................e.........................................'..................... + // mls v14.4s, v27.4s, v8.s[0] // ..................................................................................................................e.....................................'..................... + // sub v24.4s, v15.4s, v16.4s // .....................................................................................................e..................................................'..................... + // add v15.4s, v15.4s, v16.4s // ..........................................................................................................e.............................................'..................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // .........................................................................................................e..............................................'..................... + // mul v16.4s, v24.4s, v3.s[0] // ........................................................................................................e...............................................'..................... + // mls v16.4s, v27.4s, v8.s[0] // ...............................................................................................................e........................................'..................... + // sub v24.4s, v9.4s, v11.4s // ................................................................................e.......................................................................'..................... + // add v9.4s, v9.4s, v11.4s // ..................................................................................e.....................................................................'..................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .........................................................................................e..............................................................'..................... + // mul v11.4s, v24.4s, v0.s[2] // ...........................................................................................e............................................................'..................... + // mls v11.4s, v27.4s, v8.s[0] // ...............................................................................................e........................................................'..................... + // sub v24.4s, v10.4s, v12.4s // ..........................................................................................e.............................................................'..................... + // add v10.4s, v10.4s, v12.4s // ...........................................................................................................e............................................'..................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ..............................................................................................e.........................................................'..................... + // mul v12.4s, v24.4s, v0.s[2] // ...................................................................................................e....................................................'..................... + // mls v12.4s, v27.4s, v8.s[0] // ......................................................................................................e.................................................'..................... + // sub v24.4s, v13.4s, v15.4s // .................................................................................................................e......................................'..................... + // add v13.4s, v13.4s, v15.4s // ................................................................................................................e.......................................'..................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ........................................................................................................................e...............................'..................... + // mul v15.4s, v24.4s, v1.s[0] // ....................................................................................................................e...................................'..................... + // mls v15.4s, v27.4s, v8.s[0] // .............................................................................................................................e..........................'..................... + // sub v24.4s, v14.4s, v16.4s // ......................................................................................................................e.................................'..................... + // add v14.4s, v14.4s, v16.4s // .......................................................................................................................e................................'..................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .........................................................................................................................e..............................'..................... + // mul v16.4s, v24.4s, v1.s[0] // ...........................................................................................................................e............................'..................... + // mls v16.4s, v27.4s, v8.s[0] // ...............................................................................................................................e........................'..................... + // srshr v24.4S, v9.4S, #23 // ....................................................................................e...................................................................'..................... + // mls v9.4s, v24.4s, v8.4s // ......................................................................................e.................................................................'..................... + // srshr v24.4S, v10.4S, #23 // .........................................................................................................................................e..............'..................... + // mls v10.4s, v24.4s, v8.4s // ............................................................................................................................................e...........'..................... + // srshr v24.4S, v13.4S, #23 // ...................................................................................................................e....................................'..................... + // mls v13.4s, v24.4s, v8.4s // .....................................................................................................................e..................................'..................... + // srshr v24.4S, v14.4S, #23 // ..........................................................................................................................e.............................'..................... + // mls v14.4s, v24.4s, v8.4s // .................................................................................................................................e......................'..................... + // sub v24.4s, v9.4s, v13.4s // ............................................................................................................................e...........................'..................... + // add v9.4s, v9.4s, v13.4s // ..............................................................................................................................e.........................'..................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ....................................................................................................................................e...................'..................... + // mul v13.4s, v24.4s, v0.s[0] // .......................................................................................................................................e................'..................... + // mls v13.4s, v27.4s, v8.s[0] // .............................................................................................................................................e..........'..................... + // sub v24.4s, v10.4s, v14.4s // ...............................................................................................................................................e........'..................... + // add v10.4s, v10.4s, v14.4s // ................................................................................................................................................e.......'..................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...................................................................................................................................................e....'..................... + // mul v14.4s, v24.4s, v0.s[0] // ......................................................................................................................................................e.'..................... + // mls v14.4s, v27.4s, v8.s[0] // .........~..............................................................................................................................................'........*............ + // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................................e.....................'..................... + // add v11.4s, v11.4s, v15.4s // ...................................................................................................................................e....................'..................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..........................................................................................................................................e.............'..................... + // mul v15.4s, v24.4s, v0.s[0] // .................................................................................................................................................e......'..................... + // mls v15.4s, v27.4s, v8.s[0] // ...~....................................................................................................................................................'..*.................. + // sub v24.4s, v12.4s, v16.4s // .....................................................................................................................................e..................'..................... + // add v12.4s, v12.4s, v16.4s // ......................................................................................................................................e.................'..................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..............................................................................................................................................e.........'..................... + // mul v16.4s, v24.4s, v0.s[0] // .......................................................................................................................................................e'..................... + // mls v16.4s, v27.4s, v8.s[0] // .....~..................................................................................................................................................'....*................ + // str q9, [x1], #(16*4) // ................................................................................................................................e.......................'..................... + // str q10, [x1, #(-16*4 + 1*16)] // ....................................................................................................................................................e...'..................... + // str q11, [x1, #(-16*4 + 2*16)] // ........................................................................................................................................e...............'..................... + // str q12, [x1, #(-16*4 + 3*16)] // ...........................................................................................................................................e............'..................... + // str q13, [x2], #(16*4) // ..................................................................................................................................................e.....'..................... + // str q14, [x2, #(-16*4 + 1*16)] // ...................~....................................................................................................................................'..................*.. + // str q15, [x2, #(-16*4 + 2*16)] // .............~..........................................................................................................................................'............*........ + // str q16, [x2, #(-16*4 + 3*16)] // ................~.......................................................................................................................................'...............*..... + // add x1, x1, #64 // .....................................................................................................................................................e..'..................... + // add x2, x2, #64 // .....................~..................................................................................................................................'....................* sub count, count, #1 cbnz count, layer45678_start - mul v16.4S, v31.4S, v20.S[0] // *........... - str q9, [x1, #-32] // .*.......... - add x1, x1, #64 // ..*......... - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v2.4S, v19.4S, v20.S[1] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v22.4S, v15.4S, v20.S[0] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v16.4S, v4.4S, v8.S[0] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v22.4S, v11.4S, v8.S[0] // ......*..... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v23.4S, v2.4S, v8.S[0] // .........*.. - // gap // ............ - // gap // ............ - str q16, [x2, #-16] // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - str q22, [x2, #-48] // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - str q23, [x2, #-32] // ..........*. - add x2, x2, #64 // ...........* - // gap // ............ - - // original source code - // mul v30.4S, v31.4S, v20.S[0] // *........... - // str q9, [x1, #-32] // .*.......... - // add x1, x1, #64 // ..*......... - // sqrdmulh v0.4S, v19.4S, v20.S[1] // ...*........ - // mul v1.4S, v15.4S, v20.S[0] // ....*....... - // mls v30.4S, v4.4S, v8.S[0] // .....*...... - // mls v1.4S, v11.4S, v8.S[0] // ......*..... - // str q30, [x2, #-16] // ........*... - // str q1, [x2, #-48] // .........*.. - // mls v23.4S, v0.4S, v8.S[0] // .......*.... - // str q23, [x2, #-32] // ..........*. - // add x2, x2, #64 // ...........* + // Instructions: 7 + // Expected cycles: 10 + // Expected IPC: 0.70 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mls v9.4S, v2.4S, v8.S[0] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v3.4S, v27.4S, v8.S[0] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v13.4S, v31.4S, v8.S[0] // *............................. + // gap // .............................. + // gap // .............................. + str q9, [x2, #-48] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q3, [x2, #-16] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q13, [x2, #-32] // ...*.......................... + // gap // .............................. + add x2, x2, #64 // ......*....................... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // mls v13.4S, v31.4S, v8.S[0] // ..*............................ + // mls v3.4S, v27.4S, v8.S[0] // .*............................. + // mls v9.4S, v2.4S, v8.S[0] // *.............................. + // str q13, [x2, #-32] // .....*......................... + // str q3, [x2, #-16] // ....*.......................... + // str q9, [x2, #-48] // ...*........................... + // add x2, x2, #64 // ......*........................ // ----------------------------------------------------------------------------- @@ -1523,960 +1551,999 @@ layer45678_start: load_roots_123 .p2align 2 - ldr q4, [x0, #768] // *............ - ldr q20, [x0, #896] // .....*....... - // gap // ............. - ldr q12, [x0, #256] // .*........... - // gap // ............. - // gap // ............. - ldr q11, [x0, #384] // ..*.......... - // gap // ............. - // gap // ............. - ldr q28, [x0, #512] // ...*......... - // gap // ............. - // gap // ............. - sub v15.4S, v4.4S, v20.4S // ......*...... - ldr q24, [x0, #640] // ....*........ - // gap // ............. - add v18.4S, v4.4S, v20.4S // .......*..... - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - sqrdmulh v9.4S, v15.4S, v3.S[1] // .........*... - // gap // ............. - // gap // ............. - sub v29.4S, v28.4S, v24.4S // ........*.... - // gap // ............. - // gap // ............. - mul v5.4S, v15.4S, v3.S[0] // ..........*.. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - sqrdmulh v20.4S, v29.4S, v2.S[3] // ...........*. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - // gap // ............. - mls v5.4S, v9.4S, v8.S[0] // ............* - // gap // ............. - // gap // ............. - - // original source code - // ldr q13, [x0, #768] // *............ - // ldr q12, [x0, #256] // ..*.......... - // ldr q11, [x0, #384] // ...*......... - // ldr q28, [x0, #512] // ....*........ - // ldr q24, [x0, #640] // ......*...... - // ldr q14, [x0, #896] // .*........... - // sub v6.4S, v13.4S, v14.4S // .....*....... - // add v18.4S, v13.4S, v14.4S // .......*..... - // sub v29.4S, v28.4S, v24.4S // .........*... - // sqrdmulh v19.4S, v6.4S, v3.S[1] // ........*.... - // mul v5.4S, v6.4S, v3.S[0] // ..........*.. - // sqrdmulh v20.4S, v29.4S, v2.S[3] // ...........*. - // mls v5.4S, v19.4S, v8.S[0] // ............* + // Instructions: 13 + // Expected cycles: 14 + // Expected IPC: 0.93 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q18, [x0, #896] // *............................. + // gap // .............................. + ldr q20, [x0, #768] // .....*........................ + ldr q6, [x0, #256] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q27, [x0, #384] // ..*........................... + // gap // .............................. + // gap // .............................. + ldr q12, [x0, #512] // ...*.......................... + // gap // .............................. + // gap // .............................. + add v5.4S, v20.4S, v18.4S // .......*...................... + sub v18.4S, v20.4S, v18.4S // ......*....................... + ldr q14, [x0, #640] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v20.4S, v18.4S, v3.S[1] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v11.4S, v12.4S, v14.4S // ........*..................... + mul v9.4S, v18.4S, v3.S[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v23.4S, v11.4S, v2.S[2] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v9.4S, v20.4S, v8.S[0] // ............*................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q21, [x0, #896] // *.............................. + // ldr q6, [x0, #256] // ..*............................ + // ldr q27, [x0, #384] // ...*........................... + // ldr q12, [x0, #512] // ....*.......................... + // ldr q14, [x0, #640] // .......*....................... + // ldr q16, [x0, #768] // .*............................. + // sub v28.4S, v16.4S, v21.4S // ......*........................ + // add v5.4S, v16.4S, v21.4S // .....*......................... + // sub v11.4S, v12.4S, v14.4S // .........*..................... + // sqrdmulh v13.4S, v28.4S, v3.S[1] // ........*...................... + // mul v9.4S, v28.4S, v3.S[0] // ..........*.................... + // mul v23.4S, v11.4S, v2.S[2] // ...........*................... + // mls v9.4S, v13.4S, v8.S[0] // ............*.................. sub count, count, #1 layer123_start: - ldr q16, [x0, #0] // *....................................................................................................................... - ldr q4, [x0, #128] // .*...................................................................................................................... - sub v7.4S, v12.4S, v11.4S // .............*.......................................................................................................... - mul v19.4S, v29.4S, v2.S[2] // ....................*................................................................................................... - add v22.4S, v12.4S, v11.4S // ..............*......................................................................................................... - ldr q13, [x0, #784] // ......e................................................................................................................. - add v21.4S, v28.4S, v24.4S // ...................*.................................................................................................... - ldr q12, [x0, #272] // ..e..................................................................................................................... - ldr q11, [x0, #400] // ...e.................................................................................................................... - mul v17.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ - ldr q28, [x0, #528] // ....e................................................................................................................... - ldr q24, [x0, #656] // .....e.................................................................................................................. - ldr q14, [x0, #912] // .......e................................................................................................................ - sub v29.4S, v16.4S, v4.4S // ........*............................................................................................................... + // Instructions: 120 + // Expected cycles: 112 + // Expected IPC: 1.07 + // + // Wall time: 10.55s + // User time: 10.55s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + ldr q20, [x0, #0] // *....................................................................................................................... + ldr q18, [x0, #128] // .*...................................................................................................................... + sub v19.4S, v6.4S, v27.4S // .............*.......................................................................................................... + sqrdmulh v22.4S, v11.4S, v2.S[3] // ....................*................................................................................................... + add v11.4S, v6.4S, v27.4S // ..............*......................................................................................................... + ldr q21, [x0, #912] // .......e................................................................................................................ + add v13.4S, v12.4S, v14.4S // ...................*.................................................................................................... + ldr q6, [x0, #272] // ..e..................................................................................................................... + ldr q27, [x0, #400] // ...e.................................................................................................................... + sqrdmulh v17.4S, v19.4S, v2.S[1] // ...............*........................................................................................................ + ldr q12, [x0, #528] // ....e................................................................................................................... + ldr q14, [x0, #656] // .....e.................................................................................................................. + ldr q16, [x0, #784] // ......e................................................................................................................. + sub v10.4S, v20.4S, v18.4S // ........*............................................................................................................... // gap // ........................................................................................................................ - add v16.4S, v16.4S, v4.4S // .........*.............................................................................................................. - sqrdmulh v4.4S, v7.4S, v2.S[1] // ................*....................................................................................................... + add v20.4S, v20.4S, v18.4S // .........*.............................................................................................................. + mul v18.4S, v19.4S, v2.S[0] // ................*....................................................................................................... // gap // ........................................................................................................................ - sub v7.4S, v21.4S, v18.4S // ......................................*................................................................................. + sub v19.4S, v13.4S, v5.4S // ......................................*................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v19.4S, v20.4S, v8.S[0] // ......................*................................................................................................. - add v21.4S, v21.4S, v18.4S // .......................................*................................................................................ + mls v23.4S, v22.4S, v8.S[0] // ......................*................................................................................................. + add v22.4S, v13.4S, v5.4S // .......................................*................................................................................ // gap // ........................................................................................................................ - sub v20.4S, v16.4S, v22.4S // ............................*........................................................................................... + sub v13.4S, v20.4S, v11.4S // ............................*........................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v16.4S, v16.4S, v22.4S // .............................*.......................................................................................... - mul v22.4S, v29.4S, v1.S[2] // ..........*............................................................................................................. + add v20.4S, v20.4S, v11.4S // .............................*.......................................................................................... + sqrdmulh v11.4S, v10.4S, v1.S[3] // ..........*............................................................................................................. // gap // ........................................................................................................................ - sub v6.4S, v13.4S, v14.4S // .......................e................................................................................................ + sub v28.4S, v16.4S, v21.4S // .......................e................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v18.4S, v13.4S, v14.4S // ........................e............................................................................................... - mls v17.4S, v4.4S, v8.S[0] // .................*...................................................................................................... + add v5.4S, v16.4S, v21.4S // ........................e............................................................................................... + mls v18.4S, v17.4S, v8.S[0] // .................*...................................................................................................... // gap // ........................................................................................................................ - sub v4.4S, v19.4S, v5.4S // ...........................................*............................................................................ + sub v17.4S, v23.4S, v9.4S // ...........................................*............................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v13.4S, v29.4S, v1.S[3] // ...........*............................................................................................................ - add v19.4S, v19.4S, v5.4S // ............................................*........................................................................... + mul v16.4S, v10.4S, v1.S[2] // ...........*............................................................................................................ + add v10.4S, v23.4S, v9.4S // ............................................*........................................................................... // gap // ........................................................................................................................ - sub v14.4S, v16.4S, v21.4S // ................................................*....................................................................... + sub v23.4S, v20.4S, v22.4S // ................................................*....................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v16.4S, v16.4S, v21.4S // .................................................*...................................................................... - mul v21.4S, v20.4S, v0.S[2] // ..............................*......................................................................................... + add v20.4S, v20.4S, v22.4S // .................................................*...................................................................... + mls v16.4S, v11.4S, v8.S[0] // ............*........................................................................................................... // gap // ........................................................................................................................ - sub v29.4S, v28.4S, v24.4S // ..................e..................................................................................................... + sub v11.4S, v12.4S, v14.4S // ..................e..................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v20.4S, v20.4S, v0.S[3] // ...............................*........................................................................................ + sqrdmulh v22.4S, v13.4S, v0.S[3] // ..............................*......................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v5.4S, v7.4S, v1.S[0] // ........................................*............................................................................... + sqrdmulh v9.4S, v19.4S, v1.S[1] // ........................................*............................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v7.4S, v16.4S, v18.4S // .................................*...................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v19.4S, v19.4S, v1.S[0] // .........................................*.............................................................................. + add v18.4S, v16.4S, v18.4S // ..................................*..................................................................................... // gap // ........................................................................................................................ - sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.............................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v13.4S, v13.4S, v0.S[2] // ...............................*........................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v22.4S, v13.4S, v8.S[0] // ............*........................................................................................................... + sub v16.4S, v18.4S, v10.4S // .....................................................*.................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + add v18.4S, v18.4S, v10.4S // ......................................................*................................................................. + mls v13.4S, v22.4S, v8.S[0] // ................................*....................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v20.4S, v8.S[0] // ................................*....................................................................................... // gap // ........................................................................................................................ + sqrdmulh v22.4S, v7.4S, v0.S[3] // ...................................*.................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v5.4S, v7.4S, v8.S[0] // ..........................................*............................................................................. // gap // ........................................................................................................................ + mul v10.4S, v7.4S, v0.S[2] // ....................................*................................................................................... // gap // ........................................................................................................................ - sub v7.4S, v22.4S, v17.4S // .................................*...................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v22.4S, v22.4S, v17.4S // ..................................*..................................................................................... - mul v13.4S, v4.4S, v1.S[0] // .............................................*.......................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v19.4S, v9.4S, v8.S[0] // ..........................................*............................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v17.4S, v7.4S, v0.S[2] // ...................................*.................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v20.4S, v22.4S, v19.4S // .....................................................*.................................................................. // gap // ........................................................................................................................ + mls v10.4S, v22.4S, v8.S[0] // .....................................*.................................................................................. // gap // ........................................................................................................................ - add v22.4S, v22.4S, v19.4S // ......................................................*................................................................. - sqrdmulh v7.4S, v7.4S, v0.S[3] // ....................................*................................................................................... // gap // ........................................................................................................................ - sub v19.4S, v21.4S, v5.4S // ..........................................................*............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v4.4S, v4.4S, v1.S[1] // ..............................................*......................................................................... - add v21.4S, v21.4S, v5.4S // ...........................................................*............................................................ // gap // ........................................................................................................................ + sqrdmulh v22.4S, v17.4S, v1.S[1] // .............................................*.......................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sub v9.4S, v13.4S, v19.4S // ..........................................................*............................................................. // gap // ........................................................................................................................ - mul v5.4S, v14.4S, v0.S[0] // ..................................................*..................................................................... // gap // ........................................................................................................................ + mul v17.4S, v17.4S, v1.S[0] // ..............................................*......................................................................... + add v19.4S, v13.4S, v19.4S // ...........................................................*............................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v17.4S, v7.4S, v8.S[0] // .....................................*.................................................................................. + sqrdmulh v13.4S, v23.4S, v0.S[1] // ..................................................*..................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v13.4S, v4.4S, v8.S[0] // ...............................................*........................................................................ + mls v17.4S, v22.4S, v8.S[0] // ...............................................*........................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v4.4S, v14.4S, v0.S[1] // ...................................................*.................................................................... + mul v22.4S, v23.4S, v0.S[0] // ...................................................*.................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v7.4S, v20.4S, v0.S[0] // .......................................................*................................................................ + mls v22.4S, v13.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v14.4S, v17.4S, v13.4S // ...............................................................*........................................................ + sub v13.4S, v10.4S, v17.4S // ...............................................................*........................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v13.4S, v17.4S, v13.4S // ................................................................*....................................................... - mul v17.4S, v16.4S, v25.4S // ........................................................................................*............................... + add v17.4S, v10.4S, v17.4S // ................................................................*....................................................... + sqrdmulh v10.4S, v20.4S, v26.4S // ........................................................................................*............................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v16.4S, v16.4S, v26.4S // .........................................................................................*.............................. + mul v20.4S, v20.4S, v25.4S // .........................................................................................*.............................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v23.4S, v31.4S, v22.4S // ....................................................................*................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v7.4S, v22.4S, v30.4S // .....................................................................*.................................................. + sqrdmulh v24.4S, v16.4S, v0.S[1] // .......................................................*................................................................ // gap // ........................................................................................................................ - mls v5.4S, v4.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v16.4S, v16.4S, v0.S[0] // ........................................................*............................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v4.4S, v20.4S, v0.S[1] // ........................................................*............................................................... + sub v23.4S, v23.4S, v7.4S // ......................................................................*................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sqrdmulh v7.4S, v9.4S, v0.S[1] // ............................................................*........................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v20.4S, v19.4S, v0.S[0] // ............................................................*........................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v10.4S, v31.4S, v5.4S // ....................................................................*................................................... + mls v16.4S, v24.4S, v8.S[0] // .........................................................*.............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v19.4S, v19.4S, v0.S[1] // .............................................................*.......................................................... - cmge v9.4S, v5.4S, v30.4S // .....................................................................*.................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mul v9.4S, v9.4S, v0.S[0] // .............................................................*.......................................................... // gap // ........................................................................................................................ - mls v7.4S, v4.4S, v8.S[0] // .........................................................*.............................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v4.4S, v10.4S, v9.4S // ......................................................................*................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v10.4S, v14.4S, v0.S[0] // .................................................................*...................................................... + mls v9.4S, v7.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v7.4S, v31.4S, v16.4S // ........................................................................*............................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v24.4S, v16.4S, v30.4S // .........................................................................*.............................................. + sqrdmulh v4.4S, v13.4S, v0.S[1] // .................................................................*...................................................... // gap // ........................................................................................................................ - mls v20.4S, v19.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v19.4S, v31.4S, v7.4S // ........................................................................*............................................... // gap // ........................................................................................................................ + mul v13.4S, v13.4S, v0.S[0] // ..................................................................*..................................................... // gap // ........................................................................................................................ - sqrdmulh v14.4S, v14.4S, v0.S[1] // ..................................................................*..................................................... - cmge v9.4S, v7.4S, v30.4S // .........................................................................*.............................................. // gap // ........................................................................................................................ + sub v7.4S, v7.4S, v24.4S // ..........................................................................*............................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v22.4S, v23.4S, v8.4S // .......................................................................*................................................ + cmge v23.4S, v31.4S, v9.4S // ............................................................................*........................................... // gap // ........................................................................................................................ - mls v5.4S, v4.4S, v8.4S // .......................................................................*................................................ + cmge v24.4S, v9.4S, v30.4S // .............................................................................*.......................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v4.4S, v19.4S, v9.4S // ..........................................................................*............................................. + mls v13.4S, v4.4S, v8.S[0] // ...................................................................*.................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v17.4S, v16.4S, v8.S[0] // ..........................................................................................*............................. - cmge v16.4S, v31.4S, v20.4S // ............................................................................*........................................... // gap // ........................................................................................................................ - cmge v19.4S, v20.4S, v30.4S // .............................................................................*.......................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v10.4S, v14.4S, v8.S[0] // ...................................................................*.................................................... + mls v16.4S, v7.4S, v8.4S // ...........................................................................*............................................ + sub v23.4S, v23.4S, v24.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ + str q22, [x0, #512] // ....................................................................................*................................... // gap // ........................................................................................................................ - str q5, [x0, #512] // ....................................................................................*................................... // gap // ........................................................................................................................ + mls v20.4S, v10.4S, v8.S[0] // ..........................................................................................*............................. // gap // ........................................................................................................................ - mls v7.4S, v4.4S, v8.4S // ...........................................................................*............................................ - sub v16.4S, v16.4S, v19.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ - cmge v4.4S, v31.4S, v17.4S // ....................................................................................................*................... + cmge v22.4S, v31.4S, v13.4S // ................................................................................*....................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v19.4S, v17.4S, v30.4S // .....................................................................................................*.................. - mul v14.4S, v22.4S, v25.4S // ...........................................................................................*............................ + mls v9.4S, v23.4S, v8.4S // ...............................................................................*........................................ + cmge v10.4S, v13.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ - cmge v5.4S, v31.4S, v10.4S // ................................................................................*....................................... + str q16, [x0, #640] // .....................................................................................*.................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v20.4S, v16.4S, v8.4S // ...............................................................................*........................................ - cmge v16.4S, v10.4S, v30.4S // .................................................................................*...................................... + sqrdmulh v16.4S, v18.4S, v26.4S // ...........................................................................................*............................ // gap // ........................................................................................................................ - str q7, [x0, #640] // .....................................................................................*.................................. - sub v4.4S, v4.4S, v19.4S // ......................................................................................................*................. // gap // ........................................................................................................................ - sqrdmulh v7.4S, v22.4S, v26.4S // ............................................................................................*........................... + sub v22.4S, v22.4S, v10.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sub v16.4S, v5.4S, v16.4S // ..................................................................................*..................................... + mul v18.4S, v18.4S, v25.4S // ............................................................................................*........................... + cmge v10.4S, v31.4S, v20.4S // ....................................................................................................*................... // gap // ........................................................................................................................ + str q9, [x0, #768] // ......................................................................................*................................. + cmge v23.4S, v20.4S, v30.4S // .....................................................................................................*.................. // gap // ........................................................................................................................ - mul v22.4S, v21.4S, v25.4S // ..............................................................................................*......................... + sqrdmulh v9.4S, v19.4S, v26.4S // ..............................................................................................*......................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q20, [x0, #768] // ......................................................................................*................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v19.4S, v21.4S, v26.4S // ...............................................................................................*........................ // gap // ........................................................................................................................ + mls v18.4S, v16.4S, v8.S[0] // .............................................................................................*.......................... + sub v16.4S, v10.4S, v23.4S // ......................................................................................................*................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v14.4S, v7.4S, v8.S[0] // .............................................................................................*.......................... + mul v19.4S, v19.4S, v25.4S // ...............................................................................................*........................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v7.4S, v13.4S, v26.4S // ..................................................................................................*..................... + sqrdmulh v10.4S, v17.4S, v26.4S // .................................................................................................*...................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + cmge v23.4S, v31.4S, v18.4S // ........................................................................................................*............... // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v19.4S, v9.4S, v8.S[0] // ................................................................................................*....................... + cmge v9.4S, v18.4S, v30.4S // .........................................................................................................*.............. // gap // ........................................................................................................................ - mls v22.4S, v19.4S, v8.S[0] // ................................................................................................*....................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v19.4S, v31.4S, v14.4S // ........................................................................................................*............... // gap // ........................................................................................................................ + mul v17.4S, v17.4S, v25.4S // ..................................................................................................*..................... // gap // ........................................................................................................................ - mul v21.4S, v13.4S, v25.4S // .................................................................................................*...................... - cmge v13.4S, v14.4S, v30.4S // .........................................................................................................*.............. // gap // ........................................................................................................................ + sub v23.4S, v23.4S, v9.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v17.4S, v10.4S, v8.S[0] // ...................................................................................................*.................... // gap // ........................................................................................................................ - mls v21.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... // gap // ........................................................................................................................ + cmge v10.4S, v31.4S, v19.4S // ............................................................................................................*........... // gap // ........................................................................................................................ - sub v7.4S, v19.4S, v13.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ + mls v13.4S, v22.4S, v8.4S // ...................................................................................*.................................... + cmge v22.4S, v19.4S, v30.4S // .............................................................................................................*.......... // gap // ........................................................................................................................ - mls v10.4S, v16.4S, v8.4S // ...................................................................................*.................................... - cmge v16.4S, v31.4S, v22.4S // ............................................................................................................*........... // gap // ........................................................................................................................ - cmge v19.4S, v22.4S, v30.4S // .............................................................................................................*.......... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v17.4S, v4.4S, v8.4S // .......................................................................................................*................ + mls v20.4S, v16.4S, v8.4S // .......................................................................................................*................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v4.4S, v31.4S, v21.4S // ................................................................................................................*....... + cmge v16.4S, v31.4S, v17.4S // ................................................................................................................*....... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v14.4S, v7.4S, v8.4S // ...........................................................................................................*............ - cmge v7.4S, v21.4S, v30.4S // .................................................................................................................*...... + mls v18.4S, v23.4S, v8.4S // ...........................................................................................................*............ + cmge v23.4S, v17.4S, v30.4S // .................................................................................................................*...... // gap // ........................................................................................................................ - str q10, [x0, #896] // .......................................................................................*................................ - sub v16.4S, v16.4S, v19.4S // ..............................................................................................................*......... + sub v22.4S, v10.4S, v22.4S // ..............................................................................................................*......... + str q13, [x0, #896] // .......................................................................................*................................ // gap // ........................................................................................................................ - sqrdmulh v19.4S, v6.4S, v3.S[1] // ..........................e............................................................................................. + sqrdmulh v13.4S, v28.4S, v3.S[1] // .........................e.............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q17, [x0], #(16) // ....................................................................................................................*... - sub v4.4S, v4.4S, v7.4S // ..................................................................................................................*..... + str q20, [x0], #(16) // ....................................................................................................................*... + sub v20.4S, v16.4S, v23.4S // ..................................................................................................................*..... // gap // ........................................................................................................................ - mls v22.4S, v16.4S, v8.4S // ...............................................................................................................*........ + mls v19.4S, v22.4S, v8.4S // ...............................................................................................................*........ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q14, [x0, #112] // .....................................................................................................................*.. + str q18, [x0, #112] // .....................................................................................................................*.. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v21.4S, v4.4S, v8.4S // ...................................................................................................................*.... + mls v17.4S, v20.4S, v8.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v5.4S, v6.4S, v3.S[0] // .........................e.............................................................................................. + mul v9.4S, v28.4S, v3.S[0] // ..........................e............................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q22, [x0, #240] // ......................................................................................................................*. + str q19, [x0, #240] // ......................................................................................................................*. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v20.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. + mul v23.4S, v11.4S, v2.S[2] // .....................e.................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q21, [x0, #368] // .......................................................................................................................* + str q17, [x0, #368] // .......................................................................................................................* // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v5.4S, v19.4S, v8.S[0] // ...........................e............................................................................................ + mls v9.4S, v13.4S, v8.S[0] // ...........................e............................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - // original source code + // ------------------------------------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------- // ldr q9, [x0, #0] // ...................................................................................................................*...................................................................................................................... - // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................|*..................................................................................................................... - // ldr q11, [x0, #(2*(1024/8))] // ..e................................................................................................................|......e............................................................................................................... - // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................|.......e.............................................................................................................. - // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................|.........e............................................................................................................ - // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................|..........e........................................................................................................... - // ldr q15, [x0, #(6*(1024/8))] // e..................................................................................................................|....e................................................................................................................. - // ldr q16, [x0, #(7*(1024/8))] // .......e...........................................................................................................|...........e.......................................................................................................... - // sub v24.4s, v9.4s, v10.4s // ........*..........................................................................................................|............*......................................................................................................... - // add v9.4s, v9.4s, v10.4s // .........*.........................................................................................................|.............*........................................................................................................ - // mul v10.4s, v24.4s, v1.s[2] // ................*..................................................................................................|....................*................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................*.............................................................................................|.........................*............................................................................................ - // mls v10.4s, v24.4s, v8.s[0] // ..............................*....................................................................................|..................................*................................................................................... - // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................|.*.................................................................................................................... - // add v11.4s, v11.4s, v12.4s // ...................................................................................................................|...*.................................................................................................................. - // mul v12.4s, v24.4s, v2.s[0] // ....*..............................................................................................................|........*............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........*........................................................................................................|..............*....................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ...................*...............................................................................................|.......................*.............................................................................................. - // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................|..............................e....................................................................................... - // add v13.4s, v13.4s, v14.4s // .*.................................................................................................................|.....*................................................................................................................ - // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................|..*................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................e..|....................................................................................................................e. - // mls v14.4s, v24.4s, v8.s[0] // ............*......................................................................................................|................*..................................................................................................... - // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................|.....................e................................................................................................ - // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................|......................e............................................................................................... - // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....|..................................................................................................................e... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................e..........|............................................................................................................e......... - // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................e|...................................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ..............*....................................................................................................|..................*................................................................................................... - // add v9.4s, v9.4s, v11.4s // ...............*...................................................................................................|...................*.................................................................................................. - // mul v11.4s, v24.4s, v0.s[2] // .........................*.........................................................................................|.............................*........................................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*.......................................................................................|...............................*...................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ...............................*...................................................................................|...................................*.................................................................................. - // sub v24.4s, v10.4s, v12.4s // .................................*.................................................................................|.....................................*................................................................................ - // add v10.4s, v10.4s, v12.4s // ..................................*................................................................................|......................................*............................................................................... - // mul v12.4s, v24.4s, v0.s[2] // ....................................*..............................................................................|........................................*............................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...........................................................................|...........................................*.......................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ............................................*......................................................................|................................................*..................................................................... - // sub v24.4s, v13.4s, v15.4s // ...........*.......................................................................................................|...............*...................................................................................................... - // add v13.4s, v13.4s, v15.4s // .............*.....................................................................................................|.................*.................................................................................................... - // mul v15.4s, v24.4s, v1.s[0] // ............................*......................................................................................|................................*..................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................*.....................................................................................|.................................*.................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ................................*..................................................................................|....................................*................................................................................. - // sub v24.4s, v14.4s, v16.4s // ....................*..............................................................................................|........................*............................................................................................. - // add v14.4s, v14.4s, v16.4s // ......................*............................................................................................|..........................*........................................................................................... - // mul v16.4s, v24.4s, v1.s[0] // ...................................*...............................................................................|.......................................*.............................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................*.........................................................................|.............................................*........................................................................ - // mls v16.4s, v24.4s, v8.s[0] // .............................................*.....................................................................|.................................................*.................................................................... - // sub v24.4s, v9.4s, v13.4s // .......................*...........................................................................................|...........................*.......................................................................................... - // add v9.4s, v9.4s, v13.4s // ........................*..........................................................................................|............................*......................................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ...........................................*.......................................................................|...............................................*...................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................*....................................................................|..................................................*................................................................... - // mls v13.4s, v24.4s, v8.s[0] // ....................................................*..............................................................|........................................................*............................................................. - // sub v24.4s, v10.4s, v14.4s // .....................................*.............................................................................|.........................................*............................................................................ - // add v10.4s, v10.4s, v14.4s // ......................................*............................................................................|..........................................*........................................................................... - // mul v14.4s, v24.4s, v0.s[0] // ...............................................*...................................................................|...................................................*.................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................*.............................................................|.........................................................*............................................................ - // mls v14.4s, v24.4s, v8.s[0] // ..........................................................*........................................................|..............................................................*....................................................... - // sub v24.4s, v11.4s, v15.4s // ........................................*..........................................................................|............................................*......................................................................... - // add v11.4s, v11.4s, v15.4s // ..........................................*........................................................................|..............................................*....................................................................... - // mul v15.4s, v24.4s, v0.s[0] // ......................................................*............................................................|..........................................................*........................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*..........................................................|............................................................*......................................................... - // mls v15.4s, v24.4s, v8.s[0] // .............................................................*.....................................................|.................................................................*.................................................... - // sub v24.4s, v12.4s, v16.4s // ................................................*..................................................................|....................................................*................................................................. - // add v12.4s, v12.4s, v16.4s // .................................................*.................................................................|.....................................................*................................................................ - // mul v16.4s, v24.4s, v0.s[0] // ............................................................*......................................................|................................................................*..................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................*...................................................|...................................................................*.................................................. - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................*............................................|..........................................................................*........................................... - // cmge v27.4s, v31.4s, v13.4s // .......................................................*...........................................................|...........................................................*.......................................................... - // cmge v28.4s, v13.4s, v30.4s // .........................................................*.........................................................|.............................................................*........................................................ - // sub v28.4s, v27.4s, v28.4s // ...........................................................*.......................................................|...............................................................*...................................................... - // mls v13.4s, v28.4s, v8.4s // .................................................................*.................................................|.....................................................................*................................................ - // cmge v27.4s, v31.4s, v14.4s // ..............................................................*....................................................|..................................................................*................................................... - // cmge v28.4s, v14.4s, v30.4s // ................................................................*..................................................|....................................................................*................................................. - // sub v28.4s, v27.4s, v28.4s // ..................................................................*................................................|......................................................................*............................................... - // mls v14.4s, v28.4s, v8.4s // ........................................................................*..........................................|............................................................................*......................................... - // cmge v27.4s, v31.4s, v15.4s // ....................................................................*..............................................|........................................................................*............................................. - // cmge v28.4s, v15.4s, v30.4s // .....................................................................*.............................................|.........................................................................*............................................ - // sub v28.4s, v27.4s, v28.4s // .........................................................................*.........................................|.............................................................................*........................................ - // mls v15.4s, v28.4s, v8.4s // ..............................................................................*....................................|..................................................................................*................................... - // cmge v27.4s, v31.4s, v16.4s // .............................................................................*.....................................|.................................................................................*.................................... - // cmge v28.4s, v16.4s, v30.4s // ...............................................................................*...................................|...................................................................................*.................................. - // sub v28.4s, v27.4s, v28.4s // ...................................................................................*...............................|.......................................................................................*.............................. - // mls v16.4s, v28.4s, v8.4s // ...............................................................................................*...................|...................................................................................................*.................. - // str q13, [x0, #(4*(1024/8))] // .......................................................................*...........................................|...........................................................................*.......................................... - // str q14, [x0, #(5*(1024/8))] // ................................................................................*..................................|....................................................................................*................................. - // str q15, [x0, #(6*(1024/8))] // .....................................................................................*.............................|.........................................................................................*............................ - // str q16, [x0, #(7*(1024/8))] // ......................................................................................................*............|..........................................................................................................*........... - // mul v13.4s, v9.4s, v25.4s // ..................................................*................................................................|......................................................*............................................................... - // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*...............................................................|.......................................................*.............................................................. - // mls v13.4s, v9.4s, v8.s[0] // ...................................................................*...............................................|.......................................................................*.............................................. - // mul v14.4s, v10.4s, v25.4s // ............................................................................*......................................|................................................................................*..................................... - // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................*................................|......................................................................................*............................... - // mls v14.4s, v10.4s, v8.s[0] // .......................................................................................*...........................|...........................................................................................*.......................... - // mul v15.4s, v11.4s, v25.4s // ....................................................................................*..............................|........................................................................................*............................. - // sqrdmulh v11.4s, v11.4s, v26.4s // ......................................................................................*............................|..........................................................................................*........................... - // mls v15.4s, v11.4s, v8.s[0] // .........................................................................................*.........................|.............................................................................................*........................ - // mul v16.4s, v12.4s, v25.4s // ...........................................................................................*.......................|...............................................................................................*...................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ........................................................................................*..........................|............................................................................................*......................... - // mls v16.4s, v12.4s, v8.s[0] // .............................................................................................*.....................|.................................................................................................*.................... - // cmge v27.4s, v31.4s, v13.4s // ..........................................................................*........................................|..............................................................................*....................................... - // cmge v28.4s, v13.4s, v30.4s // ...........................................................................*.......................................|...............................................................................*...................................... - // sub v28.4s, v27.4s, v28.4s // .................................................................................*.................................|.....................................................................................*................................ - // mls v13.4s, v28.4s, v8.4s // ..................................................................................................*................|......................................................................................................*............... - // cmge v27.4s, v31.4s, v14.4s // ..........................................................................................*........................|..............................................................................................*....................... - // cmge v28.4s, v14.4s, v30.4s // ............................................................................................*......................|................................................................................................*..................... - // sub v28.4s, v27.4s, v28.4s // ..............................................................................................*....................|..................................................................................................*................... - // mls v14.4s, v28.4s, v8.4s // ....................................................................................................*..............|........................................................................................................*............. - // cmge v27.4s, v31.4s, v15.4s // ................................................................................................*..................|....................................................................................................*................. - // cmge v28.4s, v15.4s, v30.4s // .................................................................................................*.................|.....................................................................................................*................ - // sub v28.4s, v27.4s, v28.4s // .......................................................................................................*...........|...........................................................................................................*.......... - // mls v15.4s, v28.4s, v8.4s // ...........................................................................................................*.......|...............................................................................................................*...... - // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................*...............|.......................................................................................................*.............. - // cmge v28.4s, v16.4s, v30.4s // .....................................................................................................*.............|.........................................................................................................*............ - // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................*........|..............................................................................................................*....... - // mls v16.4s, v28.4s, v8.4s // .............................................................................................................*.....|.................................................................................................................*.... - // str q13, [x0], #(16) // .........................................................................................................*.........|.............................................................................................................*........ - // str q14, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................*......|................................................................................................................*..... - // str q15, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................*...|...................................................................................................................*.. - // str q16, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................*.|.....................................................................................................................* + // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................'*..................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e................................................................................................................'......~............................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................'.......~.............................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................'.........~............................................................................................................ + // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................'..........~........................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .......e...........................................................................................................'...........~.......................................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // e..................................................................................................................'....~................................................................................................................. + // sub v24.4s, v9.4s, v10.4s // ........~..........................................................................................................'............*......................................................................................................... + // add v9.4s, v9.4s, v10.4s // .........~.........................................................................................................'.............*........................................................................................................ + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ................~..................................................................................................'....................*................................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // .....................~.............................................................................................'.........................*............................................................................................ + // mls v10.4s, v27.4s, v8.s[0] // .........................~.........................................................................................'.............................*........................................................................................ + // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................'.*.................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................................................................................................'...*.................................................................................................................. + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ....~..............................................................................................................'........*............................................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ..........~........................................................................................................'..............*....................................................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ...................~...............................................................................................'.......................*.............................................................................................. + // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................'..............................~....................................................................................... + // add v13.4s, v13.4s, v14.4s // .~.................................................................................................................'.....*................................................................................................................ + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ...................................................................................................................'..*................................................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................................e..'....................................................................................................................~. + // mls v14.4s, v27.4s, v8.s[0] // ............~......................................................................................................'................*..................................................................................................... + // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................'.....................~................................................................................................ + // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................'......................~............................................................................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ........................................................................................................e..........'............................................................................................................~......... + // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....'..................................................................................................................~... + // mls v16.4s, v27.4s, v8.s[0] // ..................................................................................................................e'...................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............~....................................................................................................'..................*................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............~...................................................................................................'...................*.................................................................................................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ...........................~.......................................................................................'...............................*...................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................................~..................................................................................'....................................*................................................................................. + // mls v11.4s, v27.4s, v8.s[0] // ...................................~...............................................................................'.......................................*.............................................................................. + // sub v24.4s, v10.4s, v12.4s // .............................~.....................................................................................'.................................*.................................................................................... + // add v10.4s, v10.4s, v12.4s // ...............................~...................................................................................'...................................*.................................................................................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ....................................~..............................................................................'........................................*............................................................................. + // mul v12.4s, v24.4s, v0.s[2] // .....................................~.............................................................................'.........................................*............................................................................ + // mls v12.4s, v27.4s, v8.s[0] // .......................................~...........................................................................'...........................................*.......................................................................... + // sub v24.4s, v13.4s, v15.4s // ...........~.......................................................................................................'...............*...................................................................................................... + // add v13.4s, v13.4s, v15.4s // .............~.....................................................................................................'.................*.................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ............................~......................................................................................'................................*..................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ..............................~....................................................................................'..................................*................................................................................... + // mls v15.4s, v27.4s, v8.s[0] // ......................................~............................................................................'..........................................*........................................................................... + // sub v24.4s, v14.4s, v16.4s // ....................~..............................................................................................'........................*............................................................................................. + // add v14.4s, v14.4s, v16.4s // ......................~............................................................................................'..........................*........................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ........................................~..........................................................................'............................................*......................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ..........................................~........................................................................'..............................................*....................................................................... + // mls v16.4s, v27.4s, v8.s[0] // .............................................~.....................................................................'.................................................*.................................................................... + // sub v24.4s, v9.4s, v13.4s // .......................~...........................................................................................'...........................*.......................................................................................... + // add v9.4s, v9.4s, v13.4s // ........................~..........................................................................................'............................*......................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ............................................~......................................................................'................................................*..................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ..............................................~....................................................................'..................................................*................................................................... + // mls v13.4s, v27.4s, v8.s[0] // ...............................................~...................................................................'...................................................*.................................................................. + // sub v24.4s, v10.4s, v14.4s // .................................~.................................................................................'.....................................*................................................................................ + // add v10.4s, v10.4s, v14.4s // ..................................~................................................................................'......................................*............................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ......................................................~............................................................'..........................................................*........................................................... + // mul v14.4s, v24.4s, v0.s[0] // .......................................................~...........................................................'...........................................................*.......................................................... + // mls v14.4s, v27.4s, v8.s[0] // ..........................................................~........................................................'..............................................................*....................................................... + // sub v24.4s, v11.4s, v15.4s // .........................................~.........................................................................'.............................................*........................................................................ + // add v11.4s, v11.4s, v15.4s // ...........................................~.......................................................................'...............................................*...................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .........................................................~.........................................................'.............................................................*........................................................ + // mul v15.4s, v24.4s, v0.s[0] // ...........................................................~.......................................................'...............................................................*...................................................... + // mls v15.4s, v27.4s, v8.s[0] // ............................................................~......................................................'................................................................*..................................................... + // sub v24.4s, v12.4s, v16.4s // ................................................~..................................................................'....................................................*................................................................. + // add v12.4s, v12.4s, v16.4s // .................................................~.................................................................'.....................................................*................................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...............................................................~...................................................'...................................................................*.................................................. + // mul v16.4s, v24.4s, v0.s[0] // ................................................................~..................................................'....................................................................*................................................. + // mls v16.4s, v27.4s, v8.s[0] // .....................................................................~.............................................'.........................................................................*............................................ + // cmge v27.4s, v31.4s, v13.4s // ....................................................~..............................................................'........................................................*............................................................. + // cmge v28.4s, v13.4s, v30.4s // .....................................................~.............................................................'.........................................................*............................................................ + // sub v28.4s, v27.4s, v28.4s // ........................................................~..........................................................'............................................................*......................................................... + // mls v13.4s, v28.4s, v8.4s // ..................................................................~................................................'......................................................................*............................................... + // cmge v27.4s, v31.4s, v14.4s // .............................................................~.....................................................'.................................................................*.................................................... + // cmge v28.4s, v14.4s, v30.4s // ..............................................................~....................................................'..................................................................*................................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................~.................................................'.....................................................................*................................................ + // mls v14.4s, v28.4s, v8.4s // ......................................................................~............................................'..........................................................................*........................................... + // cmge v27.4s, v31.4s, v15.4s // ...................................................................~...............................................'.......................................................................*.............................................. + // cmge v28.4s, v15.4s, v30.4s // ....................................................................~..............................................'........................................................................*............................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................~...........................................'...........................................................................*.......................................... + // mls v15.4s, v28.4s, v8.4s // ...........................................................................~.......................................'...............................................................................*...................................... + // cmge v27.4s, v31.4s, v16.4s // ..........................................................................~........................................'..............................................................................*....................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................~......................................'................................................................................*..................................... + // sub v28.4s, v27.4s, v28.4s // ...............................................................................~...................................'...................................................................................*.................................. + // mls v16.4s, v28.4s, v8.4s // ................................................................................................~..................'....................................................................................................*................. + // str q13, [x0, #(4*(1024/8))] // ........................................................................~..........................................'............................................................................*......................................... + // str q14, [x0, #(5*(1024/8))] // .............................................................................~.....................................'.................................................................................*.................................... + // str q15, [x0, #(6*(1024/8))] // ..................................................................................~................................'......................................................................................*............................... + // str q16, [x0, #(7*(1024/8))] // .......................................................................................................~...........'...........................................................................................................*.......... + // sqrdmulh v27.4s, v9.4s, v26.4s // ..................................................~................................................................'......................................................*............................................................... + // mul v9.4s, v9.4s, v25.4s // ...................................................~...............................................................'.......................................................*.............................................................. + // mls v9.4s, v27.4s, v8.s[0] // .........................................................................~.........................................'.............................................................................*........................................ + // sqrdmulh v27.4s, v10.4s, v26.4s // ..............................................................................~....................................'..................................................................................*................................... + // mul v10.4s, v10.4s, v25.4s // ................................................................................~..................................'....................................................................................*................................. + // mls v10.4s, v27.4s, v8.s[0] // .....................................................................................~.............................'.........................................................................................*............................ + // sqrdmulh v27.4s, v11.4s, v26.4s // ....................................................................................~..............................'........................................................................................*............................. + // mul v11.4s, v11.4s, v25.4s // .......................................................................................~...........................'...........................................................................................*.......................... + // mls v11.4s, v27.4s, v8.s[0] // ..........................................................................................~........................'..............................................................................................*....................... + // sqrdmulh v27.4s, v12.4s, v26.4s // ........................................................................................~..........................'............................................................................................*......................... + // mul v12.4s, v12.4s, v25.4s // ............................................................................................~......................'................................................................................................*..................... + // mls v12.4s, v27.4s, v8.s[0] // ..............................................................................................~....................'..................................................................................................*................... + // cmge v27.4s, v31.4s, v9.4s // .................................................................................~.................................'.....................................................................................*................................ + // cmge v28.4s, v9.4s, v30.4s // ...................................................................................~...............................'.......................................................................................*.............................. + // sub v28.4s, v27.4s, v28.4s // ......................................................................................~............................'..........................................................................................*........................... + // mls v9.4s, v28.4s, v8.4s // ..................................................................................................~................'......................................................................................................*............... + // cmge v27.4s, v31.4s, v10.4s // .........................................................................................~.........................'.............................................................................................*........................ + // cmge v28.4s, v10.4s, v30.4s // ...........................................................................................~.......................'...............................................................................................*...................... + // sub v28.4s, v27.4s, v28.4s // .............................................................................................~.....................'.................................................................................................*.................... + // mls v10.4s, v28.4s, v8.4s // ....................................................................................................~..............'........................................................................................................*............. + // cmge v27.4s, v31.4s, v11.4s // ...............................................................................................~...................'...................................................................................................*.................. + // cmge v28.4s, v11.4s, v30.4s // .................................................................................................~.................'.....................................................................................................*................ + // sub v28.4s, v27.4s, v28.4s // ......................................................................................................~............'..........................................................................................................*........... + // mls v11.4s, v28.4s, v8.4s // ...........................................................................................................~.......'...............................................................................................................*...... + // cmge v27.4s, v31.4s, v12.4s // ...................................................................................................~...............'.......................................................................................................*.............. + // cmge v28.4s, v12.4s, v30.4s // .....................................................................................................~.............'.........................................................................................................*............ + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................~........'..............................................................................................................*....... + // mls v12.4s, v28.4s, v8.4s // .............................................................................................................~.....'.................................................................................................................*.... + // str q9, [x0], #(16) // .........................................................................................................~.........'.............................................................................................................*........ + // str q10, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................~......'................................................................................................................*..... + // str q11, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................~...'...................................................................................................................*.. + // str q12, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................~.'.....................................................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v22.4S, v12.4S, v11.4S // ..*........................................................................................................ - mul v16.4S, v29.4S, v2.S[2] // ...*....................................................................................................... - ldr q4, [x0, #0] // *.......................................................................................................... - add v7.4S, v12.4S, v11.4S // ....*...................................................................................................... - ldr q19, [x0, #128] // .*......................................................................................................... + // Instructions: 107 + // Expected cycles: 109 + // Expected IPC: 0.98 + // + // Wall time: 2.25s + // User time: 2.25s + // + // ------------------------------------------- original position --------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------ + sub v20.4S, v6.4S, v27.4S // ..*........................................................................................................ + add v18.4S, v6.4S, v27.4S // ....*...................................................................................................... + ldr q19, [x0, #0] // *.......................................................................................................... + sqrdmulh v11.4S, v11.4S, v2.S[3] // ...*....................................................................................................... + add v22.4S, v12.4S, v14.4S // .....*..................................................................................................... + ldr q13, [x0, #128] // .*......................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v6.4S, v20.4S, v2.S[1] // ......*.................................................................................................... // gap // ........................................................................................................... - add v21.4S, v28.4S, v24.4S // .....*..................................................................................................... - mls v16.4S, v20.4S, v8.S[0] // ...........*............................................................................................... // gap // ........................................................................................................... + sub v27.4S, v22.4S, v5.4S // ..........*................................................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... + mul v20.4S, v20.4S, v2.S[0] // .........*................................................................................................. + add v22.4S, v22.4S, v5.4S // ............*.............................................................................................. // gap // ........................................................................................................... - mul v12.4S, v22.4S, v2.S[0] // ......*.................................................................................................... + add v17.4S, v19.4S, v13.4S // ........*.................................................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v11.4S, v21.4S, v18.4S // ..........*................................................................................................ + mls v23.4S, v11.4S, v8.S[0] // ...........*............................................................................................... + sub v19.4S, v19.4S, v13.4S // .......*................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - add v21.4S, v21.4S, v18.4S // ............*.............................................................................................. - sqrdmulh v22.4S, v22.4S, v2.S[1] // .........*................................................................................................. // gap // ........................................................................................................... - sub v13.4S, v16.4S, v5.4S // .................*......................................................................................... // gap // ........................................................................................................... + mls v20.4S, v6.4S, v8.S[0] // ................*.......................................................................................... + sub v11.4S, v17.4S, v18.4S // .............*............................................................................................. // gap // ........................................................................................................... - add v16.4S, v16.4S, v5.4S // ...................*....................................................................................... - mul v17.4S, v11.4S, v1.S[0] // ........................*.................................................................................. + add v18.4S, v17.4S, v18.4S // ..............*............................................................................................ // gap // ........................................................................................................... - add v28.4S, v4.4S, v19.4S // ........*.................................................................................................. // gap // ........................................................................................................... + sqrdmulh v13.4S, v19.4S, v1.S[3] // ...............*........................................................................................... // gap // ........................................................................................................... - sub v4.4S, v4.4S, v19.4S // .......*................................................................................................... - sqrdmulh v19.4S, v11.4S, v1.S[1] // .........................*................................................................................. // gap // ........................................................................................................... + sub v6.4S, v23.4S, v9.4S // .................*......................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + add v17.4S, v23.4S, v9.4S // ...................*....................................................................................... + sqrdmulh v12.4S, v27.4S, v1.S[1] // ........................*.................................................................................. // gap // ........................................................................................................... - mls v12.4S, v22.4S, v8.S[0] // ................*.......................................................................................... - sub v22.4S, v28.4S, v7.4S // .............*............................................................................................. + sub v14.4S, v18.4S, v22.4S // ....................*...................................................................................... // gap // ........................................................................................................... - add v7.4S, v28.4S, v7.4S // ..............*............................................................................................ // gap // ........................................................................................................... + mul v27.4S, v27.4S, v1.S[0] // ..........................*................................................................................ + add v18.4S, v18.4S, v22.4S // .....................*..................................................................................... // gap // ........................................................................................................... - mul v11.4S, v4.4S, v1.S[2] // ...............*........................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mul v19.4S, v19.4S, v1.S[2] // ..................*........................................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v4.4S, v4.4S, v1.S[3] // ..................*........................................................................................ - sub v28.4S, v7.4S, v21.4S // ....................*...................................................................................... // gap // ........................................................................................................... - add v7.4S, v7.4S, v21.4S // .....................*..................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v21.4S, v22.4S, v0.S[2] // ......................*.................................................................................... + mls v19.4S, v13.4S, v8.S[0] // ......................*.................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v22.4S, v22.4S, v0.S[3] // .......................*................................................................................... + sqrdmulh v22.4S, v11.4S, v0.S[3] // .......................*................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v24.4S, v13.4S, v1.S[0] // ...............................*........................................................................... + mul v11.4S, v11.4S, v0.S[2] // ............................*.............................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + sub v13.4S, v19.4S, v20.4S // .........................*................................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + add v20.4S, v19.4S, v20.4S // ...........................*............................................................................... + mls v27.4S, v12.4S, v8.S[0] // ..................................*........................................................................ // gap // ........................................................................................................... - sqrdmulh v13.4S, v13.4S, v1.S[1] // .....................................*..................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mls v11.4S, v22.4S, v8.S[0] // ...............................*........................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v11.4S, v4.4S, v8.S[0] // ..........................*................................................................................ + sub v19.4S, v20.4S, v17.4S // .............................*............................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + add v20.4S, v20.4S, v17.4S // ..............................*............................................................................ + sqrdmulh v22.4S, v13.4S, v0.S[3] // ................................*.......................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v17.4S, v19.4S, v8.S[0] // ............................*.............................................................................. // gap // ........................................................................................................... + mul v13.4S, v13.4S, v0.S[2] // .................................*......................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + sub v17.4S, v11.4S, v27.4S // .....................................*..................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v21.4S, v22.4S, v8.S[0] // ...........................*............................................................................... + add v11.4S, v11.4S, v27.4S // .......................................*................................................................... + sqrdmulh v27.4S, v6.4S, v1.S[1] // ....................................*...................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - add v4.4S, v11.4S, v12.4S // ..............................*............................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v22.4S, v11.4S, v12.4S // .............................*............................................................................. - mul v19.4S, v28.4S, v0.S[0] // .......................................*................................................................... + mls v13.4S, v22.4S, v8.S[0] // ...................................*....................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v24.4S, v13.4S, v8.S[0] // .........................................*................................................................. - sub v12.4S, v4.4S, v16.4S // .................................*......................................................................... // gap // ........................................................................................................... - add v16.4S, v4.4S, v16.4S // ..................................*........................................................................ + mul v22.4S, v6.4S, v1.S[0] // ......................................*.................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v4.4S, v22.4S, v0.S[2] // ................................*.......................................................................... - sub v11.4S, v21.4S, v17.4S // ....................................*...................................................................... // gap // ........................................................................................................... - add v21.4S, v21.4S, v17.4S // ......................................*.................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v22.4S, v22.4S, v0.S[3] // ...................................*....................................................................... + sqrdmulh v6.4S, v14.4S, v0.S[1] // ........................................*.................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v13.4S, v28.4S, v0.S[1] // ..........................................*................................................................ + mls v22.4S, v27.4S, v8.S[0] // .........................................*................................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v17.4S, v12.4S, v0.S[0] // ...........................................*............................................................... + mul v27.4S, v14.4S, v0.S[0] // ..........................................*................................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v4.4S, v22.4S, v8.S[0] // ........................................*.................................................................. + mls v27.4S, v6.4S, v8.S[0] // ...........................................*............................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + sub v6.4S, v13.4S, v22.4S // ............................................*.............................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + add v22.4S, v13.4S, v22.4S // .............................................*............................................................. + sqrdmulh v13.4S, v18.4S, v26.4S // ..............................................*............................................................ // gap // ........................................................................................................... - mul v22.4S, v7.4S, v25.4S // ..............................................*............................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mul v18.4S, v18.4S, v25.4S // ...............................................*........................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v7.4S, v7.4S, v26.4S // ...............................................*........................................................... + cmge v12.4S, v31.4S, v27.4S // ................................................*.......................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v28.4S, v4.4S, v24.4S // ............................................*.............................................................. + cmge v14.4S, v27.4S, v30.4S // .................................................*......................................................... + sqrdmulh v16.4S, v19.4S, v0.S[1] // ..................................................*........................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v19.4S, v13.4S, v8.S[0] // ................................................*.......................................................... - add v4.4S, v4.4S, v24.4S // .............................................*............................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + mul v19.4S, v19.4S, v0.S[0] // ...................................................*....................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v12.4S, v12.4S, v0.S[1] // .................................................*......................................................... + sub v12.4S, v12.4S, v14.4S // ....................................................*...................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + sqrdmulh v14.4S, v17.4S, v0.S[1] // .....................................................*..................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v13.4S, v11.4S, v0.S[0] // ..................................................*........................................................ // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v24.4S, v31.4S, v19.4S // ...................................................*....................................................... + mls v19.4S, v16.4S, v8.S[0] // ......................................................*.................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v11.4S, v11.4S, v0.S[1] // ....................................................*...................................................... - cmge v14.4S, v19.4S, v30.4S // .....................................................*..................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mul v17.4S, v17.4S, v0.S[0] // .......................................................*................................................... // gap // ........................................................................................................... - mls v17.4S, v12.4S, v8.S[0] // ......................................................*.................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v12.4S, v24.4S, v14.4S // .......................................................*................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mul v24.4S, v28.4S, v0.S[0] // ........................................................*.................................................. + mls v17.4S, v14.4S, v8.S[0] // ........................................................*.................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + cmge v14.4S, v31.4S, v19.4S // .........................................................*................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + cmge v16.4S, v19.4S, v30.4S // ..........................................................*................................................ + sqrdmulh v10.4S, v6.4S, v0.S[1] // ...........................................................*............................................... // gap // ........................................................................................................... - mls v13.4S, v11.4S, v8.S[0] // .........................................................*................................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v11.4S, v31.4S, v17.4S // ..........................................................*................................................ // gap // ........................................................................................................... + mul v6.4S, v6.4S, v0.S[0] // ............................................................*.............................................. // gap // ........................................................................................................... - sqrdmulh v28.4S, v28.4S, v0.S[1] // ...........................................................*............................................... - cmge v14.4S, v17.4S, v30.4S // ............................................................*.............................................. // gap // ........................................................................................................... + sub v14.4S, v14.4S, v16.4S // .............................................................*............................................. // gap // ........................................................................................................... // gap // ........................................................................................................... + mls v27.4S, v12.4S, v8.4S // ..............................................................*............................................ + cmge v12.4S, v31.4S, v17.4S // ...............................................................*........................................... // gap // ........................................................................................................... - mls v19.4S, v12.4S, v8.4S // .............................................................*............................................. + cmge v16.4S, v17.4S, v30.4S // ................................................................*.......................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v12.4S, v11.4S, v14.4S // ..............................................................*............................................ + mls v6.4S, v10.4S, v8.S[0] // .................................................................*......................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v22.4S, v7.4S, v8.S[0] // ...............................................................*........................................... - cmge v7.4S, v31.4S, v13.4S // ................................................................*.......................................... // gap // ........................................................................................................... - cmge v11.4S, v13.4S, v30.4S // .................................................................*......................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v24.4S, v28.4S, v8.S[0] // ..................................................................*........................................ + mls v19.4S, v14.4S, v8.4S // ..................................................................*........................................ + sub v12.4S, v12.4S, v16.4S // ...................................................................*....................................... // gap // ........................................................................................................... + str q27, [x0, #512] // ....................................................................*...................................... // gap // ........................................................................................................... - str q19, [x0, #512] // ...................................................................*....................................... // gap // ........................................................................................................... + mls v18.4S, v13.4S, v8.S[0] // .....................................................................*..................................... // gap // ........................................................................................................... - mls v17.4S, v12.4S, v8.4S // ....................................................................*...................................... - sub v7.4S, v7.4S, v11.4S // .....................................................................*..................................... // gap // ........................................................................................................... - cmge v19.4S, v31.4S, v22.4S // ......................................................................*.................................... + cmge v13.4S, v31.4S, v6.4S // ......................................................................*.................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v12.4S, v22.4S, v30.4S // .......................................................................*................................... - mul v11.4S, v16.4S, v25.4S // ........................................................................*.................................. + cmge v27.4S, v6.4S, v30.4S // ........................................................................*.................................. + sqrdmulh v14.4S, v20.4S, v26.4S // ..........................................................................*................................ // gap // ........................................................................................................... - cmge v28.4S, v31.4S, v24.4S // .........................................................................*................................. + str q19, [x0, #640] // .........................................................................*................................. // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v16.4S, v16.4S, v26.4S // ..............................................................................*............................ - cmge v14.4S, v24.4S, v30.4S // ...........................................................................*............................... + mul v20.4S, v20.4S, v25.4S // ............................................................................*.............................. // gap // ........................................................................................................... - str q17, [x0, #640] // ............................................................................*.............................. - sub v19.4S, v19.4S, v12.4S // .............................................................................*............................. // gap // ........................................................................................................... - mul v12.4S, v21.4S, v25.4S // ................................................................................*.......................... + sub v19.4S, v13.4S, v27.4S // ...........................................................................*............................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sub v17.4S, v28.4S, v14.4S // ...............................................................................*........................... + cmge v13.4S, v31.4S, v18.4S // .............................................................................*............................. + sqrdmulh v27.4S, v11.4S, v26.4S // ................................................................................*.......................... // gap // ........................................................................................................... + cmge v16.4S, v18.4S, v30.4S // ...............................................................................*........................... // gap // ........................................................................................................... - sqrdmulh v21.4S, v21.4S, v26.4S // ..................................................................................*........................ // gap // ........................................................................................................... + mls v20.4S, v14.4S, v8.S[0] // .................................................................................*......................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v11.4S, v16.4S, v8.S[0] // ...................................................................................*....................... // gap // ........................................................................................................... + mul v11.4S, v11.4S, v25.4S // ...................................................................................*....................... + sub v13.4S, v13.4S, v16.4S // ..................................................................................*........................ // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - sqrdmulh v16.4S, v4.4S, v26.4S // ....................................................................................*...................... + sqrdmulh v14.4S, v22.4S, v26.4S // ....................................................................................*...................... // gap // ........................................................................................................... // gap // ........................................................................................................... + cmge v16.4S, v31.4S, v20.4S // .....................................................................................*..................... // gap // ........................................................................................................... // gap // ........................................................................................................... + mls v11.4S, v27.4S, v8.S[0] // ......................................................................................*.................... + cmge v27.4S, v20.4S, v30.4S // .......................................................................................*................... // gap // ........................................................................................................... - mls v12.4S, v21.4S, v8.S[0] // .....................................................................................*..................... // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v21.4S, v31.4S, v11.4S // ......................................................................................*.................... // gap // ........................................................................................................... + mul v22.4S, v22.4S, v25.4S // ........................................................................................*.................. // gap // ........................................................................................................... - mul v4.4S, v4.4S, v25.4S // .......................................................................................*................... - cmge v28.4S, v11.4S, v30.4S // ........................................................................................*.................. // gap // ........................................................................................................... + sub v27.4S, v16.4S, v27.4S // .........................................................................................*................. // gap // ........................................................................................................... // gap // ........................................................................................................... + mls v22.4S, v14.4S, v8.S[0] // ..........................................................................................*................ // gap // ........................................................................................................... - mls v4.4S, v16.4S, v8.S[0] // .........................................................................................*................. // gap // ........................................................................................................... + cmge v14.4S, v31.4S, v11.4S // ...........................................................................................*............... // gap // ........................................................................................................... - sub v16.4S, v21.4S, v28.4S // ..........................................................................................*................ // gap // ........................................................................................................... + mls v17.4S, v12.4S, v8.4S // .......................................................................*................................... + cmge v12.4S, v11.4S, v30.4S // .............................................................................................*............. // gap // ........................................................................................................... - mls v13.4S, v7.4S, v8.4S // ..........................................................................*................................ - cmge v7.4S, v31.4S, v12.4S // ............................................................................................*.............. // gap // ........................................................................................................... - cmge v21.4S, v12.4S, v30.4S // .............................................................................................*............. // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v24.4S, v17.4S, v8.4S // ...........................................................................................*............... + mls v6.4S, v19.4S, v8.4S // ............................................................................................*.............. // gap // ........................................................................................................... // gap // ........................................................................................................... - cmge v17.4S, v31.4S, v4.4S // ...............................................................................................*........... + cmge v19.4S, v31.4S, v22.4S // ...............................................................................................*........... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v22.4S, v19.4S, v8.4S // ..............................................................................................*............ - cmge v19.4S, v4.4S, v30.4S // .................................................................................................*......... + mls v18.4S, v13.4S, v8.4S // ..............................................................................................*............ + cmge v13.4S, v22.4S, v30.4S // .................................................................................................*......... // gap // ........................................................................................................... - str q13, [x0, #768] // .................................................................................*......................... - sub v7.4S, v7.4S, v21.4S // ...................................................................................................*....... + sub v12.4S, v14.4S, v12.4S // ..................................................................................................*........ + str q17, [x0, #768] // ..............................................................................*............................ // gap // ........................................................................................................... - mls v11.4S, v16.4S, v8.4S // ................................................................................................*.......... + mls v20.4S, v27.4S, v8.4S // ................................................................................................*.......... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q24, [x0, #896] // ..................................................................................................*........ - sub v16.4S, v17.4S, v19.4S // .....................................................................................................*..... + str q6, [x0, #896] // ...................................................................................................*....... + sub v19.4S, v19.4S, v13.4S // .....................................................................................................*..... // gap // ........................................................................................................... - mls v12.4S, v7.4S, v8.4S // ......................................................................................................*.... + mls v11.4S, v12.4S, v8.4S // ......................................................................................................*.... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q22, [x0], #(16) // ....................................................................................................*...... + str q18, [x0], #(16) // ....................................................................................................*...... // gap // ........................................................................................................... // gap // ........................................................................................................... - mls v4.4S, v16.4S, v8.4S // ........................................................................................................*.. + mls v22.4S, v19.4S, v8.4S // ........................................................................................................*.. // gap // ........................................................................................................... // gap // ........................................................................................................... - str q11, [x0, #112] // .......................................................................................................*... + str q20, [x0, #112] // .......................................................................................................*... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q12, [x0, #240] // .........................................................................................................*. + str q11, [x0, #240] // .........................................................................................................*. // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... // gap // ........................................................................................................... - str q4, [x0, #368] // ..........................................................................................................* + str q22, [x0, #368] // ..........................................................................................................* // gap // ........................................................................................................... // gap // ........................................................................................................... - // original source code - // ldr q16, [x0, #0] // ..*........................................................................................................ - // ldr q4, [x0, #128] // ....*...................................................................................................... - // sub v7.4S, v12.4S, v11.4S // *.......................................................................................................... - // mul v19.4S, v29.4S, v2.S[2] // .*......................................................................................................... - // add v22.4S, v12.4S, v11.4S // ...*....................................................................................................... - // add v21.4S, v28.4S, v24.4S // .....*..................................................................................................... - // mul v17.4S, v7.4S, v2.S[0] // .......*................................................................................................... - // sub v29.4S, v16.4S, v4.4S // ...............*........................................................................................... - // add v16.4S, v16.4S, v4.4S // ..............*............................................................................................ - // sqrdmulh v4.4S, v7.4S, v2.S[1] // ..........*................................................................................................ - // sub v7.4S, v21.4S, v18.4S // ........*.................................................................................................. - // mls v19.4S, v20.4S, v8.S[0] // ......*.................................................................................................... - // add v21.4S, v21.4S, v18.4S // .........*................................................................................................. - // sub v20.4S, v16.4S, v22.4S // ..................*........................................................................................ - // add v16.4S, v16.4S, v22.4S // ...................*....................................................................................... - // mul v22.4S, v29.4S, v1.S[2] // ....................*...................................................................................... - // mls v17.4S, v4.4S, v8.S[0] // .................*......................................................................................... - // sub v4.4S, v19.4S, v5.4S // ...........*............................................................................................... - // sqrdmulh v13.4S, v29.4S, v1.S[3] // .....................*..................................................................................... - // add v19.4S, v19.4S, v5.4S // ............*.............................................................................................. - // sub v14.4S, v16.4S, v21.4S // ......................*.................................................................................... - // add v16.4S, v16.4S, v21.4S // .......................*................................................................................... - // mul v21.4S, v20.4S, v0.S[2] // ........................*.................................................................................. - // sqrdmulh v20.4S, v20.4S, v0.S[3] // .........................*................................................................................. - // mul v5.4S, v7.4S, v1.S[0] // .............*............................................................................................. - // sqrdmulh v7.4S, v7.4S, v1.S[1] // ................*.......................................................................................... - // mls v22.4S, v13.4S, v8.S[0] // ............................*.............................................................................. - // mls v21.4S, v20.4S, v8.S[0] // ..............................*............................................................................ - // mls v5.4S, v7.4S, v8.S[0] // .............................*............................................................................. - // sub v7.4S, v22.4S, v17.4S // ................................*.......................................................................... - // add v22.4S, v22.4S, v17.4S // ...............................*........................................................................... - // mul v13.4S, v4.4S, v1.S[0] // ..........................*................................................................................ - // mul v17.4S, v7.4S, v0.S[2] // .....................................*..................................................................... - // sub v20.4S, v22.4S, v19.4S // ...................................*....................................................................... - // add v22.4S, v22.4S, v19.4S // ....................................*...................................................................... - // sqrdmulh v7.4S, v7.4S, v0.S[3] // ........................................*.................................................................. - // sub v19.4S, v21.4S, v5.4S // ......................................*.................................................................... - // sqrdmulh v4.4S, v4.4S, v1.S[1] // ...........................*............................................................................... - // add v21.4S, v21.4S, v5.4S // .......................................*................................................................... - // mul v5.4S, v14.4S, v0.S[0] // .................................*......................................................................... - // mls v17.4S, v7.4S, v8.S[0] // ...........................................*............................................................... - // mls v13.4S, v4.4S, v8.S[0] // ..................................*........................................................................ - // sqrdmulh v4.4S, v14.4S, v0.S[1] // .........................................*................................................................. - // mul v7.4S, v20.4S, v0.S[0] // ..........................................*................................................................ - // sub v14.4S, v17.4S, v13.4S // ..............................................*............................................................ - // add v13.4S, v17.4S, v13.4S // ................................................*.......................................................... - // mul v17.4S, v16.4S, v25.4S // ............................................*.............................................................. - // sqrdmulh v16.4S, v16.4S, v26.4S // .............................................*............................................................. - // mls v5.4S, v4.4S, v8.S[0] // ...............................................*........................................................... - // sqrdmulh v4.4S, v20.4S, v0.S[1] // .................................................*......................................................... - // mul v20.4S, v19.4S, v0.S[0] // ..................................................*........................................................ - // cmge v10.4S, v31.4S, v5.4S // ...................................................*....................................................... - // sqrdmulh v19.4S, v19.4S, v0.S[1] // ....................................................*...................................................... - // cmge v9.4S, v5.4S, v30.4S // .....................................................*..................................................... - // mls v7.4S, v4.4S, v8.S[0] // ......................................................*.................................................... - // sub v4.4S, v10.4S, v9.4S // .......................................................*................................................... - // mul v10.4S, v14.4S, v0.S[0] // ........................................................*.................................................. - // mls v20.4S, v19.4S, v8.S[0] // .........................................................*................................................. - // cmge v19.4S, v31.4S, v7.4S // ..........................................................*................................................ - // sqrdmulh v14.4S, v14.4S, v0.S[1] // ...........................................................*............................................... - // cmge v9.4S, v7.4S, v30.4S // ............................................................*.............................................. - // mls v5.4S, v4.4S, v8.4S // .............................................................*............................................. - // sub v4.4S, v19.4S, v9.4S // ..............................................................*............................................ - // mls v17.4S, v16.4S, v8.S[0] // ...............................................................*........................................... - // cmge v16.4S, v31.4S, v20.4S // ................................................................*.......................................... - // cmge v19.4S, v20.4S, v30.4S // .................................................................*......................................... - // mls v10.4S, v14.4S, v8.S[0] // ..................................................................*........................................ - // str q5, [x0, #512] // ...................................................................*....................................... - // mls v7.4S, v4.4S, v8.4S // ....................................................................*...................................... - // sub v16.4S, v16.4S, v19.4S // .....................................................................*..................................... - // cmge v4.4S, v31.4S, v17.4S // ......................................................................*.................................... - // cmge v19.4S, v17.4S, v30.4S // .......................................................................*................................... - // mul v14.4S, v22.4S, v25.4S // ........................................................................*.................................. - // cmge v5.4S, v31.4S, v10.4S // .........................................................................*................................. - // mls v20.4S, v16.4S, v8.4S // .........................................................................................*................. - // cmge v16.4S, v10.4S, v30.4S // ...........................................................................*............................... - // str q7, [x0, #640] // ............................................................................*.............................. - // sub v4.4S, v4.4S, v19.4S // .............................................................................*............................. - // sqrdmulh v7.4S, v22.4S, v26.4S // ..........................................................................*................................ - // sub v16.4S, v5.4S, v16.4S // ...............................................................................*........................... - // mul v22.4S, v21.4S, v25.4S // ..............................................................................*............................ - // str q20, [x0, #768] // ................................................................................................*.......... - // sqrdmulh v19.4S, v21.4S, v26.4S // ................................................................................*.......................... - // mls v14.4S, v7.4S, v8.S[0] // .................................................................................*......................... - // sqrdmulh v7.4S, v13.4S, v26.4S // ..................................................................................*........................ - // mls v22.4S, v19.4S, v8.S[0] // ...................................................................................*....................... - // cmge v19.4S, v31.4S, v14.4S // ....................................................................................*...................... - // mul v21.4S, v13.4S, v25.4S // .....................................................................................*..................... - // cmge v13.4S, v14.4S, v30.4S // ......................................................................................*.................... - // mls v21.4S, v7.4S, v8.S[0] // .......................................................................................*................... - // sub v7.4S, v19.4S, v13.4S // ........................................................................................*.................. - // mls v10.4S, v16.4S, v8.4S // ............................................................................................*.............. - // cmge v16.4S, v31.4S, v22.4S // ..........................................................................................*................ - // cmge v19.4S, v22.4S, v30.4S // ...........................................................................................*............... - // mls v17.4S, v4.4S, v8.4S // ..............................................................................................*............ - // cmge v4.4S, v31.4S, v21.4S // .............................................................................................*............. - // mls v14.4S, v7.4S, v8.4S // ..................................................................................................*........ - // cmge v7.4S, v21.4S, v30.4S // ...............................................................................................*........... - // str q10, [x0, #896] // ...................................................................................................*....... - // sub v16.4S, v16.4S, v19.4S // .................................................................................................*......... - // str q17, [x0], #(16) // ......................................................................................................*.... - // sub v4.4S, v4.4S, v7.4S // ....................................................................................................*...... - // mls v22.4S, v16.4S, v8.4S // .....................................................................................................*..... - // str q14, [x0, #112] // ........................................................................................................*.. - // mls v21.4S, v4.4S, v8.4S // .......................................................................................................*... - // str q22, [x0, #240] // .........................................................................................................*. - // str q21, [x0, #368] // ..........................................................................................................* + // ---------------------------------------------- new position ----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------ + // ldr q20, [x0, #0] // ..*........................................................................................................ + // ldr q18, [x0, #128] // .....*..................................................................................................... + // sub v19.4S, v6.4S, v27.4S // *.......................................................................................................... + // sqrdmulh v22.4S, v11.4S, v2.S[3] // ...*....................................................................................................... + // add v11.4S, v6.4S, v27.4S // .*......................................................................................................... + // add v13.4S, v12.4S, v14.4S // ....*...................................................................................................... + // sqrdmulh v17.4S, v19.4S, v2.S[1] // ......*.................................................................................................... + // sub v10.4S, v20.4S, v18.4S // ............*.............................................................................................. + // add v20.4S, v20.4S, v18.4S // ..........*................................................................................................ + // mul v18.4S, v19.4S, v2.S[0] // ........*.................................................................................................. + // sub v19.4S, v13.4S, v5.4S // .......*................................................................................................... + // mls v23.4S, v22.4S, v8.S[0] // ...........*............................................................................................... + // add v22.4S, v13.4S, v5.4S // .........*................................................................................................. + // sub v13.4S, v20.4S, v11.4S // ..............*............................................................................................ + // add v20.4S, v20.4S, v11.4S // ...............*........................................................................................... + // sqrdmulh v11.4S, v10.4S, v1.S[3] // ................*.......................................................................................... + // mls v18.4S, v17.4S, v8.S[0] // .............*............................................................................................. + // sub v17.4S, v23.4S, v9.4S // .................*......................................................................................... + // mul v16.4S, v10.4S, v1.S[2] // .......................*................................................................................... + // add v10.4S, v23.4S, v9.4S // ..................*........................................................................................ + // sub v23.4S, v20.4S, v22.4S // ....................*...................................................................................... + // add v20.4S, v20.4S, v22.4S // ......................*.................................................................................... + // mls v16.4S, v11.4S, v8.S[0] // ........................*.................................................................................. + // sqrdmulh v22.4S, v13.4S, v0.S[3] // .........................*................................................................................. + // sqrdmulh v9.4S, v19.4S, v1.S[1] // ...................*....................................................................................... + // sub v7.4S, v16.4S, v18.4S // ...........................*............................................................................... + // mul v19.4S, v19.4S, v1.S[0] // .....................*..................................................................................... + // add v18.4S, v16.4S, v18.4S // ............................*.............................................................................. + // mul v13.4S, v13.4S, v0.S[2] // ..........................*................................................................................ + // sub v16.4S, v18.4S, v10.4S // ...............................*........................................................................... + // add v18.4S, v18.4S, v10.4S // ................................*.......................................................................... + // mls v13.4S, v22.4S, v8.S[0] // ..............................*............................................................................ + // sqrdmulh v22.4S, v7.4S, v0.S[3] // .................................*......................................................................... + // mul v10.4S, v7.4S, v0.S[2] // ..................................*........................................................................ + // mls v19.4S, v9.4S, v8.S[0] // .............................*............................................................................. + // mls v10.4S, v22.4S, v8.S[0] // ......................................*.................................................................... + // sqrdmulh v22.4S, v17.4S, v1.S[1] // .....................................*..................................................................... + // sub v9.4S, v13.4S, v19.4S // ...................................*....................................................................... + // mul v17.4S, v17.4S, v1.S[0] // .......................................*................................................................... + // add v19.4S, v13.4S, v19.4S // ....................................*...................................................................... + // sqrdmulh v13.4S, v23.4S, v0.S[1] // ........................................*.................................................................. + // mls v17.4S, v22.4S, v8.S[0] // .........................................*................................................................. + // mul v22.4S, v23.4S, v0.S[0] // ..........................................*................................................................ + // mls v22.4S, v13.4S, v8.S[0] // ...........................................*............................................................... + // sub v13.4S, v10.4S, v17.4S // ............................................*.............................................................. + // add v17.4S, v10.4S, v17.4S // .............................................*............................................................. + // sqrdmulh v10.4S, v20.4S, v26.4S // ..............................................*............................................................ + // mul v20.4S, v20.4S, v25.4S // ...............................................*........................................................... + // cmge v23.4S, v31.4S, v22.4S // ................................................*.......................................................... + // cmge v7.4S, v22.4S, v30.4S // .................................................*......................................................... + // sqrdmulh v24.4S, v16.4S, v0.S[1] // ..................................................*........................................................ + // mul v16.4S, v16.4S, v0.S[0] // ...................................................*....................................................... + // sub v23.4S, v23.4S, v7.4S // ....................................................*...................................................... + // sqrdmulh v7.4S, v9.4S, v0.S[1] // .....................................................*..................................................... + // mls v16.4S, v24.4S, v8.S[0] // ......................................................*.................................................... + // mul v9.4S, v9.4S, v0.S[0] // .......................................................*................................................... + // mls v9.4S, v7.4S, v8.S[0] // ........................................................*.................................................. + // cmge v7.4S, v31.4S, v16.4S // .........................................................*................................................. + // cmge v24.4S, v16.4S, v30.4S // ..........................................................*................................................ + // sqrdmulh v4.4S, v13.4S, v0.S[1] // ...........................................................*............................................... + // mul v13.4S, v13.4S, v0.S[0] // ............................................................*.............................................. + // sub v7.4S, v7.4S, v24.4S // .............................................................*............................................. + // mls v22.4S, v23.4S, v8.4S // ..............................................................*............................................ + // cmge v23.4S, v31.4S, v9.4S // ...............................................................*........................................... + // cmge v24.4S, v9.4S, v30.4S // ................................................................*.......................................... + // mls v13.4S, v4.4S, v8.S[0] // .................................................................*......................................... + // mls v16.4S, v7.4S, v8.4S // ..................................................................*........................................ + // sub v23.4S, v23.4S, v24.4S // ...................................................................*....................................... + // str q22, [x0, #512] // ....................................................................*...................................... + // mls v20.4S, v10.4S, v8.S[0] // .....................................................................*..................................... + // cmge v22.4S, v31.4S, v13.4S // ......................................................................*.................................... + // mls v9.4S, v23.4S, v8.4S // ..........................................................................................*................ + // cmge v10.4S, v13.4S, v30.4S // .......................................................................*................................... + // str q16, [x0, #640] // .........................................................................*................................. + // sqrdmulh v16.4S, v18.4S, v26.4S // ........................................................................*.................................. + // sub v22.4S, v22.4S, v10.4S // ...........................................................................*............................... + // mul v18.4S, v18.4S, v25.4S // ..........................................................................*................................ + // cmge v10.4S, v31.4S, v20.4S // ............................................................................*.............................. + // str q9, [x0, #768] // .................................................................................................*......... + // cmge v23.4S, v20.4S, v30.4S // ..............................................................................*............................ + // sqrdmulh v9.4S, v19.4S, v26.4S // .............................................................................*............................. + // mls v18.4S, v16.4S, v8.S[0] // ...............................................................................*........................... + // sub v16.4S, v10.4S, v23.4S // .................................................................................*......................... + // mul v19.4S, v19.4S, v25.4S // ................................................................................*.......................... + // sqrdmulh v10.4S, v17.4S, v26.4S // ..................................................................................*........................ + // cmge v23.4S, v31.4S, v18.4S // ...................................................................................*....................... + // mls v19.4S, v9.4S, v8.S[0] // ....................................................................................*...................... + // cmge v9.4S, v18.4S, v30.4S // .....................................................................................*..................... + // mul v17.4S, v17.4S, v25.4S // ......................................................................................*.................... + // sub v23.4S, v23.4S, v9.4S // .......................................................................................*................... + // mls v17.4S, v10.4S, v8.S[0] // ........................................................................................*.................. + // cmge v10.4S, v31.4S, v19.4S // .........................................................................................*................. + // mls v13.4S, v22.4S, v8.4S // ............................................................................................*.............. + // cmge v22.4S, v19.4S, v30.4S // ...........................................................................................*............... + // mls v20.4S, v16.4S, v8.4S // ..............................................................................................*............ + // cmge v16.4S, v31.4S, v17.4S // .............................................................................................*............. + // mls v18.4S, v23.4S, v8.4S // ..................................................................................................*........ + // cmge v23.4S, v17.4S, v30.4S // ...............................................................................................*........... + // sub v22.4S, v10.4S, v22.4S // ................................................................................................*.......... + // str q13, [x0, #896] // ...................................................................................................*....... + // str q20, [x0], #(16) // ......................................................................................................*.... + // sub v20.4S, v16.4S, v23.4S // ....................................................................................................*...... + // mls v19.4S, v22.4S, v8.4S // .....................................................................................................*..... + // str q18, [x0, #112] // ........................................................................................................*.. + // mls v17.4S, v20.4S, v8.4S // .......................................................................................................*... + // str q19, [x0, #240] // .........................................................................................................*. + // str q17, [x0, #368] // ..........................................................................................................* pop_stack diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_firestorm.s b/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_firestorm.s index 4d589126..7ce8e4c1 100644 --- a/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_firestorm.s +++ b/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_firestorm.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +33,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro barrett_reduce_single a @@ -67,7 +53,7 @@ xtmp1 .req x11 cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s - vmls \a, \tmp2, modulus + vmls \a, \tmp2, consts .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -76,12 +62,6 @@ xtmp1 .req x11 mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.4s, \a\().4s, \b\().4s add \a\().4s, \a\().4s, \b\().4s @@ -96,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -143,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -193,7 +173,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -204,7 +184,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -214,7 +194,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -222,7 +202,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -233,24 +213,30 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -371,8 +357,6 @@ _intt_dilithium_123_45678_opt_m1_firestorm: consts .req v8 qform_consts .req q8 - modulus .req v29 - ASM_LOAD(r_ptr0, roots_l345) ASM_LOAD(r_ptr1, roots_l67) @@ -395,577 +379,609 @@ _intt_dilithium_123_45678_opt_m1_firestorm: qform_root3_tw .req q7 .p2align 2 - ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1] // ..*........................................................................................................................................... - ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x2] // ...........*.................................................................................................................................. - ldr q29, [x5, #32] // .*............................................................................................................................................ - ldr q5, [x5, #80] // ...*.......................................................................................................................................... - ldr q0, [x5], #(12*16) // *............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - ldr q2, [x5, #-128] // ....*......................................................................................................................................... - ldr q15, [x5, #-48] // ......*....................................................................................................................................... - ldr q13, [x4], #64 // .....*........................................................................................................................................ - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - ldr q18, [x5, #-16] // .......*...................................................................................................................................... - ldr q27, [x5, #-64] // ........*..................................................................................................................................... - ldr q17, [x5, #-176] // .........*.................................................................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - ldr q31, [x5, #-144] // ..........*................................................................................................................................... - ldr q25, [x5, #-32] // ....................*......................................................................................................................... - ldr q28, [x5, #-96] // ......................*....................................................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - ldr q3, [x5, #-80] // ..............................*............................................................................................................... - ldr q1, [x4, #-16] // ...................................................*.......................................................................................... - ldr q4, [x4, #-48] // ...................................*.......................................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - ldr q16, [x4, #-32] // ...................................................................*.......................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v6.4S, v19.4S, v20.4S // ..............*............................................................................................................................... - add v19.4S, v19.4S, v20.4S // ............*................................................................................................................................. - sub v20.4S, v21.4S, v22.4S // .............*................................................................................................................................ - add v22.4S, v21.4S, v22.4S // ...............*.............................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v23.4S, v9.4S, v10.4S // ........................*..................................................................................................................... - add v9.4S, v9.4S, v10.4S // .......................*...................................................................................................................... - sub v21.4S, v11.4S, v12.4S // .........................*.................................................................................................................... - add v12.4S, v11.4S, v12.4S // ..........................*................................................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mul v29.4S, v6.4S, v29.4S // ................*............................................................................................................................. - sqrdmulh v5.4S, v20.4S, v5.4S // ..................*........................................................................................................................... - mul v2.4S, v20.4S, v2.4S // .................*............................................................................................................................ - sqrdmulh v31.4S, v6.4S, v31.4S // ...................*.......................................................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sqrdmulh v15.4S, v23.4S, v15.4S // ...............................*.............................................................................................................. - sqrdmulh v18.4S, v21.4S, v18.4S // .................................*............................................................................................................ - mul v27.4S, v23.4S, v27.4S // ..................................*........................................................................................................... - mul v25.4S, v21.4S, v25.4S // ................................*............................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v20.4S, v19.4S, v22.4S // .....................*........................................................................................................................ - add v19.4S, v19.4S, v22.4S // ....................................*......................................................................................................... - sub v22.4S, v9.4S, v12.4S // ......................................*....................................................................................................... - add v9.4S, v9.4S, v12.4S // .................................................*............................................................................................ - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v29.4S, v31.4S, v8.S[0] // ...........................*.................................................................................................................. - mls v2.4S, v5.4S, v8.S[0] // ............................*................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v27.4S, v15.4S, v8.S[0] // ..........................................*................................................................................................... - mls v25.4S, v18.4S, v8.S[0] // ...........................................*.................................................................................................. - mul v5.4S, v20.4S, v0.4S // .............................*................................................................................................................ - sqrdmulh v12.4S, v20.4S, v17.4S // .....................................*........................................................................................................ - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sqrdmulh v15.4S, v22.4S, v3.4S // .........................................*.................................................................................................... - mul v18.4S, v22.4S, v28.4S // ............................................*................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v31.4S, v29.4S, v2.4S // ........................................*..................................................................................................... - add v29.4S, v29.4S, v2.4S // .......................................*...................................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v2.4S, v27.4S, v25.4S // ....................................................*......................................................................................... - add v27.4S, v27.4S, v25.4S // ......................................................*....................................................................................... - mls v5.4S, v12.4S, v8.S[0] // ..................................................*........................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mul v0.4S, v31.4S, v0.4S // ...............................................*.............................................................................................. - sqrdmulh v12.4S, v31.4S, v17.4S // ................................................*............................................................................................. - trn1 v17.4S, v19.4S, v29.4S // .............................................*................................................................................................ - trn2 v29.4S, v19.4S, v29.4S // ..............................................*............................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mul v19.4S, v2.4S, v28.4S // ..........................................................*................................................................................... - sqrdmulh v2.4S, v2.4S, v3.4S // ...........................................................*.................................................................................. - mls v18.4S, v15.4S, v8.S[0] // .....................................................*........................................................................................ - trn1 v15.4S, v9.4S, v27.4S // ........................................................*..................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - trn2 v9.4S, v9.4S, v27.4S // .........................................................*.................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v0.4S, v12.4S, v8.S[0] // .......................................................*...................................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v19.4S, v2.4S, v8.S[0] // ..............................................................*............................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - trn1 v2.4S, v5.4S, v0.4S // ............................................................*................................................................................. - trn2 v5.4S, v5.4S, v0.4S // .............................................................*................................................................................ - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - trn1 v0.4S, v18.4S, v19.4S // .....................................................................*........................................................................ - trn2 v19.4S, v18.4S, v19.4S // ......................................................................*....................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - trn1 v12.2D, v17.2D, v2.2D // ...............................................................*.............................................................................. - trn2 v2.2D, v17.2D, v2.2D // .................................................................*............................................................................ - trn1 v18.2D, v29.2D, v5.2D // ................................................................*............................................................................. - trn2 v29.2D, v29.2D, v5.2D // ..................................................................*........................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - trn2 v5.2D, v15.2D, v0.2D // ........................................................................*..................................................................... - trn1 v0.2D, v15.2D, v0.2D // .........................................................................*.................................................................... - trn2 v15.2D, v9.2D, v19.2D // ...........................................................................*.................................................................. - trn1 v19.2D, v9.2D, v19.2D // ..........................................................................*................................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v9.4S, v12.4S, v18.4S // ....................................................................*......................................................................... - add v12.4S, v12.4S, v18.4S // .............................................................................*................................................................ - sub v18.4S, v2.4S, v29.4S // .......................................................................*...................................................................... - add v29.4S, v2.4S, v29.4S // ...............................................................................*.............................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v2.4S, v5.4S, v15.4S // .................................................................................*............................................................ - add v5.4S, v5.4S, v15.4S // ....................................................................................*......................................................... - sub v15.4S, v0.4S, v19.4S // ................................................................................*............................................................. - add v19.4S, v0.4S, v19.4S // ..................................................................................*........................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sqrdmulh v0.4S, v9.4S, v4.S[3] // ............................................................................*................................................................. - mul v9.4S, v9.4S, v4.S[2] // ...................................................................................*.......................................................... - mul v27.4S, v18.4S, v16.S[0] // ..............................................................................*............................................................... - sqrdmulh v18.4S, v18.4S, v16.S[1] // .....................................................................................*........................................................ - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mul v17.4S, v2.4S, v1.S[0] // ........................................................................................*..................................................... - sqrdmulh v2.4S, v2.4S, v1.S[1] // .........................................................................................*.................................................... - sqrdmulh v31.4S, v15.4S, v16.S[3] // ......................................................................................*....................................................... - mul v15.4S, v15.4S, v16.S[2] // .......................................................................................*...................................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v25.4S, v12.4S, v29.4S // ...............................................................................................*.............................................. - add v29.4S, v12.4S, v29.4S // ..........................................................................................*................................................... - sub v12.4S, v19.4S, v5.4S // ............................................................................................*................................................. - add v19.4S, v19.4S, v5.4S // ...........................................................................................*.................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v9.4S, v0.4S, v8.S[0] // .............................................................................................*................................................ - mls v27.4S, v18.4S, v8.S[0] // ..............................................................................................*............................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v17.4S, v2.4S, v8.S[0] // ...................................................................................................*.......................................... - mls v15.4S, v31.4S, v8.S[0] // ..................................................................................................*........................................... - srshr v5.4S, v29.4S, #23 // .................................................................................................*............................................ - srshr v0.4S, v19.4S, #23 // ................................................................................................*............................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mul v2.4S, v25.4S, v13.S[2] // .....................................................................................................*........................................ - sqrdmulh v18.4S, v25.4S, v13.S[3] // .............................................................................................................*................................ - sqrdmulh v31.4S, v12.4S, v4.S[1] // ....................................................................................................*......................................... - mul v12.4S, v12.4S, v4.S[0] // ........................................................................................................*..................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v25.4S, v9.4S, v27.4S // ......................................................................................................*....................................... - add v9.4S, v9.4S, v27.4S // .......................................................................................................*...................................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v29.4S, v5.4S, v8.4S // ..........................................................................................................*................................... - mls v19.4S, v0.4S, v8.4S // .........................................................................................................*.................................... - sub v5.4S, v15.4S, v17.4S // ............................................................................................................*................................. - add v0.4S, v15.4S, v17.4S // ...........................................................................................................*.................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v12.4S, v31.4S, v8.S[0] // .....................................................................................................................*........................ - mul v15.4S, v25.4S, v13.S[2] // ..............................................................................................................*............................... - sqrdmulh v27.4S, v25.4S, v13.S[3] // ...............................................................................................................*.............................. - srshr v17.4S, v9.4S, #23 // ................................................................................................................*............................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sqrdmulh v31.4S, v5.4S, v4.S[1] // .................................................................................................................*............................ - mul v5.4S, v5.4S, v4.S[0] // ..................................................................................................................*........................... - mls v2.4S, v18.4S, v8.S[0] // ......................................................................................................................*....................... - srshr v18.4S, v0.4S, #23 // ...................................................................................................................*.......................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - add v25.4S, v29.4S, v19.4S // ........................................................................................................................*..................... - sub v29.4S, v29.4S, v19.4S // ....................................................................................................................*......................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v9.4S, v17.4S, v8.4S // .......................................................................................................................*...................... - mls v15.4S, v27.4S, v8.S[0] // .........................................................................................................................*.................... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mls v0.4S, v18.4S, v8.4S // ..........................................................................................................................*................... - mls v5.4S, v31.4S, v8.S[0] // ............................................................................................................................*................. - str q25, [x1], #(16*4) // ..............................................................................................................................*............... - mul v19.4S, v29.4S, v13.S[0] // ...........................................................................................................................*.................. - sqrdmulh v29.4S, v29.4S, v13.S[1] // .............................................................................................................................*................ - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v18.4S, v2.4S, v12.4S // ...............................................................................................................................*.............. - add v26.4S, v2.4S, v12.4S // .......................................................................................................................................*...... - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - sub v2.4S, v9.4S, v0.4S // ................................................................................................................................*............. - add v24.4S, v9.4S, v0.4S // ..................................................................................................................................*........... - mls v19.4S, v29.4S, v8.S[0] // ...................................................................................................................................*.......... - sub v29.4S, v15.4S, v5.4S // .................................................................................................................................*............ - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - add v5.4S, v15.4S, v5.4S // ......................................................................................................................................*....... - mul v11.4S, v18.4S, v13.S[0] // ....................................................................................................................................*......... - sqrdmulh v10.4S, v18.4S, v13.S[1] // .....................................................................................................................................*........ - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - mul v6.4S, v2.4S, v13.S[0] // ........................................................................................................................................*..... - sqrdmulh v23.4S, v2.4S, v13.S[1] // .........................................................................................................................................*.... - mul v12.4S, v29.4S, v13.S[0] // ..........................................................................................................................................*... - sqrdmulh v14.4S, v29.4S, v13.S[1] // ...........................................................................................................................................*.. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - str q19, [x2], #(16*4) // ............................................................................................................................................*. - str q5, [x1, #-16] // .............................................................................................................................................* - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - // gap // .............................................................................................................................................. - - // original source code - // ldr q9, [x5], #(12*16) // ....*......................................................................................................................................... - // ldr q25, [x5, #-160] // ..*........................................................................................................................................... - // ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // *............................................................................................................................................. - // ldr q26, [x5, #-112] // ...*.......................................................................................................................................... - // ldr q0, [x5, #-128] // .....*........................................................................................................................................ - // ldr q31, [x4], #64 // .......*...................................................................................................................................... - // ldr q23, [x5, #-48] // ......*....................................................................................................................................... - // ldr q7, [x5, #-16] // ........*..................................................................................................................................... - // ldr q2, [x5, #-64] // .........*.................................................................................................................................... - // ldr q6, [x5, #-176] // ..........*................................................................................................................................... - // ldr q1, [x5, #-144] // ...........*.................................................................................................................................. - // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // .*............................................................................................................................................ - // add v4.4S, v18.4S, v19.4S // ...................*.......................................................................................................................... - // sub v24.4S, v20.4S, v21.4S // ....................*......................................................................................................................... - // sub v11.4S, v18.4S, v19.4S // ..................*........................................................................................................................... - // add v12.4S, v20.4S, v21.4S // .....................*........................................................................................................................ - // mul v27.4S, v11.4S, v25.4S // ..........................*................................................................................................................... - // mul v22.4S, v24.4S, v0.4S // ............................*................................................................................................................. - // sqrdmulh v5.4S, v24.4S, v26.4S // ...........................*.................................................................................................................. - // sqrdmulh v13.4S, v11.4S, v1.4S // .............................*................................................................................................................ - // ldr q26, [x5, #-32] // ............*................................................................................................................................. - // sub v0.4S, v4.4S, v12.4S // ..................................*........................................................................................................... - // ldr q25, [x5, #-96] // .............*................................................................................................................................ - // add v20.4S, v14.4S, v15.4S // .......................*...................................................................................................................... - // sub v3.4S, v14.4S, v15.4S // ......................*....................................................................................................................... - // sub v10.4S, v16.4S, v17.4S // ........................*..................................................................................................................... - // add v24.4S, v16.4S, v17.4S // .........................*.................................................................................................................... - // mls v27.4S, v13.4S, v8.S[0] // ......................................*....................................................................................................... - // mls v22.4S, v5.4S, v8.S[0] // .......................................*...................................................................................................... - // mul v21.4S, v0.4S, v9.4S // ..........................................*................................................................................................... - // ldr q16, [x5, #-80] // ..............*............................................................................................................................... - // sqrdmulh v30.4S, v3.4S, v23.4S // ..............................*............................................................................................................... - // mul v1.4S, v10.4S, v26.4S // .................................*............................................................................................................ - // sqrdmulh v23.4S, v10.4S, v7.4S // ...............................*.............................................................................................................. - // mul v26.4S, v3.4S, v2.4S // ................................*............................................................................................................. - // ldr q3, [x4, #-48] // ................*............................................................................................................................. - // add v2.4S, v4.4S, v12.4S // ...................................*.......................................................................................................... - // sqrdmulh v19.4S, v0.4S, v6.4S // ...........................................*.................................................................................................. - // sub v14.4S, v20.4S, v24.4S // ....................................*......................................................................................................... - // add v10.4S, v27.4S, v22.4S // ...............................................*.............................................................................................. - // sub v7.4S, v27.4S, v22.4S // ..............................................*............................................................................................... - // sqrdmulh v13.4S, v14.4S, v16.4S // ............................................*................................................................................................. - // mls v26.4S, v30.4S, v8.S[0] // ........................................*..................................................................................................... - // mls v1.4S, v23.4S, v8.S[0] // .........................................*.................................................................................................... - // mul v5.4S, v14.4S, v25.4S // .............................................*................................................................................................ - // trn1 v11.4S, v2.4S, v10.4S // .....................................................*........................................................................................ - // trn2 v23.4S, v2.4S, v10.4S // ......................................................*....................................................................................... - // mul v29.4S, v7.4S, v9.4S // ...................................................*.......................................................................................... - // sqrdmulh v10.4S, v7.4S, v6.4S // ....................................................*......................................................................................... - // add v30.4S, v20.4S, v24.4S // .....................................*........................................................................................................ - // mls v21.4S, v19.4S, v8.S[0] // ..................................................*........................................................................................... - // ldr q24, [x4, #-16] // ...............*.............................................................................................................................. - // sub v28.4S, v26.4S, v1.4S // ................................................*............................................................................................. - // mls v5.4S, v13.4S, v8.S[0] // .........................................................*.................................................................................... - // add v14.4S, v26.4S, v1.4S // .................................................*............................................................................................ - // mls v29.4S, v10.4S, v8.S[0] // ............................................................*................................................................................. - // trn1 v6.4S, v30.4S, v14.4S // ..........................................................*................................................................................... - // trn2 v30.4S, v30.4S, v14.4S // ...........................................................*.................................................................................. - // mul v14.4S, v28.4S, v25.4S // .......................................................*...................................................................................... - // sqrdmulh v1.4S, v28.4S, v16.4S // ........................................................*..................................................................................... - // trn1 v19.4S, v21.4S, v29.4S // ..............................................................*............................................................................... - // trn2 v26.4S, v21.4S, v29.4S // ...............................................................*.............................................................................. - // mls v14.4S, v1.4S, v8.S[0] // .............................................................*................................................................................ - // trn1 v16.2D, v11.2D, v19.2D // ..................................................................*........................................................................... - // trn1 v20.2D, v23.2D, v26.2D // ....................................................................*......................................................................... - // trn2 v4.2D, v11.2D, v19.2D // ...................................................................*.......................................................................... - // trn2 v0.2D, v23.2D, v26.2D // .....................................................................*........................................................................ - // ldr q29, [x4, #-32] // .................*............................................................................................................................ - // sub v21.4S, v16.4S, v20.4S // ..........................................................................*................................................................... - // trn1 v10.4S, v5.4S, v14.4S // ................................................................*............................................................................. - // trn2 v14.4S, v5.4S, v14.4S // .................................................................*............................................................................ - // sub v7.4S, v4.4S, v0.4S // ............................................................................*................................................................. - // trn2 v23.2D, v6.2D, v10.2D // ......................................................................*....................................................................... - // trn1 v17.2D, v6.2D, v10.2D // .......................................................................*...................................................................... - // trn1 v10.2D, v30.2D, v14.2D // .........................................................................*.................................................................... - // trn2 v11.2D, v30.2D, v14.2D // ........................................................................*..................................................................... - // sqrdmulh v1.4S, v21.4S, v3.S[3] // ..................................................................................*........................................................... - // add v30.4S, v16.4S, v20.4S // ...........................................................................*.................................................................. - // mul v20.4S, v7.4S, v29.S[0] // ....................................................................................*......................................................... - // add v16.4S, v4.4S, v0.4S // .............................................................................*................................................................ - // sub v26.4S, v17.4S, v10.4S // ................................................................................*............................................................. - // sub v19.4S, v23.4S, v11.4S // ..............................................................................*............................................................... - // add v28.4S, v17.4S, v10.4S // .................................................................................*............................................................ - // mul v4.4S, v21.4S, v3.S[2] // ...................................................................................*.......................................................... - // add v14.4S, v23.4S, v11.4S // ...............................................................................*.............................................................. - // sqrdmulh v6.4S, v7.4S, v29.S[1] // .....................................................................................*........................................................ - // sqrdmulh v21.4S, v26.4S, v29.S[3] // ........................................................................................*..................................................... - // mul v11.4S, v26.4S, v29.S[2] // .........................................................................................*.................................................... - // mul v26.4S, v19.4S, v24.S[0] // ......................................................................................*....................................................... - // sqrdmulh v24.4S, v19.4S, v24.S[1] // .......................................................................................*...................................................... - // add v7.4S, v30.4S, v16.4S // ...........................................................................................*.................................................. - // add v23.4S, v28.4S, v14.4S // .............................................................................................*................................................ - // sub v10.4S, v28.4S, v14.4S // ............................................................................................*................................................. - // mls v4.4S, v1.4S, v8.S[0] // ..............................................................................................*............................................... - // mls v20.4S, v6.4S, v8.S[0] // ...............................................................................................*.............................................. - // sub v19.4S, v30.4S, v16.4S // ..........................................................................................*................................................... - // srshr v14.4S, v23.4S, #23 // ...................................................................................................*.......................................... - // srshr v30.4S, v7.4S, #23 // ..................................................................................................*........................................... - // mls v11.4S, v21.4S, v8.S[0] // .................................................................................................*............................................ - // mls v26.4S, v24.4S, v8.S[0] // ................................................................................................*............................................. - // sqrdmulh v5.4S, v10.4S, v3.S[1] // ......................................................................................................*....................................... - // mul v16.4S, v19.4S, v31.S[2] // ....................................................................................................*......................................... - // sub v9.4S, v4.4S, v20.4S // ........................................................................................................*..................................... - // add v1.4S, v4.4S, v20.4S // .........................................................................................................*.................................... - // mul v20.4S, v10.4S, v3.S[0] // .......................................................................................................*...................................... - // mls v23.4S, v14.4S, v8.4S // ...........................................................................................................*.................................. - // mls v7.4S, v30.4S, v8.4S // ..........................................................................................................*................................... - // add v21.4S, v11.4S, v26.4S // .............................................................................................................*................................ - // sub v22.4S, v11.4S, v26.4S // ............................................................................................................*................................. - // sqrdmulh v14.4S, v19.4S, v31.S[3] // .....................................................................................................*........................................ - // mul v4.4S, v9.4S, v31.S[2] // ...............................................................................................................*.............................. - // sqrdmulh v26.4S, v9.4S, v31.S[3] // ................................................................................................................*............................. - // srshr v24.4S, v1.4S, #23 // .................................................................................................................*............................ - // sqrdmulh v30.4S, v22.4S, v3.S[1] // ..................................................................................................................*........................... - // mul v15.4S, v22.4S, v3.S[0] // ...................................................................................................................*.......................... - // srshr v9.4S, v21.4S, #23 // .....................................................................................................................*........................ - // sub v10.4S, v7.4S, v23.4S // .......................................................................................................................*...................... - // mls v20.4S, v5.4S, v8.S[0] // ..............................................................................................................*............................... - // mls v16.4S, v14.4S, v8.S[0] // ....................................................................................................................*......................... - // mls v1.4S, v24.4S, v8.4S // ........................................................................................................................*..................... - // add v14.4S, v7.4S, v23.4S // ......................................................................................................................*....................... - // mls v4.4S, v26.4S, v8.S[0] // .........................................................................................................................*.................... - // mls v21.4S, v9.4S, v8.4S // ..........................................................................................................................*................... - // mul v7.4S, v10.4S, v31.S[0] // .............................................................................................................................*................ - // mls v15.4S, v30.4S, v8.S[0] // ...........................................................................................................................*.................. - // sqrdmulh v26.4S, v10.4S, v31.S[1] // ..............................................................................................................................*............... - // str q14, [x1], #(16*4) // ............................................................................................................................*................. - // sub v10.4S, v16.4S, v20.4S // ...............................................................................................................................*.............. - // sub v30.4S, v1.4S, v21.4S // .................................................................................................................................*............ - // sub v14.4S, v4.4S, v15.4S // ....................................................................................................................................*......... - // add v24.4S, v1.4S, v21.4S // ..................................................................................................................................*........... - // mls v7.4S, v26.4S, v8.S[0] // ...................................................................................................................................*.......... - // mul v11.4S, v10.4S, v31.S[0] // ......................................................................................................................................*....... - // sqrdmulh v10.4S, v10.4S, v31.S[1] // .......................................................................................................................................*...... - // add v2.4S, v4.4S, v15.4S // .....................................................................................................................................*........ - // add v26.4S, v16.4S, v20.4S // ................................................................................................................................*............. - // mul v6.4S, v30.4S, v31.S[0] // ........................................................................................................................................*..... - // sqrdmulh v23.4S, v30.4S, v31.S[1] // .........................................................................................................................................*.... - // mul v12.4S, v14.4S, v31.S[0] // ..........................................................................................................................................*... - // sqrdmulh v14.4S, v14.4S, v31.S[1] // ...........................................................................................................................................*.. - // str q7, [x2], #(16*4) // ............................................................................................................................................*. - // str q2, [x1, #-16] // .............................................................................................................................................* + // Instructions: 146 + // Expected cycles: 47 + // Expected IPC: 3.11 + // + // Wall time: 220.28s + // User time: 220.28s + // + // --------------------------------------------------------------- original position ---------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-------------------- + ldr q14, [x5, #128] // .........*........................................................................................................................................ + ldr q17, [x5, #48] // ....*............................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ld4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x2] // ...........*...................................................................................................................................... + ld4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x1] // *................................................................................................................................................. + ldr q9, [x5, #80] // ......*........................................................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ldr q18, [x5, #176] // .............*.................................................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ldr q20, [x5, #160] // ............*..................................................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ldr q13, [x5, #144] // ..........*....................................................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ldr q5, [x5, #64] // .....*............................................................................................................................................ + ldr q31, [x5, #32] // ...*.............................................................................................................................................. + add v28.4S, v2.4S, v3.4S // ...............................*.................................................................................................................. + sub v22.4S, v2.4S, v3.4S // ..............................*................................................................................................................... + sub v10.4S, v0.4S, v1.4S // ............................*..................................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sub v3.4S, v25.4S, v26.4S // ..................*............................................................................................................................... + add v21.4S, v25.4S, v26.4S // ...................*.............................................................................................................................. + sub v30.4S, v23.4S, v24.4S // ................*................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mul v27.4S, v10.4S, v14.4S // ....................................*............................................................................................................. + sqrdmulh v2.4S, v22.4S, v18.4S // .......................................*.......................................................................................................... + mul v6.4S, v22.4S, v20.4S // ......................................*........................................................................................................... + sqrdmulh v19.4S, v10.4S, v13.4S // .....................................*............................................................................................................ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sqrdmulh v11.4S, v30.4S, v17.4S // .......................*.......................................................................................................................... + sqrdmulh v22.4S, v3.4S, v9.4S // .........................*........................................................................................................................ + mul v25.4S, v30.4S, v31.4S // ......................*........................................................................................................................... + mul v4.4S, v3.4S, v5.4S // ........................*......................................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ldr q13, [x5, #112] // ........*......................................................................................................................................... + add v7.4S, v0.4S, v1.4S // .............................*.................................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ldr q9, [x5, #96] // .......*.......................................................................................................................................... + add v20.4S, v23.4S, v24.4S // .................*................................................................................................................................ + mls v27.4S, v19.4S, v8.S[0] // .............................................*.................................................................................................... + ldr q19, [x5], #(12*16) // .*................................................................................................................................................ + mls v6.4S, v2.4S, v8.S[0] // ..............................................*................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ldr q18, [x5, #-176] // ..*............................................................................................................................................... + sub v26.4S, v7.4S, v28.4S // ........................................*......................................................................................................... + add v1.4S, v7.4S, v28.4S // .........................................*........................................................................................................ + mls v4.4S, v22.4S, v8.S[0] // .................................*................................................................................................................ + mls v25.4S, v11.4S, v8.S[0] // ................................*................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sub v10.4S, v20.4S, v21.4S // ..........................*....................................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mul v16.4S, v26.4S, v9.4S // ................................................*................................................................................................. + sqrdmulh v26.4S, v26.4S, v13.4S // ...............................................*.................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sub v31.4S, v27.4S, v6.4S // .....................................................*............................................................................................ + add v20.4S, v20.4S, v21.4S // ...........................*...................................................................................................................... + // gap // .................................................................................................................................................. + sub v12.4S, v25.4S, v4.4S // ..........................................*....................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mul v5.4S, v10.4S, v19.4S // ...................................*.............................................................................................................. + sqrdmulh v15.4S, v10.4S, v18.4S // ..................................*............................................................................................................... + sqrdmulh v14.4S, v31.4S, v13.4S // ..........................................................*....................................................................................... + mul v31.4S, v31.4S, v9.4S // .........................................................*........................................................................................ + ldr q9, [x4, #32] // ....................*............................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sqrdmulh v18.4S, v12.4S, v18.4S // ..................................................*............................................................................................... + add v17.4S, v25.4S, v4.4S // ...........................................*...................................................................................................... + mls v16.4S, v26.4S, v8.S[0] // .......................................................*.......................................................................................... + mul v25.4S, v12.4S, v19.4S // .................................................*................................................................................................ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mls v5.4S, v15.4S, v8.S[0] // ............................................*..................................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + add v6.4S, v27.4S, v6.4S // ......................................................*........................................................................................... + mls v31.4S, v14.4S, v8.S[0] // ...............................................................*.................................................................................. + trn2 v10.4S, v20.4S, v17.4S // ....................................................*............................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mls v25.4S, v18.4S, v8.S[0] // ........................................................*......................................................................................... + trn1 v14.4S, v20.4S, v17.4S // ...................................................*.............................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + trn1 v24.4S, v1.4S, v6.4S // ...........................................................*...................................................................................... + trn2 v1.4S, v1.4S, v6.4S // ............................................................*..................................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + trn1 v22.4S, v16.4S, v31.4S // ....................................................................*............................................................................. + trn2 v27.4S, v16.4S, v31.4S // .....................................................................*............................................................................ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + trn2 v15.4S, v5.4S, v25.4S // ..............................................................*................................................................................... + trn1 v0.4S, v5.4S, v25.4S // .............................................................*.................................................................................... + ldr q25, [x4, #48] // .....................*............................................................................................................................ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + trn2 v17.2D, v1.2D, v27.2D // ............................................................................*..................................................................... + trn1 v28.2D, v1.2D, v27.2D // .............................................................................*.................................................................... + trn1 v4.2D, v24.2D, v22.2D // ...........................................................................*...................................................................... + trn2 v27.2D, v24.2D, v22.2D // ..........................................................................*....................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + trn1 v29.2D, v14.2D, v0.2D // ................................................................*................................................................................. + trn2 v11.2D, v10.2D, v15.2D // ...................................................................*.............................................................................. + trn2 v3.2D, v14.2D, v0.2D // .................................................................*................................................................................ + trn1 v21.2D, v10.2D, v15.2D // ..................................................................*............................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + ldr q1, [x4, #16] // ..............*................................................................................................................................... + sub v24.4S, v27.4S, v17.4S // ..................................................................................*............................................................... + add v19.4S, v27.4S, v17.4S // ...................................................................................*.............................................................. + sub v18.4S, v4.4S, v28.4S // ....................................................................................*............................................................. + add v12.4S, v4.4S, v28.4S // .....................................................................................*............................................................ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sub v7.4S, v3.4S, v11.4S // ........................................................................*......................................................................... + sub v13.4S, v29.4S, v21.4S // ......................................................................*........................................................................... + add v31.4S, v29.4S, v21.4S // .......................................................................*.......................................................................... + add v22.4S, v3.4S, v11.4S // .........................................................................*........................................................................ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mul v29.4S, v24.4S, v25.S[0] // ...........................................................................................*...................................................... + sqrdmulh v5.4S, v24.4S, v25.S[1] // ..........................................................................................*....................................................... + mul v30.4S, v18.4S, v9.S[2] // .........................................................................................*........................................................ + sqrdmulh v15.4S, v18.4S, v9.S[3] // ........................................................................................*......................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sqrdmulh v20.4S, v7.4S, v9.S[1] // ................................................................................*................................................................. + mul v28.4S, v7.4S, v9.S[0] // .................................................................................*................................................................ + ldr q25, [x4], #64 // ...............*.................................................................................................................................. + mul v4.4S, v13.4S, v1.S[2] // ...............................................................................*.................................................................. + sqrdmulh v2.4S, v13.4S, v1.S[3] // ..............................................................................*................................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + add v17.4S, v31.4S, v22.4S // .......................................................................................*.......................................................... + sub v31.4S, v31.4S, v22.4S // ......................................................................................*........................................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + add v22.4S, v12.4S, v19.4S // .................................................................................................*................................................ + sub v3.4S, v12.4S, v19.4S // ...............................................................................................*.................................................. + mls v29.4S, v5.4S, v8.S[0] // ....................................................................................................*............................................. + mls v30.4S, v15.4S, v8.S[0] // ...................................................................................................*.............................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mls v28.4S, v20.4S, v8.S[0] // ................................................................................................*................................................. + mls v4.4S, v2.4S, v8.S[0] // ............................................................................................*..................................................... + srshr v24.4S, v17.4S, #23 // ..................................................................................................*............................................... + srshr v2.4S, v22.4S, #23 // ........................................................................................................*......................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mul v14.4S, v31.4S, v25.S[2] // ..............................................................................................*................................................... + sqrdmulh v20.4S, v31.4S, v25.S[3] // .............................................................................................*.................................................... + mul v23.4S, v3.4S, v1.S[0] // ......................................................................................................*........................................... + sqrdmulh v5.4S, v3.4S, v1.S[1] // .....................................................................................................*............................................ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + add v6.4S, v30.4S, v29.4S // .............................................................................................................*.................................... + sub v29.4S, v30.4S, v29.4S // ............................................................................................................*..................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sub v0.4S, v4.4S, v28.4S // ..........................................................................................................*....................................... + mls v22.4S, v2.4S, v8.4S // ...............................................................................................................*.................................. + mls v17.4S, v24.4S, v8.4S // .........................................................................................................*........................................ + add v7.4S, v4.4S, v28.4S // ...........................................................................................................*...................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sqrdmulh v28.4S, v29.4S, v1.S[1] // ...................................................................................................................*.............................. + mul v31.4S, v29.4S, v1.S[0] // ....................................................................................................................*............................. + mls v14.4S, v20.4S, v8.S[0] // .......................................................................................................*.......................................... + mls v23.4S, v5.4S, v8.S[0] // ..............................................................................................................*................................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mul v18.4S, v0.4S, v25.S[2] // .................................................................................................................*................................ + sqrdmulh v1.4S, v0.4S, v25.S[3] // ................................................................................................................*................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + srshr v16.4S, v7.4S, #23 // ..................................................................................................................*............................... + sub v12.4S, v17.4S, v22.4S // .........................................................................................................................*........................ + add v29.4S, v17.4S, v22.4S // ........................................................................................................................*......................... + srshr v21.4S, v6.4S, #23 // .....................................................................................................................*............................ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mls v31.4S, v28.4S, v8.S[0] // .............................................................................................................................*.................... + sub v13.4S, v14.4S, v23.4S // .......................................................................................................................*.......................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mls v7.4S, v16.4S, v8.4S // ..........................................................................................................................*....................... + str q29, [x1], #(16*4) // .................................................................................................................................*................ + mul v26.4S, v12.4S, v25.S[0] // ...................................................................................................................................*.............. + sqrdmulh v27.4S, v12.4S, v25.S[1] // ..................................................................................................................................*............... + mls v18.4S, v1.4S, v8.S[0] // ...........................................................................................................................*...................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mls v6.4S, v21.4S, v8.4S // ............................................................................................................................*..................... + add v23.4S, v14.4S, v23.4S // ......................................................................................................................*........................... + sqrdmulh v5.4S, v13.4S, v25.S[1] // ...............................................................................................................................*.................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mul v0.4S, v13.4S, v25.S[0] // ................................................................................................................................*................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sub v9.4S, v18.4S, v31.4S // .......................................................................................................................................*.......... + add v24.4S, v18.4S, v31.4S // ........................................................................................................................................*......... + mls v26.4S, v27.4S, v8.S[0] // .........................................................................................................................................*........ + str q23, [x1, #-32] // ..............................................................................................................................*................... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sub v21.4S, v7.4S, v6.4S // ....................................................................................................................................*............. + add v1.4S, v7.4S, v6.4S // .....................................................................................................................................*............ + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mls v0.4S, v5.4S, v8.S[0] // ......................................................................................................................................*........... + str q24, [x1, #-16] // ...............................................................................................................................................*.. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + mul v13.4S, v9.4S, v25.S[0] // .............................................................................................................................................*.... + str q26, [x2], #(16*4) // .................................................................................................................................................* + str q1, [x1, #-48] // ..............................................................................................................................................*... + add x1, x1, #64 // ................................................................................................................................................*. + mul v17.4S, v21.4S, v25.S[0] // ...........................................................................................................................................*...... + sqrdmulh v30.4S, v21.4S, v25.S[1] // ..........................................................................................................................................*....... + // gap // .................................................................................................................................................. + // gap // .................................................................................................................................................. + sqrdmulh v26.4S, v9.4S, v25.S[1] // ............................................................................................................................................*..... + + // ----------------------------------------------------------------- new position ------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-------------------- + // ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1] // ...*.............................................................................................................................................. + // ldr q30, [x5], #(12*16) // .............................*.................................................................................................................... + // ldr q24, [x5, #-176] // ...............................*.................................................................................................................. + // ldr q14, [x5, #-160] // .........*........................................................................................................................................ + // ldr q26, [x5, #-144] // .*................................................................................................................................................ + // ldr q0, [x5, #-128] // ........*......................................................................................................................................... + // ldr q12, [x5, #-112] // ....*............................................................................................................................................. + // ldr q25, [x5, #-96] // ..........................*....................................................................................................................... + // ldr q7, [x5, #-80] // ........................*......................................................................................................................... + // ldr q17, [x5, #-64] // *................................................................................................................................................. + // ldr q13, [x5, #-48] // .......*.......................................................................................................................................... + // ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // ..*............................................................................................................................................... + // ldr q11, [x5, #-32] // ......*........................................................................................................................................... + // ldr q5, [x5, #-16] // .....*............................................................................................................................................ + // ldr q15, [x4, #16] // ........................................................................*......................................................................... + // ldr q31, [x4], #64 // .......................................................................................*.......................................................... + // sub v16.4S, v19.4S, v20.4S // ...............*.................................................................................................................................. + // add v19.4S, v19.4S, v20.4S // ...........................*...................................................................................................................... + // sub v20.4S, v21.4S, v22.4S // .............*.................................................................................................................................... + // add v9.4S, v21.4S, v22.4S // ..............*................................................................................................................................... + // ldr q28, [x4, #-32] // ..............................................*................................................................................................... + // ldr q6, [x4, #-16] // ...............................................................*.................................................................................. + // mul v14.4S, v16.4S, v14.4S // ......................*........................................................................................................................... + // sqrdmulh v26.4S, v16.4S, v26.4S // ....................*............................................................................................................................. + // mul v0.4S, v20.4S, v0.4S // .......................*.......................................................................................................................... + // sqrdmulh v12.4S, v20.4S, v12.4S // .....................*............................................................................................................................ + // sub v20.4S, v19.4S, v9.4S // ....................................*............................................................................................................. + // add v19.4S, v19.4S, v9.4S // ........................................*......................................................................................................... + // sub v16.4S, v1.4S, v2.4S // ............*..................................................................................................................................... + // add v1.4S, v1.4S, v2.4S // .........................*........................................................................................................................ + // sub v9.4S, v3.4S, v4.4S // ...........*...................................................................................................................................... + // add v4.4S, v3.4S, v4.4S // ..........*....................................................................................................................................... + // mls v14.4S, v26.4S, v8.S[0] // ...................................*.............................................................................................................. + // mls v0.4S, v12.4S, v8.S[0] // ..................................*............................................................................................................... + // sqrdmulh v26.4S, v20.4S, v24.4S // ...........................................*...................................................................................................... + // mul v12.4S, v20.4S, v30.4S // ..........................................*....................................................................................................... + // mul v17.4S, v16.4S, v17.4S // ................*................................................................................................................................. + // sqrdmulh v13.4S, v16.4S, v13.4S // ...................*.............................................................................................................................. + // mul v20.4S, v9.4S, v11.4S // ..................*............................................................................................................................... + // sqrdmulh v11.4S, v9.4S, v5.4S // .................*................................................................................................................................ + // sub v5.4S, v1.4S, v4.4S // ................................*................................................................................................................. + // add v1.4S, v1.4S, v4.4S // .................................*................................................................................................................ + // sub v4.4S, v14.4S, v0.4S // .........................................*........................................................................................................ + // add v14.4S, v14.4S, v0.4S // ................................................*................................................................................................. + // mls v12.4S, v26.4S, v8.S[0] // ...................................................*.............................................................................................. + // mls v17.4S, v13.4S, v8.S[0] // ............................*..................................................................................................................... + // mls v20.4S, v11.4S, v8.S[0] // ..............................*................................................................................................................... + // sqrdmulh v13.4S, v5.4S, v7.4S // ......................................*........................................................................................................... + // mul v26.4S, v5.4S, v25.4S // .....................................*............................................................................................................ + // mul v0.4S, v4.4S, v30.4S // ..................................................*............................................................................................... + // sqrdmulh v30.4S, v4.4S, v24.4S // ...............................................*.................................................................................................. + // trn1 v24.4S, v19.4S, v14.4S // ........................................................*......................................................................................... + // trn2 v19.4S, v19.4S, v14.4S // ......................................................*........................................................................................... + // sub v14.4S, v17.4S, v20.4S // .......................................*.......................................................................................................... + // add v17.4S, v17.4S, v20.4S // ....................................................*............................................................................................. + // mls v26.4S, v13.4S, v8.S[0] // .................................................*................................................................................................ + // mls v0.4S, v30.4S, v8.S[0] // .......................................................*.......................................................................................... + // mul v13.4S, v14.4S, v25.4S // .............................................*.................................................................................................... + // sqrdmulh v14.4S, v14.4S, v7.4S // ............................................*..................................................................................................... + // trn1 v25.4S, v1.4S, v17.4S // .........................................................*........................................................................................ + // trn2 v17.4S, v1.4S, v17.4S // ..........................................................*....................................................................................... + // trn1 v30.4S, v12.4S, v0.4S // ..............................................................*................................................................................... + // trn2 v0.4S, v12.4S, v0.4S // .............................................................*.................................................................................... + // mls v13.4S, v14.4S, v8.S[0] // .....................................................*............................................................................................ + // trn1 v14.2D, v24.2D, v30.2D // ....................................................................*............................................................................. + // trn2 v30.2D, v24.2D, v30.2D // ......................................................................*........................................................................... + // trn1 v24.2D, v19.2D, v0.2D // .......................................................................*.......................................................................... + // trn2 v19.2D, v19.2D, v0.2D // .....................................................................*............................................................................ + // trn1 v0.4S, v26.4S, v13.4S // ...........................................................*...................................................................................... + // trn2 v13.4S, v26.4S, v13.4S // ............................................................*..................................................................................... + // sub v26.4S, v14.4S, v24.4S // ..............................................................................*................................................................... + // add v24.4S, v14.4S, v24.4S // ...............................................................................*.................................................................. + // sub v14.4S, v30.4S, v19.4S // .............................................................................*.................................................................... + // add v30.4S, v30.4S, v19.4S // ................................................................................*................................................................. + // trn2 v19.2D, v25.2D, v0.2D // ...................................................................*.............................................................................. + // trn1 v0.2D, v25.2D, v0.2D // ..................................................................*............................................................................... + // trn2 v12.2D, v17.2D, v13.2D // ................................................................*................................................................................. + // trn1 v17.2D, v17.2D, v13.2D // .................................................................*................................................................................ + // sqrdmulh v13.4S, v26.4S, v15.S[3] // .........................................................................................*........................................................ + // mul v26.4S, v26.4S, v15.S[2] // ........................................................................................*......................................................... + // sqrdmulh v25.4S, v14.4S, v28.S[1] // .....................................................................................*............................................................ + // mul v14.4S, v14.4S, v28.S[0] // ......................................................................................*........................................................... + // sub v7.4S, v19.4S, v12.4S // .........................................................................*........................................................................ + // add v19.4S, v19.4S, v12.4S // ..........................................................................*....................................................................... + // sub v12.4S, v0.4S, v17.4S // ...........................................................................*...................................................................... + // add v17.4S, v0.4S, v17.4S // ............................................................................*..................................................................... + // sub v0.4S, v24.4S, v30.4S // ...........................................................................................*...................................................... + // add v30.4S, v24.4S, v30.4S // ..........................................................................................*....................................................... + // sqrdmulh v24.4S, v12.4S, v28.S[3] // ....................................................................................*............................................................. + // mul v12.4S, v12.4S, v28.S[2] // ...................................................................................*.............................................................. + // sqrdmulh v20.4S, v7.4S, v6.S[1] // ..................................................................................*............................................................... + // mul v7.4S, v7.4S, v6.S[0] // .................................................................................*................................................................ + // mls v26.4S, v13.4S, v8.S[0] // .................................................................................................*................................................ + // sqrdmulh v13.4S, v0.4S, v31.S[3] // .....................................................................................................*............................................ + // mul v0.4S, v0.4S, v31.S[2] // ....................................................................................................*............................................. + // sub v1.4S, v17.4S, v19.4S // .............................................................................................*.................................................... + // mls v14.4S, v25.4S, v8.S[0] // ................................................................................................*................................................. + // add v17.4S, v17.4S, v19.4S // ............................................................................................*..................................................... + // srshr v19.4S, v30.4S, #23 // ..................................................................................................*............................................... + // mls v12.4S, v24.4S, v8.S[0] // ...............................................................................................*.................................................. + // mls v7.4S, v20.4S, v8.S[0] // ..............................................................................................*................................................... + // sqrdmulh v24.4S, v1.4S, v15.S[1] // .......................................................................................................*.......................................... + // mul v25.4S, v1.4S, v15.S[0] // ......................................................................................................*........................................... + // mls v0.4S, v13.4S, v8.S[0] // ................................................................................................................*................................. + // srshr v13.4S, v17.4S, #23 // ...................................................................................................*.............................................. + // mls v30.4S, v19.4S, v8.4S // ............................................................................................................*..................................... + // sub v19.4S, v26.4S, v14.4S // ..........................................................................................................*....................................... + // add v14.4S, v26.4S, v14.4S // .............................................................................................................*.................................... + // sub v26.4S, v12.4S, v7.4S // .........................................................................................................*........................................ + // add v12.4S, v12.4S, v7.4S // ........................................................................................................*......................................... + // mls v25.4S, v24.4S, v8.S[0] // .................................................................................................................*................................ + // mls v17.4S, v13.4S, v8.4S // ...........................................................................................................*...................................... + // sqrdmulh v13.4S, v19.4S, v31.S[3] // ...................................................................................................................*.............................. + // mul v19.4S, v19.4S, v31.S[2] // ..................................................................................................................*............................... + // srshr v24.4S, v14.4S, #23 // ....................................................................................................................*............................. + // sqrdmulh v7.4S, v26.4S, v15.S[1] // ..............................................................................................................*................................... + // mul v26.4S, v26.4S, v15.S[0] // ...............................................................................................................*.................................. + // srshr v20.4S, v12.4S, #23 // .......................................................................................................................*.......................... + // add v1.4S, v0.4S, v25.4S // ................................................................................................................................*................. + // sub v0.4S, v0.4S, v25.4S // .........................................................................................................................*........................ + // add v25.4S, v30.4S, v17.4S // ......................................................................................................................*........................... + // sub v17.4S, v30.4S, v17.4S // .....................................................................................................................*............................ + // mls v14.4S, v24.4S, v8.4S // ..........................................................................................................................*....................... + // mls v19.4S, v13.4S, v8.S[0] // ..............................................................................................................................*................... + // mls v12.4S, v20.4S, v8.4S // ...............................................................................................................................*.................. + // mls v26.4S, v7.4S, v8.S[0] // ........................................................................................................................*......................... + // str q1, [x1, #32] // ......................................................................................................................................*........... + // sqrdmulh v13.4S, v0.4S, v31.S[1] // .................................................................................................................................*................ + // mul v0.4S, v0.4S, v31.S[0] // ..................................................................................................................................*............... + // str q25, [x1], #(16*4) // ...........................................................................................................................*...................... + // sqrdmulh v30.4S, v17.4S, v31.S[1] // .............................................................................................................................*.................... + // mul v24.4S, v17.4S, v31.S[0] // ............................................................................................................................*..................... + // sub v17.4S, v14.4S, v12.4S // .......................................................................................................................................*.......... + // add v14.4S, v14.4S, v12.4S // ........................................................................................................................................*......... + // mls v0.4S, v13.4S, v8.S[0] // .........................................................................................................................................*........ + // sub v13.4S, v19.4S, v26.4S // ...................................................................................................................................*.............. + // add v19.4S, v19.4S, v26.4S // ....................................................................................................................................*............. + // mls v24.4S, v30.4S, v8.S[0] // .....................................................................................................................................*............ + // sqrdmulh v30.4S, v17.4S, v31.S[1] // ................................................................................................................................................*. + // mul v17.4S, v17.4S, v31.S[0] // ...............................................................................................................................................*.. + // sqrdmulh v26.4S, v13.4S, v31.S[1] // .................................................................................................................................................* + // mul v13.4S, v13.4S, v31.S[0] // ...........................................................................................................................................*...... + // str q14, [x1, #-48] // .............................................................................................................................................*.... + // str q19, [x1, #-16] // ..........................................................................................................................................*....... + // add x1, x1, #64 // ..............................................................................................................................................*... + // str q24, [x2], #(16*4) // ............................................................................................................................................*..... sub count, count, #1 layer45678_start: - ldr q9, [x5], #(12*16) // ..e..................................................................................................................................................... - ldr q25, [x5, #-160] // ....e................................................................................................................................................... - mls v11.4S, v10.4S, v8.S[0] // ........................................................................................................................................*............... - str q26, [x1, #-32] // ................................................................................................................................................*....... - str q24, [x1, #-48] // ...............................................................................................................................................*........ - add x1, x1, #64 // ......................................................................................................................................................*. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // e....................................................................................................................................................... - ldr q26, [x5, #-112] // .......e................................................................................................................................................ + // Instructions: 152 + // Expected cycles: 51 + // Expected IPC: 2.98 + // + // Wall time: 187.45s + // User time: 187.45s + // + // ------------------------------------------------------------------ original position ------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + mls v17.4S, v30.4S, v8.S[0] // ...................................................................................................................................*.................... + mls v13.4S, v26.4S, v8.S[0] // .............................................................................................................................................*.......... + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1] // e....................................................................................................................................................... + ldr q30, [x5], #(12*16) // ..e..................................................................................................................................................... + ldr q24, [x5, #-176] // ...e.................................................................................................................................................... + str q0, [x2, #-32] // ....................................................................................................................................................*... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q14, [x5, #-160] // ....e................................................................................................................................................... + ldr q26, [x5, #-144] // .....e.................................................................................................................................................. ldr q0, [x5, #-128] // ......e................................................................................................................................................. - mls v12.4S, v14.4S, v8.S[0] // .............................................................................................................................................*.......... - mls v6.4S, v23.4S, v8.S[0] // ...................................................................................................................................*.................... - ldr q31, [x4], #64 // ......................................................................e................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - ldr q23, [x5, #-48] // ...............................e........................................................................................................................ - ldr q7, [x5, #-16] // .................................e...................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + ldr q12, [x5, #-112] // .......e................................................................................................................................................ + ldr q25, [x5, #-96] // ............................e........................................................................................................................... + ldr q7, [x5, #-80] // .............................e.......................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - str q11, [x2, #-32] // ....................................................................................................................................................*... - ldr q2, [x5, #-64] // ..............................e......................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + str q17, [x2, #-48] // ...................................................................................................................................................*.... + str q13, [x2, #-16] // .....................................................................................................................................................*.. + ldr q17, [x5, #-64] // ..............................e......................................................................................................................... + ldr q13, [x5, #-48] // ...............................e........................................................................................................................ + add x2, x2, #64 // .......................................................................................................................................................* // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // .e...................................................................................................................................................... + ldr q11, [x5, #-32] // ................................e....................................................................................................................... + ldr q5, [x5, #-16] // .................................e...................................................................................................................... // gap // ........................................................................................................................................................ - str q12, [x2, #-16] // .....................................................................................................................................................*.. - str q6, [x2, #-48] // ...................................................................................................................................................*.... - ldr q6, [x5, #-176] // ...e.................................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add x2, x2, #64 // .......................................................................................................................................................* // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - ldr q1, [x5, #-144] // .....e.................................................................................................................................................. + ldr q15, [x4, #16] // .......................................................................e................................................................................ + ldr q31, [x4], #64 // ......................................................................e................................................................................. // gap // ........................................................................................................................................................ - ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // .e...................................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sub v16.4S, v19.4S, v20.4S // ........e............................................................................................................................................... + add v19.4S, v19.4S, v20.4S // .........e.............................................................................................................................................. + sub v20.4S, v21.4S, v22.4S // .............e.......................................................................................................................................... + add v9.4S, v21.4S, v22.4S // ..............e......................................................................................................................................... + ldr q28, [x4, #-32] // ........................................................................e............................................................................... + ldr q6, [x4, #-16] // .........................................................................e.............................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ @@ -974,113 +990,108 @@ layer45678_start: // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v4.4S, v18.4S, v19.4S // .........e.............................................................................................................................................. - sub v24.4S, v20.4S, v21.4S // .............e.......................................................................................................................................... - sub v11.4S, v18.4S, v19.4S // ........e............................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mul v14.4S, v16.4S, v14.4S // ...........e............................................................................................................................................ + sqrdmulh v26.4S, v16.4S, v26.4S // ..........e............................................................................................................................................. + mul v0.4S, v20.4S, v0.4S // ................e....................................................................................................................................... + sqrdmulh v12.4S, v20.4S, v12.4S // ...............e........................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v12.4S, v20.4S, v21.4S // ..............e......................................................................................................................................... // gap // ........................................................................................................................................................ + sub v20.4S, v19.4S, v9.4S // ..................e..................................................................................................................................... + add v19.4S, v19.4S, v9.4S // ...................e.................................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mul v27.4S, v11.4S, v25.4S // ..........e............................................................................................................................................. - mul v22.4S, v24.4S, v0.4S // ...............e........................................................................................................................................ - sqrdmulh v5.4S, v24.4S, v26.4S // ................e....................................................................................................................................... - sqrdmulh v13.4S, v11.4S, v1.4S // ...........e............................................................................................................................................ - ldr q26, [x5, #-32] // ................................e....................................................................................................................... + sub v16.4S, v1.4S, v2.4S // ..................................e..................................................................................................................... + add v1.4S, v1.4S, v2.4S // ...................................e.................................................................................................................... + sub v9.4S, v3.4S, v4.4S // .......................................e................................................................................................................ + add v4.4S, v3.4S, v4.4S // ........................................e............................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v0.4S, v4.4S, v12.4S // ..................e..................................................................................................................................... - ldr q25, [x5, #-96] // ............................e........................................................................................................................... // gap // ........................................................................................................................................................ + mls v14.4S, v26.4S, v8.S[0] // ............e........................................................................................................................................... + mls v0.4S, v12.4S, v8.S[0] // .................e...................................................................................................................................... + sqrdmulh v26.4S, v20.4S, v24.4S // ....................e................................................................................................................................... + mul v12.4S, v20.4S, v30.4S // .....................e.................................................................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mul v17.4S, v16.4S, v17.4S // .....................................e.................................................................................................................. + sqrdmulh v13.4S, v16.4S, v13.4S // ....................................e................................................................................................................... + mul v20.4S, v9.4S, v11.4S // ..........................................e............................................................................................................. + sqrdmulh v11.4S, v9.4S, v5.4S // .........................................e.............................................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v20.4S, v14.4S, v15.4S // ...................................e.................................................................................................................... - sub v3.4S, v14.4S, v15.4S // ..................................e..................................................................................................................... // gap // ........................................................................................................................................................ - sub v10.4S, v16.4S, v17.4S // .......................................e................................................................................................................ // gap // ........................................................................................................................................................ + sub v5.4S, v1.4S, v4.4S // ............................................e........................................................................................................... + add v1.4S, v1.4S, v4.4S // .............................................e.......................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v24.4S, v16.4S, v17.4S // ........................................e............................................................................................................... - mls v27.4S, v13.4S, v8.S[0] // ............e........................................................................................................................................... - mls v22.4S, v5.4S, v8.S[0] // .................e...................................................................................................................................... - mul v21.4S, v0.4S, v9.4S // ....................e................................................................................................................................... - ldr q16, [x5, #-80] // .............................e.......................................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sqrdmulh v30.4S, v3.4S, v23.4S // .....................................e.................................................................................................................. - mul v1.4S, v10.4S, v26.4S // .........................................e.............................................................................................................. - sqrdmulh v23.4S, v10.4S, v7.4S // ..........................................e............................................................................................................. // gap // ........................................................................................................................................................ - mul v26.4S, v3.4S, v2.4S // ....................................e................................................................................................................... - ldr q3, [x4, #-48] // .......................................................................e................................................................................ + sub v4.4S, v14.4S, v0.4S // .......................e................................................................................................................................ + add v14.4S, v14.4S, v0.4S // ........................e............................................................................................................................... + mls v12.4S, v26.4S, v8.S[0] // ......................e................................................................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v2.4S, v4.4S, v12.4S // ...................e.................................................................................................................................... - sqrdmulh v19.4S, v0.4S, v6.4S // .....................e.................................................................................................................................. - sub v14.4S, v20.4S, v24.4S // ............................................e........................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mls v17.4S, v13.4S, v8.S[0] // ......................................e................................................................................................................. + mls v20.4S, v11.4S, v8.S[0] // ...........................................e............................................................................................................ + sqrdmulh v13.4S, v5.4S, v7.4S // ..............................................e......................................................................................................... + mul v26.4S, v5.4S, v25.4S // ...............................................e........................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v10.4S, v27.4S, v22.4S // ........................e............................................................................................................................... - sub v7.4S, v27.4S, v22.4S // .......................e................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mul v0.4S, v4.4S, v30.4S // ..........................e............................................................................................................................. + sqrdmulh v30.4S, v4.4S, v24.4S // .........................e.............................................................................................................................. + trn1 v24.4S, v19.4S, v14.4S // ......................................................e................................................................................................. + trn2 v19.4S, v19.4S, v14.4S // .......................................................e................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sqrdmulh v13.4S, v14.4S, v16.4S // ...............................................e........................................................................................................ - mls v26.4S, v30.4S, v8.S[0] // ......................................e................................................................................................................. - mls v1.4S, v23.4S, v8.S[0] // ...........................................e............................................................................................................ - mul v5.4S, v14.4S, v25.4S // ..............................................e......................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - trn1 v11.4S, v2.4S, v10.4S // ......................................................e................................................................................................. - trn2 v23.4S, v2.4S, v10.4S // .......................................................e................................................................................................ - mul v29.4S, v7.4S, v9.4S // .........................e.............................................................................................................................. - sqrdmulh v10.4S, v7.4S, v6.4S // ..........................e............................................................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v30.4S, v20.4S, v24.4S // .............................................e.......................................................................................................... - mls v21.4S, v19.4S, v8.S[0] // ......................e................................................................................................................................. - ldr q24, [x4, #-16] // .........................................................................e.............................................................................. + sub v14.4S, v17.4S, v20.4S // .................................................e...................................................................................................... + add v17.4S, v17.4S, v20.4S // ..................................................e..................................................................................................... + mls v26.4S, v13.4S, v8.S[0] // ................................................e....................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v28.4S, v26.4S, v1.4S // .................................................e...................................................................................................... + mls v0.4S, v30.4S, v8.S[0] // ...........................e............................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mls v5.4S, v13.4S, v8.S[0] // ................................................e....................................................................................................... - add v14.4S, v26.4S, v1.4S // ..................................................e..................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mls v29.4S, v10.4S, v8.S[0] // ...........................e............................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mul v13.4S, v14.4S, v25.4S // ....................................................e................................................................................................... + sqrdmulh v14.4S, v14.4S, v7.4S // ...................................................e.................................................................................................... + trn1 v25.4S, v1.4S, v17.4S // ..............................................................e......................................................................................... + trn2 v17.4S, v1.4S, v17.4S // ...............................................................e........................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ @@ -1088,238 +1099,237 @@ layer45678_start: // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - trn1 v6.4S, v30.4S, v14.4S // ..............................................................e......................................................................................... - trn2 v30.4S, v30.4S, v14.4S // ...............................................................e........................................................................................ - mul v14.4S, v28.4S, v25.4S // ...................................................e.................................................................................................... - sqrdmulh v1.4S, v28.4S, v16.4S // ....................................................e................................................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + trn1 v30.4S, v12.4S, v0.4S // ........................................................e............................................................................................... + trn2 v0.4S, v12.4S, v0.4S // .........................................................e.............................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - trn1 v19.4S, v21.4S, v29.4S // ........................................................e............................................................................................... - trn2 v26.4S, v21.4S, v29.4S // .........................................................e.............................................................................................. // gap // ........................................................................................................................................................ + mls v13.4S, v14.4S, v8.S[0] // .....................................................e.................................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mls v14.4S, v1.4S, v8.S[0] // .....................................................e.................................................................................................. // gap // ........................................................................................................................................................ + trn1 v14.2D, v24.2D, v30.2D // ............................................................e........................................................................................... + trn2 v30.2D, v24.2D, v30.2D // ..........................................................e............................................................................................. + trn1 v24.2D, v19.2D, v0.2D // .............................................................e.......................................................................................... + trn2 v19.2D, v19.2D, v0.2D // ...........................................................e............................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - trn1 v16.2D, v11.2D, v19.2D // ............................................................e........................................................................................... - trn1 v20.2D, v23.2D, v26.2D // .............................................................e.......................................................................................... - trn2 v4.2D, v11.2D, v19.2D // ..........................................................e............................................................................................. - trn2 v0.2D, v23.2D, v26.2D // ...........................................................e............................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - ldr q29, [x4, #-32] // ........................................................................e............................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + trn1 v0.4S, v26.4S, v13.4S // ................................................................e....................................................................................... + trn2 v13.4S, v26.4S, v13.4S // .................................................................e...................................................................................... + sub v26.4S, v14.4S, v24.4S // ..........................................................................e............................................................................. + add v24.4S, v14.4S, v24.4S // ...........................................................................e............................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v21.4S, v16.4S, v20.4S // ..........................................................................e............................................................................. - trn1 v10.4S, v5.4S, v14.4S // ................................................................e....................................................................................... - trn2 v14.4S, v5.4S, v14.4S // .................................................................e...................................................................................... + sub v14.4S, v30.4S, v19.4S // ...............................................................................e........................................................................ + add v30.4S, v30.4S, v19.4S // ................................................................................e....................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v7.4S, v4.4S, v0.4S // ...............................................................................e........................................................................ // gap // ........................................................................................................................................................ + trn2 v19.2D, v25.2D, v0.2D // ..................................................................e..................................................................................... + trn1 v0.2D, v25.2D, v0.2D // ....................................................................e................................................................................... + trn2 v12.2D, v17.2D, v13.2D // ...................................................................e.................................................................................... + trn1 v17.2D, v17.2D, v13.2D // .....................................................................e.................................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sqrdmulh v13.4S, v26.4S, v15.S[3] // ............................................................................e........................................................................... + mul v26.4S, v26.4S, v15.S[2] // .............................................................................e.......................................................................... + sqrdmulh v25.4S, v14.4S, v28.S[1] // .................................................................................e...................................................................... + mul v14.4S, v14.4S, v28.S[0] // ..................................................................................e..................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - trn2 v23.2D, v6.2D, v10.2D // ..................................................................e..................................................................................... - trn1 v17.2D, v6.2D, v10.2D // ....................................................................e................................................................................... - trn1 v10.2D, v30.2D, v14.2D // .....................................................................e.................................................................................. - trn2 v11.2D, v30.2D, v14.2D // ...................................................................e.................................................................................... // gap // ........................................................................................................................................................ + sub v7.4S, v19.4S, v12.4S // .........................................................................................e.............................................................. + add v19.4S, v19.4S, v12.4S // ..........................................................................................e............................................................. + sub v12.4S, v0.4S, v17.4S // ....................................................................................e................................................................... + add v17.4S, v0.4S, v17.4S // .....................................................................................e.................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sqrdmulh v1.4S, v21.4S, v3.S[3] // .............................................................................e.......................................................................... - add v30.4S, v16.4S, v20.4S // ...........................................................................e............................................................................ - mul v20.4S, v7.4S, v29.S[0] // .................................................................................e...................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sub v0.4S, v24.4S, v30.4S // ..............................................................................................e......................................................... + add v30.4S, v24.4S, v30.4S // ...............................................................................................e........................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v16.4S, v4.4S, v0.4S // ................................................................................e....................................................................... - sub v26.4S, v17.4S, v10.4S // ....................................................................................e................................................................... - sub v19.4S, v23.4S, v11.4S // .........................................................................................e.............................................................. - add v28.4S, v17.4S, v10.4S // .....................................................................................e.................................................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v12.4S, v28.S[3] // ......................................................................................e................................................................. + mul v12.4S, v12.4S, v28.S[2] // .......................................................................................e................................................................ + sqrdmulh v20.4S, v7.4S, v6.S[1] // ...........................................................................................e............................................................ + mul v7.4S, v7.4S, v6.S[0] // ............................................................................................e........................................................... // gap // ........................................................................................................................................................ - mul v4.4S, v21.4S, v3.S[2] // ............................................................................e........................................................................... - add v14.4S, v23.4S, v11.4S // ..........................................................................................e............................................................. - sqrdmulh v6.4S, v7.4S, v29.S[1] // ..................................................................................e..................................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mls v26.4S, v13.4S, v8.S[0] // ..............................................................................e......................................................................... + sqrdmulh v13.4S, v0.4S, v31.S[3] // ................................................................................................e....................................................... + mul v0.4S, v0.4S, v31.S[2] // .................................................................................................e...................................................... + sub v1.4S, v17.4S, v19.4S // ........................................................................................................e............................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sqrdmulh v21.4S, v26.4S, v29.S[3] // .......................................................................................e................................................................ - mul v11.4S, v26.4S, v29.S[2] // ......................................................................................e................................................................. - mul v26.4S, v19.4S, v24.S[0] // ...........................................................................................e............................................................ // gap // ........................................................................................................................................................ - sqrdmulh v24.4S, v19.4S, v24.S[1] // ............................................................................................e........................................................... // gap // ........................................................................................................................................................ + mls v14.4S, v25.4S, v8.S[0] // ...................................................................................e.................................................................... + add v17.4S, v17.4S, v19.4S // .........................................................................................................e.............................................. + srshr v19.4S, v30.4S, #23 // ..................................................................................................................e..................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - add v7.4S, v30.4S, v16.4S // ...............................................................................................e........................................................ - add v23.4S, v28.4S, v14.4S // .........................................................................................................e.............................................. - sub v10.4S, v28.4S, v14.4S // ........................................................................................................e............................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mls v12.4S, v24.4S, v8.S[0] // ........................................................................................e............................................................... + mls v7.4S, v20.4S, v8.S[0] // .............................................................................................e.......................................................... + sqrdmulh v24.4S, v1.4S, v15.S[1] // ..........................................................................................................e............................................. + mul v25.4S, v1.4S, v15.S[0] // ...........................................................................................................e............................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mls v4.4S, v1.4S, v8.S[0] // ..............................................................................e......................................................................... - mls v20.4S, v6.4S, v8.S[0] // ...................................................................................e.................................................................... - sub v19.4S, v30.4S, v16.4S // ..............................................................................................e......................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mls v0.4S, v13.4S, v8.S[0] // ..................................................................................................e..................................................... + srshr v13.4S, v17.4S, #23 // ......................................................................................................................e................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - srshr v14.4S, v23.4S, #23 // ......................................................................................................................e................................. - srshr v30.4S, v7.4S, #23 // ..................................................................................................................e..................................... - mls v11.4S, v21.4S, v8.S[0] // ........................................................................................e............................................................... - mls v26.4S, v24.4S, v8.S[0] // .............................................................................................e.......................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mls v30.4S, v19.4S, v8.4S // ...................................................................................................................e.................................... + sub v19.4S, v26.4S, v14.4S // ...................................................................................................e.................................................... + add v14.4S, v26.4S, v14.4S // ....................................................................................................e................................................... // gap // ........................................................................................................................................................ - sqrdmulh v5.4S, v10.4S, v3.S[1] // ...........................................................................................................e............................................ - mul v16.4S, v19.4S, v31.S[2] // ................................................................................................e....................................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sub v26.4S, v12.4S, v7.4S // .............................................................................................................e.......................................... + add v12.4S, v12.4S, v7.4S // ..............................................................................................................e......................................... + mls v25.4S, v24.4S, v8.S[0] // ............................................................................................................e........................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v9.4S, v4.4S, v20.4S // ...................................................................................................e.................................................... - add v1.4S, v4.4S, v20.4S // ....................................................................................................e................................................... - mul v20.4S, v10.4S, v3.S[0] // ..........................................................................................................e............................................. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mls v17.4S, v13.4S, v8.4S // .......................................................................................................................e................................ + sqrdmulh v13.4S, v19.4S, v31.S[3] // .....................................................................................................e.................................................. + mul v19.4S, v19.4S, v31.S[2] // ......................................................................................................e................................................. + srshr v24.4S, v14.4S, #23 // ....................................................................................................................e................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mls v23.4S, v14.4S, v8.4S // .......................................................................................................................e................................ - mls v7.4S, v30.4S, v8.4S // ...................................................................................................................e.................................... - add v21.4S, v11.4S, v26.4S // ..............................................................................................................e......................................... - sub v22.4S, v11.4S, v26.4S // .............................................................................................................e.......................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sqrdmulh v7.4S, v26.4S, v15.S[1] // ...............................................................................................................e........................................ + mul v26.4S, v26.4S, v15.S[0] // ................................................................................................................e....................................... + srshr v20.4S, v12.4S, #23 // ........................................................................................................................e............................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sqrdmulh v14.4S, v19.4S, v31.S[3] // .................................................................................................e...................................................... - mul v4.4S, v9.4S, v31.S[2] // .....................................................................................................e.................................................. - sqrdmulh v26.4S, v9.4S, v31.S[3] // ......................................................................................................e................................................. - srshr v24.4S, v1.4S, #23 // ....................................................................................................................e................................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + add v1.4S, v0.4S, v25.4S // .....................................................................................................................................e.................. + sub v0.4S, v0.4S, v25.4S // ....................................................................................................................................e................... // gap // ........................................................................................................................................................ - sqrdmulh v30.4S, v22.4S, v3.S[1] // ................................................................................................................e....................................... - mul v15.4S, v22.4S, v3.S[0] // ...............................................................................................................e........................................ // gap // ........................................................................................................................................................ - srshr v9.4S, v21.4S, #23 // ........................................................................................................................e............................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v10.4S, v7.4S, v23.4S // ..........................................................................................................................e............................. - mls v20.4S, v5.4S, v8.S[0] // ............................................................................................................e........................................... + add v25.4S, v30.4S, v17.4S // ...........................................................................................................................e............................ + sub v17.4S, v30.4S, v17.4S // ..........................................................................................................................e............................. + mls v14.4S, v24.4S, v8.4S // .....................................................................................................................e.................................. + mls v19.4S, v13.4S, v8.S[0] // .......................................................................................................e................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + mls v12.4S, v20.4S, v8.4S // .........................................................................................................................e.............................. + mls v26.4S, v7.4S, v8.S[0] // .................................................................................................................e...................................... + str q1, [x1, #32] // ................................................................................................................................................e....... + sqrdmulh v13.4S, v0.4S, v31.S[1] // ......................................................................................................................................e................. + mul v0.4S, v0.4S, v31.S[0] // .......................................................................................................................................e................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mls v16.4S, v14.4S, v8.S[0] // ..................................................................................................e..................................................... - mls v1.4S, v24.4S, v8.4S // .....................................................................................................................e.................................. - add v14.4S, v7.4S, v23.4S // ...........................................................................................................................e............................ - mls v4.4S, v26.4S, v8.S[0] // .......................................................................................................e................................................ // gap // ........................................................................................................................................................ + str q25, [x1], #(16*4) // ..............................................................................................................................................e......... + sqrdmulh v30.4S, v17.4S, v31.S[1] // ............................................................................................................................e........................... + mul v24.4S, v17.4S, v31.S[0] // .............................................................................................................................e.......................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mls v21.4S, v9.4S, v8.4S // .........................................................................................................................e.............................. - mul v7.4S, v10.4S, v31.S[0] // ............................................................................................................................e........................... - mls v15.4S, v30.4S, v8.S[0] // .................................................................................................................e...................................... - sqrdmulh v26.4S, v10.4S, v31.S[1] // .............................................................................................................................e.......................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - str q14, [x1], #(16*4) // ..............................................................................................................................................e......... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sub v17.4S, v14.4S, v12.4S // ...............................................................................................................................e........................ + add v14.4S, v14.4S, v12.4S // ................................................................................................................................e....................... + mls v0.4S, v13.4S, v8.S[0] // ........................................................................................................................................e............... + sub v13.4S, v19.4S, v26.4S // .........................................................................................................................................e.............. // gap // ........................................................................................................................................................ - sub v10.4S, v16.4S, v20.4S // ....................................................................................................................................e................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + add v19.4S, v19.4S, v26.4S // ..........................................................................................................................................e............. + mls v24.4S, v30.4S, v8.S[0] // ..............................................................................................................................e......................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - sub v30.4S, v1.4S, v21.4S // ...............................................................................................................................e........................ - sub v14.4S, v4.4S, v15.4S // .........................................................................................................................................e.............. - add v24.4S, v1.4S, v21.4S // ................................................................................................................................e....................... - mls v7.4S, v26.4S, v8.S[0] // ..............................................................................................................................e......................... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v17.4S, v31.S[1] // .................................................................................................................................e...................... + mul v17.4S, v17.4S, v31.S[0] // ..................................................................................................................................e..................... + sqrdmulh v26.4S, v13.4S, v31.S[1] // ...........................................................................................................................................e............ + mul v13.4S, v13.4S, v31.S[0] // ............................................................................................................................................e........... + str q14, [x1, #-48] // ...............................................................................................................................................e........ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mul v11.4S, v10.4S, v31.S[0] // ......................................................................................................................................e................. - sqrdmulh v10.4S, v10.4S, v31.S[1] // .......................................................................................................................................e................ - add v2.4S, v4.4S, v15.4S // ..........................................................................................................................................e............. - add v26.4S, v16.4S, v20.4S // .....................................................................................................................................e.................. // gap // ........................................................................................................................................................ + str q19, [x1, #-16] // .................................................................................................................................................e...... + add x1, x1, #64 // ......................................................................................................................................................e. // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - mul v6.4S, v30.4S, v31.S[0] // .................................................................................................................................e...................... - sqrdmulh v23.4S, v30.4S, v31.S[1] // ..................................................................................................................................e..................... - mul v12.4S, v14.4S, v31.S[0] // ...........................................................................................................................................e............ - sqrdmulh v14.4S, v14.4S, v31.S[1] // ............................................................................................................................................e........... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ + str q24, [x2], #(16*4) // ..................................................................................................................................................e..... // gap // ........................................................................................................................................................ - str q7, [x2], #(16*4) // ..................................................................................................................................................e..... - str q2, [x1, #-16] // .................................................................................................................................................e...... // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ @@ -1327,214 +1337,216 @@ layer45678_start: // gap // ........................................................................................................................................................ // gap // ........................................................................................................................................................ - // original source code - // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ......e.................................................................................................................................................|.....e............. - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .....................e..................................................................................................................................|................... - // ldr q0, [x5], #(12*16) // e.......................................................................................................................................................e................... - // ldr q4, [x5, #(-12*16 + 1*16)] // ..................e.....................................................................................................................................|.................e. - // ldr q1, [x5, #(-12*16 + 2*16)] // .e......................................................................................................................................................|e.................. - // ldr q5, [x5, #(-12*16 + 3*16)] // ....................e...................................................................................................................................|................... - // ldr q2, [x5, #(-12*16 + 4*16)] // ........e...............................................................................................................................................|.......e........... - // ldr q6, [x5, #(-12*16 + 5*16)] // .......e................................................................................................................................................|......e............ - // sub v24.4s, v9.4s, v10.4s // ........................e...............................................................................................................................|................... - // add v9.4s, v9.4s, v10.4s // ......................e.................................................................................................................................|................... - // mul v10.4s, v24.4s, v1.4s // ..........................e.............................................................................................................................|................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .............................e..........................................................................................................................|................... - // mls v10.4s, v24.4s, v8.s[0] // .....................................e..................................................................................................................|................... - // sub v24.4s, v11.4s, v12.4s // .......................e................................................................................................................................|................... - // add v11.4s, v11.4s, v12.4s // .........................e..............................................................................................................................|................... - // mul v12.4s, v24.4s, v2.4s // ...........................e............................................................................................................................|................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ............................e...........................................................................................................................|................... - // mls v12.4s, v24.4s, v8.s[0] // ......................................e.................................................................................................................|................... - // sub v24.4s, v9.4s, v11.4s // ...............................e........................................................................................................................|................... - // add v9.4s, v9.4s, v11.4s // ..............................................e.........................................................................................................|................... - // mul v11.4s, v24.4s, v0.4s // .......................................e................................................................................................................|................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................e........................................................................................................|................... - // mls v11.4s, v24.4s, v8.s[0] // ............................................................e...........................................................................................|................... - // sub v24.4s, v10.4s, v12.4s // ..................................................e.....................................................................................................|................... - // add v10.4s, v10.4s, v12.4s // .................................................e......................................................................................................|................... - // mul v12.4s, v24.4s, v0.4s // .........................................................e..............................................................................................|................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................................................e.............................................................................................|................... - // mls v12.4s, v24.4s, v8.s[0] // .................................................................e......................................................................................|................... - // ldr q0, [x5, #(-12*16 + 6*16)] // ................................e.......................................................................................................................|................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ........................................e...............................................................................................................|................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ...............e........................................................................................................................................|..............e.... - // ldr q5, [x5, #(-12*16 + 9*16)] // ............e...........................................................................................................................................|...........e....... - // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................e.........................................................................................................................|................... - // ldr q6, [x5, #(-12*16 + 11*16)] // .............e..........................................................................................................................................|............e...... - // sub v24.4s, v13.4s, v14.4s // ..................................e.....................................................................................................................|................... - // add v13.4s, v13.4s, v14.4s // .................................e......................................................................................................................|................... - // mul v14.4s, v24.4s, v1.4s // ............................................e...........................................................................................................|................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .........................................e..............................................................................................................|................... - // mls v14.4s, v24.4s, v8.s[0] // ....................................................e...................................................................................................|................... - // sub v24.4s, v15.4s, v16.4s // ...................................e....................................................................................................................|................... - // add v15.4s, v15.4s, v16.4s // ....................................e...................................................................................................................|................... - // mul v16.4s, v24.4s, v2.4s // ..........................................e.............................................................................................................|................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................e............................................................................................................|................... - // mls v16.4s, v24.4s, v8.s[0] // .....................................................e..................................................................................................|................... - // sub v24.4s, v13.4s, v15.4s // ................................................e.......................................................................................................|................... - // add v13.4s, v13.4s, v15.4s // ...........................................................e............................................................................................|................... - // mul v15.4s, v24.4s, v0.4s // ......................................................e.................................................................................................|................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e....................................................................................................|................... - // mls v15.4s, v24.4s, v8.s[0] // ...............................................................e........................................................................................|................... - // sub v24.4s, v14.4s, v16.4s // ..............................................................e.........................................................................................|................... - // add v14.4s, v14.4s, v16.4s // ................................................................e.......................................................................................|................... - // mul v16.4s, v24.4s, v0.4s // ....................................................................e...................................................................................|................... - // sqrdmulh v24.4s, v24.4s, v4.4s // .....................................................................e..................................................................................|................... - // mls v16.4s, v24.4s, v8.s[0] // ........................................................................e...............................................................................|................... - // trn1 v25.4s, v9.4s, v10.4s // .......................................................e................................................................................................|................... - // trn2 v26.4s, v9.4s, v10.4s // ........................................................e...............................................................................................|................... - // trn1 v27.4s, v11.4s, v12.4s // ......................................................................e.................................................................................|................... - // trn2 v28.4s, v11.4s, v12.4s // .......................................................................e................................................................................|................... - // trn2 v11.2d, v25.2d, v27.2d // ...........................................................................e............................................................................|................... - // trn2 v12.2d, v26.2d, v28.2d // ............................................................................e...........................................................................|................... - // trn1 v9.2d, v25.2d, v27.2d // .........................................................................e..............................................................................|................... - // trn1 v10.2d, v26.2d, v28.2d // ..........................................................................e.............................................................................|................... - // trn1 v25.4s, v13.4s, v14.4s // ..................................................................e.....................................................................................|................... - // trn2 v26.4s, v13.4s, v14.4s // ...................................................................e....................................................................................|................... - // trn1 v27.4s, v15.4s, v16.4s // ...............................................................................e........................................................................|................... - // trn2 v28.4s, v15.4s, v16.4s // ................................................................................e.......................................................................|................... - // trn2 v15.2d, v25.2d, v27.2d // ..................................................................................e.....................................................................|................... - // trn2 v16.2d, v26.2d, v28.2d // .....................................................................................e..................................................................|................... - // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................e....................................................................|................... - // trn1 v14.2d, v26.2d, v28.2d // ....................................................................................e...................................................................|................... - // ldr q0, [x4], #64 // ...........e............................................................................................................................................|..........e........ - // ldr q1, [x4, #(-64 + 16)] // .............................................e..........................................................................................................|................... - // ldr q2, [x4, #(-64 + 32)] // .............................................................................e..........................................................................|................... - // ldr q3, [x4, #(-64 + 48)] // .............................................................e..........................................................................................|................... - // sub v24.4s, v9.4s, v10.4s // ..............................................................................e.........................................................................|................... - // add v9.4s, v9.4s, v10.4s // .......................................................................................e................................................................|................... - // mul v10.4s, v24.4s, v1.s[2] // .............................................................................................e..........................................................|................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ......................................................................................e.................................................................|................... - // mls v10.4s, v24.4s, v8.s[0] // .......................................................................................................e................................................|................... - // sub v24.4s, v11.4s, v12.4s // .................................................................................e......................................................................|................... - // add v11.4s, v11.4s, v12.4s // .........................................................................................e..............................................................|................... - // mul v12.4s, v24.4s, v2.s[0] // ........................................................................................e...............................................................|................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...............................................................................................e........................................................|................... - // mls v12.4s, v24.4s, v8.s[0] // ........................................................................................................e...............................................|................... - // sub v24.4s, v13.4s, v14.4s // ..........................................................................................e.............................................................|................... - // add v13.4s, v13.4s, v14.4s // ............................................................................................e...........................................................|................... - // mul v14.4s, v24.4s, v2.s[2] // .................................................................................................e......................................................|................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................e.......................................................|................... - // mls v14.4s, v24.4s, v8.s[0] // ............................................................................................................e...........................................|................... - // sub v24.4s, v15.4s, v16.4s // ...........................................................................................e............................................................|................... - // add v15.4s, v15.4s, v16.4s // ..............................................................................................e.........................................................|................... - // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................e.....................................................|................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...................................................................................................e....................................................|................... - // mls v16.4s, v24.4s, v8.s[0] // .............................................................................................................e..........................................|................... - // sub v24.4s, v9.4s, v11.4s // .........................................................................................................e..............................................|................... - // add v9.4s, v9.4s, v11.4s // ....................................................................................................e...................................................|................... - // mul v11.4s, v24.4s, v0.s[2] // ...............................................................................................................e........................................|................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................e................................|................... - // mls v11.4s, v24.4s, v8.s[0] // ................................................................................................................................e.......................|................... - // sub v24.4s, v10.4s, v12.4s // ................................................................................................................e.......................................|................... - // add v10.4s, v10.4s, v12.4s // .................................................................................................................e......................................|................... - // mul v12.4s, v24.4s, v0.s[2] // ........................................................................................................................e...............................|................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................e..............................|................... - // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................................................................e....................|................... - // sub v24.4s, v13.4s, v15.4s // ......................................................................................................e.................................................|................... - // add v13.4s, v13.4s, v15.4s // .....................................................................................................e..................................................|................... - // mul v15.4s, v24.4s, v1.s[0] // ..................................................................................................................e.....................................|................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................e.........................................|................... - // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................................e........................|................... - // sub v24.4s, v14.4s, v16.4s // ......................................................................................................................e.................................|................... - // add v14.4s, v14.4s, v16.4s // .....................................................................................................................e..................................|................... - // mul v16.4s, v24.4s, v1.s[0] // ............................................................................................................................e...........................|................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................e............................|................... - // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................................e.................|................... - // srshr v24.4S, v9.4S, #23 // ...........................................................................................................e............................................|................... - // mls v9.4s, v24.4s, v8.4s // ....................................................................................................................e...................................|................... - // srshr v24.4S, v10.4S, #23 // ..........................................................................................................................e.............................|................... - // mls v10.4s, v24.4s, v8.4s // .................................................................................................................................e......................|................... - // srshr v24.4S, v13.4S, #23 // ..........................................................................................................e.............................................|................... - // mls v13.4s, v24.4s, v8.4s // ...................................................................................................................e....................................|................... - // srshr v24.4S, v14.4S, #23 // .............................................................................................................................e..........................|................... - // mls v14.4s, v24.4s, v8.4s // ....................................................................................................................................e...................|................... - // sub v24.4s, v9.4s, v13.4s // ..............................................................................................................................e.........................|................... - // add v9.4s, v9.4s, v13.4s // ..................................................................................................................................e.....................|................... - // mul v13.4s, v24.4s, v0.s[0] // .....................................................................................................................................e..................|................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................e................|................... - // mls v13.4s, v24.4s, v8.s[0] // .............................................................................................................................................e..........|................... - // sub v24.4s, v10.4s, v14.4s // ..........................................................................................................................................e.............|................... - // add v10.4s, v10.4s, v14.4s // ............................................................................................................................................e...........|................... - // mul v14.4s, v24.4s, v0.s[0] // ..................................................................................................................................................e.....|................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................e....|................... - // mls v14.4s, v24.4s, v8.s[0] // ..........*.............................................................................................................................................|.........*......... - // sub v24.4s, v11.4s, v15.4s // .........................................................................................................................................e..............|................... - // add v11.4s, v11.4s, v15.4s // .................................................................................................................................................e......|................... - // mul v15.4s, v24.4s, v0.s[0] // ..............................................................................................................................................e.........|................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................................e........|................... - // mls v15.4s, v24.4s, v8.s[0] // ..*.....................................................................................................................................................|.*................. - // sub v24.4s, v12.4s, v16.4s // ...........................................................................................................................................e............|................... - // add v12.4s, v12.4s, v16.4s // ................................................................................................................................................e.......|................... - // mul v16.4s, v24.4s, v0.s[0] // ....................................................................................................................................................e...|................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................................................e..|................... - // mls v16.4s, v24.4s, v8.s[0] // .........*..............................................................................................................................................|........*.......... - // str q9, [x1], #(16*4) // ........................................................................................................................................e...............|................... - // str q10, [x1, #(-16*4 + 1*16)] // ....*...................................................................................................................................................|...*............... - // str q11, [x1, #(-16*4 + 2*16)] // ...*....................................................................................................................................................|..*................ - // str q12, [x1, #(-16*4 + 3*16)] // .......................................................................................................................................................e|................... - // str q13, [x2], #(16*4) // ......................................................................................................................................................e.|................... - // str q14, [x2, #(-16*4 + 1*16)] // .................*......................................................................................................................................|................*.. - // str q15, [x2, #(-16*4 + 2*16)] // ..............*.........................................................................................................................................|.............*..... - // str q16, [x2, #(-16*4 + 3*16)] // ................*.......................................................................................................................................|...............*... - // add x1, x1, #64 // .....*..................................................................................................................................................|....*.............. - // add x2, x2, #64 // ...................*....................................................................................................................................|..................* + // ---------------------------------------------------------------------------- new position ----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // e.....................................................................................................................................................'.~.............. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ...............e......................................................................................................................................'................ + // ldr q0, [x5], #(12*16) // .e....................................................................................................................................................'..~............. + // ldr q4, [x5, #(-12*16 + 1*16)] // ..e...................................................................................................................................................'...~............ + // ldr q1, [x5, #(-12*16 + 2*16)] // ....e.................................................................................................................................................'.....~.......... + // ldr q5, [x5, #(-12*16 + 3*16)] // .....e................................................................................................................................................'......~......... + // ldr q2, [x5, #(-12*16 + 4*16)] // ......e...............................................................................................................................................'.......~........ + // ldr q6, [x5, #(-12*16 + 5*16)] // .......e..............................................................................................................................................'........~....... + // sub v24.4s, v9.4s, v10.4s // ....................e.................................................................................................................................'................ + // add v9.4s, v9.4s, v10.4s // .....................e................................................................................................................................'................ + // sqrdmulh v27.4s, v24.4s, v5.4s // ...........................e..........................................................................................................................'................ + // mul v10.4s, v24.4s, v1.4s // ..........................e...........................................................................................................................'................ + // mls v10.4s, v27.4s, v8.s[0] // ....................................e.................................................................................................................'................ + // sub v24.4s, v11.4s, v12.4s // ......................e...............................................................................................................................'................ + // add v11.4s, v11.4s, v12.4s // .......................e..............................................................................................................................'................ + // sqrdmulh v27.4s, v24.4s, v6.4s // .............................e........................................................................................................................'................ + // mul v12.4s, v24.4s, v2.4s // ............................e.........................................................................................................................'................ + // mls v12.4s, v27.4s, v8.s[0] // .....................................e................................................................................................................'................ + // sub v24.4s, v9.4s, v11.4s // ..............................e.......................................................................................................................'................ + // add v9.4s, v9.4s, v11.4s // ...............................e......................................................................................................................'................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ......................................e...............................................................................................................'................ + // mul v11.4s, v24.4s, v0.4s // .......................................e..............................................................................................................'................ + // mls v11.4s, v27.4s, v8.s[0] // ................................................e.....................................................................................................'................ + // sub v24.4s, v10.4s, v12.4s // ..............................................e.......................................................................................................'................ + // add v10.4s, v10.4s, v12.4s // ...............................................e......................................................................................................'................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ......................................................e...............................................................................................'................ + // mul v12.4s, v24.4s, v0.4s // .....................................................e................................................................................................'................ + // mls v12.4s, v27.4s, v8.s[0] // ............................................................e.........................................................................................'................ + // ldr q0, [x5, #(-12*16 + 6*16)] // ........e.............................................................................................................................................'.........~...... + // ldr q4, [x5, #(-12*16 + 7*16)] // .........e............................................................................................................................................'..........~..... + // ldr q1, [x5, #(-12*16 + 8*16)] // ............e.........................................................................................................................................'.............~.. + // ldr q5, [x5, #(-12*16 + 9*16)] // .............e........................................................................................................................................'..............~. + // ldr q2, [x5, #(-12*16 + 10*16)] // ................e.....................................................................................................................................'................ + // ldr q6, [x5, #(-12*16 + 11*16)] // .................e....................................................................................................................................'................ + // sub v24.4s, v13.4s, v14.4s // ................................e.....................................................................................................................'................ + // add v13.4s, v13.4s, v14.4s // .................................e....................................................................................................................'................ + // sqrdmulh v27.4s, v24.4s, v5.4s // .........................................e............................................................................................................'................ + // mul v14.4s, v24.4s, v1.4s // ........................................e.............................................................................................................'................ + // mls v14.4s, v27.4s, v8.s[0] // .................................................e....................................................................................................'................ + // sub v24.4s, v15.4s, v16.4s // ..................................e...................................................................................................................'................ + // add v15.4s, v15.4s, v16.4s // ...................................e..................................................................................................................'................ + // sqrdmulh v27.4s, v24.4s, v6.4s // ...........................................e..........................................................................................................'................ + // mul v16.4s, v24.4s, v2.4s // ..........................................e...........................................................................................................'................ + // mls v16.4s, v27.4s, v8.s[0] // ..................................................e...................................................................................................'................ + // sub v24.4s, v13.4s, v15.4s // ............................................e.........................................................................................................'................ + // add v13.4s, v13.4s, v15.4s // .............................................e........................................................................................................'................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ...................................................e..................................................................................................'................ + // mul v15.4s, v24.4s, v0.4s // ....................................................e.................................................................................................'................ + // mls v15.4s, v27.4s, v8.s[0] // ...........................................................e..........................................................................................'................ + // sub v24.4s, v14.4s, v16.4s // .........................................................e............................................................................................'................ + // add v14.4s, v14.4s, v16.4s // ..........................................................e...........................................................................................'................ + // sqrdmulh v27.4s, v24.4s, v4.4s // ..............................................................e.......................................................................................'................ + // mul v16.4s, v24.4s, v0.4s // .............................................................e........................................................................................'................ + // mls v16.4s, v27.4s, v8.s[0] // ...................................................................e..................................................................................'................ + // trn1 v25.4s, v9.4s, v10.4s // .......................................................e..............................................................................................'................ + // trn2 v26.4s, v9.4s, v10.4s // ........................................................e.............................................................................................'................ + // trn1 v27.4s, v11.4s, v12.4s // .................................................................e....................................................................................'................ + // trn2 v28.4s, v11.4s, v12.4s // ..................................................................e...................................................................................'................ + // trn2 v11.2d, v25.2d, v27.2d // .....................................................................e................................................................................'................ + // trn2 v12.2d, v26.2d, v28.2d // .......................................................................e..............................................................................'................ + // trn1 v9.2d, v25.2d, v27.2d // ....................................................................e.................................................................................'................ + // trn1 v10.2d, v26.2d, v28.2d // ......................................................................e...............................................................................'................ + // trn1 v25.4s, v13.4s, v14.4s // ...............................................................e......................................................................................'................ + // trn2 v26.4s, v13.4s, v14.4s // ................................................................e.....................................................................................'................ + // trn1 v27.4s, v15.4s, v16.4s // ........................................................................e.............................................................................'................ + // trn2 v28.4s, v15.4s, v16.4s // .........................................................................e............................................................................'................ + // trn2 v15.2d, v25.2d, v27.2d // ..............................................................................e.......................................................................'................ + // trn2 v16.2d, v26.2d, v28.2d // ................................................................................e.....................................................................'................ + // trn1 v13.2d, v25.2d, v27.2d // ...............................................................................e......................................................................'................ + // trn1 v14.2d, v26.2d, v28.2d // .................................................................................e....................................................................'................ + // ldr q0, [x4], #64 // ...................e..................................................................................................................................'................ + // ldr q1, [x4, #(-64 + 16)] // ..................e...................................................................................................................................'................ + // ldr q2, [x4, #(-64 + 32)] // ........................e.............................................................................................................................'................ + // ldr q3, [x4, #(-64 + 48)] // .........................e............................................................................................................................'................ + // sub v24.4s, v9.4s, v10.4s // ..........................................................................e...........................................................................'................ + // add v9.4s, v9.4s, v10.4s // ...........................................................................e..........................................................................'................ + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ..................................................................................e...................................................................'................ + // mul v10.4s, v24.4s, v1.s[2] // ...................................................................................e..................................................................'................ + // mls v10.4s, v27.4s, v8.s[0] // ................................................................................................e.....................................................'................ + // sub v24.4s, v11.4s, v12.4s // ............................................................................e.........................................................................'................ + // add v11.4s, v11.4s, v12.4s // .............................................................................e........................................................................'................ + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ....................................................................................e.................................................................'................ + // mul v12.4s, v24.4s, v2.s[0] // .....................................................................................e................................................................'................ + // mls v12.4s, v27.4s, v8.s[0] // ....................................................................................................e.................................................'................ + // sub v24.4s, v13.4s, v14.4s // ........................................................................................e.............................................................'................ + // add v13.4s, v13.4s, v14.4s // .........................................................................................e............................................................'................ + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ............................................................................................e.........................................................'................ + // mul v14.4s, v24.4s, v2.s[2] // .............................................................................................e........................................................'................ + // mls v14.4s, v27.4s, v8.s[0] // .......................................................................................................e..............................................'................ + // sub v24.4s, v15.4s, v16.4s // ......................................................................................e...............................................................'................ + // add v15.4s, v15.4s, v16.4s // .......................................................................................e..............................................................'................ + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ..............................................................................................e.......................................................'................ + // mul v16.4s, v24.4s, v3.s[0] // ...............................................................................................e......................................................'................ + // mls v16.4s, v27.4s, v8.s[0] // ........................................................................................................e.............................................'................ + // sub v24.4s, v9.4s, v11.4s // ..........................................................................................e...........................................................'................ + // add v9.4s, v9.4s, v11.4s // ...........................................................................................e..........................................................'................ + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .................................................................................................e....................................................'................ + // mul v11.4s, v24.4s, v0.s[2] // ..................................................................................................e...................................................'................ + // mls v11.4s, v27.4s, v8.s[0] // ...........................................................................................................e..........................................'................ + // sub v24.4s, v10.4s, v12.4s // ..............................................................................................................e.......................................'................ + // add v10.4s, v10.4s, v12.4s // ...............................................................................................................e......................................'................ + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ....................................................................................................................e.................................'................ + // mul v12.4s, v24.4s, v0.s[2] // .....................................................................................................................e................................'................ + // mls v12.4s, v27.4s, v8.s[0] // ...............................................................................................................................e......................'................ + // sub v24.4s, v13.4s, v15.4s // ...................................................................................................e..................................................'................ + // add v13.4s, v13.4s, v15.4s // .....................................................................................................e................................................'................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .........................................................................................................e............................................'................ + // mul v15.4s, v24.4s, v1.s[0] // ..........................................................................................................e...........................................'................ + // mls v15.4s, v27.4s, v8.s[0] // ..................................................................................................................e...................................'................ + // sub v24.4s, v14.4s, v16.4s // ................................................................................................................e.....................................'................ + // add v14.4s, v14.4s, v16.4s // .................................................................................................................e....................................'................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .......................................................................................................................e..............................'................ + // mul v16.4s, v24.4s, v1.s[0] // ........................................................................................................................e.............................'................ + // mls v16.4s, v27.4s, v8.s[0] // .................................................................................................................................e....................'................ + // srshr v24.4S, v9.4S, #23 // ......................................................................................................e...............................................'................ + // mls v9.4s, v24.4s, v8.4s // .............................................................................................................e........................................'................ + // srshr v24.4S, v10.4S, #23 // ......................................................................................................................e...............................'................ + // mls v10.4s, v24.4s, v8.4s // ..............................................................................................................................e.......................'................ + // srshr v24.4S, v13.4S, #23 // ............................................................................................................e.........................................'................ + // mls v13.4s, v24.4s, v8.4s // ...................................................................................................................e..................................'................ + // srshr v24.4S, v14.4S, #23 // .........................................................................................................................e............................'................ + // mls v14.4s, v24.4s, v8.4s // ................................................................................................................................e.....................'................ + // sub v24.4s, v9.4s, v13.4s // .............................................................................................................................e........................'................ + // add v9.4s, v9.4s, v13.4s // ............................................................................................................................e.........................'................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ......................................................................................................................................e...............'................ + // mul v13.4s, v24.4s, v0.s[0] // .......................................................................................................................................e..............'................ + // mls v13.4s, v27.4s, v8.s[0] // .............................................................................................................................................e........'................ + // sub v24.4s, v10.4s, v14.4s // ........................................................................................................................................e.............'................ + // add v10.4s, v10.4s, v14.4s // .........................................................................................................................................e............'................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..............................................................................................................................................e.......'................ + // mul v14.4s, v24.4s, v0.s[0] // ...............................................................................................................................................e......'................ + // mls v14.4s, v27.4s, v8.s[0] // ......................................................................................................................................................*................ + // sub v24.4s, v11.4s, v15.4s // ...........................................................................................................................e..........................'................ + // add v11.4s, v11.4s, v15.4s // ..........................................................................................................................e...........................'................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ...................................................................................................................................e..................'................ + // mul v15.4s, v24.4s, v0.s[0] // ....................................................................................................................................e.................'................ + // mls v15.4s, v27.4s, v8.s[0] // ..........................................................................................................................................e...........'................ + // sub v24.4s, v12.4s, v16.4s // ...........................................................................................................................................e..........'................ + // add v12.4s, v12.4s, v16.4s // ............................................................................................................................................e.........'................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................................................................................................................................................e.....'................ + // mul v16.4s, v24.4s, v0.s[0] // .................................................................................................................................................e....'................ + // mls v16.4s, v27.4s, v8.s[0] // ......................................................................................................................................................'*............... + // str q9, [x1], #(16*4) // .....................................................................................................................................e................'................ + // str q10, [x1, #(-16*4 + 1*16)] // ..................................................................................................................................................e...'................ + // str q11, [x1, #(-16*4 + 2*16)] // ..................................................................................................................................e...................'................ + // str q12, [x1, #(-16*4 + 3*16)] // ...................................................................................................................................................e..'................ + // str q13, [x2], #(16*4) // .....................................................................................................................................................e'................ + // str q14, [x2, #(-16*4 + 1*16)] // ..........~...........................................................................................................................................'...........*.... + // str q15, [x2, #(-16*4 + 2*16)] // ...~..................................................................................................................................................'....*........... + // str q16, [x2, #(-16*4 + 3*16)] // ...........~..........................................................................................................................................'............*... + // add x1, x1, #64 // ....................................................................................................................................................e.'................ + // add x2, x2, #64 // ..............~.......................................................................................................................................'...............* sub count, count, #1 cbnz count, layer45678_start - mls v11.4S, v10.4S, v8.S[0] // *......... - mls v12.4S, v14.4S, v8.S[0] // ....*..... - mls v6.4S, v23.4S, v8.S[0] // .....*.... - str q26, [x1, #-32] // .*........ - str q24, [x1, #-48] // ..*....... - add x1, x1, #64 // ...*...... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - str q11, [x2, #-32] // ......*... - str q12, [x2, #-16] // .......*.. - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - str q6, [x2, #-48] // ........*. - add x2, x2, #64 // .........* - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - - // original source code - // mls v11.4S, v10.4S, v8.S[0] // *......... - // str q26, [x1, #-32] // ...*...... - // str q24, [x1, #-48] // ....*..... - // add x1, x1, #64 // .....*.... - // mls v12.4S, v14.4S, v8.S[0] // .*........ - // mls v6.4S, v23.4S, v8.S[0] // ..*....... - // str q11, [x2, #-32] // ......*... - // str q12, [x2, #-16] // .......*.. - // str q6, [x2, #-48] // ........*. - // add x2, x2, #64 // .........* + // Instructions: 6 + // Expected cycles: 4 + // Expected IPC: 1.50 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + str q0, [x2, #-32] // ..*........................... + mls v13.4S, v26.4S, v8.S[0] // .*............................ + mls v17.4S, v30.4S, v8.S[0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q13, [x2, #-16] // ....*......................... + str q17, [x2, #-48] // ...*.......................... + add x2, x2, #64 // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // mls v17.4S, v30.4S, v8.S[0] // ..*............................ + // mls v13.4S, v26.4S, v8.S[0] // .*............................. + // str q0, [x2, #-32] // *.............................. + // str q17, [x2, #-48] // ....*.......................... + // str q13, [x2, #-16] // ...*........................... + // add x2, x2, #64 // .....*......................... // ----------------------------------------------------------------------------- @@ -1549,7 +1561,7 @@ layer45678_start: ASM_LOAD(xtmp, ninv_tw_addr) ld1r {ninv_tw.4s}, [xtmp] - ushr modulus_half.4S, modulus.4S, #1 + ushr modulus_half.4S, consts.4S, #1 neg neg_modulus_half.4S, modulus_half.4S mov count, #8 @@ -1557,778 +1569,846 @@ layer45678_start: load_roots_123 .p2align 2 - ldr q12, [x0, #256] // ..*............................................... - ldr q19, [x0, #128] // *................................................. - ldr q5, [x0, #0] // .*................................................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - ldr q15, [x0, #512] // ............*..................................... - ldr q9, [x0, #384] // ...*.............................................. - ldr q13, [x0, #896] // ..............*................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - ldr q18, [x0, #640] // .............*.................................... - ldr q27, [x0, #768] // ...............*.................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v17.4S, v5.4S, v19.4S // .....*............................................ - add v19.4S, v5.4S, v19.4S // ....*............................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v5.4S, v12.4S, v9.4S // .......*.......................................... - add v9.4S, v12.4S, v9.4S // ......*........................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v12.4S, v15.4S, v18.4S // .................*................................ - add v15.4S, v15.4S, v18.4S // ..................*............................... - add v18.4S, v27.4S, v13.4S // ........................*......................... - sqrdmulh v28.4S, v17.4S, v1.S[3] // ......................*........................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v13.4S, v27.4S, v13.4S // .........................*........................ - mul v27.4S, v17.4S, v1.S[2] // ..........................*....................... - sqrdmulh v17.4S, v5.4S, v2.S[1] // ...................*.............................. - mul v5.4S, v5.4S, v2.S[0] // ....................*............................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v22.4S, v12.4S, v2.S[3] // .....................*............................ - mul v16.4S, v12.4S, v2.S[2] // .......................*.......................... - sub v12.4S, v15.4S, v18.4S // ............................*..................... - add v20.4S, v19.4S, v9.4S // ........*......................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v19.4S, v19.4S, v9.4S // .........*........................................ - add v9.4S, v15.4S, v18.4S // .............................*.................... - mul v15.4S, v13.4S, v3.S[0] // ..............................*................... - sqrdmulh v13.4S, v13.4S, v3.S[1] // ...............................*.................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v27.4S, v28.4S, v8.S[0] // ................................*................. - mls v5.4S, v17.4S, v8.S[0] // ...........................*...................... - mul v4.4S, v12.4S, v1.S[0] // .................................*................ - sqrdmulh v12.4S, v12.4S, v1.S[1] // ...................................*.............. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v16.4S, v22.4S, v8.S[0] // ....................................*............. - mul v17.4S, v19.4S, v0.S[2] // ..........*....................................... - sqrdmulh v19.4S, v19.4S, v0.S[3] // ...........*...................................... - sub v18.4S, v20.4S, v9.4S // ..................................*............... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v15.4S, v13.4S, v8.S[0] // ......................................*........... - add v9.4S, v20.4S, v9.4S // .....................................*............ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v13.4S, v27.4S, v5.4S // .......................................*.......... - mls v4.4S, v12.4S, v8.S[0] // .........................................*........ - sqrdmulh v7.4S, v18.4S, v0.S[1] // ........................................*......... - mul v14.4S, v18.4S, v0.S[0] // ..........................................*....... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v5.4S, v27.4S, v5.4S // .............................................*.... - mls v17.4S, v19.4S, v8.S[0] // ................*................................. - sqrdmulh v23.4S, v9.4S, v26.4S // ...........................................*...... - mul v12.4S, v9.4S, v25.4S // ............................................*..... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v22.4S, v16.4S, v15.4S // .................................................* - sub v15.4S, v16.4S, v15.4S // ................................................*. - mul v28.4S, v13.4S, v0.S[2] // ..............................................*... - sqrdmulh v24.4S, v13.4S, v0.S[3] // ...............................................*.. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - - // original source code - // ldr q16, [x0, #128] // .*................................................ - // ldr q20, [x0, #0] // ..*............................................... - // ldr q6, [x0, #256] // *................................................. - // ldr q23, [x0, #384] // ....*............................................. - // add v27.4S, v20.4S, v16.4S // .........*........................................ - // sub v24.4S, v20.4S, v16.4S // ........*......................................... - // add v22.4S, v6.4S, v23.4S // ...........*...................................... - // sub v16.4S, v6.4S, v23.4S // ..........*....................................... - // add v7.4S, v27.4S, v22.4S // .......................*.......................... - // sub v23.4S, v27.4S, v22.4S // ........................*......................... - // mul v17.4S, v23.4S, v0.S[2] // .................................*................ - // sqrdmulh v23.4S, v23.4S, v0.S[3] // ..................................*............... - // ldr q20, [x0, #512] // ...*.............................................. - // ldr q28, [x0, #640] // ......*........................................... - // ldr q22, [x0, #896] // .....*............................................ - // ldr q27, [x0, #768] // .......*.......................................... - // mls v17.4S, v23.4S, v8.S[0] // ...........................................*...... - // sub v14.4S, v20.4S, v28.4S // ............*..................................... - // add v28.4S, v20.4S, v28.4S // .............*.................................... - // sqrdmulh v11.4S, v16.4S, v2.S[1] // ..................*............................... - // mul v13.4S, v16.4S, v2.S[0] // ...................*.............................. - // sqrdmulh v16.4S, v14.4S, v2.S[3] // ....................*............................. - // sqrdmulh v21.4S, v24.4S, v1.S[3] // ...............*.................................. - // mul v19.4S, v14.4S, v2.S[2] // .....................*............................ - // add v20.4S, v27.4S, v22.4S // ..............*................................... - // sub v10.4S, v27.4S, v22.4S // ................*................................. - // mul v27.4S, v24.4S, v1.S[2] // .................*................................ - // mls v13.4S, v11.4S, v8.S[0] // .............................*.................... - // sub v23.4S, v28.4S, v20.4S // ......................*........................... - // add v20.4S, v28.4S, v20.4S // .........................*........................ - // mul v22.4S, v10.4S, v3.S[0] // ..........................*....................... - // sqrdmulh v10.4S, v10.4S, v3.S[1] // ...........................*...................... - // mls v27.4S, v21.4S, v8.S[0] // ............................*..................... - // mul v4.4S, v23.4S, v1.S[0] // ..............................*................... - // sub v11.4S, v7.4S, v20.4S // ...................................*.............. - // sqrdmulh v14.4S, v23.4S, v1.S[1] // ...............................*.................. - // mls v19.4S, v16.4S, v8.S[0] // ................................*................. - // add v24.4S, v7.4S, v20.4S // .....................................*............ - // mls v22.4S, v10.4S, v8.S[0] // ....................................*............. - // sub v10.4S, v27.4S, v13.4S // ......................................*........... - // sqrdmulh v7.4S, v11.4S, v0.S[1] // ........................................*......... - // mls v4.4S, v14.4S, v8.S[0] // .......................................*.......... - // mul v14.4S, v11.4S, v0.S[0] // .........................................*........ - // sqrdmulh v23.4S, v24.4S, v26.4S // ............................................*..... - // mul v12.4S, v24.4S, v25.4S // .............................................*.... - // add v5.4S, v27.4S, v13.4S // ..........................................*....... - // mul v28.4S, v10.4S, v0.S[2] // ................................................*. - // sqrdmulh v24.4S, v10.4S, v0.S[3] // .................................................* - // sub v15.4S, v19.4S, v22.4S // ...............................................*.. - // add v22.4S, v19.4S, v22.4S // ..............................................*... + // Instructions: 75 + // Expected cycles: 26 + // Expected IPC: 2.88 + // + // Wall time: 3.85s + // User time: 3.85s + // + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q17, [x0, #128] // .*......................................................................... + ldr q13, [x0, #0] // ..*........................................................................ + ldr q19, [x0, #512] // *.......................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + ldr q24, [x0, #640] // ....*...................................................................... + ldr q14, [x0, #768] // ...*....................................................................... + ldr q12, [x0, #896] // .....*..................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + ldr q7, [x0, #256] // ......*.................................................................... + ldr q20, [x0, #384] // .......*................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v11.4S, v13.4S, v17.4S // .........*................................................................. + add v17.4S, v13.4S, v17.4S // ........*.................................................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v13.4S, v19.4S, v24.4S // ............*.............................................................. + sub v19.4S, v19.4S, v24.4S // .............*............................................................. + sub v24.4S, v14.4S, v12.4S // ..........*................................................................ + add v14.4S, v14.4S, v12.4S // ...........*............................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v12.4S, v7.4S, v20.4S // ...................*....................................................... + add v7.4S, v7.4S, v20.4S // ...............*........................................................... + mul v20.4S, v11.4S, v1.S[2] // ..............*............................................................ + sqrdmulh v11.4S, v11.4S, v1.S[3] // ....................*...................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v5.4S, v24.4S, v3.S[1] // ................*.......................................................... + mul v24.4S, v24.4S, v3.S[0] // .................*......................................................... + sub v4.4S, v13.4S, v14.4S // ..................*........................................................ + sqrdmulh v15.4S, v19.4S, v2.S[3] // .....................*..................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v13.4S, v13.4S, v14.4S // .......................*................................................... + mul v19.4S, v19.4S, v2.S[2] // ......................*.................................................... + sub v14.4S, v17.4S, v7.4S // ........................*.................................................. + mul v16.4S, v12.4S, v2.S[0] // ............................*.............................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v12.4S, v12.4S, v2.S[1] // .............................*............................................. + mls v20.4S, v11.4S, v8.S[0] // ..............................*............................................ + sqrdmulh v11.4S, v4.4S, v1.S[1] // .........................*................................................. + mul v4.4S, v4.4S, v1.S[0] // ...........................*............................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v17.4S, v17.4S, v7.4S // .................................*......................................... + mls v24.4S, v5.4S, v8.S[0] // ..........................*................................................ + sqrdmulh v7.4S, v14.4S, v0.S[3] // ...............................*........................................... + mul v14.4S, v14.4S, v0.S[2] // ................................*.......................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v19.4S, v15.4S, v8.S[0] // ...................................*....................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v5.4S, v17.4S, v13.4S // ......................................*.................................... + add v15.4S, v17.4S, v13.4S // ..................................................................*........ + mls v16.4S, v12.4S, v8.S[0] // ..................................*........................................ + mls v4.4S, v11.4S, v8.S[0] // ....................................*...................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v14.4S, v7.4S, v8.S[0] // .....................................*..................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v17.4S, v19.4S, v24.4S // ..............................................*............................ + add v13.4S, v19.4S, v24.4S // .........................................*................................. + sqrdmulh v9.4S, v5.4S, v0.S[1] // .............................................................*............. + mul v22.4S, v5.4S, v0.S[0] // ....................................................................*...... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v19.4S, v20.4S, v16.4S // ........................................*.................................. + sub v24.4S, v20.4S, v16.4S // .......................................*................................... + sqrdmulh v27.4S, v15.4S, v26.4S // .......................................................................*... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v12.4S, v14.4S, v4.4S // ..........................................*................................ + sub v14.4S, v14.4S, v4.4S // ...........................................*............................... + mul v11.4S, v17.4S, v1.S[0] // ....................................................*...................... + sqrdmulh v7.4S, v17.4S, v1.S[1] // .........................................................*................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v17.4S, v19.4S, v13.4S // .............................................*............................. + add v19.4S, v19.4S, v13.4S // ...............................................*........................... + mul v5.4S, v24.4S, v0.S[2] // ............................................*.............................. + sqrdmulh v24.4S, v24.4S, v0.S[3] // ........................................................*.................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v20.4S, v12.4S, v26.4S // ................................................*.......................... + mul v12.4S, v12.4S, v25.4S // .................................................*......................... + mul v13.4S, v14.4S, v0.S[0] // ..................................................*........................ + sqrdmulh v4.4S, v14.4S, v0.S[1] // ...................................................*....................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v16.4S, v17.4S, v0.S[1] // ..........................................................*................ + mul v17.4S, v17.4S, v0.S[0] // .....................................................*..................... + sqrdmulh v28.4S, v19.4S, v26.4S // ......................................................*.................... + mul v14.4S, v19.4S, v25.4S // .......................................................*................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v11.4S, v7.4S, v8.S[0] // .................................................................*......... + mls v5.4S, v24.4S, v8.S[0] // ................................................................*.......... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v12.4S, v20.4S, v8.S[0] // ...........................................................*............... + mls v13.4S, v4.4S, v8.S[0] // ............................................................*.............. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v17.4S, v16.4S, v8.S[0] // ...............................................................*........... + mls v14.4S, v28.4S, v8.S[0] // ..............................................................*............ + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + cmge v16.4S, v31.4S, v13.4S // ...................................................................*....... + cmge v18.4S, v13.4S, v30.4S // .....................................................................*..... + cmge v23.4S, v31.4S, v12.4S // ......................................................................*.... + cmge v6.4S, v12.4S, v30.4S // .........................................................................*. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + cmge v20.4S, v31.4S, v14.4S // ........................................................................*.. + cmge v29.4S, v14.4S, v30.4S // ..........................................................................* + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q21, [x0, #512] // ..*........................................................................ + // ldr q4, [x0, #128] // *.......................................................................... + // ldr q28, [x0, #0] // .*......................................................................... + // ldr q15, [x0, #768] // ....*...................................................................... + // ldr q20, [x0, #640] // ...*....................................................................... + // ldr q29, [x0, #896] // .....*..................................................................... + // ldr q11, [x0, #256] // ......*.................................................................... + // ldr q9, [x0, #384] // .......*................................................................... + // add v10.4S, v28.4S, v4.4S // .........*................................................................. + // sub v27.4S, v28.4S, v4.4S // ........*.................................................................. + // sub v4.4S, v15.4S, v29.4S // ............*.............................................................. + // add v15.4S, v15.4S, v29.4S // .............*............................................................. + // add v18.4S, v21.4S, v20.4S // ..........*................................................................ + // sub v13.4S, v21.4S, v20.4S // ...........*............................................................... + // mul v20.4S, v27.4S, v1.S[2] // ................*.......................................................... + // add v21.4S, v11.4S, v9.4S // ...............*........................................................... + // sqrdmulh v7.4S, v4.4S, v3.S[1] // ..................*........................................................ + // mul v4.4S, v4.4S, v3.S[0] // ...................*....................................................... + // sub v14.4S, v18.4S, v15.4S // ....................*...................................................... + // sub v11.4S, v11.4S, v9.4S // ..............*............................................................ + // sqrdmulh v27.4S, v27.4S, v1.S[3] // .................*......................................................... + // sqrdmulh v29.4S, v13.4S, v2.S[3] // .....................*..................................................... + // mul v17.4S, v13.4S, v2.S[2] // .......................*................................................... + // add v18.4S, v18.4S, v15.4S // ......................*.................................................... + // sub v15.4S, v10.4S, v21.4S // ........................*.................................................. + // sqrdmulh v13.4S, v14.4S, v1.S[1] // ............................*.............................................. + // mls v4.4S, v7.4S, v8.S[0] // ...............................*........................................... + // mul v5.4S, v14.4S, v1.S[0] // .............................*............................................. + // mul v14.4S, v11.4S, v2.S[0] // .........................*................................................. + // sqrdmulh v11.4S, v11.4S, v2.S[1] // ..........................*................................................ + // mls v20.4S, v27.4S, v8.S[0] // ...........................*............................................... + // sqrdmulh v27.4S, v15.4S, v0.S[3] // ................................*.......................................... + // mul v15.4S, v15.4S, v0.S[2] // .................................*......................................... + // add v10.4S, v10.4S, v21.4S // ..............................*............................................ + // mls v14.4S, v11.4S, v8.S[0] // .....................................*..................................... + // mls v17.4S, v29.4S, v8.S[0] // ..................................*........................................ + // mls v5.4S, v13.4S, v8.S[0] // ......................................*.................................... + // mls v15.4S, v27.4S, v8.S[0] // .......................................*................................... + // sub v23.4S, v10.4S, v18.4S // ...................................*....................................... + // sub v21.4S, v20.4S, v14.4S // .............................................*............................. + // add v14.4S, v20.4S, v14.4S // ............................................*.............................. + // add v13.4S, v17.4S, v4.4S // .........................................*................................. + // add v6.4S, v15.4S, v5.4S // ...............................................*........................... + // sub v16.4S, v15.4S, v5.4S // ................................................*.......................... + // mul v5.4S, v21.4S, v0.S[2] // .....................................................*..................... + // sub v15.4S, v14.4S, v13.4S // ...................................................*....................... + // sub v4.4S, v17.4S, v4.4S // ........................................*.................................. + // add v14.4S, v14.4S, v13.4S // ....................................................*...................... + // sqrdmulh v29.4S, v6.4S, v26.4S // .......................................................*................... + // mul v12.4S, v6.4S, v25.4S // ........................................................*.................. + // mul v13.4S, v16.4S, v0.S[0] // .........................................................*................. + // sqrdmulh v6.4S, v16.4S, v0.S[1] // ..........................................................*................ + // mul v11.4S, v4.4S, v1.S[0] // .................................................*......................... + // mul v17.4S, v15.4S, v0.S[0] // ............................................................*.............. + // sqrdmulh v16.4S, v14.4S, v26.4S // .............................................................*............. + // mul v14.4S, v14.4S, v25.4S // ..............................................................*............ + // sqrdmulh v27.4S, v21.4S, v0.S[3] // ......................................................*.................... + // sqrdmulh v21.4S, v4.4S, v1.S[1] // ..................................................*........................ + // sqrdmulh v4.4S, v15.4S, v0.S[1] // ...........................................................*............... + // mls v12.4S, v29.4S, v8.S[0] // .................................................................*......... + // mls v13.4S, v6.4S, v8.S[0] // ..................................................................*........ + // sqrdmulh v9.4S, v23.4S, v0.S[1] // ..........................................*................................ + // mls v14.4S, v16.4S, v8.S[0] // ....................................................................*...... + // mls v17.4S, v4.4S, v8.S[0] // ...................................................................*....... + // mls v5.4S, v27.4S, v8.S[0] // ................................................................*.......... + // mls v11.4S, v21.4S, v8.S[0] // ...............................................................*........... + // add v15.4S, v10.4S, v18.4S // ....................................*...................................... + // cmge v16.4S, v31.4S, v13.4S // .....................................................................*..... + // mul v22.4S, v23.4S, v0.S[0] // ...........................................*............................... + // cmge v18.4S, v13.4S, v30.4S // ......................................................................*.... + // cmge v23.4S, v31.4S, v12.4S // .......................................................................*... + // sqrdmulh v27.4S, v15.4S, v26.4S // ..............................................*............................ + // cmge v20.4S, v31.4S, v14.4S // .........................................................................*. + // cmge v6.4S, v12.4S, v30.4S // ........................................................................*.. + // cmge v29.4S, v14.4S, v30.4S // ..........................................................................* sub count, count, #1 layer123_start: + // Instructions: 120 + // Expected cycles: 27 + // Expected IPC: 4.44 + // + // Wall time: 1282.62s + // User time: 1282.62s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + // gap // ........................................................................................................................ + ldr q21, [x0, #528] // ....e................................................................................................................... + cmge v10.4S, v31.4S, v17.4S // ........................................................................*............................................... + sub v7.4S, v5.4S, v11.4S // ...............................................................*........................................................ + ldr q4, [x0, #144] // .e...................................................................................................................... + cmge v19.4S, v17.4S, v30.4S // .........................................................................*.............................................. + ldr q28, [x0, #16] // e....................................................................................................................... + mul v24.4S, v15.4S, v25.4S // .........................................................................................*.............................. + ldr q15, [x0, #784] // ......e................................................................................................................. + mls v22.4S, v9.4S, v8.S[0] // ....................................................*................................................................... + sub v18.4S, v16.4S, v18.4S // ..............................................................................*......................................... + sub v9.4S, v20.4S, v29.4S // ..........................................................................................................*............. + ldr q20, [x0, #656] // .....e.................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q29, [x0, #912] // .......e................................................................................................................ + sub v10.4S, v10.4S, v19.4S // ..........................................................................*............................................. + mul v19.4S, v7.4S, v0.S[0] // ..................................................................*..................................................... + add v16.4S, v5.4S, v11.4S // ................................................................*....................................................... + ldr q11, [x0, #272] // ..e..................................................................................................................... + // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sqrdmulh v7.4S, v7.4S, v0.S[1] // .................................................................*...................................................... // gap // ........................................................................................................................ - ldr q16, [x0, #144] // .e...................................................................................................................... - ldr q20, [x0, #16] // e....................................................................................................................... - ldr q6, [x0, #272] // ..e..................................................................................................................... - add v19.4S, v17.4S, v4.4S // ...........................................................*............................................................ - mls v14.4S, v7.4S, v8.S[0] // ....................................................*................................................................... - mls v12.4S, v23.4S, v8.S[0] // ..........................................................................................*............................. - ldr q23, [x0, #400] // ...e.................................................................................................................... - sub v27.4S, v17.4S, v4.4S // ..........................................................*............................................................. - mul v4.4S, v15.4S, v1.S[0] // .............................................*.......................................................................... - sqrdmulh v18.4S, v15.4S, v1.S[1] // ..............................................*......................................................................... + mls v14.4S, v9.4S, v8.4S // ...........................................................................................................*............ + mls v24.4S, v27.4S, v8.S[0] // ..........................................................................................*............................. // gap // ........................................................................................................................ - sub v15.4S, v5.4S, v22.4S // .....................................................*.................................................................. // gap // ........................................................................................................................ + mls v13.4S, v18.4S, v8.4S // ...............................................................................*........................................ // gap // ........................................................................................................................ - mls v28.4S, v24.4S, v8.S[0] // .....................................*.................................................................................. // gap // ........................................................................................................................ - add v11.4S, v5.4S, v22.4S // ......................................................*................................................................. + ldr q9, [x0, #400] // ...e.................................................................................................................... + mls v17.4S, v10.4S, v8.4S // ...........................................................................*............................................ + add v10.4S, v28.4S, v4.4S // .........e.............................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v13.4S, v19.4S, v25.4S // ..............................................................................................*......................... - sqrdmulh v10.4S, v19.4S, v26.4S // ...............................................................................................*........................ - mul v5.4S, v27.4S, v0.S[0] // ............................................................*........................................................... + sub v27.4S, v28.4S, v4.4S // ........e............................................................................................................... + sqrdmulh v28.4S, v16.4S, v26.4S // .................................................................................................*...................... // gap // ........................................................................................................................ - sqrdmulh v21.4S, v27.4S, v0.S[1] // .............................................................*.......................................................... + sub v4.4S, v15.4S, v29.4S // .......................e................................................................................................ + add v15.4S, v15.4S, v29.4S // ........................e............................................................................................... + add v18.4S, v21.4S, v20.4S // ...................e.................................................................................................... // gap // ........................................................................................................................ + mls v19.4S, v7.4S, v8.S[0] // ...................................................................*.................................................... // gap // ........................................................................................................................ - sqrdmulh v9.4S, v15.4S, v0.S[1] // ........................................................*............................................................... // gap // ........................................................................................................................ - mul v19.4S, v15.4S, v0.S[0] // .......................................................*................................................................ - sqrdmulh v17.4S, v11.4S, v26.4S // ............................................................................................*........................... - mls v4.4S, v18.4S, v8.S[0] // ...............................................*........................................................................ // gap // ........................................................................................................................ + str q13, [x0, #768] // ......................................................................................*................................. + sub v13.4S, v21.4S, v20.4S // ..................e..................................................................................................... + mul v20.4S, v27.4S, v1.S[2] // ...........e............................................................................................................ + cmge v29.4S, v24.4S, v30.4S // .....................................................................................................*.................. + str q14, [x0, #128] // .....................................................................................................................*.. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v27.4S, v20.4S, v16.4S // .........e.............................................................................................................. + cmge v5.4S, v31.4S, v24.4S // ....................................................................................................*................... + add v21.4S, v11.4S, v9.4S // ..............e......................................................................................................... // gap // ........................................................................................................................ - sub v24.4S, v20.4S, v16.4S // ........e............................................................................................................... - mul v15.4S, v11.4S, v25.4S // ...........................................................................................*............................ + sqrdmulh v7.4S, v4.4S, v3.S[1] // .........................e.............................................................................................. + mul v4.4S, v4.4S, v3.S[0] // ..........................e............................................................................................. + sub v14.4S, v18.4S, v15.4S // ......................................e................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v22.4S, v6.4S, v23.4S // ..............e......................................................................................................... + sub v11.4S, v11.4S, v9.4S // .............e.......................................................................................................... + sub v9.4S, v5.4S, v29.4S // ......................................................................................................*................. // gap // ........................................................................................................................ - sub v16.4S, v6.4S, v23.4S // .............e.......................................................................................................... - mls v13.4S, v10.4S, v8.S[0] // ................................................................................................*....................... - mls v5.4S, v21.4S, v8.S[0] // ..............................................................*......................................................... - cmge v21.4S, v31.4S, v14.4S // ....................................................................*................................................... - cmge v10.4S, v14.4S, v30.4S // .....................................................................*.................................................. + sqrdmulh v27.4S, v27.4S, v1.S[3] // ..........e............................................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + str q17, [x0, #640] // .....................................................................................*.................................. + sqrdmulh v29.4S, v13.4S, v2.S[3] // ....................e................................................................................................... + mul v17.4S, v13.4S, v2.S[2] // .....................e.................................................................................................. + add v18.4S, v18.4S, v15.4S // .......................................e................................................................................ + sub v15.4S, v10.4S, v21.4S // ............................e........................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v19.4S, v9.4S, v8.S[0] // .........................................................*.............................................................. + sqrdmulh v13.4S, v14.4S, v1.S[1] // ........................................e............................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v4.4S, v7.4S, v8.S[0] // ...........................e............................................................................................ + mul v5.4S, v14.4S, v1.S[0] // .........................................e.............................................................................. // gap // ........................................................................................................................ - add v7.4S, v27.4S, v22.4S // .............................e.......................................................................................... - sub v23.4S, v27.4S, v22.4S // ............................e........................................................................................... - add v27.4S, v28.4S, v4.4S // ................................................................*....................................................... - sub v9.4S, v28.4S, v4.4S // ...............................................................*........................................................ + mul v14.4S, v11.4S, v2.S[0] // ................e....................................................................................................... // gap // ........................................................................................................................ - mls v15.4S, v17.4S, v8.S[0] // .............................................................................................*.......................... // gap // ........................................................................................................................ + sqrdmulh v11.4S, v11.4S, v2.S[1] // ...............e........................................................................................................ + mul v7.4S, v16.4S, v25.4S // ..................................................................................................*..................... + mls v20.4S, v27.4S, v8.S[0] // ............e........................................................................................................... + sqrdmulh v27.4S, v15.4S, v0.S[3] // ..............................e......................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v22.4S, v31.4S, v13.4S // ............................................................................................................*........... - sub v10.4S, v21.4S, v10.4S // ......................................................................*................................................. - cmge v17.4S, v13.4S, v30.4S // .............................................................................................................*.......... - cmge v21.4S, v31.4S, v5.4S // ............................................................................*........................................... // gap // ........................................................................................................................ + mul v15.4S, v15.4S, v0.S[2] // ...............................e........................................................................................ // gap // ........................................................................................................................ - cmge v4.4S, v5.4S, v30.4S // .............................................................................*.......................................... + cmge v16.4S, v31.4S, v19.4S // ................................................................................*....................................... // gap // ........................................................................................................................ - cmge v6.4S, v31.4S, v19.4S // ........................................................................*............................................... // gap // ........................................................................................................................ - cmge v18.4S, v19.4S, v30.4S // .........................................................................*.............................................. - sub v22.4S, v22.4S, v17.4S // ..............................................................................................................*......... - mul v17.4S, v23.4S, v0.S[2] // ..............................e......................................................................................... - sqrdmulh v23.4S, v23.4S, v0.S[3] // ...............................e........................................................................................ - ldr q20, [x0, #528] // ....e................................................................................................................... - ldr q28, [x0, #656] // .....e.................................................................................................................. // gap // ........................................................................................................................ - sqrdmulh v11.4S, v9.4S, v0.S[1] // ..................................................................*..................................................... // gap // ........................................................................................................................ + add v10.4S, v10.4S, v21.4S // .............................e.......................................................................................... + sub v21.4S, v23.4S, v6.4S // ..............................................................................................................*......... + cmge v23.4S, v19.4S, v30.4S // .................................................................................*...................................... + mls v24.4S, v9.4S, v8.4S // .......................................................................................................*................ // gap // ........................................................................................................................ - mls v14.4S, v10.4S, v29.4S // .......................................................................*................................................ + mls v14.4S, v11.4S, v8.S[0] // .................e...................................................................................................... + mls v17.4S, v29.4S, v8.S[0] // ......................e................................................................................................. + mls v5.4S, v13.4S, v8.S[0] // ..........................................e............................................................................. // gap // ........................................................................................................................ - sub v10.4S, v6.4S, v18.4S // ..........................................................................*............................................. // gap // ........................................................................................................................ - sub v18.4S, v21.4S, v4.4S // ..............................................................................*......................................... - sqrdmulh v4.4S, v27.4S, v26.4S // ..................................................................................................*..................... // gap // ........................................................................................................................ - cmge v6.4S, v31.4S, v15.4S // ........................................................................................................*............... + cmge v6.4S, v22.4S, v30.4S // .....................................................................*.................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v7.4S, v28.4S, v8.S[0] // ...................................................................................................*.................... + cmge v28.4S, v31.4S, v22.4S // ....................................................................*................................................... + mls v15.4S, v27.4S, v8.S[0] // ................................e....................................................................................... // gap // ........................................................................................................................ - cmge v21.4S, v31.4S, v12.4S // ....................................................................................................*................... // gap // ........................................................................................................................ - mul v9.4S, v9.4S, v0.S[0] // .................................................................*...................................................... - mls v13.4S, v22.4S, v29.4S // ...............................................................................................................*........ - ldr q22, [x0, #912] // .......e................................................................................................................ + sub v9.4S, v16.4S, v23.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v19.4S, v10.4S, v29.4S // ...........................................................................*............................................ - mls v5.4S, v18.4S, v29.4S // ...............................................................................*........................................ - mul v18.4S, v27.4S, v25.4S // .................................................................................................*...................... - ldr q27, [x0, #784] // ......e................................................................................................................. - cmge v10.4S, v12.4S, v30.4S // .....................................................................................................*.................. + mls v12.4S, v21.4S, v8.4S // ...............................................................................................................*........ + sub v23.4S, v10.4S, v18.4S // ................................................e....................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v17.4S, v23.4S, v8.S[0] // ................................e....................................................................................... // gap // ........................................................................................................................ - cmge v23.4S, v15.4S, v30.4S // .........................................................................................................*.............. - str q14, [x0, #512] // ....................................................................................*................................... - sub v14.4S, v20.4S, v28.4S // ..................e..................................................................................................... - add v28.4S, v20.4S, v28.4S // ...................e.................................................................................................... - mls v9.4S, v11.4S, v8.S[0] // ...................................................................*.................................................... - sqrdmulh v11.4S, v16.4S, v2.S[1] // ................e....................................................................................................... + sub v21.4S, v20.4S, v14.4S // .................................e...................................................................................... + add v14.4S, v20.4S, v14.4S // ..................................e..................................................................................... + add v13.4S, v17.4S, v4.4S // ............................................e........................................................................... + sub v20.4S, v28.4S, v6.4S // ......................................................................*................................................. // gap // ........................................................................................................................ - sub v10.4S, v21.4S, v10.4S // ......................................................................................................*................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q13, [x0, #256] // ......................................................................................................................*. - mul v13.4S, v16.4S, v2.S[0] // ...............e........................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + add v6.4S, v15.4S, v5.4S // ...........................................................e............................................................ + cmge v28.4S, v7.4S, v30.4S // .................................................................................................................*...... + cmge v27.4S, v31.4S, v7.4S // ................................................................................................................*....... // gap // ........................................................................................................................ - str q19, [x0, #640] // .....................................................................................*.................................. - sqrdmulh v16.4S, v14.4S, v2.S[3] // .....................e.................................................................................................. - mls v18.4S, v4.4S, v8.S[0] // ...................................................................................................*.................... - sqrdmulh v21.4S, v24.4S, v1.S[3] // ...........e............................................................................................................ - mul v19.4S, v14.4S, v2.S[2] // ....................e................................................................................................... - str q5, [x0, #768] // ......................................................................................*................................. - add v20.4S, v27.4S, v22.4S // ........................e............................................................................................... // gap // ........................................................................................................................ - mls v12.4S, v10.4S, v29.4S // .......................................................................................................*................ - sub v10.4S, v27.4S, v22.4S // .......................e................................................................................................ // gap // ........................................................................................................................ + sub v16.4S, v15.4S, v5.4S // ..........................................................e............................................................. + str q12, [x0, #256] // ......................................................................................................................*. + mul v5.4S, v21.4S, v0.S[2] // ....................................e................................................................................... + sub v15.4S, v14.4S, v13.4S // .....................................................e.................................................................. + sub v4.4S, v17.4S, v4.4S // ...........................................e............................................................................ + add v14.4S, v14.4S, v13.4S // ......................................................e................................................................. // gap // ........................................................................................................................ - mul v27.4S, v24.4S, v1.S[2] // ..........e............................................................................................................. - sub v5.4S, v6.4S, v23.4S // ..........................................................................................................*............. // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + sqrdmulh v29.4S, v6.4S, v26.4S // ..............................................................................................e......................... // gap // ........................................................................................................................ - mls v13.4S, v11.4S, v8.S[0] // .................e...................................................................................................... - cmge v6.4S, v9.4S, v30.4S // .................................................................................*...................................... - cmge v22.4S, v31.4S, v9.4S // ................................................................................*....................................... - cmge v14.4S, v18.4S, v30.4S // .................................................................................................................*...... - cmge v4.4S, v31.4S, v18.4S // ................................................................................................................*....... + mul v12.4S, v6.4S, v25.4S // ...............................................................................................e........................ // gap // ........................................................................................................................ - sub v23.4S, v28.4S, v20.4S // ......................................e................................................................................. // gap // ........................................................................................................................ + mul v13.4S, v16.4S, v0.S[0] // .............................................................e.......................................................... + sqrdmulh v6.4S, v16.4S, v0.S[1] // ............................................................e........................................................... + mul v11.4S, v4.4S, v1.S[0] // ..............................................e......................................................................... + mul v17.4S, v15.4S, v0.S[0] // ........................................................e............................................................... + sqrdmulh v16.4S, v14.4S, v26.4S // ...........................................................................................e............................ // gap // ........................................................................................................................ + mul v14.4S, v14.4S, v25.4S // ............................................................................................e........................... // gap // ........................................................................................................................ - add v20.4S, v28.4S, v20.4S // .......................................e................................................................................ - sub v6.4S, v22.4S, v6.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ - str q12, [x0], #(16) // ....................................................................................................................*... - mul v22.4S, v10.4S, v3.S[0] // .........................e.............................................................................................. - sqrdmulh v10.4S, v10.4S, v3.S[1] // ..........................e............................................................................................. // gap // ........................................................................................................................ + sub v28.4S, v27.4S, v28.4S // ..................................................................................................................*..... + sqrdmulh v27.4S, v21.4S, v0.S[3] // ...................................e.................................................................................... + sqrdmulh v21.4S, v4.4S, v1.S[1] // .............................................e.......................................................................... + sqrdmulh v4.4S, v15.4S, v0.S[1] // .......................................................e................................................................ // gap // ........................................................................................................................ - mls v27.4S, v21.4S, v8.S[0] // ............e........................................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v19.4S, v9.4S, v8.4S // ...................................................................................*.................................... // gap // ........................................................................................................................ - sub v12.4S, v4.4S, v14.4S // ..................................................................................................................*..... - mul v4.4S, v23.4S, v1.S[0] // ........................................e............................................................................... - sub v11.4S, v7.4S, v20.4S // ................................................e....................................................................... - sqrdmulh v14.4S, v23.4S, v1.S[1] // .........................................e.............................................................................. + mls v12.4S, v29.4S, v8.S[0] // ................................................................................................e....................... + mls v13.4S, v6.4S, v8.S[0] // ..............................................................e......................................................... // gap // ........................................................................................................................ - mls v15.4S, v5.4S, v29.4S // ...........................................................................................................*............ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v22.4S, v20.4S, v8.4S // .......................................................................*................................................ // gap // ........................................................................................................................ - mls v9.4S, v6.4S, v29.4S // ...................................................................................*.................................... - mls v19.4S, v16.4S, v8.S[0] // ......................e................................................................................................. - add v24.4S, v7.4S, v20.4S // .................................................e...................................................................... + sqrdmulh v9.4S, v23.4S, v0.S[1] // ..................................................e..................................................................... + mls v7.4S, v28.4S, v8.4S // ...................................................................................................................*.... // gap // ........................................................................................................................ - mls v18.4S, v12.4S, v29.4S // ...................................................................................................................*.... - mls v22.4S, v10.4S, v8.S[0] // ...........................e............................................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v14.4S, v16.4S, v8.S[0] // .............................................................................................e.......................... // gap // ........................................................................................................................ - sub v10.4S, v27.4S, v13.4S // .................................e...................................................................................... - sqrdmulh v7.4S, v11.4S, v0.S[1] // ...................................................e.................................................................... + mls v17.4S, v4.4S, v8.S[0] // .........................................................e.............................................................. + mls v5.4S, v27.4S, v8.S[0] // .....................................e.................................................................................. // gap // ........................................................................................................................ - mls v4.4S, v14.4S, v8.S[0] // ..........................................e............................................................................. - mul v14.4S, v11.4S, v0.S[0] // ..................................................e..................................................................... // gap // ........................................................................................................................ + mls v11.4S, v21.4S, v8.S[0] // ...............................................e........................................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v23.4S, v24.4S, v26.4S // .........................................................................................e.............................. - mul v12.4S, v24.4S, v25.4S // ........................................................................................e............................... + add v15.4S, v10.4S, v18.4S // .................................................e...................................................................... + str q19, [x0, #896] // .......................................................................................*................................ + cmge v16.4S, v31.4S, v13.4S // ............................................................................e........................................... + str q22, [x0, #512] // ....................................................................................*................................... + mul v22.4S, v23.4S, v0.S[0] // ...................................................e.................................................................... + cmge v18.4S, v13.4S, v30.4S // .............................................................................e.......................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v5.4S, v27.4S, v13.4S // ..................................e..................................................................................... - str q15, [x0, #112] // .....................................................................................................................*.. + cmge v23.4S, v31.4S, v12.4S // ............................................................................................................e........... + sqrdmulh v27.4S, v15.4S, v26.4S // ........................................................................................e............................... + str q7, [x0, #384] // .......................................................................................................................* + cmge v20.4S, v31.4S, v14.4S // ........................................................................................................e............... + cmge v6.4S, v12.4S, v30.4S // .............................................................................................................e.......... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v28.4S, v10.4S, v0.S[2] // ...................................e.................................................................................... - sqrdmulh v24.4S, v10.4S, v0.S[3] // ....................................e................................................................................... - str q18, [x0, #368] // .......................................................................................................................* - // gap // ........................................................................................................................ - str q9, [x0, #880] // .......................................................................................*................................ - // gap // ........................................................................................................................ - // gap // ........................................................................................................................ - // gap // ........................................................................................................................ - sub v15.4S, v19.4S, v22.4S // ...........................................e............................................................................ - add v22.4S, v19.4S, v22.4S // ............................................e........................................................................... - - // original source code - // ldr q9, [x0, #0] // .e......................................................................................................................|e.................................................................................................................... - // ldr q10, [x0, #(1*(1024/8))] // e.......................................................................................................................e..................................................................................................................... - // ldr q11, [x0, #(2*(1024/8))] // ..e.....................................................................................................................|.e................................................................................................................... - // ldr q12, [x0, #(3*(1024/8))] // ......e.................................................................................................................|.....e............................................................................................................... - // ldr q13, [x0, #(4*(1024/8))] // ..............................................e.........................................................................|.............................................e....................................................................... - // ldr q14, [x0, #(5*(1024/8))] // ...............................................e........................................................................|..............................................e...................................................................... - // ldr q15, [x0, #(6*(1024/8))] // .............................................................e..........................................................|............................................................e........................................................ - // ldr q16, [x0, #(7*(1024/8))] // .........................................................e..............................................................|........................................................e............................................................ - // sub v24.4s, v9.4s, v10.4s // ......................e.................................................................................................|.....................e............................................................................................... - // add v9.4s, v9.4s, v10.4s // .....................e..................................................................................................|....................e................................................................................................ - // mul v10.4s, v24.4s, v1.s[2] // ..................................................................................e.....................................|.................................................................................e................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ............................................................................e...........................................|...........................................................................e......................................... - // mls v10.4s, v24.4s, v8.s[0] // ...............................................................................................e........................|..............................................................................................e...................... - // sub v24.4s, v11.4s, v12.4s // .........................e..............................................................................................|........................e............................................................................................ - // add v11.4s, v11.4s, v12.4s // ........................e...............................................................................................|.......................e............................................................................................. - // mul v12.4s, v24.4s, v2.s[0] // ........................................................................e...............................................|.......................................................................e............................................. - // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................................e..................................................|....................................................................e................................................ - // mls v12.4s, v24.4s, v8.s[0] // ....................................................................................e...................................|...................................................................................e................................. - // sub v24.4s, v13.4s, v14.4s // ..................................................................e.....................................................|.................................................................e................................................... - // add v13.4s, v13.4s, v14.4s // ...................................................................e....................................................|..................................................................e.................................................. - // mul v14.4s, v24.4s, v2.s[2] // .............................................................................e..........................................|............................................................................e........................................ - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................e.............................................|.........................................................................e........................................... - // mls v14.4s, v24.4s, v8.s[0] // ......................................................................................................e.................|.....................................................................................................e............... - // sub v24.4s, v15.4s, v16.4s // .................................................................................e......................................|................................................................................e.................................... - // add v15.4s, v15.4s, v16.4s // ...............................................................................e........................................|..............................................................................e...................................... - // mul v16.4s, v24.4s, v3.s[0] // .............................................................................................e..........................|............................................................................................e........................ - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................................................................e.........................|.............................................................................................e....................... - // mls v16.4s, v24.4s, v8.s[0] // .........................................................................................................e..............|........................................................................................................e............ - // sub v24.4s, v9.4s, v11.4s // ................................e.......................................................................................|...............................e..................................................................................... - // add v9.4s, v9.4s, v11.4s // ...............................e........................................................................................|..............................e...................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ............................................e...........................................................................|...........................................e......................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................e..........................................................................|............................................e........................................................................ - // mls v11.4s, v24.4s, v8.s[0] // ...............................................................e........................................................|..............................................................e...................................................... - // sub v24.4s, v10.4s, v12.4s // ..........................................................................................................e.............|.........................................................................................................e........... - // add v10.4s, v10.4s, v12.4s // ................................................................................................................e.......|...............................................................................................................e..... - // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................e.....|.................................................................................................................e... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................e....|..................................................................................................................e.. - // mls v12.4s, v24.4s, v8.s[0] // ...........*............................................................................................................|..........*.......................................................................................................... - // sub v24.4s, v13.4s, v15.4s // .........................................................................................e..............................|........................................................................................e............................ - // add v13.4s, v13.4s, v15.4s // ..........................................................................................e.............................|.........................................................................................e........................... - // mul v15.4s, v24.4s, v1.s[0] // .................................................................................................e......................|................................................................................................e.................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................e....................|..................................................................................................e.................. - // mls v15.4s, v24.4s, v8.s[0] // ............................................................................................................e...........|...........................................................................................................e......... - // sub v24.4s, v14.4s, v16.4s // ......................................................................................................................e.|..................................................................................................................... - // add v14.4s, v14.4s, v16.4s // .......................................................................................................................e|..................................................................................................................... - // mul v16.4s, v24.4s, v1.s[0] // ........*...............................................................................................................|.......*............................................................................................................. - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........*..............................................................................................................|........*............................................................................................................ - // mls v16.4s, v24.4s, v8.s[0] // ....................*...................................................................................................|...................*................................................................................................. - // sub v24.4s, v9.4s, v13.4s // ..................................................................................................e.....................|.................................................................................................e................... - // add v9.4s, v9.4s, v13.4s // .......................................................................................................e................|......................................................................................................e.............. - // mul v13.4s, v24.4s, v0.s[0] // .............................................................................................................e..........|............................................................................................................e........ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................e............|..........................................................................................................e.......... - // mls v13.4s, v24.4s, v8.s[0] // ....*...................................................................................................................|...*................................................................................................................. - // sub v24.4s, v10.4s, v14.4s // ..........*.............................................................................................................|.........*........................................................................................................... - // add v10.4s, v10.4s, v14.4s // ............*...........................................................................................................|...........*......................................................................................................... - // mul v14.4s, v24.4s, v0.s[0] // ..................*.....................................................................................................|.................*................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................*......................................................................................................|................*.................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ..............................*.........................................................................................|.............................*....................................................................................... - // sub v24.4s, v11.4s, v15.4s // .......*................................................................................................................|......*.............................................................................................................. - // add v11.4s, v11.4s, v15.4s // ...*....................................................................................................................|..*.................................................................................................................. - // mul v15.4s, v24.4s, v0.s[0] // ...............*........................................................................................................|..............*...................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................*.......................................................................................................|...............*..................................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ...........................*............................................................................................|..........................*.......................................................................................... - // sub v24.4s, v12.4s, v16.4s // ..................................*.....................................................................................|.................................*................................................................................... - // add v12.4s, v12.4s, v16.4s // .................................*......................................................................................|................................*.................................................................................... - // mul v16.4s, v24.4s, v0.s[0] // .......................................................*................................................................|......................................................*.............................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................*.......................................................................|...............................................*..................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ....................................................................*...................................................|...................................................................*................................................. - // cmge v27.4s, v31.4s, v13.4s // ............................*...........................................................................................|...........................*......................................................................................... - // cmge v28.4s, v13.4s, v30.4s // .............................*..........................................................................................|............................*........................................................................................ - // sub v28.4s, v27.4s, v28.4s // .....................................*..................................................................................|....................................*................................................................................ - // mls v13.4s, v28.4s, v29.4s // .................................................*......................................................................|................................................*.................................................................... - // cmge v27.4s, v31.4s, v14.4s // .........................................*..............................................................................|........................................*............................................................................ - // cmge v28.4s, v14.4s, v30.4s // ..........................................*.............................................................................|.........................................*........................................................................... - // sub v28.4s, v27.4s, v28.4s // ..................................................*.....................................................................|.................................................*................................................................... - // mls v14.4s, v28.4s, v29.4s // ..........................................................*.............................................................|.........................................................*........................................................... - // cmge v27.4s, v31.4s, v15.4s // .......................................*................................................................................|......................................*.............................................................................. - // cmge v28.4s, v15.4s, v30.4s // ........................................*...............................................................................|.......................................*............................................................................. - // sub v28.4s, v27.4s, v28.4s // ...................................................*....................................................................|..................................................*.................................................................. - // mls v15.4s, v28.4s, v29.4s // ...........................................................*............................................................|..........................................................*.......................................................... - // cmge v27.4s, v31.4s, v16.4s // ......................................................................................*.................................|.....................................................................................*............................... - // cmge v28.4s, v16.4s, v30.4s // .....................................................................................*..................................|....................................................................................*................................ - // sub v28.4s, v27.4s, v28.4s // ...........................................................................................*............................|..........................................................................................*.......................... - // mls v16.4s, v28.4s, v29.4s // .....................................................................................................*..................|....................................................................................................*................ - // str q13, [x0, #(4*(1024/8))] // .................................................................*......................................................|................................................................*.................................................... - // str q14, [x0, #(5*(1024/8))] // .........................................................................*..............................................|........................................................................*............................................ - // str q15, [x0, #(6*(1024/8))] // ..............................................................................*.........................................|.............................................................................*....................................... - // str q16, [x0, #(7*(1024/8))] // .....................................................................................................................*..|....................................................................................................................* - // mul v13.4s, v9.4s, v25.4s // ...............................................................................................................e........|..............................................................................................................e...... - // sqrdmulh v9.4s, v9.4s, v26.4s // ..............................................................................................................e.........|.............................................................................................................e....... - // mls v13.4s, v9.4s, v8.s[0] // .....*..................................................................................................................|....*................................................................................................................ - // mul v14.4s, v10.4s, v25.4s // .......................*................................................................................................|......................*.............................................................................................. - // sqrdmulh v10.4s, v10.4s, v26.4s // ...................*....................................................................................................|..................*.................................................................................................. - // mls v14.4s, v10.4s, v8.s[0] // ...................................*....................................................................................|..................................*.................................................................................. - // mul v15.4s, v11.4s, v25.4s // .............*..........................................................................................................|............*........................................................................................................ - // sqrdmulh v11.4s, v11.4s, v26.4s // ..............*.........................................................................................................|.............*....................................................................................................... - // mls v15.4s, v11.4s, v8.s[0] // ..........................*.............................................................................................|.........................*........................................................................................... - // mul v16.4s, v12.4s, v25.4s // ............................................................*...........................................................|...........................................................*......................................................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ....................................................*...................................................................|...................................................*................................................................. - // mls v16.4s, v12.4s, v8.s[0] // ...........................................................................*............................................|..........................................................................*.......................................... - // cmge v27.4s, v31.4s, v13.4s // ......................................................*.................................................................|.....................................................*............................................................... - // cmge v28.4s, v13.4s, v30.4s // ..............................................................*.........................................................|.............................................................*....................................................... - // sub v28.4s, v27.4s, v28.4s // ......................................................................*.................................................|.....................................................................*............................................... - // mls v13.4s, v28.4s, v29.4s // ................................................................................*.......................................|...............................................................................*..................................... - // cmge v27.4s, v31.4s, v14.4s // .....................................................*..................................................................|....................................................*................................................................ - // cmge v28.4s, v14.4s, v30.4s // ................................................................*.......................................................|...............................................................*..................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................................................*....................................|..................................................................................*.................................. - // mls v14.4s, v28.4s, v29.4s // ....................................................................................................*...................|...................................................................................................*................. - // cmge v27.4s, v31.4s, v15.4s // ....................................*...................................................................................|...................................*................................................................................. - // cmge v28.4s, v15.4s, v30.4s // ......................................*.................................................................................|.....................................*............................................................................... - // sub v28.4s, v27.4s, v28.4s // ...........................................*............................................................................|..........................................*.......................................................................... - // mls v15.4s, v28.4s, v29.4s // ........................................................*...............................................................|.......................................................*............................................................. - // cmge v27.4s, v31.4s, v16.4s // ........................................................................................*...............................|.......................................................................................*............................. - // cmge v28.4s, v16.4s, v30.4s // .......................................................................................*................................|......................................................................................*.............................. - // sub v28.4s, v27.4s, v28.4s // ................................................................................................*.......................|...............................................................................................*..................... - // mls v16.4s, v28.4s, v29.4s // ........................................................................................................*...............|.......................................................................................................*............. - // str q13, [x0], #(16) // ............................................................................................*...........................|...........................................................................................*......................... - // str q14, [x0, #(-16 + 1*(1024/8))] // .................................................................................................................*......|................................................................................................................*.... - // str q15, [x0, #(-16 + 2*(1024/8))] // .......................................................................*................................................|......................................................................*.............................................. - // str q16, [x0, #(-16 + 3*(1024/8))] // ....................................................................................................................*...|...................................................................................................................*. + str q24, [x0], #(16) // ....................................................................................................................*... + cmge v29.4S, v14.4S, v30.4S // .........................................................................................................e.............. + + // ---------------------------------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + // ldr q9, [x0, #0] // .....e..................................................................................................................'....~................................................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // ...e....................................................................................................................'..~................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ................e.......................................................................................................'...............~...................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // .....................e..................................................................................................'....................~................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // e.......................................................................................................................~...................................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ...........e............................................................................................................'..........~........................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .......e................................................................................................................'......~............................................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ............e...........................................................................................................'...........~.......................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ........................e...............................................................................................'.......................~.............................................................................................. + // add v9.4s, v9.4s, v10.4s // .......................e................................................................................................'......................~............................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ..........................................e.............................................................................'.........................................~............................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................................e.......................................................................................'...............................~...................................................................................... + // mls v10.4s, v27.4s, v8.s[0] // ......................................................e.................................................................'.....................................................~................................................................ + // sub v24.4s, v11.4s, v12.4s // ........................................e...............................................................................'.......................................~.............................................................................. + // add v11.4s, v11.4s, v12.4s // ....................................e...................................................................................'...................................~.................................................................................. + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ....................................................e...................................................................'...................................................~.................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ...................................................e....................................................................'..................................................~................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ..............................................................e.........................................................'.............................................................~........................................................ + // sub v24.4s, v13.4s, v14.4s // ...............................e........................................................................................'..............................~....................................................................................... + // add v13.4s, v13.4s, v14.4s // ............................e...........................................................................................'...........................~.......................................................................................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ............................................e...........................................................................'...........................................~.......................................................................... + // mul v14.4s, v24.4s, v2.s[2] // .............................................e..........................................................................'............................................~......................................................................... + // mls v14.4s, v27.4s, v8.s[0] // ...............................................................e........................................................'..............................................................~....................................................... + // sub v24.4s, v15.4s, v16.4s // ..........................e.............................................................................................'.........................~............................................................................................ + // add v15.4s, v15.4s, v16.4s // ...........................e............................................................................................'..........................~........................................................................................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // .....................................e..................................................................................'....................................~................................................................................. + // mul v16.4s, v24.4s, v3.s[0] // ......................................e.................................................................................'.....................................~................................................................................ + // mls v16.4s, v27.4s, v8.s[0] // .................................................e......................................................................'................................................~..................................................................... + // sub v24.4s, v9.4s, v11.4s // ...............................................e........................................................................'..............................................~....................................................................... + // add v9.4s, v9.4s, v11.4s // ..........................................................e.............................................................'.........................................................~............................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .......................................................e................................................................'......................................................~............................................................... + // mul v11.4s, v24.4s, v0.s[2] // ........................................................e...............................................................'.......................................................~.............................................................. + // mls v11.4s, v27.4s, v8.s[0] // ....................................................................e...................................................'...................................................................~.................................................. + // sub v24.4s, v10.4s, v12.4s // ........................................................................e...............................................'.......................................................................~.............................................. + // add v10.4s, v10.4s, v12.4s // .........................................................................e..............................................'........................................................................~............................................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ..............................................................................................e.........................'.............................................................................................~........................ + // mul v12.4s, v24.4s, v0.s[2] // .................................................................................e......................................'................................................................................~..................................... + // mls v12.4s, v27.4s, v8.s[0] // .........................................................................................................e..............'........................................................................................................~............. + // sub v24.4s, v13.4s, v15.4s // .......................................e................................................................................'......................................~............................................................................... + // add v13.4s, v13.4s, v15.4s // ..............................................e.........................................................................'.............................................~........................................................................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ................................................e.......................................................................'...............................................~...................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ..................................................e.....................................................................'.................................................~.................................................................... + // mls v15.4s, v27.4s, v8.s[0] // ................................................................e.......................................................'...............................................................~...................................................... + // sub v24.4s, v14.4s, v16.4s // ...................................................................................e....................................'..................................................................................~................................... + // add v14.4s, v14.4s, v16.4s // ..........................................................................e.............................................'.........................................................................~............................................ + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ...............................................................................................e........................'..............................................................................................~....................... + // mul v16.4s, v24.4s, v1.s[0] // .........................................................................................e..............................'........................................................................................~............................. + // mls v16.4s, v27.4s, v8.s[0] // ..........................................................................................................e.............'.........................................................................................................~............ + // sub v24.4s, v9.4s, v13.4s // .......................................................................e................................................'......................................................................~............................................... + // add v9.4s, v9.4s, v13.4s // ...........................................................................................................e............'..........................................................................................................~........... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .....................................................................................................e..................'....................................................................................................~................. + // mul v13.4s, v24.4s, v0.s[0] // ...............................................................................................................e........'..............................................................................................................~....... + // mls v13.4s, v27.4s, v8.s[0] // ........~...............................................................................................................'.......*.............................................................................................................. + // sub v24.4s, v10.4s, v14.4s // ..................................................................................e.....................................'.................................................................................~.................................... + // add v10.4s, v10.4s, v14.4s // ....................................................................................e...................................'...................................................................................~.................................. + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ................................................................................................e.......................'...............................................................................................~...................... + // mul v14.4s, v24.4s, v0.s[0] // ..........................................................................................e.............................'.........................................................................................~............................ + // mls v14.4s, v27.4s, v8.s[0] // ........................................................................................................e...............'.......................................................................................................~.............. + // sub v24.4s, v11.4s, v15.4s // ...............................................................................e........................................'..............................................................................~....................................... + // add v11.4s, v11.4s, v15.4s // ............................................................................e...........................................'...........................................................................~.......................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ........................................................................................e...............................'.......................................................................................~.............................. + // mul v15.4s, v24.4s, v0.s[0] // .......................................................................................e................................'......................................................................................~............................... + // mls v15.4s, v27.4s, v8.s[0] // ...................................................................................................e....................'..................................................................................................~................... + // sub v24.4s, v12.4s, v16.4s // ..~.....................................................................................................................'.*.................................................................................................................... + // add v12.4s, v12.4s, v16.4s // ...............~........................................................................................................'..............*....................................................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .................~......................................................................................................'................*..................................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ..............~.........................................................................................................'.............*........................................................................................................ + // mls v16.4s, v27.4s, v8.s[0] // .............................~..........................................................................................'............................*......................................................................................... + // cmge v27.4s, v31.4s, v13.4s // ...................................................................~....................................................'..................................................................*................................................... + // cmge v28.4s, v13.4s, v30.4s // .................................................................~......................................................'................................................................*..................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................................................~............................................'..........................................................................*........................................... + // mls v13.4s, v28.4s, v8.4s // ....................................................................................................~...................'...................................................................................................*.................. + // cmge v27.4s, v31.4s, v14.4s // .~......................................................................................................................'*..................................................................................................................... + // cmge v28.4s, v14.4s, v30.4s // ....~...................................................................................................................'...*.................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .............~..........................................................................................................'............*......................................................................................................... + // mls v14.4s, v28.4s, v8.4s // ......................~.................................................................................................'.....................*................................................................................................ + // cmge v27.4s, v31.4s, v15.4s // .............................................................................................................e..........'............................................................................................................~......... + // cmge v28.4s, v15.4s, v30.4s // ................................................................................................................e.......'...............................................................................................................~...... + // sub v28.4s, v27.4s, v28.4s // .........~..............................................................................................................'........*............................................................................................................. + // mls v15.4s, v28.4s, v8.4s // ....................~...................................................................................................'...................*.................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // .........................................................~..............................................................'........................................................*............................................................. + // cmge v28.4s, v16.4s, v30.4s // ............................................................~...........................................................'...........................................................*.......................................................... + // sub v28.4s, v27.4s, v28.4s // .....................................................................~..................................................'....................................................................*................................................. + // mls v16.4s, v28.4s, v8.4s // .................................................................................................~......................'................................................................................................*..................... + // str q13, [x0, #(4*(1024/8))] // ..............................................................................................................~.........'.............................................................................................................*........ + // str q14, [x0, #(5*(1024/8))] // ...........................................~............................................................................'..........................................*........................................................................... + // str q15, [x0, #(6*(1024/8))] // ..............................~.........................................................................................'.............................*........................................................................................ + // str q16, [x0, #(7*(1024/8))] // ............................................................................................................~...........'...........................................................................................................*.......... + // sqrdmulh v27.4s, v9.4s, v26.4s // ..................................................................................................................e.....'.................................................................................................................~.... + // mul v9.4s, v9.4s, v25.4s // ......~.................................................................................................................'.....*................................................................................................................ + // mls v9.4s, v27.4s, v8.s[0] // ...................~....................................................................................................'..................*................................................................................................... + // sqrdmulh v27.4s, v10.4s, v26.4s // ...........................................................................................e............................'..........................................................................................~........................... + // mul v10.4s, v10.4s, v25.4s // ............................................................................................e...........................'...........................................................................................~.......................... + // mls v10.4s, v27.4s, v8.s[0] // .......................................................................................................e................'......................................................................................................~............... + // sqrdmulh v27.4s, v11.4s, v26.4s // .....................................................................................e..................................'....................................................................................~................................. + // mul v11.4s, v11.4s, v25.4s // ......................................................................................e.................................'.....................................................................................~................................ + // mls v11.4s, v27.4s, v8.s[0] // ..................................................................................................e.....................'.................................................................................................~.................... + // sqrdmulh v27.4s, v12.4s, v26.4s // .........................~..............................................................................................'........................*............................................................................................. + // mul v12.4s, v12.4s, v25.4s // .....................................................~..................................................................'....................................................*................................................................. + // mls v12.4s, v27.4s, v8.s[0] // ..................................................................~.....................................................'.................................................................*.................................................... + // cmge v27.4s, v31.4s, v9.4s // ...................................~....................................................................................'..................................*................................................................................... + // cmge v28.4s, v9.4s, v30.4s // .................................~......................................................................................'................................*..................................................................................... + // sub v28.4s, v27.4s, v28.4s // .........................................~..............................................................................'........................................*............................................................................. + // mls v9.4s, v28.4s, v8.4s // .............................................................~..........................................................'............................................................*......................................................... + // cmge v27.4s, v31.4s, v10.4s // ....................................................................................................................e...'...................................................................................................................~.. + // cmge v28.4s, v10.4s, v30.4s // .......................................................................................................................e'...................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..........~.............................................................................................................'.........*............................................................................................................ + // mls v10.4s, v28.4s, v8.4s // ..................~.....................................................................................................'.................*.................................................................................................... + // cmge v27.4s, v31.4s, v11.4s // .................................................................................................................e......'................................................................................................................~..... + // cmge v28.4s, v11.4s, v30.4s // .....................................................................................................................e..'....................................................................................................................~. + // sub v28.4s, v27.4s, v28.4s // ...........................................................~............................................................'..........................................................*........................................................... + // mls v11.4s, v28.4s, v8.4s // ......................................................................~.................................................'.....................................................................*................................................ + // cmge v27.4s, v31.4s, v12.4s // ..............................................................................~.........................................'.............................................................................*........................................ + // cmge v28.4s, v12.4s, v30.4s // .............................................................................~..........................................'............................................................................*......................................... + // sub v28.4s, v27.4s, v28.4s // .............................................................................................~..........................'............................................................................................*......................... + // mls v12.4s, v28.4s, v8.4s // ......................................................................................................~.................'.....................................................................................................*................ + // str q9, [x0], #(16) // ......................................................................................................................~.'.....................................................................................................................* + // str q10, [x0, #(-16 + 1*(1024/8))] // ..................................~.....................................................................................'.................................*.................................................................................... + // str q11, [x0, #(-16 + 2*(1024/8))] // ................................................................................~.......................................'...............................................................................*...................................... + // str q12, [x0, #(-16 + 3*(1024/8))] // ...................................................................................................................~....'..................................................................................................................*... sub count, count, #1 cbnz count, layer123_start - mls v28.4S, v24.4S, v8.S[0] // .......*.............................................................. - mul v13.4S, v15.4S, v1.S[0] // ....*................................................................. - sqrdmulh v21.4S, v15.4S, v1.S[1] // .....*................................................................ - sub v11.4S, v17.4S, v4.4S // ...*.................................................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - add v20.4S, v17.4S, v4.4S // *..................................................................... - add v15.4S, v5.4S, v22.4S // ........*............................................................. - mls v14.4S, v7.4S, v8.S[0] // .*.................................................................... - mls v12.4S, v23.4S, v8.S[0] // ..*................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v9.4S, v5.4S, v22.4S // ......*............................................................... - sqrdmulh v4.4S, v11.4S, v0.S[1] // ............*......................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v23.4S, v20.4S, v26.4S // ..........*........................................................... - mul v20.4S, v20.4S, v25.4S // .........*............................................................ - sqrdmulh v19.4S, v15.4S, v26.4S // ...............*...................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v13.4S, v21.4S, v8.S[0] // ................*..................................................... - cmge v6.4S, v31.4S, v12.4S // ........................................*............................. - mul v18.4S, v9.4S, v0.S[0] // ..............*....................................................... - mul v27.4S, v15.4S, v25.4S // .................*.................................................... - mul v16.4S, v11.4S, v0.S[0] // ...........*.......................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - cmge v10.4S, v31.4S, v14.4S // ....................*................................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - cmge v7.4S, v14.4S, v30.4S // .....................*................................................ - cmge v17.4S, v12.4S, v30.4S // ..............................................*....................... - add v5.4S, v28.4S, v13.4S // .......................*.............................................. - sub v15.4S, v28.4S, v13.4S // ........................*............................................. - mls v20.4S, v23.4S, v8.S[0] // ..................*................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v22.4S, v9.4S, v0.S[1] // .............*........................................................ - sub v28.4S, v6.4S, v17.4S // ..................................................*................... - mls v16.4S, v4.4S, v8.S[0] // ...................*.................................................. - mls v27.4S, v19.4S, v8.S[0] // .........................*............................................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v21.4S, v5.4S, v25.4S // .............................................*........................ - sqrdmulh v9.4S, v5.4S, v26.4S // ......................................*............................... - sqrdmulh v11.4S, v15.4S, v0.S[1] // ..................................*................................... - mul v17.4S, v15.4S, v0.S[0] // .........................................*............................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v12.4S, v28.4S, v29.4S // .......................................................*.............. - sub v4.4S, v10.4S, v7.4S // ...........................*.......................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - cmge v10.4S, v31.4S, v20.4S // ..........................*........................................... - mls v18.4S, v22.4S, v8.S[0] // ......................*............................................... - cmge v19.4S, v20.4S, v30.4S // ............................*......................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - cmge v24.4S, v16.4S, v30.4S // ..............................*....................................... - cmge v15.4S, v31.4S, v16.4S // .............................*........................................ - mls v14.4S, v4.4S, v29.4S // ...................................*.................................. - mls v21.4S, v9.4S, v8.S[0] // .....................................................*................ - mls v17.4S, v11.4S, v8.S[0] // .................................................*.................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q12, [x0], #(16) // ..............................................................*....... - sub v5.4S, v10.4S, v19.4S // .................................*.................................... - sub v4.4S, v15.4S, v24.4S // .....................................*................................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - cmge v15.4S, v18.4S, v30.4S // ................................*..................................... - cmge v23.4S, v27.4S, v30.4S // ...............................................*...................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - cmge v24.4S, v31.4S, v27.4S // .......................................*.............................. - cmge v6.4S, v31.4S, v18.4S // ...............................*...................................... - str q14, [x0, #496] // ................................................*..................... - cmge v13.4S, v17.4S, v30.4S // .........................................................*............ - cmge v28.4S, v31.4S, v17.4S // ..........................................................*........... - cmge v14.4S, v21.4S, v30.4S // ...........................................................*.......... - cmge v22.4S, v31.4S, v21.4S // ............................................................*......... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v20.4S, v5.4S, v29.4S // ..........................................*........................... - sub v10.4S, v6.4S, v15.4S // ....................................*................................. - sub v9.4S, v24.4S, v23.4S // ........................................................*............. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v16.4S, v4.4S, v29.4S // ............................................*......................... - sub v12.4S, v28.4S, v13.4S // .............................................................*........ - sub v22.4S, v22.4S, v14.4S // ...............................................................*...... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v27.4S, v9.4S, v29.4S // ................................................................*..... - mls v18.4S, v10.4S, v29.4S // ...........................................*.......................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v21.4S, v22.4S, v29.4S // ..................................................................*... - mls v17.4S, v12.4S, v29.4S // .................................................................*.... - str q20, [x0, #240] // ...................................................*.................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q16, [x0, #752] // ......................................................*............... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q27, [x0, #112] // ...................................................................*.. - str q18, [x0, #624] // ....................................................*................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q21, [x0, #368] // ....................................................................*. - str q17, [x0, #880] // .....................................................................* - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - - // original source code - // add v19.4S, v17.4S, v4.4S // ....*................................................................. - // mls v14.4S, v7.4S, v8.S[0] // ......*............................................................... - // mls v12.4S, v23.4S, v8.S[0] // .......*.............................................................. - // sub v27.4S, v17.4S, v4.4S // ...*.................................................................. - // mul v4.4S, v15.4S, v1.S[0] // .*.................................................................... - // sqrdmulh v18.4S, v15.4S, v1.S[1] // ..*................................................................... - // sub v15.4S, v5.4S, v22.4S // ........*............................................................. - // mls v28.4S, v24.4S, v8.S[0] // *..................................................................... - // add v11.4S, v5.4S, v22.4S // .....*................................................................ - // mul v13.4S, v19.4S, v25.4S // ...........*.......................................................... - // sqrdmulh v10.4S, v19.4S, v26.4S // ..........*........................................................... - // mul v5.4S, v27.4S, v0.S[0] // .................*.................................................... - // sqrdmulh v21.4S, v27.4S, v0.S[1] // .........*............................................................ - // sqrdmulh v9.4S, v15.4S, v0.S[1] // ........................*............................................. - // mul v19.4S, v15.4S, v0.S[0] // ...............*...................................................... - // sqrdmulh v17.4S, v11.4S, v26.4S // ............*......................................................... - // mls v4.4S, v18.4S, v8.S[0] // .............*........................................................ - // mul v15.4S, v11.4S, v25.4S // ................*..................................................... - // mls v13.4S, v10.4S, v8.S[0] // .......................*.............................................. - // mls v5.4S, v21.4S, v8.S[0] // ..........................*........................................... - // cmge v21.4S, v31.4S, v14.4S // ..................*................................................... - // cmge v10.4S, v14.4S, v30.4S // ...................*.................................................. - // mls v19.4S, v9.4S, v8.S[0] // ...................................*.................................. - // add v27.4S, v28.4S, v4.4S // .....................*................................................ - // sub v9.4S, v28.4S, v4.4S // ......................*............................................... - // mls v15.4S, v17.4S, v8.S[0] // ...........................*.......................................... - // cmge v22.4S, v31.4S, v13.4S // ..................................*................................... - // sub v10.4S, v21.4S, v10.4S // .................................*.................................... - // cmge v17.4S, v13.4S, v30.4S // ....................................*................................. - // cmge v21.4S, v31.4S, v5.4S // ......................................*............................... - // cmge v4.4S, v5.4S, v30.4S // .....................................*................................ - // cmge v6.4S, v31.4S, v19.4S // ................................................*..................... - // cmge v18.4S, v19.4S, v30.4S // .............................................*........................ - // sub v22.4S, v22.4S, v17.4S // ...........................................*.......................... - // sqrdmulh v11.4S, v9.4S, v0.S[1] // ..............................*....................................... - // mls v14.4S, v10.4S, v29.4S // .......................................*.............................. - // sub v10.4S, v6.4S, v18.4S // .......................................................*.............. - // sub v18.4S, v21.4S, v4.4S // ............................................*......................... - // sqrdmulh v4.4S, v27.4S, v26.4S // .............................*........................................ - // cmge v6.4S, v31.4S, v15.4S // ...............................................*...................... - // cmge v21.4S, v31.4S, v12.4S // ..............*....................................................... - // mul v9.4S, v9.4S, v0.S[0] // ...............................*...................................... - // mls v13.4S, v22.4S, v29.4S // ......................................................*............... - // mls v19.4S, v10.4S, v29.4S // .............................................................*........ - // mls v5.4S, v18.4S, v29.4S // .........................................................*............ - // mul v18.4S, v27.4S, v25.4S // ............................*......................................... - // cmge v10.4S, v12.4S, v30.4S // ....................*................................................. - // cmge v23.4S, v15.4S, v30.4S // ..............................................*....................... - // str q14, [x0, #512] // .................................................*.................... - // mls v9.4S, v11.4S, v8.S[0] // .........................................*............................ - // sub v10.4S, v21.4S, v10.4S // .........................*............................................ - // str q13, [x0, #256] // ................................................................*..... - // str q19, [x0, #640] // ...................................................................*.. - // mls v18.4S, v4.4S, v8.S[0] // ........................................*............................. - // str q5, [x0, #768] // .................................................................*.... - // mls v12.4S, v10.4S, v29.4S // ................................*..................................... - // sub v5.4S, v6.4S, v23.4S // ........................................................*............. - // cmge v6.4S, v9.4S, v30.4S // ..................................................*................... - // cmge v22.4S, v31.4S, v9.4S // ...................................................*.................. - // cmge v14.4S, v18.4S, v30.4S // ....................................................*................. - // cmge v4.4S, v31.4S, v18.4S // .....................................................*................ - // sub v6.4S, v22.4S, v6.4S // ..........................................................*........... - // str q12, [x0], #(16) // ..........................................*........................... - // sub v12.4S, v4.4S, v14.4S // ...........................................................*.......... - // mls v15.4S, v5.4S, v29.4S // ............................................................*......... - // mls v9.4S, v6.4S, v29.4S // ...............................................................*...... - // mls v18.4S, v12.4S, v29.4S // ..............................................................*....... - // str q15, [x0, #112] // ..................................................................*... - // str q18, [x0, #368] // ....................................................................*. - // str q9, [x0, #880] // .....................................................................* + // Instructions: 45 + // Expected cycles: 16 + // Expected IPC: 2.81 + // + // Wall time: 0.48s + // User time: 0.48s + // + // ------------ original position -------------> + // 0 25 + // |------------------------|------------------- + sub v21.4S, v20.4S, v29.4S // ......*...................................... + mul v20.4S, v15.4S, v25.4S // ...*......................................... + add v29.4S, v5.4S, v11.4S // .........*................................... + sub v11.4S, v5.4S, v11.4S // .*........................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v22.4S, v9.4S, v8.S[0] // ....*........................................ + sub v28.4S, v16.4S, v18.4S // .....*....................................... + cmge v19.4S, v31.4S, v17.4S // *............................................ + cmge v10.4S, v17.4S, v30.4S // ..*.......................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + sqrdmulh v4.4S, v29.4S, v26.4S // ...............*............................. + mul v18.4S, v29.4S, v25.4S // .......................*..................... + mul v15.4S, v11.4S, v0.S[0] // ........*.................................... + sqrdmulh v11.4S, v11.4S, v0.S[1] // ..........*.................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v14.4S, v21.4S, v8.4S // ...........*................................. + mls v20.4S, v27.4S, v8.S[0] // ............*................................ + mls v13.4S, v28.4S, v8.4S // .............*............................... + sub v9.4S, v23.4S, v6.4S // .........................*................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + sub v10.4S, v19.4S, v10.4S // .......*..................................... + cmge v29.4S, v22.4S, v30.4S // ............................*................ + cmge v27.4S, v31.4S, v22.4S // ..............................*.............. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v18.4S, v4.4S, v8.S[0] // .............................*............... + mls v15.4S, v11.4S, v8.S[0] // ................*............................ + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v17.4S, v10.4S, v8.4S // ..............*.............................. + str q13, [x0, #768] // .................*........................... + sub v6.4S, v27.4S, v29.4S // .................................*........... + cmge v11.4S, v20.4S, v30.4S // ..................*.......................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v12.4S, v9.4S, v8.4S // ................................*............ + str q14, [x0, #128] // ...................*......................... + // gap // ............................................. + // gap // ............................................. + cmge v14.4S, v31.4S, v20.4S // ....................*........................ + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + cmge v28.4S, v15.4S, v30.4S // ..........................*.................. + cmge v13.4S, v18.4S, v30.4S // ..................................*.......... + cmge v29.4S, v31.4S, v15.4S // ........................*.................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + cmge v5.4S, v31.4S, v18.4S // ...................................*......... + // gap // ............................................. + mls v22.4S, v6.4S, v8.4S // .......................................*..... + str q17, [x0, #640] // ......................*...................... + sub v4.4S, v14.4S, v11.4S // .....................*....................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q12, [x0, #256] // ....................................*........ + sub v16.4S, v5.4S, v13.4S // .....................................*....... + sub v5.4S, v29.4S, v28.4S // ...............................*............. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v20.4S, v4.4S, v8.4S // ...........................*................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v18.4S, v16.4S, v8.4S // ........................................*.... + mls v15.4S, v5.4S, v8.4S // ......................................*...... + str q22, [x0, #512] // ..........................................*.. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q20, [x0], #(16) // ............................................* + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q18, [x0, #368] // ...........................................*. + str q15, [x0, #880] // .........................................*... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + + // --------------- new position ---------------> + // 0 25 + // |------------------------|------------------- + // cmge v10.4S, v31.4S, v17.4S // ......*...................................... + // sub v7.4S, v5.4S, v11.4S // ...*......................................... + // cmge v19.4S, v17.4S, v30.4S // .......*..................................... + // mul v24.4S, v15.4S, v25.4S // .*........................................... + // mls v22.4S, v9.4S, v8.S[0] // ....*........................................ + // sub v18.4S, v16.4S, v18.4S // .....*....................................... + // sub v9.4S, v20.4S, v29.4S // *............................................ + // sub v10.4S, v10.4S, v19.4S // ................*............................ + // mul v19.4S, v7.4S, v0.S[0] // ..........*.................................. + // add v16.4S, v5.4S, v11.4S // ..*.......................................... + // sqrdmulh v7.4S, v7.4S, v0.S[1] // ...........*................................. + // mls v14.4S, v9.4S, v8.4S // ............*................................ + // mls v24.4S, v27.4S, v8.S[0] // .............*............................... + // mls v13.4S, v18.4S, v8.4S // ..............*.............................. + // mls v17.4S, v10.4S, v8.4S // .....................*....................... + // sqrdmulh v28.4S, v16.4S, v26.4S // ........*.................................... + // mls v19.4S, v7.4S, v8.S[0] // ....................*........................ + // str q13, [x0, #768] // ......................*...................... + // cmge v29.4S, v24.4S, v30.4S // ........................*.................... + // str q14, [x0, #128] // ..........................*.................. + // cmge v5.4S, v31.4S, v24.4S // ...........................*................. + // sub v9.4S, v5.4S, v29.4S // ..................................*.......... + // str q17, [x0, #640] // .................................*........... + // mul v7.4S, v16.4S, v25.4S // .........*................................... + // cmge v16.4S, v31.4S, v19.4S // ..............................*.............. + // sub v21.4S, v23.4S, v6.4S // ...............*............................. + // cmge v23.4S, v19.4S, v30.4S // ............................*................ + // mls v24.4S, v9.4S, v8.4S // ......................................*...... + // cmge v6.4S, v22.4S, v30.4S // .................*........................... + // mls v7.4S, v28.4S, v8.S[0] // ...................*......................... + // cmge v28.4S, v31.4S, v22.4S // ..................*.......................... + // sub v9.4S, v16.4S, v23.4S // .....................................*....... + // mls v12.4S, v21.4S, v8.4S // .........................*................... + // sub v20.4S, v28.4S, v6.4S // .......................*..................... + // cmge v28.4S, v7.4S, v30.4S // .............................*............... + // cmge v27.4S, v31.4S, v7.4S // ...............................*............. + // str q12, [x0, #256] // ...................................*......... + // sub v28.4S, v27.4S, v28.4S // ....................................*........ + // mls v19.4S, v9.4S, v8.4S // ........................................*.... + // mls v22.4S, v20.4S, v8.4S // ................................*............ + // mls v7.4S, v28.4S, v8.4S // .......................................*..... + // str q19, [x0, #896] // ............................................* + // str q22, [x0, #512] // .........................................*... + // str q7, [x0, #384] // ...........................................*. + // str q24, [x0], #(16) // ..........................................*.. pop_stack diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_icestorm.s b/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_icestorm.s index eb5264de..6afd8526 100644 --- a/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_icestorm.s +++ b/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_icestorm.s @@ -13,20 +13,6 @@ xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +33,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro barrett_reduce_single a @@ -90,24 +76,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -137,35 +123,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -187,7 +173,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -198,7 +184,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -208,7 +194,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -216,7 +202,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -227,24 +213,30 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -387,886 +379,914 @@ _intt_dilithium_123_45678_opt_m1_icestorm: qform_root3_tw .req q7 .p2align 2 - // gap // ........................................................................................................................... - ldr q10, [x5, #144] // ..........*................................................................................................................ - ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // ........*.................................................................................................................. - ld4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x2] // .................*......................................................................................................... - ldr q9, [x5, #160] // ...........*............................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - ldr q21, [x5, #80] // *.......................................................................................................................... - ldr q1, [x5, #176] // .*......................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - ldr q6, [x5, #128] // ...*....................................................................................................................... - ldr q30, [x5, #64] // ....*...................................................................................................................... - ldr q27, [x5, #32] // ......*.................................................................................................................... - // gap // ........................................................................................................................... - ldr q28, [x5, #48] // .....*..................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - sub v20.4S, v16.4S, v17.4S // ..............*............................................................................................................ - sub v31.4S, v14.4S, v15.4S // .............*............................................................................................................. - // gap // ........................................................................................................................... - sub v13.4S, v23.4S, v24.4S // .............................*............................................................................................. - sub v7.4S, v25.4S, v26.4S // ............................*.............................................................................................. - // gap // ........................................................................................................................... - ldr q5, [x5, #112] // .......*................................................................................................................... - mul v2.4S, v31.4S, v27.4S // ..................*........................................................................................................ - ldr q27, [x5, #16] // .........*................................................................................................................. - sqrdmulh v4.4S, v31.4S, v28.4S // ...................*....................................................................................................... - // gap // ........................................................................................................................... - sqrdmulh v31.4S, v20.4S, v21.4S // ....................*...................................................................................................... - mul v12.4S, v7.4S, v9.4S // .....................................*..................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - ldr q21, [x5], #(12*16) // ..*........................................................................................................................ - sqrdmulh v22.4S, v13.4S, v10.4S // .......................................*................................................................................... - ldr q11, [x5, #-96] // ......................*.................................................................................................... - mul v10.4S, v20.4S, v30.4S // .....................*..................................................................................................... - mls v2.4S, v4.4S, v8.S[0] // .........................*................................................................................................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - sqrdmulh v20.4S, v7.4S, v1.4S // .................................*......................................................................................... - add v28.4S, v25.4S, v26.4S // ...............................*........................................................................................... - add v26.4S, v14.4S, v15.4S // ................*.......................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - add v18.4S, v16.4S, v17.4S // ...............*........................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mls v10.4S, v31.4S, v8.S[0] // ..........................*................................................................................................ - mul v9.4S, v13.4S, v6.4S // ...................................*....................................................................................... - // gap // ........................................................................................................................... - mls v12.4S, v20.4S, v8.S[0] // .............................................*............................................................................. - // gap // ........................................................................................................................... - sub v1.4S, v26.4S, v18.4S // ........................*.................................................................................................. - // gap // ........................................................................................................................... - add v23.4S, v23.4S, v24.4S // ................................*.......................................................................................... - // gap // ........................................................................................................................... - add v20.4S, v2.4S, v10.4S // ...............................................*........................................................................... - sub v13.4S, v2.4S, v10.4S // ....................................*...................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - sqrdmulh v31.4S, v1.4S, v27.4S // .........................................*................................................................................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mls v9.4S, v22.4S, v8.S[0] // ..............................................*............................................................................ - mul v22.4S, v13.4S, v21.4S // ...........................................*............................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - sub v4.4S, v23.4S, v28.4S // ......................................*.................................................................................... - mul v24.4S, v1.4S, v21.4S // ..................................*........................................................................................ - sqrdmulh v1.4S, v13.4S, v27.4S // ........................................*.................................................................................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - add v13.4S, v9.4S, v12.4S // ......................................................*.................................................................... - sub v12.4S, v9.4S, v12.4S // ...................................................*....................................................................... - ldr q30, [x4, #48] // ...........................................................................*............................................... - sqrdmulh v9.4S, v4.4S, v5.4S // ..........................................*................................................................................ - add v26.4S, v26.4S, v18.4S // .......................*................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mls v24.4S, v31.4S, v8.S[0] // ................................................*.......................................................................... - mls v22.4S, v1.4S, v8.S[0] // .................................................*......................................................................... - mul v10.4S, v4.4S, v11.4S // ............................................*.............................................................................. - add v1.4S, v23.4S, v28.4S // ....................................................*...................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - ldr q29, [x4, #16] // ............*.............................................................................................................. - ldr q19, [x4], #64 // ...........................*............................................................................................... - trn2 v16.4S, v26.4S, v20.4S // .....................................................*..................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - trn1 v20.4S, v26.4S, v20.4S // ...........................................................*............................................................... - trn1 v26.4S, v24.4S, v22.4S // ............................................................*.............................................................. - trn2 v4.4S, v24.4S, v22.4S // .........................................................*................................................................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mls v10.4S, v9.4S, v8.S[0] // ..................................................*........................................................................ - // gap // ........................................................................................................................... - ldr q14, [x4, #-32] // ..............................*............................................................................................ - trn2 v23.4S, v1.4S, v13.4S // ..........................................................*................................................................ - sqrdmulh v5.4S, v12.4S, v5.4S // ........................................................*.................................................................. - trn1 v17.2D, v20.2D, v26.2D // ...............................................................*........................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mul v25.4S, v12.4S, v11.4S // .......................................................*................................................................... - // gap // ........................................................................................................................... - trn2 v9.2D, v20.2D, v26.2D // .................................................................*......................................................... - trn2 v24.2D, v16.2D, v4.2D // ..............................................................*............................................................ - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - trn1 v1.4S, v1.4S, v13.4S // ..................................................................*........................................................ - trn1 v7.2D, v16.2D, v4.2D // ................................................................*.......................................................... - sub v12.4S, v9.4S, v24.4S // .................................................................................*......................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mls v25.4S, v5.4S, v8.S[0] // .............................................................*............................................................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - add v20.4S, v9.4S, v24.4S // ......................................................................*.................................................... - add v13.4S, v17.4S, v7.4S // .....................................................................*..................................................... - mul v31.4S, v12.4S, v14.S[0] // .......................................................................................*................................... - sqrdmulh v22.4S, v12.4S, v14.S[1] // ......................................................................................*.................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - trn2 v26.4S, v10.4S, v25.4S // ....................................................................*...................................................... - trn1 v24.4S, v10.4S, v25.4S // ...................................................................*....................................................... - sub v12.4S, v13.4S, v20.4S // ...................................................................................................*....................... - add v10.4S, v13.4S, v20.4S // ...............................................................................*........................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - trn1 v20.2D, v23.2D, v26.2D // ..........................................................................*................................................ - trn1 v13.2D, v1.2D, v24.2D // .........................................................................*................................................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - trn2 v0.2D, v23.2D, v26.2D // ........................................................................*.................................................. - trn2 v16.2D, v1.2D, v24.2D // .......................................................................*................................................... - sub v9.4S, v13.4S, v20.4S // .............................................................................................*............................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mul v4.4S, v12.4S, v19.S[2] // ..........................................................................................................................* - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - add v26.4S, v16.4S, v0.4S // .............................................................................*............................................. - add v24.4S, v13.4S, v20.4S // ..............................................................................*............................................ - sqrdmulh v20.4S, v9.4S, v14.S[3] // .................................................................................................*......................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mul v11.4S, v9.4S, v14.S[2] // ................................................................................................*.......................... - // gap // ........................................................................................................................... - add v27.4S, v24.4S, v26.4S // ........................................................................................*.................................. - // gap // ........................................................................................................................... - sub v1.4S, v17.4S, v7.4S // ............................................................................*.............................................. - // gap // ........................................................................................................................... - srshr v9.4S, v10.4S, #23 // .....................................................................................*..................................... - // gap // ........................................................................................................................... - sub v17.4S, v16.4S, v0.4S // ................................................................................*.......................................... - mul v16.4S, v1.4S, v29.S[2] // ...................................................................................*....................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - srshr v13.4S, v27.4S, #23 // ............................................................................................*.............................. - mls v31.4S, v22.4S, v8.S[0] // ...............................................................................................*........................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - sqrdmulh v1.4S, v1.4S, v29.S[3] // ....................................................................................*...................................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mls v10.4S, v9.4S, v8.4S // ...........................................................................................*............................... - mls v11.4S, v20.4S, v8.S[0] // .......................................................................................................*................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mls v27.4S, v13.4S, v8.4S // ..................................................................................................*........................ - mul v15.4S, v17.4S, v30.S[0] // .........................................................................................*................................. - // gap // ........................................................................................................................... - mls v16.4S, v1.4S, v8.S[0] // ..........................................................................................*................................ - // gap // ........................................................................................................................... - sqrdmulh v7.4S, v17.4S, v30.S[1] // ..............................................................................................*............................ - sqrdmulh v30.4S, v12.4S, v19.S[3] // ......................................................................................................................*.... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - sub v9.4S, v24.4S, v26.4S // ..................................................................................*........................................ - // gap // ........................................................................................................................... - add v24.4S, v10.4S, v27.4S // ..............................................................................................................*............ - sub v22.4S, v10.4S, v27.4S // ........................................................................................................*.................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - sub v10.4S, v16.4S, v31.4S // ......................................................................................................*.................... - // gap // ........................................................................................................................... - mls v15.4S, v7.4S, v8.S[0] // .....................................................................................................*..................... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mul v17.4S, v22.4S, v19.S[0] // .............................................................................................................*............. - sqrdmulh v12.4S, v22.4S, v19.S[1] // ............................................................................................................*.............. - str q24, [x1], #(16*4) // .....................................................................................................................*..... - sqrdmulh v22.4S, v10.4S, v19.S[3] // ...........................................................................................................*............... - // gap // ........................................................................................................................... - mul v10.4S, v10.4S, v19.S[2] // ..........................................................................................................*................ - // gap // ........................................................................................................................... - add v26.4S, v16.4S, v31.4S // ....................................................................................................*...................... - sub v13.4S, v11.4S, v15.4S // ...............................................................................................................*........... - // gap // ........................................................................................................................... - add v31.4S, v11.4S, v15.4S // .................................................................................................................*......... - mls v17.4S, v12.4S, v8.S[0] // ..................................................................................................................*........ - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mls v10.4S, v22.4S, v8.S[0] // ................................................................................................................*.......... - srshr v22.4S, v26.4S, #23 // .........................................................................................................*................. - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - sqrdmulh v20.4S, v13.4S, v29.S[1] // .......................................................................................................................*... - // gap // ........................................................................................................................... - // gap // ........................................................................................................................... - mul v1.4S, v13.4S, v29.S[0] // ...................................................................................................................*....... - str q17, [x2], #(16*4) // ........................................................................................................................*.. - // gap // ........................................................................................................................... - mul v7.4S, v9.4S, v29.S[0] // .........................................................................................................................*. - srshr v12.4S, v31.4S, #23 // ....................................................................................................................*...... - - // original source code - // ldr q17, [x5, #80] // ....*...................................................................................................................... - // ldr q16, [x5, #176] // .....*..................................................................................................................... - // ldr q15, [x5], #(12*16) // ....................*...................................................................................................... - // ldr q24, [x5, #-64] // ......*.................................................................................................................... - // ldr q28, [x5, #-128] // .......*................................................................................................................... - // ldr q18, [x5, #-144] // .........*................................................................................................................. - // ldr q25, [x5, #-160] // ........*.................................................................................................................. - // ldr q7, [x5, #-80] // ..............*............................................................................................................ - // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*......................................................................................................................... - // ldr q0, [x5, #-176] // ................*.......................................................................................................... - // ldr q30, [x5, #-48] // *.......................................................................................................................... - // ldr q27, [x5, #-32] // ...*....................................................................................................................... - // ldr q29, [x4, #16] // ...................................................*....................................................................... - // sub v22.4S, v3.4S, v4.4S // ...........*............................................................................................................... - // sub v12.4S, v5.4S, v6.4S // ..........*................................................................................................................ - // add v11.4S, v5.4S, v6.4S // ............................*.............................................................................................. - // add v20.4S, v3.4S, v4.4S // ...........................*............................................................................................... - // ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // ..*........................................................................................................................ - // mul v10.4S, v22.4S, v25.4S // ...............*........................................................................................................... - // sqrdmulh v18.4S, v22.4S, v18.4S // .................*......................................................................................................... - // sqrdmulh v26.4S, v12.4S, v17.4S // ..................*........................................................................................................ - // mul v28.4S, v12.4S, v28.4S // .......................*................................................................................................... - // ldr q31, [x5, #-96] // ......................*.................................................................................................... - // add v5.4S, v20.4S, v11.4S // ..............................................*............................................................................ - // sub v12.4S, v20.4S, v11.4S // ................................*.......................................................................................... - // mls v10.4S, v18.4S, v8.S[0] // ........................*.................................................................................................. - // mls v28.4S, v26.4S, v8.S[0] // .............................*............................................................................................. - // ldr q19, [x4], #64 // ....................................................*...................................................................... - // sub v9.4S, v3.4S, v4.4S // .............*............................................................................................................. - // sub v17.4S, v1.4S, v2.4S // ............*.............................................................................................................. - // ldr q26, [x4, #-32] // ..........................................................*................................................................ - // add v11.4S, v3.4S, v4.4S // ..........................*................................................................................................ - // add v22.4S, v1.4S, v2.4S // .................................*......................................................................................... - // sqrdmulh v4.4S, v9.4S, v16.4S // .........................*................................................................................................. - // mul v6.4S, v12.4S, v15.4S // ........................................*.................................................................................. - // mul v13.4S, v17.4S, v24.4S // ..............................*............................................................................................ - // sub v18.4S, v10.4S, v28.4S // ...................................*....................................................................................... - // mul v23.4S, v9.4S, v27.4S // ...................*....................................................................................................... - // sub v20.4S, v22.4S, v11.4S // .......................................*................................................................................... - // sqrdmulh v3.4S, v17.4S, v30.4S // .....................*..................................................................................................... - // sqrdmulh v1.4S, v18.4S, v0.4S // .........................................*................................................................................. - // sqrdmulh v17.4S, v12.4S, v0.4S // ....................................*...................................................................................... - // sqrdmulh v12.4S, v20.4S, v7.4S // .............................................*............................................................................. - // mul v24.4S, v18.4S, v15.4S // ......................................*.................................................................................... - // mul v25.4S, v20.4S, v31.4S // .................................................*......................................................................... - // mls v23.4S, v4.4S, v8.S[0] // ...............................*........................................................................................... - // mls v13.4S, v3.4S, v8.S[0] // .....................................*..................................................................................... - // add v14.4S, v10.4S, v28.4S // ..................................*........................................................................................ - // mls v6.4S, v17.4S, v8.S[0] // ...............................................*........................................................................... - // mls v24.4S, v1.4S, v8.S[0] // ................................................*.......................................................................... - // mls v25.4S, v12.4S, v8.S[0] // .........................................................*................................................................. - // sub v20.4S, v13.4S, v23.4S // ...........................................*............................................................................... - // add v27.4S, v22.4S, v11.4S // ..................................................*........................................................................ - // trn2 v9.4S, v5.4S, v14.4S // .....................................................*..................................................................... - // add v16.4S, v13.4S, v23.4S // ..........................................*................................................................................ - // mul v31.4S, v20.4S, v31.4S // ..............................................................*............................................................ - // sqrdmulh v30.4S, v20.4S, v7.4S // ............................................................*.............................................................. - // trn2 v12.4S, v6.4S, v24.4S // ........................................................*.................................................................. - // trn2 v13.4S, v27.4S, v16.4S // ...........................................................*............................................................... - // trn1 v4.4S, v5.4S, v14.4S // ......................................................*.................................................................... - // trn1 v20.4S, v6.4S, v24.4S // .......................................................*................................................................... - // mls v31.4S, v30.4S, v8.S[0] // ....................................................................*...................................................... - // trn2 v1.2D, v9.2D, v12.2D // ................................................................*.......................................................... - // trn1 v22.2D, v4.2D, v20.2D // .............................................................*............................................................. - // trn1 v17.2D, v9.2D, v12.2D // ..................................................................*........................................................ - // trn2 v4.2D, v4.2D, v20.2D // ...............................................................*........................................................... - // trn1 v9.4S, v27.4S, v16.4S // .................................................................*......................................................... - // trn1 v20.4S, v25.4S, v31.4S // ..........................................................................*................................................ - // trn2 v16.4S, v25.4S, v31.4S // .........................................................................*................................................. - // add v10.4S, v22.4S, v17.4S // ......................................................................*.................................................... - // add v31.4S, v4.4S, v1.4S // .....................................................................*..................................................... - // trn2 v12.2D, v9.2D, v20.2D // ................................................................................*.......................................... - // trn2 v30.2D, v13.2D, v16.2D // ...............................................................................*........................................... - // trn1 v23.2D, v9.2D, v20.2D // ..............................................................................*............................................ - // trn1 v28.2D, v13.2D, v16.2D // .............................................................................*............................................. - // ldr q24, [x4, #-16] // ............................................*.............................................................................. - // sub v22.4S, v22.4S, v17.4S // ........................................................................................*.................................. - // add v13.4S, v12.4S, v30.4S // ...................................................................................*....................................... - // add v20.4S, v23.4S, v28.4S // ....................................................................................*...................................... - // add v17.4S, v10.4S, v31.4S // ............................................................................*.............................................. - // sub v12.4S, v12.4S, v30.4S // ..........................................................................................*................................ - // sub v4.4S, v4.4S, v1.4S // ...................................................................*....................................................... - // sub v9.4S, v20.4S, v13.4S // ......................................................................................................*.................... - // mul v16.4S, v22.4S, v29.S[2] // ...........................................................................................*............................... - // sqrdmulh v30.4S, v22.4S, v29.S[3] // ..............................................................................................*............................ - // srshr v22.4S, v17.4S, #23 // .........................................................................................*................................. - // sqrdmulh v1.4S, v4.4S, v26.S[1] // ........................................................................*.................................................. - // mul v6.4S, v4.4S, v26.S[0] // .......................................................................*................................................... - // add v4.4S, v20.4S, v13.4S // .......................................................................................*................................... - // mul v25.4S, v12.4S, v24.S[0] // ..................................................................................................*........................ - // mls v16.4S, v30.4S, v8.S[0] // ...................................................................................................*....................... - // mls v17.4S, v22.4S, v8.4S // ...............................................................................................*........................... - // srshr v20.4S, v4.4S, #23 // ............................................................................................*.............................. - // sub v22.4S, v23.4S, v28.4S // .................................................................................*......................................... - // sqrdmulh v12.4S, v12.4S, v24.S[1] // ....................................................................................................*...................... - // mls v6.4S, v1.4S, v8.S[0] // .............................................................................................*............................. - // mul v1.4S, v22.4S, v26.S[2] // ......................................................................................*.................................... - // sqrdmulh v22.4S, v22.4S, v26.S[3] // .....................................................................................*..................................... - // mls v4.4S, v20.4S, v8.4S // .................................................................................................*......................... - // sub v24.4S, v10.4S, v31.4S // ...........................................................................*............................................... - // add v26.4S, v16.4S, v6.4S // ................................................................................................................*.......... - // mls v25.4S, v12.4S, v8.S[0] // ..........................................................................................................*................ - // sub v30.4S, v16.4S, v6.4S // .........................................................................................................*................. - // mls v1.4S, v22.4S, v8.S[0] // ................................................................................................*.......................... - // sub v20.4S, v17.4S, v4.4S // ........................................................................................................*.................. - // srshr v22.4S, v26.4S, #23 // .....................................................................................................................*..... - // mul v10.4S, v30.4S, v19.S[2] // ...............................................................................................................*........... - // sqrdmulh v16.4S, v30.4S, v19.S[3] // ..............................................................................................................*............ - // sqrdmulh v12.4S, v20.4S, v19.S[1] // ............................................................................................................*.............. - // mul v13.4S, v20.4S, v19.S[0] // ...........................................................................................................*............... - // add v20.4S, v17.4S, v4.4S // .......................................................................................................*................... - // sub v4.4S, v1.4S, v25.4S // .................................................................................................................*......... - // mls v10.4S, v16.4S, v8.S[0] // ....................................................................................................................*...... - // add v31.4S, v1.4S, v25.4S // ..................................................................................................................*........ - // mls v13.4S, v12.4S, v8.S[0] // ...................................................................................................................*....... - // mul v1.4S, v4.4S, v29.S[0] // .......................................................................................................................*... - // srshr v12.4S, v31.4S, #23 // ..........................................................................................................................* - // str q20, [x1], #(16*4) // .............................................................................................................*............. - // sqrdmulh v30.4S, v24.4S, v19.S[3] // .....................................................................................................*..................... - // sqrdmulh v20.4S, v4.4S, v29.S[1] // ......................................................................................................................*.... - // str q13, [x2], #(16*4) // ........................................................................................................................*.. - // mul v7.4S, v9.4S, v29.S[0] // .........................................................................................................................*. - // mul v4.4S, v24.4S, v19.S[2] // ..................................................................................*........................................ + // Instructions: 133 + // Expected cycles: 63 + // Expected IPC: 2.11 + // + // Wall time: 191.39s + // User time: 191.39s + // + // -------------------------------------------------------- original position ---------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------- + ldr q13, [x4, #48] // ......................................................................*.............................................................. + ldr q0, [x4, #16] // .................................................................*................................................................... + // gap // ..................................................................................................................................... + ld4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1] // .*................................................................................................................................... + ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x2] // ..........*.......................................................................................................................... + // gap // ..................................................................................................................................... + ldr q30, [x4], #64 // ..............................................................*...................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + ldr q27, [x5, #48] // ...*................................................................................................................................. + // gap // ..................................................................................................................................... + ldr q20, [x5, #144] // ...................*................................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + ldr q26, [x5, #80] // ....*................................................................................................................................ + ldr q28, [x5, #64] // .....*............................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + ldr q15, [x5, #128] // ....................*................................................................................................................ + ldr q1, [x5, #176] // .......................*............................................................................................................. + ldr q19, [x5, #32] // ......*.............................................................................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + ldr q7, [x5, #96] // ......................*.............................................................................................................. + sub v21.4S, v22.4S, v23.4S // ........*............................................................................................................................ + sub v5.4S, v24.4S, v25.4S // .......*............................................................................................................................. + ldr q16, [x5, #112] // ............................*........................................................................................................ + ldr q14, [x5, #160] // ..*.................................................................................................................................. + add v17.4S, v24.4S, v25.4S // ...........*......................................................................................................................... + sub v18.4S, v11.4S, v12.4S // ...........................*......................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + ldr q31, [x5, #16] // *.................................................................................................................................... + ldr q2, [x5], #(12*16) // .........*........................................................................................................................... + sqrdmulh v3.4S, v21.4S, v27.4S // ................*.................................................................................................................... + sqrdmulh v6.4S, v5.4S, v26.4S // .............*....................................................................................................................... + mul v29.4S, v21.4S, v19.4S // ...............*..................................................................................................................... + mul v24.4S, v5.4S, v28.4S // ..............*...................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v27.4S, v9.4S, v10.4S // ..............................*...................................................................................................... + add v4.4S, v22.4S, v23.4S // ............*........................................................................................................................ + mul v25.4S, v18.4S, v14.4S // ..................................*.................................................................................................. + add v21.4S, v11.4S, v12.4S // ........................................*............................................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v24.4S, v6.4S, v8.S[0] // .....................*............................................................................................................... + mls v29.4S, v3.4S, v8.S[0] // ........................*............................................................................................................ + sub v14.4S, v9.4S, v10.4S // ..........................*.......................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v9.4S, v4.4S, v17.4S // .................*................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v18.4S, v18.4S, v1.4S // .................................*................................................................................................... + add v22.4S, v27.4S, v21.4S // ...........................................................*......................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v3.4S, v9.4S, v31.4S // .............................*....................................................................................................... + sub v26.4S, v29.4S, v24.4S // ...............................*..................................................................................................... + sqrdmulh v1.4S, v14.4S, v20.4S // .....................................*............................................................................................... + mul v19.4S, v9.4S, v2.4S // .........................*........................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v9.4S, v26.4S, v2.4S // ....................................*................................................................................................ + sqrdmulh v2.4S, v26.4S, v31.4S // ...................................*................................................................................................. + mls v25.4S, v18.4S, v8.S[0] // .......................................*............................................................................................. + mul v15.4S, v14.4S, v15.4S // ................................*.................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v5.4S, v4.4S, v17.4S // ..................*.................................................................................................................. + mls v19.4S, v3.4S, v8.S[0] // ......................................*.............................................................................................. + add v18.4S, v29.4S, v24.4S // .........................................*........................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v9.4S, v2.4S, v8.S[0] // ..........................................*.......................................................................................... + mls v15.4S, v1.4S, v8.S[0] // ............................................*........................................................................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v3.4S, v27.4S, v21.4S // ...........................................*......................................................................................... + trn2 v27.4S, v5.4S, v18.4S // ..............................................*...................................................................................... + trn1 v21.4S, v5.4S, v18.4S // .............................................*....................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v4.4S, v19.4S, v9.4S // ................................................*.................................................................................... + trn1 v11.4S, v19.4S, v9.4S // ..................................................*.................................................................................. + add v26.4S, v15.4S, v25.4S // ........................................................*............................................................................ + // gap // ..................................................................................................................................... + sqrdmulh v14.4S, v3.4S, v16.4S // ......................................................*.............................................................................. + ldr q24, [x4, #-32] // .......................................................*............................................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v5.2D, v27.2D, v4.2D // ....................................................*................................................................................ + trn1 v1.2D, v21.2D, v11.2D // ............................................................*........................................................................ + trn2 v9.2D, v21.2D, v11.2D // ..........................................................*.......................................................................... + sub v19.4S, v15.4S, v25.4S // .................................................*................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v21.4S, v1.4S, v5.4S // .........................................................................*........................................................... + mul v28.4S, v3.4S, v7.4S // ...............................................*..................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v3.4S, v19.4S, v16.4S // .........................................................*........................................................................... + mul v25.4S, v19.4S, v7.4S // .....................................................*............................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v18.4S, v21.4S, v0.S[3] // .............................................................................................*....................................... + mul v2.4S, v21.4S, v0.S[2] // ..........................................................................................*.......................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v28.4S, v14.4S, v8.S[0] // .............................................................*....................................................................... + trn2 v10.2D, v27.2D, v4.2D // ...................................................*................................................................................. + // gap // ..................................................................................................................................... + trn2 v21.4S, v22.4S, v26.4S // .....................................................................*............................................................... + mls v25.4S, v3.4S, v8.S[0] // ...............................................................*..................................................................... + // gap // ..................................................................................................................................... + add v27.4S, v1.4S, v5.4S // ...................................................................*................................................................. + sub v14.4S, v9.4S, v10.4S // ................................................................*.................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v26.4S, v22.4S, v26.4S // ..................................................................*.................................................................. + // gap // ..................................................................................................................................... + add v3.4S, v9.4S, v10.4S // ........................................................................*............................................................ + mul v16.4S, v14.4S, v24.S[0] // ....................................................................*................................................................ + trn2 v22.4S, v28.4S, v25.4S // ..........................................................................*.......................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn1 v1.4S, v28.4S, v25.4S // .......................................................................*............................................................. + // gap // ..................................................................................................................................... + sub v4.4S, v27.4S, v3.4S // ...........................................................................*......................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v9.2D, v21.2D, v22.2D // ...............................................................................*..................................................... + trn1 v21.2D, v21.2D, v22.2D // ..................................................................................*.................................................. + add v19.4S, v27.4S, v3.4S // .............................................................................*....................................................... + trn1 v27.2D, v26.2D, v1.2D // .................................................................................*................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + trn2 v3.2D, v26.2D, v1.2D // ............................................................................*........................................................ + sqrdmulh v15.4S, v14.4S, v24.S[1] // ..............................................................................*...................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v25.4S, v4.4S, v30.S[2] // ................................................................................*.................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v1.4S, v27.4S, v21.4S // .....................................................................................*............................................... + add v22.4S, v3.4S, v9.4S // .......................................................................................*............................................. + add v21.4S, v27.4S, v21.4S // ......................................................................................*.............................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v26.4S, v3.4S, v9.4S // ...................................................................................*................................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v10.4S, v1.4S, v24.S[2] // ...........................................................................................*......................................... + sqrdmulh v3.4S, v1.4S, v24.S[3] // .........................................................................................*........................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v9.4S, v21.4S, v22.4S // ............................................................................................*........................................ + sub v22.4S, v21.4S, v22.4S // ..............................................................................................*...................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + srshr v1.4S, v19.4S, #23 // ...............................................................................................*..................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v24.4S, v26.4S, v13.S[0] // ........................................................................................*............................................ + srshr v21.4S, v9.4S, #23 // ................................................................................................*.................................... + sqrdmulh v11.4S, v22.4S, v0.S[1] // ............................................................................................................*........................ + sqrdmulh v13.4S, v26.4S, v13.S[1] // .................................................................................................*................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v10.4S, v3.4S, v8.S[0] // ..................................................................................................*.................................. + mls v2.4S, v18.4S, v8.S[0] // ....................................................................................................*................................ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v19.4S, v1.4S, v8.4S // ......................................................................................................*.............................. + // gap // ..................................................................................................................................... + mls v9.4S, v21.4S, v8.4S // .....................................................................................................*............................... + mls v24.4S, v13.4S, v8.S[0] // .......................................................................................................*............................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mul v14.4S, v22.4S, v0.S[0] // ........................................................................................................*............................ + sqrdmulh v3.4S, v4.4S, v30.S[3] // ....................................................................................*................................................ + mls v16.4S, v15.4S, v8.S[0] // ...................................................................................................*................................. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + add v21.4S, v19.4S, v9.4S // .............................................................................................................................*....... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sub v4.4S, v19.4S, v9.4S // ...........................................................................................................*......................... + add v15.4S, v10.4S, v24.4S // .............................................................................................................*....................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v14.4S, v11.4S, v8.S[0] // .........................................................................................................................*........... + // gap // ..................................................................................................................................... + sub v29.4S, v2.4S, v16.4S // ..........................................................................................................*.......................... + // gap // ..................................................................................................................................... + mul v28.4S, v4.4S, v30.S[0] // ...............................................................................................................*..................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v9.4S, v4.4S, v30.S[1] // ................................................................................................................*.................... + sub v22.4S, v10.4S, v24.4S // ..................................................................................................................*.................. + mul v19.4S, v29.4S, v30.S[2] // ....................................................................................................................*................ + add v27.4S, v2.4S, v16.4S // .........................................................................................................*........................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + srshr v26.4S, v15.4S, #23 // .................................................................................................................*................... + sqrdmulh v18.4S, v29.4S, v30.S[3] // ...................................................................................................................*................. + srshr v4.4S, v27.4S, #23 // ..............................................................................................................*...................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + sqrdmulh v1.4S, v22.4S, v0.S[1] // .....................................................................................................................*............... + mul v22.4S, v22.4S, v0.S[0] // .......................................................................................................................*............. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v28.4S, v9.4S, v8.S[0] // ..........................................................................................................................*.......... + mls v15.4S, v26.4S, v8.4S // ........................................................................................................................*............ + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v25.4S, v3.4S, v8.S[0] // ............................................................................................................................*........ + mls v27.4S, v4.4S, v8.4S // ......................................................................................................................*.............. + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v19.4S, v18.4S, v8.S[0] // ...........................................................................................................................*......... + // gap // ..................................................................................................................................... + // gap // ..................................................................................................................................... + mls v22.4S, v1.4S, v8.S[0] // ..............................................................................................................................*...... + str q28, [x2], #(16*4) // .................................................................................................................................*... + sub v29.4S, v25.4S, v14.4S // ....................................................................................................................................* + // gap // ..................................................................................................................................... + add v4.4S, v25.4S, v14.4S // ...................................................................................................................................*. + str q21, [x1], #(16*4) // ..................................................................................................................................*.. + sub v18.4S, v27.4S, v15.4S // ................................................................................................................................*.... + add v5.4S, v27.4S, v15.4S // ...............................................................................................................................*..... + // gap // ..................................................................................................................................... + + // ----------------------------------------------------------- new position -----------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------- + // ldr q12, [x5, #16] // ...................*................................................................................................................. + // ld4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x1] // ..*.................................................................................................................................. + // ldr q13, [x5, #160] // ................*.................................................................................................................... + // ldr q31, [x5, #48] // .....*............................................................................................................................... + // ldr q0, [x5, #80] // .......*............................................................................................................................. + // ldr q7, [x5, #64] // ........*............................................................................................................................ + // ldr q16, [x5, #32] // ...........*......................................................................................................................... + // sub v3.4S, v28.4S, v29.4S // ..............*...................................................................................................................... + // sub v30.4S, v26.4S, v27.4S // .............*....................................................................................................................... + // ldr q19, [x5], #(12*16) // ....................*................................................................................................................ + // ld4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x2] // ...*................................................................................................................................. + // add v18.4S, v28.4S, v29.4S // .................*................................................................................................................... + // add v15.4S, v26.4S, v27.4S // ..........................*.......................................................................................................... + // sqrdmulh v26.4S, v3.4S, v0.4S // ......................*.............................................................................................................. + // mul v20.4S, v3.4S, v7.4S // ........................*............................................................................................................ + // mul v5.4S, v30.4S, v16.4S // .......................*............................................................................................................. + // sqrdmulh v10.4S, v30.4S, v31.4S // .....................*............................................................................................................... + // sub v11.4S, v15.4S, v18.4S // ................................*.................................................................................................... + // add v21.4S, v15.4S, v18.4S // ...........................................*......................................................................................... + // ldr q9, [x5, #-48] // ......*.............................................................................................................................. + // ldr q18, [x5, #-64] // .........*........................................................................................................................... + // mls v20.4S, v26.4S, v8.S[0] // .............................*....................................................................................................... + // ldr q29, [x5, #-96] // ............*........................................................................................................................ + // ldr q31, [x5, #-16] // ..........*.......................................................................................................................... + // mls v5.4S, v10.4S, v8.S[0] // ..............................*...................................................................................................... + // mul v28.4S, v11.4S, v19.4S // ......................................*.............................................................................................. + // sub v6.4S, v22.4S, v23.4S // ...............................*..................................................................................................... + // sub v27.4S, v24.4S, v25.4S // ..................*.................................................................................................................. + // ldr q4, [x5, #-80] // ...............*..................................................................................................................... + // sqrdmulh v10.4S, v11.4S, v12.4S // ...................................*................................................................................................. + // add v14.4S, v22.4S, v23.4S // .........................*........................................................................................................... + // sub v26.4S, v5.4S, v20.4S // ....................................*................................................................................................ + // mul v3.4S, v6.4S, v18.4S // ..........................................*.......................................................................................... + // sqrdmulh v7.4S, v27.4S, v31.4S // .................................*................................................................................................... + // mul v1.4S, v27.4S, v13.4S // ...........................*......................................................................................................... + // sqrdmulh v30.4S, v26.4S, v12.4S // ........................................*............................................................................................ + // mul v15.4S, v26.4S, v19.4S // .......................................*............................................................................................. + // sqrdmulh v17.4S, v6.4S, v9.4S // .....................................*............................................................................................... + // mls v28.4S, v10.4S, v8.S[0] // ............................................*........................................................................................ + // mls v1.4S, v7.4S, v8.S[0] // .........................................*........................................................................................... + // add v16.4S, v24.4S, v25.4S // ............................*........................................................................................................ + // add v5.4S, v5.4S, v20.4S // .............................................*....................................................................................... + // mls v15.4S, v30.4S, v8.S[0] // ..............................................*...................................................................................... + // sub v11.4S, v14.4S, v16.4S // ................................................*.................................................................................... + // mls v3.4S, v17.4S, v8.S[0] // ...............................................*..................................................................................... + // trn1 v18.4S, v21.4S, v5.4S // ..................................................*.................................................................................. + // trn2 v30.4S, v21.4S, v5.4S // .................................................*................................................................................... + // mul v13.4S, v11.4S, v29.4S // .............................................................*....................................................................... + // trn2 v22.4S, v28.4S, v15.4S // ...................................................*................................................................................. + // sub v0.4S, v3.4S, v1.4S // ...........................................................*......................................................................... + // trn1 v31.4S, v28.4S, v15.4S // ....................................................*................................................................................ + // trn2 v20.2D, v30.2D, v22.2D // ...................................................................*................................................................. + // trn1 v25.2D, v30.2D, v22.2D // ........................................................*............................................................................ + // mul v6.4S, v0.4S, v29.4S // ...............................................................*..................................................................... + // sqrdmulh v24.4S, v11.4S, v4.4S // ......................................................*.............................................................................. + // ldr q12, [x4, #32] // .......................................................*............................................................................. + // add v10.4S, v3.4S, v1.4S // .....................................................*............................................................................... + // sqrdmulh v28.4S, v0.4S, v4.4S // ..............................................................*...................................................................... + // trn2 v19.2D, v18.2D, v31.2D // ..........................................................*.......................................................................... + // add v21.4S, v14.4S, v16.4S // ..................................*.................................................................................................. + // trn1 v0.2D, v18.2D, v31.2D // .........................................................*........................................................................... + // mls v13.4S, v24.4S, v8.S[0] // ..................................................................*.................................................................. + // ldr q30, [x4], #64 // ....*................................................................................................................................ + // mls v6.4S, v28.4S, v8.S[0] // .....................................................................*............................................................... + // sub v29.4S, v19.4S, v20.4S // .......................................................................*............................................................. + // ldr q2, [x4, #-48] // .*................................................................................................................................... + // trn1 v7.4S, v21.4S, v10.4S // ........................................................................*............................................................ + // add v11.4S, v0.4S, v25.4S // ......................................................................*.............................................................. + // mul v23.4S, v29.4S, v12.S[0] // ..........................................................................*.......................................................... + // trn2 v17.4S, v21.4S, v10.4S // ....................................................................*................................................................ + // ldr q16, [x4, #-16] // *.................................................................................................................................... + // trn1 v27.4S, v13.4S, v6.4S // ............................................................................*........................................................ + // add v9.4S, v19.4S, v20.4S // .........................................................................*........................................................... + // sub v14.4S, v0.4S, v25.4S // ............................................................*........................................................................ + // trn2 v19.4S, v13.4S, v6.4S // ...........................................................................*......................................................... + // sub v0.4S, v11.4S, v9.4S // .............................................................................*....................................................... + // trn2 v20.2D, v7.2D, v27.2D // ..................................................................................*.................................................. + // add v21.4S, v11.4S, v9.4S // ................................................................................*.................................................... + // sqrdmulh v6.4S, v29.4S, v12.S[1] // ...................................................................................*................................................. + // trn2 v18.2D, v17.2D, v19.2D // ..............................................................................*...................................................... + // mul v11.4S, v0.4S, v30.S[2] // ....................................................................................*................................................ + // trn1 v29.2D, v7.2D, v27.2D // .................................................................................*................................................... + // trn1 v26.2D, v17.2D, v19.2D // ...............................................................................*..................................................... + // sub v7.4S, v20.4S, v18.4S // ........................................................................................*............................................ + // sqrdmulh v5.4S, v0.4S, v30.S[3] // ........................................................................................................*............................ + // sub v0.4S, v29.4S, v26.4S // .....................................................................................*............................................... + // add v17.4S, v29.4S, v26.4S // .......................................................................................*............................................. + // add v10.4S, v20.4S, v18.4S // ......................................................................................*.............................................. + // mul v25.4S, v7.4S, v16.S[0] // ..............................................................................................*...................................... + // sqrdmulh v22.4S, v0.4S, v12.S[3] // ..........................................................................................*.......................................... + // mul v24.4S, v14.4S, v2.S[2] // .................................................................*................................................................... + // mul v15.4S, v0.4S, v12.S[2] // .........................................................................................*........................................... + // add v3.4S, v17.4S, v10.4S // ...........................................................................................*......................................... + // sqrdmulh v29.4S, v14.4S, v2.S[3] // ................................................................*.................................................................... + // sub v0.4S, v17.4S, v10.4S // ............................................................................................*........................................ + // srshr v31.4S, v21.4S, #23 // .............................................................................................*....................................... + // srshr v20.4S, v3.4S, #23 // ...............................................................................................*..................................... + // sqrdmulh v18.4S, v7.4S, v16.S[1] // .................................................................................................*................................... + // mls v15.4S, v22.4S, v8.S[0] // ..................................................................................................*.................................. + // mls v23.4S, v6.4S, v8.S[0] // .........................................................................................................*........................... + // mls v24.4S, v29.4S, v8.S[0] // ...................................................................................................*................................. + // mls v3.4S, v20.4S, v8.4S // .....................................................................................................*............................... + // mls v21.4S, v31.4S, v8.4S // ....................................................................................................*................................ + // mls v25.4S, v18.4S, v8.S[0] // ......................................................................................................*.............................. + // mul v17.4S, v0.4S, v2.S[0] // .......................................................................................................*............................. + // add v29.4S, v24.4S, v23.4S // ...................................................................................................................*................. + // sub v14.4S, v24.4S, v23.4S // ..............................................................................................................*...................... + // sub v12.4S, v21.4S, v3.4S // ...........................................................................................................*......................... + // sqrdmulh v13.4S, v0.4S, v2.S[1] // ................................................................................................*.................................... + // add v20.4S, v15.4S, v25.4S // ............................................................................................................*........................ + // srshr v7.4S, v29.4S, #23 // ......................................................................................................................*.............. + // mul v28.4S, v12.4S, v30.S[0] // ...............................................................................................................*..................... + // sqrdmulh v6.4S, v12.4S, v30.S[1] // ................................................................................................................*.................... + // srshr v24.4S, v20.4S, #23 // ....................................................................................................................*................ + // sub v4.4S, v15.4S, v25.4S // .................................................................................................................*................... + // sqrdmulh v10.4S, v14.4S, v30.S[3] // .....................................................................................................................*............... + // mul v19.4S, v14.4S, v30.S[2] // ..................................................................................................................*.................. + // sqrdmulh v0.4S, v4.4S, v2.S[1] // .......................................................................................................................*............. + // mls v29.4S, v7.4S, v8.4S // ............................................................................................................................*........ + // mul v22.4S, v4.4S, v2.S[0] // ........................................................................................................................*............ + // mls v20.4S, v24.4S, v8.4S // ..........................................................................................................................*.......... + // mls v17.4S, v13.4S, v8.S[0] // .............................................................................................................*....................... + // mls v28.4S, v6.4S, v8.S[0] // .........................................................................................................................*........... + // mls v19.4S, v10.4S, v8.S[0] // .............................................................................................................................*....... + // mls v11.4S, v5.4S, v8.S[0] // ...........................................................................................................................*......... + // add v27.4S, v21.4S, v3.4S // ..........................................................................................................*.......................... + // mls v22.4S, v0.4S, v8.S[0] // ..............................................................................................................................*...... + // add v5.4S, v29.4S, v20.4S // ....................................................................................................................................* + // sub v18.4S, v29.4S, v20.4S // ...................................................................................................................................*. + // str q28, [x2], #(16*4) // ...............................................................................................................................*..... + // str q27, [x1], #(16*4) // ..................................................................................................................................*.. + // add v4.4S, v11.4S, v17.4S // .................................................................................................................................*... + // sub v29.4S, v11.4S, v17.4S // ................................................................................................................................*.... sub count, count, #1 layer45678_start: - sqrdmulh v13.4S, v9.4S, v29.S[1] // ...........................................................................................................*............................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v26.4S, v22.4S, v8.4S // .....................................................................................................................*.................................. - mls v31.4S, v12.4S, v8.4S // .........................................................................................................................*.............................. - ldr q17, [x5, #80] // .......e................................................................................................................................................ - ldr q16, [x5, #176] // .................................e...................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v1.4S, v20.4S, v8.S[0] // .................................................................................................................*...................................... - // gap // ........................................................................................................................................................ - ldr q15, [x5], #(12*16) // ..e..................................................................................................................................................... - mls v7.4S, v13.4S, v8.S[0] // ............................................................................................................*........................................... - mls v4.4S, v30.4S, v8.S[0] // ..................................................................................................*..................................................... - sub v30.4S, v26.4S, v31.4S // ...............................................................................................................................*........................ - add v22.4S, v26.4S, v31.4S // ................................................................................................................................*....................... - ldr q24, [x5, #-64] // ..............................e......................................................................................................................... - ldr q28, [x5, #-128] // ......e................................................................................................................................................. - add v12.4S, v10.4S, v1.4S // ..........................................................................................................................................*............. - // gap // ........................................................................................................................................................ - ldr q18, [x5, #-144] // .....e.................................................................................................................................................. - sub v1.4S, v10.4S, v1.4S // .........................................................................................................................................*.............. - ldr q25, [x5, #-160] // ....e................................................................................................................................................... - add v9.4S, v4.4S, v7.4S // .....................................................................................................................................*.................. - sub v4.4S, v4.4S, v7.4S // ....................................................................................................................................*................... - str q22, [x1, #-48] // ...............................................................................................................................................*........ - str q12, [x1, #-16] // .................................................................................................................................................*...... - ldr q7, [x5, #-80] // .............................e.......................................................................................................................... - mul v20.4S, v1.4S, v19.S[0] // ...........................................................................................................................................*............ - sqrdmulh v12.4S, v1.4S, v19.S[1] // ............................................................................................................................................*........... - sqrdmulh v31.4S, v4.4S, v19.S[1] // .......................................................................................................................................*................ - mul v13.4S, v4.4S, v19.S[0] // ......................................................................................................................................*................. - str q9, [x1, #-32] // ................................................................................................................................................*....... - add x1, x1, #64 // ......................................................................................................................................................*. - ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // e....................................................................................................................................................... - sqrdmulh v22.4S, v30.4S, v19.S[1] // ..................................................................................................................................*..................... - mul v1.4S, v30.4S, v19.S[0] // .................................................................................................................................*...................... - ldr q0, [x5, #-176] // ...e.................................................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - ldr q30, [x5, #-48] // ...............................e........................................................................................................................ - mls v20.4S, v12.4S, v8.S[0] // .............................................................................................................................................*.......... - // gap // ........................................................................................................................................................ - ldr q27, [x5, #-32] // ................................e....................................................................................................................... - // gap // ........................................................................................................................................................ - mls v13.4S, v31.4S, v8.S[0] // ........................................................................................................................................*............... - mls v1.4S, v22.4S, v8.S[0] // ...................................................................................................................................*.................... - // gap // ........................................................................................................................................................ - ldr q29, [x4, #16] // .......................................................................e................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - str q20, [x2, #-16] // .....................................................................................................................................................*.. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - str q13, [x2, #-32] // ....................................................................................................................................................*... - sub v22.4S, v3.4S, v4.4S // ........e............................................................................................................................................... - str q1, [x2, #-48] // ...................................................................................................................................................*.... - add x2, x2, #64 // .......................................................................................................................................................* - sub v12.4S, v5.4S, v6.4S // .............e.......................................................................................................................................... - add v11.4S, v5.4S, v6.4S // ..............e......................................................................................................................................... - add v20.4S, v3.4S, v4.4S // .........e.............................................................................................................................................. - // gap // ........................................................................................................................................................ - ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // .e...................................................................................................................................................... - mul v10.4S, v22.4S, v25.4S // ..........e............................................................................................................................................. - sqrdmulh v18.4S, v22.4S, v18.4S // ...........e............................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v26.4S, v12.4S, v17.4S // ................e....................................................................................................................................... - mul v28.4S, v12.4S, v28.4S // ...............e........................................................................................................................................ - // gap // ........................................................................................................................................................ - ldr q31, [x5, #-96] // ............................e........................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v5.4S, v20.4S, v11.4S // ...................e.................................................................................................................................... - sub v12.4S, v20.4S, v11.4S // ..................e..................................................................................................................................... - // gap // ........................................................................................................................................................ - mls v10.4S, v18.4S, v8.S[0] // ............e........................................................................................................................................... - // gap // ........................................................................................................................................................ - mls v28.4S, v26.4S, v8.S[0] // .................e...................................................................................................................................... - ldr q19, [x4], #64 // ......................................................................e................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v9.4S, v3.4S, v4.4S // .......................................e................................................................................................................ - sub v17.4S, v1.4S, v2.4S // ..................................e..................................................................................................................... - // gap // ........................................................................................................................................................ - ldr q26, [x4, #-32] // ........................................................................e............................................................................... - add v11.4S, v3.4S, v4.4S // ........................................e............................................................................................................... - add v22.4S, v1.4S, v2.4S // ...................................e.................................................................................................................... - // gap // ........................................................................................................................................................ - sqrdmulh v4.4S, v9.4S, v16.4S // ..........................................e............................................................................................................. - mul v6.4S, v12.4S, v15.4S // ....................e................................................................................................................................... - // gap // ........................................................................................................................................................ - mul v13.4S, v17.4S, v24.4S // ....................................e................................................................................................................... - sub v18.4S, v10.4S, v28.4S // .......................e................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v23.4S, v9.4S, v27.4S // .........................................e.............................................................................................................. - // gap // ........................................................................................................................................................ - sub v20.4S, v22.4S, v11.4S // ............................................e........................................................................................................... - // gap // ........................................................................................................................................................ - sqrdmulh v3.4S, v17.4S, v30.4S // .....................................e.................................................................................................................. - sqrdmulh v1.4S, v18.4S, v0.4S // ..........................e............................................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v17.4S, v12.4S, v0.4S // .....................e.................................................................................................................................. - sqrdmulh v12.4S, v20.4S, v7.4S // ...............................................e........................................................................................................ - // gap // ........................................................................................................................................................ - mul v24.4S, v18.4S, v15.4S // .........................e.............................................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v25.4S, v20.4S, v31.4S // ..............................................e......................................................................................................... - mls v23.4S, v4.4S, v8.S[0] // ...........................................e............................................................................................................ - mls v13.4S, v3.4S, v8.S[0] // ......................................e................................................................................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v14.4S, v10.4S, v28.4S // ........................e............................................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v6.4S, v17.4S, v8.S[0] // ......................e................................................................................................................................. - mls v24.4S, v1.4S, v8.S[0] // ...........................e............................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v25.4S, v12.4S, v8.S[0] // ................................................e....................................................................................................... - sub v20.4S, v13.4S, v23.4S // .................................................e...................................................................................................... - add v27.4S, v22.4S, v11.4S // .............................................e.......................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn2 v9.4S, v5.4S, v14.4S // .......................................................e................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v16.4S, v13.4S, v23.4S // ..................................................e..................................................................................................... - mul v31.4S, v20.4S, v31.4S // ...................................................e.................................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v30.4S, v20.4S, v7.4S // ....................................................e................................................................................................... - // gap // ........................................................................................................................................................ - trn2 v12.4S, v6.4S, v24.4S // .........................................................e.............................................................................................. - // gap // ........................................................................................................................................................ - trn2 v13.4S, v27.4S, v16.4S // ...............................................................e........................................................................................ - trn1 v4.4S, v5.4S, v14.4S // ......................................................e................................................................................................. - trn1 v20.4S, v6.4S, v24.4S // ........................................................e............................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v31.4S, v30.4S, v8.S[0] // .....................................................e.................................................................................................. - trn2 v1.2D, v9.2D, v12.2D // ...........................................................e............................................................................................ - trn1 v22.2D, v4.2D, v20.2D // ............................................................e........................................................................................... - trn1 v17.2D, v9.2D, v12.2D // .............................................................e.......................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn2 v4.2D, v4.2D, v20.2D // ..........................................................e............................................................................................. - trn1 v9.4S, v27.4S, v16.4S // ..............................................................e......................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn1 v20.4S, v25.4S, v31.4S // ................................................................e....................................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn2 v16.4S, v25.4S, v31.4S // .................................................................e...................................................................................... - add v10.4S, v22.4S, v17.4S // ...........................................................................e............................................................................ - add v31.4S, v4.4S, v1.4S // ................................................................................e....................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - trn2 v12.2D, v9.2D, v20.2D // ..................................................................e..................................................................................... - trn2 v30.2D, v13.2D, v16.2D // ...................................................................e.................................................................................... - // gap // ........................................................................................................................................................ - trn1 v23.2D, v9.2D, v20.2D // ....................................................................e................................................................................... - trn1 v28.2D, v13.2D, v16.2D // .....................................................................e.................................................................................. - // gap // ........................................................................................................................................................ - ldr q24, [x4, #-16] // .........................................................................e.............................................................................. - // gap // ........................................................................................................................................................ - sub v22.4S, v22.4S, v17.4S // ..........................................................................e............................................................................. - add v13.4S, v12.4S, v30.4S // ..........................................................................................e............................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v20.4S, v23.4S, v28.4S // .....................................................................................e.................................................................. - add v17.4S, v10.4S, v31.4S // ...............................................................................................e........................................................ - sub v12.4S, v12.4S, v30.4S // .........................................................................................e.............................................................. - sub v4.4S, v4.4S, v1.4S // ...............................................................................e........................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v9.4S, v20.4S, v13.4S // ........................................................................................................e............................................... - mul v16.4S, v22.4S, v29.S[2] // ............................................................................e........................................................................... - // gap // ........................................................................................................................................................ - sqrdmulh v30.4S, v22.4S, v29.S[3] // .............................................................................e.......................................................................... - srshr v22.4S, v17.4S, #23 // ..................................................................................................................e..................................... - // gap // ........................................................................................................................................................ - sqrdmulh v1.4S, v4.4S, v26.S[1] // ..................................................................................e..................................................................... - mul v6.4S, v4.4S, v26.S[0] // .................................................................................e...................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v4.4S, v20.4S, v13.4S // .........................................................................................................e.............................................. - mul v25.4S, v12.4S, v24.S[0] // ...........................................................................................e............................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v16.4S, v30.4S, v8.S[0] // ..............................................................................e......................................................................... - // gap // ........................................................................................................................................................ - mls v17.4S, v22.4S, v8.4S // ...................................................................................................................e.................................... - // gap // ........................................................................................................................................................ - srshr v20.4S, v4.4S, #23 // ......................................................................................................................e................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v22.4S, v23.4S, v28.4S // ....................................................................................e................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v12.4S, v12.4S, v24.S[1] // ............................................................................................e........................................................... - mls v6.4S, v1.4S, v8.S[0] // ...................................................................................e.................................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v1.4S, v22.4S, v26.S[2] // ......................................................................................e................................................................. - sqrdmulh v22.4S, v22.4S, v26.S[3] // .......................................................................................e................................................................ - mls v4.4S, v20.4S, v8.4S // .......................................................................................................................e................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v24.4S, v10.4S, v31.4S // ..............................................................................................e......................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v26.4S, v16.4S, v6.4S // ....................................................................................................e................................................... - mls v25.4S, v12.4S, v8.S[0] // .............................................................................................e.......................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sub v30.4S, v16.4S, v6.4S // ...................................................................................................e.................................................... - mls v1.4S, v22.4S, v8.S[0] // ........................................................................................e............................................................... - sub v20.4S, v17.4S, v4.4S // ..........................................................................................................................e............................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - srshr v22.4S, v26.4S, #23 // ....................................................................................................................e................................... - mul v10.4S, v30.4S, v19.S[2] // .....................................................................................................e.................................................. - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v16.4S, v30.4S, v19.S[3] // ......................................................................................................e................................................. - sqrdmulh v12.4S, v20.4S, v19.S[1] // .............................................................................................................................e.......................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v13.4S, v20.4S, v19.S[0] // ............................................................................................................................e........................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - add v20.4S, v17.4S, v4.4S // ...........................................................................................................................e............................ - sub v4.4S, v1.4S, v25.4S // .............................................................................................................e.......................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mls v10.4S, v16.4S, v8.S[0] // .......................................................................................................e................................................ - add v31.4S, v1.4S, v25.4S // ..............................................................................................................e......................................... - mls v13.4S, v12.4S, v8.S[0] // ..............................................................................................................................e......................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - mul v1.4S, v4.4S, v29.S[0] // ...............................................................................................................e........................................ - srshr v12.4S, v31.4S, #23 // ........................................................................................................................e............................... - str q20, [x1], #(16*4) // ..............................................................................................................................................e......... - // gap // ........................................................................................................................................................ - sqrdmulh v30.4S, v24.4S, v19.S[3] // .................................................................................................e...................................................... - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - // gap // ........................................................................................................................................................ - sqrdmulh v20.4S, v4.4S, v29.S[1] // ................................................................................................................e....................................... - // gap // ........................................................................................................................................................ - str q13, [x2], #(16*4) // ..................................................................................................................................................e..... - mul v7.4S, v9.4S, v29.S[0] // ..........................................................................................................e............................................. - mul v4.4S, v24.4S, v19.S[2] // ................................................................................................e....................................................... - - // original source code - // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // .........................e...........................................................................................................................|...........................e.............. - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ...........................................e.........................................................................................................|.......................................... - // ldr q0, [x5], #(12*16) // ...e.................................................................................................................................................|.....e.................................... - // ldr q4, [x5, #(-12*16 + 1*16)] // ............................e........................................................................................................................|..............................e........... - // ldr q1, [x5, #(-12*16 + 2*16)] // .............e.......................................................................................................................................|...............e.......................... - // ldr q5, [x5, #(-12*16 + 3*16)] // ...........e.........................................................................................................................................|.............e............................ - // ldr q2, [x5, #(-12*16 + 4*16)] // .........e...........................................................................................................................................|...........e.............................. - // ldr q6, [x5, #(-12*16 + 5*16)] // e....................................................................................................................................................|..e....................................... - // sub v24.4s, v9.4s, v10.4s // .....................................e...............................................................................................................|.......................................e.. - // add v9.4s, v9.4s, v10.4s // ..........................................e..........................................................................................................|.......................................... - // mul v10.4s, v24.4s, v1.4s // ............................................e........................................................................................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................e.......................................................................................................|.......................................... - // mls v10.4s, v24.4s, v8.s[0] // ...................................................e.................................................................................................|.......................................... - // sub v24.4s, v11.4s, v12.4s // ........................................e............................................................................................................|.......................................... - // add v11.4s, v11.4s, v12.4s // .........................................e...........................................................................................................|.......................................... - // mul v12.4s, v24.4s, v2.4s // ...............................................e.....................................................................................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................e......................................................................................................|.......................................... - // mls v12.4s, v24.4s, v8.s[0] // ....................................................e................................................................................................|.......................................... - // sub v24.4s, v9.4s, v11.4s // ..................................................e..................................................................................................|.......................................... - // add v9.4s, v9.4s, v11.4s // .................................................e...................................................................................................|.......................................... - // mul v11.4s, v24.4s, v0.4s // ............................................................e........................................................................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................................e.................................................................................|.......................................... - // mls v11.4s, v24.4s, v8.s[0] // ..........................................................................e..........................................................................|.......................................... - // sub v24.4s, v10.4s, v12.4s // ..............................................................e......................................................................................|.......................................... - // add v10.4s, v10.4s, v12.4s // .........................................................................e...........................................................................|.......................................... - // mul v12.4s, v24.4s, v0.4s // .....................................................................e...............................................................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................................................e..................................................................................|.......................................... - // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................e.........................................................................|.......................................... - // ldr q0, [x5, #(-12*16 + 6*16)] // ................................................e....................................................................................................|.......................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ..................e..................................................................................................................................|....................e..................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ........e............................................................................................................................................|..........e............................... - // ldr q5, [x5, #(-12*16 + 9*16)] // .............................e.......................................................................................................................|...............................e.......... - // ldr q2, [x5, #(-12*16 + 10*16)] // ...............................e.....................................................................................................................|.................................e........ - // ldr q6, [x5, #(-12*16 + 11*16)] // .e...................................................................................................................................................|...e...................................... - // sub v24.4s, v13.4s, v14.4s // .......................................................e.............................................................................................|.......................................... - // add v13.4s, v13.4s, v14.4s // ..........................................................e..........................................................................................|.......................................... - // mul v14.4s, v24.4s, v1.4s // .............................................................e.......................................................................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v5.4s // .................................................................e...................................................................................|.......................................... - // mls v14.4s, v24.4s, v8.s[0] // ........................................................................e............................................................................|.......................................... - // sub v24.4s, v15.4s, v16.4s // ......................................................e..............................................................................................|.......................................... - // add v15.4s, v15.4s, v16.4s // .........................................................e...........................................................................................|.......................................... - // mul v16.4s, v24.4s, v2.4s // ...............................................................e.....................................................................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................................e.........................................................................................|.......................................... - // mls v16.4s, v24.4s, v8.s[0] // .......................................................................e.............................................................................|.......................................... - // sub v24.4s, v13.4s, v15.4s // ................................................................e....................................................................................|.......................................... - // add v13.4s, v13.4s, v15.4s // ..............................................................................e......................................................................|.......................................... - // mul v15.4s, v24.4s, v0.4s // ......................................................................e..............................................................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................................e................................................................................|.......................................... - // mls v15.4s, v24.4s, v8.s[0] // ............................................................................e........................................................................|.......................................... - // sub v24.4s, v14.4s, v16.4s // .............................................................................e.......................................................................|.......................................... - // add v14.4s, v14.4s, v16.4s // ................................................................................e....................................................................|.......................................... - // mul v16.4s, v24.4s, v0.4s // .................................................................................e...................................................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................................................................e..................................................................|.......................................... - // mls v16.4s, v24.4s, v8.s[0] // .......................................................................................e.............................................................|.......................................... - // trn1 v25.4s, v9.4s, v10.4s // .....................................................................................e...............................................................|.......................................... - // trn2 v26.4s, v9.4s, v10.4s // ...............................................................................e.....................................................................|.......................................... - // trn1 v27.4s, v11.4s, v12.4s // ......................................................................................e..............................................................|.......................................... - // trn2 v28.4s, v11.4s, v12.4s // ...................................................................................e.................................................................|.......................................... - // trn2 v11.2d, v25.2d, v27.2d // ...........................................................................................e.........................................................|.......................................... - // trn2 v12.2d, v26.2d, v28.2d // ........................................................................................e............................................................|.......................................... - // trn1 v9.2d, v25.2d, v27.2d // .........................................................................................e...........................................................|.......................................... - // trn1 v10.2d, v26.2d, v28.2d // ..........................................................................................e..........................................................|.......................................... - // trn1 v25.4s, v13.4s, v14.4s // ............................................................................................e........................................................|.......................................... - // trn2 v26.4s, v13.4s, v14.4s // ....................................................................................e................................................................|.......................................... - // trn1 v27.4s, v15.4s, v16.4s // .............................................................................................e.......................................................|.......................................... - // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................e......................................................|.......................................... - // trn2 v15.2d, v25.2d, v27.2d // .................................................................................................e...................................................|.......................................... - // trn2 v16.2d, v26.2d, v28.2d // ..................................................................................................e..................................................|.......................................... - // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................e.................................................|.......................................... - // trn1 v14.2d, v26.2d, v28.2d // ....................................................................................................e................................................|.......................................... - // ldr q0, [x4], #64 // .....................................................e...............................................................................................|.......................................... - // ldr q1, [x4, #(-64 + 16)] // ..................................e..................................................................................................................|....................................e..... - // ldr q2, [x4, #(-64 + 32)] // ........................................................e............................................................................................|.......................................... - // ldr q3, [x4, #(-64 + 48)] // .....................................................................................................e...............................................|.......................................... - // sub v24.4s, v9.4s, v10.4s // ......................................................................................................e..............................................|.......................................... - // add v9.4s, v9.4s, v10.4s // ...............................................................................................e.....................................................|.......................................... - // mul v10.4s, v24.4s, v1.s[2] // .............................................................................................................e.......................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................................................................................................e......................................|.......................................... - // mls v10.4s, v24.4s, v8.s[0] // ....................................................................................................................e................................|.......................................... - // sub v24.4s, v11.4s, v12.4s // ...........................................................................................................e.........................................|.......................................... - // add v11.4s, v11.4s, v12.4s // ................................................................................................e....................................................|.......................................... - // mul v12.4s, v24.4s, v2.s[0] // .................................................................................................................e...................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................................e....................................|.......................................... - // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................................e...........................|.......................................... - // sub v24.4s, v13.4s, v14.4s // .......................................................................................................................e.............................|.......................................... - // add v13.4s, v13.4s, v14.4s // ........................................................................................................e............................................|.......................................... - // mul v14.4s, v24.4s, v2.s[2] // ..........................................................................................................................e..........................|.......................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................................e.........................|.......................................... - // mls v14.4s, v24.4s, v8.s[0] // .................................................................................................................................e...................|.......................................... - // sub v24.4s, v15.4s, v16.4s // ..........................................................................................................e..........................................|.......................................... - // add v15.4s, v15.4s, v16.4s // .......................................................................................................e.............................................|.......................................... - // mul v16.4s, v24.4s, v3.s[0] // ...................................................................................................................e.................................|.......................................... - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................................e............................|.......................................... - // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................................................................e.....................|.......................................... - // sub v24.4s, v9.4s, v11.4s // .............................................................................................................................e.......................|.......................................... - // add v9.4s, v9.4s, v11.4s // .........................................................................................................e...........................................|.......................................... - // mul v11.4s, v24.4s, v0.s[2] // ....................................................................................................................................................e|.......................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................................................................e....|.......................................... - // mls v11.4s, v24.4s, v8.s[0] // .....*...............................................................................................................................................|.......*.................................. - // sub v24.4s, v10.4s, v12.4s // ................................................................................................................................e....................|.......................................... - // add v10.4s, v10.4s, v12.4s // ..............................................................................................................................e......................|.......................................... - // mul v12.4s, v24.4s, v0.s[2] // ....................................................................................................................................e................|.......................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................e...............|.......................................... - // mls v12.4s, v24.4s, v8.s[0] // ..........................................................................................................................................e..........|.......................................... - // sub v24.4s, v13.4s, v15.4s // ............................................................................................................e........................................|.......................................... - // add v13.4s, v13.4s, v15.4s // ..................................................................................................................e..................................|.......................................... - // mul v15.4s, v24.4s, v1.s[0] // ...................................................................................................................................................e.|.......................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................................................................................................................................................*.......................................... - // mls v15.4s, v24.4s, v8.s[0] // ....*................................................................................................................................................|......*................................... - // sub v24.4s, v14.4s, v16.4s // .........................................................................................................................................e...........|.......................................... - // add v14.4s, v14.4s, v16.4s // ...........................................................................................................................................e.........|.......................................... - // mul v16.4s, v24.4s, v1.s[0] // .............................................................................................................................................e.......|.......................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................e...|.......................................... - // mls v16.4s, v24.4s, v8.s[0] // ..*..................................................................................................................................................|....*..................................... - // srshr v24.4S, v9.4S, #23 // ...............................................................................................................e.....................................|.......................................... - // mls v9.4s, v24.4s, v8.4s // .....................................................................................................................e...............................|.......................................... - // srshr v24.4S, v10.4S, #23 // ...................................................................................................................................e.................|.......................................... - // mls v10.4s, v24.4s, v8.4s // .....................................................................................................................................................|*......................................... - // srshr v24.4S, v13.4S, #23 // ......................................................................................................................e..............................|.......................................... - // mls v13.4s, v24.4s, v8.4s // ............................................................................................................................e........................|.......................................... - // srshr v24.4S, v14.4S, #23 // ..............................................................................................................................................e......|.......................................... - // mls v14.4s, v24.4s, v8.4s // .....................................................................................................................................................|.*........................................ - // sub v24.4s, v9.4s, v13.4s // ..................................................................................................................................e..................|.......................................... - // add v9.4s, v9.4s, v13.4s // ........................................................................................................................................e............|.......................................... - // mul v13.4s, v24.4s, v0.s[0] // .......................................................................................................................................e.............|.......................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................e..............|.......................................... - // mls v13.4s, v24.4s, v8.s[0] // ............................................................................................................................................e........|.......................................... - // sub v24.4s, v10.4s, v14.4s // ......*..............................................................................................................................................|........*................................. - // add v10.4s, v10.4s, v14.4s // .......*.............................................................................................................................................|.........*................................ - // mul v14.4s, v24.4s, v0.s[0] // ...........................*.........................................................................................................................|.............................*............ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................*..........................................................................................................................|............................*............. - // mls v14.4s, v24.4s, v8.s[0] // .................................*...................................................................................................................|...................................*...... - // sub v24.4s, v11.4s, v15.4s // ...............*.....................................................................................................................................|.................*........................ - // add v11.4s, v11.4s, v15.4s // ..............*......................................................................................................................................|................*......................... - // mul v15.4s, v24.4s, v0.s[0] // ......................*..............................................................................................................................|........................*................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................*...............................................................................................................................|.......................*.................. - // mls v15.4s, v24.4s, v8.s[0] // ................................*....................................................................................................................|..................................*....... - // sub v24.4s, v12.4s, v16.4s // ............*........................................................................................................................................|..............*........................... - // add v12.4s, v12.4s, v16.4s // ..........*..........................................................................................................................................|............*............................. - // mul v16.4s, v24.4s, v0.s[0] // ...................*.................................................................................................................................|.....................*.................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................*................................................................................................................................|......................*................... - // mls v16.4s, v24.4s, v8.s[0] // ..............................*......................................................................................................................|................................*......... - // str q9, [x1], #(16*4) // ...............................................................................................................................................e.....|.......................................... - // str q10, [x1, #(-16*4 + 1*16)] // ................*....................................................................................................................................|..................*....................... - // str q11, [x1, #(-16*4 + 2*16)] // .......................*.............................................................................................................................|.........................*................ - // str q12, [x1, #(-16*4 + 3*16)] // .................*...................................................................................................................................|...................*...................... - // str q13, [x2], #(16*4) // ..................................................................................................................................................e..|.......................................... - // str q14, [x2, #(-16*4 + 1*16)] // ......................................*..............................................................................................................|........................................*. - // str q15, [x2, #(-16*4 + 2*16)] // ....................................*................................................................................................................|......................................*... - // str q16, [x2, #(-16*4 + 3*16)] // ...................................*.................................................................................................................|.....................................*.... - // add x1, x1, #64 // ........................*............................................................................................................................|..........................*............... - // add x2, x2, #64 // .......................................*.............................................................................................................|.........................................* + // Instructions: 152 + // Expected cycles: 66 + // Expected IPC: 2.30 + // + // Wall time: 1850.37s + // User time: 1850.37s + // + // ------------------------------------------------------------------ original position ------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + sub v0.4S, v19.4S, v22.4S // .........................................................................................................................................*.............. + ldr q12, [x5, #16] // ...e.................................................................................................................................................... + add v13.4S, v19.4S, v22.4S // ..........................................................................................................................................*............. + str q5, [x1, #-48] // ...............................................................................................................................................*........ + str q4, [x1, #-32] // ................................................................................................................................................*....... + // gap // ........................................................................................................................................................ + sqrdmulh v23.4S, v18.4S, v30.S[1] // .................................................................................................................................*...................... + mul v21.4S, v18.4S, v30.S[0] // ..................................................................................................................................*..................... + str q13, [x1, #-16] // .................................................................................................................................................*...... + add x1, x1, #64 // ......................................................................................................................................................*. + mul v1.4S, v29.4S, v30.S[0] // .......................................................................................................................................*................ + sqrdmulh v5.4S, v29.4S, v30.S[1] // ......................................................................................................................................*................. + ld4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x1] // e....................................................................................................................................................... + ldr q13, [x5, #160] // ................................e....................................................................................................................... + sqrdmulh v22.4S, v0.4S, v30.S[1] // ...........................................................................................................................................*............ + mul v6.4S, v0.4S, v30.S[0] // ............................................................................................................................................*........... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q31, [x5, #48] // .....e.................................................................................................................................................. + mls v21.4S, v23.4S, v8.S[0] // ...................................................................................................................................*.................... + ldr q0, [x5, #80] // .......e................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v1.4S, v5.4S, v8.S[0] // ........................................................................................................................................*............... + // gap // ........................................................................................................................................................ + ldr q7, [x5, #64] // ......e................................................................................................................................................. + // gap // ........................................................................................................................................................ + mls v6.4S, v22.4S, v8.S[0] // .............................................................................................................................................*.......... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q21, [x2, #-48] // ...................................................................................................................................................*.... + ldr q16, [x5, #32] // ....e................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q1, [x2, #-32] // ....................................................................................................................................................*... + sub v3.4S, v28.4S, v29.4S // .............e.......................................................................................................................................... + sub v30.4S, v26.4S, v27.4S // ........e............................................................................................................................................... + str q6, [x2, #-16] // .....................................................................................................................................................*.. + add x2, x2, #64 // .......................................................................................................................................................* + ldr q19, [x5], #(12*16) // ..e..................................................................................................................................................... + ld4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x2] // .e...................................................................................................................................................... + add v18.4S, v28.4S, v29.4S // ..............e......................................................................................................................................... + add v15.4S, v26.4S, v27.4S // .........e.............................................................................................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v26.4S, v3.4S, v0.4S // ...............e........................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v20.4S, v3.4S, v7.4S // ................e....................................................................................................................................... + mul v5.4S, v30.4S, v16.4S // ...........e............................................................................................................................................ + sqrdmulh v10.4S, v30.4S, v31.4S // ..........e............................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v11.4S, v15.4S, v18.4S // ..................e..................................................................................................................................... + // gap // ........................................................................................................................................................ + add v21.4S, v15.4S, v18.4S // ...................e.................................................................................................................................... + ldr q9, [x5, #-48] // ...............................e........................................................................................................................ + ldr q18, [x5, #-64] // ..............................e......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v20.4S, v26.4S, v8.S[0] // .................e...................................................................................................................................... + ldr q29, [x5, #-96] // ............................e........................................................................................................................... + ldr q31, [x5, #-16] // .................................e...................................................................................................................... + mls v5.4S, v10.4S, v8.S[0] // ............e........................................................................................................................................... + mul v28.4S, v11.4S, v19.4S // .....................e.................................................................................................................................. + sub v6.4S, v22.4S, v23.4S // ..................................e..................................................................................................................... + sub v27.4S, v24.4S, v25.4S // .......................................e................................................................................................................ + ldr q4, [x5, #-80] // .............................e.......................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v10.4S, v11.4S, v12.4S // ....................e................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v14.4S, v22.4S, v23.4S // ...................................e.................................................................................................................... + sub v26.4S, v5.4S, v20.4S // .......................e................................................................................................................................ + mul v3.4S, v6.4S, v18.4S // .....................................e.................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v7.4S, v27.4S, v31.4S // .........................................e.............................................................................................................. + mul v1.4S, v27.4S, v13.4S // ..........................................e............................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v26.4S, v12.4S // .........................e.............................................................................................................................. + mul v15.4S, v26.4S, v19.4S // ..........................e............................................................................................................................. + sqrdmulh v17.4S, v6.4S, v9.4S // ....................................e................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v28.4S, v10.4S, v8.S[0] // ......................e................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v1.4S, v7.4S, v8.S[0] // ...........................................e............................................................................................................ + add v16.4S, v24.4S, v25.4S // ........................................e............................................................................................................... + add v5.4S, v5.4S, v20.4S // ........................e............................................................................................................................... + // gap // ........................................................................................................................................................ + mls v15.4S, v30.4S, v8.S[0] // ...........................e............................................................................................................................ + // gap // ........................................................................................................................................................ + sub v11.4S, v14.4S, v16.4S // ............................................e........................................................................................................... + mls v3.4S, v17.4S, v8.S[0] // ......................................e................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v18.4S, v21.4S, v5.4S // ......................................................e................................................................................................. + trn2 v30.4S, v21.4S, v5.4S // .......................................................e................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v11.4S, v29.4S // ...............................................e........................................................................................................ + trn2 v22.4S, v28.4S, v15.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + sub v0.4S, v3.4S, v1.4S // .................................................e...................................................................................................... + // gap // ........................................................................................................................................................ + trn1 v31.4S, v28.4S, v15.4S // ........................................................e............................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v20.2D, v30.2D, v22.2D // ...........................................................e............................................................................................ + trn1 v25.2D, v30.2D, v22.2D // .............................................................e.......................................................................................... + mul v6.4S, v0.4S, v29.4S // ....................................................e................................................................................................... + sqrdmulh v24.4S, v11.4S, v4.4S // ..............................................e......................................................................................................... + // gap // ........................................................................................................................................................ + ldr q12, [x4, #32] // ........................................................................e............................................................................... + add v10.4S, v3.4S, v1.4S // ..................................................e..................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v28.4S, v0.4S, v4.4S // ...................................................e.................................................................................................... + // gap // ........................................................................................................................................................ + trn2 v19.2D, v18.2D, v31.2D // ..........................................................e............................................................................................. + add v21.4S, v14.4S, v16.4S // .............................................e.......................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v0.2D, v18.2D, v31.2D // ............................................................e........................................................................................... + // gap // ........................................................................................................................................................ + mls v13.4S, v24.4S, v8.S[0] // ................................................e....................................................................................................... + ldr q30, [x4], #64 // ......................................................................e................................................................................. + mls v6.4S, v28.4S, v8.S[0] // .....................................................e.................................................................................................. + sub v29.4S, v19.4S, v20.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + ldr q2, [x4, #-48] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + trn1 v7.4S, v21.4S, v10.4S // ..............................................................e......................................................................................... + // gap // ........................................................................................................................................................ + add v11.4S, v0.4S, v25.4S // ...........................................................................e............................................................................ + // gap // ........................................................................................................................................................ + mul v23.4S, v29.4S, v12.S[0] // ..................................................................................e..................................................................... + trn2 v17.4S, v21.4S, v10.4S // ...............................................................e........................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q16, [x4, #-16] // .........................................................................e.............................................................................. + trn1 v27.4S, v13.4S, v6.4S // ................................................................e....................................................................................... + add v9.4S, v19.4S, v20.4S // ................................................................................e....................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v14.4S, v0.4S, v25.4S // ..........................................................................e............................................................................. + trn2 v19.4S, v13.4S, v6.4S // .................................................................e...................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v0.4S, v11.4S, v9.4S // ..............................................................................................e......................................................... + trn2 v20.2D, v7.2D, v27.2D // ..................................................................e..................................................................................... + add v21.4S, v11.4S, v9.4S // ...............................................................................................e........................................................ + sqrdmulh v6.4S, v29.4S, v12.S[1] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v18.2D, v17.2D, v19.2D // ...................................................................e.................................................................................... + mul v11.4S, v0.4S, v30.S[2] // .................................................................................................e...................................................... + trn1 v29.2D, v7.2D, v27.2D // ....................................................................e................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v26.2D, v17.2D, v19.2D // .....................................................................e.................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v7.4S, v20.4S, v18.4S // .........................................................................................e.............................................................. + sqrdmulh v5.4S, v0.4S, v30.S[3] // ................................................................................................e....................................................... + // gap // ........................................................................................................................................................ + sub v0.4S, v29.4S, v26.4S // ....................................................................................e................................................................... + add v17.4S, v29.4S, v26.4S // .....................................................................................e.................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v10.4S, v20.4S, v18.4S // ..........................................................................................e............................................................. + // gap // ........................................................................................................................................................ + mul v25.4S, v7.4S, v16.S[0] // ............................................................................................e........................................................... + sqrdmulh v22.4S, v0.4S, v12.S[3] // ......................................................................................e................................................................. + mul v24.4S, v14.4S, v2.S[2] // .............................................................................e.......................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v15.4S, v0.4S, v12.S[2] // .......................................................................................e................................................................ + add v3.4S, v17.4S, v10.4S // .........................................................................................................e.............................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v29.4S, v14.4S, v2.S[3] // ............................................................................e........................................................................... + sub v0.4S, v17.4S, v10.4S // ........................................................................................................e............................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v31.4S, v21.4S, #23 // ..................................................................................................................e..................................... + srshr v20.4S, v3.4S, #23 // ......................................................................................................................e................................. + // gap // ........................................................................................................................................................ + sqrdmulh v18.4S, v7.4S, v16.S[1] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + mls v15.4S, v22.4S, v8.S[0] // ........................................................................................e............................................................... + mls v23.4S, v6.4S, v8.S[0] // ...................................................................................e.................................................................... + mls v24.4S, v29.4S, v8.S[0] // ..............................................................................e......................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v3.4S, v20.4S, v8.4S // .......................................................................................................................e................................ + mls v21.4S, v31.4S, v8.4S // ...................................................................................................................e.................................... + mls v25.4S, v18.4S, v8.S[0] // .............................................................................................e.......................................................... + mul v17.4S, v0.4S, v2.S[0] // ...........................................................................................................e............................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v29.4S, v24.4S, v23.4S // ....................................................................................................e................................................... + sub v14.4S, v24.4S, v23.4S // ...................................................................................................e.................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v12.4S, v21.4S, v3.4S // ..........................................................................................................................e............................. + sqrdmulh v13.4S, v0.4S, v2.S[1] // ..........................................................................................................e............................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v20.4S, v15.4S, v25.4S // ..............................................................................................................e......................................... + srshr v7.4S, v29.4S, #23 // ....................................................................................................................e................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v28.4S, v12.4S, v30.S[0] // .............................................................................................................................e.......................... + // gap // ........................................................................................................................................................ + sqrdmulh v6.4S, v12.4S, v30.S[1] // ............................................................................................................................e........................... + srshr v24.4S, v20.4S, #23 // ........................................................................................................................e............................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v4.4S, v15.4S, v25.4S // .............................................................................................................e.......................................... + sqrdmulh v10.4S, v14.4S, v30.S[3] // .....................................................................................................e.................................................. + // gap // ........................................................................................................................................................ + mul v19.4S, v14.4S, v30.S[2] // ......................................................................................................e................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v0.4S, v4.4S, v2.S[1] // ...............................................................................................................e........................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v29.4S, v7.4S, v8.4S // .....................................................................................................................e.................................. + mul v22.4S, v4.4S, v2.S[0] // ................................................................................................................e....................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v20.4S, v24.4S, v8.4S // .........................................................................................................................e.............................. + mls v17.4S, v13.4S, v8.S[0] // ............................................................................................................e........................................... + // gap // ........................................................................................................................................................ + mls v28.4S, v6.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + mls v19.4S, v10.4S, v8.S[0] // .......................................................................................................e................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v11.4S, v5.4S, v8.S[0] // ..................................................................................................e..................................................... + add v27.4S, v21.4S, v3.4S // ...........................................................................................................................e............................ + mls v22.4S, v0.4S, v8.S[0] // .................................................................................................................e...................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v5.4S, v29.4S, v20.4S // ................................................................................................................................e....................... + sub v18.4S, v29.4S, v20.4S // ...............................................................................................................................e........................ + // gap // ........................................................................................................................................................ + str q28, [x2], #(16*4) // ..................................................................................................................................................e..... + str q27, [x1], #(16*4) // ..............................................................................................................................................e......... + add v4.4S, v11.4S, v17.4S // .....................................................................................................................................e.................. + // gap // ........................................................................................................................................................ + sub v29.4S, v11.4S, v17.4S // ....................................................................................................................................e................... + + // ---------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--- + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ..........e............................................................................................................................................'..........~................ + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ............................e..........................................................................................................................'........................... + // ldr q0, [x5], #(12*16) // ...........................e...........................................................................................................................'........................... + // ldr q4, [x5, #(-12*16 + 1*16)] // e......................................................................................................................................................'~.......................... + // ldr q1, [x5, #(-12*16 + 2*16)] // .....................e.................................................................................................................................'.....................~..... + // ldr q5, [x5, #(-12*16 + 3*16)] // ..............e........................................................................................................................................'..............~............ + // ldr q2, [x5, #(-12*16 + 4*16)] // ..................e....................................................................................................................................'..................~........ + // ldr q6, [x5, #(-12*16 + 5*16)] // ................e......................................................................................................................................'................~.......... + // sub v24.4s, v9.4s, v10.4s // ........................e..............................................................................................................................'........................~.. + // add v9.4s, v9.4s, v10.4s // ..............................e........................................................................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v5.4s // ..................................e....................................................................................................................'........................... + // mul v10.4s, v24.4s, v1.4s // .................................e.....................................................................................................................'........................... + // mls v10.4s, v27.4s, v8.s[0] // ..........................................e............................................................................................................'........................... + // sub v24.4s, v11.4s, v12.4s // .......................e...............................................................................................................................'.......................~... + // add v11.4s, v11.4s, v12.4s // .............................e.........................................................................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ...............................e.......................................................................................................................'........................... + // mul v12.4s, v24.4s, v2.4s // ................................e......................................................................................................................'........................... + // mls v12.4s, v27.4s, v8.s[0] // .......................................e...............................................................................................................'........................... + // sub v24.4s, v9.4s, v11.4s // ...................................e...................................................................................................................'........................... + // add v9.4s, v9.4s, v11.4s // ....................................e..................................................................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...............................................e.......................................................................................................'........................... + // mul v11.4s, v24.4s, v0.4s // ...........................................e...........................................................................................................'........................... + // mls v11.4s, v27.4s, v8.s[0] // ........................................................e..............................................................................................'........................... + // sub v24.4s, v10.4s, v12.4s // .................................................e.....................................................................................................'........................... + // add v10.4s, v10.4s, v12.4s // ...........................................................e...........................................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v4.4s // .....................................................e.................................................................................................'........................... + // mul v12.4s, v24.4s, v0.4s // ......................................................e................................................................................................'........................... + // mls v12.4s, v27.4s, v8.s[0] // ............................................................e..........................................................................................'........................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ........................................e..............................................................................................................'........................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..............................................e........................................................................................................'........................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ......................................e................................................................................................................'........................... + // ldr q5, [x5, #(-12*16 + 9*16)] // .....................................e.................................................................................................................'........................... + // ldr q2, [x5, #(-12*16 + 10*16)] // ...........e...........................................................................................................................................'...........~............... + // ldr q6, [x5, #(-12*16 + 11*16)] // .........................................e.............................................................................................................'........................... + // sub v24.4s, v13.4s, v14.4s // ............................................e..........................................................................................................'........................... + // add v13.4s, v13.4s, v14.4s // ................................................e......................................................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v5.4s // .......................................................e...............................................................................................'........................... + // mul v14.4s, v24.4s, v1.4s // ..................................................e....................................................................................................'........................... + // mls v14.4s, v27.4s, v8.s[0] // ..............................................................e........................................................................................'........................... + // sub v24.4s, v15.4s, v16.4s // .............................................e.........................................................................................................'........................... + // add v15.4s, v15.4s, v16.4s // ..........................................................e............................................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v6.4s // ...................................................e...................................................................................................'........................... + // mul v16.4s, v24.4s, v2.4s // ....................................................e..................................................................................................'........................... + // mls v16.4s, v27.4s, v8.s[0] // .........................................................e.............................................................................................'........................... + // sub v24.4s, v13.4s, v15.4s // .............................................................e.........................................................................................'........................... + // add v13.4s, v13.4s, v15.4s // .............................................................................e.........................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ........................................................................e..............................................................................'........................... + // mul v15.4s, v24.4s, v0.4s // .................................................................e.....................................................................................'........................... + // mls v15.4s, v27.4s, v8.s[0] // ...............................................................................e.......................................................................'........................... + // sub v24.4s, v14.4s, v16.4s // ...................................................................e...................................................................................'........................... + // add v14.4s, v14.4s, v16.4s // ..........................................................................e............................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v4.4s // ...........................................................................e...........................................................................'........................... + // mul v16.4s, v24.4s, v0.4s // .......................................................................e...............................................................................'........................... + // mls v16.4s, v27.4s, v8.s[0] // .................................................................................e.....................................................................'........................... + // trn1 v25.4s, v9.4s, v10.4s // ...............................................................e.......................................................................................'........................... + // trn2 v26.4s, v9.4s, v10.4s // ................................................................e......................................................................................'........................... + // trn1 v27.4s, v11.4s, v12.4s // ....................................................................e..................................................................................'........................... + // trn2 v28.4s, v11.4s, v12.4s // ..................................................................e....................................................................................'........................... + // trn2 v11.2d, v25.2d, v27.2d // ............................................................................e..........................................................................'........................... + // trn2 v12.2d, v26.2d, v28.2d // .....................................................................e.................................................................................'........................... + // trn1 v9.2d, v25.2d, v27.2d // ..............................................................................e........................................................................'........................... + // trn1 v10.2d, v26.2d, v28.2d // ......................................................................e................................................................................'........................... + // trn1 v25.4s, v13.4s, v14.4s // ....................................................................................e..................................................................'........................... + // trn2 v26.4s, v13.4s, v14.4s // .......................................................................................e...............................................................'........................... + // trn1 v27.4s, v15.4s, v16.4s // .........................................................................................e.............................................................'........................... + // trn2 v28.4s, v15.4s, v16.4s // ............................................................................................e..........................................................'........................... + // trn2 v15.2d, v25.2d, v27.2d // ..............................................................................................e........................................................'........................... + // trn2 v16.2d, v26.2d, v28.2d // .................................................................................................e.....................................................'........................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................e...................................................'........................... + // trn1 v14.2d, v26.2d, v28.2d // ....................................................................................................e..................................................'........................... + // ldr q0, [x4], #64 // ................................................................................e......................................................................'........................... + // ldr q1, [x4, #(-64 + 16)] // ...................................................................................e...................................................................'........................... + // ldr q2, [x4, #(-64 + 32)] // .........................................................................e.............................................................................'........................... + // ldr q3, [x4, #(-64 + 48)] // ........................................................................................e..............................................................'........................... + // sub v24.4s, v9.4s, v10.4s // ...........................................................................................e...........................................................'........................... + // add v9.4s, v9.4s, v10.4s // .....................................................................................e.................................................................'........................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // ...............................................................................................................e.......................................'........................... + // mul v10.4s, v24.4s, v1.s[2] // ............................................................................................................e..........................................'........................... + // mls v10.4s, v27.4s, v8.s[0] // ......................................................................................................................e................................'........................... + // sub v24.4s, v11.4s, v12.4s // ..................................................................................e....................................................................'........................... + // add v11.4s, v11.4s, v12.4s // ..........................................................................................e............................................................'........................... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // ................................................................................................e......................................................'........................... + // mul v12.4s, v24.4s, v2.s[0] // ......................................................................................e................................................................'........................... + // mls v12.4s, v27.4s, v8.s[0] // .....................................................................................................................e.................................'........................... + // sub v24.4s, v13.4s, v14.4s // .......................................................................................................e...............................................'........................... + // add v13.4s, v13.4s, v14.4s // ........................................................................................................e..............................................'........................... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ...........................................................................................................e...........................................'........................... + // mul v14.4s, v24.4s, v2.s[2] // .............................................................................................................e.........................................'........................... + // mls v14.4s, v27.4s, v8.s[0] // ....................................................................................................................e..................................'........................... + // sub v24.4s, v15.4s, v16.4s // .....................................................................................................e.................................................'........................... + // add v15.4s, v15.4s, v16.4s // .........................................................................................................e.............................................'........................... + // sqrdmulh v27.4s, v24.4s, v3.s[1] // ...................................................................................................................e...................................'........................... + // mul v16.4s, v24.4s, v3.s[0] // ..........................................................................................................e............................................'........................... + // mls v16.4s, v27.4s, v8.s[0] // .........................................................................................................................e.............................'........................... + // sub v24.4s, v9.4s, v11.4s // .............................................................................................e.........................................................'........................... + // add v9.4s, v9.4s, v11.4s // ...............................................................................................e.......................................................'........................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // ......................................................................................................e................................................'........................... + // mul v11.4s, v24.4s, v0.s[2] // ..................................................................................................e....................................................'........................... + // mls v11.4s, v27.4s, v8.s[0] // ..............................................................................................................................................e........'........................... + // sub v24.4s, v10.4s, v12.4s // ............................................................................................................................e..........................'........................... + // add v10.4s, v10.4s, v12.4s // ...........................................................................................................................e...........................'........................... + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .....................................................................................................................................e.................'........................... + // mul v12.4s, v24.4s, v0.s[2] // ......................................................................................................................................e................'........................... + // mls v12.4s, v27.4s, v8.s[0] // .............................................................................................................................................e.........'........................... + // sub v24.4s, v13.4s, v15.4s // ................................................................................................................e......................................'........................... + // add v13.4s, v13.4s, v15.4s // ..............................................................................................................e........................................'........................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // ..............................................................................................................................e........................'........................... + // mul v15.4s, v24.4s, v1.s[0] // ..........................................................................................................................e............................'........................... + // mls v15.4s, v27.4s, v8.s[0] // ...........................................................................................................................................e...........'........................... + // sub v24.4s, v14.4s, v16.4s // ....................................................................................................................................e..................'........................... + // add v14.4s, v14.4s, v16.4s // ...............................................................................................................................e.......................'........................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .......................................................................................................................................e...............'........................... + // mul v16.4s, v24.4s, v1.s[0] // .........................................................................................................................................e.............'........................... + // mls v16.4s, v27.4s, v8.s[0] // ................................................................................................................................................e......'........................... + // srshr v24.4S, v9.4S, #23 // .................................................................................................................e.....................................'........................... + // mls v9.4s, v24.4s, v8.4s // ........................................................................................................................e..............................'........................... + // srshr v24.4S, v10.4S, #23 // ................................................................................................................................e......................'........................... + // mls v10.4s, v24.4s, v8.4s // ........................................................................................................................................e..............'........................... + // srshr v24.4S, v13.4S, #23 // ..................................................................................................................e....................................'........................... + // mls v13.4s, v24.4s, v8.4s // .......................................................................................................................e...............................'........................... + // srshr v24.4S, v14.4S, #23 // ...................................................................................................................................e...................'........................... + // mls v14.4s, v24.4s, v8.4s // ..........................................................................................................................................e............'........................... + // sub v24.4s, v9.4s, v13.4s // .............................................................................................................................e.........................'........................... + // add v9.4s, v9.4s, v13.4s // ...............................................................................................................................................e.......'........................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..................................................................................................................................e....................'........................... + // mul v13.4s, v24.4s, v0.s[0] // .................................................................................................................................e.....................'........................... + // mls v13.4s, v27.4s, v8.s[0] // ............................................................................................................................................e..........'........................... + // sub v24.4s, v10.4s, v14.4s // ..................................................................................................................................................e....'........................... + // add v10.4s, v10.4s, v14.4s // .................................................................................................................................................e.....'........................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ....~..................................................................................................................................................'....*...................... + // mul v14.4s, v24.4s, v0.s[0] // .....~.................................................................................................................................................'.....*..................... + // mls v14.4s, v27.4s, v8.s[0] // ...............~.......................................................................................................................................'...............*........... + // sub v24.4s, v11.4s, v15.4s // ......................................................................................................................................................e'........................... + // add v11.4s, v11.4s, v15.4s // .....................................................................................................................................................e.'........................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .........~.............................................................................................................................................'.........*................. + // mul v15.4s, v24.4s, v0.s[0] // ........~..............................................................................................................................................'........*.................. + // mls v15.4s, v27.4s, v8.s[0] // .................~.....................................................................................................................................'.................*......... + // sub v24.4s, v12.4s, v16.4s // .......................................................................................................................................................*........................... + // add v12.4s, v12.4s, v16.4s // .~.....................................................................................................................................................'.*......................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ............~..........................................................................................................................................'............*.............. + // mul v16.4s, v24.4s, v0.s[0] // .............~.........................................................................................................................................'.............*............. + // mls v16.4s, v27.4s, v8.s[0] // ...................~...................................................................................................................................'...................*....... + // str q9, [x1], #(16*4) // ....................................................................................................................................................e..'........................... + // str q10, [x1, #(-16*4 + 1*16)] // ..~....................................................................................................................................................'..*........................ + // str q11, [x1, #(-16*4 + 2*16)] // ...~...................................................................................................................................................'...*....................... + // str q12, [x1, #(-16*4 + 3*16)] // ......~................................................................................................................................................'......*.................... + // str q13, [x2], #(16*4) // ...................................................................................................................................................e...'........................... + // str q14, [x2, #(-16*4 + 1*16)] // ....................~..................................................................................................................................'....................*...... + // str q15, [x2, #(-16*4 + 2*16)] // ......................~................................................................................................................................'......................*.... + // str q16, [x2, #(-16*4 + 3*16)] // .........................~.............................................................................................................................'.........................*. + // add x1, x1, #64 // .......~...............................................................................................................................................'.......*................... + // add x2, x2, #64 // ..........................~............................................................................................................................'..........................* sub count, count, #1 cbnz count, layer45678_start - sqrdmulh v13.4S, v9.4S, v29.S[1] // *............................ - mls v1.4S, v20.4S, v8.S[0] // ...*......................... - // gap // ............................. - // gap // ............................. - mls v26.4S, v22.4S, v8.4S // .*........................... - mls v31.4S, v12.4S, v8.4S // ..*.......................... - // gap // ............................. - // gap // ............................. - mls v4.4S, v30.4S, v8.S[0] // .....*....................... - // gap // ............................. - // gap // ............................. - // gap // ............................. - mls v7.4S, v13.4S, v8.S[0] // ....*........................ - sub v18.4S, v10.4S, v1.4S // .........*................... - // gap // ............................. - // gap // ............................. - sub v28.4S, v26.4S, v31.4S // ......*...................... - add v12.4S, v10.4S, v1.4S // ........*.................... - // gap // ............................. - // gap // ............................. - add v2.4S, v26.4S, v31.4S // .......*..................... - mul v1.4S, v18.4S, v19.S[0] // ..............*.............. - // gap // ............................. - // gap // ............................. - sub v22.4S, v4.4S, v7.4S // ...........*................. - str q12, [x1, #-16] // .............*............... - sqrdmulh v13.4S, v18.4S, v19.S[1] // ...............*............. - // gap // ............................. - sqrdmulh v20.4S, v28.4S, v19.S[1] // ....................*........ - mul v12.4S, v28.4S, v19.S[0] // .....................*....... - str q2, [x1, #-48] // ............*................ - // gap // ............................. - mul v18.4S, v22.4S, v19.S[0] // .................*........... - sqrdmulh v22.4S, v22.4S, v19.S[1] // ................*............ - // gap // ............................. - // gap // ............................. - add v7.4S, v4.4S, v7.4S // ..........*.................. - mls v1.4S, v13.4S, v8.S[0] // ......................*...... - // gap // ............................. - // gap // ............................. - mls v12.4S, v20.4S, v8.S[0] // ........................*.... - // gap // ............................. - // gap // ............................. - // gap // ............................. - str q7, [x1, #-32] // ..................*.......... - add x1, x1, #64 // ...................*......... - mls v18.4S, v22.4S, v8.S[0] // .......................*..... - // gap // ............................. - str q1, [x2, #-16] // .........................*... - // gap // ............................. - // gap // ............................. - // gap // ............................. - str q12, [x2, #-48] // ...........................*. - // gap // ............................. - // gap // ............................. - // gap // ............................. - str q18, [x2, #-32] // ..........................*.. - add x2, x2, #64 // ............................* - // gap // ............................. - // gap // ............................. - - // original source code - // sqrdmulh v13.4S, v9.4S, v29.S[1] // *............................ - // mls v26.4S, v22.4S, v8.4S // ..*.......................... - // mls v31.4S, v12.4S, v8.4S // ...*......................... - // mls v1.4S, v20.4S, v8.S[0] // .*........................... - // mls v7.4S, v13.4S, v8.S[0] // .....*....................... - // mls v4.4S, v30.4S, v8.S[0] // ....*........................ - // sub v30.4S, v26.4S, v31.4S // .......*..................... - // add v22.4S, v26.4S, v31.4S // .........*................... - // add v12.4S, v10.4S, v1.4S // ........*.................... - // sub v1.4S, v10.4S, v1.4S // ......*...................... - // add v9.4S, v4.4S, v7.4S // ...................*......... - // sub v4.4S, v4.4S, v7.4S // ...........*................. - // str q22, [x1, #-48] // ................*............ - // str q12, [x1, #-16] // ............*................ - // mul v20.4S, v1.4S, v19.S[0] // ..........*.................. - // sqrdmulh v12.4S, v1.4S, v19.S[1] // .............*............... - // sqrdmulh v31.4S, v4.4S, v19.S[1] // ..................*.......... - // mul v13.4S, v4.4S, v19.S[0] // .................*........... - // str q9, [x1, #-32] // ......................*...... - // add x1, x1, #64 // .......................*..... - // sqrdmulh v22.4S, v30.4S, v19.S[1] // ..............*.............. - // mul v1.4S, v30.4S, v19.S[0] // ...............*............. - // mls v20.4S, v12.4S, v8.S[0] // ....................*........ - // mls v13.4S, v31.4S, v8.S[0] // ........................*.... - // mls v1.4S, v22.4S, v8.S[0] // .....................*....... - // str q20, [x2, #-16] // .........................*... - // str q13, [x2, #-32] // ...........................*. - // str q1, [x2, #-48] // ..........................*.. - // add x2, x2, #64 // ............................* + // Instructions: 19 + // Expected cycles: 10 + // Expected IPC: 1.90 + // + // Wall time: 0.11s + // User time: 0.11s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sub v3.4S, v19.4S, v22.4S // *............................. + add v1.4S, v19.4S, v22.4S // .*............................ + str q5, [x1, #-48] // ..*........................... + // gap // .............................. + str q4, [x1, #-32] // ...*.......................... + sqrdmulh v2.4S, v18.4S, v30.S[1] // ....*......................... + mul v9.4S, v18.4S, v30.S[0] // .....*........................ + // gap // .............................. + str q1, [x1, #-16] // ......*....................... + mul v28.4S, v29.4S, v30.S[0] // ........*..................... + add x1, x1, #64 // .......*...................... + sqrdmulh v22.4S, v29.4S, v30.S[1] // .........*.................... + sqrdmulh v1.4S, v3.4S, v30.S[1] // ..........*................... + mul v21.4S, v3.4S, v30.S[0] // ...........*.................. + // gap // .............................. + // gap // .............................. + mls v9.4S, v2.4S, v8.S[0] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v28.4S, v22.4S, v8.S[0] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v21.4S, v1.4S, v8.S[0] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q9, [x2, #-48] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q28, [x2, #-32] // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q21, [x2, #-16] // .................*............ + add x2, x2, #64 // ..................*........... + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sub v0.4S, v19.4S, v22.4S // *.............................. + // add v13.4S, v19.4S, v22.4S // .*............................. + // str q5, [x1, #-48] // ..*............................ + // str q4, [x1, #-32] // ...*........................... + // sqrdmulh v23.4S, v18.4S, v30.S[1] // ....*.......................... + // mul v21.4S, v18.4S, v30.S[0] // .....*......................... + // str q13, [x1, #-16] // ......*........................ + // add x1, x1, #64 // ........*...................... + // mul v1.4S, v29.4S, v30.S[0] // .......*....................... + // sqrdmulh v5.4S, v29.4S, v30.S[1] // .........*..................... + // sqrdmulh v22.4S, v0.4S, v30.S[1] // ..........*.................... + // mul v6.4S, v0.4S, v30.S[0] // ...........*................... + // mls v21.4S, v23.4S, v8.S[0] // ............*.................. + // mls v1.4S, v5.4S, v8.S[0] // .............*................. + // mls v6.4S, v22.4S, v8.S[0] // ..............*................ + // str q21, [x2, #-48] // ...............*............... + // str q1, [x2, #-32] // ................*.............. + // str q6, [x2, #-16] // .................*............. + // add x2, x2, #64 // ..................*............ // ----------------------------------------------------------------------------- @@ -1289,710 +1309,742 @@ layer45678_start: load_roots_123 .p2align 2 - ldr q15, [x0, #0] // *................. - // gap // .................. - // gap // .................. - ldr q5, [x0, #128] // .*................ - ldr q9, [x0, #256] // ....*............. - ldr q17, [x0, #384] // .....*............ - // gap // .................. - // gap // .................. - ldr q10, [x0, #512] // ......*........... - // gap // .................. - // gap // .................. - // gap // .................. - ldr q23, [x0, #768] // ........*......... - // gap // .................. - // gap // .................. - // gap // .................. - sub v27.4S, v15.4S, v5.4S // ..*............... - ldr q6, [x0, #896] // .........*........ - // gap // .................. - // gap // .................. - add v14.4S, v15.4S, v5.4S // ...............*.. - ldr q18, [x0, #640] // .......*.......... - // gap // .................. - sub v12.4S, v9.4S, v17.4S // ..........*....... - sqrdmulh v5.4S, v27.4S, v1.S[3] // ...........*...... - mul v16.4S, v27.4S, v1.S[2] // ...*.............. - // gap // .................. - // gap // .................. - add v27.4S, v9.4S, v17.4S // ..............*... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - add v28.4S, v23.4S, v6.4S // .................* - mul v13.4S, v12.4S, v2.S[0] // .............*.... - mls v16.4S, v5.4S, v8.S[0] // ................*. - // gap // .................. - // gap // .................. - sub v17.4S, v10.4S, v18.4S // ............*..... - - // original source code - // ldr q11, [x0, #0] // *................. - // ldr q7, [x0, #128] // .*................ - // sub v19.4S, v11.4S, v7.4S // ......*........... - // mul v16.4S, v19.4S, v1.S[2] // ............*..... - // ldr q24, [x0, #256] // ..*............... - // ldr q14, [x0, #384] // ...*.............. - // ldr q10, [x0, #512] // ....*............. - // ldr q18, [x0, #640] // .........*........ - // ldr q23, [x0, #768] // .....*............ - // ldr q6, [x0, #896] // .......*.......... - // sub v12.4S, v24.4S, v14.4S // ..........*....... - // sqrdmulh v22.4S, v19.4S, v1.S[3] // ...........*...... - // sub v17.4S, v10.4S, v18.4S // .................* - // mul v13.4S, v12.4S, v2.S[0] // ...............*.. - // add v27.4S, v24.4S, v14.4S // .............*.... - // add v14.4S, v11.4S, v7.4S // ........*......... - // mls v16.4S, v22.4S, v8.S[0] // ................*. - // add v28.4S, v23.4S, v6.4S // ..............*... + // Instructions: 15 + // Expected cycles: 8 + // Expected IPC: 1.88 + // + // Wall time: 0.12s + // User time: 0.12s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q24, [x0, #768] // .*............................ + ldr q22, [x0, #896] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q21, [x0, #640] // ..*........................... + // gap // .............................. + ldr q18, [x0, #512] // ...*.......................... + // gap // .............................. + // gap // .............................. + ldr q29, [x0, #384] // *............................. + ldr q4, [x0, #256] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v13.4S, v24.4S, v22.4S // ........*..................... + // gap // .............................. + // gap // .............................. + add v5.4S, v24.4S, v22.4S // .............*................ + add v15.4S, v18.4S, v21.4S // ............*................. + sub v14.4S, v18.4S, v21.4S // .......*...................... + // gap // .............................. + // gap // .............................. + add v22.4S, v4.4S, v29.4S // ..........*................... + mul v19.4S, v13.4S, v3.S[0] // ...........*.................. + // gap // .............................. + // gap // .............................. + mul v24.4S, v14.4S, v2.S[2] // ..............*............... + ldr q20, [x0, #128] // ....*......................... + // gap // .............................. + sqrdmulh v6.4S, v14.4S, v2.S[3] // .........*.................... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q29, [x0, #384] // ....*.......................... + // ldr q5, [x0, #768] // *.............................. + // ldr q23, [x0, #640] // ..*............................ + // ldr q28, [x0, #512] // ...*........................... + // ldr q20, [x0, #128] // .............*................. + // ldr q17, [x0, #896] // .*............................. + // ldr q4, [x0, #256] // .....*......................... + // sub v12.4S, v28.4S, v23.4S // .........*..................... + // sub v13.4S, v5.4S, v17.4S // ......*........................ + // sqrdmulh v6.4S, v12.4S, v2.S[3] // ..............*................ + // add v22.4S, v4.4S, v29.4S // ..........*.................... + // mul v19.4S, v13.4S, v3.S[0] // ...........*................... + // add v15.4S, v28.4S, v23.4S // ........*...................... + // add v5.4S, v5.4S, v17.4S // .......*....................... + // mul v24.4S, v12.4S, v2.S[2] // ............*.................. sub count, count, #1 layer123_start: + // Instructions: 120 + // Expected cycles: 52 + // Expected IPC: 2.31 + // + // Wall time: 460.93s + // User time: 460.93s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- // gap // ........................................................................................................................ + ldr q9, [x0, #0] // *....................................................................................................................... + sqrdmulh v11.4S, v13.4S, v3.S[1] // .........................*.............................................................................................. + sub v12.4S, v4.4S, v29.4S // .............*.......................................................................................................... + ldr q29, [x0, #400] // ...e.................................................................................................................... // gap // ........................................................................................................................ - sqrdmulh v7.4S, v12.4S, v2.S[1] // ................*....................................................................................................... - sub v9.4S, v23.4S, v6.4S // .......................*................................................................................................ - // gap // ........................................................................................................................ - mul v24.4S, v17.4S, v2.S[2] // ....................*................................................................................................... - sqrdmulh v17.4S, v17.4S, v2.S[3] // .....................*.................................................................................................. - // gap // ........................................................................................................................ + add v17.4S, v15.4S, v5.4S // .......................................*................................................................................ + sub v15.4S, v15.4S, v5.4S // ......................................*................................................................................. + ldr q5, [x0, #784] // ......e................................................................................................................. // gap // ........................................................................................................................ + mul v27.4S, v12.4S, v2.S[0] // ................*....................................................................................................... + sqrdmulh v10.4S, v12.4S, v2.S[1] // ...............*........................................................................................................ // gap // ........................................................................................................................ - mul v20.4S, v9.4S, v3.S[0] // .........................*.............................................................................................. - add v15.4S, v10.4S, v18.4S // ...................*.................................................................................................... + ldr q23, [x0, #656] // .....e.................................................................................................................. + sqrdmulh v12.4S, v15.4S, v1.S[1] // ........................................*............................................................................... + mls v24.4S, v6.4S, v8.S[0] // ......................*................................................................................................. + ldr q28, [x0, #528] // ....e................................................................................................................... // gap // ........................................................................................................................ + mls v19.4S, v11.4S, v8.S[0] // ...........................*............................................................................................ + add v16.4S, v9.4S, v20.4S // .........*.............................................................................................................. // gap // ........................................................................................................................ - mls v13.4S, v7.4S, v8.S[0] // .................*...................................................................................................... - sub v23.4S, v14.4S, v27.4S // ............................*........................................................................................... - ldr q11, [x0, #16] // e....................................................................................................................... - ldr q7, [x0, #144] // .e...................................................................................................................... - mls v24.4S, v17.4S, v8.S[0] // ......................*................................................................................................. - sqrdmulh v18.4S, v9.4S, v3.S[1] // ..........................*............................................................................................. + mul v13.4S, v15.4S, v1.S[0] // .........................................*.............................................................................. // gap // ........................................................................................................................ + mls v27.4S, v10.4S, v8.S[0] // .................*...................................................................................................... // gap // ........................................................................................................................ - mul v22.4S, v23.4S, v0.S[2] // ..............................*......................................................................................... - sqrdmulh v5.4S, v23.4S, v0.S[3] // ...............................*........................................................................................ // gap // ........................................................................................................................ + sub v7.4S, v16.4S, v22.4S // ............................*........................................................................................... + sub v4.4S, v9.4S, v20.4S // ........*............................................................................................................... // gap // ........................................................................................................................ - sub v12.4S, v15.4S, v28.4S // ......................................*................................................................................. - sub v17.4S, v16.4S, v13.4S // .................................*...................................................................................... // gap // ........................................................................................................................ + add v14.4S, v24.4S, v19.4S // ............................................*........................................................................... + sub v11.4S, v24.4S, v19.4S // ...........................................*............................................................................ // gap // ........................................................................................................................ - mls v20.4S, v18.4S, v8.S[0] // ...........................*............................................................................................ - add v10.4S, v14.4S, v27.4S // .............................*.......................................................................................... // gap // ........................................................................................................................ - mls v22.4S, v5.4S, v8.S[0] // ................................*....................................................................................... + add v15.4S, v16.4S, v22.4S // .............................*.......................................................................................... + mul v21.4S, v4.4S, v1.S[2] // ...........*............................................................................................................ // gap // ........................................................................................................................ - sub v19.4S, v11.4S, v7.4S // ........e............................................................................................................... // gap // ........................................................................................................................ + mul v24.4S, v11.4S, v1.S[0] // ..............................................*......................................................................... + sqrdmulh v22.4S, v4.4S, v1.S[3] // ..........*............................................................................................................. // gap // ........................................................................................................................ - mul v9.4S, v17.4S, v0.S[2] // ...................................*.................................................................................... - sqrdmulh v18.4S, v17.4S, v0.S[3] // ....................................*................................................................................... // gap // ........................................................................................................................ + mul v16.4S, v7.4S, v0.S[2] // ...............................*........................................................................................ + sqrdmulh v4.4S, v11.4S, v1.S[1] // .............................................*.......................................................................... // gap // ........................................................................................................................ - mul v23.4S, v12.4S, v1.S[0] // ........................................*............................................................................... - sqrdmulh v29.4S, v12.4S, v1.S[1] // .........................................*.............................................................................. // gap // ........................................................................................................................ + sub v18.4S, v15.4S, v17.4S // ................................................*....................................................................... + sqrdmulh v19.4S, v7.4S, v0.S[3] // ..............................*......................................................................................... + ldr q20, [x0, #144] // .e...................................................................................................................... + mls v21.4S, v22.4S, v8.S[0] // ............*........................................................................................................... + add v11.4S, v15.4S, v17.4S // .................................................*...................................................................... // gap // ........................................................................................................................ - sub v14.4S, v24.4S, v20.4S // ...........................................*............................................................................ - add v4.4S, v16.4S, v13.4S // ..................................*..................................................................................... - mul v16.4S, v19.4S, v1.S[2] // ..........e............................................................................................................. + mul v6.4S, v18.4S, v0.S[0] // ...................................................*.................................................................... + sqrdmulh v17.4S, v18.4S, v0.S[1] // ..................................................*..................................................................... // gap // ........................................................................................................................ - mls v9.4S, v18.4S, v8.S[0] // .....................................*.................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ + mls v24.4S, v4.4S, v8.S[0] // ...............................................*........................................................................ + sqrdmulh v7.4S, v11.4S, v26.4S // ........................................................................................*............................... // gap // ........................................................................................................................ - mls v23.4S, v29.4S, v8.S[0] // ..........................................*............................................................................. - add v18.4S, v24.4S, v20.4S // ............................................*........................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v20.4S, v15.4S, v28.4S // .......................................*................................................................................ - mul v13.4S, v14.4S, v1.S[0] // .............................................*.......................................................................... + mul v22.4S, v11.4S, v25.4S // .........................................................................................*.............................. + add v11.4S, v21.4S, v27.4S // ..................................*..................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v15.4S, v14.4S, v1.S[1] // ..............................................*......................................................................... - sub v17.4S, v4.4S, v18.4S // .....................................................*.................................................................. + mls v6.4S, v17.4S, v8.S[0] // ....................................................*................................................................... + mls v16.4S, v19.4S, v8.S[0] // ................................*....................................................................................... // gap // ........................................................................................................................ + add v18.4S, v11.4S, v14.4S // ......................................................*................................................................. // gap // ........................................................................................................................ - add v14.4S, v10.4S, v20.4S // .................................................*...................................................................... - sub v29.4S, v10.4S, v20.4S // ................................................*....................................................................... + sub v19.4S, v21.4S, v27.4S // .................................*...................................................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mul v12.4S, v17.4S, v0.S[0] // .......................................................*................................................................ - sqrdmulh v20.4S, v17.4S, v0.S[1] // ........................................................*............................................................... + sub v17.4S, v11.4S, v14.4S // .....................................................*.................................................................. + mls v13.4S, v12.4S, v8.S[0] // ..........................................*............................................................................. + mul v14.4S, v18.4S, v25.4S // ............................................................................................*........................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - add v27.4S, v4.4S, v18.4S // ......................................................*................................................................. - mul v18.4S, v29.4S, v0.S[0] // ..................................................*..................................................................... + sqrdmulh v27.4S, v18.4S, v26.4S // ...........................................................................................*............................ + mul v4.4S, v19.4S, v0.S[2] // ....................................*................................................................................... // gap // ........................................................................................................................ - mls v13.4S, v15.4S, v8.S[0] // ...............................................*........................................................................ // gap // ........................................................................................................................ - sqrdmulh v24.4S, v29.4S, v0.S[1] // ...................................................*.................................................................... + mls v22.4S, v7.4S, v8.S[0] // ..........................................................................................*............................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v12.4S, v20.4S, v8.S[0] // .........................................................*.............................................................. - sub v4.4S, v22.4S, v23.4S // ..........................................................*............................................................. + cmge v12.4S, v31.4S, v6.4S // ....................................................................*................................................... + sub v11.4S, v16.4S, v13.4S // ..........................................................*............................................................. // gap // ........................................................................................................................ - sqrdmulh v5.4S, v14.4S, v26.4S // .........................................................................................*.............................. + mls v14.4S, v27.4S, v8.S[0] // .............................................................................................*.......................... // gap // ........................................................................................................................ - add v17.4S, v22.4S, v23.4S // ...........................................................*............................................................ + add v7.4S, v16.4S, v13.4S // ...........................................................*............................................................ // gap // ........................................................................................................................ + sqrdmulh v27.4S, v19.4S, v0.S[3] // ...................................*.................................................................................... + mul v21.4S, v11.4S, v0.S[0] // .............................................................*.......................................................... // gap // ........................................................................................................................ - mul v20.4S, v4.4S, v0.S[0] // ............................................................*........................................................... - sub v28.4S, v9.4S, v13.4S // ...............................................................*........................................................ // gap // ........................................................................................................................ - mls v18.4S, v24.4S, v8.S[0] // ....................................................*................................................................... // gap // ........................................................................................................................ - add v23.4S, v9.4S, v13.4S // ................................................................*....................................................... + mul v9.4S, v7.4S, v25.4S // ...............................................................................................*........................ + sqrdmulh v16.4S, v7.4S, v26.4S // ..............................................................................................*......................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v6.4S, v4.4S, v0.S[1] // .............................................................*.......................................................... - mul v22.4S, v28.4S, v0.S[0] // .................................................................*...................................................... + cmge v7.4S, v14.4S, v30.4S // .........................................................................................................*.............. + cmge v19.4S, v31.4S, v14.4S // ........................................................................................................*............... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v29.4S, v17.4S, v26.4S // ...............................................................................................*........................ - mul v13.4S, v14.4S, v25.4S // ........................................................................................*............................... + cmge v13.4S, v6.4S, v30.4S // .....................................................................*.................................................. + mls v4.4S, v27.4S, v8.S[0] // .....................................*.................................................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v15.4S, v31.4S, v18.4S // ....................................................................*................................................... - cmge v21.4S, v18.4S, v30.4S // .....................................................................*.................................................. + sub v27.4S, v19.4S, v7.4S // ..........................................................................................................*............. + mls v9.4S, v16.4S, v8.S[0] // ................................................................................................*....................... // gap // ........................................................................................................................ - mls v20.4S, v6.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ - sqrdmulh v9.4S, v28.4S, v0.S[1] // ..................................................................*..................................................... + cmge v16.4S, v31.4S, v22.4S // ....................................................................................................*................... + sub v18.4S, v12.4S, v13.4S // ......................................................................*................................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - sqrdmulh v28.4S, v23.4S, v26.4S // ..................................................................................................*..................... - sub v24.4S, v15.4S, v21.4S // ......................................................................*................................................. + cmge v13.4S, v22.4S, v30.4S // .....................................................................................................*.................. + sub v12.4S, v4.4S, v24.4S // ...............................................................*........................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v4.4S, v12.4S, v30.4S // .........................................................................*.............................................. - cmge v6.4S, v31.4S, v12.4S // ........................................................................*............................................... + mls v14.4S, v27.4S, v8.4S // ...........................................................................................................*............ + mls v6.4S, v18.4S, v8.4S // .......................................................................*................................................ // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v21.4S, v31.4S, v20.4S // ............................................................................*........................................... - cmge v10.4S, v20.4S, v30.4S // .............................................................................*.......................................... + cmge v7.4S, v31.4S, v9.4S // ............................................................................................................*........... + sqrdmulh v15.4S, v12.4S, v0.S[1] // .................................................................*...................................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v22.4S, v9.4S, v8.S[0] // ...................................................................*.................................................... - mls v18.4S, v24.4S, v8.4S // .......................................................................*................................................ + mul v18.4S, v12.4S, v0.S[0] // ..................................................................*..................................................... + add v10.4S, v4.4S, v24.4S // ................................................................*....................................................... + sqrdmulh v19.4S, v17.4S, v0.S[1] // .......................................................*................................................................ + str q6, [x0, #512] // ....................................................................................*................................... + mul v6.4S, v17.4S, v0.S[0] // ........................................................*............................................................... // gap // ........................................................................................................................ + ldr q17, [x0, #912] // .......e................................................................................................................ + mul v27.4S, v10.4S, v25.4S // ..................................................................................................*..................... + sqrdmulh v4.4S, v10.4S, v26.4S // .................................................................................................*...................... + str q14, [x0, #128] // .....................................................................................................................*.. // gap // ........................................................................................................................ - sub v15.4S, v6.4S, v4.4S // ..........................................................................*............................................. - sub v21.4S, v21.4S, v10.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ + cmge v14.4S, v9.4S, v30.4S // .............................................................................................................*.......... + sqrdmulh v12.4S, v11.4S, v0.S[1] // ............................................................*........................................................... + mls v18.4S, v15.4S, v8.S[0] // ...................................................................*.................................................... // gap // ........................................................................................................................ - mul v4.4S, v27.4S, v25.4S // ...........................................................................................*............................ - sqrdmulh v14.4S, v27.4S, v26.4S // ............................................................................................*........................... - cmge v24.4S, v22.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ + mls v6.4S, v19.4S, v8.S[0] // .........................................................*.............................................................. + sub v10.4S, v7.4S, v14.4S // ..............................................................................................................*......... // gap // ........................................................................................................................ - mul v9.4S, v17.4S, v25.4S // ..............................................................................................*......................... // gap // ........................................................................................................................ + sub v24.4S, v16.4S, v13.4S // ......................................................................................................*................. + mls v27.4S, v4.4S, v8.S[0] // ...................................................................................................*.................... // gap // ........................................................................................................................ - cmge v6.4S, v31.4S, v22.4S // ................................................................................*....................................... - mls v20.4S, v21.4S, v8.4S // ...............................................................................*........................................ - mls v4.4S, v14.4S, v8.S[0] // .............................................................................................*.......................... // gap // ........................................................................................................................ + mls v21.4S, v12.4S, v8.S[0] // ..............................................................*......................................................... // gap // ........................................................................................................................ - mls v13.4S, v5.4S, v8.S[0] // ..........................................................................................*............................. // gap // ........................................................................................................................ - mls v12.4S, v15.4S, v8.4S // ...........................................................................*............................................ - mls v9.4S, v29.4S, v8.S[0] // ................................................................................................*....................... + cmge v16.4S, v6.4S, v30.4S // .........................................................................*.............................................. + cmge v13.4S, v31.4S, v6.4S // ........................................................................*............................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - str q20, [x0, #768] // ......................................................................................*................................. - mul v20.4S, v23.4S, v25.4S // .................................................................................................*...................... - sub v23.4S, v6.4S, v24.4S // ..................................................................................*..................................... - cmge v29.4S, v31.4S, v13.4S // ....................................................................................................*................... + mls v22.4S, v24.4S, v8.4S // .......................................................................................................*................ + cmge v24.4S, v31.4S, v18.4S // ................................................................................*....................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v21.4S, v13.4S, v30.4S // .....................................................................................................*.................. + cmge v11.4S, v27.4S, v30.4S // .................................................................................................................*...... + sub v15.4S, v13.4S, v16.4S // ..........................................................................*............................................. // gap // ........................................................................................................................ // gap // ........................................................................................................................ - cmge v15.4S, v31.4S, v4.4S // ........................................................................................................*............... - cmge v5.4S, v4.4S, v30.4S // .........................................................................................................*.............. + cmge v12.4S, v31.4S, v27.4S // ................................................................................................................*....... + cmge v14.4S, v18.4S, v30.4S // .................................................................................*...................................... // gap // ........................................................................................................................ // gap // ........................................................................................................................ - mls v20.4S, v28.4S, v8.S[0] // ...................................................................................................*.................... - sub v29.4S, v29.4S, v21.4S // ......................................................................................................*................. + cmge v7.4S, v21.4S, v30.4S // .............................................................................*.......................................... + mls v6.4S, v15.4S, v8.4S // ...........................................................................*............................................ // gap // ........................................................................................................................ - ldr q24, [x0, #272] // ..e..................................................................................................................... - mls v22.4S, v23.4S, v8.4S // ...................................................................................*.................................... - sub v21.4S, v15.4S, v5.4S // ..........................................................................................................*............. - ldr q14, [x0, #400] // ...e.................................................................................................................... - str q18, [x0, #512] // ....................................................................................*................................... - cmge v27.4S, v31.4S, v9.4S // ............................................................................................................*........... - cmge v28.4S, v9.4S, v30.4S // .............................................................................................................*.......... - ldr q10, [x0, #528] // ....e................................................................................................................... - ldr q18, [x0, #656] // .....e.................................................................................................................. - cmge v17.4S, v20.4S, v30.4S // .................................................................................................................*...... - cmge v6.4S, v31.4S, v20.4S // ................................................................................................................*....... - sub v15.4S, v27.4S, v28.4S // ..............................................................................................................*......... - str q12, [x0, #640] // .....................................................................................*.................................. - mls v13.4S, v29.4S, v8.4S // .......................................................................................................*................ // gap // ........................................................................................................................ - mls v4.4S, v21.4S, v8.4S // ...........................................................................................................*............ - ldr q23, [x0, #784] // ......e................................................................................................................. + sub v12.4S, v12.4S, v11.4S // ..................................................................................................................*..... + cmge v13.4S, v31.4S, v21.4S // ............................................................................*........................................... + mls v9.4S, v10.4S, v8.4S // ...............................................................................................................*........ // gap // ........................................................................................................................ - sub v5.4S, v6.4S, v17.4S // ..................................................................................................................*..... - mls v9.4S, v15.4S, v8.4S // ...............................................................................................................*........ - ldr q6, [x0, #912] // .......e................................................................................................................ - str q22, [x0, #896] // .......................................................................................*................................ - sub v12.4S, v24.4S, v14.4S // .............e.......................................................................................................... - sqrdmulh v22.4S, v19.4S, v1.S[3] // ...........e............................................................................................................ + ldr q4, [x0, #272] // ..e..................................................................................................................... + sub v19.4S, v24.4S, v14.4S // ..................................................................................*..................................... // gap // ........................................................................................................................ - mls v20.4S, v5.4S, v8.4S // ...................................................................................................................*.... - str q13, [x0], #(16) // ....................................................................................................................*... - sub v17.4S, v10.4S, v18.4S // ..................e..................................................................................................... - mul v13.4S, v12.4S, v2.S[0] // ...............e........................................................................................................ - // gap // ........................................................................................................................ - str q4, [x0, #112] // .....................................................................................................................*.. - add v27.4S, v24.4S, v14.4S // ..............e......................................................................................................... - add v14.4S, v11.4S, v7.4S // .........e.............................................................................................................. + str q22, [x0], #(16) // ....................................................................................................................*... + mls v27.4S, v12.4S, v8.4S // ...................................................................................................................*.... + sub v15.4S, v13.4S, v7.4S // ..............................................................................*......................................... // gap // ........................................................................................................................ + sub v12.4S, v28.4S, v23.4S // ..................e..................................................................................................... + str q6, [x0, #624] // .....................................................................................*.................................. + mls v18.4S, v19.4S, v8.4S // ...................................................................................*.................................... + sub v13.4S, v5.4S, v17.4S // .......................e................................................................................................ str q9, [x0, #240] // ......................................................................................................................*. - mls v16.4S, v22.4S, v8.S[0] // ............e........................................................................................................... - // gap // ........................................................................................................................ - str q20, [x0, #368] // .......................................................................................................................* - add v28.4S, v23.4S, v6.4S // ........................e............................................................................................... - - // original source code - // ldr q9, [x0, #0] // e...............................................................................................................|.......e.............................................................................................................. - // ldr q10, [x0, #(1*(1024/8))] // .e..............................................................................................................|........e............................................................................................................. - // ldr q11, [x0, #(2*(1024/8))] // ...............................................................................e................................|......................................................................................e............................... - // ldr q12, [x0, #(3*(1024/8))] // ..................................................................................e.............................|.........................................................................................e............................ - // ldr q13, [x0, #(4*(1024/8))] // ......................................................................................e.........................|.............................................................................................e........................ - // ldr q14, [x0, #(5*(1024/8))] // .......................................................................................e........................|..............................................................................................e....................... - // ldr q15, [x0, #(6*(1024/8))] // ..............................................................................................e.................|.....................................................................................................e................ - // ldr q16, [x0, #(7*(1024/8))] // .................................................................................................e..............|........................................................................................................e............. - // sub v24.4s, v9.4s, v10.4s // ...........e....................................................................................................|..................e................................................................................................... - // add v9.4s, v9.4s, v10.4s // ...........................................................................................................e....|..................................................................................................................e... - // mul v10.4s, v24.4s, v1.s[2] // ..................e.............................................................................................|.........................e............................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................................................................e...........|...........................................................................................................e.......... - // mls v10.4s, v24.4s, v8.s[0] // .............................................................................................................e..|....................................................................................................................e. - // sub v24.4s, v11.4s, v12.4s // ...................................................................................................e............|..........................................................................................................e........... - // add v11.4s, v11.4s, v12.4s // ..........................................................................................................e.....|.................................................................................................................e.... - // mul v12.4s, v24.4s, v2.s[0] // ........................................................................................................e.......|...............................................................................................................e...... - // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................................*...................................................................................................................... - // mls v12.4s, v24.4s, v8.s[0] // ................................................................................................................|.....*................................................................................................................ - // sub v24.4s, v13.4s, v14.4s // .......................................................................................................e........|..............................................................................................................e....... - // add v13.4s, v13.4s, v14.4s // ................................................................................................................|....*................................................................................................................. - // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................................|.*.................................................................................................................... - // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................|..*................................................................................................................... - // mls v14.4s, v24.4s, v8.s[0] // ..*.............................................................................................................|.........*............................................................................................................ - // sub v24.4s, v15.4s, v16.4s // ................................................................................................................|*..................................................................................................................... - // add v15.4s, v15.4s, v16.4s // ...............................................................................................................e|...................................................................................................................... - // mul v16.4s, v24.4s, v3.s[0] // ................................................................................................................|...*.................................................................................................................. - // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...*............................................................................................................|..........*........................................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ........*.......................................................................................................|...............*...................................................................................................... - // sub v24.4s, v9.4s, v11.4s // ................................................................................................................|......*............................................................................................................... - // add v9.4s, v9.4s, v11.4s // .........*......................................................................................................|................*..................................................................................................... - // mul v11.4s, v24.4s, v0.s[2] // ....*...........................................................................................................|...........*.......................................................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....*..........................................................................................................|............*......................................................................................................... - // mls v11.4s, v24.4s, v8.s[0] // ..........*.....................................................................................................|.................*.................................................................................................... - // sub v24.4s, v10.4s, v12.4s // .......*........................................................................................................|..............*....................................................................................................... - // add v10.4s, v10.4s, v12.4s // .................*..............................................................................................|........................*............................................................................................. - // mul v12.4s, v24.4s, v0.s[2] // ............*...................................................................................................|...................*.................................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............*..................................................................................................|....................*................................................................................................. - // mls v12.4s, v24.4s, v8.s[0] // ...................*............................................................................................|..........................*........................................................................................... - // sub v24.4s, v13.4s, v15.4s // ......*.........................................................................................................|.............*........................................................................................................ - // add v13.4s, v13.4s, v15.4s // ......................*.........................................................................................|.............................*........................................................................................ - // mul v15.4s, v24.4s, v1.s[0] // ..............*.................................................................................................|.....................*................................................................................................ - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...............*................................................................................................|......................*............................................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ....................*...........................................................................................|...........................*.......................................................................................... - // sub v24.4s, v14.4s, v16.4s // ................*...............................................................................................|.......................*.............................................................................................. - // add v14.4s, v14.4s, v16.4s // .....................*..........................................................................................|............................*......................................................................................... - // mul v16.4s, v24.4s, v1.s[0] // .......................*........................................................................................|..............................*....................................................................................... - // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................*.......................................................................................|...............................*...................................................................................... - // mls v16.4s, v24.4s, v8.s[0] // ................................*...............................................................................|.......................................*.............................................................................. - // sub v24.4s, v9.4s, v13.4s // ...........................*....................................................................................|..................................*................................................................................... - // add v9.4s, v9.4s, v13.4s // ..........................*.....................................................................................|.................................*.................................................................................... - // mul v13.4s, v24.4s, v0.s[0] // ...............................*................................................................................|......................................*............................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................*..............................................................................|........................................*............................................................................. - // mls v13.4s, v24.4s, v8.s[0] // ........................................*.......................................................................|...............................................*...................................................................... - // sub v24.4s, v10.4s, v14.4s // .........................*......................................................................................|................................*..................................................................................... - // add v10.4s, v10.4s, v14.4s // ..............................*.................................................................................|.....................................*................................................................................ - // mul v14.4s, v24.4s, v0.s[0] // ............................*...................................................................................|...................................*.................................................................................. - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................*..................................................................................|....................................*................................................................................. - // mls v14.4s, v24.4s, v8.s[0] // ..................................*.............................................................................|.........................................*............................................................................ - // sub v24.4s, v11.4s, v15.4s // ...................................*............................................................................|..........................................*........................................................................... - // add v11.4s, v11.4s, v15.4s // .....................................*..........................................................................|............................................*......................................................................... - // mul v15.4s, v24.4s, v0.s[0] // ......................................*.........................................................................|.............................................*........................................................................ - // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................*.....................................................................|.................................................*.................................................................... - // mls v15.4s, v24.4s, v8.s[0] // ................................................*...............................................................|.......................................................*.............................................................. - // sub v24.4s, v12.4s, v16.4s // .......................................*........................................................................|..............................................*....................................................................... - // add v12.4s, v12.4s, v16.4s // .........................................*......................................................................|................................................*..................................................................... - // mul v16.4s, v24.4s, v0.s[0] // ...........................................*....................................................................|..................................................*................................................................... - // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................*..............................................................|........................................................*............................................................. - // mls v16.4s, v24.4s, v8.s[0] // ........................................................*.......................................................|...............................................................*...................................................... - // cmge v27.4s, v31.4s, v13.4s // ..............................................*.................................................................|.....................................................*................................................................ - // cmge v28.4s, v13.4s, v30.4s // ...............................................*................................................................|......................................................*............................................................... - // sub v28.4s, v27.4s, v28.4s // ...................................................*............................................................|..........................................................*........................................................... - // mls v13.4s, v28.4s, v8.4s // .........................................................*......................................................|................................................................*..................................................... - // cmge v27.4s, v31.4s, v14.4s // .....................................................*..........................................................|............................................................*......................................................... - // cmge v28.4s, v14.4s, v30.4s // ....................................................*...........................................................|...........................................................*.......................................................... - // sub v28.4s, v27.4s, v28.4s // ..........................................................*.....................................................|.................................................................*.................................................... - // mls v14.4s, v28.4s, v8.4s // ....................................................................*...........................................|...........................................................................*.......................................... - // cmge v27.4s, v31.4s, v15.4s // ......................................................*.........................................................|.............................................................*........................................................ - // cmge v28.4s, v15.4s, v30.4s // .......................................................*........................................................|..............................................................*....................................................... - // sub v28.4s, v27.4s, v28.4s // ...........................................................*....................................................|..................................................................*................................................... - // mls v15.4s, v28.4s, v8.4s // .................................................................*..............................................|........................................................................*............................................. - // cmge v27.4s, v31.4s, v16.4s // ................................................................*...............................................|.......................................................................*.............................................. - // cmge v28.4s, v16.4s, v30.4s // ..............................................................*.................................................|.....................................................................*................................................ - // sub v28.4s, v27.4s, v28.4s // ........................................................................*.......................................|...............................................................................*...................................... - // mls v16.4s, v28.4s, v8.4s // ................................................................................*...............................|.......................................................................................*.............................. - // str q13, [x0, #(4*(1024/8))] // ...................................................................................*............................|..........................................................................................*........................... - // str q14, [x0, #(5*(1024/8))] // ...........................................................................................*....................|..................................................................................................*................... - // str q15, [x0, #(6*(1024/8))] // ......................................................................*.........................................|.............................................................................*........................................ - // str q16, [x0, #(7*(1024/8))] // ..................................................................................................*.............|.........................................................................................................*............ - // mul v13.4s, v9.4s, v25.4s // .............................................*..................................................................|....................................................*................................................................. - // sqrdmulh v9.4s, v9.4s, v26.4s // ....................................*...........................................................................|...........................................*.......................................................................... - // mls v13.4s, v9.4s, v8.s[0] // ...................................................................*............................................|..........................................................................*........................................... - // mul v14.4s, v10.4s, v25.4s // ............................................................*...................................................|...................................................................*.................................................. - // sqrdmulh v10.4s, v10.4s, v26.4s // .............................................................*..................................................|....................................................................*................................................. - // mls v14.4s, v10.4s, v8.s[0] // ..................................................................*.............................................|.........................................................................*............................................ - // mul v15.4s, v11.4s, v25.4s // ...............................................................*................................................|......................................................................*............................................... - // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................*...................................................................|...................................................*.................................................................. - // mls v15.4s, v11.4s, v8.s[0] // .....................................................................*..........................................|............................................................................*......................................... - // mul v16.4s, v12.4s, v25.4s // .......................................................................*........................................|..............................................................................*....................................... - // sqrdmulh v12.4s, v12.4s, v26.4s // ..................................................*.............................................................|.........................................................*............................................................ - // mls v16.4s, v12.4s, v8.s[0] // .............................................................................*..................................|....................................................................................*................................. - // cmge v27.4s, v31.4s, v13.4s // .........................................................................*......................................|................................................................................*..................................... - // cmge v28.4s, v13.4s, v30.4s // ..........................................................................*.....................................|.................................................................................*.................................... - // sub v28.4s, v27.4s, v28.4s // ..............................................................................*.................................|.....................................................................................*................................ - // mls v13.4s, v28.4s, v8.4s // ............................................................................................*...................|...................................................................................................*.................. - // cmge v27.4s, v31.4s, v14.4s // ...........................................................................*....................................|..................................................................................*................................... - // cmge v28.4s, v14.4s, v30.4s // ............................................................................*...................................|...................................................................................*.................................. - // sub v28.4s, v27.4s, v28.4s // .................................................................................*..............................|........................................................................................*............................. - // mls v14.4s, v28.4s, v8.4s // .............................................................................................*..................|....................................................................................................*................. - // cmge v27.4s, v31.4s, v15.4s // ....................................................................................*...........................|...........................................................................................*.......................... - // cmge v28.4s, v15.4s, v30.4s // .....................................................................................*..........................|............................................................................................*......................... - // sub v28.4s, v27.4s, v28.4s // ..........................................................................................*.....................|.................................................................................................*.................... - // mls v15.4s, v28.4s, v8.4s // ................................................................................................*...............|.......................................................................................................*.............. - // cmge v27.4s, v31.4s, v16.4s // .........................................................................................*......................|................................................................................................*..................... - // cmge v28.4s, v16.4s, v30.4s // ........................................................................................*.......................|...............................................................................................*...................... - // sub v28.4s, v27.4s, v28.4s // ...............................................................................................*................|......................................................................................................*............... - // mls v16.4s, v28.4s, v8.4s // .....................................................................................................*..........|............................................................................................................*......... - // str q13, [x0], #(16) // ......................................................................................................*.........|.............................................................................................................*........ - // str q14, [x0, #(-16 + 1*(1024/8))] // .........................................................................................................*......|................................................................................................................*..... - // str q15, [x0, #(-16 + 2*(1024/8))] // ............................................................................................................*...|...................................................................................................................*.. - // str q16, [x0, #(-16 + 3*(1024/8))] // ..............................................................................................................*.|.....................................................................................................................* + // gap // ........................................................................................................................ + mls v21.4S, v15.4S, v8.4S // ...............................................................................*........................................ + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v12.4S, v2.S[3] // ....................e................................................................................................... + add v22.4S, v4.4S, v29.4S // ..............e......................................................................................................... + str q27, [x0, #368] // .......................................................................................................................* + mul v19.4S, v13.4S, v3.S[0] // ..........................e............................................................................................. + add v15.4S, v28.4S, v23.4S // ...................e.................................................................................................... + // gap // ........................................................................................................................ + str q18, [x0, #880] // .......................................................................................*................................ + // gap // ........................................................................................................................ + add v5.4S, v5.4S, v17.4S // ........................e............................................................................................... + mul v24.4S, v12.4S, v2.S[2] // .....................e.................................................................................................. + str q21, [x0, #752] // ......................................................................................*................................. + + // --------------------------------------------------------------------------------------------------------------- new position ---------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ldr q9, [x0, #0] // .....................................................................................................................*....................................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ..........................e..........................................................................................'............................~.......................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // .................................................................................................e...................'...................................................................................................~................... + // ldr q12, [x0, #(3*(1024/8))] // e....................................................................................................................'..~.................................................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // .........e...........................................................................................................'...........~........................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ......e..............................................................................................................'........~.............................................................................................................. + // ldr q15, [x0, #(6*(1024/8))] // ...e.................................................................................................................'.....~................................................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // ........................................................................e............................................'..........................................................................~............................................ + // sub v24.4s, v9.4s, v10.4s // ...............~.....................................................................................................'.................*..................................................................................................... + // add v9.4s, v9.4s, v10.4s // ...........~.........................................................................................................'.............*......................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[3] // .....................~...............................................................................................'.......................*............................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ...................~.................................................................................................'.....................*................................................................................................. + // mls v10.4s, v27.4s, v8.s[0] // ...........................~.........................................................................................'.............................*......................................................................................... + // sub v24.4s, v11.4s, v12.4s // .....................................................................................................................'.*..................................................................................................................... + // add v11.4s, v11.4s, v12.4s // .............................................................................................................e.......'...............................................................................................................~....... + // sqrdmulh v27.4s, v24.4s, v2.s[1] // .....~...............................................................................................................'.......*............................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ....~................................................................................................................'......*................................................................................................................ + // mls v12.4s, v27.4s, v8.s[0] // .............~.......................................................................................................'...............*....................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ......................................................................................................e..............'........................................................................................................~.............. + // add v13.4s, v13.4s, v14.4s // ................................................................................................................e....'..................................................................................................................~.... + // sqrdmulh v27.4s, v24.4s, v2.s[3] // ............................................................................................................e........'..............................................................................................................~........ + // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................e.'.....................................................................................................................~. + // mls v14.4s, v27.4s, v8.s[0] // ........~............................................................................................................'..........*............................................................................................................ + // sub v24.4s, v15.4s, v16.4s // .........................................................................................................e...........'...........................................................................................................~........... + // add v15.4s, v15.4s, v16.4s // ..................................................................................................................e..'....................................................................................................................~.. + // sqrdmulh v27.4s, v24.4s, v3.s[1] // .....................................................................................................................'*...................................................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ...............................................................................................................e.....'.................................................................................................................~..... + // mls v16.4s, v27.4s, v8.s[0] // ..........~..........................................................................................................'............*.......................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............~......................................................................................................'................*...................................................................................................... + // add v9.4s, v9.4s, v11.4s // ..................~..................................................................................................'....................*.................................................................................................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .........................~...........................................................................................'...........................*........................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ......................~..............................................................................................'........................*.............................................................................................. + // mls v11.4s, v27.4s, v8.s[0] // ....................................~................................................................................'......................................*................................................................................ + // sub v24.4s, v10.4s, v12.4s // ......................................~..............................................................................'........................................*.............................................................................. + // add v10.4s, v10.4s, v12.4s // ..................................~..................................................................................'....................................*.................................................................................. + // sqrdmulh v27.4s, v24.4s, v0.s[3] // .................................................~...................................................................'...................................................*................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...........................................~.........................................................................'.............................................*......................................................................... + // mls v12.4s, v27.4s, v8.s[0] // ........................................................~............................................................'..........................................................*............................................................ + // sub v24.4s, v13.4s, v15.4s // ..~..................................................................................................................'....*.................................................................................................................. + // add v13.4s, v13.4s, v15.4s // .~...................................................................................................................'...*................................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .......~.............................................................................................................'.........*............................................................................................................. + // mul v15.4s, v24.4s, v1.s[0] // ............~........................................................................................................'..............*........................................................................................................ + // mls v15.4s, v27.4s, v8.s[0] // ........................................~............................................................................'..........................................*............................................................................ + // sub v24.4s, v14.4s, v16.4s // .................~...................................................................................................'...................*................................................................................................... + // add v14.4s, v14.4s, v16.4s // ................~....................................................................................................'..................*.................................................................................................... + // sqrdmulh v27.4s, v24.4s, v1.s[1] // .......................~.............................................................................................'.........................*............................................................................................. + // mul v16.4s, v24.4s, v1.s[0] // ....................~................................................................................................'......................*................................................................................................ + // mls v16.4s, v27.4s, v8.s[0] // ...............................~.....................................................................................'.................................*..................................................................................... + // sub v24.4s, v9.4s, v13.4s // ........................~............................................................................................'..........................*............................................................................................ + // add v9.4s, v9.4s, v13.4s // ............................~........................................................................................'..............................*........................................................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..............................~......................................................................................'................................*...................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // .............................~.......................................................................................'...............................*....................................................................................... + // mls v13.4s, v27.4s, v8.s[0] // ...................................~.................................................................................'.....................................*................................................................................. + // sub v24.4s, v10.4s, v14.4s // .......................................~.............................................................................'.........................................*............................................................................. + // add v10.4s, v10.4s, v14.4s // .....................................~...............................................................................'.......................................*............................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .....................................................................~...............................................'.......................................................................*............................................... + // mul v14.4s, v24.4s, v0.s[0] // .......................................................................~.............................................'.........................................................................*............................................. + // mls v14.4s, v27.4s, v8.s[0] // ...............................................................................~.....................................'.................................................................................*..................................... + // sub v24.4s, v11.4s, v15.4s // ..............................................~......................................................................'................................................*...................................................................... + // add v11.4s, v11.4s, v15.4s // ................................................~....................................................................'..................................................*.................................................................... + // sqrdmulh v27.4s, v24.4s, v0.s[1] // .............................................................................~.......................................'...............................................................................*....................................... + // mul v15.4s, v24.4s, v0.s[0] // ..................................................~..................................................................'....................................................*.................................................................. + // mls v15.4s, v27.4s, v8.s[0] // ...................................................................................~.................................'.....................................................................................*................................. + // sub v24.4s, v12.4s, v16.4s // ..............................................................~......................................................'................................................................*...................................................... + // add v12.4s, v12.4s, v16.4s // ....................................................................~................................................'......................................................................*................................................ + // sqrdmulh v27.4s, v24.4s, v0.s[1] // ..................................................................~..................................................'....................................................................*.................................................. + // mul v16.4s, v24.4s, v0.s[0] // ...................................................................~.................................................'.....................................................................*................................................. + // mls v16.4s, v27.4s, v8.s[0] // ..............................................................................~......................................'................................................................................*...................................... + // cmge v27.4s, v31.4s, v13.4s // .............................................~.......................................................................'...............................................*....................................................................... + // cmge v28.4s, v13.4s, v30.4s // .......................................................~.............................................................'.........................................................*............................................................. + // sub v28.4s, v27.4s, v28.4s // ............................................................~........................................................'..............................................................*........................................................ + // mls v13.4s, v28.4s, v8.4s // ................................................................~....................................................'..................................................................*.................................................... + // cmge v27.4s, v31.4s, v14.4s // .....................................................................................~...............................'.......................................................................................*............................... + // cmge v28.4s, v14.4s, v30.4s // ....................................................................................~................................'......................................................................................*................................ + // sub v28.4s, v27.4s, v28.4s // .........................................................................................~...........................'...........................................................................................*........................... + // mls v14.4s, v28.4s, v8.4s // .............................................................................................~.......................'...............................................................................................*....................... + // cmge v27.4s, v31.4s, v15.4s // ...............................................................................................~.....................'.................................................................................................*..................... + // cmge v28.4s, v15.4s, v30.4s // ............................................................................................~........................'..............................................................................................*........................ + // sub v28.4s, v27.4s, v28.4s // .....................................................................................................~...............'.......................................................................................................*............... + // mls v15.4s, v28.4s, v8.4s // ...........................................................................................................~.........'.............................................................................................................*......... + // cmge v27.4s, v31.4s, v16.4s // .......................................................................................~.............................'.........................................................................................*............................. + // cmge v28.4s, v16.4s, v30.4s // ...........................................................................................~.........................'.............................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................~..................'....................................................................................................*.................. + // mls v16.4s, v28.4s, v8.4s // ........................................................................................................~............'..........................................................................................................*............ + // str q13, [x0, #(4*(1024/8))] // ......................................................................~..............................................'........................................................................*.............................................. + // str q14, [x0, #(5*(1024/8))] // .......................................................................................................~.............'.........................................................................................................*............. + // str q15, [x0, #(6*(1024/8))] // ....................................................................................................................~'......................................................................................................................* + // str q16, [x0, #(7*(1024/8))] // .................................................................................................................~...'...................................................................................................................*... + // sqrdmulh v27.4s, v9.4s, v26.4s // ................................~....................................................................................'..................................*.................................................................................... + // mul v9.4s, v9.4s, v25.4s // .................................~...................................................................................'...................................*................................................................................... + // mls v9.4s, v27.4s, v8.s[0] // ............................................~........................................................................'..............................................*........................................................................ + // sqrdmulh v27.4s, v10.4s, v26.4s // ..........................................~..........................................................................'............................................*.......................................................................... + // mul v10.4s, v10.4s, v25.4s // .........................................~...........................................................................'...........................................*........................................................................... + // mls v10.4s, v27.4s, v8.s[0] // ...............................................~.....................................................................'.................................................*..................................................................... + // sqrdmulh v27.4s, v11.4s, v26.4s // ....................................................~................................................................'......................................................*................................................................ + // mul v11.4s, v11.4s, v25.4s // ...................................................~.................................................................'.....................................................*................................................................. + // mls v11.4s, v27.4s, v8.s[0] // ..........................................................~..........................................................'............................................................*.......................................................... + // sqrdmulh v27.4s, v12.4s, v26.4s // ..........................................................................~..........................................'............................................................................*.......................................... + // mul v12.4s, v12.4s, v25.4s // .........................................................................~...........................................'...........................................................................*........................................... + // mls v12.4s, v27.4s, v8.s[0] // ..................................................................................~..................................'....................................................................................*.................................. + // cmge v27.4s, v31.4s, v9.4s // ...........................................................~.........................................................'.............................................................*......................................................... + // cmge v28.4s, v9.4s, v30.4s // .............................................................~.......................................................'...............................................................*....................................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................~...................................'...................................................................................*................................... + // mls v9.4s, v28.4s, v8.4s // ......................................................................................~..............................'........................................................................................*.............................. + // cmge v27.4s, v31.4s, v10.4s // ......................................................~..............................................................'........................................................*.............................................................. + // cmge v28.4s, v10.4s, v30.4s // .....................................................~...............................................................'.......................................................*............................................................... + // sub v28.4s, v27.4s, v28.4s // .........................................................~...........................................................'...........................................................*........................................................... + // mls v10.4s, v28.4s, v8.4s // ...............................................................~.....................................................'.................................................................*..................................................... + // cmge v27.4s, v31.4s, v11.4s // .................................................................~...................................................'...................................................................*................................................... + // cmge v28.4s, v11.4s, v30.4s // ............................................................................~........................................'..............................................................................*........................................ + // sub v28.4s, v27.4s, v28.4s // ................................................................................~....................................'..................................................................................*.................................... + // mls v11.4s, v28.4s, v8.4s // ................................................................................................~....................'..................................................................................................*.................... + // cmge v27.4s, v31.4s, v12.4s // ..........................................................................................~..........................'............................................................................................*.......................... + // cmge v28.4s, v12.4s, v30.4s // ........................................................................................~............................'..........................................................................................*............................ + // sub v28.4s, v27.4s, v28.4s // ..............................................................................................~......................'................................................................................................*...................... + // mls v12.4s, v28.4s, v8.4s // ....................................................................................................~................'......................................................................................................*................ + // str q9, [x0], #(16) // ...................................................................................................~.................'.....................................................................................................*................. + // str q10, [x0, #(-16 + 1*(1024/8))] // ...........................................................................~.........................................'.............................................................................*......................................... + // str q11, [x0, #(-16 + 2*(1024/8))] // ..........................................................................................................~..........'............................................................................................................*.......... + // str q12, [x0, #(-16 + 3*(1024/8))] // ..............................................................................................................~......'................................................................................................................*...... sub count, count, #1 cbnz count, layer123_start - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sqrdmulh v12.4S, v12.4S, v2.S[1] // *..................................................................................................... - sub v23.4S, v23.4S, v6.4S // .*.................................................................................................... - sqrdmulh v5.4S, v17.4S, v2.S[3] // ...*.................................................................................................. - mul v7.4S, v17.4S, v2.S[2] // ..*................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sqrdmulh v20.4S, v23.4S, v3.S[1] // .........*............................................................................................ - // gap // ...................................................................................................... - mul v17.4S, v23.4S, v3.S[0] // ....*................................................................................................. - add v9.4S, v14.4S, v27.4S // ...............*...................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mls v13.4S, v12.4S, v8.S[0] // ......*............................................................................................... - add v10.4S, v10.4S, v18.4S // .....*................................................................................................ - // gap // ...................................................................................................... - mls v7.4S, v5.4S, v8.S[0] // ........*............................................................................................. - // gap // ...................................................................................................... - sub v23.4S, v14.4S, v27.4S // .......*.............................................................................................. - // gap // ...................................................................................................... - mls v17.4S, v20.4S, v8.S[0] // ..............*....................................................................................... - // gap // ...................................................................................................... - sub v27.4S, v10.4S, v28.4S // ............*......................................................................................... - sub v11.4S, v16.4S, v13.4S // .............*........................................................................................ - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sqrdmulh v12.4S, v23.4S, v0.S[3] // ...........*.......................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - add v13.4S, v16.4S, v13.4S // ......................*............................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mul v5.4S, v11.4S, v0.S[2] // .................*.................................................................................... - sub v20.4S, v7.4S, v17.4S // .....................*................................................................................ - mul v23.4S, v23.4S, v0.S[2] // ..........*........................................................................................... - sqrdmulh v15.4S, v11.4S, v0.S[3] // ..................*................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mul v29.4S, v20.4S, v1.S[0] // ...........................*.......................................................................... - // gap // ...................................................................................................... - sqrdmulh v19.4S, v20.4S, v1.S[1] // ............................*......................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - add v17.4S, v7.4S, v17.4S // .........................*............................................................................ - // gap // ...................................................................................................... - mul v4.4S, v27.4S, v1.S[0] // ...................*.................................................................................. - sqrdmulh v11.4S, v27.4S, v1.S[1] // ....................*................................................................................. - mls v5.4S, v15.4S, v8.S[0] // .......................*.............................................................................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - add v15.4S, v13.4S, v17.4S // ..................................*................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mls v29.4S, v19.4S, v8.S[0] // ....................................*................................................................. - add v22.4S, v10.4S, v28.4S // ..........................*........................................................................... - mls v23.4S, v12.4S, v8.S[0] // ................*..................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sub v13.4S, v13.4S, v17.4S // .............................*........................................................................ - mls v4.4S, v11.4S, v8.S[0] // ........................*............................................................................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sqrdmulh v11.4S, v15.4S, v26.4S // .................................................................*.................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - add v24.4S, v5.4S, v29.4S // .............................................*........................................................ - add v7.4S, v9.4S, v22.4S // ..............................*....................................................................... - sub v27.4S, v9.4S, v22.4S // ...............................*...................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mul v10.4S, v24.4S, v25.4S // ...........................................................................*.......................... - sqrdmulh v20.4S, v24.4S, v26.4S // ......................................................*............................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mul v17.4S, v7.4S, v25.4S // .................................................*.................................................... - sub v22.4S, v23.4S, v4.4S // .......................................*.............................................................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sqrdmulh v12.4S, v7.4S, v26.4S // ........................................*............................................................. - mul v6.4S, v27.4S, v0.S[0] // ...................................*.................................................................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sqrdmulh v9.4S, v22.4S, v0.S[1] // ..............................................*....................................................... - // gap // ...................................................................................................... - mls v10.4S, v20.4S, v8.S[0] // .................................................................................*.................... - // gap // ...................................................................................................... - mul v14.4S, v22.4S, v0.S[0] // ..........................................*........................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sub v5.4S, v5.4S, v29.4S // ...........................................*.......................................................... - sqrdmulh v16.4S, v27.4S, v0.S[1] // .....................................*................................................................ - mls v17.4S, v12.4S, v8.S[0] // .......................................................................*.............................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - cmge v12.4S, v31.4S, v10.4S // .........................................................................................*............ - cmge v20.4S, v10.4S, v30.4S // ........................................................................................*............. - mls v14.4S, v9.4S, v8.S[0] // ....................................................*................................................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sqrdmulh v24.4S, v5.4S, v0.S[1] // .....................................................*................................................ - cmge v22.4S, v31.4S, v17.4S // .............................................................................*........................ - // gap // ...................................................................................................... - sub v12.4S, v12.4S, v20.4S // ..............................................................................................*....... - // gap // ...................................................................................................... - cmge v20.4S, v17.4S, v30.4S // ..............................................................................*....................... - add v7.4S, v23.4S, v4.4S // .........................................*............................................................ - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mls v10.4S, v12.4S, v8.4S // .................................................................................................*.... - cmge v23.4S, v31.4S, v14.4S // ..........................................................*........................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mls v6.4S, v16.4S, v8.S[0] // ............................................*......................................................... - cmge v29.4S, v14.4S, v30.4S // ...........................................................*.......................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sub v4.4S, v22.4S, v20.4S // ..................................................................................*................... - mul v18.4S, v7.4S, v25.4S // ...................................................................*.................................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - str q10, [x0, #384] // .....................................................................................................* - mul v20.4S, v15.4S, v25.4S // ................................................................*..................................... - // gap // ...................................................................................................... - sub v28.4S, v23.4S, v29.4S // ...............................................................*...................................... - cmge v9.4S, v31.4S, v6.4S // ..................................................*................................................... - mul v16.4S, v5.4S, v0.S[0] // ...............................................*...................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mls v17.4S, v4.4S, v8.4S // ............................................................................................*......... - cmge v19.4S, v6.4S, v30.4S // ...................................................*.................................................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - sqrdmulh v12.4S, v7.4S, v26.4S // ................................................*..................................................... - // gap // ...................................................................................................... - mls v20.4S, v11.4S, v8.S[0] // ......................................................................*............................... - // gap // ...................................................................................................... - sub v22.4S, v9.4S, v19.4S // .......................................................*.............................................. - sqrdmulh v9.4S, v13.4S, v0.S[1] // .................................*.................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mls v14.4S, v28.4S, v8.4S // .....................................................................*................................ - mul v27.4S, v13.4S, v0.S[0] // ................................*..................................................................... - str q17, [x0], #(16) // ..................................................................................................*... - // gap // ...................................................................................................... - mls v6.4S, v22.4S, v8.4S // .............................................................*........................................ - mls v18.4S, v12.4S, v8.S[0] // .........................................................................*............................ - // gap // ...................................................................................................... - // gap // ...................................................................................................... - cmge v4.4S, v31.4S, v20.4S // ...............................................................................*...................... - mls v16.4S, v24.4S, v8.S[0] // ............................................................*......................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - cmge v13.4S, v20.4S, v30.4S // ................................................................................*..................... - mls v27.4S, v9.4S, v8.S[0] // ......................................*............................................................... - str q14, [x0, #752] // ..........................................................................*........................... - // gap // ...................................................................................................... - cmge v22.4S, v31.4S, v18.4S // ......................................................................................*............... - str q6, [x0, #496] // .....................................................................................*................ - cmge v12.4S, v18.4S, v30.4S // .......................................................................................*.............. - // gap // ...................................................................................................... - sub v13.4S, v4.4S, v13.4S // ....................................................................................*................. - cmge v21.4S, v31.4S, v16.4S // ....................................................................*................................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - cmge v15.4S, v16.4S, v30.4S // ..................................................................*................................... - cmge v4.4S, v31.4S, v27.4S // .........................................................*............................................ - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - cmge v24.4S, v27.4S, v30.4S // ........................................................*............................................. - sub v12.4S, v22.4S, v12.4S // ..........................................................................................*........... - mls v20.4S, v13.4S, v8.4S // .............................................................................................*........ - sub v23.4S, v21.4S, v15.4S // ............................................................................*......................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mls v18.4S, v12.4S, v8.4S // ...............................................................................................*...... - sub v24.4S, v4.4S, v24.4S // ..............................................................*....................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - mls v16.4S, v23.4S, v8.4S // ...................................................................................*.................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - str q20, [x0, #112] // ...................................................................................................*.. - mls v27.4S, v24.4S, v8.4S // ........................................................................*............................. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - str q18, [x0, #240] // ....................................................................................................*. - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - str q16, [x0, #880] // ................................................................................................*..... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - str q27, [x0, #624] // ...........................................................................................*.......... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - // gap // ...................................................................................................... - - // original source code - // sqrdmulh v7.4S, v12.4S, v2.S[1] // *..................................................................................................... - // sub v9.4S, v23.4S, v6.4S // .*.................................................................................................... - // mul v24.4S, v17.4S, v2.S[2] // ...*.................................................................................................. - // sqrdmulh v17.4S, v17.4S, v2.S[3] // ..*................................................................................................... - // mul v20.4S, v9.4S, v3.S[0] // .....*................................................................................................ - // add v15.4S, v10.4S, v18.4S // ........*............................................................................................. - // mls v13.4S, v7.4S, v8.S[0] // .......*.............................................................................................. - // sub v23.4S, v14.4S, v27.4S // ..........*........................................................................................... - // mls v24.4S, v17.4S, v8.S[0] // .........*............................................................................................ - // sqrdmulh v18.4S, v9.4S, v3.S[1] // ....*................................................................................................. - // mul v22.4S, v23.4S, v0.S[2] // ..................*................................................................................... - // sqrdmulh v5.4S, v23.4S, v0.S[3] // ..............*....................................................................................... - // sub v12.4S, v15.4S, v28.4S // ............*......................................................................................... - // sub v17.4S, v16.4S, v13.4S // .............*........................................................................................ - // mls v20.4S, v18.4S, v8.S[0] // ...........*.......................................................................................... - // add v10.4S, v14.4S, v27.4S // ......*............................................................................................... - // mls v22.4S, v5.4S, v8.S[0] // .............................*........................................................................ - // mul v9.4S, v17.4S, v0.S[2] // ................*..................................................................................... - // sqrdmulh v18.4S, v17.4S, v0.S[3] // ...................*.................................................................................. - // mul v23.4S, v12.4S, v1.S[0] // .......................*.............................................................................. - // sqrdmulh v29.4S, v12.4S, v1.S[1] // ........................*............................................................................. - // sub v14.4S, v24.4S, v20.4S // .................*.................................................................................... - // add v4.4S, v16.4S, v13.4S // ...............*...................................................................................... - // mls v9.4S, v18.4S, v8.S[0] // .........................*............................................................................ - // mls v23.4S, v29.4S, v8.S[0] // ...............................*...................................................................... - // add v18.4S, v24.4S, v20.4S // ......................*............................................................................... - // add v20.4S, v15.4S, v28.4S // ............................*......................................................................... - // mul v13.4S, v14.4S, v1.S[0] // ....................*................................................................................. - // sqrdmulh v15.4S, v14.4S, v1.S[1] // .....................*................................................................................ - // sub v17.4S, v4.4S, v18.4S // ..............................*....................................................................... - // add v14.4S, v10.4S, v20.4S // ..................................*................................................................... - // sub v29.4S, v10.4S, v20.4S // ...................................*.................................................................. - // mul v12.4S, v17.4S, v0.S[0] // ..........................................................................*........................... - // sqrdmulh v20.4S, v17.4S, v0.S[1] // ........................................................................*............................. - // add v27.4S, v4.4S, v18.4S // ..........................*........................................................................... - // mul v18.4S, v29.4S, v0.S[0] // .........................................*............................................................ - // mls v13.4S, v15.4S, v8.S[0] // ...........................*.......................................................................... - // sqrdmulh v24.4S, v29.4S, v0.S[1] // ..............................................*....................................................... - // mls v12.4S, v20.4S, v8.S[0] // .................................................................................*.................... - // sub v4.4S, v22.4S, v23.4S // .......................................*.............................................................. - // sqrdmulh v5.4S, v14.4S, v26.4S // ........................................*............................................................. - // add v17.4S, v22.4S, v23.4S // .......................................................*.............................................. - // mul v20.4S, v4.4S, v0.S[0] // ............................................*......................................................... - // sub v28.4S, v9.4S, v13.4S // .............................................*........................................................ - // mls v18.4S, v24.4S, v8.S[0] // ..........................................................*........................................... - // add v23.4S, v9.4S, v13.4S // .................................*.................................................................... - // sqrdmulh v6.4S, v4.4S, v0.S[1] // ..........................................*........................................................... - // mul v22.4S, v28.4S, v0.S[0] // ..................................................................*................................... - // sqrdmulh v29.4S, v17.4S, v26.4S // .....................................................................*................................ - // mul v13.4S, v14.4S, v25.4S // ......................................*............................................................... - // cmge v15.4S, v31.4S, v18.4S // .................................................................*.................................... - // cmge v21.4S, v18.4S, v30.4S // ....................................................................*................................. - // mls v20.4S, v6.4S, v8.S[0] // ..................................................*................................................... - // sqrdmulh v9.4S, v28.4S, v0.S[1] // ...................................................*.................................................. - // sqrdmulh v28.4S, v23.4S, v26.4S // .....................................*................................................................ - // sub v24.4S, v15.4S, v21.4S // .......................................................................*.............................. - // cmge v4.4S, v12.4S, v30.4S // ..........................................................................................*........... - // cmge v6.4S, v31.4S, v12.4S // .........................................................................................*............ - // cmge v21.4S, v31.4S, v20.4S // .........................................................*............................................ - // cmge v10.4S, v20.4S, v30.4S // ...........................................................*.......................................... - // mls v22.4S, v9.4S, v8.S[0] // ...............................................................................*...................... - // mls v18.4S, v24.4S, v8.4S // ............................................................................*......................... - // sub v15.4S, v6.4S, v4.4S // ...............................................................................................*...... - // sub v21.4S, v21.4S, v10.4S // ................................................................*..................................... - // mul v4.4S, v27.4S, v25.4S // ...............................................................*...................................... - // sqrdmulh v14.4S, v27.4S, v26.4S // ................................*..................................................................... - // cmge v24.4S, v22.4S, v30.4S // ........................................................................................*............. - // mul v9.4S, v17.4S, v25.4S // .............................................................*........................................ - // cmge v6.4S, v31.4S, v22.4S // .......................................................................................*.............. - // mls v20.4S, v21.4S, v8.4S // .........................................................................*............................ - // mls v4.4S, v14.4S, v8.S[0] // ......................................................................*............................... - // mls v13.4S, v5.4S, v8.S[0] // ...............................................*...................................................... - // mls v12.4S, v15.4S, v8.4S // ..................................................................................................*... - // mls v9.4S, v29.4S, v8.S[0] // .............................................................................*........................ - // str q20, [x0, #768] // ..................................................................................*................... - // mul v20.4S, v23.4S, v25.4S // ....................................*................................................................. - // sub v23.4S, v6.4S, v24.4S // .............................................................................................*........ - // cmge v29.4S, v31.4S, v13.4S // ....................................................*................................................. - // cmge v21.4S, v13.4S, v30.4S // ......................................................*............................................... - // cmge v15.4S, v31.4S, v4.4S // ..............................................................................*....................... - // cmge v5.4S, v4.4S, v30.4S // ................................................................................*..................... - // mls v20.4S, v28.4S, v8.S[0] // ...........................................*.......................................................... - // sub v29.4S, v29.4S, v21.4S // ............................................................*......................................... - // mls v22.4S, v23.4S, v8.4S // ................................................................................................*..... - // sub v21.4S, v15.4S, v5.4S // ......................................................................................*............... - // str q18, [x0, #512] // ....................................................................................*................. - // cmge v27.4S, v31.4S, v9.4S // ...................................................................................*.................. - // cmge v28.4S, v9.4S, v30.4S // .....................................................................................*................ - // cmge v17.4S, v20.4S, v30.4S // .................................................*.................................................... - // cmge v6.4S, v31.4S, v20.4S // ................................................*..................................................... - // sub v15.4S, v27.4S, v28.4S // ...........................................................................................*.......... - // str q12, [x0, #640] // .....................................................................................................* - // mls v13.4S, v29.4S, v8.4S // ...................................................................*.................................. - // mls v4.4S, v21.4S, v8.4S // ............................................................................................*......... - // sub v5.4S, v6.4S, v17.4S // .....................................................*................................................ - // mls v9.4S, v15.4S, v8.4S // ..............................................................................................*....... - // str q22, [x0, #896] // ....................................................................................................*. - // mls v20.4S, v5.4S, v8.4S // ........................................................*............................................. - // str q13, [x0], #(16) // ...........................................................................*.......................... - // str q4, [x0, #112] // .................................................................................................*.... - // str q9, [x0, #240] // ...................................................................................................*.. - // str q20, [x0, #368] // ..............................................................*....................................... + // Instructions: 105 + // Expected cycles: 52 + // Expected IPC: 2.02 + // + // Wall time: 24.05s + // User time: 24.05s + // + // ------------------------------------------ original position -------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|---- + sub v7.4S, v4.4S, v29.4S // ..*...................................................................................................... + ldr q12, [x0, #0] // *........................................................................................................ + sqrdmulh v16.4S, v13.4S, v3.S[1] // .*....................................................................................................... + // gap // ......................................................................................................... + sub v11.4S, v15.4S, v5.4S // ....*.................................................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + add v23.4S, v15.4S, v5.4S // ...*..................................................................................................... + mul v9.4S, v7.4S, v2.S[0] // .....*................................................................................................... + mls v24.4S, v6.4S, v8.S[0] // ........*................................................................................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v19.4S, v16.4S, v8.S[0] // .........*............................................................................................... + sqrdmulh v27.4S, v11.4S, v1.S[1] // .......*................................................................................................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + add v29.4S, v12.4S, v20.4S // ..........*.............................................................................................. + sub v28.4S, v12.4S, v20.4S // ..............*.......................................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mul v18.4S, v11.4S, v1.S[0] // ...........*............................................................................................. + sqrdmulh v10.4S, v7.4S, v2.S[1] // ......*.................................................................................................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mul v12.4S, v28.4S, v1.S[2] // ..................*...................................................................................... + // gap // ......................................................................................................... + sub v13.4S, v29.4S, v22.4S // .............*........................................................................................... + // gap // ......................................................................................................... + add v17.4S, v29.4S, v22.4S // .................*....................................................................................... + sqrdmulh v21.4S, v28.4S, v1.S[3] // ....................*.................................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sqrdmulh v7.4S, v13.4S, v0.S[3] // ........................*................................................................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mul v4.4S, v13.4S, v0.S[2] // .....................*................................................................................... + sub v22.4S, v17.4S, v23.4S // .......................*................................................................................. + sub v29.4S, v24.4S, v19.4S // ................*........................................................................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v9.4S, v10.4S, v8.S[0] // ............*............................................................................................ + mls v12.4S, v21.4S, v8.S[0] // .........................*............................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v18.4S, v27.4S, v8.S[0] // ......................................*.................................................................. + mls v4.4S, v7.4S, v8.S[0] // ..................................*...................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sqrdmulh v16.4S, v29.4S, v1.S[1] // ......................*.................................................................................. + add v11.4S, v17.4S, v23.4S // ..........................*.............................................................................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sub v14.4S, v12.4S, v9.4S // ....................................*.................................................................... + add v20.4S, v12.4S, v9.4S // ................................*........................................................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sub v5.4S, v4.4S, v18.4S // ............................................*............................................................ + mul v28.4S, v29.4S, v1.S[0] // ...................*..................................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mul v21.4S, v14.4S, v0.S[2] // .........................................*............................................................... + sqrdmulh v15.4S, v14.4S, v0.S[3] // ...............................................*......................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sqrdmulh v10.4S, v5.4S, v0.S[1] // ..........................................................................*.............................. + mul v13.4S, v5.4S, v0.S[0] // ................................................*........................................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v28.4S, v16.4S, v8.S[0] // .............................*........................................................................... + // gap // ......................................................................................................... + mul v6.4S, v11.4S, v25.4S // ...............................*......................................................................... + // gap // ......................................................................................................... + mls v21.4S, v15.4S, v8.S[0] // ......................................................*.................................................. + add v12.4S, v4.4S, v18.4S // ..............................................*.......................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v13.4S, v10.4S, v8.S[0] // ................................................................................*........................ + // gap // ......................................................................................................... + add v14.4S, v24.4S, v19.4S // ...............*......................................................................................... + // gap // ......................................................................................................... + sqrdmulh v7.4S, v12.4S, v26.4S // ..................................................*...................................................... + mul v5.4S, v12.4S, v25.4S // .................................................*....................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sub v10.4S, v21.4S, v28.4S // ............................................................*............................................ + add v28.4S, v21.4S, v28.4S // ..................................................................*...................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + cmge v9.4S, v31.4S, v13.4S // ............................................................................................*............ + cmge v17.4S, v13.4S, v30.4S // .........................................................................................*............... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mul v15.4S, v28.4S, v25.4S // ......................................................................*.................................. + sqrdmulh v23.4S, v28.4S, v26.4S // .......................................................................*................................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sub v16.4S, v9.4S, v17.4S // .................................................................................................*....... + mls v5.4S, v7.4S, v8.S[0] // ........................................................*................................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sqrdmulh v21.4S, v10.4S, v0.S[1] // ................................................................*........................................ + add v9.4S, v20.4S, v14.4S // ...................................*..................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v13.4S, v16.4S, v8.4S // .....................................................................................................*... + mls v15.4S, v23.4S, v8.S[0] // ...............................................................................*......................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + cmge v29.4S, v5.4S, v30.4S // .........................................................................*............................... + sub v27.4S, v20.4S, v14.4S // .....................................*................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + sqrdmulh v16.4S, v11.4S, v26.4S // ..............................*.......................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + cmge v18.4S, v31.4S, v5.4S // ...............................................................*......................................... + str q13, [x0, #768] // ........................................................................................................* + cmge v23.4S, v31.4S, v15.4S // .......................................................................................*................. + // gap // ......................................................................................................... + cmge v17.4S, v15.4S, v30.4S // .....................................................................................*................... + sub v29.4S, v18.4S, v29.4S // .............................................................................*........................... + mul v19.4S, v10.4S, v0.S[0] // .................................................................*....................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v6.4S, v16.4S, v8.S[0] // ..........................................*.............................................................. + // gap // ......................................................................................................... + sub v17.4S, v23.4S, v17.4S // ...........................................................................................*............. + // gap // ......................................................................................................... + sqrdmulh v14.4S, v9.4S, v26.4S // ........................................*................................................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v5.4S, v29.4S, v8.4S // .............................................................................................*........... + mls v15.4S, v17.4S, v8.4S // ................................................................................................*........ + mls v19.4S, v21.4S, v8.S[0] // ...........................................................................*............................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mul v10.4S, v22.4S, v0.S[0] // ...........................*............................................................................. + sqrdmulh v24.4S, v22.4S, v0.S[1] // ............................*............................................................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + str q5, [x0, #256] // ....................................................................................................*.... + mul v20.4S, v9.4S, v25.4S // .......................................*................................................................. + mul v18.4S, v27.4S, v0.S[0] // .....................................................................*................................... + // gap // ......................................................................................................... + cmge v9.4S, v19.4S, v30.4S // ........................................................................................*................ + str q15, [x0, #384] // ......................................................................................................*.. + // gap // ......................................................................................................... + cmge v11.4S, v31.4S, v19.4S // ....................................................................................*.................... + mls v10.4S, v24.4S, v8.S[0] // .................................*....................................................................... + // gap // ......................................................................................................... + sqrdmulh v16.4S, v27.4S, v0.S[1] // ...................................................................*..................................... + // gap // ......................................................................................................... + sub v15.4S, v11.4S, v9.4S // ..............................................................................................*.......... + mls v20.4S, v14.4S, v8.S[0] // .............................................*........................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + cmge v22.4S, v31.4S, v6.4S // .........................................................*............................................... + cmge v14.4S, v6.4S, v30.4S // ...........................................................*............................................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v19.4S, v15.4S, v8.4S // ...................................................................................................*..... + mls v18.4S, v16.4S, v8.S[0] // ............................................................................*............................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + cmge v11.4S, v20.4S, v30.4S // ...................................................*..................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + cmge v17.4S, v31.4S, v10.4S // ...........................................*............................................................. + cmge v4.4S, v31.4S, v20.4S // ....................................................*.................................................... + cmge v24.4S, v10.4S, v30.4S // .....................................................*................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + str q19, [x0, #896] // .......................................................................................................*. + sub v29.4S, v22.4S, v14.4S // ..............................................................................*.......................... + // gap // ......................................................................................................... + cmge v16.4S, v18.4S, v30.4S // .................................................................................*....................... + sub v14.4S, v4.4S, v11.4S // .......................................................*................................................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + cmge v21.4S, v31.4S, v18.4S // ..................................................................................*...................... + mls v6.4S, v29.4S, v8.4S // ...................................................................................*..................... + sub v23.4S, v17.4S, v24.4S // ..........................................................*.............................................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v20.4S, v14.4S, v8.4S // .............................................................*........................................... + sub v17.4S, v21.4S, v16.4S // ......................................................................................*.................. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + mls v10.4S, v23.4S, v8.4S // ..............................................................*.......................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + str q6, [x0], #(16) // ...............................................................................................*......... + mls v18.4S, v17.4S, v8.4S // ..........................................................................................*.............. + // gap // ......................................................................................................... + // gap // ......................................................................................................... + str q20, [x0, #112] // ........................................................................*................................ + // gap // ......................................................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + str q10, [x0, #496] // ....................................................................*.................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + str q18, [x0, #624] // ..................................................................................................*...... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + // gap // ......................................................................................................... + + // --------------------------------------------- new position ---------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|---- + // ldr q9, [x0, #0] // .*....................................................................................................... + // sqrdmulh v11.4S, v13.4S, v3.S[1] // ..*...................................................................................................... + // sub v12.4S, v4.4S, v29.4S // *........................................................................................................ + // add v17.4S, v15.4S, v5.4S // ....*.................................................................................................... + // sub v15.4S, v15.4S, v5.4S // ...*..................................................................................................... + // mul v27.4S, v12.4S, v2.S[0] // .....*................................................................................................... + // sqrdmulh v10.4S, v12.4S, v2.S[1] // ............*............................................................................................ + // sqrdmulh v12.4S, v15.4S, v1.S[1] // ........*................................................................................................ + // mls v24.4S, v6.4S, v8.S[0] // ......*.................................................................................................. + // mls v19.4S, v11.4S, v8.S[0] // .......*................................................................................................. + // add v16.4S, v9.4S, v20.4S // .........*............................................................................................... + // mul v13.4S, v15.4S, v1.S[0] // ...........*............................................................................................. + // mls v27.4S, v10.4S, v8.S[0] // .....................*................................................................................... + // sub v7.4S, v16.4S, v22.4S // ..............*.......................................................................................... + // sub v4.4S, v9.4S, v20.4S // ..........*.............................................................................................. + // add v14.4S, v24.4S, v19.4S // ........................................*................................................................ + // sub v11.4S, v24.4S, v19.4S // ....................*.................................................................................... + // add v15.4S, v16.4S, v22.4S // ...............*......................................................................................... + // mul v21.4S, v4.4S, v1.S[2] // .............*........................................................................................... + // mul v24.4S, v11.4S, v1.S[0] // ..............................*.......................................................................... + // sqrdmulh v22.4S, v4.4S, v1.S[3] // ................*........................................................................................ + // mul v16.4S, v7.4S, v0.S[2] // ..................*...................................................................................... + // sqrdmulh v4.4S, v11.4S, v1.S[1] // .........................*............................................................................... + // sub v18.4S, v15.4S, v17.4S // ...................*..................................................................................... + // sqrdmulh v19.4S, v7.4S, v0.S[3] // .................*....................................................................................... + // mls v21.4S, v22.4S, v8.S[0] // ......................*.................................................................................. + // add v11.4S, v15.4S, v17.4S // ..........................*.............................................................................. + // mul v6.4S, v18.4S, v0.S[0] // ......................................................................*.................................. + // sqrdmulh v17.4S, v18.4S, v0.S[1] // .......................................................................*................................. + // mls v24.4S, v4.4S, v8.S[0] // ...................................*..................................................................... + // sqrdmulh v7.4S, v11.4S, v26.4S // .........................................................*............................................... + // mul v22.4S, v11.4S, v25.4S // ....................................*.................................................................... + // add v11.4S, v21.4S, v27.4S // ............................*............................................................................ + // mls v6.4S, v17.4S, v8.S[0] // ..............................................................................*.......................... + // mls v16.4S, v19.4S, v8.S[0] // ........................*................................................................................ + // add v18.4S, v11.4S, v14.4S // ....................................................*.................................................... + // sub v19.4S, v21.4S, v27.4S // ...........................*............................................................................. + // sub v17.4S, v11.4S, v14.4S // ........................................................*................................................ + // mls v13.4S, v12.4S, v8.S[0] // .......................*................................................................................. + // mul v14.4S, v18.4S, v25.4S // .........................................................................*............................... + // sqrdmulh v27.4S, v18.4S, v26.4S // ..................................................................*...................................... + // mul v4.4S, v19.4S, v0.S[2] // ...............................*......................................................................... + // mls v22.4S, v7.4S, v8.S[0] // ................................................................*........................................ + // cmge v12.4S, v31.4S, v6.4S // .......................................................................................*................. + // sub v11.4S, v16.4S, v13.4S // .............................*........................................................................... + // mls v14.4S, v27.4S, v8.S[0] // .................................................................................*....................... + // add v7.4S, v16.4S, v13.4S // ......................................*.................................................................. + // sqrdmulh v27.4S, v19.4S, v0.S[3] // ................................*........................................................................ + // mul v21.4S, v11.4S, v0.S[0] // ..................................*...................................................................... + // mul v9.4S, v7.4S, v25.4S // ..........................................*.............................................................. + // sqrdmulh v16.4S, v7.4S, v26.4S // .........................................*............................................................... + // cmge v7.4S, v14.4S, v30.4S // ......................................................................................*.................. + // cmge v19.4S, v31.4S, v14.4S // ........................................................................................*................ + // cmge v13.4S, v6.4S, v30.4S // .........................................................................................*............... + // mls v4.4S, v27.4S, v8.S[0] // .....................................*................................................................... + // sub v27.4S, v19.4S, v7.4S // .............................................................................................*........... + // mls v9.4S, v16.4S, v8.S[0] // ..................................................*...................................................... + // cmge v16.4S, v31.4S, v22.4S // ..................................................................................*...................... + // sub v18.4S, v12.4S, v13.4S // ................................................................................................*........ + // cmge v13.4S, v22.4S, v30.4S // ...................................................................................*..................... + // sub v12.4S, v4.4S, v24.4S // ...........................................*............................................................. + // mls v14.4S, v27.4S, v8.4S // .................................................................................................*....... + // mls v6.4S, v18.4S, v8.4S // ...................................................................................................*..... + // cmge v7.4S, v31.4S, v9.4S // ..........................................................*.............................................. + // sqrdmulh v15.4S, v12.4S, v0.S[1] // ...................................................*..................................................... + // mul v18.4S, v12.4S, v0.S[0] // ...............................................................*......................................... + // add v10.4S, v4.4S, v24.4S // ............................................*............................................................ + // sqrdmulh v19.4S, v17.4S, v0.S[1] // ...............................................................................*......................... + // str q6, [x0, #512] // .......................................................................................................*. + // mul v6.4S, v17.4S, v0.S[0] // ..........................................................................*.............................. + // mul v27.4S, v10.4S, v25.4S // ...............................................*......................................................... + // sqrdmulh v4.4S, v10.4S, v26.4S // ................................................*........................................................ + // str q14, [x0, #128] // ......................................................................................................*.. + // cmge v14.4S, v9.4S, v30.4S // .......................................................*................................................. + // sqrdmulh v12.4S, v11.4S, v0.S[1] // .................................*....................................................................... + // mls v18.4S, v15.4S, v8.S[0] // .....................................................................*................................... + // mls v6.4S, v19.4S, v8.S[0] // .....................................................................................*................... + // sub v10.4S, v7.4S, v14.4S // ..............................................................*.......................................... + // sub v24.4S, v16.4S, v13.4S // ...........................................................................................*............. + // mls v27.4S, v4.4S, v8.S[0] // ......................................................*.................................................. + // mls v21.4S, v12.4S, v8.S[0] // .......................................*................................................................. + // cmge v16.4S, v6.4S, v30.4S // ............................................................................................*............ + // cmge v13.4S, v31.4S, v6.4S // ..............................................................................................*.......... + // mls v22.4S, v24.4S, v8.4S // ...............................................................................................*......... + // cmge v24.4S, v31.4S, v18.4S // .............................................................................*........................... + // cmge v11.4S, v27.4S, v30.4S // .............................................................*........................................... + // sub v15.4S, v13.4S, v16.4S // ..................................................................................................*...... + // cmge v12.4S, v31.4S, v27.4S // ............................................................*............................................ + // cmge v14.4S, v18.4S, v30.4S // ...........................................................................*............................. + // cmge v7.4S, v21.4S, v30.4S // ..............................................*.......................................................... + // mls v6.4S, v15.4S, v8.4S // .....................................................................................................*... + // sub v12.4S, v12.4S, v11.4S // .................................................................*....................................... + // cmge v13.4S, v31.4S, v21.4S // .............................................*........................................................... + // mls v9.4S, v10.4S, v8.4S // ...................................................................*..................................... + // sub v19.4S, v24.4S, v14.4S // ................................................................................*........................ + // str q22, [x0], #(16) // ....................................................................................................*.... + // mls v27.4S, v12.4S, v8.4S // ....................................................................*.................................... + // sub v15.4S, v13.4S, v7.4S // .................................................*....................................................... + // str q6, [x0, #624] // ........................................................................................................* + // mls v18.4S, v19.4S, v8.4S // ....................................................................................*.................... + // str q9, [x0, #240] // ........................................................................*................................ + // mls v21.4S, v15.4S, v8.4S // .....................................................*................................................... + // str q27, [x0, #368] // ............................................................................*............................ + // str q18, [x0, #880] // ..........................................................................................*.............. + // str q21, [x0, #752] // ...........................................................*............................................. pop_stack diff --git a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a55.s b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a55.s index 3b1ae537..9fe4a660 100644 --- a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a55.s +++ b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a55.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +69,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -146,7 +125,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +136,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +146,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +154,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +165,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -354,536 +339,567 @@ _intt_kyber_123_4567_manual_ld4_opt_a55: mov count, #8 .p2align 2 - ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*........................................................ - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - ldr q22, [x4, #64] // ..*....................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sub v19.8H, v5.8H, v6.8H // ...*...................................................... - // gap // .......................................................... - ldr q23, [x4, #80] // ...........*.............................................. - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mul v29.8H, v19.8H, v22.8H // ....*..................................................... - // gap // .......................................................... - ldr q26, [x4, #48] // .....*.................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sub v0.8H, v3.8H, v4.8H // .......*.................................................. - // gap // .......................................................... - ldr q28, [x4, #32] // .........*................................................ - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sqrdmulh v19.8H, v19.8H, v23.8H // .............*............................................ - // gap // .......................................................... - sqrdmulh v22.8H, v0.8H, v26.8H // ..........*............................................... - // gap // .......................................................... - mul v24.8H, v0.8H, v28.8H // ............*............................................. - // gap // .......................................................... - add v27.8H, v5.8H, v6.8H // ...............*.......................................... - // gap // .......................................................... - mls v29.8H, v19.8H, v7.H[0] // ................*......................................... - // gap // .......................................................... - add v3.8H, v3.8H, v4.8H // ........*................................................. - // gap // .......................................................... - mls v24.8H, v22.8H, v7.H[0] // ..............*........................................... - // gap // .......................................................... - ldr q23, [x4, #16] // *......................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sub v28.8H, v3.8H, v27.8H // .................*........................................ - // gap // .......................................................... - sub v22.8H, v24.8H, v29.8H // ...................*...................................... - // gap // .......................................................... - ldr q0, [x4], #(6*16) // ......*................................................... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sqrdmulh v19.8H, v22.8H, v23.8H // .......................*.................................. - // gap // .......................................................... - sqrdmulh v23.8H, v28.8H, v23.8H // .....................*.................................... - // gap // .......................................................... - mul v22.8H, v22.8H, v0.8H // ......................*................................... - // gap // .......................................................... - mul v0.8H, v28.8H, v0.8H // ....................*..................................... - // gap // .......................................................... - add v28.8H, v24.8H, v29.8H // ........................*................................. - // gap // .......................................................... - add v27.8H, v3.8H, v27.8H // ..................*....................................... - // gap // .......................................................... - mls v22.8H, v19.8H, v7.H[0] // ..........................*............................... - // gap // .......................................................... - mls v0.8H, v23.8H, v7.H[0] // .........................*................................ - // gap // .......................................................... - trn1 v23.4S, v27.4S, v28.4S // ..............................*........................... - // gap // .......................................................... - ldr q11, [x3], #16 // ............................*............................. - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - trn1 v19.4S, v0.4S, v22.4S // ...............................*.......................... - // gap // .......................................................... - trn2 v22.4S, v0.4S, v22.4S // .............................*............................ - // gap // .......................................................... - trn2 v28.4S, v27.4S, v28.4S // ...........................*.............................. - // gap // .......................................................... - trn1 v0.2D, v23.2D, v19.2D // ...................................*...................... - // gap // .......................................................... - trn2 v19.2D, v23.2D, v19.2D // ..................................*....................... - // gap // .......................................................... - trn1 v3.2D, v28.2D, v22.2D // .................................*........................ - // gap // .......................................................... - trn2 v23.2D, v28.2D, v22.2D // ................................*......................... - // gap // .......................................................... - add v27.8H, v0.8H, v3.8H // .....................................*.................... - // gap // .......................................................... - add v24.8H, v19.8H, v23.8H // ....................................*..................... - // gap // .......................................................... - sub v23.8H, v19.8H, v23.8H // ...........................................*.............. - // gap // .......................................................... - sqdmulh v22.8H, v27.8H, v7.H[1] // .......................................*.................. - // gap // .......................................................... - sqdmulh v28.8H, v24.8H, v7.H[1] // ......................................*................... - // gap // .......................................................... - sqrdmulh v19.8H, v23.8H, v11.H[5] // ..................................................*....... - // gap // .......................................................... - mul v8.8H, v23.8H, v11.H[4] // ................................................*......... - // gap // .......................................................... - srshr v23.8H, v22.8H, #11 // ..........................................*............... - // gap // .......................................................... - srshr v22.8H, v28.8H, #11 // .........................................*................ - // gap // .......................................................... - sub v28.8H, v0.8H, v3.8H // ........................................*................. - // gap // .......................................................... - mls v27.8H, v23.8H, v7.H[0] // .............................................*............ - // gap // .......................................................... - mls v24.8H, v22.8H, v7.H[0] // ............................................*............. - // gap // .......................................................... - mls v8.8H, v19.8H, v7.H[0] // .......................................................*.. - // gap // .......................................................... - sqrdmulh v0.8H, v28.8H, v11.H[3] // ..............................................*........... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - sub v22.8H, v27.8H, v24.8H // .................................................*........ - // gap // .......................................................... - add v23.8H, v27.8H, v24.8H // ...................................................*...... - // gap // .......................................................... - mul v28.8H, v28.8H, v11.H[2] // ...............................................*.......... - // gap // .......................................................... - sqrdmulh v19.8H, v22.8H, v11.H[1] // .....................................................*.... - // gap // .......................................................... - mul v27.8H, v22.8H, v11.H[0] // ....................................................*..... - // gap // .......................................................... - str q23, [x1], #(64) // ........................................................*. - // gap // .......................................................... - mls v28.8H, v0.8H, v7.H[0] // ......................................................*... - // gap // .......................................................... - // gap // .......................................................... - // gap // .......................................................... - mls v27.8H, v19.8H, v7.H[0] // .........................................................* - // gap // .......................................................... - - // original source code - // ldr q22, [x4, #16] // ...............*.......................................... - // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1] // *......................................................... - // ldr q24, [x4, #64] // .*........................................................ - // sub v29.8H, v15.8H, v16.8H // ..*....................................................... - // mul v18.8H, v29.8H, v24.8H // ....*..................................................... - // ldr q27, [x4, #48] // .....*.................................................... - // ldr q19, [x4], #(6*16) // ..................*....................................... - // sub v26.8H, v13.8H, v14.8H // ......*................................................... - // add v0.8H, v13.8H, v14.8H // .............*............................................ - // ldr q23, [x4, #-64] // .......*.................................................. - // sqrdmulh v27.8H, v26.8H, v27.8H // .........*................................................ - // ldr q20, [x4, #-16] // ...*...................................................... - // mul v23.8H, v26.8H, v23.8H // ..........*............................................... - // sqrdmulh v3.8H, v29.8H, v20.8H // ........*................................................. - // mls v23.8H, v27.8H, v7.H[0] // ..............*........................................... - // add v27.8H, v15.8H, v16.8H // ...........*.............................................. - // mls v18.8H, v3.8H, v7.H[0] // ............*............................................. - // sub v29.8H, v0.8H, v27.8H // ................*......................................... - // add v28.8H, v0.8H, v27.8H // ........................*................................. - // sub v3.8H, v23.8H, v18.8H // .................*........................................ - // mul v20.8H, v29.8H, v19.8H // ......................*................................... - // sqrdmulh v29.8H, v29.8H, v22.8H // ....................*..................................... - // mul v19.8H, v3.8H, v19.8H // .....................*.................................... - // sqrdmulh v22.8H, v3.8H, v22.8H // ...................*...................................... - // add v9.8H, v23.8H, v18.8H // .......................*.................................. - // mls v20.8H, v29.8H, v7.H[0] // ..........................*............................... - // mls v19.8H, v22.8H, v7.H[0] // .........................*................................ - // trn2 v31.4S, v28.4S, v9.4S // ...............................*.......................... - // ldr q11, [x3], #16 // ............................*............................. - // trn2 v21.4S, v20.4S, v19.4S // ..............................*........................... - // trn1 v13.4S, v28.4S, v9.4S // ...........................*.............................. - // trn1 v28.4S, v20.4S, v19.4S // .............................*............................ - // trn2 v27.2D, v31.2D, v21.2D // ...................................*...................... - // trn1 v19.2D, v31.2D, v21.2D // ..................................*....................... - // trn2 v0.2D, v13.2D, v28.2D // .................................*........................ - // trn1 v22.2D, v13.2D, v28.2D // ................................*......................... - // add v9.8H, v0.8H, v27.8H // .....................................*.................... - // add v24.8H, v22.8H, v19.8H // ....................................*..................... - // sqdmulh v23.8H, v9.8H, v7.H[1] // ........................................*................. - // sqdmulh v13.8H, v24.8H, v7.H[1] // .......................................*.................. - // sub v31.8H, v22.8H, v19.8H // .............................................*............ - // srshr v23.8H, v23.8H, #11 // ............................................*............. - // srshr v1.8H, v13.8H, #11 // ...........................................*.............. - // sub v21.8H, v0.8H, v27.8H // ......................................*................... - // mls v9.8H, v23.8H, v7.H[0] // ...............................................*.......... - // mls v24.8H, v1.8H, v7.H[0] // ..............................................*........... - // sqrdmulh v19.8H, v31.8H, v11.H[3] // .................................................*........ - // mul v28.8H, v31.8H, v11.H[2] // ....................................................*..... - // mul v8.8H, v21.8H, v11.H[4] // ..........................................*............... - // sub v22.8H, v24.8H, v9.8H // ..................................................*....... - // sqrdmulh v23.8H, v21.8H, v11.H[5] // .........................................*................ - // add v12.8H, v24.8H, v9.8H // ...................................................*...... - // mul v27.8H, v22.8H, v11.H[0] // ......................................................*... - // sqrdmulh v13.8H, v22.8H, v11.H[1] // .....................................................*.... - // mls v28.8H, v19.8H, v7.H[0] // ........................................................*. - // mls v8.8H, v23.8H, v7.H[0] // ................................................*......... - // str q12, [x1], #(64) // .......................................................*.. - // mls v27.8H, v13.8H, v7.H[0] // .........................................................* + // Instructions: 58 + // Expected cycles: 74 + // Expected IPC: 0.78 + // + // Cycle bound: 74.0 + // IPC bound: 0.78 + // + // Wall time: 11.25s + // User time: 11.25s + // + // ------------------- original position -------------------> + // 0 25 50 + // |------------------------|------------------------|------- + ldr q18, [x4, #48] // ..*....................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // ...*...................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q30, [x4, #32] // *......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v16.8H, v8.8H, v9.8H // .....*.................................................... + // gap // .......................................................... + sub v13.8H, v10.8H, v11.8H // .......*.................................................. + // gap // .......................................................... + ldr q2, [x4, #80] // ....*..................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q26, [x4, #64] // .*........................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v1.8H, v16.8H, v30.8H // ........*................................................. + // gap // .......................................................... + sqrdmulh v15.8H, v13.8H, v2.8H // ............*............................................. + // gap // .......................................................... + mul v12.8H, v13.8H, v26.8H // ...........*.............................................. + // gap // .......................................................... + sqrdmulh v14.8H, v16.8H, v18.8H // .........*................................................ + // gap // .......................................................... + add v4.8H, v8.8H, v9.8H // ......*................................................... + // gap // .......................................................... + add v26.8H, v10.8H, v11.8H // ..........*............................................... + // gap // .......................................................... + mls v12.8H, v15.8H, v7.H[0] // ...............*.......................................... + // gap // .......................................................... + mls v1.8H, v14.8H, v7.H[0] // .............*............................................ + // gap // .......................................................... + ldr q8, [x4], #(6*16) // ..............*........................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v5.8H, v4.8H, v26.8H // ................*......................................... + // gap // .......................................................... + sub v25.8H, v1.8H, v12.8H // ..................*....................................... + // gap // .......................................................... + ldr q21, [x4, #-80] // .................*........................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v13.8H, v5.8H, v8.8H // ...................*...................................... + // gap // .......................................................... + mul v3.8H, v25.8H, v8.8H // .....................*.................................... + // gap // .......................................................... + sqrdmulh v10.8H, v25.8H, v21.8H // ......................*................................... + // gap // .......................................................... + sqrdmulh v9.8H, v5.8H, v21.8H // ....................*..................................... + // gap // .......................................................... + add v20.8H, v1.8H, v12.8H // .......................*.................................. + // gap // .......................................................... + add v11.8H, v4.8H, v26.8H // ........................*................................. + // gap // .......................................................... + mls v3.8H, v10.8H, v7.H[0] // ..........................*............................... + // gap // .......................................................... + mls v13.8H, v9.8H, v7.H[0] // .........................*................................ + // gap // .......................................................... + ldr q2, [x3], #16 // .......................................*.................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + trn2 v1.4S, v11.4S, v20.4S // ............................*............................. + // gap // .......................................................... + trn2 v17.4S, v13.4S, v3.4S // ..............................*........................... + // gap // .......................................................... + trn1 v3.4S, v13.4S, v3.4S // .............................*............................ + // gap // .......................................................... + trn1 v21.4S, v11.4S, v20.4S // ...........................*.............................. + // gap // .......................................................... + trn1 v28.2D, v1.2D, v17.2D // .................................*........................ + // gap // .......................................................... + trn2 v31.2D, v1.2D, v17.2D // ................................*......................... + // gap // .......................................................... + trn1 v25.2D, v21.2D, v3.2D // ...................................*...................... + // gap // .......................................................... + trn2 v15.2D, v21.2D, v3.2D // ...............................*.......................... + // gap // .......................................................... + add v16.8H, v25.8H, v28.8H // ......................................*................... + // gap // .......................................................... + sub v23.8H, v15.8H, v31.8H // ..................................*....................... + // gap // .......................................................... + sub v21.8H, v25.8H, v28.8H // .....................................*.................... + // gap // .......................................................... + add v27.8H, v15.8H, v31.8H // ....................................*..................... + // gap // .......................................................... + sqdmulh v1.8H, v16.8H, v7.H[1] // .........................................*................ + // gap // .......................................................... + mul v17.8H, v21.8H, v2.H[2] // ..........................................*............... + // gap // .......................................................... + mul v0.8H, v23.8H, v2.H[4] // .............................................*............ + // gap // .......................................................... + sqrdmulh v5.8H, v23.8H, v2.H[5] // ............................................*............. + // gap // .......................................................... + sqdmulh v22.8H, v27.8H, v7.H[1] // ........................................*................. + // gap // .......................................................... + sqrdmulh v4.8H, v21.8H, v2.H[3] // ...........................................*.............. + // gap // .......................................................... + srshr v29.8H, v1.8H, #11 // ..............................................*........... + // gap // .......................................................... + mls v0.8H, v5.8H, v7.H[0] // .................................................*........ + // gap // .......................................................... + srshr v9.8H, v22.8H, #11 // ................................................*......... + // gap // .......................................................... + mls v17.8H, v4.8H, v7.H[0] // ...............................................*.......... + // gap // .......................................................... + mls v16.8H, v29.8H, v7.H[0] // ..................................................*....... + // gap // .......................................................... + mls v27.8H, v9.8H, v7.H[0] // ...................................................*...... + // gap // .......................................................... + sqdmulh v3.8H, v0.8H, v7.H[1] // .....................................................*.... + // gap // .......................................................... + sqdmulh v20.8H, v17.8H, v7.H[1] // ....................................................*..... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + add v22.8H, v16.8H, v27.8H // ......................................................*... + // gap // .......................................................... + srshr v14.8H, v3.8H, #11 // ........................................................*. + // gap // .......................................................... + srshr v6.8H, v20.8H, #11 // .......................................................*.. + // gap // .......................................................... + str q22, [x1], #(64) // .........................................................* + // gap // .......................................................... + + // --------------------- new position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------- + // ldr q26, [x4, #32] // ..*....................................................... + // ldr q11, [x4, #64] // ......*................................................... + // ldr q0, [x4, #48] // *......................................................... + // ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // .*........................................................ + // ldr q22, [x4, #80] // .....*.................................................... + // sub v13.8H, v18.8H, v19.8H // ...*...................................................... + // add v31.8H, v18.8H, v19.8H // ...........*.............................................. + // sub v15.8H, v20.8H, v21.8H // ....*..................................................... + // mul v6.8H, v13.8H, v26.8H // .......*.................................................. + // sqrdmulh v0.8H, v13.8H, v0.8H // ..........*............................................... + // add v3.8H, v20.8H, v21.8H // ............*............................................. + // mul v11.8H, v15.8H, v11.8H // .........*................................................ + // sqrdmulh v13.8H, v15.8H, v22.8H // ........*................................................. + // mls v6.8H, v0.8H, v7.H[0] // ..............*........................................... + // ldr q27, [x4], #(6*16) // ...............*.......................................... + // mls v11.8H, v13.8H, v7.H[0] // .............*............................................ + // sub v13.8H, v31.8H, v3.8H // ................*......................................... + // ldr q19, [x4, #-80] // ..................*....................................... + // sub v22.8H, v6.8H, v11.8H // .................*........................................ + // mul v15.8H, v13.8H, v27.8H // ...................*...................................... + // sqrdmulh v0.8H, v13.8H, v19.8H // ......................*................................... + // mul v27.8H, v22.8H, v27.8H // ....................*..................................... + // sqrdmulh v13.8H, v22.8H, v19.8H // .....................*.................................... + // add v21.8H, v6.8H, v11.8H // .......................*.................................. + // add v6.8H, v31.8H, v3.8H // ........................*................................. + // mls v15.8H, v0.8H, v7.H[0] // ..........................*............................... + // mls v27.8H, v13.8H, v7.H[0] // .........................*................................ + // trn1 v3.4S, v6.4S, v21.4S // ...............................*.......................... + // trn2 v6.4S, v6.4S, v21.4S // ............................*............................. + // trn1 v13.4S, v15.4S, v27.4S // ..............................*........................... + // trn2 v15.4S, v15.4S, v27.4S // .............................*............................ + // trn2 v27.2D, v3.2D, v13.2D // ...................................*...................... + // trn2 v26.2D, v6.2D, v15.2D // .................................*........................ + // trn1 v15.2D, v6.2D, v15.2D // ................................*......................... + // sub v25.8H, v27.8H, v26.8H // .....................................*.................... + // trn1 v16.2D, v3.2D, v13.2D // ..................................*....................... + // add v27.8H, v27.8H, v26.8H // .......................................*.................. + // sub v6.8H, v16.8H, v15.8H // ......................................*................... + // add v16.8H, v16.8H, v15.8H // ....................................*..................... + // ldr q2, [x3], #16 // ...........................*.............................. + // sqdmulh v15.8H, v27.8H, v7.H[1] // ............................................*............. + // sqdmulh v13.8H, v16.8H, v7.H[1] // ........................................*................. + // mul v17.8H, v6.8H, v2.H[2] // .........................................*................ + // sqrdmulh v26.8H, v6.8H, v2.H[3] // .............................................*............ + // sqrdmulh v6.8H, v25.8H, v2.H[5] // ...........................................*.............. + // mul v0.8H, v25.8H, v2.H[4] // ..........................................*............... + // srshr v13.8H, v13.8H, #11 // ..............................................*........... + // mls v17.8H, v26.8H, v7.H[0] // .................................................*........ + // srshr v26.8H, v15.8H, #11 // ................................................*......... + // mls v0.8H, v6.8H, v7.H[0] // ...............................................*.......... + // mls v16.8H, v13.8H, v7.H[0] // ..................................................*....... + // mls v27.8H, v26.8H, v7.H[0] // ...................................................*...... + // sqdmulh v15.8H, v17.8H, v7.H[1] // .....................................................*.... + // sqdmulh v13.8H, v0.8H, v7.H[1] // ....................................................*..... + // add v26.8H, v16.8H, v27.8H // ......................................................*... + // srshr v6.8H, v15.8H, #11 // ........................................................*. + // srshr v14.8H, v13.8H, #11 // .......................................................*.. + // str q26, [x1], #(64) // .........................................................* sub count, count, #1 layer4567_start: - ldr q22, [x4, #16] // ..e..................................................................... + // Instructions: 72 + // Expected cycles: 87 + // Expected IPC: 0.83 + // + // Cycle bound: 87.0 + // IPC bound: 0.83 + // + // Wall time: 8.18s + // User time: 8.18s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + mls v17.8H, v6.8H, v7.H[0] // ...................................................*.................... // gap // ........................................................................ + sub v16.8H, v16.8H, v27.8H // ..........................................................*............. // gap // ........................................................................ + mls v0.8H, v14.8H, v7.H[0] // .........................................................*.............. // gap // ........................................................................ - ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1] // e....................................................................... + ldr q26, [x4, #32] // ...e.................................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + sqrdmulh v27.8H, v16.8H, v2.H[1] // ............................................................*........... // gap // ........................................................................ + sub v15.8H, v17.8H, v0.8H // ...............................................................*........ // gap // ........................................................................ + mul v16.8H, v16.8H, v2.H[0] // .............................................................*.......... // gap // ........................................................................ + add v25.8H, v17.8H, v0.8H // ................................................................*....... // gap // ........................................................................ + mul v14.8H, v15.8H, v2.H[0] // ..................................................................*..... // gap // ........................................................................ + sqrdmulh v0.8H, v15.8H, v2.H[1] // .................................................................*...... // gap // ........................................................................ + mls v16.8H, v27.8H, v7.H[0] // ..............................................................*......... // gap // ........................................................................ + ldr q11, [x4, #64] // .....e.................................................................. // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + mls v14.8H, v0.8H, v7.H[0] // ...................................................................*.... // gap // ........................................................................ + ldr q0, [x4, #48] // ....e................................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - ldr q24, [x4, #64] // .....e.................................................................. + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // e....................................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - sub v29.8H, v15.8H, v16.8H // ............e........................................................... // gap // ........................................................................ - sqdmulh v23.8H, v28.8H, v7.H[1] // .................................................*...................... // gap // ........................................................................ - str q27, [x1, #-32] // ......................................................................*. // gap // ........................................................................ - mul v18.8H, v29.8H, v24.8H // ..............e......................................................... // gap // ........................................................................ - ldr q27, [x4, #48] // ....e................................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - ldr q19, [x4], #(6*16) // .e...................................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - srshr v23.8H, v23.8H, #11 // ..................................................*..................... // gap // ........................................................................ - sqdmulh v24.8H, v8.8H, v7.H[1] // .......................................................*................ // gap // ........................................................................ - sub v26.8H, v13.8H, v14.8H // .......e................................................................ // gap // ........................................................................ - add v0.8H, v13.8H, v14.8H // ........e............................................................... // gap // ........................................................................ - mls v28.8H, v23.8H, v7.H[0] // ...................................................*.................... + ldr q22, [x4, #80] // ......e................................................................. // gap // ........................................................................ - srshr v24.8H, v24.8H, #11 // ........................................................*............... // gap // ........................................................................ - ldr q23, [x4, #-64] // ...e.................................................................... // gap // ........................................................................ + sub v13.8H, v18.8H, v19.8H // .......e................................................................ // gap // ........................................................................ + add v31.8H, v18.8H, v19.8H // ........e............................................................... // gap // ........................................................................ - mls v8.8H, v24.8H, v7.H[0] // .........................................................*.............. + sub v15.8H, v20.8H, v21.8H // ............e........................................................... // gap // ........................................................................ - sqrdmulh v27.8H, v26.8H, v27.8H // ..........e............................................................. + mul v6.8H, v13.8H, v26.8H // ..........e............................................................. // gap // ........................................................................ - ldr q20, [x4, #-16] // ......e................................................................. + sqrdmulh v0.8H, v13.8H, v0.8H // .........e.............................................................. // gap // ........................................................................ + add v3.8H, v20.8H, v21.8H // .............e.......................................................... // gap // ........................................................................ + mul v11.8H, v15.8H, v11.8H // ...............e........................................................ // gap // ........................................................................ - sub v30.8H, v28.8H, v8.8H // ...............................................................*........ + sqrdmulh v13.8H, v15.8H, v22.8H // ..............e......................................................... // gap // ........................................................................ - mul v23.8H, v26.8H, v23.8H // .........e.............................................................. + mls v6.8H, v0.8H, v7.H[0] // ...........e............................................................ // gap // ........................................................................ - sqrdmulh v3.8H, v29.8H, v20.8H // ...............e........................................................ + ldr q27, [x4], #(6*16) // .e...................................................................... // gap // ........................................................................ - add v28.8H, v28.8H, v8.8H // ................................................................*....... // gap // ........................................................................ - sqrdmulh v26.8H, v30.8H, v11.H[1] // ..................................................................*..... // gap // ........................................................................ - mls v23.8H, v27.8H, v7.H[0] // ...........e............................................................ + mls v11.8H, v13.8H, v7.H[0] // ................e....................................................... // gap // ........................................................................ - add v27.8H, v15.8H, v16.8H // .............e.......................................................... + sub v13.8H, v31.8H, v3.8H // .................e...................................................... // gap // ........................................................................ - mls v18.8H, v3.8H, v7.H[0] // ................e....................................................... + ldr q19, [x4, #-80] // ..e..................................................................... // gap // ........................................................................ - str q28, [x1, #-48] // .....................................................................*.. // gap // ........................................................................ - sub v29.8H, v0.8H, v27.8H // .................e...................................................... // gap // ........................................................................ - add v28.8H, v0.8H, v27.8H // ..................e..................................................... + sub v22.8H, v6.8H, v11.8H // ......................e................................................. // gap // ........................................................................ - sub v3.8H, v23.8H, v18.8H // ......................e................................................. + mul v15.8H, v13.8H, v27.8H // ....................e................................................... // gap // ........................................................................ - mul v20.8H, v29.8H, v19.8H // ...................e.................................................... + sqrdmulh v0.8H, v13.8H, v19.8H // ...................e.................................................... // gap // ........................................................................ - sqrdmulh v29.8H, v29.8H, v22.8H // ....................e................................................... + mul v27.8H, v22.8H, v27.8H // .........................e.............................................. // gap // ........................................................................ - mul v19.8H, v3.8H, v19.8H // ........................e............................................... + sqrdmulh v13.8H, v22.8H, v19.8H // ........................e............................................... // gap // ........................................................................ - sqrdmulh v22.8H, v3.8H, v22.8H // .........................e.............................................. + add v21.8H, v6.8H, v11.8H // .......................e................................................ // gap // ........................................................................ - add v9.8H, v23.8H, v18.8H // .......................e................................................ + add v6.8H, v31.8H, v3.8H // ..................e..................................................... // gap // ........................................................................ - mls v20.8H, v29.8H, v7.H[0] // .....................e.................................................. + mls v15.8H, v0.8H, v7.H[0] // .....................e.................................................. // gap // ........................................................................ - mul v3.8H, v30.8H, v11.H[0] // .................................................................*...... + mls v27.8H, v13.8H, v7.H[0] // ..........................e............................................. // gap // ........................................................................ - mls v19.8H, v22.8H, v7.H[0] // ..........................e............................................. + trn1 v3.4S, v6.4S, v21.4S // ...........................e............................................ // gap // ........................................................................ - trn2 v31.4S, v28.4S, v9.4S // ............................e........................................... + trn2 v6.4S, v6.4S, v21.4S // ............................e........................................... // gap // ........................................................................ - ldr q11, [x3], #16 // ...................................e.................................... + str q25, [x1, #-48] // .....................................................................*.. // gap // ........................................................................ + trn1 v13.4S, v15.4S, v27.4S // .............................e.......................................... // gap // ........................................................................ + trn2 v15.4S, v15.4S, v27.4S // ..............................e......................................... // gap // ........................................................................ - trn2 v21.4S, v20.4S, v19.4S // ..............................e......................................... + str q16, [x1, #-32] // ......................................................................*. // gap // ........................................................................ - trn1 v13.4S, v28.4S, v9.4S // ...........................e............................................ + trn2 v27.2D, v3.2D, v13.2D // ...............................e........................................ // gap // ........................................................................ - trn1 v28.4S, v20.4S, v19.4S // .............................e.......................................... + trn2 v26.2D, v6.2D, v15.2D // ................................e....................................... // gap // ........................................................................ - trn2 v27.2D, v31.2D, v21.2D // ................................e....................................... + trn1 v15.2D, v6.2D, v15.2D // ..................................e..................................... // gap // ........................................................................ - trn1 v19.2D, v31.2D, v21.2D // ..................................e..................................... + sub v25.8H, v27.8H, v26.8H // .........................................e.............................. // gap // ........................................................................ - trn2 v0.2D, v13.2D, v28.2D // ...............................e........................................ + trn1 v16.2D, v3.2D, v13.2D // .................................e...................................... // gap // ........................................................................ - trn1 v22.2D, v13.2D, v28.2D // .................................e...................................... + add v27.8H, v27.8H, v26.8H // ..........................................e............................. // gap // ........................................................................ - add v9.8H, v0.8H, v27.8H // ..........................................e............................. + sub v6.8H, v16.8H, v15.8H // ....................................e................................... // gap // ........................................................................ - add v24.8H, v22.8H, v19.8H // .....................................e.................................. + add v16.8H, v16.8H, v15.8H // .....................................e.................................. // gap // ........................................................................ - mls v3.8H, v26.8H, v7.H[0] // ...................................................................*.... + ldr q2, [x3], #16 // ...................................e.................................... // gap // ........................................................................ - sqdmulh v23.8H, v9.8H, v7.H[1] // ....................................................e................... // gap // ........................................................................ - sqdmulh v13.8H, v24.8H, v7.H[1] // ..............................................e......................... // gap // ........................................................................ - sub v31.8H, v22.8H, v19.8H // ....................................e................................... + sqdmulh v15.8H, v27.8H, v7.H[1] // ....................................................e................... // gap // ........................................................................ - str q3, [x1, #-16] // .......................................................................* + sqdmulh v13.8H, v16.8H, v7.H[1] // ..............................................e......................... // gap // ........................................................................ - srshr v23.8H, v23.8H, #11 // .....................................................e.................. + mul v17.8H, v6.8H, v2.H[2] // .......................................e................................ // gap // ........................................................................ - srshr v1.8H, v13.8H, #11 // ...............................................e........................ + sqrdmulh v26.8H, v6.8H, v2.H[3] // ......................................e................................. // gap // ........................................................................ - sub v21.8H, v0.8H, v27.8H // .........................................e.............................. + sqrdmulh v6.8H, v25.8H, v2.H[5] // ...........................................e............................ // gap // ........................................................................ - mls v9.8H, v23.8H, v7.H[0] // ......................................................e................. + mul v0.8H, v25.8H, v2.H[4] // ............................................e........................... // gap // ........................................................................ - mls v24.8H, v1.8H, v7.H[0] // ................................................e....................... + srshr v13.8H, v13.8H, #11 // ...............................................e........................ // gap // ........................................................................ - sqrdmulh v19.8H, v31.8H, v11.H[3] // .......................................e................................ + mls v17.8H, v26.8H, v7.H[0] // ........................................e............................... // gap // ........................................................................ - mul v28.8H, v31.8H, v11.H[2] // ......................................e................................. + srshr v26.8H, v15.8H, #11 // .....................................................e.................. // gap // ........................................................................ - mul v8.8H, v21.8H, v11.H[4] // ...........................................e............................ + mls v0.8H, v6.8H, v7.H[0] // .............................................e.......................... // gap // ........................................................................ - sub v22.8H, v24.8H, v9.8H // ..........................................................e............. + mls v16.8H, v13.8H, v7.H[0] // ................................................e....................... // gap // ........................................................................ - sqrdmulh v23.8H, v21.8H, v11.H[5] // ............................................e........................... + mls v27.8H, v26.8H, v7.H[0] // ......................................................e................. // gap // ........................................................................ - add v12.8H, v24.8H, v9.8H // ...........................................................e............ + sqdmulh v15.8H, v17.8H, v7.H[1] // .................................................e...................... // gap // ........................................................................ - mul v27.8H, v22.8H, v11.H[0] // ............................................................e........... + sqdmulh v13.8H, v0.8H, v7.H[1] // .......................................................e................ // gap // ........................................................................ - sqrdmulh v13.8H, v22.8H, v11.H[1] // .............................................................e.......... + str q14, [x1, #-16] // .......................................................................* // gap // ........................................................................ - mls v28.8H, v19.8H, v7.H[0] // ........................................e............................... + add v26.8H, v16.8H, v27.8H // ...........................................................e............ // gap // ........................................................................ - mls v8.8H, v23.8H, v7.H[0] // .............................................e.......................... + srshr v6.8H, v15.8H, #11 // ..................................................e..................... // gap // ........................................................................ - str q12, [x1], #(64) // ....................................................................e... + srshr v14.8H, v13.8H, #11 // ........................................................e............... // gap // ........................................................................ - mls v27.8H, v13.8H, v7.H[0] // ..............................................................e......... + str q26, [x1], #(64) // ....................................................................e... // gap // ........................................................................ - // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // .e......................................................................|e..................................................... - // ldr q0, [x4], #(6*16) // ........e...............................................................|.......e.............................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // e.......................................................................e...................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ...............e........................................................|..............e....................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // .......e................................................................|......e............................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ..e.....................................................................|.e.................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ..................e.....................................................|.................e.................................... - // sub v24.8h, v8.8h, v9.8h // ...........e............................................................|..........e........................................... - // add v8.8h, v8.8h, v9.8h // ............e...........................................................|...........e.......................................... - // mul v9.8h, v24.8h, v1.8h // ....................e...................................................|...................e.................................. - // sqrdmulh v24.8h, v24.8h, v5.8h // .................e......................................................|................e..................................... - // mls v9.8h, v24.8h, v7.h[0] // ........................e...............................................|.......................e.............................. - // sub v24.8h, v10.8h, v11.8h // ...e....................................................................|..e................................................... - // add v10.8h, v10.8h, v11.8h // .........................e..............................................|........................e............................. - // mul v11.8h, v24.8h, v2.8h // ......e.................................................................|.....e................................................ - // sqrdmulh v24.8h, v24.8h, v6.8h // .....................e..................................................|....................e................................. - // mls v11.8h, v24.8h, v7.h[0] // ..........................e.............................................|.........................e............................ - // sub v24.8h, v8.8h, v10.8h // ............................e...........................................|...........................e.......................... - // add v8.8h, v8.8h, v10.8h // .............................e..........................................|............................e......................... - // mul v10.8h, v24.8h, v0.8h // ...............................e........................................|..............................e....................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ................................e.......................................|...............................e...................... - // mls v10.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e.................. - // sub v24.8h, v9.8h, v11.8h // ..............................e.........................................|.............................e........................ - // add v9.8h, v9.8h, v11.8h // ...................................e....................................|..................................e................... - // mul v11.8h, v24.8h, v0.8h // .................................e......................................|................................e..................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ..................................e.....................................|.................................e.................... - // mls v11.8h, v24.8h, v7.h[0] // ......................................e.................................|.....................................e................ - // trn1 v25.4s, v8.4s, v9.4s // ..........................................e.............................|.........................................e............ - // trn2 v26.4s, v8.4s, v9.4s // .......................................e................................|......................................e............... - // trn1 v27.4s, v10.4s, v11.4s // ...........................................e............................|..........................................e........... - // trn2 v28.4s, v10.4s, v11.4s // .........................................e..............................|........................................e............. - // trn2 v10.2d, v25.2d, v27.2d // ..............................................e.........................|.............................................e........ - // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e.......... - // trn1 v8.2d, v25.2d, v27.2d // ...............................................e........................|..............................................e....... - // trn1 v9.2d, v26.2d, v28.2d // .............................................e..........................|............................................e......... - // ldr q0, [x3], #16 // ........................................e...............................|.......................................e.............. - // sub v24.8h, v8.8h, v9.8h // .....................................................e..................|....................................................e. - // add v8.8h, v8.8h, v9.8h // .................................................e......................|................................................e..... - // mul v9.8h, v24.8h, v0.h[2] // .............................................................e..........|...................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............................................................e...........|...................................................... - // mls v9.8h, v24.8h, v7.h[0] // ....................................................................e...|...................................................... - // sub v24.8h, v10.8h, v11.8h // .........................................................e..............|...................................................... - // add v10.8h, v10.8h, v11.8h // ................................................e.......................|...............................................e...... - // mul v11.8h, v24.8h, v0.h[4] // ..............................................................e.........|...................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ................................................................e.......|...................................................... - // mls v11.8h, v24.8h, v7.h[0] // .....................................................................e..|...................................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ....................................................e...................|...................................................e.. - // srshr v25.8h, v25.8h, #11 // ........................................................e...............|...................................................... - // mls v8.8h, v25.8h, v7.h[0] // ...........................................................e............|...................................................... - // sqdmulh v25.8h, v9.8h, v7.h[1] // ....*...................................................................|...*.................................................. - // srshr v25.8h, v25.8h, #11 // .........*..............................................................|........*............................................. - // mls v9.8h, v25.8h, v7.h[0] // .............*..........................................................|............*......................................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................................................e....................|..................................................e... - // srshr v25.8h, v25.8h, #11 // .......................................................e................|...................................................... - // mls v10.8h, v25.8h, v7.h[0] // ..........................................................e.............|...................................................... - // sqdmulh v25.8h, v11.8h, v7.h[1] // ..........*.............................................................|.........*............................................ - // srshr v25.8h, v25.8h, #11 // ..............*.........................................................|.............*........................................ - // mls v11.8h, v25.8h, v7.h[0] // ................*.......................................................|...............*...................................... - // sub v24.8h, v8.8h, v10.8h // ...............................................................e........|...................................................... - // add v8.8h, v8.8h, v10.8h // .................................................................e......|...................................................... - // mul v10.8h, v24.8h, v0.h[0] // ..................................................................e.....|...................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................................e....|...................................................... - // mls v10.8h, v24.8h, v7.h[0] // .......................................................................e|...................................................... - // sub v24.8h, v9.8h, v11.8h // ...................*....................................................|..................*................................... - // add v9.8h, v9.8h, v11.8h // ......................*.................................................|.....................*................................ - // mul v11.8h, v24.8h, v0.h[0] // .....................................*..................................|....................................*................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................*................................................|......................*............................... - // mls v11.8h, v24.8h, v7.h[0] // ..................................................*.....................|.................................................*.... - // str q8, [x1], #(64) // ......................................................................e.|...................................................... - // str q9, [x1, #(-64 + 16*1)] // ...........................*............................................|..........................*........................... - // str q10, [x1, #(-64 + 16*2)] // .....*..................................................................|....*................................................. - // str q11, [x1, #(-64 + 16*3)] // ......................................................*.................|.....................................................* + // ------------------------------------------------------------- new position -------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // ...........e.........................................................'.............~..................................................... + // ldr q0, [x4], #(6*16) // ......................e..............................................'........................~.......................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // .........................e...........................................'...........................~....................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // e....................................................................'..~................................................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // ..........e..........................................................'............~...................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ........e............................................................'..........~........................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ............e........................................................'..............~.................................................... + // sub v24.8h, v8.8h, v9.8h // .............e.......................................................'...............~................................................... + // add v8.8h, v8.8h, v9.8h // ..............e......................................................'................~.................................................. + // sqrdmulh v27.8h, v24.8h, v5.8h // .................e...................................................'...................~............................................... + // mul v9.8h, v24.8h, v1.8h // ................e....................................................'..................~................................................ + // mls v9.8h, v27.8h, v7.h[0] // .....................e...............................................'.......................~........................................... + // sub v24.8h, v10.8h, v11.8h // ...............e.....................................................'.................~................................................. + // add v10.8h, v10.8h, v11.8h // ..................e..................................................'....................~.............................................. + // sqrdmulh v27.8h, v24.8h, v6.8h // ....................e................................................'......................~............................................ + // mul v11.8h, v24.8h, v2.8h // ...................e.................................................'.....................~............................................. + // mls v11.8h, v27.8h, v7.h[0] // .......................e.............................................'.........................~......................................... + // sub v24.8h, v8.8h, v10.8h // ........................e............................................'..........................~........................................ + // add v8.8h, v8.8h, v10.8h // ................................e....................................'..................................~................................ + // sqrdmulh v27.8h, v24.8h, v4.8h // ............................e........................................'..............................~.................................... + // mul v10.8h, v24.8h, v0.8h // ...........................e.........................................'.............................~..................................... + // mls v10.8h, v27.8h, v7.h[0] // .................................e...................................'...................................~............................... + // sub v24.8h, v9.8h, v11.8h // ..........................e..........................................'............................~...................................... + // add v9.8h, v9.8h, v11.8h // ...............................e.....................................'.................................~................................. + // sqrdmulh v27.8h, v24.8h, v4.8h // ..............................e......................................'................................~.................................. + // mul v11.8h, v24.8h, v0.8h // .............................e.......................................'...............................~................................... + // mls v11.8h, v27.8h, v7.h[0] // ..................................e..................................'....................................~.............................. + // trn1 v25.4s, v8.4s, v9.4s // ...................................e.................................'.....................................~............................. + // trn2 v26.4s, v8.4s, v9.4s // ....................................e................................'......................................~............................ + // trn1 v27.4s, v10.4s, v11.4s // ......................................e..............................'........................................~.......................... + // trn2 v28.4s, v10.4s, v11.4s // .......................................e.............................'.........................................~......................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................e...........................'...........................................~....................... + // trn2 v11.2d, v26.2d, v28.2d // ..........................................e..........................'............................................~...................... + // trn1 v8.2d, v25.2d, v27.2d // .............................................e.......................'...............................................~................... + // trn1 v9.2d, v26.2d, v28.2d // ...........................................e.........................'.............................................~..................... + // ldr q0, [x3], #16 // .................................................e...................'...................................................~............... + // sub v24.8h, v8.8h, v9.8h // ...............................................e.....................'.................................................~................. + // add v8.8h, v8.8h, v9.8h // ................................................e....................'..................................................~................ + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .....................................................e...............'.......................................................~........... + // mul v9.8h, v24.8h, v0.h[2] // ....................................................e................'......................................................~............ + // mls v9.8h, v27.8h, v7.h[0] // .........................................................e...........'...........................................................~....... + // sub v24.8h, v10.8h, v11.8h // ............................................e........................'..............................................~.................... + // add v10.8h, v10.8h, v11.8h // ..............................................e......................'................................................~.................. + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ......................................................e..............'........................................................~.......... + // mul v11.8h, v24.8h, v0.h[4] // .......................................................e.............'.........................................................~......... + // mls v11.8h, v27.8h, v7.h[0] // ...........................................................e.........'.............................................................~..... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................................................e.................'.....................................................~............. + // srshr v25.8h, v25.8h, #11 // ........................................................e............'..........................................................~........ + // mls v8.8h, v25.8h, v7.h[0] // ............................................................e........'..............................................................~.... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ..............................................................e......'................................................................~.. + // srshr v25.8h, v25.8h, #11 // ..................................................................e..'................................................................... + // mls v9.8h, v25.8h, v7.h[0] // .....................................................................*................................................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..................................................e..................'....................................................~.............. + // srshr v25.8h, v25.8h, #11 // ..........................................................e..........'............................................................~...... + // mls v10.8h, v25.8h, v7.h[0] // .............................................................e.......'...............................................................~... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ...............................................................e.....'.................................................................~. + // srshr v25.8h, v25.8h, #11 // ...................................................................e.'................................................................... + // mls v11.8h, v25.8h, v7.h[0] // .....................................................................'.*................................................................. + // sub v24.8h, v8.8h, v10.8h // .....................................................................'*.................................................................. + // add v8.8h, v8.8h, v10.8h // .................................................................e...'................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .~...................................................................'...*............................................................... + // mul v10.8h, v24.8h, v0.h[0] // ...~.................................................................'.....*............................................................. + // mls v10.8h, v27.8h, v7.h[0] // .......~.............................................................'.........*......................................................... + // sub v24.8h, v9.8h, v11.8h // ..~..................................................................'....*.............................................................. + // add v9.8h, v9.8h, v11.8h // ....~................................................................'......*............................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ......~..............................................................'........*.......................................................... + // mul v11.8h, v24.8h, v0.h[0] // .....~...............................................................'.......*........................................................... + // mls v11.8h, v27.8h, v7.h[0] // .........~...........................................................'...........*....................................................... + // str q8, [x1], #(64) // ....................................................................e'................................................................... + // str q9, [x1, #(-64 + 16*1)] // .....................................~...............................'.......................................*........................... + // str q10, [x1, #(-64 + 16*2)] // ........................................~............................'..........................................*........................ + // str q11, [x1, #(-64 + 16*3)] // ................................................................~....'..................................................................* sub count, count, #1 cbnz count, layer4567_start - sqdmulh v19.8H, v28.8H, v7.H[1] // *............. - // gap // .............. - sqdmulh v23.8H, v8.8H, v7.H[1] // ...*.......... - // gap // .............. - str q27, [x1, #-32] // .*............ - // gap // .............. - // gap // .............. - // gap // .............. - srshr v19.8H, v19.8H, #11 // ..*........... - // gap // .............. - srshr v23.8H, v23.8H, #11 // .....*........ - // gap // .............. - // gap // .............. - // gap // .............. - mls v28.8H, v19.8H, v7.H[0] // ....*......... - // gap // .............. - mls v8.8H, v23.8H, v7.H[0] // ......*....... - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - sub v19.8H, v28.8H, v8.8H // .......*...... - // gap // .............. - add v23.8H, v28.8H, v8.8H // ........*..... - // gap // .............. - // gap // .............. - // gap // .............. - sqrdmulh v22.8H, v19.8H, v11.H[1] // .........*.... - // gap // .............. - mul v19.8H, v19.8H, v11.H[0] // ...........*.. - // gap // .............. - str q23, [x1, #-48] // ..........*... - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - mls v19.8H, v22.8H, v7.H[0] // ............*. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - // gap // .............. - str q19, [x1, #-16] // .............* - // gap // .............. - - // original source code - // sqdmulh v23.8H, v28.8H, v7.H[1] // *............. - // str q27, [x1, #-32] // ..*........... - // srshr v23.8H, v23.8H, #11 // ...*.......... - // sqdmulh v24.8H, v8.8H, v7.H[1] // .*............ - // mls v28.8H, v23.8H, v7.H[0] // .....*........ - // srshr v24.8H, v24.8H, #11 // ....*......... - // mls v8.8H, v24.8H, v7.H[0] // ......*....... - // sub v30.8H, v28.8H, v8.8H // .......*...... - // add v28.8H, v28.8H, v8.8H // ........*..... - // sqrdmulh v26.8H, v30.8H, v11.H[1] // .........*.... - // str q28, [x1, #-48] // ...........*.. - // mul v3.8H, v30.8H, v11.H[0] // ..........*... - // mls v3.8H, v26.8H, v7.H[0] // ............*. - // str q3, [x1, #-16] // .............* + // Instructions: 14 + // Expected cycles: 19 + // Expected IPC: 0.74 + // + // Cycle bound: 19.0 + // IPC bound: 0.74 + // + // Wall time: 0.11s + // User time: 0.11s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mls v17.8H, v6.8H, v7.H[0] // *............................. + // gap // .............................. + sub v15.8H, v16.8H, v27.8H // .*............................ + // gap // .............................. + mls v0.8H, v14.8H, v7.H[0] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v16.8H, v15.8H, v2.H[1] // ...*.......................... + // gap // .............................. + mul v15.8H, v15.8H, v2.H[0] // .....*........................ + // gap // .............................. + sub v27.8H, v17.8H, v0.8H // ....*......................... + // gap // .............................. + add v14.8H, v17.8H, v0.8H // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v6.8H, v27.8H, v2.H[0] // .......*...................... + // gap // .............................. + sqrdmulh v27.8H, v27.8H, v2.H[1] // ........*..................... + // gap // .............................. + mls v15.8H, v16.8H, v7.H[0] // .........*.................... + // gap // .............................. + str q14, [x1, #-48] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v6.8H, v27.8H, v7.H[0] // ..........*................... + // gap // .............................. + str q15, [x1, #-32] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q6, [x1, #-16] // .............*................ + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // mls v17.8H, v6.8H, v7.H[0] // *.............................. + // sub v16.8H, v16.8H, v27.8H // .*............................. + // mls v0.8H, v14.8H, v7.H[0] // ..*............................ + // sqrdmulh v27.8H, v16.8H, v2.H[1] // ...*........................... + // sub v15.8H, v17.8H, v0.8H // .....*......................... + // mul v16.8H, v16.8H, v2.H[0] // ....*.......................... + // add v25.8H, v17.8H, v0.8H // ......*........................ + // mul v14.8H, v15.8H, v2.H[0] // .......*....................... + // sqrdmulh v0.8H, v15.8H, v2.H[1] // ........*...................... + // mls v16.8H, v27.8H, v7.H[0] // .........*..................... + // mls v14.8H, v0.8H, v7.H[0] // ...........*................... + // str q25, [x1, #-48] // ..........*.................... + // str q16, [x1, #-32] // ............*.................. + // str q14, [x1, #-16] // .............*................. // --------------------------------------------------------------------- @@ -902,580 +918,625 @@ layer4567_start: .p2align 2 - ldr q3, [x0, #256] // *........... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q11, [x0, #448] // .*.......... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q20, [x0, #384] // ..*......... - // gap // ............ - // gap // ............ - // gap // ............ - ldr q26, [x0, #320] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - add v5.8H, v20.8H, v11.8H // .........*.. - // gap // ............ - ldr q28, [x0, #192] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - add v25.8H, v3.8H, v26.8H // ........*... - // gap // ............ - ldr q22, [x0, #128] // ......*..... - // gap // ............ - // gap // ............ - // gap // ............ - add v14.8H, v25.8H, v5.8H // ...........* - // gap // ............ - ldr q23, [x0, #64] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - add v24.8H, v22.8H, v28.8H // .......*.... - // gap // ............ - ldr q27, [x0, #0] // ..........*. - // gap // ............ - - // original source code - // ldr q3, [x0, #256] // *........... - // ldr q11, [x0, #448] // .*.......... - // ldr q20, [x0, #384] // ..*......... - // ldr q26, [x0, #320] // ...*........ - // ldr q23, [x0, #64] // .........*.. - // ldr q28, [x0, #192] // .....*...... - // ldr q22, [x0, #128] // .......*.... - // add v24.8H, v22.8H, v28.8H // ..........*. - // add v25.8H, v3.8H, v26.8H // ......*..... - // add v5.8H, v20.8H, v11.8H // ....*....... - // ldr q27, [x0, #0] // ...........* - // add v14.8H, v25.8H, v5.8H // ........*... + // Instructions: 7 + // Expected cycles: 13 + // Expected IPC: 0.54 + // + // Cycle bound: 13.0 + // IPC bound: 0.54 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q6, [x0, #64] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q14, [x0, #128] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q26, [x0, #192] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x0, #256] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x0, #320] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x0, #384] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x0, #448] // ......*....................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0, #64] // *.............................. + // ldr q14, [x0, #128] // .*............................. + // ldr q26, [x0, #192] // ..*............................ + // ldr q11, [x0, #256] // ...*........................... + // ldr q13, [x0, #320] // ....*.......................... + // ldr q25, [x0, #384] // .....*......................... + // ldr q22, [x0, #448] // ......*........................ sub count, count, #1 layer123_start: - sub v19.8H, v27.8H, v23.8H // ........*............................................................................... + // Instructions: 88 + // Expected cycles: 96 + // Expected IPC: 0.92 + // + // Cycle bound: 96.0 + // IPC bound: 0.92 + // + // Wall time: 6.77s + // User time: 6.77s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + ldr q16, [x0, #0] // *....................................................................................... // gap // ........................................................................................ - add v23.8H, v27.8H, v23.8H // .........*.............................................................................. // gap // ........................................................................................ - sub v22.8H, v22.8H, v28.8H // .............*.......................................................................... // gap // ........................................................................................ - mul v28.8H, v19.8H, v0.H[6] // ..........*............................................................................. + sub v15.8H, v14.8H, v26.8H // .............*.......................................................................... // gap // ........................................................................................ - sqrdmulh v19.8H, v19.8H, v0.H[7] // ...........*............................................................................ + add v27.8H, v14.8H, v26.8H // ..............*......................................................................... // gap // ........................................................................................ - sub v27.8H, v23.8H, v24.8H // ............................*........................................................... + sub v14.8H, v16.8H, v6.8H // ........*............................................................................... // gap // ........................................................................................ - add v23.8H, v23.8H, v24.8H // .............................*.......................................................... + add v16.8H, v16.8H, v6.8H // .........*.............................................................................. // gap // ........................................................................................ - mul v24.8H, v22.8H, v1.H[0] // ...............*........................................................................ + sqrdmulh v6.8H, v15.8H, v1.H[1] // ...............*........................................................................ // gap // ........................................................................................ - sqrdmulh v22.8H, v22.8H, v1.H[1] // ................*....................................................................... + mul v15.8H, v15.8H, v1.H[0] // ................*....................................................................... // gap // ........................................................................................ - mls v28.8H, v19.8H, v7.H[0] // ............*........................................................................... + sub v26.8H, v16.8H, v27.8H // ............................*........................................................... // gap // ........................................................................................ - sub v19.8H, v3.8H, v26.8H // ..................*..................................................................... + add v16.8H, v16.8H, v27.8H // .............................*.......................................................... // gap // ........................................................................................ - mul v3.8H, v27.8H, v0.H[2] // ..............................*......................................................... + sqrdmulh v27.8H, v14.8H, v0.H[7] // ..........*............................................................................. // gap // ........................................................................................ - sqrdmulh v27.8H, v27.8H, v0.H[3] // ...............................*........................................................ + mul v14.8H, v14.8H, v0.H[6] // ...........*............................................................................ // gap // ........................................................................................ - sub v26.8H, v23.8H, v14.8H // ................................................*....................................... + mls v15.8H, v6.8H, v7.H[0] // .................*...................................................................... // gap // ........................................................................................ - add v23.8H, v23.8H, v14.8H // .................................................*...................................... + sub v6.8H, v11.8H, v13.8H // ..................*..................................................................... // gap // ........................................................................................ - mls v24.8H, v22.8H, v7.H[0] // .................*...................................................................... + add v11.8H, v11.8H, v13.8H // ...................*.................................................................... // gap // ........................................................................................ - mul v22.8H, v19.8H, v1.H[2] // ....................*................................................................... + mls v14.8H, v27.8H, v7.H[0] // ............*........................................................................... // gap // ........................................................................................ - sqrdmulh v19.8H, v19.8H, v1.H[3] // .....................*.................................................................. + sqrdmulh v27.8H, v6.8H, v1.H[3] // ....................*................................................................... // gap // ........................................................................................ - sub v20.8H, v20.8H, v11.8H // .......................*................................................................ + add v3.8H, v25.8H, v22.8H // ........................*............................................................... // gap // ........................................................................................ - sub v11.8H, v28.8H, v24.8H // .................................*...................................................... + sqrdmulh v13.8H, v26.8H, v0.H[3] // ..............................*......................................................... // gap // ........................................................................................ - add v28.8H, v28.8H, v24.8H // ..................................*..................................................... + mul v26.8H, v26.8H, v0.H[2] // ...............................*........................................................ // gap // ........................................................................................ - mls v22.8H, v19.8H, v7.H[0] // ......................*................................................................. + add v31.8H, v11.8H, v3.8H // .......................................*................................................ // gap // ........................................................................................ - mul v19.8H, v20.8H, v1.H[4] // .........................*.............................................................. + mul v6.8H, v6.8H, v1.H[2] // .....................*.................................................................. // gap // ........................................................................................ - mls v3.8H, v27.8H, v7.H[0] // ................................*....................................................... + sub v25.8H, v25.8H, v22.8H // .......................*................................................................ // gap // ........................................................................................ - sqrdmulh v27.8H, v20.8H, v1.H[5] // ..........................*............................................................. + sub v22.8H, v16.8H, v31.8H // ................................................*....................................... // gap // ........................................................................................ - mul v24.8H, v11.8H, v0.H[2] // ...................................*.................................................... + add v16.8H, v16.8H, v31.8H // .................................................*...................................... // gap // ........................................................................................ - sqrdmulh v20.8H, v11.8H, v0.H[3] // ....................................*................................................... + mls v6.8H, v27.8H, v7.H[0] // ......................*................................................................. // gap // ........................................................................................ - mul v4.8H, v26.8H, v0.H[0] // ..................................................*..................................... + sub v27.8H, v14.8H, v15.8H // .................................*...................................................... // gap // ........................................................................................ - sqrdmulh v11.8H, v26.8H, v0.H[1] // ...................................................*.................................... + add v15.8H, v14.8H, v15.8H // ..................................*..................................................... // gap // ........................................................................................ - mul v26.8H, v23.8H, v29.8H // ........................................................................*............... + sqrdmulh v14.8H, v25.8H, v1.H[5] // .........................*.............................................................. // gap // ........................................................................................ - sqrdmulh v6.8H, v23.8H, v30.8H // .........................................................................*.............. + mul v31.8H, v25.8H, v1.H[4] // ..........................*............................................................. // gap // ........................................................................................ - mls v19.8H, v27.8H, v7.H[0] // ...........................*............................................................ + sub v11.8H, v11.8H, v3.8H // ......................................*................................................. // gap // ........................................................................................ - mls v24.8H, v20.8H, v7.H[0] // .....................................*.................................................. + mls v26.8H, v13.8H, v7.H[0] // ................................*....................................................... // gap // ........................................................................................ - sub v27.8H, v25.8H, v5.8H // ......................................*................................................. + sqrdmulh v13.8H, v27.8H, v0.H[3] // ...................................*.................................................... // gap // ........................................................................................ - mls v4.8H, v11.8H, v7.H[0] // ....................................................*................................... + mls v31.8H, v14.8H, v7.H[0] // ...........................*............................................................ // gap // ........................................................................................ - sub v23.8H, v22.8H, v19.8H // ...........................................*............................................ + mul v27.8H, v27.8H, v0.H[2] // ....................................*................................................... // gap // ........................................................................................ - mul v20.8H, v27.8H, v0.H[4] // ........................................*............................................... + sqrdmulh v14.8H, v11.8H, v0.H[5] // ........................................*............................................... // gap // ........................................................................................ - sqrdmulh v11.8H, v27.8H, v0.H[5] // .........................................*.............................................. + mul v11.8H, v11.8H, v0.H[4] // .........................................*.............................................. // gap // ........................................................................................ - add v27.8H, v22.8H, v19.8H // ............................................*........................................... + sub v25.8H, v6.8H, v31.8H // ...........................................*............................................ // gap // ........................................................................................ - mul v22.8H, v23.8H, v0.H[4] // .............................................*.......................................... + mls v27.8H, v13.8H, v7.H[0] // .....................................*.................................................. // gap // ........................................................................................ - sqrdmulh v23.8H, v23.8H, v0.H[5] // ..............................................*......................................... + add v6.8H, v6.8H, v31.8H // ............................................*........................................... // gap // ........................................................................................ - sub v19.8H, v28.8H, v27.8H // .....................................................*.................................. + mls v11.8H, v14.8H, v7.H[0] // ..........................................*............................................. // gap // ........................................................................................ - add v10.8H, v28.8H, v27.8H // ......................................................*................................. + sqrdmulh v14.8H, v25.8H, v0.H[5] // .............................................*.......................................... // gap // ........................................................................................ - mls v20.8H, v11.8H, v7.H[0] // ..........................................*............................................. + mul v13.8H, v25.8H, v0.H[4] // ..............................................*......................................... // gap // ........................................................................................ - mls v22.8H, v23.8H, v7.H[0] // ...............................................*........................................ + sqrdmulh v31.8H, v22.8H, v0.H[1] // ..................................................*..................................... // gap // ........................................................................................ - mul v28.8H, v19.8H, v0.H[0] // .......................................................*................................ + mul v25.8H, v22.8H, v0.H[0] // ...................................................*.................................... // gap // ........................................................................................ - sqrdmulh v23.8H, v19.8H, v0.H[1] // ........................................................*............................... + sub v22.8H, v15.8H, v6.8H // .....................................................*.................................. // gap // ........................................................................................ - sub v14.8H, v3.8H, v20.8H // ..........................................................*............................. + mls v13.8H, v14.8H, v7.H[0] // ...............................................*........................................ // gap // ........................................................................................ - add v27.8H, v3.8H, v20.8H // ...........................................................*............................ + add v15.8H, v15.8H, v6.8H // ......................................................*................................. // gap // ........................................................................................ - ldr q3, [x0, #272] // ....e................................................................................... + sqrdmulh v14.8H, v16.8H, v30.8H // ........................................................................*............... // gap // ........................................................................................ + mul v16.8H, v16.8H, v29.8H // .........................................................................*.............. // gap // ........................................................................................ + mls v25.8H, v31.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ - sub v20.8H, v24.8H, v22.8H // ...............................................................*........................ + sqrdmulh v6.8H, v22.8H, v0.H[1] // .......................................................*................................ // gap // ........................................................................................ - add v21.8H, v24.8H, v22.8H // ................................................................*....................... + mul v31.8H, v22.8H, v0.H[0] // ........................................................*............................... // gap // ........................................................................................ - mls v26.8H, v6.8H, v7.H[0] // ..........................................................................*............. + sub v22.8H, v26.8H, v11.8H // ..........................................................*............................. // gap // ........................................................................................ - mul v22.8H, v20.8H, v0.H[0] // .................................................................*...................... + add v26.8H, v26.8H, v11.8H // ...........................................................*............................ // gap // ........................................................................................ - sqrdmulh v19.8H, v20.8H, v0.H[1] // ..................................................................*..................... + sub v11.8H, v27.8H, v13.8H // ...............................................................*........................ // gap // ........................................................................................ - mls v28.8H, v23.8H, v7.H[0] // .........................................................*.............................. + mls v31.8H, v6.8H, v7.H[0] // .........................................................*.............................. // gap // ........................................................................................ - ldr q11, [x0, #464] // .......e................................................................................ + sqrdmulh v6.8H, v22.8H, v0.H[1] // ............................................................*........................... // gap // ........................................................................................ + mul v22.8H, v22.8H, v0.H[0] // .............................................................*.......................... // gap // ........................................................................................ + add v27.8H, v27.8H, v13.8H // ................................................................*....................... // gap // ........................................................................................ - mls v22.8H, v19.8H, v7.H[0] // ...................................................................*.................... + sqrdmulh v13.8H, v11.8H, v0.H[1] // .................................................................*...................... // gap // ........................................................................................ - str q28, [x0, #320] // .....................................................................*.................. + mul v11.8H, v11.8H, v0.H[0] // ..................................................................*..................... // gap // ........................................................................................ - mul v23.8H, v10.8H, v29.8H // ...........................................................................*............ + mls v22.8H, v6.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ - sqrdmulh v19.8H, v10.8H, v30.8H // ............................................................................*........... + str q25, [x0, #256] // ....................................................................*................... // gap // ........................................................................................ - str q22, [x0, #448] // .......................................................................*................ + mls v16.8H, v14.8H, v7.H[0] // ..........................................................................*............. // gap // ........................................................................................ - mul v28.8H, v27.8H, v29.8H // ..............................................................................*......... + mls v11.8H, v13.8H, v7.H[0] // ...................................................................*.................... // gap // ........................................................................................ - sqrdmulh v20.8H, v27.8H, v30.8H // ...............................................................................*........ + str q31, [x0, #320] // .....................................................................*.................. // gap // ........................................................................................ - str q26, [x0], #(16) // ....................................................................................*... + sqrdmulh v14.8H, v15.8H, v30.8H // ...........................................................................*............ // gap // ........................................................................................ - mls v23.8H, v19.8H, v7.H[0] // .............................................................................*.......... + str q22, [x0, #384] // ......................................................................*................. // gap // ........................................................................................ - mul v19.8H, v21.8H, v29.8H // .................................................................................*...... + mul v15.8H, v15.8H, v29.8H // ............................................................................*........... // gap // ........................................................................................ - sqrdmulh v22.8H, v21.8H, v30.8H // ..................................................................................*..... + str q11, [x0, #448] // .......................................................................*................ // gap // ........................................................................................ - mls v28.8H, v20.8H, v7.H[0] // ................................................................................*....... + sqrdmulh v6.8H, v26.8H, v30.8H // ..............................................................................*......... // gap // ........................................................................................ - ldr q20, [x0, #384] // ......e................................................................................. + mul v26.8H, v26.8H, v29.8H // ...............................................................................*........ // gap // ........................................................................................ + mls v15.8H, v14.8H, v7.H[0] // .............................................................................*.......... // gap // ........................................................................................ + sqrdmulh v14.8H, v27.8H, v30.8H // .................................................................................*...... // gap // ........................................................................................ - ldr q26, [x0, #320] // .....e.................................................................................. + mul v27.8H, v27.8H, v29.8H // ..................................................................................*..... // gap // ........................................................................................ + mls v26.8H, v6.8H, v7.H[0] // ................................................................................*....... // gap // ........................................................................................ + str q16, [x0], #(16) // ....................................................................................*... // gap // ........................................................................................ - str q23, [x0, #48] // .....................................................................................*.. + ldr q6, [x0, #64] // .e...................................................................................... // gap // ........................................................................................ - ldr q23, [x0, #64] // .e...................................................................................... // gap // ........................................................................................ // gap // ........................................................................................ + mls v27.8H, v14.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ - mls v19.8H, v22.8H, v7.H[0] // ...................................................................................*.... + str q15, [x0, #48] // .....................................................................................*.. // gap // ........................................................................................ - str q28, [x0, #112] // ......................................................................................*. + ldr q14, [x0, #128] // ..e..................................................................................... // gap // ........................................................................................ - ldr q28, [x0, #192] // ...e.................................................................................... // gap // ........................................................................................ // gap // ........................................................................................ + str q26, [x0, #112] // ......................................................................................*. // gap // ........................................................................................ - str q19, [x0, #176] // .......................................................................................* + ldr q26, [x0, #192] // ...e.................................................................................... // gap // ........................................................................................ - ldr q22, [x0, #128] // ..e..................................................................................... // gap // ........................................................................................ // gap // ........................................................................................ + str q27, [x0, #176] // .......................................................................................* // gap // ........................................................................................ - mul v8.8H, v14.8H, v0.H[0] // ............................................................*........................... + ldr q11, [x0, #256] // ....e................................................................................... // gap // ........................................................................................ - sqrdmulh v27.8H, v14.8H, v0.H[1] // .............................................................*.......................... // gap // ........................................................................................ - add v24.8H, v22.8H, v28.8H // ..............e......................................................................... // gap // ........................................................................................ - str q4, [x0, #240] // ....................................................................*................... + ldr q13, [x0, #320] // .....e.................................................................................. // gap // ........................................................................................ - add v25.8H, v3.8H, v26.8H // ...................e.................................................................... // gap // ........................................................................................ - mls v8.8H, v27.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ - add v5.8H, v20.8H, v11.8H // ........................e............................................................... + ldr q25, [x0, #384] // ......e................................................................................. // gap // ........................................................................................ - ldr q27, [x0, #0] // e....................................................................................... // gap // ........................................................................................ // gap // ........................................................................................ + ldr q22, [x0, #448] // .......e................................................................................ // gap // ........................................................................................ - str q8, [x0, #368] // ......................................................................*................. // gap // ........................................................................................ - add v14.8H, v25.8H, v5.8H // .......................................e................................................ // gap // ........................................................................................ - // original source code - // ldr q8, [x0, #0] // ....................................e..|....................................................................................e. - // ldr q9, [x0, #(1*(512/8))] // .......................e...............|.......................................................................e.............. - // ldr q10, [x0, #(2*(512/8))] // ............................e..........|............................................................................e......... - // ldr q11, [x0, #(3*(512/8))] // ..........................e............|..........................................................................e........... - // ldr q12, [x0, #(4*(512/8))] // e......................................|................................................e..................................... - // ldr q13, [x0, #(5*(512/8))] // .....................e.................|.....................................................................e................ - // ldr q14, [x0, #(6*(512/8))] // ....................e..................|....................................................................e................. - // ldr q15, [x0, #(7*(512/8))] // .......e...............................|.......................................................e.............................. - // sub v24.8h, v8.8h, v9.8h // .......................................*...................................................................................... - // add v8.8h, v8.8h, v9.8h // .......................................|*..................................................................................... - // mul v9.8h, v24.8h, v0.h[6] // .......................................|..*................................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // .......................................|...*.................................................................................. - // mls v9.8h, v24.8h, v7.h[0] // .......................................|........*............................................................................. - // sub v24.8h, v10.8h, v11.8h // .......................................|.*.................................................................................... - // add v10.8h, v10.8h, v11.8h // ...............................e.......|...............................................................................e...... - // mul v11.8h, v24.8h, v1.h[0] // .......................................|......*............................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // .......................................|.......*.............................................................................. - // mls v11.8h, v24.8h, v7.h[0] // .......................................|..............*....................................................................... - // sub v24.8h, v12.8h, v13.8h // .......................................|.........*............................................................................ - // add v12.8h, v12.8h, v13.8h // .................................e.....|.................................................................................e.... - // mul v13.8h, v24.8h, v1.h[2] // .......................................|...............*...................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // .......................................|................*..................................................................... - // mls v13.8h, v24.8h, v7.h[0] // .......................................|....................*................................................................. - // sub v24.8h, v14.8h, v15.8h // .......................................|.................*.................................................................... - // add v14.8h, v14.8h, v15.8h // ...................................e...|...................................................................................e.. - // mul v15.8h, v24.8h, v1.h[4] // .......................................|.....................*................................................................ - // sqrdmulh v24.8h, v24.8h, v1.h[5] // .......................................|.......................*.............................................................. - // mls v15.8h, v24.8h, v7.h[0] // .......................................|..............................*....................................................... - // sub v24.8h, v8.8h, v10.8h // .......................................|....*................................................................................. - // add v8.8h, v8.8h, v10.8h // .......................................|.....*................................................................................ - // mul v10.8h, v24.8h, v0.h[2] // .......................................|..........*........................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................|...........*.......................................................................... - // mls v10.8h, v24.8h, v7.h[0] // .......................................|......................*............................................................... - // sub v24.8h, v9.8h, v11.8h // .......................................|..................*................................................................... - // add v9.8h, v9.8h, v11.8h // .......................................|...................*.................................................................. - // mul v11.8h, v24.8h, v0.h[2] // .......................................|........................*............................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................|.........................*............................................................ - // mls v11.8h, v24.8h, v7.h[0] // .......................................|...............................*...................................................... - // sub v24.8h, v12.8h, v14.8h // .......................................|................................*..................................................... - // add v12.8h, v12.8h, v14.8h // ......................................e|...................................................................................... - // mul v14.8h, v24.8h, v0.h[4] // .......................................|...................................*.................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................................|....................................*................................................. - // mls v14.8h, v24.8h, v7.h[0] // .......................................|..........................................*........................................... - // sub v24.8h, v13.8h, v15.8h // .......................................|..................................*................................................... - // add v13.8h, v13.8h, v15.8h // .......................................|.....................................*................................................ - // mul v15.8h, v24.8h, v0.h[4] // .......................................|......................................*............................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................................|.......................................*.............................................. - // mls v15.8h, v24.8h, v7.h[0] // .......................................|...........................................*.......................................... - // sub v24.8h, v8.8h, v12.8h // .......................................|............*......................................................................... - // add v8.8h, v8.8h, v12.8h // .......................................|.............*........................................................................ - // mul v12.8h, v24.8h, v0.h[0] // .......................................|..........................*........................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................|...........................*.......................................................... - // mls v12.8h, v24.8h, v7.h[0] // .......................................|.................................*.................................................... - // sub v24.8h, v9.8h, v13.8h // .......................................|........................................*............................................. - // add v9.8h, v9.8h, v13.8h // .......................................|.........................................*............................................ - // mul v13.8h, v24.8h, v0.h[0] // .......................................|............................................*......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................|.............................................*........................................ - // mls v13.8h, v24.8h, v7.h[0] // ......*................................|......................................................*............................... - // sub v24.8h, v10.8h, v14.8h // .......................................|..............................................*....................................... - // add v10.8h, v10.8h, v14.8h // .......................................|...............................................*...................................... - // mul v14.8h, v24.8h, v0.h[0] // .............................*.........|.............................................................................*........ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................*........|..............................................................................*....... - // mls v14.8h, v24.8h, v7.h[0] // ..................................*....|..................................................................................*... - // sub v24.8h, v11.8h, v15.8h // .*.....................................|.................................................*.................................... - // add v11.8h, v11.8h, v15.8h // ..*....................................|..................................................*................................... - // mul v15.8h, v24.8h, v0.h[0] // ....*..................................|....................................................*................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....*.................................|.....................................................*................................ - // mls v15.8h, v24.8h, v7.h[0] // ........*..............................|........................................................*............................. - // str q12, [x0, #(4*(512/8))] // ................................*......|................................................................................*..... - // str q13, [x0, #(5*(512/8))] // .........*.............................|.........................................................*............................ - // str q14, [x0, #(6*(512/8))] // .....................................*.|.....................................................................................* - // str q15, [x0, #(7*(512/8))] // ............*..........................|............................................................*......................... - // mul v12.8h, v8.8h, v29.8h // .......................................|............................*......................................................... - // sqrdmulh v8.8h, v8.8h, v30.8h // .......................................|.............................*........................................................ - // mls v12.8h, v8.8h, v7.h[0] // ...*...................................|...................................................*.................................. - // mul v13.8h, v9.8h, v29.8h // ..........*............................|..........................................................*........................... - // sqrdmulh v9.8h, v9.8h, v30.8h // ...........*...........................|...........................................................*.......................... - // mls v13.8h, v9.8h, v7.h[0] // ................*......................|................................................................*..................... - // mul v14.8h, v10.8h, v29.8h // .............*.........................|.............................................................*........................ - // sqrdmulh v10.8h, v10.8h, v30.8h // ..............*........................|..............................................................*....................... - // mls v14.8h, v10.8h, v7.h[0] // ...................*...................|...................................................................*.................. - // mul v15.8h, v11.8h, v29.8h // .................*.....................|.................................................................*.................... - // sqrdmulh v11.8h, v11.8h, v30.8h // ..................*....................|..................................................................*................... - // mls v15.8h, v11.8h, v7.h[0] // ........................*..............|........................................................................*............. - // str q12, [x0], #(16) // ...............*.......................|...............................................................*...................... - // str q13, [x0, #(-16 + 1*(512/8))] // ......................*................|......................................................................*............... - // str q14, [x0, #(-16 + 2*(512/8))] // .........................*.............|.........................................................................*............ - // str q15, [x0, #(-16 + 3*(512/8))] // ...........................*...........|...........................................................................*.......... + // ---------------------------------------- new position ----------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------- + // ldr q8, [x0, #0] // ...........*................................................................................... + // ldr q9, [x0, #(1*(512/8))] // e..........'............................................................................~...... + // ldr q10, [x0, #(2*(512/8))] // ...e.......'...............................................................................~... + // ldr q11, [x0, #(3*(512/8))] // .....e.....'.................................................................................~. + // ldr q12, [x0, #(4*(512/8))] // .......e...'................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ........e..'................................................................................... + // ldr q14, [x0, #(6*(512/8))] // .........e.'................................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..........e'................................................................................... + // sub v24.8h, v8.8h, v9.8h // ...........'..*................................................................................ + // add v8.8h, v8.8h, v9.8h // ...........'...*............................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[7] // ...........'........*.......................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ...........'.........*......................................................................... + // mls v9.8h, v27.8h, v7.h[0] // ...........'.............*..................................................................... + // sub v24.8h, v10.8h, v11.8h // ...........'*.................................................................................. + // add v10.8h, v10.8h, v11.8h // ...........'.*................................................................................. + // sqrdmulh v27.8h, v24.8h, v1.h[1] // ...........'....*.............................................................................. + // mul v11.8h, v24.8h, v1.h[0] // ...........'.....*............................................................................. + // mls v11.8h, v27.8h, v7.h[0] // ...........'..........*........................................................................ + // sub v24.8h, v12.8h, v13.8h // ...........'...........*....................................................................... + // add v12.8h, v12.8h, v13.8h // ...........'............*...................................................................... + // sqrdmulh v27.8h, v24.8h, v1.h[3] // ...........'..............*.................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ...........'...................*............................................................... + // mls v13.8h, v27.8h, v7.h[0] // ...........'.......................*........................................................... + // sub v24.8h, v14.8h, v15.8h // ...........'....................*.............................................................. + // add v14.8h, v14.8h, v15.8h // ...........'...............*................................................................... + // sqrdmulh v27.8h, v24.8h, v1.h[5] // ...........'..........................*........................................................ + // mul v15.8h, v24.8h, v1.h[4] // ...........'...........................*....................................................... + // mls v15.8h, v27.8h, v7.h[0] // ...........'...............................*................................................... + // sub v24.8h, v8.8h, v10.8h // ...........'......*............................................................................ + // add v8.8h, v8.8h, v10.8h // ...........'.......*........................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...........'................*.................................................................. + // mul v10.8h, v24.8h, v0.h[2] // ...........'.................*................................................................. + // mls v10.8h, v27.8h, v7.h[0] // ...........'.............................*..................................................... + // sub v24.8h, v9.8h, v11.8h // ...........'........................*.......................................................... + // add v9.8h, v9.8h, v11.8h // ...........'.........................*......................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...........'..............................*.................................................... + // mul v11.8h, v24.8h, v0.h[2] // ...........'................................*.................................................. + // mls v11.8h, v27.8h, v7.h[0] // ...........'....................................*.............................................. + // sub v24.8h, v12.8h, v14.8h // ...........'............................*...................................................... + // add v12.8h, v12.8h, v14.8h // ...........'..................*................................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...........'.................................*................................................. + // mul v14.8h, v24.8h, v0.h[4] // ...........'..................................*................................................ + // mls v14.8h, v27.8h, v7.h[0] // ...........'......................................*............................................ + // sub v24.8h, v13.8h, v15.8h // ...........'...................................*............................................... + // add v13.8h, v13.8h, v15.8h // ...........'.....................................*............................................. + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...........'.......................................*........................................... + // mul v15.8h, v24.8h, v0.h[4] // ...........'........................................*.......................................... + // mls v15.8h, v27.8h, v7.h[0] // ...........'............................................*...................................... + // sub v24.8h, v8.8h, v12.8h // ...........'.....................*............................................................. + // add v8.8h, v8.8h, v12.8h // ...........'......................*............................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...........'.........................................*......................................... + // mul v12.8h, v24.8h, v0.h[0] // ...........'..........................................*........................................ + // mls v12.8h, v27.8h, v7.h[0] // ...........'................................................*.................................. + // sub v24.8h, v9.8h, v13.8h // ...........'...........................................*....................................... + // add v9.8h, v9.8h, v13.8h // ...........'.............................................*..................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...........'.................................................*................................. + // mul v13.8h, v24.8h, v0.h[0] // ...........'..................................................*................................ + // mls v13.8h, v27.8h, v7.h[0] // ...........'......................................................*............................ + // sub v24.8h, v10.8h, v14.8h // ...........'...................................................*............................... + // add v10.8h, v10.8h, v14.8h // ...........'....................................................*.............................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...........'.......................................................*........................... + // mul v14.8h, v24.8h, v0.h[0] // ...........'........................................................*.......................... + // mls v14.8h, v27.8h, v7.h[0] // ...........'............................................................*...................... + // sub v24.8h, v11.8h, v15.8h // ...........'.....................................................*............................. + // add v11.8h, v11.8h, v15.8h // ...........'.........................................................*......................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...........'..........................................................*........................ + // mul v15.8h, v24.8h, v0.h[0] // ...........'...........................................................*....................... + // mls v15.8h, v27.8h, v7.h[0] // ...........'...............................................................*................... + // str q12, [x0, #(4*(512/8))] // ...........'.............................................................*..................... + // str q13, [x0, #(5*(512/8))] // ...........'................................................................*.................. + // str q14, [x0, #(6*(512/8))] // ...........'..................................................................*................ + // str q15, [x0, #(7*(512/8))] // ...........'....................................................................*.............. + // sqrdmulh v27.8h, v8.8h, v30.8h // ...........'..............................................*.................................... + // mul v8.8h, v8.8h, v29.8h // ...........'...............................................*................................... + // mls v8.8h, v27.8h, v7.h[0] // ...........'..............................................................*.................... + // sqrdmulh v27.8h, v9.8h, v30.8h // ...........'.................................................................*................. + // mul v9.8h, v9.8h, v29.8h // ...........'...................................................................*............... + // mls v9.8h, v27.8h, v7.h[0] // ...........'.......................................................................*........... + // sqrdmulh v27.8h, v10.8h, v30.8h // ...........'.....................................................................*............. + // mul v10.8h, v10.8h, v29.8h // ...........'......................................................................*............ + // mls v10.8h, v27.8h, v7.h[0] // ...........'..........................................................................*........ + // sqrdmulh v27.8h, v11.8h, v30.8h // ...........'........................................................................*.......... + // mul v11.8h, v11.8h, v29.8h // ...........'.........................................................................*......... + // mls v11.8h, v27.8h, v7.h[0] // .~.........'.............................................................................*..... + // str q8, [x0], #(16) // ...........'...........................................................................*....... + // str q9, [x0, #(-16 + 1*(512/8))] // ..~........'..............................................................................*.... + // str q10, [x0, #(-16 + 2*(512/8))] // ....~......'................................................................................*.. + // str q11, [x0, #(-16 + 3*(512/8))] // ......~....'..................................................................................* sub count, count, #1 cbnz count, layer123_start - sub v10.8H, v22.8H, v28.8H // ..*......................................................................... - // gap // ............................................................................ - sub v18.8H, v27.8H, v23.8H // *........................................................................... - // gap // ............................................................................ - sub v19.8H, v20.8H, v11.8H // ..................*......................................................... - // gap // ............................................................................ - sqrdmulh v22.8H, v10.8H, v1.H[1] // ........*................................................................... - // gap // ............................................................................ - sqrdmulh v15.8H, v18.8H, v0.H[7] // ....*....................................................................... - // gap // ............................................................................ - mul v28.8H, v18.8H, v0.H[6] // ...*........................................................................ - // gap // ............................................................................ - mul v8.8H, v10.8H, v1.H[0] // .......*.................................................................... - // gap // ............................................................................ - sub v25.8H, v25.8H, v5.8H // .................................*.......................................... - // gap // ............................................................................ - mul v10.8H, v19.8H, v1.H[4] // ......................*..................................................... - // gap // ............................................................................ - mls v28.8H, v15.8H, v7.H[0] // .........*.................................................................. - // gap // ............................................................................ - mls v8.8H, v22.8H, v7.H[0] // ...............*............................................................ - // gap // ............................................................................ - sqrdmulh v22.8H, v19.8H, v1.H[5] // ........................*................................................... - // gap // ............................................................................ - sqrdmulh v19.8H, v25.8H, v0.H[5] // .....................................*...................................... - // gap // ............................................................................ - add v11.8H, v27.8H, v23.8H // .*.......................................................................... - // gap // ............................................................................ - sub v20.8H, v28.8H, v8.8H // ...................*........................................................ - // gap // ............................................................................ - mls v10.8H, v22.8H, v7.H[0] // ...............................*............................................ - // gap // ............................................................................ - add v17.8H, v11.8H, v24.8H // ......*..................................................................... - // gap // ............................................................................ - sqrdmulh v27.8H, v20.8H, v0.H[3] // ..........................*................................................. - // gap // ............................................................................ - mul v5.8H, v20.8H, v0.H[2] // .........................*.................................................. - // gap // ............................................................................ - sub v18.8H, v11.8H, v24.8H // .....*...................................................................... - // gap // ............................................................................ - add v23.8H, v17.8H, v14.8H // ..............*............................................................. - // gap // ............................................................................ - mul v16.8H, v25.8H, v0.H[4] // ....................................*....................................... - // gap // ............................................................................ - mls v5.8H, v27.8H, v7.H[0] // ................................*........................................... - // gap // ............................................................................ - sqrdmulh v20.8H, v23.8H, v30.8H // ..............................*............................................. - // gap // ............................................................................ - sqrdmulh v27.8H, v18.8H, v0.H[3] // ............*............................................................... - // gap // ............................................................................ - mul v22.8H, v18.8H, v0.H[2] // ...........*................................................................ - // gap // ............................................................................ - mul v25.8H, v23.8H, v29.8H // .............................*.............................................. - // gap // ............................................................................ - sub v26.8H, v3.8H, v26.8H // ..........*................................................................. - // gap // ............................................................................ - mls v16.8H, v19.8H, v7.H[0] // ...........................................*................................ - // gap // ............................................................................ - mls v22.8H, v27.8H, v7.H[0] // .......................*.................................................... - // gap // ............................................................................ - sqrdmulh v19.8H, v26.8H, v1.H[3] // .................*.......................................................... - // gap // ............................................................................ - mul v11.8H, v26.8H, v1.H[2] // ................*........................................................... - // gap // ............................................................................ - add v26.8H, v28.8H, v8.8H // ....................*....................................................... - // gap // ............................................................................ - sub v3.8H, v22.8H, v16.8H // ...............................................*............................ - // gap // ............................................................................ - add v4.8H, v22.8H, v16.8H // ................................................*........................... - // gap // ............................................................................ - mls v11.8H, v19.8H, v7.H[0] // .....................*...................................................... - // gap // ............................................................................ - mul v19.8H, v3.8H, v0.H[0] // .......................................................................*.... - // gap // ............................................................................ - sqrdmulh v9.8H, v3.8H, v0.H[1] // ........................................................................*... - // gap // ............................................................................ - sqrdmulh v23.8H, v4.8H, v30.8H // .............................................................*.............. - // gap // ............................................................................ - sub v31.8H, v11.8H, v10.8H // ...................................*........................................ - // gap // ............................................................................ - add v3.8H, v11.8H, v10.8H // ......................................*..................................... - // gap // ............................................................................ - mul v8.8H, v4.8H, v29.8H // ............................................................*............... - // gap // ............................................................................ - mul v24.8H, v31.8H, v0.H[4] // .......................................*.................................... - // gap // ............................................................................ - sub v28.8H, v26.8H, v3.8H // .........................................*.................................. - // gap // ............................................................................ - sqrdmulh v27.8H, v31.8H, v0.H[5] // ........................................*................................... - // gap // ............................................................................ - mls v8.8H, v23.8H, v7.H[0] // ..................................................................*......... - // gap // ............................................................................ - mul v22.8H, v28.8H, v0.H[0] // .............................................*.............................. - // gap // ............................................................................ - sqrdmulh v28.8H, v28.8H, v0.H[1] // ..............................................*............................. - // gap // ............................................................................ - mls v24.8H, v27.8H, v7.H[0] // ............................................*............................... - // gap // ............................................................................ - mls v25.8H, v20.8H, v7.H[0] // ...................................................*........................ - // gap // ............................................................................ - add v11.8H, v26.8H, v3.8H // ..........................................*................................. - // gap // ............................................................................ - mls v22.8H, v28.8H, v7.H[0] // ......................................................*..................... - // gap // ............................................................................ - sub v23.8H, v5.8H, v24.8H // .................................................*.......................... - // gap // ............................................................................ - mul v3.8H, v11.8H, v29.8H // .........................................................*.................. - // gap // ............................................................................ - add v28.8H, v5.8H, v24.8H // ..................................................*......................... - // gap // ............................................................................ - sqrdmulh v24.8H, v23.8H, v0.H[1] // .....................................................*...................... - // gap // ............................................................................ - mul v20.8H, v23.8H, v0.H[0] // ....................................................*....................... - // gap // ............................................................................ - mul v27.8H, v28.8H, v29.8H // ................................................................*........... - // gap // ............................................................................ - sqrdmulh v23.8H, v28.8H, v30.8H // .................................................................*.......... - // gap // ............................................................................ - str q22, [x0, #320] // ........................................................*................... - // gap // ............................................................................ - mls v20.8H, v24.8H, v7.H[0] // .......................................................*.................... - // gap // ............................................................................ - sub v22.8H, v17.8H, v14.8H // .............*.............................................................. - // gap // ............................................................................ - mls v27.8H, v23.8H, v7.H[0] // ....................................................................*....... - // gap // ............................................................................ - str q8, [x0, #128] // .....................................................................*...... - // gap // ............................................................................ - sqrdmulh v26.8H, v11.8H, v30.8H // ..........................................................*................. - // gap // ............................................................................ - str q20, [x0, #448] // ...........................................................*................ - // gap // ............................................................................ - sqrdmulh v28.8H, v22.8H, v0.H[1] // ............................*............................................... - // gap // ............................................................................ - mul v24.8H, v22.8H, v0.H[0] // ...........................*................................................ - // gap // ............................................................................ - str q27, [x0, #192] // ......................................................................*..... - // gap // ............................................................................ - mls v3.8H, v26.8H, v7.H[0] // ...............................................................*............ - // gap // ............................................................................ - str q25, [x0], #(16) // ..............................................................*............. - // gap // ............................................................................ - mls v24.8H, v28.8H, v7.H[0] // ..................................*......................................... - // gap // ............................................................................ - mls v19.8H, v9.8H, v7.H[0] // ..........................................................................*. - // gap // ............................................................................ - str q3, [x0, #48] // ...................................................................*........ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str q24, [x0, #240] // .........................................................................*.. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str q19, [x0, #368] // ...........................................................................* - // gap // ............................................................................ - - // original source code - // sub v19.8H, v27.8H, v23.8H // .*.......................................................................... - // add v23.8H, v27.8H, v23.8H // .............*.............................................................. - // sub v22.8H, v22.8H, v28.8H // *........................................................................... - // mul v28.8H, v19.8H, v0.H[6] // .....*...................................................................... - // sqrdmulh v19.8H, v19.8H, v0.H[7] // ....*....................................................................... - // sub v27.8H, v23.8H, v24.8H // ...................*........................................................ - // add v23.8H, v23.8H, v24.8H // ................*........................................................... - // mul v24.8H, v22.8H, v1.H[0] // ......*..................................................................... - // sqrdmulh v22.8H, v22.8H, v1.H[1] // ...*........................................................................ - // mls v28.8H, v19.8H, v7.H[0] // .........*.................................................................. - // sub v19.8H, v3.8H, v26.8H // ...........................*................................................ - // mul v3.8H, v27.8H, v0.H[2] // .........................*.................................................. - // sqrdmulh v27.8H, v27.8H, v0.H[3] // ........................*................................................... - // sub v26.8H, v23.8H, v14.8H // .............................................................*.............. - // add v23.8H, v23.8H, v14.8H // ....................*....................................................... - // mls v24.8H, v22.8H, v7.H[0] // ..........*................................................................. - // mul v22.8H, v19.8H, v1.H[2] // ...............................*............................................ - // sqrdmulh v19.8H, v19.8H, v1.H[3] // ..............................*............................................. - // sub v20.8H, v20.8H, v11.8H // ..*......................................................................... - // sub v11.8H, v28.8H, v24.8H // ..............*............................................................. - // add v28.8H, v28.8H, v24.8H // ................................*........................................... - // mls v22.8H, v19.8H, v7.H[0] // ...................................*........................................ - // mul v19.8H, v20.8H, v1.H[4] // ........*................................................................... - // mls v3.8H, v27.8H, v7.H[0] // .............................*.............................................. - // sqrdmulh v27.8H, v20.8H, v1.H[5] // ...........*................................................................ - // mul v24.8H, v11.8H, v0.H[2] // ..................*......................................................... - // sqrdmulh v20.8H, v11.8H, v0.H[3] // .................*.......................................................... - // mul v4.8H, v26.8H, v0.H[0] // ...................................................................*........ - // sqrdmulh v11.8H, v26.8H, v0.H[1] // ..................................................................*......... - // mul v26.8H, v23.8H, v29.8H // ..........................*................................................. - // sqrdmulh v6.8H, v23.8H, v30.8H // .......................*.................................................... - // mls v19.8H, v27.8H, v7.H[0] // ...............*............................................................ - // mls v24.8H, v20.8H, v7.H[0] // ......................*..................................................... - // sub v27.8H, v25.8H, v5.8H // .......*.................................................................... - // mls v4.8H, v11.8H, v7.H[0] // .......................................................................*.... - // sub v23.8H, v22.8H, v19.8H // .......................................*.................................... - // mul v20.8H, v27.8H, v0.H[4] // .....................*...................................................... - // sqrdmulh v11.8H, v27.8H, v0.H[5] // ............*............................................................... - // add v27.8H, v22.8H, v19.8H // ........................................*................................... - // mul v22.8H, v23.8H, v0.H[4] // ..........................................*................................. - // sqrdmulh v23.8H, v23.8H, v0.H[5] // ............................................*............................... - // sub v19.8H, v28.8H, v27.8H // ...........................................*................................ - // add v10.8H, v28.8H, v27.8H // ..................................................*......................... - // mls v20.8H, v11.8H, v7.H[0] // ............................*............................................... - // mls v22.8H, v23.8H, v7.H[0] // ................................................*........................... - // mul v28.8H, v19.8H, v0.H[0] // ..............................................*............................. - // sqrdmulh v23.8H, v19.8H, v0.H[1] // ...............................................*............................ - // sub v14.8H, v3.8H, v20.8H // .................................*.......................................... - // add v27.8H, v3.8H, v20.8H // ..................................*......................................... - // sub v20.8H, v24.8H, v22.8H // ....................................................*....................... - // add v21.8H, v24.8H, v22.8H // ......................................................*..................... - // mls v26.8H, v6.8H, v7.H[0] // .................................................*.......................... - // mul v22.8H, v20.8H, v0.H[0] // ........................................................*................... - // sqrdmulh v19.8H, v20.8H, v0.H[1] // .......................................................*.................... - // mls v28.8H, v23.8H, v7.H[0] // ...................................................*........................ - // mls v22.8H, v19.8H, v7.H[0] // ............................................................*............... - // str q28, [x0, #320] // ...........................................................*................ - // mul v23.8H, v10.8H, v29.8H // .....................................................*...................... - // sqrdmulh v19.8H, v10.8H, v30.8H // ................................................................*........... - // str q22, [x0, #448] // .................................................................*.......... - // mul v28.8H, v27.8H, v29.8H // .........................................*.................................. - // sqrdmulh v20.8H, v27.8H, v30.8H // ......................................*..................................... - // str q26, [x0], #(16) // ......................................................................*..... - // mls v23.8H, v19.8H, v7.H[0] // .....................................................................*...... - // mul v19.8H, v21.8H, v29.8H // .........................................................*.................. - // sqrdmulh v22.8H, v21.8H, v30.8H // ..........................................................*................. - // mls v28.8H, v20.8H, v7.H[0] // .............................................*.............................. - // str q23, [x0, #48] // .........................................................................*.. - // mls v19.8H, v22.8H, v7.H[0] // ..............................................................*............. - // str q28, [x0, #112] // ...............................................................*............ - // str q19, [x0, #176] // ....................................................................*....... - // mul v8.8H, v14.8H, v0.H[0] // ....................................*....................................... - // sqrdmulh v27.8H, v14.8H, v0.H[1] // .....................................*...................................... - // str q4, [x0, #240] // ..........................................................................*. - // mls v8.8H, v27.8H, v7.H[0] // ........................................................................*... - // str q8, [x0, #368] // ...........................................................................* + // Instructions: 81 + // Expected cycles: 84 + // Expected IPC: 0.96 + // + // Cycle bound: 84.0 + // IPC bound: 0.96 + // + // Wall time: 30.47s + // User time: 30.47s + // + // ------------------------------ original position -------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|----- + ldr q8, [x0, #0] // *................................................................................ + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + sub v19.8H, v14.8H, v26.8H // .*............................................................................... + // gap // ................................................................................. + add v27.8H, v14.8H, v26.8H // ..*.............................................................................. + // gap // ................................................................................. + sub v23.8H, v8.8H, v6.8H // ...*............................................................................. + // gap // ................................................................................. + mul v14.8H, v19.8H, v1.H[0] // ......*.......................................................................... + // gap // ................................................................................. + add v17.8H, v8.8H, v6.8H // ....*............................................................................ + // gap // ................................................................................. + mul v15.8H, v23.8H, v0.H[6] // ..........*...................................................................... + // gap // ................................................................................. + sqrdmulh v9.8H, v23.8H, v0.H[7] // .........*....................................................................... + // gap // ................................................................................. + add v5.8H, v25.8H, v22.8H // ................*................................................................ + // gap // ................................................................................. + add v2.8H, v11.8H, v13.8H // .............*................................................................... + // gap // ................................................................................. + sub v12.8H, v17.8H, v27.8H // .......*......................................................................... + // gap // ................................................................................. + mls v15.8H, v9.8H, v7.H[0] // ..............*.................................................................. + // gap // ................................................................................. + sub v18.8H, v2.8H, v5.8H // .............................*................................................... + // gap // ................................................................................. + sqrdmulh v4.8H, v12.8H, v0.H[3] // .................*............................................................... + // gap // ................................................................................. + mul v16.8H, v12.8H, v0.H[2] // ..................*.............................................................. + // gap // ................................................................................. + sqrdmulh v20.8H, v18.8H, v0.H[5] // ..................................*.............................................. + // gap // ................................................................................. + mul v3.8H, v18.8H, v0.H[4] // ...................................*............................................. + // gap // ................................................................................. + sqrdmulh v12.8H, v19.8H, v1.H[1] // .....*........................................................................... + // gap // ................................................................................. + add v10.8H, v2.8H, v5.8H // ...................*............................................................. + // gap // ................................................................................. + mls v16.8H, v4.8H, v7.H[0] // ..............................*.................................................. + // gap // ................................................................................. + mls v3.8H, v20.8H, v7.H[0] // .......................................*......................................... + // gap // ................................................................................. + mls v14.8H, v12.8H, v7.H[0] // ...........*..................................................................... + // gap // ................................................................................. + add v23.8H, v17.8H, v27.8H // ........*........................................................................ + // gap // ................................................................................. + sub v24.8H, v11.8H, v13.8H // ............*.................................................................... + // gap // ................................................................................. + sub v18.8H, v16.8H, v3.8H // ....................................................*............................ + // gap // ................................................................................. + add v4.8H, v15.8H, v14.8H // ..........................*...................................................... + // gap // ................................................................................. + sub v6.8H, v23.8H, v10.8H // ......................*.......................................................... + // gap // ................................................................................. + mul v28.8H, v24.8H, v1.H[2] // ....................*............................................................ + // gap // ................................................................................. + sqrdmulh v31.8H, v24.8H, v1.H[3] // ...............*................................................................. + // gap // ................................................................................. + sqrdmulh v8.8H, v6.8H, v0.H[1] // ..........................................*...................................... + // gap // ................................................................................. + mul v27.8H, v18.8H, v0.H[0] // .........................................................*....................... + // gap // ................................................................................. + sub v19.8H, v25.8H, v22.8H // .....................*........................................................... + // gap // ................................................................................. + mls v28.8H, v31.8H, v7.H[0] // ........................*........................................................ + // gap // ................................................................................. + mul v22.8H, v6.8H, v0.H[0] // ...........................................*..................................... + // gap // ................................................................................. + mul v6.8H, v19.8H, v1.H[4] // ............................*.................................................... + // gap // ................................................................................. + sqrdmulh v26.8H, v19.8H, v1.H[5] // ...........................*..................................................... + // gap // ................................................................................. + sub v2.8H, v15.8H, v14.8H // .........................*....................................................... + // gap // ................................................................................. + add v19.8H, v23.8H, v10.8H // .......................*......................................................... + // gap // ................................................................................. + mls v22.8H, v8.8H, v7.H[0] // .................................................*............................... + // gap // ................................................................................. + mls v6.8H, v26.8H, v7.H[0] // ................................*................................................ + // gap // ................................................................................. + sqrdmulh v25.8H, v2.8H, v0.H[3] // ...............................*................................................. + // gap // ................................................................................. + sqrdmulh v14.8H, v19.8H, v30.8H // ...............................................*................................. + // gap // ................................................................................. + mul v23.8H, v19.8H, v29.8H // ................................................*................................ + // gap // ................................................................................. + add v21.8H, v28.8H, v6.8H // ......................................*.......................................... + // gap // ................................................................................. + sub v31.8H, v28.8H, v6.8H // ....................................*............................................ + // gap // ................................................................................. + mul v11.8H, v2.8H, v0.H[2] // .................................*............................................... + // gap // ................................................................................. + add v5.8H, v4.8H, v21.8H // ..............................................*.................................. + // gap // ................................................................................. + sqrdmulh v6.8H, v31.8H, v0.H[5] // ........................................*........................................ + // gap // ................................................................................. + mul v26.8H, v31.8H, v0.H[4] // .........................................*....................................... + // gap // ................................................................................. + sqrdmulh v15.8H, v5.8H, v30.8H // ..................................................................*.............. + // gap // ................................................................................. + mul v31.8H, v5.8H, v29.8H // ....................................................................*............ + // gap // ................................................................................. + mls v11.8H, v25.8H, v7.H[0] // .....................................*........................................... + // gap // ................................................................................. + mls v26.8H, v6.8H, v7.H[0] // .............................................*................................... + // gap // ................................................................................. + str q22, [x0, #256] // ..............................................................*.................. + // gap // ................................................................................. + mls v31.8H, v15.8H, v7.H[0] // ........................................................................*........ + // gap // ................................................................................. + mls v23.8H, v14.8H, v7.H[0] // ...............................................................*................. + // gap // ................................................................................. + sub v15.8H, v11.8H, v26.8H // ......................................................*.......................... + // gap // ................................................................................. + sub v21.8H, v4.8H, v21.8H // ............................................*.................................... + // gap // ................................................................................. + str q31, [x0, #64] // ..............................................................................*.. + // gap // ................................................................................. + mul v14.8H, v15.8H, v0.H[0] // ............................................................*.................... + // gap // ................................................................................. + sqrdmulh v4.8H, v15.8H, v0.H[1] // ...........................................................*..................... + // gap // ................................................................................. + mul v17.8H, v21.8H, v0.H[0] // ...................................................*............................. + // gap // ................................................................................. + sqrdmulh v24.8H, v21.8H, v0.H[1] // ..................................................*.............................. + // gap // ................................................................................. + add v3.8H, v16.8H, v3.8H // .....................................................*........................... + // gap // ................................................................................. + mls v14.8H, v4.8H, v7.H[0] // ................................................................*................ + // gap // ................................................................................. + add v6.8H, v11.8H, v26.8H // ..........................................................*...................... + // gap // ................................................................................. + sqrdmulh v5.8H, v18.8H, v0.H[1] // ........................................................*........................ + // gap // ................................................................................. + mls v17.8H, v24.8H, v7.H[0] // .......................................................*......................... + // gap // ................................................................................. + str q14, [x0, #448] // .....................................................................*........... + // gap // ................................................................................. + mul v25.8H, v6.8H, v29.8H // ..........................................................................*...... + // gap // ................................................................................. + mls v27.8H, v5.8H, v7.H[0] // .............................................................*................... + // gap // ................................................................................. + sqrdmulh v4.8H, v6.8H, v30.8H // .........................................................................*....... + // gap // ................................................................................. + mul v11.8H, v3.8H, v29.8H // .......................................................................*......... + // gap // ................................................................................. + sqrdmulh v15.8H, v3.8H, v30.8H // ......................................................................*.......... + // gap // ................................................................................. + str q27, [x0, #384] // ...................................................................*............. + // gap // ................................................................................. + mls v25.8H, v4.8H, v7.H[0] // .............................................................................*... + // gap // ................................................................................. + str q17, [x0, #320] // .................................................................*............... + // gap // ................................................................................. + mls v11.8H, v15.8H, v7.H[0] // ...........................................................................*..... + // gap // ................................................................................. + str q23, [x0], #(16) // ............................................................................*.... + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + str q25, [x0, #176] // ................................................................................* + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + str q11, [x0, #112] // ...............................................................................*. + // gap // ................................................................................. + + // --------------------------------- new position ---------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|----- + // ldr q16, [x0, #0] // *................................................................................ + // sub v15.8H, v14.8H, v26.8H // .*............................................................................... + // add v27.8H, v14.8H, v26.8H // ..*.............................................................................. + // sub v14.8H, v16.8H, v6.8H // ...*............................................................................. + // add v16.8H, v16.8H, v6.8H // .....*........................................................................... + // sqrdmulh v6.8H, v15.8H, v1.H[1] // .................*............................................................... + // mul v15.8H, v15.8H, v1.H[0] // ....*............................................................................ + // sub v26.8H, v16.8H, v27.8H // ..........*...................................................................... + // add v16.8H, v16.8H, v27.8H // ......................*.......................................................... + // sqrdmulh v27.8H, v14.8H, v0.H[7] // .......*......................................................................... + // mul v14.8H, v14.8H, v0.H[6] // ......*.......................................................................... + // mls v15.8H, v6.8H, v7.H[0] // .....................*........................................................... + // sub v6.8H, v11.8H, v13.8H // .......................*......................................................... + // add v11.8H, v11.8H, v13.8H // .........*....................................................................... + // mls v14.8H, v27.8H, v7.H[0] // ...........*..................................................................... + // sqrdmulh v27.8H, v6.8H, v1.H[3] // ............................*.................................................... + // add v3.8H, v25.8H, v22.8H // ........*........................................................................ + // sqrdmulh v13.8H, v26.8H, v0.H[3] // .............*................................................................... + // mul v26.8H, v26.8H, v0.H[2] // ..............*.................................................................. + // add v31.8H, v11.8H, v3.8H // ..................*.............................................................. + // mul v6.8H, v6.8H, v1.H[2] // ...........................*..................................................... + // sub v25.8H, v25.8H, v22.8H // ...............................*................................................. + // sub v22.8H, v16.8H, v31.8H // ..........................*...................................................... + // add v16.8H, v16.8H, v31.8H // .....................................*........................................... + // mls v6.8H, v27.8H, v7.H[0] // ................................*................................................ + // sub v27.8H, v14.8H, v15.8H // ....................................*............................................ + // add v15.8H, v14.8H, v15.8H // .........................*....................................................... + // sqrdmulh v14.8H, v25.8H, v1.H[5] // ...................................*............................................. + // mul v31.8H, v25.8H, v1.H[4] // ..................................*.............................................. + // sub v11.8H, v11.8H, v3.8H // ............*.................................................................... + // mls v26.8H, v13.8H, v7.H[0] // ...................*............................................................. + // sqrdmulh v13.8H, v27.8H, v0.H[3] // ........................................*........................................ + // mls v31.8H, v14.8H, v7.H[0] // .......................................*......................................... + // mul v27.8H, v27.8H, v0.H[2] // .............................................*................................... + // sqrdmulh v14.8H, v11.8H, v0.H[5] // ...............*................................................................. + // mul v11.8H, v11.8H, v0.H[4] // ................*................................................................ + // sub v25.8H, v6.8H, v31.8H // ............................................*.................................... + // mls v27.8H, v13.8H, v7.H[0] // ...................................................*............................. + // add v6.8H, v6.8H, v31.8H // ...........................................*..................................... + // mls v11.8H, v14.8H, v7.H[0] // ....................*............................................................ + // sqrdmulh v14.8H, v25.8H, v0.H[5] // ...............................................*................................. + // mul v13.8H, v25.8H, v0.H[4] // ................................................*................................ + // sqrdmulh v31.8H, v22.8H, v0.H[1] // .............................*................................................... + // mul v25.8H, v22.8H, v0.H[0] // .................................*............................................... + // sub v22.8H, v15.8H, v6.8H // .........................................................*....................... + // mls v13.8H, v14.8H, v7.H[0] // ....................................................*............................ + // add v15.8H, v15.8H, v6.8H // ..............................................*.................................. + // sqrdmulh v14.8H, v16.8H, v30.8H // .........................................*....................................... + // mul v16.8H, v16.8H, v29.8H // ..........................................*...................................... + // mls v25.8H, v31.8H, v7.H[0] // ......................................*.......................................... + // sqrdmulh v6.8H, v22.8H, v0.H[1] // ..............................................................*.................. + // mul v31.8H, v22.8H, v0.H[0] // .............................................................*................... + // sub v22.8H, v26.8H, v11.8H // ........................*........................................................ + // add v26.8H, v26.8H, v11.8H // ...............................................................*................. + // sub v11.8H, v27.8H, v13.8H // ........................................................*........................ + // mls v31.8H, v6.8H, v7.H[0] // ...................................................................*............. + // sqrdmulh v6.8H, v22.8H, v0.H[1] // ..................................................................*.............. + // mul v22.8H, v22.8H, v0.H[0] // ..............................*.................................................. + // add v27.8H, v27.8H, v13.8H // .................................................................*............... + // sqrdmulh v13.8H, v11.8H, v0.H[1] // ............................................................*.................... + // mul v11.8H, v11.8H, v0.H[0] // ...........................................................*..................... + // mls v22.8H, v6.8H, v7.H[0] // ......................................................................*.......... + // str q25, [x0, #256] // .....................................................*........................... + // mls v16.8H, v14.8H, v7.H[0] // .......................................................*......................... + // mls v11.8H, v13.8H, v7.H[0] // ................................................................*................ + // str q31, [x0, #320] // ............................................................................*.... + // sqrdmulh v14.8H, v15.8H, v30.8H // .................................................*............................... + // str q22, [x0, #384] // ..........................................................................*...... + // mul v15.8H, v15.8H, v29.8H // ..................................................*.............................. + // str q11, [x0, #448] // ....................................................................*............ + // sqrdmulh v6.8H, v26.8H, v30.8H // .........................................................................*....... + // mul v26.8H, v26.8H, v29.8H // ........................................................................*........ + // mls v15.8H, v14.8H, v7.H[0] // ......................................................*.......................... + // sqrdmulh v14.8H, v27.8H, v30.8H // .......................................................................*......... + // mul v27.8H, v27.8H, v29.8H // .....................................................................*........... + // mls v26.8H, v6.8H, v7.H[0] // .............................................................................*... + // str q16, [x0], #(16) // ..............................................................................*.. + // mls v27.8H, v14.8H, v7.H[0] // ...........................................................................*..... + // str q15, [x0, #48] // ..........................................................*...................... + // str q26, [x0, #112] // ................................................................................* + // str q27, [x0, #176] // ...............................................................................*. pop_stack diff --git a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a72.s b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a72.s index 669bcee5..1e92f32b 100644 --- a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a72.s +++ b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a72.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +69,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -146,7 +125,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +136,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +146,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +154,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +165,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -354,100 +339,113 @@ _intt_kyber_123_4567_manual_ld4_opt_a72: mov count, #8 .p2align 2 - ldr q2, [x4, #64] // ..........*......................................... - ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x1] // *................................................... - ldr q31, [x4, #32] // ............*....................................... - ldr q21, [x4, #16] // ....*............................................... - ldr q23, [x4, #48] // .*.................................................. + // Instructions: 52 + // Expected cycles: 64 + // Expected IPC: 0.81 + // + // Cycle bound: 64.0 + // IPC bound: 0.81 + // + // Wall time: 1.35s + // User time: 1.35s + // + // ---------------- original position ----------------> + // 0 25 50 + // |------------------------|------------------------|- + ldr q0, [x4, #80] // ........*........................................... + ldr q14, [x4], #(6*16) // *................................................... + ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1] // .*.................................................. + ldr q3, [x3], #16 // .........................................*.......... + ldr q26, [x4, #-48] // ....*............................................... // gap // .................................................... - ldr q10, [x4, #80] // ..*................................................. - ldr q27, [x4], #(6*16) // ...*................................................ + ldr q6, [x4, #-64] // ...*................................................ // gap // .................................................... // gap // .................................................... + ldr q27, [x4, #-80] // ..*................................................. // gap // .................................................... // gap // .................................................... - sub v12.8H, v17.8H, v18.8H // .......*............................................ - add v9.8H, v19.8H, v20.8H // ......*............................................. + sub v13.8H, v15.8H, v16.8H // .....*.............................................. + add v15.8H, v15.8H, v16.8H // .......*............................................ + ldr q11, [x4, #-32] // ...........*........................................ + add v16.8H, v17.8H, v18.8H // ......*............................................. + sub v31.8H, v17.8H, v18.8H // ..........*......................................... // gap // .................................................... - sub v3.8H, v19.8H, v20.8H // .....*.............................................. - add v20.8H, v17.8H, v18.8H // ........*........................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... + mul v6.8H, v13.8H, v6.8H // .........*.......................................... // gap // .................................................... - mul v29.8H, v12.8H, v31.8H // ................*................................... // gap // .................................................... + add v25.8H, v15.8H, v16.8H // ............*....................................... // gap // .................................................... - add v4.8H, v20.8H, v9.8H // ..............*..................................... // gap // .................................................... + sub v15.8H, v15.8H, v16.8H // ..............*..................................... + sqrdmulh v26.8H, v13.8H, v26.8H // .............*...................................... // gap // .................................................... - sqrdmulh v15.8H, v12.8H, v23.8H // ...........*........................................ - sub v1.8H, v20.8H, v9.8H // .............*...................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... - sqrdmulh v13.8H, v3.8H, v10.8H // .........*.......................................... // gap // .................................................... + sqrdmulh v16.8H, v31.8H, v0.8H // ...............*.................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... + mul v0.8H, v31.8H, v11.8H // ................*................................... // gap // .................................................... - mul v28.8H, v3.8H, v2.8H // ...............*.................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... + mls v6.8H, v26.8H, v7.H[0] // .................*.................................. // gap // .................................................... - mls v29.8H, v15.8H, v7.H[0] // .................*.................................. // gap // .................................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... + mls v0.8H, v16.8H, v7.H[0] // ..................*................................. // gap // .................................................... - mls v28.8H, v13.8H, v7.H[0] // ..................*................................. - ldr q13, [x3], #16 // .........................................*.......... // gap // .................................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... - mul v10.8H, v1.8H, v27.8H // ......................*............................. // gap // .................................................... // gap // .................................................... + sqrdmulh v16.8H, v15.8H, v27.8H // ...................*................................ // gap // .................................................... // gap // .................................................... // gap // .................................................... + mul v31.8H, v15.8H, v14.8H // .....................*.............................. // gap // .................................................... // gap // .................................................... + sub v26.8H, v6.8H, v0.8H // ....................*............................... // gap // .................................................... - sub v17.8H, v29.8H, v28.8H // ....................*............................... - add v23.8H, v29.8H, v28.8H // .....................*.............................. // gap // .................................................... + add v6.8H, v6.8H, v0.8H // ......................*............................. // gap // .................................................... // gap // .................................................... - sqrdmulh v29.8H, v1.8H, v21.8H // ...................*................................ + mls v31.8H, v16.8H, v7.H[0] // ...........................*........................ // gap // .................................................... // gap // .................................................... // gap // .................................................... - trn1 v22.4S, v4.4S, v23.4S // ........................*........................... - sqrdmulh v18.8H, v17.8H, v21.8H // .......................*............................ // gap // .................................................... - trn2 v1.4S, v4.4S, v23.4S // ..........................*......................... // gap // .................................................... + trn1 v0.4S, v25.4S, v6.4S // ........................*........................... + sqrdmulh v27.8H, v26.8H, v27.8H // .......................*............................ // gap // .................................................... - mul v16.8H, v17.8H, v27.8H // .........................*.......................... + trn2 v6.4S, v25.4S, v6.4S // .........................*.......................... // gap // .................................................... // gap // .................................................... + mul v15.8H, v26.8H, v14.8H // ..........................*......................... // gap // .................................................... // gap // .................................................... // gap // .................................................... - mls v10.8H, v29.8H, v7.H[0] // ...........................*........................ // gap // .................................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... - mls v16.8H, v18.8H, v7.H[0] // ............................*....................... + mls v15.8H, v27.8H, v7.H[0] // ............................*....................... // gap // .................................................... // gap // .................................................... // gap // .................................................... @@ -462,8 +460,8 @@ _intt_kyber_123_4567_manual_ld4_opt_a72: // gap // .................................................... // gap // .................................................... // gap // .................................................... - trn2 v21.4S, v10.4S, v16.4S // .............................*...................... - trn1 v10.4S, v10.4S, v16.4S // ..............................*..................... + trn1 v14.4S, v31.4S, v15.4S // ..............................*..................... + trn2 v27.4S, v31.4S, v15.4S // .............................*...................... // gap // .................................................... // gap // .................................................... // gap // .................................................... @@ -471,61 +469,63 @@ _intt_kyber_123_4567_manual_ld4_opt_a72: // gap // .................................................... // gap // .................................................... // gap // .................................................... - trn2 v14.2D, v1.2D, v21.2D // ..................................*................. - trn2 v6.2D, v22.2D, v10.2D // .................................*.................. + trn1 v15.2D, v6.2D, v27.2D // .................................*.................. + trn1 v16.2D, v0.2D, v14.2D // ..................................*................. // gap // .................................................... - trn1 v17.2D, v22.2D, v10.2D // ................................*................... - trn1 v24.2D, v1.2D, v21.2D // ...............................*.................... + trn2 v27.2D, v6.2D, v27.2D // ...............................*.................... // gap // .................................................... // gap // .................................................... + trn2 v14.2D, v0.2D, v14.2D // ................................*................... // gap // .................................................... // gap // .................................................... - add v18.8H, v6.8H, v14.8H // .....................................*.............. + sub v11.8H, v16.8H, v15.8H // .....................................*.............. + add v0.8H, v16.8H, v15.8H // ....................................*............... // gap // .................................................... // gap // .................................................... - add v23.8H, v17.8H, v24.8H // ...................................*................ - sub v30.8H, v17.8H, v24.8H // ....................................*............... // gap // .................................................... - sub v8.8H, v6.8H, v14.8H // .............................................*...... // gap // .................................................... + add v4.8H, v14.8H, v27.8H // ...................................*................ + sub v6.8H, v14.8H, v27.8H // .......................................*............ // gap // .................................................... + sqdmulh v16.8H, v0.8H, v7.H[1] // ........................................*........... // gap // .................................................... // gap // .................................................... - sqdmulh v4.8H, v18.8H, v7.H[1] // .......................................*............ // gap // .................................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... // gap // .................................................... - sqdmulh v19.8H, v23.8H, v7.H[1] // ......................................*............. + sqdmulh v15.8H, v4.8H, v7.H[1] // ......................................*............. // gap // .................................................... // gap // .................................................... // gap // .................................................... + sqrdmulh v9.8H, v11.8H, v3.H[3] // .................................................*.. // gap // .................................................... // gap // .................................................... + srshr v16.8H, v16.8H, #11 // ...........................................*........ // gap // .................................................... - srshr v11.8H, v4.8H, #11 // ..........................................*......... // gap // .................................................... // gap // .................................................... // gap // .................................................... + mul v24.8H, v6.8H, v3.H[4] // ...............................................*.... // gap // .................................................... // gap // .................................................... - srshr v17.8H, v19.8H, #11 // ........................................*........... + srshr v15.8H, v15.8H, #11 // ..........................................*......... + mls v0.8H, v16.8H, v7.H[0] // .............................................*...... // gap // .................................................... // gap // .................................................... - mls v18.8H, v11.8H, v7.H[0] // ............................................*....... // gap // .................................................... // gap // .................................................... // gap // .................................................... + mls v4.8H, v15.8H, v7.H[0] // ............................................*....... // gap // .................................................... // gap // .................................................... - mls v23.8H, v17.8H, v7.H[0] // ...........................................*........ // gap // .................................................... // gap // .................................................... // gap // .................................................... + sqrdmulh v27.8H, v6.8H, v3.H[5] // ..............................................*..... // gap // .................................................... // gap // .................................................... - sqrdmulh v11.8H, v8.8H, v13.H[5] // .................................................*.. // gap // .................................................... // gap // .................................................... // gap // .................................................... @@ -534,477 +534,501 @@ _intt_kyber_123_4567_manual_ld4_opt_a72: // gap // .................................................... // gap // .................................................... // gap // .................................................... - sub v26.8H, v23.8H, v18.8H // ................................................*... - add v16.8H, v23.8H, v18.8H // ...............................................*.... + add v15.8H, v0.8H, v4.8H // ................................................*... // gap // .................................................... // gap // .................................................... // gap // .................................................... - sqrdmulh v23.8H, v30.8H, v13.H[3] // ...................................................* + mls v24.8H, v27.8H, v7.H[0] // ..................................................*. // gap // .................................................... // gap // .................................................... + str q15, [x1], #(64) // ...................................................* // gap // .................................................... - str q16, [x1], #(64) // ..................................................*. - mul v12.8H, v8.8H, v13.H[4] // ..............................................*..... // gap // .................................................... - // original source code - // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*.................................................. - // ldr q27, [x4, #48] // ....*............................................... - // ldr q24, [x4, #80] // .....*.............................................. - // ldr q11, [x4], #(6*16) // ......*............................................. - // ldr q8, [x4, #-80] // ...*................................................ - // sub v28.8H, v5.8H, v6.8H // .........*.......................................... - // add v29.8H, v5.8H, v6.8H // ........*........................................... - // sub v6.8H, v3.8H, v4.8H // .......*............................................ - // add v0.8H, v3.8H, v4.8H // ..........*......................................... - // sqrdmulh v3.8H, v28.8H, v24.8H // ...............*.................................... - // ldr q18, [x4, #-32] // *................................................... - // sqrdmulh v25.8H, v6.8H, v27.8H // .............*...................................... - // ldr q27, [x4, #-64] // ..*................................................. - // sub v19.8H, v0.8H, v29.8H // ..............*..................................... - // add v15.8H, v0.8H, v29.8H // ............*....................................... - // mul v29.8H, v28.8H, v18.8H // ................*................................... - // mul v27.8H, v6.8H, v27.8H // ...........*........................................ - // mls v27.8H, v25.8H, v7.H[0] // .................*.................................. - // mls v29.8H, v3.8H, v7.H[0] // ..................*................................. - // sqrdmulh v18.8H, v19.8H, v8.8H // .......................*............................ - // sub v10.8H, v27.8H, v29.8H // .....................*.............................. - // add v24.8H, v27.8H, v29.8H // ......................*............................. - // mul v0.8H, v19.8H, v11.8H // ....................*............................... - // sqrdmulh v4.8H, v10.8H, v8.8H // .........................*.......................... - // trn1 v9.4S, v15.4S, v24.4S // ........................*........................... - // mul v25.8H, v10.8H, v11.8H // ...........................*........................ - // trn2 v24.4S, v15.4S, v24.4S // ..........................*......................... - // mls v0.8H, v18.8H, v7.H[0] // ............................*....................... - // mls v25.8H, v4.8H, v7.H[0] // .............................*...................... - // trn2 v11.4S, v0.4S, v25.4S // ..............................*..................... - // trn1 v27.4S, v0.4S, v25.4S // ...............................*.................... - // trn1 v20.2D, v24.2D, v11.2D // ...................................*................ - // trn1 v30.2D, v9.2D, v27.2D // ..................................*................. - // trn2 v15.2D, v9.2D, v27.2D // .................................*.................. - // trn2 v16.2D, v24.2D, v11.2D // ................................*................... - // add v29.8H, v30.8H, v20.8H // .....................................*.............. - // sub v30.8H, v30.8H, v20.8H // ......................................*............. - // add v25.8H, v15.8H, v16.8H // ....................................*............... - // sqdmulh v24.8H, v29.8H, v7.H[1] // .........................................*.......... - // sqdmulh v26.8H, v25.8H, v7.H[1] // ........................................*........... - // srshr v24.8H, v24.8H, #11 // ...........................................*........ - // ldr q13, [x3], #16 // ...................*................................ - // srshr v23.8H, v26.8H, #11 // ..........................................*......... - // mls v29.8H, v24.8H, v7.H[0] // .............................................*...... - // mls v25.8H, v23.8H, v7.H[0] // ............................................*....... - // sub v14.8H, v15.8H, v16.8H // .......................................*............ - // mul v12.8H, v14.8H, v13.H[4] // ...................................................* - // add v28.8H, v29.8H, v25.8H // ................................................*... - // sub v26.8H, v29.8H, v25.8H // ...............................................*.... - // sqrdmulh v11.8H, v14.8H, v13.H[5] // ..............................................*..... - // str q28, [x1], #(64) // ..................................................*. - // sqrdmulh v23.8H, v30.8H, v13.H[3] // .................................................*.. + // ------------------ new position -------------------> + // 0 25 50 + // |------------------------|------------------------|- + // ldr q8, [x4], #(6*16) // .*.................................................. + // ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // ..*................................................. + // ldr q23, [x4, #-80] // ......*............................................. + // ldr q30, [x4, #-64] // .....*.............................................. + // ldr q15, [x4, #-48] // ....*............................................... + // sub v2.8H, v18.8H, v19.8H // .......*............................................ + // add v27.8H, v20.8H, v21.8H // ..........*......................................... + // add v28.8H, v18.8H, v19.8H // ........*........................................... + // ldr q18, [x4, #-16] // *................................................... + // mul v25.8H, v2.8H, v30.8H // ............*....................................... + // sub v19.8H, v20.8H, v21.8H // ...........*........................................ + // ldr q20, [x4, #-32] // .........*.......................................... + // add v11.8H, v28.8H, v27.8H // .............*...................................... + // sqrdmulh v12.8H, v2.8H, v15.8H // ...............*.................................... + // sub v31.8H, v28.8H, v27.8H // ..............*..................................... + // sqrdmulh v0.8H, v19.8H, v18.8H // ................*................................... + // mul v14.8H, v19.8H, v20.8H // .................*.................................. + // mls v25.8H, v12.8H, v7.H[0] // ..................*................................. + // mls v14.8H, v0.8H, v7.H[0] // ...................*................................ + // sqrdmulh v0.8H, v31.8H, v23.8H // ....................*............................... + // sub v1.8H, v25.8H, v14.8H // ......................*............................. + // mul v6.8H, v31.8H, v8.8H // .....................*.............................. + // add v14.8H, v25.8H, v14.8H // .......................*............................ + // sqrdmulh v10.8H, v1.8H, v23.8H // ..........................*......................... + // trn1 v9.4S, v11.4S, v14.4S // .........................*.......................... + // trn2 v30.4S, v11.4S, v14.4S // ...........................*........................ + // mul v22.8H, v1.8H, v8.8H // ............................*....................... + // mls v6.8H, v0.8H, v7.H[0] // ........................*........................... + // mls v22.8H, v10.8H, v7.H[0] // .............................*...................... + // trn2 v17.4S, v6.4S, v22.4S // ...............................*.................... + // trn1 v22.4S, v6.4S, v22.4S // ..............................*..................... + // trn2 v27.2D, v30.2D, v17.2D // ..................................*................. + // trn2 v6.2D, v9.2D, v22.2D // ...................................*................ + // trn1 v14.2D, v30.2D, v17.2D // ................................*................... + // trn1 v28.2D, v9.2D, v22.2D // .................................*.................. + // add v4.8H, v6.8H, v27.8H // ......................................*............. + // add v0.8H, v28.8H, v14.8H // .....................................*.............. + // sub v11.8H, v28.8H, v14.8H // ....................................*............... + // sqdmulh v16.8H, v4.8H, v7.H[1] // .........................................*.......... + // sub v14.8H, v6.8H, v27.8H // .......................................*............ + // sqdmulh v27.8H, v0.8H, v7.H[1] // ........................................*........... + // ldr q3, [x3], #16 // ...*................................................ + // srshr v15.8H, v16.8H, #11 // .............................................*...... + // srshr v21.8H, v27.8H, #11 // ...........................................*........ + // mls v4.8H, v15.8H, v7.H[0] // ...............................................*.... + // mls v0.8H, v21.8H, v7.H[0] // ..............................................*..... + // sqrdmulh v8.8H, v14.8H, v3.H[5] // ................................................*... + // mul v24.8H, v14.8H, v3.H[4] // ............................................*....... + // add v15.8H, v0.8H, v4.8H // .................................................*.. + // sqrdmulh v9.8H, v11.8H, v3.H[3] // ..........................................*......... + // mls v24.8H, v8.8H, v7.H[0] // ..................................................*. + // str q15, [x1], #(64) // ...................................................* sub count, count, #1 layer4567_start: - ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // e....................................................................... - ldr q27, [x4, #48] // ....e................................................................... - // gap // ........................................................................ - ldr q24, [x4, #80] // ......e................................................................. - mul v22.8H, v30.8H, v13.H[2] // ......................................*................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v12.8H, v11.8H, v7.H[0] // .............................................*.......................... - ldr q11, [x4], #(6*16) // .e...................................................................... - // gap // ........................................................................ - ldr q8, [x4, #-80] // ..e..................................................................... - sub v28.8H, v5.8H, v6.8H // ............e........................................................... - // gap // ........................................................................ - add v29.8H, v5.8H, v6.8H // .............e.......................................................... - mls v22.8H, v23.8H, v7.H[0] // ........................................*............................... - // gap // ........................................................................ - sub v6.8H, v3.8H, v4.8H // .......e................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v0.8H, v3.8H, v4.8H // ........e............................................................... - sqrdmulh v3.8H, v28.8H, v24.8H // ...............e........................................................ - ldr q18, [x4, #-32] // .....e.................................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v25.8H, v6.8H, v27.8H // ..........e............................................................. - ldr q27, [x4, #-64] // ...e.................................................................... - // gap // ........................................................................ - sub v19.8H, v0.8H, v29.8H // .................e...................................................... - // gap // ........................................................................ - // gap // ........................................................................ - add v15.8H, v0.8H, v29.8H // ..................e..................................................... - mul v29.8H, v28.8H, v18.8H // ..............e......................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v27.8H, v6.8H, v27.8H // .........e.............................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v27.8H, v25.8H, v7.H[0] // ...........e............................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v29.8H, v3.8H, v7.H[0] // ................e....................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v28.8H, v12.8H, v7.H[1] // .......................................................*................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v18.8H, v19.8H, v8.8H // ....................e................................................... - // gap // ........................................................................ - // gap // ........................................................................ - sub v10.8H, v27.8H, v29.8H // ......................e................................................. - // gap // ........................................................................ - // gap // ........................................................................ - add v24.8H, v27.8H, v29.8H // .......................e................................................ - // gap // ........................................................................ - mul v0.8H, v19.8H, v11.8H // ...................e.................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v4.8H, v10.8H, v8.8H // .........................e.............................................. - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v9.4S, v15.4S, v24.4S // ...........................e............................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v25.8H, v10.8H, v11.8H // ........................e............................................... - trn2 v24.4S, v15.4S, v24.4S // ............................e........................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v0.8H, v18.8H, v7.H[0] // .....................e.................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v25.8H, v4.8H, v7.H[0] // ..........................e............................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v3.8H, v22.8H, v7.H[1] // .................................................*...................... - // gap // ........................................................................ - // gap // ........................................................................ - srshr v27.8H, v28.8H, #11 // ........................................................*............... - // gap // ........................................................................ - // gap // ........................................................................ - mul v14.8H, v26.8H, v13.H[0] // ............................................................*........... - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v11.4S, v0.4S, v25.4S // ..............................e......................................... - // gap // ........................................................................ - // gap // ........................................................................ - mls v12.8H, v27.8H, v7.H[0] // .........................................................*.............. - trn1 v27.4S, v0.4S, v25.4S // .............................e.......................................... - // gap // ........................................................................ - srshr v3.8H, v3.8H, #11 // ..................................................*..................... - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v2.8H, v26.8H, v13.H[1] // .............................................................*.......... - trn1 v20.2D, v24.2D, v11.2D // ..................................e..................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v30.2D, v9.2D, v27.2D // .................................e...................................... - trn2 v15.2D, v9.2D, v27.2D // ...............................e........................................ - trn2 v16.2D, v24.2D, v11.2D // ................................e....................................... - // gap // ........................................................................ - mls v22.8H, v3.8H, v7.H[0] // ...................................................*.................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v29.8H, v30.8H, v20.8H // .....................................e.................................. - mls v14.8H, v2.8H, v7.H[0] // ..............................................................*......... - sub v30.8H, v30.8H, v20.8H // ....................................e................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v25.8H, v15.8H, v16.8H // ..........................................e............................. - sqdmulh v24.8H, v29.8H, v7.H[1] // ..............................................e......................... - // gap // ........................................................................ - // gap // ........................................................................ - sub v28.8H, v22.8H, v12.8H // ...............................................................*........ - // gap // ........................................................................ - // gap // ........................................................................ - add v11.8H, v22.8H, v12.8H // ................................................................*....... - // gap // ........................................................................ - sqdmulh v26.8H, v25.8H, v7.H[1] // ....................................................e................... - str q14, [x1, #-32] // ......................................................................*. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v0.8H, v28.8H, v13.H[1] // ..................................................................*..... - srshr v24.8H, v24.8H, #11 // ...............................................e........................ - str q11, [x1, #-48] // .....................................................................*.. - // gap // ........................................................................ - mul v10.8H, v28.8H, v13.H[0] // .................................................................*...... - ldr q13, [x3], #16 // ...................................e.................................... - // gap // ........................................................................ - srshr v23.8H, v26.8H, #11 // .....................................................e.................. - // gap // ........................................................................ - // gap // ........................................................................ - mls v29.8H, v24.8H, v7.H[0] // ................................................e....................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v25.8H, v23.8H, v7.H[0] // ......................................................e................. - // gap // ........................................................................ - // gap // ........................................................................ - sub v14.8H, v15.8H, v16.8H // .........................................e.............................. - // gap // ........................................................................ - // gap // ........................................................................ - mls v10.8H, v0.8H, v7.H[0] // ...................................................................*.... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v12.8H, v14.8H, v13.H[4] // ...........................................e............................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v28.8H, v29.8H, v25.8H // ...........................................................e............ - sub v26.8H, v29.8H, v25.8H // ..........................................................e............. - sqrdmulh v11.8H, v14.8H, v13.H[5] // ............................................e........................... - // gap // ........................................................................ - str q10, [x1, #-16] // .......................................................................* - // gap // ........................................................................ - // gap // ........................................................................ - str q28, [x1], #(64) // ....................................................................e... - // gap // ........................................................................ - sqrdmulh v23.8H, v30.8H, v13.H[3] // .......................................e................................ - - // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e..................................................................... - // ldr q0, [x4], #(6*16) // .....e..................................................................|....e................................................................ - // ldr q4, [x4, #(-6*16 + 1*16)] // ......e.................................................................|.....e............................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ...............e........................................................|..............e...................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // .e......................................................................|e.................................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // .............e..........................................................|............e........................................................ - // ldr q6, [x4, #(-6*16 + 5*16)] // ..e.....................................................................|.e................................................................... - // sub v24.8h, v8.8h, v9.8h // ..........e.............................................................|.........e........................................................... - // add v8.8h, v8.8h, v9.8h // ...........e............................................................|..........e.......................................................... - // mul v9.8h, v24.8h, v1.8h // ...................e....................................................|..................e.................................................. - // sqrdmulh v24.8h, v24.8h, v5.8h // ..............e.........................................................|.............e....................................................... - // mls v9.8h, v24.8h, v7.h[0] // ....................e...................................................|...................e................................................. - // sub v24.8h, v10.8h, v11.8h // .......e................................................................|......e.............................................................. - // add v10.8h, v10.8h, v11.8h // ........e...............................................................|.......e............................................................. - // mul v11.8h, v24.8h, v2.8h // ..................e.....................................................|.................e................................................... - // sqrdmulh v24.8h, v24.8h, v6.8h // ............e...........................................................|...........e......................................................... - // mls v11.8h, v24.8h, v7.h[0] // .....................e..................................................|....................e................................................ - // sub v24.8h, v8.8h, v10.8h // ................e.......................................................|...............e..................................................... - // add v8.8h, v8.8h, v10.8h // .................e......................................................|................e.................................................... - // mul v10.8h, v24.8h, v0.8h // ..........................e.............................................|.........................e........................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // .......................e................................................|......................e.............................................. - // mls v10.8h, v24.8h, v7.h[0] // ...............................e........................................|..............................e...................................... - // sub v24.8h, v9.8h, v11.8h // ........................e...............................................|.......................e............................................. - // add v9.8h, v9.8h, v11.8h // .........................e..............................................|........................e............................................ - // mul v11.8h, v24.8h, v0.8h // .............................e..........................................|............................e........................................ - // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................e............................................|..........................e.......................................... - // mls v11.8h, v24.8h, v7.h[0] // ................................e.......................................|...............................e..................................... - // trn1 v25.4s, v8.4s, v9.4s // ............................e...........................................|...........................e......................................... - // trn2 v26.4s, v8.4s, v9.4s // ..............................e.........................................|.............................e....................................... - // trn1 v27.4s, v10.4s, v11.4s // ......................................e.................................|.....................................e............................... - // trn2 v28.4s, v10.4s, v11.4s // ....................................e...................................|...................................e................................. - // trn2 v10.2d, v25.2d, v27.2d // ...........................................e............................|..........................................e.......................... - // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e......................... - // trn1 v8.2d, v25.2d, v27.2d // ..........................................e.............................|.........................................e........................... - // trn1 v9.2d, v26.2d, v28.2d // .........................................e..............................|........................................e............................ - // ldr q0, [x3], #16 // ...........................................................e............|..........................................................e.......... - // sub v24.8h, v8.8h, v9.8h // ................................................e.......................|...............................................e..................... - // add v8.8h, v8.8h, v9.8h // ..............................................e.........................|.............................................e....................... - // mul v9.8h, v24.8h, v0.h[2] // ...*....................................................................|..*.................................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................................................e|..................................................................... - // mls v9.8h, v24.8h, v7.h[0] // .........*..............................................................|........*............................................................ - // sub v24.8h, v10.8h, v11.8h // ...............................................................e........|..............................................................e...... - // add v10.8h, v10.8h, v11.8h // .................................................e......................|................................................e.................... - // mul v11.8h, v24.8h, v0.h[4] // .................................................................e......|................................................................e.... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ....................................................................e...|...................................................................e. - // mls v11.8h, v24.8h, v7.h[0] // ....*...................................................................|...*................................................................. - // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................................................e.....................|.................................................e................... - // srshr v25.8h, v25.8h, #11 // ........................................................e...............|.......................................................e............. - // mls v8.8h, v25.8h, v7.h[0] // .............................................................e..........|............................................................e........ - // sqdmulh v25.8h, v9.8h, v7.h[1] // .................................*......................................|................................*.................................... - // srshr v25.8h, v25.8h, #11 // .......................................*................................|......................................*.............................. - // mls v9.8h, v25.8h, v7.h[0] // .............................................*..........................|............................................*........................ - // sqdmulh v25.8h, v10.8h, v7.h[1] // .....................................................e..................|....................................................e................ - // srshr v25.8h, v25.8h, #11 // ............................................................e...........|...........................................................e......... - // mls v10.8h, v25.8h, v7.h[0] // ..............................................................e.........|.............................................................e....... - // sqdmulh v25.8h, v11.8h, v7.h[1] // ......................*.................................................|.....................*............................................... - // srshr v25.8h, v25.8h, #11 // ..................................*.....................................|.................................*................................... - // mls v11.8h, v25.8h, v7.h[0] // .....................................*..................................|....................................*................................ - // sub v24.8h, v8.8h, v10.8h // ...................................................................e....|..................................................................e.. - // add v8.8h, v8.8h, v10.8h // ..................................................................e.....|.................................................................e... - // mul v10.8h, v24.8h, v0.h[0] // ...................................*....................................|..................................*.................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................*...............................|.......................................*............................. - // mls v10.8h, v24.8h, v7.h[0] // ...............................................*........................|..............................................*...................... - // sub v24.8h, v9.8h, v11.8h // ...................................................*....................|..................................................*.................. - // add v9.8h, v9.8h, v11.8h // ....................................................*...................|...................................................*................. - // mul v11.8h, v24.8h, v0.h[0] // ..........................................................*.............|.........................................................*........... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................................*................|......................................................*.............. - // mls v11.8h, v24.8h, v7.h[0] // ................................................................*.......|...............................................................*..... - // str q8, [x1], #(64) // ......................................................................e.|..................................................................... - // str q9, [x1, #(-64 + 16*1)] // .........................................................*..............|........................................................*............ - // str q10, [x1, #(-64 + 16*2)] // ......................................................*.................|.....................................................*............... - // str q11, [x1, #(-64 + 16*3)] // .....................................................................*..|....................................................................* + // Instructions: 72 + // Expected cycles: 65 + // Expected IPC: 1.11 + // + // Cycle bound: 65.0 + // IPC bound: 1.11 + // + // Wall time: 44.53s + // User time: 44.53s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ldr q8, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // e....................................................................... + ldr q23, [x4, #-80] // ..e..................................................................... + mul v26.8H, v11.8H, v3.H[2] // .......................................*................................ + // gap // ........................................................................ + ldr q30, [x4, #-64] // ...e.................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v9.8H, v7.H[0] // ........................................*............................... + // gap // ........................................................................ + // gap // ........................................................................ + ldr q15, [x4, #-48] // ....e................................................................... + sub v2.8H, v18.8H, v19.8H // .......e................................................................ + // gap // ........................................................................ + add v27.8H, v20.8H, v21.8H // .............e.......................................................... + sqdmulh v14.8H, v24.8H, v7.H[1] // .......................................................*................ + // gap // ........................................................................ + add v28.8H, v18.8H, v19.8H // ........e............................................................... + ldr q18, [x4, #-16] // ......e................................................................. + // gap // ........................................................................ + mul v25.8H, v2.8H, v30.8H // ..........e............................................................. + sub v19.8H, v20.8H, v21.8H // ............e........................................................... + ldr q20, [x4, #-32] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v11.8H, v28.8H, v27.8H // ..................e..................................................... + sqrdmulh v12.8H, v2.8H, v15.8H // .........e.............................................................. + // gap // ........................................................................ + sub v31.8H, v28.8H, v27.8H // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v27.8H, v0.8H, v4.8H // ..........................................................*............. + sqrdmulh v0.8H, v19.8H, v18.8H // ..............e......................................................... + // gap // ........................................................................ + srshr v16.8H, v14.8H, #11 // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + mul v14.8H, v19.8H, v20.8H // ...............e........................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v25.8H, v12.8H, v7.H[0] // ...........e............................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v14.8H, v0.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v15.8H, v26.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v0.8H, v31.8H, v23.8H // ...................e.................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v1.8H, v25.8H, v14.8H // ......................e................................................. + // gap // ........................................................................ + // gap // ........................................................................ + mul v6.8H, v31.8H, v8.8H // ....................e................................................... + add v14.8H, v25.8H, v14.8H // .......................e................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v10.8H, v1.8H, v23.8H // ........................e............................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v9.4S, v11.4S, v14.4S // ...........................e............................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v30.4S, v11.4S, v14.4S // ............................e........................................... + mul v22.8H, v1.8H, v8.8H // .........................e.............................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v6.8H, v0.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v22.8H, v10.8H, v7.H[0] // ..........................e............................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v15.8H, v15.8H, #11 // ..................................................*..................... + mls v24.8H, v16.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v15.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v17.4S, v6.4S, v22.4S // ..............................e......................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v22.4S, v6.4S, v22.4S // .............................e.......................................... + sqrdmulh v16.8H, v27.8H, v3.H[1] // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v8.8H, v27.8H, v3.H[0] // .............................................................*.......... + trn2 v27.2D, v30.2D, v17.2D // ................................e....................................... + // gap // ........................................................................ + trn2 v6.2D, v9.2D, v22.2D // ...............................e........................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v14.2D, v30.2D, v17.2D // ..................................e..................................... + sub v15.8H, v26.8H, v24.8H // ...............................................................*........ + // gap // ........................................................................ + trn1 v28.2D, v9.2D, v22.2D // .................................e...................................... + mls v8.8H, v16.8H, v7.H[0] // ..............................................................*......... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v4.8H, v6.8H, v27.8H // ..........................................e............................. + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v13.8H, v15.8H, v3.H[1] // .................................................................*...... + add v0.8H, v28.8H, v14.8H // .....................................e.................................. + // gap // ........................................................................ + // gap // ........................................................................ + sub v11.8H, v28.8H, v14.8H // ....................................e................................... + // gap // ........................................................................ + sqdmulh v16.8H, v4.8H, v7.H[1] // ....................................................e................... + sub v14.8H, v6.8H, v27.8H // .........................................e.............................. + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v27.8H, v0.8H, v7.H[1] // ..............................................e......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v10.8H, v15.8H, v3.H[0] // ..................................................................*..... + ldr q3, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + srshr v15.8H, v16.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v10.8H, v13.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + // gap // ........................................................................ + srshr v21.8H, v27.8H, #11 // ...............................................e........................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v4.8H, v15.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v0.8H, v21.8H, v7.H[0] // ................................................e....................... + str q8, [x1, #-32] // ......................................................................*. + add v15.8H, v26.8H, v24.8H // ................................................................*....... + str q10, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v8.8H, v14.8H, v3.H[5] // ...........................................e............................ + str q15, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + mul v24.8H, v14.8H, v3.H[4] // ............................................e........................... + // gap // ........................................................................ + // gap // ........................................................................ + add v15.8H, v0.8H, v4.8H // ...........................................................e............ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v9.8H, v11.8H, v3.H[3] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v24.8H, v8.8H, v7.H[0] // .............................................e.......................... + str q15, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + + // -------------------------------------------------------------- new position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------- + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // .e......................................................................'~................................................................. + // ldr q0, [x4], #(6*16) // e.......................................................................~.................................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // ..e.....................................................................'.~................................................................ + // ldr q1, [x4, #(-6*16 + 2*16)] // ....e...................................................................'...~.............................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // ......e.................................................................'.....~............................................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // ..............e.........................................................'.............~.................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ...........e............................................................'..........~....................................................... + // sub v24.8h, v8.8h, v9.8h // .......e................................................................'......~........................................................... + // add v8.8h, v8.8h, v9.8h // ..........e.............................................................'.........~........................................................ + // sqrdmulh v27.8h, v24.8h, v5.8h // ................e.......................................................'...............~.................................................. + // mul v9.8h, v24.8h, v1.8h // ............e...........................................................'...........~...................................................... + // mls v9.8h, v27.8h, v7.h[0] // ......................e.................................................'.....................~............................................ + // sub v24.8h, v10.8h, v11.8h // .............e..........................................................'............~..................................................... + // add v10.8h, v10.8h, v11.8h // ........e...............................................................'.......~.......................................................... + // sqrdmulh v27.8h, v24.8h, v6.8h // ...................e....................................................'..................~............................................... + // mul v11.8h, v24.8h, v2.8h // .....................e..................................................'....................~............................................. + // mls v11.8h, v27.8h, v7.h[0] // .......................e................................................'......................~........................................... + // sub v24.8h, v8.8h, v10.8h // .................e......................................................'................~................................................. + // add v8.8h, v8.8h, v10.8h // ...............e........................................................'..............~................................................... + // sqrdmulh v27.8h, v24.8h, v4.8h // .........................e..............................................'........................~......................................... + // mul v10.8h, v24.8h, v0.8h // ...........................e............................................'..........................~....................................... + // mls v10.8h, v27.8h, v7.h[0] // .................................e......................................'................................~................................. + // sub v24.8h, v9.8h, v11.8h // ..........................e.............................................'.........................~........................................ + // add v9.8h, v9.8h, v11.8h // ............................e...........................................'...........................~...................................... + // sqrdmulh v27.8h, v24.8h, v4.8h // .............................e..........................................'............................~..................................... + // mul v11.8h, v24.8h, v0.8h // ................................e.......................................'...............................~.................................. + // mls v11.8h, v27.8h, v7.h[0] // ..................................e.....................................'.................................~................................ + // trn1 v25.4s, v8.4s, v9.4s // ..............................e.........................................'.............................~.................................... + // trn2 v26.4s, v8.4s, v9.4s // ...............................e........................................'..............................~................................... + // trn1 v27.4s, v10.4s, v11.4s // .......................................e................................'......................................~........................... + // trn2 v28.4s, v10.4s, v11.4s // ......................................e.................................'.....................................~............................ + // trn2 v10.2d, v25.2d, v27.2d // ...........................................e............................'..........................................~....................... + // trn2 v11.2d, v26.2d, v28.2d // ..........................................e.............................'.........................................~........................ + // trn1 v8.2d, v25.2d, v27.2d // ..............................................e.........................'.............................................~.................... + // trn1 v9.2d, v26.2d, v28.2d // ............................................e...........................'...........................................~...................... + // ldr q0, [x3], #16 // ........................................................e...............'.......................................................~.......... + // sub v24.8h, v8.8h, v9.8h // ...................................................e....................'..................................................~............... + // add v8.8h, v8.8h, v9.8h // ..................................................e.....................'.................................................~................ + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .....................................................................e..'.................................................................. + // mul v9.8h, v24.8h, v0.h[2] // ...~....................................................................'..*............................................................... + // mls v9.8h, v27.8h, v7.h[0] // .....~..................................................................'....*............................................................. + // sub v24.8h, v10.8h, v11.8h // .....................................................e..................'....................................................~............. + // add v10.8h, v10.8h, v11.8h // ................................................e.......................'...............................................~.................. + // sqrdmulh v27.8h, v24.8h, v0.h[5] // .................................................................e......'................................................................~. + // mul v11.8h, v24.8h, v0.h[4] // ...................................................................e....'.................................................................. + // mls v11.8h, v27.8h, v7.h[0] // ......................................................................e.'.................................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ......................................................e.................'.....................................................~............ + // srshr v25.8h, v25.8h, #11 // ...........................................................e............'..........................................................~....... + // mls v8.8h, v25.8h, v7.h[0] // .............................................................e..........'............................................................~..... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ........................~...............................................'.......................*.......................................... + // srshr v25.8h, v25.8h, #11 // ...................................~....................................'..................................*............................... + // mls v9.8h, v25.8h, v7.h[0] // .....................................~..................................'....................................*............................. + // sqdmulh v25.8h, v10.8h, v7.h[1] // ....................................................e...................'...................................................~.............. + // srshr v25.8h, v25.8h, #11 // .........................................................e..............'........................................................~......... + // mls v10.8h, v25.8h, v7.h[0] // ............................................................e...........'...........................................................~...... + // sqdmulh v25.8h, v11.8h, v7.h[1] // .........~..............................................................'........*......................................................... + // srshr v25.8h, v25.8h, #11 // ....................~...................................................'...................*.............................................. + // mls v11.8h, v25.8h, v7.h[0] // ....................................~...................................'...................................*.............................. + // sub v24.8h, v8.8h, v10.8h // ..................~.....................................................'.................*................................................ + // add v8.8h, v8.8h, v10.8h // ....................................................................e...'.................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ........................................~...............................'.......................................*.......................... + // mul v10.8h, v24.8h, v0.h[0] // .........................................~..............................'........................................*......................... + // mls v10.8h, v27.8h, v7.h[0] // ...............................................~........................'..............................................*................... + // sub v24.8h, v9.8h, v11.8h // .............................................~..........................'............................................*..................... + // add v9.8h, v9.8h, v11.8h // ...............................................................~........'..............................................................*... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .................................................~......................'................................................*................. + // mul v11.8h, v24.8h, v0.h[0] // .......................................................~................'......................................................*........... + // mls v11.8h, v27.8h, v7.h[0] // ..........................................................~.............'.........................................................*........ + // str q8, [x1], #(64) // .......................................................................e'.................................................................. + // str q9, [x1, #(-64 + 16*1)] // ..................................................................~.....'.................................................................* + // str q10, [x1, #(-64 + 16*2)] // ..............................................................~.........'.............................................................*.... + // str q11, [x1, #(-64 + 16*3)] // ................................................................~.......'...............................................................*.. sub count, count, #1 cbnz count, layer4567_start - mul v19.8H, v30.8H, v13.H[2] // *................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - mls v12.8H, v11.8H, v7.H[0] // .*.................. - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - mls v19.8H, v23.8H, v7.H[0] // ..*................. - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - sqdmulh v4.8H, v12.8H, v7.H[1] // ...*................ - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - sqdmulh v28.8H, v19.8H, v7.H[1] // ....*............... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - sqrdmulh v8.8H, v26.8H, v13.H[1] // .........*.......... - // gap // .................... - // gap // .................... - srshr v21.8H, v4.8H, #11 // .....*.............. - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - srshr v22.8H, v28.8H, #11 // ........*........... - // gap // .................... - // gap // .................... - mls v12.8H, v21.8H, v7.H[0] // .......*............ - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - mls v19.8H, v22.8H, v7.H[0] // ..........*......... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - mul v23.8H, v26.8H, v13.H[0] // ......*............. - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - mls v23.8H, v8.8H, v7.H[0] // ...........*........ - // gap // .................... - // gap // .................... - sub v24.8H, v19.8H, v12.8H // ............*....... - // gap // .................... - // gap // .................... - add v25.8H, v19.8H, v12.8H // .............*...... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - sqrdmulh v26.8H, v24.8H, v13.H[1] // ...............*.... - // gap // .................... - // gap // .................... - str q25, [x1, #-48] // ................*... - // gap // .................... - // gap // .................... - mul v5.8H, v24.8H, v13.H[0] // .................*.. - str q23, [x1, #-32] // ..............*..... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - mls v5.8H, v26.8H, v7.H[0] // ..................*. - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - // gap // .................... - str q5, [x1, #-16] // ...................* - // gap // .................... - // gap // .................... - - // original source code - // mul v22.8H, v30.8H, v13.H[2] // *................... - // mls v12.8H, v11.8H, v7.H[0] // .*.................. - // mls v22.8H, v23.8H, v7.H[0] // ..*................. - // sqdmulh v28.8H, v12.8H, v7.H[1] // ...*................ - // sqdmulh v3.8H, v22.8H, v7.H[1] // ....*............... - // srshr v27.8H, v28.8H, #11 // ......*............. - // mul v14.8H, v26.8H, v13.H[0] // ..........*......... - // mls v12.8H, v27.8H, v7.H[0] // ........*........... - // srshr v3.8H, v3.8H, #11 // .......*............ - // sqrdmulh v2.8H, v26.8H, v13.H[1] // .....*.............. - // mls v22.8H, v3.8H, v7.H[0] // .........*.......... - // mls v14.8H, v2.8H, v7.H[0] // ...........*........ - // sub v28.8H, v22.8H, v12.8H // ............*....... - // add v11.8H, v22.8H, v12.8H // .............*...... - // str q14, [x1, #-32] // .................*.. - // sqrdmulh v0.8H, v28.8H, v13.H[1] // ..............*..... - // str q11, [x1, #-48] // ...............*.... - // mul v10.8H, v28.8H, v13.H[0] // ................*... - // mls v10.8H, v0.8H, v7.H[0] // ..................*. - // str q10, [x1, #-16] // ...................* + // Instructions: 20 + // Expected cycles: 34 + // Expected IPC: 0.59 + // + // Cycle bound: 34.0 + // IPC bound: 0.59 + // + // Wall time: 0.15s + // User time: 0.15s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mul v15.8H, v11.8H, v3.H[2] // *............................. + sub v16.8H, v0.8H, v4.8H // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v15.8H, v9.8H, v7.H[0] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqdmulh v27.8H, v24.8H, v7.H[1] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqdmulh v14.8H, v15.8H, v7.H[1] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + srshr v27.8H, v27.8H, #11 // ....*......................... + sqrdmulh v6.8H, v16.8H, v3.H[1] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v24.8H, v27.8H, v7.H[0] // .......*...................... + srshr v27.8H, v14.8H, #11 // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v15.8H, v27.8H, v7.H[0] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v16.8H, v16.8H, v3.H[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v16.8H, v6.8H, v7.H[0] // ............*................. + // gap // .............................. + // gap // .............................. + sub v27.8H, v15.8H, v24.8H // ...........*.................. + // gap // .............................. + // gap // .............................. + add v15.8H, v15.8H, v24.8H // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v14.8H, v27.8H, v3.H[1] // .............*................ + // gap // .............................. + // gap // .............................. + str q16, [x1, #-32] // ................*............. + str q15, [x1, #-48] // ...................*.......... + // gap // .............................. + mul v15.8H, v27.8H, v3.H[0] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v15.8H, v14.8H, v7.H[0] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q15, [x1, #-16] // ..................*........... + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // mul v26.8H, v11.8H, v3.H[2] // *.............................. + // mls v26.8H, v9.8H, v7.H[0] // ..*............................ + // sqdmulh v14.8H, v24.8H, v7.H[1] // ...*........................... + // sub v27.8H, v0.8H, v4.8H // .*............................. + // srshr v16.8H, v14.8H, #11 // .....*......................... + // sqdmulh v15.8H, v26.8H, v7.H[1] // ....*.......................... + // srshr v15.8H, v15.8H, #11 // ........*...................... + // mls v24.8H, v16.8H, v7.H[0] // .......*....................... + // mls v26.8H, v15.8H, v7.H[0] // .........*..................... + // sqrdmulh v16.8H, v27.8H, v3.H[1] // ......*........................ + // mul v8.8H, v27.8H, v3.H[0] // ..........*.................... + // sub v15.8H, v26.8H, v24.8H // ............*.................. + // mls v8.8H, v16.8H, v7.H[0] // ...........*................... + // sqrdmulh v13.8H, v15.8H, v3.H[1] // ..............*................ + // mul v10.8H, v15.8H, v3.H[0] // .................*............. + // mls v10.8H, v13.8H, v7.H[0] // ..................*............ + // str q8, [x1, #-32] // ...............*............... + // add v15.8H, v26.8H, v24.8H // .............*................. + // str q10, [x1, #-16] // ...................*........... + // str q15, [x1, #-48] // ................*.............. // --------------------------------------------------------------------- @@ -1023,800 +1047,845 @@ layer4567_start: .p2align 2 - ldr q19, [x0, #448] // *......... - ldr q23, [x0, #384] // .*........ - // gap // .......... - ldr q2, [x0, #128] // ..*....... - ldr q21, [x0, #192] // .....*.... - // gap // .......... - ldr q24, [x0, #320] // .........* - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - sub v27.8H, v23.8H, v19.8H // ...*...... - // gap // .......... - // gap // .......... - add v15.8H, v23.8H, v19.8H // ....*..... - // gap // .......... - // gap // .......... - sub v10.8H, v2.8H, v21.8H // ......*... - // gap // .......... - // gap // .......... - sqrdmulh v13.8H, v27.8H, v1.H[5] // .......*.. - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - // gap // .......... - mul v27.8H, v27.8H, v1.H[4] // ........*. - // gap // .......... - // gap // .......... - - // original source code - // ldr q26, [x0, #448] // *......... - // ldr q14, [x0, #384] // .*........ - // ldr q2, [x0, #128] // ..*....... - // sub v18.8H, v14.8H, v26.8H // .....*.... - // add v15.8H, v14.8H, v26.8H // ......*... - // ldr q21, [x0, #192] // ...*...... - // sub v10.8H, v2.8H, v21.8H // .......*.. - // sqrdmulh v13.8H, v18.8H, v1.H[5] // ........*. - // mul v27.8H, v18.8H, v1.H[4] // .........* - // ldr q24, [x0, #320] // ....*..... + // Instructions: 14 + // Expected cycles: 10 + // Expected IPC: 1.40 + // + // Cycle bound: 10.0 + // IPC bound: 1.40 + // + // Wall time: 0.08s + // User time: 0.08s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q27, [x0, #448] // .*............................ + // gap // .............................. + ldr q15, [x0, #384] // *............................. + ldr q6, [x0, #192] // ...*.......................... + // gap // .............................. + // gap // .............................. + ldr q14, [x0, #128] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v26.8H, v15.8H, v27.8H // ............*................. + sub v16.8H, v15.8H, v27.8H // ....*......................... + ldr q27, [x0, #256] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x0, #0] // .....*........................ + sub v2.8H, v14.8H, v6.8H // ......*....................... + add v17.8H, v14.8H, v6.8H // ........*..................... + sqrdmulh v19.8H, v16.8H, v1.H[5] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q6, [x0, #320] // .........*.................... + sqrdmulh v31.8H, v2.8H, v1.H[1] // ..........*................... + ldr q14, [x0, #64] // .............*................ + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q22, [x0, #384] // .*............................. + // ldr q13, [x0, #448] // *.............................. + // ldr q11, [x0, #128] // ...*........................... + // ldr q31, [x0, #192] // ..*............................ + // sub v16.8H, v22.8H, v13.8H // .....*......................... + // ldr q25, [x0, #0] // .......*....................... + // sub v2.8H, v11.8H, v31.8H // ........*...................... + // sqrdmulh v19.8H, v16.8H, v1.H[5] // ..........*.................... + // add v17.8H, v11.8H, v31.8H // .........*..................... + // ldr q6, [x0, #320] // ...........*................... + // sqrdmulh v31.8H, v2.8H, v1.H[1] // ............*.................. + // ldr q27, [x0, #256] // ......*........................ + // add v26.8H, v22.8H, v13.8H // ....*.......................... + // ldr q14, [x0, #64] // .............*................. sub count, count, #1 layer123_start: - ldr q28, [x0, #64] // .*...................................................................................... - sqrdmulh v9.8H, v10.8H, v1.H[1] // ................*....................................................................... - ldr q23, [x0, #0] // *....................................................................................... - ldr q26, [x0, #464] // .......e................................................................................ - ldr q14, [x0, #400] // ......e................................................................................. + // Instructions: 88 + // Expected cycles: 96 + // Expected IPC: 0.92 + // + // Cycle bound: 96.0 + // IPC bound: 0.92 + // + // Wall time: 59.66s + // User time: 59.66s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ // gap // ........................................................................................ - mul v3.8H, v10.8H, v1.H[0] // ...............*........................................................................ - add v8.8H, v2.8H, v21.8H // ..............*......................................................................... - ldr q22, [x0, #256] // ....*................................................................................... - ldr q2, [x0, #144] // ..e..................................................................................... + mul v15.8H, v16.8H, v1.H[4] // ..........................*............................................................. // gap // ........................................................................................ // gap // ........................................................................................ - add v25.8H, v23.8H, v28.8H // .........*.............................................................................. - mls v27.8H, v13.8H, v7.H[0] // ...........................*............................................................ // gap // ........................................................................................ - sub v20.8H, v23.8H, v28.8H // ........*............................................................................... // gap // ........................................................................................ + mls v15.8H, v19.8H, v7.H[0] // ...........................*............................................................ // gap // ........................................................................................ - mls v3.8H, v9.8H, v7.H[0] // .................*...................................................................... - add v23.8H, v22.8H, v24.8H // ...................*.................................................................... // gap // ........................................................................................ - sub v28.8H, v22.8H, v24.8H // ..................*..................................................................... + ldr q22, [x0, #400] // ......e................................................................................. // gap // ........................................................................................ + sub v16.8H, v25.8H, v14.8H // ........*............................................................................... + add v3.8H, v25.8H, v14.8H // .........*.............................................................................. + mul v25.8H, v2.8H, v1.H[0] // ................*....................................................................... // gap // ........................................................................................ - sqrdmulh v17.8H, v20.8H, v0.H[7] // ...........*............................................................................ + sub v14.8H, v27.8H, v6.8H // ..................*..................................................................... // gap // ........................................................................................ // gap // ........................................................................................ - sub v19.8H, v23.8H, v15.8H // ......................................*................................................. + add v27.8H, v27.8H, v6.8H // ...................*.................................................................... + mul v23.8H, v16.8H, v0.H[6] // ...........*............................................................................ // gap // ........................................................................................ + add v11.8H, v3.8H, v17.8H // .............................*.......................................................... // gap // ........................................................................................ - sub v12.8H, v25.8H, v8.8H // ............................*........................................................... - mul v21.8H, v20.8H, v0.H[6] // ..........*............................................................................. // gap // ........................................................................................ - sub v18.8H, v14.8H, v26.8H // .......................e................................................................ + sub v20.8H, v3.8H, v17.8H // ............................*........................................................... + sqrdmulh v3.8H, v14.8H, v1.H[3] // ....................*................................................................... // gap // ........................................................................................ // gap // ........................................................................................ - add v31.8H, v23.8H, v15.8H // .......................................*................................................ // gap // ........................................................................................ - sqrdmulh v23.8H, v28.8H, v1.H[3] // .....................*.................................................................. - add v15.8H, v14.8H, v26.8H // ........................e............................................................... + ldr q13, [x0, #464] // .......e................................................................................ + sqrdmulh v6.8H, v16.8H, v0.H[7] // ..........*............................................................................. // gap // ........................................................................................ // gap // ........................................................................................ - mls v21.8H, v17.8H, v7.H[0] // ............*........................................................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v25.8H, v31.8H, v7.H[0] // .................*...................................................................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v14.8H, v28.8H, v1.H[2] // ....................*................................................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mul v14.8H, v14.8H, v1.H[2] // .....................*.................................................................. + add v31.8H, v27.8H, v26.8H // .......................................*................................................ // gap // ........................................................................................ + sub v27.8H, v27.8H, v26.8H // ......................................*................................................. // gap // ........................................................................................ - mls v14.8H, v23.8H, v7.H[0] // ......................*................................................................. - add v23.8H, v25.8H, v8.8H // .............................*.......................................................... // gap // ........................................................................................ - sub v28.8H, v21.8H, v3.8H // .................................*...................................................... + mls v23.8H, v6.8H, v7.H[0] // ............*........................................................................... // gap // ........................................................................................ // gap // ........................................................................................ - add v3.8H, v21.8H, v3.8H // ..................................*..................................................... - mul v21.8H, v19.8H, v0.H[4] // ........................................*............................................... + sub v6.8H, v11.8H, v31.8H // ................................................*....................................... // gap // ........................................................................................ - add v24.8H, v23.8H, v31.8H // .................................................*...................................... // gap // ........................................................................................ + add v11.8H, v11.8H, v31.8H // .................................................*...................................... + sqrdmulh v31.8H, v20.8H, v0.H[3] // ..............................*......................................................... // gap // ........................................................................................ - sqrdmulh v13.8H, v19.8H, v0.H[5] // .........................................*.............................................. - sub v23.8H, v23.8H, v31.8H // ................................................*....................................... // gap // ........................................................................................ - add v22.8H, v14.8H, v27.8H // ............................................*........................................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v6.8H, v12.8H, v0.H[2] // ..............................*......................................................... - sub v17.8H, v14.8H, v27.8H // ...........................................*............................................ + mls v14.8H, v3.8H, v7.H[0] // ......................*................................................................. // gap // ........................................................................................ // gap // ........................................................................................ + sub v3.8H, v23.8H, v25.8H // .................................*...................................................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v20.8H, v23.8H, v0.H[0] // ..................................................*..................................... - sub v10.8H, v3.8H, v22.8H // .....................................................*.................................. + mul v26.8H, v20.8H, v0.H[2] // ...............................*........................................................ // gap // ........................................................................................ - add v9.8H, v3.8H, v22.8H // ......................................................*................................. // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v14.8H, v17.8H, v0.H[5] // ..............................................*......................................... // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v16.8H, v3.8H, v0.H[3] // ...................................*.................................................... + add v17.8H, v23.8H, v25.8H // ..................................*..................................................... // gap // ........................................................................................ + sub v25.8H, v14.8H, v15.8H // ...........................................*............................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v19.8H, v23.8H, v0.H[1] // ...................................................*.................................... + add v28.8H, v14.8H, v15.8H // ............................................*........................................... + sqrdmulh v14.8H, v27.8H, v0.H[5] // ........................................*............................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v26.8H, v31.8H, v7.H[0] // ................................*....................................................... // gap // ........................................................................................ - mul v26.8H, v24.8H, v29.8H // ........................................................................*............... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mul v27.8H, v27.8H, v0.H[4] // .........................................*.............................................. // gap // ........................................................................................ - sqrdmulh v31.8H, v28.8H, v0.H[3] // ....................................*................................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mul v31.8H, v3.8H, v0.H[2] // ....................................*................................................... // gap // ........................................................................................ - mls v20.8H, v19.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ + sub v2.8H, v17.8H, v28.8H // .....................................................*.................................. // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v3.8H, v11.8H, v30.8H // ........................................................................*............... + add v28.8H, v17.8H, v28.8H // ......................................................*................................. // gap // ........................................................................................ // gap // ........................................................................................ - mul v4.8H, v28.8H, v0.H[2] // ...................................*.................................................... // gap // ........................................................................................ // gap // ........................................................................................ + mls v31.8H, v16.8H, v7.H[0] // .....................................*.................................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v25.8H, v12.8H, v0.H[3] // ...............................*........................................................ // gap // ........................................................................................ // gap // ........................................................................................ - str q20, [x0, #256] // ....................................................................*................... + mls v27.8H, v14.8H, v7.H[0] // ..........................................*............................................. // gap // ........................................................................................ // gap // ........................................................................................ - mls v21.8H, v13.8H, v7.H[0] // ..........................................*............................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v22.8H, v17.8H, v0.H[4] // .............................................*.......................................... + sqrdmulh v16.8H, v25.8H, v0.H[5] // .............................................*.......................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v24.8H, v2.8H, v0.H[1] // .......................................................*................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v6.8H, v25.8H, v7.H[0] // ................................*....................................................... + sub v14.8H, v26.8H, v27.8H // ..........................................................*............................. // gap // ........................................................................................ // gap // ........................................................................................ + add v27.8H, v26.8H, v27.8H // ...........................................................*............................ + sqrdmulh v26.8H, v6.8H, v0.H[1] // ..................................................*..................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v19.8H, v24.8H, v30.8H // .........................................................................*.............. // gap // ........................................................................................ + mul v6.8H, v6.8H, v0.H[0] // ...................................................*.................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v11.8H, v10.8H, v0.H[0] // .......................................................*................................ // gap // ........................................................................................ + mul v2.8H, v2.8H, v0.H[0] // ........................................................*............................... // gap // ........................................................................................ - sub v8.8H, v6.8H, v21.8H // ..........................................................*............................. // gap // ........................................................................................ // gap // ........................................................................................ - add v25.8H, v6.8H, v21.8H // ...........................................................*............................ - ldr q21, [x0, #208] // ...e.................................................................................... - mul v24.8H, v9.8H, v29.8H // ...........................................................................*............ // gap // ........................................................................................ // gap // ........................................................................................ + mul v25.8H, v25.8H, v0.H[4] // ..............................................*......................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v3.8H, v8.8H, v0.H[1] // .............................................................*.......................... // gap // ........................................................................................ // gap // ........................................................................................ + mls v25.8H, v16.8H, v7.H[0] // ...............................................*........................................ // gap // ........................................................................................ - mul v20.8H, v8.8H, v0.H[0] // ............................................................*........................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v6.8H, v26.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ - mls v4.8H, v31.8H, v7.H[0] // .....................................*.................................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v16.8H, v28.8H, v30.8H // ...........................................................................*............ // gap // ........................................................................................ - mls v22.8H, v14.8H, v7.H[0] // ...............................................*........................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mul v15.8H, v28.8H, v29.8H // ............................................................................*........... // gap // ........................................................................................ - mls v26.8H, v19.8H, v7.H[0] // ..........................................................................*............. // gap // ........................................................................................ + str q6, [x0, #256] // ....................................................................*................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v19.8H, v10.8H, v0.H[1] // ........................................................*............................... + sqrdmulh v6.8H, v14.8H, v0.H[1] // ............................................................*........................... // gap // ........................................................................................ // gap // ........................................................................................ - sub v23.8H, v4.8H, v22.8H // ...............................................................*........................ + sub v28.8H, v31.8H, v25.8H // ...............................................................*........................ + mls v15.8H, v16.8H, v7.H[0] // .............................................................................*.......... // gap // ........................................................................................ // gap // ........................................................................................ - sub v10.8H, v2.8H, v21.8H // .............e.......................................................................... - mls v20.8H, v3.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ - add v14.8H, v4.8H, v22.8H // ................................................................*....................... - str q26, [x0], #(16) // ....................................................................................*... // gap // ........................................................................................ // gap // ........................................................................................ + mul v16.8H, v14.8H, v0.H[0] // .............................................................*.......................... // gap // ........................................................................................ - mul v28.8H, v25.8H, v29.8H // ..............................................................................*......... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v11.8H, v19.8H, v7.H[0] // .........................................................*.............................. // gap // ........................................................................................ + mls v16.8H, v6.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ - str q20, [x0, #368] // ......................................................................*................. // gap // ........................................................................................ + str q15, [x0, #64] // .....................................................................................*.. // gap // ........................................................................................ - sqrdmulh v12.8H, v25.8H, v30.8H // ...............................................................................*........ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v14.8H, v28.8H, v0.H[1] // .................................................................*...................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mul v6.8H, v28.8H, v0.H[0] // ..................................................................*..................... // gap // ........................................................................................ - mul v22.8H, v14.8H, v29.8H // .................................................................................*...... - str q11, [x0, #304] // .....................................................................*.................. // gap // ........................................................................................ + str q16, [x0, #384] // ......................................................................*................. // gap // ........................................................................................ - mul v19.8H, v23.8H, v0.H[0] // .................................................................*...................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mul v11.8H, v11.8H, v29.8H // .........................................................................*.............. // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v6.8H, v23.8H, v0.H[1] // ..................................................................*..................... // gap // ........................................................................................ + mls v6.8H, v14.8H, v7.H[0] // ...................................................................*.................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v11.8H, v9.8H, v30.8H // ............................................................................*........... // gap // ........................................................................................ + sqrdmulh v14.8H, v27.8H, v30.8H // ..............................................................................*......... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v8.8H, v14.8H, v30.8H // ..................................................................................*..... // gap // ........................................................................................ + mls v11.8H, v3.8H, v7.H[0] // ..........................................................................*............. // gap // ........................................................................................ // gap // ........................................................................................ + str q6, [x0, #448] // .......................................................................*................ + add v6.8H, v31.8H, v25.8H // ................................................................*....................... // gap // ........................................................................................ + mul v16.8H, v27.8H, v29.8H // ...............................................................................*........ // gap // ........................................................................................ - mls v19.8H, v6.8H, v7.H[0] // ...................................................................*.................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v16.8H, v14.8H, v7.H[0] // ................................................................................*....... // gap // ........................................................................................ - mls v24.8H, v11.8H, v7.H[0] // .............................................................................*.......... // gap // ........................................................................................ + str q11, [x0], #(16) // ....................................................................................*... // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v27.8H, v6.8H, v30.8H // .................................................................................*...... // gap // ........................................................................................ // gap // ........................................................................................ - mls v28.8H, v12.8H, v7.H[0] // ................................................................................*....... // gap // ........................................................................................ // gap // ........................................................................................ - str q19, [x0, #432] // .......................................................................*................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v22.8H, v8.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ + mls v2.8H, v24.8H, v7.H[0] // .........................................................*.............................. + ldr q11, [x0, #128] // ..e..................................................................................... + ldr q31, [x0, #192] // ...e.................................................................................... // gap // ........................................................................................ - str q24, [x0, #48] // .....................................................................................*.. + mul v15.8H, v6.8H, v29.8H // ..................................................................................*..... // gap // ........................................................................................ + str q16, [x0, #112] // ......................................................................................*. + sub v16.8H, v22.8H, v13.8H // .......................e................................................................ + ldr q25, [x0, #0] // e....................................................................................... // gap // ........................................................................................ - sqrdmulh v13.8H, v18.8H, v1.H[5] // ..........................e............................................................. + mls v15.8H, v27.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ // gap // ........................................................................................ - str q28, [x0, #112] // ......................................................................................*. + str q2, [x0, #304] // .....................................................................*.................. + sub v2.8H, v11.8H, v31.8H // .............e.......................................................................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v27.8H, v18.8H, v1.H[4] // .........................e.............................................................. + sqrdmulh v19.8H, v16.8H, v1.H[5] // .........................e.............................................................. // gap // ........................................................................................ + add v17.8H, v11.8H, v31.8H // ..............e......................................................................... // gap // ........................................................................................ - str q22, [x0, #176] // .......................................................................................* // gap // ........................................................................................ - ldr q24, [x0, #320] // .....e.................................................................................. + ldr q6, [x0, #320] // .....e.................................................................................. + sqrdmulh v31.8H, v2.8H, v1.H[1] // ...............e........................................................................ + ldr q27, [x0, #256] // ....e................................................................................... + add v26.8H, v22.8H, v13.8H // ........................e............................................................... + str q15, [x0, #176] // .......................................................................................* + ldr q14, [x0, #64] // .e...................................................................................... - // original source code - // ldr q8, [x0, #0] // .....................................................................................|.*.................................................................................... - // ldr q9, [x0, #(1*(512/8))] // .....................................................................................*...................................................................................... - // ldr q10, [x0, #(2*(512/8))] // .....e...............................................................................|.......e.............................................................................. - // ldr q11, [x0, #(3*(512/8))] // ...................................................e.................................|.....................................................e................................ - // ldr q12, [x0, #(4*(512/8))] // ....*................................................................................|......*............................................................................... - // ldr q13, [x0, #(5*(512/8))] // ....................................................................................e|...................................................................................... - // ldr q14, [x0, #(6*(512/8))] // .e...................................................................................|...e.................................................................................. - // ldr q15, [x0, #(7*(512/8))] // e....................................................................................|..e................................................................................... - // sub v24.8h, v8.8h, v9.8h // ........*............................................................................|..........*........................................................................... - // add v8.8h, v8.8h, v9.8h // ......*..............................................................................|........*............................................................................. - // mul v9.8h, v24.8h, v0.h[6] // ...............*.....................................................................|.................*.................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ............*........................................................................|..............*....................................................................... - // mls v9.8h, v24.8h, v7.h[0] // ....................*................................................................|......................*............................................................... - // sub v24.8h, v10.8h, v11.8h // ............................................................e........................|..............................................................e....................... - // add v10.8h, v10.8h, v11.8h // ...*.................................................................................|.....*................................................................................ - // mul v11.8h, v24.8h, v1.h[0] // ..*..................................................................................|....*................................................................................. - // sqrdmulh v24.8h, v24.8h, v1.h[1] // .....................................................................................|*..................................................................................... - // mls v11.8h, v24.8h, v7.h[0] // .........*...........................................................................|...........*.......................................................................... - // sub v24.8h, v12.8h, v13.8h // ...........*.........................................................................|.............*........................................................................ - // add v12.8h, v12.8h, v13.8h // ..........*..........................................................................|............*......................................................................... - // mul v13.8h, v24.8h, v1.h[2] // .....................*...............................................................|.......................*.............................................................. - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ..................*..................................................................|....................*................................................................. - // mls v13.8h, v24.8h, v7.h[0] // ......................*..............................................................|........................*............................................................. - // sub v24.8h, v14.8h, v15.8h // ................e....................................................................|..................e................................................................... - // add v14.8h, v14.8h, v15.8h // ...................e.................................................................|.....................e................................................................ - // mul v15.8h, v24.8h, v1.h[4] // ..................................................................................e..|....................................................................................e. - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ................................................................................e....|..................................................................................e... - // mls v15.8h, v24.8h, v7.h[0] // .......*.............................................................................|.........*............................................................................ - // sub v24.8h, v8.8h, v10.8h // ..............*......................................................................|................*..................................................................... - // add v8.8h, v8.8h, v10.8h // .......................*.............................................................|.........................*............................................................ - // mul v10.8h, v24.8h, v0.h[2] // ...............................*.....................................................|.................................*.................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................................*..........................................|............................................*......................................... - // mls v10.8h, v24.8h, v7.h[0] // ..............................................*......................................|................................................*..................................... - // sub v24.8h, v9.8h, v11.8h // ........................*............................................................|..........................*........................................................... - // add v9.8h, v9.8h, v11.8h // .........................*...........................................................|...........................*.......................................................... - // mul v11.8h, v24.8h, v0.h[2] // .........................................*...........................................|...........................................*.......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................*.............................................|.........................................*............................................ - // mls v11.8h, v24.8h, v7.h[0] // .......................................................*.............................|.........................................................*............................ - // sub v24.8h, v12.8h, v14.8h // .............*.......................................................................|...............*...................................................................... - // add v12.8h, v12.8h, v14.8h // .................*...................................................................|...................*.................................................................. - // mul v14.8h, v24.8h, v0.h[4] // ..........................*..........................................................|............................*......................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ............................*........................................................|..............................*....................................................... - // mls v14.8h, v24.8h, v7.h[0] // ............................................*........................................|..............................................*....................................... - // sub v24.8h, v13.8h, v15.8h // ................................*....................................................|..................................*................................................... - // add v13.8h, v13.8h, v15.8h // ..............................*......................................................|................................*..................................................... - // mul v15.8h, v24.8h, v0.h[4] // .............................................*.......................................|...............................................*...................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ....................................*................................................|......................................*............................................... - // mls v15.8h, v24.8h, v7.h[0] // ........................................................*............................|..........................................................*........................... - // sub v24.8h, v8.8h, v12.8h // .............................*.......................................................|...............................*...................................................... - // add v8.8h, v8.8h, v12.8h // ...........................*.........................................................|.............................*........................................................ - // mul v12.8h, v24.8h, v0.h[0] // .................................*...................................................|...................................*.................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................*...............................................|.......................................*.............................................. - // mls v12.8h, v24.8h, v7.h[0] // ........................................*............................................|..........................................*........................................... - // sub v24.8h, v9.8h, v13.8h // ..................................*..................................................|....................................*................................................. - // add v9.8h, v9.8h, v13.8h // ...................................*.................................................|.....................................*................................................ - // mul v13.8h, v24.8h, v0.h[0] // ................................................*....................................|..................................................*................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................................*..........................|............................................................*......................... - // mls v13.8h, v24.8h, v7.h[0] // .................................................................*...................|...................................................................*.................. - // sub v24.8h, v10.8h, v14.8h // .................................................*...................................|...................................................*.................................. - // add v10.8h, v10.8h, v14.8h // ..................................................*..................................|....................................................*................................. - // mul v14.8h, v24.8h, v0.h[0] // ......................................................*..............................|........................................................*............................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................................*...............................|.......................................................*.............................. - // mls v14.8h, v24.8h, v7.h[0] // .............................................................*.......................|...............................................................*...................... - // sub v24.8h, v11.8h, v15.8h // ...........................................................*.........................|.............................................................*........................ - // add v11.8h, v11.8h, v15.8h // ..............................................................*......................|................................................................*..................... - // mul v15.8h, v24.8h, v0.h[0] // ......................................................................*..............|........................................................................*............. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................................................*.............|.........................................................................*............ - // mls v15.8h, v24.8h, v7.h[0] // ..........................................................................*..........|............................................................................*......... - // str q12, [x0, #(4*(512/8))] // ...........................................*.........................................|.............................................*........................................ - // str q13, [x0, #(5*(512/8))] // .....................................................................*...............|.......................................................................*.............. - // str q14, [x0, #(6*(512/8))] // ..................................................................*..................|....................................................................*................. - // str q15, [x0, #(7*(512/8))] // .............................................................................*.......|...............................................................................*...... - // mul v12.8h, v8.8h, v29.8h // ......................................*..............................................|........................................*............................................. - // sqrdmulh v8.8h, v8.8h, v30.8h // ...............................................*.....................................|.................................................*.................................... - // mls v12.8h, v8.8h, v7.h[0] // .........................................................*...........................|...........................................................*.......................... - // mul v13.8h, v9.8h, v29.8h // ....................................................*................................|......................................................*............................... - // sqrdmulh v9.8h, v9.8h, v30.8h // ........................................................................*............|..........................................................................*........... - // mls v13.8h, v9.8h, v7.h[0] // ...........................................................................*.........|.............................................................................*........ - // mul v14.8h, v10.8h, v29.8h // ................................................................*....................|..................................................................*................... - // sqrdmulh v10.8h, v10.8h, v30.8h // ...................................................................*.................|.....................................................................*................ - // mls v14.8h, v10.8h, v7.h[0] // ............................................................................*........|..............................................................................*....... - // mul v15.8h, v11.8h, v29.8h // ....................................................................*................|......................................................................*............... - // sqrdmulh v11.8h, v11.8h, v30.8h // .........................................................................*...........|...........................................................................*.......... - // mls v15.8h, v11.8h, v7.h[0] // ..............................................................................*......|................................................................................*..... - // str q12, [x0], #(16) // ...............................................................*.....................|.................................................................*.................... - // str q13, [x0, #(-16 + 1*(512/8))] // ...............................................................................*.....|.................................................................................*.... - // str q14, [x0, #(-16 + 2*(512/8))] // .................................................................................*...|...................................................................................*.. - // str q15, [x0, #(-16 + 3*(512/8))] // ...................................................................................*.|.....................................................................................* + // ------------------------------------------------------------------------------- new position -------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------------- + // ldr q8, [x0, #0] // ..........................................................................e...........'...........................................................................~.......... + // ldr q9, [x0, #(1*(512/8))] // .....................................................................................e'...................................................................................... + // ldr q10, [x0, #(2*(512/8))] // .....................................................................e................'......................................................................~............... + // ldr q11, [x0, #(3*(512/8))] // ......................................................................e...............'.......................................................................~.............. + // ldr q12, [x0, #(4*(512/8))] // ..................................................................................e...'...................................................................................~.. + // ldr q13, [x0, #(5*(512/8))] // ................................................................................e.....'.................................................................................~.... + // ldr q14, [x0, #(6*(512/8))] // e.....................................................................................'.~.................................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..........e...........................................................................'...........~.......................................................................... + // sub v24.8h, v8.8h, v9.8h // .~....................................................................................'..*................................................................................... + // add v8.8h, v8.8h, v9.8h // ..~...................................................................................'...*.................................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[7] // ...........~..........................................................................'............*......................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ......~...............................................................................'.......*.............................................................................. + // mls v9.8h, v27.8h, v7.h[0] // ................~.....................................................................'.................*.................................................................... + // sub v24.8h, v10.8h, v11.8h // .............................................................................e........'..............................................................................~....... + // add v10.8h, v10.8h, v11.8h // ...............................................................................e......'................................................................................~..... + // sqrdmulh v27.8h, v24.8h, v1.h[1] // .................................................................................e....'..................................................................................~... + // mul v11.8h, v24.8h, v1.h[0] // ...~..................................................................................'....*................................................................................. + // mls v11.8h, v27.8h, v7.h[0] // ............~.........................................................................'.............*........................................................................ + // sub v24.8h, v12.8h, v13.8h // ....~.................................................................................'.....*................................................................................ + // add v12.8h, v12.8h, v13.8h // .....~................................................................................'......*............................................................................... + // sqrdmulh v27.8h, v24.8h, v1.h[3] // .........~............................................................................'..........*........................................................................... + // mul v13.8h, v24.8h, v1.h[2] // .............~........................................................................'..............*....................................................................... + // mls v13.8h, v27.8h, v7.h[0] // ....................~.................................................................'.....................*................................................................ + // sub v24.8h, v14.8h, v15.8h // .........................................................................e............'..........................................................................~........... + // add v14.8h, v14.8h, v15.8h // ...................................................................................e..'....................................................................................~. + // sqrdmulh v27.8h, v24.8h, v1.h[5] // ..............................................................................e.......'...............................................................................~...... + // mul v15.8h, v24.8h, v1.h[4] // ......................................................................................*...................................................................................... + // mls v15.8h, v27.8h, v7.h[0] // ......................................................................................'*..................................................................................... + // sub v24.8h, v8.8h, v10.8h // ........~.............................................................................'.........*............................................................................ + // add v8.8h, v8.8h, v10.8h // .......~..............................................................................'........*............................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...................~..................................................................'....................*................................................................. + // mul v10.8h, v24.8h, v0.h[2] // ......................~...............................................................'.......................*.............................................................. + // mls v10.8h, v27.8h, v7.h[0] // ............................~.........................................................'.............................*........................................................ + // sub v24.8h, v9.8h, v11.8h // .....................~................................................................'......................*............................................................... + // add v9.8h, v9.8h, v11.8h // ........................~.............................................................'.........................*............................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .......................~..............................................................'........................*............................................................. + // mul v11.8h, v24.8h, v0.h[2] // ..............................~.......................................................'...............................*...................................................... + // mls v11.8h, v27.8h, v7.h[0] // ..................................~...................................................'...................................*.................................................. + // sub v24.8h, v12.8h, v14.8h // ...............~......................................................................'................*..................................................................... + // add v12.8h, v12.8h, v14.8h // ..............~.......................................................................'...............*...................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...........................~..........................................................'............................*......................................................... + // mul v14.8h, v24.8h, v0.h[4] // .............................~........................................................'..............................*....................................................... + // mls v14.8h, v27.8h, v7.h[0] // ...................................~..................................................'....................................*................................................. + // sub v24.8h, v13.8h, v15.8h // .........................~............................................................'..........................*........................................................... + // add v13.8h, v13.8h, v15.8h // ..........................~...........................................................'...........................*.......................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ....................................~.................................................'.....................................*................................................ + // mul v15.8h, v24.8h, v0.h[4] // ...........................................~..........................................'............................................*......................................... + // mls v15.8h, v27.8h, v7.h[0] // ............................................~.........................................'.............................................*........................................ + // sub v24.8h, v8.8h, v12.8h // .................~....................................................................'..................*................................................................... + // add v8.8h, v8.8h, v12.8h // ..................~...................................................................'...................*.................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ........................................~.............................................'.........................................*............................................ + // mul v12.8h, v24.8h, v0.h[0] // .........................................~............................................'..........................................*........................................... + // mls v12.8h, v27.8h, v7.h[0] // .............................................~........................................'..............................................*....................................... + // sub v24.8h, v9.8h, v13.8h // ...............................~......................................................'................................*..................................................... + // add v9.8h, v9.8h, v13.8h // .................................~....................................................'..................................*................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .....................................~................................................'......................................*............................................... + // mul v13.8h, v24.8h, v0.h[0] // ..........................................~...........................................'...........................................*.......................................... + // mls v13.8h, v27.8h, v7.h[0] // ....................................................................~.................'.....................................................................*................ + // sub v24.8h, v10.8h, v14.8h // ......................................~...............................................'.......................................*.............................................. + // add v10.8h, v10.8h, v14.8h // .......................................~..............................................'........................................*............................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .................................................~....................................'..................................................*................................... + // mul v14.8h, v24.8h, v0.h[0] // ....................................................~.................................'.....................................................*................................ + // mls v14.8h, v27.8h, v7.h[0] // .....................................................~................................'......................................................*............................... + // sub v24.8h, v11.8h, v15.8h // ..................................................~...................................'...................................................*.................................. + // add v11.8h, v11.8h, v15.8h // ...............................................................~......................'................................................................*..................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .......................................................~..............................'........................................................*............................. + // mul v15.8h, v24.8h, v0.h[0] // ........................................................~.............................'.........................................................*............................ + // mls v15.8h, v27.8h, v7.h[0] // ...........................................................~..........................'............................................................*......................... + // str q12, [x0, #(4*(512/8))] // ................................................~.....................................'.................................................*.................................... + // str q13, [x0, #(5*(512/8))] // ............................................................................~.........'.............................................................................*........ + // str q14, [x0, #(6*(512/8))] // .........................................................~............................'..........................................................*........................... + // str q15, [x0, #(7*(512/8))] // ..............................................................~.......................'...............................................................*...................... + // sqrdmulh v27.8h, v8.8h, v30.8h // ................................~.....................................................'.................................*.................................................... + // mul v8.8h, v8.8h, v29.8h // ..........................................................~...........................'...........................................................*.......................... + // mls v8.8h, v27.8h, v7.h[0] // .............................................................~........................'..............................................................*....................... + // sqrdmulh v27.8h, v9.8h, v30.8h // ..............................................~.......................................'...............................................*...................................... + // mul v9.8h, v9.8h, v29.8h // ...............................................~......................................'................................................*..................................... + // mls v9.8h, v27.8h, v7.h[0] // ...................................................~..................................'....................................................*................................. + // sqrdmulh v27.8h, v10.8h, v30.8h // ............................................................~.........................'.............................................................*........................ + // mul v10.8h, v10.8h, v29.8h // ................................................................~.....................'.................................................................*.................... + // mls v10.8h, v27.8h, v7.h[0] // .................................................................~....................'..................................................................*................... + // sqrdmulh v27.8h, v11.8h, v30.8h // ...................................................................~..................'....................................................................*................. + // mul v11.8h, v11.8h, v29.8h // .......................................................................~..............'........................................................................*............. + // mls v11.8h, v27.8h, v7.h[0] // ...........................................................................~..........'............................................................................*......... + // str q8, [x0], #(16) // ..................................................................~...................'...................................................................*.................. + // str q9, [x0, #(-16 + 1*(512/8))] // ......................................................~...............................'.......................................................*.............................. + // str q10, [x0, #(-16 + 2*(512/8))] // ........................................................................~.............'.........................................................................*............ + // str q11, [x0, #(-16 + 3*(512/8))] // ....................................................................................~.'.....................................................................................* sub count, count, #1 cbnz count, layer123_start - add v19.8H, v2.8H, v21.8H // ....*......................................................................... - ldr q23, [x0, #256] // .....*........................................................................ - sqrdmulh v22.8H, v10.8H, v1.H[1] // .*............................................................................ - ldr q28, [x0, #64] // *............................................................................. - ldr q3, [x0, #0] // ..*........................................................................... - // gap // .............................................................................. - mul v26.8H, v10.8H, v1.H[0] // ...*.......................................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v27.8H, v13.8H, v7.H[0] // .......*...................................................................... - add v20.8H, v23.8H, v24.8H // ..........*................................................................... - // gap // .............................................................................. - sub v23.8H, v23.8H, v24.8H // ...........*.................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v26.8H, v22.8H, v7.H[0] // .........*.................................................................... - add v22.8H, v3.8H, v28.8H // ......*....................................................................... - // gap // .............................................................................. - sub v24.8H, v20.8H, v15.8H // .............*................................................................ - // gap // .............................................................................. - // gap // .............................................................................. - add v20.8H, v20.8H, v15.8H // ................*............................................................. - sqrdmulh v11.8H, v23.8H, v1.H[3] // .................*............................................................ - // gap // .............................................................................. - sub v14.8H, v22.8H, v19.8H // ..............*............................................................... - // gap // .............................................................................. - // gap // .............................................................................. - add v19.8H, v22.8H, v19.8H // .....................*........................................................ - mul v23.8H, v23.8H, v1.H[2] // ...................*.......................................................... - // gap // .............................................................................. - sub v22.8H, v3.8H, v28.8H // ........*..................................................................... - // gap // .............................................................................. - // gap // .............................................................................. - mul v28.8H, v24.8H, v0.H[4] // ........................*..................................................... - // gap // .............................................................................. - // gap // .............................................................................. - add v3.8H, v19.8H, v20.8H // .........................*.................................................... - // gap // .............................................................................. - // gap // .............................................................................. - sub v19.8H, v19.8H, v20.8H // ...........................*.................................................. - mls v23.8H, v11.8H, v7.H[0] // ....................*......................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v20.8H, v22.8H, v0.H[7] // ............*................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v22.8H, v22.8H, v0.H[6] // ...............*.............................................................. - // gap // .............................................................................. - // gap // .............................................................................. - add v11.8H, v23.8H, v27.8H // ............................*................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v23.8H, v23.8H, v27.8H // ..............................*............................................... - sqrdmulh v27.8H, v24.8H, v0.H[5] // ..........................*................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v22.8H, v20.8H, v7.H[0] // ..................*........................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v24.8H, v14.8H, v0.H[2] // .............................*................................................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v20.8H, v19.8H, v0.H[0] // ...............................*.............................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v25.8H, v22.8H, v26.8H // ......................*....................................................... - // gap // .............................................................................. - // gap // .............................................................................. - add v22.8H, v22.8H, v26.8H // .......................*...................................................... - sqrdmulh v26.8H, v14.8H, v0.H[3] // ........................................*..................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v14.8H, v23.8H, v0.H[5] // ..................................*........................................... - // gap // .............................................................................. - // gap // .............................................................................. - sub v5.8H, v22.8H, v11.8H // ................................*............................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v19.8H, v19.8H, v0.H[1] // ...................................*.......................................... - add v22.8H, v22.8H, v11.8H // .................................*............................................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v11.8H, v3.8H, v29.8H // ....................................*......................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v10.8H, v25.8H, v0.H[3] // .....................................*........................................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v28.8H, v27.8H, v7.H[0] // ..........................................*................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v27.8H, v3.8H, v30.8H // .............................................*................................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v20.8H, v19.8H, v7.H[0] // ......................................*....................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v19.8H, v25.8H, v0.H[2] // .......................................*...................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v23.8H, v23.8H, v0.H[4] // ...........................................*.................................. - // gap // .............................................................................. - // gap // .............................................................................. - str q20, [x0, #256] // .........................................*.................................... - // gap // .............................................................................. - // gap // .............................................................................. - mls v24.8H, v26.8H, v7.H[0] // ............................................*................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v3.8H, v5.8H, v0.H[0] // ..............................................*............................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v26.8H, v22.8H, v29.8H // .................................................*............................ - // gap // .............................................................................. - // gap // .............................................................................. - sub v20.8H, v24.8H, v28.8H // ...............................................*.............................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v19.8H, v10.8H, v7.H[0] // ....................................................*......................... - add v28.8H, v24.8H, v28.8H // ................................................*............................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v24.8H, v20.8H, v0.H[1] // ..................................................*........................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v20.8H, v20.8H, v0.H[0] // ...................................................*.......................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v23.8H, v14.8H, v7.H[0] // .....................................................*........................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v11.8H, v27.8H, v7.H[0] // ......................................................*....................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v27.8H, v5.8H, v0.H[1] // .......................................................*...................... - // gap // .............................................................................. - // gap // .............................................................................. - sub v14.8H, v19.8H, v23.8H // ........................................................*..................... - // gap // .............................................................................. - // gap // .............................................................................. - add v19.8H, v19.8H, v23.8H // ..........................................................*................... - mls v20.8H, v24.8H, v7.H[0] // .........................................................*.................... - // gap // .............................................................................. - str q11, [x0], #(16) // ...........................................................*.................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v23.8H, v28.8H, v29.8H // ............................................................*................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v3.8H, v27.8H, v7.H[0] // .............................................................*................ - // gap // .............................................................................. - // gap // .............................................................................. - str q20, [x0, #368] // ..............................................................*............... - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v28.8H, v28.8H, v30.8H // ...............................................................*.............. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v27.8H, v19.8H, v29.8H // ................................................................*............. - // gap // .............................................................................. - // gap // .............................................................................. - str q3, [x0, #304] // .................................................................*............ - // gap // .............................................................................. - // gap // .............................................................................. - mul v24.8H, v14.8H, v0.H[0] // ..................................................................*........... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v3.8H, v14.8H, v0.H[1] // ...................................................................*.......... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v22.8H, v22.8H, v30.8H // ....................................................................*......... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v19.8H, v19.8H, v30.8H // .....................................................................*........ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v24.8H, v3.8H, v7.H[0] // ......................................................................*....... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v26.8H, v22.8H, v7.H[0] // .......................................................................*...... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v23.8H, v28.8H, v7.H[0] // ........................................................................*..... - // gap // .............................................................................. - // gap // .............................................................................. - str q24, [x0, #432] // .........................................................................*.... - // gap // .............................................................................. - // gap // .............................................................................. - mls v27.8H, v19.8H, v7.H[0] // ..........................................................................*... - // gap // .............................................................................. - // gap // .............................................................................. - str q26, [x0, #48] // ...........................................................................*.. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - str q23, [x0, #112] // ............................................................................*. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - str q27, [x0, #176] // .............................................................................* - // gap // .............................................................................. - // gap // .............................................................................. - - // original source code - // ldr q28, [x0, #64] // ...*.......................................................................... - // sqrdmulh v9.8H, v10.8H, v1.H[1] // ..*........................................................................... - // ldr q23, [x0, #0] // ....*......................................................................... - // mul v3.8H, v10.8H, v1.H[0] // .....*........................................................................ - // add v8.8H, v2.8H, v21.8H // *............................................................................. - // ldr q22, [x0, #256] // .*............................................................................ - // add v25.8H, v23.8H, v28.8H // ..........*................................................................... - // mls v27.8H, v13.8H, v7.H[0] // ......*....................................................................... - // sub v20.8H, v23.8H, v28.8H // .................*............................................................ - // mls v3.8H, v9.8H, v7.H[0] // .........*.................................................................... - // add v23.8H, v22.8H, v24.8H // .......*...................................................................... - // sub v28.8H, v22.8H, v24.8H // ........*..................................................................... - // sqrdmulh v17.8H, v20.8H, v0.H[7] // ......................*....................................................... - // sub v19.8H, v23.8H, v15.8H // ...........*.................................................................. - // sub v12.8H, v25.8H, v8.8H // ..............*............................................................... - // mul v21.8H, v20.8H, v0.H[6] // .......................*...................................................... - // add v31.8H, v23.8H, v15.8H // ............*................................................................. - // sqrdmulh v23.8H, v28.8H, v1.H[3] // .............*................................................................ - // mls v21.8H, v17.8H, v7.H[0] // ...........................*.................................................. - // mul v14.8H, v28.8H, v1.H[2] // ................*............................................................. - // mls v14.8H, v23.8H, v7.H[0] // .....................*........................................................ - // add v23.8H, v25.8H, v8.8H // ...............*.............................................................. - // sub v28.8H, v21.8H, v3.8H // ..............................*............................................... - // add v3.8H, v21.8H, v3.8H // ...............................*.............................................. - // mul v21.8H, v19.8H, v0.H[4] // ..................*........................................................... - // add v24.8H, v23.8H, v31.8H // ...................*.......................................................... - // sqrdmulh v13.8H, v19.8H, v0.H[5] // ..........................*................................................... - // sub v23.8H, v23.8H, v31.8H // ....................*......................................................... - // add v22.8H, v14.8H, v27.8H // ........................*..................................................... - // mul v6.8H, v12.8H, v0.H[2] // ............................*................................................. - // sub v17.8H, v14.8H, v27.8H // .........................*.................................................... - // mul v20.8H, v23.8H, v0.H[0] // .............................*................................................ - // sub v10.8H, v3.8H, v22.8H // ..................................*........................................... - // add v9.8H, v3.8H, v22.8H // ....................................*......................................... - // sqrdmulh v14.8H, v17.8H, v0.H[5] // .................................*............................................ - // sqrdmulh v19.8H, v23.8H, v0.H[1] // ...................................*.......................................... - // mul v26.8H, v24.8H, v29.8H // .....................................*........................................ - // sqrdmulh v31.8H, v28.8H, v0.H[3] // ......................................*....................................... - // mls v20.8H, v19.8H, v7.H[0] // .........................................*.................................... - // mul v4.8H, v28.8H, v0.H[2] // ..........................................*................................... - // sqrdmulh v25.8H, v12.8H, v0.H[3] // ................................*............................................. - // str q20, [x0, #256] // ............................................*................................. - // mls v21.8H, v13.8H, v7.H[0] // .......................................*...................................... - // mul v22.8H, v17.8H, v0.H[4] // ...........................................*.................................. - // mls v6.8H, v25.8H, v7.H[0] // .............................................*................................ - // sqrdmulh v19.8H, v24.8H, v30.8H // ........................................*..................................... - // mul v11.8H, v10.8H, v0.H[0] // ..............................................*............................... - // sub v8.8H, v6.8H, v21.8H // ................................................*............................. - // add v25.8H, v6.8H, v21.8H // ..................................................*........................... - // mul v24.8H, v9.8H, v29.8H // ...............................................*.............................. - // sqrdmulh v3.8H, v8.8H, v0.H[1] // ...................................................*.......................... - // mul v20.8H, v8.8H, v0.H[0] // ....................................................*......................... - // mls v4.8H, v31.8H, v7.H[0] // .................................................*............................ - // mls v22.8H, v14.8H, v7.H[0] // .....................................................*........................ - // mls v26.8H, v19.8H, v7.H[0] // ......................................................*....................... - // sqrdmulh v19.8H, v10.8H, v0.H[1] // .......................................................*...................... - // sub v23.8H, v4.8H, v22.8H // ........................................................*..................... - // mls v20.8H, v3.8H, v7.H[0] // ..........................................................*................... - // add v14.8H, v4.8H, v22.8H // .........................................................*.................... - // str q26, [x0], #(16) // ...........................................................*.................. - // mul v28.8H, v25.8H, v29.8H // ............................................................*................. - // mls v11.8H, v19.8H, v7.H[0] // .............................................................*................ - // str q20, [x0, #368] // ..............................................................*............... - // sqrdmulh v12.8H, v25.8H, v30.8H // ...............................................................*.............. - // mul v22.8H, v14.8H, v29.8H // ................................................................*............. - // str q11, [x0, #304] // .................................................................*............ - // mul v19.8H, v23.8H, v0.H[0] // ..................................................................*........... - // sqrdmulh v6.8H, v23.8H, v0.H[1] // ...................................................................*.......... - // sqrdmulh v11.8H, v9.8H, v30.8H // ....................................................................*......... - // sqrdmulh v8.8H, v14.8H, v30.8H // .....................................................................*........ - // mls v19.8H, v6.8H, v7.H[0] // ......................................................................*....... - // mls v24.8H, v11.8H, v7.H[0] // .......................................................................*...... - // mls v28.8H, v12.8H, v7.H[0] // ........................................................................*..... - // str q19, [x0, #432] // .........................................................................*.... - // mls v22.8H, v8.8H, v7.H[0] // ..........................................................................*... - // str q24, [x0, #48] // ...........................................................................*.. - // str q28, [x0, #112] // ............................................................................*. - // str q22, [x0, #176] // .............................................................................* + // Instructions: 74 + // Expected cycles: 96 + // Expected IPC: 0.77 + // + // Cycle bound: 96.0 + // IPC bound: 0.77 + // + // Wall time: 3.19s + // User time: 3.19s + // + // --------------------------- original position ---------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------------- + mul v28.8H, v16.8H, v1.H[4] // *......................................................................... + add v15.8H, v25.8H, v14.8H // ...*...................................................................... + // gap // .......................................................................... + sub v21.8H, v25.8H, v14.8H // ..*....................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sub v12.8H, v27.8H, v6.8H // .....*.................................................................... + mls v28.8H, v19.8H, v7.H[0] // .*........................................................................ + // gap // .......................................................................... + sub v24.8H, v15.8H, v17.8H // .........*................................................................ + // gap // .......................................................................... + // gap // .......................................................................... + add v11.8H, v27.8H, v6.8H // ......*................................................................... + sqrdmulh v27.8H, v21.8H, v0.H[7] // ...........*.............................................................. + // gap // .......................................................................... + add v13.8H, v15.8H, v17.8H // ........*................................................................. + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v14.8H, v24.8H, v0.H[3] // ...................*...................................................... + // gap // .......................................................................... + // gap // .......................................................................... + add v22.8H, v11.8H, v26.8H // ..............*........................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sub v26.8H, v11.8H, v26.8H // ...............*.......................................................... + // gap // .......................................................................... + mul v11.8H, v21.8H, v0.H[6] // .......*.................................................................. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v11.8H, v27.8H, v7.H[0] // ................*......................................................... + sub v25.8H, v13.8H, v22.8H // .................*........................................................ + // gap // .......................................................................... + add v13.8H, v13.8H, v22.8H // ..................*....................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v6.8H, v2.8H, v1.H[0] // ....*..................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v6.8H, v31.8H, v7.H[0] // ............*............................................................. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v15.8H, v24.8H, v0.H[2] // ......................*................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v27.8H, v26.8H, v0.H[5] // ...........................*.............................................. + // gap // .......................................................................... + // gap // .......................................................................... + add v22.8H, v11.8H, v6.8H // ........................*................................................. + // gap // .......................................................................... + // gap // .......................................................................... + sub v6.8H, v11.8H, v6.8H // .....................*.................................................... + sqrdmulh v11.8H, v12.8H, v1.H[3] // ..........*............................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v26.8H, v26.8H, v0.H[4] // .............................*............................................ + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v26.8H, v27.8H, v7.H[0] // ...................................*...................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v23.8H, v12.8H, v1.H[2] // .............*............................................................ + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v23.8H, v11.8H, v7.H[0] // ....................*..................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v2.8H, v6.8H, v0.H[3] // .......................*.................................................. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v15.8H, v14.8H, v7.H[0] // ............................*............................................. + // gap // .......................................................................... + // gap // .......................................................................... + sub v16.8H, v23.8H, v28.8H // .........................*................................................ + // gap // .......................................................................... + // gap // .......................................................................... + mul v14.8H, v6.8H, v0.H[2] // ..............................*........................................... + add v31.8H, v23.8H, v28.8H // ..........................*............................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v27.8H, v16.8H, v0.H[5] // ....................................*..................................... + sub v11.8H, v22.8H, v31.8H // ...............................*.......................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v14.8H, v2.8H, v7.H[0] // ..................................*....................................... + sub v2.8H, v15.8H, v26.8H // ......................................*................................... + // gap // .......................................................................... + add v15.8H, v15.8H, v26.8H // .......................................*.................................. + // gap // .......................................................................... + // gap // .......................................................................... + add v31.8H, v22.8H, v31.8H // .................................*........................................ + mul v16.8H, v16.8H, v0.H[4] // ...........................................*.............................. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v16.8H, v27.8H, v7.H[0] // ............................................*............................. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v6.8H, v11.8H, v0.H[1] // .....................................*.................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v26.8H, v11.8H, v0.H[0] // ..........................................*............................... + // gap // .......................................................................... + // gap // .......................................................................... + sub v11.8H, v14.8H, v16.8H // ..................................................*....................... + // gap // .......................................................................... + // gap // .......................................................................... + add v16.8H, v14.8H, v16.8H // ...............................................................*.......... + sqrdmulh v22.8H, v13.8H, v30.8H // ................................*......................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v13.8H, v13.8H, v29.8H // ..........................................................*............... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v27.8H, v31.8H, v30.8H // ..............................................*........................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v14.8H, v31.8H, v29.8H // ...............................................*.......................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v31.8H, v2.8H, v0.H[1] // .................................................*........................ + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v14.8H, v27.8H, v7.H[0] // ...................................................*...................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v27.8H, v2.8H, v0.H[0] // ....................................................*..................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v27.8H, v31.8H, v7.H[0] // .....................................................*.................... + // gap // .......................................................................... + // gap // .......................................................................... + str q14, [x0, #64] // ......................................................*................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v14.8H, v11.8H, v0.H[1] // .......................................................*.................. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v11.8H, v11.8H, v0.H[0] // ........................................................*................. + str q27, [x0, #384] // .........................................................*................ + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v27.8H, v15.8H, v30.8H // ............................................................*............. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v11.8H, v14.8H, v7.H[0] // ...........................................................*.............. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v13.8H, v22.8H, v7.H[0] // .............................................................*............ + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mul v15.8H, v15.8H, v29.8H // ................................................................*......... + str q11, [x0, #448] // ..............................................................*........... + // gap // .......................................................................... + // gap // .......................................................................... + mls v15.8H, v27.8H, v7.H[0] // .................................................................*........ + // gap // .......................................................................... + // gap // .......................................................................... + str q13, [x0], #(16) // ..................................................................*....... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v27.8H, v16.8H, v30.8H // ...................................................................*...... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v26.8H, v6.8H, v7.H[0] // ....................................................................*..... + // gap // .......................................................................... + // gap // .......................................................................... + str q15, [x0, #112] // ......................................................................*... + // gap // .......................................................................... + // gap // .......................................................................... + mul v15.8H, v16.8H, v29.8H // .....................................................................*.... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + sqrdmulh v3.8H, v25.8H, v0.H[1] // ........................................*................................. + // gap // .......................................................................... + // gap // .......................................................................... + str q26, [x0, #304] // ........................................................................*. + // gap // .......................................................................... + // gap // .......................................................................... + mul v25.8H, v25.8H, v0.H[0] // .........................................*................................ + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v15.8H, v27.8H, v7.H[0] // .......................................................................*.. + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + mls v25.8H, v3.8H, v7.H[0] // .............................................*............................ + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + str q15, [x0, #176] // .........................................................................* + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + // gap // .......................................................................... + str q25, [x0, #240] // ................................................*......................... + // gap // .......................................................................... + // gap // .......................................................................... + + // ----------------------------- new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------------- + // mul v15.8H, v16.8H, v1.H[4] // *......................................................................... + // mls v15.8H, v19.8H, v7.H[0] // ....*..................................................................... + // sub v16.8H, v25.8H, v14.8H // ..*....................................................................... + // add v3.8H, v25.8H, v14.8H // .*........................................................................ + // mul v25.8H, v2.8H, v1.H[0] // ................*......................................................... + // sub v14.8H, v27.8H, v6.8H // ...*...................................................................... + // add v27.8H, v27.8H, v6.8H // ......*................................................................... + // mul v23.8H, v16.8H, v0.H[6] // ............*............................................................. + // add v11.8H, v3.8H, v17.8H // ........*................................................................. + // sub v20.8H, v3.8H, v17.8H // .....*.................................................................... + // sqrdmulh v3.8H, v14.8H, v1.H[3] // ......................*................................................... + // sqrdmulh v6.8H, v16.8H, v0.H[7] // .......*.................................................................. + // mls v25.8H, v31.8H, v7.H[0] // .................*........................................................ + // mul v14.8H, v14.8H, v1.H[2] // .........................*................................................ + // add v31.8H, v27.8H, v26.8H // ..........*............................................................... + // sub v27.8H, v27.8H, v26.8H // ...........*.............................................................. + // mls v23.8H, v6.8H, v7.H[0] // .............*............................................................ + // sub v6.8H, v11.8H, v31.8H // ..............*........................................................... + // add v11.8H, v11.8H, v31.8H // ...............*.......................................................... + // sqrdmulh v31.8H, v20.8H, v0.H[3] // .........*................................................................ + // mls v14.8H, v3.8H, v7.H[0] // ..........................*............................................... + // sub v3.8H, v23.8H, v25.8H // .....................*.................................................... + // mul v26.8H, v20.8H, v0.H[2] // ..................*....................................................... + // sqrdmulh v16.8H, v3.8H, v0.H[3] // ...........................*.............................................. + // add v17.8H, v23.8H, v25.8H // ....................*..................................................... + // sub v25.8H, v14.8H, v15.8H // .............................*............................................ + // add v28.8H, v14.8H, v15.8H // ...............................*.......................................... + // sqrdmulh v14.8H, v27.8H, v0.H[5] // ...................*...................................................... + // mls v26.8H, v31.8H, v7.H[0] // ............................*............................................. + // mul v27.8H, v27.8H, v0.H[4] // .......................*.................................................. + // mul v31.8H, v3.8H, v0.H[2] // ..............................*........................................... + // sub v2.8H, v17.8H, v28.8H // .................................*........................................ + // sqrdmulh v3.8H, v11.8H, v30.8H // ............................................*............................. + // add v28.8H, v17.8H, v28.8H // .....................................*.................................... + // mls v31.8H, v16.8H, v7.H[0] // ..................................*....................................... + // mls v27.8H, v14.8H, v7.H[0] // ........................*................................................. + // sqrdmulh v16.8H, v25.8H, v0.H[5] // ................................*......................................... + // sqrdmulh v24.8H, v2.8H, v0.H[1] // ........................................*................................. + // sub v14.8H, v26.8H, v27.8H // ...................................*...................................... + // add v27.8H, v26.8H, v27.8H // ....................................*..................................... + // sqrdmulh v26.8H, v6.8H, v0.H[1] // ...................................................................*...... + // mul v6.8H, v6.8H, v0.H[0] // .....................................................................*.... + // mul v2.8H, v2.8H, v0.H[0] // .........................................*................................ + // mul v25.8H, v25.8H, v0.H[4] // ......................................*................................... + // mls v25.8H, v16.8H, v7.H[0] // .......................................*.................................. + // mls v6.8H, v26.8H, v7.H[0] // .......................................................................*.. + // sqrdmulh v16.8H, v28.8H, v30.8H // ..............................................*........................... + // mul v15.8H, v28.8H, v29.8H // ...............................................*.......................... + // str q6, [x0, #256] // .........................................................................* + // sqrdmulh v6.8H, v14.8H, v0.H[1] // ................................................*......................... + // sub v28.8H, v31.8H, v25.8H // ..........................................*............................... + // mls v15.8H, v16.8H, v7.H[0] // .................................................*........................ + // mul v16.8H, v14.8H, v0.H[0] // ..................................................*....................... + // mls v16.8H, v6.8H, v7.H[0] // ...................................................*...................... + // str q15, [x0, #64] // ....................................................*..................... + // sqrdmulh v14.8H, v28.8H, v0.H[1] // .....................................................*.................... + // mul v6.8H, v28.8H, v0.H[0] // ......................................................*................... + // str q16, [x0, #384] // .......................................................*.................. + // mul v11.8H, v11.8H, v29.8H // .............................................*............................ + // mls v6.8H, v14.8H, v7.H[0] // .........................................................*................ + // sqrdmulh v14.8H, v27.8H, v30.8H // ........................................................*................. + // mls v11.8H, v3.8H, v7.H[0] // ..........................................................*............... + // str q6, [x0, #448] // ............................................................*............. + // add v6.8H, v31.8H, v25.8H // ...........................................*.............................. + // mul v16.8H, v27.8H, v29.8H // ...........................................................*.............. + // mls v16.8H, v14.8H, v7.H[0] // .............................................................*............ + // str q11, [x0], #(16) // ..............................................................*........... + // sqrdmulh v27.8H, v6.8H, v30.8H // ...............................................................*.......... + // mls v2.8H, v24.8H, v7.H[0] // ................................................................*......... + // mul v15.8H, v6.8H, v29.8H // ..................................................................*....... + // str q16, [x0, #112] // .................................................................*........ + // mls v15.8H, v27.8H, v7.H[0] // ......................................................................*... + // str q2, [x0, #304] // ....................................................................*..... + // str q15, [x0, #176] // ........................................................................*. pop_stack diff --git a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s index ea0a6c4f..2f434f81 100644 --- a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s +++ b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +69,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -146,7 +125,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +136,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +146,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +154,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +165,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -354,1010 +339,1071 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: mov count, #8 .p2align 2 - ldr q13, [x3], #16 // ...................................*.............. - ldr q28, [x4, #48] // ...*.............................................. - ldr q3, [x4, #16] // ..*............................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // *................................................. - ldr q12, [x4, #80] // ....*............................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - ldr q22, [x4, #32] // .....*............................................ - ldr q26, [x4, #64] // .*................................................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v8.8H, v14.8H, v15.8H // .........*........................................ - add v0.8H, v14.8H, v15.8H // ..........*....................................... - sub v23.8H, v16.8H, v17.8H // .......*.......................................... - add v27.8H, v16.8H, v17.8H // ........*......................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - ldr q24, [x4], #(6*16) // ......*........................................... - sqrdmulh v19.8H, v8.8H, v28.8H // .............*.................................... - sqrdmulh v10.8H, v23.8H, v12.8H // ...........*...................................... - mul v25.8H, v23.8H, v26.8H // ............*..................................... - mul v21.8H, v8.8H, v22.8H // ..............*................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v6.8H, v0.8H, v27.8H // ................*................................. - sub v9.8H, v0.8H, v27.8H // ...............*.................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v22.8H, v9.8H, v24.8H // .................*................................ - sqrdmulh v30.8H, v9.8H, v3.8H // ..................*............................... - mls v25.8H, v10.8H, v7.H[0] // ....................*............................. - mls v21.8H, v19.8H, v7.H[0] // ...................*.............................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v28.8H, v21.8H, v25.8H // ......................*........................... - sub v17.8H, v21.8H, v25.8H // .....................*............................ - mls v22.8H, v30.8H, v7.H[0] // .......................*.......................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn2 v23.4S, v6.4S, v28.4S // ........................*......................... - trn1 v0.4S, v6.4S, v28.4S // ...........................*...................... - sqrdmulh v19.8H, v17.8H, v3.8H // ..........................*....................... - mul v29.8H, v17.8H, v24.8H // .........................*........................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v29.8H, v19.8H, v7.H[0] // ............................*..................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn1 v8.4S, v22.4S, v29.4S // ..............................*................... - trn2 v31.4S, v22.4S, v29.4S // .............................*.................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn2 v26.2D, v0.2D, v8.2D // .................................*................ - trn1 v8.2D, v0.2D, v8.2D // ..................................*............... - trn2 v5.2D, v23.2D, v31.2D // ...............................*.................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn1 v22.2D, v23.2D, v31.2D // ................................*................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v19.8H, v8.8H, v22.8H // .....................................*............ - add v28.8H, v26.8H, v5.8H // ....................................*............. - sub v23.8H, v8.8H, v22.8H // ........................................*......... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v31.8H, v23.8H, v13.H[2] // ...........................................*...... - sqrdmulh v10.8H, v23.8H, v13.H[3] // ..............................................*... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqdmulh v22.8H, v28.8H, v7.H[1] // ......................................*........... - sqdmulh v23.8H, v19.8H, v7.H[1] // .......................................*.......... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - srshr v21.8H, v23.8H, #11 // .........................................*........ - srshr v24.8H, v22.8H, #11 // ..........................................*....... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v19.8H, v21.8H, v7.H[0] // ............................................*..... - mls v28.8H, v24.8H, v7.H[0] // .............................................*.... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v0.8H, v19.8H, v28.8H // ................................................*. - add v9.8H, v19.8H, v28.8H // ...............................................*.. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - str q9, [x1], #(64) // .................................................* - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - - // original source code - // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // ...*.............................................. - // ldr q28, [x4, #64] // ......*........................................... - // ldr q6, [x4, #16] // ..*............................................... - // ldr q24, [x4, #48] // .*................................................ - // ldr q25, [x4, #80] // ....*............................................. - // ldr q10, [x4, #32] // .....*............................................ - // ldr q2, [x4], #(6*16) // ...........*...................................... - // sub v5.8H, v16.8H, v17.8H // .........*........................................ - // add v21.8H, v16.8H, v17.8H // ..........*....................................... - // sub v27.8H, v14.8H, v15.8H // .......*.......................................... - // add v16.8H, v14.8H, v15.8H // ........*......................................... - // sqrdmulh v23.8H, v5.8H, v25.8H // .............*.................................... - // mul v11.8H, v5.8H, v28.8H // ..............*................................... - // sqrdmulh v20.8H, v27.8H, v24.8H // ............*..................................... - // mul v9.8H, v27.8H, v10.8H // ...............*.................................. - // sub v25.8H, v16.8H, v21.8H // .................*................................ - // add v18.8H, v16.8H, v21.8H // ................*................................. - // mul v29.8H, v25.8H, v2.8H // ..................*............................... - // sqrdmulh v14.8H, v25.8H, v6.8H // ...................*.............................. - // mls v9.8H, v20.8H, v7.H[0] // .....................*............................ - // mls v11.8H, v23.8H, v7.H[0] // ....................*............................. - // sub v30.8H, v9.8H, v11.8H // .......................*.......................... - // add v5.8H, v9.8H, v11.8H // ......................*........................... - // mls v29.8H, v14.8H, v7.H[0] // ........................*......................... - // trn2 v21.4S, v18.4S, v5.4S // .........................*........................ - // mul v12.8H, v30.8H, v2.8H // ............................*..................... - // sqrdmulh v22.8H, v30.8H, v6.8H // ...........................*...................... - // trn1 v28.4S, v18.4S, v5.4S // ..........................*....................... - // mls v12.8H, v22.8H, v7.H[0] // .............................*.................... - // trn2 v22.4S, v29.4S, v12.4S // ...............................*.................. - // trn1 v25.4S, v29.4S, v12.4S // ..............................*................... - // trn2 v5.2D, v21.2D, v22.2D // ..................................*............... - // trn1 v24.2D, v21.2D, v22.2D // ...................................*.............. - // trn2 v26.2D, v28.2D, v25.2D // ................................*................. - // trn1 v25.2D, v28.2D, v25.2D // .................................*................ - // ldr q13, [x3], #16 // *................................................. - // add v20.8H, v26.8H, v5.8H // .....................................*............ - // add v8.8H, v25.8H, v24.8H // ....................................*............. - // sqdmulh v10.8H, v20.8H, v7.H[1] // .........................................*........ - // sqdmulh v11.8H, v8.8H, v7.H[1] // ..........................................*....... - // sub v23.8H, v25.8H, v24.8H // ......................................*........... - // srshr v27.8H, v11.8H, #11 // ...........................................*...... - // srshr v10.8H, v10.8H, #11 // ............................................*..... - // mul v31.8H, v23.8H, v13.H[2] // .......................................*.......... - // mls v8.8H, v27.8H, v7.H[0] // .............................................*.... - // mls v20.8H, v10.8H, v7.H[0] // ..............................................*... - // sqrdmulh v10.8H, v23.8H, v13.H[3] // ........................................*......... - // add v28.8H, v8.8H, v20.8H // ................................................*. - // sub v0.8H, v8.8H, v20.8H // ...............................................*.. - // str q28, [x1], #(64) // .................................................* + // Instructions: 46 + // Expected cycles: 40 + // Expected IPC: 1.15 + // + // Cycle bound: 40.0 + // IPC bound: 1.15 + // + // Wall time: 1.28s + // User time: 1.28s + // + // ------------- original position -------------> + // 0 25 + // |------------------------|-------------------- + ldr q13, [x4, #16] // .................*............................ + ldr q11, [x4], #(6*16) // ..............*............................... + ldr q3, [x4, #-32] // ....*......................................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1] // *............................................. + ldr q6, [x4, #-64] // ..*........................................... + ldr q14, [x4, #-16] // .*............................................ + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + ldr q26, [x4, #-48] // ...*.......................................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + add v25.8H, v15.8H, v16.8H // ......*....................................... + add v27.8H, v17.8H, v18.8H // ........*..................................... + sub v31.8H, v15.8H, v16.8H // .....*........................................ + sub v0.8H, v17.8H, v18.8H // .......*...................................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + sqrdmulh v26.8H, v31.8H, v26.8H // ..........*................................... + mul v6.8H, v31.8H, v6.8H // .........*.................................... + mul v16.8H, v0.8H, v3.8H // ...........*.................................. + sqrdmulh v14.8H, v0.8H, v14.8H // ............*................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + add v15.8H, v25.8H, v27.8H // .............*................................ + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + sub v27.8H, v25.8H, v27.8H // ................*............................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v16.8H, v14.8H, v7.H[0] // ...............*.............................. + mls v6.8H, v26.8H, v7.H[0] // ..................*........................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mul v0.8H, v27.8H, v11.8H // .....................*........................ + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + sqrdmulh v27.8H, v27.8H, v13.8H // ......................*....................... + sub v26.8H, v6.8H, v16.8H // ...................*.......................... + add v16.8H, v6.8H, v16.8H // ....................*......................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mul v14.8H, v26.8H, v11.8H // .........................*.................... + trn2 v6.4S, v15.4S, v16.4S // .......................*...................... + trn1 v15.4S, v15.4S, v16.4S // ........................*..................... + sqrdmulh v16.8H, v26.8H, v13.8H // ..........................*................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v0.8H, v27.8H, v7.H[0] // ...........................*.................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v14.8H, v16.8H, v7.H[0] // ............................*................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + trn1 v27.4S, v0.4S, v14.4S // .............................*................ + trn2 v16.4S, v0.4S, v14.4S // ..............................*............... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + trn1 v8.2D, v15.2D, v27.2D // .................................*............ + trn2 v25.2D, v6.2D, v16.2D // ................................*............. + trn2 v15.2D, v15.2D, v27.2D // ..................................*........... + trn1 v28.2D, v6.2D, v16.2D // ...............................*.............. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + add v16.8H, v15.8H, v25.8H // ...................................*.......... + add v27.8H, v8.8H, v28.8H // ....................................*......... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + sqdmulh v26.8H, v27.8H, v7.H[1] // .....................................*........ + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + sqdmulh v14.8H, v16.8H, v7.H[1] // ......................................*....... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + srshr v6.8H, v14.8H, #11 // .......................................*...... + srshr v14.8H, v26.8H, #11 // ........................................*..... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v16.8H, v6.8H, v7.H[0] // .........................................*.... + mls v27.8H, v14.8H, v7.H[0] // ..........................................*... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + add v14.8H, v27.8H, v16.8H // ...........................................*.. + sub v16.8H, v27.8H, v16.8H // .............................................* + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + str q14, [x1], #(64) // ............................................*. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + + // --------------- new position ----------------> + // 0 25 + // |------------------------|-------------------- + // ld4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1] // ...*.......................................... + // ldr q30, [x4, #80] // .....*........................................ + // ldr q9, [x4, #32] // ....*......................................... + // ldr q13, [x4, #48] // ......*....................................... + // ldr q31, [x4, #64] // ..*........................................... + // sub v0.8H, v2.8H, v3.8H // .........*.................................... + // add v3.8H, v2.8H, v3.8H // .......*...................................... + // sub v14.8H, v4.8H, v5.8H // ..........*................................... + // add v25.8H, v4.8H, v5.8H // ........*..................................... + // mul v22.8H, v0.8H, v9.8H // ............*................................. + // sqrdmulh v13.8H, v0.8H, v13.8H // ...........*.................................. + // mul v31.8H, v14.8H, v31.8H // .............*................................ + // sqrdmulh v0.8H, v14.8H, v30.8H // ..............*............................... + // add v28.8H, v3.8H, v25.8H // ...............*.............................. + // ldr q15, [x4], #(6*16) // .*............................................ + // mls v31.8H, v0.8H, v7.H[0] // .................*............................ + // sub v0.8H, v3.8H, v25.8H // ................*............................. + // ldr q23, [x4, #-80] // *............................................. + // mls v22.8H, v13.8H, v7.H[0] // ..................*........................... + // sub v14.8H, v22.8H, v31.8H // .....................*........................ + // add v8.8H, v22.8H, v31.8H // ......................*....................... + // mul v21.8H, v0.8H, v15.8H // ...................*.......................... + // sqrdmulh v31.8H, v0.8H, v23.8H // ....................*......................... + // trn2 v25.4S, v28.4S, v8.4S // ........................*..................... + // trn1 v19.4S, v28.4S, v8.4S // .........................*.................... + // mul v27.8H, v14.8H, v15.8H // .......................*...................... + // sqrdmulh v15.8H, v14.8H, v23.8H // ..........................*................... + // mls v21.8H, v31.8H, v7.H[0] // ...........................*.................. + // mls v27.8H, v15.8H, v7.H[0] // ............................*................. + // trn1 v13.4S, v21.4S, v27.4S // .............................*................ + // trn2 v0.4S, v21.4S, v27.4S // ..............................*............... + // trn1 v28.2D, v25.2D, v0.2D // ..................................*........... + // trn2 v25.2D, v25.2D, v0.2D // ................................*............. + // trn1 v8.2D, v19.2D, v13.2D // ...............................*.............. + // trn2 v15.2D, v19.2D, v13.2D // .................................*............ + // add v14.8H, v15.8H, v25.8H // ...................................*.......... + // add v0.8H, v8.8H, v28.8H // ....................................*......... + // sqdmulh v27.8H, v0.8H, v7.H[1] // .....................................*........ + // sqdmulh v16.8H, v14.8H, v7.H[1] // ......................................*....... + // srshr v16.8H, v16.8H, #11 // .......................................*...... + // srshr v27.8H, v27.8H, #11 // ........................................*..... + // mls v14.8H, v16.8H, v7.H[0] // .........................................*.... + // mls v0.8H, v27.8H, v7.H[0] // ..........................................*... + // add v16.8H, v0.8H, v14.8H // ...........................................*.. + // str q16, [x1], #(64) // .............................................* + // sub v16.8H, v0.8H, v14.8H // ............................................*. sub count, count, #1 layer4567_start: - ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // e....................................................................... - ldr q28, [x4, #64] // .....e.................................................................. - sqrdmulh v25.8H, v0.8H, v13.H[1] // .............................................................*.......... - mul v19.8H, v0.8H, v13.H[0] // ............................................................*........... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v31.8H, v10.8H, v7.H[0] // ........................................*............................... - ldr q6, [x4, #16] // ..e..................................................................... - sub v8.8H, v26.8H, v5.8H // .........................................*.............................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - ldr q24, [x4, #48] // ....e................................................................... - mls v19.8H, v25.8H, v7.H[0] // ..............................................................*......... - sqdmulh v26.8H, v31.8H, v7.H[1] // .................................................*...................... - ldr q25, [x4, #80] // ......e................................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v3.8H, v8.8H, v13.H[4] // ...........................................*............................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v30.8H, v8.8H, v13.H[5] // ............................................*........................... - ldr q10, [x4, #32] // ...e.................................................................... - ldr q2, [x4], #(6*16) // .e...................................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v5.8H, v16.8H, v17.8H // ............e........................................................... - add v21.8H, v16.8H, v17.8H // .............e.......................................................... - srshr v1.8H, v26.8H, #11 // ..................................................*..................... - sub v27.8H, v14.8H, v15.8H // .......e................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v16.8H, v14.8H, v15.8H // ........e............................................................... - mls v3.8H, v30.8H, v7.H[0] // .............................................*.......................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v23.8H, v5.8H, v25.8H // ...............e........................................................ - mul v11.8H, v5.8H, v28.8H // ..............e......................................................... - sqrdmulh v20.8H, v27.8H, v24.8H // ..........e............................................................. - mul v9.8H, v27.8H, v10.8H // .........e.............................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v31.8H, v1.8H, v7.H[0] // ...................................................*.................... - sub v25.8H, v16.8H, v21.8H // .................e...................................................... - add v18.8H, v16.8H, v21.8H // ..................e..................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v17.8H, v3.8H, v7.H[1] // .......................................................*................ - mul v29.8H, v25.8H, v2.8H // ...................e.................................................... - sqrdmulh v14.8H, v25.8H, v6.8H // ....................e................................................... - mls v9.8H, v20.8H, v7.H[0] // ...........e............................................................ - mls v11.8H, v23.8H, v7.H[0] // ................e....................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - srshr v23.8H, v17.8H, #11 // ........................................................*............... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v30.8H, v9.8H, v11.8H // ......................e................................................. - add v5.8H, v9.8H, v11.8H // .......................e................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v29.8H, v14.8H, v7.H[0] // .....................e.................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v21.4S, v18.4S, v5.4S // ............................e........................................... - mul v12.8H, v30.8H, v2.8H // ........................e............................................... - sqrdmulh v22.8H, v30.8H, v6.8H // .........................e.............................................. - mls v3.8H, v23.8H, v7.H[0] // .........................................................*.............. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v28.4S, v18.4S, v5.4S // ...........................e............................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v12.8H, v22.8H, v7.H[0] // ..........................e............................................. - add v23.8H, v31.8H, v3.8H // ................................................................*....... - sub v8.8H, v31.8H, v3.8H // ...............................................................*........ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - str q23, [x1, #-48] // .....................................................................*.. - sqrdmulh v23.8H, v8.8H, v13.H[1] // ..................................................................*..... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v22.4S, v29.4S, v12.4S // ..............................e......................................... - trn1 v25.4S, v29.4S, v12.4S // .............................e.......................................... - str q19, [x1, #-32] // ......................................................................*. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v5.2D, v21.2D, v22.2D // ................................e....................................... - trn1 v24.2D, v21.2D, v22.2D // ..................................e..................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v26.2D, v28.2D, v25.2D // ...............................e........................................ - trn1 v25.2D, v28.2D, v25.2D // .................................e...................................... - mul v22.8H, v8.8H, v13.H[0] // .................................................................*...... - ldr q13, [x3], #16 // ...................................e.................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v20.8H, v26.8H, v5.8H // ..........................................e............................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v8.8H, v25.8H, v24.8H // .....................................e.................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v10.8H, v20.8H, v7.H[1] // ....................................................e................... - sqdmulh v11.8H, v8.8H, v7.H[1] // ..............................................e......................... - mls v22.8H, v23.8H, v7.H[0] // ...................................................................*.... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v23.8H, v25.8H, v24.8H // ....................................e................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - srshr v27.8H, v11.8H, #11 // ...............................................e........................ - srshr v10.8H, v10.8H, #11 // .....................................................e.................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - str q22, [x1, #-16] // .......................................................................* - mul v31.8H, v23.8H, v13.H[2] // ......................................e................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v8.8H, v27.8H, v7.H[0] // ................................................e....................... - mls v20.8H, v10.8H, v7.H[0] // ......................................................e................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v10.8H, v23.8H, v13.H[3] // .......................................e................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v28.8H, v8.8H, v20.8H // ...........................................................e............ - sub v0.8H, v8.8H, v20.8H // ..........................................................e............. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - str q28, [x1], #(64) // ....................................................................e... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - - // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e................................................................ - // ldr q0, [x4], #(6*16) // ..............e.........................................................|.............e.................................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // .....e..................................................................|....e........................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // .............e..........................................................|............e................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // .......e................................................................|......e......................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // .e......................................................................|e............................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ..........e.............................................................|.........e...................................................... - // sub v24.8h, v8.8h, v9.8h // ..................e.....................................................|.................e.............................................. - // add v8.8h, v8.8h, v9.8h // ...................e....................................................|..................e............................................. - // mul v9.8h, v24.8h, v1.8h // ........................e...............................................|.......................e........................................ - // sqrdmulh v24.8h, v24.8h, v5.8h // .......................e................................................|......................e......................................... - // mls v9.8h, v24.8h, v7.h[0] // ...............................e........................................|..............................e................................. - // sub v24.8h, v10.8h, v11.8h // ...............e........................................................|..............e................................................. - // add v10.8h, v10.8h, v11.8h // ................e.......................................................|...............e................................................ - // mul v11.8h, v24.8h, v2.8h // ......................e.................................................|.....................e.......................................... - // sqrdmulh v24.8h, v24.8h, v6.8h // .....................e..................................................|....................e........................................... - // mls v11.8h, v24.8h, v7.h[0] // ................................e.......................................|...............................e................................ - // sub v24.8h, v8.8h, v10.8h // ..........................e.............................................|.........................e...................................... - // add v8.8h, v8.8h, v10.8h // ...........................e............................................|..........................e..................................... - // mul v10.8h, v24.8h, v0.8h // .............................e..........................................|............................e................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ..............................e.........................................|.............................e.................................. - // mls v10.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e............................ - // sub v24.8h, v9.8h, v11.8h // ..................................e.....................................|.................................e.............................. - // add v9.8h, v9.8h, v11.8h // ...................................e....................................|..................................e............................. - // mul v11.8h, v24.8h, v0.8h // ......................................e.................................|.....................................e.......................... - // sqrdmulh v24.8h, v24.8h, v4.8h // .......................................e................................|......................................e......................... - // mls v11.8h, v24.8h, v7.h[0] // ..........................................e.............................|.........................................e...................... - // trn1 v25.4s, v8.4s, v9.4s // .........................................e..............................|........................................e....................... - // trn2 v26.4s, v8.4s, v9.4s // .....................................e..................................|....................................e........................... - // trn1 v27.4s, v10.4s, v11.4s // ................................................e.......................|...............................................e................ - // trn2 v28.4s, v10.4s, v11.4s // ...............................................e........................|..............................................e................. - // trn2 v10.2d, v25.2d, v27.2d // ....................................................e...................|...................................................e............ - // trn2 v11.2d, v26.2d, v28.2d // ..................................................e.....................|.................................................e.............. - // trn1 v8.2d, v25.2d, v27.2d // .....................................................e..................|....................................................e........... - // trn1 v9.2d, v26.2d, v28.2d // ...................................................e....................|..................................................e............. - // ldr q0, [x3], #16 // .......................................................e................|......................................................e......... - // sub v24.8h, v8.8h, v9.8h // .............................................................e..........|............................................................e... - // add v8.8h, v8.8h, v9.8h // .........................................................e..............|........................................................e....... - // mul v9.8h, v24.8h, v0.h[2] // .................................................................e......|................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................................................................e...|................................................................ - // mls v9.8h, v24.8h, v7.h[0] // ....*...................................................................|...*............................................................ - // sub v24.8h, v10.8h, v11.8h // ......*.................................................................|.....*.......................................................... - // add v10.8h, v10.8h, v11.8h // ........................................................e...............|.......................................................e........ - // mul v11.8h, v24.8h, v0.h[4] // ...........*............................................................|..........*..................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ............*...........................................................|...........*.................................................... - // mls v11.8h, v24.8h, v7.h[0] // ....................*...................................................|...................*............................................ - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........................................................e............|..........................................................e..... - // srshr v25.8h, v25.8h, #11 // ..............................................................e.........|.............................................................e.. - // mls v8.8h, v25.8h, v7.h[0] // ..................................................................e.....|................................................................ - // sqdmulh v25.8h, v9.8h, v7.h[1] // .........*..............................................................|........*....................................................... - // srshr v25.8h, v25.8h, #11 // .................*......................................................|................*............................................... - // mls v9.8h, v25.8h, v7.h[0] // .........................*..............................................|........................*....................................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ..........................................................e.............|.........................................................e...... - // srshr v25.8h, v25.8h, #11 // ...............................................................e........|..............................................................e. - // mls v10.8h, v25.8h, v7.h[0] // ...................................................................e....|................................................................ - // sqdmulh v25.8h, v11.8h, v7.h[1] // ............................*...........................................|...........................*.................................... - // srshr v25.8h, v25.8h, #11 // .................................*......................................|................................*............................... - // mls v11.8h, v25.8h, v7.h[0] // ........................................*...............................|.......................................*........................ - // sub v24.8h, v8.8h, v10.8h // ......................................................................e.|................................................................ - // add v8.8h, v8.8h, v10.8h // .....................................................................e..|................................................................ - // mul v10.8h, v24.8h, v0.h[0] // ...*....................................................................|..*............................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*.....................................................................|.*.............................................................. - // mls v10.8h, v24.8h, v7.h[0] // ........*...............................................................|.......*........................................................ - // sub v24.8h, v9.8h, v11.8h // ............................................*...........................|...........................................*.................... - // add v9.8h, v9.8h, v11.8h // ...........................................*............................|..........................................*..................... - // mul v11.8h, v24.8h, v0.h[0] // ......................................................*.................|.....................................................*.......... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*.........................|.............................................*.................. - // mls v11.8h, v24.8h, v7.h[0] // ............................................................*...........|...........................................................*.... - // str q8, [x1], #(64) // .......................................................................e|................................................................ - // str q9, [x1, #(-64 + 16*1)] // .............................................*..........................|............................................*................... - // str q10, [x1, #(-64 + 16*2)] // .................................................*......................|................................................*............... - // str q11, [x1, #(-64 + 16*3)] // ................................................................*.......|...............................................................* + // Instructions: 72 + // Expected cycles: 40 + // Expected IPC: 1.80 + // + // Cycle bound: 40.0 + // IPC bound: 1.80 + // + // Wall time: 11.54s + // User time: 11.54s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ld4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1] // e....................................................................... + sub v15.8H, v15.8H, v25.8H // .........................................*.............................. + ldr q30, [x4, #80] // ......e................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q11, [x3], #16 // ...................................*.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q9, [x4, #32] // ...e.................................................................... + ldr q13, [x4, #48] // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v21.8H, v8.8H, v28.8H // ....................................*................................... + ldr q31, [x4, #64] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v6.8H, v15.8H, v11.H[4] // ............................................*........................... + sqrdmulh v27.8H, v15.8H, v11.H[5] // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v26.8H, v21.8H, v11.H[2] // .......................................*................................ + sqrdmulh v15.8H, v21.8H, v11.H[3] // ......................................*................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v0.8H, v2.8H, v3.8H // .......e................................................................ + add v3.8H, v2.8H, v3.8H // ........e............................................................... + sub v14.8H, v4.8H, v5.8H // ............e........................................................... + add v25.8H, v4.8H, v5.8H // .............e.......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v15.8H, v7.H[0] // ........................................*............................... + sqrdmulh v15.8H, v16.8H, v11.H[1] // ............................................................*........... + mul v16.8H, v16.8H, v11.H[0] // .............................................................*.......... + mls v6.8H, v27.8H, v7.H[0] // .............................................*.......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v22.8H, v0.8H, v9.8H // ..........e............................................................. + sqrdmulh v13.8H, v0.8H, v13.8H // .........e.............................................................. + mul v31.8H, v14.8H, v31.8H // ...............e........................................................ + sqrdmulh v0.8H, v14.8H, v30.8H // ..............e......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v28.8H, v3.8H, v25.8H // ..................e..................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v16.8H, v15.8H, v7.H[0] // ..............................................................*......... + ldr q15, [x4], #(6*16) // .e...................................................................... + sqdmulh v14.8H, v26.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v27.8H, v6.8H, v7.H[1] // .......................................................*................ + mls v31.8H, v0.8H, v7.H[0] // ................e....................................................... + sub v0.8H, v3.8H, v25.8H // .................e...................................................... + ldr q23, [x4, #-80] // ..e..................................................................... + mls v22.8H, v13.8H, v7.H[0] // ...........e............................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q16, [x1, #-32] // ......................................................................*. + srshr v16.8H, v27.8H, #11 // ........................................................*............... + srshr v2.8H, v14.8H, #11 // ..................................................*..................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v14.8H, v22.8H, v31.8H // ......................e................................................. + add v8.8H, v22.8H, v31.8H // .......................e................................................ + mul v21.8H, v0.8H, v15.8H // ....................e................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v31.8H, v0.8H, v23.8H // ...................e.................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v25.4S, v28.4S, v8.4S // ............................e........................................... + trn1 v19.4S, v28.4S, v8.4S // ...........................e............................................ + mul v27.8H, v14.8H, v15.8H // .........................e.............................................. + sqrdmulh v15.8H, v14.8H, v23.8H // ........................e............................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v6.8H, v16.8H, v7.H[0] // .........................................................*.............. + mls v26.8H, v2.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v21.8H, v31.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v15.8H, v7.H[0] // ..........................e............................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v15.8H, v26.8H, v6.8H // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v13.4S, v21.4S, v27.4S // .............................e.......................................... + trn2 v0.4S, v21.4S, v27.4S // ..............................e......................................... + sqrdmulh v1.8H, v15.8H, v11.H[1] // .................................................................*...... + mul v16.8H, v15.8H, v11.H[0] // ..................................................................*..... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v28.2D, v25.2D, v0.2D // ..................................e..................................... + trn2 v25.2D, v25.2D, v0.2D // ................................e....................................... + trn1 v8.2D, v19.2D, v13.2D // .................................e...................................... + trn2 v15.2D, v19.2D, v13.2D // ...............................e........................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v16.8H, v1.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v14.8H, v15.8H, v25.8H // ..........................................e............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v0.8H, v8.8H, v28.8H // .....................................e.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v27.8H, v0.8H, v7.H[1] // ..............................................e......................... + str q16, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v16.8H, v14.8H, v7.H[1] // ....................................................e................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v16.8H, v16.8H, #11 // .....................................................e.................. + srshr v27.8H, v27.8H, #11 // ...............................................e........................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v14.8H, v16.8H, v7.H[0] // ......................................................e................. + mls v0.8H, v27.8H, v7.H[0] // ................................................e....................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v16.8H, v26.8H, v6.8H // ................................................................*....... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q16, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v16.8H, v0.8H, v14.8H // ...........................................................e............ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q16, [x1], #(64) // ....................................................................e... + sub v16.8H, v0.8H, v14.8H // ..........................................................e............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + + // --------------------------------------------------------------- new position ---------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................~.................................................................... + // ldr q0, [x4], #(6*16) // ..........................e.............................................'.........................~.......................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ...............................e........................................'..............................~..................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ....e...................................................................'...~................................................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // .....e..................................................................'....~............................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // .......e................................................................'......~............................................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // ..e.....................................................................'.~.................................................................. + // sub v24.8h, v8.8h, v9.8h // ............e...........................................................'...........~........................................................ + // add v8.8h, v8.8h, v9.8h // .............e..........................................................'............~....................................................... + // sqrdmulh v27.8h, v24.8h, v5.8h // .....................e..................................................'....................~............................................... + // mul v9.8h, v24.8h, v1.8h // ....................e...................................................'...................~................................................ + // mls v9.8h, v27.8h, v7.h[0] // ................................e.......................................'...............................~.................................... + // sub v24.8h, v10.8h, v11.8h // ..............e.........................................................'.............~...................................................... + // add v10.8h, v10.8h, v11.8h // ...............e........................................................'..............~..................................................... + // sqrdmulh v27.8h, v24.8h, v6.8h // .......................e................................................'......................~............................................. + // mul v11.8h, v24.8h, v2.8h // ......................e.................................................'.....................~.............................................. + // mls v11.8h, v27.8h, v7.h[0] // .............................e..........................................'............................~....................................... + // sub v24.8h, v8.8h, v10.8h // ..............................e.........................................'.............................~...................................... + // add v8.8h, v8.8h, v10.8h // ........................e...............................................'.......................~............................................ + // sqrdmulh v27.8h, v24.8h, v4.8h // .......................................e................................'......................................~............................. + // mul v10.8h, v24.8h, v0.8h // ......................................e.................................'.....................................~.............................. + // mls v10.8h, v27.8h, v7.h[0] // ..............................................e.........................'.............................................~...................... + // sub v24.8h, v9.8h, v11.8h // ....................................e...................................'...................................~................................ + // add v9.8h, v9.8h, v11.8h // .....................................e..................................'....................................~............................... + // sqrdmulh v27.8h, v24.8h, v4.8h // ...........................................e............................'..........................................~......................... + // mul v11.8h, v24.8h, v0.8h // ..........................................e.............................'.........................................~.......................... + // mls v11.8h, v27.8h, v7.h[0] // ...............................................e........................'..............................................~..................... + // trn1 v25.4s, v8.4s, v9.4s // .........................................e..............................'........................................~........................... + // trn2 v26.4s, v8.4s, v9.4s // ........................................e...............................'.......................................~............................ + // trn1 v27.4s, v10.4s, v11.4s // .................................................e......................'................................................~................... + // trn2 v28.4s, v10.4s, v11.4s // ..................................................e.....................'.................................................~.................. + // trn2 v10.2d, v25.2d, v27.2d // ........................................................e...............'.......................................................~............ + // trn2 v11.2d, v26.2d, v28.2d // ......................................................e.................'.....................................................~.............. + // trn1 v8.2d, v25.2d, v27.2d // .......................................................e................'......................................................~............. + // trn1 v9.2d, v26.2d, v28.2d // .....................................................e..................'....................................................~............... + // ldr q0, [x3], #16 // ...~....................................................................'..*................................................................. + // sub v24.8h, v8.8h, v9.8h // ......~.................................................................'.....*.............................................................. + // add v8.8h, v8.8h, v9.8h // ...........................................................e............'..........................................................~......... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...........~............................................................'..........*......................................................... + // mul v9.8h, v24.8h, v0.h[2] // ..........~.............................................................'.........*.......................................................... + // mls v9.8h, v27.8h, v7.h[0] // ................~.......................................................'...............*.................................................... + // sub v24.8h, v10.8h, v11.8h // .~......................................................................'*................................................................... + // add v10.8h, v10.8h, v11.8h // ..........................................................e.............'.........................................................~.......... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // .........~..............................................................'........*........................................................... + // mul v11.8h, v24.8h, v0.h[4] // ........~...............................................................'.......*............................................................ + // mls v11.8h, v27.8h, v7.h[0] // ...................~....................................................'..................*................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ............................................................e...........'...........................................................~........ + // srshr v25.8h, v25.8h, #11 // ................................................................e.......'...............................................................~.... + // mls v8.8h, v25.8h, v7.h[0] // ..................................................................e.....'.................................................................~.. + // sqdmulh v25.8h, v9.8h, v7.h[1] // ...........................~............................................'..........................*......................................... + // srshr v25.8h, v25.8h, #11 // ...................................~....................................'..................................*................................. + // mls v9.8h, v25.8h, v7.h[0] // .............................................~..........................'............................................*....................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..............................................................e.........'.............................................................~...... + // srshr v25.8h, v25.8h, #11 // ...............................................................e........'..............................................................~..... + // mls v10.8h, v25.8h, v7.h[0] // .................................................................e......'................................................................~... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ............................~...........................................'...........................*........................................ + // srshr v25.8h, v25.8h, #11 // ..................................~.....................................'.................................*.................................. + // mls v11.8h, v25.8h, v7.h[0] // ............................................~...........................'...........................................*........................ + // sub v24.8h, v8.8h, v10.8h // .......................................................................e'.................................................................... + // add v8.8h, v8.8h, v10.8h // .....................................................................e..'.................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .................~......................................................'................*................................................... + // mul v10.8h, v24.8h, v0.h[0] // ..................~.....................................................'.................*.................................................. + // mls v10.8h, v27.8h, v7.h[0] // .........................~..............................................'........................*........................................... + // sub v24.8h, v9.8h, v11.8h // ................................................~.......................'...............................................*.................... + // add v9.8h, v9.8h, v11.8h // ...................................................................~....'..................................................................*. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...................................................~....................'..................................................*................. + // mul v11.8h, v24.8h, v0.h[0] // ....................................................~...................'...................................................*................ + // mls v11.8h, v27.8h, v7.h[0] // .........................................................~..............'........................................................*........... + // str q8, [x1], #(64) // ......................................................................e.'.................................................................... + // str q9, [x1, #(-64 + 16*1)] // ....................................................................~...'...................................................................* + // str q10, [x1, #(-64 + 16*2)] // .................................~......................................'................................*................................... + // str q11, [x1, #(-64 + 16*3)] // .............................................................~..........'............................................................*....... sub count, count, #1 cbnz count, layer4567_start - mls v31.8H, v10.8H, v7.H[0] // ..*................... - sub v4.8H, v26.8H, v5.8H // ...*.................. - mul v9.8H, v0.8H, v13.H[0] // .*.................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqrdmulh v25.8H, v0.8H, v13.H[1] // *..................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mul v28.8H, v4.8H, v13.H[4] // ......*............... - sqrdmulh v16.8H, v4.8H, v13.H[5] // .......*.............. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqdmulh v5.8H, v31.8H, v7.H[1] // .....*................ - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v9.8H, v25.8H, v7.H[0] // ....*................. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v28.8H, v16.8H, v7.H[0] // .........*............ - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - srshr v21.8H, v5.8H, #11 // ........*............. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - str q9, [x1, #-32] // ..................*... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqdmulh v4.8H, v28.8H, v7.H[1] // ...........*.......... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v31.8H, v21.8H, v7.H[0] // ..........*........... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - srshr v14.8H, v4.8H, #11 // ............*......... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v28.8H, v14.8H, v7.H[0] // .............*........ - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sub v8.8H, v31.8H, v28.8H // ...............*...... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - add v26.8H, v31.8H, v28.8H // ..............*....... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqrdmulh v20.8H, v8.8H, v13.H[1] // .................*.... - mul v13.8H, v8.8H, v13.H[0] // ...................*.. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - str q26, [x1, #-48] // ................*..... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v13.8H, v20.8H, v7.H[0] // ....................*. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - str q13, [x1, #-16] // .....................* - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - - // original source code - // sqrdmulh v25.8H, v0.8H, v13.H[1] // ...*.................. - // mul v19.8H, v0.8H, v13.H[0] // ..*................... - // mls v31.8H, v10.8H, v7.H[0] // *..................... - // sub v8.8H, v26.8H, v5.8H // .*.................... - // mls v19.8H, v25.8H, v7.H[0] // .......*.............. - // sqdmulh v26.8H, v31.8H, v7.H[1] // ......*............... - // mul v3.8H, v8.8H, v13.H[4] // ....*................. - // sqrdmulh v30.8H, v8.8H, v13.H[5] // .....*................ - // srshr v1.8H, v26.8H, #11 // .........*............ - // mls v3.8H, v30.8H, v7.H[0] // ........*............. - // mls v31.8H, v1.8H, v7.H[0] // ............*......... - // sqdmulh v17.8H, v3.8H, v7.H[1] // ...........*.......... - // srshr v23.8H, v17.8H, #11 // .............*........ - // mls v3.8H, v23.8H, v7.H[0] // ..............*....... - // add v23.8H, v31.8H, v3.8H // ................*..... - // sub v8.8H, v31.8H, v3.8H // ...............*...... - // str q23, [x1, #-48] // ...................*.. - // sqrdmulh v23.8H, v8.8H, v13.H[1] // .................*.... - // str q19, [x1, #-32] // ..........*........... - // mul v22.8H, v8.8H, v13.H[0] // ..................*... - // mls v22.8H, v23.8H, v7.H[0] // ....................*. - // str q22, [x1, #-16] // .....................* + // Instructions: 26 + // Expected cycles: 28 + // Expected IPC: 0.93 + // + // Cycle bound: 28.0 + // IPC bound: 0.93 + // + // Wall time: 0.27s + // User time: 0.27s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sub v20.8H, v8.8H, v28.8H // ..*........................... + ldr q10, [x3], #16 // .*............................ + sub v2.8H, v15.8H, v25.8H // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v25.8H, v2.8H, v10.H[4] // ...*.......................... + sqrdmulh v0.8H, v20.8H, v10.H[3] // ......*....................... + mul v22.8H, v20.8H, v10.H[2] // .....*........................ + sqrdmulh v27.8H, v2.8H, v10.H[5] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v12.8H, v16.8H, v10.H[1] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v25.8H, v27.8H, v7.H[0] // ..........*................... + mls v22.8H, v0.8H, v7.H[0] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqdmulh v18.8H, v22.8H, v7.H[1] // ............*................. + sqdmulh v26.8H, v25.8H, v7.H[1] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + srshr v2.8H, v18.8H, #11 // ................*............. + srshr v0.8H, v26.8H, #11 // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v25.8H, v0.8H, v7.H[0] // .................*............ + mls v22.8H, v2.8H, v7.H[0] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v17.8H, v22.8H, v25.8H // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v13.8H, v22.8H, v25.8H // ........................*..... + mul v22.8H, v16.8H, v10.H[0] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v16.8H, v17.8H, v10.H[1] // ....................*......... + mul v11.8H, v17.8H, v10.H[0] // .....................*........ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q13, [x1, #-48] // .........................*.... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v22.8H, v12.8H, v7.H[0] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v11.8H, v16.8H, v7.H[0] // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q22, [x1, #-32] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q11, [x1, #-16] // .......................*...... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sub v15.8H, v15.8H, v25.8H // ..*............................ + // ldr q11, [x3], #16 // .*............................. + // sub v21.8H, v8.8H, v28.8H // *.............................. + // mul v6.8H, v15.8H, v11.H[4] // ...*........................... + // sqrdmulh v27.8H, v15.8H, v11.H[5] // ......*........................ + // mul v26.8H, v21.8H, v11.H[2] // .....*......................... + // sqrdmulh v15.8H, v21.8H, v11.H[3] // ....*.......................... + // mls v26.8H, v15.8H, v7.H[0] // .........*..................... + // sqrdmulh v15.8H, v16.8H, v11.H[1] // .......*....................... + // mul v16.8H, v16.8H, v11.H[0] // ..................*............ + // mls v6.8H, v27.8H, v7.H[0] // ........*...................... + // mls v16.8H, v15.8H, v7.H[0] // ......................*........ + // sqdmulh v14.8H, v26.8H, v7.H[1] // ..........*.................... + // sqdmulh v27.8H, v6.8H, v7.H[1] // ...........*................... + // str q16, [x1, #-32] // ........................*...... + // srshr v16.8H, v27.8H, #11 // .............*................. + // srshr v2.8H, v14.8H, #11 // ............*.................. + // mls v6.8H, v16.8H, v7.H[0] // ..............*................ + // mls v26.8H, v2.8H, v7.H[0] // ...............*............... + // sub v15.8H, v26.8H, v6.8H // ................*.............. + // sqrdmulh v1.8H, v15.8H, v11.H[1] // ...................*........... + // mul v16.8H, v15.8H, v11.H[0] // ....................*.......... + // mls v16.8H, v1.8H, v7.H[0] // .......................*....... + // str q16, [x1, #-16] // .........................*..... + // add v16.8H, v26.8H, v6.8H // .................*............. + // str q16, [x1, #-48] // .....................*......... // --------------------------------------------------------------------- @@ -1376,594 +1422,631 @@ layer4567_start: .p2align 2 - ldr q19, [x0, #256] // *................................................. - ldr q23, [x0, #192] // .*................................................ - ldr q22, [x0, #128] // ..*............................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - ldr q28, [x0, #320] // ...*.............................................. - ldr q27, [x0, #64] // ....*............................................. - ldr q24, [x0, #0] // .....*............................................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - ldr q3, [x0, #384] // ......*........................................... - ldr q26, [x0, #448] // .......*.......................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v20.8H, v22.8H, v23.8H // ........*......................................... - add v23.8H, v22.8H, v23.8H // .......................*.......................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v22.8H, v19.8H, v28.8H // ............*..................................... - add v19.8H, v19.8H, v28.8H // .........*........................................ - sub v28.8H, v24.8H, v27.8H // ...........*...................................... - add v27.8H, v24.8H, v27.8H // ..........*....................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v24.8H, v3.8H, v26.8H // .................*................................ - sub v3.8H, v3.8H, v26.8H // .............*.................................... - sqrdmulh v26.8H, v20.8H, v1.H[1] // ..............*................................... - mul v20.8H, v20.8H, v1.H[0] // ................*................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v11.8H, v22.8H, v1.H[3] // ....................*............................. - mul v22.8H, v22.8H, v1.H[2] // .....................*............................ - sqrdmulh v14.8H, v28.8H, v0.H[7] // ...............*.................................. - mul v25.8H, v28.8H, v0.H[6] // ......................*........................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v28.8H, v19.8H, v24.8H // .........................*........................ - add v19.8H, v19.8H, v24.8H // ........................*......................... - sqrdmulh v24.8H, v3.8H, v1.H[5] // ..................*............................... - mul v3.8H, v3.8H, v1.H[4] // ...................*.............................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v5.8H, v27.8H, v23.8H // .............................*.................... - add v23.8H, v27.8H, v23.8H // ..............................*................... - mls v20.8H, v26.8H, v7.H[0] // ..........................*....................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v25.8H, v14.8H, v7.H[0] // .................................*................ - mls v22.8H, v11.8H, v7.H[0] // ............................*..................... - mul v14.8H, v28.8H, v0.H[4] // ...............................*.................. - sqrdmulh v27.8H, v28.8H, v0.H[5] // ................................*................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v3.8H, v24.8H, v7.H[0] // ...........................*...................... - sqrdmulh v24.8H, v5.8H, v0.H[3] // ..................................*............... - mul v28.8H, v5.8H, v0.H[2] // ...................................*.............. - add v26.8H, v23.8H, v19.8H // .....................................*............ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v19.8H, v23.8H, v19.8H // ........................................*......... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v8.8H, v25.8H, v20.8H // .......................................*.......... - add v21.8H, v25.8H, v20.8H // .........................................*........ - mls v14.8H, v27.8H, v7.H[0] // ......................................*........... - mul v12.8H, v26.8H, v29.8H // ...........................................*...... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v10.8H, v22.8H, v3.8H // ....................................*............. - sub v11.8H, v22.8H, v3.8H // ..........................................*....... - mls v28.8H, v24.8H, v7.H[0] // .............................................*.... - sqrdmulh v13.8H, v26.8H, v30.8H // ............................................*..... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v22.8H, v19.8H, v0.H[0] // ..............................................*... - sqrdmulh v15.8H, v19.8H, v0.H[1] // ...............................................*.. - mul v17.8H, v8.8H, v0.H[2] // ................................................*. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sub v4.8H, v21.8H, v10.8H // .................................................* - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - - // original source code - // ldr q16, [x0, #256] // *................................................. - // ldr q5, [x0, #192] // .*................................................ - // ldr q31, [x0, #128] // ..*............................................... - // ldr q18, [x0, #320] // ...*.............................................. - // ldr q20, [x0, #64] // ....*............................................. - // ldr q11, [x0, #0] // .....*............................................ - // ldr q8, [x0, #384] // ......*........................................... - // ldr q10, [x0, #448] // .......*.......................................... - // sub v14.8H, v31.8H, v5.8H // ........*......................................... - // add v24.8H, v16.8H, v18.8H // ...........*...................................... - // add v21.8H, v11.8H, v20.8H // .............*.................................... - // sub v20.8H, v11.8H, v20.8H // ............*..................................... - // sub v19.8H, v16.8H, v18.8H // ..........*....................................... - // sub v18.8H, v8.8H, v10.8H // ...............*.................................. - // sqrdmulh v16.8H, v14.8H, v1.H[1] // ................*................................. - // sqrdmulh v9.8H, v20.8H, v0.H[7] // ....................*............................. - // mul v13.8H, v14.8H, v1.H[0] // .................*................................ - // add v15.8H, v8.8H, v10.8H // ..............*................................... - // sqrdmulh v14.8H, v18.8H, v1.H[5] // ........................*......................... - // mul v12.8H, v18.8H, v1.H[4] // .........................*........................ - // sqrdmulh v10.8H, v19.8H, v1.H[3] // ..................*............................... - // mul v25.8H, v19.8H, v1.H[2] // ...................*.............................. - // mul v19.8H, v20.8H, v0.H[6] // .....................*............................ - // add v20.8H, v31.8H, v5.8H // .........*........................................ - // add v2.8H, v24.8H, v15.8H // .......................*.......................... - // sub v28.8H, v24.8H, v15.8H // ......................*........................... - // mls v13.8H, v16.8H, v7.H[0] // ............................*..................... - // mls v12.8H, v14.8H, v7.H[0] // .................................*................ - // mls v25.8H, v10.8H, v7.H[0] // ..............................*................... - // sub v18.8H, v21.8H, v20.8H // ..........................*....................... - // add v11.8H, v21.8H, v20.8H // ...........................*...................... - // mul v14.8H, v28.8H, v0.H[4] // ...............................*.................. - // sqrdmulh v20.8H, v28.8H, v0.H[5] // ................................*................. - // mls v19.8H, v9.8H, v7.H[0] // .............................*.................... - // sqrdmulh v17.8H, v18.8H, v0.H[3] // ..................................*............... - // mul v28.8H, v18.8H, v0.H[2] // ...................................*.............. - // add v10.8H, v25.8H, v12.8H // ..........................................*....... - // add v24.8H, v11.8H, v2.8H // ....................................*............. - // mls v14.8H, v20.8H, v7.H[0] // ........................................*......... - // sub v8.8H, v19.8H, v13.8H // ......................................*........... - // sub v27.8H, v11.8H, v2.8H // .....................................*............ - // add v21.8H, v19.8H, v13.8H // .......................................*.......... - // sub v11.8H, v25.8H, v12.8H // ...........................................*...... - // mul v12.8H, v24.8H, v29.8H // .........................................*........ - // sqrdmulh v13.8H, v24.8H, v30.8H // .............................................*.... - // mls v28.8H, v17.8H, v7.H[0] // ............................................*..... - // mul v22.8H, v27.8H, v0.H[0] // ..............................................*... - // sqrdmulh v15.8H, v27.8H, v0.H[1] // ...............................................*.. - // mul v17.8H, v8.8H, v0.H[2] // ................................................*. - // sub v4.8H, v21.8H, v10.8H // .................................................* + // Instructions: 43 + // Expected cycles: 16 + // Expected IPC: 2.69 + // + // Cycle bound: 16.0 + // IPC bound: 2.69 + // + // Wall time: 0.84s + // User time: 0.84s + // + // ----------- original position ------------> + // 0 25 + // |------------------------|----------------- + ldr q3, [x0, #256] // ....*...................................... + ldr q6, [x0, #448] // ......*.................................... + ldr q22, [x0, #384] // .......*................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q10, [x0, #320] // *.......................................... + ldr q21, [x0, #128] // ...*....................................... + ldr q19, [x0, #192] // .....*..................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q15, [x0, #64] // ..*........................................ + ldr q9, [x0, #0] // .*......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v20.8H, v22.8H, v6.8H // ............*.............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v24.8H, v3.8H, v10.8H // ..........*................................ + sub v14.8H, v21.8H, v19.8H // ...........*............................... + add v27.8H, v3.8H, v10.8H // ......................*.................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v2.8H, v9.8H, v15.8H // ........*.................................. + sub v15.8H, v9.8H, v15.8H // .........*................................. + sqrdmulh v5.8H, v20.8H, v1.H[5] // .................*......................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v25.8H, v24.8H, v1.H[2] // ..............*............................ + sqrdmulh v17.8H, v24.8H, v1.H[3] // ..................*........................ + mul v24.8H, v20.8H, v1.H[4] // ................*.......................... + mul v26.8H, v14.8H, v1.H[0] // ...............*........................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v11.8H, v14.8H, v1.H[1] // ....................*...................... + mul v16.8H, v15.8H, v0.H[6] // .....................*..................... + sqrdmulh v15.8H, v15.8H, v0.H[7] // .............*............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v20.8H, v22.8H, v6.8H // ...........................*............... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v9.8H, v21.8H, v19.8H // ...................*....................... + mls v24.8H, v5.8H, v7.H[0] // .......................*................... + mls v25.8H, v17.8H, v7.H[0] // ........................*.................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v16.8H, v15.8H, v7.H[0] // ............................*.............. + sub v15.8H, v27.8H, v20.8H // ...............................*........... + mls v26.8H, v11.8H, v7.H[0] // .........................*................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v4.8H, v27.8H, v20.8H // ................................*.......... + add v3.8H, v2.8H, v9.8H // ..................................*........ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v28.8H, v2.8H, v9.8H // ..........................*................ + mul v14.8H, v15.8H, v0.H[4] // ........................................*.. + sqrdmulh v11.8H, v15.8H, v0.H[5] // .........................................*. + sub v15.8H, v25.8H, v24.8H // .............................*............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v6.8H, v25.8H, v24.8H // ..............................*............ + add v17.8H, v16.8H, v26.8H // ...................................*....... + sub v13.8H, v16.8H, v26.8H // .....................................*..... + sub v27.8H, v3.8H, v4.8H // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v23.8H, v28.8H, v0.H[3] // .................................*......... + mul v19.8H, v28.8H, v0.H[2] // ..........................................* + sqrdmulh v22.8H, v15.8H, v0.H[5] // ....................................*...... + mul v20.8H, v15.8H, v0.H[4] // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + + // -------------- new position --------------> + // 0 25 + // |------------------------|----------------- + // ldr q24, [x0, #320] // ...*....................................... + // ldr q6, [x0, #0] // .......*................................... + // ldr q17, [x0, #64] // ......*.................................... + // ldr q10, [x0, #128] // ....*...................................... + // ldr q16, [x0, #256] // *.......................................... + // ldr q3, [x0, #192] // .....*..................................... + // ldr q4, [x0, #448] // .*......................................... + // ldr q21, [x0, #384] // ..*........................................ + // add v25.8H, v6.8H, v17.8H // ............*.............................. + // sub v6.8H, v6.8H, v17.8H // .............*............................. + // sub v14.8H, v16.8H, v24.8H // .........*................................. + // sub v22.8H, v10.8H, v3.8H // ..........*................................ + // sub v19.8H, v21.8H, v4.8H // ........*.................................. + // sqrdmulh v2.8H, v6.8H, v0.H[7] // .....................*..................... + // mul v8.8H, v14.8H, v1.H[2] // ...............*........................... + // mul v9.8H, v22.8H, v1.H[0] // ..................*........................ + // mul v31.8H, v19.8H, v1.H[4] // .................*......................... + // sqrdmulh v12.8H, v19.8H, v1.H[5] // ..............*............................ + // sqrdmulh v19.8H, v14.8H, v1.H[3] // ................*.......................... + // add v17.8H, v10.8H, v3.8H // .......................*................... + // sqrdmulh v3.8H, v22.8H, v1.H[1] // ...................*....................... + // mul v10.8H, v6.8H, v0.H[6] // ....................*...................... + // add v22.8H, v16.8H, v24.8H // ...........*............................... + // mls v31.8H, v12.8H, v7.H[0] // ........................*.................. + // mls v8.8H, v19.8H, v7.H[0] // .........................*................. + // mls v9.8H, v3.8H, v7.H[0] // ............................*.............. + // sub v12.8H, v25.8H, v17.8H // ...............................*........... + // add v4.8H, v21.8H, v4.8H // ......................*.................... + // mls v10.8H, v2.8H, v7.H[0] // ..........................*................ + // sub v15.8H, v8.8H, v31.8H // ..................................*........ + // add v6.8H, v8.8H, v31.8H // ...................................*....... + // sub v31.8H, v22.8H, v4.8H // ...........................*............... + // add v4.8H, v22.8H, v4.8H // .............................*............. + // sqrdmulh v23.8H, v12.8H, v0.H[3] // .......................................*... + // add v3.8H, v25.8H, v17.8H // ..............................*............ + // add v17.8H, v10.8H, v9.8H // ....................................*...... + // sqrdmulh v22.8H, v15.8H, v0.H[5] // .........................................*. + // sub v13.8H, v10.8H, v9.8H // .....................................*..... + // mul v20.8H, v15.8H, v0.H[4] // ..........................................* + // sub v27.8H, v3.8H, v4.8H // ......................................*.... + // mul v14.8H, v31.8H, v0.H[4] // ................................*.......... + // sqrdmulh v11.8H, v31.8H, v0.H[5] // .................................*......... + // mul v19.8H, v12.8H, v0.H[2] // ........................................*.. sub count, count, #1 layer123_start: - ldr q16, [x0, #272] // ....e................................................................................... + // Instructions: 88 + // Expected cycles: 18 + // Expected IPC: 4.89 + // + // Cycle bound: 18.0 + // IPC bound: 4.89 + // + // Wall time: 101.58s + // User time: 101.58s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + mul v12.8H, v13.8H, v0.H[2] // ....................................*................................................... + sub v9.8H, v17.8H, v6.8H // .....................................................*.................................. + add v28.8H, v17.8H, v6.8H // ......................................................*................................. + ldr q24, [x0, #336] // .....e.................................................................................. // gap // ........................................................................................ - mul v25.8H, v11.8H, v0.H[4] // .............................................*.......................................... - sqrdmulh v3.8H, v11.8H, v0.H[5] // ..............................................*......................................... - ldr q5, [x0, #208] // ...e.................................................................................... - sqrdmulh v19.8H, v8.8H, v0.H[3] // ....................................*................................................... - ldr q31, [x0, #144] // ..e..................................................................................... - add v27.8H, v21.8H, v10.8H // ......................................................*................................. - mul v9.8H, v4.8H, v0.H[0] // .......................................................*................................ - sqrdmulh v24.8H, v4.8H, v0.H[1] // ........................................................*............................... - add v23.8H, v28.8H, v14.8H // ...........................................................*............................ - ldr q18, [x0, #336] // .....e.................................................................................. - ldr q20, [x0, #80] // .e...................................................................................... - ldr q11, [x0, #16] // e....................................................................................... + ldr q6, [x0, #16] // e....................................................................................... + ldr q17, [x0, #80] // .e...................................................................................... + sqrdmulh v13.8H, v13.8H, v0.H[3] // ...................................*.................................................... + add v26.8H, v3.8H, v4.8H // .................................................*...................................... // gap // ........................................................................................ - mls v12.8H, v13.8H, v7.H[0] // ..........................................................................*............. - sub v4.8H, v28.8H, v14.8H // ..........................................................*............................. - mls v22.8H, v15.8H, v7.H[0] // ....................................................*................................... - ldr q8, [x0, #400] // ......e................................................................................. + ldr q10, [x0, #144] // ..e..................................................................................... + mls v20.8H, v22.8H, v7.H[0] // ...............................................*........................................ + ldr q16, [x0, #272] // ....e................................................................................... + ldr q3, [x0, #208] // ...e.................................................................................... + sqrdmulh v2.8H, v27.8H, v0.H[1] // ..................................................*..................................... + mul v31.8H, v27.8H, v0.H[0] // ...................................................*.................................... + mls v19.8H, v23.8H, v7.H[0] // ................................*....................................................... + mls v14.8H, v11.8H, v7.H[0] // ..........................................*............................................. // gap // ........................................................................................ + ldr q4, [x0, #464] // .......e................................................................................ + ldr q21, [x0, #400] // ......e................................................................................. // gap // ........................................................................................ - sqrdmulh v2.8H, v27.8H, v30.8H // ............................................................................*........... - ldr q10, [x0, #464] // .......e................................................................................ - mul v27.8H, v27.8H, v29.8H // ...........................................................................*............ + sqrdmulh v23.8H, v9.8H, v0.H[1] // .......................................................*................................ + sqrdmulh v27.8H, v28.8H, v30.8H // ...........................................................................*............ + mul v15.8H, v26.8H, v29.8H // .........................................................................*.............. + sqrdmulh v8.8H, v26.8H, v30.8H // ........................................................................*............... + mls v12.8H, v13.8H, v7.H[0] // .....................................*.................................................. + mul v26.8H, v28.8H, v29.8H // ............................................................................*........... // gap // ........................................................................................ - mls v25.8H, v3.8H, v7.H[0] // ...............................................*........................................ - mul v3.8H, v23.8H, v29.8H // ..............................................................................*......... // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v6.8H, v23.8H, v30.8H // ...............................................................................*........ // gap // ........................................................................................ - mls v17.8H, v19.8H, v7.H[0] // .....................................*.................................................. - mls v9.8H, v24.8H, v7.H[0] // .........................................................*.............................. + mul v18.8H, v9.8H, v0.H[0] // ........................................................*............................... + add v25.8H, v6.8H, v17.8H // .........e.............................................................................. + sub v6.8H, v6.8H, v17.8H // ........e............................................................................... // gap // ........................................................................................ - str q12, [x0], #(16) // ....................................................................................*... - sqrdmulh v26.8H, v4.8H, v0.H[1] // .............................................................*.......................... - sub v14.8H, v31.8H, v5.8H // .............e.......................................................................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v4.8H, v4.8H, v0.H[0] // ............................................................*........................... - add v24.8H, v16.8H, v18.8H // ...................e.................................................................... - str q22, [x0, #240] // ....................................................................*................... - add v21.8H, v11.8H, v20.8H // .........e.............................................................................. - sub v20.8H, v11.8H, v20.8H // ........e............................................................................... // gap // ........................................................................................ + mls v31.8H, v2.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ + add v11.8H, v19.8H, v14.8H // ...........................................................*............................ + sub v17.8H, v19.8H, v14.8H // ..........................................................*............................. // gap // ........................................................................................ - mls v27.8H, v2.8H, v7.H[0] // .............................................................................*.......... - sub v19.8H, v16.8H, v18.8H // ..................e..................................................................... - sub v11.8H, v17.8H, v25.8H // ...............................................................*........................ - sub v18.8H, v8.8H, v10.8H // .......................e................................................................ - add v28.8H, v17.8H, v25.8H // ................................................................*....................... + sub v14.8H, v16.8H, v24.8H // ..................e..................................................................... + sub v22.8H, v10.8H, v3.8H // .............e.......................................................................... // gap // ........................................................................................ // gap // ........................................................................................ + mls v15.8H, v8.8H, v7.H[0] // ..........................................................................*............. + sub v19.8H, v21.8H, v4.8H // .......................e................................................................ + sqrdmulh v2.8H, v6.8H, v0.H[7] // ..........e............................................................................. // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v16.8H, v14.8H, v1.H[1] // ................e....................................................................... - str q9, [x0, #304] // .....................................................................*.................. - sqrdmulh v9.8H, v20.8H, v0.H[7] // ...........e............................................................................ - mul v13.8H, v14.8H, v1.H[0] // ...............e........................................................................ - add v15.8H, v8.8H, v10.8H // ........................e............................................................... // gap // ........................................................................................ // gap // ........................................................................................ + sub v28.8H, v12.8H, v20.8H // ...............................................................*........................ // gap // ........................................................................................ - sqrdmulh v14.8H, v18.8H, v1.H[5] // ..........................e............................................................. - mul v12.8H, v18.8H, v1.H[4] // .........................e.............................................................. - sqrdmulh v10.8H, v19.8H, v1.H[3] // .....................e.................................................................. - mul v25.8H, v19.8H, v1.H[2] // ....................e................................................................... + sqrdmulh v13.8H, v17.8H, v0.H[1] // ............................................................*........................... + mul v5.8H, v17.8H, v0.H[0] // .............................................................*.......................... + mul v8.8H, v14.8H, v1.H[2] // .....................e.................................................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mul v9.8H, v22.8H, v1.H[0] // ................e....................................................................... // gap // ........................................................................................ + str q31, [x0, #256] // ....................................................................*................... + add v20.8H, v12.8H, v20.8H // ................................................................*....................... // gap // ........................................................................................ - mul v23.8H, v28.8H, v29.8H // .................................................................................*...... - mul v19.8H, v20.8H, v0.H[6] // ..........e............................................................................. - sqrdmulh v22.8H, v11.8H, v0.H[1] // ..................................................................*..................... + mul v31.8H, v19.8H, v1.H[4] // ..........................e............................................................. + sqrdmulh v12.8H, v19.8H, v1.H[5] // .........................e.............................................................. + sqrdmulh v19.8H, v14.8H, v1.H[3] // ....................e................................................................... // gap // ........................................................................................ + mul v14.8H, v28.8H, v0.H[0] // ..................................................................*..................... + add v17.8H, v10.8H, v3.8H // ..............e......................................................................... + str q15, [x0], #(16) // ....................................................................................*... + sqrdmulh v3.8H, v22.8H, v1.H[1] // ...............e........................................................................ + mul v10.8H, v6.8H, v0.H[6] // ...........e............................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - add v20.8H, v31.8H, v5.8H // ..............e......................................................................... - add v2.8H, v24.8H, v15.8H // .......................................e................................................ - sqrdmulh v31.8H, v28.8H, v30.8H // ..................................................................................*..... - sub v28.8H, v24.8H, v15.8H // ......................................e................................................. // gap // ........................................................................................ + add v22.8H, v16.8H, v24.8H // ...................e.................................................................... + sqrdmulh v6.8H, v28.8H, v0.H[1] // .................................................................*...................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v13.8H, v16.8H, v7.H[0] // .................e...................................................................... - mls v12.8H, v14.8H, v7.H[0] // ...........................e............................................................ - mls v25.8H, v10.8H, v7.H[0] // ......................e................................................................. - sub v18.8H, v21.8H, v20.8H // ............................e........................................................... // gap // ........................................................................................ + mul v16.8H, v11.8H, v29.8H // ...............................................................................*........ + mul v28.8H, v20.8H, v29.8H // ..................................................................................*..... + sqrdmulh v15.8H, v20.8H, v30.8H // .................................................................................*...... + sqrdmulh v24.8H, v11.8H, v30.8H // ..............................................................................*......... + mls v31.8H, v12.8H, v7.H[0] // ...........................e............................................................ + mls v8.8H, v19.8H, v7.H[0] // ......................e................................................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v5.8H, v11.8H, v0.H[0] // .................................................................*...................... - add v11.8H, v21.8H, v20.8H // .............................e.......................................................... - mul v14.8H, v28.8H, v0.H[4] // ........................................e............................................... - sqrdmulh v20.8H, v28.8H, v0.H[5] // .........................................e.............................................. // gap // ........................................................................................ + mls v9.8H, v3.8H, v7.H[0] // .................e...................................................................... + mls v18.8H, v23.8H, v7.H[0] // .........................................................*.............................. // gap // ........................................................................................ + sub v12.8H, v25.8H, v17.8H // ............................e........................................................... // gap // ........................................................................................ // gap // ........................................................................................ - mls v19.8H, v9.8H, v7.H[0] // ............e........................................................................... - mls v23.8H, v31.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ - sqrdmulh v17.8H, v18.8H, v0.H[3] // ...............................e........................................................ - mul v28.8H, v18.8H, v0.H[2] // ..............................e......................................................... + mls v26.8H, v27.8H, v7.H[0] // .............................................................................*.......... // gap // ........................................................................................ + mls v14.8H, v6.8H, v7.H[0] // ...................................................................*.................... + mls v5.8H, v13.8H, v7.H[0] // ..............................................................*......................... + add v4.8H, v21.8H, v4.8H // ........................e............................................................... + mls v10.8H, v2.8H, v7.H[0] // ............e........................................................................... // gap // ........................................................................................ // gap // ........................................................................................ - mls v4.8H, v26.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ - mls v3.8H, v6.8H, v7.H[0] // ................................................................................*....... - add v10.8H, v25.8H, v12.8H // ............................................e........................................... - add v24.8H, v11.8H, v2.8H // .................................................e...................................... // gap // ........................................................................................ + mls v16.8H, v24.8H, v7.H[0] // ................................................................................*....... + mls v28.8H, v15.8H, v7.H[0] // ...................................................................................*.... + sub v15.8H, v8.8H, v31.8H // ...........................................e............................................ + add v6.8H, v8.8H, v31.8H // ............................................e........................................... // gap // ........................................................................................ // gap // ........................................................................................ - mls v5.8H, v22.8H, v7.H[0] // ...................................................................*.................... - mls v14.8H, v20.8H, v7.H[0] // ..........................................e............................................. // gap // ........................................................................................ + str q18, [x0, #304] // .....................................................................*.................. + sub v31.8H, v22.8H, v4.8H // ......................................e................................................. + add v4.8H, v22.8H, v4.8H // .......................................e................................................ // gap // ........................................................................................ - str q27, [x0, #48] // .....................................................................................*.. // gap // ........................................................................................ - sub v8.8H, v19.8H, v13.8H // .................................e...................................................... - sub v27.8H, v11.8H, v2.8H // ................................................e....................................... - add v21.8H, v19.8H, v13.8H // ..................................e..................................................... - sub v11.8H, v25.8H, v12.8H // ...........................................e............................................ - mul v12.8H, v24.8H, v29.8H // ........................................................................e............... + str q26, [x0, #48] // .....................................................................................*.. + sqrdmulh v23.8H, v12.8H, v0.H[3] // ..............................e......................................................... + add v3.8H, v25.8H, v17.8H // .............................e.......................................................... + add v17.8H, v10.8H, v9.8H // ..................................e..................................................... + str q14, [x0, #432] // .......................................................................*................ + str q5, [x0, #368] // ......................................................................*................. + sqrdmulh v22.8H, v15.8H, v0.H[5] // .............................................e.......................................... + sub v13.8H, v10.8H, v9.8H // .................................e...................................................... // gap // ........................................................................................ - sqrdmulh v13.8H, v24.8H, v30.8H // .........................................................................e.............. - mls v28.8H, v17.8H, v7.H[0] // ................................e....................................................... - str q4, [x0, #368] // ......................................................................*................. // gap // ........................................................................................ - str q23, [x0, #176] // .......................................................................................* - str q3, [x0, #112] // ......................................................................................*. + mul v20.8H, v15.8H, v0.H[4] // ..............................................e......................................... + sub v27.8H, v3.8H, v4.8H // ................................................e....................................... // gap // ........................................................................................ - mul v22.8H, v27.8H, v0.H[0] // ..................................................e..................................... - sqrdmulh v15.8H, v27.8H, v0.H[1] // ...................................................e.................................... - mul v17.8H, v8.8H, v0.H[2] // ...................................e.................................................... + str q28, [x0, #176] // .......................................................................................* + str q16, [x0, #112] // ......................................................................................*. + mul v14.8H, v31.8H, v0.H[4] // .........................................e.............................................. + sqrdmulh v11.8H, v31.8H, v0.H[5] // ........................................e............................................... // gap // ........................................................................................ - str q5, [x0, #432] // .......................................................................*................ - sub v4.8H, v21.8H, v10.8H // .....................................................e.................................. - - // original source code - // ldr q8, [x0, #0] // ............e...........................................................................|...........e.......................................................................... - // ldr q9, [x0, #(1*(512/8))] // ...........e............................................................................|..........e........................................................................... - // ldr q10, [x0, #(2*(512/8))] // .....e..................................................................................|....e................................................................................. - // ldr q11, [x0, #(3*(512/8))] // ...e....................................................................................|..e................................................................................... - // ldr q12, [x0, #(4*(512/8))] // e.......................................................................................e...................................................................................... - // ldr q13, [x0, #(5*(512/8))] // ..........e.............................................................................|.........e............................................................................ - // ldr q14, [x0, #(6*(512/8))] // ................e.......................................................................|...............e...................................................................... - // ldr q15, [x0, #(7*(512/8))] // ..................e.....................................................................|.................e.................................................................... - // sub v24.8h, v8.8h, v9.8h // ................................e.......................................................|...............................e...................................................... - // add v8.8h, v8.8h, v9.8h // ...............................e........................................................|..............................e....................................................... - // mul v9.8h, v24.8h, v0.h[6] // ................................................e.......................................|...............................................e...................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ........................................e...............................................|.......................................e.............................................. - // mls v9.8h, v24.8h, v7.h[0] // ..............................................................e.........................|.............................................................e........................ - // sub v24.8h, v10.8h, v11.8h // ...........................e............................................................|..........................e........................................................... - // add v10.8h, v10.8h, v11.8h // ..................................................e.....................................|.................................................e.................................... - // mul v11.8h, v24.8h, v1.h[0] // .........................................e..............................................|........................................e............................................. - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ......................................e.................................................|.....................................e................................................ - // mls v11.8h, v24.8h, v7.h[0] // ......................................................e.................................|.....................................................e................................ - // sub v24.8h, v12.8h, v13.8h // ..................................e.....................................................|.................................e.................................................... - // add v12.8h, v12.8h, v13.8h // .............................e..........................................................|............................e......................................................... - // mul v13.8h, v24.8h, v1.h[2] // ..............................................e.........................................|.............................................e........................................ - // sqrdmulh v24.8h, v24.8h, v1.h[3] // .............................................e..........................................|............................................e......................................... - // mls v13.8h, v24.8h, v7.h[0] // ........................................................e...............................|.......................................................e.............................. - // sub v24.8h, v14.8h, v15.8h // ....................................e...................................................|...................................e.................................................. - // add v14.8h, v14.8h, v15.8h // ..........................................e.............................................|.........................................e............................................ - // mul v15.8h, v24.8h, v1.h[4] // ............................................e...........................................|...........................................e.......................................... - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........................................e............................................|..........................................e........................................... - // mls v15.8h, v24.8h, v7.h[0] // .......................................................e................................|......................................................e............................... - // sub v24.8h, v8.8h, v10.8h // .........................................................e..............................|........................................................e............................. - // add v8.8h, v8.8h, v10.8h // ...........................................................e............................|..........................................................e........................... - // mul v10.8h, v24.8h, v0.h[2] // .................................................................e......................|................................................................e..................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................................................................e.......................|...............................................................e...................... - // mls v10.8h, v24.8h, v7.h[0] // ...............................................................................e........|..............................................................................e....... - // sub v24.8h, v9.8h, v11.8h // .........................................................................e..............|........................................................................e............. - // add v9.8h, v9.8h, v11.8h // ...........................................................................e............|..........................................................................e........... - // mul v11.8h, v24.8h, v0.h[2] // .....................................................................................e..|....................................................................................e. - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....*...................................................................................|...*.................................................................................. - // mls v11.8h, v24.8h, v7.h[0] // .......................*................................................................|......................*............................................................... - // sub v24.8h, v12.8h, v14.8h // .....................................................e..................................|....................................................e................................. - // add v12.8h, v12.8h, v14.8h // ...................................................e....................................|..................................................e................................... - // mul v14.8h, v24.8h, v0.h[4] // ............................................................e...........................|...........................................................e.......................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .............................................................e..........................|............................................................e......................... - // mls v14.8h, v24.8h, v7.h[0] // .......................................................................e................|......................................................................e............... - // sub v24.8h, v13.8h, v15.8h // ............................................................................e...........|...........................................................................e.......... - // add v13.8h, v13.8h, v15.8h // ....................................................................e...................|...................................................................e.................. - // mul v15.8h, v24.8h, v0.h[4] // .*......................................................................................|*..................................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..*.....................................................................................|.*.................................................................................... - // mls v15.8h, v24.8h, v7.h[0] // ....................*...................................................................|...................*.................................................................. - // sub v24.8h, v8.8h, v12.8h // ..........................................................................e.............|.........................................................................e............ - // add v8.8h, v8.8h, v12.8h // .....................................................................e..................|....................................................................e................. - // mul v12.8h, v24.8h, v0.h[0] // ...................................................................................e....|..................................................................................e... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................................................e...|...................................................................................e.. - // mls v12.8h, v24.8h, v7.h[0] // ...............*........................................................................|..............*....................................................................... - // sub v24.8h, v9.8h, v13.8h // .......................................................................................e|...................................................................................... - // add v9.8h, v9.8h, v13.8h // ......*.................................................................................|.....*................................................................................ - // mul v13.8h, v24.8h, v0.h[0] // .......*................................................................................|......*............................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........*...............................................................................|.......*.............................................................................. - // mls v13.8h, v24.8h, v7.h[0] // ........................*...............................................................|.......................*.............................................................. - // sub v24.8h, v10.8h, v14.8h // ..............*.........................................................................|.............*........................................................................ - // add v10.8h, v10.8h, v14.8h // .........*..............................................................................|........*............................................................................. - // mul v14.8h, v24.8h, v0.h[0] // ............................*...........................................................|...........................*.......................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................*.............................................................|.........................*............................................................ - // mls v14.8h, v24.8h, v7.h[0] // ..................................................................*.....................|.................................................................*.................... - // sub v24.8h, v11.8h, v15.8h // ...................................*....................................................|..................................*................................................... - // add v11.8h, v11.8h, v15.8h // .....................................*..................................................|....................................*................................................. - // mul v15.8h, v24.8h, v0.h[0] // ..........................................................*.............................|.........................................................*............................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .................................................*......................................|................................................*..................................... - // mls v15.8h, v24.8h, v7.h[0] // ......................................................................*.................|.....................................................................*................ - // str q12, [x0, #(4*(512/8))] // ..............................*.........................................................|.............................*........................................................ - // str q13, [x0, #(5*(512/8))] // .......................................*................................................|......................................*............................................... - // str q14, [x0, #(6*(512/8))] // ................................................................................*.......|...............................................................................*...... - // str q15, [x0, #(7*(512/8))] // ......................................................................................*.|.....................................................................................* - // mul v12.8h, v8.8h, v29.8h // .............................................................................e..........|............................................................................e......... - // sqrdmulh v8.8h, v8.8h, v30.8h // ..............................................................................e.........|.............................................................................e........ - // mls v12.8h, v8.8h, v7.h[0] // .............*..........................................................................|............*......................................................................... - // mul v13.8h, v9.8h, v29.8h // ...................*....................................................................|..................*................................................................... - // sqrdmulh v9.8h, v9.8h, v30.8h // .................*......................................................................|................*..................................................................... - // mls v13.8h, v9.8h, v7.h[0] // .................................*......................................................|................................*..................................................... - // mul v14.8h, v10.8h, v29.8h // .....................*..................................................................|....................*................................................................. - // sqrdmulh v10.8h, v10.8h, v30.8h // ......................*.................................................................|.....................*................................................................ - // mls v14.8h, v10.8h, v7.h[0] // ...................................................................*....................|..................................................................*................... - // mul v15.8h, v11.8h, v29.8h // ...............................................*........................................|..............................................*....................................... - // sqrdmulh v11.8h, v11.8h, v30.8h // ....................................................*...................................|...................................................*.................................. - // mls v15.8h, v11.8h, v7.h[0] // ...............................................................*........................|..............................................................*....................... - // str q12, [x0], #(16) // .........................*..............................................................|........................*............................................................. - // str q13, [x0, #(-16 + 1*(512/8))] // ........................................................................*...............|.......................................................................*.............. - // str q14, [x0, #(-16 + 2*(512/8))] // ..................................................................................*.....|.................................................................................*.... - // str q15, [x0, #(-16 + 3*(512/8))] // .................................................................................*......|................................................................................*..... + mul v19.8H, v12.8H, v0.H[2] // ...............................e........................................................ + + // ----------------------------------------------------------------------------- new position ------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ldr q8, [x0, #0] // .e...................................................................................'...~................................................................................ + // ldr q9, [x0, #(1*(512/8))] // ..e..................................................................................'....~............................................................................... + // ldr q10, [x0, #(2*(512/8))] // .....e...............................................................................'.......~............................................................................ + // ldr q11, [x0, #(3*(512/8))] // ........e............................................................................'..........~......................................................................... + // ldr q12, [x0, #(4*(512/8))] // .......e.............................................................................'.........~.......................................................................... + // ldr q13, [x0, #(5*(512/8))] // e....................................................................................'..~................................................................................. + // ldr q14, [x0, #(6*(512/8))] // ..............e......................................................................'................~................................................................... + // ldr q15, [x0, #(7*(512/8))] // .............e.......................................................................'...............~.................................................................... + // sub v24.8h, v8.8h, v9.8h // .......................e.............................................................'.........................~.......................................................... + // add v8.8h, v8.8h, v9.8h // ......................e..............................................................'........................~........................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[7] // ...............................e.....................................................'.................................~.................................................. + // mul v9.8h, v24.8h, v0.h[6] // ..............................................e......................................'................................................~................................... + // mls v9.8h, v27.8h, v7.h[0] // ..............................................................e......................'................................................................~................... + // sub v24.8h, v10.8h, v11.8h // ............................e........................................................'..............................~..................................................... + // add v10.8h, v10.8h, v11.8h // ...........................................e.........................................'.............................................~...................................... + // sqrdmulh v27.8h, v24.8h, v1.h[1] // .............................................e.......................................'...............................................~.................................... + // mul v11.8h, v24.8h, v1.h[0] // ....................................e................................................'......................................~............................................. + // mls v11.8h, v27.8h, v7.h[0] // .......................................................e.............................'.........................................................~.......................... + // sub v24.8h, v12.8h, v13.8h // ...........................e.........................................................'.............................~...................................................... + // add v12.8h, v12.8h, v13.8h // ...............................................e.....................................'.................................................~.................................. + // sqrdmulh v27.8h, v24.8h, v1.h[3] // .........................................e...........................................'...........................................~........................................ + // mul v13.8h, v24.8h, v1.h[2] // ...................................e.................................................'.....................................~.............................................. + // mls v13.8h, v27.8h, v7.h[0] // ......................................................e..............................'........................................................~........................... + // sub v24.8h, v14.8h, v15.8h // ..............................e......................................................'................................~................................................... + // add v14.8h, v14.8h, v15.8h // .............................................................e.......................'...............................................................~.................... + // sqrdmulh v27.8h, v24.8h, v1.h[5] // ........................................e............................................'..........................................~......................................... + // mul v15.8h, v24.8h, v1.h[4] // .......................................e.............................................'.........................................~.......................................... + // mls v15.8h, v27.8h, v7.h[0] // .....................................................e...............................'.......................................................~............................ + // sub v24.8h, v8.8h, v10.8h // .........................................................e...........................'...........................................................~........................ + // add v8.8h, v8.8h, v10.8h // ........................................................................e............'..........................................................................~......... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .......................................................................e.............'.........................................................................~.......... + // mul v10.8h, v24.8h, v0.h[2] // ....................................................................................e'.................................................................................... + // mls v10.8h, v27.8h, v7.h[0] // ...........~.........................................................................'.............*...................................................................... + // sub v24.8h, v9.8h, v11.8h // .............................................................................e.......'...............................................................................~.... + // add v9.8h, v9.8h, v11.8h // .........................................................................e...........'...........................................................................~........ + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...~.................................................................................'.....*.............................................................................. + // mul v11.8h, v24.8h, v0.h[2] // .....................................................................................*.................................................................................... + // mls v11.8h, v27.8h, v7.h[0] // ...................~.................................................................'.....................*.............................................................. + // sub v24.8h, v12.8h, v14.8h // ....................................................................e................'......................................................................~............. + // add v12.8h, v12.8h, v14.8h // .....................................................................e...............'.......................................................................~............ + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...................................................................................e.'.................................................................................... + // mul v14.8h, v24.8h, v0.h[4] // ..................................................................................e..'.................................................................................... + // mls v14.8h, v27.8h, v7.h[0] // ............~........................................................................'..............*..................................................................... + // sub v24.8h, v13.8h, v15.8h // .................................................................e...................'...................................................................~................ + // add v13.8h, v13.8h, v15.8h // ..................................................................e..................'....................................................................~............... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ............................................................................e........'..............................................................................~..... + // mul v15.8h, v24.8h, v0.h[4] // ..............................................................................e......'................................................................................~... + // mls v15.8h, v27.8h, v7.h[0] // ......~..............................................................................'........*........................................................................... + // sub v24.8h, v8.8h, v12.8h // ...............................................................................e.....'.................................................................................~.. + // add v8.8h, v8.8h, v12.8h // ....~................................................................................'......*............................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .........~...........................................................................'...........*........................................................................ + // mul v12.8h, v24.8h, v0.h[0] // ..........~..........................................................................'............*....................................................................... + // mls v12.8h, v27.8h, v7.h[0] // ........................~............................................................'..........................*......................................................... + // sub v24.8h, v9.8h, v13.8h // .....................................................................................'*................................................................................... + // add v9.8h, v9.8h, v13.8h // .....................................................................................'.*.................................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...............~.....................................................................'.................*.................................................................. + // mul v13.8h, v24.8h, v0.h[0] // .....................~...............................................................'.......................*............................................................ + // mls v13.8h, v27.8h, v7.h[0] // ........................................................~............................'..........................................................*......................... + // sub v24.8h, v10.8h, v14.8h // ..........................~..........................................................'............................*....................................................... + // add v10.8h, v10.8h, v14.8h // .........................~...........................................................'...........................*........................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .................................~...................................................'...................................*................................................ + // mul v14.8h, v24.8h, v0.h[0] // ..................................~..................................................'....................................*............................................... + // mls v14.8h, v27.8h, v7.h[0] // ............................................................~........................'..............................................................*..................... + // sub v24.8h, v11.8h, v15.8h // ................................~....................................................'..................................*................................................. + // add v11.8h, v11.8h, v15.8h // ......................................~..............................................'........................................*........................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ................................................~....................................'..................................................*................................. + // mul v15.8h, v24.8h, v0.h[0] // ..........................................~..........................................'............................................*....................................... + // mls v15.8h, v27.8h, v7.h[0] // ...........................................................~.........................'.............................................................*...................... + // str q12, [x0, #(4*(512/8))] // .....................................~...............................................'.......................................*............................................ + // str q13, [x0, #(5*(512/8))] // ...................................................................~.................'.....................................................................*.............. + // str q14, [x0, #(6*(512/8))] // ...........................................................................~.........'.............................................................................*...... + // str q15, [x0, #(7*(512/8))] // ..........................................................................~..........'............................................................................*....... + // sqrdmulh v27.8h, v8.8h, v30.8h // ..................~..................................................................'....................*............................................................... + // mul v8.8h, v8.8h, v29.8h // .................~...................................................................'...................*................................................................ + // mls v8.8h, v27.8h, v7.h[0] // .............................~.......................................................'...............................*.................................................... + // sqrdmulh v27.8h, v9.8h, v30.8h // ................~....................................................................'..................*................................................................. + // mul v9.8h, v9.8h, v29.8h // ....................~................................................................'......................*............................................................. + // mls v9.8h, v27.8h, v7.h[0] // ..........................................................~..........................'............................................................*....................... + // sqrdmulh v27.8h, v10.8h, v30.8h // ....................................................~................................'......................................................*............................. + // mul v10.8h, v10.8h, v29.8h // .................................................~...................................'...................................................*................................ + // mls v10.8h, v27.8h, v7.h[0] // ...............................................................~.....................'.................................................................*.................. + // sqrdmulh v27.8h, v11.8h, v30.8h // ...................................................~.................................'.....................................................*.............................. + // mul v11.8h, v11.8h, v29.8h // ..................................................~..................................'....................................................*............................... + // mls v11.8h, v27.8h, v7.h[0] // ................................................................~....................'..................................................................*................. + // str q8, [x0], #(16) // ............................................~........................................'..............................................*..................................... + // str q9, [x0, #(-16 + 1*(512/8))] // ......................................................................~..............'........................................................................*........... + // str q10, [x0, #(-16 + 2*(512/8))] // .................................................................................~...'...................................................................................* + // str q11, [x0, #(-16 + 3*(512/8))] // ................................................................................~....'..................................................................................*. sub count, count, #1 cbnz count, layer123_start - mul v6.8H, v11.8H, v0.H[4] // *..................................... - sqrdmulh v27.8H, v11.8H, v0.H[5] // .*.................................... - add v11.8H, v21.8H, v10.8H // ...*.................................. - sqrdmulh v3.8H, v8.8H, v0.H[3] // ..*................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v25.8H, v4.8H, v0.H[0] // ....*................................. - sqrdmulh v24.8H, v4.8H, v0.H[1] // .....*................................ - add v26.8H, v28.8H, v14.8H // ......*............................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v16.8H, v28.8H, v14.8H // ........*............................. - sqrdmulh v23.8H, v11.8H, v30.8H // ..........*........................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v6.8H, v27.8H, v7.H[0] // ............*......................... - mls v17.8H, v3.8H, v7.H[0] // ...............*...................... - mul v18.8H, v26.8H, v29.8H // .............*........................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v22.8H, v15.8H, v7.H[0] // .........*............................ - sqrdmulh v26.8H, v26.8H, v30.8H // ..............*....................... - mul v4.8H, v11.8H, v29.8H // ...........*.......................... - mls v25.8H, v24.8H, v7.H[0] // ................*..................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v12.8H, v13.8H, v7.H[0] // .......*.............................. - sqrdmulh v9.8H, v16.8H, v0.H[1] // ..................*................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v19.8H, v17.8H, v6.8H // ......................*............... - add v21.8H, v17.8H, v6.8H // .......................*.............. - mul v3.8H, v16.8H, v0.H[0] // ...................*.................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q22, [x0, #256] // ....................*................. - mls v18.8H, v26.8H, v7.H[0] // ...............................*...... - mls v4.8H, v23.8H, v7.H[0] // .....................*................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v11.8H, v19.8H, v0.H[0] // ............................*......... - sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........................*........... - mul v23.8H, v21.8H, v29.8H // .........................*............ - sqrdmulh v10.8H, v21.8H, v30.8H // ...........................*.......... - str q25, [x0, #320] // ........................*............. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v3.8H, v9.8H, v7.H[0] // ..............................*....... - str q12, [x0], #(16) // .................*.................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q4, [x0, #48] // .................................*.... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q18, [x0, #112] // ....................................*. - mls v11.8H, v19.8H, v7.H[0] // ................................*..... - mls v23.8H, v10.8H, v7.H[0] // .............................*........ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q3, [x0, #368] // ..................................*... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q11, [x0, #432] // .....................................* - str q23, [x0, #176] // ...................................*.. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - - // original source code - // mul v25.8H, v11.8H, v0.H[4] // *..................................... - // sqrdmulh v3.8H, v11.8H, v0.H[5] // .*.................................... - // sqrdmulh v19.8H, v8.8H, v0.H[3] // ...*.................................. - // add v27.8H, v21.8H, v10.8H // ..*................................... - // mul v9.8H, v4.8H, v0.H[0] // ....*................................. - // sqrdmulh v24.8H, v4.8H, v0.H[1] // .....*................................ - // add v23.8H, v28.8H, v14.8H // ......*............................... - // mls v12.8H, v13.8H, v7.H[0] // ................*..................... - // sub v4.8H, v28.8H, v14.8H // .......*.............................. - // mls v22.8H, v15.8H, v7.H[0] // ............*......................... - // sqrdmulh v2.8H, v27.8H, v30.8H // ........*............................. - // mul v27.8H, v27.8H, v29.8H // ..............*....................... - // mls v25.8H, v3.8H, v7.H[0] // .........*............................ - // mul v3.8H, v23.8H, v29.8H // ...........*.......................... - // sqrdmulh v6.8H, v23.8H, v30.8H // .............*........................ - // mls v17.8H, v19.8H, v7.H[0] // ..........*........................... - // mls v9.8H, v24.8H, v7.H[0] // ...............*...................... - // str q12, [x0], #(16) // ..............................*....... - // sqrdmulh v26.8H, v4.8H, v0.H[1] // .................*.................... - // mul v4.8H, v4.8H, v0.H[0] // ....................*................. - // str q22, [x0, #240] // .....................*................ - // mls v27.8H, v2.8H, v7.H[0] // .......................*.............. - // sub v11.8H, v17.8H, v25.8H // ..................*................... - // add v28.8H, v17.8H, v25.8H // ...................*.................. - // str q9, [x0, #304] // ............................*......... - // mul v23.8H, v28.8H, v29.8H // ..........................*........... - // sqrdmulh v22.8H, v11.8H, v0.H[1] // .........................*............ - // sqrdmulh v31.8H, v28.8H, v30.8H // ...........................*.......... - // mul v5.8H, v11.8H, v0.H[0] // ........................*............. - // mls v23.8H, v31.8H, v7.H[0] // ..................................*... - // mls v4.8H, v26.8H, v7.H[0] // .............................*........ - // mls v3.8H, v6.8H, v7.H[0] // ......................*............... - // mls v5.8H, v22.8H, v7.H[0] // .................................*.... - // str q27, [x0, #48] // ...............................*...... - // str q4, [x0, #368] // ...................................*.. - // str q23, [x0, #176] // .....................................* - // str q3, [x0, #112] // ................................*..... - // str q5, [x0, #432] // ....................................*. + // Instructions: 45 + // Expected cycles: 15 + // Expected IPC: 3.00 + // + // Cycle bound: 15.0 + // IPC bound: 3.00 + // + // Wall time: 0.83s + // User time: 0.83s + // + // ------------ original position -------------> + // 0 25 + // |------------------------|------------------- + add v16.8H, v17.8H, v6.8H // ..*.......................................... + sub v8.8H, v17.8H, v6.8H // .*........................................... + sqrdmulh v24.8H, v13.8H, v0.H[3] // ...*......................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mul v10.8H, v13.8H, v0.H[2] // *............................................ + mls v19.8H, v23.8H, v7.H[0] // ........*.................................... + mul v23.8H, v27.8H, v0.H[0] // .......*..................................... + mls v14.8H, v11.8H, v7.H[0] // .........*................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + add v28.8H, v3.8H, v4.8H // ....*........................................ + mul v9.8H, v8.8H, v0.H[0] // ................*............................ + sqrdmulh v6.8H, v16.8H, v30.8H // ...........*................................. + mul v5.8H, v16.8H, v29.8H // ...............*............................. + sqrdmulh v16.8H, v8.8H, v0.H[1] // ..........*.................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v10.8H, v24.8H, v7.H[0] // ..............*.............................. + sqrdmulh v25.8H, v28.8H, v30.8H // .............*............................... + mls v20.8H, v22.8H, v7.H[0] // .....*....................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + add v15.8H, v19.8H, v14.8H // ..................*.......................... + sub v2.8H, v19.8H, v14.8H // ...................*......................... + mul v19.8H, v28.8H, v29.8H // ............*................................ + sqrdmulh v18.8H, v27.8H, v0.H[1] // ......*...................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v5.8H, v6.8H, v7.H[0] // ..................................*.......... + mls v9.8H, v16.8H, v7.H[0] // .................................*........... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + sqrdmulh v26.8H, v2.8H, v0.H[1] // ......................*...................... + mul v22.8H, v2.8H, v0.H[0] // .......................*..................... + add v24.8H, v10.8H, v20.8H // .........................*................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + sub v20.8H, v10.8H, v20.8H // .....................*....................... + mls v23.8H, v18.8H, v7.H[0] // .................*........................... + mls v19.8H, v25.8H, v7.H[0] // ....................*........................ + mul v27.8H, v15.8H, v29.8H // .............................*............... + sqrdmulh v18.8H, v15.8H, v30.8H // ................................*............ + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q9, [x0, #320] // .......................................*..... + mul v12.8H, v24.8H, v29.8H // ..............................*.............. + sqrdmulh v15.8H, v24.8H, v30.8H // ...............................*............. + mul v6.8H, v20.8H, v0.H[0] // ..........................*.................. + sqrdmulh v14.8H, v20.8H, v0.H[1] // ............................*................ + str q5, [x0, #64] // ........................................*.... + // gap // ............................................. + // gap // ............................................. + mls v22.8H, v26.8H, v7.H[0] // ....................................*........ + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q19, [x0], #(16) // ...........................*................. + mls v27.8H, v18.8H, v7.H[0] // .....................................*....... + str q23, [x0, #240] // ........................*.................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v12.8H, v15.8H, v7.H[0] // ......................................*...... + mls v6.8H, v14.8H, v7.H[0] // ...................................*......... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q22, [x0, #368] // ..........................................*.. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q27, [x0, #112] // ............................................* + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q6, [x0, #432] // .........................................*... + str q12, [x0, #176] // ...........................................*. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + + // --------------- new position ---------------> + // 0 25 + // |------------------------|------------------- + // mul v12.8H, v13.8H, v0.H[2] // ...*......................................... + // sub v9.8H, v17.8H, v6.8H // .*........................................... + // add v28.8H, v17.8H, v6.8H // *............................................ + // sqrdmulh v13.8H, v13.8H, v0.H[3] // ..*.......................................... + // add v26.8H, v3.8H, v4.8H // .......*..................................... + // mls v20.8H, v22.8H, v7.H[0] // ..............*.............................. + // sqrdmulh v2.8H, v27.8H, v0.H[1] // ..................*.......................... + // mul v31.8H, v27.8H, v0.H[0] // .....*....................................... + // mls v19.8H, v23.8H, v7.H[0] // ....*........................................ + // mls v14.8H, v11.8H, v7.H[0] // ......*...................................... + // sqrdmulh v23.8H, v9.8H, v0.H[1] // ...........*................................. + // sqrdmulh v27.8H, v28.8H, v30.8H // .........*................................... + // mul v15.8H, v26.8H, v29.8H // .................*........................... + // sqrdmulh v8.8H, v26.8H, v30.8H // .............*............................... + // mls v12.8H, v13.8H, v7.H[0] // ............*................................ + // mul v26.8H, v28.8H, v29.8H // ..........*.................................. + // mul v18.8H, v9.8H, v0.H[0] // ........*.................................... + // mls v31.8H, v2.8H, v7.H[0] // .........................*................... + // add v11.8H, v19.8H, v14.8H // ...............*............................. + // sub v17.8H, v19.8H, v14.8H // ................*............................ + // mls v15.8H, v8.8H, v7.H[0] // ..........................*.................. + // sub v28.8H, v12.8H, v20.8H // ........................*.................... + // sqrdmulh v13.8H, v17.8H, v0.H[1] // .....................*....................... + // mul v5.8H, v17.8H, v0.H[0] // ......................*...................... + // str q31, [x0, #256] // ......................................*...... + // add v20.8H, v12.8H, v20.8H // .......................*..................... + // mul v14.8H, v28.8H, v0.H[0] // ................................*............ + // str q15, [x0], #(16) // ....................................*........ + // sqrdmulh v6.8H, v28.8H, v0.H[1] // .................................*........... + // mul v16.8H, v11.8H, v29.8H // ...........................*................. + // mul v28.8H, v20.8H, v29.8H // ..............................*.............. + // sqrdmulh v15.8H, v20.8H, v30.8H // ...............................*............. + // sqrdmulh v24.8H, v11.8H, v30.8H // ............................*................ + // mls v18.8H, v23.8H, v7.H[0] // ....................*........................ + // mls v26.8H, v27.8H, v7.H[0] // ...................*......................... + // mls v14.8H, v6.8H, v7.H[0] // ........................................*.... + // mls v5.8H, v13.8H, v7.H[0] // ...................................*......... + // mls v16.8H, v24.8H, v7.H[0] // .....................................*....... + // mls v28.8H, v15.8H, v7.H[0] // .......................................*..... + // str q18, [x0, #304] // .............................*............... + // str q26, [x0, #48] // ..................................*.......... + // str q14, [x0, #432] // ...........................................*. + // str q5, [x0, #368] // .........................................*... + // str q28, [x0, #176] // ............................................* + // str q16, [x0, #112] // ..........................................*.. pop_stack diff --git a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s index 7973747d..4a402f2e 100644 --- a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s +++ b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +69,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -146,7 +125,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +136,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +146,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +154,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +165,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -354,582 +339,639 @@ _intt_kyber_123_4567_manual_ld4_opt_m1_icestorm: mov count, #8 .p2align 2 - ld4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x1] // *..................................................... - ldr q5, [x3], #16 // .........................................*............ - // gap // ...................................................... - // gap // ...................................................... - ldr q17, [x4, #64] // ..*................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - ldr q26, [x4, #80] // ...*.................................................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - ldr q1, [x4, #32] // .....*................................................ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - ldr q15, [x4, #48] // ....*................................................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - ldr q23, [x4], #(6*16) // ........*............................................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - add v16.8H, v29.8H, v30.8H // ......*............................................... - sub v25.8H, v29.8H, v30.8H // .......*.............................................. - ldr q19, [x4, #-80] // .*.................................................... - // gap // ...................................................... - sub v21.8H, v27.8H, v28.8H // .........*............................................ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mul v17.8H, v25.8H, v17.8H // ...........*.......................................... - sqrdmulh v26.8H, v25.8H, v26.8H // ..........*........................................... - // gap // ...................................................... - // gap // ...................................................... - mul v6.8H, v21.8H, v1.8H // .............*........................................ - sqrdmulh v14.8H, v21.8H, v15.8H // ............*......................................... - // gap // ...................................................... - // gap // ...................................................... - add v11.8H, v27.8H, v28.8H // ..............*....................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v17.8H, v26.8H, v7.H[0] // ...............*...................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v6.8H, v14.8H, v7.H[0] // ................*..................................... - sub v30.8H, v11.8H, v16.8H // .................*.................................... - // gap // ...................................................... - // gap // ...................................................... - add v29.8H, v11.8H, v16.8H // ..................*................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mul v4.8H, v30.8H, v23.8H // ....................*................................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sub v15.8H, v6.8H, v17.8H // .....................*................................ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sqrdmulh v9.8H, v30.8H, v19.8H // ...................*.................................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mul v8.8H, v15.8H, v23.8H // .......................*.............................. - sqrdmulh v24.8H, v15.8H, v19.8H // ......................*............................... - // gap // ...................................................... - // gap // ...................................................... - add v26.8H, v6.8H, v17.8H // ........................*............................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v4.8H, v9.8H, v7.H[0] // .........................*............................ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v8.8H, v24.8H, v7.H[0] // ..........................*........................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - trn1 v16.4S, v29.4S, v26.4S // ...........................*.......................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - trn2 v17.4S, v29.4S, v26.4S // ............................*......................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - trn1 v14.4S, v4.4S, v8.4S // .............................*........................ - trn2 v21.4S, v4.4S, v8.4S // ..............................*....................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - trn2 v28.2D, v17.2D, v21.2D // ...............................*...................... - trn2 v31.2D, v16.2D, v14.2D // ................................*..................... - // gap // ...................................................... - // gap // ...................................................... - trn1 v27.2D, v16.2D, v14.2D // .................................*.................... - trn1 v11.2D, v17.2D, v21.2D // ..................................*................... - // gap // ...................................................... - // gap // ...................................................... - add v4.8H, v31.8H, v28.8H // ...................................*.................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - add v24.8H, v27.8H, v11.8H // ....................................*................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sqdmulh v20.8H, v4.8H, v7.H[1] // .......................................*.............. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sub v3.8H, v31.8H, v28.8H // .....................................*................ - sqdmulh v8.8H, v24.8H, v7.H[1] // ........................................*............. - // gap // ...................................................... - // gap // ...................................................... - sub v13.8H, v27.8H, v11.8H // ......................................*............... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - srshr v2.8H, v20.8H, #11 // ..........................................*........... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - srshr v19.8H, v8.8H, #11 // ...........................................*.......... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sqrdmulh v10.8H, v13.8H, v5.H[3] // ..............................................*....... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v4.8H, v2.8H, v7.H[0] // ............................................*......... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v24.8H, v19.8H, v7.H[0] // .............................................*........ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mul v28.8H, v3.8H, v5.H[4] // ...............................................*...... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mul v0.8H, v13.8H, v5.H[2] // .................................................*.... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - add v9.8H, v24.8H, v4.8H // ..................................................*... - sub v21.8H, v24.8H, v4.8H // ...................................................*.. - // gap // ...................................................... - // gap // ...................................................... - sqrdmulh v19.8H, v3.8H, v5.H[5] // ................................................*..... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v0.8H, v10.8H, v7.H[0] // ....................................................*. - str q9, [x1], #(64) // .....................................................* - // gap // ...................................................... - // gap // ...................................................... - - // original source code - // ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1] // *..................................................... - // ldr q30, [x4, #16] // .........*............................................ - // ldr q1, [x4, #64] // ..*................................................... - // ldr q14, [x4, #80] // ...*.................................................. - // ldr q8, [x4, #48] // .....*................................................ - // ldr q13, [x4, #32] // ....*................................................. - // add v22.8H, v17.8H, v18.8H // .......*.............................................. - // sub v6.8H, v17.8H, v18.8H // ........*............................................. - // ldr q29, [x4], #(6*16) // ......*............................................... - // sub v26.8H, v15.8H, v16.8H // ..........*........................................... - // sqrdmulh v31.8H, v6.8H, v14.8H // ............*......................................... - // mul v24.8H, v6.8H, v1.8H // ...........*.......................................... - // sqrdmulh v20.8H, v26.8H, v8.8H // ..............*....................................... - // mul v26.8H, v26.8H, v13.8H // .............*........................................ - // add v8.8H, v15.8H, v16.8H // ...............*...................................... - // mls v24.8H, v31.8H, v7.H[0] // ................*..................................... - // mls v26.8H, v20.8H, v7.H[0] // .................*.................................... - // sub v27.8H, v8.8H, v22.8H // ..................*................................... - // add v3.8H, v8.8H, v22.8H // ...................*.................................. - // sqrdmulh v19.8H, v27.8H, v30.8H // ......................*............................... - // mul v16.8H, v27.8H, v29.8H // ....................*................................. - // sub v2.8H, v26.8H, v24.8H // .....................*................................ - // sqrdmulh v14.8H, v2.8H, v30.8H // ........................*............................. - // mul v22.8H, v2.8H, v29.8H // .......................*.............................. - // add v2.8H, v26.8H, v24.8H // .........................*............................ - // mls v16.8H, v19.8H, v7.H[0] // ..........................*........................... - // mls v22.8H, v14.8H, v7.H[0] // ...........................*.......................... - // trn1 v23.4S, v3.4S, v2.4S // ............................*......................... - // trn2 v19.4S, v3.4S, v2.4S // .............................*........................ - // trn1 v20.4S, v16.4S, v22.4S // ..............................*....................... - // trn2 v13.4S, v16.4S, v22.4S // ...............................*...................... - // trn2 v27.2D, v19.2D, v13.2D // ................................*..................... - // trn2 v22.2D, v23.2D, v20.2D // .................................*.................... - // trn1 v26.2D, v23.2D, v20.2D // ..................................*................... - // trn1 v24.2D, v19.2D, v13.2D // ...................................*.................. - // add v23.8H, v22.8H, v27.8H // ....................................*................. - // add v10.8H, v26.8H, v24.8H // .....................................*................ - // sub v0.8H, v22.8H, v27.8H // .......................................*.............. - // sub v20.8H, v26.8H, v24.8H // .........................................*............ - // sqdmulh v28.8H, v23.8H, v7.H[1] // ......................................*............... - // sqdmulh v22.8H, v10.8H, v7.H[1] // ........................................*............. - // ldr q5, [x3], #16 // .*.................................................... - // srshr v28.8H, v28.8H, #11 // ..........................................*........... - // srshr v22.8H, v22.8H, #11 // ...........................................*.......... - // mls v23.8H, v28.8H, v7.H[0] // .............................................*........ - // mls v10.8H, v22.8H, v7.H[0] // ..............................................*....... - // sqrdmulh v30.8H, v20.8H, v5.H[3] // ............................................*......... - // mul v28.8H, v0.8H, v5.H[4] // ...............................................*...... - // sqrdmulh v19.8H, v0.8H, v5.H[5] // ...................................................*.. - // mul v0.8H, v20.8H, v5.H[2] // ................................................*..... - // add v25.8H, v10.8H, v23.8H // .................................................*.... - // sub v21.8H, v10.8H, v23.8H // ..................................................*... - // mls v0.8H, v30.8H, v7.H[0] // ....................................................*. - // str q25, [x1], #(64) // .....................................................* + // Instructions: 53 + // Expected cycles: 42 + // Expected IPC: 1.26 + // + // Cycle bound: 42.0 + // IPC bound: 1.26 + // + // Wall time: 1.43s + // User time: 1.43s + // + // ---------------- original position -----------------> + // 0 25 50 + // |------------------------|------------------------|-- + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // *.................................................... + ldr q6, [x4, #32] // ..*.................................................. + ldr q16, [x4, #64] // .*................................................... + // gap // ..................................................... + ldr q22, [x4, #80] // ....*................................................ + ldr q8, [x4], #(6*16) // ...*................................................. + // gap // ..................................................... + // gap // ..................................................... + ldr q31, [x4, #-48] // .....*............................................... + ldr q3, [x4, #-80] // ......*.............................................. + // gap // ..................................................... + // gap // ..................................................... + ldr q0, [x3], #16 // ........................*............................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sub v12.8H, v18.8H, v19.8H // .......*............................................. + add v18.8H, v18.8H, v19.8H // ........*............................................ + // gap // ..................................................... + // gap // ..................................................... + sub v19.8H, v20.8H, v21.8H // .........*........................................... + add v20.8H, v20.8H, v21.8H // ..............*...................................... + // gap // ..................................................... + // gap // ..................................................... + mul v6.8H, v12.8H, v6.8H // ...........*......................................... + sqrdmulh v31.8H, v12.8H, v31.8H // ..........*.......................................... + // gap // ..................................................... + // gap // ..................................................... + mul v16.8H, v19.8H, v16.8H // ............*........................................ + sqrdmulh v22.8H, v19.8H, v22.8H // .............*....................................... + // gap // ..................................................... + // gap // ..................................................... + sub v19.8H, v18.8H, v20.8H // .................*................................... + add v18.8H, v18.8H, v20.8H // ..................*.................................. + // gap // ..................................................... + // gap // ..................................................... + mls v6.8H, v31.8H, v7.H[0] // ...............*..................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v16.8H, v22.8H, v7.H[0] // ................*.................................... + sqrdmulh v22.8H, v19.8H, v3.8H // ....................*................................ + // gap // ..................................................... + // gap // ..................................................... + mul v19.8H, v19.8H, v8.8H // .....................*............................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sub v20.8H, v6.8H, v16.8H // ...................*................................. + add v6.8H, v6.8H, v16.8H // .........................*........................... + // gap // ..................................................... + // gap // ..................................................... + mls v19.8H, v22.8H, v7.H[0] // ...........................*......................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v16.8H, v20.8H, v8.8H // ......................*.............................. + sqrdmulh v22.8H, v20.8H, v3.8H // .......................*............................. + // gap // ..................................................... + // gap // ..................................................... + trn1 v20.4S, v18.4S, v6.4S // ............................*........................ + trn2 v6.4S, v18.4S, v6.4S // .............................*....................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v16.8H, v22.8H, v7.H[0] // ..........................*.......................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + trn1 v18.4S, v19.4S, v16.4S // ..............................*...................... + trn2 v16.4S, v19.4S, v16.4S // ...............................*..................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + trn2 v19.2D, v6.2D, v16.2D // .................................*................... + trn2 v22.2D, v20.2D, v18.2D // ................................*.................... + // gap // ..................................................... + // gap // ..................................................... + trn1 v18.2D, v20.2D, v18.2D // ..................................*.................. + trn1 v6.2D, v6.2D, v16.2D // ...................................*................. + // gap // ..................................................... + // gap // ..................................................... + add v16.8H, v22.8H, v19.8H // ....................................*................ + sub v1.8H, v22.8H, v19.8H // ........................................*............ + // gap // ..................................................... + // gap // ..................................................... + sub v22.8H, v18.8H, v6.8H // .............................................*....... + add v6.8H, v18.8H, v6.8H // .....................................*............... + // gap // ..................................................... + // gap // ..................................................... + sqdmulh v18.8H, v16.8H, v7.H[1] // ......................................*.............. + mul v30.8H, v1.8H, v0.H[4] // ...........................................*......... + // gap // ..................................................... + // gap // ..................................................... + sqdmulh v19.8H, v6.8H, v7.H[1] // .......................................*............. + mul v17.8H, v22.8H, v0.H[2] // ...............................................*..... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v22.8H, v22.8H, v0.H[3] // ................................................*.... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + srshr v18.8H, v18.8H, #11 // .........................................*........... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + srshr v19.8H, v19.8H, #11 // ..........................................*.......... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v17.8H, v22.8H, v7.H[0] // ...................................................*. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v16.8H, v18.8H, v7.H[0] // ............................................*........ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v6.8H, v19.8H, v7.H[0] // ..............................................*...... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + add v18.8H, v6.8H, v16.8H // .................................................*... + sub v28.8H, v6.8H, v16.8H // ..................................................*.. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + str q18, [x1], #(64) // ....................................................* + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + + // ------------------- new position -------------------> + // 0 25 50 + // |------------------------|------------------------|-- + // ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1] // *.................................................... + // ldr q10, [x4, #64] // ..*.................................................. + // ldr q4, [x4, #32] // .*................................................... + // ldr q2, [x4], #(6*16) // ....*................................................ + // ldr q29, [x4, #-16] // ...*................................................. + // ldr q24, [x4, #-48] // .....*............................................... + // ldr q5, [x4, #-80] // ......*.............................................. + // sub v31.8H, v20.8H, v21.8H // ........*............................................ + // add v25.8H, v20.8H, v21.8H // .........*........................................... + // sub v19.8H, v22.8H, v23.8H // ..........*.......................................... + // sqrdmulh v27.8H, v31.8H, v24.8H // .............*....................................... + // mul v8.8H, v31.8H, v4.8H // ............*........................................ + // mul v31.8H, v19.8H, v10.8H // ..............*...................................... + // sqrdmulh v20.8H, v19.8H, v29.8H // ...............*..................................... + // add v24.8H, v22.8H, v23.8H // ...........*......................................... + // mls v8.8H, v27.8H, v7.H[0] // ..................*.................................. + // mls v31.8H, v20.8H, v7.H[0] // ...................*................................. + // sub v10.8H, v25.8H, v24.8H // ................*.................................... + // add v18.8H, v25.8H, v24.8H // .................*................................... + // sub v22.8H, v8.8H, v31.8H // ......................*.............................. + // sqrdmulh v20.8H, v10.8H, v5.8H // ....................*................................ + // mul v19.8H, v10.8H, v2.8H // .....................*............................... + // mul v14.8H, v22.8H, v2.8H // .........................*........................... + // sqrdmulh v16.8H, v22.8H, v5.8H // ..........................*.......................... + // ldr q0, [x3], #16 // .......*............................................. + // add v15.8H, v8.8H, v31.8H // .......................*............................. + // mls v14.8H, v16.8H, v7.H[0] // .............................*....................... + // mls v19.8H, v20.8H, v7.H[0] // ........................*............................ + // trn1 v24.4S, v18.4S, v15.4S // ...........................*......................... + // trn2 v22.4S, v18.4S, v15.4S // ............................*........................ + // trn1 v16.4S, v19.4S, v14.4S // ..............................*...................... + // trn2 v10.4S, v19.4S, v14.4S // ...............................*..................... + // trn2 v3.2D, v24.2D, v16.2D // .................................*................... + // trn2 v18.2D, v22.2D, v10.2D // ................................*.................... + // trn1 v31.2D, v24.2D, v16.2D // ..................................*.................. + // trn1 v20.2D, v22.2D, v10.2D // ...................................*................. + // add v16.8H, v3.8H, v18.8H // ....................................*................ + // add v22.8H, v31.8H, v20.8H // .......................................*............. + // sqdmulh v6.8H, v16.8H, v7.H[1] // ........................................*............ + // sqdmulh v19.8H, v22.8H, v7.H[1] // ..........................................*.......... + // sub v1.8H, v3.8H, v18.8H // .....................................*............... + // srshr v6.8H, v6.8H, #11 // .............................................*....... + // srshr v19.8H, v19.8H, #11 // ..............................................*...... + // mul v30.8H, v1.8H, v0.H[4] // .........................................*........... + // mls v16.8H, v6.8H, v7.H[0] // ................................................*.... + // sub v6.8H, v31.8H, v20.8H // ......................................*.............. + // mls v22.8H, v19.8H, v7.H[0] // .................................................*... + // mul v17.8H, v6.8H, v0.H[2] // ...........................................*......... + // sqrdmulh v24.8H, v6.8H, v0.H[3] // ............................................*........ + // add v20.8H, v22.8H, v16.8H // ..................................................*.. + // sub v28.8H, v22.8H, v16.8H // ...................................................*. + // mls v17.8H, v24.8H, v7.H[0] // ...............................................*..... + // str q20, [x1], #(64) // ....................................................* sub count, count, #1 layer4567_start: - ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1] // e....................................................................... - mls v28.8H, v19.8H, v7.H[0] // .............................................*.......................... - sqrdmulh v19.8H, v21.8H, v5.H[1] // .............................................................*.......... - ldr q30, [x4, #16] // ..e..................................................................... - ldr q1, [x4, #64] // .....e.................................................................. + // Instructions: 72 + // Expected cycles: 42 + // Expected IPC: 1.71 + // + // Cycle bound: 42.0 + // IPC bound: 1.71 + // + // Wall time: 23.93s + // User time: 23.93s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1] // e....................................................................... + sqrdmulh v16.8H, v1.8H, v0.H[5] // ...........................................*............................ + ldr q10, [x4, #64] // .....e.................................................................. + ldr q4, [x4, #32] // ...e.................................................................... + ldr q2, [x4], #(6*16) // .e...................................................................... + ldr q29, [x4, #-16] // ......e................................................................. // gap // ........................................................................ // gap // ........................................................................ + ldr q24, [x4, #-48] // ....e................................................................... // gap // ........................................................................ - ldr q14, [x4, #80] // ......e................................................................. - mul v11.8H, v21.8H, v5.H[0] // ............................................................*........... // gap // ........................................................................ + sqdmulh v3.8H, v17.8H, v7.H[1] // .................................................*...................... + mls v30.8H, v16.8H, v7.H[0] // .............................................*.......................... + ldr q5, [x4, #-80] // ..e..................................................................... // gap // ........................................................................ - sqdmulh v12.8H, v28.8H, v7.H[1] // .......................................................*................ - ldr q8, [x4, #48] // ....e................................................................... // gap // ........................................................................ // gap // ........................................................................ - sqdmulh v27.8H, v0.8H, v7.H[1] // .................................................*...................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - ldr q13, [x4, #32] // ...e.................................................................... - mls v11.8H, v19.8H, v7.H[0] // ..............................................................*......... + srshr v26.8H, v3.8H, #11 // ..................................................*..................... + sqrdmulh v11.8H, v28.8H, v0.H[1] // ............................................................*........... // gap // ........................................................................ // gap // ........................................................................ - add v22.8H, v17.8H, v18.8H // .............e.......................................................... - sub v6.8H, v17.8H, v18.8H // ............e........................................................... - ldr q29, [x4], #(6*16) // .e...................................................................... + sub v31.8H, v20.8H, v21.8H // .......e................................................................ + add v25.8H, v20.8H, v21.8H // ........e............................................................... // gap // ........................................................................ - sub v26.8H, v15.8H, v16.8H // .......e................................................................ // gap // ........................................................................ // gap // ........................................................................ + sqdmulh v18.8H, v30.8H, v7.H[1] // .......................................................*................ // gap // ........................................................................ - sqrdmulh v31.8H, v6.8H, v14.8H // ...............e........................................................ - mul v24.8H, v6.8H, v1.8H // ..............e......................................................... - str q11, [x1, #-32] // ......................................................................*. + sub v19.8H, v22.8H, v23.8H // ............e........................................................... + sqrdmulh v27.8H, v31.8H, v24.8H // .........e.............................................................. + mul v8.8H, v31.8H, v4.8H // ..........e............................................................. // gap // ........................................................................ - sqrdmulh v20.8H, v26.8H, v8.8H // ..........e............................................................. - mul v26.8H, v26.8H, v13.8H // .........e.............................................................. // gap // ........................................................................ + mul v31.8H, v19.8H, v10.8H // ...............e........................................................ + sqrdmulh v20.8H, v19.8H, v29.8H // ..............e......................................................... // gap // ........................................................................ - add v8.8H, v15.8H, v16.8H // ........e............................................................... // gap // ........................................................................ + add v24.8H, v22.8H, v23.8H // .............e.......................................................... + srshr v21.8H, v18.8H, #11 // ........................................................*............... // gap // ........................................................................ // gap // ........................................................................ - srshr v13.8H, v27.8H, #11 // ..................................................*..................... - mls v24.8H, v31.8H, v7.H[0] // ................e....................................................... + mls v8.8H, v27.8H, v7.H[0] // ...........e............................................................ // gap // ........................................................................ // gap // ........................................................................ - mls v26.8H, v20.8H, v7.H[0] // ...........e............................................................ - sub v27.8H, v8.8H, v22.8H // .................e...................................................... + mul v29.8H, v28.8H, v0.H[0] // .............................................................*.......... + mls v31.8H, v20.8H, v7.H[0] // ................e....................................................... + mls v17.8H, v26.8H, v7.H[0] // ...................................................*.................... // gap // ........................................................................ // gap // ........................................................................ - add v3.8H, v8.8H, v22.8H // ..................e..................................................... - srshr v22.8H, v12.8H, #11 // ........................................................*............... + mls v30.8H, v21.8H, v7.H[0] // .........................................................*.............. + sub v10.8H, v25.8H, v24.8H // .................e...................................................... // gap // ........................................................................ // gap // ........................................................................ - sqrdmulh v19.8H, v27.8H, v30.8H // ....................e................................................... + mls v29.8H, v11.8H, v7.H[0] // ..............................................................*......... + add v18.8H, v25.8H, v24.8H // ..................e..................................................... // gap // ........................................................................ // gap // ........................................................................ + sub v22.8H, v8.8H, v31.8H // ......................e................................................. + sqrdmulh v20.8H, v10.8H, v5.8H // ...................e.................................................... // gap // ........................................................................ - mul v16.8H, v27.8H, v29.8H // ...................e.................................................... - sub v2.8H, v26.8H, v24.8H // ......................e................................................. // gap // ........................................................................ + mul v19.8H, v10.8H, v2.8H // ....................e................................................... // gap // ........................................................................ - mls v28.8H, v22.8H, v7.H[0] // .........................................................*.............. // gap // ........................................................................ + sub v6.8H, v17.8H, v30.8H // ...............................................................*........ + mul v14.8H, v22.8H, v2.8H // .........................e.............................................. + sqrdmulh v16.8H, v22.8H, v5.8H // ........................e............................................... + str q29, [x1, #-32] // ......................................................................*. // gap // ........................................................................ + mul v13.8H, v6.8H, v0.H[0] // ..................................................................*..... + sqrdmulh v6.8H, v6.8H, v0.H[1] // .................................................................*...... + ldr q0, [x3], #16 // ...................................e.................................... // gap // ........................................................................ - sqrdmulh v14.8H, v2.8H, v30.8H // .........................e.............................................. - mul v22.8H, v2.8H, v29.8H // ........................e............................................... + add v15.8H, v8.8H, v31.8H // .......................e................................................ // gap // ........................................................................ // gap // ........................................................................ - add v2.8H, v26.8H, v24.8H // .......................e................................................ - mls v16.8H, v19.8H, v7.H[0] // .....................e.................................................. // gap // ........................................................................ + mls v14.8H, v16.8H, v7.H[0] // ..........................e............................................. + mls v19.8H, v20.8H, v7.H[0] // .....................e.................................................. // gap // ........................................................................ // gap // ........................................................................ + mls v13.8H, v6.8H, v7.H[0] // ...................................................................*.... + trn1 v24.4S, v18.4S, v15.4S // ...........................e............................................ // gap // ........................................................................ // gap // ........................................................................ + trn2 v22.4S, v18.4S, v15.4S // ............................e........................................... // gap // ........................................................................ - mls v22.8H, v14.8H, v7.H[0] // ..........................e............................................. - mls v0.8H, v13.8H, v7.H[0] // ...................................................*.................... // gap // ........................................................................ // gap // ........................................................................ - trn1 v23.4S, v3.4S, v2.4S // ...........................e............................................ + trn1 v16.4S, v19.4S, v14.4S // .............................e.......................................... + trn2 v10.4S, v19.4S, v14.4S // ..............................e......................................... // gap // ........................................................................ // gap // ........................................................................ + str q13, [x1, #-16] // .......................................................................* // gap // ........................................................................ - trn2 v19.4S, v3.4S, v2.4S // ............................e........................................... // gap // ........................................................................ // gap // ........................................................................ + trn2 v3.2D, v24.2D, v16.2D // ...............................e........................................ + trn2 v18.2D, v22.2D, v10.2D // ................................e....................................... // gap // ........................................................................ - trn1 v20.4S, v16.4S, v22.4S // .............................e.......................................... - trn2 v13.4S, v16.4S, v22.4S // ..............................e......................................... // gap // ........................................................................ + trn1 v31.2D, v24.2D, v16.2D // .................................e...................................... + trn1 v20.2D, v22.2D, v10.2D // ..................................e..................................... // gap // ........................................................................ - sub v6.8H, v0.8H, v28.8H // ...............................................................*........ - add v28.8H, v0.8H, v28.8H // ................................................................*....... // gap // ........................................................................ + add v16.8H, v3.8H, v18.8H // ..........................................e............................. // gap // ........................................................................ - trn2 v27.2D, v19.2D, v13.2D // ................................e....................................... // gap // ........................................................................ // gap // ........................................................................ - trn2 v22.2D, v23.2D, v20.2D // ...............................e........................................ - trn1 v26.2D, v23.2D, v20.2D // .................................e...................................... - trn1 v24.2D, v19.2D, v13.2D // ..................................e..................................... - str q28, [x1, #-48] // .....................................................................*.. + add v24.8H, v17.8H, v30.8H // ................................................................*....... + add v22.8H, v31.8H, v20.8H // .....................................e.................................. // gap // ........................................................................ - add v23.8H, v22.8H, v27.8H // ..........................................e............................. // gap // ........................................................................ + sqdmulh v6.8H, v16.8H, v7.H[1] // ....................................................e................... // gap // ........................................................................ // gap // ........................................................................ - add v10.8H, v26.8H, v24.8H // .....................................e.................................. - sub v0.8H, v22.8H, v27.8H // .........................................e.............................. // gap // ........................................................................ + str q24, [x1, #-48] // .....................................................................*.. + sqdmulh v19.8H, v22.8H, v7.H[1] // ..............................................e......................... // gap // ........................................................................ - sub v20.8H, v26.8H, v24.8H // ....................................e................................... - sqdmulh v28.8H, v23.8H, v7.H[1] // ....................................................e................... // gap // ........................................................................ + sub v1.8H, v3.8H, v18.8H // .........................................e.............................. // gap // ........................................................................ - sqdmulh v22.8H, v10.8H, v7.H[1] // ..............................................e......................... - mul v19.8H, v6.8H, v5.H[0] // .................................................................*...... // gap // ........................................................................ // gap // ........................................................................ - sqrdmulh v8.8H, v6.8H, v5.H[1] // ..................................................................*..... - ldr q5, [x3], #16 // ...................................e.................................... + srshr v6.8H, v6.8H, #11 // .....................................................e.................. // gap // ........................................................................ // gap // ........................................................................ - srshr v28.8H, v28.8H, #11 // .....................................................e.................. // gap // ........................................................................ + srshr v19.8H, v19.8H, #11 // ...............................................e........................ // gap // ........................................................................ // gap // ........................................................................ - srshr v22.8H, v22.8H, #11 // ...............................................e........................ // gap // ........................................................................ + mul v30.8H, v1.8H, v0.H[4] // ............................................e........................... // gap // ........................................................................ // gap // ........................................................................ - mls v19.8H, v8.8H, v7.H[0] // ...................................................................*.... // gap // ........................................................................ + mls v16.8H, v6.8H, v7.H[0] // ......................................................e................. // gap // ........................................................................ // gap // ........................................................................ - mls v23.8H, v28.8H, v7.H[0] // ......................................................e................. // gap // ........................................................................ + sub v6.8H, v31.8H, v20.8H // ....................................e................................... + mls v22.8H, v19.8H, v7.H[0] // ................................................e....................... // gap // ........................................................................ // gap // ........................................................................ - mls v10.8H, v22.8H, v7.H[0] // ................................................e....................... - sqrdmulh v30.8H, v20.8H, v5.H[3] // .......................................e................................ // gap // ........................................................................ // gap // ........................................................................ - str q19, [x1, #-16] // .......................................................................* - mul v28.8H, v0.8H, v5.H[4] // ...........................................e............................ - sqrdmulh v19.8H, v0.8H, v5.H[5] // ............................................e........................... // gap // ........................................................................ - mul v0.8H, v20.8H, v5.H[2] // ......................................e................................. // gap // ........................................................................ + mul v17.8H, v6.8H, v0.H[2] // .......................................e................................ + sqrdmulh v24.8H, v6.8H, v0.H[3] // ......................................e................................. // gap // ........................................................................ // gap // ........................................................................ - add v25.8H, v10.8H, v23.8H // ...........................................................e............ - sub v21.8H, v10.8H, v23.8H // ..........................................................e............. + add v20.8H, v22.8H, v16.8H // ...........................................................e............ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + sub v28.8H, v22.8H, v16.8H // ..........................................................e............. // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - mls v0.8H, v30.8H, v7.H[0] // ........................................e............................... - str q25, [x1], #(64) // ....................................................................e... + mls v17.8H, v24.8H, v7.H[0] // ........................................e............................... + str q20, [x1], #(64) // ....................................................................e... // gap // ........................................................................ // gap // ........................................................................ - // original source code - // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e................................................................ - // ldr q0, [x4], #(6*16) // ..............e.........................................................|.............e.................................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // ...e....................................................................|..e............................................................. - // ldr q1, [x4, #(-6*16 + 2*16)] // ..........e.............................................................|.........e...................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // ........e...............................................................|.......e........................................................ - // ldr q2, [x4, #(-6*16 + 4*16)] // ....e...................................................................|...e............................................................ - // ldr q6, [x4, #(-6*16 + 5*16)] // .....e..................................................................|....e........................................................... - // sub v24.8h, v8.8h, v9.8h // ...............e........................................................|..............e................................................. - // add v8.8h, v8.8h, v9.8h // .....................e..................................................|....................e........................................... - // mul v9.8h, v24.8h, v1.8h // ....................e...................................................|...................e............................................ - // sqrdmulh v24.8h, v24.8h, v5.8h // ...................e....................................................|..................e............................................. - // mls v9.8h, v24.8h, v7.h[0] // ........................e...............................................|.......................e........................................ - // sub v24.8h, v10.8h, v11.8h // .............e..........................................................|............e................................................... - // add v10.8h, v10.8h, v11.8h // ............e...........................................................|...........e.................................................... - // mul v11.8h, v24.8h, v2.8h // .................e......................................................|................e............................................... - // sqrdmulh v24.8h, v24.8h, v6.8h // ................e.......................................................|...............e................................................ - // mls v11.8h, v24.8h, v7.h[0] // .......................e................................................|......................e......................................... - // sub v24.8h, v8.8h, v10.8h // .........................e..............................................|........................e....................................... - // add v8.8h, v8.8h, v10.8h // ..........................e.............................................|.........................e...................................... - // mul v10.8h, v24.8h, v0.8h // .............................e..........................................|............................e................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ............................e...........................................|...........................e.................................... - // mls v10.8h, v24.8h, v7.h[0] // ...................................e....................................|..................................e............................. - // sub v24.8h, v9.8h, v11.8h // ..............................e.........................................|.............................e.................................. - // add v9.8h, v9.8h, v11.8h // ..................................e.....................................|.................................e.............................. - // mul v11.8h, v24.8h, v0.8h // .................................e......................................|................................e............................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ................................e.......................................|...............................e................................ - // mls v11.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e............................ - // trn1 v25.4s, v8.4s, v9.4s // ......................................e.................................|.....................................e.......................... - // trn2 v26.4s, v8.4s, v9.4s // .......................................e................................|......................................e......................... - // trn1 v27.4s, v10.4s, v11.4s // ........................................e...............................|.......................................e........................ - // trn2 v28.4s, v10.4s, v11.4s // .........................................e..............................|........................................e....................... - // trn2 v10.2d, v25.2d, v27.2d // .............................................e..........................|............................................e................... - // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e.................... - // trn1 v8.2d, v25.2d, v27.2d // ..............................................e.........................|.............................................e.................. - // trn1 v9.2d, v26.2d, v28.2d // ...............................................e........................|..............................................e................. - // ldr q0, [x3], #16 // .........................................................e..............|........................................................e....... - // sub v24.8h, v8.8h, v9.8h // ....................................................e...................|...................................................e............ - // add v8.8h, v8.8h, v9.8h // ..................................................e.....................|.................................................e.............. - // mul v9.8h, v24.8h, v0.h[2] // ...................................................................e....|................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................................................e........|..............................................................e. - // mls v9.8h, v24.8h, v7.h[0] // ......................................................................e.|................................................................ - // sub v24.8h, v10.8h, v11.8h // ...................................................e....................|..................................................e............. - // add v10.8h, v10.8h, v11.8h // .................................................e......................|................................................e............... - // mul v11.8h, v24.8h, v0.h[4] // .................................................................e......|................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................................................................e.....|................................................................ - // mls v11.8h, v24.8h, v7.h[0] // .*......................................................................|*............................................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ......................................................e.................|.....................................................e.......... - // srshr v25.8h, v25.8h, #11 // ...........................................................e............|..........................................................e..... - // mls v8.8h, v25.8h, v7.h[0] // ..............................................................e.........|.............................................................e.. - // sqdmulh v25.8h, v9.8h, v7.h[1] // .........*..............................................................|........*....................................................... - // srshr v25.8h, v25.8h, #11 // ......................*.................................................|.....................*.......................................... - // mls v9.8h, v25.8h, v7.h[0] // .....................................*..................................|....................................*........................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // .....................................................e..................|....................................................e........... - // srshr v25.8h, v25.8h, #11 // ..........................................................e.............|.........................................................e...... - // mls v10.8h, v25.8h, v7.h[0] // .............................................................e..........|............................................................e... - // sqdmulh v25.8h, v11.8h, v7.h[1] // .......*................................................................|......*......................................................... - // srshr v25.8h, v25.8h, #11 // ...........................*............................................|..........................*..................................... - // mls v11.8h, v25.8h, v7.h[0] // ...............................*........................................|..............................*................................. - // sub v24.8h, v8.8h, v10.8h // .....................................................................e..|................................................................ - // add v8.8h, v8.8h, v10.8h // ....................................................................e...|................................................................ - // mul v10.8h, v24.8h, v0.h[0] // ......*.................................................................|.....*.......................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*.....................................................................|.*.............................................................. - // mls v10.8h, v24.8h, v7.h[0] // ...........*............................................................|..........*..................................................... - // sub v24.8h, v9.8h, v11.8h // ..........................................*.............................|.........................................*...................... - // add v9.8h, v9.8h, v11.8h // ...........................................*............................|..........................................*..................... - // mul v11.8h, v24.8h, v0.h[0] // .......................................................*................|......................................................*......... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................................*...............|.......................................................*........ - // mls v11.8h, v24.8h, v7.h[0] // ............................................................*...........|...........................................................*.... - // str q8, [x1], #(64) // .......................................................................e|................................................................ - // str q9, [x1, #(-64 + 16*1)] // ................................................*.......................|...............................................*................ - // str q10, [x1, #(-64 + 16*2)] // ..................*.....................................................|.................*.............................................. - // str q11, [x1, #(-64 + 16*3)] // ................................................................*.......|...............................................................* + // --------------------------------------------------------- new position ----------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---- + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................~......................................................... + // ldr q0, [x4], #(6*16) // ....e...................................................................'...~..................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // .........e..............................................................'........~................................................ + // ldr q1, [x4, #(-6*16 + 2*16)] // ...e....................................................................'..~...................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ......e.................................................................'.....~................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ..e.....................................................................'.~....................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .....e..................................................................'....~.................................................... + // sub v24.8h, v8.8h, v9.8h // ............e...........................................................'...........~............................................. + // add v8.8h, v8.8h, v9.8h // .............e..........................................................'............~............................................ + // sqrdmulh v27.8h, v24.8h, v5.8h // ................e.......................................................'...............~......................................... + // mul v9.8h, v24.8h, v1.8h // .................e......................................................'................~........................................ + // mls v9.8h, v27.8h, v7.h[0] // ......................e.................................................'.....................~................................... + // sub v24.8h, v10.8h, v11.8h // ...............e........................................................'..............~.......................................... + // add v10.8h, v10.8h, v11.8h // ....................e...................................................'...................~..................................... + // sqrdmulh v27.8h, v24.8h, v6.8h // ...................e....................................................'..................~...................................... + // mul v11.8h, v24.8h, v2.8h // ..................e.....................................................'.................~....................................... + // mls v11.8h, v27.8h, v7.h[0] // ........................e...............................................'.......................~................................. + // sub v24.8h, v8.8h, v10.8h // ...........................e............................................'..........................~.............................. + // add v8.8h, v8.8h, v10.8h // .............................e..........................................'............................~............................ + // sqrdmulh v27.8h, v24.8h, v4.8h // ...............................e........................................'..............................~.......................... + // mul v10.8h, v24.8h, v0.8h // ................................e.......................................'...............................~......................... + // mls v10.8h, v27.8h, v7.h[0] // ..........................................e.............................'.........................................~............... + // sub v24.8h, v9.8h, v11.8h // ..............................e.........................................'.............................~........................... + // add v9.8h, v9.8h, v11.8h // ........................................e...............................'.......................................~................. + // sqrdmulh v27.8h, v24.8h, v4.8h // ...................................e....................................'..................................~...................... + // mul v11.8h, v24.8h, v0.8h // ..................................e.....................................'.................................~....................... + // mls v11.8h, v27.8h, v7.h[0] // .........................................e..............................'........................................~................ + // trn1 v25.4s, v8.4s, v9.4s // ............................................e...........................'...........................................~............. + // trn2 v26.4s, v8.4s, v9.4s // .............................................e..........................'............................................~............ + // trn1 v27.4s, v10.4s, v11.4s // ..............................................e.........................'.............................................~........... + // trn2 v28.4s, v10.4s, v11.4s // ...............................................e........................'..............................................~.......... + // trn2 v10.2d, v25.2d, v27.2d // .................................................e......................'................................................~........ + // trn2 v11.2d, v26.2d, v28.2d // ..................................................e.....................'.................................................~....... + // trn1 v8.2d, v25.2d, v27.2d // ...................................................e....................'..................................................~...... + // trn1 v9.2d, v26.2d, v28.2d // ....................................................e...................'...................................................~..... + // ldr q0, [x3], #16 // .......................................e................................'......................................~.................. + // sub v24.8h, v8.8h, v9.8h // ................................................................e.......'......................................................... + // add v8.8h, v8.8h, v9.8h // .......................................................e................'......................................................~.. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...................................................................e....'......................................................... + // mul v9.8h, v24.8h, v0.h[2] // ..................................................................e.....'......................................................... + // mls v9.8h, v27.8h, v7.h[0] // ......................................................................e.'......................................................... + // sub v24.8h, v10.8h, v11.8h // ...........................................................e............'......................................................... + // add v10.8h, v10.8h, v11.8h // .....................................................e..................'....................................................~.... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // .~......................................................................'*........................................................ + // mul v11.8h, v24.8h, v0.h[4] // ..............................................................e.........'......................................................... + // mls v11.8h, v27.8h, v7.h[0] // ........~...............................................................'.......*................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..........................................................e.............'......................................................... + // srshr v25.8h, v25.8h, #11 // .............................................................e..........'......................................................... + // mls v8.8h, v25.8h, v7.h[0] // .................................................................e......'......................................................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // .......~................................................................'......*.................................................. + // srshr v25.8h, v25.8h, #11 // ..........~.............................................................'.........*............................................... + // mls v9.8h, v25.8h, v7.h[0] // .........................~..............................................'........................*................................ + // sqdmulh v25.8h, v10.8h, v7.h[1] // ........................................................e...............'.......................................................~. + // srshr v25.8h, v25.8h, #11 // ............................................................e...........'......................................................... + // mls v10.8h, v25.8h, v7.h[0] // ...............................................................e........'......................................................... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ..............~.........................................................'.............*........................................... + // srshr v25.8h, v25.8h, #11 // .....................~..................................................'....................*.................................... + // mls v11.8h, v25.8h, v7.h[0] // ..........................~.............................................'.........................*............................... + // sub v24.8h, v8.8h, v10.8h // .....................................................................e..'......................................................... + // add v8.8h, v8.8h, v10.8h // ....................................................................e...'......................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...........~............................................................'..........*.............................................. + // mul v10.8h, v24.8h, v0.h[0] // .......................~................................................'......................*.................................. + // mls v10.8h, v27.8h, v7.h[0] // ............................~...........................................'...........................*............................. + // sub v24.8h, v9.8h, v11.8h // .................................~......................................'................................*........................ + // add v9.8h, v9.8h, v11.8h // ......................................................~.................'.....................................................*... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ......................................~.................................'.....................................*................... + // mul v11.8h, v24.8h, v0.h[0] // .....................................~..................................'....................................*.................... + // mls v11.8h, v27.8h, v7.h[0] // ...........................................~............................'..........................................*.............. + // str q8, [x1], #(64) // .......................................................................e'......................................................... + // str q9, [x1, #(-64 + 16*1)] // .........................................................~..............'........................................................* + // str q10, [x1, #(-64 + 16*2)] // ....................................~...................................'...................................*..................... + // str q11, [x1, #(-64 + 16*3)] // ................................................~.......................'...............................................*......... sub count, count, #1 cbnz count, layer4567_start - // gap // .................. - mls v28.8H, v19.8H, v7.H[0] // *................. - // gap // .................. - // gap // .................. - mul v27.8H, v21.8H, v5.H[0] // ..*............... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - sqdmulh v19.8H, v0.8H, v7.H[1] // ....*............. - // gap // .................. - // gap // .................. - // gap // .................. - sqdmulh v22.8H, v28.8H, v7.H[1] // ...*.............. - // gap // .................. - // gap // .................. - sqrdmulh v24.8H, v21.8H, v5.H[1] // .*................ - // gap // .................. - // gap // .................. - // gap // .................. - srshr v23.8H, v19.8H, #11 // .......*.......... - // gap // .................. - // gap // .................. - // gap // .................. - srshr v19.8H, v22.8H, #11 // ........*......... - // gap // .................. - // gap // .................. - // gap // .................. - mls v27.8H, v24.8H, v7.H[0] // .....*............ - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mls v28.8H, v19.8H, v7.H[0] // .........*........ - // gap // .................. - mls v0.8H, v23.8H, v7.H[0] // ..........*....... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - sub v19.8H, v0.8H, v28.8H // ...........*...... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - mul v23.8H, v19.8H, v5.H[0] // ..............*... - sqrdmulh v19.8H, v19.8H, v5.H[1] // ...............*.. - str q27, [x1, #-32] // ......*........... - // gap // .................. - add v22.8H, v0.8H, v28.8H // ............*..... - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - str q22, [x1, #-48] // .............*.... - mls v23.8H, v19.8H, v7.H[0] // ................*. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - // gap // .................. - str q23, [x1, #-16] // .................* - // gap // .................. - // gap // .................. - // gap // .................. - - // original source code - // mls v28.8H, v19.8H, v7.H[0] // *................. - // sqrdmulh v19.8H, v21.8H, v5.H[1] // ....*............. - // mul v11.8H, v21.8H, v5.H[0] // .*................ - // sqdmulh v12.8H, v28.8H, v7.H[1] // ...*.............. - // sqdmulh v27.8H, v0.8H, v7.H[1] // ..*............... - // mls v11.8H, v19.8H, v7.H[0] // .......*.......... - // str q11, [x1, #-32] // .............*.... - // srshr v13.8H, v27.8H, #11 // .....*............ - // srshr v22.8H, v12.8H, #11 // ......*........... - // mls v28.8H, v22.8H, v7.H[0] // ........*......... - // mls v0.8H, v13.8H, v7.H[0] // .........*........ - // sub v6.8H, v0.8H, v28.8H // ..........*....... - // add v28.8H, v0.8H, v28.8H // ..............*... - // str q28, [x1, #-48] // ...............*.. - // mul v19.8H, v6.8H, v5.H[0] // ...........*...... - // sqrdmulh v8.8H, v6.8H, v5.H[1] // ............*..... - // mls v19.8H, v8.8H, v7.H[0] // ................*. - // str q19, [x1, #-16] // .................* + // Instructions: 19 + // Expected cycles: 24 + // Expected IPC: 0.79 + // + // Cycle bound: 24.0 + // IPC bound: 0.79 + // + // Wall time: 0.13s + // User time: 0.13s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sqrdmulh v24.8H, v1.8H, v0.H[5] // *............................. + mul v10.8H, v28.8H, v0.H[0] // .......*...................... + // gap // .............................. + // gap // .............................. + sqrdmulh v4.8H, v28.8H, v0.H[1] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v30.8H, v24.8H, v7.H[0] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v10.8H, v4.8H, v7.H[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqdmulh v24.8H, v17.8H, v7.H[1] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqdmulh v13.8H, v30.8H, v7.H[1] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q10, [x1, #-32] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + srshr v24.8H, v24.8H, #11 // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + srshr v13.8H, v13.8H, #11 // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v17.8H, v24.8H, v7.H[0] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v30.8H, v13.8H, v7.H[0] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v24.8H, v17.8H, v30.8H // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v10.8H, v17.8H, v30.8H // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v15.8H, v24.8H, v0.H[1] // ..............*............... + mul v24.8H, v24.8H, v0.H[0] // .............*................ + // gap // .............................. + // gap // .............................. + str q10, [x1, #-48] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v24.8H, v15.8H, v7.H[0] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q24, [x1, #-16] // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sqrdmulh v16.8H, v1.8H, v0.H[5] // *.............................. + // sqdmulh v3.8H, v17.8H, v7.H[1] // .....*......................... + // mls v30.8H, v16.8H, v7.H[0] // ...*........................... + // srshr v26.8H, v3.8H, #11 // ........*...................... + // sqrdmulh v11.8H, v28.8H, v0.H[1] // ..*............................ + // sqdmulh v18.8H, v30.8H, v7.H[1] // ......*........................ + // srshr v21.8H, v18.8H, #11 // .........*..................... + // mul v29.8H, v28.8H, v0.H[0] // .*............................. + // mls v17.8H, v26.8H, v7.H[0] // ..........*.................... + // mls v30.8H, v21.8H, v7.H[0] // ...........*................... + // mls v29.8H, v11.8H, v7.H[0] // ....*.......................... + // sub v6.8H, v17.8H, v30.8H // ............*.................. + // str q29, [x1, #-32] // .......*....................... + // mul v13.8H, v6.8H, v0.H[0] // ...............*............... + // sqrdmulh v6.8H, v6.8H, v0.H[1] // ..............*................ + // mls v13.8H, v6.8H, v7.H[0] // .................*............. + // str q13, [x1, #-16] // ..................*............ + // add v24.8H, v17.8H, v30.8H // .............*................. + // str q24, [x1, #-48] // ................*.............. // --------------------------------------------------------------------- @@ -948,526 +990,571 @@ layer4567_start: .p2align 2 - ldr q31, [x0, #320] // .*................................ - ldr q22, [x0, #256] // *................................. - // gap // .................................. - // gap // .................................. - ldr q8, [x0, #448] // ....*............................. - ldr q16, [x0, #384] // ..*............................... - // gap // .................................. - // gap // .................................. - ldr q4, [x0, #64] // .......*.......................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - ldr q9, [x0, #0] // .....*............................ - // gap // .................................. - // gap // .................................. - // gap // .................................. - sub v3.8H, v22.8H, v31.8H // ......*........................... - add v6.8H, v22.8H, v31.8H // ........*......................... - ldr q18, [x0, #128] // ...*.............................. - // gap // .................................. - sub v23.8H, v16.8H, v8.8H // ..........*....................... - add v22.8H, v16.8H, v8.8H // .............*.................... - ldr q19, [x0, #192] // ..................*............... - // gap // .................................. - sqrdmulh v31.8H, v3.8H, v1.H[3] // .........*........................ - mul v13.8H, v3.8H, v1.H[2] // ...........*...................... - // gap // .................................. - // gap // .................................. - mul v2.8H, v23.8H, v1.H[4] // ..............*................... - sqrdmulh v11.8H, v23.8H, v1.H[5] // ...............*.................. - // gap // .................................. - // gap // .................................. - sub v25.8H, v9.8H, v4.8H // ............*..................... - add v9.8H, v9.8H, v4.8H // ................*................. - // gap // .................................. - // gap // .................................. - sub v21.8H, v18.8H, v19.8H // .............................*.... - mls v13.8H, v31.8H, v7.H[0] // .................*................ - // gap // .................................. - // gap // .................................. - add v26.8H, v18.8H, v19.8H // ....................*............. - mls v2.8H, v11.8H, v7.H[0] // ...................*.............. - // gap // .................................. - // gap // .................................. - sqrdmulh v23.8H, v25.8H, v0.H[7] // ...........................*...... - // gap // .................................. - // gap // .................................. - // gap // .................................. - add v31.8H, v9.8H, v26.8H // ............................*..... - sub v19.8H, v6.8H, v22.8H // ..........................*....... - // gap // .................................. - // gap // .................................. - add v6.8H, v6.8H, v22.8H // ........................*......... - sub v28.8H, v13.8H, v2.8H // ......................*........... - // gap // .................................. - // gap // .................................. - mul v15.8H, v25.8H, v0.H[6] // .....................*............ - // gap // .................................. - // gap // .................................. - // gap // .................................. - mul v16.8H, v28.8H, v0.H[4] // .......................*.......... - sqrdmulh v11.8H, v28.8H, v0.H[5] // .........................*........ - // gap // .................................. - // gap // .................................. - add v24.8H, v31.8H, v6.8H // ................................*. - mul v5.8H, v19.8H, v0.H[4] // ...............................*.. - // gap // .................................. - // gap // .................................. - mls v15.8H, v23.8H, v7.H[0] // .................................* - // gap // .................................. - // gap // .................................. - // gap // .................................. - mls v16.8H, v11.8H, v7.H[0] // ..............................*... - // gap // .................................. - // gap // .................................. - // gap // .................................. - - // original source code - // ldr q28, [x0, #256] // .*................................ - // ldr q18, [x0, #320] // *................................. - // ldr q23, [x0, #384] // ...*.............................. - // ldr q10, [x0, #128] // ........*......................... - // ldr q12, [x0, #448] // ..*............................... - // ldr q4, [x0, #0] // .....*............................ - // sub v17.8H, v28.8H, v18.8H // ......*........................... - // ldr q9, [x0, #64] // ....*............................. - // add v18.8H, v28.8H, v18.8H // .......*.......................... - // sqrdmulh v19.8H, v17.8H, v1.H[3] // ............*..................... - // sub v6.8H, v23.8H, v12.8H // .........*........................ - // mul v13.8H, v17.8H, v1.H[2] // .............*.................... - // sub v11.8H, v4.8H, v9.8H // ................*................. - // add v25.8H, v23.8H, v12.8H // ..........*....................... - // mul v2.8H, v6.8H, v1.H[4] // ..............*................... - // sqrdmulh v17.8H, v6.8H, v1.H[5] // ...............*.................. - // add v9.8H, v4.8H, v9.8H // .................*................ - // mls v13.8H, v19.8H, v7.H[0] // ...................*.............. - // ldr q21, [x0, #192] // ...........*...................... - // mls v2.8H, v17.8H, v7.H[0] // .....................*............ - // add v26.8H, v10.8H, v21.8H // ....................*............. - // mul v15.8H, v11.8H, v0.H[6] // ...........................*...... - // sub v5.8H, v13.8H, v2.8H // ..........................*....... - // mul v16.8H, v5.8H, v0.H[4] // ............................*..... - // add v6.8H, v18.8H, v25.8H // .........................*........ - // sqrdmulh v22.8H, v5.8H, v0.H[5] // .............................*.... - // sub v19.8H, v18.8H, v25.8H // ........................*......... - // sqrdmulh v20.8H, v11.8H, v0.H[7] // ......................*........... - // add v31.8H, v9.8H, v26.8H // .......................*.......... - // sub v21.8H, v10.8H, v21.8H // ..................*............... - // mls v16.8H, v22.8H, v7.H[0] // .................................* - // mul v5.8H, v19.8H, v0.H[4] // ...............................*.. - // add v24.8H, v31.8H, v6.8H // ..............................*... - // mls v15.8H, v20.8H, v7.H[0] // ................................*. + // Instructions: 24 + // Expected cycles: 14 + // Expected IPC: 1.71 + // + // Cycle bound: 14.0 + // IPC bound: 1.71 + // + // Wall time: 0.18s + // User time: 0.18s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q11, [x0, #0] // .*............................ + ldr q5, [x0, #64] // *............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x0, #192] // ....*......................... + ldr q6, [x0, #128] // .........*.................... + // gap // .............................. + // gap // .............................. + ldr q17, [x0, #256] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q21, [x0, #320] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v19.8H, v11.8H, v5.8H // .....*........................ + add v25.8H, v11.8H, v5.8H // ..................*........... + // gap // .............................. + // gap // .............................. + sub v18.8H, v6.8H, v22.8H // ..........*................... + ldr q13, [x0, #448] // ..*........................... + // gap // .............................. + // gap // .............................. + sqrdmulh v16.8H, v19.8H, v0.H[7] // ......*....................... + mul v26.8H, v19.8H, v0.H[6] // .......*...................... + ldr q12, [x0, #384] // ..............*............... + // gap // .............................. + mul v31.8H, v18.8H, v1.H[0] // ...........*.................. + sqrdmulh v27.8H, v18.8H, v1.H[1] // ............*................. + // gap // .............................. + // gap // .............................. + add v15.8H, v6.8H, v22.8H // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v26.8H, v16.8H, v7.H[0] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v31.8H, v27.8H, v7.H[0] // ...............*.............. + sub v22.8H, v12.8H, v13.8H // ...................*.......... + // gap // .............................. + // gap // .............................. + add v3.8H, v12.8H, v13.8H // ....................*......... + add v14.8H, v17.8H, v21.8H // .................*............ + // gap // .............................. + // gap // .............................. + mul v20.8H, v22.8H, v1.H[4] // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v16.8H, v22.8H, v1.H[5] // .......................*...... + add v9.8H, v26.8H, v31.8H // .....................*........ + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q5, [x0, #64] // .*............................. + // ldr q28, [x0, #0] // *.............................. + // ldr q27, [x0, #448] // .........*..................... + // ldr q17, [x0, #256] // ....*.......................... + // ldr q23, [x0, #192] // ..*............................ + // sub v31.8H, v28.8H, v5.8H // ......*........................ + // sqrdmulh v6.8H, v31.8H, v0.H[7] // ..........*.................... + // mul v26.8H, v31.8H, v0.H[6] // ...........*................... + // mls v26.8H, v6.8H, v7.H[0] // ................*.............. + // ldr q25, [x0, #128] // ...*........................... + // sub v22.8H, v25.8H, v23.8H // ........*...................... + // mul v31.8H, v22.8H, v1.H[0] // .............*................. + // sqrdmulh v12.8H, v22.8H, v1.H[1] // ..............*................ + // ldr q21, [x0, #320] // .....*......................... + // ldr q13, [x0, #384] // ............*.................. + // mls v31.8H, v12.8H, v7.H[0] // .................*............. + // add v15.8H, v25.8H, v23.8H // ...............*............... + // add v14.8H, v17.8H, v21.8H // ....................*.......... + // add v25.8H, v28.8H, v5.8H // .......*....................... + // sub v16.8H, v13.8H, v27.8H // ..................*............ + // add v3.8H, v13.8H, v27.8H // ...................*........... + // add v9.8H, v26.8H, v31.8H // .......................*....... + // mul v20.8H, v16.8H, v1.H[4] // .....................*......... + // sqrdmulh v16.8H, v16.8H, v1.H[5] // ......................*........ sub count, count, #1 layer123_start: - sub v27.8H, v31.8H, v6.8H // ................................................*....................................... - sqrdmulh v22.8H, v19.8H, v0.H[5] // .........................................*.............................................. - ldr q28, [x0, #272] // ....e................................................................................... - ldr q18, [x0, #336] // .....e.................................................................................. + // Instructions: 88 + // Expected cycles: 36 + // Expected IPC: 2.44 + // + // Cycle bound: 36.0 + // IPC bound: 2.44 + // + // Wall time: 173.83s + // User time: 173.83s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + sub v6.8H, v26.8H, v31.8H // .................................*...................................................... + sub v24.8H, v17.8H, v21.8H // ..................*..................................................................... + ldr q5, [x0, #80] // .e...................................................................................... + ldr q28, [x0, #16] // e....................................................................................... + sub v8.8H, v25.8H, v15.8H // ............................*........................................................... + ldr q27, [x0, #464] // .......e................................................................................ + sub v10.8H, v14.8H, v3.8H // ......................................*................................................. + ldr q17, [x0, #272] // ....e................................................................................... + ldr q23, [x0, #208] // ...e.................................................................................... + mul v12.8H, v24.8H, v1.H[2] // .....................*.................................................................. // gap // ........................................................................................ - sub v19.8H, v9.8H, v26.8H // ............................*........................................................... - add v26.8H, v13.8H, v2.8H // ............................................*........................................... - ldr q23, [x0, #400] // ......e................................................................................. - ldr q10, [x0, #144] // ..e..................................................................................... - mul v20.8H, v24.8H, v29.8H // ........................................................................*............... - ldr q12, [x0, #464] // .......e................................................................................ - mul v31.8H, v27.8H, v0.H[0] // ..................................................*..................................... - mul v8.8H, v19.8H, v0.H[2] // ..............................*......................................................... + sqrdmulh v19.8H, v24.8H, v1.H[3] // ....................*................................................................... + sqrdmulh v18.8H, v6.8H, v0.H[3] // ...................................*.................................................... + mul v24.8H, v6.8H, v0.H[2] // ....................................*................................................... // gap // ........................................................................................ - sqrdmulh v14.8H, v19.8H, v0.H[3] // ...............................*........................................................ // gap // ........................................................................................ - sqrdmulh v27.8H, v27.8H, v0.H[1] // ...................................................*.................................... - ldr q4, [x0, #16] // e....................................................................................... + sub v31.8H, v28.8H, v5.8H // ........e............................................................................... // gap // ........................................................................................ - sub v17.8H, v28.8H, v18.8H // ..................e..................................................................... - mls v5.8H, v22.8H, v7.H[0] // ..........................................*............................................. // gap // ........................................................................................ - ldr q9, [x0, #80] // .e...................................................................................... - add v18.8H, v28.8H, v18.8H // ...................e.................................................................... + sqrdmulh v22.8H, v8.8H, v0.H[3] // ..............................*......................................................... + mls v12.8H, v19.8H, v7.H[0] // ......................*................................................................. // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v19.8H, v17.8H, v1.H[3] // .....................e.................................................................. - sub v6.8H, v23.8H, v12.8H // .......................e................................................................ - mul v13.8H, v17.8H, v1.H[2] // ....................e................................................................... + mul v8.8H, v8.8H, v0.H[2] // ...............................*........................................................ + sqrdmulh v6.8H, v31.8H, v0.H[7] // ..........e............................................................................. // gap // ........................................................................................ - mls v31.8H, v27.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ - mul v27.8H, v21.8H, v1.H[0] // ...............*........................................................................ - sqrdmulh v17.8H, v21.8H, v1.H[1] // ................*....................................................................... + mls v20.8H, v16.8H, v7.H[0] // ...........................*............................................................ + mls v24.8H, v18.8H, v7.H[0] // .....................................*.................................................. + mul v26.8H, v31.8H, v0.H[6] // ...........e............................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v8.8H, v14.8H, v7.H[0] // ................................*....................................................... + sqrdmulh v19.8H, v10.8H, v0.H[5] // ........................................*............................................... + add v18.8H, v14.8H, v3.8H // .......................................*................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sub v11.8H, v4.8H, v9.8H // ........e............................................................................... + add v14.8H, v25.8H, v15.8H // .............................*.......................................................... // gap // ........................................................................................ // gap // ........................................................................................ - add v25.8H, v23.8H, v12.8H // ........................e............................................................... - mul v2.8H, v6.8H, v1.H[4] // .........................e.............................................................. - str q31, [x0, #256] // ....................................................................*................... - mls v27.8H, v17.8H, v7.H[0] // .................*...................................................................... + sub v13.8H, v12.8H, v20.8H // ...........................................*............................................ + mls v26.8H, v6.8H, v7.H[0] // ............e........................................................................... + mul v21.8H, v10.8H, v0.H[4] // .........................................*.............................................. // gap // ........................................................................................ - sqrdmulh v17.8H, v6.8H, v1.H[5] // ..........................e............................................................. - sub v3.8H, v8.8H, v5.8H // ..........................................................*............................. - add v28.8H, v8.8H, v5.8H // ...........................................................*............................ // gap // ........................................................................................ + mul v25.8H, v13.8H, v0.H[4] // ..............................................*......................................... + add v12.8H, v12.8H, v20.8H // ............................................*........................................... // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v16.8H, v13.8H, v0.H[5] // .............................................*.......................................... // gap // ........................................................................................ - add v9.8H, v4.8H, v9.8H // .........e.............................................................................. - mls v13.8H, v19.8H, v7.H[0] // ......................e................................................................. // gap // ........................................................................................ - ldr q21, [x0, #208] // ...e.................................................................................... - sub v12.8H, v15.8H, v27.8H // .................................*...................................................... - add v23.8H, v15.8H, v27.8H // ..................................*..................................................... + sub v3.8H, v14.8H, v18.8H // ................................................*....................................... + mls v21.8H, v19.8H, v7.H[0] // ..........................................*............................................. + add v6.8H, v9.8H, v12.8H // ......................................................*................................. // gap // ........................................................................................ // gap // ........................................................................................ - mul v27.8H, v28.8H, v29.8H // ..............................................................................*......... - mls v2.8H, v17.8H, v7.H[0] // ...........................e............................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v17.8H, v12.8H, v0.H[2] // ...................................*.................................................... - add v19.8H, v23.8H, v26.8H // ......................................................*................................. - sqrdmulh v22.8H, v12.8H, v0.H[3] // ....................................*................................................... + mul v19.8H, v3.8H, v0.H[0] // ...................................................*.................................... + mls v8.8H, v22.8H, v7.H[0] // ................................*....................................................... + mul v15.8H, v6.8H, v29.8H // ............................................................................*........... + mls v25.8H, v16.8H, v7.H[0] // ...............................................*........................................ // gap // ........................................................................................ // gap // ........................................................................................ - sub v31.8H, v23.8H, v26.8H // .....................................................*.................................. + sqrdmulh v22.8H, v6.8H, v30.8H // ...........................................................................*............ + add v31.8H, v14.8H, v18.8H // .................................................*...................................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v4.8H, v19.8H, v29.8H // ...........................................................................*............ - add v26.8H, v10.8H, v21.8H // ..............e......................................................................... + add v6.8H, v8.8H, v21.8H // ...........................................................*............................ + sub v8.8H, v8.8H, v21.8H // ..........................................................*............................. // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v6.8H, v31.8H, v0.H[1] // ........................................................*............................... - mul v31.8H, v31.8H, v0.H[0] // .......................................................*................................ - mls v17.8H, v22.8H, v7.H[0] // .....................................*.................................................. + sub v16.8H, v24.8H, v25.8H // ...............................................................*........................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v15.8H, v11.8H, v0.H[6] // ..........e............................................................................. - sqrdmulh v8.8H, v19.8H, v30.8H // ............................................................................*........... - sub v5.8H, v13.8H, v2.8H // ...........................................e............................................ + add v20.8H, v24.8H, v25.8H // ................................................................*....................... + mul v11.8H, v6.8H, v29.8H // ...............................................................................*........ + sqrdmulh v6.8H, v6.8H, v30.8H // ..............................................................................*......... + ldr q25, [x0, #144] // ..e..................................................................................... // gap // ........................................................................................ + mul v2.8H, v16.8H, v0.H[0] // ..................................................................*..................... + sqrdmulh v24.8H, v16.8H, v0.H[1] // .................................................................*...................... // gap // ........................................................................................ // gap // ........................................................................................ + mul v16.8H, v31.8H, v29.8H // .........................................................................*.............. // gap // ........................................................................................ - sqrdmulh v23.8H, v28.8H, v30.8H // ...............................................................................*........ - mls v31.8H, v6.8H, v7.H[0] // .........................................................*.............................. - sub v12.8H, v17.8H, v16.8H // ...............................................................*........................ // gap // ........................................................................................ - add v22.8H, v17.8H, v16.8H // ................................................................*....................... + sqrdmulh v18.8H, v20.8H, v30.8H // .................................................................................*...... + mls v11.8H, v6.8H, v7.H[0] // ................................................................................*....... + mul v6.8H, v20.8H, v29.8H // ..................................................................................*..... // gap // ........................................................................................ // gap // ........................................................................................ + mls v2.8H, v24.8H, v7.H[0] // ...................................................................*.................... + mls v15.8H, v22.8H, v7.H[0] // .............................................................................*.......... // gap // ........................................................................................ - mul v16.8H, v5.8H, v0.H[4] // .............................................e.......................................... - add v6.8H, v18.8H, v25.8H // .......................................e................................................ - mul v19.8H, v22.8H, v29.8H // .................................................................................*...... - sqrdmulh v22.8H, v22.8H, v30.8H // ..................................................................................*..... // gap // ........................................................................................ + sub v22.8H, v25.8H, v23.8H // .............e.......................................................................... // gap // ........................................................................................ - sqrdmulh v17.8H, v24.8H, v30.8H // .........................................................................*.............. - mul v24.8H, v12.8H, v0.H[0] // .................................................................*...................... - str q31, [x0, #320] // .....................................................................*.................. // gap // ........................................................................................ - sqrdmulh v28.8H, v12.8H, v0.H[1] // ..................................................................*..................... + sqrdmulh v20.8H, v8.8H, v0.H[1] // ............................................................*........................... + str q11, [x0, #128] // ......................................................................................*. + mls v6.8H, v18.8H, v7.H[0] // ...................................................................................*.... + mul v18.8H, v8.8H, v0.H[0] // .............................................................*.......................... // gap // ........................................................................................ + sqrdmulh v24.8H, v31.8H, v30.8H // ........................................................................*............... + str q2, [x0, #448] // .......................................................................*................ + mul v31.8H, v22.8H, v1.H[0] // ................e....................................................................... // gap // ........................................................................................ - mls v27.8H, v23.8H, v7.H[0] // ................................................................................*....... - mls v19.8H, v22.8H, v7.H[0] // ...................................................................................*.... - sqrdmulh v14.8H, v3.8H, v0.H[1] // .............................................................*.......................... + sub v4.8H, v9.8H, v12.8H // .....................................................*.................................. + sqrdmulh v12.8H, v22.8H, v1.H[1] // ...............e........................................................................ // gap // ........................................................................................ + ldr q21, [x0, #336] // .....e.................................................................................. + str q6, [x0, #192] // .......................................................................................* + sqrdmulh v6.8H, v3.8H, v0.H[1] // ..................................................*..................................... + mls v18.8H, v20.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ + sqrdmulh v8.8H, v4.8H, v0.H[1] // .......................................................*................................ + mul v22.8H, v4.8H, v0.H[0] // ........................................................*............................... + ldr q13, [x0, #400] // ......e................................................................................. // gap // ........................................................................................ - mls v20.8H, v17.8H, v7.H[0] // ..........................................................................*............. + mls v16.8H, v24.8H, v7.H[0] // ..........................................................................*............. + str q15, [x0, #64] // .....................................................................................*.. + mls v31.8H, v12.8H, v7.H[0] // .................e...................................................................... // gap // ........................................................................................ - mul v12.8H, v3.8H, v0.H[0] // ............................................................*........................... + add v15.8H, v25.8H, v23.8H // ..............e......................................................................... + mls v19.8H, v6.8H, v7.H[0] // ....................................................*................................... + str q18, [x0, #384] // ......................................................................*................. // gap // ........................................................................................ - mls v4.8H, v8.8H, v7.H[0] // .............................................................................*.......... - mls v24.8H, v28.8H, v7.H[0] // ...................................................................*.................... - str q27, [x0, #128] // ......................................................................................*. - str q19, [x0, #192] // .......................................................................................* - sqrdmulh v22.8H, v5.8H, v0.H[5] // ..............................................e......................................... + mls v22.8H, v8.8H, v7.H[0] // .........................................................*.............................. // gap // ........................................................................................ - sub v19.8H, v18.8H, v25.8H // ......................................e................................................. - mls v12.8H, v14.8H, v7.H[0] // ..............................................................*......................... - str q20, [x0], #(16) // ....................................................................................*... - sqrdmulh v20.8H, v11.8H, v0.H[7] // ...........e............................................................................ // gap // ........................................................................................ - str q24, [x0, #432] // .......................................................................*................ - add v31.8H, v9.8H, v26.8H // .............................e.......................................................... - sub v21.8H, v10.8H, v21.8H // .............e.......................................................................... + add v14.8H, v17.8H, v21.8H // ...................e.................................................................... + add v25.8H, v28.8H, v5.8H // .........e.............................................................................. + str q16, [x0], #(16) // ....................................................................................*... + sub v16.8H, v13.8H, v27.8H // .......................e................................................................ // gap // ........................................................................................ - mls v16.8H, v22.8H, v7.H[0] // ...............................................e........................................ - str q4, [x0, #48] // .....................................................................................*.. + add v3.8H, v13.8H, v27.8H // ........................e............................................................... + str q19, [x0, #240] // ....................................................................*................... + add v9.8H, v26.8H, v31.8H // ..................................e..................................................... // gap // ........................................................................................ - mul v5.8H, v19.8H, v0.H[4] // ........................................e............................................... - str q12, [x0, #368] // ......................................................................*................. - add v24.8H, v31.8H, v6.8H // .................................................e...................................... + str q22, [x0, #304] // .....................................................................*.................. + mul v20.8H, v16.8H, v1.H[4] // ..........................e............................................................. + sqrdmulh v16.8H, v16.8H, v1.H[5] // .........................e.............................................................. // gap // ........................................................................................ - mls v15.8H, v20.8H, v7.H[0] // ............e........................................................................... - // original source code - // ldr q8, [x0, #0] // ............e.........................................................................|.............e....................................................................... - // ldr q9, [x0, #(1*(512/8))] // ...............e......................................................................|................e.................................................................... - // ldr q10, [x0, #(2*(512/8))] // .....e................................................................................|......e.............................................................................. - // ldr q11, [x0, #(3*(512/8))] // ..................................e...................................................|...................................e................................................. - // ldr q12, [x0, #(4*(512/8))] // e.....................................................................................|.e................................................................................... - // ldr q13, [x0, #(5*(512/8))] // .e....................................................................................|..e.................................................................................. - // ldr q14, [x0, #(6*(512/8))] // ....e.................................................................................|.....e............................................................................... - // ldr q15, [x0, #(7*(512/8))] // .......e..............................................................................|........e............................................................................ - // sub v24.8h, v8.8h, v9.8h // ........................e.............................................................|.........................e........................................................... - // add v8.8h, v8.8h, v9.8h // ................................e.....................................................|.................................e................................................... - // mul v9.8h, v24.8h, v0.h[6] // ................................................e.....................................|.................................................e................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ............................................................................e.........|.............................................................................e....... - // mls v9.8h, v24.8h, v7.h[0] // .....................................................................................e|..................................................................................... - // sub v24.8h, v10.8h, v11.8h // ...............................................................................e......|................................................................................e.... - // add v10.8h, v10.8h, v11.8h // ............................................e.........................................|.............................................e....................................... - // mul v11.8h, v24.8h, v1.h[0] // .....................*................................................................|......................*.............................................................. - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ......................*...............................................................|.......................*............................................................. - // mls v11.8h, v24.8h, v7.h[0] // ............................*.........................................................|.............................*....................................................... - // sub v24.8h, v12.8h, v13.8h // .............e........................................................................|..............e...................................................................... - // add v12.8h, v12.8h, v13.8h // ................e.....................................................................|.................e................................................................... - // mul v13.8h, v24.8h, v1.h[2] // ...................e..................................................................|....................e................................................................ - // sqrdmulh v24.8h, v24.8h, v1.h[3] // .................e....................................................................|..................e.................................................................. - // mls v13.8h, v24.8h, v7.h[0] // .................................e....................................................|..................................e.................................................. - // sub v24.8h, v14.8h, v15.8h // ..................e...................................................................|...................e................................................................. - // add v14.8h, v14.8h, v15.8h // .........................e............................................................|..........................e.......................................................... - // mul v15.8h, v24.8h, v1.h[4] // ..........................e...........................................................|...........................e......................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[5] // .............................e........................................................|..............................e...................................................... - // mls v15.8h, v24.8h, v7.h[0] // ......................................e...............................................|.......................................e............................................. - // sub v24.8h, v8.8h, v10.8h // ..*...................................................................................|...*................................................................................. - // add v8.8h, v8.8h, v10.8h // ..............................................................................e.......|...............................................................................e..... - // mul v10.8h, v24.8h, v0.h[2] // .........*............................................................................|..........*.......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........*...........................................................................|...........*......................................................................... - // mls v10.8h, v24.8h, v7.h[0] // .......................*..............................................................|........................*............................................................ - // sub v24.8h, v9.8h, v11.8h // ...................................*..................................................|....................................*................................................ - // add v9.8h, v9.8h, v11.8h // ....................................*.................................................|.....................................*............................................... - // mul v11.8h, v24.8h, v0.h[2] // .......................................*..............................................|........................................*............................................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................*............................................|..........................................*.......................................... - // mls v11.8h, v24.8h, v7.h[0] // ...............................................*......................................|................................................*.................................... - // sub v24.8h, v12.8h, v14.8h // .........................................................................e............|..........................................................................e.......... - // add v12.8h, v12.8h, v14.8h // ........................................................e.............................|.........................................................e........................... - // mul v14.8h, v24.8h, v0.h[4] // ..................................................................................e...|...................................................................................e. - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ......................................................................................|*.................................................................................... - // mls v14.8h, v24.8h, v7.h[0] // ..............*.......................................................................|...............*..................................................................... - // sub v24.8h, v13.8h, v15.8h // ..................................................e...................................|...................................................e................................. - // add v13.8h, v13.8h, v15.8h // ...*..................................................................................|....*................................................................................ - // mul v15.8h, v24.8h, v0.h[4] // .......................................................e..............................|........................................................e............................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ........................................................................e.............|.........................................................................e........... - // mls v15.8h, v24.8h, v7.h[0] // ................................................................................e.....|.................................................................................e... - // sub v24.8h, v8.8h, v12.8h // ......................................................................................*..................................................................................... - // add v8.8h, v8.8h, v12.8h // ....................................................................................e.|..................................................................................... - // mul v12.8h, v24.8h, v0.h[0] // ........*.............................................................................|.........*........................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........*..........................................................................|............*........................................................................ - // mls v12.8h, v24.8h, v7.h[0] // ....................*.................................................................|.....................*............................................................... - // sub v24.8h, v9.8h, v13.8h // ..........................................*...........................................|...........................................*......................................... - // add v9.8h, v9.8h, v13.8h // ........................................*.............................................|.........................................*........................................... - // mul v13.8h, v24.8h, v0.h[0] // ..............................................*.......................................|...............................................*..................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .............................................*........................................|..............................................*...................................... - // mls v13.8h, v24.8h, v7.h[0] // ....................................................*.................................|.....................................................*............................... - // sub v24.8h, v10.8h, v14.8h // ..............................*.......................................................|...............................*..................................................... - // add v10.8h, v10.8h, v14.8h // ...............................*......................................................|................................*.................................................... - // mul v14.8h, v24.8h, v0.h[0] // ...................................................................*..................|....................................................................*................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .................................................................*....................|..................................................................*.................. - // mls v14.8h, v24.8h, v7.h[0] // ..........................................................................*...........|...........................................................................*......... - // sub v24.8h, v11.8h, v15.8h // .....................................................*................................|......................................................*.............................. - // add v11.8h, v11.8h, v15.8h // ......................................................*...............................|.......................................................*............................. - // mul v15.8h, v24.8h, v0.h[0] // ............................................................*.........................|.............................................................*....................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*.......................|...............................................................*..................... - // mls v15.8h, v24.8h, v7.h[0] // .....................................................................*................|......................................................................*.............. - // str q12, [x0, #(4*(512/8))] // ...........................*..........................................................|............................*........................................................ - // str q13, [x0, #(5*(512/8))] // .............................................................*........................|..............................................................*...................... - // str q14, [x0, #(6*(512/8))] // ...................................................................................*..|....................................................................................* - // str q15, [x0, #(7*(512/8))] // .............................................................................*........|..............................................................................*...... - // mul v12.8h, v8.8h, v29.8h // ......*...............................................................................|.......*............................................................................. - // sqrdmulh v8.8h, v8.8h, v30.8h // ...........................................................*..........................|............................................................*........................ - // mls v12.8h, v8.8h, v7.h[0] // ..................................................................*...................|...................................................................*................. - // mul v13.8h, v9.8h, v29.8h // ...........................................*..........................................|............................................*........................................ - // sqrdmulh v9.8h, v9.8h, v30.8h // .................................................*....................................|..................................................*.................................. - // mls v13.8h, v9.8h, v7.h[0] // ....................................................................*.................|.....................................................................*............... - // mul v14.8h, v10.8h, v29.8h // .....................................*................................................|......................................*.............................................. - // sqrdmulh v10.8h, v10.8h, v30.8h // ...................................................*..................................|....................................................*................................ - // mls v14.8h, v10.8h, v7.h[0] // ...............................................................*......................|................................................................*.................... - // mul v15.8h, v11.8h, v29.8h // .........................................................*............................|..........................................................*.......................... - // sqrdmulh v11.8h, v11.8h, v30.8h // ..........................................................*...........................|...........................................................*......................... - // mls v15.8h, v11.8h, v7.h[0] // ................................................................*.....................|.................................................................*................... - // str q12, [x0], #(16) // ...........................................................................*..........|............................................................................*........ - // str q13, [x0, #(-16 + 1*(512/8))] // .................................................................................*....|..................................................................................*.. - // str q14, [x0, #(-16 + 2*(512/8))] // ......................................................................*...............|.......................................................................*............. - // str q15, [x0, #(-16 + 3*(512/8))] // .......................................................................*..............|........................................................................*............ + // ------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------------------- + // ldr q8, [x0, #0] // .e....................................................................................'..~.................................................................................. + // ldr q9, [x0, #(1*(512/8))] // e.....................................................................................'.~................................................................................... + // ldr q10, [x0, #(2*(512/8))] // ...........................................e..........................................'............................................~........................................ + // ldr q11, [x0, #(3*(512/8))] // ......e...............................................................................'.......~............................................................................. + // ldr q12, [x0, #(4*(512/8))] // .....e................................................................................'......~.............................................................................. + // ldr q13, [x0, #(5*(512/8))] // ..............................................................e.......................'...............................................................~..................... + // ldr q14, [x0, #(6*(512/8))] // ....................................................................e.................'.....................................................................~............... + // ldr q15, [x0, #(7*(512/8))] // ...e..................................................................................'....~................................................................................ + // sub v24.8h, v8.8h, v9.8h // ...........e..........................................................................'............~........................................................................ + // add v8.8h, v8.8h, v9.8h // .............................................................................e........'..............................................................................~...... + // sqrdmulh v27.8h, v24.8h, v0.h[7] // ...............e......................................................................'................~.................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ..................e...................................................................'...................~................................................................. + // mls v9.8h, v27.8h, v7.h[0] // .......................e..............................................................'........................~............................................................ + // sub v24.8h, v10.8h, v11.8h // ....................................................e.................................'.....................................................~............................... + // add v10.8h, v10.8h, v11.8h // ........................................................................e.............'.........................................................................~........... + // sqrdmulh v27.8h, v24.8h, v1.h[1] // .............................................................e........................'..............................................................~...................... + // mul v11.8h, v24.8h, v1.h[0] // ...........................................................e..........................'............................................................~........................ + // mls v11.8h, v27.8h, v7.h[0] // .......................................................................e..............'........................................................................~............ + // sub v24.8h, v12.8h, v13.8h // ......................................................................................'*.................................................................................... + // add v12.8h, v12.8h, v13.8h // ............................................................................e.........'.............................................................................~....... + // sqrdmulh v27.8h, v24.8h, v1.h[3] // ........~.............................................................................'.........*........................................................................... + // mul v13.8h, v24.8h, v1.h[2] // .......~..............................................................................'........*............................................................................ + // mls v13.8h, v27.8h, v7.h[0] // .............~........................................................................'..............*...................................................................... + // sub v24.8h, v14.8h, v15.8h // ...............................................................................e......'................................................................................~.... + // add v14.8h, v14.8h, v15.8h // ................................................................................e.....'.................................................................................~... + // sqrdmulh v27.8h, v24.8h, v1.h[5] // .....................................................................................e'..................................................................................... + // mul v15.8h, v24.8h, v1.h[4] // ....................................................................................e.'..................................................................................... + // mls v15.8h, v27.8h, v7.h[0] // ................~.....................................................................'.................*................................................................... + // sub v24.8h, v8.8h, v10.8h // ..~...................................................................................'...*................................................................................. + // add v8.8h, v8.8h, v10.8h // .....................~................................................................'......................*.............................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ............~.........................................................................'.............*....................................................................... + // mul v10.8h, v24.8h, v0.h[2] // ..............~.......................................................................'...............*..................................................................... + // mls v10.8h, v27.8h, v7.h[0] // ................................~.....................................................'.................................*................................................... + // sub v24.8h, v9.8h, v11.8h // ......................................................................................*..................................................................................... + // add v9.8h, v9.8h, v11.8h // ..................................................................................e...'...................................................................................~. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .........~............................................................................'..........*.......................................................................... + // mul v11.8h, v24.8h, v0.h[2] // ..........~...........................................................................'...........*......................................................................... + // mls v11.8h, v27.8h, v7.h[0] // .................~....................................................................'..................*.................................................................. + // sub v24.8h, v12.8h, v14.8h // ....~.................................................................................'.....*............................................................................... + // add v12.8h, v12.8h, v14.8h // ....................~.................................................................'.....................*............................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...................~..................................................................'....................*................................................................ + // mul v14.8h, v24.8h, v0.h[4] // ........................~.............................................................'.........................*........................................................... + // mls v14.8h, v27.8h, v7.h[0] // .............................~........................................................'..............................*...................................................... + // sub v24.8h, v13.8h, v15.8h // ......................~...............................................................'.......................*............................................................. + // add v13.8h, v13.8h, v15.8h // ..........................~...........................................................'...........................*......................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...........................~..........................................................'............................*........................................................ + // mul v15.8h, v24.8h, v0.h[4] // .........................~............................................................'..........................*.......................................................... + // mls v15.8h, v27.8h, v7.h[0] // ..................................~...................................................'...................................*................................................. + // sub v24.8h, v8.8h, v12.8h // ............................~.........................................................'.............................*....................................................... + // add v8.8h, v8.8h, v12.8h // ....................................~.................................................'.....................................*............................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ................................................................~.....................'.................................................................*................... + // mul v12.8h, v24.8h, v0.h[0] // ...............................~......................................................'................................*.................................................... + // mls v12.8h, v27.8h, v7.h[0] // .........................................................................~............'..........................................................................*.......... + // sub v24.8h, v9.8h, v13.8h // ............................................................~.........................'.............................................................*....................... + // add v9.8h, v9.8h, v13.8h // ..............................~.......................................................'...............................*..................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ..................................................................~...................'...................................................................*................. + // mul v13.8h, v24.8h, v0.h[0] // ...................................................................~..................'....................................................................*................ + // mls v13.8h, v27.8h, v7.h[0] // ...........................................................................~..........'............................................................................*........ + // sub v24.8h, v10.8h, v14.8h // ......................................~...............................................'.......................................*............................................. + // add v10.8h, v10.8h, v14.8h // .....................................~................................................'......................................*.............................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .....................................................~................................'......................................................*.............................. + // mul v14.8h, v24.8h, v0.h[0] // ........................................................~.............................'.........................................................*........................... + // mls v14.8h, v27.8h, v7.h[0] // .................................................................~....................'..................................................................*.................. + // sub v24.8h, v11.8h, v15.8h // .......................................~..............................................'........................................*............................................ + // add v11.8h, v11.8h, v15.8h // ........................................~.............................................'.........................................*........................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .............................................~........................................'..............................................*...................................... + // mul v15.8h, v24.8h, v0.h[0] // ............................................~.........................................'.............................................*....................................... + // mls v15.8h, v27.8h, v7.h[0] // ..................................................~...................................'...................................................*................................. + // str q12, [x0, #(4*(512/8))] // .................................................................................~....'..................................................................................*.. + // str q13, [x0, #(5*(512/8))] // ...................................................................................~..'....................................................................................* + // str q14, [x0, #(6*(512/8))] // ..........................................................................~...........'...........................................................................*......... + // str q15, [x0, #(7*(512/8))] // ..........................................................~...........................'...........................................................*......................... + // sqrdmulh v27.8h, v8.8h, v30.8h // .........................................................~............................'..........................................................*.......................... + // mul v8.8h, v8.8h, v29.8h // ..............................................~.......................................'...............................................*..................................... + // mls v8.8h, v27.8h, v7.h[0] // .....................................................................~................'......................................................................*.............. + // sqrdmulh v27.8h, v9.8h, v30.8h // ...................................~..................................................'....................................*................................................ + // mul v9.8h, v9.8h, v29.8h // .................................~....................................................'..................................*.................................................. + // mls v9.8h, v27.8h, v7.h[0] // ...................................................~..................................'....................................................*................................ + // sqrdmulh v27.8h, v10.8h, v30.8h // ..........................................~...........................................'...........................................*......................................... + // mul v10.8h, v10.8h, v29.8h // .........................................~............................................'..........................................*.......................................... + // mls v10.8h, v27.8h, v7.h[0] // ................................................~.....................................'.................................................*................................... + // sqrdmulh v27.8h, v11.8h, v30.8h // ...............................................~......................................'................................................*.................................... + // mul v11.8h, v11.8h, v29.8h // .................................................~....................................'..................................................*.................................. + // mls v11.8h, v27.8h, v7.h[0] // .......................................................~..............................'........................................................*............................ + // str q8, [x0], #(16) // ..............................................................................~.......'...............................................................................*..... + // str q9, [x0, #(-16 + 1*(512/8))] // ......................................................................~...............'.......................................................................*............. + // str q10, [x0, #(-16 + 2*(512/8))] // ......................................................~...............................'.......................................................*............................. + // str q11, [x0, #(-16 + 3*(512/8))] // ...............................................................~......................'................................................................*.................... sub count, count, #1 cbnz count, layer123_start - mul v25.8H, v21.8H, v1.H[0] // ...........*.......................................... - sqrdmulh v3.8H, v21.8H, v1.H[1] // ............*......................................... - // gap // ...................................................... - // gap // ...................................................... - sub v12.8H, v31.8H, v6.8H // *..................................................... - sub v23.8H, v9.8H, v26.8H // ..*................................................... - // gap // ...................................................... - // gap // ...................................................... - mul v22.8H, v24.8H, v29.8H // ....*................................................. - sqrdmulh v28.8H, v19.8H, v0.H[5] // .*.................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v25.8H, v3.8H, v7.H[0] // ...............*...................................... - sqrdmulh v3.8H, v12.8H, v0.H[1] // ........*............................................. - // gap // ...................................................... - // gap // ...................................................... - mul v11.8H, v23.8H, v0.H[2] // ......*............................................... - sqrdmulh v4.8H, v23.8H, v0.H[3] // .......*.............................................. - // gap // ...................................................... - // gap // ...................................................... - add v27.8H, v13.8H, v2.8H // ...*.................................................. - sqrdmulh v18.8H, v24.8H, v30.8H // ....................................*................. - // gap // ...................................................... - // gap // ...................................................... - sub v10.8H, v15.8H, v25.8H // ..................*................................... - add v25.8H, v15.8H, v25.8H // ...................*.................................. - // gap // ...................................................... - // gap // ...................................................... - mls v11.8H, v4.8H, v7.H[0] // .............*........................................ - mls v5.8H, v28.8H, v7.H[0] // .........*............................................ - // gap // ...................................................... - // gap // ...................................................... - sub v21.8H, v25.8H, v27.8H // ........................*............................. - mls v22.8H, v18.8H, v7.H[0] // ...........................................*.......... - // gap // ...................................................... - // gap // ...................................................... - mul v13.8H, v10.8H, v0.H[2] // .....................*................................ - sqrdmulh v14.8H, v10.8H, v0.H[3] // .......................*.............................. - // gap // ...................................................... - // gap // ...................................................... - sub v9.8H, v11.8H, v5.8H // ................*..................................... - mul v24.8H, v21.8H, v0.H[0] // ...........................*.......................... - // gap // ...................................................... - // gap // ...................................................... - sqrdmulh v19.8H, v21.8H, v0.H[1] // ..........................*........................... - add v17.8H, v11.8H, v5.8H // .................*.................................... - str q22, [x0], #(16) // ..................................................*... - // gap // ...................................................... - mls v13.8H, v14.8H, v7.H[0] // ............................*......................... - mul v26.8H, v9.8H, v0.H[0] // ............................................*......... - // gap // ...................................................... - // gap // ...................................................... - sqrdmulh v4.8H, v9.8H, v0.H[1] // ..........................................*........... - mul v6.8H, v17.8H, v29.8H // ....................*................................. - // gap // ...................................................... - // gap // ...................................................... - sqrdmulh v5.8H, v17.8H, v30.8H // ..............................*....................... - mul v31.8H, v12.8H, v0.H[0] // .....*................................................ - // gap // ...................................................... - // gap // ...................................................... - add v11.8H, v13.8H, v16.8H // .................................*.................... - add v28.8H, v25.8H, v27.8H // ......................*............................... - // gap // ...................................................... - // gap // ...................................................... - sub v16.8H, v13.8H, v16.8H // ................................*..................... - mls v26.8H, v4.8H, v7.H[0] // .................................................*.... - // gap // ...................................................... - // gap // ...................................................... - mul v18.8H, v11.8H, v29.8H // ..................................*................... - sqrdmulh v10.8H, v11.8H, v30.8H // ...................................*.................. - // gap // ...................................................... - // gap // ...................................................... - mul v2.8H, v16.8H, v0.H[0] // .....................................*................ - sqrdmulh v11.8H, v16.8H, v0.H[1] // .......................................*.............. - // gap // ...................................................... - // gap // ...................................................... - str q26, [x0, #368] // .....................................................* - mls v31.8H, v3.8H, v7.H[0] // ..........*........................................... - sqrdmulh v8.8H, v28.8H, v30.8H // .............................*........................ - // gap // ...................................................... - mls v18.8H, v10.8H, v7.H[0] // .........................................*............ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mul v20.8H, v28.8H, v29.8H // .........................*............................ - mls v2.8H, v11.8H, v7.H[0] // ..............................................*....... - // gap // ...................................................... - // gap // ...................................................... - mls v6.8H, v5.8H, v7.H[0] // ........................................*............. - str q31, [x0, #240] // ..............*....................................... - // gap // ...................................................... - // gap // ...................................................... - str q18, [x0, #176] // ................................................*..... - mls v24.8H, v19.8H, v7.H[0] // ...............................*...................... - // gap // ...................................................... - // gap // ...................................................... - mls v20.8H, v8.8H, v7.H[0] // .............................................*........ - str q2, [x0, #432] // ...................................................*.. - // gap // ...................................................... - // gap // ...................................................... - str q6, [x0, #112] // ...............................................*...... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - str q24, [x0, #304] // ......................................*............... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - str q20, [x0, #48] // ....................................................*. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - - // original source code - // sub v27.8H, v31.8H, v6.8H // ..*................................................... - // sqrdmulh v22.8H, v19.8H, v0.H[5] // .....*................................................ - // sub v19.8H, v9.8H, v26.8H // ...*.................................................. - // add v26.8H, v13.8H, v2.8H // ..........*........................................... - // mul v20.8H, v24.8H, v29.8H // ....*................................................. - // mul v31.8H, v27.8H, v0.H[0] // ..............................*....................... - // mul v8.8H, v19.8H, v0.H[2] // ........*............................................. - // sqrdmulh v14.8H, v19.8H, v0.H[3] // .........*............................................ - // sqrdmulh v27.8H, v27.8H, v0.H[1] // .......*.............................................. - // mls v5.8H, v22.8H, v7.H[0] // ...............*...................................... - // mls v31.8H, v27.8H, v7.H[0] // ........................................*............. - // mul v27.8H, v21.8H, v1.H[0] // *..................................................... - // sqrdmulh v17.8H, v21.8H, v1.H[1] // .*.................................................... - // mls v8.8H, v14.8H, v7.H[0] // ..............*....................................... - // str q31, [x0, #256] // ..............................................*....... - // mls v27.8H, v17.8H, v7.H[0] // ......*............................................... - // sub v3.8H, v8.8H, v5.8H // ....................*................................. - // add v28.8H, v8.8H, v5.8H // .......................*.............................. - // sub v12.8H, v15.8H, v27.8H // ............*......................................... - // add v23.8H, v15.8H, v27.8H // .............*........................................ - // mul v27.8H, v28.8H, v29.8H // ............................*......................... - // mul v17.8H, v12.8H, v0.H[2] // ..................*................................... - // add v19.8H, v23.8H, v26.8H // ................................*..................... - // sqrdmulh v22.8H, v12.8H, v0.H[3] // ...................*.................................. - // sub v31.8H, v23.8H, v26.8H // ................*..................................... - // mul v4.8H, v19.8H, v29.8H // ...........................................*.......... - // sqrdmulh v6.8H, v31.8H, v0.H[1] // ......................*............................... - // mul v31.8H, v31.8H, v0.H[0] // .....................*................................ - // mls v17.8H, v22.8H, v7.H[0] // .........................*............................ - // sqrdmulh v8.8H, v19.8H, v30.8H // .........................................*............ - // sqrdmulh v23.8H, v28.8H, v30.8H // .............................*........................ - // mls v31.8H, v6.8H, v7.H[0] // ................................................*..... - // sub v12.8H, v17.8H, v16.8H // .................................*.................... - // add v22.8H, v17.8H, v16.8H // ...............................*...................... - // mul v19.8H, v22.8H, v29.8H // ...................................*.................. - // sqrdmulh v22.8H, v22.8H, v30.8H // ....................................*................. - // sqrdmulh v17.8H, v24.8H, v30.8H // ...........*.......................................... - // mul v24.8H, v12.8H, v0.H[0] // .....................................*................ - // str q31, [x0, #320] // ....................................................*. - // sqrdmulh v28.8H, v12.8H, v0.H[1] // ......................................*............... - // mls v27.8H, v23.8H, v7.H[0] // .............................................*........ - // mls v19.8H, v22.8H, v7.H[0] // ..........................................*........... - // sqrdmulh v14.8H, v3.8H, v0.H[1] // ...........................*.......................... - // mls v20.8H, v17.8H, v7.H[0] // .................*.................................... - // mul v12.8H, v3.8H, v0.H[0] // ..........................*........................... - // mls v4.8H, v8.8H, v7.H[0] // .................................................*.... - // mls v24.8H, v28.8H, v7.H[0] // ............................................*......... - // str q27, [x0, #128] // ...................................................*.. - // str q19, [x0, #192] // ...............................................*...... - // mls v12.8H, v14.8H, v7.H[0] // ..................................*................... - // str q20, [x0], #(16) // ........................*............................. - // str q24, [x0, #432] // ..................................................*... - // str q4, [x0, #48] // .....................................................* - // str q12, [x0, #368] // .......................................*.............. + // Instructions: 64 + // Expected cycles: 33 + // Expected IPC: 1.94 + // + // Cycle bound: 31.0 + // IPC bound: 2.06 + // + // Wall time: 3600.21s + // User time: 3600.21s + // + // ---------------------- original position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + sub v10.8H, v26.8H, v31.8H // *............................................................... + sub v23.8H, v17.8H, v21.8H // .*.............................................................. + // gap // ................................................................ + // gap // ................................................................ + sub v6.8H, v25.8H, v15.8H // ..*............................................................. + sub v2.8H, v14.8H, v3.8H // ...*............................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v11.8H, v23.8H, v1.H[2] // ....*........................................................... + sqrdmulh v13.8H, v23.8H, v1.H[3] // .....*.......................................................... + // gap // ................................................................ + // gap // ................................................................ + mul v17.8H, v10.8H, v0.H[2] // .......*........................................................ + sqrdmulh v8.8H, v10.8H, v0.H[3] // ......*......................................................... + // gap // ................................................................ + // gap // ................................................................ + mls v20.8H, v16.8H, v7.H[0] // ...........*.................................................... + sqrdmulh v24.8H, v6.8H, v0.H[3] // ........*....................................................... + // gap // ................................................................ + // gap // ................................................................ + mls v11.8H, v13.8H, v7.H[0] // .........*...................................................... + mul v16.8H, v2.8H, v0.H[4] // .................*.............................................. + // gap // ................................................................ + // gap // ................................................................ + mls v17.8H, v8.8H, v7.H[0] // ............*................................................... + sqrdmulh v10.8H, v2.8H, v0.H[5] // .............*.................................................. + // gap // ................................................................ + // gap // ................................................................ + mul v22.8H, v6.8H, v0.H[2] // ..........*..................................................... + add v19.8H, v25.8H, v15.8H // ...............*................................................ + // gap // ................................................................ + // gap // ................................................................ + add v18.8H, v11.8H, v20.8H // ...................*............................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v16.8H, v10.8H, v7.H[0] // ......................*......................................... + sub v20.8H, v11.8H, v20.8H // ................*............................................... + // gap // ................................................................ + // gap // ................................................................ + sub v6.8H, v9.8H, v18.8H // ..................................................*............. + add v8.8H, v9.8H, v18.8H // .......................*........................................ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v9.8H, v20.8H, v0.H[5] // ....................*........................................... + mls v22.8H, v24.8H, v7.H[0] // .........................*...................................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v13.8H, v6.8H, v0.H[1] // ......................................................*......... + mul v6.8H, v6.8H, v0.H[0] // .......................................................*........ + // gap // ................................................................ + // gap // ................................................................ + mul v26.8H, v8.8H, v29.8H // ..........................*..................................... + sqrdmulh v27.8H, v8.8H, v30.8H // ............................*................................... + // gap // ................................................................ + // gap // ................................................................ + sub v2.8H, v22.8H, v16.8H // ...............................*................................ + mul v18.8H, v20.8H, v0.H[4] // ..................*............................................. + // gap // ................................................................ + // gap // ................................................................ + mls v6.8H, v13.8H, v7.H[0] // ............................................................*... + add v21.8H, v14.8H, v3.8H // ..............*................................................. + // gap // ................................................................ + // gap // ................................................................ + mls v26.8H, v27.8H, v7.H[0] // ...........................................*.................... + add v3.8H, v22.8H, v16.8H // ..............................*................................. + // gap // ................................................................ + // gap // ................................................................ + mls v18.8H, v9.8H, v7.H[0] // ...........................*.................................... + add v28.8H, v19.8H, v21.8H // .............................*.................................. + // gap // ................................................................ + // gap // ................................................................ + sub v24.8H, v19.8H, v21.8H // .....................*.......................................... + str q6, [x0, #320] // ...............................................................* + mul v13.8H, v3.8H, v29.8H // ..................................*............................. + // gap // ................................................................ + str q26, [x0, #64] // .........................................................*...... + sqrdmulh v31.8H, v28.8H, v30.8H // ................................................*............... + mul v20.8H, v28.8H, v29.8H // ......................................*......................... + // gap // ................................................................ + add v28.8H, v17.8H, v18.8H // .................................*.............................. + mul v19.8H, v24.8H, v0.H[0] // ........................*....................................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v16.8H, v3.8H, v30.8H // ...................................*............................ + sqrdmulh v14.8H, v24.8H, v0.H[1] // ....................................................*........... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v6.8H, v28.8H, v30.8H // .......................................*........................ + sub v12.8H, v17.8H, v18.8H // ................................*............................... + // gap // ................................................................ + // gap // ................................................................ + mul v23.8H, v28.8H, v29.8H // .........................................*...................... + mls v20.8H, v31.8H, v7.H[0] // ........................................................*....... + // gap // ................................................................ + // gap // ................................................................ + mls v19.8H, v14.8H, v7.H[0] // ..........................................................*..... + mul v5.8H, v12.8H, v0.H[0] // ....................................*........................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v18.8H, v2.8H, v0.H[1] // ............................................*................... + sqrdmulh v8.8H, v12.8H, v0.H[1] // .....................................*.......................... + // gap // ................................................................ + // gap // ................................................................ + mls v23.8H, v6.8H, v7.H[0] // ..............................................*................. + mul v2.8H, v2.8H, v0.H[0] // ...............................................*................ + str q20, [x0], #(16) // .............................................................*.. + // gap // ................................................................ + mls v13.8H, v16.8H, v7.H[0] // ........................................*....................... + str q19, [x0, #240] // ..............................................................*. + // gap // ................................................................ + // gap // ................................................................ + mls v5.8H, v8.8H, v7.H[0] // ..........................................*..................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v2.8H, v18.8H, v7.H[0] // .....................................................*.......... + str q23, [x0, #176] // ...................................................*............ + // gap // ................................................................ + // gap // ................................................................ + str q13, [x0, #112] // .............................................*.................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q5, [x0, #432] // .................................................*.............. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q2, [x0, #368] // ...........................................................*.... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + + // ------------------------ new position -------------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + // sub v6.8H, v26.8H, v31.8H // *............................................................... + // sub v24.8H, v17.8H, v21.8H // .*.............................................................. + // sub v8.8H, v25.8H, v15.8H // ..*............................................................. + // sub v10.8H, v14.8H, v3.8H // ...*............................................................ + // mul v12.8H, v24.8H, v1.H[2] // ....*........................................................... + // sqrdmulh v19.8H, v24.8H, v1.H[3] // .....*.......................................................... + // sqrdmulh v18.8H, v6.8H, v0.H[3] // .......*........................................................ + // mul v24.8H, v6.8H, v0.H[2] // ......*......................................................... + // sqrdmulh v22.8H, v8.8H, v0.H[3] // .........*...................................................... + // mls v12.8H, v19.8H, v7.H[0] // ..........*..................................................... + // mul v8.8H, v8.8H, v0.H[2] // ..............*................................................. + // mls v20.8H, v16.8H, v7.H[0] // ........*....................................................... + // mls v24.8H, v18.8H, v7.H[0] // ............*................................................... + // sqrdmulh v19.8H, v10.8H, v0.H[5] // .............*.................................................. + // add v18.8H, v14.8H, v3.8H // ..............................*................................. + // add v14.8H, v25.8H, v15.8H // ...............*................................................ + // sub v13.8H, v12.8H, v20.8H // ..................*............................................. + // mul v21.8H, v10.8H, v0.H[4] // ...........*.................................................... + // mul v25.8H, v13.8H, v0.H[4] // ............................*................................... + // add v12.8H, v12.8H, v20.8H // ................*............................................... + // sqrdmulh v16.8H, v13.8H, v0.H[5] // .....................*.......................................... + // sub v3.8H, v14.8H, v18.8H // ...................................*............................ + // mls v21.8H, v19.8H, v7.H[0] // .................*.............................................. + // add v6.8H, v9.8H, v12.8H // ....................*........................................... + // mul v19.8H, v3.8H, v0.H[0] // ..........................................*..................... + // mls v8.8H, v22.8H, v7.H[0] // ......................*......................................... + // mul v15.8H, v6.8H, v29.8H // .........................*...................................... + // mls v25.8H, v16.8H, v7.H[0] // .................................*.............................. + // sqrdmulh v22.8H, v6.8H, v30.8H // ..........................*..................................... + // add v31.8H, v14.8H, v18.8H // ..................................*............................. + // add v6.8H, v8.8H, v21.8H // ................................*............................... + // sub v8.8H, v8.8H, v21.8H // ...........................*.................................... + // sub v16.8H, v24.8H, v25.8H // ..............................................*................. + // add v20.8H, v24.8H, v25.8H // .........................................*...................... + // mul v11.8H, v6.8H, v29.8H // .....................................*.......................... + // sqrdmulh v6.8H, v6.8H, v30.8H // ...........................................*.................... + // mul v2.8H, v16.8H, v0.H[0] // ..................................................*............. + // sqrdmulh v24.8H, v16.8H, v0.H[1] // ....................................................*........... + // mul v16.8H, v31.8H, v29.8H // ........................................*....................... + // sqrdmulh v18.8H, v20.8H, v30.8H // .............................................*.................. + // mls v11.8H, v6.8H, v7.H[0] // ........................................................*....... + // mul v6.8H, v20.8H, v29.8H // ...............................................*................ + // mls v2.8H, v24.8H, v7.H[0] // ..........................................................*..... + // mls v15.8H, v22.8H, v7.H[0] // ...............................*................................ + // sqrdmulh v20.8H, v8.8H, v0.H[1] // ...................................................*............ + // str q11, [x0, #128] // .............................................................*.. + // mls v6.8H, v18.8H, v7.H[0] // .....................................................*.......... + // mul v18.8H, v8.8H, v0.H[0] // ......................................................*......... + // sqrdmulh v24.8H, v31.8H, v30.8H // .......................................*........................ + // str q2, [x0, #448] // ..............................................................*. + // sub v4.8H, v9.8H, v12.8H // ...................*............................................ + // str q6, [x0, #192] // ............................................................*... + // sqrdmulh v6.8H, v3.8H, v0.H[1] // ............................................*................... + // mls v18.8H, v20.8H, v7.H[0] // ...........................................................*.... + // sqrdmulh v8.8H, v4.8H, v0.H[1] // .......................*........................................ + // mul v22.8H, v4.8H, v0.H[0] // ........................*....................................... + // mls v16.8H, v24.8H, v7.H[0] // ................................................*............... + // str q15, [x0, #64] // ......................................*......................... + // mls v19.8H, v6.8H, v7.H[0] // .................................................*.............. + // str q18, [x0, #384] // ...............................................................* + // mls v22.8H, v8.8H, v7.H[0] // .............................*.................................. + // str q16, [x0], #(16) // .......................................................*........ + // str q19, [x0, #240] // .........................................................*...... + // str q22, [x0, #304] // ....................................*........................... pop_stack diff --git a/examples/opt/aarch64/intt_kyber_123_4567_opt_a55.s b/examples/opt/aarch64/intt_kyber_123_4567_opt_a55.s index a524ab50..1d063019 100644 --- a/examples/opt/aarch64/intt_kyber_123_4567_opt_a55.s +++ b/examples/opt/aarch64/intt_kyber_123_4567_opt_a55.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +69,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -146,7 +125,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +136,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +146,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +154,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +165,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -354,570 +339,615 @@ _intt_kyber_123_4567_opt_a55: mov count, #8 .p2align 2 - ldr q23, [x1, #0] // *.......... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q27, [x1, #16] // .*......... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q3, [x1, #32] // ..*........ - // gap // ........... - // gap // ........... - // gap // ........... - ldr q28, [x1, #48] // ...*....... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q29, [x4], #(6*16) // .....*..... - // gap // ........... - // gap // ........... - // gap // ........... - trn1 v26.4S, v3.4S, v28.4S // ....*...... - // gap // ........... - ldr q20, [x4, #-80] // ......*.... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q24, [x4, #-64] // .......*... - // gap // ........... - // gap // ........... - // gap // ........... - ldr q11, [x4, #-48] // ........*.. - // gap // ........... - // gap // ........... - // gap // ........... - ldr q14, [x4, #-32] // .........*. - // gap // ........... - // gap // ........... - // gap // ........... - ldr q25, [x4, #-16] // ..........* - // gap // ........... - - // original source code - // ldr q23, [x1, #0] // *.......... - // ldr q27, [x1, #16] // .*......... - // ldr q3, [x1, #32] // ..*........ - // ldr q28, [x1, #48] // ...*....... - // trn1 v26.4S, v3.4S, v28.4S // .....*..... - // ldr q29, [x4], #(6*16) // ....*...... - // ldr q20, [x4, #-80] // ......*.... - // ldr q24, [x4, #-64] // .......*... - // ldr q11, [x4, #-48] // ........*.. - // ldr q14, [x4, #-32] // .........*. - // ldr q25, [x4, #-16] // ..........* + // Instructions: 8 + // Expected cycles: 15 + // Expected IPC: 0.53 + // + // Cycle bound: 15.0 + // IPC bound: 0.53 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q11, [x1, #16] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q27, [x1, #32] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x1, #48] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q31, [x4], #(6*16) // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q3, [x4, #-32] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x4, #-48] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x4, #-80] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q0, [x4, #-64] // .......*...................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q11, [x1, #16] // *.............................. + // ldr q27, [x1, #32] // .*............................. + // ldr q13, [x1, #48] // ..*............................ + // ldr q31, [x4], #(6*16) // ...*........................... + // ldr q3, [x4, #-32] // ....*.......................... + // ldr q22, [x4, #-48] // .....*......................... + // ldr q25, [x4, #-80] // ......*........................ + // ldr q0, [x4, #-64] // .......*....................... sub count, count, #1 layer4567_start: - trn1 v19.4S, v23.4S, v27.4S // ....*.............................................................................. + // Instructions: 83 + // Expected cycles: 94 + // Expected IPC: 0.88 + // + // Cycle bound: 94.0 + // IPC bound: 0.88 + // + // Wall time: 8.59s + // User time: 8.59s + // + // ------------------------------- original position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------- + ldr q16, [x1, #0] // *.................................................................................. // gap // ................................................................................... - trn2 v23.4S, v23.4S, v27.4S // .....*............................................................................. // gap // ................................................................................... - trn2 v22.4S, v3.4S, v28.4S // .......*........................................................................... // gap // ................................................................................... - trn2 v28.2D, v19.2D, v26.2D // ........*.......................................................................... + trn1 v15.4S, v27.4S, v13.4S // ......*............................................................................ // gap // ................................................................................... - trn1 v19.2D, v19.2D, v26.2D // ..........*........................................................................ + trn2 v27.4S, v27.4S, v13.4S // .......*........................................................................... // gap // ................................................................................... - trn2 v0.2D, v23.2D, v22.2D // .........*......................................................................... + trn1 v14.4S, v16.4S, v11.4S // ....*.............................................................................. // gap // ................................................................................... - trn1 v23.2D, v23.2D, v22.2D // ...........*....................................................................... + trn2 v16.4S, v16.4S, v11.4S // .....*............................................................................. // gap // ................................................................................... - sub v22.8H, v28.8H, v0.8H // .......................*........................................................... + ldr q2, [x4, #-16] // .................*................................................................. // gap // ................................................................................... - add v28.8H, v28.8H, v0.8H // ........................*.......................................................... // gap // ................................................................................... - sub v0.8H, v19.8H, v23.8H // ..................*................................................................ // gap // ................................................................................... - add v19.8H, v19.8H, v23.8H // ...................*............................................................... + trn2 v6.2D, v14.2D, v15.2D // ........*.......................................................................... // gap // ................................................................................... - mul v23.8H, v22.8H, v14.8H // .........................*......................................................... + trn1 v15.2D, v14.2D, v15.2D // ..........*........................................................................ // gap // ................................................................................... - mul v27.8H, v0.8H, v24.8H // ....................*.............................................................. + trn2 v14.2D, v16.2D, v27.2D // .........*......................................................................... // gap // ................................................................................... - sqrdmulh v0.8H, v0.8H, v11.8H // .....................*............................................................. + trn1 v16.2D, v16.2D, v27.2D // ...........*....................................................................... // gap // ................................................................................... - sqrdmulh v22.8H, v22.8H, v25.8H // ..........................*........................................................ + sub v27.8H, v6.8H, v14.8H // .......................*........................................................... // gap // ................................................................................... - sub v24.8H, v19.8H, v28.8H // ............................*...................................................... + sub v26.8H, v15.8H, v16.8H // ..................*................................................................ // gap // ................................................................................... - add v19.8H, v19.8H, v28.8H // .............................*..................................................... + add v15.8H, v15.8H, v16.8H // ...................*............................................................... // gap // ................................................................................... - mls v27.8H, v0.8H, v7.H[0] // ......................*............................................................ + add v16.8H, v6.8H, v14.8H // ........................*.......................................................... // gap // ................................................................................... - mls v23.8H, v22.8H, v7.H[0] // ...........................*....................................................... + sqrdmulh v14.8H, v26.8H, v22.8H // ....................*.............................................................. // gap // ................................................................................... - mul v22.8H, v24.8H, v29.8H // ..............................*.................................................... + mul v6.8H, v26.8H, v0.8H // .....................*............................................................. // gap // ................................................................................... - sqrdmulh v28.8H, v24.8H, v20.8H // ...............................*................................................... + sqrdmulh v26.8H, v27.8H, v2.8H // .........................*......................................................... // gap // ................................................................................... - ldr q0, [x3], #16 // ..............................................*.................................... + mul v27.8H, v27.8H, v3.8H // ..........................*........................................................ // gap // ................................................................................... + sub v0.8H, v15.8H, v16.8H // ............................*...................................................... // gap // ................................................................................... + mls v6.8H, v14.8H, v7.H[0] // ......................*............................................................ // gap // ................................................................................... - sub v24.8H, v27.8H, v23.8H // .................................*................................................. + add v15.8H, v15.8H, v16.8H // .............................*..................................................... // gap // ................................................................................... - mls v22.8H, v28.8H, v7.H[0] // ................................*.................................................. + mls v27.8H, v26.8H, v7.H[0] // ...........................*....................................................... // gap // ................................................................................... - add v23.8H, v27.8H, v23.8H // ..................................*................................................ + sqrdmulh v16.8H, v0.8H, v25.8H // ..............................*.................................................... // gap // ................................................................................... - mul v28.8H, v24.8H, v29.8H // ...................................*............................................... + mul v14.8H, v0.8H, v31.8H // ...............................*................................................... // gap // ................................................................................... - sqrdmulh v27.8H, v24.8H, v20.8H // ....................................*.............................................. + ldr q0, [x3], #16 // ..............................................*.................................... // gap // ................................................................................... - trn1 v24.4S, v19.4S, v23.4S // ......................................*............................................ // gap // ................................................................................... - trn2 v19.4S, v19.4S, v23.4S // .......................................*........................................... // gap // ................................................................................... - ldr q23, [x1, #64] // e.................................................................................. + sub v26.8H, v6.8H, v27.8H // .................................*................................................. // gap // ................................................................................... + mls v14.8H, v16.8H, v7.H[0] // ................................*.................................................. // gap // ................................................................................... + add v16.8H, v6.8H, v27.8H // ..................................*................................................ // gap // ................................................................................... - mls v28.8H, v27.8H, v7.H[0] // .....................................*............................................. + sqrdmulh v27.8H, v26.8H, v25.8H // ...................................*............................................... // gap // ................................................................................... - ldr q27, [x1, #80] // .e................................................................................. + mul v6.8H, v26.8H, v31.8H // ....................................*.............................................. // gap // ................................................................................... + trn1 v26.4S, v15.4S, v16.4S // ......................................*............................................ // gap // ................................................................................... + trn2 v15.4S, v15.4S, v16.4S // .......................................*........................................... // gap // ................................................................................... - ldr q3, [x1, #96] // ..e................................................................................ + ldr q11, [x1, #80] // .e................................................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - trn1 v26.4S, v22.4S, v28.4S // ........................................*.......................................... + mls v6.8H, v27.8H, v7.H[0] // .....................................*............................................. // gap // ................................................................................... - trn2 v22.4S, v22.4S, v28.4S // .........................................*......................................... + ldr q27, [x1, #96] // ..e................................................................................ // gap // ................................................................................... - ldr q28, [x1, #112] // ...e............................................................................... // gap // ................................................................................... // gap // ................................................................................... + ldr q13, [x1, #112] // ...e............................................................................... // gap // ................................................................................... - trn2 v29.2D, v24.2D, v26.2D // ..........................................*........................................ // gap // ................................................................................... - trn2 v20.2D, v19.2D, v22.2D // ...........................................*....................................... // gap // ................................................................................... - trn1 v24.2D, v24.2D, v26.2D // ............................................*...................................... + trn1 v16.4S, v14.4S, v6.4S // ........................................*.......................................... // gap // ................................................................................... - trn1 v19.2D, v19.2D, v22.2D // .............................................*..................................... + trn2 v14.4S, v14.4S, v6.4S // .........................................*......................................... // gap // ................................................................................... - sub v22.8H, v29.8H, v20.8H // ....................................................*.............................. + ldr q31, [x4], #(6*16) // ............e...................................................................... // gap // ................................................................................... - sub v26.8H, v24.8H, v19.8H // ...............................................*................................... // gap // ................................................................................... - add v19.8H, v24.8H, v19.8H // ................................................*.................................. // gap // ................................................................................... - mul v24.8H, v22.8H, v0.H[4] // ......................................................*............................ + trn2 v6.2D, v26.2D, v16.2D // ..........................................*........................................ // gap // ................................................................................... - mul v11.8H, v26.8H, v0.H[2] // .................................................*................................. + trn2 v25.2D, v15.2D, v14.2D // ...........................................*....................................... // gap // ................................................................................... - sqrdmulh v26.8H, v26.8H, v0.H[3] // ..................................................*................................ + trn1 v16.2D, v26.2D, v16.2D // ............................................*...................................... // gap // ................................................................................... - sqrdmulh v22.8H, v22.8H, v0.H[5] // .......................................................*........................... + trn1 v15.2D, v15.2D, v14.2D // .............................................*..................................... // gap // ................................................................................... - add v29.8H, v29.8H, v20.8H // .....................................................*............................. + sub v14.8H, v6.8H, v25.8H // ....................................................*.............................. // gap // ................................................................................... - sqdmulh v20.8H, v19.8H, v7.H[1] // .........................................................*......................... + sub v26.8H, v16.8H, v15.8H // ...............................................*................................... // gap // ................................................................................... - mls v11.8H, v26.8H, v7.H[0] // ...................................................*............................... + add v15.8H, v16.8H, v15.8H // ................................................*.................................. // gap // ................................................................................... - mls v24.8H, v22.8H, v7.H[0] // ........................................................*.......................... + sqrdmulh v16.8H, v14.8H, v0.H[5] // ......................................................*............................ // gap // ................................................................................... - sqdmulh v22.8H, v29.8H, v7.H[1] // ............................................................*...................... + sqrdmulh v22.8H, v26.8H, v0.H[3] // .................................................*................................. // gap // ................................................................................... - srshr v26.8H, v20.8H, #11 // ..........................................................*........................ + mul v26.8H, v26.8H, v0.H[2] // ..................................................*................................ // gap // ................................................................................... - sqdmulh v20.8H, v11.8H, v7.H[1] // ...............................................................*................... + mul v14.8H, v14.8H, v0.H[4] // .......................................................*........................... // gap // ................................................................................... - sqdmulh v14.8H, v24.8H, v7.H[1] // ..................................................................*................ + add v6.8H, v6.8H, v25.8H // .....................................................*............................. // gap // ................................................................................... - mls v19.8H, v26.8H, v7.H[0] // ...........................................................*....................... + sqdmulh v25.8H, v15.8H, v7.H[1] // .........................................................*......................... // gap // ................................................................................... - srshr v22.8H, v22.8H, #11 // .............................................................*..................... + mls v26.8H, v22.8H, v7.H[0] // ...................................................*............................... // gap // ................................................................................... - srshr v26.8H, v20.8H, #11 // ................................................................*.................. + mls v14.8H, v16.8H, v7.H[0] // ........................................................*.......................... // gap // ................................................................................... - srshr v20.8H, v14.8H, #11 // ...................................................................*............... + sqdmulh v16.8H, v6.8H, v7.H[1] // ............................................................*...................... // gap // ................................................................................... - mls v29.8H, v22.8H, v7.H[0] // ..............................................................*.................... + srshr v25.8H, v25.8H, #11 // ..........................................................*........................ // gap // ................................................................................... - mls v11.8H, v26.8H, v7.H[0] // .................................................................*................. + sqdmulh v22.8H, v26.8H, v7.H[1] // ...............................................................*................... // gap // ................................................................................... - mls v24.8H, v20.8H, v7.H[0] // ....................................................................*.............. + sqdmulh v3.8H, v14.8H, v7.H[1] // ..................................................................*................ // gap // ................................................................................... - trn1 v26.4S, v3.4S, v28.4S // ......e............................................................................ + mls v15.8H, v25.8H, v7.H[0] // ...........................................................*....................... // gap // ................................................................................... - sub v22.8H, v19.8H, v29.8H // .....................................................................*............. + srshr v16.8H, v16.8H, #11 // .............................................................*..................... // gap // ................................................................................... - add v19.8H, v19.8H, v29.8H // ......................................................................*............ + srshr v25.8H, v22.8H, #11 // ................................................................*.................. // gap // ................................................................................... - sub v29.8H, v11.8H, v24.8H // ..........................................................................*........ + srshr v22.8H, v3.8H, #11 // ...................................................................*............... // gap // ................................................................................... - mul v20.8H, v22.8H, v0.H[0] // .......................................................................*........... + mls v6.8H, v16.8H, v7.H[0] // ..............................................................*.................... // gap // ................................................................................... - sqrdmulh v22.8H, v22.8H, v0.H[1] // ........................................................................*.......... + mls v26.8H, v25.8H, v7.H[0] // .................................................................*................. // gap // ................................................................................... - mul v14.8H, v29.8H, v0.H[0] // ............................................................................*...... + mls v14.8H, v22.8H, v7.H[0] // ....................................................................*.............. // gap // ................................................................................... - sqrdmulh v0.8H, v29.8H, v0.H[1] // .............................................................................*..... + ldr q3, [x4, #-32] // ................e.................................................................. // gap // ................................................................................... - add v24.8H, v11.8H, v24.8H // ...........................................................................*....... // gap // ................................................................................... - mls v20.8H, v22.8H, v7.H[0] // .........................................................................*......... // gap // ................................................................................... - str q19, [x1], #(64) // ...............................................................................*... + sub v16.8H, v15.8H, v6.8H // .....................................................................*............. // gap // ................................................................................... - mls v14.8H, v0.8H, v7.H[0] // ..............................................................................*.... + add v15.8H, v15.8H, v6.8H // ......................................................................*............ // gap // ................................................................................... - str q24, [x1, #-48] // ................................................................................*.. + sub v6.8H, v26.8H, v14.8H // ..........................................................................*........ // gap // ................................................................................... - ldr q29, [x4], #(6*16) // ............e...................................................................... + sqrdmulh v25.8H, v16.8H, v0.H[1] // .......................................................................*........... // gap // ................................................................................... + mul v16.8H, v16.8H, v0.H[0] // ........................................................................*.......... // gap // ................................................................................... + sqrdmulh v22.8H, v6.8H, v0.H[1] // ............................................................................*...... // gap // ................................................................................... - str q20, [x1, #-32] // .................................................................................*. + mul v6.8H, v6.8H, v0.H[0] // .............................................................................*..... // gap // ................................................................................... - ldr q20, [x4, #-80] // .............e..................................................................... + add v14.8H, v26.8H, v14.8H // ...........................................................................*....... // gap // ................................................................................... + mls v16.8H, v25.8H, v7.H[0] // .........................................................................*......... // gap // ................................................................................... + str q15, [x1], #(64) // ...............................................................................*... // gap // ................................................................................... - str q14, [x1, #-16] // ..................................................................................* + mls v6.8H, v22.8H, v7.H[0] // ..............................................................................*.... // gap // ................................................................................... - ldr q24, [x4, #-64] // ..............e.................................................................... + str q14, [x1, #-48] // ................................................................................*.. // gap // ................................................................................... + ldr q22, [x4, #-48] // ...............e................................................................... // gap // ................................................................................... // gap // ................................................................................... - ldr q11, [x4, #-48] // ...............e................................................................... // gap // ................................................................................... + str q6, [x1, #-16] // ..................................................................................* // gap // ................................................................................... + ldr q25, [x4, #-80] // .............e..................................................................... // gap // ................................................................................... - ldr q14, [x4, #-32] // ................e.................................................................. // gap // ................................................................................... // gap // ................................................................................... + str q16, [x1, #-32] // .................................................................................*. // gap // ................................................................................... - ldr q25, [x4, #-16] // .................e................................................................. + ldr q0, [x4, #-64] // ..............e.................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - // original source code - // ldr q8, [x1, #(16*0)] // e.....................................................|............................e................................................. - // ldr q9, [x1, #(16*1)] // ..e...................................................|..............................e............................................... - // ldr q10, [x1, #(16*2)] // ...e..................................................|...............................e.............................................. - // ldr q11, [x1, #(16*3)] // ......e...............................................|..................................e........................................... - // trn1 v25.4s, v8.4s, v9.4s // ......................................................*.............................................................................. - // trn2 v26.4s, v8.4s, v9.4s // ......................................................|*............................................................................. - // trn1 v27.4s, v10.4s, v11.4s // .................................e....................|.............................................................e................ - // trn2 v28.4s, v10.4s, v11.4s // ......................................................|.*............................................................................ - // trn2 v10.2d, v25.2d, v27.2d // ......................................................|..*........................................................................... - // trn2 v11.2d, v26.2d, v28.2d // ......................................................|....*......................................................................... - // trn1 v8.2d, v25.2d, v27.2d // ......................................................|...*.......................................................................... - // trn1 v9.2d, v26.2d, v28.2d // ......................................................|.....*........................................................................ - // ldr q0, [x4], #(6*16) // ..............................................e.......|..........................................................................e... - // ldr q4, [x4, #(-6*16 + 1*16)] // ................................................e.....|............................................................................e. - // ldr q1, [x4, #(-6*16 + 2*16)] // ..................................................e...|.............................................................................. - // ldr q5, [x4, #(-6*16 + 3*16)] // ...................................................e..|.............................................................................. - // ldr q2, [x4, #(-6*16 + 4*16)] // ....................................................e.|.............................................................................. - // ldr q6, [x4, #(-6*16 + 5*16)] // .....................................................e|.............................................................................. - // sub v24.8h, v8.8h, v9.8h // ......................................................|........*..................................................................... - // add v8.8h, v8.8h, v9.8h // ......................................................|.........*.................................................................... - // mul v9.8h, v24.8h, v1.8h // ......................................................|...........*.................................................................. - // sqrdmulh v24.8h, v24.8h, v5.8h // ......................................................|............*................................................................. - // mls v9.8h, v24.8h, v7.h[0] // ......................................................|................*............................................................. - // sub v24.8h, v10.8h, v11.8h // ......................................................|......*....................................................................... - // add v10.8h, v10.8h, v11.8h // ......................................................|.......*...................................................................... - // mul v11.8h, v24.8h, v2.8h // ......................................................|..........*................................................................... - // sqrdmulh v24.8h, v24.8h, v6.8h // ......................................................|.............*................................................................ - // mls v11.8h, v24.8h, v7.h[0] // ......................................................|.................*............................................................ - // sub v24.8h, v8.8h, v10.8h // ......................................................|..............*............................................................... - // add v8.8h, v8.8h, v10.8h // ......................................................|...............*.............................................................. - // mul v10.8h, v24.8h, v0.8h // ......................................................|..................*........................................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ......................................................|...................*.......................................................... - // mls v10.8h, v24.8h, v7.h[0] // ......................................................|......................*....................................................... - // sub v24.8h, v9.8h, v11.8h // ......................................................|.....................*........................................................ - // add v9.8h, v9.8h, v11.8h // ......................................................|.......................*...................................................... - // mul v11.8h, v24.8h, v0.8h // ......................................................|........................*..................................................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ......................................................|.........................*.................................................... - // mls v11.8h, v24.8h, v7.h[0] // .*....................................................|.............................*................................................ - // trn1 v25.4s, v8.4s, v9.4s // ......................................................|..........................*................................................... - // trn2 v26.4s, v8.4s, v9.4s // ......................................................|...........................*.................................................. - // trn1 v27.4s, v10.4s, v11.4s // ....*.................................................|................................*............................................. - // trn2 v28.4s, v10.4s, v11.4s // .....*................................................|.................................*............................................ - // trn2 v10.2d, v25.2d, v27.2d // .......*..............................................|...................................*.......................................... - // trn2 v11.2d, v26.2d, v28.2d // ........*.............................................|....................................*......................................... - // trn1 v8.2d, v25.2d, v27.2d // .........*............................................|.....................................*........................................ - // trn1 v9.2d, v26.2d, v28.2d // ..........*...........................................|......................................*....................................... - // ldr q0, [x3], #16 // ......................................................|....................*......................................................... - // sub v24.8h, v8.8h, v9.8h // ............*.........................................|........................................*..................................... - // add v8.8h, v8.8h, v9.8h // .............*........................................|.........................................*.................................... - // mul v9.8h, v24.8h, v0.h[2] // ...............*......................................|...........................................*.................................. - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................*.....................................|............................................*................................. - // mls v9.8h, v24.8h, v7.h[0] // ....................*.................................|................................................*............................. - // sub v24.8h, v10.8h, v11.8h // ...........*..........................................|.......................................*...................................... - // add v10.8h, v10.8h, v11.8h // ..................*...................................|..............................................*............................... - // mul v11.8h, v24.8h, v0.h[4] // ..............*.......................................|..........................................*................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................*....................................|.............................................*................................ - // mls v11.8h, v24.8h, v7.h[0] // .....................*................................|.................................................*............................ - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................*..................................|...............................................*.............................. - // srshr v25.8h, v25.8h, #11 // .......................*..............................|...................................................*.......................... - // mls v8.8h, v25.8h, v7.h[0] // ..........................*...........................|......................................................*....................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ......................*...............................|..................................................*........................... - // srshr v25.8h, v25.8h, #11 // ...........................*..........................|.......................................................*...................... - // mls v10.8h, v25.8h, v7.h[0] // ..............................*.......................|..........................................................*................... - // sqdmulh v25.8h, v9.8h, v7.h[1] // ........................*.............................|....................................................*......................... - // srshr v25.8h, v25.8h, #11 // ............................*.........................|........................................................*..................... - // mls v9.8h, v25.8h, v7.h[0] // ...............................*......................|...........................................................*.................. - // sqdmulh v25.8h, v11.8h, v7.h[1] // .........................*............................|.....................................................*........................ - // srshr v25.8h, v25.8h, #11 // .............................*........................|.........................................................*.................... - // mls v11.8h, v25.8h, v7.h[0] // ................................*.....................|............................................................*................. - // sub v24.8h, v8.8h, v10.8h // ..................................*...................|..............................................................*............... - // add v8.8h, v8.8h, v10.8h // ...................................*..................|...............................................................*.............. - // mul v10.8h, v24.8h, v0.h[0] // .....................................*................|.................................................................*............ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................*...............|..................................................................*........... - // mls v10.8h, v24.8h, v7.h[0] // ..........................................*...........|......................................................................*....... - // sub v24.8h, v9.8h, v11.8h // ....................................*.................|................................................................*............. - // add v9.8h, v9.8h, v11.8h // .........................................*............|.....................................................................*........ - // mul v11.8h, v24.8h, v0.h[0] // .......................................*..............|...................................................................*.......... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................*.............|....................................................................*......... - // mls v11.8h, v24.8h, v7.h[0] // ............................................*.........|........................................................................*..... - // str q8, [x1], #(64) // ...........................................*..........|.......................................................................*...... - // str q9, [x1, #(-64 + 16*1)] // .............................................*........|.........................................................................*.... - // str q10, [x1, #(-64 + 16*2)] // ...............................................*......|...........................................................................*.. - // str q11, [x1, #(-64 + 16*3)] // .................................................*....|.............................................................................* + // ----------------------------------------------------------- new position -----------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------- + // ldr q8, [x1, #(16*0)] // ...................................................*................................................................................. + // ldr q9, [x1, #(16*1)] // e..................................................'...............................~................................................. + // ldr q10, [x1, #(16*2)] // ..e................................................'.................................~............................................... + // ldr q11, [x1, #(16*3)] // ...e...............................................'..................................~.............................................. + // trn1 v25.4s, v8.4s, v9.4s // ...................................................'..*.............................................................................. + // trn2 v26.4s, v8.4s, v9.4s // ...................................................'...*............................................................................. + // trn1 v27.4s, v10.4s, v11.4s // ...................................................'*................................................................................ + // trn2 v28.4s, v10.4s, v11.4s // ...................................................'.*............................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ...................................................'.....*........................................................................... + // trn2 v11.2d, v26.2d, v28.2d // ...................................................'.......*......................................................................... + // trn1 v8.2d, v25.2d, v27.2d // ...................................................'......*.......................................................................... + // trn1 v9.2d, v26.2d, v28.2d // ...................................................'........*........................................................................ + // ldr q0, [x4], #(6*16) // ......e............................................'.....................................~........................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ................................................e..'...............................................................................~. + // ldr q1, [x4, #(-6*16 + 2*16)] // ..................................................e'................................................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // ..............................................e....'.............................................................................~... + // ldr q2, [x4, #(-6*16 + 4*16)] // .................................e.................'................................................................~................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ...................................................'....*............................................................................ + // sub v24.8h, v8.8h, v9.8h // ...................................................'..........*...................................................................... + // add v8.8h, v8.8h, v9.8h // ...................................................'...........*..................................................................... + // sqrdmulh v27.8h, v24.8h, v5.8h // ...................................................'.............*................................................................... + // mul v9.8h, v24.8h, v1.8h // ...................................................'..............*.................................................................. + // mls v9.8h, v27.8h, v7.h[0] // ...................................................'..................*.............................................................. + // sub v24.8h, v10.8h, v11.8h // ...................................................'.........*....................................................................... + // add v10.8h, v10.8h, v11.8h // ...................................................'............*.................................................................... + // sqrdmulh v27.8h, v24.8h, v6.8h // ...................................................'...............*................................................................. + // mul v11.8h, v24.8h, v2.8h // ...................................................'................*................................................................ + // mls v11.8h, v27.8h, v7.h[0] // ...................................................'....................*............................................................ + // sub v24.8h, v8.8h, v10.8h // ...................................................'.................*............................................................... + // add v8.8h, v8.8h, v10.8h // ...................................................'...................*............................................................. + // sqrdmulh v27.8h, v24.8h, v4.8h // ...................................................'.....................*........................................................... + // mul v10.8h, v24.8h, v0.8h // ...................................................'......................*.......................................................... + // mls v10.8h, v27.8h, v7.h[0] // ...................................................'.........................*....................................................... + // sub v24.8h, v9.8h, v11.8h // ...................................................'........................*........................................................ + // add v9.8h, v9.8h, v11.8h // ...................................................'..........................*...................................................... + // sqrdmulh v27.8h, v24.8h, v4.8h // ...................................................'...........................*..................................................... + // mul v11.8h, v24.8h, v0.8h // ...................................................'............................*.................................................... + // mls v11.8h, v27.8h, v7.h[0] // .~.................................................'................................*................................................ + // trn1 v25.4s, v8.4s, v9.4s // ...................................................'.............................*................................................... + // trn2 v26.4s, v8.4s, v9.4s // ...................................................'..............................*.................................................. + // trn1 v27.4s, v10.4s, v11.4s // ....~..............................................'...................................*............................................. + // trn2 v28.4s, v10.4s, v11.4s // .....~.............................................'....................................*............................................ + // trn2 v10.2d, v25.2d, v27.2d // .......~...........................................'......................................*.......................................... + // trn2 v11.2d, v26.2d, v28.2d // ........~..........................................'.......................................*......................................... + // trn1 v8.2d, v25.2d, v27.2d // .........~.........................................'........................................*........................................ + // trn1 v9.2d, v26.2d, v28.2d // ..........~........................................'.........................................*....................................... + // ldr q0, [x3], #16 // ...................................................'.......................*......................................................... + // sub v24.8h, v8.8h, v9.8h // ............~......................................'...........................................*..................................... + // add v8.8h, v8.8h, v9.8h // .............~.....................................'............................................*.................................... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...............~...................................'..............................................*.................................. + // mul v9.8h, v24.8h, v0.h[2] // ................~..................................'...............................................*................................. + // mls v9.8h, v27.8h, v7.h[0] // ....................~..............................'...................................................*............................. + // sub v24.8h, v10.8h, v11.8h // ...........~.......................................'..........................................*...................................... + // add v10.8h, v10.8h, v11.8h // ..................~................................'.................................................*............................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ..............~....................................'.............................................*................................... + // mul v11.8h, v24.8h, v0.h[4] // .................~.................................'................................................*................................ + // mls v11.8h, v27.8h, v7.h[0] // .....................~.............................'....................................................*............................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................~...............................'..................................................*.............................. + // srshr v25.8h, v25.8h, #11 // .......................~...........................'......................................................*.......................... + // mls v8.8h, v25.8h, v7.h[0] // ..........................~........................'.........................................................*....................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ......................~............................'.....................................................*........................... + // srshr v25.8h, v25.8h, #11 // ...........................~.......................'..........................................................*...................... + // mls v10.8h, v25.8h, v7.h[0] // ..............................~....................'.............................................................*................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ........................~..........................'.......................................................*......................... + // srshr v25.8h, v25.8h, #11 // ............................~......................'...........................................................*..................... + // mls v9.8h, v25.8h, v7.h[0] // ...............................~...................'..............................................................*.................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // .........................~.........................'........................................................*........................ + // srshr v25.8h, v25.8h, #11 // .............................~.....................'............................................................*.................... + // mls v11.8h, v25.8h, v7.h[0] // ................................~..................'...............................................................*................. + // sub v24.8h, v8.8h, v10.8h // ..................................~................'.................................................................*............... + // add v8.8h, v8.8h, v10.8h // ...................................~...............'..................................................................*.............. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .....................................~.............'....................................................................*............ + // mul v10.8h, v24.8h, v0.h[0] // ......................................~............'.....................................................................*........... + // mls v10.8h, v27.8h, v7.h[0] // ..........................................~........'.........................................................................*....... + // sub v24.8h, v9.8h, v11.8h // ....................................~..............'...................................................................*............. + // add v9.8h, v9.8h, v11.8h // .........................................~.........'........................................................................*........ + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .......................................~...........'......................................................................*.......... + // mul v11.8h, v24.8h, v0.h[0] // ........................................~..........'.......................................................................*......... + // mls v11.8h, v27.8h, v7.h[0] // ............................................~......'...........................................................................*..... + // str q8, [x1], #(64) // ...........................................~.......'..........................................................................*...... + // str q9, [x1, #(-64 + 16*1)] // .............................................~.....'............................................................................*.... + // str q10, [x1, #(-64 + 16*2)] // .................................................~.'................................................................................* + // str q11, [x1, #(-64 + 16*3)] // ...............................................~...'..............................................................................*.. sub count, count, #1 cbnz count, layer4567_start - trn1 v22.4S, v23.4S, v27.4S // *....................................................................... - // gap // ........................................................................ - trn2 v0.4S, v23.4S, v27.4S // .*...................................................................... - // gap // ........................................................................ - trn2 v23.4S, v3.4S, v28.4S // ..*..................................................................... - // gap // ........................................................................ - trn2 v28.2D, v22.2D, v26.2D // ...*.................................................................... - // gap // ........................................................................ - trn1 v26.2D, v22.2D, v26.2D // ....*................................................................... - // gap // ........................................................................ - trn1 v22.2D, v0.2D, v23.2D // ......*................................................................. - // gap // ........................................................................ - trn2 v19.2D, v0.2D, v23.2D // .....*.................................................................. - // gap // ........................................................................ - sub v23.8H, v26.8H, v22.8H // .........*.............................................................. - // gap // ........................................................................ - add v3.8H, v28.8H, v19.8H // ........*............................................................... - // gap // ........................................................................ - sub v0.8H, v28.8H, v19.8H // .......*................................................................ - // gap // ........................................................................ - sqrdmulh v19.8H, v23.8H, v11.8H // .............*.......................................................... - // gap // ........................................................................ - mul v28.8H, v23.8H, v24.8H // ............*........................................................... - // gap // ........................................................................ - sqrdmulh v23.8H, v0.8H, v25.8H // ..............*......................................................... - // gap // ........................................................................ - mul v27.8H, v0.8H, v14.8H // ...........*............................................................ - // gap // ........................................................................ - add v24.8H, v26.8H, v22.8H // ..........*............................................................. - // gap // ........................................................................ - mls v28.8H, v19.8H, v7.H[0] // .................*...................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v27.8H, v23.8H, v7.H[0] // ..................*..................................................... - // gap // ........................................................................ - sub v19.8H, v24.8H, v3.8H // ...............*........................................................ - // gap // ........................................................................ - ldr q11, [x3], #16 // .....................*.................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v23.8H, v28.8H, v27.8H // ......................*................................................. - // gap // ........................................................................ - sqrdmulh v22.8H, v19.8H, v20.8H // ....................*................................................... - // gap // ........................................................................ - mul v0.8H, v19.8H, v29.8H // ...................*.................................................... - // gap // ........................................................................ - sqrdmulh v19.8H, v23.8H, v20.8H // ..........................*............................................. - // gap // ........................................................................ - mul v23.8H, v23.8H, v29.8H // .........................*.............................................. - // gap // ........................................................................ - add v28.8H, v28.8H, v27.8H // ........................*............................................... - // gap // ........................................................................ - add v27.8H, v24.8H, v3.8H // ................*....................................................... - // gap // ........................................................................ - mls v0.8H, v22.8H, v7.H[0] // .......................*................................................ - // gap // ........................................................................ - mls v23.8H, v19.8H, v7.H[0] // .............................*.......................................... - // gap // ........................................................................ - trn2 v22.4S, v27.4S, v28.4S // ............................*........................................... - // gap // ........................................................................ - trn1 v27.4S, v27.4S, v28.4S // ...........................*............................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v19.4S, v0.4S, v23.4S // ...............................*........................................ - // gap // ........................................................................ - trn1 v0.4S, v0.4S, v23.4S // ..............................*......................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v23.2D, v22.2D, v19.2D // .................................*...................................... - // gap // ........................................................................ - trn2 v28.2D, v27.2D, v0.2D // ................................*....................................... - // gap // ........................................................................ - trn1 v22.2D, v22.2D, v19.2D // ...................................*.................................... - // gap // ........................................................................ - sub v19.8H, v28.8H, v23.8H // ....................................*................................... - // gap // ........................................................................ - trn1 v26.2D, v27.2D, v0.2D // ..................................*..................................... - // gap // ........................................................................ - add v0.8H, v28.8H, v23.8H // ...........................................*............................ - // gap // ........................................................................ - sub v27.8H, v26.8H, v22.8H // .....................................*.................................. - // gap // ........................................................................ - sqrdmulh v23.8H, v19.8H, v11.H[5] // ..........................................*............................. - // gap // ........................................................................ - mul v24.8H, v19.8H, v11.H[4] // .......................................*................................ - // gap // ........................................................................ - sqrdmulh v28.8H, v27.8H, v11.H[3] // .........................................*.............................. - // gap // ........................................................................ - mul v3.8H, v27.8H, v11.H[2] // ........................................*............................... - // gap // ........................................................................ - sqdmulh v19.8H, v0.8H, v7.H[1] // ...............................................*........................ - // gap // ........................................................................ - mls v24.8H, v23.8H, v7.H[0] // ..............................................*......................... - // gap // ........................................................................ - add v27.8H, v26.8H, v22.8H // ......................................*................................. - // gap // ........................................................................ - mls v3.8H, v28.8H, v7.H[0] // .............................................*.......................... - // gap // ........................................................................ - srshr v19.8H, v19.8H, #11 // ....................................................*................... - // gap // ........................................................................ - sqdmulh v22.8H, v27.8H, v7.H[1] // ............................................*........................... - // gap // ........................................................................ - sqdmulh v26.8H, v24.8H, v7.H[1] // ..................................................*..................... - // gap // ........................................................................ - sqdmulh v23.8H, v3.8H, v7.H[1] // .................................................*...................... - // gap // ........................................................................ - mls v0.8H, v19.8H, v7.H[0] // .......................................................*................ - // gap // ........................................................................ - srshr v22.8H, v22.8H, #11 // ................................................*....................... - // gap // ........................................................................ - srshr v19.8H, v26.8H, #11 // ......................................................*................. - // gap // ........................................................................ - srshr v23.8H, v23.8H, #11 // .....................................................*.................. - // gap // ........................................................................ - mls v27.8H, v22.8H, v7.H[0] // ...................................................*.................... - // gap // ........................................................................ - mls v24.8H, v19.8H, v7.H[0] // .........................................................*.............. - // gap // ........................................................................ - mls v3.8H, v23.8H, v7.H[0] // ........................................................*............... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v19.8H, v27.8H, v0.8H // ..........................................................*............. - // gap // ........................................................................ - add v27.8H, v27.8H, v0.8H // ...........................................................*............ - // gap // ........................................................................ - sub v22.8H, v3.8H, v24.8H // ............................................................*........... - // gap // ........................................................................ - mul v0.8H, v19.8H, v11.H[0] // .............................................................*.......... - // gap // ........................................................................ - sqrdmulh v28.8H, v19.8H, v11.H[1] // ..............................................................*......... - // gap // ........................................................................ - sqrdmulh v23.8H, v22.8H, v11.H[1] // ................................................................*....... - // gap // ........................................................................ - mul v22.8H, v22.8H, v11.H[0] // ...............................................................*........ - // gap // ........................................................................ - add v19.8H, v3.8H, v24.8H // .................................................................*...... - // gap // ........................................................................ - mls v0.8H, v28.8H, v7.H[0] // ..................................................................*..... - // gap // ........................................................................ - str q27, [x1], #(64) // ...................................................................*.... - // gap // ........................................................................ - mls v22.8H, v23.8H, v7.H[0] // ....................................................................*... - // gap // ........................................................................ - str q19, [x1, #-48] // .....................................................................*.. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - str q0, [x1, #-32] // ......................................................................*. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - str q22, [x1, #-16] // .......................................................................* - // gap // ........................................................................ - - // original source code - // trn1 v19.4S, v23.4S, v27.4S // *....................................................................... - // trn2 v23.4S, v23.4S, v27.4S // .*...................................................................... - // trn2 v22.4S, v3.4S, v28.4S // ..*..................................................................... - // trn2 v28.2D, v19.2D, v26.2D // ...*.................................................................... - // trn1 v19.2D, v19.2D, v26.2D // ....*................................................................... - // trn2 v0.2D, v23.2D, v22.2D // ......*................................................................. - // trn1 v23.2D, v23.2D, v22.2D // .....*.................................................................. - // sub v22.8H, v28.8H, v0.8H // .........*.............................................................. - // add v28.8H, v28.8H, v0.8H // ........*............................................................... - // sub v0.8H, v19.8H, v23.8H // .......*................................................................ - // add v19.8H, v19.8H, v23.8H // ..............*......................................................... - // mul v23.8H, v22.8H, v14.8H // .............*.......................................................... - // mul v27.8H, v0.8H, v24.8H // ...........*............................................................ - // sqrdmulh v0.8H, v0.8H, v11.8H // ..........*............................................................. - // sqrdmulh v22.8H, v22.8H, v25.8H // ............*........................................................... - // sub v24.8H, v19.8H, v28.8H // .................*...................................................... - // add v19.8H, v19.8H, v28.8H // .........................*.............................................. - // mls v27.8H, v0.8H, v7.H[0] // ...............*........................................................ - // mls v23.8H, v22.8H, v7.H[0] // ................*....................................................... - // mul v22.8H, v24.8H, v29.8H // .....................*.................................................. - // sqrdmulh v28.8H, v24.8H, v20.8H // ....................*................................................... - // ldr q0, [x3], #16 // ..................*..................................................... - // sub v24.8H, v27.8H, v23.8H // ...................*.................................................... - // mls v22.8H, v28.8H, v7.H[0] // ..........................*............................................. - // add v23.8H, v27.8H, v23.8H // ........................*............................................... - // mul v28.8H, v24.8H, v29.8H // .......................*................................................ - // sqrdmulh v27.8H, v24.8H, v20.8H // ......................*................................................. - // trn1 v24.4S, v19.4S, v23.4S // .............................*.......................................... - // trn2 v19.4S, v19.4S, v23.4S // ............................*........................................... - // mls v28.8H, v27.8H, v7.H[0] // ...........................*............................................ - // trn1 v26.4S, v22.4S, v28.4S // ...............................*........................................ - // trn2 v22.4S, v22.4S, v28.4S // ..............................*......................................... - // trn2 v29.2D, v24.2D, v26.2D // .................................*...................................... - // trn2 v20.2D, v19.2D, v22.2D // ................................*....................................... - // trn1 v24.2D, v24.2D, v26.2D // ....................................*................................... - // trn1 v19.2D, v19.2D, v22.2D // ..................................*..................................... - // sub v22.8H, v29.8H, v20.8H // ...................................*.................................... - // sub v26.8H, v24.8H, v19.8H // ......................................*................................. - // add v19.8H, v24.8H, v19.8H // .............................................*.......................... - // mul v24.8H, v22.8H, v0.H[4] // ........................................*............................... - // mul v11.8H, v26.8H, v0.H[2] // ..........................................*............................. - // sqrdmulh v26.8H, v26.8H, v0.H[3] // .........................................*.............................. - // sqrdmulh v22.8H, v22.8H, v0.H[5] // .......................................*................................ - // add v29.8H, v29.8H, v20.8H // .....................................*.................................. - // sqdmulh v20.8H, v19.8H, v7.H[1] // ................................................*....................... - // mls v11.8H, v26.8H, v7.H[0] // ..............................................*......................... - // mls v24.8H, v22.8H, v7.H[0] // ............................................*........................... - // sqdmulh v22.8H, v29.8H, v7.H[1] // ...........................................*............................ - // srshr v26.8H, v20.8H, #11 // ....................................................*................... - // sqdmulh v20.8H, v11.8H, v7.H[1] // ..................................................*..................... - // sqdmulh v14.8H, v24.8H, v7.H[1] // .................................................*...................... - // mls v19.8H, v26.8H, v7.H[0] // .......................................................*................ - // srshr v22.8H, v22.8H, #11 // ...............................................*........................ - // srshr v26.8H, v20.8H, #11 // ......................................................*................. - // srshr v20.8H, v14.8H, #11 // .....................................................*.................. - // mls v29.8H, v22.8H, v7.H[0] // ...................................................*.................... - // mls v11.8H, v26.8H, v7.H[0] // .........................................................*.............. - // mls v24.8H, v20.8H, v7.H[0] // ........................................................*............... - // sub v22.8H, v19.8H, v29.8H // ..........................................................*............. - // add v19.8H, v19.8H, v29.8H // ...........................................................*............ - // sub v29.8H, v11.8H, v24.8H // ............................................................*........... - // mul v20.8H, v22.8H, v0.H[0] // .............................................................*.......... - // sqrdmulh v22.8H, v22.8H, v0.H[1] // ..............................................................*......... - // mul v14.8H, v29.8H, v0.H[0] // ................................................................*....... - // sqrdmulh v0.8H, v29.8H, v0.H[1] // ...............................................................*........ - // add v24.8H, v11.8H, v24.8H // .................................................................*...... - // mls v20.8H, v22.8H, v7.H[0] // ..................................................................*..... - // str q19, [x1], #(64) // ...................................................................*.... - // mls v14.8H, v0.8H, v7.H[0] // ....................................................................*... - // str q24, [x1, #-48] // .....................................................................*.. - // str q20, [x1, #-32] // ......................................................................*. - // str q14, [x1, #-16] // .......................................................................* + // Instructions: 75 + // Expected cycles: 84 + // Expected IPC: 0.89 + // + // Cycle bound: 84.0 + // IPC bound: 0.89 + // + // Wall time: 17.74s + // User time: 17.74s + // + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q17, [x1, #0] // *.......................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + trn1 v2.4S, v27.4S, v13.4S // .*......................................................................... + // gap // ........................................................................... + trn2 v16.4S, v27.4S, v13.4S // ..*........................................................................ + // gap // ........................................................................... + trn2 v26.4S, v17.4S, v11.4S // ....*...................................................................... + // gap // ........................................................................... + trn1 v15.4S, v17.4S, v11.4S // ...*....................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + trn1 v14.2D, v26.2D, v16.2D // .........*................................................................. + // gap // ........................................................................... + trn1 v6.2D, v15.2D, v2.2D // .......*................................................................... + // gap // ........................................................................... + trn2 v11.2D, v15.2D, v2.2D // ......*.................................................................... + // gap // ........................................................................... + sub v15.8H, v6.8H, v14.8H // ...........*............................................................... + // gap // ........................................................................... + trn2 v17.2D, v26.2D, v16.2D // ........*.................................................................. + // gap // ........................................................................... + ldr q29, [x4, #-16] // .....*..................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v27.8H, v11.8H, v17.8H // ..........*................................................................ + // gap // ........................................................................... + sqrdmulh v16.8H, v15.8H, v22.8H // ..............*............................................................ + // gap // ........................................................................... + mul v0.8H, v15.8H, v0.8H // ...............*........................................................... + // gap // ........................................................................... + sqrdmulh v15.8H, v27.8H, v29.8H // ................*.......................................................... + // gap // ........................................................................... + mul v26.8H, v27.8H, v3.8H // .................*......................................................... + // gap // ........................................................................... + add v11.8H, v11.8H, v17.8H // .............*............................................................. + // gap // ........................................................................... + add v2.8H, v6.8H, v14.8H // ............*.............................................................. + // gap // ........................................................................... + mls v0.8H, v16.8H, v7.H[0] // ...................*....................................................... + // gap // ........................................................................... + mls v26.8H, v15.8H, v7.H[0] // .....................*..................................................... + // gap // ........................................................................... + sub v27.8H, v2.8H, v11.8H // ..................*........................................................ + // gap // ........................................................................... + ldr q3, [x3], #16 // ........................*.................................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v16.8H, v0.8H, v26.8H // .........................*................................................. + // gap // ........................................................................... + mul v14.8H, v27.8H, v31.8H // .......................*................................................... + // gap // ........................................................................... + sqrdmulh v6.8H, v27.8H, v25.8H // ......................*.................................................... + // gap // ........................................................................... + mul v15.8H, v16.8H, v31.8H // .............................*............................................. + // gap // ........................................................................... + sqrdmulh v16.8H, v16.8H, v25.8H // ............................*.............................................. + // gap // ........................................................................... + add v27.8H, v0.8H, v26.8H // ...........................*............................................... + // gap // ........................................................................... + add v26.8H, v2.8H, v11.8H // ....................*...................................................... + // gap // ........................................................................... + mls v14.8H, v6.8H, v7.H[0] // ..........................*................................................ + // gap // ........................................................................... + mls v15.8H, v16.8H, v7.H[0] // ................................*.......................................... + // gap // ........................................................................... + trn2 v0.4S, v26.4S, v27.4S // ...............................*........................................... + // gap // ........................................................................... + trn1 v31.4S, v26.4S, v27.4S // ..............................*............................................ + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + trn2 v6.4S, v14.4S, v15.4S // ..................................*........................................ + // gap // ........................................................................... + trn1 v16.4S, v14.4S, v15.4S // .................................*......................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + trn2 v26.2D, v0.2D, v6.2D // ....................................*...................................... + // gap // ........................................................................... + trn2 v15.2D, v31.2D, v16.2D // ...................................*....................................... + // gap // ........................................................................... + trn1 v14.2D, v31.2D, v16.2D // .....................................*..................................... + // gap // ........................................................................... + sub v27.8H, v15.8H, v26.8H // .......................................*................................... + // gap // ........................................................................... + trn1 v8.2D, v0.2D, v6.2D // ......................................*.................................... + // gap // ........................................................................... + add v26.8H, v15.8H, v26.8H // ..............................................*............................ + // gap // ........................................................................... + sub v15.8H, v14.8H, v8.8H // ........................................*.................................. + // gap // ........................................................................... + sqrdmulh v16.8H, v27.8H, v3.H[5] // ..........................................*................................ + // gap // ........................................................................... + mul v0.8H, v27.8H, v3.H[4] // .............................................*............................. + // gap // ........................................................................... + mul v11.8H, v15.8H, v3.H[2] // ............................................*.............................. + // gap // ........................................................................... + sqrdmulh v6.8H, v15.8H, v3.H[3] // ...........................................*............................... + // gap // ........................................................................... + add v13.8H, v14.8H, v8.8H // .........................................*................................. + // gap // ........................................................................... + mls v0.8H, v16.8H, v7.H[0] // .................................................*......................... + // gap // ........................................................................... + sqdmulh v15.8H, v26.8H, v7.H[1] // ..................................................*........................ + // gap // ........................................................................... + mls v11.8H, v6.8H, v7.H[0] // ................................................*.......................... + // gap // ........................................................................... + sqdmulh v27.8H, v13.8H, v7.H[1] // ...............................................*........................... + // gap // ........................................................................... + sqdmulh v16.8H, v0.8H, v7.H[1] // .....................................................*..................... + // gap // ........................................................................... + srshr v15.8H, v15.8H, #11 // .......................................................*................... + // gap // ........................................................................... + sqdmulh v19.8H, v11.8H, v7.H[1] // ....................................................*...................... + // gap // ........................................................................... + srshr v31.8H, v27.8H, #11 // ...................................................*....................... + // gap // ........................................................................... + mls v26.8H, v15.8H, v7.H[0] // ..........................................................*................ + // gap // ........................................................................... + srshr v16.8H, v16.8H, #11 // .........................................................*................. + // gap // ........................................................................... + srshr v27.8H, v19.8H, #11 // ........................................................*.................. + // gap // ........................................................................... + mls v13.8H, v31.8H, v7.H[0] // ......................................................*.................... + // gap // ........................................................................... + mls v0.8H, v16.8H, v7.H[0] // ............................................................*.............. + // gap // ........................................................................... + mls v11.8H, v27.8H, v7.H[0] // ...........................................................*............... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v16.8H, v13.8H, v26.8H // .............................................................*............. + // gap // ........................................................................... + add v26.8H, v13.8H, v26.8H // ..............................................................*............ + // gap // ........................................................................... + sub v15.8H, v11.8H, v0.8H // ...............................................................*........... + // gap // ........................................................................... + mul v14.8H, v16.8H, v3.H[0] // .................................................................*......... + // gap // ........................................................................... + sqrdmulh v6.8H, v16.8H, v3.H[1] // ................................................................*.......... + // gap // ........................................................................... + mul v16.8H, v15.8H, v3.H[0] // ...................................................................*....... + // gap // ........................................................................... + sqrdmulh v27.8H, v15.8H, v3.H[1] // ..................................................................*........ + // gap // ........................................................................... + str q26, [x1], #(64) // ......................................................................*.... + // gap // ........................................................................... + add v15.8H, v11.8H, v0.8H // ....................................................................*...... + // gap // ........................................................................... + mls v14.8H, v6.8H, v7.H[0] // .....................................................................*..... + // gap // ........................................................................... + mls v16.8H, v27.8H, v7.H[0] // .......................................................................*... + // gap // ........................................................................... + str q15, [x1, #-48] // ........................................................................*.. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + str q14, [x1, #-32] // ..........................................................................* + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + str q16, [x1, #-16] // .........................................................................*. + // gap // ........................................................................... + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q16, [x1, #0] // *.......................................................................... + // trn1 v15.4S, v27.4S, v13.4S // .*......................................................................... + // trn2 v27.4S, v27.4S, v13.4S // ..*........................................................................ + // trn1 v14.4S, v16.4S, v11.4S // ....*...................................................................... + // trn2 v16.4S, v16.4S, v11.4S // ...*....................................................................... + // ldr q2, [x4, #-16] // ..........*................................................................ + // trn2 v6.2D, v14.2D, v15.2D // .......*................................................................... + // trn1 v15.2D, v14.2D, v15.2D // ......*.................................................................... + // trn2 v14.2D, v16.2D, v27.2D // .........*................................................................. + // trn1 v16.2D, v16.2D, v27.2D // .....*..................................................................... + // sub v27.8H, v6.8H, v14.8H // ...........*............................................................... + // sub v26.8H, v15.8H, v16.8H // ........*.................................................................. + // add v15.8H, v15.8H, v16.8H // .................*......................................................... + // add v16.8H, v6.8H, v14.8H // ................*.......................................................... + // sqrdmulh v14.8H, v26.8H, v22.8H // ............*.............................................................. + // mul v6.8H, v26.8H, v0.8H // .............*............................................................. + // sqrdmulh v26.8H, v27.8H, v2.8H // ..............*............................................................ + // mul v27.8H, v27.8H, v3.8H // ...............*........................................................... + // sub v0.8H, v15.8H, v16.8H // ....................*...................................................... + // mls v6.8H, v14.8H, v7.H[0] // ..................*........................................................ + // add v15.8H, v15.8H, v16.8H // ............................*.............................................. + // mls v27.8H, v26.8H, v7.H[0] // ...................*....................................................... + // sqrdmulh v16.8H, v0.8H, v25.8H // ........................*.................................................. + // mul v14.8H, v0.8H, v31.8H // .......................*................................................... + // ldr q0, [x3], #16 // .....................*..................................................... + // sub v26.8H, v6.8H, v27.8H // ......................*.................................................... + // mls v14.8H, v16.8H, v7.H[0] // .............................*............................................. + // add v16.8H, v6.8H, v27.8H // ...........................*............................................... + // sqrdmulh v27.8H, v26.8H, v25.8H // ..........................*................................................ + // mul v6.8H, v26.8H, v31.8H // .........................*................................................. + // trn1 v26.4S, v15.4S, v16.4S // ................................*.......................................... + // trn2 v15.4S, v15.4S, v16.4S // ...............................*........................................... + // mls v6.8H, v27.8H, v7.H[0] // ..............................*............................................ + // trn1 v16.4S, v14.4S, v6.4S // ..................................*........................................ + // trn2 v14.4S, v14.4S, v6.4S // .................................*......................................... + // trn2 v6.2D, v26.2D, v16.2D // ....................................*...................................... + // trn2 v25.2D, v15.2D, v14.2D // ...................................*....................................... + // trn1 v16.2D, v26.2D, v16.2D // .....................................*..................................... + // trn1 v15.2D, v15.2D, v14.2D // .......................................*................................... + // sub v14.8H, v6.8H, v25.8H // ......................................*.................................... + // sub v26.8H, v16.8H, v15.8H // .........................................*................................. + // add v15.8H, v16.8H, v15.8H // ..............................................*............................ + // sqrdmulh v16.8H, v14.8H, v0.H[5] // ..........................................*................................ + // sqrdmulh v22.8H, v26.8H, v0.H[3] // .............................................*............................. + // mul v26.8H, v26.8H, v0.H[2] // ............................................*.............................. + // mul v14.8H, v14.8H, v0.H[4] // ...........................................*............................... + // add v6.8H, v6.8H, v25.8H // ........................................*.................................. + // sqdmulh v25.8H, v15.8H, v7.H[1] // ..................................................*........................ + // mls v26.8H, v22.8H, v7.H[0] // .................................................*......................... + // mls v14.8H, v16.8H, v7.H[0] // ...............................................*........................... + // sqdmulh v16.8H, v6.8H, v7.H[1] // ................................................*.......................... + // srshr v25.8H, v25.8H, #11 // ......................................................*.................... + // sqdmulh v22.8H, v26.8H, v7.H[1] // .....................................................*..................... + // sqdmulh v3.8H, v14.8H, v7.H[1] // ...................................................*....................... + // mls v15.8H, v25.8H, v7.H[0] // ..........................................................*................ + // srshr v16.8H, v16.8H, #11 // ....................................................*...................... + // srshr v25.8H, v22.8H, #11 // .........................................................*................. + // srshr v22.8H, v3.8H, #11 // ........................................................*.................. + // mls v6.8H, v16.8H, v7.H[0] // .......................................................*................... + // mls v26.8H, v25.8H, v7.H[0] // ............................................................*.............. + // mls v14.8H, v22.8H, v7.H[0] // ...........................................................*............... + // sub v16.8H, v15.8H, v6.8H // .............................................................*............. + // add v15.8H, v15.8H, v6.8H // ..............................................................*............ + // sub v6.8H, v26.8H, v14.8H // ...............................................................*........... + // sqrdmulh v25.8H, v16.8H, v0.H[1] // .................................................................*......... + // mul v16.8H, v16.8H, v0.H[0] // ................................................................*.......... + // sqrdmulh v22.8H, v6.8H, v0.H[1] // ...................................................................*....... + // mul v6.8H, v6.8H, v0.H[0] // ..................................................................*........ + // add v14.8H, v26.8H, v14.8H // .....................................................................*..... + // mls v16.8H, v25.8H, v7.H[0] // ......................................................................*.... + // str q15, [x1], #(64) // ....................................................................*...... + // mls v6.8H, v22.8H, v7.H[0] // .......................................................................*... + // str q14, [x1, #-48] // ........................................................................*.. + // str q6, [x1, #-16] // ..........................................................................* + // str q16, [x1, #-32] // .........................................................................*. // --------------------------------------------------------------------- @@ -936,580 +966,625 @@ layer4567_start: .p2align 2 - ldr q24, [x0, #64] // *...... - // gap // ....... - // gap // ....... - // gap // ....... - ldr q22, [x0, #128] // .*..... - // gap // ....... - // gap // ....... - // gap // ....... - ldr q28, [x0, #192] // ..*.... - // gap // ....... - // gap // ....... - // gap // ....... - ldr q3, [x0, #256] // ...*... - // gap // ....... - // gap // ....... - // gap // ....... - ldr q26, [x0, #320] // ....*.. - // gap // ....... - // gap // ....... - // gap // ....... - ldr q11, [x0, #384] // .....*. - // gap // ....... - // gap // ....... - // gap // ....... - ldr q14, [x0, #448] // ......* - // gap // ....... - - // original source code - // ldr q24, [x0, #64] // *...... - // ldr q22, [x0, #128] // .*..... - // ldr q28, [x0, #192] // ..*.... - // ldr q3, [x0, #256] // ...*... - // ldr q26, [x0, #320] // ....*.. - // ldr q11, [x0, #384] // .....*. - // ldr q14, [x0, #448] // ......* + // Instructions: 8 + // Expected cycles: 14 + // Expected IPC: 0.57 + // + // Cycle bound: 14.0 + // IPC bound: 0.57 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q22, [x0, #448] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x0, #384] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x0, #320] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v3.8H, v25.8H, v22.8H // .....*........................ + // gap // .............................. + ldr q11, [x0, #256] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q14, [x0, #128] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q6, [x0, #64] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q26, [x0, #192] // .......*...................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q22, [x0, #448] // *.............................. + // ldr q25, [x0, #384] // .*............................. + // ldr q13, [x0, #320] // ..*............................ + // ldr q11, [x0, #256] // ....*.......................... + // ldr q14, [x0, #128] // .....*......................... + // add v3.8H, v25.8H, v22.8H // ...*........................... + // ldr q6, [x0, #64] // ......*........................ + // ldr q26, [x0, #192] // .......*....................... sub count, count, #1 layer123_start: - ldr q23, [x0, #0] // *....................................................................................... + // Instructions: 88 + // Expected cycles: 96 + // Expected IPC: 0.92 + // + // Cycle bound: 96.0 + // IPC bound: 0.92 + // + // Wall time: 7.17s + // User time: 7.17s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + ldr q16, [x0, #0] // *....................................................................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sub v19.8H, v22.8H, v28.8H // .............*.......................................................................... + sub v15.8H, v14.8H, v26.8H // .............*.......................................................................... // gap // ........................................................................................ - add v22.8H, v22.8H, v28.8H // ..............*......................................................................... + add v27.8H, v14.8H, v26.8H // ..............*......................................................................... // gap // ........................................................................................ - sub v28.8H, v23.8H, v24.8H // ........*............................................................................... + sub v14.8H, v16.8H, v6.8H // ........*............................................................................... // gap // ........................................................................................ - add v23.8H, v23.8H, v24.8H // .........*.............................................................................. + add v16.8H, v16.8H, v6.8H // .........*.............................................................................. // gap // ........................................................................................ - mul v27.8H, v19.8H, v1.H[0] // ...............*........................................................................ + sqrdmulh v6.8H, v15.8H, v1.H[1] // ...............*........................................................................ // gap // ........................................................................................ - sqrdmulh v8.8H, v19.8H, v1.H[1] // ................*....................................................................... + mul v15.8H, v15.8H, v1.H[0] // ................*....................................................................... // gap // ........................................................................................ - sub v24.8H, v23.8H, v22.8H // ............................*........................................................... + sub v26.8H, v16.8H, v27.8H // ............................*........................................................... // gap // ........................................................................................ - add v21.8H, v23.8H, v22.8H // .............................*.......................................................... + add v16.8H, v16.8H, v27.8H // .............................*.......................................................... // gap // ........................................................................................ - mul v22.8H, v28.8H, v0.H[6] // ..........*............................................................................. + sqrdmulh v27.8H, v14.8H, v0.H[7] // ..........*............................................................................. // gap // ........................................................................................ - sqrdmulh v28.8H, v28.8H, v0.H[7] // ...........*............................................................................ + mul v14.8H, v14.8H, v0.H[6] // ...........*............................................................................ // gap // ........................................................................................ - mls v27.8H, v8.8H, v7.H[0] // .................*...................................................................... + mls v15.8H, v6.8H, v7.H[0] // .................*...................................................................... // gap // ........................................................................................ - sub v19.8H, v3.8H, v26.8H // ..................*..................................................................... + sub v6.8H, v11.8H, v13.8H // ..................*..................................................................... // gap // ........................................................................................ - add v3.8H, v3.8H, v26.8H // ...................*.................................................................... + add v11.8H, v11.8H, v13.8H // ...................*.................................................................... // gap // ........................................................................................ - mls v22.8H, v28.8H, v7.H[0] // ............*........................................................................... + mls v14.8H, v27.8H, v7.H[0] // ............*........................................................................... // gap // ........................................................................................ - mul v28.8H, v19.8H, v1.H[2] // ....................*................................................................... + sqrdmulh v27.8H, v6.8H, v1.H[3] // ....................*................................................................... // gap // ........................................................................................ - add v25.8H, v11.8H, v14.8H // ........................*............................................................... + sqrdmulh v13.8H, v26.8H, v0.H[3] // ..............................*......................................................... // gap // ........................................................................................ - mul v26.8H, v24.8H, v0.H[2] // ..............................*......................................................... + mul v26.8H, v26.8H, v0.H[2] // ...............................*........................................................ // gap // ........................................................................................ - sqrdmulh v24.8H, v24.8H, v0.H[3] // ...............................*........................................................ + add v31.8H, v11.8H, v3.8H // .......................................*................................................ // gap // ........................................................................................ - add v20.8H, v3.8H, v25.8H // .......................................*................................................ + mul v6.8H, v6.8H, v1.H[2] // .....................*.................................................................. // gap // ........................................................................................ - sqrdmulh v19.8H, v19.8H, v1.H[3] // .....................*.................................................................. + sub v25.8H, v25.8H, v22.8H // .......................*................................................................ // gap // ........................................................................................ - sub v11.8H, v11.8H, v14.8H // .......................*................................................................ + sub v22.8H, v16.8H, v31.8H // ................................................*....................................... // gap // ........................................................................................ - add v23.8H, v21.8H, v20.8H // .................................................*...................................... + add v16.8H, v16.8H, v31.8H // .................................................*...................................... // gap // ........................................................................................ - sub v14.8H, v21.8H, v20.8H // ................................................*....................................... + mls v6.8H, v27.8H, v7.H[0] // ......................*................................................................. // gap // ........................................................................................ - mls v28.8H, v19.8H, v7.H[0] // ......................*................................................................. + sub v27.8H, v14.8H, v15.8H // .................................*...................................................... // gap // ........................................................................................ - sub v19.8H, v22.8H, v27.8H // .................................*...................................................... + add v15.8H, v14.8H, v15.8H // ..................................*..................................................... // gap // ........................................................................................ - add v22.8H, v22.8H, v27.8H // ..................................*..................................................... + sqrdmulh v14.8H, v25.8H, v1.H[5] // .........................*.............................................................. // gap // ........................................................................................ - mul v27.8H, v11.8H, v1.H[4] // .........................*.............................................................. + mul v31.8H, v25.8H, v1.H[4] // ..........................*............................................................. // gap // ........................................................................................ - sqrdmulh v20.8H, v11.8H, v1.H[5] // ..........................*............................................................. + sub v11.8H, v11.8H, v3.8H // ......................................*................................................. // gap // ........................................................................................ - sub v3.8H, v3.8H, v25.8H // ......................................*................................................. + mls v26.8H, v13.8H, v7.H[0] // ................................*....................................................... // gap // ........................................................................................ - mls v26.8H, v24.8H, v7.H[0] // ................................*....................................................... + sqrdmulh v13.8H, v27.8H, v0.H[3] // ...................................*.................................................... // gap // ........................................................................................ - mul v24.8H, v19.8H, v0.H[2] // ...................................*.................................................... + mls v31.8H, v14.8H, v7.H[0] // ...........................*............................................................ // gap // ........................................................................................ - mls v27.8H, v20.8H, v7.H[0] // ...........................*............................................................ + mul v27.8H, v27.8H, v0.H[2] // ....................................*................................................... // gap // ........................................................................................ - sqrdmulh v19.8H, v19.8H, v0.H[3] // ....................................*................................................... + sqrdmulh v14.8H, v11.8H, v0.H[5] // ........................................*............................................... // gap // ........................................................................................ - mul v20.8H, v3.8H, v0.H[4] // ........................................*............................................... + mul v11.8H, v11.8H, v0.H[4] // .........................................*.............................................. // gap // ........................................................................................ - sqrdmulh v3.8H, v3.8H, v0.H[5] // .........................................*.............................................. + sub v25.8H, v6.8H, v31.8H // ...........................................*............................................ // gap // ........................................................................................ - sub v11.8H, v28.8H, v27.8H // ...........................................*............................................ + mls v27.8H, v13.8H, v7.H[0] // .....................................*.................................................. // gap // ........................................................................................ - mls v24.8H, v19.8H, v7.H[0] // .....................................*.................................................. + add v6.8H, v6.8H, v31.8H // ............................................*........................................... // gap // ........................................................................................ - add v19.8H, v28.8H, v27.8H // ............................................*........................................... + mls v11.8H, v14.8H, v7.H[0] // ..........................................*............................................. // gap // ........................................................................................ - mul v28.8H, v11.8H, v0.H[4] // .............................................*.......................................... + sqrdmulh v14.8H, v25.8H, v0.H[5] // .............................................*.......................................... // gap // ........................................................................................ - sqrdmulh v27.8H, v11.8H, v0.H[5] // ..............................................*......................................... + mul v13.8H, v25.8H, v0.H[4] // ..............................................*......................................... // gap // ........................................................................................ - mls v20.8H, v3.8H, v7.H[0] // ..........................................*............................................. + sqrdmulh v31.8H, v22.8H, v0.H[1] // ..................................................*..................................... // gap // ........................................................................................ - mul v3.8H, v14.8H, v0.H[0] // ..................................................*..................................... + mul v25.8H, v22.8H, v0.H[0] // ...................................................*.................................... // gap // ........................................................................................ - sqrdmulh v25.8H, v14.8H, v0.H[1] // ...................................................*.................................... + sub v22.8H, v15.8H, v6.8H // .....................................................*.................................. // gap // ........................................................................................ - add v8.8H, v22.8H, v19.8H // ......................................................*................................. + mls v13.8H, v14.8H, v7.H[0] // ...............................................*........................................ // gap // ........................................................................................ - sub v14.8H, v22.8H, v19.8H // .....................................................*.................................. + add v15.8H, v15.8H, v6.8H // ......................................................*................................. // gap // ........................................................................................ - mls v28.8H, v27.8H, v7.H[0] // ...............................................*........................................ + sqrdmulh v14.8H, v16.8H, v30.8H // ........................................................................*............... // gap // ........................................................................................ - mul v22.8H, v23.8H, v29.8H // ........................................................................*............... + mul v16.8H, v16.8H, v29.8H // .........................................................................*.............. // gap // ........................................................................................ - sqrdmulh v11.8H, v14.8H, v0.H[1] // ........................................................*............................... + mls v25.8H, v31.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ - sqrdmulh v23.8H, v23.8H, v30.8H // .........................................................................*.............. + sqrdmulh v6.8H, v22.8H, v0.H[1] // .......................................................*................................ // gap // ........................................................................................ - mls v3.8H, v25.8H, v7.H[0] // ....................................................*................................... + mul v31.8H, v22.8H, v0.H[0] // ........................................................*............................... // gap // ........................................................................................ - mul v27.8H, v14.8H, v0.H[0] // .......................................................*................................ + sub v22.8H, v26.8H, v11.8H // ..........................................................*............................. // gap // ........................................................................................ - sub v14.8H, v26.8H, v20.8H // ..........................................................*............................. + add v26.8H, v26.8H, v11.8H // ...........................................................*............................ // gap // ........................................................................................ - add v26.8H, v26.8H, v20.8H // ...........................................................*............................ + sub v11.8H, v27.8H, v13.8H // ...............................................................*........................ // gap // ........................................................................................ - sub v20.8H, v24.8H, v28.8H // ...............................................................*........................ + mls v31.8H, v6.8H, v7.H[0] // .........................................................*.............................. // gap // ........................................................................................ - mls v27.8H, v11.8H, v7.H[0] // .........................................................*.............................. + sqrdmulh v6.8H, v22.8H, v0.H[1] // ............................................................*........................... // gap // ........................................................................................ - mul v11.8H, v14.8H, v0.H[0] // ............................................................*........................... + mul v22.8H, v22.8H, v0.H[0] // .............................................................*.......................... // gap // ........................................................................................ - sqrdmulh v14.8H, v14.8H, v0.H[1] // .............................................................*.......................... + add v27.8H, v27.8H, v13.8H // ................................................................*....................... // gap // ........................................................................................ - add v28.8H, v24.8H, v28.8H // ................................................................*....................... + sqrdmulh v13.8H, v11.8H, v0.H[1] // .................................................................*...................... // gap // ........................................................................................ - str q27, [x0, #320] // .....................................................................*.................. + mul v11.8H, v11.8H, v0.H[0] // ..................................................................*..................... // gap // ........................................................................................ - mul v24.8H, v20.8H, v0.H[0] // .................................................................*...................... + mls v22.8H, v6.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ - sqrdmulh v20.8H, v20.8H, v0.H[1] // ..................................................................*..................... + str q25, [x0, #256] // ....................................................................*................... // gap // ........................................................................................ - str q3, [x0, #256] // ....................................................................*................... + mls v16.8H, v14.8H, v7.H[0] // ..........................................................................*............. // gap // ........................................................................................ - mls v11.8H, v14.8H, v7.H[0] // ..............................................................*......................... + mls v11.8H, v13.8H, v7.H[0] // ...................................................................*.................... // gap // ........................................................................................ - mls v22.8H, v23.8H, v7.H[0] // ..........................................................................*............. + str q31, [x0, #320] // .....................................................................*.................. // gap // ........................................................................................ - mls v24.8H, v20.8H, v7.H[0] // ...................................................................*.................... + sqrdmulh v14.8H, v15.8H, v30.8H // ...........................................................................*............ // gap // ........................................................................................ - mul v23.8H, v8.8H, v29.8H // ...........................................................................*............ + str q22, [x0, #384] // ......................................................................*................. // gap // ........................................................................................ - str q11, [x0, #384] // ......................................................................*................. + mul v15.8H, v15.8H, v29.8H // ............................................................................*........... // gap // ........................................................................................ - sqrdmulh v19.8H, v8.8H, v30.8H // ............................................................................*........... + str q11, [x0, #448] // .......................................................................*................ // gap // ........................................................................................ - str q24, [x0, #448] // .......................................................................*................ + ldr q22, [x0, #464] // .......e................................................................................ // gap // ........................................................................................ - mul v27.8H, v26.8H, v29.8H // ..............................................................................*......... // gap // ........................................................................................ - str q22, [x0], #(16) // ....................................................................................*... // gap // ........................................................................................ - sqrdmulh v24.8H, v26.8H, v30.8H // ...............................................................................*........ + mls v15.8H, v14.8H, v7.H[0] // .............................................................................*.......... // gap // ........................................................................................ - mls v23.8H, v19.8H, v7.H[0] // .............................................................................*.......... + ldr q25, [x0, #400] // ......e................................................................................. // gap // ........................................................................................ - mul v19.8H, v28.8H, v29.8H // .................................................................................*...... // gap // ........................................................................................ - sqrdmulh v8.8H, v28.8H, v30.8H // ..................................................................................*..... // gap // ........................................................................................ - mls v27.8H, v24.8H, v7.H[0] // ................................................................................*....... + sqrdmulh v14.8H, v27.8H, v30.8H // .................................................................................*...... // gap // ........................................................................................ - ldr q24, [x0, #64] // .e...................................................................................... + str q15, [x0, #64] // .....................................................................................*.. // gap // ........................................................................................ + ldr q13, [x0, #336] // .....e.................................................................................. // gap // ........................................................................................ // gap // ........................................................................................ - mls v19.8H, v8.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ - str q23, [x0, #48] // .....................................................................................*.. + ldr q11, [x0, #272] // ....e................................................................................... // gap // ........................................................................................ - ldr q22, [x0, #128] // ..e..................................................................................... // gap // ........................................................................................ // gap // ........................................................................................ + mul v27.8H, v27.8H, v29.8H // ..................................................................................*..... // gap // ........................................................................................ - str q27, [x0, #112] // ......................................................................................*. + str q16, [x0], #(16) // ....................................................................................*... // gap // ........................................................................................ - ldr q28, [x0, #192] // ...e.................................................................................... + sqrdmulh v6.8H, v26.8H, v30.8H // ..............................................................................*......... // gap // ........................................................................................ + mul v26.8H, v26.8H, v29.8H // ...............................................................................*........ // gap // ........................................................................................ + mls v27.8H, v14.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ - str q19, [x0, #176] // .......................................................................................* + ldr q14, [x0, #128] // ..e..................................................................................... // gap // ........................................................................................ - ldr q3, [x0, #256] // ....e................................................................................... // gap // ........................................................................................ // gap // ........................................................................................ + mls v26.8H, v6.8H, v7.H[0] // ................................................................................*....... // gap // ........................................................................................ - ldr q26, [x0, #320] // .....e.................................................................................. + add v3.8H, v25.8H, v22.8H // ........................e............................................................... // gap // ........................................................................................ + ldr q6, [x0, #64] // .e...................................................................................... // gap // ........................................................................................ // gap // ........................................................................................ - ldr q11, [x0, #384] // ......e................................................................................. // gap // ........................................................................................ + str q26, [x0, #112] // ......................................................................................*. // gap // ........................................................................................ + ldr q26, [x0, #192] // ...e.................................................................................... // gap // ........................................................................................ - ldr q14, [x0, #448] // .......e................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + str q27, [x0, #176] // .......................................................................................* // gap // ........................................................................................ - // original source code - // ldr q8, [x0, #0] // ...........*................................................................................... - // ldr q9, [x0, #(1*(512/8))] // e..........|............................................................................e...... - // ldr q10, [x0, #(2*(512/8))] // ...e.......|...............................................................................e... - // ldr q11, [x0, #(3*(512/8))] // .....e.....|.................................................................................e. - // ldr q12, [x0, #(4*(512/8))] // .......e...|................................................................................... - // ldr q13, [x0, #(5*(512/8))] // ........e..|................................................................................... - // ldr q14, [x0, #(6*(512/8))] // .........e.|................................................................................... - // ldr q15, [x0, #(7*(512/8))] // ..........e|................................................................................... - // sub v24.8h, v8.8h, v9.8h // ...........|..*................................................................................ - // add v8.8h, v8.8h, v9.8h // ...........|...*............................................................................... - // mul v9.8h, v24.8h, v0.h[6] // ...........|........*.......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........|.........*......................................................................... - // mls v9.8h, v24.8h, v7.h[0] // ...........|.............*..................................................................... - // sub v24.8h, v10.8h, v11.8h // ...........|*.................................................................................. - // add v10.8h, v10.8h, v11.8h // ...........|.*................................................................................. - // mul v11.8h, v24.8h, v1.h[0] // ...........|....*.............................................................................. - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...........|.....*............................................................................. - // mls v11.8h, v24.8h, v7.h[0] // ...........|..........*........................................................................ - // sub v24.8h, v12.8h, v13.8h // ...........|...........*....................................................................... - // add v12.8h, v12.8h, v13.8h // ...........|............*...................................................................... - // mul v13.8h, v24.8h, v1.h[2] // ...........|..............*.................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........|...................*............................................................... - // mls v13.8h, v24.8h, v7.h[0] // ...........|.......................*........................................................... - // sub v24.8h, v14.8h, v15.8h // ...........|....................*.............................................................. - // add v14.8h, v14.8h, v15.8h // ...........|...............*................................................................... - // mul v15.8h, v24.8h, v1.h[4] // ...........|..........................*........................................................ - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........|...........................*....................................................... - // mls v15.8h, v24.8h, v7.h[0] // ...........|...............................*................................................... - // sub v24.8h, v8.8h, v10.8h // ...........|......*............................................................................ - // add v8.8h, v8.8h, v10.8h // ...........|.......*........................................................................... - // mul v10.8h, v24.8h, v0.h[2] // ...........|................*.................................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|.................*................................................................. - // mls v10.8h, v24.8h, v7.h[0] // ...........|.............................*..................................................... - // sub v24.8h, v9.8h, v11.8h // ...........|........................*.......................................................... - // add v9.8h, v9.8h, v11.8h // ...........|.........................*......................................................... - // mul v11.8h, v24.8h, v0.h[2] // ...........|..............................*.................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................................*.................................................. - // mls v11.8h, v24.8h, v7.h[0] // ...........|....................................*.............................................. - // sub v24.8h, v12.8h, v14.8h // ...........|............................*...................................................... - // add v12.8h, v12.8h, v14.8h // ...........|..................*................................................................ - // mul v14.8h, v24.8h, v0.h[4] // ...........|.................................*................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|..................................*................................................ - // mls v14.8h, v24.8h, v7.h[0] // ...........|........................................*.......................................... - // sub v24.8h, v13.8h, v15.8h // ...........|...................................*............................................... - // add v13.8h, v13.8h, v15.8h // ...........|.....................................*............................................. - // mul v15.8h, v24.8h, v0.h[4] // ...........|......................................*............................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|.......................................*........................................... - // mls v15.8h, v24.8h, v7.h[0] // ...........|.............................................*..................................... - // sub v24.8h, v8.8h, v12.8h // ...........|......................*............................................................ - // add v8.8h, v8.8h, v12.8h // ...........|.....................*............................................................. - // mul v12.8h, v24.8h, v0.h[0] // ...........|.........................................*......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|..........................................*........................................ - // mls v12.8h, v24.8h, v7.h[0] // ...........|.................................................*................................. - // sub v24.8h, v9.8h, v13.8h // ...........|............................................*...................................... - // add v9.8h, v9.8h, v13.8h // ...........|...........................................*....................................... - // mul v13.8h, v24.8h, v0.h[0] // ...........|..................................................*................................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|...............................................*................................... - // mls v13.8h, v24.8h, v7.h[0] // ...........|......................................................*............................ - // sub v24.8h, v10.8h, v14.8h // ...........|...................................................*............................... - // add v10.8h, v10.8h, v14.8h // ...........|....................................................*.............................. - // mul v14.8h, v24.8h, v0.h[0] // ...........|.......................................................*........................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|........................................................*.......................... - // mls v14.8h, v24.8h, v7.h[0] // ...........|..............................................................*.................... - // sub v24.8h, v11.8h, v15.8h // ...........|.....................................................*............................. - // add v11.8h, v11.8h, v15.8h // ...........|.........................................................*......................... - // mul v15.8h, v24.8h, v0.h[0] // ...........|...........................................................*....................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|............................................................*...................... - // mls v15.8h, v24.8h, v7.h[0] // ...........|................................................................*.................. - // str q12, [x0, #(4*(512/8))] // ...........|.............................................................*..................... - // str q13, [x0, #(5*(512/8))] // ...........|..........................................................*........................ - // str q14, [x0, #(6*(512/8))] // ...........|..................................................................*................ - // str q15, [x0, #(7*(512/8))] // ...........|....................................................................*.............. - // mul v12.8h, v8.8h, v29.8h // ...........|..............................................*.................................... - // sqrdmulh v8.8h, v8.8h, v30.8h // ...........|................................................*.................................. - // mls v12.8h, v8.8h, v7.h[0] // ...........|...............................................................*................... - // mul v13.8h, v9.8h, v29.8h // ...........|.................................................................*................. - // sqrdmulh v9.8h, v9.8h, v30.8h // ...........|...................................................................*............... - // mls v13.8h, v9.8h, v7.h[0] // ...........|........................................................................*.......... - // mul v14.8h, v10.8h, v29.8h // ...........|.....................................................................*............. - // sqrdmulh v10.8h, v10.8h, v30.8h // ...........|.......................................................................*........... - // mls v14.8h, v10.8h, v7.h[0] // ...........|...........................................................................*....... - // mul v15.8h, v11.8h, v29.8h // ...........|.........................................................................*......... - // sqrdmulh v11.8h, v11.8h, v30.8h // ...........|..........................................................................*........ - // mls v15.8h, v11.8h, v7.h[0] // .*.........|.............................................................................*..... - // str q12, [x0], #(16) // ...........|......................................................................*............ - // str q13, [x0, #(-16 + 1*(512/8))] // ..*........|..............................................................................*.... - // str q14, [x0, #(-16 + 2*(512/8))] // ....*......|................................................................................*.. - // str q15, [x0, #(-16 + 3*(512/8))] // ......*....|..................................................................................* + // ---------------------------------------------- new position ----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------ + // ldr q8, [x0, #0] // ...................*....................................................................................... + // ldr q9, [x0, #(1*(512/8))] // ...............e...'...................................................................................~... + // ldr q10, [x0, #(2*(512/8))] // ............e......'................................................................................~...... + // ldr q11, [x0, #(3*(512/8))] // .................e.'.....................................................................................~. + // ldr q12, [x0, #(4*(512/8))] // ......e............'..........................................................................~............ + // ldr q13, [x0, #(5*(512/8))] // .....e.............'.........................................................................~............. + // ldr q14, [x0, #(6*(512/8))] // ..e................'......................................................................~................ + // ldr q15, [x0, #(7*(512/8))] // e..................'....................................................................~.................. + // sub v24.8h, v8.8h, v9.8h // ...................'..*.................................................................................... + // add v8.8h, v8.8h, v9.8h // ...................'...*................................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[7] // ...................'........*.............................................................................. + // mul v9.8h, v24.8h, v0.h[6] // ...................'.........*............................................................................. + // mls v9.8h, v27.8h, v7.h[0] // ...................'.............*......................................................................... + // sub v24.8h, v10.8h, v11.8h // ...................'*...................................................................................... + // add v10.8h, v10.8h, v11.8h // ...................'.*..................................................................................... + // sqrdmulh v27.8h, v24.8h, v1.h[1] // ...................'....*.................................................................................. + // mul v11.8h, v24.8h, v1.h[0] // ...................'.....*................................................................................. + // mls v11.8h, v27.8h, v7.h[0] // ...................'..........*............................................................................ + // sub v24.8h, v12.8h, v13.8h // ...................'...........*........................................................................... + // add v12.8h, v12.8h, v13.8h // ...................'............*.......................................................................... + // sqrdmulh v27.8h, v24.8h, v1.h[3] // ...................'..............*........................................................................ + // mul v13.8h, v24.8h, v1.h[2] // ...................'..................*.................................................................... + // mls v13.8h, v27.8h, v7.h[0] // ...................'......................*................................................................ + // sub v24.8h, v14.8h, v15.8h // ...................'...................*................................................................... + // add v14.8h, v14.8h, v15.8h // ..............e....'..................................................................................~.... + // sqrdmulh v27.8h, v24.8h, v1.h[5] // ...................'.........................*............................................................. + // mul v15.8h, v24.8h, v1.h[4] // ...................'..........................*............................................................ + // mls v15.8h, v27.8h, v7.h[0] // ...................'..............................*........................................................ + // sub v24.8h, v8.8h, v10.8h // ...................'......*................................................................................ + // add v8.8h, v8.8h, v10.8h // ...................'.......*............................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...................'...............*....................................................................... + // mul v10.8h, v24.8h, v0.h[2] // ...................'................*...................................................................... + // mls v10.8h, v27.8h, v7.h[0] // ...................'............................*.......................................................... + // sub v24.8h, v9.8h, v11.8h // ...................'.......................*............................................................... + // add v9.8h, v9.8h, v11.8h // ...................'........................*.............................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...................'.............................*......................................................... + // mul v11.8h, v24.8h, v0.h[2] // ...................'...............................*....................................................... + // mls v11.8h, v27.8h, v7.h[0] // ...................'...................................*................................................... + // sub v24.8h, v12.8h, v14.8h // ...................'...........................*........................................................... + // add v12.8h, v12.8h, v14.8h // ...................'.................*..................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...................'................................*...................................................... + // mul v14.8h, v24.8h, v0.h[4] // ...................'.................................*..................................................... + // mls v14.8h, v27.8h, v7.h[0] // ...................'.....................................*................................................. + // sub v24.8h, v13.8h, v15.8h // ...................'..................................*.................................................... + // add v13.8h, v13.8h, v15.8h // ...................'....................................*.................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...................'......................................*................................................ + // mul v15.8h, v24.8h, v0.h[4] // ...................'.......................................*............................................... + // mls v15.8h, v27.8h, v7.h[0] // ...................'...........................................*........................................... + // sub v24.8h, v8.8h, v12.8h // ...................'....................*.................................................................. + // add v8.8h, v8.8h, v12.8h // ...................'.....................*................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...................'........................................*.............................................. + // mul v12.8h, v24.8h, v0.h[0] // ...................'.........................................*............................................. + // mls v12.8h, v27.8h, v7.h[0] // ...................'...............................................*....................................... + // sub v24.8h, v9.8h, v13.8h // ...................'..........................................*............................................ + // add v9.8h, v9.8h, v13.8h // ...................'............................................*.......................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...................'................................................*...................................... + // mul v13.8h, v24.8h, v0.h[0] // ...................'.................................................*..................................... + // mls v13.8h, v27.8h, v7.h[0] // ...................'.....................................................*................................. + // sub v24.8h, v10.8h, v14.8h // ...................'..................................................*.................................... + // add v10.8h, v10.8h, v14.8h // ...................'...................................................*................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...................'......................................................*................................ + // mul v14.8h, v24.8h, v0.h[0] // ...................'.......................................................*............................... + // mls v14.8h, v27.8h, v7.h[0] // ...................'...........................................................*........................... + // sub v24.8h, v11.8h, v15.8h // ...................'....................................................*.................................. + // add v11.8h, v11.8h, v15.8h // ...................'........................................................*.............................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...................'.........................................................*............................. + // mul v15.8h, v24.8h, v0.h[0] // ...................'..........................................................*............................ + // mls v15.8h, v27.8h, v7.h[0] // ...................'..............................................................*........................ + // str q12, [x0, #(4*(512/8))] // ...................'............................................................*.......................... + // str q13, [x0, #(5*(512/8))] // ...................'...............................................................*....................... + // str q14, [x0, #(6*(512/8))] // ...................'.................................................................*..................... + // str q15, [x0, #(7*(512/8))] // ...................'...................................................................*................... + // sqrdmulh v27.8h, v8.8h, v30.8h // ...................'.............................................*......................................... + // mul v8.8h, v8.8h, v29.8h // ...................'..............................................*........................................ + // mls v8.8h, v27.8h, v7.h[0] // ...................'.............................................................*......................... + // sqrdmulh v27.8h, v9.8h, v30.8h // ...................'................................................................*...................... + // mul v9.8h, v9.8h, v29.8h // ...................'..................................................................*.................... + // mls v9.8h, v27.8h, v7.h[0] // .~.................'.....................................................................*................. + // sqrdmulh v27.8h, v10.8h, v30.8h // .........~.........'.............................................................................*......... + // mul v10.8h, v10.8h, v29.8h // ..........~........'..............................................................................*........ + // mls v10.8h, v27.8h, v7.h[0] // .............~.....'.................................................................................*..... + // sqrdmulh v27.8h, v11.8h, v30.8h // ...~...............'.......................................................................*............... + // mul v11.8h, v11.8h, v29.8h // .......~...........'...........................................................................*........... + // mls v11.8h, v27.8h, v7.h[0] // ...........~.......'...............................................................................*....... + // str q8, [x0], #(16) // ........~..........'............................................................................*.......... + // str q9, [x0, #(-16 + 1*(512/8))] // ....~..............'........................................................................*.............. + // str q10, [x0, #(-16 + 2*(512/8))] // ................~..'....................................................................................*.. + // str q11, [x0, #(-16 + 3*(512/8))] // ..................~'......................................................................................* sub count, count, #1 cbnz count, layer123_start - ldr q4, [x0, #0] // *................................................................................ - // gap // ................................................................................. - // gap // ................................................................................. - // gap // ................................................................................. - sub v9.8H, v22.8H, v28.8H // .*............................................................................... - // gap // ................................................................................. - sub v5.8H, v3.8H, v26.8H // ............*.................................................................... - // gap // ................................................................................. - sub v6.8H, v4.8H, v24.8H // ...*............................................................................. - // gap // ................................................................................. - sqrdmulh v15.8H, v9.8H, v1.H[1] // ......*.......................................................................... - // gap // ................................................................................. - sub v20.8H, v11.8H, v14.8H // .....................*........................................................... - // gap // ................................................................................. - mul v19.8H, v6.8H, v0.H[6] // .........*....................................................................... - // gap // ................................................................................. - mul v23.8H, v5.8H, v1.H[2] // ...............*................................................................. - // gap // ................................................................................. - mul v27.8H, v20.8H, v1.H[4] // ...........................*..................................................... - // gap // ................................................................................. - sqrdmulh v21.8H, v20.8H, v1.H[5] // ............................*.................................................... - // gap // ................................................................................. - sqrdmulh v10.8H, v5.8H, v1.H[3] // ....................*............................................................ - // gap // ................................................................................. - mul v8.8H, v9.8H, v1.H[0] // .....*........................................................................... - // gap // ................................................................................. - sqrdmulh v9.8H, v6.8H, v0.H[7] // ..........*...................................................................... - // gap // ................................................................................. - mls v27.8H, v21.8H, v7.H[0] // ................................*................................................ - // gap // ................................................................................. - mls v23.8H, v10.8H, v7.H[0] // ........................*........................................................ - // gap // ................................................................................. - mls v8.8H, v15.8H, v7.H[0] // ...........*..................................................................... - // gap // ................................................................................. - mls v19.8H, v9.8H, v7.H[0] // ..............*.................................................................. - // gap // ................................................................................. - add v16.8H, v11.8H, v14.8H // ................*................................................................ - // gap // ................................................................................. - sub v20.8H, v23.8H, v27.8H // ....................................*............................................ - // gap // ................................................................................. - add v14.8H, v23.8H, v27.8H // ......................................*.......................................... - // gap // ................................................................................. - add v5.8H, v19.8H, v8.8H // ..........................*...................................................... - // gap // ................................................................................. - sub v15.8H, v19.8H, v8.8H // .........................*....................................................... - // gap // ................................................................................. - add v10.8H, v3.8H, v26.8H // .............*................................................................... - // gap // ................................................................................. - sub v27.8H, v5.8H, v14.8H // .............................................*................................... - // gap // ................................................................................. - add v9.8H, v22.8H, v28.8H // ..*.............................................................................. - // gap // ................................................................................. - add v4.8H, v4.8H, v24.8H // ....*............................................................................ - // gap // ................................................................................. - sqrdmulh v19.8H, v27.8H, v0.H[1] // ................................................*................................ - // gap // ................................................................................. - mul v23.8H, v27.8H, v0.H[0] // ...................................................*............................. - // gap // ................................................................................. - sqrdmulh v18.8H, v15.8H, v0.H[3] // .................................*............................................... - // gap // ................................................................................. - add v3.8H, v10.8H, v16.8H // ...................*............................................................. - // gap // ................................................................................. - add v24.8H, v4.8H, v9.8H // ........*........................................................................ - // gap // ................................................................................. - mls v23.8H, v19.8H, v7.H[0] // .......................................................*......................... - // gap // ................................................................................. - sqrdmulh v26.8H, v20.8H, v0.H[5] // ........................................*........................................ - // gap // ................................................................................. - sub v22.8H, v24.8H, v3.8H // .......................*......................................................... - // gap // ................................................................................. - mul v11.8H, v20.8H, v0.H[4] // .......................................*......................................... - // gap // ................................................................................. - str q23, [x0, #320] // ...........................................................*..................... - // gap // ................................................................................. - sqrdmulh v19.8H, v22.8H, v0.H[1] // ...........................................*..................................... - // gap // ................................................................................. - mul v25.8H, v22.8H, v0.H[0] // ..........................................*...................................... - // gap // ................................................................................. - mls v11.8H, v26.8H, v7.H[0] // ..............................................*.................................. - // gap // ................................................................................. - mul v20.8H, v15.8H, v0.H[2] // ...............................*................................................. - // gap // ................................................................................. - sub v12.8H, v10.8H, v16.8H // .............................*................................................... - // gap // ................................................................................. - mls v25.8H, v19.8H, v7.H[0] // ..................................................*.............................. - // gap // ................................................................................. - add v23.8H, v5.8H, v14.8H // ............................................*.................................... - // gap // ................................................................................. - mls v20.8H, v18.8H, v7.H[0] // .....................................*........................................... - // gap // ................................................................................. - mul v5.8H, v12.8H, v0.H[4] // ..................................*.............................................. - // gap // ................................................................................. - str q25, [x0, #256] // ..............................................................*.................. - // gap // ................................................................................. - mul v28.8H, v23.8H, v29.8H // ..................................................................*.............. - // gap // ................................................................................. - add v27.8H, v20.8H, v11.8H // ..........................................................*...................... - // gap // ................................................................................. - sqrdmulh v19.8H, v23.8H, v30.8H // ....................................................................*............ - // gap // ................................................................................. - sqrdmulh v22.8H, v12.8H, v0.H[5] // ...................................*............................................. - // gap // ................................................................................. - mul v23.8H, v27.8H, v29.8H // ..........................................................................*...... - // gap // ................................................................................. - sqrdmulh v14.8H, v27.8H, v30.8H // ...........................................................................*..... - // gap // ................................................................................. - mls v28.8H, v19.8H, v7.H[0] // .........................................................................*....... - // gap // ................................................................................. - mls v5.8H, v22.8H, v7.H[0] // .........................................*....................................... - // gap // ................................................................................. - sub v21.8H, v4.8H, v9.8H // .......*......................................................................... - // gap // ................................................................................. - mls v23.8H, v14.8H, v7.H[0] // .............................................................................*... - // gap // ................................................................................. - str q28, [x0, #64] // ..............................................................................*.. - // gap // ................................................................................. - mul v8.8H, v21.8H, v0.H[2] // .................*............................................................... - // gap // ................................................................................. - sqrdmulh v19.8H, v21.8H, v0.H[3] // ..................*.............................................................. - // gap // ................................................................................. - str q23, [x0, #192] // ................................................................................* - // gap // ................................................................................. - sub v27.8H, v20.8H, v11.8H // ......................................................*.......................... - // gap // ................................................................................. - add v25.8H, v24.8H, v3.8H // ......................*.......................................................... - // gap // ................................................................................. - mls v8.8H, v19.8H, v7.H[0] // ..............................*.................................................. - // gap // ................................................................................. - mul v3.8H, v27.8H, v0.H[0] // ............................................................*.................... - // gap // ................................................................................. - sqrdmulh v19.8H, v27.8H, v0.H[1] // .............................................................*................... - // gap // ................................................................................. - sqrdmulh v27.8H, v25.8H, v30.8H // .................................................*............................... - // gap // ................................................................................. - add v23.8H, v8.8H, v5.8H // .....................................................*........................... - // gap // ................................................................................. - sub v26.8H, v8.8H, v5.8H // ....................................................*............................ - // gap // ................................................................................. - mls v3.8H, v19.8H, v7.H[0] // .................................................................*............... - // gap // ................................................................................. - sqrdmulh v19.8H, v23.8H, v30.8H // ........................................................................*........ - // gap // ................................................................................. - mul v22.8H, v23.8H, v29.8H // ......................................................................*.......... - // gap // ................................................................................. - sqrdmulh v28.8H, v26.8H, v0.H[1] // .........................................................*....................... - // gap // ................................................................................. - mul v24.8H, v26.8H, v0.H[0] // ........................................................*........................ - // gap // ................................................................................. - mul v23.8H, v25.8H, v29.8H // ...............................................*................................. - // gap // ................................................................................. - mls v22.8H, v19.8H, v7.H[0] // ............................................................................*.... - // gap // ................................................................................. - str q3, [x0, #448] // .....................................................................*........... - // gap // ................................................................................. - mls v24.8H, v28.8H, v7.H[0] // ...............................................................*................. - // gap // ................................................................................. - mls v23.8H, v27.8H, v7.H[0] // ................................................................*................ - // gap // ................................................................................. - str q22, [x0, #128] // ...............................................................................*. - // gap // ................................................................................. - // gap // ................................................................................. - // gap // ................................................................................. - str q24, [x0, #384] // ...................................................................*............. - // gap // ................................................................................. - // gap // ................................................................................. - // gap // ................................................................................. - str q23, [x0], #(16) // .......................................................................*......... - // gap // ................................................................................. - - // original source code - // ldr q23, [x0, #0] // *................................................................................ - // sub v19.8H, v22.8H, v28.8H // .*............................................................................... - // add v22.8H, v22.8H, v28.8H // ........................*........................................................ - // sub v28.8H, v23.8H, v24.8H // ...*............................................................................. - // add v23.8H, v23.8H, v24.8H // .........................*....................................................... - // mul v27.8H, v19.8H, v1.H[0] // ...........*..................................................................... - // sqrdmulh v8.8H, v19.8H, v1.H[1] // ....*............................................................................ - // sub v24.8H, v23.8H, v22.8H // ......................................................*.......................... - // add v21.8H, v23.8H, v22.8H // ..............................*.................................................. - // mul v22.8H, v28.8H, v0.H[6] // ......*.......................................................................... - // sqrdmulh v28.8H, v28.8H, v0.H[7] // ............*.................................................................... - // mls v27.8H, v8.8H, v7.H[0] // ...............*................................................................. - // sub v19.8H, v3.8H, v26.8H // ..*.............................................................................. - // add v3.8H, v3.8H, v26.8H // ......................*.......................................................... - // mls v22.8H, v28.8H, v7.H[0] // ................*................................................................ - // mul v28.8H, v19.8H, v1.H[2] // .......*......................................................................... - // add v25.8H, v11.8H, v14.8H // .................*............................................................... - // mul v26.8H, v24.8H, v0.H[2] // .........................................................*....................... - // sqrdmulh v24.8H, v24.8H, v0.H[3] // ..........................................................*...................... - // add v20.8H, v3.8H, v25.8H // .............................*................................................... - // sqrdmulh v19.8H, v19.8H, v1.H[3] // ..........*...................................................................... - // sub v11.8H, v11.8H, v14.8H // .....*........................................................................... - // add v23.8H, v21.8H, v20.8H // .............................................................*................... - // sub v14.8H, v21.8H, v20.8H // .................................*............................................... - // mls v28.8H, v19.8H, v7.H[0] // ..............*.................................................................. - // sub v19.8H, v22.8H, v27.8H // .....................*........................................................... - // add v22.8H, v22.8H, v27.8H // ....................*............................................................ - // mul v27.8H, v11.8H, v1.H[4] // ........*........................................................................ - // sqrdmulh v20.8H, v11.8H, v1.H[5] // .........*....................................................................... - // sub v3.8H, v3.8H, v25.8H // ........................................*........................................ - // mls v26.8H, v24.8H, v7.H[0] // ..............................................................*.................. - // mul v24.8H, v19.8H, v0.H[2] // .......................................*......................................... - // mls v27.8H, v20.8H, v7.H[0] // .............*................................................................... - // sqrdmulh v19.8H, v19.8H, v0.H[3] // ............................*.................................................... - // mul v20.8H, v3.8H, v0.H[4] // ............................................*.................................... - // sqrdmulh v3.8H, v3.8H, v0.H[5] // .................................................*............................... - // sub v11.8H, v28.8H, v27.8H // ..................*.............................................................. - // mls v24.8H, v19.8H, v7.H[0] // ...........................................*..................................... - // add v19.8H, v28.8H, v27.8H // ...................*............................................................. - // mul v28.8H, v11.8H, v0.H[4] // ..................................*.............................................. - // sqrdmulh v27.8H, v11.8H, v0.H[5] // ................................*................................................ - // mls v20.8H, v3.8H, v7.H[0] // .....................................................*........................... - // mul v3.8H, v14.8H, v0.H[0] // .....................................*........................................... - // sqrdmulh v25.8H, v14.8H, v0.H[1] // ....................................*............................................ - // add v8.8H, v22.8H, v19.8H // ..........................................*...................................... - // sub v14.8H, v22.8H, v19.8H // .......................*......................................................... - // mls v28.8H, v27.8H, v7.H[0] // ......................................*.......................................... - // mul v22.8H, v23.8H, v29.8H // .........................................................................*....... - // sqrdmulh v11.8H, v14.8H, v0.H[1] // ..........................*...................................................... - // sqrdmulh v23.8H, v23.8H, v30.8H // .................................................................*............... - // mls v3.8H, v25.8H, v7.H[0] // .........................................*....................................... - // mul v27.8H, v14.8H, v0.H[0] // ...........................*..................................................... - // sub v14.8H, v26.8H, v20.8H // ...................................................................*............. - // add v26.8H, v26.8H, v20.8H // ..................................................................*.............. - // sub v20.8H, v24.8H, v28.8H // ............................................................*.................... - // mls v27.8H, v11.8H, v7.H[0] // ...............................*................................................. - // mul v11.8H, v14.8H, v0.H[0] // ........................................................................*........ - // sqrdmulh v14.8H, v14.8H, v0.H[1] // .......................................................................*......... - // add v28.8H, v24.8H, v28.8H // ...............................................*................................. - // str q27, [x0, #320] // ...................................*............................................. - // mul v24.8H, v20.8H, v0.H[0] // ...............................................................*................. - // sqrdmulh v20.8H, v20.8H, v0.H[1] // ................................................................*................ - // str q3, [x0, #256] // .............................................*................................... - // mls v11.8H, v14.8H, v7.H[0] // ............................................................................*.... - // mls v22.8H, v23.8H, v7.H[0] // .............................................................................*... - // mls v24.8H, v20.8H, v7.H[0] // ....................................................................*............ - // mul v23.8H, v8.8H, v29.8H // ..............................................*.................................. - // str q11, [x0, #384] // ...............................................................................*. - // sqrdmulh v19.8H, v8.8H, v30.8H // ................................................*................................ - // str q24, [x0, #448] // ...........................................................................*..... - // mul v27.8H, v26.8H, v29.8H // ......................................................................*.......... - // str q22, [x0], #(16) // ................................................................................* - // sqrdmulh v24.8H, v26.8H, v30.8H // .....................................................................*........... - // mls v23.8H, v19.8H, v7.H[0] // ....................................................*............................ - // mul v19.8H, v28.8H, v29.8H // ..................................................*.............................. - // sqrdmulh v8.8H, v28.8H, v30.8H // ...................................................*............................. - // mls v27.8H, v24.8H, v7.H[0] // ..........................................................................*...... - // mls v19.8H, v8.8H, v7.H[0] // .......................................................*......................... - // str q23, [x0, #48] // ........................................................*........................ - // str q27, [x0, #112] // ..............................................................................*.. - // str q19, [x0, #176] // ...........................................................*..................... + // Instructions: 80 + // Expected cycles: 83 + // Expected IPC: 0.96 + // + // Cycle bound: 83.0 + // IPC bound: 0.96 + // + // Wall time: 31.99s + // User time: 31.99s + // + // ------------------------------ original position ------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|---- + ldr q2, [x0, #0] // *............................................................................... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + sub v27.8H, v14.8H, v26.8H // .*.............................................................................. + // gap // ................................................................................ + add v26.8H, v14.8H, v26.8H // ..*............................................................................. + // gap // ................................................................................ + sub v14.8H, v2.8H, v6.8H // ...*............................................................................ + // gap // ................................................................................ + add v16.8H, v2.8H, v6.8H // ....*........................................................................... + // gap // ................................................................................ + sqrdmulh v15.8H, v27.8H, v1.H[1] // .....*.......................................................................... + // gap // ................................................................................ + mul v6.8H, v27.8H, v1.H[0] // ......*......................................................................... + // gap // ................................................................................ + sub v27.8H, v16.8H, v26.8H // .......*........................................................................ + // gap // ................................................................................ + add v26.8H, v16.8H, v26.8H // ........*....................................................................... + // gap // ................................................................................ + sqrdmulh v16.8H, v14.8H, v0.H[7] // .........*...................................................................... + // gap // ................................................................................ + mul v28.8H, v14.8H, v0.H[6] // ..........*..................................................................... + // gap // ................................................................................ + mls v6.8H, v15.8H, v7.H[0] // ...........*.................................................................... + // gap // ................................................................................ + sub v15.8H, v11.8H, v13.8H // ............*................................................................... + // gap // ................................................................................ + add v31.8H, v11.8H, v13.8H // .............*.................................................................. + // gap // ................................................................................ + mls v28.8H, v16.8H, v7.H[0] // ..............*................................................................. + // gap // ................................................................................ + sqrdmulh v4.8H, v15.8H, v1.H[3] // ...............*................................................................ + // gap // ................................................................................ + sqrdmulh v14.8H, v27.8H, v0.H[3] // ................*............................................................... + // gap // ................................................................................ + mul v16.8H, v27.8H, v0.H[2] // .................*.............................................................. + // gap // ................................................................................ + add v27.8H, v31.8H, v3.8H // ..................*............................................................. + // gap // ................................................................................ + mul v13.8H, v15.8H, v1.H[2] // ...................*............................................................ + // gap // ................................................................................ + sub v15.8H, v25.8H, v22.8H // ....................*........................................................... + // gap // ................................................................................ + sub v11.8H, v26.8H, v27.8H // .....................*.......................................................... + // gap // ................................................................................ + add v2.8H, v26.8H, v27.8H // ......................*......................................................... + // gap // ................................................................................ + mls v13.8H, v4.8H, v7.H[0] // .......................*........................................................ + // gap // ................................................................................ + sub v26.8H, v28.8H, v6.8H // ........................*....................................................... + // gap // ................................................................................ + add v22.8H, v28.8H, v6.8H // .........................*...................................................... + // gap // ................................................................................ + sqrdmulh v27.8H, v15.8H, v1.H[5] // ..........................*..................................................... + // gap // ................................................................................ + mul v6.8H, v15.8H, v1.H[4] // ...........................*.................................................... + // gap // ................................................................................ + sub v15.8H, v31.8H, v3.8H // ............................*................................................... + // gap // ................................................................................ + mls v16.8H, v14.8H, v7.H[0] // .............................*.................................................. + // gap // ................................................................................ + sqrdmulh v14.8H, v26.8H, v0.H[3] // ..............................*................................................. + // gap // ................................................................................ + mls v6.8H, v27.8H, v7.H[0] // ...............................*................................................ + // gap // ................................................................................ + mul v25.8H, v26.8H, v0.H[2] // ................................*............................................... + // gap // ................................................................................ + sqrdmulh v27.8H, v15.8H, v0.H[5] // .................................*.............................................. + // gap // ................................................................................ + mul v26.8H, v15.8H, v0.H[4] // ..................................*............................................. + // gap // ................................................................................ + sub v15.8H, v13.8H, v6.8H // ...................................*............................................ + // gap // ................................................................................ + mls v25.8H, v14.8H, v7.H[0] // ....................................*........................................... + // gap // ................................................................................ + add v31.8H, v13.8H, v6.8H // .....................................*.......................................... + // gap // ................................................................................ + mls v26.8H, v27.8H, v7.H[0] // ......................................*......................................... + // gap // ................................................................................ + sqrdmulh v13.8H, v15.8H, v0.H[5] // .......................................*........................................ + // gap // ................................................................................ + mul v6.8H, v15.8H, v0.H[4] // ........................................*....................................... + // gap // ................................................................................ + sqrdmulh v27.8H, v11.8H, v0.H[1] // .........................................*...................................... + // gap // ................................................................................ + mul v14.8H, v11.8H, v0.H[0] // ..........................................*..................................... + // gap // ................................................................................ + sub v11.8H, v16.8H, v26.8H // ...................................................*............................ + // gap // ................................................................................ + sub v15.8H, v22.8H, v31.8H // ...........................................*.................................... + // gap // ................................................................................ + add v26.8H, v16.8H, v26.8H // ....................................................*........................... + // gap // ................................................................................ + mls v14.8H, v27.8H, v7.H[0] // ................................................*............................... + // gap // ................................................................................ + mul v27.8H, v15.8H, v0.H[0] // ..................................................*............................. + // gap // ................................................................................ + sqrdmulh v15.8H, v15.8H, v0.H[1] // .................................................*.............................. + // gap // ................................................................................ + mls v6.8H, v13.8H, v7.H[0] // ............................................*................................... + // gap // ................................................................................ + str q14, [x0, #256] // .............................................................*.................. + // gap // ................................................................................ + add v31.8H, v22.8H, v31.8H // .............................................*.................................. + // gap // ................................................................................ + mls v27.8H, v15.8H, v7.H[0] // ......................................................*......................... + // gap // ................................................................................ + add v15.8H, v25.8H, v6.8H // .........................................................*...................... + // gap // ................................................................................ + sub v16.8H, v25.8H, v6.8H // .....................................................*.......................... + // gap // ................................................................................ + mul v13.8H, v2.8H, v29.8H // ...............................................*................................ + // gap // ................................................................................ + str q27, [x0, #320] // ................................................................*............... + // gap // ................................................................................ + sqrdmulh v14.8H, v16.8H, v0.H[1] // ..........................................................*..................... + // gap // ................................................................................ + mul v6.8H, v16.8H, v0.H[0] // ...........................................................*.................... + // gap // ................................................................................ + mul v16.8H, v15.8H, v29.8H // ........................................................................*....... + // gap // ................................................................................ + sqrdmulh v15.8H, v15.8H, v30.8H // ......................................................................*......... + // gap // ................................................................................ + mul v27.8H, v31.8H, v29.8H // ...................................................................*............ + // gap // ................................................................................ + mls v6.8H, v14.8H, v7.H[0] // ...............................................................*................ + // gap // ................................................................................ + mul v19.8H, v26.8H, v29.8H // ...........................................................................*.... + // gap // ................................................................................ + mls v16.8H, v15.8H, v7.H[0] // ............................................................................*... + // gap // ................................................................................ + sqrdmulh v15.8H, v26.8H, v30.8H // ..........................................................................*..... + // gap // ................................................................................ + sqrdmulh v14.8H, v31.8H, v30.8H // .................................................................*.............. + // gap // ................................................................................ + mul v26.8H, v11.8H, v0.H[0] // ........................................................*....................... + // gap // ................................................................................ + str q16, [x0, #192] // ...............................................................................* + // gap // ................................................................................ + mls v19.8H, v15.8H, v7.H[0] // .............................................................................*.. + // gap // ................................................................................ + sqrdmulh v15.8H, v11.8H, v0.H[1] // .......................................................*........................ + // gap // ................................................................................ + mls v27.8H, v14.8H, v7.H[0] // .....................................................................*.......... + // gap // ................................................................................ + sqrdmulh v16.8H, v2.8H, v30.8H // ..............................................*................................. + // gap // ................................................................................ + str q19, [x0, #128] // ..............................................................................*. + // gap // ................................................................................ + mls v26.8H, v15.8H, v7.H[0] // ............................................................*................... + // gap // ................................................................................ + str q27, [x0, #64] // .......................................................................*........ + // gap // ................................................................................ + mls v13.8H, v16.8H, v7.H[0] // ..............................................................*................. + // gap // ................................................................................ + str q6, [x0, #448] // ....................................................................*........... + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + str q26, [x0, #384] // ..................................................................*............. + // gap // ................................................................................ + // gap // ................................................................................ + // gap // ................................................................................ + str q13, [x0], #(16) // .........................................................................*...... + // gap // ................................................................................ + + // -------------------------------- new position ---------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|---- + // ldr q16, [x0, #0] // *............................................................................... + // sub v15.8H, v14.8H, v26.8H // .*.............................................................................. + // add v27.8H, v14.8H, v26.8H // ..*............................................................................. + // sub v14.8H, v16.8H, v6.8H // ...*............................................................................ + // add v16.8H, v16.8H, v6.8H // ....*........................................................................... + // sqrdmulh v6.8H, v15.8H, v1.H[1] // .....*.......................................................................... + // mul v15.8H, v15.8H, v1.H[0] // ......*......................................................................... + // sub v26.8H, v16.8H, v27.8H // .......*........................................................................ + // add v16.8H, v16.8H, v27.8H // ........*....................................................................... + // sqrdmulh v27.8H, v14.8H, v0.H[7] // .........*...................................................................... + // mul v14.8H, v14.8H, v0.H[6] // ..........*..................................................................... + // mls v15.8H, v6.8H, v7.H[0] // ...........*.................................................................... + // sub v6.8H, v11.8H, v13.8H // ............*................................................................... + // add v11.8H, v11.8H, v13.8H // .............*.................................................................. + // mls v14.8H, v27.8H, v7.H[0] // ..............*................................................................. + // sqrdmulh v27.8H, v6.8H, v1.H[3] // ...............*................................................................ + // sqrdmulh v13.8H, v26.8H, v0.H[3] // ................*............................................................... + // mul v26.8H, v26.8H, v0.H[2] // .................*.............................................................. + // add v31.8H, v11.8H, v3.8H // ..................*............................................................. + // mul v6.8H, v6.8H, v1.H[2] // ...................*............................................................ + // sub v25.8H, v25.8H, v22.8H // ....................*........................................................... + // sub v22.8H, v16.8H, v31.8H // .....................*.......................................................... + // add v16.8H, v16.8H, v31.8H // ......................*......................................................... + // mls v6.8H, v27.8H, v7.H[0] // .......................*........................................................ + // sub v27.8H, v14.8H, v15.8H // ........................*....................................................... + // add v15.8H, v14.8H, v15.8H // .........................*...................................................... + // sqrdmulh v14.8H, v25.8H, v1.H[5] // ..........................*..................................................... + // mul v31.8H, v25.8H, v1.H[4] // ...........................*.................................................... + // sub v11.8H, v11.8H, v3.8H // ............................*................................................... + // mls v26.8H, v13.8H, v7.H[0] // .............................*.................................................. + // sqrdmulh v13.8H, v27.8H, v0.H[3] // ..............................*................................................. + // mls v31.8H, v14.8H, v7.H[0] // ...............................*................................................ + // mul v27.8H, v27.8H, v0.H[2] // ................................*............................................... + // sqrdmulh v14.8H, v11.8H, v0.H[5] // .................................*.............................................. + // mul v11.8H, v11.8H, v0.H[4] // ..................................*............................................. + // sub v25.8H, v6.8H, v31.8H // ...................................*............................................ + // mls v27.8H, v13.8H, v7.H[0] // ....................................*........................................... + // add v6.8H, v6.8H, v31.8H // .....................................*.......................................... + // mls v11.8H, v14.8H, v7.H[0] // ......................................*......................................... + // sqrdmulh v14.8H, v25.8H, v0.H[5] // .......................................*........................................ + // mul v13.8H, v25.8H, v0.H[4] // ........................................*....................................... + // sqrdmulh v31.8H, v22.8H, v0.H[1] // .........................................*...................................... + // mul v25.8H, v22.8H, v0.H[0] // ..........................................*..................................... + // sub v22.8H, v15.8H, v6.8H // ............................................*................................... + // mls v13.8H, v14.8H, v7.H[0] // .................................................*.............................. + // add v15.8H, v15.8H, v6.8H // ...................................................*............................ + // sqrdmulh v14.8H, v16.8H, v30.8H // ........................................................................*....... + // mul v16.8H, v16.8H, v29.8H // .......................................................*........................ + // mls v25.8H, v31.8H, v7.H[0] // ..............................................*................................. + // sqrdmulh v6.8H, v22.8H, v0.H[1] // ................................................*............................... + // mul v31.8H, v22.8H, v0.H[0] // ...............................................*................................ + // sub v22.8H, v26.8H, v11.8H // ...........................................*.................................... + // add v26.8H, v26.8H, v11.8H // .............................................*.................................. + // sub v11.8H, v27.8H, v13.8H // ......................................................*......................... + // mls v31.8H, v6.8H, v7.H[0] // ....................................................*........................... + // sqrdmulh v6.8H, v22.8H, v0.H[1] // ......................................................................*......... + // mul v22.8H, v22.8H, v0.H[0] // ...................................................................*............ + // add v27.8H, v27.8H, v13.8H // .....................................................*.......................... + // sqrdmulh v13.8H, v11.8H, v0.H[1] // .........................................................*...................... + // mul v11.8H, v11.8H, v0.H[0] // ..........................................................*..................... + // mls v22.8H, v6.8H, v7.H[0] // ..........................................................................*..... + // str q25, [x0, #256] // ..................................................*............................. + // mls v16.8H, v14.8H, v7.H[0] // ............................................................................*... + // mls v11.8H, v13.8H, v7.H[0] // ..............................................................*................. + // str q31, [x0, #320] // ........................................................*....................... + // sqrdmulh v14.8H, v15.8H, v30.8H // ..................................................................*............. + // str q22, [x0, #384] // ..............................................................................*. + // mul v15.8H, v15.8H, v29.8H // .............................................................*.................. + // str q11, [x0, #448] // .............................................................................*.. + // mls v15.8H, v14.8H, v7.H[0] // .......................................................................*........ + // sqrdmulh v14.8H, v27.8H, v30.8H // ............................................................*................... + // str q15, [x0, #64] // ...........................................................................*.... + // mul v27.8H, v27.8H, v29.8H // ...........................................................*.................... + // str q16, [x0], #(16) // ...............................................................................* + // sqrdmulh v6.8H, v26.8H, v30.8H // .................................................................*.............. + // mul v26.8H, v26.8H, v29.8H // ...............................................................*................ + // mls v27.8H, v14.8H, v7.H[0] // ................................................................*............... + // mls v26.8H, v6.8H, v7.H[0] // .....................................................................*.......... + // str q26, [x0, #112] // .........................................................................*...... + // str q27, [x0, #176] // ....................................................................*........... pop_stack diff --git a/examples/opt/aarch64/intt_kyber_123_4567_opt_a72.s b/examples/opt/aarch64/intt_kyber_123_4567_opt_a72.s index 4e82bc77..807716d9 100644 --- a/examples/opt/aarch64/intt_kyber_123_4567_opt_a72.s +++ b/examples/opt/aarch64/intt_kyber_123_4567_opt_a72.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +69,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -146,7 +125,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +136,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +146,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +154,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +165,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -354,673 +339,715 @@ _intt_kyber_123_4567_opt_a72: mov count, #8 .p2align 2 - // gap // ........................................................ - ldr q13, [x1, #48] // .*...................................................... - ldr q14, [x1, #32] // *....................................................... - ldr q25, [x1, #0] // ...*.................................................... - ldr q1, [x1, #16] // ..*..................................................... - // gap // ........................................................ - ldr q4, [x4, #80] // .......................*................................ - ldr q6, [x4], #(6*16) // ....*................................................... - // gap // ........................................................ - ldr q11, [x4, #-32] // ...............*........................................ - ldr q10, [x4, #-64] // ............*........................................... - // gap // ........................................................ - trn2 v28.4S, v14.4S, v13.4S // ......*................................................. - trn1 v23.4S, v14.4S, v13.4S // ........*............................................... - ldr q9, [x4, #-48] // ..............*......................................... - trn2 v16.4S, v25.4S, v1.4S // .......*................................................ - trn1 v5.4S, v25.4S, v1.4S // .........*.............................................. - ldr q3, [x4, #-80] // .....*.................................................. - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - trn1 v14.2D, v5.2D, v23.2D // .............*.......................................... - trn1 v12.2D, v16.2D, v28.2D // ..........*............................................. - // gap // ........................................................ - trn2 v19.2D, v16.2D, v28.2D // ...........*............................................ - // gap // ........................................................ - // gap // ........................................................ - trn2 v22.2D, v5.2D, v23.2D // ................*....................................... - // gap // ........................................................ - // gap // ........................................................ - sub v26.8H, v14.8H, v12.8H // .................*...................................... - add v13.8H, v14.8H, v12.8H // .....................*.................................. - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - sub v27.8H, v22.8H, v19.8H // ..................*..................................... - // gap // ........................................................ - // gap // ........................................................ - add v29.8H, v22.8H, v19.8H // ...................*.................................... - sqrdmulh v5.8H, v26.8H, v9.8H // ......................*................................. - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - sqrdmulh v30.8H, v27.8H, v4.8H // .........................*.............................. - // gap // ........................................................ - // gap // ........................................................ - sub v4.8H, v13.8H, v29.8H // ..........................*............................. - // gap // ........................................................ - // gap // ........................................................ - mul v24.8H, v26.8H, v10.8H // ....................*................................... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - mls v24.8H, v5.8H, v7.H[0] // ...........................*............................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - mul v10.8H, v27.8H, v11.8H // ............................*........................... - add v27.8H, v13.8H, v29.8H // ........................*............................... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - mls v10.8H, v30.8H, v7.H[0] // .............................*.......................... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - sqrdmulh v15.8H, v4.8H, v3.8H // ..............................*......................... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - mul v25.8H, v4.8H, v6.8H // ................................*....................... - // gap // ........................................................ - // gap // ........................................................ - sub v4.8H, v24.8H, v10.8H // ...............................*........................ - // gap // ........................................................ - // gap // ........................................................ - add v0.8H, v24.8H, v10.8H // .....................................*.................. - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - sqrdmulh v16.8H, v4.8H, v3.8H // .................................*...................... - // gap // ........................................................ - // gap // ........................................................ - trn2 v3.4S, v27.4S, v0.4S // ......................................*................. - // gap // ........................................................ - // gap // ........................................................ - mul v12.8H, v4.8H, v6.8H // ...................................*.................... - ldr q6, [x3], #16 // ............................................*........... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - mls v25.8H, v15.8H, v7.H[0] // ..................................*..................... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - mls v12.8H, v16.8H, v7.H[0] // ....................................*................... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - trn1 v23.4S, v27.4S, v0.4S // .......................................*................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - trn2 v19.4S, v25.4S, v12.4S // ........................................*............... - trn1 v18.4S, v25.4S, v12.4S // .........................................*.............. - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - trn2 v29.2D, v3.2D, v19.2D // ..........................................*............. - trn2 v21.2D, v23.2D, v18.2D // ...........................................*............ - // gap // ........................................................ - trn1 v10.2D, v3.2D, v19.2D // .............................................*.......... - trn1 v8.2D, v23.2D, v18.2D // ..............................................*......... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - add v11.8H, v21.8H, v29.8H // ...............................................*........ - sub v1.8H, v21.8H, v29.8H // ................................................*....... - // gap // ........................................................ - sub v23.8H, v8.8H, v10.8H // .................................................*...... - add v0.8H, v8.8H, v10.8H // ..................................................*..... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - sqdmulh v22.8H, v11.8H, v7.H[1] // ...................................................*.... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - mul v2.8H, v1.8H, v6.H[4] // .......................................................* - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - sqdmulh v12.8H, v0.8H, v7.H[1] // .....................................................*.. - // gap // ........................................................ - // gap // ........................................................ - srshr v3.8H, v22.8H, #11 // ......................................................*. - // gap // ........................................................ - // gap // ........................................................ - sqrdmulh v24.8H, v23.8H, v6.H[3] // ....................................................*... - // gap // ........................................................ - // gap // ........................................................ - - // original source code - // ldr q5, [x1, #32] // .*...................................................... - // ldr q22, [x1, #48] // *....................................................... - // ldr q12, [x1, #16] // ...*.................................................... - // ldr q17, [x1, #0] // ..*..................................................... - // ldr q25, [x4], #(6*16) // .....*.................................................. - // ldr q16, [x4, #-80] // .............*.......................................... - // trn2 v19.4S, v5.4S, v22.4S // ........*............................................... - // trn2 v23.4S, v17.4S, v12.4S // ...........*............................................ - // trn1 v8.4S, v5.4S, v22.4S // .........*.............................................. - // trn1 v22.4S, v17.4S, v12.4S // ............*........................................... - // trn1 v1.2D, v23.2D, v19.2D // ...............*........................................ - // trn2 v15.2D, v23.2D, v19.2D // ................*....................................... - // ldr q23, [x4, #-64] // .......*................................................ - // trn1 v27.2D, v22.2D, v8.2D // ..............*......................................... - // ldr q29, [x4, #-48] // ..........*............................................. - // ldr q14, [x4, #-32] // ......*................................................. - // trn2 v20.2D, v22.2D, v8.2D // .................*...................................... - // sub v3.8H, v27.8H, v1.8H // ..................*..................................... - // sub v21.8H, v20.8H, v15.8H // ....................*................................... - // add v15.8H, v20.8H, v15.8H // .....................*.................................. - // mul v12.8H, v3.8H, v23.8H // .........................*.............................. - // add v5.8H, v27.8H, v1.8H // ...................*.................................... - // sqrdmulh v23.8H, v3.8H, v29.8H // ......................*................................. - // ldr q28, [x4, #-16] // ....*................................................... - // add v29.8H, v5.8H, v15.8H // ............................*........................... - // sqrdmulh v18.8H, v21.8H, v28.8H // .......................*................................ - // sub v9.8H, v5.8H, v15.8H // ........................*............................... - // mls v12.8H, v23.8H, v7.H[0] // ..........................*............................. - // mul v27.8H, v21.8H, v14.8H // ...........................*............................ - // mls v27.8H, v18.8H, v7.H[0] // .............................*.......................... - // sqrdmulh v1.8H, v9.8H, v16.8H // ..............................*......................... - // sub v28.8H, v12.8H, v27.8H // ................................*....................... - // mul v31.8H, v9.8H, v25.8H // ...............................*........................ - // sqrdmulh v19.8H, v28.8H, v16.8H // ..................................*..................... - // mls v31.8H, v1.8H, v7.H[0] // ......................................*................. - // mul v1.8H, v28.8H, v25.8H // ....................................*................... - // mls v1.8H, v19.8H, v7.H[0] // .......................................*................ - // add v19.8H, v12.8H, v27.8H // .................................*...................... - // trn2 v3.4S, v29.4S, v19.4S // ...................................*.................... - // trn1 v10.4S, v29.4S, v19.4S // ........................................*............... - // trn2 v20.4S, v31.4S, v1.4S // .........................................*.............. - // trn1 v31.4S, v31.4S, v1.4S // ..........................................*............. - // trn2 v30.2D, v3.2D, v20.2D // ...........................................*............ - // trn2 v28.2D, v10.2D, v31.2D // ............................................*........... - // ldr q6, [x3], #16 // .....................................*.................. - // trn1 v17.2D, v3.2D, v20.2D // .............................................*.......... - // trn1 v5.2D, v10.2D, v31.2D // ..............................................*......... - // add v11.8H, v28.8H, v30.8H // ...............................................*........ - // sub v1.8H, v28.8H, v30.8H // ................................................*....... - // sub v23.8H, v5.8H, v17.8H // .................................................*...... - // add v0.8H, v5.8H, v17.8H // ..................................................*..... - // sqdmulh v8.8H, v11.8H, v7.H[1] // ...................................................*.... - // sqrdmulh v24.8H, v23.8H, v6.H[3] // .......................................................* - // sqdmulh v12.8H, v0.8H, v7.H[1] // .....................................................*.. - // srshr v3.8H, v8.8H, #11 // ......................................................*. - // mul v2.8H, v1.8H, v6.H[4] // ....................................................*... + // Instructions: 54 + // Expected cycles: 57 + // Expected IPC: 0.95 + // + // Cycle bound: 57.0 + // IPC bound: 0.95 + // + // Wall time: 1.60s + // User time: 1.60s + // + // ----------------- original position -----------------> + // 0 25 50 + // |------------------------|------------------------|--- + ldr q10, [x1, #32] // *..................................................... + ldr q23, [x1, #48] // .*.................................................... + // gap // ...................................................... + ldr q18, [x1, #0] // ..*................................................... + ldr q15, [x1, #16] // ...*.................................................. + // gap // ...................................................... + ldr q8, [x4], #(6*16) // .........*............................................ + // gap // ...................................................... + // gap // ...................................................... + ldr q28, [x4, #-80] // ..........*........................................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v26.4S, v10.4S, v23.4S // .......*.............................................. + trn2 v31.4S, v10.4S, v23.4S // ........*............................................. + ldr q14, [x4, #-32] // .....*................................................ + trn2 v1.4S, v18.4S, v15.4S // ...........*.......................................... + trn1 v18.4S, v18.4S, v15.4S // ............*......................................... + ldr q0, [x4, #-64] // .............*........................................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v23.2D, v1.2D, v31.2D // ................*..................................... + // gap // ...................................................... + trn1 v16.2D, v18.2D, v26.2D // ...............*...................................... + trn2 v19.2D, v18.2D, v26.2D // .................*.................................... + // gap // ...................................................... + // gap // ...................................................... + trn2 v3.2D, v1.2D, v31.2D // ..............*....................................... + ldr q1, [x4, #-48] // ......*............................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sub v21.8H, v16.8H, v23.8H // ..................*................................... + add v16.8H, v16.8H, v23.8H // ....................*................................. + ldr q23, [x4, #-16] // ....*................................................. + // gap // ...................................................... + sub v15.8H, v19.8H, v3.8H // .....................*................................ + // gap // ...................................................... + // gap // ...................................................... + mul v30.8H, v21.8H, v0.8H // .......................*.............................. + add v24.8H, v19.8H, v3.8H // ...................*.................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v11.8H, v15.8H, v23.8H // .........................*............................ + // gap // ...................................................... + // gap // ...................................................... + add v4.8H, v16.8H, v24.8H // ...................................*.................. + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v17.8H, v21.8H, v1.8H // ......................*............................... + sub v19.8H, v16.8H, v24.8H // ........................*............................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v13.8H, v15.8H, v14.8H // ...........................*.......................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v13.8H, v11.8H, v7.H[0] // ............................*......................... + ldr q11, [x3], #16 // ..............................................*....... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v30.8H, v17.8H, v7.H[0] // ..........................*........................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v2.8H, v19.8H, v8.8H // .................................*.................... + // gap // ...................................................... + // gap // ...................................................... + sub v0.8H, v30.8H, v13.8H // .............................*........................ + sqrdmulh v12.8H, v19.8H, v28.8H // ...............................*...................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v22.8H, v0.8H, v28.8H // ................................*..................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v17.8H, v0.8H, v8.8H // ..................................*................... + add v0.8H, v30.8H, v13.8H // ..............................*....................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v2.8H, v12.8H, v7.H[0] // ......................................*............... + // gap // ...................................................... + // gap // ...................................................... + trn1 v1.4S, v4.4S, v0.4S // .....................................*................ + // gap // ...................................................... + // gap // ...................................................... + mls v17.8H, v22.8H, v7.H[0] // ....................................*................. + trn2 v22.4S, v4.4S, v0.4S // .......................................*.............. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v4.4S, v2.4S, v17.4S // .........................................*............ + // gap // ...................................................... + trn2 v17.4S, v2.4S, v17.4S // ........................................*............. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v31.2D, v1.2D, v4.2D // ...........................................*.......... + trn1 v9.2D, v22.2D, v17.2D // ..........................................*........... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn2 v6.2D, v22.2D, v17.2D // .............................................*........ + // gap // ...................................................... + // gap // ...................................................... + trn2 v10.2D, v1.2D, v4.2D // ............................................*......... + add v0.8H, v31.8H, v9.8H // ................................................*..... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sub v23.8H, v31.8H, v9.8H // ...............................................*...... + // gap // ...................................................... + // gap // ...................................................... + sqdmulh v13.8H, v0.8H, v7.H[1] // .....................................................* + // gap // ...................................................... + sub v27.8H, v10.8H, v6.8H // .................................................*.... + add v6.8H, v10.8H, v6.8H // ..................................................*... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v14.8H, v23.8H, v11.H[3] // ....................................................*. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v26.8H, v23.8H, v11.H[2] // ...................................................*.. + // gap // ...................................................... + // gap // ...................................................... + + // ------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|--- + // ldr q22, [x1, #32] // *..................................................... + // ldr q20, [x1, #48] // .*.................................................... + // ldr q5, [x1, #0] // ..*................................................... + // ldr q9, [x1, #16] // ...*.................................................. + // ldr q25, [x4, #80] // ...................*.................................. + // ldr q1, [x4, #64] // ........*............................................. + // ldr q3, [x4, #48] // ................*..................................... + // trn1 v19.4S, v22.4S, v20.4S // ......*............................................... + // trn2 v24.4S, v22.4S, v20.4S // .......*.............................................. + // ldr q8, [x4], #(6*16) // ....*................................................. + // ldr q4, [x4, #-80] // .....*................................................ + // trn2 v22.4S, v5.4S, v9.4S // .........*............................................ + // trn1 v12.4S, v5.4S, v9.4S // ..........*........................................... + // ldr q5, [x4, #-64] // ...........*.......................................... + // trn2 v31.2D, v22.2D, v24.2D // ...............*...................................... + // trn1 v23.2D, v12.2D, v19.2D // .............*........................................ + // trn1 v21.2D, v22.2D, v24.2D // ............*......................................... + // trn2 v12.2D, v12.2D, v19.2D // ..............*....................................... + // sub v13.8H, v23.8H, v21.8H // .................*.................................... + // add v17.8H, v12.8H, v31.8H // ......................*............................... + // add v2.8H, v23.8H, v21.8H // ..................*................................... + // sub v29.8H, v12.8H, v31.8H // ....................*................................. + // sqrdmulh v31.8H, v13.8H, v3.8H // .........................*............................ + // mul v3.8H, v13.8H, v5.8H // .....................*................................ + // sub v28.8H, v2.8H, v17.8H // ..........................*........................... + // sqrdmulh v22.8H, v29.8H, v25.8H // .......................*.............................. + // mls v3.8H, v31.8H, v7.H[0] // ..............................*....................... + // mul v25.8H, v29.8H, v1.8H // ...........................*.......................... + // mls v25.8H, v22.8H, v7.H[0] // ............................*......................... + // sub v16.8H, v3.8H, v25.8H // ................................*..................... + // add v18.8H, v3.8H, v25.8H // ....................................*................. + // sqrdmulh v31.8H, v28.8H, v4.8H // .................................*.................... + // sqrdmulh v12.8H, v16.8H, v4.8H // ..................................*................... + // mul v13.8H, v28.8H, v8.8H // ...............................*...................... + // mul v3.8H, v16.8H, v8.8H // ...................................*.................. + // add v14.8H, v2.8H, v17.8H // ........................*............................. + // mls v3.8H, v12.8H, v7.H[0] // .......................................*.............. + // trn1 v22.4S, v14.4S, v18.4S // ......................................*............... + // mls v13.8H, v31.8H, v7.H[0] // .....................................*................ + // trn2 v25.4S, v14.4S, v18.4S // ........................................*............. + // trn2 v14.4S, v13.4S, v3.4S // ..........................................*........... + // trn1 v19.4S, v13.4S, v3.4S // .........................................*............ + // trn1 v21.2D, v25.2D, v14.2D // ............................................*......... + // trn1 v26.2D, v22.2D, v19.2D // ...........................................*.......... + // trn2 v31.2D, v22.2D, v19.2D // ..............................................*....... + // trn2 v13.2D, v25.2D, v14.2D // .............................................*........ + // ldr q11, [x3], #16 // .............................*........................ + // sub v16.8H, v26.8H, v21.8H // ................................................*..... + // add v0.8H, v26.8H, v21.8H // ...............................................*...... + // sub v27.8H, v31.8H, v13.8H // ..................................................*... + // add v6.8H, v31.8H, v13.8H // ...................................................*.. + // mul v26.8H, v16.8H, v11.H[2] // .....................................................* + // sqrdmulh v14.8H, v16.8H, v11.H[3] // ....................................................*. + // sqdmulh v13.8H, v0.8H, v7.H[1] // .................................................*.... sub count, count, #1 layer4567_start: - sqrdmulh v30.8H, v1.8H, v6.H[5] // .......................................................*........................... - ldr q5, [x1, #96] // ..e................................................................................ - ldr q22, [x1, #112] // ...e............................................................................... - srshr v28.8H, v12.8H, #11 // ..........................................................*........................ - ldr q12, [x1, #80] // .e................................................................................. - ldr q17, [x1, #64] // e.................................................................................. - ldr q25, [x4], #(6*16) // ............e...................................................................... - mul v26.8H, v23.8H, v6.H[2] // .................................................*................................. - ldr q16, [x4, #-80] // .............e..................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - trn2 v19.4S, v5.4S, v22.4S // .......e........................................................................... - mls v26.8H, v24.8H, v7.H[0] // ...................................................*............................... - // gap // ................................................................................... - trn2 v23.4S, v17.4S, v12.4S // .....e............................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - mls v2.8H, v30.8H, v7.H[0] // ........................................................*.......................... - trn1 v8.4S, v5.4S, v22.4S // ......e............................................................................ - // gap // ................................................................................... - trn1 v22.4S, v17.4S, v12.4S // ....e.............................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - trn1 v1.2D, v23.2D, v19.2D // ...........e....................................................................... - mls v11.8H, v3.8H, v7.H[0] // ..............................................................*.................... - // gap // ................................................................................... - trn2 v15.2D, v23.2D, v19.2D // .........e......................................................................... - ldr q23, [x4, #-64] // ..............e.................................................................... - // gap // ................................................................................... - trn1 v27.2D, v22.2D, v8.2D // ..........e........................................................................ - sqdmulh v24.8H, v26.8H, v7.H[1] // ...............................................................*................... - ldr q29, [x4, #-48] // ...............e................................................................... - ldr q14, [x4, #-32] // ................e.................................................................. - trn2 v20.2D, v22.2D, v8.2D // ........e.......................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqdmulh v19.8H, v2.8H, v7.H[1] // ..................................................................*................ - sub v3.8H, v27.8H, v1.8H // ..................e................................................................ - // gap // ................................................................................... - // gap // ................................................................................... - sub v21.8H, v20.8H, v15.8H // .......................e........................................................... - mls v0.8H, v28.8H, v7.H[0] // ...........................................................*....................... - // gap // ................................................................................... - srshr v22.8H, v24.8H, #11 // ................................................................*.................. - // gap // ................................................................................... - // gap // ................................................................................... - add v15.8H, v20.8H, v15.8H // ........................e.......................................................... - mul v12.8H, v3.8H, v23.8H // ....................e.............................................................. - // gap // ................................................................................... - srshr v19.8H, v19.8H, #11 // ...................................................................*............... - // gap // ................................................................................... - // gap // ................................................................................... - add v5.8H, v27.8H, v1.8H // ...................e............................................................... - sqrdmulh v23.8H, v3.8H, v29.8H // .....................e............................................................. - ldr q28, [x4, #-16] // .................e................................................................. - add v10.8H, v0.8H, v11.8H // ......................................................................*............ - // gap // ................................................................................... - // gap // ................................................................................... - mls v2.8H, v19.8H, v7.H[0] // ....................................................................*.............. - // gap // ................................................................................... - // gap // ................................................................................... - add v29.8H, v5.8H, v15.8H // .............................e..................................................... - // gap // ................................................................................... - // gap // ................................................................................... - str q10, [x1], #(64) // ...............................................................................*... - sqrdmulh v18.8H, v21.8H, v28.8H // ..........................e........................................................ - sub v9.8H, v5.8H, v15.8H // ............................e...................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v12.8H, v23.8H, v7.H[0] // ......................e............................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v27.8H, v21.8H, v14.8H // .........................e......................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v27.8H, v18.8H, v7.H[0] // ...........................e....................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v1.8H, v9.8H, v16.8H // ...............................e................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v26.8H, v22.8H, v7.H[0] // .................................................................*................. - // gap // ................................................................................... - // gap // ................................................................................... - sub v28.8H, v12.8H, v27.8H // .................................e................................................. - // gap // ................................................................................... - // gap // ................................................................................... - mul v31.8H, v9.8H, v25.8H // ..............................e.................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v19.8H, v28.8H, v16.8H // ....................................e.............................................. - // gap // ................................................................................... - // gap // ................................................................................... - add v23.8H, v26.8H, v2.8H // ...........................................................................*....... - // gap // ................................................................................... - // gap // ................................................................................... - mls v31.8H, v1.8H, v7.H[0] // ................................e.................................................. - sub v22.8H, v26.8H, v2.8H // ..........................................................................*........ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - str q23, [x1, #-48] // ................................................................................*.. - mul v1.8H, v28.8H, v25.8H // ...................................e............................................... - sub v28.8H, v0.8H, v11.8H // .....................................................................*............. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v1.8H, v19.8H, v7.H[0] // .....................................e............................................. - add v19.8H, v12.8H, v27.8H // ..................................e................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v23.8H, v28.8H, v6.H[1] // ........................................................................*.......... - // gap // ................................................................................... - // gap // ................................................................................... - trn2 v3.4S, v29.4S, v19.4S // .......................................e........................................... - // gap // ................................................................................... - // gap // ................................................................................... - trn1 v10.4S, v29.4S, v19.4S // ......................................e............................................ - mul v4.8H, v28.8H, v6.H[0] // .......................................................................*........... - // gap // ................................................................................... - trn2 v20.4S, v31.4S, v1.4S // .........................................e......................................... - // gap // ................................................................................... - // gap // ................................................................................... - trn1 v31.4S, v31.4S, v1.4S // ........................................e.......................................... - // gap // ................................................................................... - sqrdmulh v19.8H, v22.8H, v6.H[1] // .............................................................................*..... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v4.8H, v23.8H, v7.H[0] // .........................................................................*......... - trn2 v30.2D, v3.2D, v20.2D // ...........................................e....................................... - // gap // ................................................................................... - trn2 v28.2D, v10.2D, v31.2D // ..........................................e........................................ - // gap // ................................................................................... - // gap // ................................................................................... - mul v22.8H, v22.8H, v6.H[0] // ............................................................................*...... - ldr q6, [x3], #16 // ..............................................e.................................... - trn1 v17.2D, v3.2D, v20.2D // .............................................e..................................... - trn1 v5.2D, v10.2D, v31.2D // ............................................e...................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v11.8H, v28.8H, v30.8H // .....................................................e............................. - mls v22.8H, v19.8H, v7.H[0] // ..............................................................................*.... - // gap // ................................................................................... - str q4, [x1, #-32] // .................................................................................*. - sub v1.8H, v28.8H, v30.8H // ....................................................e.............................. - // gap // ................................................................................... - sub v23.8H, v5.8H, v17.8H // ...............................................e................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v0.8H, v5.8H, v17.8H // ................................................e.................................. - sqdmulh v8.8H, v11.8H, v7.H[1] // ............................................................e...................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - str q22, [x1, #-16] // ..................................................................................* - sqrdmulh v24.8H, v23.8H, v6.H[3] // ..................................................e................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqdmulh v12.8H, v0.8H, v7.H[1] // .........................................................e......................... - // gap // ................................................................................... - // gap // ................................................................................... - srshr v3.8H, v8.8H, #11 // .............................................................e..................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v2.8H, v1.8H, v6.H[4] // ......................................................e............................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - - // original source code - // ldr q8, [x1, #(16*0)] // ....e.............................................................................|....e......................................................................... - // ldr q9, [x1, #(16*1)] // ...e..............................................................................|...e.......................................................................... - // ldr q10, [x1, #(16*2)] // e.................................................................................|e............................................................................. - // ldr q11, [x1, #(16*3)] // .e................................................................................|.e............................................................................ - // trn1 v25.4s, v8.4s, v9.4s // .............e....................................................................|.............e................................................................ - // trn2 v26.4s, v8.4s, v9.4s // ..........e.......................................................................|..........e................................................................... - // trn1 v27.4s, v10.4s, v11.4s // ............e.....................................................................|............e................................................................. - // trn2 v28.4s, v10.4s, v11.4s // ........e.........................................................................|........e..................................................................... - // trn2 v10.2d, v25.2d, v27.2d // ......................e...........................................................|......................e....................................................... - // trn2 v11.2d, v26.2d, v28.2d // ................e.................................................................|................e............................................................. - // trn1 v8.2d, v25.2d, v27.2d // ..................e...............................................................|..................e........................................................... - // trn1 v9.2d, v26.2d, v28.2d // ..............e...................................................................|..............e............................................................... - // ldr q0, [x4], #(6*16) // .....e............................................................................|.....e........................................................................ - // ldr q4, [x4, #(-6*16 + 1*16)] // .......e..........................................................................|.......e...................................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // .................e................................................................|.................e............................................................ - // ldr q5, [x4, #(-6*16 + 3*16)] // ....................e.............................................................|....................e......................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // .....................e............................................................|.....................e........................................................ - // ldr q6, [x4, #(-6*16 + 5*16)] // .................................e................................................|.................................e............................................ - // sub v24.8h, v8.8h, v9.8h // ........................e.........................................................|........................e..................................................... - // add v8.8h, v8.8h, v9.8h // ...............................e..................................................|...............................e.............................................. - // mul v9.8h, v24.8h, v1.8h // .............................e....................................................|.............................e................................................ - // sqrdmulh v24.8h, v24.8h, v5.8h // ................................e.................................................|................................e............................................. - // mls v9.8h, v24.8h, v7.h[0] // ........................................e.........................................|........................................e..................................... - // sub v24.8h, v10.8h, v11.8h // .........................e........................................................|.........................e.................................................... - // add v10.8h, v10.8h, v11.8h // ............................e.....................................................|............................e................................................. - // mul v11.8h, v24.8h, v2.8h // .........................................e........................................|.........................................e.................................... - // sqrdmulh v24.8h, v24.8h, v6.8h // ......................................e...........................................|......................................e....................................... - // mls v11.8h, v24.8h, v7.h[0] // ..........................................e.......................................|..........................................e................................... - // sub v24.8h, v8.8h, v10.8h // .......................................e..........................................|.......................................e...................................... - // add v8.8h, v8.8h, v10.8h // ....................................e.............................................|....................................e......................................... - // mul v10.8h, v24.8h, v0.8h // ..............................................e...................................|..............................................e............................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................................e......................................|...........................................e.................................. - // mls v10.8h, v24.8h, v7.h[0] // .................................................e................................|.................................................e............................ - // sub v24.8h, v9.8h, v11.8h // .............................................e....................................|.............................................e................................ - // add v9.8h, v9.8h, v11.8h // .......................................................e..........................|.......................................................e...................... - // mul v11.8h, v24.8h, v0.8h // ....................................................e.............................|....................................................e......................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ...............................................e..................................|...............................................e.............................. - // mls v11.8h, v24.8h, v7.h[0] // ......................................................e...........................|......................................................e....................... - // trn1 v25.4s, v8.4s, v9.4s // ..........................................................e.......................|..........................................................e................... - // trn2 v26.4s, v8.4s, v9.4s // .........................................................e........................|.........................................................e.................... - // trn1 v27.4s, v10.4s, v11.4s // .............................................................e....................|.............................................................e................ - // trn2 v28.4s, v10.4s, v11.4s // ............................................................e.....................|............................................................e................. - // trn2 v10.2d, v25.2d, v27.2d // .................................................................e................|.................................................................e............ - // trn2 v11.2d, v26.2d, v28.2d // ................................................................e.................|................................................................e............. - // trn1 v8.2d, v25.2d, v27.2d // .....................................................................e............|.....................................................................e........ - // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e.............|....................................................................e......... - // ldr q0, [x3], #16 // ...................................................................e..............|...................................................................e.......... - // sub v24.8h, v8.8h, v9.8h // ..........................................................................e.......|..........................................................................e... - // add v8.8h, v8.8h, v9.8h // ...........................................................................e......|...........................................................................e.. - // mul v9.8h, v24.8h, v0.h[2] // ......*...........................................................................|......*....................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..............................................................................e...|.............................................................................. - // mls v9.8h, v24.8h, v7.h[0] // .........*........................................................................|.........*.................................................................... - // sub v24.8h, v10.8h, v11.8h // .........................................................................e........|.........................................................................e.... - // add v10.8h, v10.8h, v11.8h // ......................................................................e...........|......................................................................e....... - // mul v11.8h, v24.8h, v0.h[4] // .................................................................................e|.............................................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................................................................................*.............................................................................. - // mls v11.8h, v24.8h, v7.h[0] // ...........*......................................................................|...........*.................................................................. - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...............................................................................e..|.............................................................................. - // srshr v25.8h, v25.8h, #11 // ..*...............................................................................|..*........................................................................... - // mls v8.8h, v25.8h, v7.h[0] // ..........................*.......................................................|..........................*................................................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ............................................................................e.....|............................................................................e. - // srshr v25.8h, v25.8h, #11 // ................................................................................e.|.............................................................................. - // mls v10.8h, v25.8h, v7.h[0] // ...............*..................................................................|...............*.............................................................. - // sqdmulh v25.8h, v9.8h, v7.h[1] // ...................*..............................................................|...................*.......................................................... - // srshr v25.8h, v25.8h, #11 // ...........................*......................................................|...........................*.................................................. - // mls v9.8h, v25.8h, v7.h[0] // ............................................*.....................................|............................................*................................. - // sqdmulh v25.8h, v11.8h, v7.h[1] // .......................*..........................................................|.......................*...................................................... - // srshr v25.8h, v25.8h, #11 // ..............................*...................................................|..............................*............................................... - // mls v11.8h, v25.8h, v7.h[0] // ...................................*..............................................|...................................*.......................................... - // sub v24.8h, v8.8h, v10.8h // .....................................................*............................|.....................................................*........................ - // add v8.8h, v8.8h, v10.8h // ..................................*...............................................|..................................*........................................... - // mul v10.8h, v24.8h, v0.h[0] // ...........................................................*......................|...........................................................*.................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................................*.........................|........................................................*..................... - // mls v10.8h, v24.8h, v7.h[0] // ...............................................................*..................|...............................................................*.............. - // sub v24.8h, v9.8h, v11.8h // ..................................................*...............................|..................................................*........................... - // add v9.8h, v9.8h, v11.8h // ................................................*.................................|................................................*............................. - // mul v11.8h, v24.8h, v0.h[0] // ..................................................................*...............|..................................................................*........... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*...................|..............................................................*............... - // mls v11.8h, v24.8h, v7.h[0] // .......................................................................*..........|.......................................................................*...... - // str q8, [x1], #(64) // .....................................*............................................|.....................................*........................................ - // str q9, [x1, #(-64 + 16*1)] // ...................................................*..............................|...................................................*.......................... - // str q10, [x1, #(-64 + 16*2)] // ........................................................................*.........|........................................................................*..... - // str q11, [x1, #(-64 + 16*3)] // .............................................................................*....|.............................................................................* + // Instructions: 83 + // Expected cycles: 65 + // Expected IPC: 1.28 + // + // Cycle bound: 65.0 + // IPC bound: 1.28 + // + // Wall time: 290.91s + // User time: 290.91s + // + // ------------------------------- original position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------- + ldr q22, [x1, #96] // ..e................................................................................ + sqdmulh v31.8H, v6.8H, v7.H[1] // ............................................................*...................... + ldr q20, [x1, #112] // ...e............................................................................... + ldr q5, [x1, #64] // e.................................................................................. + ldr q9, [x1, #80] // .e................................................................................. + // gap // ................................................................................... + ldr q25, [x4, #80] // .................e................................................................. + ldr q1, [x4, #64] // ................e.................................................................. + sqrdmulh v16.8H, v27.8H, v11.H[5] // ......................................................*............................ + // gap // ................................................................................... + // gap // ................................................................................... + srshr v13.8H, v13.8H, #11 // ..........................................................*........................ + ldr q3, [x4, #48] // ...............e................................................................... + trn1 v19.4S, v22.4S, v20.4S // ......e............................................................................ + mls v26.8H, v14.8H, v7.H[0] // ...................................................*............................... + trn2 v24.4S, v22.4S, v20.4S // .......e........................................................................... + ldr q8, [x4], #(6*16) // ............e...................................................................... + // gap // ................................................................................... + mul v14.8H, v27.8H, v11.H[4] // .......................................................*........................... + ldr q4, [x4, #-80] // .............e..................................................................... + trn2 v22.4S, v5.4S, v9.4S // .....e............................................................................. + trn1 v12.4S, v5.4S, v9.4S // ....e.............................................................................. + ldr q5, [x4, #-64] // ..............e.................................................................... + // gap // ................................................................................... + mls v14.8H, v16.8H, v7.H[0] // ........................................................*.......................... + srshr v28.8H, v31.8H, #11 // .............................................................*..................... + // gap // ................................................................................... + trn2 v31.2D, v22.2D, v24.2D // .........e......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v0.8H, v13.8H, v7.H[0] // ...........................................................*....................... + trn1 v23.2D, v12.2D, v19.2D // ..........e........................................................................ + // gap // ................................................................................... + trn1 v21.2D, v22.2D, v24.2D // ...........e....................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v12.2D, v12.2D, v19.2D // ........e.......................................................................... + mls v6.8H, v28.8H, v7.H[0] // ..............................................................*.................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v15.8H, v26.8H, v7.H[1] // ...............................................................*................... + sub v13.8H, v23.8H, v21.8H // ..................e................................................................ + // gap // ................................................................................... + add v17.8H, v12.8H, v31.8H // ........................e.......................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v2.8H, v23.8H, v21.8H // ...................e............................................................... + // gap // ................................................................................... + sqdmulh v10.8H, v14.8H, v7.H[1] // ..................................................................*................ + sub v27.8H, v0.8H, v6.8H // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + sub v29.8H, v12.8H, v31.8H // .......................e........................................................... + sqrdmulh v31.8H, v13.8H, v3.8H // ....................e.............................................................. + // gap // ................................................................................... + srshr v16.8H, v15.8H, #11 // ................................................................*.................. + // gap // ................................................................................... + // gap // ................................................................................... + add v15.8H, v0.8H, v6.8H // ......................................................................*............ + mul v3.8H, v13.8H, v5.8H // .....................e............................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v28.8H, v2.8H, v17.8H // ............................e...................................................... + sqrdmulh v22.8H, v29.8H, v25.8H // .........................e......................................................... + // gap // ................................................................................... + str q15, [x1], #(64) // ...............................................................................*... + srshr v15.8H, v10.8H, #11 // ...................................................................*............... + // gap // ................................................................................... + mls v3.8H, v31.8H, v7.H[0] // ......................e............................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v25.8H, v29.8H, v1.8H // ..........................e........................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v25.8H, v22.8H, v7.H[0] // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v26.8H, v16.8H, v7.H[0] // .................................................................*................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v14.8H, v15.8H, v7.H[0] // ....................................................................*.............. + // gap // ................................................................................... + // gap // ................................................................................... + sub v16.8H, v3.8H, v25.8H // .................................e................................................. + // gap // ................................................................................... + // gap // ................................................................................... + add v18.8H, v3.8H, v25.8H // ..................................e................................................ + sqrdmulh v31.8H, v28.8H, v4.8H // ..............................e.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v12.8H, v16.8H, v4.8H // ...................................e............................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v15.8H, v26.8H, v14.8H // ...........................................................................*....... + // gap // ................................................................................... + // gap // ................................................................................... + mul v13.8H, v28.8H, v8.8H // ...............................e................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v3.8H, v16.8H, v8.8H // ....................................e.............................................. + str q15, [x1, #-48] // ................................................................................*.. + sub v16.8H, v26.8H, v14.8H // ..........................................................................*........ + add v14.8H, v2.8H, v17.8H // .............................e..................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v3.8H, v12.8H, v7.H[0] // .....................................e............................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v22.4S, v14.4S, v18.4S // ......................................e............................................ + // gap // ................................................................................... + mls v13.8H, v31.8H, v7.H[0] // ................................e.................................................. + trn2 v25.4S, v14.4S, v18.4S // .......................................e........................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v15.8H, v16.8H, v11.H[1] // ............................................................................*...... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v28.8H, v27.8H, v11.H[1] // .......................................................................*........... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v14.4S, v13.4S, v3.4S // .........................................e......................................... + mul v9.8H, v16.8H, v11.H[0] // .............................................................................*..... + trn1 v19.4S, v13.4S, v3.4S // ........................................e.......................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v9.8H, v15.8H, v7.H[0] // ..............................................................................*.... + // gap // ................................................................................... + trn1 v21.2D, v25.2D, v14.2D // .............................................e..................................... + trn1 v26.2D, v22.2D, v19.2D // ............................................e...................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v15.8H, v27.8H, v11.H[0] // ........................................................................*.......... + trn2 v31.2D, v22.2D, v19.2D // ..........................................e........................................ + // gap // ................................................................................... + trn2 v13.2D, v25.2D, v14.2D // ...........................................e....................................... + // gap // ................................................................................... + ldr q11, [x3], #16 // ..............................................e.................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v16.8H, v26.8H, v21.8H // ...............................................e................................... + add v0.8H, v26.8H, v21.8H // ................................................e.................................. + mls v15.8H, v28.8H, v7.H[0] // .........................................................................*......... + str q9, [x1, #-16] // ..................................................................................* + sub v27.8H, v31.8H, v13.8H // ....................................................e.............................. + // gap // ................................................................................... + // gap // ................................................................................... + add v6.8H, v31.8H, v13.8H // .....................................................e............................. + // gap // ................................................................................... + mul v26.8H, v16.8H, v11.H[2] // ..................................................e................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v14.8H, v16.8H, v11.H[3] // .................................................e................................. + // gap // ................................................................................... + str q15, [x1, #-32] // .................................................................................*. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v13.8H, v0.8H, v7.H[1] // .........................................................e......................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + + // --------------------------------------------------------------------------- new position ---------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + // ldr q8, [x1, #(16*0)] // ...e...............................................................................'..~.............................................................................. + // ldr q9, [x1, #(16*1)] // ....e..............................................................................'...~............................................................................. + // ldr q10, [x1, #(16*2)] // e..................................................................................~................................................................................. + // ldr q11, [x1, #(16*3)] // ..e................................................................................'.~............................................................................... + // trn1 v25.4s, v8.4s, v9.4s // .................e.................................................................'................~................................................................ + // trn2 v26.4s, v8.4s, v9.4s // ................e..................................................................'...............~................................................................. + // trn1 v27.4s, v10.4s, v11.4s // ..........e........................................................................'.........~....................................................................... + // trn2 v28.4s, v10.4s, v11.4s // ............e......................................................................'...........~..................................................................... + // trn2 v10.2d, v25.2d, v27.2d // .........................e.........................................................'........................~........................................................ + // trn2 v11.2d, v26.2d, v28.2d // .....................e.............................................................'....................~............................................................ + // trn1 v8.2d, v25.2d, v27.2d // .......................e...........................................................'......................~.......................................................... + // trn1 v9.2d, v26.2d, v28.2d // ........................e..........................................................'.......................~......................................................... + // ldr q0, [x4], #(6*16) // .............e.....................................................................'............~.................................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ...............e...................................................................'..............~.................................................................. + // ldr q1, [x4, #(-6*16 + 2*16)] // ..................e................................................................'.................~............................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .........e.........................................................................'........~........................................................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // ......e............................................................................'.....~........................................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .....e.............................................................................'....~............................................................................ + // sub v24.8h, v8.8h, v9.8h // ............................e......................................................'...........................~..................................................... + // add v8.8h, v8.8h, v9.8h // ..............................e....................................................'.............................~................................................... + // sqrdmulh v27.8h, v24.8h, v5.8h // ..................................e................................................'.................................~............................................... + // mul v9.8h, v24.8h, v1.8h // .....................................e.............................................'....................................~............................................ + // mls v9.8h, v27.8h, v7.h[0] // ..........................................e........................................'.........................................~....................................... + // sub v24.8h, v10.8h, v11.8h // .................................e.................................................'................................~................................................ + // add v10.8h, v10.8h, v11.8h // .............................e.....................................................'............................~.................................................... + // sqrdmulh v27.8h, v24.8h, v6.8h // .......................................e...........................................'......................................~.......................................... + // mul v11.8h, v24.8h, v2.8h // ...........................................e.......................................'..........................................~...................................... + // mls v11.8h, v27.8h, v7.h[0] // ............................................e......................................'...........................................~..................................... + // sub v24.8h, v8.8h, v10.8h // ......................................e............................................'.....................................~........................................... + // add v8.8h, v8.8h, v10.8h // ........................................................e..........................'.......................................................~......................... + // sqrdmulh v27.8h, v24.8h, v4.8h // .................................................e.................................'................................................~................................ + // mul v10.8h, v24.8h, v0.8h // ....................................................e..............................'...................................................~............................. + // mls v10.8h, v27.8h, v7.h[0] // ...........................................................e.......................'..........................................................~...................... + // sub v24.8h, v9.8h, v11.8h // ...............................................e...................................'..............................................~.................................. + // add v9.8h, v9.8h, v11.8h // ................................................e..................................'...............................................~................................. + // sqrdmulh v27.8h, v24.8h, v4.8h // ..................................................e................................'.................................................~............................... + // mul v11.8h, v24.8h, v0.8h // .....................................................e.............................'....................................................~............................ + // mls v11.8h, v27.8h, v7.h[0] // .........................................................e.........................'........................................................~........................ + // trn1 v25.4s, v8.4s, v9.4s // ..........................................................e........................'.........................................................~....................... + // trn2 v26.4s, v8.4s, v9.4s // ............................................................e......................'...........................................................~..................... + // trn1 v27.4s, v10.4s, v11.4s // .................................................................e.................'................................................................~................ + // trn2 v28.4s, v10.4s, v11.4s // ...............................................................e...................'..............................................................~.................. + // trn2 v10.2d, v25.2d, v27.2d // ......................................................................e............'.....................................................................~........... + // trn2 v11.2d, v26.2d, v28.2d // .......................................................................e...........'......................................................................~.......... + // trn1 v8.2d, v25.2d, v27.2d // ....................................................................e..............'...................................................................~............. + // trn1 v9.2d, v26.2d, v28.2d // ...................................................................e...............'..................................................................~.............. + // ldr q0, [x3], #16 // ........................................................................e..........'.......................................................................~......... + // sub v24.8h, v8.8h, v9.8h // .........................................................................e.........'........................................................................~........ + // add v8.8h, v8.8h, v9.8h // ..........................................................................e........'.........................................................................~....... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ................................................................................e..'...............................................................................~. + // mul v9.8h, v24.8h, v0.h[2] // ...............................................................................e...'..............................................................................~.. + // mls v9.8h, v27.8h, v7.h[0] // ...........~.......................................................................'..........*...................................................................... + // sub v24.8h, v10.8h, v11.8h // .............................................................................e.....'............................................................................~.... + // add v10.8h, v10.8h, v11.8h // ..............................................................................e....'.............................................................................~... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // .......~...........................................................................'......*.......................................................................... + // mul v11.8h, v24.8h, v0.h[4] // ..............~....................................................................'.............*................................................................... + // mls v11.8h, v27.8h, v7.h[0] // ...................~...............................................................'..................*.............................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................................................................................e'................................................................................. + // srshr v25.8h, v25.8h, #11 // ........~..........................................................................'.......*......................................................................... + // mls v8.8h, v25.8h, v7.h[0] // ......................~............................................................'.....................*........................................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .~.................................................................................'*................................................................................ + // srshr v25.8h, v25.8h, #11 // ....................~..............................................................'...................*............................................................. + // mls v10.8h, v25.8h, v7.h[0] // ..........................~........................................................'.........................*....................................................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ...........................~.......................................................'..........................*...................................................... + // srshr v25.8h, v25.8h, #11 // ...................................~...............................................'..................................*.............................................. + // mls v9.8h, v25.8h, v7.h[0] // .............................................~.....................................'............................................*.................................... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ...............................~...................................................'..............................*.................................................. + // srshr v25.8h, v25.8h, #11 // .........................................~.........................................'........................................*........................................ + // mls v11.8h, v25.8h, v7.h[0] // ..............................................~....................................'.............................................*................................... + // sub v24.8h, v8.8h, v10.8h // ................................~..................................................'...............................*................................................. + // add v8.8h, v8.8h, v10.8h // ....................................~..............................................'...................................*............................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ..............................................................~....................'.............................................................*................... + // mul v10.8h, v24.8h, v0.h[0] // .....................................................................~.............'....................................................................*............ + // mls v10.8h, v27.8h, v7.h[0] // ...........................................................................~.......'..........................................................................*...... + // sub v24.8h, v9.8h, v11.8h // .......................................................~...........................'......................................................*.......................... + // add v9.8h, v9.8h, v11.8h // ...................................................~...............................'..................................................*.............................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .............................................................~.....................'............................................................*.................... + // mul v11.8h, v24.8h, v0.h[0] // ................................................................~..................'...............................................................*................. + // mls v11.8h, v27.8h, v7.h[0] // ..................................................................~................'.................................................................*............... + // str q8, [x1], #(64) // ........................................~..........................................'.......................................*......................................... + // str q9, [x1, #(-64 + 16*1)] // ......................................................~............................'.....................................................*........................... + // str q10, [x1, #(-64 + 16*2)] // .................................................................................~.'................................................................................* + // str q11, [x1, #(-64 + 16*3)] // ............................................................................~......'...........................................................................*..... sub count, count, #1 cbnz count, layer4567_start - sqrdmulh v8.8H, v1.8H, v6.H[5] // *.......................... - srshr v22.8H, v12.8H, #11 // .*......................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mul v4.8H, v23.8H, v6.H[2] // ..*........................ - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v4.8H, v24.8H, v7.H[0] // ...*....................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v2.8H, v8.8H, v7.H[0] // ....*...................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - sqdmulh v8.8H, v4.8H, v7.H[1] // ......*.................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - sqdmulh v17.8H, v2.8H, v7.H[1] // .......*................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v0.8H, v22.8H, v7.H[0] // ........*.................. - // gap // ........................... - // gap // ........................... - srshr v21.8H, v8.8H, #11 // .........*................. - // gap // ........................... - // gap // ........................... - mls v11.8H, v3.8H, v7.H[0] // .....*..................... - // gap // ........................... - // gap // ........................... - srshr v8.8H, v17.8H, #11 // ..........*................ - // gap // ........................... - // gap // ........................... - mls v4.8H, v21.8H, v7.H[0] // ..............*............ - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v2.8H, v8.8H, v7.H[0] // ............*.............. - // gap // ........................... - // gap // ........................... - sub v8.8H, v0.8H, v11.8H // ..................*........ - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - sqrdmulh v1.8H, v8.8H, v6.H[1] // ...................*....... - // gap // ........................... - // gap // ........................... - sub v25.8H, v4.8H, v2.8H // ................*.......... - // gap // ........................... - // gap // ........................... - mul v13.8H, v8.8H, v6.H[0] // ....................*...... - // gap // ........................... - // gap // ........................... - // gap // ........................... - add v8.8H, v0.8H, v11.8H // ...........*............... - // gap // ........................... - sqrdmulh v21.8H, v25.8H, v6.H[1] // .....................*..... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - str q8, [x1], #(64) // .............*............. - add v12.8H, v4.8H, v2.8H // ...............*........... - mul v8.8H, v25.8H, v6.H[0] // .......................*... - // gap // ........................... - // gap // ........................... - // gap // ........................... - mls v13.8H, v1.8H, v7.H[0] // ......................*.... - // gap // ........................... - // gap // ........................... - str q12, [x1, #-48] // .................*......... - // gap // ........................... - // gap // ........................... - mls v8.8H, v21.8H, v7.H[0] // ........................*.. - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - str q13, [x1, #-32] // .........................*. - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - // gap // ........................... - str q8, [x1, #-16] // ..........................* - // gap // ........................... - // gap // ........................... - - // original source code - // sqrdmulh v30.8H, v1.8H, v6.H[5] // *.......................... - // srshr v28.8H, v12.8H, #11 // .*......................... - // mul v26.8H, v23.8H, v6.H[2] // ..*........................ - // mls v26.8H, v24.8H, v7.H[0] // ...*....................... - // mls v2.8H, v30.8H, v7.H[0] // ....*...................... - // mls v11.8H, v3.8H, v7.H[0] // .........*................. - // sqdmulh v24.8H, v26.8H, v7.H[1] // .....*..................... - // sqdmulh v19.8H, v2.8H, v7.H[1] // ......*.................... - // mls v0.8H, v28.8H, v7.H[0] // .......*................... - // srshr v22.8H, v24.8H, #11 // ........*.................. - // srshr v19.8H, v19.8H, #11 // ..........*................ - // add v10.8H, v0.8H, v11.8H // .................*......... - // mls v2.8H, v19.8H, v7.H[0] // ............*.............. - // str q10, [x1], #(64) // ...................*....... - // mls v26.8H, v22.8H, v7.H[0] // ...........*............... - // add v23.8H, v26.8H, v2.8H // ....................*...... - // sub v22.8H, v26.8H, v2.8H // ...............*........... - // str q23, [x1, #-48] // .......................*... - // sub v28.8H, v0.8H, v11.8H // .............*............. - // sqrdmulh v23.8H, v28.8H, v6.H[1] // ..............*............ - // mul v4.8H, v28.8H, v6.H[0] // ................*.......... - // sqrdmulh v19.8H, v22.8H, v6.H[1] // ..................*........ - // mls v4.8H, v23.8H, v7.H[0] // ......................*.... - // mul v22.8H, v22.8H, v6.H[0] // .....................*..... - // mls v22.8H, v19.8H, v7.H[0] // ........................*.. - // str q4, [x1, #-32] // .........................*. - // str q22, [x1, #-16] // ..........................* + // Instructions: 29 + // Expected cycles: 40 + // Expected IPC: 0.72 + // + // Cycle bound: 40.0 + // IPC bound: 0.72 + // + // Wall time: 0.27s + // User time: 0.27s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + srshr v23.8H, v13.8H, #11 // ..*........................... + sqrdmulh v2.8H, v27.8H, v11.H[5] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v26.8H, v14.8H, v7.H[0] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v19.8H, v27.8H, v11.H[4] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v19.8H, v2.8H, v7.H[0] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqdmulh v5.8H, v6.8H, v7.H[1] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqdmulh v28.8H, v26.8H, v7.H[1] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqdmulh v10.8H, v19.8H, v7.H[1] // ..........*................... + // gap // .............................. + // gap // .............................. + srshr v1.8H, v5.8H, #11 // ......*....................... + // gap // .............................. + // gap // .............................. + mls v0.8H, v23.8H, v7.H[0] // .......*...................... + // gap // .............................. + // gap // .............................. + srshr v29.8H, v28.8H, #11 // ............*................. + // gap // .............................. + // gap // .............................. + mls v6.8H, v1.8H, v7.H[0] // ........*..................... + // gap // .............................. + // gap // .............................. + srshr v14.8H, v10.8H, #11 // ...............*.............. + // gap // .............................. + // gap // .............................. + mls v26.8H, v29.8H, v7.H[0] // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v19.8H, v14.8H, v7.H[0] // .................*............ + // gap // .............................. + // gap // .............................. + sub v1.8H, v0.8H, v6.8H // ...........*.................. + // gap // .............................. + // gap // .............................. + add v16.8H, v0.8H, v6.8H // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v25.8H, v1.8H, v11.H[1] // ......................*....... + // gap // .............................. + // gap // .............................. + str q16, [x1], #(64) // ..............*............... + // gap // .............................. + sub v12.8H, v26.8H, v19.8H // ....................*......... + mul v31.8H, v1.8H, v11.H[0] // .........................*.... + add v24.8H, v26.8H, v19.8H // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v17.8H, v12.8H, v11.H[1] // .....................*........ + str q24, [x1, #-48] // ...................*.......... + // gap // .............................. + // gap // .............................. + mul v10.8H, v12.8H, v11.H[0] // .......................*...... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v31.8H, v25.8H, v7.H[0] // ..........................*... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v10.8H, v17.8H, v7.H[0] // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q31, [x1, #-32] // ............................*. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q10, [x1, #-16] // ...........................*.. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sqdmulh v31.8H, v6.8H, v7.H[1] // .....*......................... + // sqrdmulh v16.8H, v27.8H, v11.H[5] // .*............................. + // srshr v13.8H, v13.8H, #11 // *.............................. + // mls v26.8H, v14.8H, v7.H[0] // ..*............................ + // mul v14.8H, v27.8H, v11.H[4] // ...*........................... + // mls v14.8H, v16.8H, v7.H[0] // ....*.......................... + // srshr v28.8H, v31.8H, #11 // ........*...................... + // mls v0.8H, v13.8H, v7.H[0] // .........*..................... + // mls v6.8H, v28.8H, v7.H[0] // ...........*................... + // sqdmulh v15.8H, v26.8H, v7.H[1] // ......*........................ + // sqdmulh v10.8H, v14.8H, v7.H[1] // .......*....................... + // sub v27.8H, v0.8H, v6.8H // ...............*............... + // srshr v16.8H, v15.8H, #11 // ..........*.................... + // add v15.8H, v0.8H, v6.8H // ................*.............. + // str q15, [x1], #(64) // ..................*............ + // srshr v15.8H, v10.8H, #11 // ............*.................. + // mls v26.8H, v16.8H, v7.H[0] // .............*................. + // mls v14.8H, v15.8H, v7.H[0] // ..............*................ + // add v15.8H, v26.8H, v14.8H // .....................*......... + // str q15, [x1, #-48] // .......................*....... + // sub v16.8H, v26.8H, v14.8H // ...................*........... + // sqrdmulh v15.8H, v16.8H, v11.H[1] // ......................*........ + // sqrdmulh v28.8H, v27.8H, v11.H[1] // .................*............. + // mul v9.8H, v16.8H, v11.H[0] // ........................*...... + // mls v9.8H, v15.8H, v7.H[0] // ..........................*.... + // mul v15.8H, v27.8H, v11.H[0] // ....................*.......... + // mls v15.8H, v28.8H, v7.H[0] // .........................*..... + // str q9, [x1, #-16] // ............................*.. + // str q15, [x1, #-32] // ...........................*... // --------------------------------------------------------------------- @@ -1039,800 +1066,848 @@ layer4567_start: .p2align 2 - ldr q23, [x0, #384] // *................. - ldr q19, [x0, #448] // .*................ - // gap // .................. - ldr q2, [x0, #320] // ..*............... - // gap // .................. - // gap // .................. - ldr q5, [x0, #256] // ....*............. - // gap // .................. - // gap // .................. - ldr q4, [x0, #128] // ......*........... - // gap // .................. - // gap // .................. - add v24.8H, v23.8H, v19.8H // ............*..... - sub v23.8H, v23.8H, v19.8H // .....*............ - ldr q19, [x0, #192] // .......*.......... - ldr q20, [x0, #0] // ..........*....... - // gap // .................. - // gap // .................. - ldr q26, [x0, #64] // ...*.............. - add v17.8H, v5.8H, v2.8H // ........*......... - // gap // .................. - sqrdmulh v13.8H, v23.8H, v1.H[5] // ...........*...... - // gap // .................. - // gap // .................. - sub v9.8H, v4.8H, v19.8H // .........*........ - // gap // .................. - // gap // .................. - add v27.8H, v4.8H, v19.8H // ..............*... - mul v11.8H, v23.8H, v1.H[4] // .............*.... - // gap // .................. - add v25.8H, v20.8H, v26.8H // ................*. - // gap // .................. - // gap // .................. - sub v23.8H, v20.8H, v26.8H // .................* - sqrdmulh v16.8H, v9.8H, v1.H[1] // ...............*.. - // gap // .................. - - // original source code - // ldr q14, [x0, #384] // *................. - // ldr q26, [x0, #448] // .*................ - // ldr q2, [x0, #320] // ..*............... - // ldr q28, [x0, #64] // .........*........ - // ldr q5, [x0, #256] // ...*.............. - // sub v8.8H, v14.8H, v26.8H // ......*........... - // ldr q3, [x0, #128] // ....*............. - // ldr q20, [x0, #192] // .......*.......... - // add v17.8H, v5.8H, v2.8H // ..........*....... - // sub v9.8H, v3.8H, v20.8H // ............*..... - // ldr q23, [x0, #0] // ........*......... - // sqrdmulh v13.8H, v8.8H, v1.H[5] // ...........*...... - // add v24.8H, v14.8H, v26.8H // .....*............ - // mul v11.8H, v8.8H, v1.H[4] // ..............*... - // add v27.8H, v3.8H, v20.8H // .............*.... - // sqrdmulh v16.8H, v9.8H, v1.H[1] // .................* - // add v25.8H, v23.8H, v28.8H // ...............*.. - // sub v23.8H, v23.8H, v28.8H // ................*. + // Instructions: 13 + // Expected cycles: 12 + // Expected IPC: 1.08 + // + // Cycle bound: 12.0 + // IPC bound: 1.08 + // + // Wall time: 0.08s + // User time: 0.08s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q18, [x0, #384] // .....*........................ + ldr q28, [x0, #448] // *............................. + // gap // .............................. + ldr q21, [x0, #192] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x0, #128] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v6.8H, v18.8H, v28.8H // ......*....................... + sub v16.8H, v18.8H, v28.8H // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v14.8H, v4.8H, v21.8H // ........*..................... + sub v28.8H, v4.8H, v21.8H // ...*.......................... + ldr q24, [x0, #0] // ....*......................... + sqrdmulh v15.8H, v16.8H, v1.H[5] // ..........*................... + ldr q22, [x0, #320] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v10.8H, v28.8H, v1.H[0] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v27.8H, v28.8H, v1.H[1] // .........*.................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q5, [x0, #448] // .*............................. + // ldr q19, [x0, #128] // ...*........................... + // ldr q8, [x0, #192] // ..*............................ + // sub v17.8H, v19.8H, v8.8H // .......*....................... + // ldr q24, [x0, #0] // ........*...................... + // ldr q21, [x0, #384] // *.............................. + // add v6.8H, v21.8H, v5.8H // ....*.......................... + // sub v16.8H, v21.8H, v5.8H // .....*......................... + // add v14.8H, v19.8H, v8.8H // ......*........................ + // sqrdmulh v27.8H, v17.8H, v1.H[1] // ............*.................. + // sqrdmulh v15.8H, v16.8H, v1.H[5] // .........*..................... + // mul v10.8H, v17.8H, v1.H[0] // ...........*................... + // ldr q22, [x0, #320] // ..........*.................... sub count, count, #1 layer123_start: - mls v11.8H, v13.8H, v7.H[0] // ...........................*............................................................ - sub v28.8H, v5.8H, v2.8H // ..................*..................................................................... + // Instructions: 88 + // Expected cycles: 96 + // Expected IPC: 0.92 + // + // Cycle bound: 96.0 + // IPC bound: 0.92 + // + // Wall time: 47.70s + // User time: 47.70s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + ldr q5, [x0, #464] // .......e................................................................................ // gap // ........................................................................................ - ldr q14, [x0, #400] // ......e................................................................................. + ldr q21, [x0, #64] // .*...................................................................................... + ldr q19, [x0, #144] // ..e..................................................................................... + ldr q8, [x0, #208] // ...e.................................................................................... + mul v3.8H, v16.8H, v1.H[4] // ..........................*............................................................. + ldr q11, [x0, #256] // ....*................................................................................... // gap // ........................................................................................ - sub v5.8H, v25.8H, v27.8H // ............................*........................................................... - mul v19.8H, v23.8H, v0.H[6] // ..........*............................................................................. - add v27.8H, v25.8H, v27.8H // .............................*.......................................................... // gap // ........................................................................................ - add v20.8H, v17.8H, v24.8H // .......................................*................................................ + mls v3.8H, v15.8H, v7.H[0] // ...........................*............................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sub v6.8H, v17.8H, v24.8H // ......................................*................................................. + sub v20.8H, v24.8H, v21.8H // ........*............................................................................... // gap // ........................................................................................ - sqrdmulh v31.8H, v28.8H, v1.H[3] // .....................*.................................................................. // gap // ........................................................................................ + sub v17.8H, v19.8H, v8.8H // .............e.......................................................................... // gap // ........................................................................................ + mls v10.8H, v27.8H, v7.H[0] // .................*...................................................................... + add v31.8H, v24.8H, v21.8H // .........*.............................................................................. // gap // ........................................................................................ - sqrdmulh v22.8H, v23.8H, v0.H[7] // ...........*............................................................................ // gap // ........................................................................................ - sub v4.8H, v27.8H, v20.8H // ................................................*....................................... - add v27.8H, v27.8H, v20.8H // .................................................*...................................... + sub v27.8H, v11.8H, v22.8H // ..................*..................................................................... // gap // ........................................................................................ + sqrdmulh v25.8H, v20.8H, v0.H[7] // ..........*............................................................................. + add v26.8H, v11.8H, v22.8H // ...................*.................................................................... // gap // ........................................................................................ - mul v13.8H, v6.8H, v0.H[4] // ........................................*............................................... // gap // ........................................................................................ + add v23.8H, v31.8H, v14.8H // .............................*.......................................................... + mul v21.8H, v20.8H, v0.H[6] // ...........*............................................................................ // gap // ........................................................................................ + sub v16.8H, v31.8H, v14.8H // ............................*........................................................... // gap // ........................................................................................ // gap // ........................................................................................ + add v12.8H, v26.8H, v6.8H // .......................................*................................................ // gap // ........................................................................................ - mul v28.8H, v28.8H, v1.H[2] // ....................*................................................................... + sqrdmulh v11.8H, v27.8H, v1.H[3] // ....................*................................................................... + sub v15.8H, v26.8H, v6.8H // ......................................*................................................. // gap // ........................................................................................ // gap // ........................................................................................ + mls v21.8H, v25.8H, v7.H[0] // ............*........................................................................... // gap // ........................................................................................ // gap // ........................................................................................ + add v13.8H, v23.8H, v12.8H // .................................................*...................................... // gap // ........................................................................................ // gap // ........................................................................................ + mul v2.8H, v16.8H, v0.H[2] // ...............................*........................................................ // gap // ........................................................................................ - mul v20.8H, v9.8H, v1.H[0] // ...............*........................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v20.8H, v16.8H, v7.H[0] // .................*...................................................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v18.8H, v15.8H, v0.H[5] // ........................................*............................................... + sub v28.8H, v21.8H, v10.8H // .................................*...................................................... // gap // ........................................................................................ // gap // ........................................................................................ + add v24.8H, v21.8H, v10.8H // ..................................*..................................................... // gap // ........................................................................................ + sqrdmulh v4.8H, v16.8H, v0.H[3] // ..............................*......................................................... // gap // ........................................................................................ - mul v24.8H, v5.8H, v0.H[2] // ..............................*......................................................... // gap // ........................................................................................ // gap // ........................................................................................ + mul v25.8H, v15.8H, v0.H[4] // .........................................*.............................................. // gap // ........................................................................................ - mls v19.8H, v22.8H, v7.H[0] // ............*........................................................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v25.8H, v18.8H, v7.H[0] // ..........................................*............................................. // gap // ........................................................................................ - mls v28.8H, v31.8H, v7.H[0] // ......................*................................................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v2.8H, v4.8H, v7.H[0] // ................................*....................................................... // gap // ........................................................................................ - sqrdmulh v22.8H, v6.8H, v0.H[5] // .........................................*.............................................. // gap // ........................................................................................ // gap // ........................................................................................ - sub v26.8H, v19.8H, v20.8H // .................................*...................................................... // gap // ........................................................................................ // gap // ........................................................................................ - add v18.8H, v19.8H, v20.8H // ..................................*..................................................... - sqrdmulh v19.8H, v5.8H, v0.H[3] // ...............................*........................................................ + mul v4.8H, v27.8H, v1.H[2] // .....................*.................................................................. // gap // ........................................................................................ - add v23.8H, v28.8H, v11.8H // ............................................*........................................... // gap // ........................................................................................ // gap // ........................................................................................ - sub v10.8H, v28.8H, v11.8H // ...........................................*............................................ - sqrdmulh v11.8H, v4.8H, v0.H[1] // ...................................................*.................................... // gap // ........................................................................................ // gap // ........................................................................................ + mls v4.8H, v11.8H, v7.H[0] // ......................*................................................................. // gap // ........................................................................................ // gap // ........................................................................................ - add v6.8H, v18.8H, v23.8H // ......................................................*................................. // gap // ........................................................................................ - mul v8.8H, v26.8H, v0.H[2] // ...................................*.................................................... // gap // ........................................................................................ + sub v15.8H, v2.8H, v25.8H // ..........................................................*............................. + mul v22.8H, v28.8H, v0.H[2] // ....................................*................................................... // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v28.8H, v26.8H, v0.H[3] // ....................................*................................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - ldr q26, [x0, #464] // .......e................................................................................ + sqrdmulh v6.8H, v15.8H, v0.H[1] // ............................................................*........................... // gap // ........................................................................................ - mls v13.8H, v22.8H, v7.H[0] // ..........................................*............................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + add v27.8H, v4.8H, v3.8H // ............................................*........................................... + sqrdmulh v26.8H, v28.8H, v0.H[3] // ...................................*.................................................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v22.8H, v4.8H, v0.H[0] // ..................................................*..................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mul v14.8H, v15.8H, v0.H[0] // .............................................................*.......................... + sub v11.8H, v24.8H, v27.8H // .....................................................*.................................. // gap // ........................................................................................ + add v28.8H, v24.8H, v27.8H // ......................................................*................................. + ldr q24, [x0, #16] // e....................................................................................... // gap // ........................................................................................ - mls v24.8H, v19.8H, v7.H[0] // ................................*....................................................... + sub v27.8H, v4.8H, v3.8H // ...........................................*............................................ + mls v14.8H, v6.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v22.8H, v26.8H, v7.H[0] // .....................................*.................................................. // gap // ........................................................................................ - mls v8.8H, v28.8H, v7.H[0] // .....................................*.................................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v28.8H, v10.8H, v0.H[4] // .............................................*.......................................... // gap // ........................................................................................ + sqrdmulh v15.8H, v27.8H, v0.H[5] // .............................................*.......................................... + str q14, [x0, #384] // ......................................................................*................. // gap // ........................................................................................ - add v5.8H, v24.8H, v13.8H // ...........................................................*............................ // gap // ........................................................................................ + mul v31.8H, v27.8H, v0.H[4] // ..............................................*......................................... // gap // ........................................................................................ - mls v22.8H, v11.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ // gap // ........................................................................................ - sub v3.8H, v24.8H, v13.8H // ..........................................................*............................. // gap // ........................................................................................ // gap // ........................................................................................ - sub v24.8H, v18.8H, v23.8H // .....................................................*.................................. - sqrdmulh v23.8H, v10.8H, v0.H[5] // ..............................................*......................................... // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v26.8H, v11.8H, v0.H[1] // .......................................................*................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v25.8H, v3.8H, v0.H[1] // .............................................................*.......................... // gap // ........................................................................................ + mls v31.8H, v15.8H, v7.H[0] // ...............................................*........................................ // gap // ........................................................................................ - str q22, [x0, #256] // ....................................................................*................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v22.8H, v3.8H, v0.H[0] // ............................................................*........................... // gap // ........................................................................................ // gap // ........................................................................................ + mul v6.8H, v11.8H, v0.H[0] // ........................................................*............................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v28.8H, v23.8H, v7.H[0] // ...............................................*........................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v6.8H, v26.8H, v7.H[0] // .........................................................*.............................. // gap // ........................................................................................ // gap // ........................................................................................ + sub v15.8H, v22.8H, v31.8H // ...............................................................*........................ // gap // ........................................................................................ - mls v22.8H, v25.8H, v7.H[0] // ..............................................................*......................... + ldr q21, [x0, #400] // ......e................................................................................. + add v27.8H, v22.8H, v31.8H // ................................................................*....................... + sqrdmulh v16.8H, v13.8H, v30.8H // ........................................................................*............... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v11.8H, v15.8H, v0.H[1] // .................................................................*...................... // gap // ........................................................................................ - mul v3.8H, v27.8H, v29.8H // ........................................................................*............... // gap // ........................................................................................ + str q6, [x0, #320] // .....................................................................*.................. + add v6.8H, v21.8H, v5.8H // ........................e............................................................... // gap // ........................................................................................ + mul v26.8H, v13.8H, v29.8H // .........................................................................*.............. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v27.8H, v27.8H, v30.8H // .........................................................................*.............. // gap // ........................................................................................ // gap // ........................................................................................ - str q22, [x0, #384] // ......................................................................*................. + mul v15.8H, v15.8H, v0.H[0] // ..................................................................*..................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v22.8H, v6.8H, v29.8H // ...........................................................................*............ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v15.8H, v11.8H, v7.H[0] // ...................................................................*.................... // gap // ........................................................................................ - sub v10.8H, v8.8H, v28.8H // ...............................................................*........................ - sqrdmulh v19.8H, v6.8H, v30.8H // ............................................................................*........... - add v28.8H, v8.8H, v28.8H // ................................................................*....................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v3.8H, v27.8H, v7.H[0] // ..........................................................................*............. + mls v26.8H, v16.8H, v7.H[0] // ..........................................................................*............. + sub v16.8H, v21.8H, v5.8H // .......................e................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + sqrdmulh v14.8H, v27.8H, v30.8H // .................................................................................*...... + str q15, [x0, #448] // .......................................................................*................ // gap // ........................................................................................ - sqrdmulh v23.8H, v24.8H, v0.H[1] // ........................................................*............................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v20.8H, v10.8H, v0.H[1] // ..................................................................*..................... + sqrdmulh v22.8H, v28.8H, v30.8H // ...........................................................................*............ + str q26, [x0], #(16) // ....................................................................................*... // gap // ........................................................................................ // gap // ........................................................................................ - str q3, [x0], #(16) // ....................................................................................*... + mul v31.8H, v27.8H, v29.8H // ..................................................................................*..... + sub v27.8H, v23.8H, v12.8H // ................................................*....................................... // gap // ........................................................................................ + add v15.8H, v2.8H, v25.8H // ...........................................................*............................ // gap // ........................................................................................ - mul v11.8H, v24.8H, v0.H[0] // .......................................................*................................ // gap // ........................................................................................ + mls v31.8H, v14.8H, v7.H[0] // ...................................................................................*.... + add v14.8H, v19.8H, v8.8H // ..............e......................................................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v11.8H, v23.8H, v7.H[0] // .........................................................*.............................. + mul v2.8H, v15.8H, v29.8H // ...............................................................................*........ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v23.8H, v5.8H, v30.8H // ...............................................................................*........ + sqrdmulh v15.8H, v15.8H, v30.8H // ..............................................................................*......... // gap // ........................................................................................ // gap // ........................................................................................ + str q31, [x0, #176] // .......................................................................................* // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v27.8H, v5.8H, v29.8H // ..............................................................................*......... // gap // ........................................................................................ + sqrdmulh v11.8H, v27.8H, v0.H[1] // ..................................................*..................................... // gap // ........................................................................................ - str q11, [x0, #304] // .....................................................................*.................. // gap // ........................................................................................ // gap // ........................................................................................ - mul v24.8H, v10.8H, v0.H[0] // .................................................................*...................... + mul v26.8H, v27.8H, v0.H[0] // ...................................................*.................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + mls v2.8H, v15.8H, v7.H[0] // ................................................................................*....... // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v8.8H, v28.8H, v30.8H // ..................................................................................*..... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v27.8H, v23.8H, v7.H[0] // ................................................................................*....... + mul v15.8H, v28.8H, v29.8H // ............................................................................*........... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v23.8H, v28.8H, v29.8H // .................................................................................*...... - ldr q2, [x0, #320] // .....e.................................................................................. + mls v15.8H, v22.8H, v7.H[0] // .............................................................................*.......... // gap // ........................................................................................ // gap // ........................................................................................ - ldr q28, [x0, #64] // .e...................................................................................... + str q2, [x0, #112] // ......................................................................................*. // gap // ........................................................................................ - mls v23.8H, v8.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ - ldr q5, [x0, #256] // ....e................................................................................... - str q27, [x0, #112] // ......................................................................................*. - sub v8.8H, v14.8H, v26.8H // .......................e................................................................ - ldr q3, [x0, #128] // ..e..................................................................................... - mls v24.8H, v20.8H, v7.H[0] // ...................................................................*.................... - ldr q20, [x0, #192] // ...e.................................................................................... + mls v26.8H, v11.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v22.8H, v19.8H, v7.H[0] // .............................................................................*.......... // gap // ........................................................................................ - add v17.8H, v5.8H, v2.8H // ...................e.................................................................... - str q23, [x0, #176] // .......................................................................................* + sqrdmulh v27.8H, v17.8H, v1.H[1] // ...............e........................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sub v9.8H, v3.8H, v20.8H // .............e.......................................................................... - ldr q23, [x0, #0] // e....................................................................................... - sqrdmulh v13.8H, v8.8H, v1.H[5] // ..........................e............................................................. - str q24, [x0, #432] // .......................................................................*................ - add v24.8H, v14.8H, v26.8H // ........................e............................................................... + str q15, [x0, #48] // .....................................................................................*.. // gap // ........................................................................................ - mul v11.8H, v8.8H, v1.H[4] // .........................e.............................................................. - add v27.8H, v3.8H, v20.8H // ..............e......................................................................... // gap // ........................................................................................ - str q22, [x0, #48] // .....................................................................................*.. + sqrdmulh v15.8H, v16.8H, v1.H[5] // .........................e.............................................................. // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v16.8H, v9.8H, v1.H[1] // ................e....................................................................... - add v25.8H, v23.8H, v28.8H // .........e.............................................................................. + str q26, [x0, #240] // ....................................................................*................... // gap // ........................................................................................ - sub v23.8H, v23.8H, v28.8H // ........e............................................................................... // gap // ........................................................................................ + mul v10.8H, v17.8H, v1.H[0] // ................e....................................................................... // gap // ........................................................................................ + ldr q22, [x0, #320] // .....e.................................................................................. - // original source code - // ldr q8, [x0, #0] // ............................................................................e.........|.............................................................................e...... - // ldr q9, [x0, #(1*(512/8))] // ................................................................e.....................|.................................................................e.................. - // ldr q10, [x0, #(2*(512/8))] // .....................................................................e................|......................................................................e............. - // ldr q11, [x0, #(3*(512/8))] // .......................................................................e..............|........................................................................e........... - // ldr q12, [x0, #(4*(512/8))] // ..................................................................e...................|...................................................................e................ - // ldr q13, [x0, #(5*(512/8))] // ...............................................................e......................|................................................................e................... - // ldr q14, [x0, #(6*(512/8))] // e.....................................................................................|.e.................................................................................. - // ldr q15, [x0, #(7*(512/8))] // ...........................e..........................................................|............................e....................................................... - // sub v24.8h, v8.8h, v9.8h // .....................................................................................e|.................................................................................... - // add v8.8h, v8.8h, v9.8h // ....................................................................................e.|.................................................................................... - // mul v9.8h, v24.8h, v0.h[6] // ..*...................................................................................|...*................................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[7] // .......*..............................................................................|........*........................................................................... - // mls v9.8h, v24.8h, v7.h[0] // ...............*......................................................................|................*................................................................... - // sub v24.8h, v10.8h, v11.8h // ...........................................................................e..........|............................................................................e....... - // add v10.8h, v10.8h, v11.8h // .................................................................................e....|..................................................................................e. - // mul v11.8h, v24.8h, v1.h[0] // ............*.........................................................................|.............*...................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...................................................................................e..|.................................................................................... - // mls v11.8h, v24.8h, v7.h[0] // .............*........................................................................|..............*..................................................................... - // sub v24.8h, v12.8h, v13.8h // ......................................................................................|*................................................................................... - // add v12.8h, v12.8h, v13.8h // .........................................................................e............|..........................................................................e......... - // mul v13.8h, v24.8h, v1.h[2] // ...........*..........................................................................|............*....................................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ......*...............................................................................|.......*............................................................................ - // mls v13.8h, v24.8h, v7.h[0] // ................*.....................................................................|.................*.................................................................. - // sub v24.8h, v14.8h, v15.8h // ....................................................................e.................|.....................................................................e.............. - // add v14.8h, v14.8h, v15.8h // ...............................................................................e......|................................................................................e... - // mul v15.8h, v24.8h, v1.h[4] // ................................................................................e.....|.................................................................................e.. - // sqrdmulh v24.8h, v24.8h, v1.h[5] // .............................................................................e........|..............................................................................e..... - // mls v15.8h, v24.8h, v7.h[0] // ......................................................................................*.................................................................................... - // sub v24.8h, v8.8h, v10.8h // .*....................................................................................|..*................................................................................. - // add v8.8h, v8.8h, v10.8h // ...*..................................................................................|....*............................................................................... - // mul v10.8h, v24.8h, v0.h[2] // ..............*.......................................................................|...............*.................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................*.................................................................|.....................*.............................................................. - // mls v10.8h, v24.8h, v7.h[0] // ..............................*.......................................................|...............................*.................................................... - // sub v24.8h, v9.8h, v11.8h // ..................*...................................................................|...................*................................................................ - // add v9.8h, v9.8h, v11.8h // ...................*..................................................................|....................*............................................................... - // mul v11.8h, v24.8h, v0.h[2] // .........................*............................................................|..........................*......................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................*...........................................................|...........................*........................................................ - // mls v11.8h, v24.8h, v7.h[0] // ...............................*......................................................|................................*................................................... - // sub v24.8h, v12.8h, v14.8h // .....*................................................................................|......*............................................................................. - // add v12.8h, v12.8h, v14.8h // ....*.................................................................................|.....*.............................................................................. - // mul v14.8h, v24.8h, v0.h[4] // ..........*...........................................................................|...........*........................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................*....................................................................|..................*................................................................. - // mls v14.8h, v24.8h, v7.h[0] // ............................*.........................................................|.............................*...................................................... - // sub v24.8h, v13.8h, v15.8h // ......................*...............................................................|.......................*............................................................ - // add v13.8h, v13.8h, v15.8h // .....................*................................................................|......................*............................................................. - // mul v15.8h, v24.8h, v0.h[4] // ................................*.....................................................|.................................*.................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .....................................*................................................|......................................*............................................. - // mls v15.8h, v24.8h, v7.h[0] // .........................................*............................................|..........................................*......................................... - // sub v24.8h, v8.8h, v12.8h // ........*.............................................................................|.........*.......................................................................... - // add v8.8h, v8.8h, v12.8h // .........*............................................................................|..........*......................................................................... - // mul v12.8h, v24.8h, v0.h[0] // .............................*........................................................|..............................*..................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................*..............................................................|........................*........................................................... - // mls v12.8h, v24.8h, v7.h[0] // ..................................*...................................................|...................................*................................................ - // sub v24.8h, v9.8h, v13.8h // ....................................*.................................................|.....................................*.............................................. - // add v9.8h, v9.8h, v13.8h // ........................*.............................................................|.........................*.......................................................... - // mul v13.8h, v24.8h, v0.h[0] // ......................................................*...............................|.......................................................*............................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................*..................................|....................................................*............................... - // mls v13.8h, v24.8h, v7.h[0] // .......................................................*..............................|........................................................*........................... - // sub v24.8h, v10.8h, v14.8h // ...................................*..................................................|....................................*............................................... - // add v10.8h, v10.8h, v14.8h // .................................*....................................................|..................................*................................................. - // mul v14.8h, v24.8h, v0.h[0] // ........................................*.............................................|.........................................*.......................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................*...............................................|.......................................*............................................ - // mls v14.8h, v24.8h, v7.h[0] // ..........................................*...........................................|...........................................*........................................ - // sub v24.8h, v11.8h, v15.8h // ...............................................*......................................|................................................*................................... - // add v11.8h, v11.8h, v15.8h // .................................................*....................................|..................................................*................................. - // mul v15.8h, v24.8h, v0.h[0] // ...........................................................*..........................|............................................................*....................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*.................................|.....................................................*.............................. - // mls v15.8h, v24.8h, v7.h[0] // ......................................................................*...............|.......................................................................*............ - // str q12, [x0, #(4*(512/8))] // .......................................*..............................................|........................................*........................................... - // str q13, [x0, #(5*(512/8))] // ..........................................................*...........................|...........................................................*........................ - // str q14, [x0, #(6*(512/8))] // .............................................*........................................|..............................................*..................................... - // str q15, [x0, #(7*(512/8))] // ..............................................................................*.......|...............................................................................*.... - // mul v12.8h, v8.8h, v29.8h // ...........................................*..........................................|............................................*....................................... - // sqrdmulh v8.8h, v8.8h, v30.8h // ............................................*.........................................|.............................................*...................................... - // mls v12.8h, v8.8h, v7.h[0] // ..................................................*...................................|...................................................*................................ - // mul v13.8h, v9.8h, v29.8h // ..............................................*.......................................|...............................................*.................................... - // sqrdmulh v9.8h, v9.8h, v30.8h // ................................................*.....................................|.................................................*.................................. - // mls v13.8h, v9.8h, v7.h[0] // ........................................................................*.............|.........................................................................*.......... - // mul v14.8h, v10.8h, v29.8h // .........................................................*............................|..........................................................*......................... - // sqrdmulh v10.8h, v10.8h, v30.8h // ........................................................*.............................|.........................................................*.......................... - // mls v14.8h, v10.8h, v7.h[0] // .............................................................*........................|..............................................................*..................... - // mul v15.8h, v11.8h, v29.8h // ..............................................................*.......................|...............................................................*.................... - // sqrdmulh v11.8h, v11.8h, v30.8h // ............................................................*.........................|.............................................................*...................... - // mls v15.8h, v11.8h, v7.h[0] // .................................................................*....................|..................................................................*................. - // str q12, [x0], #(16) // .....................................................*................................|......................................................*............................. - // str q13, [x0, #(-16 + 1*(512/8))] // ..................................................................................*...|...................................................................................* - // str q14, [x0, #(-16 + 2*(512/8))] // ...................................................................*..................|....................................................................*............... - // str q15, [x0, #(-16 + 3*(512/8))] // ..........................................................................*...........|...........................................................................*........ + // ------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + // ldr q8, [x0, #0] // ........................................e...............................................'.......................................~............................................. + // ldr q9, [x0, #(1*(512/8))] // .~......................................................................................'*.................................................................................... + // ldr q10, [x0, #(2*(512/8))] // ..e.....................................................................................'.~................................................................................... + // ldr q11, [x0, #(3*(512/8))] // ...e....................................................................................'..~.................................................................................. + // ldr q12, [x0, #(4*(512/8))] // .....~..................................................................................'....*................................................................................ + // ldr q13, [x0, #(5*(512/8))] // .......................................................................................e'..................................................................................... + // ldr q14, [x0, #(6*(512/8))] // ....................................................e...................................'...................................................~................................. + // ldr q15, [x0, #(7*(512/8))] // e.......................................................................................~..................................................................................... + // sub v24.8h, v8.8h, v9.8h // .......~................................................................................'......*.............................................................................. + // add v8.8h, v8.8h, v9.8h // ..........~.............................................................................'.........*........................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[7] // ............~...........................................................................'...........*......................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ...............~........................................................................'..............*...................................................................... + // mls v9.8h, v27.8h, v7.h[0] // ....................~...................................................................'...................*................................................................. + // sub v24.8h, v10.8h, v11.8h // ........e...............................................................................'.......~............................................................................. + // add v10.8h, v10.8h, v11.8h // .......................................................................e................'......................................................................~.............. + // sqrdmulh v27.8h, v24.8h, v1.h[1] // ..................................................................................e.....'.................................................................................~... + // mul v11.8h, v24.8h, v1.h[0] // ......................................................................................e.'..................................................................................... + // mls v11.8h, v27.8h, v7.h[0] // .........~..............................................................................'........*............................................................................ + // sub v24.8h, v12.8h, v13.8h // ...........~............................................................................'..........*.......................................................................... + // add v12.8h, v12.8h, v13.8h // .............~..........................................................................'............*........................................................................ + // sqrdmulh v27.8h, v24.8h, v1.h[3] // ..................~.....................................................................'.................*................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ..............................~.........................................................'.............................*....................................................... + // mls v13.8h, v27.8h, v7.h[0] // ...............................~........................................................'..............................*...................................................... + // sub v24.8h, v14.8h, v15.8h // ..............................................................e.........................'.............................................................~....................... + // add v14.8h, v14.8h, v15.8h // .........................................................e..............................'........................................................~............................ + // sqrdmulh v27.8h, v24.8h, v1.h[5] // ....................................................................................e...'...................................................................................~. + // mul v15.8h, v24.8h, v1.h[4] // ....~...................................................................................'...*................................................................................. + // mls v15.8h, v27.8h, v7.h[0] // ......~.................................................................................'.....*............................................................................... + // sub v24.8h, v8.8h, v10.8h // ................~.......................................................................'...............*..................................................................... + // add v8.8h, v8.8h, v10.8h // ..............~.........................................................................'.............*....................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ..........................~.............................................................'.........................*........................................................... + // mul v10.8h, v24.8h, v0.h[2] // ......................~.................................................................'.....................*............................................................... + // mls v10.8h, v27.8h, v7.h[0] // .............................~..........................................................'............................*........................................................ + // sub v24.8h, v9.8h, v11.8h // ........................~...............................................................'.......................*............................................................. + // add v9.8h, v9.8h, v11.8h // .........................~..............................................................'........................*............................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ....................................~...................................................'...................................*................................................. + // mul v11.8h, v24.8h, v0.h[2] // .................................~......................................................'................................*.................................................... + // mls v11.8h, v27.8h, v7.h[0] // ...........................................~............................................'..........................................*.......................................... + // sub v24.8h, v12.8h, v14.8h // ...................~....................................................................'..................*.................................................................. + // add v12.8h, v12.8h, v14.8h // .................~......................................................................'................*.................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // .......................~................................................................'......................*.............................................................. + // mul v14.8h, v24.8h, v0.h[4] // ...........................~............................................................'..........................*.......................................................... + // mls v14.8h, v27.8h, v7.h[0] // ............................~...........................................................'...........................*......................................................... + // sub v24.8h, v13.8h, v15.8h // .........................................~..............................................'........................................*............................................ + // add v13.8h, v13.8h, v15.8h // ...................................~....................................................'..................................*.................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ............................................~...........................................'...........................................*......................................... + // mul v15.8h, v24.8h, v0.h[4] // ..............................................~.........................................'.............................................*....................................... + // mls v15.8h, v27.8h, v7.h[0] // ................................................~.......................................'...............................................*..................................... + // sub v24.8h, v8.8h, v12.8h // ....................................................................~...................'...................................................................*................. + // add v8.8h, v8.8h, v12.8h // .....................~..................................................................'....................*................................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...........................................................................~............'..........................................................................*.......... + // mul v12.8h, v24.8h, v0.h[0] // ............................................................................~...........'...........................................................................*......... + // mls v12.8h, v27.8h, v7.h[0] // .................................................................................~......'................................................................................*.... + // sub v24.8h, v9.8h, v13.8h // ......................................~.................................................'.....................................*............................................... + // add v9.8h, v9.8h, v13.8h // .......................................~................................................'......................................*.............................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...............................................~........................................'..............................................*...................................... + // mul v13.8h, v24.8h, v0.h[0] // .................................................~......................................'................................................*.................................... + // mls v13.8h, v27.8h, v7.h[0] // ..................................................~.....................................'.................................................*................................... + // sub v24.8h, v10.8h, v14.8h // ................................~.......................................................'...............................*..................................................... + // add v10.8h, v10.8h, v14.8h // .....................................................................~..................'....................................................................*................ + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ..................................~.....................................................'.................................*................................................... + // mul v14.8h, v24.8h, v0.h[0] // .....................................~..................................................'....................................*................................................ + // mls v14.8h, v27.8h, v7.h[0] // ..........................................~.............................................'.........................................*........................................... + // sub v24.8h, v11.8h, v15.8h // ...................................................~....................................'..................................................*.................................. + // add v11.8h, v11.8h, v15.8h // .....................................................~..................................'....................................................*................................ + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .......................................................~................................'......................................................*.............................. + // mul v15.8h, v24.8h, v0.h[0] // ...........................................................~............................'..........................................................*.......................... + // mls v15.8h, v27.8h, v7.h[0] // ............................................................~...........................'...........................................................*......................... + // str q12, [x0, #(4*(512/8))] // .....................................................................................~..'....................................................................................* + // str q13, [x0, #(5*(512/8))] // ........................................................~...............................'.......................................................*............................. + // str q14, [x0, #(6*(512/8))] // .............................................~..........................................'............................................*........................................ + // str q15, [x0, #(7*(512/8))] // ................................................................~.......................'...............................................................*..................... + // sqrdmulh v27.8h, v8.8h, v30.8h // ......................................................~.................................'.....................................................*............................... + // mul v8.8h, v8.8h, v29.8h // ..........................................................~.............................'.........................................................*........................... + // mls v8.8h, v27.8h, v7.h[0] // .............................................................~..........................'............................................................*........................ + // sqrdmulh v27.8h, v9.8h, v30.8h // .................................................................~......................'................................................................*.................... + // mul v9.8h, v9.8h, v29.8h // ..............................................................................~.........'.............................................................................*....... + // mls v9.8h, v27.8h, v7.h[0] // ...............................................................................~........'..............................................................................*...... + // sqrdmulh v27.8h, v10.8h, v30.8h // .........................................................................~..............'........................................................................*............ + // mul v10.8h, v10.8h, v29.8h // ........................................................................~...............'.......................................................................*............. + // mls v10.8h, v27.8h, v7.h[0] // .............................................................................~..........'............................................................................*........ + // sqrdmulh v27.8h, v11.8h, v30.8h // ...............................................................~........................'..............................................................*...................... + // mul v11.8h, v11.8h, v29.8h // ...................................................................~....................'..................................................................*.................. + // mls v11.8h, v27.8h, v7.h[0] // ......................................................................~.................'.....................................................................*............... + // str q8, [x0], #(16) // ..................................................................~.....................'.................................................................*................... + // str q9, [x0, #(-16 + 1*(512/8))] // ...................................................................................~....'..................................................................................*.. + // str q10, [x0, #(-16 + 2*(512/8))] // ................................................................................~.......'...............................................................................*..... + // str q11, [x0, #(-16 + 3*(512/8))] // ..........................................................................~.............'.........................................................................*........... sub count, count, #1 cbnz count, layer123_start - mls v11.8H, v13.8H, v7.H[0] // *..................................................................... - sub v28.8H, v17.8H, v24.8H // ......*............................................................... - // gap // ...................................................................... - add v31.8H, v25.8H, v27.8H // ....*................................................................. - // gap // ...................................................................... - // gap // ...................................................................... - sub v10.8H, v25.8H, v27.8H // ..*................................................................... - mul v12.8H, v9.8H, v1.H[0] // .............*........................................................ - // gap // ...................................................................... - add v24.8H, v17.8H, v24.8H // .....*................................................................ - // gap // ...................................................................... - // gap // ...................................................................... - mul v14.8H, v28.8H, v0.H[4] // ...........*.......................................................... - sub v3.8H, v5.8H, v2.8H // .*.................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v8.8H, v28.8H, v0.H[5] // ..................*................................................... - add v19.8H, v31.8H, v24.8H // ..........*........................................................... - // gap // ...................................................................... - sub v15.8H, v31.8H, v24.8H // .........*............................................................ - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v22.8H, v10.8H, v0.H[3] // .....................*................................................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v28.8H, v10.8H, v0.H[2] // ...............*...................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v14.8H, v8.8H, v7.H[0] // ............................*......................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v28.8H, v22.8H, v7.H[0] // ..............................*....................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v2.8H, v23.8H, v0.H[7] // ........*............................................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v12.8H, v16.8H, v7.H[0] // ..............*....................................................... - // gap // ...................................................................... - // gap // ...................................................................... - add v27.8H, v28.8H, v14.8H // .................................*.................................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v18.8H, v28.8H, v14.8H // ...................................*.................................. - mul v28.8H, v23.8H, v0.H[6] // ...*.................................................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v23.8H, v3.8H, v1.H[3] // .......*.............................................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v28.8H, v2.8H, v7.H[0] // ................*..................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v6.8H, v3.8H, v1.H[2] // ............*......................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v6.8H, v23.8H, v7.H[0] // .................*.................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - add v17.8H, v28.8H, v12.8H // ....................*................................................. - mul v23.8H, v27.8H, v29.8H // .........................................................*............ - sub v28.8H, v28.8H, v12.8H // ...................*.................................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v10.8H, v18.8H, v0.H[1] // ......................................*............................... - // gap // ...................................................................... - // gap // ...................................................................... - add v16.8H, v6.8H, v11.8H // ......................*............................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v22.8H, v18.8H, v0.H[0] // ........................................*............................. - sub v9.8H, v6.8H, v11.8H // .......................*.............................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - add v6.8H, v17.8H, v16.8H // .........................*............................................ - sqrdmulh v20.8H, v27.8H, v30.8H // ........................................................*............. - // gap // ...................................................................... - sub v14.8H, v17.8H, v16.8H // ....................................*................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v22.8H, v10.8H, v7.H[0] // ..........................................*........................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v2.8H, v14.8H, v0.H[1] // ...................................................*.................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v11.8H, v6.8H, v29.8H // ..............................................*....................... - // gap // ...................................................................... - // gap // ...................................................................... - str q22, [x0, #384] // .............................................*........................ - // gap // ...................................................................... - // gap // ...................................................................... - mls v23.8H, v20.8H, v7.H[0] // .............................................................*........ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v5.8H, v15.8H, v0.H[0] // .............................*........................................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v22.8H, v28.8H, v0.H[3] // ...........................*.......................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q23, [x0, #128] // ................................................................*..... - // gap // ...................................................................... - // gap // ...................................................................... - mul v3.8H, v28.8H, v0.H[2] // ..........................*........................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v26.8H, v6.8H, v30.8H // ................................................*..................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v3.8H, v22.8H, v7.H[0] // ...............................*...................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v17.8H, v9.8H, v0.H[5] // .....................................*................................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v28.8H, v15.8H, v0.H[1] // ........................*............................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v23.8H, v9.8H, v0.H[4] // ................................*..................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v23.8H, v17.8H, v7.H[0] // .........................................*............................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v11.8H, v26.8H, v7.H[0] // ..................................................................*... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v5.8H, v28.8H, v7.H[0] // ..................................*................................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v24.8H, v3.8H, v23.8H // ...............................................*...................... - // gap // ...................................................................... - // gap // ...................................................................... - add v31.8H, v3.8H, v23.8H // .................................................*.................... - mul v3.8H, v14.8H, v0.H[0] // ......................................................*............... - // gap // ...................................................................... - str q11, [x0, #64] // .....................................................................* - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v4.8H, v24.8H, v0.H[1] // ....................................................*................. - // gap // ...................................................................... - // gap // ...................................................................... - str q5, [x0, #256] // .......................................*.............................. - // gap // ...................................................................... - // gap // ...................................................................... - mul v8.8H, v24.8H, v0.H[0] // ...........................................................*.......... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v27.8H, v19.8H, v30.8H // ............................................*......................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v8.8H, v4.8H, v7.H[0] // .................................................................*.... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v3.8H, v2.8H, v7.H[0] // .......................................................*.............. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sqrdmulh v25.8H, v31.8H, v30.8H // ............................................................*......... - str q8, [x0, #448] // ....................................................................*. - // gap // ...................................................................... - // gap // ...................................................................... - mul v12.8H, v31.8H, v29.8H // ..............................................................*....... - // gap // ...................................................................... - // gap // ...................................................................... - str q3, [x0, #320] // ..........................................................*........... - // gap // ...................................................................... - // gap // ...................................................................... - mul v22.8H, v19.8H, v29.8H // ...........................................*.......................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v12.8H, v25.8H, v7.H[0] // ...............................................................*...... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v22.8H, v27.8H, v7.H[0] // ..................................................*................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q12, [x0, #192] // ...................................................................*.. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q22, [x0], #(16) // .....................................................*................ - // gap // ...................................................................... - // gap // ...................................................................... - - // original source code - // mls v11.8H, v13.8H, v7.H[0] // *..................................................................... - // sub v28.8H, v5.8H, v2.8H // .......*.............................................................. - // sub v5.8H, v25.8H, v27.8H // ...*.................................................................. - // mul v19.8H, v23.8H, v0.H[6] // ...................*.................................................. - // add v27.8H, v25.8H, v27.8H // ..*................................................................... - // add v20.8H, v17.8H, v24.8H // .....*................................................................ - // sub v6.8H, v17.8H, v24.8H // .*.................................................................... - // sqrdmulh v31.8H, v28.8H, v1.H[3] // ....................*................................................. - // sqrdmulh v22.8H, v23.8H, v0.H[7] // ...............*...................................................... - // sub v4.8H, v27.8H, v20.8H // ..........*........................................................... - // add v27.8H, v27.8H, v20.8H // .........*............................................................ - // mul v13.8H, v6.8H, v0.H[4] // ......*............................................................... - // mul v28.8H, v28.8H, v1.H[2] // ......................*............................................... - // mul v20.8H, v9.8H, v1.H[0] // ....*................................................................. - // mls v20.8H, v16.8H, v7.H[0] // ................*..................................................... - // mul v24.8H, v5.8H, v0.H[2] // ............*......................................................... - // mls v19.8H, v22.8H, v7.H[0] // .....................*................................................ - // mls v28.8H, v31.8H, v7.H[0] // .......................*.............................................. - // sqrdmulh v22.8H, v6.8H, v0.H[5] // ........*............................................................. - // sub v26.8H, v19.8H, v20.8H // ..........................*........................................... - // add v18.8H, v19.8H, v20.8H // ........................*............................................. - // sqrdmulh v19.8H, v5.8H, v0.H[3] // ...........*.......................................................... - // add v23.8H, v28.8H, v11.8H // ............................*......................................... - // sub v10.8H, v28.8H, v11.8H // ..............................*....................................... - // sqrdmulh v11.8H, v4.8H, v0.H[1] // ..............................................*....................... - // add v6.8H, v18.8H, v23.8H // ...............................*...................................... - // mul v8.8H, v26.8H, v0.H[2] // ..........................................*........................... - // sqrdmulh v28.8H, v26.8H, v0.H[3] // ........................................*............................. - // mls v13.8H, v22.8H, v7.H[0] // .............*........................................................ - // mul v22.8H, v4.8H, v0.H[0] // .......................................*.............................. - // mls v24.8H, v19.8H, v7.H[0] // ..............*....................................................... - // mls v8.8H, v28.8H, v7.H[0] // ............................................*......................... - // mul v28.8H, v10.8H, v0.H[4] // ...............................................*...................... - // add v5.8H, v24.8H, v13.8H // .................*.................................................... - // mls v22.8H, v11.8H, v7.H[0] // ..................................................*................... - // sub v3.8H, v24.8H, v13.8H // ..................*................................................... - // sub v24.8H, v18.8H, v23.8H // .................................*.................................... - // sqrdmulh v23.8H, v10.8H, v0.H[5] // .............................................*........................ - // sqrdmulh v25.8H, v3.8H, v0.H[1] // ...........................*.......................................... - // str q22, [x0, #256] // ........................................................*............. - // mul v22.8H, v3.8H, v0.H[0] // .............................*........................................ - // mls v28.8H, v23.8H, v7.H[0] // ................................................*..................... - // mls v22.8H, v25.8H, v7.H[0] // ..................................*................................... - // mul v3.8H, v27.8H, v29.8H // .................................................................*.... - // sqrdmulh v27.8H, v27.8H, v30.8H // ..........................................................*........... - // str q22, [x0, #384] // .....................................*................................ - // mul v22.8H, v6.8H, v29.8H // ....................................*................................. - // sub v10.8H, v8.8H, v28.8H // ...................................................*.................. - // sqrdmulh v19.8H, v6.8H, v30.8H // ...........................................*.......................... - // add v28.8H, v8.8H, v28.8H // ....................................................*................. - // mls v3.8H, v27.8H, v7.H[0] // ...................................................................*.. - // sqrdmulh v23.8H, v24.8H, v0.H[1] // ...................................*.................................. - // sqrdmulh v20.8H, v10.8H, v0.H[1] // .......................................................*.............. - // str q3, [x0], #(16) // .....................................................................* - // mul v11.8H, v24.8H, v0.H[0] // .....................................................*................ - // mls v11.8H, v23.8H, v7.H[0] // ............................................................*......... - // sqrdmulh v23.8H, v5.8H, v30.8H // ................................*..................................... - // mul v27.8H, v5.8H, v29.8H // .........................*............................................ - // str q11, [x0, #304] // ................................................................*..... - // mul v24.8H, v10.8H, v0.H[0] // .........................................................*............ - // sqrdmulh v8.8H, v28.8H, v30.8H // .............................................................*........ - // mls v27.8H, v23.8H, v7.H[0] // ......................................*............................... - // mul v23.8H, v28.8H, v29.8H // ...............................................................*...... - // mls v23.8H, v8.8H, v7.H[0] // ..................................................................*... - // str q27, [x0, #112] // .........................................*............................ - // mls v24.8H, v20.8H, v7.H[0] // ...........................................................*.......... - // mls v22.8H, v19.8H, v7.H[0] // .................................................*.................... - // str q23, [x0, #176] // ....................................................................*. - // str q24, [x0, #432] // ..............................................................*....... - // str q22, [x0, #48] // ......................................................*............... + // Instructions: 75 + // Expected cycles: 95 + // Expected IPC: 0.79 + // + // Cycle bound: 95.0 + // IPC bound: 0.79 + // + // Wall time: 4.48s + // User time: 4.48s + // + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + mul v16.8H, v16.8H, v1.H[4] // .*......................................................................... + ldr q26, [x0, #64] // *.......................................................................... + ldr q11, [x0, #256] // ..*........................................................................ + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v16.8H, v15.8H, v7.H[0] // ...*....................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v15.8H, v24.8H, v26.8H // ......*.................................................................... + sub v26.8H, v24.8H, v26.8H // ....*...................................................................... + // gap // ........................................................................... + mls v10.8H, v27.8H, v7.H[0] // .....*..................................................................... + sub v27.8H, v11.8H, v22.8H // .......*................................................................... + // gap // ........................................................................... + add v11.8H, v11.8H, v22.8H // .........*................................................................. + // gap // ........................................................................... + // gap // ........................................................................... + add v13.8H, v15.8H, v14.8H // ..........*................................................................ + sqrdmulh v31.8H, v26.8H, v0.H[7] // ........*.................................................................. + // gap // ........................................................................... + sub v15.8H, v15.8H, v14.8H // ............*.............................................................. + // gap // ........................................................................... + // gap // ........................................................................... + add v14.8H, v11.8H, v6.8H // .............*............................................................. + sqrdmulh v25.8H, v27.8H, v1.H[3] // ..............*............................................................ + // gap // ........................................................................... + sub v6.8H, v11.8H, v6.8H // ...............*........................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v26.8H, v26.8H, v0.H[6] // ...........*............................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v11.8H, v13.8H, v14.8H // .................*......................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v27.8H, v27.8H, v1.H[2] // ..........................*................................................ + sub v14.8H, v13.8H, v14.8H // ............................................................*.............. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v27.8H, v25.8H, v7.H[0] // ...........................*............................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v26.8H, v31.8H, v7.H[0] // ................*.......................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v13.8H, v15.8H, v0.H[2] // ..................*........................................................ + // gap // ........................................................................... + // gap // ........................................................................... + add v31.8H, v27.8H, v16.8H // ...............................*........................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v16.8H, v27.8H, v16.8H // ....................................*...................................... + sqrdmulh v27.8H, v6.8H, v0.H[5] // ...................*....................................................... + // gap // ........................................................................... + sub v25.8H, v26.8H, v10.8H // ....................*...................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v26.8H, v26.8H, v10.8H // .....................*..................................................... + sqrdmulh v15.8H, v15.8H, v0.H[3] // ......................*.................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v6.8H, v6.8H, v0.H[4] // .......................*................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v22.8H, v26.8H, v31.8H // ..................................*........................................ + // gap // ........................................................................... + // gap // ........................................................................... + add v26.8H, v26.8H, v31.8H // ...................................*....................................... + mls v6.8H, v27.8H, v7.H[0] // ........................*.................................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v13.8H, v15.8H, v7.H[0] // .........................*................................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v15.8H, v25.8H, v0.H[2] // .............................*............................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v27.8H, v25.8H, v0.H[3] // ................................*.......................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v31.8H, v13.8H, v6.8H // ............................*.............................................. + // gap // ........................................................................... + // gap // ........................................................................... + add v6.8H, v13.8H, v6.8H // .............................................................*............. + sqrdmulh v13.8H, v16.8H, v0.H[5] // .......................................*................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v25.8H, v11.8H, v30.8H // ................................................*.......................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v11.8H, v11.8H, v29.8H // ...................................................*....................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v3.8H, v14.8H, v0.H[1] // ..................................................................*........ + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v14.8H, v14.8H, v0.H[0] // ...................................................................*....... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v2.8H, v31.8H, v0.H[1] // ..............................*............................................ + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v31.8H, v31.8H, v0.H[0] // .................................*......................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v15.8H, v27.8H, v7.H[0] // ......................................*.................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v31.8H, v2.8H, v7.H[0] // .....................................*..................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v16.8H, v16.8H, v0.H[4] // .........................................*................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v27.8H, v22.8H, v0.H[1] // ..........................................*................................ + // gap // ........................................................................... + // gap // ........................................................................... + str q31, [x0, #384] // ........................................*.................................. + // gap // ........................................................................... + // gap // ........................................................................... + mls v16.8H, v13.8H, v7.H[0] // ...........................................*............................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v13.8H, v22.8H, v0.H[0] // ............................................*.............................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v13.8H, v27.8H, v7.H[0] // .............................................*............................. + // gap // ........................................................................... + // gap // ........................................................................... + sub v27.8H, v15.8H, v16.8H // ..............................................*............................ + // gap // ........................................................................... + // gap // ........................................................................... + mls v11.8H, v25.8H, v7.H[0] // ......................................................*.................... + add v15.8H, v15.8H, v16.8H // ...............................................*........................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v16.8H, v27.8H, v0.H[1] // .................................................*......................... + // gap // ........................................................................... + // gap // ........................................................................... + str q13, [x0, #320] // ..................................................*........................ + // gap // ........................................................................... + // gap // ........................................................................... + mul v27.8H, v27.8H, v0.H[0] // ....................................................*...................... + // gap // ........................................................................... + // gap // ........................................................................... + str q11, [x0], #(16) // ..........................................................*................ + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v11.8H, v15.8H, v30.8H // .......................................................*................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v27.8H, v16.8H, v7.H[0] // .....................................................*..................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v16.8H, v26.8H, v30.8H // .........................................................*................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v15.8H, v15.8H, v29.8H // ...........................................................*............... + // gap // ........................................................................... + // gap // ........................................................................... + str q27, [x0, #432] // ........................................................*.................. + // gap // ........................................................................... + // gap // ........................................................................... + mls v15.8H, v11.8H, v7.H[0] // ..............................................................*............ + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v27.8H, v6.8H, v29.8H // ...............................................................*........... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sqrdmulh v6.8H, v6.8H, v30.8H // ................................................................*.......... + // gap // ........................................................................... + // gap // ........................................................................... + str q15, [x0, #176] // .................................................................*......... + // gap // ........................................................................... + // gap // ........................................................................... + mul v15.8H, v26.8H, v29.8H // .....................................................................*..... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v15.8H, v16.8H, v7.H[0] // ......................................................................*.... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v27.8H, v6.8H, v7.H[0] // ....................................................................*...... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v14.8H, v3.8H, v7.H[0] // ........................................................................*.. + // gap // ........................................................................... + // gap // ........................................................................... + str q15, [x0, #48] // .........................................................................*. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + str q27, [x0, #112] // .......................................................................*... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + str q14, [x0, #240] // ..........................................................................* + // gap // ........................................................................... + // gap // ........................................................................... + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q21, [x0, #64] // .*......................................................................... + // mul v3.8H, v16.8H, v1.H[4] // *.......................................................................... + // ldr q11, [x0, #256] // ..*........................................................................ + // mls v3.8H, v15.8H, v7.H[0] // ...*....................................................................... + // sub v20.8H, v24.8H, v21.8H // .....*..................................................................... + // mls v10.8H, v27.8H, v7.H[0] // ......*.................................................................... + // add v31.8H, v24.8H, v21.8H // ....*...................................................................... + // sub v27.8H, v11.8H, v22.8H // .......*................................................................... + // sqrdmulh v25.8H, v20.8H, v0.H[7] // ..........*................................................................ + // add v26.8H, v11.8H, v22.8H // ........*.................................................................. + // add v23.8H, v31.8H, v14.8H // .........*................................................................. + // mul v21.8H, v20.8H, v0.H[6] // ...............*........................................................... + // sub v16.8H, v31.8H, v14.8H // ...........*............................................................... + // add v12.8H, v26.8H, v6.8H // ............*.............................................................. + // sqrdmulh v11.8H, v27.8H, v1.H[3] // .............*............................................................. + // sub v15.8H, v26.8H, v6.8H // ..............*............................................................ + // mls v21.8H, v25.8H, v7.H[0] // ....................*...................................................... + // add v13.8H, v23.8H, v12.8H // ................*.......................................................... + // mul v2.8H, v16.8H, v0.H[2] // .....................*..................................................... + // sqrdmulh v18.8H, v15.8H, v0.H[5] // ........................*.................................................. + // sub v28.8H, v21.8H, v10.8H // .........................*................................................. + // add v24.8H, v21.8H, v10.8H // ..........................*................................................ + // sqrdmulh v4.8H, v16.8H, v0.H[3] // ...........................*............................................... + // mul v25.8H, v15.8H, v0.H[4] // ............................*.............................................. + // mls v25.8H, v18.8H, v7.H[0] // ...............................*........................................... + // mls v2.8H, v4.8H, v7.H[0] // ................................*.......................................... + // mul v4.8H, v27.8H, v1.H[2] // .................*......................................................... + // mls v4.8H, v11.8H, v7.H[0] // ...................*....................................................... + // sub v15.8H, v2.8H, v25.8H // ...................................*....................................... + // mul v22.8H, v28.8H, v0.H[2] // .................................*......................................... + // sqrdmulh v6.8H, v15.8H, v0.H[1] // ..........................................*................................ + // add v27.8H, v4.8H, v3.8H // ......................*.................................................... + // sqrdmulh v26.8H, v28.8H, v0.H[3] // ..................................*........................................ + // mul v14.8H, v15.8H, v0.H[0] // ...........................................*............................... + // sub v11.8H, v24.8H, v27.8H // .............................*............................................. + // add v28.8H, v24.8H, v27.8H // ..............................*............................................ + // sub v27.8H, v4.8H, v3.8H // .......................*................................................... + // mls v14.8H, v6.8H, v7.H[0] // .............................................*............................. + // mls v22.8H, v26.8H, v7.H[0] // ............................................*.............................. + // sqrdmulh v15.8H, v27.8H, v0.H[5] // .....................................*..................................... + // str q14, [x0, #384] // ................................................*.......................... + // mul v31.8H, v27.8H, v0.H[4] // ..............................................*............................ + // sqrdmulh v26.8H, v11.8H, v0.H[1] // ...............................................*........................... + // mls v31.8H, v15.8H, v7.H[0] // .................................................*......................... + // mul v6.8H, v11.8H, v0.H[0] // ..................................................*........................ + // mls v6.8H, v26.8H, v7.H[0] // ...................................................*....................... + // sub v15.8H, v22.8H, v31.8H // ....................................................*...................... + // add v27.8H, v22.8H, v31.8H // ......................................................*.................... + // sqrdmulh v16.8H, v13.8H, v30.8H // ......................................*.................................... + // sqrdmulh v11.8H, v15.8H, v0.H[1] // .......................................................*................... + // str q6, [x0, #320] // ........................................................*.................. + // mul v26.8H, v13.8H, v29.8H // .......................................*................................... + // mul v15.8H, v15.8H, v0.H[0] // .........................................................*................. + // mls v15.8H, v11.8H, v7.H[0] // ............................................................*.............. + // mls v26.8H, v16.8H, v7.H[0] // .....................................................*..................... + // sqrdmulh v14.8H, v27.8H, v30.8H // ...........................................................*............... + // str q15, [x0, #448] // ...............................................................*........... + // sqrdmulh v22.8H, v28.8H, v30.8H // .............................................................*............. + // str q26, [x0], #(16) // ..........................................................*................ + // mul v31.8H, v27.8H, v29.8H // ..............................................................*............ + // sub v27.8H, v23.8H, v12.8H // ..................*........................................................ + // add v15.8H, v2.8H, v25.8H // ....................................*...................................... + // mls v31.8H, v14.8H, v7.H[0] // ................................................................*.......... + // mul v2.8H, v15.8H, v29.8H // .................................................................*......... + // sqrdmulh v15.8H, v15.8H, v30.8H // ..................................................................*........ + // str q31, [x0, #176] // ...................................................................*....... + // sqrdmulh v11.8H, v27.8H, v0.H[1] // ........................................*.................................. + // mul v26.8H, v27.8H, v0.H[0] // .........................................*................................. + // mls v2.8H, v15.8H, v7.H[0] // ......................................................................*.... + // mul v15.8H, v28.8H, v29.8H // ....................................................................*...... + // mls v15.8H, v22.8H, v7.H[0] // .....................................................................*..... + // str q2, [x0, #112] // .........................................................................*. + // mls v26.8H, v11.8H, v7.H[0] // .......................................................................*... + // str q15, [x0, #48] // ........................................................................*.. + // str q26, [x0, #240] // ..........................................................................* pop_stack diff --git a/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_firestorm.s b/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_firestorm.s index d3209bce..88541085 100644 --- a/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_firestorm.s +++ b/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_firestorm.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +69,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -146,7 +125,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +136,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +146,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +154,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +165,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -354,25 +339,36 @@ _intt_kyber_123_4567_opt_m1_firestorm: mov count, #8 .p2align 2 - ldr q27, [x4, #48] // ...........*............................... - ldr q15, [x1, #0] // ....*...................................... - ldr q29, [x4, #32] // ........*.................................. + // Instructions: 43 + // Expected cycles: 26 + // Expected IPC: 1.65 + // + // Cycle bound: 26.0 + // IPC bound: 1.65 + // + // Wall time: 1.09s + // User time: 1.09s + // + // ----------- original position ------------> + // 0 25 + // |------------------------|----------------- + ldr q15, [x1, #16] // .....*..................................... + ldr q0, [x1, #0] // ...*....................................... + ldr q2, [x1, #48] // .*......................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... - ldr q24, [x1, #16] // .....*..................................... - ldr q16, [x1, #48] // ..*........................................ - ldr q21, [x1, #32] // .*......................................... + ldr q6, [x3], #16 // ......................................*.... + ldr q27, [x1, #32] // ....*...................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... - ldr q5, [x3], #16 // ..........................................* - ldr q31, [x4, #64] // ......*.................................... // gap // ........................................... + ldr q12, [x4, #32] // .........*................................. // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -387,18 +383,21 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... + ldr q30, [x4, #64] // ..*........................................ + trn1 v20.4S, v0.4S, v15.4S // ............*.............................. + trn2 v14.4S, v0.4S, v15.4S // .............*............................. + ldr q15, [x4, #16] // .......*................................... + ldr q31, [x4], #(6*16) // ......*.................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... + trn1 v4.4S, v27.4S, v2.4S // ..........*................................ + trn2 v0.4S, v27.4S, v2.4S // ...........*............................... + ldr q21, [x4, #-16] // ........*.................................. // gap // ........................................... // gap // ........................................... // gap // ........................................... - trn1 v28.4S, v15.4S, v24.4S // .........*................................. - trn1 v25.4S, v21.4S, v16.4S // ..........*................................ - trn2 v19.4S, v21.4S, v16.4S // ............*.............................. - ldr q21, [x4, #80] // ...*....................................... - trn2 v26.4S, v15.4S, v24.4S // .............*............................. // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -409,16 +408,15 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... + trn1 v27.2D, v20.2D, v4.2D // .................*......................... + trn2 v13.2D, v20.2D, v4.2D // ...............*........................... + trn1 v5.2D, v14.2D, v0.2D // ................*.......................... + trn2 v3.2D, v14.2D, v0.2D // ..............*............................ // gap // ........................................... - trn1 v9.2D, v26.2D, v19.2D // ................*.......................... - trn2 v12.2D, v26.2D, v19.2D // .................*......................... - trn1 v13.2D, v28.2D, v25.2D // ..............*............................ - trn2 v2.2D, v28.2D, v25.2D // ...............*........................... - ldr q25, [x4, #16] // .......*................................... // gap // ........................................... // gap // ........................................... + ldr q11, [x4, #-48] // *.......................................... // gap // ........................................... - ldr q22, [x4], #(6*16) // *.......................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -426,10 +424,10 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... - sub v1.8H, v13.8H, v9.8H // ...................*....................... - add v0.8H, v13.8H, v9.8H // ....................*...................... - sub v11.8H, v2.8H, v12.8H // ..................*........................ - add v17.8H, v2.8H, v12.8H // ...........................*............... + sub v22.8H, v27.8H, v5.8H // ....................*...................... + add v28.8H, v27.8H, v5.8H // .....................*..................... + add v26.8H, v13.8H, v3.8H // ...................*....................... + sub v14.8H, v13.8H, v3.8H // ..................*........................ // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -442,16 +440,15 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... - sqrdmulh v21.8H, v11.8H, v21.8H // ........................*.................. - mul v19.8H, v1.8H, v29.8H // .......................*................... - mul v31.8H, v11.8H, v31.8H // ......................*.................... - sqrdmulh v28.8H, v1.8H, v27.8H // .....................*..................... + sqrdmulh v16.8H, v22.8H, v11.8H // ...........................*............... + mul v11.8H, v14.8H, v30.8H // ......................*.................... + sqrdmulh v0.8H, v14.8H, v21.8H // .......................*................... + mul v14.8H, v22.8H, v12.8H // .........................*................. // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... - add v12.8H, v0.8H, v17.8H // ............................*.............. - sub v0.8H, v0.8H, v17.8H // .............................*............. + sub v27.8H, v28.8H, v26.8H // ..........................*................ // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -459,6 +456,7 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... + add v9.8H, v28.8H, v26.8H // ........................*.................. // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -466,10 +464,10 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... - sqrdmulh v13.8H, v0.8H, v25.8H // ................................*.......... - mls v31.8H, v21.8H, v7.H[0] // ..........................*................ - mls v19.8H, v28.8H, v7.H[0] // .........................*................. - mul v8.8H, v0.8H, v22.8H // ...............................*........... + mls v14.8H, v16.8H, v7.H[0] // ..............................*............ + mul v28.8H, v27.8H, v31.8H // .............................*............. + sqrdmulh v21.8H, v27.8H, v15.8H // ............................*.............. + mls v11.8H, v0.8H, v7.H[0] // ...............................*........... // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -490,9 +488,7 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... - mls v8.8H, v13.8H, v7.H[0] // .....................................*..... - sub v4.8H, v19.8H, v31.8H // ..............................*............ - add v13.8H, v19.8H, v31.8H // ...................................*....... + sub v1.8H, v14.8H, v11.8H // .................................*......... // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -500,22 +496,23 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... + add v12.8H, v14.8H, v11.8H // ..................................*........ // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... - sqrdmulh v14.8H, v4.8H, v25.8H // .................................*......... - mul v21.8H, v4.8H, v22.8H // ..................................*........ - trn2 v23.4S, v12.4S, v13.4S // .......................................*... // gap // ........................................... + sqrdmulh v30.8H, v1.8H, v15.8H // ....................................*...... + mul v15.8H, v1.8H, v31.8H // ...................................*....... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... + trn2 v26.4S, v9.4S, v12.4S // .....................................*..... // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -523,6 +520,7 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... + mls v28.8H, v21.8H, v7.H[0] // ................................*.......... // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -530,7 +528,7 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... - mls v21.8H, v14.8H, v7.H[0] // ....................................*...... + mls v15.8H, v30.8H, v7.H[0] // .......................................*... // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -538,6 +536,7 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... + trn1 v16.4S, v9.4S, v12.4S // ........................................*.. // gap // ........................................... // gap // ........................................... // gap // ........................................... @@ -553,379 +552,408 @@ _intt_kyber_123_4567_opt_m1_firestorm: // gap // ........................................... // gap // ........................................... // gap // ........................................... + trn2 v17.4S, v28.4S, v15.4S // ..........................................* + trn1 v0.4S, v28.4S, v15.4S // .........................................*. // gap // ........................................... - trn2 v24.4S, v8.4S, v21.4S // .........................................*. - trn1 v0.4S, v8.4S, v21.4S // ........................................*.. - trn1 v8.4S, v12.4S, v13.4S // ......................................*.... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... // gap // ........................................... - // original source code - // ldr q13, [x4], #(6*16) // ..................*........................ - // ldr q27, [x1, #32] // .....*..................................... - // ldr q1, [x1, #48] // ....*...................................... - // ldr q4, [x4, #-16] // ...........*............................... - // ldr q17, [x1, #0] // .*......................................... - // ldr q29, [x1, #16] // ...*....................................... - // ldr q2, [x4, #-32] // .......*................................... - // ldr q21, [x4, #-80] // .................*......................... - // ldr q28, [x4, #-64] // ..*........................................ - // trn1 v3.4S, v17.4S, v29.4S // ........*.................................. - // trn1 v26.4S, v27.4S, v1.4S // .........*................................. - // ldr q25, [x4, #-48] // *.......................................... - // trn2 v20.4S, v27.4S, v1.4S // ..........*................................ - // trn2 v29.4S, v17.4S, v29.4S // ............*.............................. - // trn1 v8.2D, v3.2D, v26.2D // ...............*........................... - // trn2 v3.2D, v3.2D, v26.2D // ................*.......................... - // trn1 v1.2D, v29.2D, v20.2D // .............*............................. - // trn2 v20.2D, v29.2D, v20.2D // ..............*............................ - // sub v27.8H, v3.8H, v20.8H // .....................*..................... - // sub v12.8H, v8.8H, v1.8H // ...................*....................... - // add v18.8H, v8.8H, v1.8H // ....................*...................... - // sqrdmulh v17.8H, v12.8H, v25.8H // ..........................*................ - // mul v14.8H, v27.8H, v2.8H // .........................*................. - // mul v9.8H, v12.8H, v28.8H // ........................*.................. - // sqrdmulh v31.8H, v27.8H, v4.8H // .......................*................... - // mls v9.8H, v17.8H, v7.H[0] // ...............................*........... - // mls v14.8H, v31.8H, v7.H[0] // ..............................*............ - // add v31.8H, v3.8H, v20.8H // ......................*.................... - // add v25.8H, v18.8H, v31.8H // ...........................*............... - // sub v29.8H, v18.8H, v31.8H // ............................*.............. - // sub v28.8H, v9.8H, v14.8H // ..................................*........ - // mul v27.8H, v29.8H, v13.8H // ................................*.......... - // sqrdmulh v3.8H, v29.8H, v21.8H // .............................*............. - // sqrdmulh v26.8H, v28.8H, v21.8H // ....................................*...... - // mul v1.8H, v28.8H, v13.8H // .....................................*..... - // add v18.8H, v9.8H, v14.8H // ...................................*....... - // mls v1.8H, v26.8H, v7.H[0] // .......................................*... - // mls v27.8H, v3.8H, v7.H[0] // .................................*......... - // trn1 v8.4S, v25.4S, v18.4S // ..........................................* - // trn2 v23.4S, v25.4S, v18.4S // ......................................*.... - // trn1 v0.4S, v27.4S, v1.4S // .........................................*. - // trn2 v24.4S, v27.4S, v1.4S // ........................................*.. - // ldr q5, [x3], #16 // ......*.................................... + // -------------- new position --------------> + // 0 25 + // |------------------------|----------------- + // ldr q12, [x4, #48] // ..................*........................ + // ldr q25, [x1, #48] // ..*........................................ + // ldr q23, [x4, #64] // ......*.................................... + // ldr q13, [x1, #0] // .*......................................... + // ldr q3, [x1, #32] // ....*...................................... + // ldr q17, [x1, #16] // *.......................................... + // ldr q15, [x4], #(6*16) // ..........*................................ + // ldr q19, [x4, #-80] // .........*................................. + // ldr q2, [x4, #-16] // .............*............................. + // ldr q1, [x4, #-64] // .....*..................................... + // trn1 v11.4S, v3.4S, v25.4S // ...........*............................... + // trn2 v14.4S, v3.4S, v25.4S // ............*.............................. + // trn1 v22.4S, v13.4S, v17.4S // .......*................................... + // trn2 v13.4S, v13.4S, v17.4S // ........*.................................. + // trn2 v25.2D, v13.2D, v14.2D // .................*......................... + // trn2 v29.2D, v22.2D, v11.2D // ...............*........................... + // trn1 v14.2D, v13.2D, v14.2D // ................*.......................... + // trn1 v11.2D, v22.2D, v11.2D // ..............*............................ + // sub v13.8H, v29.8H, v25.8H // ......................*.................... + // add v25.8H, v29.8H, v25.8H // .....................*..................... + // sub v3.8H, v11.8H, v14.8H // ...................*....................... + // add v8.8H, v11.8H, v14.8H // ....................*...................... + // mul v20.8H, v13.8H, v23.8H // ........................*.................. + // sqrdmulh v13.8H, v13.8H, v2.8H // .........................*................. + // add v17.8H, v8.8H, v25.8H // ............................*.............. + // mul v23.8H, v3.8H, v1.8H // ..........................*................ + // sub v25.8H, v8.8H, v25.8H // ...........................*............... + // sqrdmulh v3.8H, v3.8H, v12.8H // .......................*................... + // sqrdmulh v11.8H, v25.8H, v19.8H // ...............................*........... + // mul v22.8H, v25.8H, v15.8H // ..............................*............ + // mls v23.8H, v3.8H, v7.H[0] // .............................*............. + // mls v20.8H, v13.8H, v7.H[0] // ................................*.......... + // mls v22.8H, v11.8H, v7.H[0] // ......................................*.... + // sub v3.8H, v23.8H, v20.8H // .................................*......... + // add v25.8H, v23.8H, v20.8H // ..................................*........ + // mul v13.8H, v3.8H, v15.8H // ....................................*...... + // sqrdmulh v31.8H, v3.8H, v19.8H // ...................................*....... + // trn2 v26.4S, v17.4S, v25.4S // .....................................*..... + // ldr q6, [x3], #16 // ...*....................................... + // mls v13.8H, v31.8H, v7.H[0] // .......................................*... + // trn1 v16.4S, v17.4S, v25.4S // ........................................*.. + // trn1 v0.4S, v22.4S, v13.4S // ..........................................* + // trn2 v17.4S, v22.4S, v13.4S // .........................................*. sub count, count, #1 layer4567_start: - ldr q13, [x4], #(6*16) // ............e...................................................................... - trn1 v18.2D, v8.2D, v0.2D // ............................................*...................................... - trn2 v11.2D, v8.2D, v0.2D // ..........................................*........................................ - ldr q27, [x1, #96] // ..e................................................................................ - ldr q1, [x1, #112] // ...e............................................................................... + // Instructions: 83 + // Expected cycles: 28 + // Expected IPC: 2.96 + // + // Cycle bound: 28.0 + // IPC bound: 2.96 + // + // Wall time: 16.90s + // User time: 16.90s + // + // ------------------------------- original position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------- + ldr q12, [x4, #48] // ...............e................................................................... + ldr q25, [x1, #112] // ...e............................................................................... + trn1 v22.2D, v26.2D, v17.2D // .............................................*..................................... + trn2 v4.2D, v26.2D, v17.2D // ...........................................*....................................... + trn1 v14.2D, v16.2D, v0.2D // ............................................*...................................... + trn2 v0.2D, v16.2D, v0.2D // ..........................................*........................................ // gap // ................................................................................... - trn1 v10.2D, v23.2D, v24.2D // .............................................*..................................... - trn2 v24.2D, v23.2D, v24.2D // ...........................................*....................................... - ldr q4, [x4, #-16] // .................e................................................................. - ldr q17, [x1, #64] // e.................................................................................. - ldr q29, [x1, #80] // .e................................................................................. + ldr q23, [x4, #64] // ................e.................................................................. + ldr q13, [x1, #64] // e.................................................................................. + ldr q3, [x1, #96] // ..e................................................................................ + ldr q17, [x1, #80] // .e................................................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - sub v19.8H, v11.8H, v24.8H // ....................................................*.............................. - add v22.8H, v11.8H, v24.8H // .....................................................*............................. - sub v15.8H, v18.8H, v10.8H // ...............................................*................................... - add v10.8H, v18.8H, v10.8H // ................................................*.................................. - ldr q2, [x4, #-32] // ................e.................................................................. + ldr q15, [x4], #(6*16) // ............e...................................................................... + sub v27.8H, v14.8H, v22.8H // ...............................................*................................... + sub v16.8H, v0.8H, v4.8H // ....................................................*.............................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - ldr q21, [x4, #-80] // .............e..................................................................... // gap // ................................................................................... // gap // ................................................................................... + ldr q19, [x4, #-80] // .............e..................................................................... + ldr q2, [x4, #-16] // .................e................................................................. + add v21.8H, v14.8H, v22.8H // ................................................*.................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - sqrdmulh v23.8H, v15.8H, v5.H[3] // ..................................................*................................ - mul v6.8H, v15.8H, v5.H[2] // .................................................*................................. - sqrdmulh v9.8H, v19.8H, v5.H[5] // .......................................................*........................... - mul v24.8H, v19.8H, v5.H[4] // ......................................................*............................ - ldr q28, [x4, #-64] // ..............e.................................................................... + ldr q1, [x4, #-64] // ..............e.................................................................... + mul v31.8H, v27.8H, v6.H[2] // ..................................................*................................ + sqrdmulh v27.8H, v27.8H, v6.H[3] // .................................................*................................. + mul v26.8H, v16.8H, v6.H[4] // .......................................................*........................... + sqrdmulh v16.8H, v16.8H, v6.H[5] // ......................................................*............................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - trn1 v3.4S, v17.4S, v29.4S // ....e.............................................................................. - trn1 v26.4S, v27.4S, v1.4S // ......e............................................................................ - ldr q25, [x4, #-48] // ...............e................................................................... + trn1 v11.4S, v3.4S, v25.4S // ......e............................................................................ + trn2 v14.4S, v3.4S, v25.4S // .......e........................................................................... + trn1 v22.4S, v13.4S, v17.4S // ....e.............................................................................. + trn2 v13.4S, v13.4S, v17.4S // .....e............................................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - sqdmulh v0.8H, v10.8H, v7.H[1] // .........................................................*......................... - trn2 v20.4S, v27.4S, v1.4S // .......e........................................................................... - trn2 v29.4S, v17.4S, v29.4S // .....e............................................................................. + add v0.8H, v0.8H, v4.8H // .....................................................*............................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v6.8H, v23.8H, v7.H[0] // ...................................................*............................... - trn1 v8.2D, v3.2D, v26.2D // ..........e........................................................................ - mls v24.8H, v9.8H, v7.H[0] // ........................................................*.......................... // gap // ................................................................................... + trn2 v25.2D, v13.2D, v14.2D // .........e......................................................................... + mls v26.8H, v16.8H, v7.H[0] // ........................................................*.......................... + mls v31.8H, v27.8H, v7.H[0] // ...................................................*............................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - srshr v16.8H, v0.8H, #11 // ..........................................................*........................ - trn2 v3.2D, v3.2D, v26.2D // ........e.......................................................................... - trn1 v1.2D, v29.2D, v20.2D // ...........e....................................................................... - trn2 v20.2D, v29.2D, v20.2D // .........e......................................................................... + trn2 v29.2D, v22.2D, v11.2D // ........e.......................................................................... + trn1 v14.2D, v13.2D, v14.2D // ...........e....................................................................... + trn1 v11.2D, v22.2D, v11.2D // ..........e........................................................................ + sqdmulh v27.8H, v21.8H, v7.H[1] // .........................................................*......................... + sqdmulh v16.8H, v0.8H, v7.H[1] // ............................................................*...................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - sqdmulh v0.8H, v22.8H, v7.H[1] // ............................................................*...................... + sub v13.8H, v29.8H, v25.8H // .......................e........................................................... + add v25.8H, v29.8H, v25.8H // ........................e.......................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + sub v3.8H, v11.8H, v14.8H // ..................e................................................................ + add v8.8H, v11.8H, v14.8H // ...................e............................................................... + sqdmulh v4.8H, v31.8H, v7.H[1] // ...............................................................*................... + sqdmulh v10.8H, v26.8H, v7.H[1] // ..................................................................*................ // gap // ................................................................................... - sub v27.8H, v3.8H, v20.8H // .......................e........................................................... - sqdmulh v29.8H, v24.8H, v7.H[1] // ..................................................................*................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mul v20.8H, v13.8H, v23.8H // ..........................e........................................................ + srshr v11.8H, v27.8H, #11 // ..........................................................*........................ + srshr v14.8H, v16.8H, #11 // .............................................................*..................... + sqrdmulh v13.8H, v13.8H, v2.8H // .........................e......................................................... // gap // ................................................................................... - sub v12.8H, v8.8H, v1.8H // ..................e................................................................ - sqdmulh v30.8H, v6.8H, v7.H[1] // ...............................................................*................... - mls v10.8H, v16.8H, v7.H[0] // ...........................................................*....................... - add v18.8H, v8.8H, v1.8H // ...................e............................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + add v17.8H, v8.8H, v25.8H // .............................e..................................................... + mul v23.8H, v3.8H, v1.8H // .....................e............................................................. + sub v25.8H, v8.8H, v25.8H // ............................e...................................................... + sqrdmulh v3.8H, v3.8H, v12.8H // ....................e.............................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - sqrdmulh v17.8H, v12.8H, v25.8H // .....................e............................................................. - mul v14.8H, v27.8H, v2.8H // .........................e......................................................... - mul v9.8H, v12.8H, v28.8H // ....................e.............................................................. - sqrdmulh v31.8H, v27.8H, v4.8H // ..........................e........................................................ // gap // ................................................................................... + srshr v16.8H, v4.8H, #11 // ................................................................*.................. + srshr v27.8H, v10.8H, #11 // ...................................................................*............... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - srshr v23.8H, v30.8H, #11 // ................................................................*.................. - srshr v26.8H, v0.8H, #11 // .............................................................*..................... - srshr v0.8H, v29.8H, #11 // ...................................................................*............... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mls v0.8H, v14.8H, v7.H[0] // ..............................................................*.................... + mls v21.8H, v11.8H, v7.H[0] // ...........................................................*....................... + sqrdmulh v11.8H, v25.8H, v19.8H // ..............................e.................................................... + mul v22.8H, v25.8H, v15.8H // ...............................e................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mls v23.8H, v3.8H, v7.H[0] // ......................e............................................................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v9.8H, v17.8H, v7.H[0] // ......................e............................................................ - mls v14.8H, v31.8H, v7.H[0] // ...........................e....................................................... + mls v20.8H, v13.8H, v7.H[0] // ...........................e....................................................... + mls v31.8H, v16.8H, v7.H[0] // .................................................................*................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - add v31.8H, v3.8H, v20.8H // ........................e.......................................................... - mls v24.8H, v0.8H, v7.H[0] // ....................................................................*.............. - mls v22.8H, v26.8H, v7.H[0] // ..............................................................*.................... + mls v26.8H, v27.8H, v7.H[0] // ....................................................................*.............. + mls v22.8H, v11.8H, v7.H[0] // ................................e.................................................. + sub v28.8H, v21.8H, v0.8H // .....................................................................*............. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v6.8H, v23.8H, v7.H[0] // .................................................................*................. // gap // ................................................................................... // gap // ................................................................................... + sub v3.8H, v23.8H, v20.8H // .................................e................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - add v25.8H, v18.8H, v31.8H // .............................e..................................................... - sub v29.8H, v18.8H, v31.8H // ............................e...................................................... - sub v28.8H, v9.8H, v14.8H // .................................e................................................. // gap // ................................................................................... + sub v16.8H, v31.8H, v26.8H // ..........................................................................*........ + sqrdmulh v11.8H, v28.8H, v6.H[1] // .......................................................................*........... + mul v14.8H, v28.8H, v6.H[0] // ........................................................................*.......... + add v25.8H, v23.8H, v20.8H // ..................................e................................................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - add v16.8H, v6.8H, v24.8H // ...........................................................................*....... + mul v13.8H, v3.8H, v15.8H // ....................................e.............................................. + add v15.8H, v31.8H, v26.8H // ...........................................................................*....... + sqrdmulh v31.8H, v3.8H, v19.8H // ...................................e............................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - add v2.8H, v10.8H, v22.8H // ......................................................................*............ - sub v10.8H, v10.8H, v22.8H // .....................................................................*............. - sub v24.8H, v6.8H, v24.8H // ..........................................................................*........ - mul v27.8H, v29.8H, v13.8H // ..............................e.................................................... - sqrdmulh v3.8H, v29.8H, v21.8H // ...............................e................................................... - sqrdmulh v26.8H, v28.8H, v21.8H // ....................................e.............................................. - mul v1.8H, v28.8H, v13.8H // ...................................e............................................... // gap // ................................................................................... + trn2 v26.4S, v17.4S, v25.4S // .......................................e........................................... + mul v27.8H, v16.8H, v6.H[0] // .............................................................................*..... + sqrdmulh v16.8H, v16.8H, v6.H[1] // ............................................................................*...... + ldr q6, [x3], #16 // ..............................................e.................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - str q16, [x1, #16] // ................................................................................*.. - mul v22.8H, v24.8H, v5.H[0] // ............................................................................*...... - sqrdmulh v11.8H, v24.8H, v5.H[1] // .............................................................................*..... // gap // ................................................................................... + str q15, [x1, #16] // ................................................................................*.. + add v15.8H, v21.8H, v0.8H // ......................................................................*............ + mls v14.8H, v11.8H, v7.H[0] // .........................................................................*......... // gap // ................................................................................... // gap // ................................................................................... - sqrdmulh v0.8H, v10.8H, v5.H[1] // ........................................................................*.......... - mul v10.8H, v10.8H, v5.H[0] // .......................................................................*........... - str q2, [x1], #(64) // ...............................................................................*... - add v18.8H, v9.8H, v14.8H // ..................................e................................................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mls v13.8H, v31.8H, v7.H[0] // .....................................e............................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v1.8H, v26.8H, v7.H[0] // .....................................e............................................. - mls v27.8H, v3.8H, v7.H[0] // ................................e.................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mls v27.8H, v16.8H, v7.H[0] // ..............................................................................*.... + trn1 v16.4S, v17.4S, v25.4S // ......................................e............................................ + str q15, [x1], #(64) // ...............................................................................*... // gap // ................................................................................... // gap // ................................................................................... - mls v22.8H, v11.8H, v7.H[0] // ..............................................................................*.... - mls v10.8H, v0.8H, v7.H[0] // .........................................................................*......... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + str q14, [x1, #-32] // .................................................................................*. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - trn1 v8.4S, v25.4S, v18.4S // ......................................e............................................ - trn2 v23.4S, v25.4S, v18.4S // .......................................e........................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + trn1 v0.4S, v22.4S, v13.4S // ........................................e.......................................... + trn2 v17.4S, v22.4S, v13.4S // .........................................e......................................... // gap // ................................................................................... // gap // ................................................................................... - trn1 v0.4S, v27.4S, v1.4S // ........................................e.......................................... - trn2 v24.4S, v27.4S, v1.4S // .........................................e......................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + str q27, [x1, #-16] // ..................................................................................* // gap // ................................................................................... // gap // ................................................................................... - str q22, [x1, #-16] // ..................................................................................* - str q10, [x1, #-32] // .................................................................................*. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - ldr q5, [x3], #16 // ..............................................e.................................... - // original source code - // ldr q8, [x1, #(16*0)] // ........e..........................................................................|.......e......................................................................... - // ldr q9, [x1, #(16*1)] // .........e.........................................................................|........e........................................................................ - // ldr q10, [x1, #(16*2)] // ...e...............................................................................|..e.............................................................................. - // ldr q11, [x1, #(16*3)] // ....e..............................................................................|...e............................................................................. - // trn1 v25.4s, v8.4s, v9.4s // .....................e.............................................................|....................e............................................................ - // trn2 v26.4s, v8.4s, v9.4s // ..........................e........................................................|.........................e....................................................... - // trn1 v27.4s, v10.4s, v11.4s // ......................e............................................................|.....................e........................................................... - // trn2 v28.4s, v10.4s, v11.4s // .........................e.........................................................|........................e........................................................ - // trn2 v10.2d, v25.2d, v27.2d // ...............................e...................................................|..............................e.................................................. - // trn2 v11.2d, v26.2d, v28.2d // .................................e.................................................|................................e................................................ - // trn1 v8.2d, v25.2d, v27.2d // ............................e......................................................|...........................e..................................................... - // trn1 v9.2d, v26.2d, v28.2d // ................................e..................................................|...............................e................................................. - // ldr q0, [x4], #(6*16) // e..................................................................................e................................................................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // ...............e...................................................................|..............e.................................................................. - // ldr q1, [x4, #(-6*16 + 2*16)] // ....................e..............................................................|...................e............................................................. - // ldr q5, [x4, #(-6*16 + 3*16)] // .......................e...........................................................|......................e.......................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ..............e....................................................................|.............e................................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // .......e...........................................................................|......e.......................................................................... - // sub v24.8h, v8.8h, v9.8h // .....................................e.............................................|....................................e............................................ - // add v8.8h, v8.8h, v9.8h // ........................................e..........................................|.......................................e......................................... - // mul v9.8h, v24.8h, v1.8h // ...........................................e.......................................|..........................................e...................................... - // sqrdmulh v24.8h, v24.8h, v5.8h // .........................................e.........................................|........................................e........................................ - // mls v9.8h, v24.8h, v7.h[0] // ................................................e..................................|...............................................e................................. - // sub v24.8h, v10.8h, v11.8h // ...................................e...............................................|..................................e.............................................. - // add v10.8h, v10.8h, v11.8h // ..................................................e................................|.................................................e............................... - // mul v11.8h, v24.8h, v2.8h // ..........................................e........................................|.........................................e....................................... - // sqrdmulh v24.8h, v24.8h, v6.8h // ............................................e......................................|...........................................e..................................... - // mls v11.8h, v24.8h, v7.h[0] // .................................................e.................................|................................................e................................ - // sub v24.8h, v8.8h, v10.8h // .......................................................e...........................|......................................................e.......................... - // add v8.8h, v8.8h, v10.8h // ......................................................e............................|.....................................................e........................... - // mul v10.8h, v24.8h, v0.8h // .............................................................e.....................|............................................................e.................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ..............................................................e....................|.............................................................e................... - // mls v10.8h, v24.8h, v7.h[0] // .........................................................................e.........|........................................................................e........ - // sub v24.8h, v9.8h, v11.8h // ........................................................e..........................|.......................................................e......................... - // add v9.8h, v9.8h, v11.8h // .......................................................................e...........|......................................................................e.......... - // mul v11.8h, v24.8h, v0.8h // ................................................................e..................|...............................................................e................. - // sqrdmulh v24.8h, v24.8h, v4.8h // ...............................................................e...................|..............................................................e.................. - // mls v11.8h, v24.8h, v7.h[0] // ........................................................................e..........|.......................................................................e......... - // trn1 v25.4s, v8.4s, v9.4s // ............................................................................e......|...........................................................................e..... - // trn2 v26.4s, v8.4s, v9.4s // .............................................................................e.....|............................................................................e.... - // trn1 v27.4s, v10.4s, v11.4s // ..............................................................................e....|.............................................................................e... - // trn2 v28.4s, v10.4s, v11.4s // ...............................................................................e...|..............................................................................e.. - // trn2 v10.2d, v25.2d, v27.2d // ..*................................................................................|.*............................................................................... - // trn2 v11.2d, v26.2d, v28.2d // ......*............................................................................|.....*........................................................................... - // trn1 v8.2d, v25.2d, v27.2d // .*.................................................................................|*................................................................................ - // trn1 v9.2d, v26.2d, v28.2d // .....*.............................................................................|....*............................................................................ - // ldr q0, [x3], #16 // ..................................................................................e|................................................................................. - // sub v24.8h, v8.8h, v9.8h // ............*......................................................................|...........*..................................................................... - // add v8.8h, v8.8h, v9.8h // .............*.....................................................................|............*.................................................................... - // mul v9.8h, v24.8h, v0.h[2] // .................*.................................................................|................*................................................................ - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................*..................................................................|...............*................................................................. - // mls v9.8h, v24.8h, v7.h[0] // ...........................*.......................................................|..........................*...................................................... - // sub v24.8h, v10.8h, v11.8h // ..........*........................................................................|.........*....................................................................... - // add v10.8h, v10.8h, v11.8h // ...........*.......................................................................|..........*...................................................................... - // mul v11.8h, v24.8h, v0.h[4] // ...................*...............................................................|..................*.............................................................. - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................*................................................................|.................*............................................................... - // mls v11.8h, v24.8h, v7.h[0] // .............................*.....................................................|............................*.................................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ........................*..........................................................|.......................*......................................................... - // srshr v25.8h, v25.8h, #11 // ..............................*....................................................|.............................*................................................... - // mls v8.8h, v25.8h, v7.h[0] // .......................................*...........................................|......................................*.......................................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ..................................*................................................|.................................*............................................... - // srshr v25.8h, v25.8h, #11 // ..............................................*....................................|.............................................*................................... - // mls v10.8h, v25.8h, v7.h[0] // ....................................................*..............................|...................................................*............................. - // sqdmulh v25.8h, v9.8h, v7.h[1] // ......................................*............................................|.....................................*........................................... - // srshr v25.8h, v25.8h, #11 // .............................................*.....................................|............................................*.................................... - // mls v9.8h, v25.8h, v7.h[0] // .....................................................*.............................|....................................................*............................ - // sqdmulh v25.8h, v11.8h, v7.h[1] // ....................................*..............................................|...................................*............................................. - // srshr v25.8h, v25.8h, #11 // ...............................................*...................................|..............................................*.................................. - // mls v11.8h, v25.8h, v7.h[0] // ...................................................*...............................|..................................................*.............................. - // sub v24.8h, v8.8h, v10.8h // ...........................................................*.......................|..........................................................*...................... - // add v8.8h, v8.8h, v10.8h // ..........................................................*........................|.........................................................*....................... - // mul v10.8h, v24.8h, v0.h[0] // .....................................................................*.............|....................................................................*............ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................................*..............|...................................................................*............. - // mls v10.8h, v24.8h, v7.h[0] // ...........................................................................*.......|..........................................................................*...... - // sub v24.8h, v9.8h, v11.8h // ............................................................*......................|...........................................................*..................... - // add v9.8h, v9.8h, v11.8h // .........................................................*.........................|........................................................*........................ - // mul v11.8h, v24.8h, v0.h[0] // ..................................................................*................|.................................................................*............... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................................*...............|..................................................................*.............. - // mls v11.8h, v24.8h, v7.h[0] // ..........................................................................*........|.........................................................................*....... - // str q8, [x1], #(64) // ......................................................................*............|.....................................................................*........... - // str q9, [x1, #(-64 + 16*1)] // .................................................................*.................|................................................................*................ - // str q10, [x1, #(-64 + 16*2)] // .................................................................................*.|................................................................................* - // str q11, [x1, #(-64 + 16*3)] // ................................................................................*..|...............................................................................*. + // --------------------------------------------------------------------------- new position ----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q8, [x1, #(16*0)] // .......e...........................................................................'......~........................................................................... + // ldr q9, [x1, #(16*1)] // .........e.........................................................................'........~......................................................................... + // ldr q10, [x1, #(16*2)] // ........e..........................................................................'.......~.......................................................................... + // ldr q11, [x1, #(16*3)] // .e.................................................................................'~................................................................................. + // trn1 v25.4s, v8.4s, v9.4s // .......................e...........................................................'......................~........................................................... + // trn2 v26.4s, v8.4s, v9.4s // ........................e..........................................................'.......................~.......................................................... + // trn1 v27.4s, v10.4s, v11.4s // .....................e.............................................................'....................~............................................................. + // trn2 v28.4s, v10.4s, v11.4s // ......................e............................................................'.....................~............................................................ + // trn2 v10.2d, v25.2d, v27.2d // .............................e.....................................................'............................~..................................................... + // trn2 v11.2d, v26.2d, v28.2d // ..........................e........................................................'.........................~........................................................ + // trn1 v8.2d, v25.2d, v27.2d // ...............................e...................................................'..............................~................................................... + // trn1 v9.2d, v26.2d, v28.2d // ..............................e....................................................'.............................~.................................................... + // ldr q0, [x4], #(6*16) // ..........e........................................................................'.........~........................................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // .............e.....................................................................'............~..................................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ................e..................................................................'...............~.................................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // e..................................................................................~.................................................................................. + // ldr q2, [x4, #(-6*16 + 4*16)] // ......e............................................................................'.....~............................................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ..............e....................................................................'.............~.................................................................... + // sub v24.8h, v8.8h, v9.8h // ....................................e..............................................'...................................~.............................................. + // add v8.8h, v8.8h, v9.8h // .....................................e.............................................'....................................~............................................. + // sqrdmulh v27.8h, v24.8h, v5.8h // ...............................................e...................................'..............................................~................................... + // mul v9.8h, v24.8h, v1.8h // .............................................e.....................................'............................................~..................................... + // mls v9.8h, v27.8h, v7.h[0] // ......................................................e............................'.....................................................~............................ + // sub v24.8h, v10.8h, v11.8h // ..................................e................................................'.................................~................................................ + // add v10.8h, v10.8h, v11.8h // ...................................e...............................................'..................................~............................................... + // sqrdmulh v27.8h, v24.8h, v6.8h // ...........................................e.......................................'..........................................~....................................... + // mul v11.8h, v24.8h, v2.8h // ........................................e..........................................'.......................................~.......................................... + // mls v11.8h, v27.8h, v7.h[0] // .......................................................e...........................'......................................................~........................... + // sub v24.8h, v8.8h, v10.8h // ..............................................e....................................'.............................................~.................................... + // add v8.8h, v8.8h, v10.8h // ............................................e......................................'...........................................~...................................... + // sqrdmulh v27.8h, v24.8h, v4.8h // ....................................................e..............................'...................................................~.............................. + // mul v10.8h, v24.8h, v0.8h // .....................................................e.............................'....................................................~............................. + // mls v10.8h, v27.8h, v7.h[0] // ..........................................................e........................'.........................................................~........................ + // sub v24.8h, v9.8h, v11.8h // ............................................................e......................'...........................................................~...................... + // add v9.8h, v9.8h, v11.8h // ................................................................e..................'...............................................................~.................. + // sqrdmulh v27.8h, v24.8h, v4.8h // ...................................................................e...............'..................................................................~............... + // mul v11.8h, v24.8h, v0.8h // .................................................................e.................'................................................................~................. + // mls v11.8h, v27.8h, v7.h[0] // ...........................................................................e.......'..........................................................................~....... + // trn1 v25.4s, v8.4s, v9.4s // .............................................................................e.....'............................................................................~..... + // trn2 v26.4s, v8.4s, v9.4s // ....................................................................e..............'...................................................................~.............. + // trn1 v27.4s, v10.4s, v11.4s // ................................................................................e..'...............................................................................~.. + // trn2 v28.4s, v10.4s, v11.4s // .................................................................................e.'................................................................................~. + // trn2 v10.2d, v25.2d, v27.2d // .....~.............................................................................'....*............................................................................. + // trn2 v11.2d, v26.2d, v28.2d // ...~...............................................................................'..*............................................................................... + // trn1 v8.2d, v25.2d, v27.2d // ....~..............................................................................'...*.............................................................................. + // trn1 v9.2d, v26.2d, v28.2d // ..~................................................................................'.*................................................................................ + // ldr q0, [x3], #16 // .......................................................................e...........'......................................................................~........... + // sub v24.8h, v8.8h, v9.8h // ...........~.......................................................................'..........*....................................................................... + // add v8.8h, v8.8h, v9.8h // ...............~...................................................................'..............*................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ..................~................................................................'.................*................................................................ + // mul v9.8h, v24.8h, v0.h[2] // .................~.................................................................'................*................................................................. + // mls v9.8h, v27.8h, v7.h[0] // ............................~......................................................'...........................*...................................................... + // sub v24.8h, v10.8h, v11.8h // ............~......................................................................'...........*...................................................................... + // add v10.8h, v10.8h, v11.8h // .........................~.........................................................'........................*......................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ....................~..............................................................'...................*.............................................................. + // mul v11.8h, v24.8h, v0.h[4] // ...................~...............................................................'..................*............................................................... + // mls v11.8h, v27.8h, v7.h[0] // ...........................~.......................................................'..........................*....................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ................................~..................................................'...............................*.................................................. + // srshr v25.8h, v25.8h, #11 // .........................................~.........................................'........................................*......................................... + // mls v8.8h, v25.8h, v7.h[0] // ...................................................~...............................'..................................................*............................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .................................~.................................................'................................*................................................. + // srshr v25.8h, v25.8h, #11 // ..........................................~........................................'.........................................*........................................ + // mls v10.8h, v25.8h, v7.h[0] // ..................................................~................................'.................................................*................................ + // sqdmulh v25.8h, v9.8h, v7.h[1] // ......................................~............................................'.....................................*............................................ + // srshr v25.8h, v25.8h, #11 // ................................................~..................................'...............................................*.................................. + // mls v9.8h, v25.8h, v7.h[0] // ........................................................~..........................'.......................................................*.......................... + // sqdmulh v25.8h, v11.8h, v7.h[1] // .......................................~...........................................'......................................*........................................... + // srshr v25.8h, v25.8h, #11 // .................................................~.................................'................................................*................................. + // mls v11.8h, v25.8h, v7.h[0] // .........................................................~.........................'........................................................*......................... + // sub v24.8h, v8.8h, v10.8h // ...........................................................~.......................'..........................................................*....................... + // add v8.8h, v8.8h, v10.8h // .........................................................................~.........'........................................................................*......... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ..............................................................~....................'.............................................................*.................... + // mul v10.8h, v24.8h, v0.h[0] // ...............................................................~...................'..............................................................*................... + // mls v10.8h, v27.8h, v7.h[0] // ..........................................................................~........'.........................................................................*........ + // sub v24.8h, v9.8h, v11.8h // .............................................................~.....................'............................................................*..................... + // add v9.8h, v9.8h, v11.8h // ..................................................................~................'.................................................................*................ + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ......................................................................~............'.....................................................................*............ + // mul v11.8h, v24.8h, v0.h[0] // .....................................................................~.............'....................................................................*............. + // mls v11.8h, v27.8h, v7.h[0] // ............................................................................~......'...........................................................................*...... + // str q8, [x1], #(64) // ..............................................................................~....'.............................................................................*.... + // str q9, [x1, #(-64 + 16*1)] // ........................................................................~..........'.......................................................................*.......... + // str q10, [x1, #(-64 + 16*2)] // ...............................................................................~...'..............................................................................*... + // str q11, [x1, #(-64 + 16*3)] // ..................................................................................~'.................................................................................* sub count, count, #1 cbnz count, layer4567_start - trn2 v1.2D, v8.2D, v0.2D // .*...................................... - trn1 v14.2D, v8.2D, v0.2D // *....................................... - trn2 v17.2D, v23.2D, v24.2D // ...*.................................... - trn1 v0.2D, v23.2D, v24.2D // ..*..................................... + // Instructions: 40 + // Expected cycles: 28 + // Expected IPC: 1.43 + // + // Cycle bound: 28.0 + // IPC bound: 1.43 + // + // Wall time: 0.64s + // User time: 0.64s + // + // ---------- original position ----------> + // 0 25 + // |------------------------|-------------- + trn1 v15.2D, v16.2D, v0.2D // ..*..................................... + trn2 v10.2D, v16.2D, v0.2D // ...*.................................... + trn1 v14.2D, v26.2D, v17.2D // *....................................... + trn2 v27.2D, v26.2D, v17.2D // .*...................................... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -938,15 +966,15 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - sub v12.8H, v1.8H, v17.8H // ....*................................... - sub v9.8H, v14.8H, v0.8H // ......*................................. + sub v28.8H, v15.8H, v14.8H // ....*................................... + add v4.8H, v15.8H, v14.8H // ......*................................. + sub v15.8H, v10.8H, v27.8H // .....*.................................. // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - add v16.8H, v1.8H, v17.8H // .....*.................................. // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -954,16 +982,16 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - sqrdmulh v11.8H, v12.8H, v5.H[5] // ..........*............................. - mul v19.8H, v12.8H, v5.H[4] // ...........*............................ - sqrdmulh v25.8H, v9.8H, v5.H[3] // ........*............................... - mul v30.8H, v9.8H, v5.H[2] // .........*.............................. + mul v21.8H, v28.8H, v6.H[2] // .......*................................ + sqrdmulh v16.8H, v28.8H, v6.H[3] // ........*............................... + mul v28.8H, v15.8H, v6.H[4] // .........*.............................. + sqrdmulh v18.8H, v15.8H, v6.H[5] // ..........*............................. // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - add v12.8H, v14.8H, v0.8H // .......*................................ - sqdmulh v2.8H, v16.8H, v7.H[1] // ................*....................... + sqdmulh v26.8H, v4.8H, v7.H[1] // ..............*......................... + add v15.8H, v10.8H, v27.8H // ...........*............................ // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -978,15 +1006,15 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mls v19.8H, v11.8H, v7.H[0] // ..............*......................... - mls v30.8H, v25.8H, v7.H[0] // .............*.......................... - sqdmulh v25.8H, v12.8H, v7.H[1] // ............*........................... + mls v21.8H, v16.8H, v7.H[0] // .............*.......................... + sqdmulh v16.8H, v15.8H, v7.H[1] // ...............*........................ + mls v28.8H, v18.8H, v7.H[0] // ............*........................... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - srshr v15.8H, v2.8H, #11 // .....................*.................. + srshr v20.8H, v26.8H, #11 // ..................*..................... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1002,15 +1030,15 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - srshr v31.8H, v25.8H, #11 // ...............*........................ - sqdmulh v22.8H, v19.8H, v7.H[1] // .................*...................... - sqdmulh v29.8H, v30.8H, v7.H[1] // ..................*..................... + sqdmulh v14.8H, v21.8H, v7.H[1] // ................*....................... + srshr v16.8H, v16.8H, #11 // ...................*.................... + sqdmulh v27.8H, v28.8H, v7.H[1] // .................*...................... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - mls v16.8H, v15.8H, v7.H[0] // ........................*............... + mls v4.8H, v20.8H, v7.H[0] // .......................*................ // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1026,9 +1054,9 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mls v12.8H, v31.8H, v7.H[0] // ...................*.................... - srshr v25.8H, v22.8H, #11 // ......................*................. - srshr v20.8H, v29.8H, #11 // ....................*................... + mls v15.8H, v16.8H, v7.H[0] // ......................*................. + srshr v16.8H, v14.8H, #11 // ....................*................... + srshr v9.8H, v27.8H, #11 // .....................*.................. // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1050,15 +1078,15 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mls v19.8H, v25.8H, v7.H[0] // .......................*................ - mls v30.8H, v20.8H, v7.H[0] // .........................*.............. - add v3.8H, v12.8H, v16.8H // ...........................*............ + add v20.8H, v4.8H, v15.8H // ..................................*..... + sub v15.8H, v4.8H, v15.8H // ..........................*............. + mls v21.8H, v16.8H, v7.H[0] // ........................*............... + mls v28.8H, v9.8H, v7.H[0] // .........................*.............. // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - sub v2.8H, v12.8H, v16.8H // ............................*........... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1066,23 +1094,22 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - str q3, [x1], #(64) // ...................................*.... + str q20, [x1], #(64) // .....................................*.. + mul v27.8H, v15.8H, v6.H[0] // .............................*.......... + sqrdmulh v15.8H, v15.8H, v6.H[1] // ............................*........... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ + sub v14.8H, v21.8H, v28.8H // ...........................*............ // gap // ........................................ // gap // ........................................ - sub v21.8H, v30.8H, v19.8H // .............................*.......... - add v12.8H, v30.8H, v19.8H // ..........................*............. - mul v8.8H, v2.8H, v5.H[0] // ..................................*..... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ - sqrdmulh v9.8H, v2.8H, v5.H[1] // .................................*...... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1090,10 +1117,10 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mul v31.8H, v21.8H, v5.H[0] // ...............................*........ - sqrdmulh v4.8H, v21.8H, v5.H[1] // ................................*....... - str q12, [x1, #-48] // ..............................*......... // gap // ........................................ + mls v27.8H, v15.8H, v7.H[0] // ...................................*.... + mul v16.8H, v14.8H, v6.H[0] // ...............................*........ + sqrdmulh v15.8H, v14.8H, v6.H[1] // ................................*....... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1106,7 +1133,6 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mls v8.8H, v9.8H, v7.H[0] // .....................................*.. // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1114,15 +1140,17 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - mls v31.8H, v4.8H, v7.H[0] // ....................................*... // gap // ........................................ // gap // ........................................ + str q27, [x1, #-32] // ......................................*. + mls v16.8H, v15.8H, v7.H[0] // ....................................*... // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ // gap // ........................................ + add v15.8H, v21.8H, v28.8H // ..............................*......... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1130,7 +1158,6 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - str q8, [x1, #-32] // .......................................* // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1138,8 +1165,9 @@ layer4567_start: // gap // ........................................ // gap // ........................................ // gap // ........................................ - str q31, [x1, #-16] // ......................................*. // gap // ........................................ + str q16, [x1, #-16] // .......................................* + str q15, [x1, #-48] // .................................*...... // gap // ........................................ // gap // ........................................ // gap // ........................................ @@ -1147,47 +1175,49 @@ layer4567_start: // gap // ........................................ // gap // ........................................ - // original source code - // trn1 v18.2D, v8.2D, v0.2D // .*...................................... - // trn2 v11.2D, v8.2D, v0.2D // *....................................... - // trn1 v10.2D, v23.2D, v24.2D // ...*.................................... - // trn2 v24.2D, v23.2D, v24.2D // ..*..................................... - // sub v19.8H, v11.8H, v24.8H // ....*................................... - // add v22.8H, v11.8H, v24.8H // ......*................................. - // sub v15.8H, v18.8H, v10.8H // .....*.................................. - // add v10.8H, v18.8H, v10.8H // ...........*............................ - // sqrdmulh v23.8H, v15.8H, v5.H[3] // .........*.............................. - // mul v6.8H, v15.8H, v5.H[2] // ..........*............................. - // sqrdmulh v9.8H, v19.8H, v5.H[5] // .......*................................ - // mul v24.8H, v19.8H, v5.H[4] // ........*............................... - // sqdmulh v0.8H, v10.8H, v7.H[1] // ...............*........................ - // mls v6.8H, v23.8H, v7.H[0] // ..............*......................... - // mls v24.8H, v9.8H, v7.H[0] // .............*.......................... - // srshr v16.8H, v0.8H, #11 // .................*...................... - // sqdmulh v0.8H, v22.8H, v7.H[1] // ............*........................... - // sqdmulh v29.8H, v24.8H, v7.H[1] // ..................*..................... - // sqdmulh v30.8H, v6.8H, v7.H[1] // ...................*.................... - // mls v10.8H, v16.8H, v7.H[0] // .....................*.................. - // srshr v23.8H, v30.8H, #11 // .......................*................ - // srshr v26.8H, v0.8H, #11 // ................*....................... - // srshr v0.8H, v29.8H, #11 // ......................*................. - // mls v24.8H, v0.8H, v7.H[0] // ........................*............... - // mls v22.8H, v26.8H, v7.H[0] // ....................*................... - // mls v6.8H, v23.8H, v7.H[0] // .........................*.............. - // add v16.8H, v6.8H, v24.8H // ..............................*......... - // add v2.8H, v10.8H, v22.8H // ..........................*............. - // sub v10.8H, v10.8H, v22.8H // ...........................*............ - // sub v24.8H, v6.8H, v24.8H // .............................*.......... - // str q16, [x1, #16] // ...................................*.... - // mul v22.8H, v24.8H, v5.H[0] // .................................*...... - // sqrdmulh v11.8H, v24.8H, v5.H[1] // ..................................*..... - // sqrdmulh v0.8H, v10.8H, v5.H[1] // ................................*....... - // mul v10.8H, v10.8H, v5.H[0] // ...............................*........ - // str q2, [x1], #(64) // ............................*........... - // mls v22.8H, v11.8H, v7.H[0] // .....................................*.. - // mls v10.8H, v0.8H, v7.H[0] // ....................................*... - // str q22, [x1, #-16] // .......................................* - // str q10, [x1, #-32] // ......................................*. + // ------------ new position -------------> + // 0 25 + // |------------------------|-------------- + // trn1 v22.2D, v26.2D, v17.2D // ..*..................................... + // trn2 v4.2D, v26.2D, v17.2D // ...*.................................... + // trn1 v14.2D, v16.2D, v0.2D // *....................................... + // trn2 v0.2D, v16.2D, v0.2D // .*...................................... + // sub v27.8H, v14.8H, v22.8H // ....*................................... + // sub v16.8H, v0.8H, v4.8H // ......*................................. + // add v21.8H, v14.8H, v22.8H // .....*.................................. + // mul v31.8H, v27.8H, v6.H[2] // .......*................................ + // sqrdmulh v27.8H, v27.8H, v6.H[3] // ........*............................... + // mul v26.8H, v16.8H, v6.H[4] // .........*.............................. + // sqrdmulh v16.8H, v16.8H, v6.H[5] // ..........*............................. + // add v0.8H, v0.8H, v4.8H // ............*........................... + // mls v26.8H, v16.8H, v7.H[0] // ...............*........................ + // mls v31.8H, v27.8H, v7.H[0] // .............*.......................... + // sqdmulh v27.8H, v21.8H, v7.H[1] // ...........*............................ + // sqdmulh v16.8H, v0.8H, v7.H[1] // ..............*......................... + // sqdmulh v4.8H, v31.8H, v7.H[1] // .................*...................... + // sqdmulh v10.8H, v26.8H, v7.H[1] // ...................*.................... + // srshr v11.8H, v27.8H, #11 // ................*....................... + // srshr v14.8H, v16.8H, #11 // ..................*..................... + // srshr v16.8H, v4.8H, #11 // ......................*................. + // srshr v27.8H, v10.8H, #11 // .......................*................ + // mls v0.8H, v14.8H, v7.H[0] // .....................*.................. + // mls v21.8H, v11.8H, v7.H[0] // ....................*................... + // mls v31.8H, v16.8H, v7.H[0] // ..........................*............. + // mls v26.8H, v27.8H, v7.H[0] // ...........................*............ + // sub v28.8H, v21.8H, v0.8H // .........................*.............. + // sub v16.8H, v31.8H, v26.8H // ...............................*........ + // sqrdmulh v11.8H, v28.8H, v6.H[1] // ..............................*......... + // mul v14.8H, v28.8H, v6.H[0] // .............................*.......... + // add v15.8H, v31.8H, v26.8H // .....................................*.. + // mul v27.8H, v16.8H, v6.H[0] // .................................*...... + // sqrdmulh v16.8H, v16.8H, v6.H[1] // ..................................*..... + // str q15, [x1, #16] // .......................................* + // add v15.8H, v21.8H, v0.8H // ........................*............... + // mls v14.8H, v11.8H, v7.H[0] // ................................*....... + // mls v27.8H, v16.8H, v7.H[0] // ....................................*... + // str q15, [x1], #(64) // ............................*........... + // str q14, [x1, #-32] // ...................................*.... + // str q27, [x1, #-16] // ......................................*. // --------------------------------------------------------------------- @@ -1206,570 +1236,639 @@ layer4567_start: .p2align 2 - ldr q23, [x0, #128] // ..*................................................. - ldr q19, [x0, #192] // .*.................................................. - ldr q22, [x0, #0] // *................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - ldr q28, [x0, #64] // ....*............................................... - ldr q27, [x0, #384] // ...*................................................ - ldr q24, [x0, #448] // .....*.............................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - ldr q3, [x0, #320] // ......*............................................. - ldr q26, [x0, #256] // .......*............................................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v20.8H, v23.8H, v19.8H // .........*.......................................... - add v19.8H, v23.8H, v19.8H // ........*........................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v23.8H, v22.8H, v28.8H // ..........*......................................... - add v22.8H, v22.8H, v28.8H // ...........*........................................ - sub v28.8H, v27.8H, v24.8H // .............*...................................... - add v27.8H, v27.8H, v24.8H // ............*....................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v24.8H, v26.8H, v3.8H // ...............*.................................... - add v3.8H, v26.8H, v3.8H // .................*.................................. - mul v26.8H, v20.8H, v1.H[0] // ..............*..................................... - sqrdmulh v20.8H, v20.8H, v1.H[1] // ...................*................................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v11.8H, v23.8H, v0.H[7] // ....................*............................... - mul v23.8H, v23.8H, v0.H[6] // .....................*.............................. - sqrdmulh v14.8H, v28.8H, v1.H[5] // ................*................................... - mul v28.8H, v28.8H, v1.H[4] // ..................*................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v25.8H, v24.8H, v1.H[3] // ......................*............................. - mul v24.8H, v24.8H, v1.H[2] // .......................*............................ - sub v5.8H, v3.8H, v27.8H // ........................*........................... - sub v10.8H, v22.8H, v19.8H // .........................*.......................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - add v19.8H, v22.8H, v19.8H // ............................*....................... - add v22.8H, v3.8H, v27.8H // .............................*...................... - mls v26.8H, v20.8H, v7.H[0] // ...........................*........................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v23.8H, v11.8H, v7.H[0] // ...............................*.................... - mls v28.8H, v14.8H, v7.H[0] // ..........................*......................... - sqrdmulh v3.8H, v5.8H, v0.H[5] // ................................*................... - sqrdmulh v20.8H, v10.8H, v0.H[3] // .................................*.................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v24.8H, v25.8H, v7.H[0] // ..............................*..................... - mul v13.8H, v5.8H, v0.H[4] // ...................................*................ - mul v16.8H, v10.8H, v0.H[2] // ..................................*................. - add v27.8H, v19.8H, v22.8H // ....................................*............... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v25.8H, v19.8H, v22.8H // ...........................................*........ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v11.8H, v23.8H, v26.8H // ......................................*............. - add v15.8H, v23.8H, v26.8H // ........................................*........... - sqrdmulh v22.8H, v27.8H, v30.8H // .................................................*.. - mul v19.8H, v27.8H, v29.8H // ..................................................*. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v23.8H, v24.8H, v28.8H // .......................................*............ - add v27.8H, v24.8H, v28.8H // .....................................*.............. - mls v13.8H, v3.8H, v7.H[0] // .........................................*.......... - mls v16.8H, v20.8H, v7.H[0] // ..........................................*......... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mul v4.8H, v11.8H, v0.H[2] // .............................................*...... - sqrdmulh v20.8H, v11.8H, v0.H[3] // ..............................................*..... - mul v5.8H, v25.8H, v0.H[0] // ................................................*... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mul v10.8H, v23.8H, v0.H[4] // ............................................*....... - sqrdmulh v3.8H, v23.8H, v0.H[5] // ...............................................*.... - add v23.8H, v15.8H, v27.8H // ...................................................* - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - - // original source code - // ldr q21, [x0, #0] // ..*................................................. - // ldr q9, [x0, #192] // .*.................................................. - // ldr q16, [x0, #128] // *................................................... - // ldr q15, [x0, #384] // ....*............................................... - // ldr q8, [x0, #64] // ...*................................................ - // ldr q25, [x0, #448] // .....*.............................................. - // ldr q18, [x0, #320] // ......*............................................. - // ldr q6, [x0, #256] // .......*............................................ - // add v13.8H, v16.8H, v9.8H // .........*.......................................... - // sub v16.8H, v16.8H, v9.8H // ........*........................................... - // sub v2.8H, v21.8H, v8.8H // ..........*......................................... - // add v17.8H, v21.8H, v8.8H // ...........*........................................ - // add v11.8H, v15.8H, v25.8H // .............*...................................... - // sub v27.8H, v15.8H, v25.8H // ............*....................................... - // mul v15.8H, v16.8H, v1.H[0] // ................*................................... - // sub v9.8H, v6.8H, v18.8H // ..............*..................................... - // sqrdmulh v28.8H, v27.8H, v1.H[5] // ....................*............................... - // add v18.8H, v6.8H, v18.8H // ...............*.................................... - // mul v6.8H, v27.8H, v1.H[4] // .....................*.............................. - // sqrdmulh v5.8H, v16.8H, v1.H[1] // .................*.................................. - // sqrdmulh v16.8H, v2.8H, v0.H[7] // ..................*................................. - // mul v23.8H, v2.8H, v0.H[6] // ...................*................................ - // sqrdmulh v27.8H, v9.8H, v1.H[3] // ......................*............................. - // mul v2.8H, v9.8H, v1.H[2] // .......................*............................ - // sub v25.8H, v18.8H, v11.8H // ........................*........................... - // sub v9.8H, v17.8H, v13.8H // .........................*.......................... - // mls v6.8H, v28.8H, v7.H[0] // ..............................*..................... - // mls v15.8H, v5.8H, v7.H[0] // ............................*....................... - // add v5.8H, v17.8H, v13.8H // ..........................*......................... - // add v4.8H, v18.8H, v11.8H // ...........................*........................ - // mls v2.8H, v27.8H, v7.H[0] // .................................*.................. - // mls v23.8H, v16.8H, v7.H[0] // .............................*...................... - // sqrdmulh v11.8H, v25.8H, v0.H[5] // ...............................*.................... - // sqrdmulh v19.8H, v9.8H, v0.H[3] // ................................*................... - // mul v16.8H, v9.8H, v0.H[2] // ...................................*................ - // mul v13.8H, v25.8H, v0.H[4] // ..................................*................. - // add v24.8H, v5.8H, v4.8H // ....................................*............... - // add v27.8H, v2.8H, v6.8H // ...........................................*........ - // sub v14.8H, v23.8H, v15.8H // ......................................*............. - // sub v28.8H, v2.8H, v6.8H // ..........................................*......... - // add v15.8H, v23.8H, v15.8H // .......................................*............ - // mls v13.8H, v11.8H, v7.H[0] // ............................................*....... - // mls v16.8H, v19.8H, v7.H[0] // .............................................*...... - // sub v25.8H, v5.8H, v4.8H // .....................................*.............. - // mul v10.8H, v28.8H, v0.H[4] // .................................................*.. - // mul v4.8H, v14.8H, v0.H[2] // ..............................................*..... - // sqrdmulh v20.8H, v14.8H, v0.H[3] // ...............................................*.... - // sqrdmulh v3.8H, v28.8H, v0.H[5] // ..................................................*. - // mul v5.8H, v25.8H, v0.H[0] // ................................................*... - // sqrdmulh v22.8H, v24.8H, v30.8H // ........................................*........... - // mul v19.8H, v24.8H, v29.8H // .........................................*.......... - // add v23.8H, v15.8H, v27.8H // ...................................................* + // Instructions: 43 + // Expected cycles: 17 + // Expected IPC: 2.53 + // + // Cycle bound: 17.0 + // IPC bound: 2.53 + // + // Wall time: 0.89s + // User time: 0.89s + // + // ----------- original position ------------> + // 0 25 + // |------------------------|----------------- + ldr q16, [x0, #320] // ..*........................................ + ldr q15, [x0, #256] // .*......................................... + ldr q27, [x0, #192] // *.......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q14, [x0, #128] // .....*..................................... + ldr q6, [x0, #0] // ...*....................................... + ldr q26, [x0, #64] // ....*...................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q11, [x0, #384] // ......*.................................... + ldr q13, [x0, #448] // .......*................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v31.8H, v15.8H, v16.8H // ........*.................................. + add v15.8H, v15.8H, v16.8H // .........*................................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v16.8H, v14.8H, v27.8H // ...........*............................... + add v27.8H, v14.8H, v27.8H // ....................*...................... + sub v14.8H, v6.8H, v26.8H // ..........*................................ + add v6.8H, v6.8H, v26.8H // ............................*.............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v26.8H, v11.8H, v13.8H // .................*......................... + sub v11.8H, v11.8H, v13.8H // ...............*........................... + sqrdmulh v13.8H, v31.8H, v1.H[3] // ............*.............................. + mul v31.8H, v31.8H, v1.H[2] // .............*............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v22.8H, v16.8H, v1.H[1] // ................*.......................... + mul v16.8H, v16.8H, v1.H[0] // ...................*....................... + mul v3.8H, v14.8H, v0.H[6] // ..............*............................ + sqrdmulh v14.8H, v14.8H, v0.H[7] // ..................*........................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v25.8H, v15.8H, v26.8H // ...........................*............... + add v5.8H, v15.8H, v26.8H // .................................*......... + sqrdmulh v26.8H, v11.8H, v1.H[5] // .....................*..................... + mul v11.8H, v11.8H, v1.H[4] // ......................*.................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v15.8H, v6.8H, v27.8H // ..................................*........ + add v12.8H, v6.8H, v27.8H // ....................................*...... + mls v31.8H, v13.8H, v7.H[0] // .........................*................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v16.8H, v22.8H, v7.H[0] // .......................*................... + mls v3.8H, v14.8H, v7.H[0] // ........................*.................. + mul v8.8H, v25.8H, v0.H[4] // ...................................*....... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v11.8H, v26.8H, v7.H[0] // ..........................*................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v27.8H, v3.8H, v16.8H // ..............................*............ + add v16.8H, v3.8H, v16.8H // .............................*............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v14.8H, v31.8H, v11.8H // ...............................*........... + sub v6.8H, v31.8H, v11.8H // ................................*.......... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v26.8H, v27.8H, v0.H[2] // .....................................*..... + sqrdmulh v11.8H, v27.8H, v0.H[3] // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v13.8H, v16.8H, v14.8H // .......................................*... + sub v17.8H, v16.8H, v14.8H // ..........................................* + mul v9.8H, v6.8H, v0.H[4] // ........................................*.. + sqrdmulh v19.8H, v6.8H, v0.H[5] // .........................................*. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + + // -------------- new position --------------> + // 0 25 + // |------------------------|----------------- + // ldr q14, [x0, #192] // ..*........................................ + // ldr q4, [x0, #256] // .*......................................... + // ldr q25, [x0, #320] // *.......................................... + // ldr q3, [x0, #0] // ....*...................................... + // ldr q20, [x0, #64] // .....*..................................... + // ldr q31, [x0, #128] // ...*....................................... + // ldr q18, [x0, #384] // ......*.................................... + // ldr q12, [x0, #448] // .......*................................... + // sub v19.8H, v4.8H, v25.8H // ........*.................................. + // add v16.8H, v4.8H, v25.8H // .........*................................. + // sub v13.8H, v3.8H, v20.8H // ............*.............................. + // sub v9.8H, v31.8H, v14.8H // ..........*................................ + // sqrdmulh v6.8H, v19.8H, v1.H[3] // ................*.......................... + // mul v21.8H, v19.8H, v1.H[2] // .................*......................... + // mul v22.8H, v13.8H, v0.H[6] // ....................*...................... + // sub v4.8H, v18.8H, v12.8H // ...............*........................... + // sqrdmulh v19.8H, v9.8H, v1.H[1] // ..................*........................ + // add v27.8H, v18.8H, v12.8H // ..............*............................ + // sqrdmulh v28.8H, v13.8H, v0.H[7] // .....................*..................... + // mul v9.8H, v9.8H, v1.H[0] // ...................*....................... + // add v12.8H, v31.8H, v14.8H // ...........*............................... + // sqrdmulh v18.8H, v4.8H, v1.H[5] // ........................*.................. + // mul v4.8H, v4.8H, v1.H[4] // .........................*................. + // mls v9.8H, v19.8H, v7.H[0] // .............................*............. + // mls v22.8H, v28.8H, v7.H[0] // ..............................*............ + // mls v21.8H, v6.8H, v7.H[0] // ............................*.............. + // mls v4.8H, v18.8H, v7.H[0] // ................................*.......... + // sub v25.8H, v16.8H, v27.8H // ......................*.................... + // add v6.8H, v3.8H, v20.8H // .............*............................. + // add v18.8H, v22.8H, v9.8H // ..................................*........ + // sub v3.8H, v22.8H, v9.8H // .................................*......... + // add v23.8H, v21.8H, v4.8H // ...................................*....... + // sub v22.8H, v21.8H, v4.8H // ....................................*...... + // add v5.8H, v16.8H, v27.8H // .......................*................... + // sub v15.8H, v6.8H, v12.8H // ..........................*................ + // mul v8.8H, v25.8H, v0.H[4] // ...............................*........... + // add v12.8H, v6.8H, v12.8H // ...........................*............... + // mul v26.8H, v3.8H, v0.H[2] // .....................................*..... + // sqrdmulh v11.8H, v3.8H, v0.H[3] // ......................................*.... + // add v13.8H, v18.8H, v23.8H // .......................................*... + // mul v9.8H, v22.8H, v0.H[4] // .........................................*. + // sqrdmulh v19.8H, v22.8H, v0.H[5] // ..........................................* + // sub v17.8H, v18.8H, v23.8H // ........................................*.. sub count, count, #1 layer123_start: - sub v26.8H, v16.8H, v13.8H // ..........................................................*............................. + // Instructions: 88 + // Expected cycles: 18 + // Expected IPC: 4.89 + // + // Cycle bound: 18.0 + // IPC bound: 4.89 + // + // Wall time: 135.65s + // User time: 135.65s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + sqrdmulh v16.8H, v25.8H, v0.H[5] // ........................................*............................................... + sub v6.8H, v12.8H, v5.8H // ................................................*....................................... + sqrdmulh v2.8H, v15.8H, v0.H[3] // ..............................*......................................................... // gap // ........................................................................................ - ldr q21, [x0, #16] // e....................................................................................... - ldr q9, [x0, #208] // ...e.................................................................................... - sqrdmulh v11.8H, v25.8H, v0.H[1] // ...................................................*.................................... - sub v27.8H, v15.8H, v27.8H // .....................................................*.................................. - add v28.8H, v16.8H, v13.8H // ...........................................................*............................ - ldr q16, [x0, #144] // ..e..................................................................................... - ldr q15, [x0, #400] // ......e................................................................................. + ldr q14, [x0, #208] // ...e.................................................................................... + ldr q4, [x0, #272] // ....e................................................................................... + ldr q25, [x0, #336] // .....e.................................................................................. + mul v15.8H, v15.8H, v0.H[2] // ...............................*........................................................ + mls v26.8H, v11.8H, v7.H[0] // .....................................*.................................................. // gap // ........................................................................................ - ldr q8, [x0, #80] // .e...................................................................................... - ldr q25, [x0, #464] // .......e................................................................................ - mls v4.8H, v20.8H, v7.H[0] // .....................................*.................................................. - mls v10.8H, v3.8H, v7.H[0] // ...............................................*........................................ - mul v31.8H, v23.8H, v29.8H // ...........................................................................*............ - sqrdmulh v17.8H, v23.8H, v30.8H // ............................................................................*........... - sqrdmulh v14.8H, v26.8H, v0.H[1] // .............................................................*.......................... - mul v12.8H, v26.8H, v0.H[0] // ............................................................*........................... - mul v26.8H, v27.8H, v0.H[0] // .......................................................*................................ + ldr q3, [x0, #16] // e....................................................................................... + sqrdmulh v27.8H, v17.8H, v0.H[1] // .......................................................*................................ + add v23.8H, v12.8H, v5.8H // .................................................*...................................... + ldr q20, [x0, #80] // .e...................................................................................... + ldr q31, [x0, #144] // ..e..................................................................................... + mul v28.8H, v17.8H, v0.H[0] // ........................................................*............................... + mul v11.8H, v13.8H, v29.8H // ............................................................................*........... + ldr q18, [x0, #400] // ......e................................................................................. // gap // ........................................................................................ + sqrdmulh v5.8H, v6.8H, v0.H[1] // ..................................................*..................................... + mls v9.8H, v19.8H, v7.H[0] // ...............................................*........................................ + sqrdmulh v22.8H, v13.8H, v30.8H // ...........................................................................*............ // gap // ........................................................................................ - ldr q18, [x0, #336] // .....e.................................................................................. - ldr q6, [x0, #272] // ....e................................................................................... - sqrdmulh v20.8H, v27.8H, v0.H[1] // ........................................................*............................... // gap // ........................................................................................ - mls v19.8H, v22.8H, v7.H[0] // ..........................................................................*............. // gap // ........................................................................................ // gap // ........................................................................................ + mls v8.8H, v16.8H, v7.H[0] // ..........................................*............................................. + ldr q12, [x0, #464] // .......e................................................................................ + sqrdmulh v10.8H, v23.8H, v30.8H // ........................................................................*............... + mul v17.8H, v23.8H, v29.8H // .........................................................................*.............. // gap // ........................................................................................ - mul v3.8H, v28.8H, v29.8H // ..............................................................................*......... - sqrdmulh v24.8H, v28.8H, v30.8H // ...............................................................................*........ - mls v5.8H, v11.8H, v7.H[0] // ....................................................*................................... - add v13.8H, v16.8H, v9.8H // ..............e......................................................................... - mls v31.8H, v17.8H, v7.H[0] // .............................................................................*.......... + mls v15.8H, v2.8H, v7.H[0] // ................................*....................................................... + mls v28.8H, v27.8H, v7.H[0] // .........................................................*.............................. + mul v24.8H, v6.8H, v0.H[0] // ...................................................*.................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sub v16.8H, v16.8H, v9.8H // .............e.......................................................................... - add v23.8H, v4.8H, v10.8H // ................................................................*....................... - sub v2.8H, v21.8H, v8.8H // ........e............................................................................... - add v17.8H, v21.8H, v8.8H // .........e.............................................................................. - add v11.8H, v15.8H, v25.8H // ........................e............................................................... + sub v19.8H, v4.8H, v25.8H // ..................e..................................................................... + add v16.8H, v4.8H, v25.8H // ...................e.................................................................... + add v2.8H, v26.8H, v9.8H // ................................................................*....................... + sub v26.8H, v26.8H, v9.8H // ...............................................................*........................ // gap // ........................................................................................ - sub v27.8H, v15.8H, v25.8H // .......................e................................................................ // gap // ........................................................................................ // gap // ........................................................................................ + sub v13.8H, v3.8H, v20.8H // ........e............................................................................... + sub v9.8H, v31.8H, v14.8H // .............e.......................................................................... // gap // ........................................................................................ - mul v15.8H, v16.8H, v1.H[0] // ...............e........................................................................ - sub v9.8H, v6.8H, v18.8H // ..................e..................................................................... - str q19, [x0], #(16) // ....................................................................................*... + sub v23.8H, v15.8H, v8.8H // ..........................................................*............................. + sqrdmulh v6.8H, v19.8H, v1.H[3] // ....................e................................................................... + add v25.8H, v15.8H, v8.8H // ...........................................................*............................ // gap // ........................................................................................ // gap // ........................................................................................ - str q5, [x0, #240] // ....................................................................*................... - mul v22.8H, v23.8H, v29.8H // .................................................................................*...... - sqrdmulh v25.8H, v23.8H, v30.8H // ..................................................................................*..... - sqrdmulh v28.8H, v27.8H, v1.H[5] // ..........................e............................................................. - add v18.8H, v6.8H, v18.8H // ...................e.................................................................... - mul v6.8H, v27.8H, v1.H[4] // .........................e.............................................................. // gap // ........................................................................................ + mul v21.8H, v19.8H, v1.H[2] // .....................e.................................................................. // gap // ........................................................................................ + mls v11.8H, v22.8H, v7.H[0] // .............................................................................*.......... + mul v22.8H, v13.8H, v0.H[6] // ...........e............................................................................ + str q28, [x0, #320] // .....................................................................*.................. + sub v4.8H, v18.8H, v12.8H // .......................e................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v5.8H, v16.8H, v1.H[1] // ................e....................................................................... - sqrdmulh v16.8H, v2.8H, v0.H[7] // ...........e............................................................................ - mul v23.8H, v2.8H, v0.H[6] // ..........e............................................................................. // gap // ........................................................................................ + sqrdmulh v19.8H, v9.8H, v1.H[1] // ...............e........................................................................ + mul v8.8H, v2.8H, v29.8H // ..................................................................................*..... + add v27.8H, v18.8H, v12.8H // ........................e............................................................... + sqrdmulh v28.8H, v13.8H, v0.H[7] // ..........e............................................................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v27.8H, v9.8H, v1.H[3] // .....................e.................................................................. - mul v2.8H, v9.8H, v1.H[2] // ....................e................................................................... - mls v22.8H, v25.8H, v7.H[0] // ...................................................................................*.... - sub v25.8H, v18.8H, v11.8H // ......................................e................................................. - sub v9.8H, v17.8H, v13.8H // ............................e........................................................... - sub v19.8H, v4.8H, v10.8H // ...............................................................*........................ + mul v9.8H, v9.8H, v1.H[0] // ................e....................................................................... // gap // ........................................................................................ + add v12.8H, v31.8H, v14.8H // ..............e......................................................................... + sqrdmulh v18.8H, v4.8H, v1.H[5] // .........................e.............................................................. // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mls v6.8H, v28.8H, v7.H[0] // ...........................e............................................................ - mls v15.8H, v5.8H, v7.H[0] // .................e...................................................................... - add v5.8H, v17.8H, v13.8H // .............................e.......................................................... - add v4.8H, v18.8H, v11.8H // .......................................e................................................ + mul v4.8H, v4.8H, v1.H[4] // ..........................e............................................................. // gap // ........................................................................................ + sqrdmulh v14.8H, v2.8H, v30.8H // .................................................................................*...... + mul v2.8H, v23.8H, v0.H[0] // .............................................................*.......................... // gap // ........................................................................................ + str q11, [x0, #64] // .....................................................................................*.. + sqrdmulh v15.8H, v25.8H, v30.8H // ..............................................................................*......... // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v28.8H, v19.8H, v0.H[1] // ..................................................................*..................... - mul v18.8H, v19.8H, v0.H[0] // .................................................................*...................... - mls v2.8H, v27.8H, v7.H[0] // ......................e................................................................. - mls v23.8H, v16.8H, v7.H[0] // ............e........................................................................... + sqrdmulh v13.8H, v26.8H, v0.H[1] // .................................................................*...................... + mul v31.8H, v25.8H, v29.8H // ...............................................................................*........ // gap // ........................................................................................ // gap // ........................................................................................ + mls v9.8H, v19.8H, v7.H[0] // .................e...................................................................... + sqrdmulh v19.8H, v23.8H, v0.H[1] // ............................................................*........................... // gap // ........................................................................................ + mls v22.8H, v28.8H, v7.H[0] // ............e........................................................................... + mul v28.8H, v26.8H, v0.H[0] // ..................................................................*..................... // gap // ........................................................................................ + mls v8.8H, v14.8H, v7.H[0] // ...................................................................................*.... + mls v17.8H, v10.8H, v7.H[0] // ..........................................................................*............. + mls v21.8H, v6.8H, v7.H[0] // ......................e................................................................. // gap // ........................................................................................ - sqrdmulh v11.8H, v25.8H, v0.H[5] // .........................................e.............................................. - sqrdmulh v19.8H, v9.8H, v0.H[3] // ...............................e........................................................ - mul v16.8H, v9.8H, v0.H[2] // ..............................e......................................................... // gap // ........................................................................................ // gap // ........................................................................................ + mls v4.8H, v18.8H, v7.H[0] // ...........................e............................................................ // gap // ........................................................................................ - mul v13.8H, v25.8H, v0.H[4] // ........................................e............................................... - mls v3.8H, v24.8H, v7.H[0] // ................................................................................*....... - mls v12.8H, v14.8H, v7.H[0] // ..............................................................*......................... - mls v26.8H, v20.8H, v7.H[0] // .........................................................*.............................. - add v24.8H, v5.8H, v4.8H // .................................................e...................................... + sub v25.8H, v16.8H, v27.8H // ......................................e................................................. + mls v31.8H, v15.8H, v7.H[0] // ................................................................................*....... + mls v24.8H, v5.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - add v27.8H, v2.8H, v6.8H // ............................................e........................................... - mls v18.8H, v28.8H, v7.H[0] // ...................................................................*.................... + add v6.8H, v3.8H, v20.8H // .........e.............................................................................. + add v18.8H, v22.8H, v9.8H // ..................................e..................................................... + mls v28.8H, v13.8H, v7.H[0] // ...................................................................*.................... + mls v2.8H, v19.8H, v7.H[0] // ..............................................................*......................... // gap // ........................................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - str q22, [x0, #176] // .......................................................................................* - sub v14.8H, v23.8H, v15.8H // .................................e...................................................... - sub v28.8H, v2.8H, v6.8H // ...........................................e............................................ - add v15.8H, v23.8H, v15.8H // ..................................e..................................................... - mls v13.8H, v11.8H, v7.H[0] // ..........................................e............................................. - mls v16.8H, v19.8H, v7.H[0] // ................................e....................................................... - sub v25.8H, v5.8H, v4.8H // ................................................e....................................... // gap // ........................................................................................ + sub v3.8H, v22.8H, v9.8H // .................................e...................................................... + str q8, [x0, #192] // .......................................................................................* + add v23.8H, v21.8H, v4.8H // ............................................e........................................... + sub v22.8H, v21.8H, v4.8H // ...........................................e............................................ + str q17, [x0], #(16) // ....................................................................................*... + add v5.8H, v16.8H, v27.8H // .......................................e................................................ // gap // ........................................................................................ // gap // ........................................................................................ - str q31, [x0, #48] // .....................................................................................*.. + sub v15.8H, v6.8H, v12.8H // ............................e........................................................... + mul v8.8H, v25.8H, v0.H[4] // .........................................e.............................................. // gap // ........................................................................................ - str q3, [x0, #112] // ......................................................................................*. - str q12, [x0, #368] // ......................................................................*................. - mul v10.8H, v28.8H, v0.H[4] // .............................................e.......................................... + str q24, [x0, #240] // ....................................................................*................... // gap // ........................................................................................ - mul v4.8H, v14.8H, v0.H[2] // ...................................e.................................................... - sqrdmulh v20.8H, v14.8H, v0.H[3] // ....................................e................................................... - sqrdmulh v3.8H, v28.8H, v0.H[5] // ..............................................e......................................... - str q18, [x0, #432] // .......................................................................*................ - mul v5.8H, v25.8H, v0.H[0] // ..................................................e..................................... - sqrdmulh v22.8H, v24.8H, v30.8H // .........................................................................e.............. - mul v19.8H, v24.8H, v29.8H // ........................................................................e............... - str q26, [x0, #304] // .....................................................................*.................. + add v12.8H, v6.8H, v12.8H // .............................e.......................................................... + str q31, [x0, #112] // ......................................................................................*. + mul v26.8H, v3.8H, v0.H[2] // ....................................e................................................... + sqrdmulh v11.8H, v3.8H, v0.H[3] // ...................................e.................................................... + str q28, [x0, #432] // .......................................................................*................ + str q2, [x0, #368] // ......................................................................*................. + add v13.8H, v18.8H, v23.8H // ......................................................e................................. + mul v9.8H, v22.8H, v0.H[4] // ..............................................e......................................... + sqrdmulh v19.8H, v22.8H, v0.H[5] // .............................................e.......................................... // gap // ........................................................................................ // gap // ........................................................................................ - add v23.8H, v15.8H, v27.8H // ......................................................e................................. - - // original source code - // ldr q8, [x0, #0] // e......................................................................................|e..................................................................................... - // ldr q9, [x0, #(1*(512/8))] // .......e...............................................................................|.......e.............................................................................. - // ldr q10, [x0, #(2*(512/8))] // .....e.................................................................................|.....e................................................................................ - // ldr q11, [x0, #(3*(512/8))] // .e.....................................................................................|.e.................................................................................... - // ldr q12, [x0, #(4*(512/8))] // .................e.....................................................................|.................e.................................................................... - // ldr q13, [x0, #(5*(512/8))] // ................e......................................................................|................e..................................................................... - // ldr q14, [x0, #(6*(512/8))] // ......e................................................................................|......e............................................................................... - // ldr q15, [x0, #(7*(512/8))] // ........e..............................................................................|........e............................................................................. - // sub v24.8h, v8.8h, v9.8h // ...........................e...........................................................|...........................e.......................................................... - // add v8.8h, v8.8h, v9.8h // ............................e..........................................................|............................e......................................................... - // mul v9.8h, v24.8h, v0.h[6] // ..........................................e............................................|..........................................e........................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // .........................................e.............................................|.........................................e............................................ - // mls v9.8h, v24.8h, v7.h[0] // ........................................................e..............................|........................................................e............................. - // sub v24.8h, v10.8h, v11.8h // .........................e.............................................................|.........................e............................................................ - // add v10.8h, v10.8h, v11.8h // .......................e...............................................................|.......................e.............................................................. - // mul v11.8h, v24.8h, v1.h[0] // ...............................e.......................................................|...............................e...................................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ........................................e..............................................|........................................e............................................. - // mls v11.8h, v24.8h, v7.h[0] // ..................................................e....................................|..................................................e................................... - // sub v24.8h, v12.8h, v13.8h // ................................e......................................................|................................e..................................................... - // add v12.8h, v12.8h, v13.8h // ......................................e................................................|......................................e............................................... - // mul v13.8h, v24.8h, v1.h[2] // ............................................e..........................................|............................................e......................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........................................e...........................................|...........................................e.......................................... - // mls v13.8h, v24.8h, v7.h[0] // .......................................................e...............................|.......................................................e.............................. - // sub v24.8h, v14.8h, v15.8h // ..............................e........................................................|..............................e....................................................... - // add v14.8h, v14.8h, v15.8h // .............................e.........................................................|.............................e........................................................ - // mul v15.8h, v24.8h, v1.h[4] // .......................................e...............................................|.......................................e.............................................. - // sqrdmulh v24.8h, v24.8h, v1.h[5] // .....................................e.................................................|.....................................e................................................ - // mls v15.8h, v24.8h, v7.h[0] // .................................................e.....................................|.................................................e.................................... - // sub v24.8h, v8.8h, v10.8h // ...............................................e.......................................|...............................................e...................................... - // add v8.8h, v8.8h, v10.8h // ...................................................e...................................|...................................................e.................................. - // mul v10.8h, v24.8h, v0.h[2] // ...........................................................e...........................|...........................................................e.......................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................................................e............................|..........................................................e........................... - // mls v10.8h, v24.8h, v7.h[0] // ........................................................................e..............|........................................................................e............. - // sub v24.8h, v9.8h, v11.8h // ....................................................................e..................|....................................................................e................. - // add v9.8h, v9.8h, v11.8h // ......................................................................e................|......................................................................e............... - // mul v11.8h, v24.8h, v0.h[2] // ..............................................................................e........|..............................................................................e....... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................................................................e.......|...............................................................................e...... - // mls v11.8h, v24.8h, v7.h[0] // .........*.............................................................................|.........*............................................................................ - // sub v24.8h, v12.8h, v14.8h // ..............................................e........................................|..............................................e....................................... - // add v12.8h, v12.8h, v14.8h // ....................................................e..................................|....................................................e................................. - // mul v14.8h, v24.8h, v0.h[4] // ............................................................e..........................|............................................................e......................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .........................................................e.............................|.........................................................e............................ - // mls v14.8h, v24.8h, v7.h[0] // .......................................................................e...............|.......................................................................e.............. - // sub v24.8h, v13.8h, v15.8h // .....................................................................e.................|.....................................................................e................ - // add v13.8h, v13.8h, v15.8h // .................................................................e.....................|.................................................................e.................... - // mul v15.8h, v24.8h, v0.h[4] // .............................................................................e.........|.............................................................................e........ - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ................................................................................e......|................................................................................e..... - // mls v15.8h, v24.8h, v7.h[0] // ..........*............................................................................|..........*........................................................................... - // sub v24.8h, v8.8h, v12.8h // .........................................................................e.............|.........................................................................e............ - // add v8.8h, v8.8h, v12.8h // ................................................................e......................|................................................................e..................... - // mul v12.8h, v24.8h, v0.h[0] // ..................................................................................e....|..................................................................................e... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*....................................................................................|..*................................................................................... - // mls v12.8h, v24.8h, v7.h[0] // ......................*................................................................|......................*............................................................... - // sub v24.8h, v9.8h, v13.8h // ...*...................................................................................|...*.................................................................................. - // add v9.8h, v9.8h, v13.8h // ......................................................................................e|...................................................................................... - // mul v13.8h, v24.8h, v0.h[0] // ...............*.......................................................................|...............*...................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................*....................................................................|..................*................................................................... - // mls v13.8h, v24.8h, v7.h[0] // ...............................................................*.......................|...............................................................*...................... - // sub v24.8h, v10.8h, v14.8h // .......................................................................................*...................................................................................... - // add v10.8h, v10.8h, v14.8h // ....*..................................................................................|....*................................................................................. - // mul v14.8h, v24.8h, v0.h[0] // ..............*........................................................................|..............*....................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .............*.........................................................................|.............*........................................................................ - // mls v14.8h, v24.8h, v7.h[0] // ..............................................................*........................|..............................................................*....................... - // sub v24.8h, v11.8h, v15.8h // ................................................*......................................|................................................*..................................... - // add v11.8h, v11.8h, v15.8h // ..........................*............................................................|..........................*........................................................... - // mul v15.8h, v24.8h, v0.h[0] // ......................................................*................................|......................................................*............................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................................*.................................|.....................................................*................................ - // mls v15.8h, v24.8h, v7.h[0] // ..................................................................*....................|..................................................................*................... - // str q12, [x0, #(4*(512/8))] // ..................................*....................................................|..................................*................................................... - // str q13, [x0, #(5*(512/8))] // .....................................................................................*.|.....................................................................................* - // str q14, [x0, #(6*(512/8))] // ............................................................................*..........|............................................................................*......... - // str q15, [x0, #(7*(512/8))] // .................................................................................*.....|.................................................................................*.... - // mul v12.8h, v8.8h, v29.8h // ....................................................................................e..|....................................................................................e. - // sqrdmulh v8.8h, v8.8h, v30.8h // ...................................................................................e...|...................................................................................e.. - // mls v12.8h, v8.8h, v7.h[0] // ...................*...................................................................|...................*.................................................................. - // mul v13.8h, v9.8h, v29.8h // ...........*...........................................................................|...........*.......................................................................... - // sqrdmulh v9.8h, v9.8h, v30.8h // ............*..........................................................................|............*......................................................................... - // mls v13.8h, v9.8h, v7.h[0] // ........................*..............................................................|........................*............................................................. - // mul v14.8h, v10.8h, v29.8h // ....................*..................................................................|....................*................................................................. - // sqrdmulh v10.8h, v10.8h, v30.8h // .....................*.................................................................|.....................*................................................................ - // mls v14.8h, v10.8h, v7.h[0] // .............................................................*.........................|.............................................................*........................ - // mul v15.8h, v11.8h, v29.8h // ...................................*...................................................|...................................*.................................................. - // sqrdmulh v11.8h, v11.8h, v30.8h // ....................................*..................................................|....................................*................................................. - // mls v15.8h, v11.8h, v7.h[0] // .............................................*.........................................|.............................................*........................................ - // str q12, [x0], #(16) // .................................*.....................................................|.................................*.................................................... - // str q13, [x0, #(-16 + 1*(512/8))] // ..........................................................................*............|..........................................................................*........... - // str q14, [x0, #(-16 + 2*(512/8))] // ...........................................................................*...........|...........................................................................*.......... - // str q15, [x0, #(-16 + 3*(512/8))] // ...................................................................*...................|...................................................................*.................. + sub v17.8H, v18.8H, v23.8H // .....................................................e.................................. + + // ----------------------------------------------------------------------------- new position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q8, [x0, #0] // .....e...............................................................................'.......~........................................................................... + // ldr q9, [x0, #(1*(512/8))] // ........e............................................................................'..........~........................................................................ + // ldr q10, [x0, #(2*(512/8))] // .........e...........................................................................'...........~....................................................................... + // ldr q11, [x0, #(3*(512/8))] // e....................................................................................'..~................................................................................ + // ldr q12, [x0, #(4*(512/8))] // .e...................................................................................'...~............................................................................... + // ldr q13, [x0, #(5*(512/8))] // ..e..................................................................................'....~.............................................................................. + // ldr q14, [x0, #(6*(512/8))] // ............e........................................................................'..............~.................................................................... + // ldr q15, [x0, #(7*(512/8))] // .................e...................................................................'...................~............................................................... + // sub v24.8h, v8.8h, v9.8h // ...........................e.........................................................'.............................~..................................................... + // add v8.8h, v8.8h, v9.8h // ..............................................................e......................'................................................................~.................. + // sqrdmulh v27.8h, v24.8h, v0.h[7] // ........................................e............................................'..........................................~........................................ + // mul v9.8h, v24.8h, v0.h[6] // ..................................e..................................................'....................................~.............................................. + // mls v9.8h, v27.8h, v7.h[0] // .....................................................e...............................'.......................................................~........................... + // sub v24.8h, v10.8h, v11.8h // ............................e........................................................'..............................~.................................................... + // add v10.8h, v10.8h, v11.8h // ..........................................e..........................................'............................................~...................................... + // sqrdmulh v27.8h, v24.8h, v1.h[1] // .....................................e...............................................'.......................................~........................................... + // mul v11.8h, v24.8h, v1.h[0] // .........................................e...........................................'...........................................~....................................... + // mls v11.8h, v27.8h, v7.h[0] // ...................................................e.................................'.....................................................~............................. + // sub v24.8h, v12.8h, v13.8h // .......................e.............................................................'.........................~......................................................... + // add v12.8h, v12.8h, v13.8h // ........................e............................................................'..........................~........................................................ + // sqrdmulh v27.8h, v24.8h, v1.h[3] // ..............................e......................................................'................................~.................................................. + // mul v13.8h, v24.8h, v1.h[2] // ................................e....................................................'..................................~................................................ + // mls v13.8h, v27.8h, v7.h[0] // .........................................................e...........................'...........................................................~....................... + // sub v24.8h, v14.8h, v15.8h // ....................................e................................................'......................................~............................................ + // add v14.8h, v14.8h, v15.8h // .......................................e.............................................'.........................................~......................................... + // sqrdmulh v27.8h, v24.8h, v1.h[5] // ...........................................e.........................................'.............................................~..................................... + // mul v15.8h, v24.8h, v1.h[4] // ............................................e........................................'..............................................~.................................... + // mls v15.8h, v27.8h, v7.h[0] // ..........................................................e..........................'............................................................~...................... + // sub v24.8h, v8.8h, v10.8h // ........................................................................e............'..........................................................................~........ + // add v8.8h, v8.8h, v10.8h // ...........................................................................e.........'.............................................................................~..... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .....................................................................................'.*................................................................................. + // mul v10.8h, v24.8h, v0.h[2] // ...~.................................................................................'.....*............................................................................. + // mls v10.8h, v27.8h, v7.h[0] // ....................~................................................................'......................*............................................................ + // sub v24.8h, v9.8h, v11.8h // ..................................................................e..................'....................................................................~.............. + // add v9.8h, v9.8h, v11.8h // ...............................................................e.....................'.................................................................~................. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ..............................................................................e......'................................................................................~.. + // mul v11.8h, v24.8h, v0.h[2] // .............................................................................e.......'...............................................................................~... + // mls v11.8h, v27.8h, v7.h[0] // ....~................................................................................'......*............................................................................ + // sub v24.8h, v12.8h, v14.8h // ...........................................................e.........................'.............................................................~..................... + // add v12.8h, v12.8h, v14.8h // .......................................................................e.............'.........................................................................~......... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // .....................................................................................*................................................................................... + // mul v14.8h, v24.8h, v0.h[4] // .........................................................................e...........'...........................................................................~....... + // mls v14.8h, v27.8h, v7.h[0] // ................~....................................................................'..................*................................................................ + // sub v24.8h, v13.8h, v15.8h // .....................................................................e...............'.......................................................................~........... + // add v13.8h, v13.8h, v15.8h // ....................................................................e................'......................................................................~............ + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...................................................................................e.'................................................................................... + // mul v15.8h, v24.8h, v0.h[4] // ..................................................................................e..'................................................................................... + // mls v15.8h, v27.8h, v7.h[0] // ..............~......................................................................'................*.................................................................. + // sub v24.8h, v8.8h, v12.8h // .....................................................................................'*.................................................................................. + // add v8.8h, v8.8h, v12.8h // .......~.............................................................................'.........*......................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .............~.......................................................................'...............*................................................................... + // mul v12.8h, v24.8h, v0.h[0] // ......................~..............................................................'........................*.......................................................... + // mls v12.8h, v27.8h, v7.h[0] // .............................................................~.......................'...............................................................*................... + // sub v24.8h, v9.8h, v13.8h // ....................................................................................e'................................................................................... + // add v9.8h, v9.8h, v13.8h // .................................................................................e...'................................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ......~..............................................................................'........*.......................................................................... + // mul v13.8h, v24.8h, v0.h[0] // ..........~..........................................................................'............*...................................................................... + // mls v13.8h, v27.8h, v7.h[0] // .....................~...............................................................'.......................*........................................................... + // sub v24.8h, v10.8h, v14.8h // .............................~.......................................................'...............................*................................................... + // add v10.8h, v10.8h, v14.8h // ...............................~.....................................................'.................................*................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ....................................................~................................'......................................................*............................ + // mul v14.8h, v24.8h, v0.h[0] // ..............................................~......................................'................................................*.................................. + // mls v14.8h, v27.8h, v7.h[0] // .................................................................~...................'...................................................................*............... + // sub v24.8h, v11.8h, v15.8h // ..........................~..........................................................'............................*...................................................... + // add v11.8h, v11.8h, v15.8h // .........................~...........................................................'...........................*....................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .................................................~...................................'...................................................*............................... + // mul v15.8h, v24.8h, v0.h[0] // ......................................................~..............................'........................................................*.......................... + // mls v15.8h, v27.8h, v7.h[0] // ................................................................~....................'..................................................................*................ + // str q12, [x0, #(4*(512/8))] // ..........................................................................~..........'............................................................................*...... + // str q13, [x0, #(5*(512/8))] // ...................................~.................................................'.....................................*............................................. + // str q14, [x0, #(6*(512/8))] // ................................................................................~....'..................................................................................* + // str q15, [x0, #(7*(512/8))] // ...............................................................................~.....'.................................................................................*. + // sqrdmulh v27.8h, v8.8h, v30.8h // ..................~..................................................................'....................*.............................................................. + // mul v8.8h, v8.8h, v29.8h // ...................~.................................................................'.....................*............................................................. + // mls v8.8h, v27.8h, v7.h[0] // ........................................................~............................'..........................................................*........................ + // sqrdmulh v27.8h, v9.8h, v30.8h // ...............~.....................................................................'.................*................................................................. + // mul v9.8h, v9.8h, v29.8h // ...........~.........................................................................'.............*..................................................................... + // mls v9.8h, v27.8h, v7.h[0] // .................................~...................................................'...................................*............................................... + // sqrdmulh v27.8h, v10.8h, v30.8h // ................................................~....................................'..................................................*................................ + // mul v10.8h, v10.8h, v29.8h // ..................................................~..................................'....................................................*.............................. + // mls v10.8h, v27.8h, v7.h[0] // ............................................................~........................'..............................................................*.................... + // sqrdmulh v27.8h, v11.8h, v30.8h // .............................................~.......................................'...............................................*................................... + // mul v11.8h, v11.8h, v29.8h // ......................................~..............................................'........................................*.......................................... + // mls v11.8h, v27.8h, v7.h[0] // .......................................................~.............................'.........................................................*......................... + // str q8, [x0], #(16) // ......................................................................~..............'........................................................................*.......... + // str q9, [x0, #(-16 + 1*(512/8))] // ...............................................~.....................................'.................................................*................................. + // str q10, [x0, #(-16 + 2*(512/8))] // ............................................................................~........'..............................................................................*.... + // str q11, [x0, #(-16 + 3*(512/8))] // ...................................................................~.................'.....................................................................*............. sub count, count, #1 cbnz count, layer123_start - mls v19.8H, v22.8H, v7.H[0] // ............*....................... - sub v24.8H, v15.8H, v27.8H // ..*................................. - mls v10.8H, v3.8H, v7.H[0] // .....*.............................. - mls v4.8H, v20.8H, v7.H[0] // ....*............................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - add v17.8H, v16.8H, v13.8H // ...*................................ - sqrdmulh v6.8H, v23.8H, v30.8H // .......*............................ - sub v21.8H, v16.8H, v13.8H // *................................... - mul v26.8H, v23.8H, v29.8H // ......*............................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sqrdmulh v16.8H, v25.8H, v0.H[1] // .*.................................. - mul v2.8H, v24.8H, v0.H[0] // ..........*......................... - sqrdmulh v20.8H, v24.8H, v0.H[1] // ...........*........................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q19, [x0], #(16) // ..................*................. - sqrdmulh v15.8H, v21.8H, v0.H[1] // ........*........................... - mul v31.8H, v21.8H, v0.H[0] // .........*.......................... - add v13.8H, v4.8H, v10.8H // .................*.................. - sub v4.8H, v4.8H, v10.8H // .......................*............ - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v26.8H, v6.8H, v7.H[0] // ................*................... - mul v27.8H, v17.8H, v29.8H // .............*...................... - sqrdmulh v17.8H, v17.8H, v30.8H // ..............*..................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v24.8H, v13.8H, v29.8H // ....................*............... - sqrdmulh v3.8H, v13.8H, v30.8H // .....................*.............. - sqrdmulh v25.8H, v4.8H, v0.H[1] // ........................*........... - mul v12.8H, v4.8H, v0.H[0] // .........................*.......... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v31.8H, v15.8H, v7.H[0] // ...........................*........ - mls v5.8H, v16.8H, v7.H[0] // ...............*.................... - mls v2.8H, v20.8H, v7.H[0] // ............................*....... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v27.8H, v17.8H, v7.H[0] // ..........................*......... - str q26, [x0, #48] // ...............................*.... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v24.8H, v3.8H, v7.H[0] // ......................*............. - mls v12.8H, v25.8H, v7.H[0] // .............................*...... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q31, [x0, #368] // .................................*.. - str q5, [x0, #240] // ...................*................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q27, [x0, #112] // ................................*... - str q2, [x0, #304] // ...................................* - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q24, [x0, #176] // ..............................*..... - str q12, [x0, #432] // ..................................*. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - - // original source code - // sub v26.8H, v16.8H, v13.8H // ......*............................. - // sqrdmulh v11.8H, v25.8H, v0.H[1] // ........*........................... - // sub v27.8H, v15.8H, v27.8H // .*.................................. - // add v28.8H, v16.8H, v13.8H // ....*............................... - // mls v4.8H, v20.8H, v7.H[0] // ...*................................ - // mls v10.8H, v3.8H, v7.H[0] // ..*................................. - // mul v31.8H, v23.8H, v29.8H // .......*............................ - // sqrdmulh v17.8H, v23.8H, v30.8H // .....*.............................. - // sqrdmulh v14.8H, v26.8H, v0.H[1] // ............*....................... - // mul v12.8H, v26.8H, v0.H[0] // .............*...................... - // mul v26.8H, v27.8H, v0.H[0] // .........*.......................... - // sqrdmulh v20.8H, v27.8H, v0.H[1] // ..........*......................... - // mls v19.8H, v22.8H, v7.H[0] // *................................... - // mul v3.8H, v28.8H, v29.8H // .................*.................. - // sqrdmulh v24.8H, v28.8H, v30.8H // ..................*................. - // mls v5.8H, v11.8H, v7.H[0] // ........................*........... - // mls v31.8H, v17.8H, v7.H[0] // ................*................... - // add v23.8H, v4.8H, v10.8H // ..............*..................... - // str q19, [x0], #(16) // ...........*........................ - // str q5, [x0, #240] // ...............................*.... - // mul v22.8H, v23.8H, v29.8H // ...................*................ - // sqrdmulh v25.8H, v23.8H, v30.8H // ....................*............... - // mls v22.8H, v25.8H, v7.H[0] // ............................*....... - // sub v19.8H, v4.8H, v10.8H // ...............*.................... - // sqrdmulh v28.8H, v19.8H, v0.H[1] // .....................*.............. - // mul v18.8H, v19.8H, v0.H[0] // ......................*............. - // mls v3.8H, v24.8H, v7.H[0] // ..........................*......... - // mls v12.8H, v14.8H, v7.H[0] // .......................*............ - // mls v26.8H, v20.8H, v7.H[0] // .........................*.......... - // mls v18.8H, v28.8H, v7.H[0] // .............................*...... - // str q22, [x0, #176] // ..................................*. - // str q31, [x0, #48] // ...........................*........ - // str q3, [x0, #112] // ................................*... - // str q12, [x0, #368] // ..............................*..... - // str q18, [x0, #432] // ...................................* - // str q26, [x0, #304] // .................................*.. + // Instructions: 45 + // Expected cycles: 15 + // Expected IPC: 3.00 + // + // Cycle bound: 15.0 + // IPC bound: 3.00 + // + // Wall time: 0.91s + // User time: 0.91s + // + // ------------ original position -------------> + // 0 25 + // |------------------------|------------------- + sqrdmulh v6.8H, v15.8H, v0.H[3] // ..*.......................................... + sqrdmulh v3.8H, v25.8H, v0.H[5] // *............................................ + mls v26.8H, v11.8H, v7.H[0] // ....*........................................ + mul v11.8H, v15.8H, v0.H[2] // ...*......................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + sqrdmulh v31.8H, v17.8H, v0.H[1] // .....*....................................... + add v2.8H, v12.8H, v5.8H // ......*...................................... + mul v18.8H, v17.8H, v0.H[0] // .......*..................................... + mls v9.8H, v19.8H, v7.H[0] // ..........*.................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + sqrdmulh v16.8H, v13.8H, v30.8H // ...........*................................. + sub v27.8H, v12.8H, v5.8H // .*........................................... + mul v14.8H, v13.8H, v29.8H // ........*.................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mul v4.8H, v2.8H, v29.8H // ..............*.............................. + sqrdmulh v20.8H, v2.8H, v30.8H // .............*............................... + mls v11.8H, v6.8H, v7.H[0] // ...............*............................. + mls v8.8H, v3.8H, v7.H[0] // ............*................................ + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v18.8H, v31.8H, v7.H[0] // ................*............................ + sub v24.8H, v26.8H, v9.8H // ...................*......................... + add v15.8H, v26.8H, v9.8H // ..................*.......................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v14.8H, v16.8H, v7.H[0] // ......................*...................... + sqrdmulh v28.8H, v27.8H, v0.H[1] // .........*................................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mul v21.8H, v15.8H, v29.8H // ........................*.................... + sqrdmulh v13.8H, v15.8H, v30.8H // .........................*................... + add v17.8H, v11.8H, v8.8H // .....................*....................... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + sub v15.8H, v11.8H, v8.8H // ....................*........................ + mul v26.8H, v24.8H, v0.H[0] // ................................*............ + sqrdmulh v25.8H, v24.8H, v0.H[1] // .............................*............... + str q18, [x0, #320] // .......................*..................... + mul v16.8H, v27.8H, v0.H[0] // .................*........................... + mls v4.8H, v20.8H, v7.H[0] // ..................................*.......... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mul v6.8H, v15.8H, v0.H[0] // ..........................*.................. + str q14, [x0, #64] // ...........................*................. + mul v14.8H, v17.8H, v29.8H // ..............................*.............. + sqrdmulh v31.8H, v17.8H, v30.8H // ............................*................ + sqrdmulh v5.8H, v15.8H, v0.H[1] // ...............................*............. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v21.8H, v13.8H, v7.H[0] // .................................*........... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v26.8H, v25.8H, v7.H[0] // .....................................*....... + mls v16.8H, v28.8H, v7.H[0] // ....................................*........ + str q4, [x0], #(16) // ........................................*.... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + mls v14.8H, v31.8H, v7.H[0] // ...................................*......... + mls v6.8H, v5.8H, v7.H[0] // ......................................*...... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q21, [x0, #176] // .......................................*..... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q26, [x0, #432] // ...........................................*. + str q16, [x0, #240] // .........................................*... + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + str q14, [x0, #112] // ..........................................*.. + str q6, [x0, #368] // ............................................* + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + // gap // ............................................. + + // --------------- new position ---------------> + // 0 25 + // |------------------------|------------------- + // sqrdmulh v16.8H, v25.8H, v0.H[5] // .*........................................... + // sub v6.8H, v12.8H, v5.8H // .........*................................... + // sqrdmulh v2.8H, v15.8H, v0.H[3] // *............................................ + // mul v15.8H, v15.8H, v0.H[2] // ...*......................................... + // mls v26.8H, v11.8H, v7.H[0] // ..*.......................................... + // sqrdmulh v27.8H, v17.8H, v0.H[1] // ....*........................................ + // add v23.8H, v12.8H, v5.8H // .....*....................................... + // mul v28.8H, v17.8H, v0.H[0] // ......*...................................... + // mul v11.8H, v13.8H, v29.8H // ..........*.................................. + // sqrdmulh v5.8H, v6.8H, v0.H[1] // ...................*......................... + // mls v9.8H, v19.8H, v7.H[0] // .......*..................................... + // sqrdmulh v22.8H, v13.8H, v30.8H // ........*.................................... + // mls v8.8H, v16.8H, v7.H[0] // ..............*.............................. + // sqrdmulh v10.8H, v23.8H, v30.8H // ............*................................ + // mul v17.8H, v23.8H, v29.8H // ...........*................................. + // mls v15.8H, v2.8H, v7.H[0] // .............*............................... + // mls v28.8H, v27.8H, v7.H[0] // ...............*............................. + // mul v24.8H, v6.8H, v0.H[0] // ...........................*................. + // add v2.8H, v26.8H, v9.8H // .................*........................... + // sub v26.8H, v26.8H, v9.8H // ................*............................ + // sub v23.8H, v15.8H, v8.8H // .......................*..................... + // add v25.8H, v15.8H, v8.8H // ......................*...................... + // mls v11.8H, v22.8H, v7.H[0] // ..................*.......................... + // str q28, [x0, #320] // ..........................*.................. + // mul v8.8H, v2.8H, v29.8H // ....................*........................ + // sqrdmulh v14.8H, v2.8H, v30.8H // .....................*....................... + // mul v2.8H, v23.8H, v0.H[0] // .............................*............... + // str q11, [x0, #64] // ..............................*.............. + // sqrdmulh v15.8H, v25.8H, v30.8H // ................................*............ + // sqrdmulh v13.8H, v26.8H, v0.H[1] // .........................*................... + // mul v31.8H, v25.8H, v29.8H // ...............................*............. + // sqrdmulh v19.8H, v23.8H, v0.H[1] // .................................*........... + // mul v28.8H, v26.8H, v0.H[0] // ........................*.................... + // mls v8.8H, v14.8H, v7.H[0] // ..................................*.......... + // mls v17.8H, v10.8H, v7.H[0] // ............................*................ + // mls v31.8H, v15.8H, v7.H[0] // ......................................*...... + // mls v24.8H, v5.8H, v7.H[0] // ....................................*........ + // mls v28.8H, v13.8H, v7.H[0] // ...................................*......... + // mls v2.8H, v19.8H, v7.H[0] // .......................................*..... + // str q8, [x0, #192] // ........................................*.... + // str q17, [x0], #(16) // .....................................*....... + // str q24, [x0, #240] // ..........................................*.. + // str q31, [x0, #112] // ...........................................*. + // str q28, [x0, #432] // .........................................*... + // str q2, [x0, #368] // ............................................* pop_stack diff --git a/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_icestorm.s b/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_icestorm.s index bbadddf1..ad9296b2 100644 --- a/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_icestorm.s +++ b/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_icestorm.s @@ -35,21 +35,6 @@ // Eventually, NeLight should include a proper parser for AArch64, // but for initial investigations, the below is enough. -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro gs_butterfly a, b, root, idx0, idx1 @@ -84,12 +69,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro gs_butterfly_v a, b, root, root_twisted sub tmp.8h, \a\().8h, \b\().8h add \a\().8h, \a\().8h, \b\().8h @@ -110,21 +89,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -146,7 +125,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +136,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +146,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +154,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,24 +165,30 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + .data .p2align 4 roots: @@ -354,548 +339,593 @@ _intt_kyber_123_4567_opt_m1_icestorm: mov count, #8 .p2align 2 - ldr q19, [x1, #16] // ...*............................................. - ldr q1, [x1, #0] // ..*.............................................. + // Instructions: 49 + // Expected cycles: 31 + // Expected IPC: 1.58 + // + // Cycle bound: 31.0 + // IPC bound: 1.58 + // + // Wall time: 1.37s + // User time: 1.37s + // + // -------------- original position ---------------> + // 0 25 + // |------------------------|----------------------- + ldr q18, [x1, #0] // .*............................................... + ldr q6, [x1, #16] // *................................................ // gap // ................................................. // gap // ................................................. + ldr q16, [x1, #48] // ..*.............................................. + ldr q22, [x1, #32] // ...*............................................. // gap // ................................................. - ldr q0, [x1, #32] // .*............................................... // gap // ................................................. - ldr q22, [x1, #48] // *................................................ + ldr q19, [x4, #80] // ....*............................................ + ldr q20, [x4, #48] // .....*........................................... // gap // ................................................. // gap // ................................................. + ldr q8, [x4, #32] // ...........*..................................... + ldr q31, [x4], #(6*16) // .........*....................................... // gap // ................................................. // gap // ................................................. - ldr q6, [x3], #16 // .......................................*......... + trn1 v3.4S, v18.4S, v6.4S // ......*.......................................... + trn2 v6.4S, v18.4S, v6.4S // .......*......................................... + ldr q18, [x4, #-32] // ............*.................................... + ldr q12, [x4, #-80] // .............*................................... + trn1 v9.4S, v22.4S, v16.4S // ........*........................................ + trn2 v16.4S, v22.4S, v16.4S // ..........*...................................... + ldr q14, [x3], #16 // ............................................*.... // gap // ................................................. // gap // ................................................. // gap // ................................................. - ldr q11, [x4, #48] // ........*........................................ - trn1 v27.4S, v1.4S, v19.4S // ...........*..................................... - trn2 v24.4S, v1.4S, v19.4S // .........*....................................... // gap // ................................................. - trn2 v17.4S, v0.4S, v22.4S // .......*......................................... // gap // ................................................. - ldr q19, [x4, #80] // .................*............................... - trn1 v8.4S, v0.4S, v22.4S // ......*.......................................... + trn2 v15.2D, v6.2D, v16.2D // ...............*................................. + trn2 v22.2D, v3.2D, v9.2D // ..............*.................................. // gap // ................................................. // gap // ................................................. - ldr q0, [x4, #32] // .....*........................................... + trn1 v3.2D, v3.2D, v9.2D // ................*................................ + trn1 v6.2D, v6.2D, v16.2D // .................*............................... // gap // ................................................. - trn1 v4.2D, v27.2D, v8.2D // ...............*................................. // gap // ................................................. + sub v16.8H, v22.8H, v15.8H // ...................*............................. + add v22.8H, v22.8H, v15.8H // ..................*.............................. // gap // ................................................. - trn1 v13.2D, v24.2D, v17.2D // ............*.................................... - ldr q2, [x4, #64] // ....*............................................ - trn2 v22.2D, v27.2D, v8.2D // ................*................................ // gap // ................................................. - trn2 v12.2D, v24.2D, v17.2D // .............*................................... + sub v9.8H, v3.8H, v6.8H // ....................*............................ + add v6.8H, v3.8H, v6.8H // .....................*........................... // gap // ................................................. // gap // ................................................. + sqrdmulh v3.8H, v16.8H, v19.8H // .........................*....................... + mul v18.8H, v16.8H, v18.8H // ........................*........................ // gap // ................................................. - sub v5.8H, v4.8H, v13.8H // ..................*.............................. - add v28.8H, v4.8H, v13.8H // ....................*............................ // gap // ................................................. // gap // ................................................. - sub v17.8H, v22.8H, v12.8H // ...................*............................. + sqrdmulh v16.8H, v9.8H, v20.8H // ......................*.......................... // gap // ................................................. - mul v23.8H, v5.8H, v0.8H // ......................*.......................... + mul v20.8H, v9.8H, v8.8H // .......................*......................... + sub v19.8H, v6.8H, v22.8H // ..........................*...................... + add v6.8H, v6.8H, v22.8H // ...................................*............. // gap // ................................................. - sqrdmulh v10.8H, v5.8H, v11.8H // ........................*........................ - mul v30.8H, v17.8H, v2.8H // .......................*......................... // gap // ................................................. + mls v18.8H, v3.8H, v7.H[0] // ............................*.................... // gap // ................................................. - sqrdmulh v19.8H, v17.8H, v19.8H // .....................*........................... - add v16.8H, v22.8H, v12.8H // .........................*....................... - ldr q2, [x4, #16] // ..........*...................................... // gap // ................................................. // gap // ................................................. - mls v23.8H, v10.8H, v7.H[0] // ..........................*...................... + mul v24.8H, v19.8H, v31.8H // ..............................*.................. // gap // ................................................. // gap // ................................................. + mls v20.8H, v16.8H, v7.H[0] // ...........................*..................... + sqrdmulh v22.8H, v19.8H, v12.8H // .............................*................... // gap // ................................................. - mls v30.8H, v19.8H, v7.H[0] // ...........................*..................... - sub v11.8H, v28.8H, v16.8H // ............................*.................... // gap // ................................................. - ldr q19, [x4], #(6*16) // ..............*.................................. // gap // ................................................. // gap // ................................................. // gap // ................................................. // gap // ................................................. - sqrdmulh v22.8H, v11.8H, v2.8H // .............................*................... // gap // ................................................. // gap // ................................................. // gap // ................................................. // gap // ................................................. + sub v16.8H, v20.8H, v18.8H // ...............................*................. + mls v24.8H, v22.8H, v7.H[0] // ....................................*............ + add v18.8H, v20.8H, v18.8H // ..................................*.............. // gap // ................................................. // gap // ................................................. - sub v8.8H, v23.8H, v30.8H // ..............................*.................. - mul v21.8H, v11.8H, v19.8H // ...............................*................. + mul v22.8H, v16.8H, v31.8H // .................................*............... // gap // ................................................. // gap // ................................................. + sqrdmulh v16.8H, v16.8H, v12.8H // ................................*................ // gap // ................................................. // gap // ................................................. // gap // ................................................. - mul v3.8H, v8.8H, v19.8H // .................................*............... - sqrdmulh v13.8H, v8.8H, v2.8H // ................................*................ // gap // ................................................. + trn1 v20.4S, v6.4S, v18.4S // ......................................*.......... + trn2 v6.4S, v6.4S, v18.4S // .........................................*....... // gap // ................................................. // gap // ................................................. // gap // ................................................. - add v8.8H, v23.8H, v30.8H // ...................................*............. - add v19.8H, v28.8H, v16.8H // ..................................*.............. // gap // ................................................. // gap // ................................................. - mls v21.8H, v22.8H, v7.H[0] // .....................................*........... + mls v22.8H, v16.8H, v7.H[0] // .....................................*........... // gap // ................................................. // gap // ................................................. - mls v3.8H, v13.8H, v7.H[0] // ....................................*............ - trn1 v5.4S, v19.4S, v8.4S // ......................................*.......... // gap // ................................................. // gap // ................................................. // gap // ................................................. - trn2 v8.4S, v19.4S, v8.4S // ........................................*........ // gap // ................................................. // gap // ................................................. // gap // ................................................. - trn1 v13.4S, v21.4S, v3.4S // ..........................................*...... + trn1 v18.4S, v24.4S, v22.4S // ........................................*........ // gap // ................................................. // gap // ................................................. - trn2 v30.4S, v21.4S, v3.4S // .........................................*....... + trn2 v16.4S, v24.4S, v22.4S // .......................................*......... // gap // ................................................. // gap // ................................................. // gap // ................................................. // gap // ................................................. - trn2 v14.2D, v8.2D, v30.2D // ............................................*.... + trn2 v19.2D, v6.2D, v16.2D // ...........................................*..... // gap // ................................................. // gap // ................................................. - trn2 v19.2D, v5.2D, v13.2D // ...........................................*..... - trn1 v22.2D, v8.2D, v30.2D // .............................................*... - trn1 v28.2D, v5.2D, v13.2D // ..............................................*.. + trn2 v22.2D, v20.2D, v18.2D // ..........................................*...... + trn1 v9.2D, v20.2D, v18.2D // ..............................................*.. // gap // ................................................. // gap // ................................................. - add v3.8H, v19.8H, v14.8H // ...............................................*. + trn1 v16.2D, v6.2D, v16.2D // .............................................*... + sub v18.8H, v22.8H, v19.8H // ................................................* // gap // ................................................. // gap // ................................................. - sub v19.8H, v19.8H, v14.8H // ................................................* + add v12.8H, v22.8H, v19.8H // ...............................................*. - // original source code - // ldr q30, [x1, #48] // ...*............................................. - // ldr q12, [x1, #32] // ..*.............................................. - // ldr q26, [x1, #0] // .*............................................... - // ldr q21, [x1, #16] // *................................................ - // ldr q29, [x4, #64] // ..............*.................................. - // ldr q2, [x4, #32] // ...........*..................................... - // trn1 v13.4S, v12.4S, v30.4S // ..........*...................................... - // trn2 v19.4S, v12.4S, v30.4S // ........*........................................ - // ldr q14, [x4, #48] // .....*........................................... - // trn2 v9.4S, v26.4S, v21.4S // .......*......................................... - // ldr q31, [x4, #16] // .........................*....................... - // trn1 v11.4S, v26.4S, v21.4S // ......*.......................................... - // trn1 v5.2D, v9.2D, v19.2D // .............*................................... - // trn2 v8.2D, v9.2D, v19.2D // ................*................................ - // ldr q9, [x4], #(6*16) // .............................*................... - // trn1 v12.2D, v11.2D, v13.2D // ............*.................................... - // trn2 v11.2D, v11.2D, v13.2D // ...............*................................. - // ldr q15, [x4, #-16] // .........*....................................... - // sub v19.8H, v12.8H, v5.8H // .................*............................... - // sub v26.8H, v11.8H, v8.8H // ...................*............................. - // add v5.8H, v12.8H, v5.8H // ..................*.............................. - // sqrdmulh v0.8H, v26.8H, v15.8H // .......................*......................... - // mul v10.8H, v19.8H, v2.8H // ....................*............................ - // mul v29.8H, v26.8H, v29.8H // ......................*.......................... - // sqrdmulh v14.8H, v19.8H, v14.8H // .....................*........................... - // add v11.8H, v11.8H, v8.8H // ........................*........................ - // mls v10.8H, v14.8H, v7.H[0] // ..........................*...................... - // mls v29.8H, v0.8H, v7.H[0] // ...........................*..................... - // sub v0.8H, v5.8H, v11.8H // ............................*.................... - // sqrdmulh v23.8H, v0.8H, v31.8H // ..............................*.................. - // sub v16.8H, v10.8H, v29.8H // ...............................*................. - // mul v25.8H, v0.8H, v9.8H // ................................*................ - // sqrdmulh v28.8H, v16.8H, v31.8H // ..................................*.............. - // mul v20.8H, v16.8H, v9.8H // .................................*............... - // add v5.8H, v5.8H, v11.8H // ....................................*............ - // add v14.8H, v10.8H, v29.8H // ...................................*............. - // mls v20.8H, v28.8H, v7.H[0] // ......................................*.......... - // mls v25.8H, v23.8H, v7.H[0] // .....................................*........... - // trn1 v29.4S, v5.4S, v14.4S // .......................................*......... - // ldr q6, [x3], #16 // ....*............................................ - // trn2 v26.4S, v5.4S, v14.4S // ........................................*........ - // trn2 v19.4S, v25.4S, v20.4S // ..........................................*...... - // trn1 v20.4S, v25.4S, v20.4S // .........................................*....... - // trn2 v27.2D, v29.2D, v20.2D // ............................................*.... - // trn2 v0.2D, v26.2D, v19.2D // ...........................................*..... - // trn1 v22.2D, v26.2D, v19.2D // .............................................*... - // trn1 v28.2D, v29.2D, v20.2D // ..............................................*.. - // add v3.8H, v27.8H, v0.8H // ...............................................*. - // sub v19.8H, v27.8H, v0.8H // ................................................* + // ----------------- new position -----------------> + // 0 25 + // |------------------------|----------------------- + // ldr q5, [x1, #16] // .*............................................... + // ldr q29, [x1, #0] // *................................................ + // ldr q17, [x1, #48] // ..*.............................................. + // ldr q26, [x1, #32] // ...*............................................. + // ldr q11, [x4, #80] // ....*............................................ + // ldr q4, [x4, #48] // .....*........................................... + // trn1 v20.4S, v29.4S, v5.4S // ........*........................................ + // trn2 v9.4S, v29.4S, v5.4S // .........*....................................... + // trn1 v16.4S, v26.4S, v17.4S // ............*.................................... + // ldr q21, [x4], #(6*16) // .......*......................................... + // trn2 v17.4S, v26.4S, v17.4S // .............*................................... + // ldr q19, [x4, #-64] // ......*.......................................... + // ldr q26, [x4, #-32] // ..........*...................................... + // ldr q30, [x4, #-80] // ...........*..................................... + // trn2 v25.2D, v20.2D, v16.2D // ................*................................ + // trn2 v10.2D, v9.2D, v17.2D // ...............*................................. + // trn1 v20.2D, v20.2D, v16.2D // .................*............................... + // trn1 v9.2D, v9.2D, v17.2D // ..................*.............................. + // add v2.8H, v25.8H, v10.8H // ....................*............................ + // sub v31.8H, v25.8H, v10.8H // ...................*............................. + // sub v16.8H, v20.8H, v9.8H // .....................*........................... + // add v17.8H, v20.8H, v9.8H // ......................*.......................... + // sqrdmulh v9.8H, v16.8H, v4.8H // .........................*....................... + // mul v16.8H, v16.8H, v19.8H // ..........................*...................... + // mul v26.8H, v31.8H, v26.8H // ........................*........................ + // sqrdmulh v1.8H, v31.8H, v11.8H // .......................*......................... + // sub v4.8H, v17.8H, v2.8H // ...........................*..................... + // mls v16.8H, v9.8H, v7.H[0] // ...............................*................. + // mls v26.8H, v1.8H, v7.H[0] // .............................*................... + // sqrdmulh v25.8H, v4.8H, v30.8H // ................................*................ + // mul v24.8H, v4.8H, v21.8H // ..............................*.................. + // sub v6.8H, v16.8H, v26.8H // .................................*............... + // sqrdmulh v18.8H, v6.8H, v30.8H // .....................................*........... + // mul v19.8H, v6.8H, v21.8H // ....................................*............ + // add v31.8H, v16.8H, v26.8H // ...................................*............. + // add v16.8H, v17.8H, v2.8H // ............................*.................... + // mls v24.8H, v25.8H, v7.H[0] // ..................................*.............. + // mls v19.8H, v18.8H, v7.H[0] // ........................................*........ + // trn1 v11.4S, v16.4S, v31.4S // ......................................*.......... + // trn2 v3.4S, v24.4S, v19.4S // ..........................................*...... + // trn1 v9.4S, v24.4S, v19.4S // .........................................*....... + // trn2 v16.4S, v16.4S, v31.4S // .......................................*......... + // trn2 v22.2D, v11.2D, v9.2D // ............................................*.... + // trn2 v20.2D, v16.2D, v3.2D // ...........................................*..... + // ldr q14, [x3], #16 // ..............*.................................. + // trn1 v16.2D, v16.2D, v3.2D // ..............................................*.. + // trn1 v9.2D, v11.2D, v9.2D // .............................................*... + // add v12.8H, v22.8H, v20.8H // ................................................* + // sub v18.8H, v22.8H, v20.8H // ...............................................*. sub count, count, #1 layer4567_start: - add v25.8H, v28.8H, v22.8H // ................................................*.................................. - ldr q30, [x1, #112] // ...e............................................................................... - sub v23.8H, v28.8H, v22.8H // ...............................................*................................... - ldr q12, [x1, #96] // ..e................................................................................ - sqrdmulh v20.8H, v19.8H, v6.H[5] // .......................................................*........................... - ldr q26, [x1, #64] // e.................................................................................. - ldr q21, [x1, #80] // .e................................................................................. - mul v24.8H, v19.8H, v6.H[4] // ......................................................*............................ - ldr q29, [x4, #64] // ................e.................................................................. - sqdmulh v22.8H, v3.8H, v7.H[1] // ............................................................*...................... - mul v4.8H, v23.8H, v6.H[2] // .................................................*................................. - // gap // ................................................................................... - ldr q2, [x4, #32] // ..............e.................................................................... - sqdmulh v28.8H, v25.8H, v7.H[1] // .........................................................*......................... - // gap // ................................................................................... - sqrdmulh v23.8H, v23.8H, v6.H[3] // ..................................................*................................ - trn1 v13.4S, v12.4S, v30.4S // ......e............................................................................ - trn2 v19.4S, v12.4S, v30.4S // .......e........................................................................... - ldr q14, [x4, #48] // ...............e................................................................... - // gap // ................................................................................... - trn2 v9.4S, v26.4S, v21.4S // .....e............................................................................. - ldr q31, [x4, #16] // .............e..................................................................... - // gap // ................................................................................... - srshr v22.8H, v22.8H, #11 // .............................................................*..................... - trn1 v11.4S, v26.4S, v21.4S // ....e.............................................................................. - // gap // ................................................................................... - srshr v28.8H, v28.8H, #11 // ..........................................................*........................ - // gap // ................................................................................... - trn1 v5.2D, v9.2D, v19.2D // ...........e....................................................................... - trn2 v8.2D, v9.2D, v19.2D // .........e......................................................................... - ldr q9, [x4], #(6*16) // ............e...................................................................... - // gap // ................................................................................... - trn1 v12.2D, v11.2D, v13.2D // ..........e........................................................................ - trn2 v11.2D, v11.2D, v13.2D // ........e.......................................................................... - // gap // ................................................................................... - ldr q15, [x4, #-16] // .................e................................................................. - mls v3.8H, v22.8H, v7.H[0] // ..............................................................*.................... - mls v25.8H, v28.8H, v7.H[0] // ...........................................................*....................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v19.8H, v12.8H, v5.8H // ..................e................................................................ - sub v26.8H, v11.8H, v8.8H // .......................e........................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v5.8H, v12.8H, v5.8H // ...................e............................................................... - mls v4.8H, v23.8H, v7.H[0] // ...................................................*............................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v0.8H, v26.8H, v15.8H // ..........................e........................................................ - mul v10.8H, v19.8H, v2.8H // ....................e.............................................................. - // gap // ................................................................................... - // gap // ................................................................................... - mul v29.8H, v26.8H, v29.8H // .........................e......................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v24.8H, v20.8H, v7.H[0] // ........................................................*.......................... - sqdmulh v23.8H, v4.8H, v7.H[1] // ...............................................................*................... - sub v22.8H, v25.8H, v3.8H // .....................................................................*............. - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v14.8H, v19.8H, v14.8H // .....................e............................................................. - add v27.8H, v25.8H, v3.8H // ......................................................................*............ - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v3.8H, v22.8H, v6.H[1] // ........................................................................*.......... - // gap // ................................................................................... - // gap // ................................................................................... - sqdmulh v19.8H, v24.8H, v7.H[1] // ..................................................................*................ - srshr v23.8H, v23.8H, #11 // ................................................................*.................. - str q27, [x1], #(64) // ...............................................................................*... - add v11.8H, v11.8H, v8.8H // ........................e.......................................................... - // gap // ................................................................................... - mls v10.8H, v14.8H, v7.H[0] // ......................e............................................................ - mls v29.8H, v0.8H, v7.H[0] // ...........................e....................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v0.8H, v5.8H, v11.8H // ............................e...................................................... - // gap // ................................................................................... - // gap // ................................................................................... - srshr v19.8H, v19.8H, #11 // ...................................................................*............... - mls v4.8H, v23.8H, v7.H[0] // .................................................................*................. - mul v27.8H, v22.8H, v6.H[0] // .......................................................................*........... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v23.8H, v0.8H, v31.8H // ...............................e................................................... - sub v16.8H, v10.8H, v29.8H // .................................e................................................. - // gap // ................................................................................... - // gap // ................................................................................... - mul v25.8H, v0.8H, v9.8H // ..............................e.................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v24.8H, v19.8H, v7.H[0] // ....................................................................*.............. - sqrdmulh v28.8H, v16.8H, v31.8H // ....................................e.............................................. - mul v20.8H, v16.8H, v9.8H // ...................................e............................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v27.8H, v3.8H, v7.H[0] // .........................................................................*......... - add v5.8H, v5.8H, v11.8H // .............................e..................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v14.8H, v10.8H, v29.8H // ..................................e................................................ - // gap // ................................................................................... - // gap // ................................................................................... - sub v19.8H, v4.8H, v24.8H // ..........................................................................*........ - mls v20.8H, v28.8H, v7.H[0] // .....................................e............................................. - mls v25.8H, v23.8H, v7.H[0] // ................................e.................................................. - // gap // ................................................................................... - // gap // ................................................................................... - str q27, [x1, #-32] // .................................................................................*. - // gap // ................................................................................... - mul v23.8H, v19.8H, v6.H[0] // ............................................................................*...... - sqrdmulh v22.8H, v19.8H, v6.H[1] // .............................................................................*..... - trn1 v29.4S, v5.4S, v14.4S // ......................................e............................................ - // gap // ................................................................................... - ldr q6, [x3], #16 // ..............................................e.................................... - trn2 v26.4S, v5.4S, v14.4S // .......................................e........................................... - trn2 v19.4S, v25.4S, v20.4S // .........................................e......................................... - trn1 v20.4S, v25.4S, v20.4S // ........................................e.......................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v11.8H, v4.8H, v24.8H // ...........................................................................*....... - mls v23.8H, v22.8H, v7.H[0] // ..............................................................................*.... - trn2 v27.2D, v29.2D, v20.2D // ..........................................e........................................ - trn2 v0.2D, v26.2D, v19.2D // ...........................................e....................................... - // gap // ................................................................................... - // gap // ................................................................................... - trn1 v22.2D, v26.2D, v19.2D // .............................................e..................................... - trn1 v28.2D, v29.2D, v20.2D // ............................................e...................................... - // gap // ................................................................................... - str q11, [x1, #-48] // ................................................................................*.. - // gap // ................................................................................... - str q23, [x1, #-16] // ..................................................................................* - add v3.8H, v27.8H, v0.8H // .....................................................e............................. - sub v19.8H, v27.8H, v0.8H // ....................................................e.............................. - - // original source code - // ldr q8, [x1, #(16*0)] // ....e.............................................................................|....e........................................................................... - // ldr q9, [x1, #(16*1)] // .....e............................................................................|.....e.......................................................................... - // ldr q10, [x1, #(16*2)] // ..e...............................................................................|..e............................................................................. - // ldr q11, [x1, #(16*3)] // e.................................................................................|e............................................................................... - // trn1 v25.4s, v8.4s, v9.4s // ...................e..............................................................|...................e............................................................ - // trn2 v26.4s, v8.4s, v9.4s // ................e.................................................................|................e............................................................... - // trn1 v27.4s, v10.4s, v11.4s // .............e....................................................................|.............e.................................................................. - // trn2 v28.4s, v10.4s, v11.4s // ..............e...................................................................|..............e................................................................. - // trn2 v10.2d, v25.2d, v27.2d // .........................e........................................................|.........................e...................................................... - // trn2 v11.2d, v26.2d, v28.2d // ......................e...........................................................|......................e......................................................... - // trn1 v8.2d, v25.2d, v27.2d // ........................e.........................................................|........................e....................................................... - // trn1 v9.2d, v26.2d, v28.2d // .....................e............................................................|.....................e.......................................................... - // ldr q0, [x4], #(6*16) // .......................e..........................................................|.......................e........................................................ - // ldr q4, [x4, #(-6*16 + 1*16)] // .................e................................................................|.................e.............................................................. - // ldr q1, [x4, #(-6*16 + 2*16)] // ..........e.......................................................................|..........e..................................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // ...............e..................................................................|...............e................................................................ - // ldr q2, [x4, #(-6*16 + 4*16)] // .......e..........................................................................|.......e........................................................................ - // ldr q6, [x4, #(-6*16 + 5*16)] // ..........................e.......................................................|..........................e..................................................... - // sub v24.8h, v8.8h, v9.8h // .............................e....................................................|.............................e.................................................. - // add v8.8h, v8.8h, v9.8h // ...............................e..................................................|...............................e................................................ - // mul v9.8h, v24.8h, v1.8h // ..................................e...............................................|..................................e............................................. - // sqrdmulh v24.8h, v24.8h, v5.8h // .......................................e..........................................|.......................................e........................................ - // mls v9.8h, v24.8h, v7.h[0] // ..............................................e...................................|..............................................e................................. - // sub v24.8h, v10.8h, v11.8h // ..............................e...................................................|..............................e................................................. - // add v10.8h, v10.8h, v11.8h // .............................................e....................................|.............................................e.................................. - // mul v11.8h, v24.8h, v2.8h // ...................................e..............................................|...................................e............................................ - // sqrdmulh v24.8h, v24.8h, v6.8h // .................................e................................................|.................................e.............................................. - // mls v11.8h, v24.8h, v7.h[0] // ...............................................e..................................|...............................................e................................ - // sub v24.8h, v8.8h, v10.8h // ................................................e.................................|................................................e............................... - // add v8.8h, v8.8h, v10.8h // ...........................................................e......................|...........................................................e.................... - // mul v10.8h, v24.8h, v0.8h // ......................................................e...........................|......................................................e......................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ....................................................e.............................|....................................................e........................... - // mls v10.8h, v24.8h, v7.h[0] // ...............................................................e..................|...............................................................e................ - // sub v24.8h, v9.8h, v11.8h // .....................................................e............................|.....................................................e.......................... - // add v9.8h, v9.8h, v11.8h // ............................................................e.....................|............................................................e................... - // mul v11.8h, v24.8h, v0.8h // .........................................................e........................|.........................................................e...................... - // sqrdmulh v24.8h, v24.8h, v4.8h // ........................................................e.........................|........................................................e....................... - // mls v11.8h, v24.8h, v7.h[0] // ..............................................................e...................|..............................................................e................. - // trn1 v25.4s, v8.4s, v9.4s // ...................................................................e..............|...................................................................e............ - // trn2 v26.4s, v8.4s, v9.4s // .....................................................................e............|.....................................................................e.......... - // trn1 v27.4s, v10.4s, v11.4s // .......................................................................e..........|.......................................................................e........ - // trn2 v28.4s, v10.4s, v11.4s // ......................................................................e...........|......................................................................e......... - // trn2 v10.2d, v25.2d, v27.2d // ..........................................................................e.......|..........................................................................e..... - // trn2 v11.2d, v26.2d, v28.2d // ...........................................................................e......|...........................................................................e.... - // trn1 v8.2d, v25.2d, v27.2d // .............................................................................e....|.............................................................................e.. - // trn1 v9.2d, v26.2d, v28.2d // ............................................................................e.....|............................................................................e... - // ldr q0, [x3], #16 // ....................................................................e.............|....................................................................e........... - // sub v24.8h, v8.8h, v9.8h // .*................................................................................|.*.............................................................................. - // add v8.8h, v8.8h, v9.8h // ..................................................................................*................................................................................ - // mul v9.8h, v24.8h, v0.h[2] // .........*........................................................................|.........*...................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............*.....................................................................|............*................................................................... - // mls v9.8h, v24.8h, v7.h[0] // ................................*.................................................|................................*............................................... - // sub v24.8h, v10.8h, v11.8h // .................................................................................e|................................................................................ - // add v10.8h, v10.8h, v11.8h // ................................................................................e.|................................................................................ - // mul v11.8h, v24.8h, v0.h[4] // ......*...........................................................................|......*......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...*..............................................................................|...*............................................................................ - // mls v11.8h, v24.8h, v7.h[0] // ....................................*.............................................|....................................*........................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........*......................................................................|...........*.................................................................... - // srshr v25.8h, v25.8h, #11 // ....................*.............................................................|....................*........................................................... - // mls v8.8h, v25.8h, v7.h[0] // ............................*.....................................................|............................*................................................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ........*.........................................................................|........*....................................................................... - // srshr v25.8h, v25.8h, #11 // ..................*...............................................................|..................*............................................................. - // mls v10.8h, v25.8h, v7.h[0] // ...........................*......................................................|...........................*.................................................... - // sqdmulh v25.8h, v9.8h, v7.h[1] // .....................................*............................................|.....................................*.......................................... - // srshr v25.8h, v25.8h, #11 // ...........................................*......................................|...........................................*.................................... - // mls v9.8h, v25.8h, v7.h[0] // ..................................................*...............................|..................................................*............................. - // sqdmulh v25.8h, v11.8h, v7.h[1] // ..........................................*.......................................|..........................................*..................................... - // srshr v25.8h, v25.8h, #11 // .................................................*................................|.................................................*.............................. - // mls v11.8h, v25.8h, v7.h[0] // .......................................................*..........................|.......................................................*........................ - // sub v24.8h, v8.8h, v10.8h // ......................................*...........................................|......................................*......................................... - // add v8.8h, v8.8h, v10.8h // ........................................*.........................................|........................................*....................................... - // mul v10.8h, v24.8h, v0.h[0] // ...................................................*..............................|...................................................*............................ - // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................*........................................|.........................................*...................................... - // mls v10.8h, v24.8h, v7.h[0] // ..........................................................*.......................|..........................................................*..................... - // sub v24.8h, v9.8h, v11.8h // .............................................................*....................|.............................................................*.................. - // add v9.8h, v9.8h, v11.8h // ........................................................................*.........|........................................................................*....... - // mul v11.8h, v24.8h, v0.h[0] // .................................................................*................|.................................................................*.............. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................................................*...............|..................................................................*............. - // mls v11.8h, v24.8h, v7.h[0] // .........................................................................*........|.........................................................................*...... - // str q8, [x1], #(64) // ............................................*.....................................|............................................*................................... - // str q9, [x1, #(-64 + 16*1)] // ..............................................................................*...|..............................................................................*. - // str q10, [x1, #(-64 + 16*2)] // ................................................................*.................|................................................................*............... - // str q11, [x1, #(-64 + 16*3)] // ...............................................................................*..|...............................................................................* + // Instructions: 83 + // Expected cycles: 34 + // Expected IPC: 2.44 + // + // Cycle bound: 34.0 + // IPC bound: 2.44 + // + // Wall time: 123.85s + // User time: 123.85s + // + // ------------------------------- original position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------- + add v15.8H, v9.8H, v16.8H // ................................................*.................................. + sub v16.8H, v9.8H, v16.8H // ...............................................*................................... + ldr q5, [x1, #80] // .e................................................................................. + ldr q29, [x1, #64] // e.................................................................................. + ldr q17, [x1, #112] // ...e............................................................................... + ldr q26, [x1, #96] // ..e................................................................................ + mul v3.8H, v18.8H, v14.H[4] // .......................................................*........................... + sqdmulh v22.8H, v12.8H, v7.H[1] // ............................................................*...................... + // gap // ................................................................................... + sqrdmulh v6.8H, v18.8H, v14.H[5] // ......................................................*............................ + // gap // ................................................................................... + mul v27.8H, v16.8H, v14.H[2] // ..................................................*................................ + sqdmulh v8.8H, v15.8H, v7.H[1] // .........................................................*......................... + sqrdmulh v18.8H, v16.8H, v14.H[3] // .................................................*................................. + // gap // ................................................................................... + // gap // ................................................................................... + ldr q11, [x4, #80] // .................e................................................................. + ldr q4, [x4, #48] // ...............e................................................................... + trn1 v20.4S, v29.4S, v5.4S // ....e.............................................................................. + trn2 v9.4S, v29.4S, v5.4S // .....e............................................................................. + trn1 v16.4S, v26.4S, v17.4S // ......e............................................................................ + // gap // ................................................................................... + ldr q21, [x4], #(6*16) // ............e...................................................................... + trn2 v17.4S, v26.4S, v17.4S // .......e........................................................................... + mls v3.8H, v6.8H, v7.H[0] // ........................................................*.......................... + ldr q19, [x4, #-64] // ..............e.................................................................... + ldr q26, [x4, #-32] // ................e.................................................................. + mls v27.8H, v18.8H, v7.H[0] // ...................................................*............................... + ldr q30, [x4, #-80] // .............e..................................................................... + // gap // ................................................................................... + trn2 v25.2D, v20.2D, v16.2D // ........e.......................................................................... + trn2 v10.2D, v9.2D, v17.2D // .........e......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v20.2D, v20.2D, v16.2D // ..........e........................................................................ + trn1 v9.2D, v9.2D, v17.2D // ...........e....................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v2.8H, v25.8H, v10.8H // ........................e.......................................................... + sqdmulh v18.8H, v27.8H, v7.H[1] // ...............................................................*................... + // gap // ................................................................................... + sub v31.8H, v25.8H, v10.8H // .......................e........................................................... + // gap // ................................................................................... + sub v16.8H, v20.8H, v9.8H // ..................e................................................................ + sqdmulh v6.8H, v3.8H, v7.H[1] // ..................................................................*................ + // gap // ................................................................................... + // gap // ................................................................................... + add v17.8H, v20.8H, v9.8H // ...................e............................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v9.8H, v16.8H, v4.8H // ....................e.............................................................. + mul v16.8H, v16.8H, v19.8H // .....................e............................................................. + mul v26.8H, v31.8H, v26.8H // ..........................e........................................................ + sqrdmulh v1.8H, v31.8H, v11.8H // .........................e......................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v20.8H, v8.8H, #11 // ..........................................................*........................ + srshr v18.8H, v18.8H, #11 // ................................................................*.................. + // gap // ................................................................................... + sub v4.8H, v17.8H, v2.8H // ............................e...................................................... + // gap // ................................................................................... + mls v16.8H, v9.8H, v7.H[0] // ......................e............................................................ + srshr v6.8H, v6.8H, #11 // ...................................................................*............... + mls v26.8H, v1.8H, v7.H[0] // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v22.8H, v22.8H, #11 // .............................................................*..................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v25.8H, v4.8H, v30.8H // ..............................e.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v24.8H, v4.8H, v21.8H // ...............................e................................................... + mls v15.8H, v20.8H, v7.H[0] // ...........................................................*....................... + mls v3.8H, v6.8H, v7.H[0] // ....................................................................*.............. + // gap // ................................................................................... + // gap // ................................................................................... + sub v6.8H, v16.8H, v26.8H // .................................e................................................. + // gap // ................................................................................... + mls v27.8H, v18.8H, v7.H[0] // .................................................................*................. + // gap // ................................................................................... + mls v12.8H, v22.8H, v7.H[0] // ..............................................................*.................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v18.8H, v6.8H, v30.8H // ...................................e............................................... + mul v19.8H, v6.8H, v21.8H // ....................................e.............................................. + add v31.8H, v16.8H, v26.8H // ..................................e................................................ + // gap // ................................................................................... + // gap // ................................................................................... + add v16.8H, v17.8H, v2.8H // .............................e..................................................... + mls v24.8H, v25.8H, v7.H[0] // ................................e.................................................. + sub v6.8H, v27.8H, v3.8H // ..........................................................................*........ + // gap // ................................................................................... + // gap // ................................................................................... + sub v22.8H, v15.8H, v12.8H // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + mls v19.8H, v18.8H, v7.H[0] // .....................................e............................................. + // gap // ................................................................................... + mul v18.8H, v6.8H, v14.H[0] // .............................................................................*..... + // gap // ................................................................................... + sqrdmulh v20.8H, v6.8H, v14.H[1] // ............................................................................*...... + // gap // ................................................................................... + mul v8.8H, v22.8H, v14.H[0] // ........................................................................*.......... + // gap // ................................................................................... + trn1 v11.4S, v16.4S, v31.4S // ......................................e............................................ + sqrdmulh v6.8H, v22.8H, v14.H[1] // .......................................................................*........... + add v27.8H, v27.8H, v3.8H // ...........................................................................*....... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v3.4S, v24.4S, v19.4S // .........................................e......................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v18.8H, v20.8H, v7.H[0] // ..............................................................................*.... + trn1 v9.4S, v24.4S, v19.4S // ........................................e.......................................... + trn2 v16.4S, v16.4S, v31.4S // .......................................e........................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v31.8H, v15.8H, v12.8H // ......................................................................*............ + mls v8.8H, v6.8H, v7.H[0] // .........................................................................*......... + // gap // ................................................................................... + str q27, [x1, #16] // ................................................................................*.. + trn2 v22.2D, v11.2D, v9.2D // ..........................................e........................................ + trn2 v20.2D, v16.2D, v3.2D // ...........................................e....................................... + str q18, [x1, #48] // ..................................................................................* + ldr q14, [x3], #16 // ..............................................e.................................... + str q31, [x1], #(64) // ...............................................................................*... + // gap // ................................................................................... + trn1 v16.2D, v16.2D, v3.2D // .............................................e..................................... + trn1 v9.2D, v11.2D, v9.2D // ............................................e...................................... + str q8, [x1, #-32] // .................................................................................*. + add v12.8H, v22.8H, v20.8H // .....................................................e............................. + sub v18.8H, v22.8H, v20.8H // ....................................................e.............................. + // gap // ................................................................................... + + // ------------------------------------------------------------------------- new position --------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ldr q8, [x1, #(16*0)] // .e...............................................................................'..~............................................................................. + // ldr q9, [x1, #(16*1)] // e................................................................................'.~.............................................................................. + // ldr q10, [x1, #(16*2)] // ...e.............................................................................'....~........................................................................... + // ldr q11, [x1, #(16*3)] // ..e..............................................................................'...~............................................................................ + // trn1 v25.4s, v8.4s, v9.4s // ............e....................................................................'.............~.................................................................. + // trn2 v26.4s, v8.4s, v9.4s // .............e...................................................................'..............~................................................................. + // trn1 v27.4s, v10.4s, v11.4s // ..............e..................................................................'...............~................................................................ + // trn2 v28.4s, v10.4s, v11.4s // ................e................................................................'.................~.............................................................. + // trn2 v10.2d, v25.2d, v27.2d // ......................e..........................................................'.......................~........................................................ + // trn2 v11.2d, v26.2d, v28.2d // .......................e.........................................................'........................~....................................................... + // trn1 v8.2d, v25.2d, v27.2d // ........................e........................................................'.........................~...................................................... + // trn1 v9.2d, v26.2d, v28.2d // .........................e.......................................................'..........................~..................................................... + // ldr q0, [x4], #(6*16) // ...............e.................................................................'................~............................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // .....................e...........................................................'......................~......................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ..................e..............................................................'...................~............................................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // ...........e.....................................................................'............~................................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ...................e.............................................................'....................~........................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ..........e......................................................................'...........~.................................................................... + // sub v24.8h, v8.8h, v9.8h // .............................e...................................................'..............................~................................................. + // add v8.8h, v8.8h, v9.8h // ...............................e.................................................'................................~............................................... + // sqrdmulh v27.8h, v24.8h, v5.8h // ................................e................................................'.................................~.............................................. + // mul v9.8h, v24.8h, v1.8h // .................................e...............................................'..................................~............................................. + // mls v9.8h, v27.8h, v7.h[0] // .......................................e.........................................'........................................~....................................... + // sub v24.8h, v10.8h, v11.8h // ............................e....................................................'.............................~.................................................. + // add v10.8h, v10.8h, v11.8h // ..........................e......................................................'...........................~.................................................... + // sqrdmulh v27.8h, v24.8h, v6.8h // ...................................e.............................................'....................................~........................................... + // mul v11.8h, v24.8h, v2.8h // ..................................e..............................................'...................................~............................................ + // mls v11.8h, v27.8h, v7.h[0] // .........................................e.......................................'..........................................~..................................... + // sub v24.8h, v8.8h, v10.8h // ......................................e..........................................'.......................................~........................................ + // add v8.8h, v8.8h, v10.8h // .....................................................e...........................'......................................................~......................... + // sqrdmulh v27.8h, v24.8h, v4.8h // ...........................................e.....................................'............................................~................................... + // mul v10.8h, v24.8h, v0.8h // ............................................e....................................'.............................................~.................................. + // mls v10.8h, v27.8h, v7.h[0] // ......................................................e..........................'.......................................................~........................ + // sub v24.8h, v9.8h, v11.8h // ...............................................e.................................'................................................~............................... + // add v9.8h, v9.8h, v11.8h // ....................................................e............................'.....................................................~.......................... + // sqrdmulh v27.8h, v24.8h, v4.8h // ..................................................e..............................'...................................................~............................ + // mul v11.8h, v24.8h, v0.8h // ...................................................e.............................'....................................................~........................... + // mls v11.8h, v27.8h, v7.h[0] // .........................................................e.......................'..........................................................~..................... + // trn1 v25.4s, v8.4s, v9.4s // .............................................................e...................'..............................................................~................. + // trn2 v26.4s, v8.4s, v9.4s // ...................................................................e.............'....................................................................~........... + // trn1 v27.4s, v10.4s, v11.4s // ..................................................................e..............'...................................................................~............ + // trn2 v28.4s, v10.4s, v11.4s // ................................................................e................'.................................................................~.............. + // trn2 v10.2d, v25.2d, v27.2d // .......................................................................e.........'........................................................................~....... + // trn2 v11.2d, v26.2d, v28.2d // ........................................................................e........'.........................................................................~...... + // trn1 v8.2d, v25.2d, v27.2d // .............................................................................e...'..............................................................................~. + // trn1 v9.2d, v26.2d, v28.2d // ............................................................................e....'.............................................................................~.. + // ldr q0, [x3], #16 // ..........................................................................e......'...........................................................................~.... + // sub v24.8h, v8.8h, v9.8h // .................................................................................'*............................................................................... + // add v8.8h, v8.8h, v9.8h // .................................................................................*................................................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .........~.......................................................................'..........*..................................................................... + // mul v9.8h, v24.8h, v0.h[2] // .......~.........................................................................'........*....................................................................... + // mls v9.8h, v27.8h, v7.h[0] // ....................~............................................................'.....................*.......................................................... + // sub v24.8h, v10.8h, v11.8h // ................................................................................e'................................................................................ + // add v10.8h, v10.8h, v11.8h // ...............................................................................e.'................................................................................ + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ......~..........................................................................'.......*........................................................................ + // mul v11.8h, v24.8h, v0.h[4] // ....~............................................................................'.....*.......................................................................... + // mls v11.8h, v27.8h, v7.h[0] // .................~...............................................................'..................*............................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ........~........................................................................'.........*...................................................................... + // srshr v25.8h, v25.8h, #11 // ....................................~............................................'.....................................*.......................................... + // mls v8.8h, v25.8h, v7.h[0] // .............................................~...................................'..............................................*................................. + // sqdmulh v25.8h, v10.8h, v7.h[1] // .....~...........................................................................'......*......................................................................... + // srshr v25.8h, v25.8h, #11 // ..........................................~......................................'...........................................*.................................... + // mls v10.8h, v25.8h, v7.h[0] // .................................................~...............................'..................................................*............................. + // sqdmulh v25.8h, v9.8h, v7.h[1] // ...........................~.....................................................'............................*................................................... + // srshr v25.8h, v25.8h, #11 // .....................................~...........................................'......................................*......................................... + // mls v9.8h, v25.8h, v7.h[0] // ................................................~................................'.................................................*.............................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // ..............................~..................................................'...............................*................................................ + // srshr v25.8h, v25.8h, #11 // ........................................~........................................'.........................................*...................................... + // mls v11.8h, v25.8h, v7.h[0] // ..............................................~..................................'...............................................*................................ + // sub v24.8h, v8.8h, v10.8h // ........................................................~........................'.........................................................*...................... + // add v8.8h, v8.8h, v10.8h // ....................................................................~............'.....................................................................*.......... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ..............................................................~..................'...............................................................*................ + // mul v10.8h, v24.8h, v0.h[0] // ............................................................~....................'.............................................................*.................. + // mls v10.8h, v27.8h, v7.h[0] // .....................................................................~...........'......................................................................*......... + // sub v24.8h, v9.8h, v11.8h // .......................................................~.........................'........................................................*....................... + // add v9.8h, v9.8h, v11.8h // ...............................................................~.................'................................................................*............... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...........................................................~.....................'............................................................*................... + // mul v11.8h, v24.8h, v0.h[0] // ..........................................................~......................'...........................................................*.................... + // mls v11.8h, v27.8h, v7.h[0] // .................................................................~...............'..................................................................*............. + // str q8, [x1], #(64) // ...........................................................................~.....'............................................................................*... + // str q9, [x1, #(-64 + 16*1)] // ......................................................................~..........'.......................................................................*........ + // str q10, [x1, #(-64 + 16*2)] // ..............................................................................~..'...............................................................................* + // str q11, [x1, #(-64 + 16*3)] // .........................................................................~.......'..........................................................................*..... sub count, count, #1 cbnz count, layer4567_start - sub v27.8H, v28.8H, v22.8H // .*................................ - // gap // .................................. - mul v13.8H, v19.8H, v6.H[4] // ...*.............................. - // gap // .................................. - sqrdmulh v19.8H, v19.8H, v6.H[5] // ..*............................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - mul v1.8H, v27.8H, v6.H[2] // .....*............................ - sqrdmulh v5.8H, v27.8H, v6.H[3] // .......*.......................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - add v28.8H, v28.8H, v22.8H // *................................. - // gap // .................................. - // gap // .................................. - mls v13.8H, v19.8H, v7.H[0] // .............*.................... - // gap // .................................. - sqdmulh v26.8H, v3.8H, v7.H[1] // ....*............................. - mls v1.8H, v5.8H, v7.H[0] // ............*..................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - sqdmulh v22.8H, v28.8H, v7.H[1] // ......*........................... - // gap // .................................. - // gap // .................................. - sqdmulh v31.8H, v13.8H, v7.H[1] // ..................*............... - // gap // .................................. - srshr v30.8H, v26.8H, #11 // ........*......................... - // gap // .................................. - sqdmulh v19.8H, v1.8H, v7.H[1] // ..............*................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - srshr v0.8H, v22.8H, #11 // .........*........................ - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - mls v3.8H, v30.8H, v7.H[0] // ..........*....................... - srshr v30.8H, v19.8H, #11 // ...................*.............. - srshr v19.8H, v31.8H, #11 // .....................*............ - // gap // .................................. - // gap // .................................. - mls v28.8H, v0.8H, v7.H[0] // ...........*...................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - mls v1.8H, v30.8H, v7.H[0] // ......................*........... - // gap // .................................. - mls v13.8H, v19.8H, v7.H[0] // ........................*......... - // gap // .................................. - sub v19.8H, v28.8H, v3.8H // ...............*.................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - sub v23.8H, v1.8H, v13.8H // ..........................*....... - sqrdmulh v30.8H, v19.8H, v6.H[1] // .................*................ - // gap // .................................. - // gap // .................................. - mul v4.8H, v19.8H, v6.H[0] // .......................*.......... - // gap // .................................. - // gap // .................................. - // gap // .................................. - mul v27.8H, v23.8H, v6.H[0] // ............................*..... - sqrdmulh v21.8H, v23.8H, v6.H[1] // .............................*.... - // gap // .................................. - // gap // .................................. - add v3.8H, v28.8H, v3.8H // ................*................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - mls v4.8H, v30.8H, v7.H[0] // .........................*........ - add v20.8H, v1.8H, v13.8H // ..............................*... - // gap // .................................. - // gap // .................................. - mls v27.8H, v21.8H, v7.H[0] // ...............................*.. - str q3, [x1], #(64) // ....................*............. - // gap // .................................. - // gap // .................................. - str q20, [x1, #-48] // ................................*. - // gap // .................................. - // gap // .................................. - // gap // .................................. - str q4, [x1, #-32] // ...........................*...... - // gap // .................................. - // gap // .................................. - // gap // .................................. - str q27, [x1, #-16] // .................................* - // gap // .................................. - // gap // .................................. - // gap // .................................. - - // original source code - // add v25.8H, v28.8H, v22.8H // .....*............................ - // sub v23.8H, v28.8H, v22.8H // *................................. - // sqrdmulh v20.8H, v19.8H, v6.H[5] // ..*............................... - // mul v24.8H, v19.8H, v6.H[4] // .*................................ - // sqdmulh v22.8H, v3.8H, v7.H[1] // .......*.......................... - // mul v4.8H, v23.8H, v6.H[2] // ...*.............................. - // sqdmulh v28.8H, v25.8H, v7.H[1] // .........*........................ - // sqrdmulh v23.8H, v23.8H, v6.H[3] // ....*............................. - // srshr v22.8H, v22.8H, #11 // ...........*...................... - // srshr v28.8H, v28.8H, #11 // .............*.................... - // mls v3.8H, v22.8H, v7.H[0] // ..............*................... - // mls v25.8H, v28.8H, v7.H[0] // .................*................ - // mls v4.8H, v23.8H, v7.H[0] // ........*......................... - // mls v24.8H, v20.8H, v7.H[0] // ......*........................... - // sqdmulh v23.8H, v4.8H, v7.H[1] // ............*..................... - // sub v22.8H, v25.8H, v3.8H // ....................*............. - // add v27.8H, v25.8H, v3.8H // ..........................*....... - // sqrdmulh v3.8H, v22.8H, v6.H[1] // ......................*........... - // sqdmulh v19.8H, v24.8H, v7.H[1] // ..........*....................... - // srshr v23.8H, v23.8H, #11 // ...............*.................. - // str q27, [x1], #(64) // ..............................*... - // srshr v19.8H, v19.8H, #11 // ................*................. - // mls v4.8H, v23.8H, v7.H[0] // ..................*............... - // mul v27.8H, v22.8H, v6.H[0] // .......................*.......... - // mls v24.8H, v19.8H, v7.H[0] // ...................*.............. - // mls v27.8H, v3.8H, v7.H[0] // ...........................*...... - // sub v19.8H, v4.8H, v24.8H // .....................*............ - // str q27, [x1, #-32] // ................................*. - // mul v23.8H, v19.8H, v6.H[0] // ........................*......... - // sqrdmulh v22.8H, v19.8H, v6.H[1] // .........................*........ - // add v11.8H, v4.8H, v24.8H // ............................*..... - // mls v23.8H, v22.8H, v7.H[0] // .............................*.... - // str q11, [x1, #-48] // ...............................*.. - // str q23, [x1, #-16] // .................................* + // Instructions: 34 + // Expected cycles: 26 + // Expected IPC: 1.31 + // + // Cycle bound: 26.0 + // IPC bound: 1.31 + // + // Wall time: 0.41s + // User time: 0.41s + // + // ------- original position -------> + // 0 25 + // |------------------------|-------- + sub v22.8H, v9.8H, v16.8H // .*................................ + add v13.8H, v9.8H, v16.8H // *................................. + // gap // .................................. + // gap // .................................. + mul v26.8H, v18.8H, v14.H[4] // ..*............................... + sqrdmulh v16.8H, v18.8H, v14.H[5] // ....*............................. + // gap // .................................. + // gap // .................................. + sqrdmulh v15.8H, v22.8H, v14.H[3] // .......*.......................... + mul v5.8H, v22.8H, v14.H[2] // .....*............................ + // gap // .................................. + // gap // .................................. + sqdmulh v29.8H, v12.8H, v7.H[1] // ...*.............................. + // gap // .................................. + // gap // .................................. + sqdmulh v6.8H, v13.8H, v7.H[1] // ......*........................... + mls v26.8H, v16.8H, v7.H[0] // ........*......................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v5.8H, v15.8H, v7.H[0] // .........*........................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + srshr v2.8H, v6.8H, #11 // ............*..................... + srshr v20.8H, v29.8H, #11 // ...............*.................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqdmulh v11.8H, v26.8H, v7.H[1] // ...........*...................... + sqdmulh v24.8H, v5.8H, v7.H[1] // ..........*....................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v12.8H, v20.8H, v7.H[0] // ...................*.............. + // gap // .................................. + // gap // .................................. + // gap // .................................. + srshr v21.8H, v11.8H, #11 // ..............*................... + mls v13.8H, v2.8H, v7.H[0] // ................*................. + // gap // .................................. + // gap // .................................. + srshr v2.8H, v24.8H, #11 // .............*.................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v26.8H, v21.8H, v7.H[0] // .................*................ + sub v0.8H, v13.8H, v12.8H // .....................*............ + // gap // .................................. + // gap // .................................. + mls v5.8H, v2.8H, v7.H[0] // ..................*............... + add v15.8H, v13.8H, v12.8H // ............................*..... + // gap // .................................. + // gap // .................................. + sqrdmulh v18.8H, v0.8H, v14.H[1] // .........................*........ + mul v12.8H, v0.8H, v14.H[0] // ........................*......... + // gap // .................................. + // gap // .................................. + str q15, [x1], #(64) // ................................*. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sub v20.8H, v5.8H, v26.8H // ....................*............. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v12.8H, v18.8H, v7.H[0] // .............................*.... + add v22.8H, v5.8H, v26.8H // ..........................*....... + // gap // .................................. + // gap // .................................. + mul v17.8H, v20.8H, v14.H[0] // ......................*........... + sqrdmulh v26.8H, v20.8H, v14.H[1] // .......................*.......... + // gap // .................................. + // gap // .................................. + str q22, [x1, #-48] // ..............................*... + // gap // .................................. + // gap // .................................. + // gap // .................................. + str q12, [x1, #-32] // .................................* + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v17.8H, v26.8H, v7.H[0] // ...........................*...... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + str q17, [x1, #-16] // ...............................*.. + // gap // .................................. + // gap // .................................. + // gap // .................................. + + // --------- new position ----------> + // 0 25 + // |------------------------|-------- + // add v15.8H, v9.8H, v16.8H // .*................................ + // sub v16.8H, v9.8H, v16.8H // *................................. + // mul v3.8H, v18.8H, v14.H[4] // ..*............................... + // sqdmulh v22.8H, v12.8H, v7.H[1] // ......*........................... + // sqrdmulh v6.8H, v18.8H, v14.H[5] // ...*.............................. + // mul v27.8H, v16.8H, v14.H[2] // .....*............................ + // sqdmulh v8.8H, v15.8H, v7.H[1] // .......*.......................... + // sqrdmulh v18.8H, v16.8H, v14.H[3] // ....*............................. + // mls v3.8H, v6.8H, v7.H[0] // ........*......................... + // mls v27.8H, v18.8H, v7.H[0] // .........*........................ + // sqdmulh v18.8H, v27.8H, v7.H[1] // .............*.................... + // sqdmulh v6.8H, v3.8H, v7.H[1] // ............*..................... + // srshr v20.8H, v8.8H, #11 // ..........*....................... + // srshr v18.8H, v18.8H, #11 // .................*................ + // srshr v6.8H, v6.8H, #11 // ...............*.................. + // srshr v22.8H, v22.8H, #11 // ...........*...................... + // mls v15.8H, v20.8H, v7.H[0] // ................*................. + // mls v3.8H, v6.8H, v7.H[0] // ..................*............... + // mls v27.8H, v18.8H, v7.H[0] // ....................*............. + // mls v12.8H, v22.8H, v7.H[0] // ..............*................... + // sub v6.8H, v27.8H, v3.8H // .........................*........ + // sub v22.8H, v15.8H, v12.8H // ...................*.............. + // mul v18.8H, v6.8H, v14.H[0] // ............................*..... + // sqrdmulh v20.8H, v6.8H, v14.H[1] // .............................*.... + // mul v8.8H, v22.8H, v14.H[0] // .......................*.......... + // sqrdmulh v6.8H, v22.8H, v14.H[1] // ......................*........... + // add v27.8H, v27.8H, v3.8H // ...........................*...... + // mls v18.8H, v20.8H, v7.H[0] // ................................*. + // add v31.8H, v15.8H, v12.8H // .....................*............ + // mls v8.8H, v6.8H, v7.H[0] // ..........................*....... + // str q27, [x1, #16] // ..............................*... + // str q18, [x1, #48] // .................................* + // str q31, [x1], #(64) // ........................*......... + // str q8, [x1, #-32] // ...............................*.. // --------------------------------------------------------------------- @@ -914,526 +944,563 @@ layer4567_start: .p2align 2 - ldr q10, [x0, #256] // *.................................... - ldr q23, [x0, #320] // .....*............................... - // gap // ..................................... - // gap // ..................................... - ldr q21, [x0, #384] // ......*.............................. - ldr q3, [x0, #448] // ..*.................................. - // gap // ..................................... - // gap // ..................................... - ldr q6, [x0, #0] // ....*................................ - ldr q9, [x0, #64] // .*................................... - // gap // ..................................... - // gap // ..................................... - ldr q15, [x0, #192] // ...*................................. - ldr q25, [x0, #128] // .......*............................. - // gap // ..................................... - // gap // ..................................... - sub v17.8H, v10.8H, v23.8H // .........*........................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sub v22.8H, v21.8H, v3.8H // ............*........................ - add v19.8H, v21.8H, v3.8H // ..........................*.......... - // gap // ..................................... - // gap // ..................................... - sqrdmulh v27.8H, v17.8H, v1.H[3] // .............*....................... - sub v13.8H, v6.8H, v9.8H // ........*............................ - // gap // ..................................... - // gap // ..................................... - sub v2.8H, v25.8H, v15.8H // ..............*...................... - mul v24.8H, v22.8H, v1.H[4] // ................*.................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sqrdmulh v31.8H, v22.8H, v1.H[5] // ...............*..................... - mul v22.8H, v13.8H, v0.H[6] // ...........*......................... - mul v28.8H, v17.8H, v1.H[2] // .................*................... - sqrdmulh v3.8H, v2.8H, v1.H[1] // ...................*................. - // gap // ..................................... - // gap // ..................................... - mul v16.8H, v2.8H, v1.H[0] // ......................*.............. - sqrdmulh v13.8H, v13.8H, v0.H[7] // ..........*.......................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v24.8H, v31.8H, v7.H[0] // ....................*................ - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v28.8H, v27.8H, v7.H[0] // .....................*............... - // gap // ..................................... - // gap // ..................................... - mls v22.8H, v13.8H, v7.H[0] // ..................*.................. - mls v16.8H, v3.8H, v7.H[0] // ........................*............ - // gap // ..................................... - // gap // ..................................... - add v23.8H, v10.8H, v23.8H // .............................*....... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - add v5.8H, v28.8H, v24.8H // .......................*............. - sub v12.8H, v28.8H, v24.8H // .........................*........... - // gap // ..................................... - // gap // ..................................... - add v31.8H, v22.8H, v16.8H // ...........................*......... - sub v20.8H, v23.8H, v19.8H // ................................*.... - // gap // ..................................... - // gap // ..................................... - add v13.8H, v23.8H, v19.8H // ..................................*.. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - add v10.8H, v31.8H, v5.8H // ............................*........ - sub v27.8H, v31.8H, v5.8H // .................................*... - // gap // ..................................... - // gap // ..................................... - mul v14.8H, v12.8H, v0.H[4] // ....................................* - // gap // ..................................... - // gap // ..................................... - sqrdmulh v11.8H, v20.8H, v0.H[5] // ...................................*. - mul v4.8H, v10.8H, v29.8H // ..............................*...... - sqrdmulh v19.8H, v10.8H, v30.8H // ...............................*..... - // gap // ..................................... - // gap // ..................................... - - // original source code - // ldr q10, [x0, #256] // *.................................... - // ldr q9, [x0, #64] // .....*............................... - // ldr q17, [x0, #448] // ...*................................. - // ldr q15, [x0, #192] // ......*.............................. - // ldr q6, [x0, #0] // ....*................................ - // ldr q28, [x0, #320] // .*................................... - // ldr q2, [x0, #384] // ..*.................................. - // ldr q25, [x0, #128] // .......*............................. - // sub v22.8H, v6.8H, v9.8H // ............*........................ - // sub v4.8H, v10.8H, v28.8H // ........*............................ - // sqrdmulh v8.8H, v22.8H, v0.H[7] // ....................*................ - // mul v22.8H, v22.8H, v0.H[6] // ................*.................... - // sub v23.8H, v2.8H, v17.8H // .........*........................... - // sqrdmulh v12.8H, v4.8H, v1.H[3] // ...........*......................... - // sub v26.8H, v25.8H, v15.8H // .............*....................... - // sqrdmulh v24.8H, v23.8H, v1.H[5] // ...............*..................... - // mul v31.8H, v23.8H, v1.H[4] // ..............*...................... - // mul v21.8H, v4.8H, v1.H[2] // .................*................... - // mls v22.8H, v8.8H, v7.H[0] // .......................*............. - // sqrdmulh v8.8H, v26.8H, v1.H[1] // ..................*.................. - // mls v31.8H, v24.8H, v7.H[0] // .....................*............... - // mls v21.8H, v12.8H, v7.H[0] // ......................*.............. - // mul v16.8H, v26.8H, v1.H[0] // ...................*................. - // add v27.8H, v21.8H, v31.8H // ..........................*.......... - // mls v16.8H, v8.8H, v7.H[0] // ........................*............ - // sub v12.8H, v21.8H, v31.8H // ...........................*......... - // add v31.8H, v2.8H, v17.8H // ..........*.......................... - // add v21.8H, v22.8H, v16.8H // ............................*........ - // add v23.8H, v21.8H, v27.8H // ...............................*..... - // add v26.8H, v10.8H, v28.8H // .........................*........... - // mul v4.8H, v23.8H, v29.8H // ...................................*. - // sqrdmulh v19.8H, v23.8H, v30.8H // ....................................* - // sub v20.8H, v26.8H, v31.8H // .............................*....... - // sub v27.8H, v21.8H, v27.8H // ................................*.... - // add v13.8H, v26.8H, v31.8H // ..............................*...... - // sqrdmulh v11.8H, v20.8H, v0.H[5] // ..................................*.. - // mul v14.8H, v12.8H, v0.H[4] // .................................*... + // Instructions: 48 + // Expected cycles: 24 + // Expected IPC: 2.00 + // + // Cycle bound: 24.0 + // IPC bound: 2.00 + // + // Wall time: 0.95s + // User time: 0.95s + // + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + ldr q12, [x0, #128] // .*.............................................. + ldr q8, [x0, #192] // .....*.......................................... + // gap // ................................................ + // gap // ................................................ + ldr q20, [x0, #448] // .............*.................................. + ldr q6, [x0, #384] // *............................................... + // gap // ................................................ + // gap // ................................................ + ldr q31, [x0, #256] // ......*......................................... + ldr q19, [x0, #0] // ..*............................................. + // gap // ................................................ + // gap // ................................................ + ldr q16, [x0, #64] // ...*............................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sub v3.8H, v12.8H, v8.8H // .........*...................................... + add v18.8H, v12.8H, v8.8H // .......*........................................ + ldr q9, [x0, #320] // ....*........................................... + // gap // ................................................ + add v22.8H, v6.8H, v20.8H // .........................*...................... + sub v4.8H, v6.8H, v20.8H // ................*............................... + // gap // ................................................ + // gap // ................................................ + mul v21.8H, v3.8H, v1.H[0] // ..........*..................................... + sqrdmulh v12.8H, v3.8H, v1.H[1] // ...........*.................................... + // gap // ................................................ + // gap // ................................................ + add v20.8H, v19.8H, v16.8H // ............*................................... + sub v8.8H, v19.8H, v16.8H // ........*....................................... + // gap // ................................................ + // gap // ................................................ + add v6.8H, v31.8H, v9.8H // .......................*........................ + sub v9.8H, v31.8H, v9.8H // .................*.............................. + // gap // ................................................ + // gap // ................................................ + sub v19.8H, v20.8H, v18.8H // ...............*................................ + add v15.8H, v20.8H, v18.8H // ..............*................................. + // gap // ................................................ + // gap // ................................................ + mul v20.8H, v9.8H, v1.H[2] // ...................*............................ + sqrdmulh v18.8H, v4.8H, v1.H[5] // ..................*............................. + // gap // ................................................ + // gap // ................................................ + mul v16.8H, v8.8H, v0.H[6] // .....................*.......................... + sqrdmulh v8.8H, v8.8H, v0.H[7] // ....................*........................... + // gap // ................................................ + // gap // ................................................ + sub v3.8H, v6.8H, v22.8H // ............................*................... + mul v31.8H, v4.8H, v1.H[4] // ........................*....................... + // gap // ................................................ + // gap // ................................................ + sqrdmulh v9.8H, v9.8H, v1.H[3] // ......................*......................... + mls v21.8H, v12.8H, v7.H[0] // ..........................*..................... + // gap // ................................................ + // gap // ................................................ + add v6.8H, v6.8H, v22.8H // ...............................*................ + mls v16.8H, v8.8H, v7.H[0] // ...........................*.................... + // gap // ................................................ + // gap // ................................................ + mls v31.8H, v18.8H, v7.H[0] // ..............................*................. + // gap // ................................................ + // gap // ................................................ + mul v26.8H, v3.8H, v0.H[4] // ..................................*............. + add v5.8H, v15.8H, v6.8H // ......................................*......... + mls v20.8H, v9.8H, v7.H[0] // .............................*.................. + // gap // ................................................ + // gap // ................................................ + add v18.8H, v16.8H, v21.8H // ................................*............... + sub v16.8H, v16.8H, v21.8H // .................................*.............. + // gap // ................................................ + // gap // ................................................ + sqrdmulh v12.8H, v5.8H, v30.8H // ...........................................*.... + sqrdmulh v23.8H, v3.8H, v0.H[5] // .......................................*........ + // gap // ................................................ + // gap // ................................................ + sub v27.8H, v15.8H, v6.8H // .........................................*...... + add v6.8H, v20.8H, v31.8H // ....................................*........... + // gap // ................................................ + // gap // ................................................ + mul v9.8H, v16.8H, v0.H[2] // ........................................*....... + sqrdmulh v22.8H, v19.8H, v0.H[3] // ...................................*............ + // gap // ................................................ + // gap // ................................................ + sub v21.8H, v20.8H, v31.8H // .....................................*.......... + // gap // ................................................ + // gap // ................................................ + sub v31.8H, v18.8H, v6.8H // ..........................................*..... + sqrdmulh v11.8H, v16.8H, v0.H[3] // ............................................*... + add v4.8H, v18.8H, v6.8H // .............................................*.. + // gap // ................................................ + // gap // ................................................ + mul v6.8H, v31.8H, v0.H[0] // ...............................................* + mul v8.8H, v27.8H, v0.H[0] // ..............................................*. + // gap // ................................................ + // gap // ................................................ + + // ---------------- new position -----------------> + // 0 25 + // |------------------------|---------------------- + // ldr q13, [x0, #384] // ...*............................................ + // ldr q3, [x0, #128] // *............................................... + // ldr q10, [x0, #0] // .....*.......................................... + // ldr q2, [x0, #64] // ......*......................................... + // ldr q11, [x0, #320] // .........*...................................... + // ldr q20, [x0, #192] // .*.............................................. + // ldr q14, [x0, #256] // ....*........................................... + // add v6.8H, v3.8H, v20.8H // ........*....................................... + // sub v28.8H, v10.8H, v2.8H // ...............*................................ + // sub v18.8H, v3.8H, v20.8H // .......*........................................ + // mul v24.8H, v18.8H, v1.H[0] // ............*................................... + // sqrdmulh v22.8H, v18.8H, v1.H[1] // .............*.................................. + // add v15.8H, v10.8H, v2.8H // ..............*................................. + // ldr q31, [x0, #448] // ..*............................................. + // add v8.8H, v15.8H, v6.8H // ...................*............................ + // sub v19.8H, v15.8H, v6.8H // ..................*............................. + // sub v25.8H, v13.8H, v31.8H // ...........*.................................... + // sub v5.8H, v14.8H, v11.8H // .................*.............................. + // sqrdmulh v6.8H, v25.8H, v1.H[5] // .....................*.......................... + // mul v17.8H, v5.8H, v1.H[2] // ....................*........................... + // sqrdmulh v15.8H, v28.8H, v0.H[7] // .......................*........................ + // mul v18.8H, v28.8H, v0.H[6] // ......................*......................... + // sqrdmulh v9.8H, v5.8H, v1.H[3] // ..........................*..................... + // add v16.8H, v14.8H, v11.8H // ................*............................... + // mul v21.8H, v25.8H, v1.H[4] // .........................*...................... + // add v12.8H, v13.8H, v31.8H // ..........*..................................... + // mls v24.8H, v22.8H, v7.H[0] // ...........................*.................... + // mls v18.8H, v15.8H, v7.H[0] // .............................*.................. + // sub v20.8H, v16.8H, v12.8H // ........................*....................... + // mls v17.8H, v9.8H, v7.H[0] // .................................*.............. + // mls v21.8H, v6.8H, v7.H[0] // ..............................*................. + // add v16.8H, v16.8H, v12.8H // ............................*................... + // add v6.8H, v18.8H, v24.8H // ..................................*............. + // sub v18.8H, v18.8H, v24.8H // ...................................*............ + // mul v26.8H, v20.8H, v0.H[4] // ...............................*................ + // sqrdmulh v22.8H, v19.8H, v0.H[3] // .........................................*...... + // add v15.8H, v17.8H, v21.8H // .......................................*........ + // sub v21.8H, v17.8H, v21.8H // ..........................................*..... + // add v5.8H, v8.8H, v16.8H // ................................*............... + // sqrdmulh v23.8H, v20.8H, v0.H[5] // .....................................*.......... + // mul v9.8H, v18.8H, v0.H[2] // ........................................*....... + // sub v27.8H, v8.8H, v16.8H // ......................................*......... + // sub v31.8H, v6.8H, v15.8H // ...........................................*.... + // sqrdmulh v12.8H, v5.8H, v30.8H // ....................................*........... + // sqrdmulh v11.8H, v18.8H, v0.H[3] // ............................................*... + // add v4.8H, v6.8H, v15.8H // .............................................*.. + // mul v8.8H, v27.8H, v0.H[0] // ...............................................* + // mul v6.8H, v31.8H, v0.H[0] // ..............................................*. sub count, count, #1 layer123_start: - ldr q10, [x0, #272] // ....e................................................................................... - sub v3.8H, v22.8H, v16.8H // .................................*...................................................... - add v22.8H, v6.8H, v9.8H // .........*.............................................................................. - ldr q9, [x0, #80] // .e...................................................................................... - ldr q17, [x0, #464] // .......e................................................................................ - add v18.8H, v25.8H, v15.8H // ..............*......................................................................... - ldr q15, [x0, #208] // ...e.................................................................................... - sqrdmulh v5.8H, v12.8H, v0.H[5] // ..............................................*......................................... - ldr q6, [x0, #16] // e....................................................................................... - mls v4.8H, v19.8H, v7.H[0] // .............................................................................*.......... - sqrdmulh v19.8H, v27.8H, v0.H[1] // ........................................................*............................... - ldr q28, [x0, #336] // .....e.................................................................................. - add v16.8H, v22.8H, v18.8H // .............................*.......................................................... - mul v23.8H, v27.8H, v0.H[0] // .......................................................*................................ + // Instructions: 88 + // Expected cycles: 36 + // Expected IPC: 2.44 + // + // Cycle bound: 36.0 + // IPC bound: 2.44 + // + // Wall time: 127.98s + // User time: 127.98s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + mul v25.8H, v4.8H, v29.8H // ............................................................................*........... + sqrdmulh v15.8H, v31.8H, v0.H[1] // .......................................................*................................ + ldr q13, [x0, #400] // ......e................................................................................. + ldr q3, [x0, #144] // ..e..................................................................................... + mul v31.8H, v19.8H, v0.H[2] // ...............................*........................................................ + sqrdmulh v19.8H, v27.8H, v0.H[1] // ..................................................*..................................... + ldr q10, [x0, #16] // e....................................................................................... + ldr q2, [x0, #80] // .e...................................................................................... + mls v9.8H, v11.8H, v7.H[0] // .....................................*.................................................. + sqrdmulh v4.8H, v4.8H, v30.8H // ...........................................................................*............ + ldr q11, [x0, #336] // .....e.................................................................................. + ldr q20, [x0, #208] // ...e.................................................................................... + mls v6.8H, v15.8H, v7.H[0] // .........................................................*.............................. + mls v26.8H, v23.8H, v7.H[0] // ..........................................*............................................. + ldr q14, [x0, #272] // ....e................................................................................... // gap // ........................................................................................ + mls v31.8H, v22.8H, v7.H[0] // ................................*....................................................... + sqrdmulh v22.8H, v21.8H, v0.H[5] // .............................................*.......................................... // gap // ........................................................................................ - mls v14.8H, v5.8H, v7.H[0] // ...............................................*........................................ - sub v27.8H, v22.8H, v18.8H // ............................*........................................................... // gap // ........................................................................................ - ldr q2, [x0, #400] // ......e................................................................................. - mul v5.8H, v20.8H, v0.H[4] // ........................................*............................................... - ldr q25, [x0, #144] // ..e..................................................................................... - mul v18.8H, v3.8H, v0.H[2] // ...................................*.................................................... - str q4, [x0, #64] // .....................................................................................*.. - sqrdmulh v3.8H, v3.8H, v0.H[3] // ....................................*................................................... + mls v25.8H, v4.8H, v7.H[0] // .............................................................................*.......... + mls v8.8H, v19.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ // gap // ........................................................................................ - sub v22.8H, v6.8H, v9.8H // ........e............................................................................... - sub v4.8H, v10.8H, v28.8H // ..................e..................................................................... - mls v23.8H, v19.8H, v7.H[0] // .........................................................*.............................. + str q6, [x0, #320] // .....................................................................*.................. + add v6.8H, v3.8H, v20.8H // ..............e......................................................................... + sub v28.8H, v10.8H, v2.8H // ........e............................................................................... // gap // ........................................................................................ + add v16.8H, v31.8H, v26.8H // ...........................................................*............................ + sub v18.8H, v3.8H, v20.8H // .............e.......................................................................... // gap // ........................................................................................ - mls v5.8H, v11.8H, v7.H[0] // ..........................................*............................................. - sqrdmulh v8.8H, v22.8H, v0.H[7] // ...........e............................................................................ // gap // ........................................................................................ + str q8, [x0, #256] // ....................................................................*................... + sub v24.8H, v31.8H, v26.8H // ..........................................................*............................. // gap // ........................................................................................ - mls v18.8H, v3.8H, v7.H[0] // .....................................*.................................................. + mul v31.8H, v21.8H, v0.H[4] // ..............................................*......................................... + sqrdmulh v19.8H, v16.8H, v30.8H // ..............................................................................*......... + mul v16.8H, v16.8H, v29.8H // ...............................................................................*........ + str q25, [x0, #64] // .....................................................................................*.. // gap // ........................................................................................ + sqrdmulh v26.8H, v24.8H, v0.H[1] // ............................................................*........................... + mul v23.8H, v24.8H, v0.H[0] // .............................................................*.......................... // gap // ........................................................................................ - sqrdmulh v20.8H, v27.8H, v0.H[3] // ...............................*........................................................ - str q23, [x0, #320] // .....................................................................*.................. - sub v19.8H, v16.8H, v13.8H // ................................................*....................................... - mul v22.8H, v22.8H, v0.H[6] // ..........e............................................................................. // gap // ........................................................................................ - sub v23.8H, v2.8H, v17.8H // .......................e................................................................ + mul v24.8H, v18.8H, v1.H[0] // ................e....................................................................... + mls v31.8H, v22.8H, v7.H[0] // ...............................................*........................................ // gap // ........................................................................................ - mul v3.8H, v27.8H, v0.H[2] // ..............................*......................................................... // gap // ........................................................................................ - sqrdmulh v12.8H, v4.8H, v1.H[3] // .....................e.................................................................. + mls v16.8H, v19.8H, v7.H[0] // ................................................................................*....... + sqrdmulh v22.8H, v18.8H, v1.H[1] // ...............e........................................................................ // gap // ........................................................................................ // gap // ........................................................................................ - sub v26.8H, v25.8H, v15.8H // .............e.......................................................................... - sqrdmulh v24.8H, v23.8H, v1.H[5] // ..........................e............................................................. - mul v31.8H, v23.8H, v1.H[4] // .........................e.............................................................. + mls v23.8H, v26.8H, v7.H[0] // ..............................................................*......................... + add v15.8H, v10.8H, v2.8H // .........e.............................................................................. // gap // ........................................................................................ // gap // ........................................................................................ - mls v3.8H, v20.8H, v7.H[0] // ................................*....................................................... - mul v21.8H, v4.8H, v1.H[2] // ....................e................................................................... + sub v3.8H, v9.8H, v31.8H // ...............................................................*........................ // gap // ........................................................................................ + add v17.8H, v9.8H, v31.8H // ................................................................*....................... + ldr q31, [x0, #464] // .......e................................................................................ + str q16, [x0, #128] // ......................................................................................*. + add v8.8H, v15.8H, v6.8H // .............................e.......................................................... + sub v19.8H, v15.8H, v6.8H // ............................e........................................................... // gap // ........................................................................................ - add v11.8H, v16.8H, v13.8H // .................................................*...................................... - mls v22.8H, v8.8H, v7.H[0] // ............e........................................................................... + str q23, [x0, #384] // ......................................................................*................. + mul v18.8H, v17.8H, v29.8H // ..................................................................................*..... + sqrdmulh v6.8H, v17.8H, v30.8H // .................................................................................*...... // gap // ........................................................................................ + sqrdmulh v23.8H, v3.8H, v0.H[1] // .................................................................*...................... + mul v9.8H, v5.8H, v29.8H // .........................................................................*.............. // gap // ........................................................................................ // gap // ........................................................................................ - sqrdmulh v8.8H, v26.8H, v1.H[1] // ................e....................................................................... + sub v25.8H, v13.8H, v31.8H // .......................e................................................................ + sub v5.8H, v14.8H, v11.8H // ..................e..................................................................... // gap // ........................................................................................ - mls v31.8H, v24.8H, v7.H[0] // ...........................e............................................................ - mls v21.8H, v12.8H, v7.H[0] // ......................e................................................................. - mul v16.8H, v26.8H, v1.H[0] // ...............e........................................................................ // gap // ........................................................................................ + mls v18.8H, v6.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ - sqrdmulh v23.8H, v19.8H, v0.H[1] // ...................................................*.................................... + mul v20.8H, v3.8H, v0.H[0] // ..................................................................*..................... // gap // ........................................................................................ + sqrdmulh v6.8H, v25.8H, v1.H[5] // .........................e.............................................................. // gap // ........................................................................................ - mul v4.8H, v19.8H, v0.H[0] // ..................................................*..................................... // gap // ........................................................................................ - sub v19.8H, v3.8H, v5.8H // ..........................................................*............................. - add v3.8H, v3.8H, v5.8H // ...........................................................*............................ + mls v9.8H, v12.8H, v7.H[0] // ..........................................................................*............. + mul v17.8H, v5.8H, v1.H[2] // .....................e.................................................................. + sqrdmulh v15.8H, v28.8H, v0.H[7] // ..........e............................................................................. // gap // ........................................................................................ - add v27.8H, v21.8H, v31.8H // ............................................e........................................... // gap // ........................................................................................ + str q18, [x0, #192] // .......................................................................................* + mls v20.8H, v23.8H, v7.H[0] // ...................................................................*.................... // gap // ........................................................................................ - mls v16.8H, v8.8H, v7.H[0] // .................e...................................................................... - sub v24.8H, v18.8H, v14.8H // ...............................................................*........................ + mul v18.8H, v28.8H, v0.H[6] // ...........e............................................................................ + str q9, [x0], #(16) // ....................................................................................*... + sqrdmulh v9.8H, v5.8H, v1.H[3] // ....................e................................................................... + add v16.8H, v14.8H, v11.8H // ...................e.................................................................... // gap // ........................................................................................ + mul v21.8H, v25.8H, v1.H[4] // ..........................e............................................................. + add v12.8H, v13.8H, v31.8H // ........................e............................................................... // gap // ........................................................................................ - mul v8.8H, v19.8H, v0.H[0] // ............................................................*........................... - add v5.8H, v18.8H, v14.8H // ................................................................*....................... // gap // ........................................................................................ - sub v12.8H, v21.8H, v31.8H // ...........................................e............................................ + str q20, [x0, #432] // .......................................................................*................ + mls v24.8H, v22.8H, v7.H[0] // .................e...................................................................... // gap // ........................................................................................ - add v31.8H, v2.8H, v17.8H // ........................e............................................................... + mls v18.8H, v15.8H, v7.H[0] // ............e........................................................................... + sub v20.8H, v16.8H, v12.8H // ......................................e................................................. + mls v17.8H, v9.8H, v7.H[0] // ......................e................................................................. // gap // ........................................................................................ - mls v4.8H, v23.8H, v7.H[0] // ....................................................*................................... // gap // ........................................................................................ - sqrdmulh v17.8H, v5.8H, v30.8H // ..................................................................................*..... + mls v21.8H, v6.8H, v7.H[0] // ...........................e............................................................ // gap // ........................................................................................ // gap // ........................................................................................ - mul v2.8H, v5.8H, v29.8H // .................................................................................*...... - add v21.8H, v22.8H, v16.8H // ..................................e..................................................... + add v16.8H, v16.8H, v12.8H // .......................................e................................................ + add v6.8H, v18.8H, v24.8H // ..................................e..................................................... + sub v18.8H, v18.8H, v24.8H // .................................e...................................................... // gap // ........................................................................................ // gap // ........................................................................................ - mul v14.8H, v3.8H, v29.8H // ..............................................................................*......... - sqrdmulh v5.8H, v19.8H, v0.H[1] // .............................................................*.......................... - str q4, [x0, #256] // ....................................................................*................... - sqrdmulh v20.8H, v24.8H, v0.H[1] // ..................................................................*..................... + mul v26.8H, v20.8H, v0.H[4] // .........................................e.............................................. + sqrdmulh v22.8H, v19.8H, v0.H[3] // ..............................e......................................................... // gap // ........................................................................................ - mul v13.8H, v24.8H, v0.H[0] // .................................................................*...................... - mul v24.8H, v11.8H, v29.8H // ........................................................................*............... // gap // ........................................................................................ + add v15.8H, v17.8H, v21.8H // ............................................e........................................... + sub v21.8H, v17.8H, v21.8H // ...........................................e............................................ // gap // ........................................................................................ - sqrdmulh v19.8H, v11.8H, v30.8H // .........................................................................*.............. // gap // ........................................................................................ + add v5.8H, v8.8H, v16.8H // .................................................e...................................... + sqrdmulh v23.8H, v20.8H, v0.H[5] // ........................................e............................................... // gap // ........................................................................................ - mls v2.8H, v17.8H, v7.H[0] // ...................................................................................*.... // gap // ........................................................................................ - sqrdmulh v3.8H, v3.8H, v30.8H // ...............................................................................*........ - mls v8.8H, v5.8H, v7.H[0] // ..............................................................*......................... + mul v9.8H, v18.8H, v0.H[2] // ....................................e................................................... + sub v27.8H, v8.8H, v16.8H // ................................................e....................................... // gap // ........................................................................................ - add v23.8H, v21.8H, v27.8H // ......................................................e................................. // gap // ........................................................................................ - mls v13.8H, v20.8H, v7.H[0] // ...................................................................*.................... + sub v31.8H, v6.8H, v15.8H // .....................................................e.................................. // gap // ........................................................................................ - mls v24.8H, v19.8H, v7.H[0] // ..........................................................................*............. - str q2, [x0, #192] // .......................................................................................* - add v26.8H, v10.8H, v28.8H // ...................e.................................................................... // gap // ........................................................................................ - str q8, [x0, #384] // ......................................................................*................. - mul v4.8H, v23.8H, v29.8H // ...........................................................................e............ - mls v14.8H, v3.8H, v7.H[0] // ................................................................................*....... + sqrdmulh v12.8H, v5.8H, v30.8H // ........................................................................e............... + sqrdmulh v11.8H, v18.8H, v0.H[3] // ...................................e.................................................... + add v4.8H, v6.8H, v15.8H // ......................................................e................................. // gap // ........................................................................................ - sqrdmulh v19.8H, v23.8H, v30.8H // ............................................................................e........... - sub v20.8H, v26.8H, v31.8H // ......................................e................................................. // gap // ........................................................................................ - str q13, [x0, #448] // .......................................................................*................ - sub v27.8H, v21.8H, v27.8H // .....................................................e.................................. - add v13.8H, v26.8H, v31.8H // .......................................e................................................ - str q24, [x0], #(16) // ....................................................................................*... + mul v8.8H, v27.8H, v0.H[0] // ...................................................e.................................... + mul v6.8H, v31.8H, v0.H[0] // ........................................................e............................... // gap // ........................................................................................ - sqrdmulh v11.8H, v20.8H, v0.H[5] // .........................................e.............................................. - str q14, [x0, #112] // ......................................................................................*. - mul v14.8H, v12.8H, v0.H[4] // .............................................e.......................................... // gap // ........................................................................................ - // original source code - // ldr q8, [x0, #0] // ........e...............................................................................|.......e.............................................................................. - // ldr q9, [x0, #(1*(512/8))] // ...e....................................................................................|..e................................................................................... - // ldr q10, [x0, #(2*(512/8))] // ..................e.....................................................................|.................e.................................................................... - // ldr q11, [x0, #(3*(512/8))] // ......e.................................................................................|.....e................................................................................ - // ldr q12, [x0, #(4*(512/8))] // e.......................................................................................e...................................................................................... - // ldr q13, [x0, #(5*(512/8))] // ...........e............................................................................|..........e........................................................................... - // ldr q14, [x0, #(6*(512/8))] // ................e.......................................................................|...............e...................................................................... - // ldr q15, [x0, #(7*(512/8))] // ....e...................................................................................|...e.................................................................................. - // sub v24.8h, v8.8h, v9.8h // ......................e.................................................................|.....................e................................................................ - // add v8.8h, v8.8h, v9.8h // ..*.....................................................................................|.*.................................................................................... - // mul v9.8h, v24.8h, v0.h[6] // ...............................e........................................................|..............................e....................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[7] // ..........................e.............................................................|.........................e............................................................ - // mls v9.8h, v24.8h, v7.h[0] // .........................................e..............................................|........................................e............................................. - // sub v24.8h, v10.8h, v11.8h // ...................................e....................................................|..................................e................................................... - // add v10.8h, v10.8h, v11.8h // .....*..................................................................................|....*................................................................................. - // mul v11.8h, v24.8h, v1.h[0] // .............................................e..........................................|............................................e......................................... - // sqrdmulh v24.8h, v24.8h, v1.h[1] // ..........................................e.............................................|.........................................e............................................ - // mls v11.8h, v24.8h, v7.h[0] // ...................................................e....................................|..................................................e................................... - // sub v24.8h, v12.8h, v13.8h // .......................e................................................................|......................e............................................................... - // add v12.8h, v12.8h, v13.8h // ...........................................................................e............|..........................................................................e........... - // mul v13.8h, v24.8h, v1.h[2] // .......................................e................................................|......................................e............................................... - // sqrdmulh v24.8h, v24.8h, v1.h[3] // ..................................e.....................................................|.................................e.................................................... - // mls v13.8h, v24.8h, v7.h[0] // ............................................e...........................................|...........................................e.......................................... - // sub v24.8h, v14.8h, v15.8h // ................................e.......................................................|...............................e...................................................... - // add v14.8h, v14.8h, v15.8h // ........................................................e...............................|.......................................................e.............................. - // mul v15.8h, v24.8h, v1.h[4] // .....................................e..................................................|....................................e................................................. - // sqrdmulh v24.8h, v24.8h, v1.h[5] // ....................................e...................................................|...................................e.................................................. - // mls v15.8h, v24.8h, v7.h[0] // ...........................................e............................................|..........................................e........................................... - // sub v24.8h, v8.8h, v10.8h // ...............*........................................................................|..............*....................................................................... - // add v8.8h, v8.8h, v10.8h // ............*...........................................................................|...........*.......................................................................... - // mul v10.8h, v24.8h, v0.h[2] // .................................*......................................................|................................*..................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............................*...........................................................|...........................*.......................................................... - // mls v10.8h, v24.8h, v7.h[0] // ......................................*.................................................|.....................................*................................................ - // sub v24.8h, v9.8h, v11.8h // .*......................................................................................|*..................................................................................... - // add v9.8h, v9.8h, v11.8h // ............................................................e...........................|...........................................................e.......................... - // mul v11.8h, v24.8h, v0.h[2] // ...................*....................................................................|..................*................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[3] // .....................*..................................................................|....................*................................................................. - // mls v11.8h, v24.8h, v7.h[0] // ...........................*............................................................|..........................*........................................................... - // sub v24.8h, v12.8h, v14.8h // ................................................................................e.......|...............................................................................e...... - // add v12.8h, v12.8h, v14.8h // ...................................................................................e....|..................................................................................e... - // mul v14.8h, v24.8h, v0.h[4] // .................*......................................................................|................*..................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .....................................................................................e..|....................................................................................e. - // mls v14.8h, v24.8h, v7.h[0] // .........................*..............................................................|........................*............................................................. - // sub v24.8h, v13.8h, v15.8h // .......................................................e................................|......................................................e............................... - // add v13.8h, v13.8h, v15.8h // ..................................................e.....................................|.................................................e.................................... - // mul v15.8h, v24.8h, v0.h[4] // .......................................................................................e|...................................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......*................................................................................|......*............................................................................... - // mls v15.8h, v24.8h, v7.h[0] // ..............*.........................................................................|.............*........................................................................ - // sub v24.8h, v8.8h, v12.8h // ..............................*.........................................................|.............................*........................................................ - // add v8.8h, v8.8h, v12.8h // ........................................*...............................................|.......................................*.............................................. - // mul v12.8h, v24.8h, v0.h[0] // ...............................................*........................................|..............................................*....................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*.........................................|.............................................*........................................ - // mls v12.8h, v24.8h, v7.h[0] // .........................................................*..............................|........................................................*............................. - // sub v24.8h, v9.8h, v13.8h // ..................................................................................e.....|.................................................................................e.... - // add v9.8h, v9.8h, v13.8h // .......................................................................e................|......................................................................e............... - // mul v13.8h, v24.8h, v0.h[0] // .............*..........................................................................|............*......................................................................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........*.............................................................................|.........*............................................................................ - // mls v13.8h, v24.8h, v7.h[0] // ........................*...............................................................|.......................*.............................................................. - // sub v24.8h, v10.8h, v14.8h // ................................................*.......................................|...............................................*...................................... - // add v10.8h, v10.8h, v14.8h // .................................................*......................................|................................................*..................................... - // mul v14.8h, v24.8h, v0.h[0] // .....................................................*..................................|....................................................*................................. - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*.........................|.............................................................*........................ - // mls v14.8h, v24.8h, v7.h[0] // ......................................................................*.................|.....................................................................*................ - // sub v24.8h, v11.8h, v15.8h // ....................................................*...................................|...................................................*.................................. - // add v11.8h, v11.8h, v15.8h // ......................................................*.................................|.....................................................*................................ - // mul v15.8h, v24.8h, v0.h[0] // .................................................................*......................|................................................................*..................... - // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................................................*.......................|...............................................................*...................... - // mls v15.8h, v24.8h, v7.h[0] // ........................................................................*...............|.......................................................................*.............. - // str q12, [x0, #(4*(512/8))] // ...............................................................*........................|..............................................................*....................... - // str q13, [x0, #(5*(512/8))] // .............................*..........................................................|............................*......................................................... - // str q14, [x0, #(6*(512/8))] // ............................................................................*...........|...........................................................................*.......... - // str q15, [x0, #(7*(512/8))] // .................................................................................*......|................................................................................*..... - // mul v12.8h, v8.8h, v29.8h // ..................................................................*.....................|.................................................................*.................... - // sqrdmulh v8.8h, v8.8h, v30.8h // ...................................................................*....................|..................................................................*................... - // mls v12.8h, v8.8h, v7.h[0] // .........................................................................*..............|........................................................................*............. - // mul v13.8h, v9.8h, v29.8h // .............................................................................e..........|............................................................................e......... - // sqrdmulh v9.8h, v9.8h, v30.8h // ...............................................................................e........|..............................................................................e....... - // mls v13.8h, v9.8h, v7.h[0] // .........*..............................................................................|........*............................................................................. - // mul v14.8h, v10.8h, v29.8h // .............................................................*..........................|............................................................*......................... - // sqrdmulh v10.8h, v10.8h, v30.8h // .....................................................................*..................|....................................................................*................. - // mls v14.8h, v10.8h, v7.h[0] // ..............................................................................*.........|.............................................................................*........ - // mul v15.8h, v11.8h, v29.8h // ...........................................................*............................|..........................................................*........................... - // sqrdmulh v11.8h, v11.8h, v30.8h // ..........................................................*.............................|.........................................................*............................ - // mls v15.8h, v11.8h, v7.h[0] // ....................................................................*...................|...................................................................*.................. - // str q12, [x0], #(16) // ....................................................................................*...|...................................................................................*.. - // str q13, [x0, #(-16 + 1*(512/8))] // ....................*...................................................................|...................*.................................................................. - // str q14, [x0, #(-16 + 2*(512/8))] // ......................................................................................*.|.....................................................................................* - // str q15, [x0, #(-16 + 3*(512/8))] // ..........................................................................*.............|.........................................................................*............ + // -------------------------------------------------------------------- new position ---------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr q8, [x0, #0] // ....e.................................................................................'.....~........................................................... + // ldr q9, [x0, #(1*(512/8))] // .....e................................................................................'......~.......................................................... + // ldr q10, [x0, #(2*(512/8))] // .e....................................................................................'..~.............................................................. + // ldr q11, [x0, #(3*(512/8))] // .........e............................................................................'..........~...................................................... + // ldr q12, [x0, #(4*(512/8))] // ............e.........................................................................'.............~................................................... + // ldr q13, [x0, #(5*(512/8))] // ........e.............................................................................'.........~....................................................... + // ldr q14, [x0, #(6*(512/8))] // e.....................................................................................'.~............................................................... + // ldr q15, [x0, #(7*(512/8))] // ......................................e...............................................'.......................................~......................... + // sub v24.8h, v8.8h, v9.8h // ...................e..................................................................'....................~............................................ + // add v8.8h, v8.8h, v9.8h // ...................................e..................................................'....................................~............................ + // sqrdmulh v27.8h, v24.8h, v0.h[7] // ......................................................e...............................'.......................................................~......... + // mul v9.8h, v24.8h, v0.h[6] // .........................................................e............................'..........................................................~...... + // mls v9.8h, v27.8h, v7.h[0] // .................................................................e....................'................................................................. + // sub v24.8h, v10.8h, v11.8h // .....................e................................................................'......................~.......................................... + // add v10.8h, v10.8h, v11.8h // ..................e...................................................................'...................~............................................. + // sqrdmulh v27.8h, v24.8h, v1.h[1] // .................................e....................................................'..................................~.............................. + // mul v11.8h, v24.8h, v1.h[0] // ..............................e.......................................................'...............................~................................. + // mls v11.8h, v27.8h, v7.h[0] // ................................................................e.....................'................................................................. + // sub v24.8h, v12.8h, v13.8h // ................................................e.....................................'.................................................~............... + // add v12.8h, v12.8h, v13.8h // ............................................................e.........................'.............................................................~... + // sqrdmulh v27.8h, v24.8h, v1.h[3] // ...........................................................e..........................'............................................................~.... + // mul v13.8h, v24.8h, v1.h[2] // .....................................................e................................'......................................................~.......... + // mls v13.8h, v27.8h, v7.h[0] // ...................................................................e..................'................................................................. + // sub v24.8h, v14.8h, v15.8h // ...............................................e......................................'................................................~................ + // add v14.8h, v14.8h, v15.8h // ..............................................................e.......................'...............................................................~. + // sqrdmulh v27.8h, v24.8h, v1.h[5] // ...................................................e..................................'....................................................~............ + // mul v15.8h, v24.8h, v1.h[4] // .............................................................e........................'..............................................................~.. + // mls v15.8h, v27.8h, v7.h[0] // ....................................................................e.................'................................................................. + // sub v24.8h, v8.8h, v10.8h // .........................................e............................................'..........................................~...................... + // add v8.8h, v8.8h, v10.8h // ........................................e.............................................'.........................................~....................... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .........................................................................e............'................................................................. + // mul v10.8h, v24.8h, v0.h[2] // ..~...................................................................................'...*............................................................. + // mls v10.8h, v27.8h, v7.h[0] // .............~........................................................................'..............*.................................................. + // sub v24.8h, v9.8h, v11.8h // .......................................................................e..............'................................................................. + // add v9.8h, v9.8h, v11.8h // ......................................................................e...............'................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ..................................................................................e...'................................................................. + // mul v11.8h, v24.8h, v0.h[2] // ..............................................................................e.......'................................................................. + // mls v11.8h, v27.8h, v7.h[0] // ......~...............................................................................'.......*......................................................... + // sub v24.8h, v12.8h, v14.8h // ..................................................................e...................'................................................................. + // add v12.8h, v12.8h, v14.8h // .....................................................................e................'................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[5] // .............................................................................e........'................................................................. + // mul v14.8h, v24.8h, v0.h[4] // ........................................................................e.............'................................................................. + // mls v14.8h, v27.8h, v7.h[0] // ...........~..........................................................................'............*.................................................... + // sub v24.8h, v13.8h, v15.8h // ...........................................................................e..........'................................................................. + // add v13.8h, v13.8h, v15.8h // ..........................................................................e...........'................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ..............~.......................................................................'...............*................................................. + // mul v15.8h, v24.8h, v0.h[4] // ........................~.............................................................'.........................*....................................... + // mls v15.8h, v27.8h, v7.h[0] // ...............................~......................................................'................................*................................ + // sub v24.8h, v8.8h, v12.8h // ...............................................................................e......'................................................................. + // add v8.8h, v8.8h, v12.8h // ............................................................................e.........'................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...~..................................................................................'....*............................................................ + // mul v12.8h, v24.8h, v0.h[0] // ....................................................................................e.'................................................................. + // mls v12.8h, v27.8h, v7.h[0] // ................~.....................................................................'.................*............................................... + // sub v24.8h, v9.8h, v13.8h // ................................................................................e.....'................................................................. + // add v9.8h, v9.8h, v13.8h // ...................................................................................e..'................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ......................................................................................'*................................................................ + // mul v13.8h, v24.8h, v0.h[0] // .....................................................................................e'................................................................. + // mls v13.8h, v27.8h, v7.h[0] // ..........~...........................................................................'...........*..................................................... + // sub v24.8h, v10.8h, v14.8h // .......................~..............................................................'........................*........................................ + // add v10.8h, v10.8h, v14.8h // ....................~.................................................................'.....................*........................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ............................~.........................................................'.............................*................................... + // mul v14.8h, v24.8h, v0.h[0] // .............................~........................................................'..............................*.................................. + // mls v14.8h, v27.8h, v7.h[0] // ..................................~...................................................'...................................*............................. + // sub v24.8h, v11.8h, v15.8h // ....................................~.................................................'.....................................*........................... + // add v11.8h, v11.8h, v15.8h // .....................................~................................................'......................................*.......................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .............................................~........................................'..............................................*.................. + // mul v15.8h, v24.8h, v0.h[0] // ..................................................~...................................'...................................................*............. + // mls v15.8h, v27.8h, v7.h[0] // ........................................................~.............................'.........................................................*....... + // str q12, [x0, #(4*(512/8))] // ......................~...............................................................'.......................*......................................... + // str q13, [x0, #(5*(512/8))] // .................~....................................................................'..................*.............................................. + // str q14, [x0, #(6*(512/8))] // ..........................................~...........................................'...........................................*..................... + // str q15, [x0, #(7*(512/8))] // ...............................................................~......................'................................................................* + // sqrdmulh v27.8h, v8.8h, v30.8h // .................................................................................e....'................................................................. + // mul v8.8h, v8.8h, v29.8h // ..............................................~.......................................'...............................................*................. + // mls v8.8h, v27.8h, v7.h[0] // ....................................................~.................................'.....................................................*........... + // sqrdmulh v27.8h, v9.8h, v30.8h // .......~..............................................................................'........*........................................................ + // mul v9.8h, v9.8h, v29.8h // ......................................................................................*................................................................. + // mls v9.8h, v27.8h, v7.h[0] // ...............~......................................................................'................*................................................ + // sqrdmulh v27.8h, v10.8h, v30.8h // .........................~............................................................'..........................*...................................... + // mul v10.8h, v10.8h, v29.8h // ..........................~...........................................................'...........................*..................................... + // mls v10.8h, v27.8h, v7.h[0] // ................................~.....................................................'.................................*............................... + // sqrdmulh v27.8h, v11.8h, v30.8h // ............................................~.........................................'.............................................*................... + // mul v11.8h, v11.8h, v29.8h // ...........................................~..........................................'............................................*.................... + // mls v11.8h, v27.8h, v7.h[0] // .................................................~....................................'..................................................*.............. + // str q8, [x0], #(16) // ..........................................................~...........................'...........................................................*..... + // str q9, [x0, #(-16 + 1*(512/8))] // ...........................~..........................................................'............................*.................................... + // str q10, [x0, #(-16 + 2*(512/8))] // .......................................~..............................................'........................................*........................ + // str q11, [x0, #(-16 + 3*(512/8))] // .......................................................~..............................'........................................................*........ sub count, count, #1 cbnz count, layer123_start - sub v23.8H, v22.8H, v16.8H // *.................................................. - add v22.8H, v6.8H, v9.8H // .*................................................. - // gap // ................................................... - // gap // ................................................... - sqrdmulh v24.8H, v12.8H, v0.H[5] // ...*............................................... - mul v3.8H, v20.8H, v0.H[4] // ..........*........................................ - // gap // ................................................... - // gap // ................................................... - add v26.8H, v25.8H, v15.8H // ..*................................................ - sqrdmulh v20.8H, v27.8H, v0.H[1] // .....*............................................. - // gap // ................................................... - // gap // ................................................... - mul v28.8H, v27.8H, v0.H[0] // .......*........................................... - mls v4.8H, v19.8H, v7.H[0] // ....*.............................................. - // gap // ................................................... - // gap // ................................................... - mls v3.8H, v11.8H, v7.H[0] // ...............*................................... - mls v14.8H, v24.8H, v7.H[0] // ........*.......................................... - // gap // ................................................... - // gap // ................................................... - add v19.8H, v22.8H, v26.8H // ......*............................................ - sqrdmulh v27.8H, v23.8H, v0.H[3] // .............*..................................... - // gap // ................................................... - // gap // ................................................... - mul v23.8H, v23.8H, v0.H[2] // ...........*....................................... - sub v22.8H, v22.8H, v26.8H // .........*......................................... - str q4, [x0, #64] // ............*...................................... - // gap // ................................................... - sub v24.8H, v19.8H, v13.8H // ...................*............................... - add v19.8H, v19.8H, v13.8H // ......................*............................ - // gap // ................................................... - // gap // ................................................... - mls v28.8H, v20.8H, v7.H[0] // ..............*.................................... - sqrdmulh v26.8H, v22.8H, v0.H[3] // .................*................................. - // gap // ................................................... - // gap // ................................................... - mls v23.8H, v27.8H, v7.H[0] // ................*.................................. - mul v22.8H, v22.8H, v0.H[2] // ....................*.............................. - // gap // ................................................... - // gap // ................................................... - sqrdmulh v27.8H, v24.8H, v0.H[1] // .......................*........................... - mul v24.8H, v24.8H, v0.H[0] // ........................*.......................... - // gap // ................................................... - // gap // ................................................... - mul v20.8H, v19.8H, v29.8H // ......................................*............ - sqrdmulh v19.8H, v19.8H, v30.8H // .......................................*........... - str q28, [x0, #320] // ..................*................................ - // gap // ................................................... - sub v28.8H, v23.8H, v14.8H // ...........................*....................... - mls v22.8H, v26.8H, v7.H[0] // .....................*............................. - // gap // ................................................... - // gap // ................................................... - add v23.8H, v23.8H, v14.8H // .............................*..................... - mls v24.8H, v27.8H, v7.H[0] // ..............................*.................... - // gap // ................................................... - // gap // ................................................... - sqrdmulh v27.8H, v28.8H, v0.H[1] // ....................................*.............. - mul v28.8H, v28.8H, v0.H[0] // .....................................*............. - // gap // ................................................... - // gap // ................................................... - sub v26.8H, v22.8H, v3.8H // .........................*......................... - add v22.8H, v22.8H, v3.8H // ..........................*........................ - // gap // ................................................... - // gap // ................................................... - sqrdmulh v3.8H, v23.8H, v30.8H // ...............................*................... - mul v23.8H, v23.8H, v29.8H // ................................*.................. - str q24, [x0, #256] // ...................................*............... - // gap // ................................................... - mul v24.8H, v26.8H, v0.H[0] // ............................*...................... - mul v11.8H, v22.8H, v29.8H // .................................*................. - // gap // ................................................... - // gap // ................................................... - mls v28.8H, v27.8H, v7.H[0] // ...........................................*....... - sqrdmulh v27.8H, v26.8H, v0.H[1] // ..................................*................ - // gap // ................................................... - // gap // ................................................... - sqrdmulh v22.8H, v22.8H, v30.8H // .........................................*......... - mls v23.8H, v3.8H, v7.H[0] // ........................................*.......... - // gap // ................................................... - // gap // ................................................... - mls v20.8H, v19.8H, v7.H[0] // ............................................*...... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v24.8H, v27.8H, v7.H[0] // ..........................................*........ - str q28, [x0, #448] // ................................................*.. - // gap // ................................................... - // gap // ................................................... - mls v11.8H, v22.8H, v7.H[0] // ...............................................*... - str q23, [x0, #192] // .............................................*..... - // gap // ................................................... - // gap // ................................................... - str q20, [x0], #(16) // .................................................*. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str q24, [x0, #368] // ..............................................*.... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str q11, [x0, #112] // ..................................................* - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - - // original source code - // sub v3.8H, v22.8H, v16.8H // *.................................................. - // add v22.8H, v6.8H, v9.8H // .*................................................. - // add v18.8H, v25.8H, v15.8H // ....*.............................................. - // sqrdmulh v5.8H, v12.8H, v0.H[5] // ..*................................................ - // mls v4.8H, v19.8H, v7.H[0] // .......*........................................... - // sqrdmulh v19.8H, v27.8H, v0.H[1] // .....*............................................. - // add v16.8H, v22.8H, v18.8H // ..........*........................................ - // mul v23.8H, v27.8H, v0.H[0] // ......*............................................ - // mls v14.8H, v5.8H, v7.H[0] // .........*......................................... - // sub v27.8H, v22.8H, v18.8H // .............*..................................... - // mul v5.8H, v20.8H, v0.H[4] // ...*............................................... - // mul v18.8H, v3.8H, v0.H[2] // ............*...................................... - // str q4, [x0, #64] // ..............*.................................... - // sqrdmulh v3.8H, v3.8H, v0.H[3] // ...........*....................................... - // mls v23.8H, v19.8H, v7.H[0] // .................*................................. - // mls v5.8H, v11.8H, v7.H[0] // ........*.......................................... - // mls v18.8H, v3.8H, v7.H[0] // ...................*............................... - // sqrdmulh v20.8H, v27.8H, v0.H[3] // ..................*................................ - // str q23, [x0, #320] // .........................*......................... - // sub v19.8H, v16.8H, v13.8H // ...............*................................... - // mul v3.8H, v27.8H, v0.H[2] // ....................*.............................. - // mls v3.8H, v20.8H, v7.H[0] // ...........................*....................... - // add v11.8H, v16.8H, v13.8H // ................*.................................. - // sqrdmulh v23.8H, v19.8H, v0.H[1] // .....................*............................. - // mul v4.8H, v19.8H, v0.H[0] // ......................*............................ - // sub v19.8H, v3.8H, v5.8H // ................................*.................. - // add v3.8H, v3.8H, v5.8H // .................................*................. - // sub v24.8H, v18.8H, v14.8H // ..........................*........................ - // mul v8.8H, v19.8H, v0.H[0] // .....................................*............. - // add v5.8H, v18.8H, v14.8H // ............................*...................... - // mls v4.8H, v23.8H, v7.H[0] // .............................*..................... - // sqrdmulh v17.8H, v5.8H, v30.8H // ..................................*................ - // mul v2.8H, v5.8H, v29.8H // ...................................*............... - // mul v14.8H, v3.8H, v29.8H // ......................................*............ - // sqrdmulh v5.8H, v19.8H, v0.H[1] // ........................................*.......... - // str q4, [x0, #256] // ....................................*.............. - // sqrdmulh v20.8H, v24.8H, v0.H[1] // ..............................*.................... - // mul v13.8H, v24.8H, v0.H[0] // ...............................*................... - // mul v24.8H, v11.8H, v29.8H // .......................*........................... - // sqrdmulh v19.8H, v11.8H, v30.8H // ........................*.......................... - // mls v2.8H, v17.8H, v7.H[0] // ..........................................*........ - // sqrdmulh v3.8H, v3.8H, v30.8H // .........................................*......... - // mls v8.8H, v5.8H, v7.H[0] // ............................................*...... - // mls v13.8H, v20.8H, v7.H[0] // .......................................*........... - // mls v24.8H, v19.8H, v7.H[0] // ...........................................*....... - // str q2, [x0, #192] // ...............................................*... - // str q8, [x0, #384] // .................................................*. - // mls v14.8H, v3.8H, v7.H[0] // ..............................................*.... - // str q13, [x0, #448] // .............................................*..... - // str q24, [x0], #(16) // ................................................*.. - // str q14, [x0, #112] // ..................................................* + // Instructions: 40 + // Expected cycles: 21 + // Expected IPC: 1.90 + // + // Cycle bound: 19.0 + // IPC bound: 2.11 + // + // Wall time: 3600.13s + // User time: 3600.13s + // + // ---------- original position ----------> + // 0 25 + // |------------------------|-------------- + mul v17.8H, v4.8H, v29.8H // *....................................... + sqrdmulh v3.8H, v4.8H, v30.8H // .....*.................................. + // gap // ........................................ + // gap // ........................................ + sqrdmulh v25.8H, v21.8H, v0.H[5] // .........*.............................. + mul v20.8H, v21.8H, v0.H[4] // ................*....................... + // gap // ........................................ + // gap // ........................................ + mls v26.8H, v23.8H, v7.H[0] // .......*................................ + sqrdmulh v28.8H, v27.8H, v0.H[1] // ...*.................................... + // gap // ........................................ + // gap // ........................................ + mls v17.8H, v3.8H, v7.H[0] // ..........*............................. + mls v9.8H, v11.8H, v7.H[0] // ....*................................... + // gap // ........................................ + // gap // ........................................ + mul v14.8H, v19.8H, v0.H[2] // ..*..................................... + mls v20.8H, v25.8H, v7.H[0] // ......................*................. + // gap // ........................................ + // gap // ........................................ + mls v8.8H, v28.8H, v7.H[0] // ...........*............................ + sqrdmulh v4.8H, v31.8H, v0.H[1] // .*...................................... + // gap // ........................................ + // gap // ........................................ + str q17, [x0, #64] // ...................*.................... + mul v2.8H, v5.8H, v29.8H // ................................*....... + // gap // ........................................ + // gap // ........................................ + add v24.8H, v9.8H, v20.8H // ..........................*............. + sub v9.8H, v9.8H, v20.8H // .........................*.............. + // gap // ........................................ + // gap // ........................................ + mls v14.8H, v22.8H, v7.H[0] // ........*............................... + str q8, [x0, #256] // ..............*......................... + mls v6.8H, v4.8H, v7.H[0] // ......*................................. + // gap // ........................................ + sqrdmulh v20.8H, v9.8H, v0.H[1] // ...............................*........ + mul v19.8H, v9.8H, v0.H[0] // ..................................*..... + // gap // ........................................ + // gap // ........................................ + mls v2.8H, v12.8H, v7.H[0] // ...................................*.... + // gap // ........................................ + // gap // ........................................ + mul v13.8H, v24.8H, v29.8H // .............................*.......... + str q6, [x0, #320] // ............*........................... + sub v21.8H, v14.8H, v26.8H // ...............*........................ + add v16.8H, v14.8H, v26.8H // .............*.......................... + // gap // ........................................ + mls v19.8H, v20.8H, v7.H[0] // .....................................*.. + sqrdmulh v15.8H, v24.8H, v30.8H // ..............................*......... + // gap // ........................................ + // gap // ........................................ + sqrdmulh v22.8H, v16.8H, v30.8H // .................*...................... + mul v16.8H, v16.8H, v29.8H // ..................*..................... + str q2, [x0], #(16) // ......................................*. + // gap // ........................................ + sqrdmulh v18.8H, v21.8H, v0.H[1] // ....................*................... + mul v21.8H, v21.8H, v0.H[0] // .....................*.................. + // gap // ........................................ + // gap // ........................................ + str q19, [x0, #432] // .......................................* + mls v13.8H, v15.8H, v7.H[0] // .................................*...... + // gap // ........................................ + // gap // ........................................ + mls v16.8H, v22.8H, v7.H[0] // .......................*................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v21.8H, v18.8H, v7.H[0] // ........................*............... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q13, [x0, #176] // ....................................*... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q16, [x0, #112] // ...........................*............ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q21, [x0, #368] // ............................*........... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + + // ------------ new position -------------> + // 0 25 + // |------------------------|-------------- + // mul v25.8H, v4.8H, v29.8H // *....................................... + // sqrdmulh v15.8H, v31.8H, v0.H[1] // ...........*............................ + // mul v31.8H, v19.8H, v0.H[2] // ........*............................... + // sqrdmulh v19.8H, v27.8H, v0.H[1] // .....*.................................. + // mls v9.8H, v11.8H, v7.H[0] // .......*................................ + // sqrdmulh v4.8H, v4.8H, v30.8H // .*...................................... + // mls v6.8H, v15.8H, v7.H[0] // ..................*..................... + // mls v26.8H, v23.8H, v7.H[0] // ....*................................... + // mls v31.8H, v22.8H, v7.H[0] // ................*....................... + // sqrdmulh v22.8H, v21.8H, v0.H[5] // ..*..................................... + // mls v25.8H, v4.8H, v7.H[0] // ......*................................. + // mls v8.8H, v19.8H, v7.H[0] // ..........*............................. + // str q6, [x0, #320] // .......................*................ + // add v16.8H, v31.8H, v26.8H // .........................*.............. + // str q8, [x0, #256] // .................*...................... + // sub v24.8H, v31.8H, v26.8H // ........................*............... + // mul v31.8H, v21.8H, v0.H[4] // ...*.................................... + // sqrdmulh v19.8H, v16.8H, v30.8H // ............................*........... + // mul v16.8H, v16.8H, v29.8H // .............................*.......... + // str q25, [x0, #64] // ............*........................... + // sqrdmulh v26.8H, v24.8H, v0.H[1] // ...............................*........ + // mul v23.8H, v24.8H, v0.H[0] // ................................*....... + // mls v31.8H, v22.8H, v7.H[0] // .........*.............................. + // mls v16.8H, v19.8H, v7.H[0] // ...................................*.... + // mls v23.8H, v26.8H, v7.H[0] // ....................................*... + // sub v3.8H, v9.8H, v31.8H // ...............*........................ + // add v17.8H, v9.8H, v31.8H // ..............*......................... + // str q16, [x0, #128] // ......................................*. + // str q23, [x0, #384] // .......................................* + // mul v18.8H, v17.8H, v29.8H // ......................*................. + // sqrdmulh v6.8H, v17.8H, v30.8H // ...........................*............ + // sqrdmulh v23.8H, v3.8H, v0.H[1] // ...................*.................... + // mul v9.8H, v5.8H, v29.8H // .............*.......................... + // mls v18.8H, v6.8H, v7.H[0] // ..................................*..... + // mul v20.8H, v3.8H, v0.H[0] // ....................*................... + // mls v9.8H, v12.8H, v7.H[0] // .....................*.................. + // str q18, [x0, #192] // .....................................*.. + // mls v20.8H, v23.8H, v7.H[0] // ..........................*............. + // str q9, [x0], #(16) // ..............................*......... + // str q20, [x0, #432] // .................................*...... pop_stack diff --git a/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_a55.s b/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_a55.s new file mode 100644 index 00000000..b39b618d --- /dev/null +++ b/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_a55.s @@ -0,0 +1,1425 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmla d,a,b + mla \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlaq d,a,b,i + mla \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmla \dst, t2, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmla \dst, t2, modulus +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro load_roots_1234 + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_dilithium_1234_5678_twiddles.s" +.text + + .global ntt_dilithium_1234_5678_manual_st4_opt_a55 + .global _ntt_dilithium_1234_5678_manual_st4 + +.p2align 4 +modulus_addr: .quad -8380417 +ntt_dilithium_1234_5678_manual_st4_opt_a55: +_ntt_dilithium_1234_5678_manual_st4_opt_a55: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, in + mov count, #4 + + load_roots_1234 + + .p2align 2 + ldr q22, [x0, #0] // *............................................................................................................................................................................................... + ldr q11, [x0, #512] // ........*....................................................................................................................................................................................... + ldr q10, [x0, #832] // .............*.................................................................................................................................................................................. + sqrdmulh v12.4S, v11.4S, v0.S[1] // ................*............................................................................................................................................................................... + mul v11.4S, v11.4S, v0.S[0] // .................*.............................................................................................................................................................................. + sqrdmulh v8.4S, v10.4S, v0.S[1] // .........................................*...................................................................................................................................................... + ldr q27, [x0, #64] // .*.............................................................................................................................................................................................. + mla v11.4S, v12.4S, v29.4S // ..................*............................................................................................................................................................................. + mul v10.4S, v10.4S, v0.S[0] // ..........................................*..................................................................................................................................................... + ldr q12, [x0, #128] // ..*............................................................................................................................................................................................. + sub v24.4S, v22.4S, v11.4S // ...................*............................................................................................................................................................................ + add v22.4S, v22.4S, v11.4S // ....................*........................................................................................................................................................................... + mla v10.4S, v8.4S, v29.4S // ...........................................*.................................................................................................................................................... + ldr q8, [x0, #192] // ...*............................................................................................................................................................................................ + ldr q11, [x0, #576] // .........*...................................................................................................................................................................................... + ldr q16, [x0, #768] // ............*................................................................................................................................................................................... + sqrdmulh v18.4S, v11.4S, v0.S[1] // .....................*.......................................................................................................................................................................... + mul v11.4S, v11.4S, v0.S[0] // ......................*......................................................................................................................................................................... + sqrdmulh v23.4S, v16.4S, v0.S[1] // ....................................*........................................................................................................................................................... + ldr q15, [x0, #704] // ...........*.................................................................................................................................................................................... + mla v11.4S, v18.4S, v29.4S // .......................*........................................................................................................................................................................ + mul v17.4S, v16.4S, v0.S[0] // .....................................*.......................................................................................................................................................... + ldr q16, [x0, #640] // ..........*..................................................................................................................................................................................... + add v13.4S, v27.4S, v11.4S // .........................*...................................................................................................................................................................... + sub v31.4S, v27.4S, v11.4S // ........................*....................................................................................................................................................................... + mul v18.4S, v16.4S, v0.S[0] // ...........................*.................................................................................................................................................................... + sqrdmulh v11.4S, v16.4S, v0.S[1] // ..........................*..................................................................................................................................................................... + mla v17.4S, v23.4S, v29.4S // ......................................*......................................................................................................................................................... + ldr q23, [x0, #256] // ....*........................................................................................................................................................................................... + mla v18.4S, v11.4S, v29.4S // ............................*................................................................................................................................................................... + ldr q16, [x0, #320] // .....*.......................................................................................................................................................................................... + add v27.4S, v23.4S, v17.4S // ........................................*....................................................................................................................................................... + add v20.4S, v12.4S, v18.4S // ..............................*................................................................................................................................................................. + sub v25.4S, v12.4S, v18.4S // .............................*.................................................................................................................................................................. + mul v11.4S, v27.4S, v0.S[2] // .........................................................*...................................................................................................................................... + sqrdmulh v21.4S, v27.4S, v0.S[3] // ........................................................*....................................................................................................................................... + add v12.4S, v16.4S, v10.4S // .............................................*.................................................................................................................................................. + sub v18.4S, v16.4S, v10.4S // ............................................*................................................................................................................................................... + sqrdmulh v10.4S, v15.4S, v0.S[1] // ...............................*................................................................................................................................................................ + mul v27.4S, v15.4S, v0.S[0] // ................................*............................................................................................................................................................... + mla v11.4S, v21.4S, v29.4S // ..........................................................*..................................................................................................................................... + sub v23.4S, v23.4S, v17.4S // .......................................*........................................................................................................................................................ + sqrdmulh v9.4S, v12.4S, v0.S[3] // .............................................................*.................................................................................................................................. + mla v27.4S, v10.4S, v29.4S // .................................*.............................................................................................................................................................. + add v15.4S, v22.4S, v11.4S // ............................................................*................................................................................................................................... + sub v11.4S, v22.4S, v11.4S // ...........................................................*.................................................................................................................................... + sqrdmulh v10.4S, v23.4S, v1.S[1] // ............................................................................*................................................................................................................... + sub v19.4S, v8.4S, v27.4S // ..................................*............................................................................................................................................................. + mul v17.4S, v23.4S, v1.S[0] // .............................................................................*.................................................................................................................. + add v28.4S, v8.4S, v27.4S // ...................................*............................................................................................................................................................ + sqrdmulh v22.4S, v18.4S, v1.S[1] // .................................................................................*.............................................................................................................. + mul v27.4S, v18.4S, v1.S[0] // ..................................................................................*............................................................................................................. + ldr q8, [x0, #896] // ..............*................................................................................................................................................................................. + mla v17.4S, v10.4S, v29.4S // ..............................................................................*................................................................................................................. + mla v27.4S, v22.4S, v29.4S // ...................................................................................*............................................................................................................ + sqrdmulh v22.4S, v8.4S, v0.S[1] // ..............................................*................................................................................................................................................. + mul v16.4S, v8.4S, v0.S[0] // ...............................................*................................................................................................................................................ + sub v18.4S, v24.4S, v17.4S // ...............................................................................*................................................................................................................ + mla v16.4S, v22.4S, v29.4S // ................................................*............................................................................................................................................... + ldr q30, [x0, #384] // ......*......................................................................................................................................................................................... + ldr q10, [x0, #960] // ...............*................................................................................................................................................................................ + sub v8.4S, v30.4S, v16.4S // .................................................*.............................................................................................................................................. + add v26.4S, v31.4S, v27.4S // .....................................................................................*.......................................................................................................... + mul v21.4S, v10.4S, v0.S[0] // ....................................................*........................................................................................................................................... + mul v23.4S, v8.4S, v1.S[0] // .......................................................................................*........................................................................................................ + sqrdmulh v8.4S, v8.4S, v1.S[1] // ......................................................................................*......................................................................................................... + sqrdmulh v14.4S, v10.4S, v0.S[1] // ...................................................*............................................................................................................................................ + add v22.4S, v30.4S, v16.4S // ..................................................*............................................................................................................................................. + sub v10.4S, v31.4S, v27.4S // ....................................................................................*........................................................................................................... + mla v23.4S, v8.4S, v29.4S // ........................................................................................*....................................................................................................... + sqrdmulh v16.4S, v22.4S, v0.S[3] // ..................................................................*............................................................................................................................. + mul v27.4S, v22.4S, v0.S[2] // ...................................................................*............................................................................................................................ + mul v8.4S, v12.4S, v0.S[2] // ..............................................................*................................................................................................................................. + sub v30.4S, v25.4S, v23.4S // .........................................................................................*...................................................................................................... + add v31.4S, v25.4S, v23.4S // ..........................................................................................*..................................................................................................... + mla v27.4S, v16.4S, v29.4S // ....................................................................*........................................................................................................................... + mul v16.4S, v30.4S, v3.S[0] // ...............................................................................................................................*................................................................ + sqrdmulh v25.4S, v30.4S, v3.S[1] // ..............................................................................................................................*................................................................. + mla v21.4S, v14.4S, v29.4S // .....................................................*.......................................................................................................................................... + ldr q12, [x0, #448] // .......*........................................................................................................................................................................................ + mla v16.4S, v25.4S, v29.4S // ................................................................................................................................*............................................................... + mul v25.4S, v31.4S, v2.S[2] // .....................................................................................................................*.......................................................................... + sub v22.4S, v12.4S, v21.4S // ......................................................*......................................................................................................................................... + sqrdmulh v14.4S, v31.4S, v2.S[3] // ....................................................................................................................*........................................................................... + add v31.4S, v18.4S, v16.4S // ..................................................................................................................................*............................................................. + sqrdmulh v23.4S, v22.4S, v1.S[1] // ...........................................................................................*.................................................................................................... + mul v30.4S, v22.4S, v1.S[0] // ............................................................................................*................................................................................................... + sub v22.4S, v20.4S, v27.4S // .....................................................................*.......................................................................................................................... + add v21.4S, v12.4S, v21.4S // .......................................................*........................................................................................................................................ + sub v12.4S, v18.4S, v16.4S // .................................................................................................................................*.............................................................. + mla v30.4S, v23.4S, v29.4S // .............................................................................................*.................................................................................................. + mul v18.4S, v22.4S, v2.S[0] // ...........................................................................................................*.................................................................................... + sqrdmulh v16.4S, v22.4S, v2.S[1] // ..........................................................................................................*..................................................................................... + mul v22.4S, v21.4S, v0.S[2] // ........................................................................*....................................................................................................................... + sub v23.4S, v19.4S, v30.4S // ..............................................................................................*................................................................................................. + sqrdmulh v21.4S, v21.4S, v0.S[3] // .......................................................................*........................................................................................................................ + sub count, count, #1 +layer1234_start: + // Instructions: 192 + // Expected cycles: 96 + // Expected IPC: 2.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + mla v22.4S, v21.4S, v29.4S // ...*............................................................................................................................................................................................ + mla v8.4S, v9.4S, v29.4S // .......*........................................................................................................................................................................................ + add v27.4S, v20.4S, v27.4S // .....*.......................................................................................................................................................................................... + mla v18.4S, v16.4S, v29.4S // ...................................................*............................................................................................................................................ + sub v9.4S, v28.4S, v22.4S // .................*.............................................................................................................................................................................. + sub v21.4S, v13.4S, v8.4S // ..................................................*............................................................................................................................................. + add v28.4S, v28.4S, v22.4S // ............*................................................................................................................................................................................... + mul v16.4S, v9.4S, v2.S[0] // .............................................*.................................................................................................................................................. + sqrdmulh v9.4S, v9.4S, v2.S[1] // ............................................*................................................................................................................................................... + mul v20.4S, v28.4S, v1.S[2] // ..................*............................................................................................................................................................................. + sqrdmulh v28.4S, v28.4S, v1.S[3] // ...................*............................................................................................................................................................................ + add v13.4S, v13.4S, v8.4S // ......................*......................................................................................................................................................................... + mla v16.4S, v9.4S, v29.4S // .................................................*.............................................................................................................................................. + sqrdmulh v22.4S, v27.4S, v1.S[3] // .........*...................................................................................................................................................................................... + mla v20.4S, v28.4S, v29.4S // .......................*........................................................................................................................................................................ + mul v27.4S, v27.4S, v1.S[2] // ........*....................................................................................................................................................................................... + add v8.4S, v21.4S, v16.4S // .....................................................*.......................................................................................................................................... + sub v28.4S, v21.4S, v16.4S // ......................................................*......................................................................................................................................... + sub v16.4S, v13.4S, v20.4S // ...........................*.................................................................................................................................................................... + add v20.4S, v13.4S, v20.4S // .............................*.................................................................................................................................................................. + sqrdmulh v13.4S, v28.4S, v5.S[1] // ..........................................................*..................................................................................................................................... + mul v9.4S, v28.4S, v5.S[0] // .........................................................*...................................................................................................................................... + mul v28.4S, v8.4S, v4.S[2] // ........................................................*....................................................................................................................................... + sqrdmulh v8.4S, v8.4S, v4.S[3] // ...........................................................*.................................................................................................................................... + mla v27.4S, v22.4S, v29.4S // ..............*................................................................................................................................................................................. + mla v25.4S, v14.4S, v29.4S // .....................*.......................................................................................................................................................................... + sub v22.4S, v11.4S, v18.4S // .............................................................*.................................................................................................................................. + add v11.4S, v11.4S, v18.4S // ............................................................*................................................................................................................................... + add v18.4S, v15.4S, v27.4S // ............................................................................*................................................................................................................... + sub v27.4S, v15.4S, v27.4S // ...............................................................*................................................................................................................................ + sqrdmulh v15.4S, v16.4S, v4.S[1] // ...................................................................................*............................................................................................................ + mul v16.4S, v16.4S, v4.S[0] // ..................................................................................*............................................................................................................. + add v30.4S, v19.4S, v30.4S // *............................................................................................................................................................................................... + mul v19.4S, v23.4S, v3.S[0] // .*.............................................................................................................................................................................................. + sqrdmulh v23.4S, v23.4S, v3.S[1] // ..*............................................................................................................................................................................................. + add v24.4S, v24.4S, v17.4S // ....................................*........................................................................................................................................................... + sqrdmulh v17.4S, v30.4S, v2.S[3] // ..........................*..................................................................................................................................................................... + mul v30.4S, v30.4S, v2.S[2] // ....*........................................................................................................................................................................................... + mla v19.4S, v23.4S, v29.4S // ......*......................................................................................................................................................................................... + sqrdmulh v23.4S, v20.4S, v3.S[3] // ................................*............................................................................................................................................................... + mul v20.4S, v20.4S, v3.S[2] // ................................................................*............................................................................................................................... + mla v30.4S, v17.4S, v29.4S // ..............................*................................................................................................................................................................. + mla v9.4S, v13.4S, v29.4S // ..............................................................*................................................................................................................................. + mla v28.4S, v8.4S, v29.4S // .....................................................................*.......................................................................................................................... + mla v16.4S, v15.4S, v29.4S // .......................................................................................*........................................................................................................ + sub v8.4S, v26.4S, v30.4S // ..................................*............................................................................................................................................................. + add v13.4S, v26.4S, v30.4S // ...................................*............................................................................................................................................................ + sub v17.4S, v24.4S, v25.4S // ...........................................*.................................................................................................................................................... + add v24.4S, v24.4S, v25.4S // ................................................*............................................................................................................................................... + sub v25.4S, v10.4S, v19.4S // ...........*.................................................................................................................................................................................... + add v10.4S, v10.4S, v19.4S // ..........*..................................................................................................................................................................................... + sub v15.4S, v11.4S, v28.4S // ..........................................................................*..................................................................................................................... + add v11.4S, v11.4S, v28.4S // ................................................................................*............................................................................................................... + sub v26.4S, v27.4S, v16.4S // .............................................................................................*.................................................................................................. + add v27.4S, v27.4S, v16.4S // ...........................................................................................*.................................................................................................... + add v16.4S, v22.4S, v9.4S // ..................................................................*............................................................................................................................. + sub v22.4S, v22.4S, v9.4S // ...................................................................*............................................................................................................................ + mla v20.4S, v23.4S, v29.4S // ..............................................................................*................................................................................................................. + str q15, [x0, #320] // .............................................................................*.................................................................................................................. + mul v23.4S, v25.4S, v7.S[0] // ...............*................................................................................................................................................................................ + sqrdmulh v25.4S, v25.4S, v7.S[1] // ................*............................................................................................................................................................................... + mul v15.4S, v10.4S, v6.S[2] // .................................*.............................................................................................................................................................. + sqrdmulh v10.4S, v10.4S, v6.S[3] // .............*.................................................................................................................................................................................. + add v9.4S, v18.4S, v20.4S // .........................................................................................*...................................................................................................... + sub v18.4S, v18.4S, v20.4S // .....................................................................................*.......................................................................................................... + sqrdmulh v20.4S, v8.4S, v6.S[1] // ......................................*......................................................................................................................................................... + mul v8.4S, v8.4S, v6.S[0] // .....................................*.......................................................................................................................................................... + mul v30.4S, v13.4S, v5.S[2] // .........................................*...................................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v5.S[3] // ........................................*....................................................................................................................................................... + str q11, [x0, #256] // ......................................................................................*......................................................................................................... + mla v23.4S, v25.4S, v29.4S // ....................*........................................................................................................................................................................... + str q26, [x0, #192] // ...............................................................................................*................................................................................................ + mla v15.4S, v10.4S, v29.4S // .......................................*........................................................................................................................................................ + str q27, [x0, #128] // ..............................................................................................*................................................................................................. + sub v11.4S, v12.4S, v23.4S // ........................*....................................................................................................................................................................... + add v10.4S, v12.4S, v23.4S // .........................*...................................................................................................................................................................... + add v12.4S, v31.4S, v15.4S // ........................................................................*....................................................................................................................... + sub v27.4S, v31.4S, v15.4S // .................................................................*.............................................................................................................................. + str q16, [x0, #384] // .................................................................................*.............................................................................................................. + mla v8.4S, v20.4S, v29.4S // ..........................................*..................................................................................................................................................... + str q22, [x0, #448] // ..........................................................................................*..................................................................................................... + mla v30.4S, v13.4S, v29.4S // ...............................................*................................................................................................................................................ + str q9, [x0], #(16) // ............................................................................................*................................................................................................... + sub v22.4S, v17.4S, v8.4S // ......................................................................*......................................................................................................................... + add v8.4S, v17.4S, v8.4S // ..............................................*................................................................................................................................................. + str q18, [x0, #48] // ........................................................................................*....................................................................................................... + add v16.4S, v24.4S, v30.4S // ....................................................*........................................................................................................................................... + sub v24.4S, v24.4S, v30.4S // ....................................................................*........................................................................................................................... + str q11, [x0, #944] // ............................*................................................................................................................................................................... + ldr q11, [x0, #896] // ....................................................................................................................................................*........................................... + str q10, [x0, #880] // ...............................*................................................................................................................................................................ + ldr q10, [x0, #960] // ............................................................................................................................................................*................................... + str q12, [x0, #752] // ...........................................................................*.................................................................................................................... + sqrdmulh v12.4S, v11.4S, v0.S[1] // .......................................................................................................................................................*........................................ + str q27, [x0, #816] // ....................................................................................*........................................................................................................... + mul v21.4S, v10.4S, v0.S[0] // ...............................................................................................................................................................*................................ + str q22, [x0, #688] // .........................................................................*...................................................................................................................... + sqrdmulh v22.4S, v10.4S, v0.S[1] // ..................................................................................................................................................................*............................. + str q8, [x0, #624] // ...............................................................................*................................................................................................................ + mul v11.4S, v11.4S, v0.S[0] // ........................................................................................................................................................*....................................... + str q16, [x0, #496] // .......................................................*........................................................................................................................................ + mla v21.4S, v22.4S, v29.4S // ..............................................................................................................................................................................*................. + str q24, [x0, #560] // .......................................................................*........................................................................................................................ + mla v11.4S, v12.4S, v29.4S // ..........................................................................................................................................................*..................................... + ldr q22, [x0, #320] // ..............................................................................................................................*................................................................. + ldr q10, [x0, #256] // ............................................................................................................................*................................................................... + ldr q12, [x0, #448] // ...............................................................................................................................................................................*................ + ldr q8, [x0, #832] // ..................................................................................................*............................................................................................. + sub v27.4S, v12.4S, v21.4S // ..................................................................................................................................................................................*............. + add v12.4S, v12.4S, v21.4S // ........................................................................................................................................................................................*....... + ldr q24, [x0, #384] // ...........................................................................................................................................................*.................................... + sqrdmulh v16.4S, v8.4S, v0.S[1] // .....................................................................................................*.......................................................................................... + mul v8.4S, v8.4S, v0.S[0] // ........................................................................................................*....................................................................................... + sub v18.4S, v24.4S, v11.4S // .............................................................................................................................................................*.................................. + add v11.4S, v24.4S, v11.4S // ...................................................................................................................................................................*............................ + sqrdmulh v24.4S, v27.4S, v1.S[1] // .....................................................................................................................................................................................*.......... + mla v8.4S, v16.4S, v29.4S // ............................................................................................................*................................................................................... + mul v16.4S, v18.4S, v1.S[0] // ................................................................................................................................................................*............................... + ldr q23, [x0, #768] // ...............................................................................................................*................................................................................ + sub v31.4S, v22.4S, v8.4S // .....................................................................................................................................*.......................................................... + add v8.4S, v22.4S, v8.4S // ....................................................................................................................................*........................................................... + sqrdmulh v25.4S, v23.4S, v0.S[1] // ..................................................................................................................*............................................................................. + mul v23.4S, v23.4S, v0.S[0] // .....................................................................................................................*.......................................................................... + mul v30.4S, v27.4S, v1.S[0] // ......................................................................................................................................................................................*......... + sqrdmulh v27.4S, v31.4S, v1.S[1] // ..................................................................................................................................................*............................................. + mul v22.4S, v12.4S, v0.S[2] // .............................................................................................................................................................................................*.. + mla v23.4S, v25.4S, v29.4S // ...........................................................................................................................*.................................................................... + sqrdmulh v21.4S, v12.4S, v0.S[3] // ...............................................................................................................................................................................................* + mul v12.4S, v31.4S, v1.S[0] // ...................................................................................................................................................*............................................ + sqrdmulh v9.4S, v8.4S, v0.S[3] // ..........................................................................................................................................*..................................................... + add v31.4S, v10.4S, v23.4S // ...............................................................................................................................*................................................................ + sub v10.4S, v10.4S, v23.4S // .........................................................................................................................................*...................................................... + mla v12.4S, v27.4S, v29.4S // ......................................................................................................................................................*......................................... + mul v8.4S, v8.4S, v0.S[2] // ........................................................................................................................................................................*....................... + mul v23.4S, v31.4S, v0.S[2] // ..................................................................................................................................*............................................................. + sqrdmulh v18.4S, v18.4S, v1.S[1] // .................................................................................................................................................................*.............................. + sqrdmulh v31.4S, v31.4S, v0.S[3] // ...................................................................................................................................*............................................................ + sqrdmulh v25.4S, v11.4S, v0.S[3] // ......................................................................................................................................................................*......................... + mul v27.4S, v11.4S, v0.S[2] // .......................................................................................................................................................................*........................ + mla v30.4S, v24.4S, v29.4S // ..........................................................................................................................................................................................*..... + sqrdmulh v11.4S, v10.4S, v1.S[1] // ..............................................................................................................................................*................................................. + mul v17.4S, v10.4S, v1.S[0] // ................................................................................................................................................*............................................... + mla v23.4S, v31.4S, v29.4S // ........................................................................................................................................*....................................................... + mla v16.4S, v18.4S, v29.4S // .....................................................................................................................................................................*.......................... + mla v27.4S, v25.4S, v29.4S // ...........................................................................................................................................................................*.................... + mla v17.4S, v11.4S, v29.4S // .....................................................................................................................................................*.......................................... + ldr q11, [x0, #576] // ..............................................................................................................*................................................................................. + ldr q10, [x0, #512] // .................................................................................................*.............................................................................................. + mul v24.4S, v11.4S, v0.S[0] // .................................................................................................................*.............................................................................. + sqrdmulh v11.4S, v11.4S, v0.S[1] // ................................................................................................................*............................................................................... + mul v18.4S, v10.4S, v0.S[0] // ....................................................................................................*........................................................................................... + sqrdmulh v10.4S, v10.4S, v0.S[1] // ...................................................................................................*............................................................................................ + ldr q31, [x0, #704] // ...................................................................................................................*............................................................................ + mla v24.4S, v11.4S, v29.4S // ....................................................................................................................*........................................................................... + mla v18.4S, v10.4S, v29.4S // .......................................................................................................*........................................................................................ + sqrdmulh v11.4S, v31.4S, v0.S[1] // ......................................................................................................................................*......................................................... + mul v25.4S, v31.4S, v0.S[0] // .......................................................................................................................................*........................................................ + ldr q10, [x0, #64] // ......................................................................................................*......................................................................................... + ldr q31, [x0, #0] // ................................................................................................*............................................................................................... + mla v25.4S, v11.4S, v29.4S // ...........................................................................................................................................*.................................................... + sub v11.4S, v10.4S, v24.4S // ........................................................................................................................*....................................................................... + add v13.4S, v10.4S, v24.4S // .......................................................................................................................*........................................................................ + sub v24.4S, v31.4S, v18.4S // ..........................................................................................................*..................................................................................... + add v26.4S, v11.4S, v12.4S // ..............................................................................................................................................................*................................. + ldr q20, [x0, #640] // ......................................................................................................................*......................................................................... + sub v10.4S, v11.4S, v12.4S // ....................................................................................................................................................................*........................... + add v11.4S, v31.4S, v18.4S // ...........................................................................................................*.................................................................................... + mul v12.4S, v20.4S, v0.S[0] // .........................................................................................................................*...................................................................... + sqrdmulh v18.4S, v20.4S, v0.S[1] // ..........................................................................................................................*..................................................................... + add v15.4S, v11.4S, v23.4S // ............................................................................................................................................*................................................... + ldr q31, [x0, #128] // .........................................................................................................*...................................................................................... + mla v12.4S, v18.4S, v29.4S // .............................................................................................................................*.................................................................. + sub v11.4S, v11.4S, v23.4S // .............................................................................................................................................*.................................................. + ldr q28, [x0, #192] // .............................................................................................................*.................................................................................. + sub v18.4S, v31.4S, v12.4S // .................................................................................................................................*.............................................................. + sub v23.4S, v24.4S, v17.4S // .........................................................................................................................................................*...................................... + sub v19.4S, v28.4S, v25.4S // ...............................................................................................................................................*................................................ + sub v14.4S, v18.4S, v16.4S // .........................................................................................................................................................................*...................... + add v20.4S, v31.4S, v12.4S // ................................................................................................................................*............................................................... + add v12.4S, v18.4S, v16.4S // ..........................................................................................................................................................................*..................... + mul v31.4S, v14.4S, v3.S[0] // ............................................................................................................................................................................*................... + sqrdmulh v18.4S, v14.4S, v3.S[1] // .............................................................................................................................................................................*.................. + sqrdmulh v14.4S, v12.4S, v2.S[3] // ...................................................................................................................................................................................*............ + add v28.4S, v28.4S, v25.4S // .................................................................................................................................................*.............................................. + sub v16.4S, v20.4S, v27.4S // .......................................................................................................................................................................................*........ + mla v31.4S, v18.4S, v29.4S // ................................................................................................................................................................................*............... + mul v25.4S, v12.4S, v2.S[2] // .................................................................................................................................................................................*.............. + mul v18.4S, v16.4S, v2.S[0] // ...........................................................................................................................................................................................*.... + sqrdmulh v16.4S, v16.4S, v2.S[1] // ............................................................................................................................................................................................*... + sub v12.4S, v23.4S, v31.4S // .........................................................................................................................................................................................*...... + add v31.4S, v23.4S, v31.4S // ....................................................................................................................................................................................*........... + sub v23.4S, v19.4S, v30.4S // ..............................................................................................................................................................................................*. + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // add v30.4S, v19.4S, v30.4S // ................................*............................................................................................................................................................... + // mul v19.4S, v23.4S, v3.S[0] // .................................*.............................................................................................................................................................. + // sqrdmulh v23.4S, v23.4S, v3.S[1] // ..................................*............................................................................................................................................................. + // mla v22.4S, v21.4S, v29.4S // *............................................................................................................................................................................................... + // mul v21.4S, v30.4S, v2.S[2] // .....................................*.......................................................................................................................................................... + // add v20.4S, v20.4S, v27.4S // ..*............................................................................................................................................................................................. + // mla v19.4S, v23.4S, v29.4S // ......................................*......................................................................................................................................................... + // mla v8.4S, v9.4S, v29.4S // .*.............................................................................................................................................................................................. + // mul v27.4S, v20.4S, v1.S[2] // ...............*................................................................................................................................................................................ + // sqrdmulh v20.4S, v20.4S, v1.S[3] // .............*.................................................................................................................................................................................. + // add v23.4S, v10.4S, v19.4S // ..................................................*............................................................................................................................................. + // sub v19.4S, v10.4S, v19.4S // .................................................*.............................................................................................................................................. + // add v10.4S, v28.4S, v22.4S // ......*......................................................................................................................................................................................... + // sqrdmulh v9.4S, v23.4S, v6.S[3] // ..............................................................*................................................................................................................................. + // mla v27.4S, v20.4S, v29.4S // ........................*....................................................................................................................................................................... + // mul v20.4S, v19.4S, v7.S[0] // ...........................................................*.................................................................................................................................... + // sqrdmulh v19.4S, v19.4S, v7.S[1] // ............................................................*................................................................................................................................... + // sub v22.4S, v28.4S, v22.4S // ....*........................................................................................................................................................................................... + // mul v28.4S, v10.4S, v1.S[2] // .........*...................................................................................................................................................................................... + // sqrdmulh v10.4S, v10.4S, v1.S[3] // ..........*..................................................................................................................................................................................... + // mla v20.4S, v19.4S, v29.4S // ......................................................................*......................................................................................................................... + // mla v25.4S, v14.4S, v29.4S // .........................*...................................................................................................................................................................... + // add v19.4S, v13.4S, v8.4S // ...........*.................................................................................................................................................................................... + // mla v28.4S, v10.4S, v29.4S // ..............*................................................................................................................................................................................. + // sub v10.4S, v12.4S, v20.4S // ..........................................................................*..................................................................................................................... + // add v14.4S, v12.4S, v20.4S // ...........................................................................*.................................................................................................................... + // sqrdmulh v12.4S, v30.4S, v2.S[3] // ....................................*........................................................................................................................................................... + // sub v20.4S, v19.4S, v28.4S // ..................*............................................................................................................................................................................. + // str q10, [x0, #960] // ........................................................................................*....................................................................................................... + // add v19.4S, v19.4S, v28.4S // ...................*............................................................................................................................................................................ + // mla v21.4S, v12.4S, v29.4S // .........................................*...................................................................................................................................................... + // str q14, [x0, #896] // ..........................................................................................*..................................................................................................... + // sqrdmulh v10.4S, v19.4S, v3.S[3] // .......................................*........................................................................................................................................................ + // mul v14.4S, v23.4S, v6.S[2] // .............................................................*.................................................................................................................................. + // sub v23.4S, v26.4S, v21.4S // .............................................*.................................................................................................................................................. + // add v21.4S, v26.4S, v21.4S // ..............................................*................................................................................................................................................. + // add v12.4S, v24.4S, v17.4S // ...................................*............................................................................................................................................................ + // mul v30.4S, v23.4S, v6.S[0] // ..................................................................*............................................................................................................................. + // sqrdmulh v26.4S, v23.4S, v6.S[1] // .................................................................*.............................................................................................................................. + // mla v14.4S, v9.4S, v29.4S // ........................................................................*....................................................................................................................... + // sqrdmulh v28.4S, v21.4S, v5.S[3] // ....................................................................*........................................................................................................................... + // mul v24.4S, v21.4S, v5.S[2] // ...................................................................*............................................................................................................................ + // mla v30.4S, v26.4S, v29.4S // ...............................................................................*................................................................................................................ + // sub v17.4S, v12.4S, v25.4S // ...............................................*................................................................................................................................................ + // sqrdmulh v21.4S, v22.4S, v2.S[1] // ........*....................................................................................................................................................................................... + // mul v9.4S, v22.4S, v2.S[0] // .......*........................................................................................................................................................................................ + // add v23.4S, v17.4S, v30.4S // ....................................................................................*........................................................................................................... + // mla v24.4S, v28.4S, v29.4S // .................................................................................*.............................................................................................................. + // add v22.4S, v12.4S, v25.4S // ................................................*............................................................................................................................................... + // mla v9.4S, v21.4S, v29.4S // ............*................................................................................................................................................................................... + // sub v21.4S, v13.4S, v8.4S // .....*.......................................................................................................................................................................................... + // mla v18.4S, v16.4S, v29.4S // ...*............................................................................................................................................................................................ + // add v28.4S, v22.4S, v24.4S // ......................................................................................*......................................................................................................... + // add v25.4S, v21.4S, v9.4S // ................*............................................................................................................................................................................... + // sub v26.4S, v21.4S, v9.4S // .................*.............................................................................................................................................................................. + // str q28, [x0, #512] // ....................................................................................................*........................................................................................... + // mul v21.4S, v25.4S, v4.S[2] // ......................*......................................................................................................................................................................... + // mul v16.4S, v26.4S, v5.S[0] // .....................*.......................................................................................................................................................................... + // sqrdmulh v12.4S, v26.4S, v5.S[1] // ....................*........................................................................................................................................................................... + // sqrdmulh v13.4S, v25.4S, v4.S[3] // .......................*........................................................................................................................................................................ + // add v26.4S, v11.4S, v18.4S // ...........................*.................................................................................................................................................................... + // sub v11.4S, v11.4S, v18.4S // ..........................*..................................................................................................................................................................... + // mla v16.4S, v12.4S, v29.4S // ..........................................*..................................................................................................................................................... + // sub v18.4S, v15.4S, v27.4S // .............................*.................................................................................................................................................................. + // mul v25.4S, v19.4S, v3.S[2] // ........................................*....................................................................................................................................................... + // sub v9.4S, v31.4S, v14.4S // .............................................................................*.................................................................................................................. + // add v28.4S, v11.4S, v16.4S // .......................................................*........................................................................................................................................ + // sub v8.4S, v11.4S, v16.4S // ........................................................*....................................................................................................................................... + // sub v19.4S, v22.4S, v24.4S // .......................................................................................*........................................................................................................ + // mla v21.4S, v13.4S, v29.4S // ...........................................*.................................................................................................................................................... + // sub v11.4S, v17.4S, v30.4S // ...................................................................................*............................................................................................................ + // str q19, [x0, #576] // ......................................................................................................*......................................................................................... + // add v14.4S, v31.4S, v14.4S // ............................................................................*................................................................................................................... + // str q11, [x0, #704] // ................................................................................................*............................................................................................... + // sub v16.4S, v26.4S, v21.4S // ...................................................*............................................................................................................................................ + // str q14, [x0, #768] // ............................................................................................*................................................................................................... + // add v24.4S, v15.4S, v27.4S // ............................*................................................................................................................................................................... + // str q16, [x0, #320] // ..........................................................*..................................................................................................................................... + // mla v25.4S, v10.4S, v29.4S // .........................................................*...................................................................................................................................... + // str q23, [x0, #640] // ..................................................................................................*............................................................................................. + // add v31.4S, v26.4S, v21.4S // ....................................................*........................................................................................................................................... + // str q28, [x0, #384] // ..............................................................................*................................................................................................................. + // mul v17.4S, v20.4S, v4.S[0] // ...............................*................................................................................................................................................................ + // sqrdmulh v16.4S, v20.4S, v4.S[1] // ..............................*................................................................................................................................................................. + // str q9, [x0, #832] // ..............................................................................................*................................................................................................. + // sub v27.4S, v24.4S, v25.4S // ................................................................*............................................................................................................................... + // str q31, [x0, #256] // .....................................................................*.......................................................................................................................... + // mla v17.4S, v16.4S, v29.4S // ............................................*................................................................................................................................................... + // str q27, [x0, #64] // .....................................................................................*.......................................................................................................... + // add v14.4S, v24.4S, v25.4S // ...............................................................*................................................................................................................................ + // str q8, [x0, #448] // ................................................................................*............................................................................................................... + // add v9.4S, v18.4S, v17.4S // ......................................................*......................................................................................................................................... + // str q14, [x0], #(16) // ..................................................................................*............................................................................................................. + // sub v19.4S, v18.4S, v17.4S // .....................................................*.......................................................................................................................................... + // str q9, [x0, #112] // .........................................................................*...................................................................................................................... + // str q19, [x0, #176] // .......................................................................*........................................................................................................................ + // ldr q22, [x0, #0] // ..............................................................................................................................................................*................................. + // ldr q11, [x0, #512] // ...................................................................................................................................................*............................................ + // ldr q10, [x0, #832] // ...........................................................................................................*.................................................................................... + // sqrdmulh v12.4S, v11.4S, v0.S[1] // .......................................................................................................................................................*........................................ + // mul v11.4S, v11.4S, v0.S[0] // ......................................................................................................................................................*......................................... + // sqrdmulh v8.4S, v10.4S, v0.S[1] // ...............................................................................................................*................................................................................ + // ldr q27, [x0, #64] // .............................................................................................................................................................*.................................. + // mla v11.4S, v12.4S, v29.4S // ..........................................................................................................................................................*..................................... + // mul v10.4S, v10.4S, v0.S[0] // ................................................................................................................*............................................................................... + // ldr q12, [x0, #128] // ..........................................................................................................................................................................*..................... + // sub v24.4S, v22.4S, v11.4S // ..................................................................................................................................................................*............................. + // add v22.4S, v22.4S, v11.4S // ......................................................................................................................................................................*......................... + // mla v10.4S, v8.4S, v29.4S // ....................................................................................................................*........................................................................... + // ldr q8, [x0, #192] // .............................................................................................................................................................................*.................. + // ldr q11, [x0, #576] // ..................................................................................................................................................*............................................. + // ldr q16, [x0, #768] // ......................................................................................................................*......................................................................... + // sqrdmulh v18.4S, v11.4S, v0.S[1] // .....................................................................................................................................................*.......................................... + // mul v11.4S, v11.4S, v0.S[0] // ....................................................................................................................................................*........................................... + // sqrdmulh v23.4S, v16.4S, v0.S[1] // .........................................................................................................................*...................................................................... + // ldr q15, [x0, #704] // ........................................................................................................................................................*....................................... + // mla v11.4S, v18.4S, v29.4S // .........................................................................................................................................................*...................................... + // mul v17.4S, v16.4S, v0.S[0] // ..........................................................................................................................*..................................................................... + // ldr q16, [x0, #640] // ....................................................................................................................................................................*........................... + // add v13.4S, v27.4S, v11.4S // .................................................................................................................................................................*.............................. + // sub v31.4S, v27.4S, v11.4S // ................................................................................................................................................................*............................... + // mul v18.4S, v16.4S, v0.S[0] // .......................................................................................................................................................................*........................ + // sqrdmulh v11.4S, v16.4S, v0.S[1] // ........................................................................................................................................................................*....................... + // mla v17.4S, v23.4S, v29.4S // ..............................................................................................................................*................................................................. + // ldr q23, [x0, #256] // .........................................................................................................*...................................................................................... + // mla v18.4S, v11.4S, v29.4S // ...........................................................................................................................................................................*.................... + // ldr q16, [x0, #320] // ........................................................................................................*....................................................................................... + // add v27.4S, v23.4S, v17.4S // ..................................................................................................................................*............................................................. + // add v20.4S, v12.4S, v18.4S // ..................................................................................................................................................................................*............. + // sub v25.4S, v12.4S, v18.4S // ..............................................................................................................................................................................*................. + // mul v11.4S, v27.4S, v0.S[2] // ......................................................................................................................................*......................................................... + // sqrdmulh v21.4S, v27.4S, v0.S[3] // ........................................................................................................................................*....................................................... + // add v12.4S, v16.4S, v10.4S // ........................................................................................................................*....................................................................... + // sub v18.4S, v16.4S, v10.4S // .......................................................................................................................*........................................................................ + // sqrdmulh v10.4S, v15.4S, v0.S[1] // ...........................................................................................................................................................*.................................... + // mul v27.4S, v15.4S, v0.S[0] // ............................................................................................................................................................*................................... + // mla v11.4S, v21.4S, v29.4S // ..............................................................................................................................................*................................................. + // sub v23.4S, v23.4S, v17.4S // ...................................................................................................................................*............................................................ + // sqrdmulh v9.4S, v12.4S, v0.S[3] // .................................................................................................................................*.............................................................. + // mla v27.4S, v10.4S, v29.4S // ...............................................................................................................................................................*................................ + // add v15.4S, v22.4S, v11.4S // .........................................................................................................................................................................*...................... + // sub v11.4S, v22.4S, v11.4S // ............................................................................................................................................................................*................... + // sqrdmulh v10.4S, v23.4S, v1.S[1] // ............................................................................................................................................*................................................... + // sub v19.4S, v8.4S, v27.4S // ................................................................................................................................................................................*............... + // mul v17.4S, v23.4S, v1.S[0] // .............................................................................................................................................*.................................................. + // add v28.4S, v8.4S, v27.4S // .......................................................................................................................................................................................*........ + // sqrdmulh v22.4S, v18.4S, v1.S[1] // ............................................................................................................................*................................................................... + // mul v27.4S, v18.4S, v1.S[0] // ................................................................................................................................*............................................................... + // ldr q8, [x0, #896] // .........................................................................................*...................................................................................................... + // mla v17.4S, v10.4S, v29.4S // .................................................................................................................................................*.............................................. + // mla v27.4S, v22.4S, v29.4S // ....................................................................................................................................*........................................................... + // sqrdmulh v22.4S, v8.4S, v0.S[1] // .............................................................................................*.................................................................................................. + // mul v16.4S, v8.4S, v0.S[0] // ...................................................................................................*............................................................................................ + // sub v18.4S, v24.4S, v17.4S // ...............................................................................................................................................................................*................ + // mla v16.4S, v22.4S, v29.4S // .......................................................................................................*........................................................................................ + // ldr q30, [x0, #384] // ..............................................................................................................*................................................................................. + // ldr q10, [x0, #960] // ...........................................................................................*.................................................................................................... + // sub v8.4S, v30.4S, v16.4S // .................................................................................................................*.............................................................................. + // add v26.4S, v31.4S, v27.4S // ...................................................................................................................................................................*............................ + // mul v21.4S, v10.4S, v0.S[0] // ...............................................................................................*................................................................................................ + // mul v23.4S, v8.4S, v1.S[0] // .....................................................................................................................*.......................................................................... + // sqrdmulh v8.4S, v8.4S, v1.S[1] // .......................................................................................................................................*........................................................ + // sqrdmulh v14.4S, v10.4S, v0.S[1] // .................................................................................................*.............................................................................................. + // add v22.4S, v30.4S, v16.4S // ..................................................................................................................*............................................................................. + // sub v10.4S, v31.4S, v27.4S // .....................................................................................................................................................................*.......................... + // mla v23.4S, v8.4S, v29.4S // ...............................................................................................................................................*................................................ + // sqrdmulh v16.4S, v22.4S, v0.S[3] // .........................................................................................................................................*...................................................... + // mul v27.4S, v22.4S, v0.S[2] // ..........................................................................................................................................*..................................................... + // mul v8.4S, v12.4S, v0.S[2] // .....................................................................................................................................*.......................................................... + // sub v30.4S, v25.4S, v23.4S // .................................................................................................................................................................................*.............. + // add v31.4S, v25.4S, v23.4S // ...................................................................................................................................................................................*............ + // mla v27.4S, v16.4S, v29.4S // ................................................................................................................................................*............................................... + // mul v16.4S, v30.4S, v3.S[0] // ....................................................................................................................................................................................*........... + // sqrdmulh v25.4S, v30.4S, v3.S[1] // .....................................................................................................................................................................................*.......... + // mla v21.4S, v14.4S, v29.4S // .....................................................................................................*.......................................................................................... + // ldr q12, [x0, #448] // ..........................................................................................................*..................................................................................... + // mla v16.4S, v25.4S, v29.4S // .........................................................................................................................................................................................*...... + // mul v25.4S, v31.4S, v2.S[2] // ..........................................................................................................................................................................................*..... + // sub v22.4S, v12.4S, v21.4S // ............................................................................................................*................................................................................... + // sqrdmulh v14.4S, v31.4S, v2.S[3] // ......................................................................................................................................................................................*......... + // add v31.4S, v18.4S, v16.4S // ..............................................................................................................................................................................................*. + // sqrdmulh v23.4S, v22.4S, v1.S[1] // ...................................................................................................................*............................................................................ + // mul v30.4S, v22.4S, v1.S[0] // ...........................................................................................................................*.................................................................... + // sub v22.4S, v20.4S, v27.4S // ........................................................................................................................................................................................*....... + // add v21.4S, v12.4S, v21.4S // .............................................................................................................*.................................................................................. + // sub v12.4S, v18.4S, v16.4S // .............................................................................................................................................................................................*.. + // mla v30.4S, v23.4S, v29.4S // ...........................................................................................................................................*.................................................... + // mul v18.4S, v22.4S, v2.S[0] // ...........................................................................................................................................................................................*.... + // sqrdmulh v16.4S, v22.4S, v2.S[1] // ............................................................................................................................................................................................*... + // mul v22.4S, v21.4S, v0.S[2] // .............................................................................................................................*.................................................................. + // sub v23.4S, v19.4S, v30.4S // ...............................................................................................................................................................................................* + // sqrdmulh v21.4S, v21.4S, v0.S[3] // ...............................................................................................................................*................................................................ + + sub count, count, #1 + cbnz count, layer1234_start + add v30.4S, v19.4S, v30.4S // ...............................................................................................*................................................................................................ + mul v19.4S, v23.4S, v3.S[0] // ....................................................................................................................................*........................................................... + sqrdmulh v23.4S, v23.4S, v3.S[1] // ...................................................................................................................................*............................................................ + mla v22.4S, v21.4S, v29.4S // .........................................................................*...................................................................................................................... + mul v21.4S, v30.4S, v2.S[2] // ..........................................................................................................................*..................................................................... + add v20.4S, v20.4S, v27.4S // ......................................................................*......................................................................................................................... + mla v19.4S, v23.4S, v29.4S // .....................................................................................................................................*.......................................................... + mla v8.4S, v9.4S, v29.4S // ...............................................................*................................................................................................................................ + mul v27.4S, v20.4S, v1.S[2] // .................................................................................................*.............................................................................................. + sqrdmulh v20.4S, v20.4S, v1.S[3] // ................................................................................................*............................................................................................... + add v23.4S, v10.4S, v19.4S // .......................................................................................................................................*........................................................ + sub v19.4S, v10.4S, v19.4S // ......................................................................................................................................*......................................................... + add v10.4S, v28.4S, v22.4S // ...........................................................................*.................................................................................................................... + sqrdmulh v9.4S, v23.4S, v6.S[3] // ......................................................................................................................................................................*......................... + mla v27.4S, v20.4S, v29.4S // ..................................................................................................*............................................................................................. + mul v20.4S, v19.4S, v7.S[0] // ............................................................................................................................................................................*................... + sqrdmulh v19.4S, v19.4S, v7.S[1] // ...........................................................................................................................................................................*.................... + sub v22.4S, v28.4S, v22.4S // ..........................................................................*..................................................................................................................... + mul v28.4S, v10.4S, v1.S[2] // ......................................................................................................*......................................................................................... + sqrdmulh v10.4S, v10.4S, v1.S[3] // .....................................................................................................*.......................................................................................... + mla v20.4S, v19.4S, v29.4S // .............................................................................................................................................................................*.................. + mla v25.4S, v14.4S, v29.4S // ......................................................................................................................*......................................................................... + add v19.4S, v13.4S, v8.4S // .................................................................*.............................................................................................................................. + mla v28.4S, v10.4S, v29.4S // .......................................................................................................*........................................................................................ + sub v10.4S, v12.4S, v20.4S // ..............................................................................................................................................................................*................. + add v14.4S, v12.4S, v20.4S // ...............................................................................................................................................................................*................ + sqrdmulh v12.4S, v30.4S, v2.S[3] // .........................................................................................................................*...................................................................... + sub v20.4S, v19.4S, v28.4S // ........................................................................................................*....................................................................................... + str q10, [x0, #960] // ...............................................................................................................................................................................................* + add v19.4S, v19.4S, v28.4S // .........................................................................................................*...................................................................................... + mla v21.4S, v12.4S, v29.4S // ...........................................................................................................................*.................................................................... + str q14, [x0, #896] // ..............................................................................................................................................................................................*. + sqrdmulh v10.4S, v19.4S, v3.S[3] // ........................................................................................................................................*....................................................... + mul v14.4S, v23.4S, v6.S[2] // .......................................................................................................................................................................*........................ + sub v23.4S, v26.4S, v21.4S // ............................................................................................................................*................................................................... + add v21.4S, v26.4S, v21.4S // .............................................................................................................................*.................................................................. + add v12.4S, v24.4S, v17.4S // ................................................................................*............................................................................................................... + mul v30.4S, v23.4S, v6.S[0] // ..................................................................................................................................................................*............................. + sqrdmulh v26.4S, v23.4S, v6.S[1] // .................................................................................................................................................................*.............................. + mla v14.4S, v9.4S, v29.4S // ........................................................................................................................................................................*....................... + sqrdmulh v28.4S, v21.4S, v5.S[3] // ............................................................................................................................................................*................................... + mul v24.4S, v21.4S, v5.S[2] // .............................................................................................................................................................*.................................. + mla v30.4S, v26.4S, v29.4S // ...................................................................................................................................................................*............................ + sub v17.4S, v12.4S, v25.4S // .......................................................................................................................*........................................................................ + sqrdmulh v21.4S, v22.4S, v2.S[1] // ...............................................................................................................*................................................................................ + mul v9.4S, v22.4S, v2.S[0] // ................................................................................................................*............................................................................... + add v23.4S, v17.4S, v30.4S // .....................................................................................................................................................................*.......................... + mla v24.4S, v28.4S, v29.4S // ..............................................................................................................................................................*................................. + add v22.4S, v12.4S, v25.4S // ........................................................................................................................*....................................................................... + mla v9.4S, v21.4S, v29.4S // .................................................................................................................*.............................................................................. + sub v21.4S, v13.4S, v8.4S // ................................................................*............................................................................................................................... + mla v18.4S, v16.4S, v29.4S // ............................................................................................................*................................................................................... + add v28.4S, v22.4S, v24.4S // ................................................................................................................................................................*............................... + add v25.4S, v21.4S, v9.4S // ...................................................................................................................*............................................................................ + sub v26.4S, v21.4S, v9.4S // ..................................................................................................................*............................................................................. + str q28, [x0, #512] // ........................................................................................................................................................................................*....... + mul v21.4S, v25.4S, v4.S[2] // ...................................................................................................................................................*............................................ + mul v16.4S, v26.4S, v5.S[0] // ........................................................................................................................................................*....................................... + sqrdmulh v12.4S, v26.4S, v5.S[1] // .......................................................................................................................................................*........................................ + sqrdmulh v13.4S, v25.4S, v4.S[3] // ..................................................................................................................................................*............................................. + add v26.4S, v11.4S, v18.4S // ..............................................................................................................*................................................................................. + sub v11.4S, v11.4S, v18.4S // .............................................................................................................*.................................................................................. + mla v16.4S, v12.4S, v29.4S // .........................................................................................................................................................*...................................... + sub v18.4S, v15.4S, v27.4S // ...................................................................................................*............................................................................................ + mul v25.4S, v19.4S, v3.S[2] // .........................................................................................................................................*...................................................... + sub v9.4S, v31.4S, v14.4S // .........................................................................................................................................................................*...................... + add v28.4S, v11.4S, v16.4S // ...........................................................................................................................................................*.................................... + sub v8.4S, v11.4S, v16.4S // ..........................................................................................................................................................*..................................... + sub v19.4S, v22.4S, v24.4S // ...............................................................................................................................................................*................................ + mla v21.4S, v13.4S, v29.4S // ....................................................................................................................................................*........................................... + sub v11.4S, v17.4S, v30.4S // ....................................................................................................................................................................*........................... + str q19, [x0, #576] // .........................................................................................................................................................................................*...... + add v14.4S, v31.4S, v14.4S // ..........................................................................................................................................................................*..................... + str q11, [x0, #704] // ...........................................................................................................................................................................................*.... + sub v16.4S, v26.4S, v21.4S // .....................................................................................................................................................*.......................................... + str q14, [x0, #768] // ............................................................................................................................................................................................*... + add v24.4S, v15.4S, v27.4S // ....................................................................................................*........................................................................................... + str q16, [x0, #320] // .....................................................................................................................................................................................*.......... + mla v25.4S, v10.4S, v29.4S // ..........................................................................................................................................*..................................................... + str q23, [x0, #640] // ..........................................................................................................................................................................................*..... + add v31.4S, v26.4S, v21.4S // ......................................................................................................................................................*......................................... + str q28, [x0, #384] // ......................................................................................................................................................................................*......... + mul v17.4S, v20.4S, v4.S[0] // ..............................................................................................................................................*................................................. + sqrdmulh v16.4S, v20.4S, v4.S[1] // .............................................................................................................................................*.................................................. + str q9, [x0, #832] // .............................................................................................................................................................................................*.. + sub v27.4S, v24.4S, v25.4S // ...........................................................................................................................................*.................................................... + str q31, [x0, #256] // ....................................................................................................................................................................................*........... + mla v17.4S, v16.4S, v29.4S // ...............................................................................................................................................*................................................ + str q27, [x0, #64] // .................................................................................................................................................................................*.............. + add v14.4S, v24.4S, v25.4S // ............................................................................................................................................*................................................... + str q8, [x0, #448] // .......................................................................................................................................................................................*........ + add v9.4S, v18.4S, v17.4S // .................................................................................................................................................*.............................................. + str q14, [x0], #(16) // ................................................................................................................................................................................*............... + sub v19.4S, v18.4S, v17.4S // ................................................................................................................................................*............................................... + str q9, [x0, #112] // ..................................................................................................................................................................................*............. + str q19, [x0, #176] // ...................................................................................................................................................................................*............ + + restore inp, STACK0 + mov count, #16 + + .unreq root4 + .unreq root5 + .unreq root6 + .unreq root7 + .unreq qform_root4 + .unreq qform_root5 + .unreq qform_root6 + .unreq qform_root7 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + // Instructions: 66 + // Expected cycles: 80 + // Expected IPC: 0.82 + // + // Wall time: 7.73s + // User time: 7.73s + // + // ----------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + ldr q27, [x3], #16 // .*................................................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q10, [x1, #48] // *................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q2, [x1, #32] // ..*............................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v24.4S, v10.4S, v27.S[0] // ...*.............................................................. + // gap // .................................................................. + sqrdmulh v1.4S, v10.4S, v27.S[1] // ....*............................................................. + // gap // .................................................................. + ldr q8, [x1, #16] // ......*........................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v11.4S, v2.4S, v27.S[0] // ........*......................................................... + // gap // .................................................................. + mla v24.4S, v1.4S, v29.4S // .......*.......................................................... + // gap // .................................................................. + sqrdmulh v21.4S, v2.4S, v27.S[1] // .....*............................................................ + // gap // .................................................................. + ldr q18, [x3], #8 // .........*........................................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v25.4S, v8.4S, v24.4S // ..........*....................................................... + // gap // .................................................................. + mla v11.4S, v21.4S, v29.4S // ............*..................................................... + // gap // .................................................................. + add v20.4S, v8.4S, v24.4S // ...........*...................................................... + // gap // .................................................................. + mul v21.4S, v25.4S, v18.S[0] // .............*.................................................... + // gap // .................................................................. + sqrdmulh v15.4S, v25.4S, v18.S[1] // ................*................................................. + // gap // .................................................................. + sqrdmulh v10.4S, v20.4S, v27.S[3] // ..............*................................................... + // gap // .................................................................. + mul v2.4S, v20.4S, v27.S[2] // ...............*.................................................. + // gap // .................................................................. + ldr q1, [x1, #0] // .................*................................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mla v21.4S, v15.4S, v29.4S // ...................*.............................................. + // gap // .................................................................. + mla v2.4S, v10.4S, v29.4S // ..................*............................................... + // gap // .................................................................. + sub v15.4S, v1.4S, v11.4S // ....................*............................................. + // gap // .................................................................. + add v3.4S, v1.4S, v11.4S // ........................*......................................... + // gap // .................................................................. + ldr q26, [x4], #(6*16) // .....................*............................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sub v31.4S, v15.4S, v21.4S // ......................*........................................... + // gap // .................................................................. + add v11.4S, v15.4S, v21.4S // .......................*.......................................... + // gap // .................................................................. + add v14.4S, v3.4S, v2.4S // ...........................*...................................... + // gap // .................................................................. + sub v28.4S, v3.4S, v2.4S // ..........................*....................................... + // gap // .................................................................. + trn2 v21.4S, v11.4S, v31.4S // ............................*..................................... + // gap // .................................................................. + trn1 v9.4S, v11.4S, v31.4S // .............................*.................................... + // gap // .................................................................. + trn2 v7.4S, v14.4S, v28.4S // ..............................*................................... + // gap // .................................................................. + ldr q23, [x4, #-80] // .........................*........................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v12.2D, v7.2D, v21.2D // ..................................*............................... + // gap // .................................................................. + trn1 v19.2D, v7.2D, v21.2D // ....................................*............................. + // gap // .................................................................. + mul v1.4S, v12.4S, v26.4S // .......................................*.......................... + // gap // .................................................................. + sqrdmulh v24.4S, v12.4S, v23.4S // ......................................*........................... + // gap // .................................................................. + ldr q17, [x4, #-16] // ..............................................*................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn1 v21.4S, v14.4S, v28.4S // ...............................*.................................. + // gap // .................................................................. + mla v1.4S, v24.4S, v29.4S // ..........................................*....................... + // gap // .................................................................. + ldr q22, [x4, #-48] // .........................................*........................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v15.2D, v21.2D, v9.2D // .................................*................................ + // gap // .................................................................. + add v30.4S, v19.4S, v1.4S // ...............................................*.................. + // gap // .................................................................. + mul v10.4S, v15.4S, v26.4S // ........................................*......................... + // gap // .................................................................. + sub v13.4S, v19.4S, v1.4S // .............................................*.................... + // gap // .................................................................. + sqrdmulh v19.4S, v30.4S, v22.4S // ...................................................*.............. + // gap // .................................................................. + sqrdmulh v12.4S, v15.4S, v23.4S // .....................................*............................ + // gap // .................................................................. + sqrdmulh v28.4S, v13.4S, v17.4S // .................................................*................ + // gap // .................................................................. + ldr q14, [x4, #-32] // ................................*................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q15, [x4, #-64] // ............................................*..................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v1.4S, v13.4S, v14.4S // ................................................*................. + // gap // .................................................................. + mla v10.4S, v12.4S, v29.4S // ...........................................*...................... + // gap // .................................................................. + mul v24.4S, v30.4S, v15.4S // ..................................................*............... + // gap // .................................................................. + trn1 v17.2D, v21.2D, v9.2D // ...................................*.............................. + // gap // .................................................................. + mla v1.4S, v28.4S, v29.4S // ....................................................*............. + // gap // .................................................................. + sub v14.4S, v17.4S, v10.4S // .....................................................*............ + // gap // .................................................................. + mla v24.4S, v19.4S, v29.4S // ......................................................*........... + // gap // .................................................................. + add v21.4S, v17.4S, v10.4S // .......................................................*.......... + // gap // .................................................................. + add v22.4S, v14.4S, v1.4S // ........................................................*......... + // gap // .................................................................. + sub v2.4S, v14.4S, v1.4S // .........................................................*........ + // gap // .................................................................. + add v19.4S, v21.4S, v24.4S // ...........................................................*...... + // gap // .................................................................. + sub v21.4S, v21.4S, v24.4S // ..........................................................*....... + // gap // .................................................................. + trn2 v7.4S, v22.4S, v2.4S // ............................................................*..... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v14.4S, v19.4S, v21.4S // .............................................................*.... + // gap // .................................................................. + trn1 v24.4S, v19.4S, v21.4S // ...............................................................*.. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn1 v21.2D, v14.2D, v7.2D // ..............................................................*... + // gap // .................................................................. + trn2 v1.2D, v14.2D, v7.2D // .................................................................* + // gap // .................................................................. + str q21, [x1, #16] // ................................................................*. + // gap // .................................................................. + + // ------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // ldr q19, [x1, #48] // .*................................................................ + // ldr q17, [x3], #16 // *................................................................. + // ldr q6, [x1, #32] // ..*............................................................... + // mul v8.4S, v19.4S, v17.S[0] // ...*.............................................................. + // sqrdmulh v19.4S, v19.4S, v17.S[1] // ....*............................................................. + // sqrdmulh v11.4S, v6.4S, v17.S[1] // ........*......................................................... + // ldr q31, [x1, #16] // .....*............................................................ + // mla v8.4S, v19.4S, v29.4S // .......*.......................................................... + // mul v27.4S, v6.4S, v17.S[0] // ......*........................................................... + // ldr q26, [x3], #8 // .........*........................................................ + // sub v4.4S, v31.4S, v8.4S // ..........*....................................................... + // add v25.4S, v31.4S, v8.4S // ............*..................................................... + // mla v27.4S, v11.4S, v29.4S // ...........*...................................................... + // mul v10.4S, v4.4S, v26.S[0] // .............*.................................................... + // sqrdmulh v15.4S, v25.4S, v17.S[3] // ...............*.................................................. + // mul v8.4S, v25.4S, v17.S[2] // ................*................................................. + // sqrdmulh v4.4S, v4.4S, v26.S[1] // ..............*................................................... + // ldr q21, [x1, #0] // .................*................................................ + // mla v8.4S, v15.4S, v29.4S // ...................*.............................................. + // mla v10.4S, v4.4S, v29.4S // ..................*............................................... + // sub v12.4S, v21.4S, v27.4S // ....................*............................................. + // ldr q6, [x4], #(6*16) // ......................*........................................... + // sub v19.4S, v12.4S, v10.4S // .......................*.......................................... + // add v17.4S, v12.4S, v10.4S // ........................*......................................... + // add v10.4S, v21.4S, v27.4S // .....................*............................................ + // ldr q16, [x4, #-80] // ..............................*................................... + // sub v5.4S, v10.4S, v8.4S // ..........................*....................................... + // add v14.4S, v10.4S, v8.4S // .........................*........................................ + // trn2 v21.4S, v17.4S, v19.4S // ...........................*...................................... + // trn1 v30.4S, v17.4S, v19.4S // ............................*..................................... + // trn2 v28.4S, v14.4S, v5.4S // .............................*.................................... + // trn1 v2.4S, v14.4S, v5.4S // ....................................*............................. + // ldr q4, [x4, #-32] // ..............................................*................... + // trn2 v9.2D, v2.2D, v30.2D // .......................................*.......................... + // trn2 v13.2D, v28.2D, v21.2D // ...............................*.................................. + // trn1 v15.2D, v2.2D, v30.2D // ...................................................*.............. + // trn1 v2.2D, v28.2D, v21.2D // ................................*................................. + // sqrdmulh v19.4S, v9.4S, v16.4S // ............................................*..................... + // sqrdmulh v14.4S, v13.4S, v16.4S // ..................................*............................... + // mul v18.4S, v13.4S, v6.4S // .................................*................................ + // mul v11.4S, v9.4S, v6.4S // .........................................*........................ + // ldr q6, [x4, #-48] // ......................................*........................... + // mla v18.4S, v14.4S, v29.4S // .....................................*............................ + // mla v11.4S, v19.4S, v29.4S // .................................................*................ + // ldr q10, [x4, #-64] // ...............................................*.................. + // sub v26.4S, v2.4S, v18.4S // ..........................................*....................... + // ldr q25, [x4, #-16] // ...................................*.............................. + // add v17.4S, v2.4S, v18.4S // ........................................*......................... + // mul v2.4S, v26.4S, v4.4S // ................................................*................. + // sqrdmulh v16.4S, v26.4S, v25.4S // .............................................*.................... + // mul v5.4S, v17.4S, v10.4S // ..................................................*............... + // sqrdmulh v25.4S, v17.4S, v6.4S // ...........................................*...................... + // mla v2.4S, v16.4S, v29.4S // ....................................................*............. + // sub v10.4S, v15.4S, v11.4S // .....................................................*............ + // mla v5.4S, v25.4S, v29.4S // ......................................................*........... + // add v21.4S, v15.4S, v11.4S // .......................................................*.......... + // add v22.4S, v10.4S, v2.4S // ........................................................*......... + // sub v2.4S, v10.4S, v2.4S // .........................................................*........ + // sub v12.4S, v21.4S, v5.4S // ...........................................................*...... + // add v26.4S, v21.4S, v5.4S // ..........................................................*....... + // trn2 v28.4S, v22.4S, v2.4S // ............................................................*..... + // trn2 v13.4S, v26.4S, v12.4S // .............................................................*.... + // trn1 v23.2D, v13.2D, v28.2D // ...............................................................*.. + // trn1 v24.4S, v26.4S, v12.4S // ..............................................................*... + // str q23, [x1, #16] // .................................................................* + // trn2 v1.2D, v13.2D, v28.2D // ................................................................*. + + sub count, count, #1 +layer5678_start: + // Instructions: 72 + // Expected cycles: 84 + // Expected IPC: 0.86 + // + // Wall time: 25.46s + // User time: 25.46s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ldr q19, [x1, #112] // ...e.................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q17, [x3], #16 // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q6, [x1, #96] // ..e..................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v8.4S, v19.4S, v17.S[0] // ............e........................................................... + // gap // ........................................................................ + sqrdmulh v19.4S, v19.4S, v17.S[1] // ...........e............................................................ + // gap // ........................................................................ + sqrdmulh v11.4S, v6.4S, v17.S[1] // ......e................................................................. + // gap // ........................................................................ + ldr q31, [x1, #80] // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mla v8.4S, v19.4S, v29.4S // .............e.......................................................... + // gap // ........................................................................ + mul v27.4S, v6.4S, v17.S[0] // .......e................................................................ + // gap // ........................................................................ + ldr q26, [x3], #8 // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v4.4S, v31.4S, v8.4S // ..............e......................................................... + // gap // ........................................................................ + add v25.4S, v31.4S, v8.4S // ...............e........................................................ + // gap // ........................................................................ + mla v27.4S, v11.4S, v29.4S // ........e............................................................... + // gap // ........................................................................ + mul v10.4S, v4.4S, v26.S[0] // ......................e................................................. + // gap // ........................................................................ + sqrdmulh v15.4S, v25.4S, v17.S[3] // ................e....................................................... + // gap // ........................................................................ + mul v8.4S, v25.4S, v17.S[2] // .................e...................................................... + // gap // ........................................................................ + sqrdmulh v4.4S, v4.4S, v26.S[1] // .....................e.................................................. + // gap // ........................................................................ + ldr q21, [x1, #64] // e....................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mla v8.4S, v15.4S, v29.4S // ..................e..................................................... + // gap // ........................................................................ + mla v10.4S, v4.4S, v29.4S // .......................e................................................ + // gap // ........................................................................ + sub v12.4S, v21.4S, v27.4S // .........e.............................................................. + // gap // ........................................................................ + ldr q6, [x4], #(6*16) // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v19.4S, v12.4S, v10.4S // ........................e............................................... + // gap // ........................................................................ + add v17.4S, v12.4S, v10.4S // .........................e.............................................. + // gap // ........................................................................ + add v10.4S, v21.4S, v27.4S // ..........e............................................................. + // gap // ........................................................................ + ldr q16, [x4, #-80] // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v5.4S, v10.4S, v8.4S // ...................e.................................................... + // gap // ........................................................................ + add v14.4S, v10.4S, v8.4S // ....................e................................................... + // gap // ........................................................................ + trn2 v21.4S, v17.4S, v19.4S // .............................e.......................................... + // gap // ........................................................................ + trn1 v30.4S, v17.4S, v19.4S // ............................e........................................... + // gap // ........................................................................ + trn2 v28.4S, v14.4S, v5.4S // ...........................e............................................ + // gap // ........................................................................ + trn1 v23.4S, v22.4S, v2.4S // ..............................................................*......... + // gap // ........................................................................ + trn1 v2.4S, v14.4S, v5.4S // ..........................e............................................. + // gap // ........................................................................ + ldr q4, [x4, #-32] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v9.2D, v2.2D, v30.2D // ..............................e......................................... + // gap // ........................................................................ + trn2 v13.2D, v28.2D, v21.2D // ...............................e........................................ + // gap // ........................................................................ + trn1 v15.2D, v2.2D, v30.2D // ................................e....................................... + // gap // ........................................................................ + trn1 v2.2D, v28.2D, v21.2D // .................................e...................................... + // gap // ........................................................................ + sqrdmulh v19.4S, v9.4S, v16.4S // ........................................e............................... + // gap // ........................................................................ + sqrdmulh v14.4S, v13.4S, v16.4S // .............................................e.......................... + // gap // ........................................................................ + mul v18.4S, v13.4S, v6.4S // ..............................................e......................... + // gap // ........................................................................ + mul v11.4S, v9.4S, v6.4S // .........................................e.............................. + // gap // ........................................................................ + ldr q6, [x4, #-48] // .....................................e.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mla v18.4S, v14.4S, v29.4S // ...............................................e........................ + // gap // ........................................................................ + mla v11.4S, v19.4S, v29.4S // ..........................................e............................. + // gap // ........................................................................ + ldr q10, [x4, #-64] // ....................................e................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v26.4S, v2.4S, v18.4S // ................................................e....................... + // gap // ........................................................................ + ldr q25, [x4, #-16] // .......................................e................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v17.4S, v2.4S, v18.4S // .................................................e...................... + // gap // ........................................................................ + mul v2.4S, v26.4S, v4.4S // ........................................................e............... + // gap // ........................................................................ + sqrdmulh v16.4S, v26.4S, v25.4S // .......................................................e................ + // gap // ........................................................................ + mul v5.4S, v17.4S, v10.4S // ...................................................e.................... + // gap // ........................................................................ + sqrdmulh v25.4S, v17.4S, v6.4S // ..................................................e..................... + // gap // ........................................................................ + trn1 v7.2D, v24.2D, v23.2D // ..................................................................*..... + // gap // ........................................................................ + mla v2.4S, v16.4S, v29.4S // .........................................................e.............. + // gap // ........................................................................ + sub v10.4S, v15.4S, v11.4S // ...........................................e............................ + // gap // ........................................................................ + mla v5.4S, v25.4S, v29.4S // ....................................................e................... + // gap // ........................................................................ + add v21.4S, v15.4S, v11.4S // ............................................e........................... + // gap // ........................................................................ + add v22.4S, v10.4S, v2.4S // ...........................................................e............ + // gap // ........................................................................ + sub v2.4S, v10.4S, v2.4S // ..........................................................e............. + // gap // ........................................................................ + sub v12.4S, v21.4S, v5.4S // .....................................................e.................. + // gap // ........................................................................ + add v26.4S, v21.4S, v5.4S // ......................................................e................. + // gap // ........................................................................ + str q7, [x1], #64 // ....................................................................*... + // gap // ........................................................................ + trn2 v28.4S, v22.4S, v2.4S // ...............................................................e........ + // gap // ........................................................................ + trn2 v13.4S, v26.4S, v12.4S // .............................................................e.......... + // gap // ........................................................................ + trn2 v8.2D, v24.2D, v23.2D // ................................................................*....... + // gap // ........................................................................ + str q1, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + trn1 v23.2D, v13.2D, v28.2D // ...................................................................e.... + // gap // ........................................................................ + trn1 v24.4S, v26.4S, v12.4S // ............................................................e........... + // gap // ........................................................................ + str q23, [x1, #16] // .....................................................................e.. + // gap // ........................................................................ + trn2 v1.2D, v13.2D, v28.2D // .................................................................e...... + // gap // ........................................................................ + str q8, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q8, [x1, #(16*0)] // .................e......................................................'................~...................................................... + // ldr q9, [x1, #(16*1)] // ......e.................................................................'.....~................................................................. + // ldr q10, [x1, #(16*2)] // ..e.....................................................................'.~..................................................................... + // ldr q11, [x1, #(16*3)] // e.......................................................................~....................................................................... + // ldr q0, [x3], #16 // .e......................................................................'~...................................................................... + // ldr q1, [x3], #8 // .........e..............................................................'........~.............................................................. + // sqrdmulh v27.4s, v10.4s, v0.s[1] // .....e..................................................................'....~.................................................................. + // mul v24.4s, v10.4s, v0.s[0] // ........e...............................................................'.......~............................................................... + // mla v24.4s, v27.4s, v29.4s // ............e...........................................................'...........~........................................................... + // sub v10.4s, v8.4s, v24.4s // ....................e...................................................'...................~................................................... + // add v8.4s, v8.4s, v24.4s // ........................e...............................................'.......................~............................................... + // sqrdmulh v27.4s, v11.4s, v0.s[1] // ....e...................................................................'...~................................................................... + // mul v24.4s, v11.4s, v0.s[0] // ...e....................................................................'..~.................................................................... + // mla v24.4s, v27.4s, v29.4s // .......e................................................................'......~................................................................ + // sub v11.4s, v9.4s, v24.4s // ..........e.............................................................'.........~............................................................. + // add v9.4s, v9.4s, v24.4s // ...........e............................................................'..........~............................................................ + // sqrdmulh v27.4s, v9.4s, v0.s[3] // ..............e.........................................................'.............~......................................................... + // mul v24.4s, v9.4s, v0.s[2] // ...............e........................................................'..............~........................................................ + // mla v24.4s, v27.4s, v29.4s // ..................e.....................................................'.................~..................................................... + // sub v9.4s, v8.4s, v24.4s // ..........................e.............................................'.........................~............................................. + // add v8.4s, v8.4s, v24.4s // ...........................e............................................'..........................~............................................ + // sqrdmulh v27.4s, v11.4s, v1.s[1] // ................e.......................................................'...............~....................................................... + // mul v24.4s, v11.4s, v1.s[0] // .............e..........................................................'............~.......................................................... + // mla v24.4s, v27.4s, v29.4s // ...................e....................................................'..................~.................................................... + // sub v11.4s, v10.4s, v24.4s // ......................e.................................................'.....................~................................................. + // add v10.4s, v10.4s, v24.4s // .......................e................................................'......................~................................................ + // trn1 v25.4s, v8.4s, v9.4s // ................................e.......................................'...............................~....................................... + // trn2 v26.4s, v8.4s, v9.4s // ..............................e.........................................'.............................~......................................... + // trn1 v27.4s, v10.4s, v11.4s // .............................e..........................................'............................~.......................................... + // trn2 v28.4s, v10.4s, v11.4s // ............................e...........................................'...........................~........................................... + // trn2 v10.2d, v25.2d, v27.2d // ..................................e.....................................'.................................~..................................... + // trn2 v11.2d, v26.2d, v28.2d // ...................................e....................................'..................................~.................................... + // trn1 v8.2d, v25.2d, v27.2d // ....................................e...................................'...................................~................................... + // trn1 v9.2d, v26.2d, v28.2d // .....................................e..................................'....................................~.................................. + // ldr q0, [ x4], #(6*16) // .....................e..................................................'....................~.................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .........................e..............................................'........................~.............................................. + // ldr q1, [ x4, #(-6*16 + 2*16)] // .............................................e..........................'............................................~.......................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ..........................................e.............................'.........................................~............................. + // ldr q2, [ x4, #(-6*16 + 4*16)] // .................................e......................................'................................~...................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ...............................................e........................'..............................................~........................ + // sqrdmulh v27.4s, v10.4s, v4.4s // ......................................e.................................'.....................................~................................. + // mul v24.4s, v10.4s, v0.4s // .........................................e..............................'........................................~.............................. + // mla v24.4s, v27.4s, v29.4s // ............................................e...........................'...........................................~........................... + // sub v10.4s, v8.4s, v24.4s // .......................................................e................'......................................................~................ + // add v8.4s, v8.4s, v24.4s // .........................................................e..............'........................................................~.............. + // sqrdmulh v27.4s, v11.4s, v4.4s // .......................................e................................'......................................~................................ + // mul v24.4s, v11.4s, v0.4s // ........................................e...............................'.......................................~............................... + // mla v24.4s, v27.4s, v29.4s // ...........................................e............................'..........................................~............................ + // sub v11.4s, v9.4s, v24.4s // ..............................................e.........................'.............................................~......................... + // add v9.4s, v9.4s, v24.4s // ................................................e.......................'...............................................~....................... + // sqrdmulh v27.4s, v9.4s, v5.4s // ....................................................e...................'...................................................~................... + // mul v24.4s, v9.4s, v1.4s // ...................................................e....................'..................................................~.................... + // mla v24.4s, v27.4s, v29.4s // ........................................................e...............'.......................................................~............... + // sub v9.4s, v8.4s, v24.4s // ............................................................e...........'...........................................................~........... + // add v8.4s, v8.4s, v24.4s // .............................................................e..........'............................................................~.......... + // sqrdmulh v27.4s, v11.4s, v6.4s // ..................................................e.....................'.................................................~..................... + // mul v24.4s, v11.4s, v2.4s // .................................................e......................'................................................~...................... + // mla v24.4s, v27.4s, v29.4s // ......................................................e.................'.....................................................~................. + // sub v11.4s, v10.4s, v24.4s // ...........................................................e............'..........................................................~............ + // add v10.4s, v10.4s, v24.4s // ..........................................................e.............'.........................................................~............. + // trn1 v25.4s, v8.4s, v9.4s // ....................................................................e...'...................................................................~... + // trn2 v26.4s, v8.4s, v9.4s // ................................................................e.......'...............................................................~....... + // trn1 v27.4s, v10.4s, v11.4s // ...............................~........................................'..............................*........................................ + // trn2 v28.4s, v10.4s, v11.4s // ...............................................................e........'..............................................................~........ + // trn2 v10.2d, v25.2d, v27.2d // .................................................................~......'................................................................*...... + // trn2 v11.2d, v26.2d, v28.2d // ......................................................................e.'.....................................................................~. + // trn1 v8.2d, v25.2d, v27.2d // .....................................................~..................'....................................................*.................. + // trn1 v9.2d, v26.2d, v28.2d // ...................................................................e....'..................................................................~.... + // str q8, [x1], #64 // ..............................................................~.........'.............................................................*......... + // str q9, [x1, #(-(64) + 16*1)] // .....................................................................e..'....................................................................~.. + // str q10, [x1, #(-(64) + 16*2)] // .......................................................................~'......................................................................* + // str q11, [x1, #(-(64) + 16*3)] // ..................................................................~.....'.................................................................*..... + + sub count, count, #1 + cbnz count, layer5678_start + // Instructions: 6 + // Expected cycles: 8 + // Expected IPC: 0.75 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + trn1 v21.4S, v22.4S, v2.4S // *............................. + // gap // .............................. + str q1, [x1, #48] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + trn2 v22.2D, v24.2D, v21.2D // ...*.......................... + // gap // .............................. + trn1 v11.2D, v24.2D, v21.2D // .*............................ + // gap // .............................. + str q22, [x1, #32] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q11, [x1], #64 // ..*........................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // trn1 v23.4S, v22.4S, v2.4S // *.............................. + // trn1 v7.2D, v24.2D, v23.2D // ...*........................... + // str q7, [x1], #64 // .....*......................... + // trn2 v8.2D, v24.2D, v23.2D // ..*............................ + // str q1, [x1, #-16] // .*............................. + // str q8, [x1, #-32] // ....*.......................... + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm.s index 7af235b0..c0358b3c 100644 --- a/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -67,15 +46,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, modulus, 0 + vmla \dst, t2, modulus .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +63,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, modulus, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -97,38 +70,38 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data0, data1, data2, data3 @@ -143,7 +116,7 @@ trn1 \data1\().2d, t1.2d, t3.2d .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -154,7 +127,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -164,7 +137,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -172,7 +145,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -183,19 +156,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -296,589 +269,598 @@ _ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm: load_roots_1234 .p2align 2 - ldr q16, [x0, #896] // ..............*................................................................................................................................................................................. - ldr q21, [x0, #960] // ...............*................................................................................................................................................................................ - ldr q15, [x0, #448] // .......*........................................................................................................................................................................................ - ldr q9, [x0, #512] // ........*....................................................................................................................................................................................... - ldr q30, [x0, #832] // .............*.................................................................................................................................................................................. - ldr q20, [x0, #576] // .........*...................................................................................................................................................................................... - ldr q27, [x0, #768] // ............*................................................................................................................................................................................... - ldr q28, [x0, #384] // ......*......................................................................................................................................................................................... - ldr q23, [x0, #256] // ....*........................................................................................................................................................................................... - ldr q17, [x0, #704] // ...........*.................................................................................................................................................................................... - ldr q11, [x0, #0] // *............................................................................................................................................................................................... - ldr q22, [x0, #320] // .....*.......................................................................................................................................................................................... - sqrdmulh v24.4S, v16.4S, v0.S[1] // ...............................................*................................................................................................................................................ - mul v16.4S, v16.4S, v0.S[0] // ..............................................*................................................................................................................................................. - sqrdmulh v18.4S, v21.4S, v0.S[1] // ....................................................*........................................................................................................................................... - mul v21.4S, v21.4S, v0.S[0] // ...................................................*............................................................................................................................................ - ldr q12, [x0, #640] // ..........*..................................................................................................................................................................................... - ldr q14, [x0, #192] // ...*............................................................................................................................................................................................ - ldr q19, [x0, #128] // ..*............................................................................................................................................................................................. - sqrdmulh v26.4S, v30.4S, v0.S[1] // ..........................................*..................................................................................................................................................... - mul v13.4S, v30.4S, v0.S[0] // .........................................*...................................................................................................................................................... - mla v16.4S, v24.4S, v29.4S // ................................................*............................................................................................................................................... - mul v24.4S, v9.4S, v0.S[0] // ................*............................................................................................................................................................................... - mla v21.4S, v18.4S, v29.4S // .....................................................*.......................................................................................................................................... - sqrdmulh v25.4S, v27.4S, v0.S[1] // .....................................*.......................................................................................................................................................... - mul v30.4S, v27.4S, v0.S[0] // ....................................*........................................................................................................................................................... - sqrdmulh v8.4S, v9.4S, v0.S[1] // .................*.............................................................................................................................................................................. - sqrdmulh v10.4S, v12.4S, v0.S[1] // ...........................*.................................................................................................................................................................... - sqrdmulh v31.4S, v17.4S, v0.S[1] // ................................*............................................................................................................................................................... - mul v18.4S, v12.4S, v0.S[0] // ..........................*..................................................................................................................................................................... - mul v27.4S, v17.4S, v0.S[0] // ...............................*................................................................................................................................................................ - sub v12.4S, v28.4S, v16.4S // .................................................*.............................................................................................................................................. - mla v13.4S, v26.4S, v29.4S // ...........................................*.................................................................................................................................................... - add v26.4S, v28.4S, v16.4S // ..................................................*............................................................................................................................................. - sqrdmulh v28.4S, v20.4S, v0.S[1] // ......................*......................................................................................................................................................................... - mul v16.4S, v20.4S, v0.S[0] // .....................*.......................................................................................................................................................................... - add v20.4S, v15.4S, v21.4S // .......................................................*........................................................................................................................................ - mla v30.4S, v25.4S, v29.4S // ......................................*......................................................................................................................................................... - mla v18.4S, v10.4S, v29.4S // ............................*................................................................................................................................................................... - mla v27.4S, v31.4S, v29.4S // .................................*.............................................................................................................................................................. - sub v21.4S, v15.4S, v21.4S // ......................................................*......................................................................................................................................... - sqrdmulh v15.4S, v12.4S, v1.S[1] // .......................................................................................*........................................................................................................ - ldr q9, [x0, #64] // .*.............................................................................................................................................................................................. - mul v17.4S, v20.4S, v0.S[2] // .......................................................................*........................................................................................................................ - sqrdmulh v20.4S, v20.4S, v0.S[3] // ........................................................................*....................................................................................................................... - mul v12.4S, v12.4S, v1.S[0] // ......................................................................................*......................................................................................................... - mla v16.4S, v28.4S, v29.4S // .......................*........................................................................................................................................................................ - sub v28.4S, v22.4S, v13.4S // ............................................*................................................................................................................................................... - add v22.4S, v22.4S, v13.4S // .............................................*.................................................................................................................................................. - sub v13.4S, v23.4S, v30.4S // .......................................*........................................................................................................................................................ - add v30.4S, v23.4S, v30.4S // ........................................*....................................................................................................................................................... - sub v23.4S, v19.4S, v18.4S // .............................*.................................................................................................................................................................. - add v18.4S, v19.4S, v18.4S // ..............................*................................................................................................................................................................. - add v19.4S, v14.4S, v27.4S // ...................................*............................................................................................................................................................ - sub v27.4S, v14.4S, v27.4S // ..................................*............................................................................................................................................................. - mul v14.4S, v21.4S, v1.S[0] // ...........................................................................................*.................................................................................................... - sqrdmulh v21.4S, v21.4S, v1.S[1] // ............................................................................................*................................................................................................... - mla v24.4S, v8.4S, v29.4S // ..................*............................................................................................................................................................................. - sqrdmulh v25.4S, v28.4S, v1.S[1] // ..................................................................................*............................................................................................................. - mla v17.4S, v20.4S, v29.4S // .........................................................................*...................................................................................................................... - mul v20.4S, v26.4S, v0.S[2] // ..................................................................*............................................................................................................................. - sqrdmulh v26.4S, v26.4S, v0.S[3] // ...................................................................*............................................................................................................................ - mla v14.4S, v21.4S, v29.4S // .............................................................................................*.................................................................................................. - sqrdmulh v21.4S, v22.4S, v0.S[3] // ..............................................................*................................................................................................................................. - mul v31.4S, v22.4S, v0.S[2] // .............................................................*.................................................................................................................................. - sqrdmulh v10.4S, v13.4S, v1.S[1] // .............................................................................*.................................................................................................................. - mul v8.4S, v13.4S, v1.S[0] // ............................................................................*................................................................................................................... - mla v12.4S, v15.4S, v29.4S // ........................................................................................*....................................................................................................... - mul v13.4S, v30.4S, v0.S[2] // ........................................................*....................................................................................................................................... - mul v28.4S, v28.4S, v1.S[0] // .................................................................................*.............................................................................................................. - mla v20.4S, v26.4S, v29.4S // ....................................................................*........................................................................................................................... - add v15.4S, v19.4S, v17.4S // ...........................................................................*.................................................................................................................... - sqrdmulh v26.4S, v30.4S, v0.S[3] // .........................................................*...................................................................................................................................... - sub v19.4S, v19.4S, v17.4S // ..........................................................................*..................................................................................................................... - sub v17.4S, v27.4S, v14.4S // ..............................................................................................*................................................................................................. - add v27.4S, v27.4S, v14.4S // ...............................................................................................*................................................................................................ - mla v31.4S, v21.4S, v29.4S // ...............................................................*................................................................................................................................ - sqrdmulh v21.4S, v19.4S, v2.S[1] // ................................................................................................................*............................................................................... - mul v14.4S, v19.4S, v2.S[0] // ...............................................................................................................*................................................................................ - mul v19.4S, v15.4S, v1.S[2] // .....................................................................................................*.......................................................................................... - sqrdmulh v15.4S, v15.4S, v1.S[3] // ......................................................................................................*......................................................................................... - sqrdmulh v30.4S, v17.4S, v3.S[1] // ....................................................................................................................................*........................................................... - mul v17.4S, v17.4S, v3.S[0] // ...................................................................................................................................*............................................................ - add v22.4S, v9.4S, v16.4S // .........................*...................................................................................................................................................................... - mla v28.4S, v25.4S, v29.4S // ...................................................................................*............................................................................................................ - sub v16.4S, v9.4S, v16.4S // ........................*....................................................................................................................................................................... - add v25.4S, v18.4S, v20.4S // ......................................................................*......................................................................................................................... - mul v9.4S, v27.4S, v2.S[2] // .........................................................................................................................*...................................................................... - sqrdmulh v27.4S, v27.4S, v2.S[3] // ..........................................................................................................................*..................................................................... - mla v14.4S, v21.4S, v29.4S // .................................................................................................................*.............................................................................. - mla v19.4S, v15.4S, v29.4S // .......................................................................................................*........................................................................................ - sub v15.4S, v22.4S, v31.4S // ................................................................*............................................................................................................................... - sub v21.4S, v23.4S, v12.4S // .........................................................................................*...................................................................................................... - add v22.4S, v22.4S, v31.4S // .................................................................*.............................................................................................................................. - mla v17.4S, v30.4S, v29.4S // .....................................................................................................................................*.......................................................... - sub v30.4S, v16.4S, v28.4S // ....................................................................................*........................................................................................................... + ldr q15, [x0, #256] // ....*........................................................................................................................................................................................... + ldr q20, [x0, #704] // ...........*.................................................................................................................................................................................... + ldr q30, [x0, #960] // ...............*................................................................................................................................................................................ + ldr q16, [x0, #768] // ............*................................................................................................................................................................................... + ldr q10, [x0, #512] // ........*....................................................................................................................................................................................... + ldr q12, [x0, #896] // ..............*................................................................................................................................................................................. + ldr q8, [x0, #832] // .............*.................................................................................................................................................................................. + ldr q19, [x0, #576] // .........*...................................................................................................................................................................................... + ldr q28, [x0, #640] // ..........*..................................................................................................................................................................................... + ldr q31, [x0, #448] // .......*........................................................................................................................................................................................ + sqrdmulh v26.4S, v30.4S, v0.S[1] // ...................................................*............................................................................................................................................ + mul v11.4S, v30.4S, v0.S[0] // ....................................................*........................................................................................................................................... + mul v9.4S, v20.4S, v0.S[0] // ................................*............................................................................................................................................................... + sqrdmulh v24.4S, v20.4S, v0.S[1] // ...............................*................................................................................................................................................................ + ldr q20, [x0, #192] // ...*............................................................................................................................................................................................ + sqrdmulh v22.4S, v16.4S, v0.S[1] // ....................................*........................................................................................................................................................... + mul v17.4S, v16.4S, v0.S[0] // .....................................*.......................................................................................................................................................... + mul v27.4S, v12.4S, v0.S[0] // ...............................................*................................................................................................................................................ + sqrdmulh v13.4S, v10.4S, v0.S[1] // ................*............................................................................................................................................................................... + ldr q14, [x0, #384] // ......*......................................................................................................................................................................................... + sqrdmulh v30.4S, v12.4S, v0.S[1] // ..............................................*................................................................................................................................................. + mla v11.4S, v26.4S, v29.4S // .....................................................*.......................................................................................................................................... + mul v26.4S, v28.4S, v0.S[0] // ...........................*.................................................................................................................................................................... + mla v17.4S, v22.4S, v29.4S // ......................................*......................................................................................................................................................... + sqrdmulh v16.4S, v8.4S, v0.S[1] // .........................................*...................................................................................................................................................... + mla v9.4S, v24.4S, v29.4S // .................................*.............................................................................................................................................................. + mul v24.4S, v8.4S, v0.S[0] // ..........................................*..................................................................................................................................................... + sqrdmulh v21.4S, v28.4S, v0.S[1] // ..........................*..................................................................................................................................................................... + sqrdmulh v22.4S, v19.4S, v0.S[1] // .....................*.......................................................................................................................................................................... + mla v27.4S, v30.4S, v29.4S // ................................................*............................................................................................................................................... + sub v25.4S, v31.4S, v11.4S // ......................................................*......................................................................................................................................... + add v8.4S, v31.4S, v11.4S // .......................................................*........................................................................................................................................ + sub v12.4S, v15.4S, v17.4S // .......................................*........................................................................................................................................................ + ldr q28, [x0, #320] // .....*.......................................................................................................................................................................................... + add v11.4S, v15.4S, v17.4S // ........................................*....................................................................................................................................................... + mla v24.4S, v16.4S, v29.4S // ...........................................*.................................................................................................................................................... + mul v15.4S, v19.4S, v0.S[0] // ......................*......................................................................................................................................................................... + mla v26.4S, v21.4S, v29.4S // ............................*................................................................................................................................................................... + sqrdmulh v19.4S, v25.4S, v1.S[1] // ...........................................................................................*.................................................................................................... + mul v25.4S, v25.4S, v1.S[0] // ............................................................................................*................................................................................................... + ldr q17, [x0, #128] // ..*............................................................................................................................................................................................. + sub v21.4S, v14.4S, v27.4S // .................................................*.............................................................................................................................................. + add v31.4S, v14.4S, v27.4S // ..................................................*............................................................................................................................................. + sub v23.4S, v20.4S, v9.4S // ..................................*............................................................................................................................................................. + sqrdmulh v27.4S, v8.4S, v0.S[3] // .......................................................................*........................................................................................................................ + sub v14.4S, v28.4S, v24.4S // ............................................*................................................................................................................................................... + mul v16.4S, v8.4S, v0.S[2] // ........................................................................*....................................................................................................................... + add v18.4S, v28.4S, v24.4S // .............................................*.................................................................................................................................................. + mul v30.4S, v21.4S, v1.S[0] // .......................................................................................*........................................................................................................ + sqrdmulh v28.4S, v21.4S, v1.S[1] // ......................................................................................*......................................................................................................... + mul v8.4S, v11.4S, v0.S[2] // .........................................................*...................................................................................................................................... + ldr q21, [x0, #64] // .*.............................................................................................................................................................................................. + mla v25.4S, v19.4S, v29.4S // .............................................................................................*.................................................................................................. + sqrdmulh v19.4S, v14.4S, v1.S[1] // .................................................................................*.............................................................................................................. + mul v24.4S, v14.4S, v1.S[0] // ..................................................................................*............................................................................................................. + mul v14.4S, v10.4S, v0.S[0] // .................*.............................................................................................................................................................................. + mla v15.4S, v22.4S, v29.4S // .......................*........................................................................................................................................................................ + mul v10.4S, v12.4S, v1.S[0] // .............................................................................*.................................................................................................................. + add v22.4S, v20.4S, v9.4S // ...................................*............................................................................................................................................................ + mla v30.4S, v28.4S, v29.4S // ........................................................................................*....................................................................................................... + sqrdmulh v20.4S, v31.4S, v0.S[3] // ..................................................................*............................................................................................................................. + mul v31.4S, v31.4S, v0.S[2] // ...................................................................*............................................................................................................................ + ldr q28, [x0, #0] // *............................................................................................................................................................................................... + sqrdmulh v9.4S, v11.4S, v0.S[3] // ........................................................*....................................................................................................................................... + sub v11.4S, v17.4S, v26.4S // .............................*.................................................................................................................................................................. + mla v14.4S, v13.4S, v29.4S // ..................*............................................................................................................................................................................. + mla v16.4S, v27.4S, v29.4S // .........................................................................*...................................................................................................................... + add v13.4S, v23.4S, v25.4S // ...............................................................................................*................................................................................................ + add v26.4S, v17.4S, v26.4S // ..............................*................................................................................................................................................................. + add v17.4S, v21.4S, v15.4S // .........................*...................................................................................................................................................................... + add v27.4S, v11.4S, v30.4S // ..........................................................................................*..................................................................................................... + mla v31.4S, v20.4S, v29.4S // ....................................................................*........................................................................................................................... + sub v20.4S, v11.4S, v30.4S // .........................................................................................*...................................................................................................... + sqrdmulh v30.4S, v12.4S, v1.S[1] // ............................................................................*................................................................................................................... + add v11.4S, v28.4S, v14.4S // ....................*........................................................................................................................................................................... + sub v28.4S, v28.4S, v14.4S // ...................*............................................................................................................................................................................ + sub v23.4S, v23.4S, v25.4S // ..............................................................................................*................................................................................................. + mla v24.4S, v19.4S, v29.4S // ...................................................................................*............................................................................................................ + mla v8.4S, v9.4S, v29.4S // ..........................................................*..................................................................................................................................... + sqrdmulh v14.4S, v27.4S, v2.S[3] // ....................................................................................................................*........................................................................... + mul v9.4S, v27.4S, v2.S[2] // .....................................................................................................................*.......................................................................... + sqrdmulh v19.4S, v18.4S, v0.S[3] // .............................................................*.................................................................................................................................. + sub v12.4S, v26.4S, v31.4S // .....................................................................*.......................................................................................................................... + add v25.4S, v22.4S, v16.4S // ...........................................................................*.................................................................................................................... + sub v16.4S, v22.4S, v16.4S // ..........................................................................*..................................................................................................................... + mul v22.4S, v13.4S, v2.S[2] // ..........................................................................................................................*..................................................................... + mla v10.4S, v30.4S, v29.4S // ..............................................................................*................................................................................................................. + mul v18.4S, v18.4S, v0.S[2] // ..............................................................*................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v2.S[3] // .........................................................................................................................*...................................................................... + add v30.4S, v26.4S, v31.4S // ......................................................................*......................................................................................................................... + mul v26.4S, v20.4S, v3.S[0] // ...............................................................................................................................*................................................................ + sqrdmulh v20.4S, v20.4S, v3.S[1] // ..............................................................................................................................*................................................................. + sub v27.4S, v11.4S, v8.4S // ...........................................................*.................................................................................................................................... + mla v9.4S, v14.4S, v29.4S // ......................................................................................................................*......................................................................... + sub v21.4S, v21.4S, v15.4S // ........................*....................................................................................................................................................................... + mul v14.4S, v23.4S, v3.S[0] // ....................................................................................................................................*........................................................... sub count, count, #1 layer1234_start: - mla v13.4S, v26.4S, v29.4S // ..........................................................*..................................................................................................................................... - mla v9.4S, v27.4S, v29.4S // ...........................................................................................................................*.................................................................... - add v28.4S, v16.4S, v28.4S // .....................................................................................*.......................................................................................................... - add v31.4S, v30.4S, v17.4S // .......................................................................................................................................*........................................................ - sub v26.4S, v30.4S, v17.4S // ......................................................................................................................................*......................................................... - sub v16.4S, v18.4S, v20.4S // .....................................................................*.......................................................................................................................... - mla v8.4S, v10.4S, v29.4S // ..............................................................................*................................................................................................................. - sqrdmulh v30.4S, v21.4S, v3.S[1] // ...............................................................................................................................*................................................................ - mul v20.4S, v21.4S, v3.S[0] // ..............................................................................................................................*................................................................. - add v21.4S, v23.4S, v12.4S // ..........................................................................................*..................................................................................................... - sub v23.4S, v28.4S, v9.4S // ............................................................................................................................*................................................................... - add v28.4S, v28.4S, v9.4S // .............................................................................................................................*.................................................................. - mul v9.4S, v31.4S, v6.S[2] // ......................................................................................................................................................................*......................... - sqrdmulh v10.4S, v16.4S, v2.S[1] // ...........................................................................................................*.................................................................................... - add v27.4S, v15.4S, v14.4S // ...................................................................................................................*............................................................................ - sub v17.4S, v11.4S, v24.4S // ...................*............................................................................................................................................................................ - add v18.4S, v11.4S, v24.4S // ....................*........................................................................................................................................................................... - sub v12.4S, v15.4S, v14.4S // ..................................................................................................................*............................................................................. - mul v15.4S, v16.4S, v2.S[0] // ..........................................................................................................*..................................................................................... - mla v20.4S, v30.4S, v29.4S // ................................................................................................................................*............................................................... - mla v15.4S, v10.4S, v29.4S // ............................................................................................................*................................................................................... - sqrdmulh v10.4S, v21.4S, v2.S[3] // .....................................................................................................................*.......................................................................... - mul v14.4S, v21.4S, v2.S[2] // ....................................................................................................................*........................................................................... - sub v11.4S, v18.4S, v13.4S // ...........................................................*.................................................................................................................................... - sub v21.4S, v17.4S, v8.4S // ...............................................................................*................................................................................................................ - add v13.4S, v18.4S, v13.4S // ............................................................*................................................................................................................................... - sqrdmulh v18.4S, v23.4S, v6.S[1] // ..................................................................................................................................................................*............................. - sqrdmulh v16.4S, v28.4S, v5.S[3] // .............................................................................................................................................................*.................................. - mul v30.4S, v28.4S, v5.S[2] // ............................................................................................................................................................*................................... - mul v23.4S, v23.4S, v6.S[0] // .................................................................................................................................................................*.............................. - sqrdmulh v24.4S, v31.4S, v6.S[3] // .......................................................................................................................................................................*........................ - mul v28.4S, v12.4S, v5.S[0] // .......................................................................................................................................................*........................................ - mla v14.4S, v10.4S, v29.4S // ......................................................................................................................*......................................................................... - sub v10.4S, v11.4S, v15.4S // .............................................................................................................*.................................................................................. - ldr q31, [x0, #528] // ........e....................................................................................................................................................................................... - add v11.4S, v11.4S, v15.4S // ..............................................................................................................*................................................................................. - add v15.4S, v17.4S, v8.4S // ................................................................................*............................................................................................................... - mul v17.4S, v27.4S, v4.S[2] // ..................................................................................................................................................*............................................. - sqrdmulh v8.4S, v25.4S, v1.S[3] // .................................................................................................*.............................................................................................. - mla v30.4S, v16.4S, v29.4S // ..............................................................................................................................................................*................................. - mul v16.4S, v25.4S, v1.S[2] // ................................................................................................*............................................................................................... - sqrdmulh v25.4S, v12.4S, v5.S[1] // ........................................................................................................................................................*....................................... - sqrdmulh v27.4S, v27.4S, v4.S[3] // ...................................................................................................................................................*............................................ - mla v9.4S, v24.4S, v29.4S // ........................................................................................................................................................................*....................... - add v24.4S, v15.4S, v14.4S // ........................................................................................................................*....................................................................... - sub v12.4S, v22.4S, v19.4S // ........................................................................................................*....................................................................................... - add v22.4S, v22.4S, v19.4S // .........................................................................................................*...................................................................................... - sub v15.4S, v15.4S, v14.4S // .......................................................................................................................*........................................................................ - mla v23.4S, v18.4S, v29.4S // ...................................................................................................................................................................*............................ - add v14.4S, v21.4S, v20.4S // ..................................................................................................................................*............................................................. - mla v16.4S, v8.4S, v29.4S // ..................................................................................................*............................................................................................. - sub v19.4S, v24.4S, v30.4S // ...............................................................................................................................................................*................................ - add v30.4S, v24.4S, v30.4S // ................................................................................................................................................................*............................... - mla v28.4S, v25.4S, v29.4S // .........................................................................................................................................................*...................................... - add v18.4S, v14.4S, v9.4S // ..........................................................................................................................................................................*..................... - sub v9.4S, v14.4S, v9.4S // .........................................................................................................................................................................*...................... - sqrdmulh v24.4S, v22.4S, v3.S[3] // .........................................................................................................................................*...................................................... - ldr q8, [x0, #976] // ...............e................................................................................................................................................................................ - str q18, [x0, #768] // ............................................................................................................................................................................................*... - str q19, [x0, #576] // .........................................................................................................................................................................................*...... - sqrdmulh v19.4S, v8.4S, v0.S[1] // ....................................................e........................................................................................................................................... - add v25.4S, v10.4S, v28.4S // ...........................................................................................................................................................*.................................... - mla v17.4S, v27.4S, v29.4S // ....................................................................................................................................................*........................................... - ldr q27, [x0, #720] // ...........e.................................................................................................................................................................................... - mul v18.4S, v8.4S, v0.S[0] // ...................................................e............................................................................................................................................ - ldr q8, [x0, #784] // ............e................................................................................................................................................................................... - str q9, [x0, #832] // .............................................................................................................................................................................................*.. - ldr q9, [x0, #848] // .............e.................................................................................................................................................................................. - str q30, [x0, #512] // ........................................................................................................................................................................................*....... - mul v30.4S, v22.4S, v3.S[2] // ........................................................................................................................................*....................................................... - sub v22.4S, v13.4S, v16.4S // ...................................................................................................*............................................................................................ - add v14.4S, v13.4S, v16.4S // ....................................................................................................*........................................................................................... - sub v16.4S, v10.4S, v28.4S // ..........................................................................................................................................................*..................................... - sqrdmulh v10.4S, v12.4S, v4.S[1] // ..............................................................................................................................................*................................................. - sub v28.4S, v15.4S, v23.4S // ....................................................................................................................................................................*........................... - add v23.4S, v15.4S, v23.4S // .....................................................................................................................................................................*.......................... - mul v15.4S, v12.4S, v4.S[0] // .............................................................................................................................................*.................................................. - str q16, [x0, #448] // .......................................................................................................................................................................................*........ - str q23, [x0, #640] // ..........................................................................................................................................................................................*..... - sqrdmulh v12.4S, v26.4S, v7.S[1] // ............................................................................................................................................................................*................... - mul v13.4S, v26.4S, v7.S[0] // ...........................................................................................................................................................................*.................... - ldr q26, [x0, #592] // .........e...................................................................................................................................................................................... - mul v23.4S, v9.4S, v0.S[0] // .........................................e...................................................................................................................................................... - sub v21.4S, v21.4S, v20.4S // .................................................................................................................................*.............................................................. - sqrdmulh v20.4S, v27.4S, v0.S[1] // ................................e............................................................................................................................................................... - str q25, [x0, #384] // ......................................................................................................................................................................................*......... - mla v15.4S, v10.4S, v29.4S // ...............................................................................................................................................*................................................ - ldr q25, [x0, #464] // .......e........................................................................................................................................................................................ - sqrdmulh v9.4S, v9.4S, v0.S[1] // ..........................................e..................................................................................................................................................... - mla v30.4S, v24.4S, v29.4S // ..........................................................................................................................................*..................................................... - mla v18.4S, v19.4S, v29.4S // .....................................................e.......................................................................................................................................... - sub v16.4S, v11.4S, v17.4S // .....................................................................................................................................................*.......................................... - str q28, [x0, #704] // ...........................................................................................................................................................................................*.... - ldr q10, [x0, #656] // ..........e..................................................................................................................................................................................... - mla v13.4S, v12.4S, v29.4S // .............................................................................................................................................................................*.................. - sqrdmulh v12.4S, v8.4S, v0.S[1] // .....................................e.......................................................................................................................................................... - add v19.4S, v11.4S, v17.4S // ......................................................................................................................................................*......................................... - add v24.4S, v22.4S, v15.4S // .................................................................................................................................................*.............................................. - ldr q28, [x0, #336] // .....e.......................................................................................................................................................................................... - str q16, [x0, #320] // .....................................................................................................................................................................................*.......... - mul v16.4S, v10.4S, v0.S[0] // ..........................e..................................................................................................................................................................... - sqrdmulh v17.4S, v10.4S, v0.S[1] // ...........................e.................................................................................................................................................................... - ldr q10, [x0, #912] // ..............e................................................................................................................................................................................. - mla v23.4S, v9.4S, v29.4S // ...........................................e.................................................................................................................................................... - sub v9.4S, v25.4S, v18.4S // ......................................................e......................................................................................................................................... - mul v11.4S, v27.4S, v0.S[0] // ...............................e................................................................................................................................................................ - sub v27.4S, v21.4S, v13.4S // ..............................................................................................................................................................................*................. - str q24, [x0, #128] // ..................................................................................................................................................................................*............. - add v24.4S, v21.4S, v13.4S // ...............................................................................................................................................................................*................ - sub v21.4S, v22.4S, v15.4S // ................................................................................................................................................*............................................... - mul v13.4S, v8.4S, v0.S[0] // ....................................e........................................................................................................................................................... - add v8.4S, v25.4S, v18.4S // .......................................................e........................................................................................................................................ - str q19, [x0, #256] // ....................................................................................................................................................................................*........... - mla v16.4S, v17.4S, v29.4S // ............................e................................................................................................................................................................... - mul v25.4S, v9.4S, v1.S[0] // ...........................................................................................e.................................................................................................... - str q27, [x0, #960] // ...............................................................................................................................................................................................* - sqrdmulh v17.4S, v9.4S, v1.S[1] // ............................................................................................e................................................................................................... - sqrdmulh v22.4S, v31.4S, v0.S[1] // .................e.............................................................................................................................................................................. - ldr q15, [x0, #208] // ...e............................................................................................................................................................................................ - sub v27.4S, v28.4S, v23.4S // ............................................e................................................................................................................................................... - str q21, [x0, #192] // ...................................................................................................................................................................................*............ - sqrdmulh v18.4S, v10.4S, v0.S[1] // ...............................................e................................................................................................................................................ - mul v21.4S, v10.4S, v0.S[0] // ..............................................e................................................................................................................................................. - mla v11.4S, v20.4S, v29.4S // .................................e.............................................................................................................................................................. - mul v10.4S, v8.4S, v0.S[2] // .......................................................................e........................................................................................................................ - add v19.4S, v28.4S, v23.4S // .............................................e.................................................................................................................................................. - sqrdmulh v20.4S, v8.4S, v0.S[3] // ........................................................................e....................................................................................................................... - mla v13.4S, v12.4S, v29.4S // ......................................e......................................................................................................................................................... - mla v25.4S, v17.4S, v29.4S // .............................................................................................e.................................................................................................. - ldr q12, [x0, #400] // ......e......................................................................................................................................................................................... - mul v28.4S, v27.4S, v1.S[0] // .................................................................................e.............................................................................................................. - str q24, [x0, #896] // ..............................................................................................................................................................................................*. - mul v24.4S, v31.4S, v0.S[0] // ................e............................................................................................................................................................................... - sqrdmulh v9.4S, v27.4S, v1.S[1] // ..................................................................................e............................................................................................................. - mul v31.4S, v19.4S, v0.S[2] // .............................................................e.................................................................................................................................. - sub v23.4S, v15.4S, v11.4S // ..................................e............................................................................................................................................................. - sqrdmulh v19.4S, v19.4S, v0.S[3] // ..............................................................e................................................................................................................................. - mla v21.4S, v18.4S, v29.4S // ................................................e............................................................................................................................................... - ldr q27, [x0, #272] // ....e........................................................................................................................................................................................... - mla v10.4S, v20.4S, v29.4S // .........................................................................e...................................................................................................................... - ldr q8, [x0, #144] // ..e............................................................................................................................................................................................. - add v20.4S, v14.4S, v30.4S // ............................................................................................................................................*................................................... - sub v18.4S, v14.4S, v30.4S // ...........................................................................................................................................*.................................................... - add v17.4S, v15.4S, v11.4S // ...................................e............................................................................................................................................................ - sub v14.4S, v23.4S, v25.4S // ..............................................................................................e................................................................................................. - mla v28.4S, v9.4S, v29.4S // ...................................................................................e............................................................................................................ - add v11.4S, v23.4S, v25.4S // ...............................................................................................e................................................................................................ - mul v15.4S, v26.4S, v0.S[0] // .....................e.......................................................................................................................................................................... - mla v31.4S, v19.4S, v29.4S // ...............................................................e................................................................................................................................ - str q20, [x0], #(16) // ................................................................................................................................................................................*............... - str q18, [x0, #48] // .................................................................................................................................................................................*.............. - sqrdmulh v19.4S, v26.4S, v0.S[1] // ......................e......................................................................................................................................................................... - sub v20.4S, v12.4S, v21.4S // .................................................e.............................................................................................................................................. - add v9.4S, v12.4S, v21.4S // ..................................................e............................................................................................................................................. - add v26.4S, v17.4S, v10.4S // ...........................................................................e.................................................................................................................... - sub v25.4S, v17.4S, v10.4S // ..........................................................................e..................................................................................................................... - ldr q10, [x0, #64] // .e.............................................................................................................................................................................................. - sub v21.4S, v27.4S, v13.4S // .......................................e........................................................................................................................................................ - sqrdmulh v30.4S, v14.4S, v3.S[1] // ....................................................................................................................................e........................................................... - add v18.4S, v8.4S, v16.4S // ..............................e................................................................................................................................................................. - sub v23.4S, v8.4S, v16.4S // .............................e.................................................................................................................................................................. - mul v12.4S, v20.4S, v1.S[0] // ......................................................................................e......................................................................................................... - sqrdmulh v8.4S, v20.4S, v1.S[1] // .......................................................................................e........................................................................................................ - sqrdmulh v16.4S, v9.4S, v0.S[3] // ...................................................................e............................................................................................................................ - mul v20.4S, v9.4S, v0.S[2] // ..................................................................e............................................................................................................................. - mla v15.4S, v19.4S, v29.4S // .......................e........................................................................................................................................................................ - mul v19.4S, v26.4S, v1.S[2] // .....................................................................................................e.......................................................................................... - sqrdmulh v26.4S, v26.4S, v1.S[3] // ......................................................................................................e......................................................................................... - mul v17.4S, v14.4S, v3.S[0] // ...................................................................................................................................e............................................................ - mul v14.4S, v25.4S, v2.S[0] // ...............................................................................................................e................................................................................ - mul v9.4S, v11.4S, v2.S[2] // .........................................................................................................................e...................................................................... - add v13.4S, v27.4S, v13.4S // ........................................e....................................................................................................................................................... - mla v12.4S, v8.4S, v29.4S // ........................................................................................e....................................................................................................... - sqrdmulh v27.4S, v11.4S, v2.S[3] // ..........................................................................................................................e..................................................................... - sqrdmulh v25.4S, v25.4S, v2.S[1] // ................................................................................................................e............................................................................... - mul v8.4S, v21.4S, v1.S[0] // ............................................................................e................................................................................................................... - mla v20.4S, v16.4S, v29.4S // ....................................................................e........................................................................................................................... - sub v16.4S, v10.4S, v15.4S // ........................e....................................................................................................................................................................... - add v15.4S, v10.4S, v15.4S // .........................e...................................................................................................................................................................... - mla v19.4S, v26.4S, v29.4S // .......................................................................................................e........................................................................................ - sqrdmulh v26.4S, v13.4S, v0.S[3] // .........................................................e...................................................................................................................................... - sqrdmulh v10.4S, v21.4S, v1.S[1] // .............................................................................e.................................................................................................................. - mul v13.4S, v13.4S, v0.S[2] // ........................................................e....................................................................................................................................... - mla v24.4S, v22.4S, v29.4S // ..................e............................................................................................................................................................................. - add v22.4S, v15.4S, v31.4S // .................................................................e.............................................................................................................................. - mla v14.4S, v25.4S, v29.4S // .................................................................................................................e.............................................................................. - sub v21.4S, v23.4S, v12.4S // .........................................................................................e...................................................................................................... - sub v15.4S, v15.4S, v31.4S // ................................................................e............................................................................................................................... - mla v17.4S, v30.4S, v29.4S // .....................................................................................................................................e.......................................................... - ldr q11, [x0, #0] // e............................................................................................................................................................................................... - sub v30.4S, v16.4S, v28.4S // ....................................................................................e........................................................................................................... - add v25.4S, v18.4S, v20.4S // ......................................................................e......................................................................................................................... + // Instructions: 192 + // Expected cycles: 24 + // Expected IPC: 8.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + sqrdmulh v23.4S, v23.4S, v3.S[1] // .......*........................................................................................................................................................................................ + sqrdmulh v31.4S, v25.4S, v1.S[3] // ....*........................................................................................................................................................................................... + mul v15.4S, v25.4S, v1.S[2] // ..*............................................................................................................................................................................................. + sqrdmulh v25.4S, v16.4S, v2.S[1] // ........*....................................................................................................................................................................................... + mla v18.4S, v19.4S, v29.4S // .*.............................................................................................................................................................................................. + mul v19.4S, v16.4S, v2.S[0] // ........................*....................................................................................................................................................................... + sqrdmulh v16.4S, v30.4S, v1.S[3] // .............................*.................................................................................................................................................................. + mul v30.4S, v30.4S, v1.S[2] // .........*...................................................................................................................................................................................... + mla v22.4S, v13.4S, v29.4S // ...*............................................................................................................................................................................................ + add v8.4S, v11.4S, v8.4S // .....*.......................................................................................................................................................................................... + mul v11.4S, v12.4S, v2.S[0] // .....................*.......................................................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v2.S[1] // ...................*............................................................................................................................................................................ + mla v26.4S, v20.4S, v29.4S // ..........*..................................................................................................................................................................................... + mla v15.4S, v31.4S, v29.4S // ..............*................................................................................................................................................................................. + mla v14.4S, v23.4S, v29.4S // ..................*............................................................................................................................................................................. + add v13.4S, v21.4S, v24.4S // ......*......................................................................................................................................................................................... + add v23.4S, v17.4S, v18.4S // ...............*................................................................................................................................................................................ + sub v21.4S, v21.4S, v24.4S // .......................*........................................................................................................................................................................ + mla v30.4S, v16.4S, v29.4S // ........................................*....................................................................................................................................................... + sub v24.4S, v17.4S, v18.4S // ................*............................................................................................................................................................................... + sub v18.4S, v21.4S, v14.4S // ................................*............................................................................................................................................................... + add v31.4S, v8.4S, v30.4S // ..................................................*............................................................................................................................................. + mla v11.4S, v12.4S, v29.4S // ..............................*................................................................................................................................................................. + sub v12.4S, v8.4S, v30.4S // ...................................................................*............................................................................................................................ + mla v19.4S, v25.4S, v29.4S // .....................................*.......................................................................................................................................................... + sub v8.4S, v28.4S, v10.4S // .................*.............................................................................................................................................................................. + sub v25.4S, v23.4S, v15.4S // ..........................*..................................................................................................................................................................... + add v16.4S, v21.4S, v14.4S // .................................*.............................................................................................................................................................. + add v23.4S, v23.4S, v15.4S // ...........................*.................................................................................................................................................................... + add v28.4S, v28.4S, v10.4S // *............................................................................................................................................................................................... + add v30.4S, v13.4S, v22.4S // ............*................................................................................................................................................................................... + sub v14.4S, v27.4S, v11.4S // ......................................................................*......................................................................................................................... + mul v20.4S, v25.4S, v4.S[0] // ..........................................*..................................................................................................................................................... + add v27.4S, v27.4S, v11.4S // ................................................................................*............................................................................................................... + sqrdmulh v10.4S, v25.4S, v4.S[1] // ................................................*............................................................................................................................................... + add v17.4S, v24.4S, v19.4S // .................................................*.............................................................................................................................................. + sub v21.4S, v8.4S, v26.4S // ...................................*............................................................................................................................................................ + sqrdmulh v25.4S, v23.4S, v3.S[3] // ....................................*........................................................................................................................................................... + mul v11.4S, v17.4S, v4.S[2] // ......................................................*......................................................................................................................................... + sqrdmulh v15.4S, v17.4S, v4.S[3] // ............................................................*................................................................................................................................... + sub v22.4S, v13.4S, v22.4S // ...........*.................................................................................................................................................................................... + ldr q17, [x0, #848] // ......................................................................................................*......................................................................................... + sub v13.4S, v24.4S, v19.4S // ...............................................*................................................................................................................................................ + mla v20.4S, v10.4S, v29.4S // .............................................................*.................................................................................................................................. + add v10.4S, v28.4S, v9.4S // ..................................................................*............................................................................................................................. + sub v28.4S, v28.4S, v9.4S // .............*.................................................................................................................................................................................. + sqrdmulh v19.4S, v30.4S, v5.S[3] // ......................*......................................................................................................................................................................... + mul v9.4S, v16.4S, v6.S[2] // ...........................................*.................................................................................................................................................... + mul v24.4S, v30.4S, v5.S[2] // ...............................*................................................................................................................................................................ + sqrdmulh v30.4S, v16.4S, v6.S[3] // .............................................*.................................................................................................................................................. + sqrdmulh v16.4S, v22.4S, v6.S[1] // .........................*...................................................................................................................................................................... + mul v23.4S, v23.4S, v3.S[2] // ..................................*............................................................................................................................................................. + mla v11.4S, v15.4S, v29.4S // ..........................................................................*..................................................................................................................... + add v15.4S, v8.4S, v26.4S // ............................*................................................................................................................................................................... + mul v22.4S, v22.4S, v6.S[0] // ....................*........................................................................................................................................................................... + mul v26.4S, v17.4S, v0.S[0] // ..........................................................................................................................*..................................................................... + sqrdmulh v8.4S, v17.4S, v0.S[1] // ........................................................................................................................*....................................................................... + sub v17.4S, v12.4S, v20.4S // ...............................................................................*................................................................................................................ + str q17, [x0, #192] // ..........................................................................................*..................................................................................................... + mla v23.4S, v25.4S, v29.4S // ..............................................*................................................................................................................................................. + mla v9.4S, v30.4S, v29.4S // .........................................................*...................................................................................................................................... + mla v22.4S, v16.4S, v29.4S // .........................................*...................................................................................................................................................... + add v16.4S, v12.4S, v20.4S // ..............................................................................*................................................................................................................. + ldr q12, [x0, #912] // .....................................................................................................*.......................................................................................... + mla v24.4S, v19.4S, v29.4S // ............................................*................................................................................................................................................... + sqrdmulh v30.4S, v13.4S, v5.S[1] // ........................................................*....................................................................................................................................... + mul v19.4S, v13.4S, v5.S[0] // .......................................................*........................................................................................................................................ + add v13.4S, v27.4S, v11.4S // ........................................................................................*....................................................................................................... + mla v26.4S, v8.4S, v29.4S // ...................................................................................................................................*............................................................ + str q16, [x0, #128] // ......................................................................................*......................................................................................................... + sqrdmulh v25.4S, v18.4S, v7.S[1] // ......................................*......................................................................................................................................................... + mul v20.4S, v18.4S, v7.S[0] // .......................................*........................................................................................................................................................ + ldr q16, [x0, #592] // .......................................................................................................*........................................................................................ + sub v17.4S, v27.4S, v11.4S // .........................................................................................*...................................................................................................... + ldr q27, [x0, #784] // ...................................................................................................*............................................................................................ + ldr q11, [x0, #528] // ....................................................................................................*........................................................................................... + str q13, [x0, #256] // ..............................................................................................*................................................................................................. + add v13.4S, v15.4S, v9.4S // .........................................................................*...................................................................................................................... + str q17, [x0, #320] // ...............................................................................................*................................................................................................ + sqrdmulh v18.4S, v12.4S, v0.S[1] // ....................................................................................................................*........................................................................... + sub v9.4S, v15.4S, v9.4S // .....................................................................*.......................................................................................................................... + mul v15.4S, v12.4S, v0.S[0] // .................................................................................................................*.............................................................................. + ldr q8, [x0, #656] // ........................................................................................................*....................................................................................... + mla v20.4S, v25.4S, v29.4S // ....................................................*........................................................................................................................................... + sub v12.4S, v28.4S, v22.4S // .....................................................*.......................................................................................................................................... + add v25.4S, v10.4S, v24.4S // ..................................................................................*............................................................................................................. + sub v24.4S, v10.4S, v24.4S // ............................................................................*................................................................................................................... + ldr q10, [x0, #400] // ...................................................................................................................*............................................................................ + add v17.4S, v28.4S, v22.4S // ...................................................*............................................................................................................................................ + str q13, [x0, #768] // ...................................................................................*............................................................................................................ + sqrdmulh v22.4S, v11.4S, v0.S[1] // ..................................................................................................................*............................................................................. + mla v19.4S, v30.4S, v29.4S // ........................................................................*....................................................................................................................... + mul v28.4S, v11.4S, v0.S[0] // .......................................................................................................................................................*........................................ + str q24, [x0, #576] // ....................................................................................*........................................................................................................... + add v24.4S, v31.4S, v23.4S // ...........................................................*.................................................................................................................................... + str q12, [x0, #704] // .................................................................*.............................................................................................................................. + ldr q13, [x0, #720] // .................................................................................................*.............................................................................................. + ldr q30, [x0, #16] // ..............................................................................................................................................................*................................. + str q25, [x0, #512] // ...........................................................................................*.................................................................................................... + ldr q25, [x0, #976] // ..................................................................................................*............................................................................................. + mul v12.4S, v8.4S, v0.S[0] // ......................................................................................................................*......................................................................... + sqrdmulh v11.4S, v8.4S, v0.S[1] // ...........................................................................................................................*.................................................................... + mul v8.4S, v27.4S, v0.S[0] // ................................................................................................................*............................................................................... + mla v15.4S, v18.4S, v29.4S // .............................................................................................................................*.................................................................. + add v18.4S, v21.4S, v20.4S // ...............................................................*................................................................................................................................ + mla v28.4S, v22.4S, v29.4S // .................................................................................................................................................................*.............................. + sub v23.4S, v31.4S, v23.4S // ..........................................................*..................................................................................................................................... + ldr q31, [x0, #144] // ........................................................................................................................................*....................................................... + ldr q22, [x0, #272] // ................................................................................................*............................................................................................... + str q17, [x0, #640] // ..............................................................*................................................................................................................................. + str q24, [x0], #(16) // .......................................................................*........................................................................................................................ + sub v21.4S, v21.4S, v20.4S // ................................................................*............................................................................................................................... + ldr q17, [x0, #64] // ...................................................................................................................................................*............................................ + str q9, [x0, #816] // .................................................................................*.............................................................................................................. + ldr q9, [x0, #192] // ..............................................................................................................*................................................................................. + sqrdmulh v24.4S, v27.4S, v0.S[1] // ...............................................................................................................*................................................................................ + mla v12.4S, v11.4S, v29.4S // .....................................................................................................................................*.......................................................... + sub v27.4S, v10.4S, v15.4S // .........................................................................................................................................*...................................................... + str q18, [x0, #880] // ...........................................................................*.................................................................................................................... + add v18.4S, v10.4S, v15.4S // ..........................................................................................................................................*..................................................... + add v11.4S, v30.4S, v28.4S // ..........................................................................................................................................................................*..................... + sub v28.4S, v30.4S, v28.4S // ...........................................................................................................................................................................*.................... + sqrdmulh v10.4S, v25.4S, v0.S[1] // ..........................................................................................................*..................................................................................... + mul v20.4S, v25.4S, v0.S[0] // ...........................................................................................................*.................................................................................... + str q21, [x0, #944] // .............................................................................*.................................................................................................................. + str q23, [x0, #48] // ....................................................................*........................................................................................................................... + mul v23.4S, v27.4S, v1.S[0] // ................................................................................................................................................*............................................... + sqrdmulh v21.4S, v13.4S, v0.S[1] // .............................................................................................................*.................................................................................. + mul v25.4S, v13.4S, v0.S[0] // ............................................................................................................*................................................................................... + mul v13.4S, v16.4S, v0.S[0] // ....................................................................................................................................*........................................................... + add v30.4S, v31.4S, v12.4S // ....................................................................................................................................................................*........................... + sqrdmulh v16.4S, v16.4S, v0.S[1] // ............................................................................................................................*................................................................... + sub v15.4S, v31.4S, v12.4S // ................................................................................................................................................................*............................... + mla v8.4S, v24.4S, v29.4S // .......................................................................................................................*........................................................................ + ldr q12, [x0, #320] // .................................................................................................................................*.............................................................. + ldr q24, [x0, #448] // .........................................................................................................*...................................................................................... + mla v20.4S, v10.4S, v29.4S // .....................................................................................................................*.......................................................................... + mul v31.4S, v18.4S, v0.S[2] // .............................................................................................................................................................*.................................. + sqrdmulh v18.4S, v18.4S, v0.S[3] // ............................................................................................................................................................*................................... + sub v10.4S, v14.4S, v19.4S // .....................................................................................*.......................................................................................................... + mla v25.4S, v21.4S, v29.4S // .........................................................................................................................*...................................................................... + sqrdmulh v21.4S, v27.4S, v1.S[1] // .................................................................................................................................................*.............................................. + add v27.4S, v14.4S, v19.4S // .......................................................................................*........................................................................................................ + str q10, [x0, #432] // ............................................................................................*................................................................................................... + mla v13.4S, v16.4S, v29.4S // ........................................................................................................................................................*....................................... + sub v19.4S, v22.4S, v8.4S // ................................................................................................................................*............................................................... + add v22.4S, v22.4S, v8.4S // ..................................................................................................................................*............................................................. + add v10.4S, v24.4S, v20.4S // ...............................................................................................................................*................................................................ + sub v14.4S, v24.4S, v20.4S // ..............................................................................................................................*................................................................. + mla v31.4S, v18.4S, v29.4S // .......................................................................................................................................................................*........................ + sub v16.4S, v12.4S, v26.4S // .............................................................................................................................................*.................................................. + str q27, [x0, #368] // .............................................................................................*.................................................................................................. + mul v8.4S, v22.4S, v0.S[2] // ..................................................................................................................................................*............................................. + sqrdmulh v18.4S, v22.4S, v0.S[3] // ...............................................................................................................................................................*................................ + mla v23.4S, v21.4S, v29.4S // ...........................................................................................................................................................*.................................... + add v24.4S, v12.4S, v26.4S // ...............................................................................................................................................*................................................ + sqrdmulh v26.4S, v14.4S, v1.S[1] // ......................................................................................................................................*......................................................... + mul v27.4S, v14.4S, v1.S[0] // .......................................................................................................................................*........................................................ + sqrdmulh v14.4S, v10.4S, v0.S[3] // ............................................................................................................................................*................................................... + mul v22.4S, v10.4S, v0.S[2] // ..............................................................................................................................................*................................................. + sub v12.4S, v30.4S, v31.4S // ..................................................................................................................................................................................*............. + sub v21.4S, v17.4S, v13.4S // ..............................................................................................................................................................................................*. + add v30.4S, v30.4S, v31.4S // .........................................................................................................................................................................................*...... + mul v10.4S, v19.4S, v1.S[0] // .........................................................................................................................................................*...................................... + mla v8.4S, v18.4S, v29.4S // ..............................................................................................................................................................................*................. + sub v20.4S, v15.4S, v23.4S // ........................................................................................................................................................................*....................... + add v18.4S, v15.4S, v23.4S // ......................................................................................................................................................................*......................... + sub v23.4S, v9.4S, v25.4S // ...........................................................................................................................................*.................................................... + mla v27.4S, v26.4S, v29.4S // ....................................................................................................................................................*........................................... + add v25.4S, v9.4S, v25.4S // ..........................................................................................................................................................*..................................... + sqrdmulh v31.4S, v16.4S, v1.S[1] // .....................................................................................................................................................*.......................................... + mla v22.4S, v14.4S, v29.4S // ..................................................................................................................................................................*............................. + sqrdmulh v14.4S, v18.4S, v2.S[3] // ...............................................................................................................................................................................*................ + sqrdmulh v15.4S, v19.4S, v1.S[1] // .........................................................................................................................................................................*...................... + mul v26.4S, v20.4S, v3.S[0] // ..........................................................................................................................................................................................*..... + mul v9.4S, v18.4S, v2.S[2] // ................................................................................................................................................................................*............... + mul v18.4S, v24.4S, v0.S[2] // .......................................................................................................................................................................................*........ + sqrdmulh v19.4S, v24.4S, v0.S[3] // .................................................................................................................................................................................*.............. + mul v24.4S, v16.4S, v1.S[0] // ......................................................................................................................................................*......................................... + sqrdmulh v20.4S, v20.4S, v3.S[1] // ...........................................................................................................................................................................................*.... + sub v16.4S, v25.4S, v22.4S // ....................................................................................................................................................................................*........... + add v25.4S, v25.4S, v22.4S // ...................................................................................................................................................................................*............ + add v22.4S, v23.4S, v27.4S // ...................................................................................................................................................................*............................ + sub v23.4S, v23.4S, v27.4S // ............................................................................................................................................................................*................... + mla v10.4S, v15.4S, v29.4S // ......................................................................................................................................................................................*......... + mla v9.4S, v14.4S, v29.4S // .............................................................................................................................................................................................*.. + sub v27.4S, v11.4S, v8.4S // ............................................................................................................................................................................................*... + add v17.4S, v17.4S, v13.4S // .....................................................................................................................................................................*.......................... + sqrdmulh v13.4S, v22.4S, v2.S[3] // ........................................................................................................................................................................................*....... + mul v22.4S, v22.4S, v2.S[2] // .....................................................................................................................................................................................*.......... + mla v24.4S, v31.4S, v29.4S // .............................................................................................................................................................................*.................. + mul v14.4S, v23.4S, v3.S[0] // ...............................................................................................................................................................................................* - // original source code - // ldr q8, [x0, #0] // ...........................................................................................................................................................e..|...................................................................................................................................................... - // ldr q9, [x0, #(1*(512/8))] // ..........................................................................................................................e...................................|...................................................................................................................................................... - // ldr q10, [x0, #(2*(512/8))] // ..........................................................................................................e...................................................|...........................................................................................................................................e.......... - // ldr q11, [x0, #(3*(512/8))] // ....................................................................................e.........................................................................|.....................................................................................................................e................................ - // ldr q12, [x0, #(4*(512/8))] // ........................................................................................................e.....................................................|.........................................................................................................................................e............ - // ldr q13, [x0, #(5*(512/8))] // ................................................................e.............................................................................................|.................................................................................................e.................................................... - // ldr q14, [x0, #(6*(512/8))] // ...............................................................................................e..............................................................|................................................................................................................................e..................... - // ldr q15, [x0, #(7*(512/8))] // .....................................................e........................................................................................................|......................................................................................e............................................................... - // ldr q16, [x0, #(8*(512/8))] // e.............................................................................................................................................................|.................................e.................................................................................................................... - // ldr q17, [x0, #(9*(512/8))] // ...............................................e..............................................................................................................|................................................................................e..................................................................... - // ldr q18, [x0, #(10*(512/8))] // ...........................................................e..................................................................................................|............................................................................................e......................................................... - // ldr q19, [x0, #(11*(512/8))] // .............................e................................................................................................................................|..............................................................e....................................................................................... - // ldr q20, [x0, #(12*(512/8))] // ...............................e..............................................................................................................................|................................................................e..................................................................................... - // ldr q21, [x0, #(13*(512/8))] // .................................e............................................................................................................................|..................................................................e................................................................................... - // ldr q22, [x0, #(14*(512/8))] // ....................................................................e.........................................................................................|.....................................................................................................e................................................ - // ldr q23, [x0, #(15*(512/8))] // .......................e......................................................................................................................................|........................................................e............................................................................................. - // mul v24.4s, v16.4s, v0.s[0] // ..................................................................................................e...........................................................|...................................................................................................................................e.................. - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ...................................................................................e..........................................................................|....................................................................................................................e................................. - // mla v24.4s, v16.4s, v29.4s // .....................................................................................................................................................e........|...................................................................................................................................................... - // sub v16.4s, v8.4s, v24.4s // ..............................................................................................................................................................|..............*....................................................................................................................................... - // add v8.4s, v8.4s, v24.4s // ..............................................................................................................................................................|...............*...................................................................................................................................... - // mul v24.4s, v17.4s, v0.s[0] // .................................................................................................................e............................................|..................................................................................................................................................e... - // sqrdmulh v17.4s, v17.4s, v0.s[1] // .....................................................................................................................e........................................|...................................................................................................................................................... - // mla v24.4s, v17.4s, v29.4s // ...................................................................................................................................e..........................|...................................................................................................................................................... - // sub v17.4s, v9.4s, v24.4s // ...............................................................................................................................................e..............|...................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ................................................................................................................................................e.............|...................................................................................................................................................... - // mul v24.4s, v18.4s, v0.s[0] // ..................................................................e...........................................................................................|...................................................................................................e.................................................. - // sqrdmulh v18.4s, v18.4s, v0.s[1] // ...................................................................e..........................................................................................|....................................................................................................e................................................. - // mla v24.4s, v18.4s, v29.4s // ...............................................................................e..............................................................................|................................................................................................................e..................................... - // sub v18.4s, v10.4s, v24.4s // ..............................................................................................................................e...............................|...................................................................................................................................................... - // add v10.4s, v10.4s, v24.4s // .............................................................................................................................e................................|...................................................................................................................................................... - // mul v24.4s, v19.4s, v0.s[0] // .......................................................................e......................................................................................|........................................................................................................e............................................. - // sqrdmulh v19.4s, v19.4s, v0.s[1] // ..................................................e...........................................................................................................|...................................................................................e.................................................................. - // mla v24.4s, v19.4s, v29.4s // .........................................................................................e....................................................................|..........................................................................................................................e........................... - // sub v19.4s, v11.4s, v24.4s // .....................................................................................................e........................................................|......................................................................................................................................e............... - // add v11.4s, v11.4s, v24.4s // .............................................................................................................e................................................|..............................................................................................................................................e....... - // mul v24.4s, v20.4s, v0.s[0] // ............................................................................e.................................................................................|.............................................................................................................e........................................ - // sqrdmulh v20.4s, v20.4s, v0.s[1] // .............................................................e................................................................................................|..............................................................................................e....................................................... - // mla v24.4s, v20.4s, v29.4s // .............................................................................................e................................................................|..............................................................................................................................e....................... - // sub v20.4s, v12.4s, v24.4s // ...........................................................................................................................e..................................|...................................................................................................................................................... - // add v12.4s, v12.4s, v24.4s // .........................................................................................................................................e....................|...................................................................................................................................................... - // mul v24.4s, v21.4s, v0.s[0] // ................................................e.............................................................................................................|.................................................................................e.................................................................... - // sqrdmulh v21.4s, v21.4s, v0.s[1] // ......................................................e.......................................................................................................|.......................................................................................e.............................................................. - // mla v24.4s, v21.4s, v29.4s // .....................................................................e........................................................................................|......................................................................................................e............................................... - // sub v21.4s, v13.4s, v24.4s // .....................................................................................e........................................................................|......................................................................................................................e............................... - // add v13.4s, v13.4s, v24.4s // ...........................................................................................e..................................................................|............................................................................................................................e......................... - // mul v24.4s, v22.4s, v0.s[0] // ........................................................................................e.....................................................................|.........................................................................................................................e............................ - // sqrdmulh v22.4s, v22.4s, v0.s[1] // .......................................................................................e......................................................................|........................................................................................................................e............................. - // mla v24.4s, v22.4s, v29.4s // .......................................................................................................e......................................................|........................................................................................................................................e............. - // sub v22.4s, v14.4s, v24.4s // ......................................................................................................................e.......................................|...................................................................................................................................................... - // add v14.4s, v14.4s, v24.4s // .......................................................................................................................e......................................|...................................................................................................................................................... - // mul v24.4s, v23.4s, v0.s[0] // ..............................e...............................................................................................................................|...............................................................e...................................................................................... - // sqrdmulh v23.4s, v23.4s, v0.s[1] // ..........................e...................................................................................................................................|...........................................................e.......................................................................................... - // mla v24.4s, v23.4s, v29.4s // ........................................................e.....................................................................................................|.........................................................................................e............................................................ - // sub v23.4s, v15.4s, v24.4s // ......................................................................e.......................................................................................|.......................................................................................................e.............................................. - // add v15.4s, v15.4s, v24.4s // .............................................................................e................................................................................|..............................................................................................................e....................................... - // mul v24.4s, v12.4s, v0.s[2] // ....................................................................................................................................................e.........|...................................................................................................................................................... - // sqrdmulh v12.4s, v12.4s, v0.s[3] // ..................................................................................................................................................e...........|...................................................................................................................................................... - // mla v24.4s, v12.4s, v29.4s // ..............................................................................................................................................................*...................................................................................................................................................... - // sub v12.4s, v8.4s, v24.4s // ..............................................................................................................................................................|......................*............................................................................................................................... - // add v8.4s, v8.4s, v24.4s // ..............................................................................................................................................................|........................*............................................................................................................................. - // mul v24.4s, v13.4s, v0.s[2] // ....................................................................................................e.........................................................|.....................................................................................................................................e................ - // sqrdmulh v13.4s, v13.4s, v0.s[3] // ......................................................................................................e.......................................................|.......................................................................................................................................e.............. - // mla v24.4s, v13.4s, v29.4s // ..................................................................................................................e...........................................|...................................................................................................................................................e.. - // sub v13.4s, v9.4s, v24.4s // .........................................................................................................................................................e....|...................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ......................................................................................................................................................e.......|...................................................................................................................................................... - // mul v24.4s, v14.4s, v0.s[2] // ..................................................................................................................................e...........................|...................................................................................................................................................... - // sqrdmulh v14.4s, v14.4s, v0.s[3] // .................................................................................................................................e............................|...................................................................................................................................................... - // mla v24.4s, v14.4s, v29.4s // ..............................................................................................................................................e...............|...................................................................................................................................................... - // sub v14.4s, v10.4s, v24.4s // ..............................................................................................................................................................|....*................................................................................................................................................. - // add v10.4s, v10.4s, v24.4s // .............................................................................................................................................................e|...................................................................................................................................................... - // mul v24.4s, v15.4s, v0.s[2] // ..........................................................................................e...................................................................|...........................................................................................................................e.......................... - // sqrdmulh v15.4s, v15.4s, v0.s[3] // ............................................................................................e.................................................................|.............................................................................................................................e........................ - // mla v24.4s, v15.4s, v29.4s // .........................................................................................................e....................................................|..........................................................................................................................................e........... - // sub v15.4s, v11.4s, v24.4s // .........................................................................................................................e....................................|...................................................................................................................................................... - // add v11.4s, v11.4s, v24.4s // ........................................................................................................................e.....................................|...................................................................................................................................................... - // mul v24.4s, v20.4s, v1.s[0] // .............................................................................................................................................e................|...................................................................................................................................................... - // sqrdmulh v20.4s, v20.4s, v1.s[1] // ...................................................................................................................................................e..........|...................................................................................................................................................... - // mla v24.4s, v20.4s, v29.4s // ..............................................................................................................................................................|.....*................................................................................................................................................ - // sub v20.4s, v16.4s, v24.4s // ..............................................................................................................................................................|.......................*.............................................................................................................................. - // add v16.4s, v16.4s, v24.4s // ..*...........................................................................................................................................................|...................................*.................................................................................................................. - // mul v24.4s, v21.4s, v1.s[0] // ................................................................................................e.............................................................|.................................................................................................................................e.................... - // sqrdmulh v21.4s, v21.4s, v1.s[1] // ...................................................................................................e..........................................................|....................................................................................................................................e................. - // mla v24.4s, v21.4s, v29.4s // ...............................................................................................................e..............................................|................................................................................................................................................e..... - // sub v21.4s, v17.4s, v24.4s // ............................................................................................................................................................e.|...................................................................................................................................................... - // add v17.4s, v17.4s, v24.4s // ..............................................................................................................................................................|.*.................................................................................................................................................... - // mul v24.4s, v22.4s, v1.s[0] // ...............................................................................................................................e..............................|...................................................................................................................................................... - // sqrdmulh v22.4s, v22.4s, v1.s[1] // ................................................................................................................................e.............................|...................................................................................................................................................... - // mla v24.4s, v22.4s, v29.4s // ..........................................................................................................................................e...................|...................................................................................................................................................... - // sub v22.4s, v18.4s, v24.4s // ........................................................................................................................................................e.....|...................................................................................................................................................... - // add v18.4s, v18.4s, v24.4s // ..............................................................................................................................................................|........*............................................................................................................................................. - // mul v24.4s, v23.4s, v1.s[0] // ................................................................................e.............................................................................|.................................................................................................................e.................................... - // sqrdmulh v23.4s, v23.4s, v1.s[1] // ..................................................................................e...........................................................................|...................................................................................................................e.................................. - // mla v24.4s, v23.4s, v29.4s // ..............................................................................................e...............................................................|...............................................................................................................................e...................... - // sub v23.4s, v19.4s, v24.4s // ..............................................................................................................e...............................................|...............................................................................................................................................e...... - // add v19.4s, v19.4s, v24.4s // ................................................................................................................e.............................................|.................................................................................................................................................e.... - // mul v24.4s, v10.4s, v1.s[2] // ......*.......................................................................................................................................................|.......................................*.............................................................................................................. - // sqrdmulh v10.4s, v10.4s, v1.s[3] // ....*.........................................................................................................................................................|.....................................*................................................................................................................ - // mla v24.4s, v10.4s, v29.4s // ................*.............................................................................................................................................|.................................................*.................................................................................................... - // sub v10.4s, v8.4s, v24.4s // ....................................*.........................................................................................................................|.....................................................................*................................................................................ - // add v8.4s, v8.4s, v24.4s // .....................................*........................................................................................................................|......................................................................*............................................................................... - // mul v24.4s, v11.4s, v1.s[2] // ....................................................................................................................................e.........................|...................................................................................................................................................... - // sqrdmulh v11.4s, v11.4s, v1.s[3] // .....................................................................................................................................e........................|...................................................................................................................................................... - // mla v24.4s, v11.4s, v29.4s // .................................................................................................................................................e............|...................................................................................................................................................... - // sub v11.4s, v9.4s, v24.4s // ...........*..................................................................................................................................................|............................................*......................................................................................................... - // add v9.4s, v9.4s, v24.4s // ............*.................................................................................................................................................|.............................................*........................................................................................................ - // mul v24.4s, v14.4s, v2.s[0] // ..............................................................................................................................................................|.................*.................................................................................................................................... - // sqrdmulh v14.4s, v14.4s, v2.s[1] // ..............................................................................................................................................................|............*......................................................................................................................................... - // mla v24.4s, v14.4s, v29.4s // ..............................................................................................................................................................|...................*.................................................................................................................................. - // sub v14.4s, v12.4s, v24.4s // ..............................................................................................................................................................|................................*..................................................................................................................... - // add v12.4s, v12.4s, v24.4s // .*............................................................................................................................................................|..................................*................................................................................................................... - // mul v24.4s, v15.4s, v2.s[0] // .......................................................................................................................................e......................|...................................................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v2.s[1] // ............................................................................................................................................e.................|...................................................................................................................................................... - // mla v24.4s, v15.4s, v29.4s // .......................................................................................................................................................e......|...................................................................................................................................................... - // sub v15.4s, v13.4s, v24.4s // ..............................................................................................................................................................|................*..................................................................................................................................... - // add v13.4s, v13.4s, v24.4s // ..............................................................................................................................................................|.............*........................................................................................................................................ - // mul v24.4s, v18.4s, v2.s[2] // ..............................................................................................................................................................|.....................*................................................................................................................................ - // sqrdmulh v18.4s, v18.4s, v2.s[3] // ..............................................................................................................................................................|....................*................................................................................................................................. - // mla v24.4s, v18.4s, v29.4s // ..............................................................................................................................................................|...............................*...................................................................................................................... - // sub v18.4s, v16.4s, v24.4s // .............*................................................................................................................................................|..............................................*....................................................................................................... - // add v16.4s, v16.4s, v24.4s // ..........*...................................................................................................................................................|...........................................*.......................................................................................................... - // mul v24.4s, v19.4s, v2.s[2] // ........................................................................................................................................e.....................|...................................................................................................................................................... - // sqrdmulh v19.4s, v19.4s, v2.s[3] // ...........................................................................................................................................e..................|...................................................................................................................................................... - // mla v24.4s, v19.4s, v29.4s // ..............................................................................................................................................................|*..................................................................................................................................................... - // sub v19.4s, v17.4s, v24.4s // ..............................................................................................................................................................|.........*............................................................................................................................................ - // add v17.4s, v17.4s, v24.4s // ..............................................................................................................................................................|..........*........................................................................................................................................... - // mul v24.4s, v22.4s, v3.s[0] // ..............................................................................................................................................................|.......*.............................................................................................................................................. - // sqrdmulh v22.4s, v22.4s, v3.s[1] // ..............................................................................................................................................................|......*............................................................................................................................................... - // mla v24.4s, v22.4s, v29.4s // ..............................................................................................................................................................|..................*................................................................................................................................... - // sub v22.4s, v20.4s, v24.4s // .................................................*............................................................................................................|..................................................................................*................................................................... - // add v20.4s, v20.4s, v24.4s // ...............*..............................................................................................................................................|................................................*..................................................................................................... - // mul v24.4s, v23.4s, v3.s[0] // ......................................................................................................................................e.......................|...................................................................................................................................................... - // sqrdmulh v23.4s, v23.4s, v3.s[1] // ............................................................................................................................e.................................|...................................................................................................................................................... - // mla v24.4s, v23.4s, v29.4s // ..........................................................................................................................................................e...|...................................................................................................................................................... - // sub v23.4s, v21.4s, v24.4s // ..............................................................................................................................................................|...*.................................................................................................................................................. - // add v21.4s, v21.4s, v24.4s // ..............................................................................................................................................................|..*................................................................................................................................................... - // mul v24.4s, v9.4s, v3.s[2] // ...................................*..........................................................................................................................|....................................................................*................................................................................. - // sqrdmulh v9.4s, v9.4s, v3.s[3] // ......................*.......................................................................................................................................|.......................................................*.............................................................................................. - // mla v24.4s, v9.4s, v29.4s // .......................................................*......................................................................................................|........................................................................................*............................................................. - // sub v9.4s, v8.4s, v24.4s // ............................................................................................................*.................................................|.............................................................................................................................................*........ - // add v8.4s, v8.4s, v24.4s // ...........................................................................................................*..................................................|............................................................................................................................................*......... - // mul v24.4s, v11.4s, v4.s[0] // ..........................................*...................................................................................................................|...........................................................................*.......................................................................... - // sqrdmulh v11.4s, v11.4s, v4.s[1] // .......................................*......................................................................................................................|........................................................................*............................................................................. - // mla v24.4s, v11.4s, v29.4s // ....................................................*.........................................................................................................|.....................................................................................*................................................................ - // sub v11.4s, v10.4s, v24.4s // ...........................................................................*..................................................................................|............................................................................................................*......................................... - // add v10.4s, v10.4s, v24.4s // ...............................................................*..............................................................................................|................................................................................................*..................................................... - // mul v24.4s, v13.4s, v4.s[2] // ...*..........................................................................................................................................................|....................................*................................................................................................................. - // sqrdmulh v13.4s, v13.4s, v4.s[3] // ........*.....................................................................................................................................................|.........................................*............................................................................................................ - // mla v24.4s, v13.4s, v29.4s // ............................*.................................................................................................................................|.............................................................*........................................................................................ - // sub v13.4s, v12.4s, v24.4s // .........................................................*....................................................................................................|..........................................................................................*........................................................... - // add v12.4s, v12.4s, v24.4s // ..............................................................*...............................................................................................|...............................................................................................*...................................................... - // mul v24.4s, v15.4s, v5.s[0] // ..............................................................................................................................................................|..............................*....................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v5.s[1] // .......*......................................................................................................................................................|........................................*............................................................................................................. - // mla v24.4s, v15.4s, v29.4s // ...................*..........................................................................................................................................|....................................................*................................................................................................. - // sub v15.4s, v14.4s, v24.4s // ......................................*.......................................................................................................................|.......................................................................*.............................................................................. - // add v14.4s, v14.4s, v24.4s // ...........................*..................................................................................................................................|............................................................*......................................................................................... - // mul v24.4s, v17.4s, v5.s[2] // ..............................................................................................................................................................|...........................*.......................................................................................................................... - // sqrdmulh v17.4s, v17.4s, v5.s[3] // ..............................................................................................................................................................|..........................*........................................................................................................................... - // mla v24.4s, v17.4s, v29.4s // .....*........................................................................................................................................................|......................................*............................................................................................................... - // sub v17.4s, v16.4s, v24.4s // .................*............................................................................................................................................|..................................................*................................................................................................... - // add v16.4s, v16.4s, v24.4s // ..................*...........................................................................................................................................|...................................................*.................................................................................................. - // mul v24.4s, v19.4s, v6.s[0] // ..............................................................................................................................................................|............................*......................................................................................................................... - // sqrdmulh v19.4s, v19.4s, v6.s[1] // ..............................................................................................................................................................|.........................*............................................................................................................................ - // mla v24.4s, v19.4s, v29.4s // ..............*...............................................................................................................................................|...............................................*...................................................................................................... - // sub v19.4s, v18.4s, v24.4s // ........................................*.....................................................................................................................|.........................................................................*............................................................................ - // add v18.4s, v18.4s, v24.4s // .........................................*....................................................................................................................|..........................................................................*........................................................................... - // mul v24.4s, v21.4s, v6.s[2] // ..............................................................................................................................................................|...........*.......................................................................................................................................... - // sqrdmulh v21.4s, v21.4s, v6.s[3] // ..............................................................................................................................................................|.............................*........................................................................................................................ - // mla v24.4s, v21.4s, v29.4s // .........*....................................................................................................................................................|..........................................*........................................................................................................... - // sub v21.4s, v20.4s, v24.4s // .....................*........................................................................................................................................|......................................................*............................................................................................... - // add v20.4s, v20.4s, v24.4s // ....................*.........................................................................................................................................|.....................................................*................................................................................................ - // mul v24.4s, v23.4s, v7.s[0] // ..............................................*...............................................................................................................|...............................................................................*...................................................................... - // sqrdmulh v23.4s, v23.4s, v7.s[1] // .............................................*................................................................................................................|..............................................................................*....................................................................... - // mla v24.4s, v23.4s, v29.4s // ............................................................*.................................................................................................|.............................................................................................*........................................................ - // sub v23.4s, v22.4s, v24.4s // ........................................................................*.....................................................................................|.........................................................................................................*............................................ - // add v22.4s, v22.4s, v24.4s // ..........................................................................*...................................................................................|...........................................................................................................*.......................................... - // str q8, [x0], #(16) // ...................................................................................................................*..........................................|....................................................................................................................................................*. - // str q9, [x0, #(-16 + 1*(512/8))] // ....................................................................................................................*.........................................|.....................................................................................................................................................* - // str q10, [x0, #(-16 + 2*(512/8))] // .........................................................................*....................................................................................|..........................................................................................................*........................................... - // str q11, [x0, #(-16 + 3*(512/8))] // ......................................................................................*.......................................................................|.......................................................................................................................*.............................. - // str q12, [x0, #(-16 + 4*(512/8))] // ..............................................................................*...............................................................................|...............................................................................................................*...................................... - // str q13, [x0, #(-16 + 5*(512/8))] // .................................................................*............................................................................................|..................................................................................................*................................................... - // str q14, [x0, #(-16 + 6*(512/8))] // ...................................................*..........................................................................................................|....................................................................................*................................................................. - // str q15, [x0, #(-16 + 7*(512/8))] // ...........................................*..................................................................................................................|............................................................................*......................................................................... - // str q16, [x0, #(-16 + 8*(512/8))] // ..................................*...........................................................................................................................|...................................................................*.................................................................................. - // str q17, [x0, #(-16 + 9*(512/8))] // .........................*....................................................................................................................................|..........................................................*........................................................................................... - // str q18, [x0, #(-16 + 10*(512/8))] // ............................................*.................................................................................................................|.............................................................................*........................................................................ - // str q19, [x0, #(-16 + 11*(512/8))] // ..........................................................*...................................................................................................|...........................................................................................*.......................................................... - // str q20, [x0, #(-16 + 12*(512/8))] // ........................*.....................................................................................................................................|.........................................................*............................................................................................ - // str q21, [x0, #(-16 + 13*(512/8))] // ................................*.............................................................................................................................|.................................................................*.................................................................................... - // str q22, [x0, #(-16 + 14*(512/8))] // .................................................................................................*............................................................|..................................................................................................................................*................... - // str q23, [x0, #(-16 + 15*(512/8))] // .................................................................................*............................................................................|..................................................................................................................*................................... + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // add v31.4S, v28.4S, v10.4S // .............................*.................................................................................................................................................................. + // mla v18.4S, v19.4S, v29.4S // ....*........................................................................................................................................................................................... + // mul v19.4S, v25.4S, v1.S[2] // ..*............................................................................................................................................................................................. + // mla v22.4S, v13.4S, v29.4S // ........*....................................................................................................................................................................................... + // sqrdmulh v13.4S, v25.4S, v1.S[3] // .*.............................................................................................................................................................................................. + // add v11.4S, v11.4S, v8.4S // .........*...................................................................................................................................................................................... + // add v8.4S, v21.4S, v24.4S // ...............*................................................................................................................................................................................ + // sqrdmulh v15.4S, v23.4S, v3.S[1] // *............................................................................................................................................................................................... + // sqrdmulh v23.4S, v16.4S, v2.S[1] // ...*............................................................................................................................................................................................ + // mul v25.4S, v30.4S, v1.S[2] // .......*........................................................................................................................................................................................ + // mla v26.4S, v20.4S, v29.4S // ............*................................................................................................................................................................................... + // sub v20.4S, v8.4S, v22.4S // ........................................*....................................................................................................................................................... + // add v8.4S, v8.4S, v22.4S // ..............................*................................................................................................................................................................. + // sub v22.4S, v31.4S, v9.4S // .............................................*.................................................................................................................................................. + // mla v19.4S, v13.4S, v29.4S // .............*.................................................................................................................................................................................. + // add v13.4S, v17.4S, v18.4S // ................*............................................................................................................................................................................... + // sub v17.4S, v17.4S, v18.4S // ...................*............................................................................................................................................................................ + // sub v18.4S, v28.4S, v10.4S // .........................*...................................................................................................................................................................... + // mla v14.4S, v15.4S, v29.4S // ..............*................................................................................................................................................................................. + // sqrdmulh v15.4S, v12.4S, v2.S[1] // ...........*.................................................................................................................................................................................... + // mul v10.4S, v20.4S, v6.S[0] // ......................................................*......................................................................................................................................... + // mul v28.4S, v12.4S, v2.S[0] // ..........*..................................................................................................................................................................................... + // sqrdmulh v12.4S, v8.4S, v5.S[3] // ..............................................*................................................................................................................................................. + // sub v21.4S, v21.4S, v24.4S // .................*.............................................................................................................................................................................. + // mul v24.4S, v16.4S, v2.S[0] // .....*.......................................................................................................................................................................................... + // sqrdmulh v20.4S, v20.4S, v6.S[1] // ..................................................*............................................................................................................................................. + // sub v16.4S, v13.4S, v19.4S // ..........................*..................................................................................................................................................................... + // add v13.4S, v13.4S, v19.4S // ............................*................................................................................................................................................................... + // add v19.4S, v18.4S, v26.4S // .....................................................*.......................................................................................................................................... + // sqrdmulh v30.4S, v30.4S, v1.S[3] // ......*......................................................................................................................................................................................... + // mla v28.4S, v15.4S, v29.4S // ......................*......................................................................................................................................................................... + // mul v15.4S, v8.4S, v5.S[2] // ................................................*............................................................................................................................................... + // sub v8.4S, v21.4S, v14.4S // ....................*........................................................................................................................................................................... + // add v14.4S, v21.4S, v14.4S // ...........................*.................................................................................................................................................................... + // mul v21.4S, v13.4S, v3.S[2] // ...................................................*............................................................................................................................................ + // sub v18.4S, v18.4S, v26.4S // ....................................*........................................................................................................................................................... + // sqrdmulh v26.4S, v13.4S, v3.S[3] // .....................................*.......................................................................................................................................................... + // mla v24.4S, v23.4S, v29.4S // ........................*....................................................................................................................................................................... + // sqrdmulh v13.4S, v8.4S, v7.S[1] // ......................................................................*......................................................................................................................... + // mul v8.4S, v8.4S, v7.S[0] // .......................................................................*........................................................................................................................ + // mla v25.4S, v30.4S, v29.4S // ..................*............................................................................................................................................................................. + // mla v10.4S, v20.4S, v29.4S // .............................................................*.................................................................................................................................. + // mul v30.4S, v16.4S, v4.S[0] // ................................*............................................................................................................................................................... + // mul v20.4S, v14.4S, v6.S[2] // ...............................................*................................................................................................................................................ + // mla v15.4S, v12.4S, v29.4S // ................................................................*............................................................................................................................... + // sqrdmulh v14.4S, v14.4S, v6.S[3] // .................................................*.............................................................................................................................................. + // mla v21.4S, v26.4S, v29.4S // ...........................................................*.................................................................................................................................... + // sub v23.4S, v17.4S, v24.4S // ..........................................*..................................................................................................................................................... + // sqrdmulh v16.4S, v16.4S, v4.S[1] // ..................................*............................................................................................................................................................. + // add v26.4S, v17.4S, v24.4S // ...................................*............................................................................................................................................................ + // add v12.4S, v11.4S, v25.4S // .....................*.......................................................................................................................................................................... + // add v17.4S, v22.4S, v10.4S // ........................................................................................*....................................................................................................... + // mla v8.4S, v13.4S, v29.4S // ...................................................................................*............................................................................................................ + // sub v22.4S, v22.4S, v10.4S // ....................................................................................*........................................................................................................... + // mul v13.4S, v26.4S, v4.S[2] // ......................................*......................................................................................................................................................... + // mul v24.4S, v23.4S, v5.S[0] // ..................................................................*............................................................................................................................. + // sqrdmulh v23.4S, v23.4S, v5.S[1] // .................................................................*.............................................................................................................................. + // mla v20.4S, v14.4S, v29.4S // ............................................................*................................................................................................................................... + // sub v10.4S, v12.4S, v21.4S // ..........................................................................................................*..................................................................................... + // add v12.4S, v12.4S, v21.4S // ..............................................................................................*................................................................................................. + // sqrdmulh v14.4S, v26.4S, v4.S[3] // .......................................*........................................................................................................................................................ + // mla v30.4S, v16.4S, v29.4S // ...........................................*.................................................................................................................................................... + // str q17, [x0, #640] // .............................................................................................................*.................................................................................. + // add v26.4S, v18.4S, v8.4S // ........................................................................................................*....................................................................................... + // sub v8.4S, v18.4S, v8.4S // ...............................................................................................................*................................................................................ + // str q22, [x0, #704] // ...............................................................................................*................................................................................................ + // add v17.4S, v31.4S, v9.4S // ............................................*................................................................................................................................................... + // sub v21.4S, v11.4S, v25.4S // .......................*........................................................................................................................................................................ + // str q10, [x0, #64] // .............................................................................................................................*.................................................................. + // sub v22.4S, v19.4S, v20.4S // ................................................................................*............................................................................................................... + // sub v11.4S, v27.4S, v28.4S // ...............................*................................................................................................................................................................ + // str q12, [x0], #(16) // ..............................................................................................................*................................................................................. + // mla v24.4S, v23.4S, v29.4S // ...........................................................................................*.................................................................................................... + // add v9.4S, v19.4S, v20.4S // .............................................................................*.................................................................................................................. + // mla v13.4S, v14.4S, v29.4S // ....................................................*........................................................................................................................................... + // str q26, [x0, #880] // ......................................................................................................................*......................................................................... + // sub v12.4S, v17.4S, v15.4S // ......................................................................................*......................................................................................................... + // str q8, [x0, #944] // ............................................................................................................................*................................................................... + // add v10.4S, v21.4S, v30.4S // ..............................................................*................................................................................................................................. + // sub v8.4S, v21.4S, v30.4S // .........................................................*...................................................................................................................................... + // add v16.4S, v27.4S, v28.4S // .................................*.............................................................................................................................................................. + // str q22, [x0, #816] // .................................................................................................................*.............................................................................. + // add v31.4S, v17.4S, v15.4S // .....................................................................................*.......................................................................................................... + // str q9, [x0, #752] // .........................................................................................*...................................................................................................... + // str q12, [x0, #560] // .............................................................................................*.................................................................................................. + // sub v12.4S, v11.4S, v24.4S // ...........................................................................................................................................*.................................................... + // str q10, [x0, #112] // .....................................................................*.......................................................................................................................... + // add v26.4S, v11.4S, v24.4S // ..............................................................................................................................................*................................................. + // add v28.4S, v16.4S, v13.4S // ...................................................................*............................................................................................................................ + // sub v19.4S, v16.4S, v13.4S // .........................................................................*...................................................................................................................... + // str q8, [x0, #176] // ..........................................................*..................................................................................................................................... + // str q31, [x0, #496] // ..................................................................................................*............................................................................................. + // str q12, [x0, #432] // ...............................................................................................................................................*................................................ + // str q26, [x0, #368] // .......................................................................................................................................................*........................................ + // str q28, [x0, #240] // ............................................................................*................................................................................................................... + // str q19, [x0, #304] // ..............................................................................*................................................................................................................. + // ldr q15, [x0, #256] // ............................................................................................................*................................................................................... + // ldr q20, [x0, #704] // ................................................................................................*............................................................................................... + // ldr q30, [x0, #960] // ...................................................................................................*............................................................................................ + // ldr q16, [x0, #768] // ..........................................................................*..................................................................................................................... + // ldr q10, [x0, #512] // ...........................................................................*.................................................................................................................... + // ldr q12, [x0, #896] // ...............................................................*................................................................................................................................ + // ldr q8, [x0, #832] // .........................................*...................................................................................................................................................... + // ldr q19, [x0, #576] // ........................................................................*....................................................................................................................... + // ldr q28, [x0, #640] // ..................................................................................*............................................................................................................. + // ldr q31, [x0, #448] // .......................................................................................................................................*........................................................ + // sqrdmulh v26.4S, v30.4S, v0.S[1] // ..........................................................................................................................*..................................................................... + // mul v11.4S, v30.4S, v0.S[0] // ...........................................................................................................................*.................................................................... + // mul v9.4S, v20.4S, v0.S[0] // ................................................................................................................................*............................................................... + // sqrdmulh v24.4S, v20.4S, v0.S[1] // ...............................................................................................................................*................................................................ + // ldr q20, [x0, #192] // ..................................................................................................................*............................................................................. + // sqrdmulh v22.4S, v16.4S, v0.S[1] // ...................................................................................................................*............................................................................ + // mul v17.4S, v16.4S, v0.S[0] // ......................................................................................................*......................................................................................... + // mul v27.4S, v12.4S, v0.S[0] // .................................................................................*.............................................................................................................. + // sqrdmulh v13.4S, v10.4S, v0.S[1] // ..........................................................................................*..................................................................................................... + // ldr q14, [x0, #384] // .......................................................................................*........................................................................................................ + // sqrdmulh v30.4S, v12.4S, v0.S[1] // ...............................................................................*................................................................................................................ + // mla v11.4S, v26.4S, v29.4S // ........................................................................................................................................*....................................................... + // mul v26.4S, v28.4S, v0.S[0] // ....................................................................................................*........................................................................................... + // mla v17.4S, v22.4S, v29.4S // .....................................................................................................................................*.......................................................... + // sqrdmulh v16.4S, v8.4S, v0.S[1] // ........................................................*....................................................................................................................................... + // mla v9.4S, v24.4S, v29.4S // ............................................................................................................................................*................................................... + // mul v24.4S, v8.4S, v0.S[0] // .......................................................*........................................................................................................................................ + // sqrdmulh v21.4S, v28.4S, v0.S[1] // .....................................................................................................*.......................................................................................... + // sqrdmulh v22.4S, v19.4S, v0.S[1] // ...................................................................................................................................*............................................................ + // mla v27.4S, v30.4S, v29.4S // .......................................................................................................*........................................................................................ + // sub v25.4S, v31.4S, v11.4S // ....................................................................................................................................................*........................................... + // add v8.4S, v31.4S, v11.4S // ...................................................................................................................................................*............................................ + // sub v12.4S, v15.4S, v17.4S // .................................................................................................................................................*.............................................. + // ldr q28, [x0, #320] // ......................................................................................................................................*......................................................... + // add v11.4S, v15.4S, v17.4S // ..................................................................................................................................................*............................................. + // mla v24.4S, v16.4S, v29.4S // ....................................................................*........................................................................................................................... + // mul v15.4S, v19.4S, v0.S[0] // .................................................................................................................................*.............................................................. + // mla v26.4S, v21.4S, v29.4S // ....................................................................................................................*........................................................................... + // sqrdmulh v19.4S, v25.4S, v1.S[1] // ............................................................................................................................................................*................................... + // mul v25.4S, v25.4S, v1.S[0] // .............................................................................................................................................................*.................................. + // ldr q17, [x0, #128] // ...........................................................................................................*.................................................................................... + // sub v21.4S, v14.4S, v27.4S // .....................................................................................................................*.......................................................................... + // add v31.4S, v14.4S, v27.4S // .......................................................................................................................*........................................................................ + // sub v23.4S, v20.4S, v9.4S // .......................................................................................................................................................................*........................ + // sqrdmulh v27.4S, v8.4S, v0.S[3] // ..............................................................................................................................................................*................................. + // sub v14.4S, v28.4S, v24.4S // ......................................................................................................................................................*......................................... + // mul v16.4S, v8.4S, v0.S[2] // ...............................................................................................................................................................*................................ + // add v18.4S, v28.4S, v24.4S // ...........................................................................................................................................................*.................................... + // mul v30.4S, v21.4S, v1.S[0] // ..............................................................................................................................*................................................................. + // sqrdmulh v28.4S, v21.4S, v1.S[1] // .............................................................................................................................................*.................................................. + // mul v8.4S, v11.4S, v0.S[2] // ........................................................................................................................................................*....................................... + // ldr q21, [x0, #64] // ................................................................................................................*............................................................................... + // mla v25.4S, v19.4S, v29.4S // ........................................................................................................................................................................*....................... + // sqrdmulh v19.4S, v14.4S, v1.S[1] // ..........................................................................................................................................................................*..................... + // mul v24.4S, v14.4S, v1.S[0] // ..................................................................................................................................................................................*............. + // mul v14.4S, v10.4S, v0.S[0] // ............................................................................................*................................................................................................... + // mla v15.4S, v22.4S, v29.4S // ................................................................................................................................................*............................................... + // mul v10.4S, v12.4S, v1.S[0] // ...................................................................................................................................................................*............................ + // add v22.4S, v20.4S, v9.4S // .........................................................................................................................................................................*...................... + // mla v30.4S, v28.4S, v29.4S // ..........................................................................................................................................................*..................................... + // sqrdmulh v20.4S, v31.4S, v0.S[3] // ..........................................................................................................................................*..................................................... + // mul v31.4S, v31.4S, v0.S[2] // .........................................................................................................................................*...................................................... + // ldr q28, [x0, #0] // .................................................................................................*.............................................................................................. + // sqrdmulh v9.4S, v11.4S, v0.S[3] // .........................................................................................................................................................*...................................... + // sub v11.4S, v17.4S, v26.4S // ....................................................................................................................................*........................................................... + // mla v14.4S, v13.4S, v29.4S // .........................................................................................................*...................................................................................... + // mla v16.4S, v27.4S, v29.4S // ...........................................................................................................................................................................*.................... + // add v13.4S, v23.4S, v25.4S // ......................................................................................................................................................................................*......... + // add v26.4S, v17.4S, v26.4S // ..................................................................................................................................*............................................................. + // add v17.4S, v21.4S, v15.4S // ...........................................................................................................................................................................................*.... + // add v27.4S, v11.4S, v30.4S // ......................................................................................................................................................................*......................... + // mla v31.4S, v20.4S, v29.4S // .....................................................................................................................................................*.......................................... + // sub v20.4S, v11.4S, v30.4S // .....................................................................................................................................................................*.......................... + // sqrdmulh v30.4S, v12.4S, v1.S[1] // .............................................................................................................................................................................*.................. + // add v11.4S, v28.4S, v14.4S // ........................................................................................................................*....................................................................... + // sub v28.4S, v28.4S, v14.4S // .........................................................................................................................*...................................................................... + // sub v23.4S, v23.4S, v25.4S // .......................................................................................................................................................................................*........ + // mla v24.4S, v19.4S, v29.4S // ..............................................................................................................................................................................................*. + // mla v8.4S, v9.4S, v29.4S // ....................................................................................................................................................................*........................... + // sqrdmulh v14.4S, v27.4S, v2.S[3] // ............................................................................................................................................................................*................... + // mul v9.4S, v27.4S, v2.S[2] // ...............................................................................................................................................................................*................ + // sqrdmulh v19.4S, v18.4S, v0.S[3] // .................................................................................................................................................................................*.............. + // sub v12.4S, v26.4S, v31.4S // ................................................................................................................................................................*............................... + // add v25.4S, v22.4S, v16.4S // .....................................................................................................................................................................................*.......... + // sub v16.4S, v22.4S, v16.4S // ....................................................................................................................................................................................*........... + // mul v22.4S, v13.4S, v2.S[2] // .............................................................................................................................................................................................*.. + // mla v10.4S, v30.4S, v29.4S // ........................................................................................................................................................................................*....... + // mul v18.4S, v18.4S, v0.S[2] // ................................................................................................................................................................................*............... + // sqrdmulh v13.4S, v13.4S, v2.S[3] // ............................................................................................................................................................................................*... + // add v30.4S, v26.4S, v31.4S // ..................................................................................................................................................................*............................. + // mul v26.4S, v20.4S, v3.S[0] // ..............................................................................................................................................................................*................. + // sqrdmulh v20.4S, v20.4S, v3.S[1] // ...................................................................................................................................................................................*............ + // sub v27.4S, v11.4S, v8.4S // ..........................................................................................................................................................................................*..... + // mla v9.4S, v14.4S, v29.4S // .........................................................................................................................................................................................*...... + // sub v21.4S, v21.4S, v15.4S // .................................................................................................................................................................*.............................. + // mul v14.4S, v23.4S, v3.S[0] // ...............................................................................................................................................................................................* sub count, count, #1 cbnz count, layer1234_start - mul v31.4S, v21.4S, v3.S[0] // ..............................................................................................................................*................................................................. - mla v9.4S, v27.4S, v29.4S // ...........................................................................................................................*.................................................................... - sqrdmulh v27.4S, v21.4S, v3.S[1] // ...............................................................................................................................*................................................................ - add v21.4S, v30.4S, v17.4S // .......................................................................................................................................*........................................................ - add v28.4S, v16.4S, v28.4S // .....................................................................................*.......................................................................................................... - sub v17.4S, v30.4S, v17.4S // ......................................................................................................................................*......................................................... - mla v8.4S, v10.4S, v29.4S // ..............................................................................*................................................................................................................. - mla v13.4S, v26.4S, v29.4S // ..........................................................*..................................................................................................................................... - sqrdmulh v10.4S, v21.4S, v6.S[3] // .......................................................................................................................................................................*........................ - mul v21.4S, v21.4S, v6.S[2] // ......................................................................................................................................................................*......................... - add v30.4S, v23.4S, v12.4S // ..........................................................................................*..................................................................................................... - sub v12.4S, v11.4S, v24.4S // ...................*............................................................................................................................................................................ - add v16.4S, v28.4S, v9.4S // .............................................................................................................................*.................................................................. - sub v23.4S, v18.4S, v20.4S // .....................................................................*.......................................................................................................................... - mla v31.4S, v27.4S, v29.4S // ................................................................................................................................*............................................................... - sub v27.4S, v28.4S, v9.4S // ............................................................................................................................*................................................................... - sqrdmulh v28.4S, v30.4S, v2.S[3] // .....................................................................................................................*.......................................................................... - mul v20.4S, v30.4S, v2.S[2] // ....................................................................................................................*........................................................................... - sub v30.4S, v22.4S, v19.4S // ........................................................................................................*....................................................................................... - sub v26.4S, v12.4S, v8.4S // ...............................................................................*................................................................................................................ - mla v21.4S, v10.4S, v29.4S // ........................................................................................................................................................................*....................... - add v22.4S, v22.4S, v19.4S // .........................................................................................................*...................................................................................... - mul v10.4S, v16.4S, v5.S[2] // ............................................................................................................................................................*................................... - sqrdmulh v9.4S, v16.4S, v5.S[3] // .............................................................................................................................................................*.................................. - sub v18.4S, v26.4S, v31.4S // .................................................................................................................................*.............................................................. - sqrdmulh v19.4S, v30.4S, v4.S[1] // ..............................................................................................................................................*................................................. - add v16.4S, v26.4S, v31.4S // ..................................................................................................................................*............................................................. - mul v31.4S, v30.4S, v4.S[0] // .............................................................................................................................................*.................................................. - mla v20.4S, v28.4S, v29.4S // ......................................................................................................................*......................................................................... - add v8.4S, v12.4S, v8.4S // ................................................................................*............................................................................................................... - sqrdmulh v26.4S, v17.4S, v7.S[1] // ............................................................................................................................................................................*................... - mul v30.4S, v17.4S, v7.S[0] // ...........................................................................................................................................................................*.................... - mla v10.4S, v9.4S, v29.4S // ..............................................................................................................................................................*................................. - add v17.4S, v16.4S, v21.4S // ..........................................................................................................................................................................*..................... - sqrdmulh v28.4S, v25.4S, v1.S[3] // .................................................................................................*.............................................................................................. - mul v9.4S, v25.4S, v1.S[2] // ................................................................................................*............................................................................................... - mla v31.4S, v19.4S, v29.4S // ...............................................................................................................................................*................................................ - sub v16.4S, v16.4S, v21.4S // .........................................................................................................................................................................*...................... - mul v19.4S, v27.4S, v6.S[0] // .................................................................................................................................................................*.............................. - sub v25.4S, v15.4S, v14.4S // ..................................................................................................................*............................................................................. - mla v30.4S, v26.4S, v29.4S // .............................................................................................................................................................................*.................. - sub v21.4S, v8.4S, v20.4S // .......................................................................................................................*........................................................................ - add v24.4S, v11.4S, v24.4S // ....................*........................................................................................................................................................................... - str q17, [x0, #768] // ............................................................................................................................................................................................*... - sqrdmulh v11.4S, v27.4S, v6.S[1] // ..................................................................................................................................................................*............................. - add v17.4S, v15.4S, v14.4S // ...................................................................................................................*............................................................................ - mul v14.4S, v23.4S, v2.S[0] // ..........................................................................................................*..................................................................................... - mla v9.4S, v28.4S, v29.4S // ..................................................................................................*............................................................................................. - sqrdmulh v28.4S, v25.4S, v5.S[1] // ........................................................................................................................................................*....................................... - sqrdmulh v12.4S, v22.4S, v3.S[3] // .........................................................................................................................................*...................................................... - add v15.4S, v24.4S, v13.4S // ............................................................*................................................................................................................................... - sqrdmulh v26.4S, v23.4S, v2.S[1] // ...........................................................................................................*.................................................................................... - mul v27.4S, v25.4S, v5.S[0] // .......................................................................................................................................................*........................................ - sub v23.4S, v24.4S, v13.4S // ...........................................................*.................................................................................................................................... - add v25.4S, v8.4S, v20.4S // ........................................................................................................................*....................................................................... - mul v8.4S, v22.4S, v3.S[2] // ........................................................................................................................................*....................................................... - sub v22.4S, v18.4S, v30.4S // ..............................................................................................................................................................................*................. - mul v24.4S, v17.4S, v4.S[2] // ..................................................................................................................................................*............................................. - sqrdmulh v13.4S, v17.4S, v4.S[3] // ...................................................................................................................................................*............................................ - sub v17.4S, v15.4S, v9.4S // ...................................................................................................*............................................................................................ - add v18.4S, v18.4S, v30.4S // ...............................................................................................................................................................................*................ - sub v20.4S, v25.4S, v10.4S // ...............................................................................................................................................................*................................ - mla v14.4S, v26.4S, v29.4S // ............................................................................................................*................................................................................... - mla v19.4S, v11.4S, v29.4S // ...................................................................................................................................................................*............................ - mla v27.4S, v28.4S, v29.4S // .........................................................................................................................................................*...................................... - str q16, [x0, #832] // .............................................................................................................................................................................................*.. - sub v16.4S, v17.4S, v31.4S // ................................................................................................................................................*............................................... - add v30.4S, v17.4S, v31.4S // .................................................................................................................................................*.............................................. - mla v8.4S, v12.4S, v29.4S // ..........................................................................................................................................*..................................................... - add v11.4S, v15.4S, v9.4S // ....................................................................................................*........................................................................................... - str q18, [x0, #896] // ..............................................................................................................................................................................................*. - mla v24.4S, v13.4S, v29.4S // ....................................................................................................................................................*........................................... - add v17.4S, v25.4S, v10.4S // ................................................................................................................................................................*............................... - str q20, [x0, #576] // .........................................................................................................................................................................................*...... - sub v9.4S, v21.4S, v19.4S // ....................................................................................................................................................................*........................... - sub v20.4S, v23.4S, v14.4S // .............................................................................................................*.................................................................................. - add v23.4S, v23.4S, v14.4S // ..............................................................................................................*................................................................................. - str q30, [x0, #128] // ..................................................................................................................................................................................*............. - add v15.4S, v21.4S, v19.4S // .....................................................................................................................................................................*.......................... - str q22, [x0, #960] // ...............................................................................................................................................................................................* - sub v21.4S, v11.4S, v8.4S // ...........................................................................................................................................*.................................................... - add v28.4S, v11.4S, v8.4S // ............................................................................................................................................*................................................... - str q17, [x0, #512] // ........................................................................................................................................................................................*....... - str q16, [x0, #192] // ...................................................................................................................................................................................*............ - str q9, [x0, #704] // ...........................................................................................................................................................................................*.... - str q15, [x0, #640] // ..........................................................................................................................................................................................*..... - sub v9.4S, v20.4S, v27.4S // ..........................................................................................................................................................*..................................... - sub v15.4S, v23.4S, v24.4S // .....................................................................................................................................................*.......................................... - add v16.4S, v23.4S, v24.4S // ......................................................................................................................................................*......................................... - str q21, [x0, #64] // .................................................................................................................................................................................*.............. - add v21.4S, v20.4S, v27.4S // ...........................................................................................................................................................*.................................... - str q28, [x0], #(16) // ................................................................................................................................................................................*............... - str q9, [x0, #432] // .......................................................................................................................................................................................*........ - str q15, [x0, #304] // .....................................................................................................................................................................................*.......... - str q16, [x0, #240] // ....................................................................................................................................................................................*........... - str q21, [x0, #368] // ......................................................................................................................................................................................*......... + add v31.4S, v28.4S, v10.4S // ................................................................................*............................................................................................................... + mla v18.4S, v19.4S, v29.4S // ...............................................................*................................................................................................................................ + mul v19.4S, v25.4S, v1.S[2] // ......................................................................................................*......................................................................................... + mla v22.4S, v13.4S, v29.4S // ...........................................................................................................................*.................................................................... + sqrdmulh v13.4S, v25.4S, v1.S[3] // .....................................................................................................*.......................................................................................... + add v11.4S, v11.4S, v8.4S // ............................................................*................................................................................................................................... + add v8.4S, v21.4S, v24.4S // .....................................................................................*.......................................................................................................... + sqrdmulh v15.4S, v23.4S, v3.S[1] // ...................................................................................................................................*............................................................ + sqrdmulh v23.4S, v16.4S, v2.S[1] // ...............................................................................................................*................................................................................ + mul v25.4S, v30.4S, v1.S[2] // .................................................................................................*.............................................................................................. + mla v26.4S, v20.4S, v29.4S // ................................................................................................................................*............................................................... + sub v20.4S, v8.4S, v22.4S // ............................................................................................................................*................................................................... + add v8.4S, v8.4S, v22.4S // .............................................................................................................................*.................................................................. + sub v22.4S, v31.4S, v9.4S // .......................................................................................................................*........................................................................ + mla v19.4S, v13.4S, v29.4S // .......................................................................................................*........................................................................................ + add v13.4S, v17.4S, v18.4S // .................................................................*.............................................................................................................................. + sub v17.4S, v17.4S, v18.4S // ................................................................*............................................................................................................................... + sub v18.4S, v28.4S, v10.4S // ...............................................................................*................................................................................................................ + mla v14.4S, v15.4S, v29.4S // .....................................................................................................................................*.......................................................... + sqrdmulh v15.4S, v12.4S, v2.S[1] // ..........................................................................................................*..................................................................................... + mul v10.4S, v20.4S, v6.S[0] // ..................................................................................................................................................................*............................. + mul v28.4S, v12.4S, v2.S[0] // ...........................................................................................................*.................................................................................... + sqrdmulh v12.4S, v8.4S, v5.S[3] // ............................................................................................................................................................*................................... + sub v21.4S, v21.4S, v24.4S // ....................................................................................*........................................................................................................... + mul v24.4S, v16.4S, v2.S[0] // ................................................................................................................*............................................................................... + sqrdmulh v20.4S, v20.4S, v6.S[1] // .................................................................................................................................................................*.............................. + sub v16.4S, v13.4S, v19.4S // ........................................................................................................*....................................................................................... + add v13.4S, v13.4S, v19.4S // .........................................................................................................*...................................................................................... + add v19.4S, v18.4S, v26.4S // ..................................................................................................................................*............................................................. + sqrdmulh v30.4S, v30.4S, v1.S[3] // ................................................................................................*............................................................................................... + mla v28.4S, v15.4S, v29.4S // ............................................................................................................*................................................................................... + mul v15.4S, v8.4S, v5.S[2] // .............................................................................................................................................................*.................................. + sub v8.4S, v21.4S, v14.4S // ......................................................................................................................................*......................................................... + add v14.4S, v21.4S, v14.4S // .......................................................................................................................................*........................................................ + mul v21.4S, v13.4S, v3.S[2] // .........................................................................................................................................*...................................................... + sub v18.4S, v18.4S, v26.4S // .................................................................................................................................*.............................................................. + sqrdmulh v26.4S, v13.4S, v3.S[3] // ........................................................................................................................................*....................................................... + mla v24.4S, v23.4S, v29.4S // .................................................................................................................*.............................................................................. + sqrdmulh v13.4S, v8.4S, v7.S[1] // ...........................................................................................................................................................................*.................... + mul v8.4S, v8.4S, v7.S[0] // ............................................................................................................................................................................*................... + mla v25.4S, v30.4S, v29.4S // ..................................................................................................*............................................................................................. + mla v10.4S, v20.4S, v29.4S // ...................................................................................................................................................................*............................ + mul v30.4S, v16.4S, v4.S[0] // ..............................................................................................................................................*................................................. + mul v20.4S, v14.4S, v6.S[2] // .......................................................................................................................................................................*........................ + mla v15.4S, v12.4S, v29.4S // ..............................................................................................................................................................*................................. + sqrdmulh v14.4S, v14.4S, v6.S[3] // ......................................................................................................................................................................*......................... + mla v21.4S, v26.4S, v29.4S // ..........................................................................................................................................*..................................................... + sub v23.4S, v17.4S, v24.4S // ..................................................................................................................*............................................................................. + sqrdmulh v16.4S, v16.4S, v4.S[1] // .............................................................................................................................................*.................................................. + add v26.4S, v17.4S, v24.4S // ...................................................................................................................*............................................................................ + add v12.4S, v11.4S, v25.4S // ....................................................................................................*........................................................................................... + add v17.4S, v22.4S, v10.4S // .....................................................................................................................................................................*.......................... + mla v8.4S, v13.4S, v29.4S // .............................................................................................................................................................................*.................. + sub v22.4S, v22.4S, v10.4S // ....................................................................................................................................................................*........................... + mul v13.4S, v26.4S, v4.S[2] // ...................................................................................................................................................*............................................ + mul v24.4S, v23.4S, v5.S[0] // ........................................................................................................................................................*....................................... + sqrdmulh v23.4S, v23.4S, v5.S[1] // .......................................................................................................................................................*........................................ + mla v20.4S, v14.4S, v29.4S // ........................................................................................................................................................................*....................... + sub v10.4S, v12.4S, v21.4S // ...........................................................................................................................................*.................................................... + add v12.4S, v12.4S, v21.4S // ............................................................................................................................................*................................................... + sqrdmulh v14.4S, v26.4S, v4.S[3] // ..................................................................................................................................................*............................................. + mla v30.4S, v16.4S, v29.4S // ...............................................................................................................................................*................................................ + str q17, [x0, #640] // ..........................................................................................................................................................................................*..... + add v26.4S, v18.4S, v8.4S // ...............................................................................................................................................................................*................ + sub v8.4S, v18.4S, v8.4S // ..............................................................................................................................................................................*................. + str q22, [x0, #704] // ...........................................................................................................................................................................................*.... + add v17.4S, v31.4S, v9.4S // ........................................................................................................................*....................................................................... + sub v21.4S, v11.4S, v25.4S // ...................................................................................................*............................................................................................ + str q10, [x0, #64] // .................................................................................................................................................................................*.............. + sub v22.4S, v19.4S, v20.4S // .........................................................................................................................................................................*...................... + sub v11.4S, v27.4S, v28.4S // .............................................................................................................*.................................................................................. + str q12, [x0], #(16) // ................................................................................................................................................................................*............... + mla v24.4S, v23.4S, v29.4S // .........................................................................................................................................................*...................................... + add v9.4S, v19.4S, v20.4S // ..........................................................................................................................................................................*..................... + mla v13.4S, v14.4S, v29.4S // ....................................................................................................................................................*........................................... + str q26, [x0, #880] // ..............................................................................................................................................................................................*. + sub v12.4S, v17.4S, v15.4S // ...............................................................................................................................................................*................................ + str q8, [x0, #944] // ...............................................................................................................................................................................................* + add v10.4S, v21.4S, v30.4S // .................................................................................................................................................*.............................................. + sub v8.4S, v21.4S, v30.4S // ................................................................................................................................................*............................................... + add v16.4S, v27.4S, v28.4S // ..............................................................................................................*................................................................................. + str q22, [x0, #816] // .............................................................................................................................................................................................*.. + add v31.4S, v17.4S, v15.4S // ................................................................................................................................................................*............................... + str q9, [x0, #752] // ............................................................................................................................................................................................*... + str q12, [x0, #560] // .........................................................................................................................................................................................*...... + sub v12.4S, v11.4S, v24.4S // ..........................................................................................................................................................*..................................... + str q10, [x0, #112] // ..................................................................................................................................................................................*............. + add v26.4S, v11.4S, v24.4S // ...........................................................................................................................................................*.................................... + add v28.4S, v16.4S, v13.4S // ......................................................................................................................................................*......................................... + sub v19.4S, v16.4S, v13.4S // .....................................................................................................................................................*.......................................... + str q8, [x0, #176] // ...................................................................................................................................................................................*............ + str q31, [x0, #496] // ........................................................................................................................................................................................*....... + str q12, [x0, #432] // .......................................................................................................................................................................................*........ + str q26, [x0, #368] // ......................................................................................................................................................................................*......... + str q28, [x0, #240] // ....................................................................................................................................................................................*........... + str q19, [x0, #304] // .....................................................................................................................................................................................*.......... restore inp, STACK0 mov count, #16 @@ -901,714 +883,750 @@ layer1234_start: qform_root3_tw .req q7 .p2align 2 - ldr q30, [x1, #32] // *................................. - ldr q31, [x1, #48] // ..*............................... - ldr q5, [x3], #16 // .*................................ - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - ldr q6, [x1, #0] // ....*............................. - ldr q18, [x3], #8 // .....*............................ - ldr q3, [x4, #16] // ...*.............................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - ldr q22, [x1, #16] // ...........*...................... - ldr q14, [x4], #(6*16) // ..........*....................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - mul v23.4S, v30.4S, v5.S[0] // ......*........................... - sqrdmulh v30.4S, v30.4S, v5.S[1] // .......*.......................... - mul v13.4S, v31.4S, v5.S[0] // ........*......................... - sqrdmulh v31.4S, v31.4S, v5.S[1] // .........*........................ - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - mla v13.4S, v31.4S, v29.4S // .............*.................... - mla v23.4S, v30.4S, v29.4S // ............*..................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - add v21.4S, v6.4S, v23.4S // ................*................. - sub v30.4S, v6.4S, v23.4S // .................*................ - add v31.4S, v22.4S, v13.4S // ..............*................... - sub v6.4S, v22.4S, v13.4S // ...............*.................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - mul v22.4S, v31.4S, v5.S[2] // ....................*............. - sqrdmulh v31.4S, v31.4S, v5.S[3] // .....................*............ - mul v5.4S, v6.4S, v18.S[0] // ..................*............... - sqrdmulh v6.4S, v6.4S, v18.S[1] // ...................*.............. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - mla v5.4S, v6.4S, v29.4S // .......................*.......... - mla v22.4S, v31.4S, v29.4S // ......................*........... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - add v31.4S, v21.4S, v22.4S // ..........................*....... - sub v6.4S, v21.4S, v22.4S // ...........................*...... - sub v18.4S, v30.4S, v5.4S // ........................*......... - add v30.4S, v30.4S, v5.4S // .........................*........ - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - trn2 v22.4S, v31.4S, v6.4S // ..............................*... - trn1 v4.4S, v31.4S, v6.4S // ...............................*.. - trn2 v13.4S, v30.4S, v18.4S // ............................*..... - trn1 v24.4S, v30.4S, v18.4S // .............................*.... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - trn2 v21.2D, v22.2D, v13.2D // ................................*. - trn2 v0.2D, v4.2D, v24.2D // .................................* - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. + // Instructions: 33 + // Expected cycles: 23 + // Expected IPC: 1.43 + // + // Wall time: 0.57s + // User time: 0.57s + // + // ------ original position -------> + // 0 25 + // |------------------------|------- + ldr q6, [x1, #32] // *................................ + ldr q19, [x1, #48] // ..*.............................. + ldr q28, [x3], #16 // .*............................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + ldr q22, [x1, #16] // ...*............................. + ldr q4, [x1, #0] // .........*....................... + ldr q13, [x3], #8 // ....*............................ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + ldr q8, [x4, #16] // ......................*.......... + ldr q26, [x4], #(6*16) // ..............................*.. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v0.4S, v6.4S, v28.S[1] // .......*......................... + mul v6.4S, v6.4S, v28.S[0] // ........*........................ + sqrdmulh v14.4S, v19.4S, v28.S[1] // .....*........................... + mul v19.4S, v19.4S, v28.S[0] // ......*.......................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mla v6.4S, v0.4S, v29.4S // ...........*..................... + mla v19.4S, v14.4S, v29.4S // ..........*...................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + add v0.4S, v22.4S, v19.4S // .............*................... + sub v19.4S, v22.4S, v19.4S // ..............*.................. + add v22.4S, v4.4S, v6.4S // ............*.................... + sub v6.4S, v4.4S, v6.4S // ...................*............. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v4.4S, v0.4S, v28.S[3] // .................*............... + mul v28.4S, v0.4S, v28.S[2] // ..................*.............. + sqrdmulh v0.4S, v19.4S, v13.S[1] // ...............*................. + mul v19.4S, v19.4S, v13.S[0] // ................*................ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mla v28.4S, v4.4S, v29.4S // .....................*........... + mla v19.4S, v0.4S, v29.4S // ....................*............ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + add v4.4S, v22.4S, v28.4S // .........................*....... + sub v28.4S, v22.4S, v28.4S // ..........................*...... + sub v22.4S, v6.4S, v19.4S // .......................*......... + add v6.4S, v6.4S, v19.4S // ........................*........ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + trn2 v21.4S, v4.4S, v28.4S // ...........................*..... + trn1 v14.4S, v4.4S, v28.4S // ............................*.... + trn2 v24.4S, v6.4S, v22.4S // .............................*... + trn1 v0.4S, v6.4S, v22.4S // ...............................*. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + trn2 v6.2D, v21.2D, v24.2D // ................................* + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. - // original source code - // ldr q5, [x1, #32] // *................................. - // ldr q4, [x3], #16 // ..*............................... - // ldr q9, [x1, #48] // .*................................ - // ldr q3, [x4, #16] // .....*............................ - // ldr q22, [x1, #0] // ...*.............................. - // ldr q28, [x3], #8 // ....*............................. - // mul v16.4S, v5.4S, v4.S[0] // ........*......................... - // sqrdmulh v21.4S, v5.4S, v4.S[1] // .........*........................ - // mul v6.4S, v9.4S, v4.S[0] // ..........*....................... - // sqrdmulh v8.4S, v9.4S, v4.S[1] // ...........*...................... - // ldr q14, [x4], #(6*16) // .......*.......................... - // ldr q13, [x1, #16] // ......*........................... - // mla v16.4S, v21.4S, v29.4S // .............*.................... - // mla v6.4S, v8.4S, v29.4S // ............*..................... - // add v18.4S, v13.4S, v6.4S // ................*................. - // sub v10.4S, v13.4S, v6.4S // .................*................ - // add v17.4S, v22.4S, v16.4S // ..............*................... - // sub v25.4S, v22.4S, v16.4S // ...............*.................. - // mul v2.4S, v10.4S, v28.S[0] // ....................*............. - // sqrdmulh v10.4S, v10.4S, v28.S[1] // .....................*............ - // mul v19.4S, v18.4S, v4.S[2] // ..................*............... - // sqrdmulh v9.4S, v18.4S, v4.S[3] // ...................*.............. - // mla v19.4S, v9.4S, v29.4S // .......................*.......... - // mla v2.4S, v10.4S, v29.4S // ......................*........... - // sub v7.4S, v25.4S, v2.4S // ..........................*....... - // add v15.4S, v25.4S, v2.4S // ...........................*...... - // add v11.4S, v17.4S, v19.4S // ........................*......... - // sub v28.4S, v17.4S, v19.4S // .........................*........ - // trn2 v13.4S, v15.4S, v7.4S // ..............................*... - // trn1 v24.4S, v15.4S, v7.4S // ...............................*.. - // trn2 v22.4S, v11.4S, v28.4S // ............................*..... - // trn1 v4.4S, v11.4S, v28.4S // .............................*.... - // trn2 v21.2D, v22.2D, v13.2D // ................................*. - // trn2 v0.2D, v4.2D, v24.2D // .................................* + // --------- new position ---------> + // 0 25 + // |------------------------|------- + // ldr q21, [x1, #32] // *................................ + // ldr q9, [x3], #16 // ..*.............................. + // ldr q31, [x1, #48] // .*............................... + // ldr q2, [x1, #16] // ...*............................. + // ldr q16, [x3], #8 // .....*........................... + // sqrdmulh v11.4S, v31.4S, v9.S[1] // ..........*...................... + // mul v5.4S, v31.4S, v9.S[0] // ...........*..................... + // sqrdmulh v0.4S, v21.4S, v9.S[1] // ........*........................ + // mul v18.4S, v21.4S, v9.S[0] // .........*....................... + // ldr q15, [x1, #0] // ....*............................ + // mla v5.4S, v11.4S, v29.4S // .............*................... + // mla v18.4S, v0.4S, v29.4S // ............*.................... + // add v22.4S, v15.4S, v18.4S // ................*................ + // add v3.4S, v2.4S, v5.4S // ..............*.................. + // sub v13.4S, v2.4S, v5.4S // ...............*................. + // sqrdmulh v4.4S, v13.4S, v16.S[1] // ....................*............ + // mul v28.4S, v13.4S, v16.S[0] // .....................*........... + // sqrdmulh v14.4S, v3.4S, v9.S[3] // ..................*.............. + // mul v13.4S, v3.4S, v9.S[2] // ...................*............. + // sub v30.4S, v15.4S, v18.4S // .................*............... + // mla v28.4S, v4.4S, v29.4S // .......................*......... + // mla v13.4S, v14.4S, v29.4S // ......................*.......... + // ldr q8, [x4, #16] // ......*.......................... + // sub v12.4S, v30.4S, v28.4S // ..........................*...... + // add v28.4S, v30.4S, v28.4S // ...........................*..... + // add v4.4S, v22.4S, v13.4S // ........................*........ + // sub v27.4S, v22.4S, v13.4S // .........................*....... + // trn2 v21.4S, v4.4S, v27.4S // ............................*.... + // trn1 v14.4S, v4.4S, v27.4S // .............................*... + // trn2 v24.4S, v28.4S, v12.4S // ..............................*.. + // ldr q26, [x4], #(6*16) // .......*......................... + // trn1 v0.4S, v28.4S, v12.4S // ...............................*. + // trn2 v6.2D, v21.2D, v24.2D // ................................* sub count, count, #1 layer5678_start: - ldr q5, [x1, #96] // ..e..................................................................... - mul v23.4S, v0.4S, v14.4S // ........................................*............................... - trn1 v15.2D, v4.2D, v24.2D // ................................*....................................... - ldr q4, [x3], #16 // ....e................................................................... - mul v18.4S, v21.4S, v14.4S // .............................................*.......................... - ldr q9, [x1, #112] // ...e.................................................................... - sqrdmulh v20.4S, v21.4S, v3.4S // ..............................................*......................... - // gap // ........................................................................ - sqrdmulh v27.4S, v0.4S, v3.4S // .........................................*.............................. - ldr q12, [x4, #-32] // ......................................*................................. - ldr q3, [x4, #16] // ...................................e.................................... - ldr q11, [x4, #-16] // .......................................*................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v31.2D, v22.2D, v13.2D // .................................*...................................... - ldr q22, [x1, #64] // e....................................................................... - ldr q28, [x3], #8 // .....e.................................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mla v18.4S, v20.4S, v29.S[0] // ...............................................*........................ - ldr q30, [x4, #-64] // ....................................*................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v16.4S, v5.4S, v4.S[0] // ......e................................................................. - sqrdmulh v21.4S, v5.4S, v4.S[1] // .......e................................................................ - mul v6.4S, v9.4S, v4.S[0] // ...........e............................................................ - sqrdmulh v8.4S, v9.4S, v4.S[1] // ............e........................................................... - ldr q0, [x4, #-48] // .....................................*.................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mla v23.4S, v27.4S, v29.S[0] // ..........................................*............................. - ldr q14, [x4], #(6*16) // ..................................e..................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v5.4S, v31.4S, v18.4S // .................................................*...................... - sub v17.4S, v31.4S, v18.4S // ................................................*....................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - ldr q13, [x1, #80] // .e...................................................................... - mla v16.4S, v21.4S, v29.4S // ........e............................................................... - mla v6.4S, v8.4S, v29.4S // .............e.......................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v2.4S, v5.4S, v0.4S // ...................................................*.................... - sqrdmulh v9.4S, v17.4S, v11.4S // ........................................................*............... - mul v11.4S, v5.4S, v30.4S // ..................................................*..................... - mul v30.4S, v17.4S, v12.4S // .......................................................*................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v0.4S, v15.4S, v23.4S // ...........................................*............................ - add v23.4S, v15.4S, v23.4S // ............................................*........................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v18.4S, v13.4S, v6.4S // ...............e........................................................ - sub v10.4S, v13.4S, v6.4S // ..............e......................................................... - add v17.4S, v22.4S, v16.4S // ..........e............................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mla v11.4S, v2.4S, v29.S[0] // ....................................................*................... - mla v30.4S, v9.4S, v29.S[0] // .........................................................*.............. - sub v25.4S, v22.4S, v16.4S // .........e.............................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v2.4S, v10.4S, v28.S[0] // .....................e.................................................. - sqrdmulh v10.4S, v10.4S, v28.S[1] // ......................e................................................. - mul v19.4S, v18.4S, v4.S[2] // ................e....................................................... - sqrdmulh v9.4S, v18.4S, v4.S[3] // .................e...................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v18.4S, v23.4S, v11.4S // .....................................................*.................. - add v31.4S, v23.4S, v11.4S // ......................................................*................. - sub v22.4S, v0.4S, v30.4S // ..........................................................*............. - add v11.4S, v0.4S, v30.4S // ...........................................................*............ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mla v19.4S, v9.4S, v29.4S // ..................e..................................................... - mla v2.4S, v10.4S, v29.4S // .......................e................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v1.4S, v31.4S, v18.4S // ............................................................*........... - trn2 v21.4S, v31.4S, v18.4S // .............................................................*.......... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v12.4S, v11.4S, v22.4S // ...............................................................*........ - trn1 v6.4S, v11.4S, v22.4S // ..............................................................*......... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v7.4S, v25.4S, v2.4S // ........................e............................................... - add v15.4S, v25.4S, v2.4S // .........................e.............................................. - add v11.4S, v17.4S, v19.4S // ....................e................................................... - sub v28.4S, v17.4S, v19.4S // ...................e.................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v25.2D, v1.2D, v6.2D // ..................................................................*..... - trn2 v1.2D, v1.2D, v6.2D // ................................................................*....... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v9.2D, v21.2D, v12.2D // .................................................................*...... - trn2 v13.4S, v15.4S, v7.4S // .............................e.......................................... - trn1 v24.4S, v15.4S, v7.4S // ............................e........................................... - trn2 v22.4S, v11.4S, v28.4S // ...........................e............................................ - trn1 v4.4S, v11.4S, v28.4S // ..........................e............................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v16.2D, v21.2D, v12.2D // ...................................................................*.... - str q25, [x1], #64 // ....................................................................*... - str q1, [x1, #-32] // ......................................................................*. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - str q9, [x1, #-16] // .......................................................................* - trn2 v21.2D, v22.2D, v13.2D // ...............................e........................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v0.2D, v4.2D, v24.2D // ..............................e......................................... - str q16, [x1, #-48] // .....................................................................*.. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ + // Instructions: 72 + // Expected cycles: 24 + // Expected IPC: 3.00 + // + // Wall time: 12.90s + // User time: 12.90s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + sqrdmulh v25.4S, v6.4S, v8.4S // .............................................*.......................... + trn1 v17.2D, v14.2D, v0.2D // ................................*....................................... + trn1 v13.2D, v21.2D, v24.2D // .................................*...................................... + ldr q21, [x1, #96] // ..e..................................................................... + mul v6.4S, v6.4S, v26.4S // ..............................................*......................... + ldr q9, [x3], #16 // ....e................................................................... + ldr q31, [x1, #112] // ...e.................................................................... + // gap // ........................................................................ + trn2 v28.2D, v14.2D, v0.2D // ..............................*......................................... + ldr q22, [x4, #-48] // .....................................*.................................. + ldr q20, [x4, #-32] // ......................................*................................. + ldr q2, [x1, #80] // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q4, [x4, #-64] // ....................................*................................... + ldr q30, [x4, #-16] // .......................................*................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mla v6.4S, v25.4S, v29.4S // ...............................................*........................ + sqrdmulh v19.4S, v28.4S, v8.4S // ........................................*............................... + mul v28.4S, v28.4S, v26.4S // .........................................*.............................. + ldr q16, [x3], #8 // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v11.4S, v31.4S, v9.S[1] // ...........e............................................................ + mul v5.4S, v31.4S, v9.S[0] // ............e........................................................... + sqrdmulh v0.4S, v21.4S, v9.S[1] // ......e................................................................. + mul v18.4S, v21.4S, v9.S[0] // .......e................................................................ + ldr q15, [x1, #64] // e....................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mla v28.4S, v19.4S, v29.4S // ..........................................*............................. + sub v19.4S, v13.4S, v6.4S // ................................................*....................... + add v6.4S, v13.4S, v6.4S // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mla v5.4S, v11.4S, v29.4S // .............e.......................................................... + mla v18.4S, v0.4S, v29.4S // ........e............................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v8.4S, v6.4S, v4.4S // ...................................................*.................... + sqrdmulh v26.4S, v6.4S, v22.4S // ..................................................*..................... + mul v6.4S, v19.4S, v20.4S // ........................................................*............... + sqrdmulh v19.4S, v19.4S, v30.4S // .......................................................*................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v22.4S, v15.4S, v18.4S // ..........e............................................................. + add v3.4S, v2.4S, v5.4S // ...............e........................................................ + sub v13.4S, v2.4S, v5.4S // ..............e......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mla v8.4S, v26.4S, v29.4S // ....................................................*................... + mla v6.4S, v19.4S, v29.4S // .........................................................*.............. + add v19.4S, v17.4S, v28.4S // ............................................*........................... + sub v26.4S, v17.4S, v28.4S // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v4.4S, v13.4S, v16.S[1] // .....................e.................................................. + mul v28.4S, v13.4S, v16.S[0] // ......................e................................................. + sqrdmulh v14.4S, v3.4S, v9.S[3] // ................e....................................................... + mul v13.4S, v3.4S, v9.S[2] // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v30.4S, v15.4S, v18.4S // .........e.............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v16.4S, v19.4S, v8.4S // .....................................................*.................. + add v8.4S, v19.4S, v8.4S // ......................................................*................. + add v2.4S, v26.4S, v6.4S // ...........................................................*............ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v19.4S, v26.4S, v6.4S // ..........................................................*............. + mla v28.4S, v4.4S, v29.4S // .......................e................................................ + mla v13.4S, v14.4S, v29.4S // ..................e..................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v26.4S, v8.4S, v16.4S // ............................................................*........... + trn2 v16.4S, v8.4S, v16.4S // .............................................................*.......... + ldr q8, [x4, #16] // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v18.4S, v2.4S, v19.4S // ..............................................................*......... + trn2 v6.4S, v2.4S, v19.4S // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v12.4S, v30.4S, v28.4S // ........................e............................................... + add v28.4S, v30.4S, v28.4S // .........................e.............................................. + add v4.4S, v22.4S, v13.4S // ....................e................................................... + sub v27.4S, v22.4S, v13.4S // ...................e.................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v17.2D, v26.2D, v18.2D // ................................................................*....... + trn2 v9.2D, v16.2D, v6.2D // .................................................................*...... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v19.2D, v26.2D, v18.2D // ..................................................................*..... + trn2 v21.4S, v4.4S, v27.4S // ...........................e............................................ + trn1 v14.4S, v4.4S, v27.4S // ..........................e............................................. + trn2 v24.4S, v28.4S, v12.4S // .............................e.......................................... + ldr q26, [x4], #(6*16) // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v2.2D, v16.2D, v6.2D // ...................................................................*.... + trn1 v0.4S, v28.4S, v12.4S // ............................e........................................... + str q17, [x1, #32] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q19, [x1], #64 // ....................................................................*... + trn2 v6.2D, v21.2D, v24.2D // ...............................e........................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q9, [x1, #-16] // .......................................................................* + str q2, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ - // original source code - // ldr q8, [x1, #(16*0)] // ............e...........................................................|...........e........................................................... - // ldr q9, [x1, #(16*1)] // .........................e..............................................|........................e.............................................. - // ldr q10, [x1, #(16*2)] // e.......................................................................e....................................................................... - // ldr q11, [x1, #(16*3)] // .....e..................................................................|....e.................................................................. - // ldr q0, [x3], #16 // ...e....................................................................|..e.................................................................... - // ldr q1, [x3], #8 // .............e..........................................................|............e.......................................................... - // mul v24.4s, v10.4s, v0.s[0] // ................e.......................................................|...............e....................................................... - // sqrdmulh v10.4s, v10.4s, v0.s[1] // .................e......................................................|................e...................................................... - // mla v24.4s, v10.4s, v29.4s // ..........................e.............................................|.........................e............................................. - // sub v10.4s, v8.4s, v24.4s // .......................................e................................|......................................e................................ - // add v8.4s, v8.4s, v24.4s // ....................................e...................................|...................................e................................... - // mul v24.4s, v11.4s, v0.s[0] // ..................e.....................................................|.................e..................................................... - // sqrdmulh v11.4s, v11.4s, v0.s[1] // ...................e....................................................|..................e.................................................... - // mla v24.4s, v11.4s, v29.4s // ...........................e............................................|..........................e............................................ - // sub v11.4s, v9.4s, v24.4s // ...................................e....................................|..................................e.................................... - // add v9.4s, v9.4s, v24.4s // ..................................e.....................................|.................................e..................................... - // mul v24.4s, v9.4s, v0.s[2] // ..........................................e.............................|.........................................e............................. - // sqrdmulh v9.4s, v9.4s, v0.s[3] // ...........................................e............................|..........................................e............................ - // mla v24.4s, v9.4s, v29.4s // ................................................e.......................|...............................................e....................... - // sub v9.4s, v8.4s, v24.4s // .........................................................e..............|........................................................e.............. - // add v8.4s, v8.4s, v24.4s // ........................................................e...............|.......................................................e............... - // mul v24.4s, v11.4s, v1.s[0] // ........................................e...............................|.......................................e............................... - // sqrdmulh v11.4s, v11.4s, v1.s[1] // .........................................e..............................|........................................e.............................. - // mla v24.4s, v11.4s, v29.4s // .................................................e......................|................................................e...................... - // sub v11.4s, v10.4s, v24.4s // ......................................................e.................|.....................................................e................. - // add v10.4s, v10.4s, v24.4s // .......................................................e................|......................................................e................ - // trn1 v25.4s, v8.4s, v9.4s // ................................................................e.......|...............................................................e....... - // trn2 v26.4s, v8.4s, v9.4s // ...............................................................e........|..............................................................e........ - // trn1 v27.4s, v10.4s, v11.4s // ..............................................................e.........|.............................................................e......... - // trn2 v28.4s, v10.4s, v11.4s // .............................................................e..........|............................................................e.......... - // trn2 v10.2d, v25.2d, v27.2d // ......................................................................e.|.....................................................................e. - // trn2 v11.2d, v26.2d, v28.2d // .....................................................................e..|....................................................................e.. - // trn1 v8.2d, v25.2d, v27.2d // ..*.....................................................................|.*..................................................................... - // trn1 v9.2d, v26.2d, v28.2d // ...........*............................................................|..........*............................................................ - // ldr q0, [x4], #(6*16) // ......................e.................................................|.....................e................................................. - // ldr q4, [x4, #(-6*16 + 1*16)] // .........e..............................................................|........e.............................................................. - // ldr q1, [x4, #(-6*16 + 2*16)] // ...............*........................................................|..............*........................................................ - // ldr q5, [x4, #(-6*16 + 3*16)] // ....................*...................................................|...................*................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ........*...............................................................|.......*............................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ..........*.............................................................|.........*............................................................. - // mul v24.4s, v10.4s, v0.4s // .*......................................................................|*...................................................................... - // sqrdmulh v10.4s, v10.4s, v4.4s // .......*................................................................|......*................................................................ - // mla v24.4s, v10.4s, v29.s[0] // .....................*..................................................|....................*.................................................. - // sub v10.4s, v8.4s, v24.4s // ................................*.......................................|...............................*....................................... - // add v8.4s, v8.4s, v24.4s // .................................*......................................|................................*...................................... - // mul v24.4s, v11.4s, v0.4s // ....*...................................................................|...*................................................................... - // sqrdmulh v11.4s, v11.4s, v4.4s // ......*.................................................................|.....*................................................................. - // mla v24.4s, v11.4s, v29.s[0] // ..............*.........................................................|.............*......................................................... - // sub v11.4s, v9.4s, v24.4s // ........................*...............................................|.......................*............................................... - // add v9.4s, v9.4s, v24.4s // .......................*................................................|......................*................................................ - // mul v24.4s, v9.4s, v1.4s // ..............................*.........................................|.............................*......................................... - // sqrdmulh v9.4s, v9.4s, v5.4s // ............................*...........................................|...........................*........................................... - // mla v24.4s, v9.4s, v29.s[0] // .....................................*..................................|....................................*.................................. - // sub v9.4s, v8.4s, v24.4s // ............................................*...........................|...........................................*........................... - // add v8.4s, v8.4s, v24.4s // .............................................*..........................|............................................*.......................... - // mul v24.4s, v11.4s, v2.4s // ...............................*........................................|..............................*........................................ - // sqrdmulh v11.4s, v11.4s, v6.4s // .............................*..........................................|............................*.......................................... - // mla v24.4s, v11.4s, v29.s[0] // ......................................*.................................|.....................................*................................. - // sub v11.4s, v10.4s, v24.4s // ..............................................*.........................|.............................................*......................... - // add v10.4s, v10.4s, v24.4s // ...............................................*........................|..............................................*........................ - // trn1 v25.4s, v8.4s, v9.4s // ..................................................*.....................|.................................................*..................... - // trn2 v26.4s, v8.4s, v9.4s // ...................................................*....................|..................................................*.................... - // trn1 v27.4s, v10.4s, v11.4s // .....................................................*..................|....................................................*.................. - // trn2 v28.4s, v10.4s, v11.4s // ....................................................*...................|...................................................*................... - // trn2 v10.2d, v25.2d, v27.2d // ...........................................................*............|..........................................................*............ - // trn2 v11.2d, v26.2d, v28.2d // ............................................................*...........|...........................................................*........... - // trn1 v8.2d, v25.2d, v27.2d // ..........................................................*.............|.........................................................*............. - // trn1 v9.2d, v26.2d, v28.2d // .................................................................*......|................................................................*...... - // str q8, [x1], #64 // ..................................................................*.....|.................................................................*..... - // str q9, [x1, #(-(64) + 16*1)] // .......................................................................*|......................................................................* - // str q10, [x1, #(-(64) + 16*2)] // ...................................................................*....|..................................................................*.... - // str q11, [x1, #(-(64) + 16*3)] // ....................................................................*...|...................................................................*... + // --------------------------------------------------------------- new position ---------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q8, [x1, #(16*0)] // ..................e..................................................'....................~.................................................. + // ldr q9, [x1, #(16*1)] // .......e.............................................................'.........~............................................................. + // ldr q10, [x1, #(16*2)] // e....................................................................'..~.................................................................... + // ldr q11, [x1, #(16*3)] // ...e.................................................................'.....~................................................................. + // ldr q0, [x3], #16 // ..e..................................................................'....~.................................................................. + // ldr q1, [x3], #8 // .............e.......................................................'...............~....................................................... + // sqrdmulh v27.4s, v10.4s, v0.s[1] // ................e....................................................'..................~.................................................... + // mul v24.4s, v10.4s, v0.s[0] // .................e...................................................'...................~................................................... + // mla v24.4s, v27.4s, v29.4s // .......................e.............................................'.........................~............................................. + // sub v10.4s, v8.4s, v24.4s // .......................................e.............................'.........................................~............................. + // add v8.4s, v8.4s, v24.4s // ............................e........................................'..............................~........................................ + // sqrdmulh v27.4s, v11.4s, v0.s[1] // ..............e......................................................'................~...................................................... + // mul v24.4s, v11.4s, v0.s[0] // ...............e.....................................................'.................~..................................................... + // mla v24.4s, v27.4s, v29.4s // ......................e..............................................'........................~.............................................. + // sub v11.4s, v9.4s, v24.4s // ..............................e......................................'................................~...................................... + // add v9.4s, v9.4s, v24.4s // .............................e.......................................'...............................~....................................... + // sqrdmulh v27.4s, v9.4s, v0.s[3] // .....................................e...............................'.......................................~............................... + // mul v24.4s, v9.4s, v0.s[2] // ......................................e..............................'........................................~.............................. + // mla v24.4s, v27.4s, v29.4s // .............................................e.......................'...............................................~....................... + // sub v9.4s, v8.4s, v24.4s // ......................................................e..............'........................................................~.............. + // add v8.4s, v8.4s, v24.4s // .....................................................e...............'.......................................................~............... + // sqrdmulh v27.4s, v11.4s, v1.s[1] // ...................................e.................................'.....................................~................................. + // mul v24.4s, v11.4s, v1.s[0] // ....................................e................................'......................................~................................ + // mla v24.4s, v27.4s, v29.4s // ............................................e........................'..............................................~........................ + // sub v11.4s, v10.4s, v24.4s // ...................................................e.................'.....................................................~................. + // add v10.4s, v10.4s, v24.4s // ....................................................e................'......................................................~................ + // trn1 v25.4s, v8.4s, v9.4s // ...........................................................e.........'.............................................................~......... + // trn2 v26.4s, v8.4s, v9.4s // ..........................................................e..........'............................................................~.......... + // trn1 v27.4s, v10.4s, v11.4s // ...............................................................e.....'.................................................................~..... + // trn2 v28.4s, v10.4s, v11.4s // ............................................................e........'..............................................................~........ + // trn2 v10.2d, v25.2d, v27.2d // ....~................................................................'......*................................................................ + // trn2 v11.2d, v26.2d, v28.2d // ..................................................................e..'....................................................................~.. + // trn1 v8.2d, v25.2d, v27.2d // .....................................................................'*...................................................................... + // trn1 v9.2d, v26.2d, v28.2d // .....................................................................'.*..................................................................... + // ldr q0, [ x4], #(6*16) // .............................................................e.......'...............................................................~....... + // ldr q4, [x4, #(-6*16 + 1*16)] // ................................................e....................'..................................................~.................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ........~............................................................'..........*............................................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // .....~...............................................................'.......*............................................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // ......~..............................................................'........*.............................................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // .........~...........................................................'...........*........................................................... + // sqrdmulh v27.4s, v10.4s, v4.4s // ...........~.........................................................'.............*......................................................... + // mul v24.4s, v10.4s, v0.4s // ............~........................................................'..............*........................................................ + // mla v24.4s, v27.4s, v29.4s // ...................~.................................................'.....................*................................................. + // sub v10.4s, v8.4s, v24.4s // ..................................~..................................'....................................*.................................. + // add v8.4s, v8.4s, v24.4s // .................................~...................................'...................................*................................... + // sqrdmulh v27.4s, v11.4s, v4.4s // .....................................................................*....................................................................... + // mul v24.4s, v11.4s, v0.4s // .~...................................................................'...*................................................................... + // mla v24.4s, v27.4s, v29.4s // ..........~..........................................................'............*.......................................................... + // sub v11.4s, v9.4s, v24.4s // ....................~................................................'......................*................................................ + // add v9.4s, v9.4s, v24.4s // .....................~...............................................'.......................*............................................... + // sqrdmulh v27.4s, v9.4s, v5.4s // .........................~...........................................'...........................*........................................... + // mul v24.4s, v9.4s, v1.4s // ........................~............................................'..........................*............................................ + // mla v24.4s, v27.4s, v29.4s // ...............................~.....................................'.................................*..................................... + // sub v9.4s, v8.4s, v24.4s // ........................................~............................'..........................................*............................ + // add v8.4s, v8.4s, v24.4s // .........................................~...........................'...........................................*........................... + // sqrdmulh v27.4s, v11.4s, v6.4s // ...........................~.........................................'.............................*......................................... + // mul v24.4s, v11.4s, v2.4s // ..........................~..........................................'............................*.......................................... + // mla v24.4s, v27.4s, v29.4s // ................................~....................................'..................................*.................................... + // sub v11.4s, v10.4s, v24.4s // ...........................................~.........................'.............................................*......................... + // add v10.4s, v10.4s, v24.4s // ..........................................~..........................'............................................*.......................... + // trn1 v25.4s, v8.4s, v9.4s // ..............................................~......................'................................................*...................... + // trn2 v26.4s, v8.4s, v9.4s // ...............................................~.....................'.................................................*..................... + // trn1 v27.4s, v10.4s, v11.4s // .................................................~...................'...................................................*................... + // trn2 v28.4s, v10.4s, v11.4s // ..................................................~..................'....................................................*.................. + // trn2 v10.2d, v25.2d, v27.2d // .......................................................~.............'.........................................................*............. + // trn2 v11.2d, v26.2d, v28.2d // ........................................................~............'..........................................................*............ + // trn1 v8.2d, v25.2d, v27.2d // .........................................................~...........'...........................................................*........... + // trn1 v9.2d, v26.2d, v28.2d // ..............................................................~......'................................................................*...... + // str q8, [x1], #64 // .................................................................~...'...................................................................*... + // str q9, [x1, #(-(64) + 16*1)] // ....................................................................~'......................................................................* + // str q10, [x1, #(-(64) + 16*2)] // ................................................................~....'..................................................................*.... + // str q11, [x1, #(-(64) + 16*3)] // ...................................................................~.'.....................................................................*. sub count, count, #1 cbnz count, layer5678_start - mul v30.4S, v0.4S, v14.4S // *..................................... - sqrdmulh v5.4S, v0.4S, v3.4S // ....*................................. - sqrdmulh v0.4S, v21.4S, v3.4S // ...*.................................. - mul v31.4S, v21.4S, v14.4S // ..*................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - trn1 v18.2D, v22.2D, v13.2D // .......*.............................. - ldr q21, [x4, #-32] // .....*................................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - ldr q27, [x4, #-16] // ......*............................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mla v31.4S, v0.4S, v29.S[0] // ........*............................. - ldr q3, [x4, #-48] // ..........*........................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - ldr q22, [x4, #-64] // .........*............................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mla v30.4S, v5.4S, v29.S[0] // ...........*.......................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v20.4S, v18.4S, v31.4S // .............*........................ - add v6.4S, v18.4S, v31.4S // ............*......................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - trn1 v18.2D, v4.2D, v24.2D // .*.................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqrdmulh v31.4S, v6.4S, v3.4S // ..............*....................... - sqrdmulh v12.4S, v20.4S, v27.4S // ...............*...................... - mul v3.4S, v6.4S, v22.4S // ................*..................... - mul v26.4S, v20.4S, v21.4S // .................*.................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v6.4S, v18.4S, v30.4S // ..................*................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - add v5.4S, v18.4S, v30.4S // ...................*.................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mla v3.4S, v31.4S, v29.S[0] // ....................*................. - mla v26.4S, v12.4S, v29.S[0] // .....................*................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v23.4S, v6.4S, v26.4S // ........................*............. - add v31.4S, v6.4S, v26.4S // .........................*............ - add v27.4S, v5.4S, v3.4S // .......................*.............. - sub v30.4S, v5.4S, v3.4S // ......................*............... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - trn2 v20.4S, v31.4S, v23.4S // ............................*......... - trn1 v21.4S, v31.4S, v23.4S // .............................*........ - trn2 v1.4S, v27.4S, v30.4S // ...........................*.......... - trn1 v28.4S, v27.4S, v30.4S // ..........................*........... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - trn1 v24.2D, v28.2D, v21.2D // ..............................*....... - trn2 v16.2D, v1.2D, v20.2D // ................................*..... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - trn1 v23.2D, v1.2D, v20.2D // .................................*.... - trn2 v11.2D, v28.2D, v21.2D // ...............................*...... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q24, [x1], #64 // ..................................*... - str q16, [x1, #-16] // ....................................*. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q23, [x1, #-48] // .....................................* - str q11, [x1, #-32] // ...................................*.. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... + // Instructions: 39 + // Expected cycles: 22 + // Expected IPC: 1.77 + // + // Wall time: 0.93s + // User time: 0.93s + // + // --------- original position ----------> + // 0 25 + // |------------------------|------------- + sqrdmulh v28.4S, v6.4S, v8.4S // *...................................... + trn1 v16.2D, v14.2D, v0.2D // .*..................................... + mul v10.4S, v6.4S, v26.4S // ...*................................... + trn2 v6.2D, v14.2D, v0.2D // ....*.................................. + ldr q4, [x4, #-48] // .....*................................. + ldr q22, [x4, #-32] // ......*................................ + // gap // ....................................... + // gap // ....................................... + trn1 v13.2D, v21.2D, v24.2D // ..*.................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + sqrdmulh v19.4S, v6.4S, v8.4S // ..........*............................ + ldr q8, [x4, #-16] // ........*.............................. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mla v10.4S, v28.4S, v29.4S // .........*............................. + mul v28.4S, v6.4S, v26.4S // ...........*........................... + ldr q26, [x4, #-64] // .......*............................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + add v6.4S, v13.4S, v10.4S // ..............*........................ + sub v30.4S, v13.4S, v10.4S // .............*......................... + mla v28.4S, v19.4S, v29.4S // ............*.......................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + sqrdmulh v19.4S, v6.4S, v4.4S // ................*...................... + mul v25.4S, v6.4S, v26.4S // ...............*....................... + sqrdmulh v6.4S, v30.4S, v8.4S // ..................*.................... + mul v8.4S, v30.4S, v22.4S // .................*..................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + add v26.4S, v16.4S, v28.4S // .....................*................. + sub v2.4S, v16.4S, v28.4S // ......................*................ + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mla v8.4S, v6.4S, v29.4S // ....................*.................. + mla v25.4S, v19.4S, v29.4S // ...................*................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + add v19.4S, v2.4S, v8.4S // .........................*............. + sub v6.4S, v2.4S, v8.4S // ..........................*............ + add v8.4S, v26.4S, v25.4S // ........................*.............. + sub v26.4S, v26.4S, v25.4S // .......................*............... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + trn1 v7.4S, v8.4S, v26.4S // ...........................*........... + trn2 v8.4S, v8.4S, v26.4S // ............................*.......... + trn1 v18.4S, v19.4S, v6.4S // .............................*......... + trn2 v6.4S, v19.4S, v6.4S // ..............................*........ + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + trn1 v16.2D, v8.2D, v6.2D // ..................................*.... + trn2 v8.2D, v8.2D, v6.2D // ................................*...... + trn2 v26.2D, v7.2D, v18.2D // ...............................*....... + trn1 v19.2D, v7.2D, v18.2D // .................................*..... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + str q16, [x1, #16] // ......................................* + str q8, [x1, #48] // .....................................*. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + str q26, [x1, #32] // ...................................*... + str q19, [x1], #64 // ....................................*.. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... - // original source code - // mul v23.4S, v0.4S, v14.4S // *..................................... - // trn1 v15.2D, v4.2D, v24.2D // .............*........................ - // mul v18.4S, v21.4S, v14.4S // ...*.................................. - // sqrdmulh v20.4S, v21.4S, v3.4S // ..*................................... - // sqrdmulh v27.4S, v0.4S, v3.4S // .*.................................... - // ldr q12, [x4, #-32] // .....*................................ - // ldr q11, [x4, #-16] // ......*............................... - // trn1 v31.2D, v22.2D, v13.2D // ....*................................. - // mla v18.4S, v20.4S, v29.S[0] // .......*.............................. - // ldr q30, [x4, #-64] // .........*............................ - // ldr q0, [x4, #-48] // ........*............................. - // mla v23.4S, v27.4S, v29.S[0] // ..........*........................... - // add v5.4S, v31.4S, v18.4S // ............*......................... - // sub v17.4S, v31.4S, v18.4S // ...........*.......................... - // sqrdmulh v2.4S, v5.4S, v0.4S // ..............*....................... - // sqrdmulh v9.4S, v17.4S, v11.4S // ...............*...................... - // mul v11.4S, v5.4S, v30.4S // ................*..................... - // mul v30.4S, v17.4S, v12.4S // .................*.................... - // sub v0.4S, v15.4S, v23.4S // ..................*................... - // add v23.4S, v15.4S, v23.4S // ...................*.................. - // mla v11.4S, v2.4S, v29.S[0] // ....................*................. - // mla v30.4S, v9.4S, v29.S[0] // .....................*................ - // sub v18.4S, v23.4S, v11.4S // .........................*............ - // add v31.4S, v23.4S, v11.4S // ........................*............. - // sub v22.4S, v0.4S, v30.4S // ......................*............... - // add v11.4S, v0.4S, v30.4S // .......................*.............. - // trn1 v1.4S, v31.4S, v18.4S // .............................*........ - // trn2 v21.4S, v31.4S, v18.4S // ............................*......... - // trn2 v12.4S, v11.4S, v22.4S // ..........................*........... - // trn1 v6.4S, v11.4S, v22.4S // ...........................*.......... - // trn1 v25.2D, v1.2D, v6.2D // ..............................*....... - // trn2 v1.2D, v1.2D, v6.2D // .................................*.... - // trn2 v9.2D, v21.2D, v12.2D // ...............................*...... - // trn1 v16.2D, v21.2D, v12.2D // ................................*..... - // str q25, [x1], #64 // ..................................*... - // str q1, [x1, #-32] // .....................................* - // str q9, [x1, #-16] // ...................................*.. - // str q16, [x1, #-48] // ....................................*. + // ------------ new position ------------> + // 0 25 + // |------------------------|------------- + // sqrdmulh v25.4S, v6.4S, v8.4S // *...................................... + // trn1 v17.2D, v14.2D, v0.2D // .*..................................... + // trn1 v13.2D, v21.2D, v24.2D // ......*................................ + // mul v6.4S, v6.4S, v26.4S // ..*.................................... + // trn2 v28.2D, v14.2D, v0.2D // ...*................................... + // ldr q22, [x4, #-48] // ....*.................................. + // ldr q20, [x4, #-32] // .....*................................. + // ldr q4, [x4, #-64] // ...........*........................... + // ldr q30, [x4, #-16] // ........*.............................. + // mla v6.4S, v25.4S, v29.4S // .........*............................. + // sqrdmulh v19.4S, v28.4S, v8.4S // .......*............................... + // mul v28.4S, v28.4S, v26.4S // ..........*............................ + // mla v28.4S, v19.4S, v29.4S // ..............*........................ + // sub v19.4S, v13.4S, v6.4S // .............*......................... + // add v6.4S, v13.4S, v6.4S // ............*.......................... + // mul v8.4S, v6.4S, v4.4S // ................*...................... + // sqrdmulh v26.4S, v6.4S, v22.4S // ...............*....................... + // mul v6.4S, v19.4S, v20.4S // ..................*.................... + // sqrdmulh v19.4S, v19.4S, v30.4S // .................*..................... + // mla v8.4S, v26.4S, v29.4S // ......................*................ + // mla v6.4S, v19.4S, v29.4S // .....................*................. + // add v19.4S, v17.4S, v28.4S // ...................*................... + // sub v26.4S, v17.4S, v28.4S // ....................*.................. + // sub v16.4S, v19.4S, v8.4S // ..........................*............ + // add v8.4S, v19.4S, v8.4S // .........................*............. + // add v2.4S, v26.4S, v6.4S // .......................*............... + // sub v19.4S, v26.4S, v6.4S // ........................*.............. + // trn1 v26.4S, v8.4S, v16.4S // ...........................*........... + // trn2 v16.4S, v8.4S, v16.4S // ............................*.......... + // trn1 v18.4S, v2.4S, v19.4S // .............................*......... + // trn2 v6.4S, v2.4S, v19.4S // ..............................*........ + // trn2 v17.2D, v26.2D, v18.2D // .................................*..... + // trn2 v9.2D, v16.2D, v6.2D // ................................*...... + // trn1 v19.2D, v26.2D, v18.2D // ..................................*.... + // trn1 v2.2D, v16.2D, v6.2D // ...............................*....... + // str q17, [x1, #32] // .....................................*. + // str q19, [x1], #64 // ......................................* + // str q9, [x1, #-16] // ....................................*.. + // str q2, [x1, #-48] // ...................................*... pop_stack diff --git a/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm.s index d0d34aa1..c473cc3e 100644 --- a/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -67,15 +46,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, modulus, 0 + vmla \dst, t2, modulus .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +63,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, modulus, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -97,38 +70,38 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data0, data1, data2, data3 @@ -143,7 +116,7 @@ trn1 \data1\().2d, t1.2d, t3.2d .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -154,7 +127,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -164,7 +137,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -172,7 +145,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -183,19 +156,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -296,589 +269,598 @@ _ntt_dilithium_1234_5678_manual_st4_opt_m1_icestorm: load_roots_1234 .p2align 2 - ldr q8, [x0, #576] // .........*...................................................................................................................................................................................... - ldr q24, [x0, #960] // ...............*................................................................................................................................................................................ - ldr q28, [x0, #64] // .*.............................................................................................................................................................................................. - ldr q25, [x0, #832] // .............*.................................................................................................................................................................................. - ldr q27, [x0, #704] // ...........*.................................................................................................................................................................................... - ldr q14, [x0, #640] // ..........*..................................................................................................................................................................................... - sqrdmulh v15.4S, v24.4S, v0.S[1] // ....................................................*........................................................................................................................................... - mul v23.4S, v24.4S, v0.S[0] // ...................................................*............................................................................................................................................ - ldr q24, [x0, #896] // ..............*................................................................................................................................................................................. - sqrdmulh v17.4S, v25.4S, v0.S[1] // ..........................................*..................................................................................................................................................... - mul v20.4S, v25.4S, v0.S[0] // .........................................*...................................................................................................................................................... - mul v9.4S, v27.4S, v0.S[0] // ...............................*................................................................................................................................................................ - sqrdmulh v31.4S, v8.4S, v0.S[1] // ......................*......................................................................................................................................................................... - ldr q12, [x0, #448] // .......*........................................................................................................................................................................................ - sqrdmulh v11.4S, v27.4S, v0.S[1] // ................................*............................................................................................................................................................... - ldr q19, [x0, #768] // ............*................................................................................................................................................................................... - mla v23.4S, v15.4S, v29.4S // .....................................................*.......................................................................................................................................... - ldr q30, [x0, #320] // .....*.......................................................................................................................................................................................... - mul v16.4S, v24.4S, v0.S[0] // ..............................................*................................................................................................................................................. - mul v10.4S, v8.4S, v0.S[0] // .....................*.......................................................................................................................................................................... + ldr q23, [x0, #896] // ..............*................................................................................................................................................................................. + ldr q9, [x0, #960] // ...............*................................................................................................................................................................................ ldr q21, [x0, #192] // ...*............................................................................................................................................................................................ - ldr q13, [x0, #512] // ........*....................................................................................................................................................................................... - sub v27.4S, v12.4S, v23.4S // ......................................................*......................................................................................................................................... - mla v20.4S, v17.4S, v29.4S // ...........................................*.................................................................................................................................................... - mul v17.4S, v19.4S, v0.S[0] // ....................................*........................................................................................................................................................... - sqrdmulh v15.4S, v19.4S, v0.S[1] // .....................................*.......................................................................................................................................................... - sqrdmulh v18.4S, v27.4S, v1.S[1] // ............................................................................................*................................................................................................... - mul v19.4S, v27.4S, v1.S[0] // ...........................................................................................*.................................................................................................... - mla v10.4S, v31.4S, v29.4S // .......................*........................................................................................................................................................................ - mla v9.4S, v11.4S, v29.4S // .................................*.............................................................................................................................................................. + ldr q25, [x0, #768] // ............*................................................................................................................................................................................... + ldr q27, [x0, #576] // .........*...................................................................................................................................................................................... + ldr q19, [x0, #64] // .*.............................................................................................................................................................................................. + ldr q22, [x0, #704] // ...........*.................................................................................................................................................................................... + sqrdmulh v17.4S, v9.4S, v0.S[1] // ...................................................*............................................................................................................................................ + ldr q16, [x0, #384] // ......*......................................................................................................................................................................................... + mul v9.4S, v9.4S, v0.S[0] // ....................................................*........................................................................................................................................... + sqrdmulh v10.4S, v23.4S, v0.S[1] // ..............................................*................................................................................................................................................. + mul v28.4S, v23.4S, v0.S[0] // ...............................................*................................................................................................................................................ ldr q11, [x0, #256] // ....*........................................................................................................................................................................................... - mul v8.4S, v13.4S, v0.S[0] // ................*............................................................................................................................................................................... - mla v17.4S, v15.4S, v29.4S // ......................................*......................................................................................................................................................... - sqrdmulh v15.4S, v13.4S, v0.S[1] // .................*.............................................................................................................................................................................. - add v22.4S, v28.4S, v10.4S // .........................*...................................................................................................................................................................... - mla v19.4S, v18.4S, v29.4S // .............................................................................................*.................................................................................................. - sub v26.4S, v21.4S, v9.4S // ..................................*............................................................................................................................................................. - add v9.4S, v21.4S, v9.4S // ...................................*............................................................................................................................................................ - ldr q27, [x0, #384] // ......*......................................................................................................................................................................................... - sub v28.4S, v28.4S, v10.4S // ........................*....................................................................................................................................................................... - sqrdmulh v10.4S, v24.4S, v0.S[1] // ...............................................*................................................................................................................................................ - add v24.4S, v12.4S, v23.4S // .......................................................*........................................................................................................................................ - sub v23.4S, v30.4S, v20.4S // ............................................*................................................................................................................................................... - add v18.4S, v30.4S, v20.4S // .............................................*.................................................................................................................................................. - sub v30.4S, v11.4S, v17.4S // .......................................*........................................................................................................................................................ - mul v12.4S, v14.4S, v0.S[0] // ..........................*..................................................................................................................................................................... - mla v16.4S, v10.4S, v29.4S // ................................................*............................................................................................................................................... - sqrdmulh v25.4S, v24.4S, v0.S[3] // ........................................................................*....................................................................................................................... - mul v20.4S, v24.4S, v0.S[2] // .......................................................................*........................................................................................................................ - sqrdmulh v31.4S, v18.4S, v0.S[3] // ..............................................................*................................................................................................................................. - sqrdmulh v21.4S, v14.4S, v0.S[1] // ...........................*.................................................................................................................................................................... - mul v14.4S, v18.4S, v0.S[2] // .............................................................*.................................................................................................................................. - add v13.4S, v27.4S, v16.4S // ..................................................*............................................................................................................................................. - add v24.4S, v11.4S, v17.4S // ........................................*....................................................................................................................................................... - sub v17.4S, v26.4S, v19.4S // ..............................................................................................*................................................................................................. - mla v20.4S, v25.4S, v29.4S // .........................................................................*...................................................................................................................... - add v26.4S, v26.4S, v19.4S // ...............................................................................................*................................................................................................ - mla v14.4S, v31.4S, v29.4S // ...............................................................*................................................................................................................................ - add v18.4S, v9.4S, v20.4S // ...........................................................................*.................................................................................................................... - ldr q19, [x0, #128] // ..*............................................................................................................................................................................................. - sub v20.4S, v9.4S, v20.4S // ..........................................................................*..................................................................................................................... - sqrdmulh v10.4S, v23.4S, v1.S[1] // ..................................................................................*............................................................................................................. - mla v12.4S, v21.4S, v29.4S // ............................*................................................................................................................................................................... - sqrdmulh v9.4S, v20.4S, v2.S[1] // ................................................................................................................*............................................................................... - mul v31.4S, v20.4S, v2.S[0] // ...............................................................................................................*................................................................................ - mul v11.4S, v17.4S, v3.S[0] // ...................................................................................................................................*............................................................ - sub v20.4S, v27.4S, v16.4S // .................................................*.............................................................................................................................................. - sqrdmulh v17.4S, v17.4S, v3.S[1] // ....................................................................................................................................*........................................................... - mul v16.4S, v23.4S, v1.S[0] // .................................................................................*.............................................................................................................. - mla v31.4S, v9.4S, v29.4S // .................................................................................................................*.............................................................................. - mul v9.4S, v18.4S, v1.S[2] // .....................................................................................................*.......................................................................................... - sqrdmulh v21.4S, v18.4S, v1.S[3] // ......................................................................................................*......................................................................................... - mla v16.4S, v10.4S, v29.4S // ...................................................................................*............................................................................................................ - add v10.4S, v22.4S, v14.4S // .................................................................*.............................................................................................................................. - sub v27.4S, v22.4S, v14.4S // ................................................................*............................................................................................................................... - sqrdmulh v22.4S, v20.4S, v1.S[1] // .......................................................................................*........................................................................................................ - mla v9.4S, v21.4S, v29.4S // .......................................................................................................*........................................................................................ - mul v18.4S, v13.4S, v0.S[2] // ..................................................................*............................................................................................................................. - sqrdmulh v25.4S, v13.4S, v0.S[3] // ...................................................................*............................................................................................................................ - add v21.4S, v10.4S, v9.4S // .........................................................................................................*...................................................................................... - sub v14.4S, v10.4S, v9.4S // ........................................................................................................*....................................................................................... - ldr q9, [x0, #0] // *............................................................................................................................................................................................... - add v10.4S, v28.4S, v16.4S // .....................................................................................*.......................................................................................................... - mla v8.4S, v15.4S, v29.4S // ..................*............................................................................................................................................................................. - sub v23.4S, v28.4S, v16.4S // ....................................................................................*........................................................................................................... - mla v18.4S, v25.4S, v29.4S // ....................................................................*........................................................................................................................... - mul v13.4S, v20.4S, v1.S[0] // ......................................................................................*......................................................................................................... - add v25.4S, v19.4S, v12.4S // ..............................*................................................................................................................................................................. - sub v20.4S, v9.4S, v8.4S // ...................*............................................................................................................................................................................ - add v15.4S, v9.4S, v8.4S // ....................*........................................................................................................................................................................... - sub v28.4S, v25.4S, v18.4S // .....................................................................*.......................................................................................................................... - add v25.4S, v25.4S, v18.4S // ......................................................................*......................................................................................................................... - mla v13.4S, v22.4S, v29.4S // ........................................................................................*....................................................................................................... - mla v11.4S, v17.4S, v29.4S // .....................................................................................................................................*.......................................................... - mul v17.4S, v25.4S, v1.S[2] // ................................................................................................*............................................................................................... - sqrdmulh v18.4S, v25.4S, v1.S[3] // .................................................................................................*.............................................................................................. + ldr q15, [x0, #128] // ..*............................................................................................................................................................................................. + mul v23.4S, v27.4S, v0.S[0] // ......................*......................................................................................................................................................................... + sqrdmulh v18.4S, v25.4S, v0.S[1] // ....................................*........................................................................................................................................................... + ldr q13, [x0, #640] // ..........*..................................................................................................................................................................................... + mla v9.4S, v17.4S, v29.4S // .....................................................*.......................................................................................................................................... + sqrdmulh v12.4S, v27.4S, v0.S[1] // .....................*.......................................................................................................................................................................... + ldr q27, [x0, #512] // ........*....................................................................................................................................................................................... + ldr q26, [x0, #832] // .............*.................................................................................................................................................................................. + mla v28.4S, v10.4S, v29.4S // ................................................*............................................................................................................................................... + mul v24.4S, v22.4S, v0.S[0] // ................................*............................................................................................................................................................... + sqrdmulh v10.4S, v22.4S, v0.S[1] // ...............................*................................................................................................................................................................ + ldr q31, [x0, #320] // .....*.......................................................................................................................................................................................... + sqrdmulh v14.4S, v13.4S, v0.S[1] // ..........................*..................................................................................................................................................................... + mul v17.4S, v13.4S, v0.S[0] // ...........................*.................................................................................................................................................................... + mla v23.4S, v12.4S, v29.4S // .......................*........................................................................................................................................................................ + mul v8.4S, v25.4S, v0.S[0] // .....................................*.......................................................................................................................................................... + ldr q25, [x0, #448] // .......*........................................................................................................................................................................................ + add v30.4S, v16.4S, v28.4S // ..................................................*............................................................................................................................................. + mla v24.4S, v10.4S, v29.4S // .................................*.............................................................................................................................................................. + sqrdmulh v22.4S, v26.4S, v0.S[1] // .........................................*...................................................................................................................................................... + mul v26.4S, v26.4S, v0.S[0] // ..........................................*..................................................................................................................................................... + sqrdmulh v13.4S, v30.4S, v0.S[3] // ..................................................................*............................................................................................................................. + sub v20.4S, v16.4S, v28.4S // .................................................*.............................................................................................................................................. + mul v10.4S, v30.4S, v0.S[2] // ...................................................................*............................................................................................................................ + sub v28.4S, v25.4S, v9.4S // ......................................................*......................................................................................................................................... + mul v12.4S, v20.4S, v1.S[0] // .......................................................................................*........................................................................................................ + add v25.4S, v25.4S, v9.4S // .......................................................*........................................................................................................................................ + mla v8.4S, v18.4S, v29.4S // ......................................*......................................................................................................................................................... + sqrdmulh v9.4S, v20.4S, v1.S[1] // ......................................................................................*......................................................................................................... + sub v20.4S, v21.4S, v24.4S // ..................................*............................................................................................................................................................. + mla v10.4S, v13.4S, v29.4S // ....................................................................*........................................................................................................................... + sqrdmulh v13.4S, v25.4S, v0.S[3] // .......................................................................*........................................................................................................................ + mla v26.4S, v22.4S, v29.4S // ...........................................*.................................................................................................................................................... + sub v16.4S, v11.4S, v8.4S // .......................................*........................................................................................................................................................ + mul v25.4S, v25.4S, v0.S[2] // ........................................................................*....................................................................................................................... + mul v30.4S, v27.4S, v0.S[0] // .................*.............................................................................................................................................................................. + sqrdmulh v27.4S, v27.4S, v0.S[1] // ................*............................................................................................................................................................................... + add v18.4S, v31.4S, v26.4S // .............................................*.................................................................................................................................................. + add v8.4S, v11.4S, v8.4S // ........................................*....................................................................................................................................................... + mla v25.4S, v13.4S, v29.4S // .........................................................................*...................................................................................................................... + sub v13.4S, v31.4S, v26.4S // ............................................*................................................................................................................................................... + mul v11.4S, v18.4S, v0.S[2] // ..............................................................*................................................................................................................................. + mla v30.4S, v27.4S, v29.4S // ..................*............................................................................................................................................................................. + add v24.4S, v21.4S, v24.4S // ...................................*............................................................................................................................................................ + sqrdmulh v31.4S, v18.4S, v0.S[3] // .............................................................*.................................................................................................................................. + mul v26.4S, v28.4S, v1.S[0] // ............................................................................................*................................................................................................... + sqrdmulh v22.4S, v28.4S, v1.S[1] // ...........................................................................................*.................................................................................................... + ldr q28, [x0, #0] // *............................................................................................................................................................................................... + add v27.4S, v24.4S, v25.4S // ...........................................................................*.................................................................................................................... + sqrdmulh v21.4S, v16.4S, v1.S[1] // ............................................................................*................................................................................................................... + mul v18.4S, v13.4S, v1.S[0] // ..................................................................................*............................................................................................................. + mul v16.4S, v16.4S, v1.S[0] // .............................................................................*.................................................................................................................. + sqrdmulh v13.4S, v13.4S, v1.S[1] // .................................................................................*.............................................................................................................. + mla v26.4S, v22.4S, v29.4S // .............................................................................................*.................................................................................................. + add v22.4S, v28.4S, v30.4S // ....................*........................................................................................................................................................................... + sub v28.4S, v28.4S, v30.4S // ...................*............................................................................................................................................................................ + mla v12.4S, v9.4S, v29.4S // ........................................................................................*....................................................................................................... + mla v16.4S, v21.4S, v29.4S // ..............................................................................*................................................................................................................. + mla v11.4S, v31.4S, v29.4S // ...............................................................*................................................................................................................................ + sub v31.4S, v20.4S, v26.4S // ..............................................................................................*................................................................................................. + mla v18.4S, v13.4S, v29.4S // ...................................................................................*............................................................................................................ + add v9.4S, v20.4S, v26.4S // ...............................................................................................*................................................................................................ + mul v26.4S, v31.4S, v3.S[0] // ....................................................................................................................................*........................................................... + sqrdmulh v30.4S, v31.4S, v3.S[1] // ...................................................................................................................................*............................................................ + mul v21.4S, v27.4S, v1.S[2] // ......................................................................................................*......................................................................................... + sqrdmulh v31.4S, v27.4S, v1.S[3] // .....................................................................................................*.......................................................................................... + add v27.4S, v19.4S, v23.4S // .........................*...................................................................................................................................................................... + sub v23.4S, v19.4S, v23.4S // ........................*....................................................................................................................................................................... + mul v13.4S, v9.4S, v2.S[2] // ..........................................................................................................................*..................................................................... + sqrdmulh v9.4S, v9.4S, v2.S[3] // .........................................................................................................................*...................................................................... + add v20.4S, v23.4S, v18.4S // .....................................................................................*.......................................................................................................... + mla v21.4S, v31.4S, v29.4S // .......................................................................................................*........................................................................................ + add v19.4S, v27.4S, v11.4S // .................................................................*.............................................................................................................................. + mla v17.4S, v14.4S, v29.4S // ............................*................................................................................................................................................................... + sub v23.4S, v23.4S, v18.4S // ....................................................................................*........................................................................................................... + mla v13.4S, v9.4S, v29.4S // ...........................................................................................................................*.................................................................... + mul v9.4S, v8.4S, v0.S[2] // .........................................................*...................................................................................................................................... + sqrdmulh v18.4S, v8.4S, v0.S[3] // ........................................................*....................................................................................................................................... + add v31.4S, v19.4S, v21.4S // .........................................................................................................*...................................................................................... + sub v8.4S, v15.4S, v17.4S // .............................*.................................................................................................................................................................. + sub v14.4S, v20.4S, v13.4S // ............................................................................................................................*................................................................... + sub v24.4S, v24.4S, v25.4S // ..........................................................................*..................................................................................................................... + mla v9.4S, v18.4S, v29.4S // ..........................................................*..................................................................................................................................... sub count, count, #1 layer1234_start: - mul v25.4S, v24.4S, v0.S[2] // ........................................................*....................................................................................................................................... - sqrdmulh v24.4S, v24.4S, v0.S[3] // .........................................................*...................................................................................................................................... - mul v22.4S, v21.4S, v3.S[2] // ........................................................................................................................................*....................................................... - sqrdmulh v16.4S, v21.4S, v3.S[3] // .........................................................................................................................................*...................................................... - mul v8.4S, v28.4S, v2.S[0] // ..........................................................................................................*..................................................................................... - sqrdmulh v9.4S, v28.4S, v2.S[1] // ...........................................................................................................*.................................................................................... - mla v25.4S, v24.4S, v29.4S // ..........................................................*..................................................................................................................................... - add v28.4S, v23.4S, v11.4S // .......................................................................................................................................*........................................................ - mla v22.4S, v16.4S, v29.4S // ..........................................................................................................................................*..................................................... - add v16.4S, v27.4S, v31.4S // ...................................................................................................................*............................................................................ - mla v17.4S, v18.4S, v29.4S // ..................................................................................................*............................................................................................. - mla v8.4S, v9.4S, v29.4S // ............................................................................................................*................................................................................... - add v24.4S, v15.4S, v25.4S // ............................................................*................................................................................................................................... - sub v9.4S, v15.4S, v25.4S // ...........................................................*.................................................................................................................................... - mul v21.4S, v16.4S, v4.S[2] // ..................................................................................................................................................*............................................. - sqrdmulh v16.4S, v16.4S, v4.S[3] // ...................................................................................................................................................*............................................ - add v25.4S, v9.4S, v8.4S // ..............................................................................................................*................................................................................. - sub v9.4S, v9.4S, v8.4S // .............................................................................................................*.................................................................................. - sqrdmulh v18.4S, v26.4S, v2.S[3] // ..........................................................................................................................*..................................................................... - sub v15.4S, v27.4S, v31.4S // ..................................................................................................................*............................................................................. - sqrdmulh v8.4S, v30.4S, v1.S[1] // .............................................................................*.................................................................................................................. - mul v27.4S, v30.4S, v1.S[0] // ............................................................................*................................................................................................................... - mul v31.4S, v26.4S, v2.S[2] // .........................................................................................................................*...................................................................... - sqrdmulh v30.4S, v15.4S, v5.S[1] // ........................................................................................................................................................*....................................... - mul v15.4S, v15.4S, v5.S[0] // .......................................................................................................................................................*........................................ - sub v26.4S, v19.4S, v12.4S // .............................*.................................................................................................................................................................. - sqrdmulh v12.4S, v28.4S, v6.S[3] // .......................................................................................................................................................................*........................ - mla v21.4S, v16.4S, v29.4S // ....................................................................................................................................................*........................................... - mla v27.4S, v8.4S, v29.4S // ..............................................................................*................................................................................................................. - add v8.4S, v26.4S, v13.4S // ..........................................................................................*..................................................................................................... - ldr q19, [x0, #848] // .............e.................................................................................................................................................................................. - mla v15.4S, v30.4S, v29.4S // .........................................................................................................................................................*...................................... - mla v31.4S, v18.4S, v29.4S // ...........................................................................................................................*.................................................................... - mul v16.4S, v28.4S, v6.S[2] // ......................................................................................................................................................................*......................... - sqrdmulh v28.4S, v8.4S, v2.S[3] // .....................................................................................................................*.......................................................................... - sub v30.4S, v20.4S, v27.4S // ...............................................................................*................................................................................................................ - add v18.4S, v9.4S, v15.4S // ...........................................................................................................................................................*.................................... - add v27.4S, v20.4S, v27.4S // ................................................................................*............................................................................................................... - str q18, [x0, #384] // ......................................................................................................................................................................................*......... - sqrdmulh v18.4S, v19.4S, v0.S[1] // ..........................................e..................................................................................................................................................... - sub v9.4S, v9.4S, v15.4S // ..........................................................................................................................................................*..................................... - mul v15.4S, v19.4S, v0.S[0] // .........................................e...................................................................................................................................................... - mul v19.4S, v8.4S, v2.S[2] // ....................................................................................................................*........................................................................... - ldr q8, [x0, #528] // ........e....................................................................................................................................................................................... - str q9, [x0, #448] // .......................................................................................................................................................................................*........ - sub v11.4S, v23.4S, v11.4S // ......................................................................................................................................*......................................................... - add v20.4S, v10.4S, v31.4S // .............................................................................................................................*.................................................................. - mla v16.4S, v12.4S, v29.4S // ........................................................................................................................................................................*....................... - sub v9.4S, v25.4S, v21.4S // .....................................................................................................................................................*.......................................... - mla v15.4S, v18.4S, v29.4S // ...........................................e.................................................................................................................................................... - sqrdmulh v12.4S, v20.4S, v5.S[3] // .............................................................................................................................................................*.................................. - add v25.4S, v25.4S, v21.4S // ......................................................................................................................................................*......................................... - sub v21.4S, v26.4S, v13.4S // .........................................................................................*...................................................................................................... - str q9, [x0, #320] // .....................................................................................................................................................................................*.......... - mul v13.4S, v20.4S, v5.S[2] // ............................................................................................................................................................*................................... - mul v20.4S, v8.4S, v0.S[0] // ................e............................................................................................................................................................................... - str q25, [x0, #256] // ....................................................................................................................................................................................*........... - mul v9.4S, v21.4S, v3.S[0] // ..............................................................................................................................*................................................................. - add v26.4S, v24.4S, v17.4S // ....................................................................................................*........................................................................................... - sqrdmulh v25.4S, v21.4S, v3.S[1] // ...............................................................................................................................*................................................................ - mla v19.4S, v28.4S, v29.4S // ......................................................................................................................*......................................................................... - ldr q28, [x0, #656] // ..........e..................................................................................................................................................................................... - sub v23.4S, v24.4S, v17.4S // ...................................................................................................*............................................................................................ - sqrdmulh v21.4S, v8.4S, v0.S[1] // .................e.............................................................................................................................................................................. - ldr q24, [x0, #976] // ...............e................................................................................................................................................................................ - mul v18.4S, v11.4S, v7.S[0] // ...........................................................................................................................................................................*.................... - mla v13.4S, v12.4S, v29.4S // ..............................................................................................................................................................*................................. - mla v9.4S, v25.4S, v29.4S // ................................................................................................................................*............................................................... - ldr q25, [x0, #720] // ...........e.................................................................................................................................................................................... - sub v8.4S, v27.4S, v19.4S // .......................................................................................................................*........................................................................ - add v12.4S, v27.4S, v19.4S // ........................................................................................................................*....................................................................... - sqrdmulh v17.4S, v11.4S, v7.S[1] // ............................................................................................................................................................................*................... - sqrdmulh v19.4S, v28.4S, v0.S[1] // ...........................e.................................................................................................................................................................... - mul v27.4S, v24.4S, v0.S[0] // ...................................................e............................................................................................................................................ - sub v11.4S, v30.4S, v9.4S // .................................................................................................................................*.............................................................. - add v9.4S, v30.4S, v9.4S // ..................................................................................................................................*............................................................. - sub v30.4S, v26.4S, v22.4S // ...........................................................................................................................................*.................................................... - add v22.4S, v26.4S, v22.4S // ............................................................................................................................................*................................................... - sqrdmulh v24.4S, v24.4S, v0.S[1] // ....................................................e........................................................................................................................................... - sqrdmulh v26.4S, v14.4S, v4.S[1] // ..............................................................................................................................................*................................................. - str q30, [x0, #64] // .................................................................................................................................................................................*.............. - sub v30.4S, v12.4S, v13.4S // ...............................................................................................................................................................*................................ - mla v20.4S, v21.4S, v29.4S // ..................e............................................................................................................................................................................. - add v21.4S, v9.4S, v16.4S // ..........................................................................................................................................................................*..................... - str q22, [x0], #(16) // ................................................................................................................................................................................*............... - ldr q22, [x0, #448] // .......e........................................................................................................................................................................................ - mla v18.4S, v17.4S, v29.4S // .............................................................................................................................................................................*.................. - mla v27.4S, v24.4S, v29.4S // .....................................................e.......................................................................................................................................... - ldr q17, [x0, #320] // .....e.......................................................................................................................................................................................... - add v13.4S, v12.4S, v13.4S // ................................................................................................................................................................*............................... - str q21, [x0, #752] // ............................................................................................................................................................................................*... - sqrdmulh v21.4S, v25.4S, v0.S[1] // ................................e............................................................................................................................................................... - mul v25.4S, v25.4S, v0.S[0] // ...............................e................................................................................................................................................................ - mul v14.4S, v14.4S, v4.S[0] // .............................................................................................................................................*.................................................. - str q30, [x0, #560] // .........................................................................................................................................................................................*...... - sub v30.4S, v10.4S, v31.4S // ............................................................................................................................*................................................................... - str q13, [x0, #496] // ........................................................................................................................................................................................*....... - mla v14.4S, v26.4S, v29.4S // ...............................................................................................................................................*................................................ - ldr q31, [x0, #192] // ...e............................................................................................................................................................................................ - sub v13.4S, v22.4S, v27.4S // ......................................................e......................................................................................................................................... - mla v25.4S, v21.4S, v29.4S // .................................e.............................................................................................................................................................. - sqrdmulh v12.4S, v30.4S, v6.S[1] // ..................................................................................................................................................................*............................. - mul v10.4S, v13.4S, v1.S[0] // ...........................................................................................e.................................................................................................... - sqrdmulh v24.4S, v13.4S, v1.S[1] // ............................................................................................e................................................................................................... - ldr q13, [x0, #896] // ..............e................................................................................................................................................................................. - sub v26.4S, v23.4S, v14.4S // ................................................................................................................................................*............................................... - sub v21.4S, v17.4S, v15.4S // ............................................e................................................................................................................................................... - add v17.4S, v17.4S, v15.4S // .............................................e.................................................................................................................................................. - add v15.4S, v31.4S, v25.4S // ...................................e............................................................................................................................................................ - mul v30.4S, v30.4S, v6.S[0] // .................................................................................................................................................................*.............................. - mla v10.4S, v24.4S, v29.4S // .............................................................................................e.................................................................................................. - ldr q24, [x0, #768] // ............e................................................................................................................................................................................... - add v14.4S, v23.4S, v14.4S // .................................................................................................................................................*.............................................. - sub v23.4S, v31.4S, v25.4S // ..................................e............................................................................................................................................................. - str q26, [x0, #176] // ...................................................................................................................................................................................*............ - sqrdmulh v26.4S, v13.4S, v0.S[1] // ...............................................e................................................................................................................................................ - add v22.4S, v22.4S, v27.4S // .......................................................e........................................................................................................................................ - mul v31.4S, v13.4S, v0.S[0] // ..............................................e................................................................................................................................................. - mla v30.4S, v12.4S, v29.4S // ...................................................................................................................................................................*............................ - str q14, [x0, #112] // ..................................................................................................................................................................................*............. - sqrdmulh v14.4S, v24.4S, v0.S[1] // .....................................e.......................................................................................................................................................... - mul v12.4S, v22.4S, v0.S[2] // .......................................................................e........................................................................................................................ - sqrdmulh v27.4S, v22.4S, v0.S[3] // ........................................................................e....................................................................................................................... - ldr q22, [x0, #384] // ......e......................................................................................................................................................................................... - mul v25.4S, v24.4S, v0.S[0] // ....................................e........................................................................................................................................................... - sub v13.4S, v8.4S, v30.4S // ....................................................................................................................................................................*........................... - mla v31.4S, v26.4S, v29.4S // ................................................e............................................................................................................................................... - ldr q26, [x0, #576] // .........e...................................................................................................................................................................................... - add v24.4S, v8.4S, v30.4S // .....................................................................................................................................................................*.......................... - sub v8.4S, v11.4S, v18.4S // ..............................................................................................................................................................................*................. - str q13, [x0, #688] // ...........................................................................................................................................................................................*.... - add v18.4S, v11.4S, v18.4S // ...............................................................................................................................................................................*................ - sqrdmulh v13.4S, v17.4S, v0.S[3] // ..............................................................e................................................................................................................................. - ldr q30, [x0, #256] // ....e........................................................................................................................................................................................... - mla v12.4S, v27.4S, v29.4S // .........................................................................e...................................................................................................................... - str q24, [x0, #624] // ..........................................................................................................................................................................................*..... - add v24.4S, v22.4S, v31.4S // ..................................................e............................................................................................................................................. - mul v11.4S, v26.4S, v0.S[0] // .....................e.......................................................................................................................................................................... - mla v25.4S, v14.4S, v29.4S // ......................................e......................................................................................................................................................... - mul v14.4S, v17.4S, v0.S[2] // .............................................................e.................................................................................................................................. - str q18, [x0, #880] // ..............................................................................................................................................................................................*. - mul v18.4S, v24.4S, v0.S[2] // ..................................................................e............................................................................................................................. - sub v17.4S, v15.4S, v12.4S // ..........................................................................e..................................................................................................................... - add v12.4S, v15.4S, v12.4S // ...........................................................................e.................................................................................................................... - str q8, [x0, #944] // ...............................................................................................................................................................................................* - sqrdmulh v8.4S, v24.4S, v0.S[3] // ...................................................................e............................................................................................................................ - add v24.4S, v30.4S, v25.4S // ........................................e....................................................................................................................................................... - sub v22.4S, v22.4S, v31.4S // .................................................e.............................................................................................................................................. - sqrdmulh v31.4S, v12.4S, v1.S[3] // ......................................................................................................e......................................................................................... - sub v30.4S, v30.4S, v25.4S // .......................................e........................................................................................................................................................ - mul v25.4S, v12.4S, v1.S[2] // .....................................................................................................e.......................................................................................... - mla v14.4S, v13.4S, v29.4S // ...............................................................e................................................................................................................................ - mul v13.4S, v22.4S, v1.S[0] // ......................................................................................e......................................................................................................... - mul v12.4S, v28.4S, v0.S[0] // ..........................e..................................................................................................................................................................... - sqrdmulh v22.4S, v22.4S, v1.S[1] // .......................................................................................e........................................................................................................ - sqrdmulh v28.4S, v21.4S, v1.S[1] // ..................................................................................e............................................................................................................. - mla v25.4S, v31.4S, v29.4S // .......................................................................................................e........................................................................................ - sqrdmulh v26.4S, v26.4S, v0.S[1] // ......................e......................................................................................................................................................................... - mul v31.4S, v17.4S, v2.S[0] // ...............................................................................................................e................................................................................ - sub v16.4S, v9.4S, v16.4S // .........................................................................................................................................................................*...................... - ldr q9, [x0, #0] // e............................................................................................................................................................................................... - mla v18.4S, v8.4S, v29.4S // ....................................................................e........................................................................................................................... - sub v8.4S, v23.4S, v10.4S // ..............................................................................................e................................................................................................. - ldr q27, [x0, #64] // .e.............................................................................................................................................................................................. - mla v11.4S, v26.4S, v29.4S // .......................e........................................................................................................................................................................ - sqrdmulh v26.4S, v17.4S, v2.S[1] // ................................................................................................................e............................................................................... - str q16, [x0, #816] // .............................................................................................................................................................................................*.. - mla v12.4S, v19.4S, v29.4S // ............................e................................................................................................................................................................... - ldr q19, [x0, #128] // ..e............................................................................................................................................................................................. - mul v16.4S, v21.4S, v1.S[0] // .................................................................................e.............................................................................................................. - sqrdmulh v17.4S, v8.4S, v3.S[1] // ....................................................................................................................................e........................................................... - add v15.4S, v9.4S, v20.4S // ....................e........................................................................................................................................................................... - sub v21.4S, v27.4S, v11.4S // ........................e....................................................................................................................................................................... - add v27.4S, v27.4S, v11.4S // .........................e...................................................................................................................................................................... - mla v16.4S, v28.4S, v29.4S // ...................................................................................e............................................................................................................ - mul v11.4S, v8.4S, v3.S[0] // ...................................................................................................................................e............................................................ - add v8.4S, v27.4S, v14.4S // .................................................................e.............................................................................................................................. - add v28.4S, v19.4S, v12.4S // ..............................e................................................................................................................................................................. - mla v13.4S, v22.4S, v29.4S // ........................................................................................e....................................................................................................... - sub v27.4S, v27.4S, v14.4S // ................................................................e............................................................................................................................... - add v22.4S, v28.4S, v18.4S // ......................................................................e......................................................................................................................... - mla v31.4S, v26.4S, v29.4S // .................................................................................................................e.............................................................................. - add v26.4S, v23.4S, v10.4S // ...............................................................................................e................................................................................................ - add v10.4S, v21.4S, v16.4S // .....................................................................................e.......................................................................................................... - sub v23.4S, v21.4S, v16.4S // ....................................................................................e........................................................................................................... - sub v28.4S, v28.4S, v18.4S // .....................................................................e.......................................................................................................................... - mla v11.4S, v17.4S, v29.4S // .....................................................................................................................................e.......................................................... - sub v14.4S, v8.4S, v25.4S // ........................................................................................................e....................................................................................... - mul v17.4S, v22.4S, v1.S[2] // ................................................................................................e............................................................................................... - sqrdmulh v18.4S, v22.4S, v1.S[3] // .................................................................................................e.............................................................................................. - sub v20.4S, v9.4S, v20.4S // ...................e............................................................................................................................................................................ - add v21.4S, v8.4S, v25.4S // .........................................................................................................e...................................................................................... + // Instructions: 192 + // Expected cycles: 48 + // Expected IPC: 4.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + mla v26.4S, v30.4S, v29.4S // .*.............................................................................................................................................................................................. + add v17.4S, v15.4S, v17.4S // *............................................................................................................................................................................................... + sub v30.4S, v8.4S, v12.4S // ......................................................*......................................................................................................................................... + sub v27.4S, v27.4S, v11.4S // ...*............................................................................................................................................................................................ + add v15.4S, v8.4S, v12.4S // .................................*.............................................................................................................................................................. + sub v12.4S, v17.4S, v10.4S // ..............................*................................................................................................................................................................. + sub v18.4S, v23.4S, v26.4S // .......*........................................................................................................................................................................................ + add v25.4S, v23.4S, v26.4S // ......*......................................................................................................................................................................................... + mul v23.4S, v14.4S, v6.S[0] // ...........................*.................................................................................................................................................................... + sqrdmulh v26.4S, v12.4S, v2.S[1] // ....................................*........................................................................................................................................................... + sqrdmulh v11.4S, v14.4S, v6.S[1] // ..........................*..................................................................................................................................................................... + mul v14.4S, v12.4S, v2.S[0] // .....................................*.......................................................................................................................................................... + sub v12.4S, v19.4S, v21.4S // .....*.......................................................................................................................................................................................... + mul v21.4S, v24.4S, v2.S[0] // ...............*................................................................................................................................................................................ + sqrdmulh v8.4S, v24.4S, v2.S[1] // .................*.............................................................................................................................................................................. + add v24.4S, v20.4S, v13.4S // .......................*........................................................................................................................................................................ + mla v14.4S, v26.4S, v29.4S // ...........................................*.................................................................................................................................................... + sub v26.4S, v22.4S, v9.4S // ...................*............................................................................................................................................................................ + sqrdmulh v13.4S, v15.4S, v2.S[3] // .......................................*........................................................................................................................................................ + sqrdmulh v20.4S, v31.4S, v3.S[3] // ..*............................................................................................................................................................................................. + mul v19.4S, v15.4S, v2.S[2] // ......................................*......................................................................................................................................................... + sub v15.4S, v26.4S, v14.4S // ...................................................*............................................................................................................................................ + add v26.4S, v26.4S, v14.4S // .................................................*.............................................................................................................................................. + add v9.4S, v22.4S, v9.4S // .............*.................................................................................................................................................................................. + sub v22.4S, v28.4S, v16.4S // ..........................................................*..................................................................................................................................... + mla v23.4S, v11.4S, v29.4S // ...................................*............................................................................................................................................................ + mla v19.4S, v13.4S, v29.4S // ............................................*................................................................................................................................................... + add v14.4S, v28.4S, v16.4S // .............................................*.................................................................................................................................................. + mul v16.4S, v31.4S, v3.S[2] // ............*................................................................................................................................................................................... + sqrdmulh v13.4S, v24.4S, v5.S[3] // ...........................................................*.................................................................................................................................... + add v28.4S, v14.4S, v19.4S // ........................................................................*....................................................................................................................... + sub v11.4S, v14.4S, v19.4S // ....................................................*........................................................................................................................................... + mul v19.4S, v30.4S, v3.S[0] // ..............................................................*................................................................................................................................. + mla v16.4S, v20.4S, v29.4S // ..................*............................................................................................................................................................................. + ldr q20, [x0, #784] // ...................................................................................................*............................................................................................ + add v14.4S, v11.4S, v23.4S // ................................................................................*............................................................................................................... + mul v31.4S, v24.4S, v5.S[2] // .............................................................*.................................................................................................................................. + mla v21.4S, v8.4S, v29.4S // ......................*......................................................................................................................................................................... + sub v24.4S, v27.4S, v21.4S // ............................*................................................................................................................................................................... + add v17.4S, v17.4S, v10.4S // ....*........................................................................................................................................................................................... + str q14, [x0, #640] // ......................................................................................*......................................................................................................... + ldr q14, [x0, #656] // ................................................................................................................*............................................................................... + mul v10.4S, v12.4S, v4.S[0] // ..........*..................................................................................................................................................................................... + sqrdmulh v8.4S, v12.4S, v4.S[1] // ...........*.................................................................................................................................................................................... + sub v23.4S, v11.4S, v23.4S // .......................................................*........................................................................................................................................ + mul v12.4S, v24.4S, v5.S[0] // ..................................*............................................................................................................................................................. + sqrdmulh v11.4S, v17.4S, v1.S[3] // .........*...................................................................................................................................................................................... + mul v17.4S, v17.4S, v1.S[2] // ........*....................................................................................................................................................................................... + mla v10.4S, v8.4S, v29.4S // ................*............................................................................................................................................................................... + mla v31.4S, v13.4S, v29.4S // ....................................................................*........................................................................................................................... + sqrdmulh v13.4S, v20.4S, v0.S[1] // ...............................................................................................................*................................................................................ + str q23, [x0, #704] // ...............................................................*................................................................................................................................ + mul v8.4S, v25.4S, v6.S[2] // ...................................................................*............................................................................................................................ + mla v17.4S, v11.4S, v29.4S // ..............*................................................................................................................................................................................. + mul v20.4S, v20.4S, v0.S[0] // ............................................................................................................................*................................................................... + sqrdmulh v30.4S, v30.4S, v3.S[1] // .................................................................*.............................................................................................................................. + add v11.4S, v28.4S, v31.4S // ...............................................................................*................................................................................................................ + sqrdmulh v25.4S, v25.4S, v6.S[3] // ................................................................*............................................................................................................................... + ldr q23, [x0, #848] // ....................................................................................................................*........................................................................... + mla v19.4S, v30.4S, v29.4S // ......................................................................*......................................................................................................................... + add v30.4S, v9.4S, v17.4S // .....................*.......................................................................................................................................................................... + mla v20.4S, v13.4S, v29.4S // ........................................................................................................................................*....................................................... + ldr q13, [x0, #976] // .................................................................................................*.............................................................................................. + str q11, [x0, #512] // ....................................................................................*........................................................................................................... + mla v8.4S, v25.4S, v29.4S // .........................................................................*...................................................................................................................... + sub v25.4S, v28.4S, v31.4S // ............................................................................*................................................................................................................... + add v28.4S, v30.4S, v16.4S // ..................................................................*............................................................................................................................. + add v11.4S, v22.4S, v19.4S // ..............................................................................*................................................................................................................. + sub v22.4S, v22.4S, v19.4S // .......................................................................................*........................................................................................................ + str q25, [x0, #576] // .................................................................................*.............................................................................................................. + sqrdmulh v31.4S, v18.4S, v7.S[1] // ...........................................................................*.................................................................................................................... + mul v25.4S, v18.4S, v7.S[0] // .....................................................................*.......................................................................................................................... + sub v9.4S, v9.4S, v17.4S // ....................*........................................................................................................................................................................... + str q28, [x0], #(16) // .......................................................................*........................................................................................................................ + mul v17.4S, v14.4S, v0.S[0] // ..........................................................................................................................*..................................................................... + sqrdmulh v19.4S, v14.4S, v0.S[1] // .........................................................................................................................*...................................................................... + mul v18.4S, v23.4S, v0.S[0] // .................................................................................................................................*.............................................................. + ldr q28, [x0, #576] // ....................................................................................................*........................................................................................... + sqrdmulh v14.4S, v13.4S, v0.S[1] // .......................................................................................................*........................................................................................ + mul v13.4S, v13.4S, v0.S[0] // .........................................................................................................*...................................................................................... + mla v25.4S, v31.4S, v29.4S // ...................................................................................*............................................................................................................ + add v31.4S, v9.4S, v10.4S // ........................*....................................................................................................................................................................... + sqrdmulh v24.4S, v24.4S, v5.S[1] // ..........................................*..................................................................................................................................................... + sub v9.4S, v9.4S, v10.4S // ..............................................*................................................................................................................................................. + ldr q10, [x0, #448] // .............................................................................................................................*.................................................................. + str q31, [x0, #112] // ...............................*................................................................................................................................................................ + mla v13.4S, v14.4S, v29.4S // .................................................................................................................*.............................................................................. + sub v14.4S, v30.4S, v16.4S // .........................*...................................................................................................................................................................... + sqrdmulh v16.4S, v28.4S, v0.S[1] // ..................................................................................................................*............................................................................. + mul v30.4S, v28.4S, v0.S[0] // ..............................................................................................................*................................................................................. + ldr q28, [x0, #64] // .....................................................................................................*.......................................................................................... + str q9, [x0, #176] // ..................................................*............................................................................................................................................. + mla v12.4S, v24.4S, v29.4S // ................................................*............................................................................................................................................... + add v9.4S, v22.4S, v25.4S // ..........................................................................................*..................................................................................................... + str q14, [x0, #48] // .............................*.................................................................................................................................................................. + sub v14.4S, v10.4S, v13.4S // .....................................................................................................................................*.......................................................... + str q9, [x0, #880] // ..............................................................................................*................................................................................................. + add v9.4S, v27.4S, v21.4S // ................................*............................................................................................................................................................... + sqrdmulh v24.4S, v23.4S, v0.S[1] // ................................................................................................................................*............................................................... + ldr q23, [x0, #256] // ............................................................................................................*................................................................................... + add v31.4S, v15.4S, v12.4S // ........................................................*....................................................................................................................................... + add v21.4S, v10.4S, v13.4S // .......................................................................................................................................*........................................................ + ldr q13, [x0, #512] // ...................................................................................................................*............................................................................ + ldr q27, [x0, #320] // ........................................................................................................................*....................................................................... + sub v25.4S, v22.4S, v25.4S // ............................................................................................*................................................................................................... + add v10.4S, v11.4S, v8.4S // ..................................................................................*............................................................................................................. + sub v12.4S, v15.4S, v12.4S // .........................................................................................*...................................................................................................... + str q31, [x0, #368] // ............................................................*................................................................................................................................... + mla v18.4S, v24.4S, v29.4S // .............................................................................................................................................*.................................................. + ldr q24, [x0, #704] // ......................................................................................................*......................................................................................... + sub v15.4S, v11.4S, v8.4S // .....................................................................................*.......................................................................................................... + str q10, [x0, #752] // ........................................................................................*....................................................................................................... + sub v10.4S, v23.4S, v20.4S // ..............................................................................................................................................*................................................. + str q25, [x0, #944] // ...............................................................................................*................................................................................................ + ldr q11, [x0, #896] // ................................................................................................*............................................................................................... + sqrdmulh v31.4S, v9.4S, v4.S[3] // .........................................*...................................................................................................................................................... + sqrdmulh v25.4S, v14.4S, v1.S[1] // ...........................................................................................................................................................*.................................... + add v8.4S, v23.4S, v20.4S // ...................................................................................................................................................*............................................ + str q15, [x0, #816] // ...........................................................................................*.................................................................................................... + sub v15.4S, v27.4S, v18.4S // .....................................................................................................................................................*.......................................... + mul v20.4S, v14.4S, v1.S[0] // ..........................................................................................................................................................*..................................... + mul v14.4S, v9.4S, v4.S[2] // ........................................*....................................................................................................................................................... + mla v30.4S, v16.4S, v29.4S // ...........................................................................................................................*.................................................................... + mul v16.4S, v24.4S, v0.S[0] // ......................................................................................................................*......................................................................... + str q12, [x0, #432] // .............................................................................................*.................................................................................................. + sqrdmulh v12.4S, v24.4S, v0.S[1] // .......................................................................................................................*........................................................................ + ldr q24, [x0, #192] // ..................................................................................................*............................................................................................. + mul v23.4S, v11.4S, v0.S[0] // ...........................................................................................................*.................................................................................... + mla v14.4S, v31.4S, v29.4S // ...............................................*................................................................................................................................................ + add v22.4S, v27.4S, v18.4S // ..................................................................................................................................................*............................................. + add v27.4S, v28.4S, v30.4S // ...............................................................................................................................................................................*................ + sqrdmulh v9.4S, v11.4S, v0.S[1] // ..........................................................................................................*..................................................................................... + mla v16.4S, v12.4S, v29.4S // ...............................................................................................................................*................................................................ + sqrdmulh v12.4S, v21.4S, v0.S[3] // ............................................................................................................................................*................................................... + sub v31.4S, v26.4S, v14.4S // ..........................................................................*..................................................................................................................... + mul v11.4S, v22.4S, v0.S[2] // ......................................................................................................................................................*......................................... + sub v18.4S, v28.4S, v30.4S // ................................................................................................................................................................................*............... + add v28.4S, v26.4S, v14.4S // .....................................................*.......................................................................................................................................... + sqrdmulh v14.4S, v22.4S, v0.S[3] // .........................................................................................................................................................*...................................... + sqrdmulh v30.4S, v8.4S, v0.S[3] // ..........................................................................................................................................................................................*..... + str q31, [x0, #304] // .............................................................................*.................................................................................................................. + ldr q26, [x0, #384] // ........................................................................................................*....................................................................................... + mla v20.4S, v25.4S, v29.4S // ..................................................................................................................................................................*............................. + mla v23.4S, v9.4S, v29.4S // .....................................................................................................................*.......................................................................... + mla v17.4S, v19.4S, v29.4S // ......................................................................................................................................................................................*......... + sub v31.4S, v24.4S, v16.4S // ..........................................................................................................................................*..................................................... + mla v11.4S, v14.4S, v29.4S // .......................................................................................................................................................................*........................ + mul v21.4S, v21.4S, v0.S[2] // ...............................................................................................................................................*................................................ + mul v22.4S, v15.4S, v1.S[0] // ...............................................................................................................................................................*................................ + sub v9.4S, v26.4S, v23.4S // ...................................................................................................................................*............................................................ + add v25.4S, v31.4S, v20.4S // ..........................................................................................................................................................................*..................... + str q28, [x0, #240] // .........................................................*...................................................................................................................................... + add v19.4S, v27.4S, v11.4S // .....................................................................................................................................................................................*.......... + mul v28.4S, v13.4S, v0.S[0] // ................................................................................................................................................*............................................... + mla v21.4S, v12.4S, v29.4S // ....................................................................................................................................................*........................................... + mul v12.4S, v9.4S, v1.S[0] // ......................................................................................................................................*......................................................... + add v24.4S, v24.4S, v16.4S // ........................................................................................................................................................*....................................... + sqrdmulh v16.4S, v9.4S, v1.S[1] // .........................................................................................................................................*...................................................... + mul v9.4S, v8.4S, v0.S[2] // .........................................................................................................................................................................................*...... + sqrdmulh v8.4S, v15.4S, v1.S[1] // .................................................................................................................................................................*.............................. + sqrdmulh v13.4S, v13.4S, v0.S[1] // .................................................................................................................................................*.............................................. + add v15.4S, v24.4S, v21.4S // .............................................................................................................................................................*.................................. + mla v12.4S, v16.4S, v29.4S // .....................................................................................................................................................................*.......................... + sub v24.4S, v24.4S, v21.4S // ..............................................................................................................................................................................................*. + mul v21.4S, v15.4S, v1.S[2] // .............................................................................................................................................................................*.................. + sqrdmulh v16.4S, v15.4S, v1.S[3] // ..............................................................................................................................................................................*................. + ldr q15, [x0, #128] // .............................................................................................................*.................................................................................. + mla v22.4S, v8.4S, v29.4S // .........................................................................................................................................................................*...................... + mla v9.4S, v30.4S, v29.4S // ...............................................................................................................................................................................................* + mla v28.4S, v13.4S, v29.4S // .......................................................................................................................................................*........................................ + mul v13.4S, v25.4S, v2.S[2] // .................................................................................................................................................................................*.............. + mla v21.4S, v16.4S, v29.4S // ....................................................................................................................................................................................*........... + sqrdmulh v25.4S, v25.4S, v2.S[3] // ..................................................................................................................................................................................*............. + add v26.4S, v26.4S, v23.4S // ..............................................................................................................................*................................................................. + sqrdmulh v8.4S, v10.4S, v1.S[1] // ..............................................................................................................................................................*................................. + sub v31.4S, v31.4S, v20.4S // ........................................................................................................................................................................*....................... + mul v16.4S, v10.4S, v1.S[0] // ................................................................................................................................................................*............................... + sqrdmulh v23.4S, v26.4S, v0.S[3] // ..................................................................................................................................*............................................................. + mul v10.4S, v26.4S, v0.S[2] // ....................................................................................................................................*........................................................... + mul v26.4S, v31.4S, v3.S[0] // ...........................................................................................................................................................................*.................... + mla v13.4S, v25.4S, v29.4S // ........................................................................................................................................................................................*....... + add v20.4S, v18.4S, v22.4S // ...................................................................................................................................................................................*............ + mla v16.4S, v8.4S, v29.4S // ......................................................................................................................................................................*......................... + ldr q25, [x0, #0] // ............................................................................................................................................................*................................... + mla v10.4S, v23.4S, v29.4S // ...........................................................................................................................................*.................................................... + sub v23.4S, v18.4S, v22.4S // .......................................................................................................................................................................................*........ + sqrdmulh v30.4S, v31.4S, v3.S[1] // ............................................................................................................................................................................*................... + sub v14.4S, v20.4S, v13.4S // .............................................................................................................................................................................................*.. + sub v8.4S, v15.4S, v17.4S // ............................................................................................................................................................................................*... + add v31.4S, v19.4S, v21.4S // ...........................................................................................................................................................................................*.... + add v22.4S, v25.4S, v28.4S // ...................................................................................................................................................................*............................ + sub v28.4S, v25.4S, v28.4S // ....................................................................................................................................................................*........................... - // original source code - // ldr q8, [x0, #0] // ..................................................................................................................................e...............................|...............................................................................................................................................................e...... - // ldr q9, [x0, #(1*(512/8))] // .....................................................................................................................................e............................|..................................................................................................................................................................e... - // ldr q10, [x0, #(2*(512/8))] // ..........................................................................................................................................e.......................|...................................................................................................................................................................... - // ldr q11, [x0, #(3*(512/8))] // ....................................................................e.............................................................................................|.................................................................................................e.................................................................... - // ldr q12, [x0, #(4*(512/8))] // .......................................................................................................e..........................................................|....................................................................................................................................e................................. - // ldr q13, [x0, #(5*(512/8))] // ..........................................................e.......................................................................................................|.......................................................................................e.............................................................................. - // ldr q14, [x0, #(6*(512/8))] // .............................................................................................e....................................................................|..........................................................................................................................e........................................... - // ldr q15, [x0, #(7*(512/8))] // .......................................................e..........................................................................................................|....................................................................................e................................................................................. - // ldr q16, [x0, #(8*(512/8))] // .............e....................................................................................................................................................|..........................................e........................................................................................................................... - // ldr q17, [x0, #(9*(512/8))] // .................................................................................................e................................................................|..............................................................................................................................e....................................... - // ldr q18, [x0, #(10*(512/8))] // ...............................e..................................................................................................................................|............................................................e......................................................................................................... - // ldr q19, [x0, #(11*(512/8))] // ......................................e...........................................................................................................................|...................................................................e.................................................................................................. - // ldr q20, [x0, #(12*(512/8))] // .................................................................................e................................................................................|..............................................................................................................e....................................................... - // ldr q21, [x0, #(13*(512/8))] // e.................................................................................................................................................................|.............................e........................................................................................................................................ - // ldr q22, [x0, #(14*(512/8))] // ..........................................................................e.......................................................................................|.......................................................................................................e.............................................................. - // ldr q23, [x0, #(15*(512/8))] // ..................................e...............................................................................................................................|...............................................................e...................................................................................................... - // mul v24.4s, v16.4s, v0.s[0] // .........................e........................................................................................................................................|......................................................e............................................................................................................... - // sqrdmulh v16.4s, v16.4s, v0.s[1] // .................................e................................................................................................................................|..............................................................e....................................................................................................... - // mla v24.4s, v16.4s, v29.4s // ....................................................e.............................................................................................................|.................................................................................e.................................................................................... - // sub v16.4s, v8.4s, v24.4s // ................................................................................................................................................................e.|...................................................................................................................................................................... - // add v8.4s, v8.4s, v24.4s // .............................................................................................................................................e....................|...................................................................................................................................................................... - // mul v24.4s, v17.4s, v0.s[0] // ...........................................................................................................e......................................................|........................................................................................................................................e............................. - // sqrdmulh v17.4s, v17.4s, v0.s[1] // ...............................................................................................................................e..................................|............................................................................................................................................................e......... - // mla v24.4s, v17.4s, v29.4s // ......................................................................................................................................e...........................|...................................................................................................................................................................e.. - // sub v17.4s, v9.4s, v24.4s // ..............................................................................................................................................e...................|...................................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ...............................................................................................................................................e..................|...................................................................................................................................................................... - // mul v24.4s, v18.4s, v0.s[0] // ...........................................................................................................................e......................................|........................................................................................................................................................e............. - // sqrdmulh v18.4s, v18.4s, v0.s[1] // ..........................................e.......................................................................................................................|.......................................................................e.............................................................................................. - // mla v24.4s, v18.4s, v29.4s // .........................................................................................................................................e........................|...................................................................................................................................................................... - // sub v18.4s, v10.4s, v24.4s // ..................................................................................................................................................................|........................*............................................................................................................................................. - // add v10.4s, v10.4s, v24.4s // ...................................................................................................................................................e..............|...................................................................................................................................................................... - // mul v24.4s, v19.4s, v0.s[0] // ..............................................................e...................................................................................................|...........................................................................................e.......................................................................... - // sqrdmulh v19.4s, v19.4s, v0.s[1] // .............................................................e....................................................................................................|..........................................................................................e........................................................................... - // mla v24.4s, v19.4s, v29.4s // ......................................................................e...........................................................................................|...................................................................................................e.................................................................. - // sub v19.4s, v11.4s, v24.4s // ...................................................................................e..............................................................................|................................................................................................................e..................................................... - // add v11.4s, v11.4s, v24.4s // ..............................................................................e...................................................................................|...........................................................................................................e.......................................................... - // mul v24.4s, v20.4s, v0.s[0] // ..............................................................................................e...................................................................|...........................................................................................................................e.......................................... - // sqrdmulh v20.4s, v20.4s, v0.s[1] // ..........................................................................................e.......................................................................|.......................................................................................................................e.............................................. - // mla v24.4s, v20.4s, v29.4s // ............................................................................................................e.....................................................|.........................................................................................................................................e............................ - // sub v20.4s, v12.4s, v24.4s // .......................................................................................................................e..........................................|....................................................................................................................................................e................. - // add v12.4s, v12.4s, v24.4s // ....................................................................................................................e.............................................|.................................................................................................................................................e.................... - // mul v24.4s, v21.4s, v0.s[0] // ...........e......................................................................................................................................................|........................................e............................................................................................................................. - // sqrdmulh v21.4s, v21.4s, v0.s[1] // .........e........................................................................................................................................................|......................................e............................................................................................................................... - // mla v24.4s, v21.4s, v29.4s // ...................e..............................................................................................................................................|................................................e..................................................................................................................... - // sub v21.4s, v13.4s, v24.4s // ............................................................................e.....................................................................................|.........................................................................................................e............................................................ - // add v13.4s, v13.4s, v24.4s // .............................................................................e....................................................................................|..........................................................................................................e........................................................... - // mul v24.4s, v22.4s, v0.s[0] // .......................................................................................e..........................................................................|....................................................................................................................e................................................. - // sqrdmulh v22.4s, v22.4s, v0.s[1] // .....................................................................................e............................................................................|..................................................................................................................e................................................... - // mla v24.4s, v22.4s, v29.4s // ................................................................................................e.................................................................|.............................................................................................................................e........................................ - // sub v22.4s, v14.4s, v24.4s // .....................................................................................................................e............................................|..................................................................................................................................................e................... - // add v14.4s, v14.4s, v24.4s // ..........................................................................................................e.......................................................|.......................................................................................................................................e.............................. - // mul v24.4s, v23.4s, v0.s[0] // ...........................................e......................................................................................................................|........................................................................e............................................................................................. - // sqrdmulh v23.4s, v23.4s, v0.s[1] // ................................................e.................................................................................................................|.............................................................................e........................................................................................ - // mla v24.4s, v23.4s, v29.4s // .........................................................e........................................................................................................|......................................................................................e............................................................................... - // sub v23.4s, v15.4s, v24.4s // .....................................................................e............................................................................................|..................................................................................................e................................................................... - // add v15.4s, v15.4s, v24.4s // ......................................................................................e...........................................................................|...................................................................................................................e.................................................. - // mul v24.4s, v12.4s, v0.s[2] // ..................................................................................................................................................................*...................................................................................................................................................................... - // sqrdmulh v12.4s, v12.4s, v0.s[3] // ..................................................................................................................................................................|*..................................................................................................................................................................... - // mla v24.4s, v12.4s, v29.4s // ..................................................................................................................................................................|.....*................................................................................................................................................................ - // sub v12.4s, v8.4s, v24.4s // ..................................................................................................................................................................|............*......................................................................................................................................................... - // add v8.4s, v8.4s, v24.4s // ..................................................................................................................................................................|...........*.......................................................................................................................................................... - // mul v24.4s, v13.4s, v0.s[2] // .............................................................................................................e....................................................|..........................................................................................................................................e........................... - // sqrdmulh v13.4s, v13.4s, v0.s[3] // ......................................................................................................e...........................................................|...................................................................................................................................e.................................. - // mla v24.4s, v13.4s, v29.4s // .........................................................................................................................e........................................|......................................................................................................................................................e............... - // sub v13.4s, v9.4s, v24.4s // .....................................................................................................................................................e............|...................................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ..................................................................................................................................................e...............|...................................................................................................................................................................... - // mul v24.4s, v14.4s, v0.s[2] // ...............................................................................................................e..................................................|............................................................................................................................................e......................... - // sqrdmulh v14.4s, v14.4s, v0.s[3] // ...................................................................................................................e..............................................|................................................................................................................................................e..................... - // mla v24.4s, v14.4s, v29.4s // ...................................................................................................................................e..............................|................................................................................................................................................................e..... - // sub v14.4s, v10.4s, v24.4s // ...........................................................................................................................................................e......|...................................................................................................................................................................... - // add v10.4s, v10.4s, v24.4s // ......................................................................................................................................................e...........|...................................................................................................................................................................... - // mul v24.4s, v15.4s, v0.s[2] // ...........................................................................................e......................................................................|........................................................................................................................e............................................. - // sqrdmulh v15.4s, v15.4s, v0.s[3] // ............................................................................................e.....................................................................|.........................................................................................................................e............................................ - // mla v24.4s, v15.4s, v29.4s // ........................................................................................................e.........................................................|.....................................................................................................................................e................................ - // sub v15.4s, v11.4s, v24.4s // ................................................................................................................e.................................................|.............................................................................................................................................e........................ - // add v11.4s, v11.4s, v24.4s // .................................................................................................................e................................................|..............................................................................................................................................e....................... - // mul v24.4s, v20.4s, v1.s[0] // ..................................................................................................................................................................|....................*................................................................................................................................................. - // sqrdmulh v20.4s, v20.4s, v1.s[1] // ..................................................................................................................................................................|...................*.................................................................................................................................................. - // mla v24.4s, v20.4s, v29.4s // ..................................................................................................................................................................|...........................*.......................................................................................................................................... - // sub v20.4s, v16.4s, v24.4s // .....*............................................................................................................................................................|..................................*................................................................................................................................... - // add v16.4s, v16.4s, v24.4s // .......*..........................................................................................................................................................|....................................*................................................................................................................................. - // mul v24.4s, v21.4s, v1.s[0] // ...........................................................................................................................................e......................|...................................................................................................................................................................... - // sqrdmulh v21.4s, v21.4s, v1.s[1] // .............................................................................................................................e....................................|..........................................................................................................................................................e........... - // mla v24.4s, v21.4s, v29.4s // ................................................................................................................................................e.................|...................................................................................................................................................................... - // sub v21.4s, v17.4s, v24.4s // ..........................................................................................................................................................e.......|...................................................................................................................................................................... - // add v17.4s, v17.4s, v24.4s // .........................................................................................................................................................e........|...................................................................................................................................................................... - // mul v24.4s, v22.4s, v1.s[0] // ..........................................................................................................................e.......................................|.......................................................................................................................................................e.............. - // sqrdmulh v22.4s, v22.4s, v1.s[1] // ............................................................................................................................e.....................................|.........................................................................................................................................................e............ - // mla v24.4s, v22.4s, v29.4s // ....................................................................................................................................................e.............|...................................................................................................................................................................... - // sub v22.4s, v18.4s, v24.4s // ......................*...........................................................................................................................................|...................................................*.................................................................................................................. - // add v18.4s, v18.4s, v24.4s // ..................................................................................................................................................................|............................*......................................................................................................................................... - // mul v24.4s, v23.4s, v1.s[0] // ........................................................................e.........................................................................................|.....................................................................................................e................................................................ - // sqrdmulh v23.4s, v23.4s, v1.s[1] // .........................................................................e........................................................................................|......................................................................................................e............................................................... - // mla v24.4s, v23.4s, v29.4s // ................................................................................e.................................................................................|.............................................................................................................e........................................................ - // sub v23.4s, v19.4s, v24.4s // ....................................................................................................................................e.............................|.................................................................................................................................................................e.... - // add v19.4s, v19.4s, v24.4s // ........................................................................................................................................................e.........|...................................................................................................................................................................... - // mul v24.4s, v10.4s, v1.s[2] // ..............................................................................................................................................................e...|...................................................................................................................................................................... - // sqrdmulh v10.4s, v10.4s, v1.s[3] // ...............................................................................................................................................................e..|...................................................................................................................................................................... - // mla v24.4s, v10.4s, v29.4s // ..................................................................................................................................................................|.........*............................................................................................................................................................ - // sub v10.4s, v8.4s, v24.4s // ................................*.................................................................................................................................|.............................................................*........................................................................................................ - // add v8.4s, v8.4s, v24.4s // ............................*.....................................................................................................................................|.........................................................*............................................................................................................ - // mul v24.4s, v11.4s, v1.s[2] // ........................................................................................................................e.........................................|.....................................................................................................................................................e................ - // sqrdmulh v11.4s, v11.4s, v1.s[3] // ......................................................................................................................e...........................................|...................................................................................................................................................e.................. - // mla v24.4s, v11.4s, v29.4s // ..............................................................................................................................e...................................|...........................................................................................................................................................e.......... - // sub v11.4s, v9.4s, v24.4s // .............................................................................................................................................................e....|...................................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // .................................................................................................................................................................e|...................................................................................................................................................................... - // mul v24.4s, v14.4s, v2.s[0] // ..................................................................................................................................................................|...*.................................................................................................................................................................. - // sqrdmulh v14.4s, v14.4s, v2.s[1] // ..................................................................................................................................................................|....*................................................................................................................................................................. - // mla v24.4s, v14.4s, v29.4s // ..................................................................................................................................................................|..........*........................................................................................................................................................... - // sub v14.4s, v12.4s, v24.4s // ..................................................................................................................................................................|................*..................................................................................................................................................... - // add v12.4s, v12.4s, v24.4s // ..................................................................................................................................................................|...............*...................................................................................................................................................... - // mul v24.4s, v15.4s, v2.s[0] // ................................................................................................................................e.................................|.............................................................................................................................................................e........ - // sqrdmulh v15.4s, v15.4s, v2.s[1] // .......................................................................................................................................e..........................|....................................................................................................................................................................e. - // mla v24.4s, v15.4s, v29.4s // .......................................................................................................................................................e..........|...................................................................................................................................................................... - // sub v15.4s, v13.4s, v24.4s // ..................................................................................................................................................................|..................*................................................................................................................................................... - // add v13.4s, v13.4s, v24.4s // ..................................................................................................................................................................|........*............................................................................................................................................................. - // mul v24.4s, v18.4s, v2.s[2] // ............*.....................................................................................................................................................|.........................................*............................................................................................................................ - // sqrdmulh v18.4s, v18.4s, v2.s[3] // ....*.............................................................................................................................................................|.................................*.................................................................................................................................... - // mla v24.4s, v18.4s, v29.4s // ..............................*...................................................................................................................................|...........................................................*.......................................................................................................... - // sub v18.4s, v16.4s, v24.4s // .......................................*..........................................................................................................................|....................................................................*................................................................................................. - // add v16.4s, v16.4s, v24.4s // ........................................*.........................................................................................................................|.....................................................................*................................................................................................ - // mul v24.4s, v19.4s, v2.s[2] // ..................................................................................................................................................................|.....................*................................................................................................................................................ - // sqrdmulh v19.4s, v19.4s, v2.s[3] // ..................................................................................................................................................................|.................*.................................................................................................................................................... - // mla v24.4s, v19.4s, v29.4s // ..*...............................................................................................................................................................|...............................*...................................................................................................................................... - // sub v19.4s, v17.4s, v24.4s // .................................................................*................................................................................................|..............................................................................................*....................................................................... - // add v17.4s, v17.4s, v24.4s // ................*.................................................................................................................................................|.............................................*........................................................................................................................ - // mul v24.4s, v22.4s, v3.s[0] // ...........................*......................................................................................................................................|........................................................*............................................................................................................. - // sqrdmulh v22.4s, v22.4s, v3.s[1] // .............................*....................................................................................................................................|..........................................................*........................................................................................................... - // mla v24.4s, v22.4s, v29.4s // .....................................*............................................................................................................................|..................................................................*................................................................................................... - // sub v22.4s, v20.4s, v24.4s // ............................................*.....................................................................................................................|.........................................................................*............................................................................................ - // add v20.4s, v20.4s, v24.4s // .............................................*....................................................................................................................|..........................................................................*........................................................................................... - // mul v24.4s, v23.4s, v3.s[0] // .................................................................................................................................................e................|...................................................................................................................................................................... - // sqrdmulh v23.4s, v23.4s, v3.s[1] // ............................................................................................................................................e.....................|...................................................................................................................................................................... - // mla v24.4s, v23.4s, v29.4s // ............................................................................................................................................................e.....|...................................................................................................................................................................... - // sub v23.4s, v21.4s, v24.4s // ...............*..................................................................................................................................................|............................................*......................................................................................................................... - // add v21.4s, v21.4s, v24.4s // ..................................................................................................................................................................|......*............................................................................................................................................................... - // mul v24.4s, v9.4s, v3.s[2] // ..................................................................................................................................................................|.*.................................................................................................................................................................... - // sqrdmulh v9.4s, v9.4s, v3.s[3] // ..................................................................................................................................................................|..*................................................................................................................................................................... - // mla v24.4s, v9.4s, v29.4s // ..................................................................................................................................................................|.......*.............................................................................................................................................................. - // sub v9.4s, v8.4s, v24.4s // ..............................................*...................................................................................................................|...........................................................................*.......................................................................................... - // add v8.4s, v8.4s, v24.4s // ...............................................*..................................................................................................................|............................................................................*......................................................................................... - // mul v24.4s, v11.4s, v4.s[0] // ...............................................................*..................................................................................................|............................................................................................*......................................................................... - // sqrdmulh v11.4s, v11.4s, v4.s[1] // .................................................*................................................................................................................|..............................................................................*....................................................................................... - // mla v24.4s, v11.4s, v29.4s // ...................................................................*..............................................................................................|................................................................................................*..................................................................... - // sub v11.4s, v10.4s, v24.4s // ...........................................................................*......................................................................................|........................................................................................................*............................................................. - // add v10.4s, v10.4s, v24.4s // ..................................................................................*...............................................................................|...............................................................................................................*...................................................... - // mul v24.4s, v13.4s, v4.s[2] // ..................................................................................................................................................................|.............*........................................................................................................................................................ - // sqrdmulh v13.4s, v13.4s, v4.s[3] // ..................................................................................................................................................................|..............*....................................................................................................................................................... - // mla v24.4s, v13.4s, v29.4s // ..................................................................................................................................................................|..........................*........................................................................................................................................... - // sub v13.4s, v12.4s, v24.4s // ..................*...............................................................................................................................................|...............................................*...................................................................................................................... - // add v12.4s, v12.4s, v24.4s // .....................*............................................................................................................................................|..................................................*................................................................................................................... - // mul v24.4s, v15.4s, v5.s[0] // ..................................................................................................................................................................|.......................*.............................................................................................................................................. - // sqrdmulh v15.4s, v15.4s, v5.s[1] // ..................................................................................................................................................................|......................*............................................................................................................................................... - // mla v24.4s, v15.4s, v29.4s // .*................................................................................................................................................................|..............................*....................................................................................................................................... - // sub v15.4s, v14.4s, v24.4s // ..........*.......................................................................................................................................................|.......................................*.............................................................................................................................. - // add v14.4s, v14.4s, v24.4s // ......*...........................................................................................................................................................|...................................*.................................................................................................................................. - // mul v24.4s, v17.4s, v5.s[2] // ........................*.........................................................................................................................................|.....................................................*................................................................................................................ - // sqrdmulh v17.4s, v17.4s, v5.s[3] // ....................*.............................................................................................................................................|.................................................*.................................................................................................................... - // mla v24.4s, v17.4s, v29.4s // ....................................*.............................................................................................................................|.................................................................*.................................................................................................... - // sub v17.4s, v16.4s, v24.4s // ...................................................*..............................................................................................................|................................................................................*..................................................................................... - // add v16.4s, v16.4s, v24.4s // ...........................................................*......................................................................................................|........................................................................................*............................................................................. - // mul v24.4s, v19.4s, v6.s[0] // ...............................................................................*..................................................................................|............................................................................................................*......................................................... - // sqrdmulh v19.4s, v19.4s, v6.s[1] // .......................................................................*..........................................................................................|....................................................................................................*................................................................. - // mla v24.4s, v19.4s, v29.4s // ........................................................................................*.........................................................................|.....................................................................................................................*................................................ - // sub v19.4s, v18.4s, v24.4s // ...............................................................................................*..................................................................|............................................................................................................................*......................................... - // add v18.4s, v18.4s, v24.4s // ..................................................................................................*...............................................................|...............................................................................................................................*...................................... - // mul v24.4s, v21.4s, v6.s[2] // ...*..............................................................................................................................................................|................................*..................................................................................................................................... - // sqrdmulh v21.4s, v21.4s, v6.s[3] // ..................................................................................................................................................................|.........................*............................................................................................................................................ - // mla v24.4s, v21.4s, v29.4s // .................*................................................................................................................................................|..............................................*....................................................................................................................... - // sub v21.4s, v20.4s, v24.4s // .................................................................................................................................*................................|..............................................................................................................................................................*....... - // add v20.4s, v20.4s, v24.4s // .....................................................*............................................................................................................|..................................................................................*................................................................................... - // mul v24.4s, v23.4s, v7.s[0] // ...................................*..............................................................................................................................|................................................................*..................................................................................................... - // sqrdmulh v23.4s, v23.4s, v7.s[1] // .........................................*........................................................................................................................|......................................................................*............................................................................................... - // mla v24.4s, v23.4s, v29.4s // ........................................................*.........................................................................................................|.....................................................................................*................................................................................ - // sub v23.4s, v22.4s, v24.4s // ...................................................................................................*..............................................................|................................................................................................................................*..................................... - // add v22.4s, v22.4s, v24.4s // .....................................................................................................*............................................................|..................................................................................................................................*................................... - // str q8, [x0], #(16) // ......................................................*...........................................................................................................|...................................................................................*.................................................................................. - // str q9, [x0, #(-16 + 1*(512/8))] // ..................................................*...............................................................................................................|...............................................................................*...................................................................................... - // str q10, [x0, #(-16 + 2*(512/8))] // .........................................................................................*........................................................................|......................................................................................................................*............................................... - // str q11, [x0, #(-16 + 3*(512/8))] // ....................................................................................*.............................................................................|.................................................................................................................*.................................................... - // str q12, [x0, #(-16 + 4*(512/8))] // ..........................*.......................................................................................................................................|.......................................................*.............................................................................................................. - // str q13, [x0, #(-16 + 5*(512/8))] // .......................*..........................................................................................................................................|....................................................*................................................................................................................. - // str q14, [x0, #(-16 + 6*(512/8))] // ........*.........................................................................................................................................................|.....................................*................................................................................................................................ - // str q15, [x0, #(-16 + 7*(512/8))] // ..............*...................................................................................................................................................|...........................................*.......................................................................................................................... - // str q16, [x0, #(-16 + 8*(512/8))] // ..................................................................*...............................................................................................|...............................................................................................*...................................................................... - // str q17, [x0, #(-16 + 9*(512/8))] // ................................................................*.................................................................................................|.............................................................................................*........................................................................ - // str q18, [x0, #(-16 + 10*(512/8))] // .........................................................................................................*........................................................|......................................................................................................................................*............................... - // str q19, [x0, #(-16 + 11*(512/8))] // ....................................................................................................*.............................................................|.................................................................................................................................*.................................... - // str q20, [x0, #(-16 + 12*(512/8))] // ............................................................*.....................................................................................................|.........................................................................................*............................................................................ - // str q21, [x0, #(-16 + 13*(512/8))] // ........................................................................................................................................*.........................|.....................................................................................................................................................................* - // str q22, [x0, #(-16 + 14*(512/8))] // ..............................................................................................................*...................................................|...........................................................................................................................................*.......................... - // str q23, [x0, #(-16 + 15*(512/8))] // ..................................................................................................................*...............................................|...............................................................................................................................................*...................... + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // add v17.4S, v15.4S, v17.4S // .*.............................................................................................................................................................................................. + // mla v26.4S, v30.4S, v29.4S // *............................................................................................................................................................................................... + // sqrdmulh v25.4S, v31.4S, v3.S[3] // ...................*............................................................................................................................................................................ + // sub v11.4S, v27.4S, v11.4S // ...*............................................................................................................................................................................................ + // add v18.4S, v17.4S, v10.4S // .......................................*........................................................................................................................................................ + // sub v19.4S, v19.4S, v21.4S // ............*................................................................................................................................................................................... + // add v15.4S, v23.4S, v26.4S // .......*........................................................................................................................................................................................ + // sub v27.4S, v23.4S, v26.4S // ......*......................................................................................................................................................................................... + // mul v23.4S, v18.4S, v1.S[2] // ...............................................*................................................................................................................................................ + // sqrdmulh v26.4S, v18.4S, v1.S[3] // ..............................................*................................................................................................................................................. + // mul v21.4S, v19.4S, v4.S[0] // ..........................................*..................................................................................................................................................... + // sqrdmulh v19.4S, v19.4S, v4.S[1] // ...........................................*.................................................................................................................................................... + // mul v18.4S, v31.4S, v3.S[2] // ............................*................................................................................................................................................................... + // add v30.4S, v22.4S, v9.4S // .......................*........................................................................................................................................................................ + // mla v23.4S, v26.4S, v29.4S // .....................................................*.......................................................................................................................................... + // mul v26.4S, v24.4S, v2.S[0] // .............*.................................................................................................................................................................................. + // mla v21.4S, v19.4S, v29.4S // ................................................*............................................................................................................................................... + // sqrdmulh v24.4S, v24.4S, v2.S[1] // ..............*................................................................................................................................................................................. + // mla v18.4S, v25.4S, v29.4S // .................................*.............................................................................................................................................................. + // sub v31.4S, v22.4S, v9.4S // .................*.............................................................................................................................................................................. + // sub v9.4S, v30.4S, v23.4S // ........................................................................*....................................................................................................................... + // add v23.4S, v30.4S, v23.4S // ............................................................*................................................................................................................................... + // mla v26.4S, v24.4S, v29.4S // .....................................*.......................................................................................................................................................... + // add v19.4S, v20.4S, v13.4S // ...............*................................................................................................................................................................................ + // add v22.4S, v9.4S, v21.4S // .................................................................................*.............................................................................................................. + // sub v30.4S, v23.4S, v18.4S // .......................................................................................*........................................................................................................ + // sqrdmulh v24.4S, v14.4S, v6.S[1] // ..........*..................................................................................................................................................................................... + // mul v13.4S, v14.4S, v6.S[0] // ........*....................................................................................................................................................................................... + // sub v25.4S, v11.4S, v26.4S // ......................................*......................................................................................................................................................... + // str q30, [x0, #64] // ..............................................................................................*................................................................................................. + // sub v10.4S, v17.4S, v10.4S // .....*.......................................................................................................................................................................................... + // str q22, [x0, #128] // .....................................................................................*.......................................................................................................... + // add v20.4S, v11.4S, v26.4S // .................................................................................................*.............................................................................................. + // add v30.4S, v8.4S, v12.4S // ....*........................................................................................................................................................................................... + // mul v14.4S, v25.4S, v5.S[0] // .............................................*.................................................................................................................................................. + // mla v13.4S, v24.4S, v29.4S // .........................*...................................................................................................................................................................... + // sqrdmulh v24.4S, v10.4S, v2.S[1] // .........*...................................................................................................................................................................................... + // mul v26.4S, v10.4S, v2.S[0] // ...........*.................................................................................................................................................................................... + // mul v11.4S, v30.4S, v2.S[2] // ....................*........................................................................................................................................................................... + // sqrdmulh v17.4S, v30.4S, v2.S[3] // ..................*............................................................................................................................................................................. + // mul v10.4S, v20.4S, v4.S[2] // .........................................................................................................................*...................................................................... + // sqrdmulh v20.4S, v20.4S, v4.S[3] // ...................................................................................................................*............................................................................ + // sqrdmulh v30.4S, v25.4S, v5.S[1] // ..................................................................................*............................................................................................................. + // mla v26.4S, v24.4S, v29.4S // ................*............................................................................................................................................................................... + // mla v11.4S, v17.4S, v29.4S // ..........................*..................................................................................................................................................................... + // add v17.4S, v28.4S, v16.4S // ...........................*.................................................................................................................................................................... + // sub v9.4S, v9.4S, v21.4S // ...................................................................................*............................................................................................................ + // mla v10.4S, v20.4S, v29.4S // ................................................................................................................................*............................................................... + // mla v14.4S, v30.4S, v29.4S // ............................................................................................*................................................................................................... + // add v25.4S, v31.4S, v26.4S // ......................*......................................................................................................................................................................... + // str q9, [x0, #192] // ...........................................................................................*.................................................................................................... + // sub v30.4S, v31.4S, v26.4S // .....................*.......................................................................................................................................................................... + // sub v21.4S, v17.4S, v11.4S // ...............................*................................................................................................................................................................ + // add v22.4S, v25.4S, v10.4S // .........................................................................................................................................*...................................................... + // sub v12.4S, v8.4S, v12.4S // ..*............................................................................................................................................................................................. + // sub v24.4S, v21.4S, v13.4S // ............................................*................................................................................................................................................... + // add v9.4S, v30.4S, v14.4S // ....................................................................................................*........................................................................................... + // str q22, [x0, #256] // .......................................................................................................................................................*........................................ + // sub v31.4S, v28.4S, v16.4S // ........................*....................................................................................................................................................................... + // sqrdmulh v26.4S, v19.4S, v5.S[3] // .............................*.................................................................................................................................................................. + // str q9, [x0, #384] // ...........................................................................................................*.................................................................................... + // mul v19.4S, v19.4S, v5.S[2] // ....................................*........................................................................................................................................................... + // mul v16.4S, v12.4S, v3.S[0] // ................................*............................................................................................................................................................... + // str q24, [x0, #704] // ...................................................*............................................................................................................................................ + // sqrdmulh v24.4S, v15.4S, v6.S[3] // .........................................................*...................................................................................................................................... + // sqrdmulh v20.4S, v12.4S, v3.S[1] // .......................................................*........................................................................................................................................ + // add v9.4S, v23.4S, v18.4S // ..................................................................*............................................................................................................................. + // mul v22.4S, v15.4S, v6.S[2] // ....................................................*........................................................................................................................................... + // mla v19.4S, v26.4S, v29.4S // .................................................*.............................................................................................................................................. + // mul v18.4S, v27.4S, v7.S[0] // .......................................................................*........................................................................................................................ + // mla v16.4S, v20.4S, v29.4S // ...........................................................*.................................................................................................................................... + // str q9, [x0], #(16) // .........................................................................*...................................................................................................................... + // add v8.4S, v17.4S, v11.4S // ..............................*................................................................................................................................................................. + // mla v22.4S, v24.4S, v29.4S // ................................................................*............................................................................................................................... + // sub v26.4S, v25.4S, v10.4S // ......................................................................................................................................*......................................................... + // sqrdmulh v11.4S, v27.4S, v7.S[1] // ......................................................................*......................................................................................................................... + // sub v15.4S, v8.4S, v19.4S // .................................................................*.............................................................................................................................. + // str q26, [x0, #304] // ............................................................................................................................................*................................................... + // add v12.4S, v31.4S, v16.4S // ...................................................................*............................................................................................................................ + // add v10.4S, v8.4S, v19.4S // ........................................................*....................................................................................................................................... + // add v8.4S, v21.4S, v13.4S // ...................................*............................................................................................................................................................ + // str q15, [x0, #560] // .....................................................................*.......................................................................................................................... + // add v9.4S, v12.4S, v22.4S // .........................................................................................................*...................................................................................... + // mla v18.4S, v11.4S, v29.4S // ................................................................................*............................................................................................................... + // str q10, [x0, #496] // ...............................................................*................................................................................................................................ + // sub v27.4S, v12.4S, v22.4S // ..............................................................................................................*................................................................................. + // str q8, [x0, #624] // ........................................*....................................................................................................................................................... + // sub v10.4S, v31.4S, v16.4S // ....................................................................*........................................................................................................................... + // str q9, [x0, #752] // ...............................................................................................................*................................................................................ + // sub v26.4S, v30.4S, v14.4S // ..........................................................................................................*..................................................................................... + // add v13.4S, v10.4S, v18.4S // .............................................................................................*.................................................................................................. + // str q27, [x0, #816] // ......................................................................................................................*......................................................................... + // sub v9.4S, v10.4S, v18.4S // ........................................................................................................*....................................................................................... + // str q26, [x0, #432] // ............................................................................................................................*................................................................... + // str q13, [x0, #880] // ................................................................................................*............................................................................................... + // str q9, [x0, #944] // .................................................................................................................*.............................................................................. + // ldr q23, [x0, #896] // ..................................................................................................................*............................................................................. + // ldr q9, [x0, #960] // ..............................................................*................................................................................................................................. + // ldr q21, [x0, #192] // ..............................................................................................................................*................................................................. + // ldr q25, [x0, #768] // ..................................*............................................................................................................................................................. + // ldr q27, [x0, #576] // .............................................................................*.................................................................................................................. + // ldr q19, [x0, #64] // ..........................................................................................*..................................................................................................... + // ldr q22, [x0, #704] // .............................................................................................................*.................................................................................. + // sqrdmulh v17.4S, v9.4S, v0.S[1] // ..............................................................................*................................................................................................................. + // ldr q16, [x0, #384] // .............................................................................................................................................*.................................................. + // mul v9.4S, v9.4S, v0.S[0] // ...............................................................................*................................................................................................................ + // sqrdmulh v10.4S, v23.4S, v0.S[1] // ...................................................................................................................................*............................................................ + // mul v28.4S, v23.4S, v0.S[0] // ...............................................................................................................................*................................................................ + // ldr q11, [x0, #256] // ...................................................................................................*............................................................................................ + // ldr q15, [x0, #128] // ......................................................................................................................................................................*......................... + // mul v23.4S, v27.4S, v0.S[0] // .........................................................................................*...................................................................................................... + // sqrdmulh v18.4S, v25.4S, v0.S[1] // ..................................................*............................................................................................................................................. + // ldr q13, [x0, #640] // .........................................*...................................................................................................................................................... + // mla v9.4S, v17.4S, v29.4S // ......................................................................................*......................................................................................................... + // sqrdmulh v12.4S, v27.4S, v0.S[1] // ........................................................................................*....................................................................................................... + // ldr q27, [x0, #512] // ......................................................................................................*......................................................................................... + // ldr q26, [x0, #832] // ..........................................................*..................................................................................................................................... + // mla v28.4S, v10.4S, v29.4S // ...............................................................................................................................................*................................................ + // mul v24.4S, v22.4S, v0.S[0] // ...........................................................................................................................*.................................................................... + // sqrdmulh v10.4S, v22.4S, v0.S[1] // .............................................................................................................................*.................................................................. + // ldr q31, [x0, #320] // .......................................................................................................*........................................................................................ + // sqrdmulh v14.4S, v13.4S, v0.S[1] // ...........................................................................*.................................................................................................................... + // mul v17.4S, v13.4S, v0.S[0] // ..........................................................................*..................................................................................................................... + // mla v23.4S, v12.4S, v29.4S // ..........................................................................................................................*..................................................................... + // mul v8.4S, v25.4S, v0.S[0] // ......................................................*......................................................................................................................................... + // ldr q25, [x0, #448] // ....................................................................................*........................................................................................................... + // add v30.4S, v16.4S, v28.4S // .............................................................................................................................................................................*.................. + // mla v24.4S, v10.4S, v29.4S // ....................................................................................................................................*........................................................... + // sqrdmulh v22.4S, v26.4S, v0.S[1] // ..................................................................................................*............................................................................................. + // mul v26.4S, v26.4S, v0.S[0] // ............................................................................*................................................................................................................... + // sqrdmulh v13.4S, v30.4S, v0.S[3] // .................................................................................................................................................................................*.............. + // sub v20.4S, v16.4S, v28.4S // .....................................................................................................................................................*.......................................... + // mul v10.4S, v30.4S, v0.S[2] // ..................................................................................................................................................................................*............. + // sub v28.4S, v25.4S, v9.4S // ...............................................................................................*................................................................................................ + // mul v12.4S, v20.4S, v1.S[0] // ...........................................................................................................................................................*.................................... + // add v25.4S, v25.4S, v9.4S // .....................................................................................................*.......................................................................................... + // mla v8.4S, v18.4S, v29.4S // .............................................................*.................................................................................................................................. + // sqrdmulh v9.4S, v20.4S, v1.S[1] // .............................................................................................................................................................*.................................. + // sub v20.4S, v21.4S, v24.4S // .................................................................................................................................................*.............................................. + // mla v10.4S, v13.4S, v29.4S // ........................................................................................................................................................................................*....... + // sqrdmulh v13.4S, v25.4S, v0.S[3] // .....................................................................................................................................*.......................................................... + // mla v26.4S, v22.4S, v29.4S // ............................................................................................................*................................................................................... + // sub v16.4S, v11.4S, v8.4S // ................................................................................................................*............................................................................... + // mul v25.4S, v25.4S, v0.S[2] // ...................................................................................................................................................*............................................ + // mul v30.4S, v27.4S, v0.S[0] // .........................................................................................................................................................*...................................... + // sqrdmulh v27.4S, v27.4S, v0.S[1] // ................................................................................................................................................................*............................... + // add v18.4S, v31.4S, v26.4S // .................................................................................................................................*.............................................................. + // add v8.4S, v11.4S, v8.4S // .....................................................................................................................*.......................................................................... + // mla v25.4S, v13.4S, v29.4S // ..........................................................................................................................................................*..................................... + // sub v13.4S, v31.4S, v26.4S // .......................................................................................................................*........................................................................ + // mul v11.4S, v18.4S, v0.S[2] // .......................................................................................................................................*........................................................ + // mla v30.4S, v27.4S, v29.4S // .........................................................................................................................................................................*...................... + // add v24.4S, v21.4S, v24.4S // ............................................................................................................................................................*................................... + // sqrdmulh v31.4S, v18.4S, v0.S[3] // ..........................................................................................................................................*..................................................... + // mul v26.4S, v28.4S, v1.S[0] // ........................................................................................................................*....................................................................... + // sqrdmulh v22.4S, v28.4S, v1.S[1] // ....................................................................................................................*........................................................................... + // ldr q28, [x0, #0] // .......................................................................................................................................................................................*........ + // add v27.4S, v24.4S, v25.4S // .................................................................................................................................................................*.............................. + // sqrdmulh v21.4S, v16.4S, v1.S[1] // ..............................................................................................................................................................................*................. + // mul v18.4S, v13.4S, v1.S[0] // ....................................................................................................................................................*........................................... + // mul v16.4S, v16.4S, v1.S[0] // ................................................................................................................................................................................*............... + // sqrdmulh v13.4S, v13.4S, v1.S[1] // ...............................................................................................................................................................*................................ + // mla v26.4S, v22.4S, v29.4S // ..............................................................................................................................................*................................................. + // add v22.4S, v28.4S, v30.4S // ..............................................................................................................................................................................................*. + // sub v28.4S, v28.4S, v30.4S // ...............................................................................................................................................................................................* + // mla v12.4S, v9.4S, v29.4S // ..................................................................................................................................................................*............................. + // mla v16.4S, v21.4S, v29.4S // ......................................................................................................................................................................................*......... + // mla v11.4S, v31.4S, v29.4S // ..................................................................................................................................................*............................................. + // sub v31.4S, v20.4S, v26.4S // ...............................................................................................................................................................................*................ + // mla v18.4S, v13.4S, v29.4S // .......................................................................................................................................................................*........................ + // add v9.4S, v20.4S, v26.4S // ......................................................................................................................................................*......................................... + // mul v26.4S, v31.4S, v3.S[0] // ...................................................................................................................................................................................*............ + // sqrdmulh v30.4S, v31.4S, v3.S[1] // ..........................................................................................................................................................................................*..... + // mul v21.4S, v27.4S, v1.S[2] // ....................................................................................................................................................................*........................... + // sqrdmulh v31.4S, v27.4S, v1.S[3] // .....................................................................................................................................................................*.......................... + // add v27.4S, v19.4S, v23.4S // ..................................................................................................................................*............................................................. + // sub v23.4S, v19.4S, v23.4S // ........................................................................................................................................*....................................................... + // mul v13.4S, v9.4S, v2.S[2] // ..........................................................................................................................................................................*..................... + // sqrdmulh v9.4S, v9.4S, v2.S[3] // ............................................................................................................................................................................*................... + // add v20.4S, v23.4S, v18.4S // .....................................................................................................................................................................................*.......... + // mla v21.4S, v31.4S, v29.4S // ...........................................................................................................................................................................*.................... + // add v19.4S, v27.4S, v11.4S // ........................................................................................................................................................*....................................... + // mla v17.4S, v14.4S, v29.4S // ................................................................................................................................................*............................................... + // sub v23.4S, v23.4S, v18.4S // .........................................................................................................................................................................................*...... + // mla v13.4S, v9.4S, v29.4S // ....................................................................................................................................................................................*........... + // mul v9.4S, v8.4S, v0.S[2] // ..............................................................................................................................................................*................................. + // sqrdmulh v18.4S, v8.4S, v0.S[3] // ...........................................................................................................................................*.................................................... + // add v31.4S, v19.4S, v21.4S // .............................................................................................................................................................................................*.. + // sub v8.4S, v15.4S, v17.4S // ............................................................................................................................................................................................*... + // sub v14.4S, v20.4S, v13.4S // ...........................................................................................................................................................................................*.... + // sub v24.4S, v24.4S, v25.4S // ...................................................................................................................................................................*............................ + // mla v9.4S, v18.4S, v29.4S // ........................................................................................................................................................................*....................... sub count, count, #1 cbnz count, layer1234_start - mul v16.4S, v24.4S, v0.S[2] // ........................................................*....................................................................................................................................... - add v25.4S, v27.4S, v31.4S // ...................................................................................................................*............................................................................ - sqrdmulh v8.4S, v30.4S, v1.S[1] // .............................................................................*.................................................................................................................. - sqrdmulh v24.4S, v24.4S, v0.S[3] // .........................................................*...................................................................................................................................... - mul v22.4S, v25.4S, v4.S[2] // ..................................................................................................................................................*............................................. - sqrdmulh v9.4S, v25.4S, v4.S[3] // ...................................................................................................................................................*............................................ - mul v30.4S, v30.4S, v1.S[0] // ............................................................................*................................................................................................................... - mul v25.4S, v26.4S, v2.S[2] // .........................................................................................................................*...................................................................... - mla v16.4S, v24.4S, v29.4S // ..........................................................*..................................................................................................................................... - sqrdmulh v24.4S, v26.4S, v2.S[3] // ..........................................................................................................................*..................................................................... - mla v22.4S, v9.4S, v29.4S // ....................................................................................................................................................*........................................... - sub v12.4S, v19.4S, v12.4S // .............................*.................................................................................................................................................................. - add v9.4S, v23.4S, v11.4S // .......................................................................................................................................*........................................................ - mla v30.4S, v8.4S, v29.4S // ..............................................................................*................................................................................................................. - add v8.4S, v15.4S, v16.4S // ............................................................*................................................................................................................................... - mla v25.4S, v24.4S, v29.4S // ...........................................................................................................................*.................................................................... - sqrdmulh v19.4S, v9.4S, v6.S[3] // .......................................................................................................................................................................*........................ - add v26.4S, v12.4S, v13.4S // ..........................................................................................*..................................................................................................... - sub v24.4S, v12.4S, v13.4S // .........................................................................................*...................................................................................................... - mul v13.4S, v9.4S, v6.S[2] // ......................................................................................................................................................................*......................... - sqrdmulh v9.4S, v26.4S, v2.S[3] // .....................................................................................................................*.......................................................................... - mul v26.4S, v26.4S, v2.S[2] // ....................................................................................................................*........................................................................... - mul v12.4S, v24.4S, v3.S[0] // ..............................................................................................................................*................................................................. - sqrdmulh v24.4S, v24.4S, v3.S[1] // ...............................................................................................................................*................................................................ - mla v13.4S, v19.4S, v29.4S // ........................................................................................................................................................................*....................... - sub v19.4S, v23.4S, v11.4S // ......................................................................................................................................*......................................................... - mla v26.4S, v9.4S, v29.4S // ......................................................................................................................*......................................................................... - mul v23.4S, v14.4S, v4.S[0] // .............................................................................................................................................*.................................................. - mla v12.4S, v24.4S, v29.4S // ................................................................................................................................*............................................................... - sqrdmulh v24.4S, v14.4S, v4.S[1] // ..............................................................................................................................................*................................................. - sub v14.4S, v20.4S, v30.4S // ...............................................................................*................................................................................................................ - sqrdmulh v9.4S, v19.4S, v7.S[1] // ............................................................................................................................................................................*................... - add v11.4S, v20.4S, v30.4S // ................................................................................*............................................................................................................... - mul v20.4S, v19.4S, v7.S[0] // ...........................................................................................................................................................................*.................... - mla v17.4S, v18.4S, v29.4S // ..................................................................................................*............................................................................................. - add v19.4S, v14.4S, v12.4S // ..................................................................................................................................*............................................................. - sub v15.4S, v15.4S, v16.4S // ...........................................................*.................................................................................................................................... - add v16.4S, v11.4S, v26.4S // ........................................................................................................................*....................................................................... - mla v20.4S, v9.4S, v29.4S // .............................................................................................................................................................................*.................. - sub v18.4S, v14.4S, v12.4S // .................................................................................................................................*.............................................................. - mla v23.4S, v24.4S, v29.4S // ...............................................................................................................................................*................................................ - add v24.4S, v19.4S, v13.4S // ..........................................................................................................................................................................*..................... - sub v14.4S, v8.4S, v17.4S // ...................................................................................................*............................................................................................ - sqrdmulh v9.4S, v28.4S, v2.S[1] // ...........................................................................................................*.................................................................................... - mul v30.4S, v28.4S, v2.S[0] // ..........................................................................................................*..................................................................................... - add v28.4S, v18.4S, v20.4S // ...............................................................................................................................................................................*................ - str q24, [x0, #768] // ............................................................................................................................................................................................*... - add v24.4S, v14.4S, v23.4S // .................................................................................................................................................*.............................................. - sub v13.4S, v19.4S, v13.4S // .........................................................................................................................................................................*...................... - add v12.4S, v10.4S, v25.4S // .............................................................................................................................*.................................................................. - sub v20.4S, v18.4S, v20.4S // ..............................................................................................................................................................................*................. - mla v30.4S, v9.4S, v29.4S // ............................................................................................................*................................................................................... - str q13, [x0, #832] // .............................................................................................................................................................................................*.. - sub v18.4S, v27.4S, v31.4S // ..................................................................................................................*............................................................................. - str q20, [x0, #960] // ...............................................................................................................................................................................................* - sqrdmulh v20.4S, v12.4S, v5.S[3] // .............................................................................................................................................................*.................................. - mul v27.4S, v12.4S, v5.S[2] // ............................................................................................................................................................*................................... - sub v13.4S, v10.4S, v25.4S // ............................................................................................................................*................................................................... - mul v12.4S, v18.4S, v5.S[0] // .......................................................................................................................................................*........................................ - add v25.4S, v15.4S, v30.4S // ..............................................................................................................*................................................................................. - sqrdmulh v31.4S, v18.4S, v5.S[1] // ........................................................................................................................................................*....................................... - sqrdmulh v9.4S, v13.4S, v6.S[1] // ..................................................................................................................................................................*............................. - mla v27.4S, v20.4S, v29.4S // ..............................................................................................................................................................*................................. - sub v19.4S, v15.4S, v30.4S // .............................................................................................................*.................................................................................. - add v30.4S, v25.4S, v22.4S // ......................................................................................................................................................*......................................... - mla v12.4S, v31.4S, v29.4S // .........................................................................................................................................................*...................................... - mul v18.4S, v21.4S, v3.S[2] // ........................................................................................................................................*....................................................... - str q28, [x0, #896] // ..............................................................................................................................................................................................*. - sub v10.4S, v16.4S, v27.4S // ...............................................................................................................................................................*................................ - str q30, [x0, #256] // ....................................................................................................................................................................................*........... - mul v30.4S, v13.4S, v6.S[0] // .................................................................................................................................................................*.............................. - sqrdmulh v21.4S, v21.4S, v3.S[3] // .........................................................................................................................................*...................................................... - sub v15.4S, v14.4S, v23.4S // ................................................................................................................................................*............................................... - str q24, [x0, #128] // ..................................................................................................................................................................................*............. - sub v20.4S, v25.4S, v22.4S // .....................................................................................................................................................*.......................................... - sub v31.4S, v19.4S, v12.4S // ..........................................................................................................................................................*..................................... - str q10, [x0, #576] // .........................................................................................................................................................................................*...... - add v28.4S, v19.4S, v12.4S // ...........................................................................................................................................................*.................................... - str q15, [x0, #192] // ...................................................................................................................................................................................*............ - mla v30.4S, v9.4S, v29.4S // ...................................................................................................................................................................*............................ - str q31, [x0, #448] // .......................................................................................................................................................................................*........ - add v16.4S, v16.4S, v27.4S // ................................................................................................................................................................*............................... - sub v9.4S, v11.4S, v26.4S // .......................................................................................................................*........................................................................ - str q28, [x0, #384] // ......................................................................................................................................................................................*......... - add v25.4S, v8.4S, v17.4S // ....................................................................................................*........................................................................................... - mla v18.4S, v21.4S, v29.4S // ..........................................................................................................................................*..................................................... - str q16, [x0, #512] // ........................................................................................................................................................................................*....... - sub v28.4S, v9.4S, v30.4S // ....................................................................................................................................................................*........................... - str q20, [x0, #320] // .....................................................................................................................................................................................*.......... - add v24.4S, v9.4S, v30.4S // .....................................................................................................................................................................*.......................... - add v16.4S, v25.4S, v18.4S // ............................................................................................................................................*................................................... - sub v15.4S, v25.4S, v18.4S // ...........................................................................................................................................*.................................................... - str q28, [x0, #704] // ...........................................................................................................................................................................................*.... - str q24, [x0, #640] // ..........................................................................................................................................................................................*..... - str q15, [x0, #64] // .................................................................................................................................................................................*.............. - str q16, [x0], #(16) // ................................................................................................................................................................................*............... + add v17.4S, v15.4S, v17.4S // ..............................*................................................................................................................................................................. + mla v26.4S, v30.4S, v29.4S // .....................................................................................................................................*.......................................................... + sqrdmulh v25.4S, v31.4S, v3.S[3] // ........................................................................................................................................*....................................................... + sub v11.4S, v27.4S, v11.4S // ................................................................*............................................................................................................................... + add v18.4S, v17.4S, v10.4S // ......................................................................*......................................................................................................................... + sub v19.4S, v19.4S, v21.4S // ........................................................................................................*....................................................................................... + add v15.4S, v23.4S, v26.4S // .......................................................................................................................................*........................................................ + sub v27.4S, v23.4S, v26.4S // ......................................................................................................................................*......................................................... + mul v23.4S, v18.4S, v1.S[2] // .................................................................................................*.............................................................................................. + sqrdmulh v26.4S, v18.4S, v1.S[3] // ................................................................................................*............................................................................................... + mul v21.4S, v19.4S, v4.S[0] // ..............................................................................................................................................*................................................. + sqrdmulh v19.4S, v19.4S, v4.S[1] // .............................................................................................................................................*.................................................. + mul v18.4S, v31.4S, v3.S[2] // .........................................................................................................................................*...................................................... + add v30.4S, v22.4S, v9.4S // ............................................................*................................................................................................................................... + mla v23.4S, v26.4S, v29.4S // ..................................................................................................*............................................................................................. + mul v26.4S, v24.4S, v2.S[0] // ................................................................................................................*............................................................................... + mla v21.4S, v19.4S, v29.4S // ...............................................................................................................................................*................................................ + sqrdmulh v24.4S, v24.4S, v2.S[1] // ...............................................................................................................*................................................................................ + mla v18.4S, v25.4S, v29.4S // ..........................................................................................................................................*..................................................... + sub v31.4S, v22.4S, v9.4S // ...........................................................*.................................................................................................................................... + sub v9.4S, v30.4S, v23.4S // ...................................................................................................*............................................................................................ + add v23.4S, v30.4S, v23.4S // ....................................................................................................*........................................................................................... + mla v26.4S, v24.4S, v29.4S // .................................................................................................................*.............................................................................. + add v19.4S, v20.4S, v13.4S // .............................................................................................................................*.................................................................. + add v22.4S, v9.4S, v21.4S // .................................................................................................................................................*.............................................. + sub v30.4S, v23.4S, v18.4S // ...........................................................................................................................................*.................................................... + sqrdmulh v24.4S, v14.4S, v6.S[1] // .................................................................................................................................................................*.............................. + mul v13.4S, v14.4S, v6.S[0] // ..................................................................................................................................................................*............................. + sub v25.4S, v11.4S, v26.4S // ..................................................................................................................*............................................................................. + str q30, [x0, #64] // .................................................................................................................................................................................*.............. + sub v10.4S, v17.4S, v10.4S // .....................................................................*.......................................................................................................................... + str q22, [x0, #128] // ..................................................................................................................................................................................*............. + add v20.4S, v11.4S, v26.4S // ...................................................................................................................*............................................................................ + add v30.4S, v8.4S, v12.4S // ..........................................................................................*..................................................................................................... + mul v14.4S, v25.4S, v5.S[0] // ........................................................................................................................................................*....................................... + mla v13.4S, v24.4S, v29.4S // ...................................................................................................................................................................*............................ + sqrdmulh v24.4S, v10.4S, v2.S[1] // ..........................................................................................................*..................................................................................... + mul v26.4S, v10.4S, v2.S[0] // ...........................................................................................................*.................................................................................... + mul v11.4S, v30.4S, v2.S[2] // .....................................................................................................................*.......................................................................... + sqrdmulh v17.4S, v30.4S, v2.S[3] // ....................................................................................................................*........................................................................... + mul v10.4S, v20.4S, v4.S[2] // ...................................................................................................................................................*............................................ + sqrdmulh v20.4S, v20.4S, v4.S[3] // ..................................................................................................................................................*............................................. + sqrdmulh v30.4S, v25.4S, v5.S[1] // .......................................................................................................................................................*........................................ + mla v26.4S, v24.4S, v29.4S // ............................................................................................................*................................................................................... + mla v11.4S, v17.4S, v29.4S // ......................................................................................................................*......................................................................... + add v17.4S, v28.4S, v16.4S // ................................................................................*............................................................................................................... + sub v9.4S, v9.4S, v21.4S // ................................................................................................................................................*............................................... + mla v10.4S, v20.4S, v29.4S // ....................................................................................................................................................*........................................... + mla v14.4S, v30.4S, v29.4S // .........................................................................................................................................................*...................................... + add v25.4S, v31.4S, v26.4S // ..............................................................................................................*................................................................................. + str q9, [x0, #192] // ...................................................................................................................................................................................*............ + sub v30.4S, v31.4S, v26.4S // .............................................................................................................*.................................................................................. + sub v21.4S, v17.4S, v11.4S // .......................................................................................................................*........................................................................ + add v22.4S, v25.4S, v10.4S // ......................................................................................................................................................*......................................... + sub v12.4S, v8.4S, v12.4S // .........................................................................................*...................................................................................................... + sub v24.4S, v21.4S, v13.4S // ....................................................................................................................................................................*........................... + add v9.4S, v30.4S, v14.4S // ...........................................................................................................................................................*.................................... + str q22, [x0, #256] // ....................................................................................................................................................................................*........... + sub v31.4S, v28.4S, v16.4S // ...............................................................................*................................................................................................................ + sqrdmulh v26.4S, v19.4S, v5.S[3] // ............................................................................................................................................................*................................... + str q9, [x0, #384] // ......................................................................................................................................................................................*......... + mul v19.4S, v19.4S, v5.S[2] // .............................................................................................................................................................*.................................. + mul v16.4S, v12.4S, v3.S[0] // ...............................................................................................................................*................................................................ + str q24, [x0, #704] // ...........................................................................................................................................................................................*.... + sqrdmulh v24.4S, v15.4S, v6.S[3] // ......................................................................................................................................................................*......................... + sqrdmulh v20.4S, v12.4S, v3.S[1] // ..............................................................................................................................*................................................................. + add v9.4S, v23.4S, v18.4S // ............................................................................................................................................*................................................... + mul v22.4S, v15.4S, v6.S[2] // .......................................................................................................................................................................*........................ + mla v19.4S, v26.4S, v29.4S // ..............................................................................................................................................................*................................. + mul v18.4S, v27.4S, v7.S[0] // ............................................................................................................................................................................*................... + mla v16.4S, v20.4S, v29.4S // ................................................................................................................................*............................................................... + str q9, [x0], #(16) // ................................................................................................................................................................................*............... + add v8.4S, v17.4S, v11.4S // ........................................................................................................................*....................................................................... + mla v22.4S, v24.4S, v29.4S // ........................................................................................................................................................................*....................... + sub v26.4S, v25.4S, v10.4S // .....................................................................................................................................................*.......................................... + sqrdmulh v11.4S, v27.4S, v7.S[1] // ...........................................................................................................................................................................*.................... + sub v15.4S, v8.4S, v19.4S // ...............................................................................................................................................................*................................ + str q26, [x0, #304] // .....................................................................................................................................................................................*.......... + add v12.4S, v31.4S, v16.4S // ..................................................................................................................................*............................................................. + add v10.4S, v8.4S, v19.4S // ................................................................................................................................................................*............................... + add v8.4S, v21.4S, v13.4S // .....................................................................................................................................................................*.......................... + str q15, [x0, #560] // .........................................................................................................................................................................................*...... + add v9.4S, v12.4S, v22.4S // ..........................................................................................................................................................................*..................... + mla v18.4S, v11.4S, v29.4S // .............................................................................................................................................................................*.................. + str q10, [x0, #496] // ........................................................................................................................................................................................*....... + sub v27.4S, v12.4S, v22.4S // .........................................................................................................................................................................*...................... + str q8, [x0, #624] // ..........................................................................................................................................................................................*..... + sub v10.4S, v31.4S, v16.4S // .................................................................................................................................*.............................................................. + str q9, [x0, #752] // ............................................................................................................................................................................................*... + sub v26.4S, v30.4S, v14.4S // ..........................................................................................................................................................*..................................... + add v13.4S, v10.4S, v18.4S // ...............................................................................................................................................................................*................ + str q27, [x0, #816] // .............................................................................................................................................................................................*.. + sub v9.4S, v10.4S, v18.4S // ..............................................................................................................................................................................*................. + str q26, [x0, #432] // .......................................................................................................................................................................................*........ + str q13, [x0, #880] // ..............................................................................................................................................................................................*. + str q9, [x0, #944] // ...............................................................................................................................................................................................* restore inp, STACK0 mov count, #16 @@ -901,474 +883,510 @@ layer1234_start: qform_root3_tw .req q7 .p2align 2 - ldr q17, [x3], #16 // .*.............................. - ldr q9, [x1, #48] // *............................... - // gap // ................................ - // gap // ................................ - ldr q28, [x3], #8 // ...........*.................... - ldr q7, [x1, #32] // ..*............................. - // gap // ................................ - // gap // ................................ - ldr q24, [x1, #16] // ....*........................... - // gap // ................................ - // gap // ................................ - // gap // ................................ - ldr q4, [x4, #16] // ...*............................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - sqrdmulh v12.4S, v9.4S, v17.S[1] // .......*........................ - mul v9.4S, v9.4S, v17.S[0] // ......*......................... - ldr q23, [x1, #0] // ............*................... - // gap // ................................ - mul v27.4S, v7.4S, v17.S[0] // ........*....................... - sqrdmulh v20.4S, v7.4S, v17.S[1] // .........*...................... - ldr q1, [x4], #(6*16) // .....*.......................... - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - mla v9.4S, v12.4S, v29.4S // ..........*..................... - // gap // ................................ - // gap // ................................ - // gap // ................................ - mla v27.4S, v20.4S, v29.4S // .............*.................. - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - sub v6.4S, v24.4S, v9.4S // ...................*............ - add v24.4S, v24.4S, v9.4S // ..............*................. - // gap // ................................ - // gap // ................................ - add v7.4S, v23.4S, v27.4S // ...............*................ - sub v13.4S, v23.4S, v27.4S // ................*............... - // gap // ................................ - // gap // ................................ - mul v23.4S, v24.4S, v17.S[2] // ..................*............. - sqrdmulh v19.4S, v24.4S, v17.S[3] // .................*.............. - // gap // ................................ - // gap // ................................ - mul v9.4S, v6.4S, v28.S[0] // .....................*.......... - sqrdmulh v12.4S, v6.4S, v28.S[1] // ....................*........... - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - mla v23.4S, v19.4S, v29.4S // ......................*......... - // gap // ................................ - // gap // ................................ - // gap // ................................ - mla v9.4S, v12.4S, v29.4S // .......................*........ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - add v19.4S, v7.4S, v23.4S // .........................*...... - sub v11.4S, v7.4S, v23.4S // ........................*....... - // gap // ................................ - // gap // ................................ - sub v0.4S, v13.4S, v9.4S // ...........................*.... - add v21.4S, v13.4S, v9.4S // ..........................*..... - // gap // ................................ - // gap // ................................ - trn2 v16.4S, v19.4S, v11.4S // .............................*.. - trn1 v25.4S, v19.4S, v11.4S // ............................*... - // gap // ................................ - // gap // ................................ - trn2 v28.4S, v21.4S, v0.4S // ..............................*. - trn1 v11.4S, v21.4S, v0.4S // ...............................* - // gap // ................................ - // gap // ................................ + // Instructions: 39 + // Expected cycles: 26 + // Expected IPC: 1.50 + // + // Wall time: 0.81s + // User time: 0.81s + // + // --------- original position ----------> + // 0 25 + // |------------------------|------------- + ldr q17, [x3], #16 // *...................................... + ldr q0, [x1, #48] // .*..................................... + // gap // ....................................... + // gap // ....................................... + ldr q9, [x1, #32] // ....*.................................. + // gap // ....................................... + // gap // ....................................... + ldr q22, [x1, #0] // ..*.................................... + ldr q21, [x3], #8 // ...*................................... + ldr q31, [x4, #32] // ...............*....................... + // gap // ....................................... + // gap // ....................................... + ldr q11, [x4, #80] // ......................*................ + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mul v27.4S, v0.4S, v17.S[0] // ......*................................ + sqrdmulh v13.4S, v0.4S, v17.S[1] // .....*................................. + ldr q28, [x4, #48] // ..................................*.... + // gap // ....................................... + mul v3.4S, v9.4S, v17.S[0] // .........*............................. + sqrdmulh v9.4S, v9.4S, v17.S[1] // ........*.............................. + ldr q24, [x4, #64] // ................*...................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + ldr q1, [x1, #16] // .......*............................... + mla v27.4S, v13.4S, v29.4S // ..........*............................ + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mla v3.4S, v9.4S, v29.4S // ...........*........................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + add v0.4S, v1.4S, v27.4S // ..............*........................ + sub v18.4S, v1.4S, v27.4S // .............*......................... + // gap // ....................................... + // gap // ....................................... + sub v13.4S, v22.4S, v3.4S // ...........................*........... + add v1.4S, v22.4S, v3.4S // ........................*.............. + // gap // ....................................... + // gap // ....................................... + mul v23.4S, v0.4S, v17.S[2] // .................*..................... + sqrdmulh v0.4S, v0.4S, v17.S[3] // ..................*.................... + // gap // ....................................... + // gap // ....................................... + sqrdmulh v9.4S, v18.4S, v21.S[1] // .....................*................. + mul v5.4S, v18.4S, v21.S[0] // ....................*.................. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mla v23.4S, v0.4S, v29.4S // .......................*............... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mla v5.4S, v9.4S, v29.4S // .........................*............. + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + sub v27.4S, v1.4S, v23.4S // ............................*.......... + add v10.4S, v1.4S, v23.4S // ..........................*............ + // gap // ....................................... + // gap // ....................................... + sub v9.4S, v13.4S, v5.4S // ..............................*........ + add v0.4S, v13.4S, v5.4S // .............................*......... + // gap // ....................................... + // gap // ....................................... + trn1 v20.4S, v10.4S, v27.4S // ......................................* + trn2 v16.4S, v10.4S, v27.4S // ...............................*....... + // gap // ....................................... + // gap // ....................................... + trn1 v25.4S, v0.4S, v9.4S // .....................................*. + trn2 v6.4S, v0.4S, v9.4S // ................................*...... + ldr q12, [x4], #(6*16) // ............*.......................... + ldr q8, [x4, #-80] // ...................*................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + trn2 v9.2D, v16.2D, v6.2D // .................................*..... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + // gap // ....................................... + mul v13.4S, v9.4S, v12.4S // ....................................*.. + sqrdmulh v1.4S, v9.4S, v8.4S // ...................................*... + // gap // ....................................... + // gap // ....................................... - // original source code - // ldr q23, [x1, #48] // .*.............................. - // ldr q13, [x3], #16 // *............................... - // ldr q21, [x1, #32] // ...*............................ - // ldr q4, [x4, #16] // .....*.......................... - // ldr q22, [x1, #16] // ....*........................... - // ldr q1, [x4], #(6*16) // ...........*.................... - // mul v30.4S, v23.4S, v13.S[0] // .......*........................ - // sqrdmulh v18.4S, v23.4S, v13.S[1] // ......*......................... - // mul v9.4S, v21.4S, v13.S[0] // .........*...................... - // sqrdmulh v27.4S, v21.4S, v13.S[1] // ..........*..................... - // mla v30.4S, v18.4S, v29.4S // ............*................... - // ldr q10, [x3], #8 // ..*............................. - // ldr q18, [x1, #0] // ........*....................... - // mla v9.4S, v27.4S, v29.4S // .............*.................. - // add v31.4S, v22.4S, v30.4S // ...............*................ - // add v21.4S, v18.4S, v9.4S // ................*............... - // sub v20.4S, v18.4S, v9.4S // .................*.............. - // sqrdmulh v14.4S, v31.4S, v13.S[3] // ...................*............ - // mul v31.4S, v31.4S, v13.S[2] // ..................*............. - // sub v3.4S, v22.4S, v30.4S // ..............*................. - // sqrdmulh v23.4S, v3.4S, v10.S[1] // .....................*.......... - // mul v3.4S, v3.4S, v10.S[0] // ....................*........... - // mla v31.4S, v14.4S, v29.4S // ......................*......... - // mla v3.4S, v23.4S, v29.4S // .......................*........ - // sub v30.4S, v21.4S, v31.4S // .........................*...... - // add v18.4S, v21.4S, v31.4S // ........................*....... - // add v5.4S, v20.4S, v3.4S // ...........................*.... - // sub v20.4S, v20.4S, v3.4S // ..........................*..... - // trn1 v25.4S, v18.4S, v30.4S // .............................*.. - // trn2 v16.4S, v18.4S, v30.4S // ............................*... - // trn2 v28.4S, v5.4S, v20.4S // ..............................*. - // trn1 v11.4S, v5.4S, v20.4S // ...............................* + // ------------ new position ------------> + // 0 25 + // |------------------------|------------- + // ldr q26, [x3], #16 // *...................................... + // ldr q17, [x1, #48] // .*..................................... + // ldr q22, [x1, #0] // ...*................................... + // ldr q21, [x3], #8 // ....*.................................. + // ldr q1, [x1, #32] // ..*.................................... + // sqrdmulh v2.4S, v17.4S, v26.S[1] // ........*.............................. + // mul v9.4S, v17.4S, v26.S[0] // .......*............................... + // ldr q17, [x1, #16] // .............*......................... + // sqrdmulh v14.4S, v1.4S, v26.S[1] // ...........*........................... + // mul v1.4S, v1.4S, v26.S[0] // ..........*............................ + // mla v9.4S, v2.4S, v29.4S // ..............*........................ + // mla v1.4S, v14.4S, v29.4S // ...............*....................... + // ldr q12, [x4], #(6*16) // ..................................*.... + // sub v15.4S, v17.4S, v9.4S // .................*..................... + // add v2.4S, v17.4S, v9.4S // ................*...................... + // ldr q31, [x4, #-64] // .....*................................. + // ldr q24, [x4, #-32] // ............*.......................... + // mul v0.4S, v2.4S, v26.S[2] // ....................*.................. + // sqrdmulh v16.4S, v2.4S, v26.S[3] // .....................*................. + // ldr q8, [x4, #-80] // ...................................*... + // mul v9.4S, v15.4S, v21.S[0] // .......................*............... + // sqrdmulh v2.4S, v15.4S, v21.S[1] // ......................*................ + // ldr q11, [x4, #-16] // ......*................................ + // mla v0.4S, v16.4S, v29.4S // ........................*.............. + // add v16.4S, v22.4S, v1.4S // ...................*................... + // mla v9.4S, v2.4S, v29.4S // .........................*............. + // add v19.4S, v16.4S, v0.4S // ...........................*........... + // sub v14.4S, v22.4S, v1.4S // ..................*.................... + // sub v15.4S, v16.4S, v0.4S // ..........................*............ + // add v30.4S, v14.4S, v9.4S // .............................*......... + // sub v2.4S, v14.4S, v9.4S // ............................*.......... + // trn2 v16.4S, v19.4S, v15.4S // ...............................*....... + // trn2 v6.4S, v30.4S, v2.4S // .................................*..... + // trn2 v4.2D, v16.2D, v6.2D // ....................................*.. + // ldr q28, [x4, #-48] // .........*............................. + // sqrdmulh v1.4S, v4.4S, v8.4S // ......................................* + // mul v13.4S, v4.4S, v12.4S // .....................................*. + // trn1 v25.4S, v30.4S, v2.4S // ................................*...... + // trn1 v20.4S, v19.4S, v15.4S // ..............................*........ sub count, count, #1 layer5678_start: - // gap // ........................................................................ - ldr q23, [x1, #112] // ...e.................................................................... - trn2 v12.2D, v16.2D, v28.2D // ...............................*........................................ - ldr q13, [x3], #16 // ....e................................................................... - ldr q21, [x1, #96] // ..e..................................................................... - trn2 v20.2D, v25.2D, v11.2D // ..............................*......................................... - // gap // ........................................................................ - trn1 v2.2D, v16.2D, v28.2D // .................................*...................................... - sqrdmulh v7.4S, v12.4S, v4.4S // ..............................................*......................... - // gap // ........................................................................ - // gap // ........................................................................ - mul v5.4S, v12.4S, v1.4S // .............................................*.......................... - sqrdmulh v8.4S, v20.4S, v4.4S // .........................................*.............................. - ldr q4, [x4, #16] // ...................................e.................................... - mul v3.4S, v20.4S, v1.4S // ........................................*............................... - ldr q22, [x1, #80] // .e...................................................................... - ldr q12, [x4, #-16] // .......................................*................................ - ldr q1, [x4], #(6*16) // ..................................e..................................... - mul v30.4S, v23.4S, v13.S[0] // ...........e............................................................ - sqrdmulh v18.4S, v23.4S, v13.S[1] // ............e........................................................... - ldr q6, [x4, #-128] // ......................................*................................. - mla v5.4S, v7.4S, v29.S[0] // ...............................................*........................ - mul v9.4S, v21.4S, v13.S[0] // ......e................................................................. - // gap // ........................................................................ - mla v3.4S, v8.4S, v29.S[0] // ..........................................*............................. - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v27.4S, v21.4S, v13.S[1] // .......e................................................................ - trn1 v16.2D, v25.2D, v11.2D // ................................*....................................... - ldr q14, [x4, #-160] // ....................................*................................... - mla v30.4S, v18.4S, v29.4S // .............e.......................................................... - ldr q31, [x4, #-144] // .....................................*.................................. - add v28.4S, v2.4S, v5.4S // .................................................*...................... - ldr q10, [x3], #8 // .....e.................................................................. - sub v20.4S, v2.4S, v5.4S // ................................................*....................... - ldr q18, [x1, #64] // e....................................................................... - // gap // ........................................................................ - mla v9.4S, v27.4S, v29.4S // ........e............................................................... - // gap // ........................................................................ - add v2.4S, v16.4S, v3.4S // ............................................*........................... - sqrdmulh v19.4S, v20.4S, v12.4S // ........................................................*............... - mul v6.4S, v20.4S, v6.4S // .......................................................*................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v5.4S, v28.4S, v14.4S // ..................................................*..................... - sqrdmulh v27.4S, v28.4S, v31.4S // ...................................................*.................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v31.4S, v22.4S, v30.4S // ...............e........................................................ - // gap // ........................................................................ - add v21.4S, v18.4S, v9.4S // ..........e............................................................. - mla v6.4S, v19.4S, v29.S[0] // .........................................................*.............. - // gap // ........................................................................ - // gap // ........................................................................ - sub v20.4S, v18.4S, v9.4S // .........e.............................................................. - sub v16.4S, v16.4S, v3.4S // ...........................................*............................ - mla v5.4S, v27.4S, v29.S[0] // ....................................................*................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v14.4S, v31.4S, v13.S[3] // .................e...................................................... - mul v31.4S, v31.4S, v13.S[2] // ................e....................................................... - // gap // ........................................................................ - add v17.4S, v16.4S, v6.4S // ...........................................................*............ - // gap // ........................................................................ - // gap // ........................................................................ - sub v7.4S, v16.4S, v6.4S // ..........................................................*............. - // gap // ........................................................................ - sub v3.4S, v22.4S, v30.4S // ..............e......................................................... - // gap // ........................................................................ - sub v8.4S, v2.4S, v5.4S // .....................................................*.................. - add v6.4S, v2.4S, v5.4S // ......................................................*................. - // gap // ........................................................................ - trn1 v2.4S, v17.4S, v7.4S // ..............................................................*......... - // gap // ........................................................................ - sqrdmulh v23.4S, v3.4S, v10.S[1] // ......................e................................................. - // gap // ........................................................................ - mul v3.4S, v3.4S, v10.S[0] // .....................e.................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v26.4S, v6.4S, v8.4S // ............................................................*........... - mla v31.4S, v14.4S, v29.4S // ..................e..................................................... - trn2 v15.4S, v6.4S, v8.4S // .............................................................*.......... - trn2 v12.4S, v17.4S, v7.4S // ...............................................................*........ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v13.2D, v26.2D, v2.2D // ..................................................................*..... - // gap // ........................................................................ - mla v3.4S, v23.4S, v29.4S // .......................e................................................ - // gap // ........................................................................ - sub v30.4S, v21.4S, v31.4S // ...................e.................................................... - // gap // ........................................................................ - trn1 v0.2D, v15.2D, v12.2D // ...................................................................*.... - // gap // ........................................................................ - trn2 v9.2D, v26.2D, v2.2D // ................................................................*....... - // gap // ........................................................................ - add v18.4S, v21.4S, v31.4S // ....................e................................................... - // gap // ........................................................................ - add v5.4S, v20.4S, v3.4S // .........................e.............................................. - sub v20.4S, v20.4S, v3.4S // ........................e............................................... - str q0, [x1, #16] // .....................................................................*.. - // gap // ........................................................................ - str q9, [x1, #32] // ......................................................................*. - trn2 v21.2D, v15.2D, v12.2D // .................................................................*...... - // gap // ........................................................................ - trn1 v25.4S, v18.4S, v30.4S // ..........................e............................................. - // gap // ........................................................................ - trn2 v16.4S, v18.4S, v30.4S // ...........................e............................................ - trn2 v28.4S, v5.4S, v20.4S // .............................e.......................................... - str q13, [x1], #64 // ....................................................................*... - str q21, [x1, #-16] // .......................................................................* - trn1 v11.4S, v5.4S, v20.4S // ............................e........................................... - // gap // ........................................................................ - // gap // ........................................................................ + // Instructions: 72 + // Expected cycles: 29 + // Expected IPC: 2.48 + // + // Wall time: 75.02s + // User time: 75.02s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + mla v13.4S, v1.4S, v29.4S // ...............................................*........................ + ldr q26, [x3], #16 // ....e................................................................... + ldr q17, [x1, #112] // ...e.................................................................... + trn2 v5.2D, v20.2D, v25.2D // ..............................*......................................... + ldr q22, [x1, #64] // e....................................................................... + trn1 v7.2D, v16.2D, v6.2D // .................................*...................................... + ldr q21, [x3], #8 // .....e.................................................................. + trn1 v4.2D, v20.2D, v25.2D // ................................*....................................... + // gap // ........................................................................ + sqrdmulh v8.4S, v5.4S, v8.4S // ........................................*............................... + mul v0.4S, v5.4S, v12.4S // .........................................*.............................. + ldr q1, [x1, #96] // ..e..................................................................... + sub v20.4S, v7.4S, v13.4S // ................................................*....................... + add v5.4S, v7.4S, v13.4S // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v2.4S, v17.4S, v26.S[1] // ...........e............................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v9.4S, v17.4S, v26.S[0] // ............e........................................................... + ldr q17, [x1, #80] // .e...................................................................... + // gap // ........................................................................ + sqrdmulh v7.4S, v20.4S, v11.4S // .......................................................*................ + mul v10.4S, v20.4S, v24.4S // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v14.4S, v1.4S, v26.S[1] // ......e................................................................. + mul v1.4S, v1.4S, v26.S[0] // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mla v9.4S, v2.4S, v29.4S // .............e.......................................................... + mul v3.4S, v5.4S, v31.4S // ...................................................*.................... + mla v10.4S, v7.4S, v29.4S // .........................................................*.............. + // gap // ........................................................................ + mla v0.4S, v8.4S, v29.4S // ..........................................*............................. + // gap // ........................................................................ + // gap // ........................................................................ + mla v1.4S, v14.4S, v29.4S // ........e............................................................... + sqrdmulh v25.4S, v5.4S, v28.4S // ..................................................*..................... + ldr q12, [x4], #(6*16) // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v15.4S, v17.4S, v9.4S // ..............e......................................................... + add v2.4S, v17.4S, v9.4S // ...............e........................................................ + sub v18.4S, v4.4S, v0.4S // ...........................................*............................ + add v5.4S, v4.4S, v0.4S // ............................................*........................... + ldr q31, [x4, #-64] // ....................................e................................... + ldr q24, [x4, #-32] // ......................................e................................. + // gap // ........................................................................ + mul v0.4S, v2.4S, v26.S[2] // .................e...................................................... + sqrdmulh v16.4S, v2.4S, v26.S[3] // ................e....................................................... + // gap // ........................................................................ + add v28.4S, v18.4S, v10.4S // ...........................................................*............ + // gap // ........................................................................ + // gap // ........................................................................ + mla v3.4S, v25.4S, v29.4S // ....................................................*................... + ldr q8, [x4, #-80] // ...................................e.................................... + // gap // ........................................................................ + mul v9.4S, v15.4S, v21.S[0] // ......................e................................................. + sqrdmulh v2.4S, v15.4S, v21.S[1] // .....................e.................................................. + // gap // ........................................................................ + ldr q11, [x4, #-16] // .......................................e................................ + sub v25.4S, v18.4S, v10.4S // ..........................................................*............. + mla v0.4S, v16.4S, v29.4S // ..................e..................................................... + add v18.4S, v5.4S, v3.4S // ......................................................*................. + // gap // ........................................................................ + // gap // ........................................................................ + add v16.4S, v22.4S, v1.4S // ..........e............................................................. + sub v6.4S, v5.4S, v3.4S // .....................................................*.................. + // gap // ........................................................................ + // gap // ........................................................................ + mla v9.4S, v2.4S, v29.4S // .......................e................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v19.4S, v16.4S, v0.4S // ....................e................................................... + sub v14.4S, v22.4S, v1.4S // .........e.............................................................. + sub v15.4S, v16.4S, v0.4S // ...................e.................................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v20.4S, v18.4S, v6.4S // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + add v30.4S, v14.4S, v9.4S // .........................e.............................................. + sub v2.4S, v14.4S, v9.4S // ........................e............................................... + trn2 v18.4S, v18.4S, v6.4S // .............................................................*.......... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v7.4S, v28.4S, v25.4S // ..............................................................*......... + trn2 v16.4S, v19.4S, v15.4S // ...........................e............................................ + // gap // ........................................................................ + trn2 v6.4S, v30.4S, v2.4S // .............................e.......................................... + // gap // ........................................................................ + trn2 v5.4S, v28.4S, v25.4S // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v1.2D, v20.2D, v7.2D // ..................................................................*..... + trn2 v4.2D, v16.2D, v6.2D // ...............................e........................................ + // gap // ........................................................................ + ldr q28, [x4, #-48] // .....................................e.................................. + trn2 v7.2D, v20.2D, v7.2D // ................................................................*....... + trn2 v25.2D, v18.2D, v5.2D // .................................................................*...... + str q1, [x1], #64 // ....................................................................*... + trn1 v5.2D, v18.2D, v5.2D // ...................................................................*.... + // gap // ........................................................................ + sqrdmulh v1.4S, v4.4S, v8.4S // .............................................e.......................... + mul v13.4S, v4.4S, v12.4S // ..............................................e......................... + // gap // ........................................................................ + str q7, [x1, #-32] // ......................................................................*. + str q25, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + trn1 v25.4S, v30.4S, v2.4S // ............................e........................................... + trn1 v20.4S, v19.4S, v15.4S // ..........................e............................................. + str q5, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ - // original source code - // ldr q8, [x1, #(16*0)] // ............................e...........................................|...........................e.......................................... - // ldr q9, [x1, #(16*1)] // ...........e............................................................|..........e........................................................... - // ldr q10, [x1, #(16*2)] // ...e....................................................................|..e................................................................... - // ldr q11, [x1, #(16*3)] // e.......................................................................e...................................................................... - // ldr q0, [x3], #16 // ..e.....................................................................|.e.................................................................... - // ldr q1, [x3], #8 // ..........................e.............................................|.........................e............................................ - // mul v24.4s, v10.4s, v0.s[0] // ..................e.....................................................|.................e.................................................... - // sqrdmulh v10.4s, v10.4s, v0.s[1] // ....................e...................................................|...................e.................................................. - // mla v24.4s, v10.4s, v29.4s // .............................e..........................................|............................e......................................... - // sub v10.4s, v8.4s, v24.4s // ......................................e.................................|.....................................e................................ - // add v8.4s, v8.4s, v24.4s // ....................................e...................................|...................................e.................................. - // mul v24.4s, v11.4s, v0.s[0] // ..............e.........................................................|.............e........................................................ - // sqrdmulh v11.4s, v11.4s, v0.s[1] // ...............e........................................................|..............e....................................................... - // mla v24.4s, v11.4s, v29.4s // .......................e................................................|......................e............................................... - // sub v11.4s, v9.4s, v24.4s // .............................................e..........................|............................................e......................... - // add v9.4s, v9.4s, v24.4s // ...................................e....................................|..................................e................................... - // mul v24.4s, v9.4s, v0.s[2] // ..........................................e.............................|.........................................e............................ - // sqrdmulh v9.4s, v9.4s, v0.s[3] // .........................................e..............................|........................................e............................. - // mla v24.4s, v9.4s, v29.4s // ....................................................e...................|...................................................e.................. - // sub v9.4s, v8.4s, v24.4s // .........................................................e..............|........................................................e............. - // add v8.4s, v8.4s, v24.4s // ............................................................e...........|...........................................................e.......... - // mul v24.4s, v11.4s, v1.s[0] // ..................................................e.....................|.................................................e.................... - // sqrdmulh v11.4s, v11.4s, v1.s[1] // .................................................e......................|................................................e..................... - // mla v24.4s, v11.4s, v29.4s // ........................................................e...............|.......................................................e.............. - // sub v11.4s, v10.4s, v24.4s // ..............................................................e.........|.............................................................e........ - // add v10.4s, v10.4s, v24.4s // .............................................................e..........|............................................................e......... - // trn1 v25.4s, v8.4s, v9.4s // ..................................................................e.....|.................................................................e.... - // trn2 v26.4s, v8.4s, v9.4s // ...................................................................e....|..................................................................e... - // trn1 v27.4s, v10.4s, v11.4s // .......................................................................e|...................................................................... - // trn2 v28.4s, v10.4s, v11.4s // ....................................................................e...|...................................................................e.. - // trn2 v10.2d, v25.2d, v27.2d // ....*...................................................................|...*.................................................................. - // trn2 v11.2d, v26.2d, v28.2d // .*......................................................................|*..................................................................... - // trn1 v8.2d, v25.2d, v27.2d // .....................*..................................................|....................*................................................. - // trn1 v9.2d, v26.2d, v28.2d // .....*..................................................................|....*................................................................. - // ldr q0, [x4], #(6*16) // .............e..........................................................|............e......................................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // .........e..............................................................|........e............................................................. - // ldr q1, [x4, #(-6*16 + 2*16)] // ......................*.................................................|.....................*................................................ - // ldr q5, [x4, #(-6*16 + 3*16)] // ........................*...............................................|.......................*.............................................. - // ldr q2, [x4, #(-6*16 + 4*16)] // ................*.......................................................|...............*...................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ............*...........................................................|...........*.......................................................... - // mul v24.4s, v10.4s, v0.4s // ..........*.............................................................|.........*............................................................ - // sqrdmulh v10.4s, v10.4s, v4.4s // ........*...............................................................|.......*.............................................................. - // mla v24.4s, v10.4s, v29.s[0] // ...................*....................................................|..................*................................................... - // sub v10.4s, v8.4s, v24.4s // .......................................*................................|......................................*............................... - // add v8.4s, v8.4s, v24.4s // ..............................*.........................................|.............................*........................................ - // mul v24.4s, v11.4s, v0.4s // .......*................................................................|......*............................................................... - // sqrdmulh v11.4s, v11.4s, v4.4s // ......*.................................................................|.....*................................................................ - // mla v24.4s, v11.4s, v29.s[0] // .................*......................................................|................*..................................................... - // sub v11.4s, v9.4s, v24.4s // ...........................*............................................|..........................*........................................... - // add v9.4s, v9.4s, v24.4s // .........................*..............................................|........................*............................................. - // mul v24.4s, v9.4s, v1.4s // .................................*......................................|................................*..................................... - // sqrdmulh v9.4s, v9.4s, v5.4s // ..................................*.....................................|.................................*.................................... - // mla v24.4s, v9.4s, v29.s[0] // ........................................*...............................|.......................................*.............................. - // sub v9.4s, v8.4s, v24.4s // ..............................................*.........................|.............................................*........................ - // add v8.4s, v8.4s, v24.4s // ...............................................*........................|..............................................*....................... - // mul v24.4s, v11.4s, v2.4s // ................................*.......................................|...............................*...................................... - // sqrdmulh v11.4s, v11.4s, v6.4s // ...............................*........................................|..............................*....................................... - // mla v24.4s, v11.4s, v29.s[0] // .....................................*..................................|....................................*................................. - // sub v11.4s, v10.4s, v24.4s // ............................................*...........................|...........................................*.......................... - // add v10.4s, v10.4s, v24.4s // ...........................................*............................|..........................................*........................... - // trn1 v25.4s, v8.4s, v9.4s // ...................................................*....................|..................................................*................... - // trn2 v26.4s, v8.4s, v9.4s // .....................................................*..................|....................................................*................. - // trn1 v27.4s, v10.4s, v11.4s // ................................................*.......................|...............................................*...................... - // trn2 v28.4s, v10.4s, v11.4s // ......................................................*.................|.....................................................*................ - // trn2 v10.2d, v25.2d, v27.2d // ...........................................................*............|..........................................................*........... - // trn2 v11.2d, v26.2d, v28.2d // .................................................................*......|................................................................*..... - // trn1 v8.2d, v25.2d, v27.2d // .......................................................*................|......................................................*............... - // trn1 v9.2d, v26.2d, v28.2d // ..........................................................*.............|.........................................................*............ - // str q8, [x1], #64 // .....................................................................*..|....................................................................*. - // str q9, [x1, #(-(64) + 16*1)] // ...............................................................*........|..............................................................*....... - // str q10, [x1, #(-(64) + 16*2)] // ................................................................*.......|...............................................................*...... - // str q11, [x1, #(-(64) + 16*3)] // ......................................................................*.|.....................................................................* + // ---------------------------------------------------------------- new position ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q8, [x1, #(16*0)] // ...e...................................................................'...~................................................................... + // ldr q9, [x1, #(16*1)] // ..............e........................................................'..............~........................................................ + // ldr q10, [x1, #(16*2)] // .........e.............................................................'.........~............................................................. + // ldr q11, [x1, #(16*3)] // .e.....................................................................'.~..................................................................... + // ldr q0, [x3], #16 // e......................................................................'~...................................................................... + // ldr q1, [x3], #8 // .....e.................................................................'.....~................................................................. + // sqrdmulh v27.4s, v10.4s, v0.s[1] // .................e.....................................................'.................~..................................................... + // mul v24.4s, v10.4s, v0.s[0] // ..................e....................................................'..................~.................................................... + // mla v24.4s, v27.4s, v29.4s // .......................e...............................................'.......................~............................................... + // sub v10.4s, v8.4s, v24.4s // ...............................................e.......................'...............................................~....................... + // add v8.4s, v8.4s, v24.4s // ...........................................e...........................'...........................................~........................... + // sqrdmulh v27.4s, v11.4s, v0.s[1] // ............e..........................................................'............~.......................................................... + // mul v24.4s, v11.4s, v0.s[0] // .............e.........................................................'.............~......................................................... + // mla v24.4s, v27.4s, v29.4s // ...................e...................................................'...................~................................................... + // sub v11.4s, v9.4s, v24.4s // ..........................e............................................'..........................~............................................ + // add v9.4s, v9.4s, v24.4s // ...........................e...........................................'...........................~........................................... + // sqrdmulh v27.4s, v9.4s, v0.s[3] // .................................e.....................................'.................................~..................................... + // mul v24.4s, v9.4s, v0.s[2] // ................................e......................................'................................~...................................... + // mla v24.4s, v27.4s, v29.4s // .........................................e.............................'.........................................~............................. + // sub v9.4s, v8.4s, v24.4s // ................................................e......................'................................................~...................... + // add v8.4s, v8.4s, v24.4s // ..............................................e........................'..............................................~........................ + // sqrdmulh v27.4s, v11.4s, v1.s[1] // ......................................e................................'......................................~................................ + // mul v24.4s, v11.4s, v1.s[0] // .....................................e.................................'.....................................~................................. + // mla v24.4s, v27.4s, v29.4s // .............................................e.........................'.............................................~......................... + // sub v11.4s, v10.4s, v24.4s // ...................................................e...................'...................................................~................... + // add v10.4s, v10.4s, v24.4s // ..................................................e....................'..................................................~.................... + // trn1 v25.4s, v8.4s, v9.4s // .....................................................................e.'.....................................................................~. + // trn2 v26.4s, v8.4s, v9.4s // ......................................................e................'......................................................~................ + // trn1 v27.4s, v10.4s, v11.4s // ....................................................................e..'....................................................................~.. + // trn2 v28.4s, v10.4s, v11.4s // .......................................................e...............'.......................................................~............... + // trn2 v10.2d, v25.2d, v27.2d // ..~....................................................................'..*.................................................................... + // trn2 v11.2d, v26.2d, v28.2d // ..........................................................e............'..........................................................~............ + // trn1 v8.2d, v25.2d, v27.2d // ......~................................................................'......*................................................................ + // trn1 v9.2d, v26.2d, v28.2d // ....~..................................................................'....*.................................................................. + // ldr q0, [ x4], #(6*16) // .........................e.............................................'.........................~............................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // ....................................e..................................'....................................~.................................. + // ldr q1, [ x4, #(-6*16 + 2*16)] // ..............................e........................................'..............................~........................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // ...........................................................e...........'...........................................................~........... + // ldr q2, [ x4, #(-6*16 + 4*16)] // ...............................e.......................................'...............................~....................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .......................................e...............................'.......................................~............................... + // sqrdmulh v27.4s, v10.4s, v4.4s // .......~...............................................................'.......*............................................................... + // mul v24.4s, v10.4s, v0.4s // ........~..............................................................'........*.............................................................. + // mla v24.4s, v27.4s, v29.4s // ......................~................................................'......................*................................................ + // sub v10.4s, v8.4s, v24.4s // ............................~..........................................'............................*.......................................... + // add v8.4s, v8.4s, v24.4s // .............................~.........................................'.............................*......................................... + // sqrdmulh v27.4s, v11.4s, v4.4s // ................................................................e......'................................................................~...... + // mul v24.4s, v11.4s, v0.4s // .................................................................e.....'.................................................................~..... + // mla v24.4s, v27.4s, v29.4s // .......................................................................*....................................................................... + // sub v11.4s, v9.4s, v24.4s // ..........~............................................................'..........*............................................................ + // add v9.4s, v9.4s, v24.4s // ...........~...........................................................'...........*........................................................... + // sqrdmulh v27.4s, v9.4s, v5.4s // ........................~..............................................'........................*.............................................. + // mul v24.4s, v9.4s, v1.4s // ....................~..................................................'....................*.................................................. + // mla v24.4s, v27.4s, v29.4s // ...................................~...................................'...................................*................................... + // sub v9.4s, v8.4s, v24.4s // ............................................~..........................'............................................*.......................... + // add v8.4s, v8.4s, v24.4s // ..........................................~............................'..........................................*............................ + // sqrdmulh v27.4s, v11.4s, v6.4s // ...............~.......................................................'...............*....................................................... + // mul v24.4s, v11.4s, v2.4s // ................~......................................................'................*...................................................... + // mla v24.4s, v27.4s, v29.4s // .....................~.................................................'.....................*................................................. + // sub v11.4s, v10.4s, v24.4s // ........................................~..............................'........................................*.............................. + // add v10.4s, v10.4s, v24.4s // ..................................~....................................'..................................*.................................... + // trn1 v25.4s, v8.4s, v9.4s // .................................................~.....................'.................................................*..................... + // trn2 v26.4s, v8.4s, v9.4s // ....................................................~..................'....................................................*.................. + // trn1 v27.4s, v10.4s, v11.4s // .....................................................~.................'.....................................................*................. + // trn2 v28.4s, v10.4s, v11.4s // ........................................................~..............'........................................................*.............. + // trn2 v10.2d, v25.2d, v27.2d // ............................................................~..........'............................................................*.......... + // trn2 v11.2d, v26.2d, v28.2d // .............................................................~.........'.............................................................*......... + // trn1 v8.2d, v25.2d, v27.2d // .........................................................~.............'.........................................................*............. + // trn1 v9.2d, v26.2d, v28.2d // ...............................................................~.......'...............................................................*....... + // str q8, [x1], #64 // ..............................................................~........'..............................................................*........ + // str q9, [x1, #(-(64) + 16*1)] // ......................................................................~'......................................................................* + // str q10, [x1, #(-(64) + 16*2)] // ..................................................................~....'..................................................................*.... + // str q11, [x1, #(-(64) + 16*3)] // ...................................................................~...'...................................................................*... sub count, count, #1 cbnz count, layer5678_start - ldr q6, [x4, #-16] // .......*................................ - ldr q18, [x4, #-64] // ............*........................... - trn2 v30.2D, v16.2D, v28.2D // *....................................... - // gap // ........................................ - trn2 v9.2D, v25.2D, v11.2D // .*...................................... - ldr q21, [x4, #-48] // .............*.......................... - // gap // ........................................ - // gap // ........................................ - sqrdmulh v31.4S, v30.4S, v4.4S // ...*.................................... - mul v3.4S, v30.4S, v1.4S // ....*................................... - ldr q14, [x4, #-32] // ........*............................... - // gap // ........................................ - mul v5.4S, v9.4S, v1.4S // ......*................................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - trn1 v30.2D, v16.2D, v28.2D // ..*..................................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mla v3.4S, v31.4S, v29.S[0] // .........*.............................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqrdmulh v31.4S, v9.4S, v4.4S // .....*.................................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sub v22.4S, v30.4S, v3.4S // ...............*........................ - add v3.4S, v30.4S, v3.4S // ..............*......................... - // gap // ........................................ - // gap // ........................................ - trn1 v30.2D, v25.2D, v11.2D // ...........*............................ - // gap // ........................................ - // gap // ........................................ - mla v5.4S, v31.4S, v29.S[0] // ..........*............................. - sqrdmulh v31.4S, v22.4S, v6.4S // .................*...................... - mul v6.4S, v22.4S, v14.4S // ..................*..................... - // gap // ........................................ - // gap // ........................................ - mul v20.4S, v3.4S, v18.4S // ...................*.................... - sqrdmulh v3.4S, v3.4S, v21.4S // ....................*................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mla v6.4S, v31.4S, v29.S[0] // .....................*.................. - sub v31.4S, v30.4S, v5.4S // ......................*................. - // gap // ........................................ - // gap // ........................................ - add v30.4S, v30.4S, v5.4S // ................*....................... - mla v20.4S, v3.4S, v29.S[0] // .......................*................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sub v3.4S, v31.4S, v6.4S // .........................*.............. - add v31.4S, v31.4S, v6.4S // ........................*............... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v5.4S, v30.4S, v20.4S // ...........................*............ - sub v30.4S, v30.4S, v20.4S // ..........................*............. - trn2 v6.4S, v31.4S, v3.4S // ...............................*........ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - trn2 v22.4S, v5.4S, v30.4S // ..............................*......... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - trn1 v9.4S, v5.4S, v30.4S // .............................*.......... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - trn1 v30.4S, v31.4S, v3.4S // ............................*........... - // gap // ........................................ - // gap // ........................................ - trn2 v3.2D, v22.2D, v6.2D // .....................................*.. - trn1 v19.2D, v22.2D, v6.2D // .................................*...... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - str q3, [x1, #48] // .......................................* - // gap // ........................................ - // gap // ........................................ - trn1 v17.2D, v9.2D, v30.2D // ................................*....... - trn2 v11.2D, v9.2D, v30.2D // ..................................*..... - str q19, [x1, #16] // ...................................*.... - // gap // ........................................ - // gap // ........................................ - str q17, [x1], #64 // ......................................*. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - str q11, [x1, #-32] // ....................................*... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ + // Instructions: 33 + // Expected cycles: 23 + // Expected IPC: 1.43 + // + // Wall time: 0.38s + // User time: 0.38s + // + // ------ original position -------> + // 0 25 + // |------------------------|------- + mla v13.4S, v1.4S, v29.4S // *................................ + trn2 v18.2D, v20.2D, v25.2D // .*............................... + // gap // ................................. + // gap // ................................. + trn1 v23.2D, v20.2D, v25.2D // ...*............................. + trn1 v30.2D, v16.2D, v6.2D // ..*.............................. + // gap // ................................. + // gap // ................................. + sqrdmulh v8.4S, v18.4S, v8.4S // ....*............................ + mul v4.4S, v18.4S, v12.4S // .....*........................... + // gap // ................................. + // gap // ................................. + add v12.4S, v30.4S, v13.4S // .......*......................... + sub v18.4S, v30.4S, v13.4S // ......*.......................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v27.4S, v12.4S, v28.4S // .............*................... + mul v21.4S, v12.4S, v31.4S // ..........*...................... + // gap // ................................. + // gap // ................................. + mla v4.4S, v8.4S, v29.4S // ............*.................... + sqrdmulh v10.4S, v18.4S, v11.4S // ........*........................ + // gap // ................................. + // gap // ................................. + mul v30.4S, v18.4S, v24.4S // .........*....................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + mla v21.4S, v27.4S, v29.4S // .................*............... + // gap // ................................. + // gap // ................................. + // gap // ................................. + sub v11.4S, v23.4S, v4.4S // ..............*.................. + add v3.4S, v23.4S, v4.4S // ...............*................. + // gap // ................................. + // gap // ................................. + mla v30.4S, v10.4S, v29.4S // ...........*..................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sub v8.4S, v3.4S, v21.4S // ....................*............ + // gap // ................................. + // gap // ................................. + add v6.4S, v3.4S, v21.4S // ...................*............. + // gap // ................................. + // gap // ................................. + // gap // ................................. + add v14.4S, v11.4S, v30.4S // ................*................ + sub v15.4S, v11.4S, v30.4S // ..................*.............. + // gap // ................................. + // gap // ................................. + trn1 v1.4S, v6.4S, v8.4S // .....................*........... + trn2 v30.4S, v6.4S, v8.4S // ......................*.......... + // gap // ................................. + // gap // ................................. + trn1 v11.4S, v14.4S, v15.4S // .......................*......... + trn2 v9.4S, v14.4S, v15.4S // ........................*........ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + trn1 v8.2D, v1.2D, v11.2D // .........................*....... + // gap // ................................. + // gap // ................................. + // gap // ................................. + trn2 v22.2D, v1.2D, v11.2D // ..........................*...... + // gap // ................................. + // gap // ................................. + // gap // ................................. + str q8, [x1], #64 // ............................*.... + trn2 v15.2D, v30.2D, v9.2D // ...........................*..... + // gap // ................................. + // gap // ................................. + trn1 v31.2D, v30.2D, v9.2D // .............................*... + str q22, [x1, #-32] // ..............................*.. + // gap // ................................. + // gap // ................................. + str q15, [x1, #-16] // ...............................*. + // gap // ................................. + // gap // ................................. + // gap // ................................. + str q31, [x1, #-48] // ................................* + // gap // ................................. + // gap // ................................. + // gap // ................................. - // original source code - // trn2 v12.2D, v16.2D, v28.2D // ..*..................................... - // trn2 v20.2D, v25.2D, v11.2D // ...*.................................... - // trn1 v2.2D, v16.2D, v28.2D // .........*.............................. - // sqrdmulh v7.4S, v12.4S, v4.4S // .....*.................................. - // mul v5.4S, v12.4S, v1.4S // ......*................................. - // sqrdmulh v8.4S, v20.4S, v4.4S // ...........*............................ - // mul v3.4S, v20.4S, v1.4S // ........*............................... - // ldr q12, [x4, #-16] // *....................................... - // ldr q6, [x4, #-32] // .......*................................ - // mla v5.4S, v7.4S, v29.S[0] // ..........*............................. - // mla v3.4S, v8.4S, v29.S[0] // ...............*........................ - // trn1 v16.2D, v25.2D, v11.2D // ..............*......................... - // ldr q14, [x4, #-64] // .*...................................... - // ldr q31, [x4, #-48] // ....*................................... - // add v28.4S, v2.4S, v5.4S // .............*.......................... - // sub v20.4S, v2.4S, v5.4S // ............*........................... - // add v2.4S, v16.4S, v3.4S // ......................*................. - // sqrdmulh v19.4S, v20.4S, v12.4S // ................*....................... - // mul v6.4S, v20.4S, v6.4S // .................*...................... - // mul v5.4S, v28.4S, v14.4S // ..................*..................... - // sqrdmulh v27.4S, v28.4S, v31.4S // ...................*.................... - // mla v6.4S, v19.4S, v29.S[0] // ....................*................... - // sub v16.4S, v16.4S, v3.4S // .....................*.................. - // mla v5.4S, v27.4S, v29.S[0] // .......................*................ - // add v17.4S, v16.4S, v6.4S // .........................*.............. - // sub v7.4S, v16.4S, v6.4S // ........................*............... - // sub v8.4S, v2.4S, v5.4S // ...........................*............ - // add v6.4S, v2.4S, v5.4S // ..........................*............. - // trn1 v2.4S, v17.4S, v7.4S // ...............................*........ - // trn1 v26.4S, v6.4S, v8.4S // ..............................*......... - // trn2 v15.4S, v6.4S, v8.4S // .............................*.......... - // trn2 v12.4S, v17.4S, v7.4S // ............................*........... - // trn1 v13.2D, v26.2D, v2.2D // ...................................*.... - // trn1 v0.2D, v15.2D, v12.2D // .................................*...... - // trn2 v9.2D, v26.2D, v2.2D // ....................................*... - // str q0, [x1, #16] // .....................................*.. - // str q9, [x1, #32] // .......................................* - // trn2 v21.2D, v15.2D, v12.2D // ................................*....... - // str q13, [x1], #64 // ......................................*. - // str q21, [x1, #-16] // ..................................*..... + // --------- new position ---------> + // 0 25 + // |------------------------|------- + // mla v13.4S, v1.4S, v29.4S // *................................ + // trn2 v5.2D, v20.2D, v25.2D // .*............................... + // trn1 v7.2D, v16.2D, v6.2D // ...*............................. + // trn1 v4.2D, v20.2D, v25.2D // ..*.............................. + // sqrdmulh v8.4S, v5.4S, v8.4S // ....*............................ + // mul v0.4S, v5.4S, v12.4S // .....*........................... + // sub v20.4S, v7.4S, v13.4S // .......*......................... + // add v5.4S, v7.4S, v13.4S // ......*.......................... + // sqrdmulh v7.4S, v20.4S, v11.4S // ...........*..................... + // mul v10.4S, v20.4S, v24.4S // ............*.................... + // mul v3.4S, v5.4S, v31.4S // .........*....................... + // mla v10.4S, v7.4S, v29.4S // ................*................ + // mla v0.4S, v8.4S, v29.4S // ..........*...................... + // sqrdmulh v25.4S, v5.4S, v28.4S // ........*........................ + // sub v18.4S, v4.4S, v0.4S // ..............*.................. + // add v5.4S, v4.4S, v0.4S // ...............*................. + // add v28.4S, v18.4S, v10.4S // ...................*............. + // mla v3.4S, v25.4S, v29.4S // .............*................... + // sub v25.4S, v18.4S, v10.4S // ....................*............ + // add v18.4S, v5.4S, v3.4S // ..................*.............. + // sub v6.4S, v5.4S, v3.4S // .................*............... + // trn1 v20.4S, v18.4S, v6.4S // .....................*........... + // trn2 v18.4S, v18.4S, v6.4S // ......................*.......... + // trn1 v7.4S, v28.4S, v25.4S // .......................*......... + // trn2 v5.4S, v28.4S, v25.4S // ........................*........ + // trn1 v1.2D, v20.2D, v7.2D // .........................*....... + // trn2 v7.2D, v20.2D, v7.2D // ..........................*...... + // trn2 v25.2D, v18.2D, v5.2D // ............................*.... + // str q1, [x1], #64 // ...........................*..... + // trn1 v5.2D, v18.2D, v5.2D // .............................*... + // str q7, [x1, #-32] // ..............................*.. + // str q25, [x1, #-16] // ...............................*. + // str q5, [x1, #-48] // ................................* pop_stack diff --git a/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_a55.s b/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_a55.s new file mode 100644 index 00000000..4cc1e5f7 --- /dev/null +++ b/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_a55.s @@ -0,0 +1,1394 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmla d,a,b + mla \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlaq d,a,b,i + mla \d\().4s, \a\().4s, \b\().s[\i] + .endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] + .endm + + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmla \dst, t2, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmla \dst, t2, modulus +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro load_roots_1234 + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_dilithium_1234_5678_twiddles.s" +.text + + .global ntt_dilithium_1234_5678_opt_a55 + .global _ntt_dilithium_1234_5678 + +.p2align 4 +modulus_addr: .quad -8380417 +ntt_dilithium_1234_5678_opt_a55: +_ntt_dilithium_1234_5678_opt_a55: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, in + mov count, #4 + + load_roots_1234 + + .p2align 2 + ldr q22, [x0, #0] // *............................................................................................................................................................................................... + ldr q11, [x0, #512] // ........*....................................................................................................................................................................................... + ldr q10, [x0, #832] // .............*.................................................................................................................................................................................. + sqrdmulh v12.4S, v11.4S, v0.S[1] // ................*............................................................................................................................................................................... + mul v11.4S, v11.4S, v0.S[0] // .................*.............................................................................................................................................................................. + sqrdmulh v8.4S, v10.4S, v0.S[1] // .........................................*...................................................................................................................................................... + ldr q27, [x0, #64] // .*.............................................................................................................................................................................................. + mla v11.4S, v12.4S, v29.4S // ..................*............................................................................................................................................................................. + mul v10.4S, v10.4S, v0.S[0] // ..........................................*..................................................................................................................................................... + ldr q12, [x0, #128] // ..*............................................................................................................................................................................................. + sub v24.4S, v22.4S, v11.4S // ...................*............................................................................................................................................................................ + add v22.4S, v22.4S, v11.4S // ....................*........................................................................................................................................................................... + mla v10.4S, v8.4S, v29.4S // ...........................................*.................................................................................................................................................... + ldr q11, [x0, #192] // ...*............................................................................................................................................................................................ + ldr q8, [x0, #576] // .........*...................................................................................................................................................................................... + ldr q16, [x0, #768] // ............*................................................................................................................................................................................... + sqrdmulh v18.4S, v8.4S, v0.S[1] // .....................*.......................................................................................................................................................................... + mul v8.4S, v8.4S, v0.S[0] // ......................*......................................................................................................................................................................... + sqrdmulh v31.4S, v16.4S, v0.S[1] // ....................................*........................................................................................................................................................... + ldr q15, [x0, #704] // ...........*.................................................................................................................................................................................... + mla v8.4S, v18.4S, v29.4S // .......................*........................................................................................................................................................................ + mul v25.4S, v16.4S, v0.S[0] // .....................................*.......................................................................................................................................................... + ldr q16, [x0, #640] // ..........*..................................................................................................................................................................................... + sub v23.4S, v27.4S, v8.4S // ........................*....................................................................................................................................................................... + add v17.4S, v27.4S, v8.4S // .........................*...................................................................................................................................................................... + sqrdmulh v8.4S, v16.4S, v0.S[1] // ..........................*..................................................................................................................................................................... + mul v16.4S, v16.4S, v0.S[0] // ...........................*.................................................................................................................................................................... + mla v25.4S, v31.4S, v29.4S // ......................................*......................................................................................................................................................... + ldr q18, [x0, #256] // ....*........................................................................................................................................................................................... + mla v16.4S, v8.4S, v29.4S // ............................*................................................................................................................................................................... + sqrdmulh v31.4S, v15.4S, v0.S[1] // ...............................*................................................................................................................................................................ + sub v8.4S, v18.4S, v25.4S // .......................................*........................................................................................................................................................ + add v27.4S, v18.4S, v25.4S // ........................................*....................................................................................................................................................... + sub v20.4S, v12.4S, v16.4S // .............................*.................................................................................................................................................................. + add v13.4S, v12.4S, v16.4S // ..............................*................................................................................................................................................................. + sqrdmulh v12.4S, v8.4S, v1.S[1] // ............................................................................*................................................................................................................... + mul v8.4S, v8.4S, v1.S[0] // .............................................................................*.................................................................................................................. + mul v18.4S, v15.4S, v0.S[0] // ................................*............................................................................................................................................................... + ldr q26, [x0, #896] // ..............*................................................................................................................................................................................. + ldr q15, [x0, #320] // .....*.......................................................................................................................................................................................... + mul v30.4S, v26.4S, v0.S[0] // ...............................................*................................................................................................................................................ + sqrdmulh v19.4S, v26.4S, v0.S[1] // ..............................................*................................................................................................................................................. + sub v28.4S, v15.4S, v10.4S // ............................................*................................................................................................................................................... + ldr q14, [x0, #384] // ......*......................................................................................................................................................................................... + mla v30.4S, v19.4S, v29.4S // ................................................*............................................................................................................................................... + mla v8.4S, v12.4S, v29.4S // ..............................................................................*................................................................................................................. + ldr q26, [x0, #960] // ...............*................................................................................................................................................................................ + sub v16.4S, v14.4S, v30.4S // .................................................*.............................................................................................................................................. + add v19.4S, v24.4S, v8.4S // ................................................................................*............................................................................................................... + mul v25.4S, v26.4S, v0.S[0] // ....................................................*........................................................................................................................................... + sqrdmulh v26.4S, v26.4S, v0.S[1] // ...................................................*............................................................................................................................................ + add v14.4S, v14.4S, v30.4S // ..................................................*............................................................................................................................................. + ldr q21, [x0, #448] // .......*........................................................................................................................................................................................ + mla v25.4S, v26.4S, v29.4S // .....................................................*.......................................................................................................................................... + mul v30.4S, v14.4S, v0.S[2] // ...................................................................*............................................................................................................................ + sqrdmulh v26.4S, v14.4S, v0.S[3] // ..................................................................*............................................................................................................................. + add v12.4S, v15.4S, v10.4S // .............................................*.................................................................................................................................................. + add v9.4S, v21.4S, v25.4S // .......................................................*........................................................................................................................................ + mla v30.4S, v26.4S, v29.4S // ....................................................................*........................................................................................................................... + mul v14.4S, v28.4S, v1.S[0] // ..................................................................................*............................................................................................................. + sqrdmulh v15.4S, v28.4S, v1.S[1] // .................................................................................*.............................................................................................................. + mul v10.4S, v27.4S, v0.S[2] // .........................................................*...................................................................................................................................... + sub v28.4S, v13.4S, v30.4S // .....................................................................*.......................................................................................................................... + sqrdmulh v26.4S, v27.4S, v0.S[3] // ........................................................*....................................................................................................................................... + mla v14.4S, v15.4S, v29.4S // ...................................................................................*............................................................................................................ + sqrdmulh v15.4S, v28.4S, v2.S[1] // ..........................................................................................................*..................................................................................... + mul v27.4S, v28.4S, v2.S[0] // ...........................................................................................................*.................................................................................... + mla v10.4S, v26.4S, v29.4S // ..........................................................*..................................................................................................................................... + mla v18.4S, v31.4S, v29.4S // .................................*.............................................................................................................................................................. + mul v31.4S, v9.4S, v0.S[2] // ........................................................................*....................................................................................................................... + mla v27.4S, v15.4S, v29.4S // ............................................................................................................*................................................................................... + sub v26.4S, v22.4S, v10.4S // ...........................................................*.................................................................................................................................... + sqrdmulh v9.4S, v9.4S, v0.S[3] // .......................................................................*........................................................................................................................ + sub v15.4S, v21.4S, v25.4S // ......................................................*......................................................................................................................................... + add v28.4S, v26.4S, v27.4S // ..............................................................................................................*................................................................................. + add v25.4S, v11.4S, v18.4S // ...................................*............................................................................................................................................................ + mla v31.4S, v9.4S, v29.4S // .........................................................................*...................................................................................................................... + add v9.4S, v22.4S, v10.4S // ............................................................*................................................................................................................................... + mul v10.4S, v16.4S, v1.S[0] // .......................................................................................*........................................................................................................ + sqrdmulh v22.4S, v12.4S, v0.S[3] // .............................................................*.................................................................................................................................. + sub v21.4S, v25.4S, v31.4S // ..........................................................................*..................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..............................................................*................................................................................................................................. + add v25.4S, v25.4S, v31.4S // ...........................................................................*.................................................................................................................... + sqrdmulh v31.4S, v21.4S, v2.S[1] // ...............................................................................................................*................................................................................ + mul v21.4S, v21.4S, v2.S[0] // ................................................................................................................*............................................................................... + mla v12.4S, v22.4S, v29.4S // ...............................................................*................................................................................................................................ + mul v22.4S, v25.4S, v1.S[2] // ......................................................................................................*......................................................................................... + sqrdmulh v25.4S, v25.4S, v1.S[3] // .....................................................................................................*.......................................................................................... + mla v21.4S, v31.4S, v29.4S // .................................................................................................................*.............................................................................. + sub v31.4S, v17.4S, v12.4S // ................................................................*............................................................................................................................... + sub v27.4S, v26.4S, v27.4S // .............................................................................................................*.................................................................................. + add v26.4S, v17.4S, v12.4S // .................................................................*.............................................................................................................................. + add v17.4S, v31.4S, v21.4S // ...................................................................................................................*............................................................................ + mla v22.4S, v25.4S, v29.4S // .......................................................................................................*........................................................................................ + sub v25.4S, v11.4S, v18.4S // ..................................*............................................................................................................................................................. + sqrdmulh v18.4S, v17.4S, v4.S[3] // ..................................................................................................................................................*............................................. + sub count, count, #1 +layer1234_start: + // Instructions: 192 + // Expected cycles: 96 + // Expected IPC: 2.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + add v11.4S, v13.4S, v30.4S // ................*............................................................................................................................................................................... + mul v13.4S, v17.4S, v4.S[2] // *............................................................................................................................................................................................... + add v17.4S, v26.4S, v22.4S // ...*............................................................................................................................................................................................ + sqrdmulh v12.4S, v11.4S, v1.S[3] // .....................*.......................................................................................................................................................................... + mul v30.4S, v11.4S, v1.S[2] // ....................*........................................................................................................................................................................... + mul v11.4S, v17.4S, v3.S[2] // ......*......................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v3.S[3] // .......*........................................................................................................................................................................................ + mla v13.4S, v18.4S, v29.4S // ....*........................................................................................................................................................................................... + mla v30.4S, v12.4S, v29.4S // .........................*...................................................................................................................................................................... + sub v26.4S, v26.4S, v22.4S // ..*............................................................................................................................................................................................. + mla v11.4S, v17.4S, v29.4S // ............*................................................................................................................................................................................... + sub v18.4S, v28.4S, v13.4S // ........*....................................................................................................................................................................................... + sub v12.4S, v9.4S, v30.4S // ..............................................*................................................................................................................................................. + add v22.4S, v9.4S, v30.4S // .............................*.................................................................................................................................................................. + mul v30.4S, v26.4S, v4.S[0] // ...................................................................*............................................................................................................................ + sqrdmulh v9.4S, v26.4S, v4.S[1] // .........................................*...................................................................................................................................................... + sub v17.4S, v22.4S, v11.4S // .................................*.............................................................................................................................................................. + sqrdmulh v26.4S, v16.4S, v1.S[1] // ..............*................................................................................................................................................................................. + add v16.4S, v22.4S, v11.4S // ................................*............................................................................................................................................................... + str q17, [x0, #64] // .....................................*.......................................................................................................................................................... + sub v22.4S, v24.4S, v8.4S // ..............................*................................................................................................................................................................. + mla v30.4S, v9.4S, v29.4S // .......................................................................*........................................................................................................................ + sqrdmulh v11.4S, v15.4S, v1.S[1] // ..................*............................................................................................................................................................................. + mul v8.4S, v15.4S, v1.S[0] // ...............*................................................................................................................................................................................ + add v24.4S, v23.4S, v14.4S // .............................................*.................................................................................................................................................. + sub v23.4S, v23.4S, v14.4S // ......................................................*......................................................................................................................................... + str q16, [x0], #(16) // ...................................*............................................................................................................................................................ + mla v10.4S, v26.4S, v29.4S // ...................*............................................................................................................................................................................ + sub v16.4S, v31.4S, v21.4S // .*.............................................................................................................................................................................................. + add v31.4S, v12.4S, v30.4S // ..............................................................................*................................................................................................................. + sub v12.4S, v12.4S, v30.4S // ...........................................................................*.................................................................................................................... + sub v17.4S, v20.4S, v10.4S // .......................*........................................................................................................................................................................ + add v10.4S, v20.4S, v10.4S // ........................*....................................................................................................................................................................... + str q18, [x0, #304] // ...........*.................................................................................................................................................................................... + add v18.4S, v28.4S, v13.4S // ..........*..................................................................................................................................................................................... + mla v8.4S, v11.4S, v29.4S // ......................*......................................................................................................................................................................... + sqrdmulh v11.4S, v16.4S, v5.S[1] // .........*...................................................................................................................................................................................... + mul v16.4S, v16.4S, v5.S[0] // .....*.......................................................................................................................................................................................... + mla v16.4S, v11.4S, v29.4S // .............*.................................................................................................................................................................................. + sqrdmulh v11.4S, v10.4S, v2.S[3] // ........................................*....................................................................................................................................................... + mul v10.4S, v10.4S, v2.S[2] // ...........................*.................................................................................................................................................................... + sub v20.4S, v25.4S, v8.4S // ............................*................................................................................................................................................................... + add v8.4S, v25.4S, v8.4S // ..........................*..................................................................................................................................................................... + mul v25.4S, v17.4S, v3.S[0] // ......................................*......................................................................................................................................................... + sqrdmulh v13.4S, v17.4S, v3.S[1] // ..........................................*..................................................................................................................................................... + str q31, [x0, #112] // ..................................................................................*............................................................................................................. + sub v31.4S, v27.4S, v16.4S // ...............................................................................*................................................................................................................ + add v27.4S, v27.4S, v16.4S // .................*.............................................................................................................................................................................. + mla v25.4S, v13.4S, v29.4S // ...............................................*................................................................................................................................................ + str q12, [x0, #176] // ......................................................................................*......................................................................................................... + mla v10.4S, v11.4S, v29.4S // ............................................*................................................................................................................................................... + str q18, [x0, #240] // .................................................................*.............................................................................................................................. + sub v11.4S, v22.4S, v25.4S // ...............................................................*................................................................................................................................ + add v22.4S, v22.4S, v25.4S // ...................................................*............................................................................................................................................ + add v12.4S, v19.4S, v10.4S // .................................................*.............................................................................................................................................. + sub v10.4S, v19.4S, v10.4S // ..................................................*............................................................................................................................................. + mul v16.4S, v8.4S, v2.S[2] // ...........................................*.................................................................................................................................................... + sqrdmulh v8.4S, v8.4S, v2.S[3] // ....................................*........................................................................................................................................................... + str q31, [x0, #432] // ..........................................................................................*..................................................................................................... + mul v25.4S, v20.4S, v3.S[0] // ...............................*................................................................................................................................................................ + sqrdmulh v18.4S, v20.4S, v3.S[1] // ..................................*............................................................................................................................................................. + mla v16.4S, v8.4S, v29.4S // ................................................*............................................................................................................................................... + str q27, [x0, #368] // .......................................*........................................................................................................................................................ + ldr q20, [x0, #512] // .................................................................................................*.............................................................................................. + add v27.4S, v24.4S, v16.4S // .....................................................*.......................................................................................................................................... + sub v8.4S, v24.4S, v16.4S // .......................................................*........................................................................................................................................ + mla v25.4S, v18.4S, v29.4S // ....................................................*........................................................................................................................................... + mul v18.4S, v27.4S, v5.S[2] // ...................................................................................*............................................................................................................ + sqrdmulh v24.4S, v27.4S, v5.S[3] // ........................................................*....................................................................................................................................... + mul v27.4S, v8.4S, v6.S[0] // .....................................................................*.......................................................................................................................... + sqrdmulh v8.4S, v8.4S, v6.S[1] // ..........................................................*..................................................................................................................................... + sub v31.4S, v23.4S, v25.4S // .........................................................*...................................................................................................................................... + mla v18.4S, v24.4S, v29.4S // .......................................................................................*........................................................................................................ + add v23.4S, v23.4S, v25.4S // ...........................................................*.................................................................................................................................... + mla v27.4S, v8.4S, v29.4S // .........................................................................*...................................................................................................................... + mul v25.4S, v31.4S, v7.S[0] // .............................................................*.................................................................................................................................. + add v16.4S, v12.4S, v18.4S // ...........................................................................................*.................................................................................................... + sub v12.4S, v12.4S, v18.4S // .............................................................................................*.................................................................................................. + sub v8.4S, v10.4S, v27.4S // .....................................................................................*.......................................................................................................... + add v10.4S, v10.4S, v27.4S // .............................................................................*.................................................................................................................. + sqrdmulh v27.4S, v31.4S, v7.S[1] // ............................................................*................................................................................................................................... + sqrdmulh v24.4S, v23.4S, v6.S[3] // ..............................................................*................................................................................................................................. + mul v18.4S, v23.4S, v6.S[2] // ................................................................*............................................................................................................................... + sqrdmulh v23.4S, v20.4S, v0.S[1] // ...................................................................................................*............................................................................................ + mul v31.4S, v20.4S, v0.S[0] // ....................................................................................................*........................................................................................... + mla v25.4S, v27.4S, v29.4S // ..................................................................*............................................................................................................................. + str q16, [x0, #496] // ..............................................................................................*................................................................................................. + mla v18.4S, v24.4S, v29.4S // ....................................................................*........................................................................................................................... + str q12, [x0, #560] // ...............................................................................................*................................................................................................ + sub v12.4S, v11.4S, v25.4S // ......................................................................*......................................................................................................................... + add v11.4S, v11.4S, v25.4S // .........................................................................................*...................................................................................................... + sub v27.4S, v22.4S, v18.4S // ........................................................................*....................................................................................................................... + add v22.4S, v22.4S, v18.4S // .................................................................................*.............................................................................................................. + str q8, [x0, #688] // ........................................................................................*....................................................................................................... + mla v31.4S, v23.4S, v29.4S // .......................................................................................................*........................................................................................ + str q10, [x0, #624] // ................................................................................*............................................................................................................... + str q12, [x0, #944] // ..........................................................................*..................................................................................................................... + ldr q10, [x0, #384] // ...........................................................................................................................................*.................................................... + str q27, [x0, #816] // ............................................................................*................................................................................................................... + ldr q12, [x0, #0] // ................................................................................................*............................................................................................... + ldr q8, [x0, #896] // ......................................................................................................................................*......................................................... + sub v24.4S, v12.4S, v31.4S // ..........................................................................................................*..................................................................................... + add v12.4S, v12.4S, v31.4S // ...........................................................................................................*.................................................................................... + str q11, [x0, #880] // ............................................................................................*................................................................................................... + mul v11.4S, v8.4S, v0.S[0] // ........................................................................................................................................*....................................................... + sqrdmulh v8.4S, v8.4S, v0.S[1] // .........................................................................................................................................*...................................................... + str q22, [x0, #752] // ....................................................................................*........................................................................................................... + ldr q22, [x0, #832] // ..................................................................................................*............................................................................................. + mla v11.4S, v8.4S, v29.4S // ............................................................................................................................................*................................................... + ldr q8, [x0, #64] // ......................................................................................................*......................................................................................... + sqrdmulh v27.4S, v22.4S, v0.S[1] // .....................................................................................................*.......................................................................................... + add v18.4S, v10.4S, v11.4S // ...................................................................................................................................................*............................................ + sub v16.4S, v10.4S, v11.4S // ...............................................................................................................................................*................................................ + ldr q11, [x0, #768] // ...............................................................................................................*................................................................................ + mul v22.4S, v22.4S, v0.S[0] // ........................................................................................................*....................................................................................... + mul v30.4S, v18.4S, v0.S[2] // ......................................................................................................................................................*......................................... + sqrdmulh v10.4S, v11.4S, v0.S[1] // ..................................................................................................................*............................................................................. + mul v11.4S, v11.4S, v0.S[0] // .....................................................................................................................*.......................................................................... + mla v22.4S, v27.4S, v29.4S // ............................................................................................................*................................................................................... + ldr q27, [x0, #256] // ............................................................................................................................*................................................................... + mla v11.4S, v10.4S, v29.4S // ...........................................................................................................................*.................................................................... + sqrdmulh v10.4S, v18.4S, v0.S[3] // .......................................................................................................................................................*........................................ + ldr q18, [x0, #576] // ..............................................................................................................*................................................................................. + add v23.4S, v27.4S, v11.4S // ................................................................................................................................*............................................................... + sub v11.4S, v27.4S, v11.4S // ...............................................................................................................................*................................................................ + sqrdmulh v27.4S, v18.4S, v0.S[1] // ................................................................................................................*............................................................................... + mul v31.4S, v23.4S, v0.S[2] // .............................................................................................................................................................*.................................. + sqrdmulh v23.4S, v23.4S, v0.S[3] // ...............................................................................................................................................................*................................ + mul v18.4S, v18.4S, v0.S[0] // .................................................................................................................*.............................................................................. + sqrdmulh v25.4S, v11.4S, v1.S[1] // ...................................................................................................................................*............................................................ + mla v30.4S, v10.4S, v29.4S // ..........................................................................................................................................................*..................................... + mla v31.4S, v23.4S, v29.4S // ...................................................................................................................................................................*............................ + mla v18.4S, v27.4S, v29.4S // ....................................................................................................................*........................................................................... + ldr q27, [x0, #320] // .......................................................................................................................................*........................................................ + sub v28.4S, v12.4S, v31.4S // .......................................................................................................................................................................*........................ + add v9.4S, v12.4S, v31.4S // .............................................................................................................................................................................*.................. + add v12.4S, v8.4S, v18.4S // ........................................................................................................................*....................................................................... + sub v23.4S, v8.4S, v18.4S // .......................................................................................................................*........................................................................ + mul v10.4S, v16.4S, v1.S[0] // ..............................................................................................................................................................................*................. + sub v18.4S, v27.4S, v22.4S // ..........................................................................................................................................*..................................................... + add v22.4S, v27.4S, v22.4S // ........................................................................................................................................................*....................................... + mul v8.4S, v11.4S, v1.S[0] // ....................................................................................................................................*........................................................... + mul v14.4S, v18.4S, v1.S[0] // ...........................................................................................................................................................*.................................... + sqrdmulh v11.4S, v18.4S, v1.S[1] // ............................................................................................................................................................*................................... + sqrdmulh v27.4S, v22.4S, v0.S[3] // ...............................................................................................................................................................................*................ + mla v8.4S, v25.4S, v29.4S // .............................................................................................................................................*.................................................. + mul v22.4S, v22.4S, v0.S[2] // .................................................................................................................................................................................*.............. + mla v14.4S, v11.4S, v29.4S // ................................................................................................................................................................*............................... + ldr q11, [x0, #128] // .........................................................................................................*...................................................................................... + add v19.4S, v24.4S, v8.4S // ................................................................................................................................................*............................................... + mla v22.4S, v27.4S, v29.4S // .....................................................................................................................................................................................*.......... + ldr q27, [x0, #704] // ...................................................................................................................*............................................................................ + ldr q18, [x0, #640] // ......................................................................................................................*......................................................................... + sqrdmulh v25.4S, v27.4S, v0.S[1] // ..............................................................................................................................*................................................................. + ldr q31, [x0, #960] // ..............................................................................................................................................*................................................. + mul v27.4S, v27.4S, v0.S[0] // .....................................................................................................................................*.......................................................... + sqrdmulh v20.4S, v18.4S, v0.S[1] // .........................................................................................................................*...................................................................... + mul v13.4S, v31.4S, v0.S[0] // .................................................................................................................................................*.............................................. + sqrdmulh v31.4S, v31.4S, v0.S[1] // ..................................................................................................................................................*............................................. + mul v18.4S, v18.4S, v0.S[0] // ..........................................................................................................................*..................................................................... + ldr q17, [x0, #448] // ....................................................................................................................................................*........................................... + mla v13.4S, v31.4S, v29.4S // .....................................................................................................................................................*.......................................... + sub v31.4S, v12.4S, v22.4S // .........................................................................................................................................................................................*...... + add v26.4S, v12.4S, v22.4S // ...........................................................................................................................................................................................*.... + mla v27.4S, v25.4S, v29.4S // ....................................................................................................................................................................*........................... + add v22.4S, v17.4S, v13.4S // .........................................................................................................................................................*...................................... + mla v18.4S, v20.4S, v29.4S // .............................................................................................................................*.................................................................. + sub v15.4S, v17.4S, v13.4S // .........................................................................................................................................................................*...................... + mul v12.4S, v22.4S, v0.S[2] // .....................................................................................................................................................................*.......................... + sqrdmulh v22.4S, v22.4S, v0.S[3] // ........................................................................................................................................................................*....................... + ldr q25, [x0, #192] // .............................................................................................................*.................................................................................. + add v13.4S, v11.4S, v18.4S // ..................................................................................................................................*............................................................. + mla v12.4S, v22.4S, v29.4S // ............................................................................................................................................................................*................... + add v22.4S, v25.4S, v27.4S // ...........................................................................................................................................................................*.................... + sub v20.4S, v11.4S, v18.4S // .................................................................................................................................*.............................................................. + sub v11.4S, v13.4S, v30.4S // ..............................................................................................................................................................*................................. + sub v21.4S, v22.4S, v12.4S // ................................................................................................................................................................................*............... + add v12.4S, v22.4S, v12.4S // ..................................................................................................................................................................................*............. + sqrdmulh v18.4S, v11.4S, v2.S[1] // .................................................................................................................................................................*.............................. + sqrdmulh v17.4S, v21.4S, v2.S[1] // ...................................................................................................................................................................................*............ + mul v21.4S, v21.4S, v2.S[0] // ....................................................................................................................................................................................*........... + mul v11.4S, v11.4S, v2.S[0] // ..................................................................................................................................................................*............................. + mul v22.4S, v12.4S, v1.S[2] // ......................................................................................................................................................................................*......... + sqrdmulh v12.4S, v12.4S, v1.S[3] // .......................................................................................................................................................................................*........ + mla v21.4S, v17.4S, v29.4S // ........................................................................................................................................................................................*....... + mla v11.4S, v18.4S, v29.4S // ......................................................................................................................................................................*......................... + sub v25.4S, v25.4S, v27.4S // ..............................................................................................................................................................................................*. + mla v22.4S, v12.4S, v29.4S // .............................................................................................................................................................................................*.. + add v17.4S, v31.4S, v21.4S // ............................................................................................................................................................................................*... + sub v27.4S, v28.4S, v11.4S // ..........................................................................................................................................................................................*..... + add v28.4S, v28.4S, v11.4S // ..........................................................................................................................................................................*..................... + sqrdmulh v18.4S, v17.4S, v4.S[3] // ...............................................................................................................................................................................................* + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // mul v17.4S, v17.4S, v4.S[2] // .*.............................................................................................................................................................................................. + // sub v31.4S, v31.4S, v21.4S // ............................*................................................................................................................................................................... + // sub v21.4S, v26.4S, v22.4S // .........*...................................................................................................................................................................................... + // add v22.4S, v26.4S, v22.4S // ..*............................................................................................................................................................................................. + // mla v17.4S, v18.4S, v29.4S // .......*........................................................................................................................................................................................ + // mul v26.4S, v31.4S, v5.S[0] // .....................................*.......................................................................................................................................................... + // mul v12.4S, v22.4S, v3.S[2] // .....*.......................................................................................................................................................................................... + // sqrdmulh v18.4S, v22.4S, v3.S[3] // ......*......................................................................................................................................................................................... + // sub v11.4S, v28.4S, v17.4S // ...........*.................................................................................................................................................................................... + // sqrdmulh v22.4S, v31.4S, v5.S[1] // ....................................*........................................................................................................................................................... + // add v28.4S, v28.4S, v17.4S // ..................................*............................................................................................................................................................. + // str q11, [x0, #320] // .................................*.............................................................................................................................................................. + // mla v12.4S, v18.4S, v29.4S // ..........*..................................................................................................................................................................................... + // mla v26.4S, v22.4S, v29.4S // ......................................*......................................................................................................................................................... + // sqrdmulh v18.4S, v16.4S, v1.S[1] // .................*.............................................................................................................................................................................. + // mul v16.4S, v15.4S, v1.S[0] // .......................*........................................................................................................................................................................ + // add v22.4S, v13.4S, v30.4S // *............................................................................................................................................................................................... + // add v17.4S, v27.4S, v26.4S // ...............................................*................................................................................................................................................ + // sqrdmulh v11.4S, v15.4S, v1.S[1] // ......................*......................................................................................................................................................................... + // mla v10.4S, v18.4S, v29.4S // ...........................*.................................................................................................................................................................... + // mul v31.4S, v22.4S, v1.S[2] // ....*........................................................................................................................................................................................... + // sqrdmulh v22.4S, v22.4S, v1.S[3] // ...*............................................................................................................................................................................................ + // mla v16.4S, v11.4S, v29.4S // ...................................*............................................................................................................................................................ + // sub v13.4S, v20.4S, v10.4S // ...............................*................................................................................................................................................................ + // add v15.4S, v20.4S, v10.4S // ................................*............................................................................................................................................................... + // mla v31.4S, v22.4S, v29.4S // ........*....................................................................................................................................................................................... + // add v20.4S, v25.4S, v16.4S // ..........................................*..................................................................................................................................................... + // mul v18.4S, v15.4S, v2.S[2] // ........................................*....................................................................................................................................................... + // sub v10.4S, v25.4S, v16.4S // .........................................*...................................................................................................................................................... + // add v22.4S, v9.4S, v31.4S // .............*.................................................................................................................................................................................. + // sub v30.4S, v24.4S, v8.4S // ....................*........................................................................................................................................................................... + // mul v16.4S, v10.4S, v3.S[0] // ...........................................................*.................................................................................................................................... + // add v11.4S, v22.4S, v12.4S // ..................*............................................................................................................................................................................. + // sub v22.4S, v22.4S, v12.4S // ................*............................................................................................................................................................................... + // sqrdmulh v8.4S, v10.4S, v3.S[1] // ............................................................*................................................................................................................................... + // str q11, [x0], #(16) // ..........................*..................................................................................................................................................................... + // sqrdmulh v12.4S, v20.4S, v2.S[3] // .........................................................*...................................................................................................................................... + // str q22, [x0, #48] // ...................*............................................................................................................................................................................ + // mul v24.4S, v13.4S, v3.S[0] // ...........................................*.................................................................................................................................................... + // str q17, [x0, #368] // ..............................................................*................................................................................................................................. + // sqrdmulh v22.4S, v15.4S, v2.S[3] // .......................................*........................................................................................................................................................ + // sqrdmulh v15.4S, v21.4S, v4.S[1] // ...............*................................................................................................................................................................................ + // sqrdmulh v11.4S, v13.4S, v3.S[1] // ............................................*................................................................................................................................................... + // mul v10.4S, v20.4S, v2.S[2] // ........................................................*....................................................................................................................................... + // mla v18.4S, v22.4S, v29.4S // ..................................................*............................................................................................................................................. + // add v22.4S, v23.4S, v14.4S // ........................*....................................................................................................................................................................... + // sub v17.4S, v9.4S, v31.4S // ............*................................................................................................................................................................................... + // mla v24.4S, v11.4S, v29.4S // ................................................*............................................................................................................................................... + // mla v10.4S, v12.4S, v29.4S // .............................................................*.................................................................................................................................. + // add v13.4S, v19.4S, v18.4S // ......................................................*......................................................................................................................................... + // sub v20.4S, v19.4S, v18.4S // .......................................................*........................................................................................................................................ + // add v25.4S, v30.4S, v24.4S // .....................................................*.......................................................................................................................................... + // mla v16.4S, v8.4S, v29.4S // ..................................................................*............................................................................................................................. + // add v31.4S, v22.4S, v10.4S // ................................................................*............................................................................................................................... + // sub v11.4S, v23.4S, v14.4S // .........................*...................................................................................................................................................................... + // sub v8.4S, v22.4S, v10.4S // .................................................................*.............................................................................................................................. + // sqrdmulh v23.4S, v31.4S, v5.S[3] // ....................................................................*........................................................................................................................... + // sub v22.4S, v11.4S, v16.4S // .......................................................................*........................................................................................................................ + // sqrdmulh v12.4S, v8.4S, v6.S[1] // ......................................................................*......................................................................................................................... + // add v10.4S, v11.4S, v16.4S // .........................................................................*...................................................................................................................... + // sqrdmulh v11.4S, v22.4S, v7.S[1] // ................................................................................*............................................................................................................... + // mul v18.4S, v22.4S, v7.S[0] // ...........................................................................*.................................................................................................................... + // sqrdmulh v22.4S, v10.4S, v6.S[3] // .................................................................................*.............................................................................................................. + // sub v16.4S, v30.4S, v24.4S // ....................................................*........................................................................................................................................... + // mul v24.4S, v10.4S, v6.S[2] // ..................................................................................*............................................................................................................. + // str q28, [x0, #240] // ...................................................*............................................................................................................................................ + // mla v18.4S, v11.4S, v29.4S // .....................................................................................*.......................................................................................................... + // mul v10.4S, v21.4S, v4.S[0] // ..............*................................................................................................................................................................................. + // mla v24.4S, v22.4S, v29.4S // .......................................................................................*........................................................................................................ + // mul v8.4S, v8.4S, v6.S[0] // .....................................................................*.......................................................................................................................... + // sub v11.4S, v16.4S, v18.4S // .........................................................................................*...................................................................................................... + // mla v10.4S, v15.4S, v29.4S // .....................*.......................................................................................................................................................................... + // sub v22.4S, v25.4S, v24.4S // ...........................................................................................*.................................................................................................... + // mla v8.4S, v12.4S, v29.4S // ..........................................................................*..................................................................................................................... + // str q11, [x0, #944] // ................................................................................................*............................................................................................... + // sub v12.4S, v17.4S, v10.4S // ..............................*................................................................................................................................................................. + // str q22, [x0, #816] // ..................................................................................................*............................................................................................. + // add v22.4S, v20.4S, v8.4S // ...............................................................................*................................................................................................................ + // add v10.4S, v17.4S, v10.4S // .............................*.................................................................................................................................................................. + // sub v11.4S, v27.4S, v26.4S // ..............................................*................................................................................................................................................. + // str q22, [x0, #624] // ...............................................................................................*................................................................................................ + // add v22.4S, v25.4S, v24.4S // ............................................................................................*................................................................................................... + // str q10, [x0, #112] // .............................................*.................................................................................................................................................. + // mul v10.4S, v31.4S, v5.S[2] // ...................................................................*............................................................................................................................ + // str q22, [x0, #752] // ..........................................................................................................*..................................................................................... + // sub v22.4S, v20.4S, v8.4S // ..............................................................................*................................................................................................................. + // str q12, [x0, #176] // .................................................*.............................................................................................................................................. + // mla v10.4S, v23.4S, v29.4S // ........................................................................*....................................................................................................................... + // str q22, [x0, #688] // .............................................................................................*.................................................................................................. + // add v22.4S, v16.4S, v18.4S // ..........................................................................................*..................................................................................................... + // str q11, [x0, #432] // ..........................................................*..................................................................................................................................... + // add v11.4S, v13.4S, v10.4S // ............................................................................*................................................................................................................... + // str q22, [x0, #880] // .......................................................................................................*........................................................................................ + // sub v22.4S, v13.4S, v10.4S // .............................................................................*.................................................................................................................. + // str q11, [x0, #496] // ......................................................................................*......................................................................................................... + // str q22, [x0, #560] // ........................................................................................*....................................................................................................... + // ldr q22, [x0, #0] // ...................................................................................................*............................................................................................ + // ldr q11, [x0, #512] // ...............................................................*................................................................................................................................ + // ldr q10, [x0, #832] // ...........................................................................................................*.................................................................................... + // sqrdmulh v12.4S, v11.4S, v0.S[1] // ...................................................................................*............................................................................................................ + // mul v11.4S, v11.4S, v0.S[0] // ....................................................................................*........................................................................................................... + // sqrdmulh v8.4S, v10.4S, v0.S[1] // ..............................................................................................................*................................................................................. + // ldr q27, [x0, #64] // .............................................................................................................*.................................................................................. + // mla v11.4S, v12.4S, v29.4S // ..............................................................................................*................................................................................................. + // mul v10.4S, v10.4S, v0.S[0] // ..................................................................................................................*............................................................................. + // ldr q12, [x0, #128] // ....................................................................................................................................................*........................................... + // sub v24.4S, v22.4S, v11.4S // .....................................................................................................*.......................................................................................... + // add v22.4S, v22.4S, v11.4S // ......................................................................................................*......................................................................................... + // mla v10.4S, v8.4S, v29.4S // ......................................................................................................................*......................................................................... + // ldr q11, [x0, #192] // ..........................................................................................................................................................................*..................... + // ldr q8, [x0, #576] // ..........................................................................................................................*..................................................................... + // ldr q16, [x0, #768] // .................................................................................................................*.............................................................................. + // sqrdmulh v18.4S, v8.4S, v0.S[1] // .............................................................................................................................*.................................................................. + // mul v8.4S, v8.4S, v0.S[0] // ................................................................................................................................*............................................................... + // sqrdmulh v31.4S, v16.4S, v0.S[1] // ....................................................................................................................*........................................................................... + // ldr q15, [x0, #704] // .......................................................................................................................................................*........................................ + // mla v8.4S, v18.4S, v29.4S // ....................................................................................................................................*........................................................... + // mul v25.4S, v16.4S, v0.S[0] // .....................................................................................................................*.......................................................................... + // ldr q16, [x0, #640] // ........................................................................................................................................................*....................................... + // sub v23.4S, v27.4S, v8.4S // .........................................................................................................................................*...................................................... + // add v17.4S, v27.4S, v8.4S // ........................................................................................................................................*....................................................... + // sqrdmulh v8.4S, v16.4S, v0.S[1] // ............................................................................................................................................................*................................... + // mul v16.4S, v16.4S, v0.S[0] // ...............................................................................................................................................................*................................ + // mla v25.4S, v31.4S, v29.4S // ........................................................................................................................*....................................................................... + // ldr q18, [x0, #256] // .......................................................................................................................*........................................................................ + // mla v16.4S, v8.4S, v29.4S // ......................................................................................................................................................................*......................... + // sqrdmulh v31.4S, v15.4S, v0.S[1] // .........................................................................................................................................................*...................................... + // sub v8.4S, v18.4S, v25.4S // ............................................................................................................................*................................................................... + // add v27.4S, v18.4S, v25.4S // ...........................................................................................................................*.................................................................... + // sub v20.4S, v12.4S, v16.4S // ..............................................................................................................................................................................*................. + // add v13.4S, v12.4S, v16.4S // ...........................................................................................................................................................................*.................... + // sqrdmulh v12.4S, v8.4S, v1.S[1] // .................................................................................................................................*.............................................................. + // mul v8.4S, v8.4S, v1.S[0] // .............................................................................................................................................*.................................................. + // mul v18.4S, v15.4S, v0.S[0] // ...........................................................................................................................................................*.................................... + // ldr q26, [x0, #896] // ....................................................................................................*........................................................................................... + // ldr q15, [x0, #320] // .....................................................................................................................................*.......................................................... + // mul v30.4S, v26.4S, v0.S[0] // ........................................................................................................*....................................................................................... + // sqrdmulh v19.4S, v26.4S, v0.S[1] // .........................................................................................................*...................................................................................... + // sub v28.4S, v15.4S, v10.4S // ...........................................................................................................................................*.................................................... + // ldr q14, [x0, #384] // .................................................................................................*.............................................................................................. + // mla v30.4S, v19.4S, v29.4S // ............................................................................................................*................................................................................... + // mla v8.4S, v12.4S, v29.4S // .................................................................................................................................................*.............................................. + // ldr q26, [x0, #960] // ..........................................................................................................................................................*..................................... + // sub v16.4S, v14.4S, v30.4S // ................................................................................................................*............................................................................... + // add v19.4S, v24.4S, v8.4S // .....................................................................................................................................................*.......................................... + // mul v25.4S, v26.4S, v0.S[0] // .............................................................................................................................................................*.................................. + // sqrdmulh v26.4S, v26.4S, v0.S[1] // ..............................................................................................................................................................*................................. + // add v14.4S, v14.4S, v30.4S // ...............................................................................................................*................................................................................ + // ldr q21, [x0, #448] // ................................................................................................................................................................*............................... + // mla v25.4S, v26.4S, v29.4S // .................................................................................................................................................................*.............................. + // mul v30.4S, v14.4S, v0.S[2] // ...................................................................................................................*............................................................................ + // sqrdmulh v26.4S, v14.4S, v0.S[3] // .........................................................................................................................*...................................................................... + // add v12.4S, v15.4S, v10.4S // ............................................................................................................................................*................................................... + // add v9.4S, v21.4S, v25.4S // .....................................................................................................................................................................*.......................... + // mla v30.4S, v26.4S, v29.4S // ..................................................................................................................................*............................................................. + // mul v14.4S, v28.4S, v1.S[0] // ..............................................................................................................................................*................................................. + // sqrdmulh v15.4S, v28.4S, v1.S[1] // ...............................................................................................................................................*................................................ + // mul v10.4S, v27.4S, v0.S[2] // ..............................................................................................................................*................................................................. + // sub v28.4S, v13.4S, v30.4S // ...............................................................................................................................................................................*................ + // sqrdmulh v26.4S, v27.4S, v0.S[3] // ...............................................................................................................................*................................................................ + // mla v14.4S, v15.4S, v29.4S // ...................................................................................................................................................*............................................ + // sqrdmulh v15.4S, v28.4S, v2.S[1] // ..................................................................................................................................................................................*............. + // mul v27.4S, v28.4S, v2.S[0] // .....................................................................................................................................................................................*.......... + // mla v10.4S, v26.4S, v29.4S // ...................................................................................................................................*............................................................ + // mla v18.4S, v31.4S, v29.4S // ....................................................................................................................................................................*........................... + // mul v31.4S, v9.4S, v0.S[2] // ........................................................................................................................................................................*....................... + // mla v27.4S, v15.4S, v29.4S // .........................................................................................................................................................................................*...... + // sub v26.4S, v22.4S, v10.4S // ......................................................................................................................................*......................................................... + // sqrdmulh v9.4S, v9.4S, v0.S[3] // .........................................................................................................................................................................*...................... + // sub v15.4S, v21.4S, v25.4S // .......................................................................................................................................................................*........................ + // add v28.4S, v26.4S, v27.4S // ..............................................................................................................................................................................................*. + // add v25.4S, v11.4S, v18.4S // .............................................................................................................................................................................*.................. + // mla v31.4S, v9.4S, v29.4S // ............................................................................................................................................................................*................... + // add v9.4S, v22.4S, v10.4S // .......................................................................................................................................*........................................................ + // mul v10.4S, v16.4S, v1.S[0] // ..........................................................................................................................................*..................................................... + // sqrdmulh v22.4S, v12.4S, v0.S[3] // ................................................................................................................................................*............................................... + // sub v21.4S, v25.4S, v31.4S // ................................................................................................................................................................................*............... + // mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................................................................*............................................. + // add v25.4S, v25.4S, v31.4S // .................................................................................................................................................................................*.............. + // sqrdmulh v31.4S, v21.4S, v2.S[1] // ...................................................................................................................................................................................*............ + // mul v21.4S, v21.4S, v2.S[0] // ....................................................................................................................................................................................*........... + // mla v12.4S, v22.4S, v29.4S // ......................................................................................................................................................*......................................... + // mul v22.4S, v25.4S, v1.S[2] // ......................................................................................................................................................................................*......... + // sqrdmulh v25.4S, v25.4S, v1.S[3] // .......................................................................................................................................................................................*........ + // mla v21.4S, v31.4S, v29.4S // ........................................................................................................................................................................................*....... + // sub v31.4S, v17.4S, v12.4S // ..................................................................................................................................................................*............................. + // sub v27.4S, v26.4S, v27.4S // .............................................................................................................................................................................................*.. + // add v26.4S, v17.4S, v12.4S // ...................................................................................................................................................................*............................ + // add v17.4S, v31.4S, v21.4S // ............................................................................................................................................................................................*... + // mla v22.4S, v25.4S, v29.4S // ...........................................................................................................................................................................................*.... + // sub v25.4S, v11.4S, v18.4S // ..........................................................................................................................................................................................*..... + // sqrdmulh v18.4S, v17.4S, v4.S[3] // ...............................................................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer1234_start + mul v17.4S, v17.4S, v4.S[2] // ...................................................................................................................................................*............................................ + sub v31.4S, v31.4S, v21.4S // ..................................................................................................................*............................................................................. + sub v21.4S, v26.4S, v22.4S // ........................................................................................................*....................................................................................... + add v22.4S, v26.4S, v22.4S // .........................................................................................................*...................................................................................... + mla v17.4S, v18.4S, v29.4S // ....................................................................................................................................................*........................................... + mul v26.4S, v31.4S, v5.S[0] // ........................................................................................................................................................*....................................... + mul v12.4S, v22.4S, v3.S[2] // .........................................................................................................................................*...................................................... + sqrdmulh v18.4S, v22.4S, v3.S[3] // ........................................................................................................................................*....................................................... + sub v11.4S, v28.4S, v17.4S // .....................................................................................................................................................*.......................................... + sqrdmulh v22.4S, v31.4S, v5.S[1] // .......................................................................................................................................................*........................................ + add v28.4S, v28.4S, v17.4S // ......................................................................................................................................................*......................................... + str q11, [x0, #320] // .....................................................................................................................................................................................*.......... + mla v12.4S, v18.4S, v29.4S // ..........................................................................................................................................*..................................................... + mla v26.4S, v22.4S, v29.4S // .........................................................................................................................................................*...................................... + sqrdmulh v18.4S, v16.4S, v1.S[1] // ......................................................................................*......................................................................................................... + mul v16.4S, v15.4S, v1.S[0] // ............................................................................................*................................................................................................... + add v22.4S, v13.4S, v30.4S // ......................................................................*......................................................................................................................... + add v17.4S, v27.4S, v26.4S // ...........................................................................................................................................................*.................................... + sqrdmulh v11.4S, v15.4S, v1.S[1] // ...........................................................................................*.................................................................................................... + mla v10.4S, v18.4S, v29.4S // ........................................................................................*....................................................................................................... + mul v31.4S, v22.4S, v1.S[2] // .................................................................................................*.............................................................................................. + sqrdmulh v22.4S, v22.4S, v1.S[3] // ................................................................................................*............................................................................................... + mla v16.4S, v11.4S, v29.4S // .............................................................................................*.................................................................................................. + sub v13.4S, v20.4S, v10.4S // .........................................................................................*...................................................................................................... + add v15.4S, v20.4S, v10.4S // ..........................................................................................*..................................................................................................... + mla v31.4S, v22.4S, v29.4S // ..................................................................................................*............................................................................................. + add v20.4S, v25.4S, v16.4S // ...............................................................................................*................................................................................................ + mul v18.4S, v15.4S, v2.S[2] // .....................................................................................................................*.......................................................................... + sub v10.4S, v25.4S, v16.4S // ..............................................................................................*................................................................................................. + add v22.4S, v9.4S, v31.4S // ....................................................................................................*........................................................................................... + sub v30.4S, v24.4S, v8.4S // ...............................................................................*................................................................................................................ + mul v16.4S, v10.4S, v3.S[0] // ....................................................................................................................................*........................................................... + add v11.4S, v22.4S, v12.4S // ............................................................................................................................................*................................................... + sub v22.4S, v22.4S, v12.4S // ...........................................................................................................................................*.................................................... + sqrdmulh v8.4S, v10.4S, v3.S[1] // ...................................................................................................................................*............................................................ + str q11, [x0], #(16) // ................................................................................................................................................................................*............... + sqrdmulh v12.4S, v20.4S, v2.S[3] // .........................................................................................................................*...................................................................... + str q22, [x0, #48] // .................................................................................................................................................................................*.............. + mul v24.4S, v13.4S, v3.S[0] // ...............................................................................................................................*................................................................ + str q17, [x0, #368] // ......................................................................................................................................................................................*......... + sqrdmulh v22.4S, v15.4S, v2.S[3] // ....................................................................................................................*........................................................................... + sqrdmulh v15.4S, v21.4S, v4.S[1] // .............................................................................................................................................*.................................................. + sqrdmulh v11.4S, v13.4S, v3.S[1] // ..............................................................................................................................*................................................................. + mul v10.4S, v20.4S, v2.S[2] // ..........................................................................................................................*..................................................................... + mla v18.4S, v22.4S, v29.4S // ......................................................................................................................*......................................................................... + add v22.4S, v23.4S, v14.4S // .....................................................................................*.......................................................................................................... + sub v17.4S, v9.4S, v31.4S // ...................................................................................................*............................................................................................ + mla v24.4S, v11.4S, v29.4S // ................................................................................................................................*............................................................... + mla v10.4S, v12.4S, v29.4S // ...........................................................................................................................*.................................................................... + add v13.4S, v19.4S, v18.4S // ........................................................................................................................*....................................................................... + sub v20.4S, v19.4S, v18.4S // .......................................................................................................................*........................................................................ + add v25.4S, v30.4S, v24.4S // ..................................................................................................................................*............................................................. + mla v16.4S, v8.4S, v29.4S // .....................................................................................................................................*.......................................................... + add v31.4S, v22.4S, v10.4S // .............................................................................................................................*.................................................................. + sub v11.4S, v23.4S, v14.4S // ....................................................................................*........................................................................................................... + sub v8.4S, v22.4S, v10.4S // ............................................................................................................................*................................................................... + sqrdmulh v23.4S, v31.4S, v5.S[3] // ............................................................................................................................................................*................................... + sub v22.4S, v11.4S, v16.4S // ......................................................................................................................................*......................................................... + sqrdmulh v12.4S, v8.4S, v6.S[1] // .................................................................................................................................................................*.............................. + add v10.4S, v11.4S, v16.4S // .......................................................................................................................................*........................................................ + sqrdmulh v11.4S, v22.4S, v7.S[1] // ...........................................................................................................................................................................*.................... + mul v18.4S, v22.4S, v7.S[0] // ............................................................................................................................................................................*................... + sqrdmulh v22.4S, v10.4S, v6.S[3] // ......................................................................................................................................................................*......................... + sub v16.4S, v30.4S, v24.4S // .................................................................................................................................*.............................................................. + mul v24.4S, v10.4S, v6.S[2] // .......................................................................................................................................................................*........................ + str q28, [x0, #240] // ....................................................................................................................................................................................*........... + mla v18.4S, v11.4S, v29.4S // .............................................................................................................................................................................*.................. + mul v10.4S, v21.4S, v4.S[0] // ..............................................................................................................................................*................................................. + mla v24.4S, v22.4S, v29.4S // ........................................................................................................................................................................*....................... + mul v8.4S, v8.4S, v6.S[0] // ..................................................................................................................................................................*............................. + sub v11.4S, v16.4S, v18.4S // ..............................................................................................................................................................................*................. + mla v10.4S, v15.4S, v29.4S // ...............................................................................................................................................*................................................ + sub v22.4S, v25.4S, v24.4S // .........................................................................................................................................................................*...................... + mla v8.4S, v12.4S, v29.4S // ...................................................................................................................................................................*............................ + str q11, [x0, #944] // ...............................................................................................................................................................................................* + sub v12.4S, v17.4S, v10.4S // ................................................................................................................................................*............................................... + str q22, [x0, #816] // .............................................................................................................................................................................................*.. + add v22.4S, v20.4S, v8.4S // .....................................................................................................................................................................*.......................... + add v10.4S, v17.4S, v10.4S // .................................................................................................................................................*.............................................. + sub v11.4S, v27.4S, v26.4S // ..........................................................................................................................................................*..................................... + str q22, [x0, #624] // ..........................................................................................................................................................................................*..... + add v22.4S, v25.4S, v24.4S // ..........................................................................................................................................................................*..................... + str q10, [x0, #112] // ..................................................................................................................................................................................*............. + mul v10.4S, v31.4S, v5.S[2] // .............................................................................................................................................................*.................................. + str q22, [x0, #752] // ............................................................................................................................................................................................*... + sub v22.4S, v20.4S, v8.4S // ....................................................................................................................................................................*........................... + str q12, [x0, #176] // ...................................................................................................................................................................................*............ + mla v10.4S, v23.4S, v29.4S // ..............................................................................................................................................................*................................. + str q22, [x0, #688] // ...........................................................................................................................................................................................*.... + add v22.4S, v16.4S, v18.4S // ...............................................................................................................................................................................*................ + str q11, [x0, #432] // .......................................................................................................................................................................................*........ + add v11.4S, v13.4S, v10.4S // ................................................................................................................................................................*............................... + str q22, [x0, #880] // ..............................................................................................................................................................................................*. + sub v22.4S, v13.4S, v10.4S // ...............................................................................................................................................................*................................ + str q11, [x0, #496] // ........................................................................................................................................................................................*....... + str q22, [x0, #560] // .........................................................................................................................................................................................*...... + + restore inp, STACK0 + mov count, #16 + + .unreq root4 + .unreq root5 + .unreq root6 + .unreq root7 + .unreq qform_root4 + .unreq qform_root5 + .unreq qform_root6 + .unreq qform_root7 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + // Instructions: 1 + // Expected cycles: 1 + // Expected IPC: 1.00 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q15, [x4, #64] // *............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q15, [x4, #64] // *.............................. + + sub count, count, #1 +layer5678_start: + // Instructions: 61 + // Expected cycles: 77 + // Expected IPC: 0.79 + // + // Wall time: 7.39s + // User time: 7.39s + // + // -------------------- original position ---------------------> + // 0 25 50 + // |------------------------|------------------------|---------- + ldr q30, [x3], #16 // ....*........................................................ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + ldr q31, [x1, #0] // *............................................................ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + ldr q8, [x1, #32] // ..*.......................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + ldr q28, [x1, #48] // ...*......................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v22.4S, v8.4S, v30.S[1] // ......*...................................................... + // gap // ............................................................. + mul v21.4S, v8.4S, v30.S[0] // .......*..................................................... + // gap // ............................................................. + sqrdmulh v7.4S, v28.4S, v30.S[1] // ...........*................................................. + // gap // ............................................................. + mul v1.4S, v28.4S, v30.S[0] // ............*................................................ + // gap // ............................................................. + ldr q12, [x1, #16] // .*........................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v21.4S, v22.4S, v29.4S // ........*.................................................... + // gap // ............................................................. + mla v1.4S, v7.4S, v29.4S // .............*............................................... + // gap // ............................................................. + ldr q17, [x3], #8 // .....*....................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v16.4S, v31.4S, v21.4S // .........*................................................... + // gap // ............................................................. + add v10.4S, v12.4S, v1.4S // ...............*............................................. + // gap // ............................................................. + add v14.4S, v31.4S, v21.4S // ..........*.................................................. + // gap // ............................................................. + sub v28.4S, v12.4S, v1.4S // ..............*.............................................. + // gap // ............................................................. + sqrdmulh v22.4S, v10.4S, v30.S[3] // ................*............................................ + // gap // ............................................................. + mul v8.4S, v10.4S, v30.S[2] // .................*........................................... + // gap // ............................................................. + sqrdmulh v18.4S, v28.4S, v17.S[1] // .....................*....................................... + // gap // ............................................................. + mul v30.4S, v28.4S, v17.S[0] // ......................*...................................... + // gap // ............................................................. + ldr q10, [x4], #(6*16) // ..................................*.......................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v8.4S, v22.4S, v29.4S // ..................*.......................................... + // gap // ............................................................. + mla v30.4S, v18.4S, v29.4S // .......................*..................................... + // gap // ............................................................. + ldr q11, [x4, #-64] // ....................................*........................ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v21.4S, v14.4S, v8.4S // ...................*......................................... + // gap // ............................................................. + add v17.4S, v14.4S, v8.4S // ....................*........................................ + // gap // ............................................................. + sub v0.4S, v16.4S, v30.4S // ........................*.................................... + // gap // ............................................................. + add v6.4S, v16.4S, v30.4S // .........................*................................... + // gap // ............................................................. + trn1 v13.4S, v17.4S, v21.4S // ..........................*.................................. + // gap // ............................................................. + trn2 v5.4S, v17.4S, v21.4S // ...........................*................................. + // gap // ............................................................. + trn1 v25.4S, v6.4S, v0.4S // ............................*................................ + // gap // ............................................................. + trn2 v19.4S, v6.4S, v0.4S // .............................*............................... + // gap // ............................................................. + ldr q14, [x4, #-80] // ...................................*......................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + trn2 v1.2D, v13.2D, v25.2D // ..............................*.............................. + // gap // ............................................................. + trn2 v3.2D, v5.2D, v19.2D // ...............................*............................. + // gap // ............................................................. + trn1 v28.2D, v5.2D, v19.2D // .................................*........................... + // gap // ............................................................. + sqrdmulh v30.4S, v3.4S, v14.4S // .............................................*............... + // gap // ............................................................. + mul v31.4S, v3.4S, v10.4S // ..............................................*.............. + // gap // ............................................................. + mul v2.4S, v1.4S, v10.4S // .........................................*................... + // gap // ............................................................. + ldr q23, [x4, #-48] // .....................................*....................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v31.4S, v30.4S, v29.4S // ...............................................*............. + // gap // ............................................................. + sqrdmulh v30.4S, v1.4S, v14.4S // ........................................*.................... + // gap // ............................................................. + ldr q0, [x4, #-16] // .......................................*..................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + add v18.4S, v28.4S, v31.4S // .................................................*........... + // gap // ............................................................. + sub v4.4S, v28.4S, v31.4S // ................................................*............ + // gap // ............................................................. + mla v2.4S, v30.4S, v29.4S // ..........................................*.................. + // gap // ............................................................. + sqrdmulh v31.4S, v18.4S, v23.4S // ..................................................*.......... + // gap // ............................................................. + mul v28.4S, v18.4S, v11.4S // ...................................................*......... + // gap // ............................................................. + sqrdmulh v22.4S, v4.4S, v0.4S // .......................................................*..... + // gap // ............................................................. + mul v14.4S, v4.4S, v15.4S // ........................................................*.... + // gap // ............................................................. + trn1 v19.2D, v13.2D, v25.2D // ................................*............................ + // gap // ............................................................. + mla v28.4S, v31.4S, v29.4S // ....................................................*........ + // gap // ............................................................. + add v24.4S, v19.4S, v2.4S // ............................................*................ + // gap // ............................................................. + mla v14.4S, v22.4S, v29.4S // .........................................................*... + // gap // ............................................................. + sub v13.4S, v19.4S, v2.4S // ...........................................*................. + // gap // ............................................................. + sub v23.4S, v24.4S, v28.4S // .....................................................*....... + // gap // ............................................................. + add v22.4S, v24.4S, v28.4S // ......................................................*...... + // gap // ............................................................. + sub v25.4S, v13.4S, v14.4S // ..........................................................*.. + // gap // ............................................................. + add v24.4S, v13.4S, v14.4S // ...........................................................*. + // gap // ............................................................. + ldr q15, [x4, #64] // ......................................e...................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + st4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1], #64 // ............................................................* + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + + // ------------------------ new position ------------------------> + // 0 25 50 + // |------------------------|------------------------|------------ + // ldr q8, [x1, #(16*0)] // ..'*........................................................... + // ldr q9, [x1, #(16*1)] // ..'.......*.................................................... + // ldr q10, [x1, #(16*2)] // ..'.*.......................................................... + // ldr q11, [x1, #(16*3)] // ..'..*......................................................... + // ldr q0, [x3], #16 // ..*............................................................ + // ldr q1, [x3], #8 // ..'..........*................................................. + // sqrdmulh v27.4s, v10.4s, v0.s[1] // ..'...*........................................................ + // mul v24.4s, v10.4s, v0.s[0] // ..'....*....................................................... + // mla v24.4s, v27.4s, v29.4s // ..'........*................................................... + // sub v10.4s, v8.4s, v24.4s // ..'...........*................................................ + // add v8.4s, v8.4s, v24.4s // ..'.............*.............................................. + // sqrdmulh v27.4s, v11.4s, v0.s[1] // ..'.....*...................................................... + // mul v24.4s, v11.4s, v0.s[0] // ..'......*..................................................... + // mla v24.4s, v27.4s, v29.4s // ..'.........*.................................................. + // sub v11.4s, v9.4s, v24.4s // ..'..............*............................................. + // add v9.4s, v9.4s, v24.4s // ..'............*............................................... + // sqrdmulh v27.4s, v9.4s, v0.s[3] // ..'...............*............................................ + // mul v24.4s, v9.4s, v0.s[2] // ..'................*........................................... + // mla v24.4s, v27.4s, v29.4s // ..'....................*....................................... + // sub v9.4s, v8.4s, v24.4s // ..'.......................*.................................... + // add v8.4s, v8.4s, v24.4s // ..'........................*................................... + // sqrdmulh v27.4s, v11.4s, v1.s[1] // ..'.................*.......................................... + // mul v24.4s, v11.4s, v1.s[0] // ..'..................*......................................... + // mla v24.4s, v27.4s, v29.4s // ..'.....................*...................................... + // sub v11.4s, v10.4s, v24.4s // ..'.........................*.................................. + // add v10.4s, v10.4s, v24.4s // ..'..........................*................................. + // trn1 v25.4s, v8.4s, v9.4s // ..'...........................*................................ + // trn2 v26.4s, v8.4s, v9.4s // ..'............................*............................... + // trn1 v27.4s, v10.4s, v11.4s // ..'.............................*.............................. + // trn2 v28.4s, v10.4s, v11.4s // ..'..............................*............................. + // trn2 v10.2d, v25.2d, v27.2d // ..'................................*........................... + // trn2 v11.2d, v26.2d, v28.2d // ..'.................................*.......................... + // trn1 v8.2d, v25.2d, v27.2d // ..'.................................................*.......... + // trn1 v9.2d, v26.2d, v28.2d // ..'..................................*......................... + // ldr q0, [ x4], #(6*16) // ..'...................*........................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // ..'...............................*............................ + // ldr q1, [ x4, #(-6*16 + 2*16)] // ..'......................*..................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ..'......................................*..................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // e.'..........................................................~. + // ldr q6, [x4, #(-6*16 + 5*16)] // ..'.........................................*.................. + // sqrdmulh v27.4s, v10.4s, v4.4s // ..'........................................*................... + // mul v24.4s, v10.4s, v0.4s // ..'.....................................*...................... + // mla v24.4s, v27.4s, v29.4s // ..'............................................*............... + // sub v10.4s, v8.4s, v24.4s // ..'.....................................................*...... + // add v8.4s, v8.4s, v24.4s // ..'...................................................*........ + // sqrdmulh v27.4s, v11.4s, v4.4s // ..'...................................*........................ + // mul v24.4s, v11.4s, v0.4s // ..'....................................*....................... + // mla v24.4s, v27.4s, v29.4s // ..'.......................................*.................... + // sub v11.4s, v9.4s, v24.4s // ..'...........................................*................ + // add v9.4s, v9.4s, v24.4s // ..'..........................................*................. + // sqrdmulh v27.4s, v9.4s, v5.4s // ..'.............................................*.............. + // mul v24.4s, v9.4s, v1.4s // ..'..............................................*............. + // mla v24.4s, v27.4s, v29.4s // ..'..................................................*......... + // sub v9.4s, v8.4s, v24.4s // ..'......................................................*..... + // add v8.4s, v8.4s, v24.4s // ..'.......................................................*.... + // sqrdmulh v27.4s, v11.4s, v6.4s // ..'...............................................*............ + // mul v24.4s, v11.4s, v2.4s // ..'................................................*........... + // mla v24.4s, v27.4s, v29.4s // ..'....................................................*....... + // sub v11.4s, v10.4s, v24.4s // ..'........................................................*... + // add v10.4s, v10.4s, v24.4s // ..'.........................................................*.. + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // .~'...........................................................* + + sub count, count, #1 + cbnz count, layer5678_start + // Instructions: 60 + // Expected cycles: 76 + // Expected IPC: 0.79 + // + // Wall time: 7.69s + // User time: 7.69s + // + // -------------------- original position --------------------> + // 0 25 50 + // |------------------------|------------------------|--------- + ldr q22, [x3], #16 // *........................................................... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + ldr q11, [x1, #48] // ...*........................................................ + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + ldr q10, [x1, #32] // ..*......................................................... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + sqrdmulh v12.4S, v11.4S, v22.S[1] // ......*..................................................... + // gap // ............................................................ + mul v11.4S, v11.4S, v22.S[0] // .......*.................................................... + // gap // ............................................................ + sqrdmulh v6.4S, v10.4S, v22.S[1] // ....*....................................................... + // gap // ............................................................ + ldr q8, [x1, #16] // ........*................................................... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + mla v11.4S, v12.4S, v29.4S // ..........*................................................. + // gap // ............................................................ + mul v10.4S, v10.4S, v22.S[0] // .....*...................................................... + // gap // ............................................................ + ldr q12, [x3], #8 // ...........*................................................ + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + add v27.4S, v8.4S, v11.4S // .............*.............................................. + // gap // ............................................................ + mla v10.4S, v6.4S, v29.4S // .........*.................................................. + // gap // ............................................................ + sub v11.4S, v8.4S, v11.4S // ...............*............................................ + // gap // ............................................................ + sqrdmulh v6.4S, v27.4S, v22.S[3] // ................*........................................... + // gap // ............................................................ + mul v22.4S, v27.4S, v22.S[2] // .................*.......................................... + // gap // ............................................................ + sqrdmulh v8.4S, v11.4S, v12.S[1] // ..................*......................................... + // gap // ............................................................ + mul v11.4S, v11.4S, v12.S[0] // ...................*........................................ + // gap // ............................................................ + ldr q12, [x1, #0] // .*.......................................................... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + mla v22.4S, v6.4S, v29.4S // .....................*...................................... + // gap // ............................................................ + mla v11.4S, v8.4S, v29.4S // ......................*..................................... + // gap // ............................................................ + sub v6.4S, v12.4S, v10.4S // ............*............................................... + // gap // ............................................................ + add v10.4S, v12.4S, v10.4S // ..............*............................................. + // gap // ............................................................ + ldr q12, [x4], #(6*16) // ....................*....................................... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + sub v8.4S, v10.4S, v22.4S // ........................*................................... + // gap // ............................................................ + add v22.4S, v10.4S, v22.4S // .........................*.................................. + // gap // ............................................................ + sub v10.4S, v6.4S, v11.4S // ..........................*................................. + // gap // ............................................................ + add v11.4S, v6.4S, v11.4S // ...........................*................................ + // gap // ............................................................ + trn1 v6.4S, v22.4S, v8.4S // ............................*............................... + // gap // ............................................................ + trn2 v22.4S, v22.4S, v8.4S // .............................*.............................. + // gap // ............................................................ + trn2 v8.4S, v11.4S, v10.4S // ...............................*............................ + // gap // ............................................................ + ldr q27, [x4, #-80] // ................................*........................... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + trn2 v4.2D, v22.2D, v8.2D // ..................................*......................... + // gap // ............................................................ + trn1 v11.4S, v11.4S, v10.4S // ..............................*............................. + // gap // ............................................................ + sqrdmulh v10.4S, v4.4S, v27.4S // ....................................*....................... + // gap // ............................................................ + mul v4.4S, v4.4S, v12.4S // .....................................*...................... + // gap // ............................................................ + trn2 v2.2D, v6.2D, v11.2D // .................................*.......................... + // gap // ............................................................ + trn1 v22.2D, v22.2D, v8.2D // ...................................*........................ + // gap // ............................................................ + mul v12.4S, v2.4S, v12.4S // ......................................*..................... + // gap // ............................................................ + mla v4.4S, v10.4S, v29.4S // ........................................*................... + // gap // ............................................................ + sqrdmulh v10.4S, v2.4S, v27.4S // .........................................*.................. + // gap // ............................................................ + ldr q8, [x4, #-64] // .......................*.................................... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + sub v27.4S, v22.4S, v4.4S // ............................................*............... + // gap // ............................................................ + add v22.4S, v22.4S, v4.4S // ...........................................*................ + // gap // ............................................................ + mla v12.4S, v10.4S, v29.4S // .............................................*.............. + // gap // ............................................................ + mul v10.4S, v27.4S, v15.4S // .................................................*.......... + // gap // ............................................................ + ldr q4, [x4, #-16] // ..........................................*................. + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + ldr q2, [x4, #-48] // .......................................*.................... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + sqrdmulh v27.4S, v27.4S, v4.4S // ................................................*........... + // gap // ............................................................ + mul v8.4S, v22.4S, v8.4S // ...............................................*............ + // gap // ............................................................ + sqrdmulh v22.4S, v22.4S, v2.4S // ..............................................*............. + // gap // ............................................................ + trn1 v11.2D, v6.2D, v11.2D // ..................................................*......... + // gap // ............................................................ + mla v10.4S, v27.4S, v29.4S // .....................................................*...... + // gap // ............................................................ + sub v6.4S, v11.4S, v12.4S // ......................................................*..... + // gap // ............................................................ + mla v8.4S, v22.4S, v29.4S // ...................................................*........ + // gap // ............................................................ + add v22.4S, v11.4S, v12.4S // ....................................................*....... + // gap // ............................................................ + sub v11.4S, v6.4S, v10.4S // .........................................................*.. + // gap // ............................................................ + add v10.4S, v6.4S, v10.4S // ..........................................................*. + // gap // ............................................................ + sub v9.4S, v22.4S, v8.4S // .......................................................*.... + // gap // ............................................................ + add v8.4S, v22.4S, v8.4S // ........................................................*... + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ...........................................................* + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + // gap // ............................................................ + + // ---------------------- new position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------- + // ldr q30, [x3], #16 // *........................................................... + // ldr q31, [x1, #0] // .................*.......................................... + // ldr q8, [x1, #32] // ..*......................................................... + // ldr q28, [x1, #48] // .*.......................................................... + // sqrdmulh v22.4S, v8.4S, v30.S[1] // .....*...................................................... + // mul v21.4S, v8.4S, v30.S[0] // ........*................................................... + // sqrdmulh v7.4S, v28.4S, v30.S[1] // ...*........................................................ + // mul v1.4S, v28.4S, v30.S[0] // ....*....................................................... + // ldr q12, [x1, #16] // ......*..................................................... + // mla v21.4S, v22.4S, v29.4S // ...........*................................................ + // mla v1.4S, v7.4S, v29.4S // .......*.................................................... + // ldr q17, [x3], #8 // .........*.................................................. + // sub v16.4S, v31.4S, v21.4S // ....................*....................................... + // add v10.4S, v12.4S, v1.4S // ..........*................................................. + // add v14.4S, v31.4S, v21.4S // .....................*...................................... + // sub v28.4S, v12.4S, v1.4S // ............*............................................... + // sqrdmulh v22.4S, v10.4S, v30.S[3] // .............*.............................................. + // mul v8.4S, v10.4S, v30.S[2] // ..............*............................................. + // sqrdmulh v18.4S, v28.4S, v17.S[1] // ...............*............................................ + // mul v30.4S, v28.4S, v17.S[0] // ................*........................................... + // ldr q10, [x4], #(6*16) // ......................*..................................... + // mla v8.4S, v22.4S, v29.4S // ..................*......................................... + // mla v30.4S, v18.4S, v29.4S // ...................*........................................ + // ldr q11, [x4, #-64] // ........................................*................... + // sub v21.4S, v14.4S, v8.4S // .......................*.................................... + // add v17.4S, v14.4S, v8.4S // ........................*................................... + // sub v0.4S, v16.4S, v30.4S // .........................*.................................. + // add v6.4S, v16.4S, v30.4S // ..........................*................................. + // trn1 v13.4S, v17.4S, v21.4S // ...........................*................................ + // trn2 v5.4S, v17.4S, v21.4S // ............................*............................... + // trn1 v25.4S, v6.4S, v0.4S // ................................*........................... + // trn2 v19.4S, v6.4S, v0.4S // .............................*.............................. + // ldr q14, [x4, #-80] // ..............................*............................. + // trn2 v1.2D, v13.2D, v25.2D // ...................................*........................ + // trn2 v3.2D, v5.2D, v19.2D // ...............................*............................ + // trn1 v28.2D, v5.2D, v19.2D // ....................................*....................... + // sqrdmulh v30.4S, v3.4S, v14.4S // .................................*.......................... + // mul v31.4S, v3.4S, v10.4S // ..................................*......................... + // mul v2.4S, v1.4S, v10.4S // .....................................*...................... + // ldr q23, [x4, #-48] // ..............................................*............. + // mla v31.4S, v30.4S, v29.4S // ......................................*..................... + // sqrdmulh v30.4S, v1.4S, v14.4S // .......................................*.................... + // ldr q0, [x4, #-16] // .............................................*.............. + // add v18.4S, v28.4S, v31.4S // ..........................................*................. + // sub v4.4S, v28.4S, v31.4S // .........................................*.................. + // mla v2.4S, v30.4S, v29.4S // ...........................................*................ + // sqrdmulh v31.4S, v18.4S, v23.4S // .................................................*.......... + // mul v28.4S, v18.4S, v11.4S // ................................................*........... + // sqrdmulh v22.4S, v4.4S, v0.4S // ...............................................*............ + // mul v14.4S, v4.4S, v15.4S // ............................................*............... + // trn1 v19.2D, v13.2D, v25.2D // ..................................................*......... + // mla v28.4S, v31.4S, v29.4S // .....................................................*...... + // add v24.4S, v19.4S, v2.4S // ......................................................*..... + // mla v14.4S, v22.4S, v29.4S // ...................................................*........ + // sub v13.4S, v19.4S, v2.4S // ....................................................*....... + // sub v23.4S, v24.4S, v28.4S // .........................................................*.. + // add v22.4S, v24.4S, v28.4S // ..........................................................*. + // sub v25.4S, v13.4S, v14.4S // .......................................................*.... + // add v24.4S, v13.4S, v14.4S // ........................................................*... + // st4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1], #64 // ...........................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_a72.s b/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_a72.s index 7d31d123..073f7f2d 100644 --- a/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_a72.s +++ b/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_a72.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, \offset] -.endm -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], \inc -.endm -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, \offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], \inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -68,15 +47,15 @@ .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -85,12 +64,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -98,46 +71,46 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -148,7 +121,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -158,7 +131,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -166,7 +139,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -177,19 +150,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -202,7 +175,7 @@ roots: .text .global ntt_dilithium_1234_5678_opt_a72 - .global _ntt_dilithium_1234_5678_opt_a72 + .global _ntt_dilithium_1234_5678 .p2align 4 modulus_addr: .quad -8380417 @@ -234,6 +207,23 @@ _ntt_dilithium_1234_5678_opt_a72: data14 .req v22 data15 .req v23 + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + qform_v0 .req q0 qform_v1 .req q1 qform_v2 .req q2 @@ -306,395 +296,598 @@ _ntt_dilithium_1234_5678_opt_a72: load_roots_1234 .p2align 2 - ldr_vo v10, x0, 512 - ldr_vo v31, x0, 0 - ldr_vo v16, x0, 64 - ldr_vo v17, x0, 128 - ldr_vo v8, x0, 256 - ldr_vo v25, x0, 192 - ldr_vo v27, x0, 320 - ldr_vo v12, x0, 384 - sqrdmulh v9.4S, v10.4S, v0.S[1] - ldr_vo v23, x0, 576 - ldr_vo v18, x0, 448 - ldr_vo v13, x0, 640 - ldr_vo v11, x0, 704 - mul v10.4S, v10.4S, v0.S[0] - ldr_vo v24, x0, 768 - ldr_vo v28, x0, 832 - sqrdmulh v15.4S, v23.4S, v0.S[1] - mla v10.4S, v9.4S, v29.4S - mul v9.4S, v23.4S, v0.S[0] - sub v22.4S, v31.4S, v10.4S - ldr_vo v20, x0, 896 - sqrdmulh v23.4S, v13.4S, v0.S[1] - add v26.4S, v31.4S, v10.4S - sqrdmulh v31.4S, v24.4S, v0.S[1] - sqrdmulh v14.4S, v20.4S, v0.S[1] - mul v24.4S, v24.4S, v0.S[0] - mul v10.4S, v20.4S, v0.S[0] - mla v24.4S, v31.4S, v29.4S - mla v10.4S, v14.4S, v29.4S - mul v13.4S, v13.4S, v0.S[0] - add v31.4S, v8.4S, v24.4S - mla v13.4S, v23.4S, v29.4S - sub v30.4S, v8.4S, v24.4S - add v8.4S, v12.4S, v10.4S - mul v20.4S, v31.4S, v0.S[2] - sub v19.4S, v12.4S, v10.4S - sqrdmulh v12.4S, v31.4S, v0.S[3] - mul v21.4S, v28.4S, v0.S[0] - ldr_vo v14, x0, 960 - add v23.4S, v17.4S, v13.4S - mla v9.4S, v15.4S, v29.4S - sub v13.4S, v17.4S, v13.4S - mla v20.4S, v12.4S, v29.4S - sqrdmulh v17.4S, v11.4S, v0.S[1] - add v12.4S, v16.4S, v9.4S - sqrdmulh v24.4S, v19.4S, v1.S[1] - sub v16.4S, v16.4S, v9.4S - sqrdmulh v10.4S, v30.4S, v1.S[1] - mul v11.4S, v11.4S, v0.S[0] - sqrdmulh v15.4S, v8.4S, v0.S[3] - mul v9.4S, v30.4S, v1.S[0] - mla v9.4S, v10.4S, v29.4S - mla v11.4S, v17.4S, v29.4S - mul v10.4S, v19.4S, v1.S[0] - add v19.4S, v22.4S, v9.4S - mul v31.4S, v8.4S, v0.S[2] - add v30.4S, v26.4S, v20.4S - sub v17.4S, v22.4S, v9.4S - mla v31.4S, v15.4S, v29.4S - sub v8.4S, v26.4S, v20.4S - add v9.4S, v25.4S, v11.4S - mla v10.4S, v24.4S, v29.4S - sub v25.4S, v25.4S, v11.4S - sqrdmulh v11.4S, v28.4S, v0.S[1] - add v24.4S, v23.4S, v31.4S - sub v23.4S, v23.4S, v31.4S - mul v28.4S, v14.4S, v0.S[0] - add v31.4S, v13.4S, v10.4S - sqrdmulh v20.4S, v14.4S, v0.S[1] - sub v10.4S, v13.4S, v10.4S - sqrdmulh v14.4S, v24.4S, v1.S[3] - mla v21.4S, v11.4S, v29.4S - mla v28.4S, v20.4S, v29.4S - sqrdmulh v13.4S, v31.4S, v2.S[3] - sub v11.4S, v27.4S, v21.4S - add v27.4S, v27.4S, v21.4S - mul v15.4S, v31.4S, v2.S[2] - mul v26.4S, v10.4S, v3.S[0] - sub v22.4S, v18.4S, v28.4S - add v18.4S, v18.4S, v28.4S - sqrdmulh v31.4S, v23.4S, v2.S[1] - sqrdmulh v21.4S, v18.4S, v0.S[3] - mul v20.4S, v27.4S, v0.S[2] - mul v28.4S, v18.4S, v0.S[2] - mla v28.4S, v21.4S, v29.4S - sqrdmulh v27.4S, v27.4S, v0.S[3] - mul v21.4S, v24.4S, v1.S[2] - add v18.4S, v9.4S, v28.4S - sub v28.4S, v9.4S, v28.4S - mla v21.4S, v14.4S, v29.4S - mul v9.4S, v18.4S, v1.S[2] - mla v20.4S, v27.4S, v29.4S - mul v27.4S, v11.4S, v1.S[0] - sqrdmulh v24.4S, v28.4S, v2.S[1] - add v14.4S, v12.4S, v20.4S - sub v20.4S, v12.4S, v20.4S + ldr q20, [x0, #960] // ...............*................................................................................................................................................................................ + ldr q18, [x0, #448] // .......*........................................................................................................................................................................................ + ldr q19, [x0, #832] // .............*.................................................................................................................................................................................. + ldr q22, [x0, #576] // .........*...................................................................................................................................................................................... + ldr q11, [x0, #640] // ..........*..................................................................................................................................................................................... + ldr q30, [x0, #0] // *............................................................................................................................................................................................... + ldr q13, [x0, #320] // .....*.......................................................................................................................................................................................... + ldr q27, [x0, #704] // ...........*.................................................................................................................................................................................... + sqrdmulh v17.4S, v20.4S, v0.S[1] // ...................................................*............................................................................................................................................ + ldr q12, [x0, #768] // ............*................................................................................................................................................................................... + ldr q14, [x0, #64] // .*.............................................................................................................................................................................................. + ldr q26, [x0, #512] // ........*....................................................................................................................................................................................... + ldr q8, [x0, #896] // ..............*................................................................................................................................................................................. + mul v20.4S, v20.4S, v0.S[0] // ....................................................*........................................................................................................................................... + ldr q31, [x0, #128] // ..*............................................................................................................................................................................................. + ldr q25, [x0, #192] // ...*............................................................................................................................................................................................ + ldr q16, [x0, #384] // ......*......................................................................................................................................................................................... + ldr q10, [x0, #256] // ....*........................................................................................................................................................................................... + sqrdmulh v23.4S, v19.4S, v0.S[1] // .........................................*...................................................................................................................................................... + mla v20.4S, v17.4S, v29.4S // .....................................................*.......................................................................................................................................... + mul v19.4S, v19.4S, v0.S[0] // ..........................................*..................................................................................................................................................... + mla v19.4S, v23.4S, v29.4S // ...........................................*.................................................................................................................................................... + sub v17.4S, v18.4S, v20.4S // ......................................................*......................................................................................................................................... + add v20.4S, v18.4S, v20.4S // .......................................................*........................................................................................................................................ + sqrdmulh v18.4S, v11.4S, v0.S[1] // ..........................*..................................................................................................................................................................... + mul v11.4S, v11.4S, v0.S[0] // ...........................*.................................................................................................................................................................... + add v23.4S, v13.4S, v19.4S // .............................................*.................................................................................................................................................. + sub v19.4S, v13.4S, v19.4S // ............................................*................................................................................................................................................... + sqrdmulh v13.4S, v26.4S, v0.S[1] // ................*............................................................................................................................................................................... + mul v26.4S, v26.4S, v0.S[0] // .................*.............................................................................................................................................................................. + sqrdmulh v28.4S, v22.4S, v0.S[1] // .....................*.......................................................................................................................................................................... + mul v22.4S, v22.4S, v0.S[0] // ......................*......................................................................................................................................................................... + sqrdmulh v9.4S, v27.4S, v0.S[1] // ...............................*................................................................................................................................................................ + mul v27.4S, v27.4S, v0.S[0] // ................................*............................................................................................................................................................... + sqrdmulh v24.4S, v12.4S, v0.S[1] // ....................................*........................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[0] // .....................................*.......................................................................................................................................................... + sqrdmulh v21.4S, v8.4S, v0.S[1] // ..............................................*................................................................................................................................................. + mul v8.4S, v8.4S, v0.S[0] // ...............................................*................................................................................................................................................ + mla v26.4S, v13.4S, v29.4S // ..................*............................................................................................................................................................................. + mla v22.4S, v28.4S, v29.4S // .......................*........................................................................................................................................................................ + mla v11.4S, v18.4S, v29.4S // ............................*................................................................................................................................................................... + sub v18.4S, v30.4S, v26.4S // ...................*............................................................................................................................................................................ + add v30.4S, v30.4S, v26.4S // ....................*........................................................................................................................................................................... + mla v8.4S, v21.4S, v29.4S // ................................................*............................................................................................................................................... + sub v13.4S, v14.4S, v22.4S // ........................*....................................................................................................................................................................... + mla v27.4S, v9.4S, v29.4S // .................................*.............................................................................................................................................................. + add v22.4S, v14.4S, v22.4S // .........................*...................................................................................................................................................................... + sub v14.4S, v31.4S, v11.4S // .............................*.................................................................................................................................................................. + add v11.4S, v31.4S, v11.4S // ..............................*................................................................................................................................................................. + mla v12.4S, v24.4S, v29.4S // ......................................*......................................................................................................................................................... + add v26.4S, v16.4S, v8.4S // ..................................................*............................................................................................................................................. + sub v8.4S, v16.4S, v8.4S // .................................................*.............................................................................................................................................. + sqrdmulh v31.4S, v20.4S, v0.S[3] // .......................................................................*........................................................................................................................ + sub v16.4S, v25.4S, v27.4S // ..................................*............................................................................................................................................................. + add v27.4S, v25.4S, v27.4S // ...................................*............................................................................................................................................................ + mul v20.4S, v20.4S, v0.S[2] // ........................................................................*....................................................................................................................... + sub v25.4S, v10.4S, v12.4S // .......................................*........................................................................................................................................................ + add v12.4S, v10.4S, v12.4S // ........................................*....................................................................................................................................................... + mla v20.4S, v31.4S, v29.4S // .........................................................................*...................................................................................................................... + mul v10.4S, v23.4S, v0.S[2] // ..............................................................*................................................................................................................................. + sqrdmulh v31.4S, v23.4S, v0.S[3] // .............................................................*.................................................................................................................................. + add v23.4S, v27.4S, v20.4S // ...........................................................................*.................................................................................................................... + sub v20.4S, v27.4S, v20.4S // ..........................................................................*..................................................................................................................... + sqrdmulh v27.4S, v17.4S, v1.S[1] // ...........................................................................................*.................................................................................................... + mul v17.4S, v17.4S, v1.S[0] // ............................................................................................*................................................................................................... + sqrdmulh v28.4S, v19.4S, v1.S[1] // .................................................................................*.............................................................................................................. + mul v19.4S, v19.4S, v1.S[0] // ..................................................................................*............................................................................................................. + sqrdmulh v9.4S, v25.4S, v1.S[1] // ............................................................................*................................................................................................................... + mul v25.4S, v25.4S, v1.S[0] // .............................................................................*.................................................................................................................. + sqrdmulh v24.4S, v12.4S, v0.S[3] // ........................................................*....................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // .........................................................*...................................................................................................................................... + sqrdmulh v21.4S, v8.4S, v1.S[1] // ......................................................................................*......................................................................................................... + mul v8.4S, v8.4S, v1.S[0] // .......................................................................................*........................................................................................................ + sqrdmulh v15.4S, v26.4S, v0.S[3] // ..................................................................*............................................................................................................................. + mul v26.4S, v26.4S, v0.S[2] // ...................................................................*............................................................................................................................ + mla v25.4S, v9.4S, v29.4S // ..............................................................................*................................................................................................................. + mla v12.4S, v24.4S, v29.4S // ..........................................................*..................................................................................................................................... + mla v19.4S, v28.4S, v29.4S // ...................................................................................*............................................................................................................ + add v28.4S, v18.4S, v25.4S // ................................................................................*............................................................................................................... + sub v18.4S, v18.4S, v25.4S // ...............................................................................*................................................................................................................ + mla v8.4S, v21.4S, v29.4S // ........................................................................................*....................................................................................................... + sub v25.4S, v30.4S, v12.4S // ...........................................................*.................................................................................................................................... + add v30.4S, v30.4S, v12.4S // ............................................................*................................................................................................................................... + mla v26.4S, v15.4S, v29.4S // ....................................................................*........................................................................................................................... + sub v12.4S, v13.4S, v19.4S // ....................................................................................*........................................................................................................... + add v19.4S, v13.4S, v19.4S // .....................................................................................*.......................................................................................................... + mla v10.4S, v31.4S, v29.4S // ...............................................................*................................................................................................................................ + sub v13.4S, v14.4S, v8.4S // .........................................................................................*...................................................................................................... + add v14.4S, v14.4S, v8.4S // ..........................................................................................*..................................................................................................... + mla v17.4S, v27.4S, v29.4S // .............................................................................................*.................................................................................................. + add v27.4S, v11.4S, v26.4S // ......................................................................*......................................................................................................................... + sub v11.4S, v11.4S, v26.4S // .....................................................................*.......................................................................................................................... + sqrdmulh v31.4S, v23.4S, v1.S[3] // .....................................................................................................*.......................................................................................... + sub v9.4S, v22.4S, v10.4S // ................................................................*............................................................................................................................... + mul v23.4S, v23.4S, v1.S[2] // ......................................................................................................*......................................................................................... + add v22.4S, v22.4S, v10.4S // .................................................................*.............................................................................................................................. sub count, count, #1 -.p2align 2 layer1234_start: - mul v23.4S, v23.4S, v2.S[0] // gap(s) to follow - sqrdmulh v11.4S, v11.4S, v1.S[1] // gap(s) to follow - sqrdmulh v12.4S, v22.4S, v1.S[1] // gap(s) to follow - mul v22.4S, v22.4S, v1.S[0] // gap(s) to follow - mla v27.4S, v11.4S, v29.4S // gap(s) to follow - mla v22.4S, v12.4S, v29.4S // gap(s) to follow - mul v28.4S, v28.4S, v2.S[0] // gap(s) to follow - sub v11.4S, v16.4S, v27.4S // gap(s) to follow - add v27.4S, v16.4S, v27.4S - mla v28.4S, v24.4S, v29.4S // gap(s) to follow - sub v24.4S, v25.4S, v22.4S // gap(s) to follow - add v22.4S, v25.4S, v22.4S - mla v23.4S, v31.4S, v29.4S // gap(s) to follow - mla v15.4S, v13.4S, v29.4S // gap(s) to follow - add v13.4S, v20.4S, v28.4S // gap(s) to follow - sub v12.4S, v20.4S, v28.4S - sqrdmulh v31.4S, v22.4S, v2.S[3] // gap(s) to follow - add v25.4S, v8.4S, v23.4S // gap(s) to follow - sub v23.4S, v8.4S, v23.4S - sqrdmulh v16.4S, v24.4S, v3.S[1] - ldr_vo v20, x0, 912 // gap(s) to follow - sqrdmulh v8.4S, v18.4S, v1.S[3] // gap(s) to follow - add v28.4S, v19.4S, v15.4S // gap(s) to follow - sub v18.4S, v19.4S, v15.4S - sqrdmulh v15.4S, v20.4S, v0.S[1] // gap(s) to follow - sqrdmulh v19.4S, v10.4S, v3.S[1] // gap(s) to follow - mul v10.4S, v24.4S, v3.S[0] // gap(s) to follow - mla v10.4S, v16.4S, v29.4S // gap(s) to follow - mla v26.4S, v19.4S, v29.4S // gap(s) to follow - mla v9.4S, v8.4S, v29.4S // gap(s) to follow - add v24.4S, v11.4S, v10.4S // gap(s) to follow - mul v8.4S, v22.4S, v2.S[2] // gap(s) to follow - add v22.4S, v17.4S, v26.4S - ldr_vo v16, x0, 784 - sub v26.4S, v17.4S, v26.4S - sqrdmulh v17.4S, v24.4S, v6.S[3] - sub v19.4S, v14.4S, v9.4S // gap(s) to follow - mla v8.4S, v31.4S, v29.4S // gap(s) to follow - sub v10.4S, v11.4S, v10.4S - mul v31.4S, v13.4S, v4.S[2] // gap(s) to follow - sqrdmulh v13.4S, v13.4S, v4.S[3] - add v11.4S, v30.4S, v21.4S // gap(s) to follow - sub v21.4S, v30.4S, v21.4S - mul v30.4S, v24.4S, v6.S[2] // gap(s) to follow - add v24.4S, v14.4S, v9.4S // gap(s) to follow - mla v30.4S, v17.4S, v29.4S // gap(s) to follow - sqrdmulh v17.4S, v12.4S, v5.S[1] // gap(s) to follow - mul v12.4S, v12.4S, v5.S[0] // gap(s) to follow - mul v20.4S, v20.4S, v0.S[0] // gap(s) to follow - mla v12.4S, v17.4S, v29.4S - add v17.4S, v22.4S, v30.4S // gap(s) to follow - sqrdmulh v9.4S, v10.4S, v7.S[1] // gap(s) to follow - str_vo v17, x0, 768 // gap(s) to follow - mul v17.4S, v10.4S, v7.S[0] // gap(s) to follow - sub v14.4S, v22.4S, v30.4S // gap(s) to follow - add v22.4S, v27.4S, v8.4S - mla v20.4S, v15.4S, v29.4S // gap(s) to follow - ldr_vo v15, x0, 848 - sub v10.4S, v27.4S, v8.4S - str_vo v14, x0, 832 - sqrdmulh v30.4S, v22.4S, v5.S[3] - sub v27.4S, v23.4S, v12.4S // gap(s) to follow - mul v22.4S, v22.4S, v5.S[2] // gap(s) to follow - str_vo v27, x0, 448 - sqrdmulh v27.4S, v19.4S, v4.S[1] // gap(s) to follow - mla v22.4S, v30.4S, v29.4S // gap(s) to follow - mul v19.4S, v19.4S, v4.S[0] // gap(s) to follow - mla v17.4S, v9.4S, v29.4S // gap(s) to follow - sub v8.4S, v28.4S, v22.4S // gap(s) to follow - mla v19.4S, v27.4S, v29.4S // gap(s) to follow - sqrdmulh v30.4S, v10.4S, v6.S[1] // gap(s) to follow - add v9.4S, v26.4S, v17.4S // gap(s) to follow - mul v14.4S, v10.4S, v6.S[0] - add v10.4S, v23.4S, v12.4S // gap(s) to follow - sub v27.4S, v21.4S, v19.4S // gap(s) to follow - mla v31.4S, v13.4S, v29.4S - add v12.4S, v21.4S, v19.4S - mla v14.4S, v30.4S, v29.4S - str_vo v9, x0, 896 - sub v9.4S, v25.4S, v31.4S - add v13.4S, v28.4S, v22.4S - str_vo v27, x0, 192 - ldr_vo v22, x0, 656 - mul v28.4S, v24.4S, v3.S[2] - add v19.4S, v25.4S, v31.4S - ldr_vo v25, x0, 528 - str_vo v9, x0, 320 - ldr_vo v9, x0, 400 - ldr_vo v21, x0, 976 - sub v26.4S, v26.4S, v17.4S - ldr_vo v27, x0, 592 - sqrdmulh v31.4S, v16.4S, v0.S[1] - str_vo v10, x0, 384 - str_vo v19, x0, 256 - add v10.4S, v18.4S, v14.4S - str_vo v13, x0, 512 - add v23.4S, v9.4S, v20.4S - str_vo v8, x0, 576 - mul v30.4S, v16.4S, v0.S[0] - str_vo v10, x0, 640 - sub v10.4S, v9.4S, v20.4S - str_vo v26, x0, 960 - sub v8.4S, v18.4S, v14.4S - str_vo v12, x0, 128 - mla v30.4S, v31.4S, v29.4S - ldr_vo v9, x0, 272 // gap(s) to follow - sqrdmulh v18.4S, v10.4S, v1.S[1] // gap(s) to follow - str_vo v8, x0, 704 // gap(s) to follow - sqrdmulh v17.4S, v27.4S, v0.S[1] // gap(s) to follow - add v16.4S, v9.4S, v30.4S // gap(s) to follow - sqrdmulh v14.4S, v25.4S, v0.S[1] // gap(s) to follow - sqrdmulh v26.4S, v16.4S, v0.S[3] // gap(s) to follow - sqrdmulh v13.4S, v24.4S, v3.S[3] // gap(s) to follow - mul v8.4S, v16.4S, v0.S[2] // gap(s) to follow - mla v8.4S, v26.4S, v29.4S // gap(s) to follow - mul v31.4S, v21.4S, v0.S[0] // gap(s) to follow - mul v16.4S, v27.4S, v0.S[0] // gap(s) to follow - mul v20.4S, v10.4S, v1.S[0] // gap(s) to follow - sqrdmulh v26.4S, v15.4S, v0.S[1] // gap(s) to follow - mul v27.4S, v15.4S, v0.S[0] // gap(s) to follow - sqrdmulh v15.4S, v23.4S, v0.S[3] // gap(s) to follow - mla v27.4S, v26.4S, v29.4S - ldr_vo v26, x0, 336 // gap(s) to follow - sqrdmulh v19.4S, v22.4S, v0.S[1] // gap(s) to follow - mla v28.4S, v13.4S, v29.4S // gap(s) to follow - mul v10.4S, v22.4S, v0.S[0] - add v22.4S, v26.4S, v27.4S // gap(s) to follow - sqrdmulh v24.4S, v21.4S, v0.S[1] // gap(s) to follow - sub v21.4S, v11.4S, v28.4S // gap(s) to follow - add v12.4S, v11.4S, v28.4S - ldr_vo v28, x0, 144 - sqrdmulh v11.4S, v22.4S, v0.S[3] // gap(s) to follow - mla v10.4S, v19.4S, v29.4S - str_vo v21, x0, 64 // gap(s) to follow - mul v19.4S, v23.4S, v0.S[2] // gap(s) to follow - mla v19.4S, v15.4S, v29.4S // gap(s) to follow - add v15.4S, v28.4S, v10.4S // gap(s) to follow - str_vi v12, x0, 16 - mul v12.4S, v25.4S, v0.S[0] // gap(s) to follow - mla v12.4S, v14.4S, v29.4S - sub v14.4S, v9.4S, v30.4S // gap(s) to follow - sub v23.4S, v15.4S, v19.4S // gap(s) to follow - sub v30.4S, v28.4S, v10.4S - mla v31.4S, v24.4S, v29.4S // gap(s) to follow - mla v16.4S, v17.4S, v29.4S // gap(s) to follow - ldr_vo v25, x0, 0 - ldr_vo v28, x0, 448 // gap(s) to follow - ldr_vo v24, x0, 64 // gap(s) to follow - mul v9.4S, v22.4S, v0.S[2] // gap(s) to follow - mla v9.4S, v11.4S, v29.4S - ldr_vo v17, x0, 704 - sub v11.4S, v26.4S, v27.4S // gap(s) to follow - sub v22.4S, v28.4S, v31.4S - mla v20.4S, v18.4S, v29.4S // gap(s) to follow - add v13.4S, v25.4S, v12.4S - add v10.4S, v28.4S, v31.4S // gap(s) to follow - add v21.4S, v24.4S, v16.4S - mul v26.4S, v17.4S, v0.S[0] // gap(s) to follow - sub v16.4S, v24.4S, v16.4S // gap(s) to follow - sqrdmulh v28.4S, v10.4S, v0.S[3] - sub v12.4S, v25.4S, v12.4S - ldr_vo v31, x0, 192 // gap(s) to follow - add v24.4S, v30.4S, v20.4S // gap(s) to follow - sqrdmulh v25.4S, v17.4S, v0.S[1] // gap(s) to follow - add v17.4S, v15.4S, v19.4S // gap(s) to follow - mul v15.4S, v10.4S, v0.S[2] // gap(s) to follow - sub v10.4S, v30.4S, v20.4S // gap(s) to follow - mul v19.4S, v14.4S, v1.S[0] // gap(s) to follow - add v30.4S, v13.4S, v8.4S - sub v8.4S, v13.4S, v8.4S // gap(s) to follow - mla v26.4S, v25.4S, v29.4S // gap(s) to follow - mla v15.4S, v28.4S, v29.4S // gap(s) to follow - sqrdmulh v13.4S, v24.4S, v2.S[3] // gap(s) to follow - sub v25.4S, v31.4S, v26.4S // gap(s) to follow - add v20.4S, v31.4S, v26.4S - mul v27.4S, v11.4S, v1.S[0] // gap(s) to follow - sqrdmulh v14.4S, v14.4S, v1.S[1] // gap(s) to follow - add v18.4S, v20.4S, v15.4S // gap(s) to follow - sub v28.4S, v20.4S, v15.4S - mul v26.4S, v10.4S, v3.S[0] // gap(s) to follow - sub v20.4S, v21.4S, v9.4S // gap(s) to follow - mul v15.4S, v24.4S, v2.S[2] // gap(s) to follow - mla v19.4S, v14.4S, v29.4S // gap(s) to follow - sqrdmulh v31.4S, v17.4S, v1.S[3] // gap(s) to follow - add v14.4S, v21.4S, v9.4S // gap(s) to follow - mul v21.4S, v17.4S, v1.S[2] // gap(s) to follow - sqrdmulh v24.4S, v28.4S, v2.S[1] // gap(s) to follow - sub v17.4S, v12.4S, v19.4S // gap(s) to follow - mla v21.4S, v31.4S, v29.4S // gap(s) to follow - sqrdmulh v31.4S, v23.4S, v2.S[1] // gap(s) to follow - mul v9.4S, v18.4S, v1.S[2] // gap(s) to follow - add v19.4S, v12.4S, v19.4S // gap(s) to follow - subs count, count, #1 + // Instructions: 192 + // Expected cycles: 64 + // Expected IPC: 3.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + mul v10.4S, v20.4S, v2.S[0] // ..........*..................................................................................................................................................................................... + ldr q21, [x0, #784] // .........................................................................................................*...................................................................................... + add v8.4S, v16.4S, v17.4S // .*.............................................................................................................................................................................................. + sqrdmulh v26.4S, v13.4S, v3.S[1] // .......*........................................................................................................................................................................................ + sqrdmulh v24.4S, v8.4S, v2.S[3] // ................................*............................................................................................................................................................... + mul v15.4S, v13.4S, v3.S[0] // ........*....................................................................................................................................................................................... + mla v15.4S, v26.4S, v29.4S // ................*............................................................................................................................................................................... + mul v26.4S, v8.4S, v2.S[2] // ..................................*............................................................................................................................................................. + sqrdmulh v8.4S, v21.4S, v0.S[1] // ..................................................................................................................................*............................................................. + sub v13.4S, v18.4S, v15.4S // ......................*......................................................................................................................................................................... + add v15.4S, v18.4S, v15.4S // .....................*.......................................................................................................................................................................... + mul v21.4S, v21.4S, v0.S[0] // ...................................................................................................................................*............................................................ + mla v23.4S, v31.4S, v29.4S // *............................................................................................................................................................................................... + mla v26.4S, v24.4S, v29.4S // ...................................*............................................................................................................................................................ + mla v21.4S, v8.4S, v29.4S // .................................................................................................................................................*.............................................. + sub v8.4S, v22.4S, v23.4S // .....*.......................................................................................................................................................................................... + add v24.4S, v22.4S, v23.4S // ......*......................................................................................................................................................................................... + sqrdmulh v31.4S, v20.4S, v2.S[1] // .........*...................................................................................................................................................................................... + sub v23.4S, v16.4S, v17.4S // ..*............................................................................................................................................................................................. + sqrdmulh v20.4S, v23.4S, v3.S[1] // ..........................*..................................................................................................................................................................... + mla v10.4S, v31.4S, v29.4S // .......................*........................................................................................................................................................................ + sqrdmulh v17.4S, v11.4S, v2.S[1] // ...........*.................................................................................................................................................................................... + mul v18.4S, v23.4S, v3.S[0] // .............................*.................................................................................................................................................................. + sub v22.4S, v9.4S, v10.4S // ..............................*................................................................................................................................................................. + mla v18.4S, v20.4S, v29.4S // .................................*.............................................................................................................................................................. + sqrdmulh v20.4S, v22.4S, v5.S[1] // ..................................................*............................................................................................................................................. + mul v16.4S, v11.4S, v2.S[0] // ............*................................................................................................................................................................................... + mla v16.4S, v17.4S, v29.4S // .................*.............................................................................................................................................................................. + add v17.4S, v9.4S, v10.4S // ...............................*................................................................................................................................................................ + mul v10.4S, v22.4S, v5.S[0] // ...................................................*............................................................................................................................................ + sub v22.4S, v12.4S, v18.4S // .....................................*.......................................................................................................................................................... + mla v10.4S, v20.4S, v29.4S // ..........................................................*..................................................................................................................................... + sub v23.4S, v25.4S, v16.4S // ........................*....................................................................................................................................................................... + sqrdmulh v31.4S, v22.4S, v7.S[1] // ............................................*................................................................................................................................................... + add v11.4S, v12.4S, v18.4S // ....................................*........................................................................................................................................................... + add v18.4S, v25.4S, v16.4S // .........................*...................................................................................................................................................................... + mul v16.4S, v22.4S, v7.S[0] // .................................................*.............................................................................................................................................. + add v9.4S, v23.4S, v10.4S // ......................................................................................*......................................................................................................... + ldr q20, [x0, #720] // .......................................................................................................*........................................................................................ + str q9, [x0, #384] // ...........................................................................................*.................................................................................................... + mul v9.4S, v8.4S, v4.S[0] // .......................................*........................................................................................................................................................ + add v25.4S, v19.4S, v26.4S // ........................................*....................................................................................................................................................... + sub v12.4S, v19.4S, v26.4S // .........................................*...................................................................................................................................................... + sqrdmulh v22.4S, v14.4S, v2.S[3] // ...*............................................................................................................................................................................................ + sub v26.4S, v23.4S, v10.4S // .................................................................*.............................................................................................................................. + sqrdmulh v19.4S, v25.4S, v5.S[3] // .............................................*.................................................................................................................................................. + str q26, [x0, #448] // ......................................................................*......................................................................................................................... + mul v10.4S, v25.4S, v5.S[2] // ..............................................*................................................................................................................................................. + sqrdmulh v23.4S, v8.4S, v4.S[1] // ......................................*......................................................................................................................................................... + mla v10.4S, v19.4S, v29.4S // ..................................................................*............................................................................................................................. + mul v25.4S, v14.4S, v2.S[2] // ....*........................................................................................................................................................................................... + mla v9.4S, v23.4S, v29.4S // .......................................................*........................................................................................................................................ + sqrdmulh v23.4S, v12.4S, v6.S[1] // ...............................................*................................................................................................................................................ + mul v26.4S, v12.4S, v6.S[0] // ................................................*............................................................................................................................................... + mul v8.4S, v24.4S, v3.S[2] // ...........................................*.................................................................................................................................................... + sqrdmulh v14.4S, v24.4S, v3.S[3] // ..........................................*..................................................................................................................................................... + mla v16.4S, v31.4S, v29.4S // ................................................................*............................................................................................................................... + mla v25.4S, v22.4S, v29.4S // ...............*................................................................................................................................................................................ + sub v31.4S, v13.4S, v16.4S // .........................................................................*...................................................................................................................... + mul v22.4S, v20.4S, v0.S[0] // .................................................................................................................................*.............................................................. + add v12.4S, v13.4S, v16.4S // ........................................................................*....................................................................................................................... + sqrdmulh v20.4S, v20.4S, v0.S[1] // ................................................................................................................................*............................................................... + str q31, [x0, #960] // ................................................................................*............................................................................................................... + str q12, [x0, #896] // ...............................................................................*................................................................................................................ + mul v12.4S, v11.4S, v6.S[2] // ..............................................................................*................................................................................................................. + sqrdmulh v11.4S, v11.4S, v6.S[3] // .......................................................................*........................................................................................................................ + mla v8.4S, v14.4S, v29.4S // .....................................................*.......................................................................................................................................... + sqrdmulh v16.4S, v27.4S, v1.S[3] // .............*.................................................................................................................................................................................. + mla v12.4S, v11.4S, v29.4S // .................................................................................*.............................................................................................................. + mul v13.4S, v27.4S, v1.S[2] // ..............*................................................................................................................................................................................. + add v27.4S, v28.4S, v25.4S // ..................*............................................................................................................................................................................. + mla v13.4S, v16.4S, v29.4S // ....................*........................................................................................................................................................................... + sub v16.4S, v28.4S, v25.4S // ...................*............................................................................................................................................................................ + sub v28.4S, v15.4S, v12.4S // .........................................................................................*...................................................................................................... + add v14.4S, v15.4S, v12.4S // ..........................................................................................*..................................................................................................... + sqrdmulh v11.4S, v17.4S, v4.S[3] // ....................................................*........................................................................................................................................... + add v15.4S, v27.4S, v10.4S // ............................................................................*................................................................................................................... + str q28, [x0, #832] // ..............................................................................................*................................................................................................. + mla v26.4S, v23.4S, v29.4S // ..........................................................................*..................................................................................................................... + str q14, [x0, #768] // ...............................................................................................*................................................................................................ + str q15, [x0, #512] // ..................................................................................*............................................................................................................. + sub v19.4S, v27.4S, v10.4S // .............................................................................*.................................................................................................................. + ldr q27, [x0, #400] // ................................................................................................................*............................................................................... + mul v17.4S, v17.4S, v4.S[2] // ......................................................*......................................................................................................................................... + sub v12.4S, v30.4S, v13.4S // ............................*................................................................................................................................................................... + ldr q14, [x0, #912] // ............................................................................................................*................................................................................... + add v30.4S, v30.4S, v13.4S // ...........................*.................................................................................................................................................................... + ldr q13, [x0, #272] // .................................................................................................................*.............................................................................. + ldr q31, [x0, #144] // ..............................................................................................................*................................................................................. + mla v17.4S, v11.4S, v29.4S // ...........................................................*.................................................................................................................................... + str q19, [x0, #576] // ...................................................................................*............................................................................................................ + ldr q19, [x0, #656] // ....................................................................................................*........................................................................................... + sub v11.4S, v12.4S, v9.4S // ...............................................................*................................................................................................................................ + ldr q25, [x0, #208] // ...............................................................................................................*................................................................................ + ldr q10, [x0, #976] // ................................................................................................*............................................................................................... + add v12.4S, v12.4S, v9.4S // ..............................................................*................................................................................................................................. + mla v22.4S, v20.4S, v29.4S // .............................................................................................................................................*.................................................. + str q11, [x0, #192] // ....................................................................*........................................................................................................................... + str q12, [x0, #128] // ...................................................................*............................................................................................................................ + sub v20.4S, v16.4S, v26.4S // .....................................................................................*.......................................................................................................... + ldr q11, [x0, #528] // ...........................................................................................................*.................................................................................... + ldr q12, [x0, #464] // .................................................................................................*.............................................................................................. + add v26.4S, v16.4S, v26.4S // ....................................................................................*........................................................................................................... + sqrdmulh v23.4S, v14.4S, v0.S[1] // ....................................................................................................................................*........................................................... + ldr q28, [x0, #592] // ...................................................................................................*............................................................................................ + add v16.4S, v18.4S, v17.4S // .....................................................................*.......................................................................................................................... + ldr q9, [x0, #80] // ..........................................................................................................*..................................................................................... + ldr q24, [x0, #336] // ......................................................................................................*......................................................................................... + mul v14.4S, v14.4S, v0.S[0] // .....................................................................................................................................*.......................................................... + sub v18.4S, v18.4S, v17.4S // .......................................................................................*........................................................................................................ + str q20, [x0, #704] // ............................................................................................*................................................................................................... + str q26, [x0, #640] // ........................................................................................*....................................................................................................... + ldr q20, [x0, #848] // ..................................................................................................*............................................................................................. + sub v17.4S, v13.4S, v21.4S // ........................................................................................................................................................*....................................... + add v13.4S, v13.4S, v21.4S // .........................................................................................................................................................*...................................... + str q16, [x0, #256] // ...........................................................................*.................................................................................................................... + sqrdmulh v26.4S, v19.4S, v0.S[1] // ........................................................................................................................*....................................................................... + str q18, [x0, #320] // .............................................................................................*.................................................................................................. + sub v18.4S, v30.4S, v8.4S // .........................................................*...................................................................................................................................... + ldr q21, [x0, #16] // .....................................................................................................*.......................................................................................... + mul v19.4S, v19.4S, v0.S[0] // .........................................................................................................................*...................................................................... + add v30.4S, v30.4S, v8.4S // ........................................................*....................................................................................................................................... + sub v16.4S, v25.4S, v22.4S // .....................................................................................................................................................*.......................................... + add v22.4S, v25.4S, v22.4S // ......................................................................................................................................................*......................................... + str q18, [x0, #64] // .............................................................*.................................................................................................................................. + sqrdmulh v18.4S, v10.4S, v0.S[1] // ........................................................................................................*....................................................................................... + str q30, [x0], #(16) // ............................................................*................................................................................................................................... + mul v30.4S, v10.4S, v0.S[0] // .............................................................................................................*.................................................................................. + mla v14.4S, v23.4S, v29.4S // ...........................................................................................................................................*.................................................... + mla v19.4S, v26.4S, v29.4S // ........................................................................................................................................*....................................................... + sqrdmulh v26.4S, v11.4S, v0.S[1] // ............................................................................................................................*................................................................... + add v8.4S, v27.4S, v14.4S // ..................................................................................................................................................*............................................. + sub v27.4S, v27.4S, v14.4S // ...................................................................................................................................................*............................................ + mul v11.4S, v11.4S, v0.S[0] // .............................................................................................................................*.................................................................. + sub v14.4S, v31.4S, v19.4S // ...............................................................................................................................................*................................................ + add v19.4S, v31.4S, v19.4S // ................................................................................................................................................*............................................... + mla v30.4S, v18.4S, v29.4S // ...................................................................................................................*............................................................................ + sqrdmulh v18.4S, v28.4S, v0.S[1] // ..............................................................................................................................*................................................................. + mul v31.4S, v28.4S, v0.S[0] // ...............................................................................................................................*................................................................ + sub v25.4S, v12.4S, v30.4S // ......................................................................................................................*......................................................................... + add v30.4S, v12.4S, v30.4S // .......................................................................................................................*........................................................................ + sqrdmulh v12.4S, v20.4S, v0.S[1] // ..................................................................................................................*............................................................................. + mla v31.4S, v18.4S, v29.4S // .......................................................................................................................................*........................................................ + mul v20.4S, v20.4S, v0.S[0] // ....................................................................................................................*........................................................................... + mla v20.4S, v12.4S, v29.4S // .....................................................................................................................*.......................................................................... + sub v18.4S, v9.4S, v31.4S // ............................................................................................................................................*................................................... + add v31.4S, v9.4S, v31.4S // ..............................................................................................................................................*................................................. + mul v10.4S, v17.4S, v1.S[0] // ....................................................................................................................................................................*........................... + sqrdmulh v23.4S, v17.4S, v1.S[1] // ...................................................................................................................................................................*............................ + add v12.4S, v24.4S, v20.4S // ..........................................................................................................................*..................................................................... + sub v28.4S, v24.4S, v20.4S // ...........................................................................................................................*.................................................................... + sqrdmulh v17.4S, v13.4S, v0.S[3] // .....................................................................................................................................................................*.......................... + mul v9.4S, v13.4S, v0.S[2] // ......................................................................................................................................................................*......................... + mla v11.4S, v26.4S, v29.4S // ......................................................................................................................................*......................................................... + sqrdmulh v20.4S, v30.4S, v0.S[3] // ....................................................................................................................................................*........................................... + sqrdmulh v13.4S, v27.4S, v1.S[1] // .......................................................................................................................................................................*........................ + sub v26.4S, v21.4S, v11.4S // .........................................................................................................................................*...................................................... + add v24.4S, v21.4S, v11.4S // ..........................................................................................................................................*..................................................... + mul v11.4S, v30.4S, v0.S[2] // .......................................................................................................................................................*........................................ + mla v11.4S, v20.4S, v29.4S // ..........................................................................................................................................................*..................................... + mul v30.4S, v27.4S, v1.S[0] // ........................................................................................................................................................................*....................... + sqrdmulh v27.4S, v8.4S, v0.S[3] // .........................................................................................................................................................................*...................... + add v21.4S, v22.4S, v11.4S // .............................................................................................................................................................*.................................. + sub v20.4S, v22.4S, v11.4S // ..............................................................................................................................................................*................................. + mul v22.4S, v8.4S, v0.S[2] // ..........................................................................................................................................................................*..................... + mla v30.4S, v13.4S, v29.4S // ................................................................................................................................................................................*............... + mla v22.4S, v27.4S, v29.4S // ...................................................................................................................................................................................*............ + mla v9.4S, v17.4S, v29.4S // ............................................................................................................................................................................*................... + sub v13.4S, v14.4S, v30.4S // .......................................................................................................................................................................................*........ + add v14.4S, v14.4S, v30.4S // ........................................................................................................................................................................................*....... + sqrdmulh v8.4S, v25.4S, v1.S[1] // ...............................................................................................................................................................*................................ + sub v11.4S, v19.4S, v22.4S // ...........................................................................................................................................................................................*.... + add v27.4S, v19.4S, v22.4S // ..........................................................................................................................................................................................*..... + mul v17.4S, v25.4S, v1.S[0] // ................................................................................................................................................................*............................... + add v30.4S, v24.4S, v9.4S // ..................................................................................................................................................................................*............. + sub v25.4S, v24.4S, v9.4S // .................................................................................................................................................................................*.............. + sqrdmulh v19.4S, v28.4S, v1.S[1] // .................................................................................................................................................................*.............................. + mul v22.4S, v28.4S, v1.S[0] // ..................................................................................................................................................................*............................. + sqrdmulh v28.4S, v12.4S, v0.S[3] // ............................................................................................................................................................*................................... + mla v22.4S, v19.4S, v29.4S // .............................................................................................................................................................................*.................. + mul v24.4S, v12.4S, v0.S[2] // ...........................................................................................................................................................*.................................... + mla v24.4S, v28.4S, v29.4S // ......................................................................................................................................................................................*......... + add v19.4S, v18.4S, v22.4S // .....................................................................................................................................................................................*.......... + sub v12.4S, v18.4S, v22.4S // ....................................................................................................................................................................................*........... + mla v10.4S, v23.4S, v29.4S // ...........................................................................................................................................................................*.................... + mla v17.4S, v8.4S, v29.4S // .........................................................................................................................................................................................*...... + sub v9.4S, v31.4S, v24.4S // .............................................................................................................................................................................................*.. + add v22.4S, v31.4S, v24.4S // ...............................................................................................................................................................................................* + mul v23.4S, v21.4S, v1.S[2] // ..............................................................................................................................................................................................*. + sub v18.4S, v26.4S, v10.4S // ...............................................................................................................................................................................*................ + add v28.4S, v26.4S, v10.4S // ..............................................................................................................................................................................*................. + sqrdmulh v31.4S, v21.4S, v1.S[3] // ............................................................................................................................................................................................*... + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // mla v23.4S, v31.4S, v29.4S // ............*................................................................................................................................................................................... + // add v26.4S, v16.4S, v17.4S // ..*............................................................................................................................................................................................. + // sub v17.4S, v16.4S, v17.4S // ..................*............................................................................................................................................................................. + // sqrdmulh v8.4S, v14.4S, v2.S[3] // ...........................................*.................................................................................................................................................... + // mul v14.4S, v14.4S, v2.S[2] // ..................................................*............................................................................................................................................. + // sub v31.4S, v22.4S, v23.4S // ...............*................................................................................................................................................................................ + // add v22.4S, v22.4S, v23.4S // ................*............................................................................................................................................................................... + // sqrdmulh v16.4S, v13.4S, v3.S[1] // ...*............................................................................................................................................................................................ + // mul v13.4S, v13.4S, v3.S[0] // .....*.......................................................................................................................................................................................... + // sqrdmulh v10.4S, v20.4S, v2.S[1] // .................*.............................................................................................................................................................................. + // mul v20.4S, v20.4S, v2.S[0] // *............................................................................................................................................................................................... + // sqrdmulh v23.4S, v11.4S, v2.S[1] // .....................*.......................................................................................................................................................................... + // mul v11.4S, v11.4S, v2.S[0] // ..........................*..................................................................................................................................................................... + // sqrdmulh v24.4S, v27.4S, v1.S[3] // ...................................................................*............................................................................................................................ + // mul v27.4S, v27.4S, v1.S[2] // .....................................................................*.......................................................................................................................... + // mla v14.4S, v8.4S, v29.4S // .........................................................*...................................................................................................................................... + // mla v13.4S, v16.4S, v29.4S // ......*......................................................................................................................................................................................... + // mla v11.4S, v23.4S, v29.4S // ...........................*.................................................................................................................................................................... + // add v8.4S, v28.4S, v14.4S // ......................................................................*......................................................................................................................... + // sub v14.4S, v28.4S, v14.4S // ........................................................................*....................................................................................................................... + // mla v27.4S, v24.4S, v29.4S // .......................................................................*........................................................................................................................ + // add v16.4S, v18.4S, v13.4S // ..........*..................................................................................................................................................................................... + // sub v18.4S, v18.4S, v13.4S // .........*...................................................................................................................................................................................... + // mla v20.4S, v10.4S, v29.4S // ....................*........................................................................................................................................................................... + // sub v13.4S, v25.4S, v11.4S // ................................*............................................................................................................................................................... + // add v11.4S, v25.4S, v11.4S // ...................................*............................................................................................................................................................ + // sqrdmulh v25.4S, v17.4S, v3.S[1] // ...................*............................................................................................................................................................................ + // add v10.4S, v30.4S, v27.4S // ......................................................................................*......................................................................................................... + // sub v30.4S, v30.4S, v27.4S // ....................................................................................*........................................................................................................... + // mul v27.4S, v17.4S, v3.S[0] // ......................*......................................................................................................................................................................... + // sub v17.4S, v9.4S, v20.4S // .......................*........................................................................................................................................................................ + // add v20.4S, v9.4S, v20.4S // ............................*................................................................................................................................................................... + // sqrdmulh v23.4S, v26.4S, v2.S[3] // ....*........................................................................................................................................................................................... + // mla v27.4S, v25.4S, v29.4S // ........................*....................................................................................................................................................................... + // mul v26.4S, v26.4S, v2.S[2] // .......*........................................................................................................................................................................................ + // mla v26.4S, v23.4S, v29.4S // .............*.................................................................................................................................................................................. + // add v25.4S, v12.4S, v27.4S // ..................................*............................................................................................................................................................. + // sub v27.4S, v12.4S, v27.4S // ..............................*................................................................................................................................................................. + // sqrdmulh v12.4S, v31.4S, v4.S[1] // ................................................*............................................................................................................................................... + // mul v31.4S, v31.4S, v4.S[0] // ........................................*....................................................................................................................................................... + // add v23.4S, v19.4S, v26.4S // .........................................*...................................................................................................................................................... + // sub v19.4S, v19.4S, v26.4S // ..........................................*..................................................................................................................................................... + // sqrdmulh v26.4S, v22.4S, v3.S[3] // .......................................................*........................................................................................................................................ + // mul v22.4S, v22.4S, v3.S[2] // ......................................................*......................................................................................................................................... + // sqrdmulh v28.4S, v27.4S, v7.S[1] // .................................*.............................................................................................................................................................. + // sqrdmulh v9.4S, v23.4S, v5.S[3] // .............................................*.................................................................................................................................................. + // mul v23.4S, v23.4S, v5.S[2] // ...............................................*................................................................................................................................................ + // sqrdmulh v24.4S, v19.4S, v6.S[1] // ....................................................*........................................................................................................................................... + // mul v19.4S, v19.4S, v6.S[0] // .....................................................*.......................................................................................................................................... + // mul v27.4S, v27.4S, v7.S[0] // ....................................*........................................................................................................................................................... + // sqrdmulh v21.4S, v17.4S, v5.S[1] // .........................*...................................................................................................................................................................... + // mul v17.4S, v17.4S, v5.S[0] // .............................*.................................................................................................................................................................. + // sqrdmulh v15.4S, v20.4S, v4.S[3] // ...........................................................................*.................................................................................................................... + // mla v22.4S, v26.4S, v29.4S // ..................................................................*............................................................................................................................. + // mul v20.4S, v20.4S, v4.S[2] // ...................................................................................*............................................................................................................ + // mla v31.4S, v12.4S, v29.4S // ...................................................*............................................................................................................................................ + // add v12.4S, v10.4S, v22.4S // .........................................................................................................................*...................................................................... + // sub v22.4S, v10.4S, v22.4S // ......................................................................................................................*......................................................................... + // mla v17.4S, v21.4S, v29.4S // ...............................*................................................................................................................................................................ + // mla v20.4S, v15.4S, v29.4S // .........................................................................................*...................................................................................................... + // str q12, [x0], #(16) // ..............................................................................................................................*................................................................. + // str q22, [x0, #48] // ............................................................................................................................*................................................................... + // add v22.4S, v30.4S, v31.4S // ...............................................................................................*................................................................................................ + // sub v30.4S, v30.4S, v31.4S // ............................................................................................*................................................................................................... + // mla v27.4S, v28.4S, v29.4S // ........................................................*....................................................................................................................................... + // sub v12.4S, v13.4S, v17.4S // ............................................*................................................................................................................................................... + // mla v23.4S, v9.4S, v29.4S // .................................................*.............................................................................................................................................. + // str q22, [x0, #112] // ..................................................................................................*............................................................................................. + // str q30, [x0, #176] // .................................................................................................*.............................................................................................. + // add v22.4S, v11.4S, v20.4S // .........................................................................................................*...................................................................................... + // str q12, [x0, #432] // ..............................................*................................................................................................................................................. + // sqrdmulh v30.4S, v25.4S, v6.S[3] // .................................................................*.............................................................................................................................. + // add v12.4S, v18.4S, v27.4S // ............................................................*................................................................................................................................... + // sub v18.4S, v18.4S, v27.4S // ..........................................................*..................................................................................................................................... + // mla v19.4S, v24.4S, v29.4S // ..............................................................................*................................................................................................................. + // str q22, [x0, #240] // ...................................................................................................................*............................................................................ + // add v22.4S, v8.4S, v23.4S // ............................................................................*................................................................................................................... + // sub v27.4S, v8.4S, v23.4S // .................................................................................*.............................................................................................................. + // mul v26.4S, v25.4S, v6.S[2] // ................................................................*............................................................................................................................... + // str q12, [x0, #880] // ...............................................................*................................................................................................................................ + // str q18, [x0, #944] // ..............................................................*................................................................................................................................. + // mla v26.4S, v30.4S, v29.4S // ....................................................................*........................................................................................................................... + // str q22, [x0, #496] // ................................................................................*............................................................................................................... + // str q27, [x0, #560] // ..........................................................................................*..................................................................................................... + // add v18.4S, v14.4S, v19.4S // ......................................................................................................*......................................................................................... + // sub v19.4S, v14.4S, v19.4S // ...................................................................................................*............................................................................................ + // add v22.4S, v13.4S, v17.4S // .....................................*.......................................................................................................................................................... + // sub v20.4S, v11.4S, v20.4S // .............................................................................................................*.................................................................................. + // str q18, [x0, #624] // ...............................................................................................................*................................................................................ + // sub v18.4S, v16.4S, v26.4S // .........................................................................*...................................................................................................................... + // add v11.4S, v16.4S, v26.4S // ..........................................................................*..................................................................................................................... + // str q22, [x0, #368] // .......................................*........................................................................................................................................................ + // str q19, [x0, #688] // ..............................................................................................................*................................................................................. + // str q20, [x0, #304] // .....................................................................................................................*.......................................................................... + // str q18, [x0, #816] // .............................................................................*.................................................................................................................. + // str q11, [x0, #752] // ...............................................................................*................................................................................................................ + // ldr q20, [x0, #960] // ..............................................................................................*................................................................................................. + // ldr q18, [x0, #448] // .....................................................................................................*.......................................................................................... + // ldr q19, [x0, #832] // ................................................................................................................*............................................................................... + // ldr q22, [x0, #576] // ........................................................................................................*....................................................................................... + // ldr q11, [x0, #640] // ...........................................................................................*.................................................................................................... + // ldr q30, [x0, #0] // .......................................................................................................................*........................................................................ + // ldr q13, [x0, #320] // ...........................................................................................................*.................................................................................... + // ldr q27, [x0, #704] // ......................................*......................................................................................................................................................... + // sqrdmulh v17.4S, v20.4S, v0.S[1] // .............................................................................................................................*.................................................................. + // ldr q12, [x0, #768] // .*.............................................................................................................................................................................................. + // ldr q14, [x0, #64] // ..........................................................................................................*..................................................................................... + // ldr q26, [x0, #512] // ....................................................................................................*........................................................................................... + // ldr q8, [x0, #896] // .....................................................................................*.......................................................................................................... + // mul v20.4S, v20.4S, v0.S[0] // ...............................................................................................................................*................................................................ + // ldr q31, [x0, #128] // ........................................................................................*....................................................................................................... + // ldr q25, [x0, #192] // .............................................................................................*.................................................................................................. + // ldr q16, [x0, #384] // ..................................................................................*............................................................................................................. + // ldr q10, [x0, #256] // .......................................................................................*........................................................................................................ + // sqrdmulh v23.4S, v19.4S, v0.S[1] // .............................................................................................................................................*.................................................. + // mla v20.4S, v17.4S, v29.4S // ........................................................................................................................................*....................................................... + // mul v19.4S, v19.4S, v0.S[0] // ...............................................................................................................................................*................................................ + // mla v19.4S, v23.4S, v29.4S // ................................................................................................................................................*............................................... + // sub v17.4S, v18.4S, v20.4S // ...........................................................................................................................................*.................................................... + // add v20.4S, v18.4S, v20.4S // ............................................................................................................................................*................................................... + // sqrdmulh v18.4S, v11.4S, v0.S[1] // ....................................................................................................................*........................................................................... + // mul v11.4S, v11.4S, v0.S[0] // ........................................................................................................................*....................................................................... + // add v23.4S, v13.4S, v19.4S // .....................................................................................................................................................*.......................................... + // sub v19.4S, v13.4S, v19.4S // ......................................................................................................................................................*......................................... + // sqrdmulh v13.4S, v26.4S, v0.S[1] // ..................................................................................................................................*............................................................. + // mul v26.4S, v26.4S, v0.S[0] // .....................................................................................................................................*.......................................................... + // sqrdmulh v28.4S, v22.4S, v0.S[1] // .........................................................................................................................................*...................................................... + // mul v22.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*..................................................... + // sqrdmulh v9.4S, v27.4S, v0.S[1] // .............................................................*.................................................................................................................................. + // mul v27.4S, v27.4S, v0.S[0] // ...........................................................*.................................................................................................................................... + // sqrdmulh v24.4S, v12.4S, v0.S[1] // ........*....................................................................................................................................................................................... + // mul v12.4S, v12.4S, v0.S[0] // ...........*.................................................................................................................................................................................... + // sqrdmulh v21.4S, v8.4S, v0.S[1] // .......................................................................................................*........................................................................................ + // mul v8.4S, v8.4S, v0.S[0] // ............................................................................................................*................................................................................... + // mla v26.4S, v13.4S, v29.4S // .........................................................................................................................................................*...................................... + // mla v22.4S, v28.4S, v29.4S // ..............................................................................................................................................*................................................. + // mla v11.4S, v18.4S, v29.4S // .................................................................................................................................*.............................................................. + // sub v18.4S, v30.4S, v26.4S // ............................................................................................................................................................*................................... + // add v30.4S, v30.4S, v26.4S // .............................................................................................................................................................*.................................. + // mla v8.4S, v21.4S, v29.4S // ................................................................................................................................*............................................................... + // sub v13.4S, v14.4S, v22.4S // .................................................................................................................................................*.............................................. + // mla v27.4S, v9.4S, v29.4S // ................................................................................................*............................................................................................... + // add v22.4S, v14.4S, v22.4S // ..................................................................................................................................................*............................................. + // sub v14.4S, v31.4S, v11.4S // ......................................................................................................................................*......................................................... + // add v11.4S, v31.4S, v11.4S // .......................................................................................................................................*........................................................ + // mla v12.4S, v24.4S, v29.4S // ..............*................................................................................................................................................................................. + // add v26.4S, v16.4S, v8.4S // ...................................................................................................................................*............................................................ + // sub v8.4S, v16.4S, v8.4S // ....................................................................................................................................*........................................................... + // sqrdmulh v31.4S, v20.4S, v0.S[3] // ..........................................................................................................................................................*..................................... + // sub v16.4S, v25.4S, v27.4S // ..........................................................................................................................*..................................................................... + // add v27.4S, v25.4S, v27.4S // ...........................................................................................................................*.................................................................... + // mul v20.4S, v20.4S, v0.S[2] // ..............................................................................................................................................................*................................. + // sub v25.4S, v10.4S, v12.4S // .................................................................................................................*.............................................................................. + // add v12.4S, v10.4S, v12.4S // ..................................................................................................................*............................................................................. + // mla v20.4S, v31.4S, v29.4S // ...............................................................................................................................................................*................................ + // mul v10.4S, v23.4S, v0.S[2] // ....................................................................................................................................................................................*........... + // sqrdmulh v31.4S, v23.4S, v0.S[3] // ..................................................................................................................................................................................*............. + // add v23.4S, v27.4S, v20.4S // ..................................................................................................................................................................*............................. + // sub v20.4S, v27.4S, v20.4S // ...................................................................................................................................................................*............................ + // sqrdmulh v27.4S, v17.4S, v1.S[1] // ..........................................................................................................................................................................*..................... + // mul v17.4S, v17.4S, v1.S[0] // .............................................................................................................................................................................*.................. + // sqrdmulh v28.4S, v19.4S, v1.S[1] // ................................................................................................................................................................................*............... + // mul v19.4S, v19.4S, v1.S[0] // .................................................................................................................................................................................*.............. + // sqrdmulh v9.4S, v25.4S, v1.S[1] // ....................................................................................................................................................*........................................... + // mul v25.4S, v25.4S, v1.S[0] // ...................................................................................................................................................*............................................ + // sqrdmulh v24.4S, v12.4S, v0.S[3] // .......................................................................................................................................................*........................................ + // mul v12.4S, v12.4S, v0.S[2] // ........................................................................................................................................................*....................................... + // sqrdmulh v21.4S, v8.4S, v1.S[1] // ...........................................................................................................................................................*.................................... + // mul v8.4S, v8.4S, v1.S[0] // ................................................................................................................................................................*............................... + // sqrdmulh v15.4S, v26.4S, v0.S[3] // .................................................................................................................................................................*.............................. + // mul v26.4S, v26.4S, v0.S[2] // ....................................................................................................................................................................*........................... + // mla v25.4S, v9.4S, v29.4S // ........................................................................................................................................................................................*....... + // mla v12.4S, v24.4S, v29.4S // .......................................................................................................................................................................*........................ + // mla v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*............ + // add v28.4S, v18.4S, v25.4S // ..............................................................................................................................................................................................*. + // sub v18.4S, v18.4S, v25.4S // .............................................................................................................................................................................................*.. + // mla v8.4S, v21.4S, v29.4S // .....................................................................................................................................................................*.......................... + // sub v25.4S, v30.4S, v12.4S // ...............................................................................................................................................................................*................ + // add v30.4S, v30.4S, v12.4S // ..............................................................................................................................................................................*................. + // mla v26.4S, v15.4S, v29.4S // ......................................................................................................................................................................*......................... + // sub v12.4S, v13.4S, v19.4S // .......................................................................................................................................................................................*........ + // add v19.4S, v13.4S, v19.4S // ......................................................................................................................................................................................*......... + // mla v10.4S, v31.4S, v29.4S // .....................................................................................................................................................................................*.......... + // sub v13.4S, v14.4S, v8.4S // ........................................................................................................................................................................*....................... + // add v14.4S, v14.4S, v8.4S // .........................................................................................................................................................................*...................... + // mla v17.4S, v27.4S, v29.4S // .........................................................................................................................................................................................*...... + // add v27.4S, v11.4S, v26.4S // ............................................................................................................................................................................*................... + // sub v11.4S, v11.4S, v26.4S // ...........................................................................................................................................................................*.................... + // sqrdmulh v31.4S, v23.4S, v1.S[3] // ...............................................................................................................................................................................................* + // sub v9.4S, v22.4S, v10.4S // ..........................................................................................................................................................................................*..... + // mul v23.4S, v23.4S, v1.S[2] // ............................................................................................................................................................................................*... + // add v22.4S, v22.4S, v10.4S // ...........................................................................................................................................................................................*.... + + sub count, count, #1 cbnz count, layer1234_start - sqrdmulh v12.4S, v22.4S, v1.S[1] - sqrdmulh v11.4S, v11.4S, v1.S[1] - mul v22.4S, v22.4S, v1.S[0] - mla v22.4S, v12.4S, v29.4S - mla v15.4S, v13.4S, v29.4S - mla v27.4S, v11.4S, v29.4S - sub v13.4S, v25.4S, v22.4S - mul v11.4S, v28.4S, v2.S[0] - sqrdmulh v28.4S, v13.4S, v3.S[1] - sub v12.4S, v16.4S, v27.4S - mla v11.4S, v24.4S, v29.4S - add v27.4S, v16.4S, v27.4S - mul v16.4S, v13.4S, v3.S[0] - mla v16.4S, v28.4S, v29.4S - sub v24.4S, v20.4S, v11.4S - add v28.4S, v20.4S, v11.4S - sqrdmulh v18.4S, v18.4S, v1.S[3] - add v20.4S, v19.4S, v15.4S - sub v11.4S, v19.4S, v15.4S - sqrdmulh v15.4S, v10.4S, v3.S[1] - add v13.4S, v12.4S, v16.4S - add v10.4S, v25.4S, v22.4S - mul v25.4S, v24.4S, v5.S[0] - mla v9.4S, v18.4S, v29.4S - sub v18.4S, v12.4S, v16.4S - sqrdmulh v22.4S, v13.4S, v6.S[3] - mla v26.4S, v15.4S, v29.4S - add v15.4S, v14.4S, v9.4S - mul v13.4S, v13.4S, v6.S[2] - mla v13.4S, v22.4S, v29.4S - add v22.4S, v17.4S, v26.4S - sub v26.4S, v17.4S, v26.4S - sqrdmulh v19.4S, v24.4S, v5.S[1] - mul v16.4S, v23.4S, v2.S[0] - add v12.4S, v22.4S, v13.4S - mla v16.4S, v31.4S, v29.4S - sub v17.4S, v22.4S, v13.4S - sub v23.4S, v14.4S, v9.4S - mla v25.4S, v19.4S, v29.4S - str_vo v12, x0, 768 - add v12.4S, v30.4S, v21.4S - sqrdmulh v24.4S, v15.4S, v3.S[3] - sub v31.4S, v8.4S, v16.4S - mul v15.4S, v15.4S, v3.S[2] - add v14.4S, v8.4S, v16.4S - str_vo v17, x0, 832 - sqrdmulh v22.4S, v28.4S, v4.S[3] - sub v17.4S, v31.4S, v25.4S - add v8.4S, v31.4S, v25.4S - mul v25.4S, v28.4S, v4.S[2] - str_vo v17, x0, 448 - mul v28.4S, v23.4S, v4.S[0] - str_vo v8, x0, 384 - sqrdmulh v17.4S, v10.4S, v2.S[3] - sub v16.4S, v30.4S, v21.4S - mul v10.4S, v10.4S, v2.S[2] - sqrdmulh v21.4S, v18.4S, v7.S[1] - mla v10.4S, v17.4S, v29.4S - mla v25.4S, v22.4S, v29.4S - mul v9.4S, v18.4S, v7.S[0] - sub v13.4S, v27.4S, v10.4S - mla v9.4S, v21.4S, v29.4S - sub v31.4S, v14.4S, v25.4S - mul v18.4S, v13.4S, v6.S[0] - str_vo v31, x0, 320 - sqrdmulh v22.4S, v23.4S, v4.S[1] - sub v8.4S, v26.4S, v9.4S - add v31.4S, v26.4S, v9.4S - mla v15.4S, v24.4S, v29.4S - add v26.4S, v27.4S, v10.4S - str_vo v8, x0, 960 - sqrdmulh v10.4S, v13.4S, v6.S[1] - mla v28.4S, v22.4S, v29.4S - sqrdmulh v21.4S, v26.4S, v5.S[3] - sub v24.4S, v12.4S, v15.4S - mla v18.4S, v10.4S, v29.4S - sub v19.4S, v16.4S, v28.4S - add v30.4S, v16.4S, v28.4S - str_vo v24, x0, 64 - mul v22.4S, v26.4S, v5.S[2] - str_vo v19, x0, 192 - mla v22.4S, v21.4S, v29.4S - str_vo v30, x0, 128 - sub v9.4S, v11.4S, v18.4S - add v21.4S, v11.4S, v18.4S - add v18.4S, v14.4S, v25.4S - str_vo v31, x0, 896 - str_vo v9, x0, 704 - add v14.4S, v12.4S, v15.4S - sub v11.4S, v20.4S, v22.4S - str_vo v21, x0, 640 - add v27.4S, v20.4S, v22.4S - str_vo v18, x0, 256 - str_vi v14, x0, 16 - str_vo v11, x0, 560 - str_vo v27, x0, 496 + mla v23.4S, v31.4S, v29.4S // .......................................................................................................*........................................................................................ + add v26.4S, v16.4S, v17.4S // ...............................................................................................*................................................................................................ + sub v17.4S, v16.4S, v17.4S // ..............................................................................................*................................................................................................. + sqrdmulh v8.4S, v14.4S, v2.S[3] // ....................................................................................................................*........................................................................... + mul v14.4S, v14.4S, v2.S[2] // .....................................................................................................................*.......................................................................... + sub v31.4S, v22.4S, v23.4S // ........................................................................................................*....................................................................................... + add v22.4S, v22.4S, v23.4S // .........................................................................................................*...................................................................................... + sqrdmulh v16.4S, v13.4S, v3.S[1] // ..............................................................................................................................*................................................................. + mul v13.4S, v13.4S, v3.S[0] // ...............................................................................................................................*................................................................ + sqrdmulh v10.4S, v20.4S, v2.S[1] // ...............................................................................................................*................................................................................ + mul v20.4S, v20.4S, v2.S[0] // ................................................................................................................*............................................................................... + sqrdmulh v23.4S, v11.4S, v2.S[1] // ..........................................................................................................*..................................................................................... + mul v11.4S, v11.4S, v2.S[0] // ...........................................................................................................*.................................................................................... + sqrdmulh v24.4S, v27.4S, v1.S[3] // ................................................................................................*............................................................................................... + mul v27.4S, v27.4S, v1.S[2] // .................................................................................................*.............................................................................................. + mla v14.4S, v8.4S, v29.4S // ......................................................................................................................*......................................................................... + mla v13.4S, v16.4S, v29.4S // ................................................................................................................................*............................................................... + mla v11.4S, v23.4S, v29.4S // ............................................................................................................*................................................................................... + add v8.4S, v28.4S, v14.4S // ........................................................................................................................*....................................................................... + sub v14.4S, v28.4S, v14.4S // .......................................................................................................................*........................................................................ + mla v27.4S, v24.4S, v29.4S // ..................................................................................................*............................................................................................. + add v16.4S, v18.4S, v13.4S // ..................................................................................................................................*............................................................. + sub v18.4S, v18.4S, v13.4S // .................................................................................................................................*.............................................................. + mla v20.4S, v10.4S, v29.4S // .................................................................................................................*.............................................................................. + sub v13.4S, v25.4S, v11.4S // .............................................................................................................*.................................................................................. + add v11.4S, v25.4S, v11.4S // ..............................................................................................................*................................................................................. + sqrdmulh v25.4S, v17.4S, v3.S[1] // ...................................................................................................................................*............................................................ + add v10.4S, v30.4S, v27.4S // ....................................................................................................*........................................................................................... + sub v30.4S, v30.4S, v27.4S // ...................................................................................................*............................................................................................ + mul v27.4S, v17.4S, v3.S[0] // ....................................................................................................................................*........................................................... + sub v17.4S, v9.4S, v20.4S // ..................................................................................................................*............................................................................. + add v20.4S, v9.4S, v20.4S // ...................................................................................................................*............................................................................ + sqrdmulh v23.4S, v26.4S, v2.S[3] // .........................................................................................................................*...................................................................... + mla v27.4S, v25.4S, v29.4S // .....................................................................................................................................*.......................................................... + mul v26.4S, v26.4S, v2.S[2] // ..........................................................................................................................*..................................................................... + mla v26.4S, v23.4S, v29.4S // ...........................................................................................................................*.................................................................... + add v25.4S, v12.4S, v27.4S // .......................................................................................................................................*........................................................ + sub v27.4S, v12.4S, v27.4S // ......................................................................................................................................*......................................................... + sqrdmulh v12.4S, v31.4S, v4.S[1] // .............................................................................................................................................*.................................................. + mul v31.4S, v31.4S, v4.S[0] // ..............................................................................................................................................*................................................. + add v23.4S, v19.4S, v26.4S // .............................................................................................................................*.................................................................. + sub v19.4S, v19.4S, v26.4S // ............................................................................................................................*................................................................... + sqrdmulh v26.4S, v22.4S, v3.S[3] // ........................................................................................................................................*....................................................... + mul v22.4S, v22.4S, v3.S[2] // .........................................................................................................................................*...................................................... + sqrdmulh v28.4S, v27.4S, v7.S[1] // ...........................................................................................................................................................................*.................... + sqrdmulh v9.4S, v23.4S, v5.S[3] // ............................................................................................................................................................*................................... + mul v23.4S, v23.4S, v5.S[2] // .............................................................................................................................................................*.................................. + sqrdmulh v24.4S, v19.4S, v6.S[1] // .................................................................................................................................................................*.............................. + mul v19.4S, v19.4S, v6.S[0] // ..................................................................................................................................................................*............................. + mul v27.4S, v27.4S, v7.S[0] // ............................................................................................................................................................................*................... + sqrdmulh v21.4S, v17.4S, v5.S[1] // .......................................................................................................................................................*........................................ + mul v17.4S, v17.4S, v5.S[0] // ........................................................................................................................................................*....................................... + sqrdmulh v15.4S, v20.4S, v4.S[3] // ..................................................................................................................................................*............................................. + mla v22.4S, v26.4S, v29.4S // ..........................................................................................................................................*..................................................... + mul v20.4S, v20.4S, v4.S[2] // ...................................................................................................................................................*............................................ + mla v31.4S, v12.4S, v29.4S // ...............................................................................................................................................*................................................ + add v12.4S, v10.4S, v22.4S // ............................................................................................................................................*................................................... + sub v22.4S, v10.4S, v22.4S // ...........................................................................................................................................*.................................................... + mla v17.4S, v21.4S, v29.4S // .........................................................................................................................................................*...................................... + mla v20.4S, v15.4S, v29.4S // ....................................................................................................................................................*........................................... + str q12, [x0], #(16) // ................................................................................................................................................................................*............... + str q22, [x0, #48] // .................................................................................................................................................................................*.............. + add v22.4S, v30.4S, v31.4S // .................................................................................................................................................*.............................................. + sub v30.4S, v30.4S, v31.4S // ................................................................................................................................................*............................................... + mla v27.4S, v28.4S, v29.4S // .............................................................................................................................................................................*.................. + sub v12.4S, v13.4S, v17.4S // ..........................................................................................................................................................*..................................... + mla v23.4S, v9.4S, v29.4S // ..............................................................................................................................................................*................................. + str q22, [x0, #112] // ..................................................................................................................................................................................*............. + str q30, [x0, #176] // ...................................................................................................................................................................................*............ + add v22.4S, v11.4S, v20.4S // ......................................................................................................................................................*......................................... + str q12, [x0, #432] // .......................................................................................................................................................................................*........ + sqrdmulh v30.4S, v25.4S, v6.S[3] // ......................................................................................................................................................................*......................... + add v12.4S, v18.4S, v27.4S // ...............................................................................................................................................................................*................ + sub v18.4S, v18.4S, v27.4S // ..............................................................................................................................................................................*................. + mla v19.4S, v24.4S, v29.4S // ...................................................................................................................................................................*............................ + str q22, [x0, #240] // ....................................................................................................................................................................................*........... + add v22.4S, v8.4S, v23.4S // ................................................................................................................................................................*............................... + sub v27.4S, v8.4S, v23.4S // ...............................................................................................................................................................*................................ + mul v26.4S, v25.4S, v6.S[2] // .......................................................................................................................................................................*........................ + str q12, [x0, #880] // ..............................................................................................................................................................................................*. + str q18, [x0, #944] // ...............................................................................................................................................................................................* + mla v26.4S, v30.4S, v29.4S // ........................................................................................................................................................................*....................... + str q22, [x0, #496] // ........................................................................................................................................................................................*....... + str q27, [x0, #560] // .........................................................................................................................................................................................*...... + add v18.4S, v14.4S, v19.4S // .....................................................................................................................................................................*.......................... + sub v19.4S, v14.4S, v19.4S // ....................................................................................................................................................................*........................... + add v22.4S, v13.4S, v17.4S // ...........................................................................................................................................................*.................................... + sub v20.4S, v11.4S, v20.4S // .....................................................................................................................................................*.......................................... + str q18, [x0, #624] // ..........................................................................................................................................................................................*..... + sub v18.4S, v16.4S, v26.4S // .........................................................................................................................................................................*...................... + add v11.4S, v16.4S, v26.4S // ..........................................................................................................................................................................*..................... + str q22, [x0, #368] // ......................................................................................................................................................................................*......... + str q19, [x0, #688] // ...........................................................................................................................................................................................*.... + str q20, [x0, #304] // .....................................................................................................................................................................................*.......... + str q18, [x0, #816] // .............................................................................................................................................................................................*.. + str q11, [x0, #752] // ............................................................................................................................................................................................*... restore inp, STACK0 mov count, #16 @@ -717,529 +910,573 @@ layer1234_start: qform_root3_tw .req q7 .p2align 2 - ldr_vi v26, x3, 16 // *....................................... - ldr_vo v28, x1, 48 // .*...................................... - // gap // ........................................ - ldr_vo v3, x1, 32 // ...*.................................... - ldr_vi v24, x4, 96 // ..*..................................... - // gap // ........................................ - ldr_vo v10, x1, 16 // .......*................................ - // gap // ........................................ - // gap // ........................................ - ldr_vo v16, x4, -16 // ..........................*............. - // gap // ........................................ - // gap // ........................................ - sqrdmulh v9.4S, v28.4S, v26.S[1] // ....*................................... - // gap // ........................................ - ldr_vo v21, x4, -64 // .......................................* - ldr_vo v6, x1, 0 // ..................*..................... - // gap // ........................................ - // gap // ........................................ - mul v18.4S, v28.4S, v26.S[0] // ......*................................. - ldr_vo v11, x4, -80 // ..............................*......... - // gap // ........................................ - ldr_vi v2, x3, 8 // .........*.............................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mla v18.4S, v9.4S, v29.4S // ........*............................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqrdmulh v25.4S, v3.4S, v26.S[1] // .....*.................................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v31.4S, v3.4S, v26.S[0] // ..........*............................. - // gap // ........................................ - // gap // ........................................ - add v9.4S, v10.4S, v18.4S // ...........*............................ - // gap // ........................................ - // gap // ........................................ - sub v15.4S, v10.4S, v18.4S // ..............*......................... - // gap // ........................................ - // gap // ........................................ - mla v31.4S, v25.4S, v29.4S // ............*........................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v12.4S, v9.4S, v26.S[2] // .............*.......................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqrdmulh v3.4S, v9.4S, v26.S[3] // ...............*........................ - // gap // ........................................ - // gap // ........................................ - add v25.4S, v6.4S, v31.4S // .....................*.................. - // gap // ........................................ - // gap // ........................................ - sqrdmulh v9.4S, v15.4S, v2.S[1] // ................*....................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v15.4S, v15.4S, v2.S[0] // .................*...................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mla v12.4S, v3.4S, v29.4S // ...................*.................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mla v15.4S, v9.4S, v29.4S // ....................*................... - sub v9.4S, v6.4S, v31.4S // ......................*................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v3.4S, v25.4S, v12.4S // ........................*............... - sub v26.4S, v25.4S, v12.4S // .......................*................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v25.4S, v9.4S, v15.4S // .........................*.............. - sub v9.4S, v9.4S, v15.4S // ...........................*............ - // gap // ........................................ - trn2 v31.4S, v3.4S, v26.4S // ..................................*..... - trn1 v30.4S, v3.4S, v26.4S // ............................*........... - ldr_vo v3, x4, -32 // ................................*....... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - trn1 v7.4S, v25.4S, v9.4S // .............................*.......... - trn2 v15.4S, v25.4S, v9.4S // .................................*...... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - trn2 v9.2D, v30.2D, v7.2D // ...............................*........ - trn2 v1.2D, v31.2D, v15.2D // ....................................*... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v17.4S, v9.4S, v24.4S // .....................................*.. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v0.4S, v1.4S, v24.4S // ......................................*. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqrdmulh v5.4S, v9.4S, v11.4S // ...................................*.... - // gap // ........................................ - // gap // ........................................ - - // original source code - // ldr_vi v8, x3, 16 // *....................................... || *.............................................. - // ldr_vo v24, x1, 48 // .*...................................... || *.............................................. - // ldr_vi v9, x4, 96 // ...*.................................... || .*............................................. - // ldr_vo v6, x1, 32 // ..*..................................... || .*............................................. - // sqrdmulh v2.4S, v24.4S, v8.S[1] // ......*................................. || ....*.......................................... - // sqrdmulh v26.4S, v6.4S, v8.S[1] // .............*.......................... || ...........*................................... - // mul v19.4S, v24.4S, v8.S[0] // .........*.............................. || ......*........................................ - // ldr_vo v12, x1, 16 // ....*................................... || ..*............................................ - // mla v19.4S, v2.4S, v29.4S // ............*........................... || .........*..................................... - // ldr_vi v13, x3, 8 // ...........*............................ || .......*....................................... - // mul v27.4S, v6.4S, v8.S[0] // ..............*......................... || .............*................................. - // add v18.4S, v12.4S, v19.4S // ...............*........................ || ..............*................................ - // mla v27.4S, v26.4S, v29.4S // .................*...................... || ................*.............................. - // mul v1.4S, v18.4S, v8.S[2] // ..................*..................... || ..................*............................ - // sub v25.4S, v12.4S, v19.4S // ................*....................... || ...............*............................... - // sqrdmulh v4.4S, v18.4S, v8.S[3] // ...................*.................... || ....................*.......................... - // sqrdmulh v28.4S, v25.4S, v13.S[1] // .....................*.................. || ......................*........................ - // mul v14.4S, v25.4S, v13.S[0] // ......................*................. || ........................*...................... - // ldr_vo v24, x1, 0 // ........*............................... || .....*......................................... - // mla v1.4S, v4.4S, v29.4S // .......................*................ || ..........................*.................... - // mla v14.4S, v28.4S, v29.4S // ........................*............... || ............................*.................. - // add v30.4S, v24.4S, v27.4S // ....................*................... || .....................*......................... - // sub v13.4S, v24.4S, v27.4S // .........................*.............. || ............................*.................. - // sub v25.4S, v30.4S, v1.4S // ...........................*............ || ...............................*............... - // add v0.4S, v30.4S, v1.4S // ..........................*............. || ...............................*............... - // add v10.4S, v13.4S, v14.4S // ............................*........... || .................................*............. - // ldr_vo v16, x4, -16 // .....*.................................. || ...*........................................... - // sub v5.4S, v13.4S, v14.4S // .............................*.......... || .................................*............. - // trn1 v30.4S, v0.4S, v25.4S // ...............................*........ || ..................................*............ - // trn1 v7.4S, v10.4S, v5.4S // .................................*...... || ....................................*.......... - // ldr_vo v11, x4, -80 // ..........*............................. || ......*........................................ - // trn2 v23.2D, v30.2D, v7.2D // ...................................*.... || .......................................*....... - // ldr_vo v3, x4, -32 // ................................*....... || ..................................*............ - // trn2 v15.4S, v10.4S, v5.4S // ..................................*..... || .....................................*......... - // trn2 v31.4S, v0.4S, v25.4S // ..............................*......... || ..................................*............ - // sqrdmulh v5.4S, v23.4S, v11.4S // .......................................* || ..............................................* - // trn2 v1.2D, v31.2D, v15.2D // ....................................*... || ........................................*...... - // mul v17.4S, v23.4S, v9.4S // .....................................*.. || ..........................................*.... - // mul v0.4S, v1.4S, v9.4S // ......................................*. || ............................................*.. - // ldr_vo v21, x4, -64 // .......*................................ || ....*.......................................... - + // Instructions: 21 + // Expected cycles: 29 + // Expected IPC: 0.72 + // + // Wall time: 0.15s + // User time: 0.15s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q27, [x1, #48] // ..*........................... + ldr q8, [x3], #16 // .*............................ + // gap // .............................. + ldr q3, [x1, #32] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x1, #16] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v26.4S, v27.4S, v8.S[1] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v22.4S, v27.4S, v8.S[0] // .........*.................... + // gap // .............................. + // gap // .............................. + ldr q27, [x3], #8 // .......*...................... + // gap // .............................. + // gap // .............................. + sqrdmulh v20.4S, v3.4S, v8.S[1] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v22.4S, v26.4S, v29.4S // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v18.4S, v3.4S, v8.S[0] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v1.4S, v15.4S, v22.4S // ...........*.................. + // gap // .............................. + add v15.4S, v15.4S, v22.4S // ............*................. + mla v18.4S, v20.4S, v29.4S // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v30.4S, v15.4S, v8.S[2] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v15.4S, v15.4S, v8.S[3] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v16.4S, v1.4S, v27.S[1] // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v17.4S, v1.4S, v27.S[0] // ..................*........... + ldr q1, [x1, #0] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v30.4S, v15.4S, v29.4S // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v17.4S, v16.4S, v29.4S // ....................*......... + add v6.4S, v1.4S, v18.4S // ................*............. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q18, [x1, #32] // ..*............................ + // ldr q19, [x3], #16 // .*............................. + // ldr q12, [x1, #48] // *.............................. + // sqrdmulh v14.4S, v18.4S, v19.S[1] // .......*....................... + // ldr q8, [x1, #16] // ...*........................... + // ldr q1, [x1, #0] // .................*............. + // mul v18.4S, v18.4S, v19.S[0] // .........*..................... + // ldr q25, [x3], #8 // ......*........................ + // sqrdmulh v30.4S, v12.4S, v19.S[1] // ....*.......................... + // mul v12.4S, v12.4S, v19.S[0] // .....*......................... + // mla v12.4S, v30.4S, v29.4S // ........*...................... + // sub v13.4S, v8.4S, v12.4S // ..........*.................... + // add v6.4S, v8.4S, v12.4S // ...........*................... + // mla v18.4S, v14.4S, v29.4S // ............*.................. + // sqrdmulh v20.4S, v6.4S, v19.S[3] // ..............*................ + // mul v30.4S, v6.4S, v19.S[2] // .............*................. + // add v6.4S, v1.4S, v18.4S // ....................*.......... + // sqrdmulh v19.4S, v13.4S, v25.S[1] // ...............*............... + // mul v17.4S, v13.4S, v25.S[0] // ................*.............. + // mla v30.4S, v20.4S, v29.4S // ..................*............ + // mla v17.4S, v19.4S, v29.4S // ...................*........... + sub count, count, #1 -.p2align 2 layer5678_start: - mla v17.4S, v5.4S, v29.4S // ..........................................*.................. - ldr_vi v8, x3, 16 // ....e........................................................ - ldr_vo v24, x1, 112 // ...e......................................................... - ldr_vi v9, x4, 96 // ..................................e.......................... - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v10.4S, v1.4S, v11.4S // ..............................................*.............. - // gap // ............................................................. - ldr_vo v6, x1, 96 // ..e.......................................................... - ldr_vo v23, x4, -144 // .....................................*....................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v2.4S, v24.4S, v8.S[1] // ............e................................................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v26.4S, v6.4S, v8.S[1] // .......e..................................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v19.4S, v24.4S, v8.S[0] // ...........e................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - ldr_vo v12, x1, 80 // .e........................................................... - mla v19.4S, v2.4S, v29.4S // .............e............................................... - // gap // ............................................................. - // gap // ............................................................. - ldr_vi v13, x3, 8 // .....e....................................................... - // gap // ............................................................. - // gap // ............................................................. - mul v27.4S, v6.4S, v8.S[0] // ......e...................................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v0.4S, v10.4S, v29.4S // ...............................................*............. - // gap // ............................................................. - // gap // ............................................................. - add v18.4S, v12.4S, v19.4S // ...............e............................................. - // gap // ............................................................. - // gap // ............................................................. - mla v27.4S, v26.4S, v29.4S // ........e.................................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v1.4S, v18.4S, v8.S[2] // ................e............................................ - // gap // ............................................................. - // gap // ............................................................. - sub v25.4S, v12.4S, v19.4S // ..............e.............................................. - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v4.4S, v18.4S, v8.S[3] // .................e........................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v28.4S, v25.4S, v13.S[1] // ......................e...................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v14.4S, v25.4S, v13.S[0] // .....................e....................................... - trn1 v25.2D, v31.2D, v15.2D // .................................*........................... - ldr_vo v24, x1, 64 // e............................................................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - trn1 v31.2D, v30.2D, v7.2D // ................................*............................ - // gap // ............................................................. - mla v1.4S, v4.4S, v29.4S // ..................e.......................................... - // gap // ............................................................. - add v7.4S, v25.4S, v0.4S // .................................................*........... - // gap // ............................................................. - mla v14.4S, v28.4S, v29.4S // .......................e..................................... - add v30.4S, v24.4S, v27.4S // ..........e.................................................. - // gap // ............................................................. - sub v11.4S, v25.4S, v0.4S // ................................................*............ - // gap // ............................................................. - // gap // ............................................................. - sub v13.4S, v24.4S, v27.4S // .........e................................................... - // gap // ............................................................. - sqrdmulh v15.4S, v7.4S, v23.4S // ...................................................*......... - sub v25.4S, v30.4S, v1.4S // ...................e......................................... - // gap // ............................................................. - // gap // ............................................................. - add v0.4S, v30.4S, v1.4S // ....................e........................................ - sqrdmulh v19.4S, v11.4S, v16.4S // ........................................................*.... - // gap // ............................................................. - add v10.4S, v13.4S, v14.4S // .........................e................................... - // gap // ............................................................. - // gap // ............................................................. - ldr_vo v16, x4, -16 // .......................................e..................... - mul v2.4S, v7.4S, v21.4S // ..................................................*.......... - sub v5.4S, v13.4S, v14.4S // ........................e.................................... - trn1 v30.4S, v0.4S, v25.4S // ..........................e.................................. - // gap // ............................................................. - // gap // ............................................................. - sub v6.4S, v31.4S, v17.4S // ...........................................*................. - mul v20.4S, v11.4S, v3.4S // .......................................................*..... - // gap // ............................................................. - trn1 v7.4S, v10.4S, v5.4S // ............................e................................ - // gap // ............................................................. - // gap // ............................................................. - mla v20.4S, v19.4S, v29.4S // .........................................................*... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - ldr_vo v11, x4, -80 // ...................................e......................... - // gap // ............................................................. - mla v2.4S, v15.4S, v29.4S // ....................................................*........ - trn2 v23.2D, v30.2D, v7.2D // ..............................e.............................. - // gap // ............................................................. - // gap // ............................................................. - ldr_vo v3, x4, -32 // ......................................e...................... - trn2 v15.4S, v10.4S, v5.4S // .............................e............................... - add v17.4S, v31.4S, v17.4S // ............................................*................ - trn2 v31.4S, v0.4S, v25.4S // ...........................e................................. - // gap // ............................................................. - sub v22.4S, v6.4S, v20.4S // ..........................................................*.. - sqrdmulh v5.4S, v23.4S, v11.4S // .........................................e................... - // gap // ............................................................. - add v21.4S, v6.4S, v20.4S // ...........................................................*. - // gap // ............................................................. - // gap // ............................................................. - sub v20.4S, v17.4S, v2.4S // .....................................................*....... - trn2 v1.2D, v31.2D, v15.2D // ...............................e............................. - // gap // ............................................................. - add v19.4S, v17.4S, v2.4S // ......................................................*...... - mul v17.4S, v23.4S, v9.4S // ........................................e.................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v0.4S, v1.4S, v9.4S // .............................................e............... - // gap // ............................................................. - // gap // ............................................................. - st4 {v19.4S,v20.4S,v21.4S,v22.4S}, [x1], #64 // ............................................................* - ldr_vo v21, x4, -64 // ....................................e........................ - // gap // ............................................................. - - // original source code - // ldr_vo v8, x1, 0 // ......................e................................................................................................. || ........................e........................................................................... - // ldr_vo v9, x1, 16 // .........e.............................................................................................................. || .........e.......................................................................................... - // ldr_vo v10, x1, 32 // ....e................................................................................................................... || ..e................................................................................................. - // ldr_vo v11, x1, 48 // .e...................................................................................................................... || e................................................................................................... - // ldr_vi v0, x3, 16 // e....................................................................................................................... || e................................................................................................... - // ldr_vi v1, x3, 8 // ...........e............................................................................................................ || ...........e........................................................................................ - // mul v24.4S, v10.4S, v0.S[0] // ............e........................................................................................................... || ............e....................................................................................... - // sqrdmulh v10.4S, v10.4S, v0.S[1] // .......e................................................................................................................ || ......e............................................................................................. - // mla v24.4S, v10.4S, v29.4S // ...............e........................................................................................................ || ................e................................................................................... - // sub v10.4S, v8.4S, v24.4S // .............................e.......................................................................................... || ..............................e..................................................................... - // add v8.4S, v8.4S, v24.4S // ...........................e............................................................................................ || ............................e....................................................................... - // mul v24.4S, v11.4S, v0.S[0] // ........e............................................................................................................... || ........e........................................................................................... - // sqrdmulh v11.4S, v11.4S, v0.S[1] // ......e................................................................................................................. || ....e............................................................................................... - // mla v24.4S, v11.4S, v29.4S // ..........e............................................................................................................. || ..........e......................................................................................... - // sub v11.4S, v9.4S, v24.4S // .................e...................................................................................................... || ...................e................................................................................ - // add v9.4S, v9.4S, v24.4S // ..............e......................................................................................................... || ...............e.................................................................................... - // mul v24.4S, v9.4S, v0.S[2] // ................e....................................................................................................... || ..................e................................................................................. - // sqrdmulh v9.4S, v9.4S, v0.S[3] // ..................e..................................................................................................... || ....................e............................................................................... - // mla v24.4S, v9.4S, v29.4S // ........................e............................................................................................... || ..........................e......................................................................... - // sub v9.4S, v8.4S, v24.4S // ...............................e........................................................................................ || ...............................e.................................................................... - // add v8.4S, v8.4S, v24.4S // ................................e....................................................................................... || ................................e................................................................... - // mul v24.4S, v11.4S, v1.S[0] // ....................e................................................................................................... || ........................e........................................................................... - // sqrdmulh v11.4S, v11.4S, v1.S[1] // ...................e.................................................................................................... || ......................e............................................................................. - // mla v24.4S, v11.4S, v29.4S // ..........................e............................................................................................. || ............................e....................................................................... - // sub v11.4S, v10.4S, v24.4S // .....................................e.................................................................................. || ..................................e................................................................. - // add v10.4S, v10.4S, v24.4S // ..................................e..................................................................................... || .................................e.................................................................. - // trn1 v25.4S, v8.4S, v9.4S // ......................................e................................................................................. || ...................................e................................................................ - // trn2 v26.4S, v8.4S, v9.4S // .................................................e...................................................................... || ..........................................e......................................................... - // trn1 v27.4S, v10.4S, v11.4S // .........................................e.............................................................................. || .....................................e.............................................................. - // trn2 v28.4S, v10.4S, v11.4S // ...............................................e........................................................................ || .........................................e.......................................................... - // trn2 v10.2D, v25.2D, v27.2D // .............................................e.......................................................................... || ........................................e........................................................... - // trn2 v11.2D, v26.2D, v28.2D // ......................................................e................................................................. || .............................................e...................................................... - // trn1 v8.2D, v25.2D, v27.2D // ....................................................................................*................................... || ............................................................................*....................... - // trn1 v9.2D, v26.2D, v28.2D // ..................................................................................*..................................... || ..........................................................................*......................... - // ldr_vi v0, x4, 96 // ..e..................................................................................................................... || .e.................................................................................................. - // ldr_vo v4, x4, -80 // ...........................................e............................................................................ || .......................................e............................................................ - // ldr_vo v1, x4, -64 // ...........................................................e............................................................ || .................................................e.................................................. - // ldr_vo v5, x4, -48 // ..................................................................*..................................................... || .....................................................*.............................................. - // ldr_vo v2, x4, -32 // ..............................................e......................................................................... || .........................................e.......................................................... - // ldr_vo v6, x4, -16 // ...................................e.................................................................................... || ..................................e................................................................. - // mul v24.4S, v10.4S, v0.4S // ........................................................e............................................................... || ..............................................e..................................................... - // sqrdmulh v10.4S, v10.4S, v4.4S // ...................................................e.................................................................... || ...........................................e........................................................ - // mla v24.4S, v10.4S, v29.4S // ............................................................*........................................................... || ..................................................*................................................. - // sub v10.4S, v8.4S, v24.4S // ....................................................................................................*................... || ......................................................................................*............. - // add v8.4S, v8.4S, v24.4S // .............................................................................................................*.......... || ............................................................................................*....... - // mul v24.4S, v11.4S, v0.4S // .........................................................e.............................................................. || ................................................e................................................... - // sqrdmulh v11.4S, v11.4S, v4.4S // ................................................................*....................................................... || ....................................................*............................................... - // mla v24.4S, v11.4S, v29.4S // ..........................................................................*............................................. || ................................................................*................................... - // sub v11.4S, v9.4S, v24.4S // .........................................................................................*.............................. || ...............................................................................*.................... - // add v9.4S, v9.4S, v24.4S // ......................................................................................*................................. || .............................................................................*...................... - // mul v24.4S, v9.4S, v1.4S // .................................................................................................*...................... || ....................................................................................*............... - // sqrdmulh v9.4S, v9.4S, v5.4S // ...........................................................................................*............................ || ................................................................................*................... - // mla v24.4S, v9.4S, v29.4S // .........................................................................................................*.............. || ..........................................................................................*......... - // sub v9.4S, v8.4S, v24.4S // ..................................................................................................................*..... || ...............................................................................................*.... - // add v8.4S, v8.4S, v24.4S // ....................................................................................................................*... || ................................................................................................*... - // mul v24.4S, v11.4S, v2.4S // .....................................................................................................*.................. || ......................................................................................*............. - // sqrdmulh v11.4S, v11.4S, v6.4S // ..............................................................................................*......................... || ..................................................................................*................. - // mla v24.4S, v11.4S, v29.4S // .......................................................................................................*................ || ........................................................................................*........... - // sub v11.4S, v10.4S, v24.4S // ...............................................................................................................*........ || .............................................................................................*...... - // add v10.4S, v10.4S, v24.4S // .................................................................................................................*...... || ..............................................................................................*..... - // st4 {v8.4S,v9.4S,v10.4S,v11.4S}, [x1], #64 // .......................................................................................................................* || ...................................................................................................* - - subs count, count, #1 + // Instructions: 61 + // Expected cycles: 52 + // Expected IPC: 1.17 + // + // Wall time: 9.38s + // User time: 9.38s + // + // -------------------- original position ---------------------> + // 0 25 50 + // |------------------------|------------------------|---------- + sub v20.4S, v1.4S, v18.4S // .........*................................................... + ldr q18, [x1, #96] // ..e.......................................................... + ldr q19, [x3], #16 // ....e........................................................ + ldr q0, [x4, #16] // ...................................*......................... + sub v11.4S, v6.4S, v30.4S // ...................*......................................... + ldr q22, [x4], #(6*16) // ..................................*.......................... + add v30.4S, v6.4S, v30.4S // ....................*........................................ + ldr q13, [x4, #-64] // ....................................*........................ + ldr q6, [x4, #-48] // .....................................*....................... + sub v27.4S, v20.4S, v17.4S // ........................*.................................... + add v20.4S, v20.4S, v17.4S // .........................*................................... + ldr q17, [x4, #-32] // ......................................*...................... + ldr q12, [x1, #112] // ...e......................................................... + ldr q5, [x4, #-16] // .......................................*..................... + sqrdmulh v14.4S, v18.4S, v19.S[1] // ......e...................................................... + ldr q8, [x1, #80] // .e........................................................... + trn2 v26.4S, v30.4S, v11.4S // ...........................*................................. + ldr q1, [x1, #64] // e............................................................ + mul v18.4S, v18.4S, v19.S[0] // .......e..................................................... + trn2 v31.4S, v20.4S, v27.4S // .............................*............................... + ldr q25, [x3], #8 // .....e....................................................... + trn1 v11.4S, v30.4S, v11.4S // ..........................*.................................. + // gap // ............................................................. + // gap // ............................................................. + trn1 v20.4S, v20.4S, v27.4S // ............................*................................ + sqrdmulh v30.4S, v12.4S, v19.S[1] // ...........e................................................. + // gap // ............................................................. + trn2 v27.2D, v26.2D, v31.2D // ...............................*............................. + // gap // ............................................................. + // gap // ............................................................. + mul v12.4S, v12.4S, v19.S[0] // ............e................................................ + trn1 v26.2D, v26.2D, v31.2D // .................................*........................... + // gap // ............................................................. + trn2 v31.2D, v11.2D, v20.2D // ..............................*.............................. + // gap // ............................................................. + // gap // ............................................................. + trn1 v20.2D, v11.2D, v20.2D // ................................*............................ + sqrdmulh v11.4S, v27.4S, v0.4S // .............................................*............... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v0.4S, v31.4S, v0.4S // ........................................*.................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mul v27.4S, v27.4S, v22.4S // ..............................................*.............. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v27.4S, v11.4S, v29.4S // ...............................................*............. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mul v11.4S, v31.4S, v22.4S // .........................................*................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v12.4S, v30.4S, v29.4S // .............e............................................... + // gap // ............................................................. + // gap // ............................................................. + add v22.4S, v26.4S, v27.4S // .................................................*........... + // gap // ............................................................. + // gap // ............................................................. + sub v30.4S, v26.4S, v27.4S // ................................................*............ + mla v11.4S, v0.4S, v29.4S // ..........................................*.................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mul v0.4S, v22.4S, v13.4S // ...................................................*......... + // gap // ............................................................. + // gap // ............................................................. + sub v13.4S, v8.4S, v12.4S // ..............e.............................................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v22.4S, v22.4S, v6.4S // ..................................................*.......... + add v6.4S, v8.4S, v12.4S // ...............e............................................. + // gap // ............................................................. + sub v27.4S, v20.4S, v11.4S // ...........................................*................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v12.4S, v30.4S, v5.4S // .......................................................*..... + add v20.4S, v20.4S, v11.4S // ............................................*................ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mul v11.4S, v30.4S, v17.4S // ........................................................*.... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v0.4S, v22.4S, v29.4S // ....................................................*........ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v11.4S, v12.4S, v29.4S // .........................................................*... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v18.4S, v14.4S, v29.4S // ........e.................................................... + // gap // ............................................................. + // gap // ............................................................. + sub v22.4S, v20.4S, v0.4S // .....................................................*....... + // gap // ............................................................. + // gap // ............................................................. + add v21.4S, v20.4S, v0.4S // ......................................................*...... + sqrdmulh v20.4S, v6.4S, v19.S[3] // ................e............................................ + // gap // ............................................................. + sub v24.4S, v27.4S, v11.4S // ..........................................................*.. + // gap // ............................................................. + // gap // ............................................................. + mul v30.4S, v6.4S, v19.S[2] // .................e........................................... + add v23.4S, v27.4S, v11.4S // ...........................................................*. + // gap // ............................................................. + add v6.4S, v1.4S, v18.4S // ..........e.................................................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v19.4S, v13.4S, v25.S[1] // .....................e....................................... + // gap // ............................................................. + // gap // ............................................................. + st4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x1], #64 // ............................................................* + // gap // ............................................................. + // gap // ............................................................. + mul v17.4S, v13.4S, v25.S[0] // ......................e...................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v30.4S, v20.4S, v29.4S // ..................e.......................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v17.4S, v19.4S, v29.4S // .......................e..................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + + // --------------------------------------------------- new position ----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q8, [x1, #(16*0)] // ................e...........................................'................~........................................ + // ldr q9, [x1, #(16*1)] // ..............e.............................................'..............~.......................................... + // ldr q10, [x1, #(16*2)] // e...........................................................'~........................................................ + // ldr q11, [x1, #(16*3)] // ...........e................................................'...........~............................................. + // ldr q0, [x3], #16 // .e..........................................................'.~....................................................... + // ldr q1, [x3], #8 // ...................e........................................'...................~..................................... + // sqrdmulh v27.4s, v10.4s, v0.s[1] // .............e..............................................'.............~........................................... + // mul v24.4s, v10.4s, v0.s[0] // .................e..........................................'.................~....................................... + // mla v24.4s, v27.4s, v29.4s // ...............................................e............'...............................................~......... + // sub v10.4s, v8.4s, v24.4s // ............................................................*......................................................... + // add v8.4s, v8.4s, v24.4s // ......................................................e.....'......................................................~.. + // sqrdmulh v27.4s, v11.4s, v0.s[1] // ......................e.....................................'......................~.................................. + // mul v24.4s, v11.4s, v0.s[0] // ........................e...................................'........................~................................ + // mla v24.4s, v27.4s, v29.4s // .................................e..........................'.................................~....................... + // sub v11.4s, v9.4s, v24.4s // ......................................e.....................'......................................~.................. + // add v9.4s, v9.4s, v24.4s // ........................................e...................'........................................~................ + // sqrdmulh v27.4s, v9.4s, v0.s[3] // ..................................................e.........'..................................................~...... + // mul v24.4s, v9.4s, v0.s[2] // ....................................................e.......'....................................................~.... + // mla v24.4s, v27.4s, v29.4s // ..........................................................e.'......................................................... + // sub v9.4s, v8.4s, v24.4s // ...~........................................................'...*..................................................... + // add v8.4s, v8.4s, v24.4s // .....~......................................................'.....*................................................... + // sqrdmulh v27.4s, v11.4s, v1.s[1] // .......................................................e....'.......................................................~. + // mul v24.4s, v11.4s, v1.s[0] // .........................................................e..'......................................................... + // mla v24.4s, v27.4s, v29.4s // ...........................................................e'......................................................... + // sub v11.4s, v10.4s, v24.4s // ........~...................................................'........*................................................ + // add v10.4s, v10.4s, v24.4s // .........~..................................................'.........*............................................... + // trn1 v25.4s, v8.4s, v9.4s // ....................~.......................................'....................*.................................... + // trn2 v26.4s, v8.4s, v9.4s // ...............~............................................'...............*......................................... + // trn1 v27.4s, v10.4s, v11.4s // .....................~......................................'.....................*................................... + // trn2 v28.4s, v10.4s, v11.4s // ..................~.........................................'..................*...................................... + // trn2 v10.2d, v25.2d, v27.2d // ..........................~.................................'..........................*.............................. + // trn2 v11.2d, v26.2d, v28.2d // .......................~....................................'.......................*................................. + // trn1 v8.2d, v25.2d, v27.2d // ...........................~................................'...........................*............................. + // trn1 v9.2d, v26.2d, v28.2d // .........................~..................................'.........................*............................... + // ldr q0, [ x4], #(6*16) // ....~.......................................................'....*.................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ..~.........................................................'..*...................................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ......~.....................................................'......*.................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // .......~....................................................'.......*................................................. + // ldr q2, [ x4, #(-6*16 + 4*16)] // ..........~.................................................'..........*.............................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // ............~...............................................'............*............................................ + // sqrdmulh v27.4s, v10.4s, v4.4s // .............................~..............................'.............................*........................... + // mul v24.4s, v10.4s, v0.4s // ................................~...........................'................................*........................ + // mla v24.4s, v27.4s, v29.4s // ....................................~.......................'....................................*.................... + // sub v10.4s, v8.4s, v24.4s // .........................................~..................'.........................................*............... + // add v8.4s, v8.4s, v24.4s // ...........................................~................'...........................................*............. + // sqrdmulh v27.4s, v11.4s, v4.4s // ............................~...............................'............................*............................ + // mul v24.4s, v11.4s, v0.4s // ..............................~.............................'..............................*.......................... + // mla v24.4s, v27.4s, v29.4s // ...............................~............................'...............................*......................... + // sub v11.4s, v9.4s, v24.4s // ...................................~........................'...................................*..................... + // add v9.4s, v9.4s, v24.4s // ..................................~.........................'..................................*...................... + // sqrdmulh v27.4s, v9.4s, v5.4s // .......................................~....................'.......................................*................. + // mul v24.4s, v9.4s, v1.4s // .....................................~......................'.....................................*................... + // mla v24.4s, v27.4s, v29.4s // .............................................~..............'.............................................*........... + // sub v9.4s, v8.4s, v24.4s // ................................................~...........'................................................*........ + // add v8.4s, v8.4s, v24.4s // .................................................~..........'.................................................*....... + // sqrdmulh v27.4s, v11.4s, v6.4s // ..........................................~.................'..........................................*.............. + // mul v24.4s, v11.4s, v2.4s // ............................................~...............'............................................*............ + // mla v24.4s, v27.4s, v29.4s // ..............................................~.............'..............................................*.......... + // sub v11.4s, v10.4s, v24.4s // ...................................................~........'...................................................*..... + // add v10.4s, v10.4s, v24.4s // .....................................................~......'.....................................................*... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ........................................................~...'........................................................* + + sub count, count, #1 cbnz count, layer5678_start - trn1 v9.2D, v30.2D, v7.2D // .....*............... - sqrdmulh v27.4S, v1.4S, v11.4S // .*................... - ldr_vo v2, x4, -48 // ..*.................. - trn1 v15.2D, v31.2D, v15.2D // ....*................ - // gap // ..................... - // gap // ..................... - mla v17.4S, v5.4S, v29.4S // *.................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - mla v0.4S, v27.4S, v29.4S // ...*................. - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - sub v14.4S, v9.4S, v17.4S // ...........*......... - add v9.4S, v9.4S, v17.4S // ...............*..... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - sub v31.4S, v15.4S, v0.4S // .......*............. - // gap // ..................... - // gap // ..................... - add v1.4S, v15.4S, v0.4S // ......*.............. - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - sqrdmulh v25.4S, v31.4S, v16.4S // .........*........... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - mul v5.4S, v1.4S, v21.4S // ..........*.......... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - sqrdmulh v2.4S, v1.4S, v2.4S // ........*............ - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - mul v21.4S, v31.4S, v3.4S // ............*........ - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - mla v21.4S, v25.4S, v29.4S // .............*....... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - mla v5.4S, v2.4S, v29.4S // ..............*...... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - sub v11.4S, v14.4S, v21.4S // ................*.... - add v10.4S, v14.4S, v21.4S // .................*... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - add v8.4S, v9.4S, v5.4S // ...................*. - sub v9.4S, v9.4S, v5.4S // ..................*.. - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - // gap // ..................... - st4 {v8.4S,v9.4S,v10.4S,v11.4S}, [x1], #64 // ....................* - // gap // ..................... - // gap // ..................... - - // original source code - // mla v17.4S, v5.4S, v29.4S // ....*................ || ..*............................. - // sqrdmulh v10.4S, v1.4S, v11.4S // .*................... || *............................... - // ldr_vo v23, x4, -48 // ..*.................. || *............................... - // mla v0.4S, v10.4S, v29.4S // .....*............... || .....*.......................... - // trn1 v25.2D, v31.2D, v15.2D // ...*................. || .*.............................. - // trn1 v31.2D, v30.2D, v7.2D // *.................... || *............................... - // add v7.4S, v25.4S, v0.4S // .........*........... || ...........*.................... - // sub v11.4S, v25.4S, v0.4S // ........*............ || ..........*..................... - // sqrdmulh v15.4S, v7.4S, v23.4S // ............*........ || .................*.............. - // sqrdmulh v19.4S, v11.4S, v16.4S // ..........*.......... || .............*.................. - // mul v2.4S, v7.4S, v21.4S // ...........*......... || ...............*................ - // sub v6.4S, v31.4S, v17.4S // ......*.............. || .......*........................ - // mul v20.4S, v11.4S, v3.4S // .............*....... || ...................*............ - // mla v20.4S, v19.4S, v29.4S // ..............*...... || .....................*.......... - // mla v2.4S, v15.4S, v29.4S // ...............*..... || .......................*........ - // add v17.4S, v31.4S, v17.4S // .......*............. || .......*........................ - // sub v22.4S, v6.4S, v20.4S // ................*.... || ..........................*..... - // add v21.4S, v6.4S, v20.4S // .................*... || ..........................*..... - // sub v20.4S, v17.4S, v2.4S // ...................*. || ............................*... - // add v19.4S, v17.4S, v2.4S // ..................*.. || ............................*... - // st4 {v19.4S,v20.4S,v21.4S,v22.4S}, [x1], #64 // ....................* || ...............................* - + // Instructions: 40 + // Expected cycles: 51 + // Expected IPC: 0.78 + // + // Wall time: 0.62s + // User time: 0.62s + // + // ---------- original position ----------> + // 0 25 + // |------------------------|-------------- + sub v11.4S, v1.4S, v18.4S // *....................................... + ldr q20, [x4, #16] // .*...................................... + add v15.4S, v6.4S, v30.4S // ....*................................... + ldr q23, [x4, #32] // .....*.................................. + // gap // ........................................ + // gap // ........................................ + sub v19.4S, v6.4S, v30.4S // ..*..................................... + ldr q30, [x4], #(6*16) // ...*.................................... + ldr q7, [x4, #-48] // ......*................................. + sub v10.4S, v11.4S, v17.4S // .......*................................ + add v1.4S, v11.4S, v17.4S // ........*............................... + ldr q4, [x4, #-32] // .........*.............................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v14.4S, v15.4S, v19.4S // ...........*............................ + trn1 v27.4S, v15.4S, v19.4S // .............*.......................... + // gap // ........................................ + trn2 v26.4S, v1.4S, v10.4S // ............*........................... + trn1 v28.4S, v1.4S, v10.4S // ..............*......................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v19.2D, v14.2D, v26.2D // ...............*........................ + // gap // ........................................ + // gap // ........................................ + trn2 v15.2D, v27.2D, v28.2D // .................*...................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v24.4S, v19.4S, v20.4S // ...................*.................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v17.4S, v15.4S, v20.4S // ....................*................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v21.4S, v19.4S, v30.4S // .....................*.................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mla v21.4S, v24.4S, v29.4S // ......................*................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v11.4S, v15.4S, v30.4S // .......................*................ + trn1 v15.2D, v14.2D, v26.2D // ................*....................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mla v11.4S, v17.4S, v29.4S // ..........................*............. + // gap // ........................................ + // gap // ........................................ + add v22.4S, v15.4S, v21.4S // ........................*............... + // gap // ........................................ + // gap // ........................................ + trn1 v20.2D, v27.2D, v28.2D // ..................*..................... + // gap // ........................................ + // gap // ........................................ + sub v28.4S, v15.4S, v21.4S // .........................*.............. + // gap // ........................................ + // gap // ........................................ + mul v24.4S, v22.4S, v23.4S // ...........................*............ + ldr q23, [x4, #-16] // ..........*............................. + // gap // ........................................ + sub v18.4S, v20.4S, v11.4S // .............................*.......... + // gap // ........................................ + // gap // ........................................ + sqrdmulh v0.4S, v22.4S, v7.4S // ............................*........... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v15.4S, v28.4S, v23.4S // ..............................*......... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v23.4S, v28.4S, v4.4S // ................................*....... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mla v24.4S, v0.4S, v29.4S // .................................*...... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mla v23.4S, v15.4S, v29.4S // ..................................*..... + add v15.4S, v20.4S, v11.4S // ...............................*........ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v26.4S, v15.4S, v24.4S // ...................................*.... + // gap // ........................................ + // gap // ........................................ + add v25.4S, v15.4S, v24.4S // ....................................*... + // gap // ........................................ + // gap // ........................................ + sub v28.4S, v18.4S, v23.4S // .....................................*.. + add v27.4S, v18.4S, v23.4S // ......................................*. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + + // ------------ new position -------------> + // 0 25 + // |------------------------|-------------- + // sub v20.4S, v1.4S, v18.4S // *....................................... + // ldr q0, [x4, #16] // .*...................................... + // sub v11.4S, v6.4S, v30.4S // ....*................................... + // ldr q22, [x4], #(6*16) // .....*.................................. + // add v30.4S, v6.4S, v30.4S // ..*..................................... + // ldr q13, [x4, #-64] // ...*.................................... + // ldr q6, [x4, #-48] // ......*................................. + // sub v27.4S, v20.4S, v17.4S // .......*................................ + // add v20.4S, v20.4S, v17.4S // ........*............................... + // ldr q17, [x4, #-32] // .........*.............................. + // ldr q5, [x4, #-16] // ...........................*............ + // trn2 v26.4S, v30.4S, v11.4S // ..........*............................. + // trn2 v31.4S, v20.4S, v27.4S // ............*........................... + // trn1 v11.4S, v30.4S, v11.4S // ...........*............................ + // trn1 v20.4S, v20.4S, v27.4S // .............*.......................... + // trn2 v27.2D, v26.2D, v31.2D // ..............*......................... + // trn1 v26.2D, v26.2D, v31.2D // .....................*.................. + // trn2 v31.2D, v11.2D, v20.2D // ...............*........................ + // trn1 v20.2D, v11.2D, v20.2D // ........................*............... + // sqrdmulh v11.4S, v27.4S, v0.4S // ................*....................... + // sqrdmulh v0.4S, v31.4S, v0.4S // .................*...................... + // mul v27.4S, v27.4S, v22.4S // ..................*..................... + // mla v27.4S, v11.4S, v29.4S // ...................*.................... + // mul v11.4S, v31.4S, v22.4S // ....................*................... + // add v22.4S, v26.4S, v27.4S // .......................*................ + // sub v30.4S, v26.4S, v27.4S // .........................*.............. + // mla v11.4S, v0.4S, v29.4S // ......................*................. + // mul v0.4S, v22.4S, v13.4S // ..........................*............. + // sqrdmulh v22.4S, v22.4S, v6.4S // .............................*.......... + // sub v27.4S, v20.4S, v11.4S // ............................*........... + // sqrdmulh v12.4S, v30.4S, v5.4S // ..............................*......... + // add v20.4S, v20.4S, v11.4S // ..................................*..... + // mul v11.4S, v30.4S, v17.4S // ...............................*........ + // mla v0.4S, v22.4S, v29.4S // ................................*....... + // mla v11.4S, v12.4S, v29.4S // .................................*...... + // sub v22.4S, v20.4S, v0.4S // ...................................*.... + // add v21.4S, v20.4S, v0.4S // ....................................*... + // sub v24.4S, v27.4S, v11.4S // .....................................*.. + // add v23.4S, v27.4S, v11.4S // ......................................*. + // st4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x1], #64 // .......................................* + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_m1_firestorm.s index 10a36170..f0acf06f 100644 --- a/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_m1_firestorm.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -68,15 +47,15 @@ .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -85,12 +64,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -98,31 +71,31 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -137,7 +110,7 @@ trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -148,7 +121,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -158,7 +131,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -166,7 +139,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -177,19 +150,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -323,589 +296,598 @@ _ntt_dilithium_1234_5678_opt_m1_firestorm: load_roots_1234 .p2align 2 - ldr q12, [x0, #192] // ...*............................................................................................................................................................................................ - ldr q31, [x0, #640] // ..........*..................................................................................................................................................................................... - ldr q26, [x0, #960] // ...............*................................................................................................................................................................................ - ldr q20, [x0, #768] // ............*................................................................................................................................................................................... - ldr q14, [x0, #832] // .............*.................................................................................................................................................................................. - ldr q13, [x0, #896] // ..............*................................................................................................................................................................................. - ldr q18, [x0, #704] // ...........*.................................................................................................................................................................................... - ldr q10, [x0, #512] // ........*....................................................................................................................................................................................... - ldr q11, [x0, #448] // .......*........................................................................................................................................................................................ - ldr q15, [x0, #384] // ......*......................................................................................................................................................................................... - mul v9.4S, v26.4S, v0.S[0] // ...................................................*............................................................................................................................................ - sqrdmulh v8.4S, v26.4S, v0.S[1] // ....................................................*........................................................................................................................................... - mul v25.4S, v31.4S, v0.S[0] // ..........................*..................................................................................................................................................................... - sqrdmulh v31.4S, v31.4S, v0.S[1] // ...........................*.................................................................................................................................................................... - ldr q19, [x0, #576] // .........*...................................................................................................................................................................................... - mul v28.4S, v13.4S, v0.S[0] // ..............................................*................................................................................................................................................. - sqrdmulh v17.4S, v13.4S, v0.S[1] // ...............................................*................................................................................................................................................ - sqrdmulh v24.4S, v14.4S, v0.S[1] // ..........................................*..................................................................................................................................................... - mul v14.4S, v14.4S, v0.S[0] // .........................................*...................................................................................................................................................... - ldr q27, [x0, #128] // ..*............................................................................................................................................................................................. - mla v28.4S, v17.4S, v29.4S // ................................................*............................................................................................................................................... - sqrdmulh v30.4S, v18.4S, v0.S[1] // ................................*............................................................................................................................................................... - mla v9.4S, v8.4S, v29.4S // .....................................................*.......................................................................................................................................... - mul v17.4S, v18.4S, v0.S[0] // ...............................*................................................................................................................................................................ - ldr q8, [x0, #320] // .....*.......................................................................................................................................................................................... - mul v22.4S, v19.4S, v0.S[0] // .....................*.......................................................................................................................................................................... - sqrdmulh v18.4S, v20.4S, v0.S[1] // .....................................*.......................................................................................................................................................... - mla v14.4S, v24.4S, v29.4S // ...........................................*.................................................................................................................................................... - sqrdmulh v21.4S, v19.4S, v0.S[1] // ......................*......................................................................................................................................................................... - sqrdmulh v23.4S, v10.4S, v0.S[1] // .................*.............................................................................................................................................................................. - mul v24.4S, v20.4S, v0.S[0] // ....................................*........................................................................................................................................................... - mla v25.4S, v31.4S, v29.4S // ............................*................................................................................................................................................................... - ldr q26, [x0, #64] // .*.............................................................................................................................................................................................. - add v31.4S, v11.4S, v9.4S // .......................................................*........................................................................................................................................ - sub v13.4S, v11.4S, v9.4S // ......................................................*......................................................................................................................................... - add v16.4S, v15.4S, v28.4S // ..................................................*............................................................................................................................................. - mla v17.4S, v30.4S, v29.4S // .................................*.............................................................................................................................................................. - add v30.4S, v8.4S, v14.4S // .............................................*.................................................................................................................................................. - mla v24.4S, v18.4S, v29.4S // ......................................*......................................................................................................................................................... - sub v8.4S, v8.4S, v14.4S // ............................................*................................................................................................................................................... - mul v18.4S, v31.4S, v0.S[2] // .......................................................................*........................................................................................................................ - mla v22.4S, v21.4S, v29.4S // .......................*........................................................................................................................................................................ - ldr q20, [x0, #256] // ....*........................................................................................................................................................................................... - sqrdmulh v9.4S, v31.4S, v0.S[3] // ........................................................................*....................................................................................................................... - sqrdmulh v11.4S, v16.4S, v0.S[3] // ...................................................................*............................................................................................................................ - sub v21.4S, v27.4S, v25.4S // .............................*.................................................................................................................................................................. - add v31.4S, v27.4S, v25.4S // ..............................*................................................................................................................................................................. - mul v14.4S, v8.4S, v1.S[0] // .................................................................................*.............................................................................................................. - mul v19.4S, v10.4S, v0.S[0] // ................*............................................................................................................................................................................... - sqrdmulh v10.4S, v8.4S, v1.S[1] // ..................................................................................*............................................................................................................. - mul v27.4S, v16.4S, v0.S[2] // ..................................................................*............................................................................................................................. - mul v25.4S, v13.4S, v1.S[0] // ...........................................................................................*.................................................................................................... - add v16.4S, v26.4S, v22.4S // .........................*...................................................................................................................................................................... - sub v8.4S, v26.4S, v22.4S // ........................*....................................................................................................................................................................... - sub v26.4S, v12.4S, v17.4S // ..................................*............................................................................................................................................................. - add v22.4S, v20.4S, v24.4S // ........................................*....................................................................................................................................................... - mla v18.4S, v9.4S, v29.4S // .........................................................................*...................................................................................................................... - add v9.4S, v12.4S, v17.4S // ...................................*............................................................................................................................................................ - sub v15.4S, v15.4S, v28.4S // .................................................*.............................................................................................................................................. - sub v20.4S, v20.4S, v24.4S // .......................................*........................................................................................................................................................ - ldr q28, [x0, #0] // *............................................................................................................................................................................................... - sqrdmulh v12.4S, v30.4S, v0.S[3] // ..............................................................*................................................................................................................................. - mul v17.4S, v30.4S, v0.S[2] // .............................................................*.................................................................................................................................. - mla v19.4S, v23.4S, v29.4S // ..................*............................................................................................................................................................................. - mul v23.4S, v22.4S, v0.S[2] // ........................................................*....................................................................................................................................... - mla v27.4S, v11.4S, v29.4S // ....................................................................*........................................................................................................................... - mla v14.4S, v10.4S, v29.4S // ...................................................................................*............................................................................................................ - sqrdmulh v11.4S, v22.4S, v0.S[3] // .........................................................*...................................................................................................................................... - mul v24.4S, v15.4S, v1.S[0] // ......................................................................................*......................................................................................................... - sqrdmulh v15.4S, v15.4S, v1.S[1] // .......................................................................................*........................................................................................................ - add v30.4S, v9.4S, v18.4S // ...........................................................................*.................................................................................................................... - sub v18.4S, v9.4S, v18.4S // ..........................................................................*..................................................................................................................... - mla v17.4S, v12.4S, v29.4S // ...............................................................*................................................................................................................................ - add v22.4S, v31.4S, v27.4S // ......................................................................*......................................................................................................................... - sub v27.4S, v31.4S, v27.4S // .....................................................................*.......................................................................................................................... - add v9.4S, v8.4S, v14.4S // .....................................................................................*.......................................................................................................... - sqrdmulh v13.4S, v13.4S, v1.S[1] // ............................................................................................*................................................................................................... - mul v31.4S, v20.4S, v1.S[0] // ............................................................................*................................................................................................................... - mla v25.4S, v13.4S, v29.4S // .............................................................................................*.................................................................................................. - mla v24.4S, v15.4S, v29.4S // ........................................................................................*....................................................................................................... - sqrdmulh v13.4S, v20.4S, v1.S[1] // .............................................................................*.................................................................................................................. - add v15.4S, v28.4S, v19.4S // ....................*........................................................................................................................................................................... - sub v28.4S, v28.4S, v19.4S // ...................*............................................................................................................................................................................ - sub v14.4S, v8.4S, v14.4S // ....................................................................................*........................................................................................................... - mul v8.4S, v18.4S, v2.S[0] // ...............................................................................................................*................................................................................ - add v12.4S, v16.4S, v17.4S // .................................................................*.............................................................................................................................. - sqrdmulh v10.4S, v18.4S, v2.S[1] // ................................................................................................................*............................................................................... - sqrdmulh v18.4S, v27.4S, v2.S[1] // ...........................................................................................................*.................................................................................... - mul v27.4S, v27.4S, v2.S[0] // ..........................................................................................................*..................................................................................... - mla v31.4S, v13.4S, v29.4S // ..............................................................................*................................................................................................................. - sub v20.4S, v26.4S, v25.4S // ..............................................................................................*................................................................................................. - add v25.4S, v26.4S, v25.4S // ...............................................................................................*................................................................................................ - add v13.4S, v21.4S, v24.4S // ..........................................................................................*..................................................................................................... - mla v23.4S, v11.4S, v29.4S // ..........................................................*..................................................................................................................................... - sub v26.4S, v21.4S, v24.4S // .........................................................................................*...................................................................................................... - sub v21.4S, v16.4S, v17.4S // ................................................................*............................................................................................................................... + ldr q8, [x0, #704] // ...........*.................................................................................................................................................................................... + ldr q12, [x0, #512] // ........*....................................................................................................................................................................................... + ldr q24, [x0, #768] // ............*................................................................................................................................................................................... + ldr q18, [x0, #640] // ..........*..................................................................................................................................................................................... + ldr q11, [x0, #832] // .............*.................................................................................................................................................................................. + ldr q30, [x0, #960] // ...............*................................................................................................................................................................................ + ldr q28, [x0, #896] // ..............*................................................................................................................................................................................. + ldr q9, [x0, #576] // .........*...................................................................................................................................................................................... + ldr q17, [x0, #64] // .*.............................................................................................................................................................................................. + ldr q22, [x0, #448] // .......*........................................................................................................................................................................................ + mul v16.4S, v24.4S, v0.S[0] // .....................................*.......................................................................................................................................................... + sqrdmulh v13.4S, v24.4S, v0.S[1] // ....................................*........................................................................................................................................................... + mul v14.4S, v8.4S, v0.S[0] // ................................*............................................................................................................................................................... + sqrdmulh v21.4S, v8.4S, v0.S[1] // ...............................*................................................................................................................................................................ + sqrdmulh v10.4S, v11.4S, v0.S[1] // .........................................*...................................................................................................................................................... + mul v19.4S, v11.4S, v0.S[0] // ..........................................*..................................................................................................................................................... + mul v11.4S, v30.4S, v0.S[0] // ....................................................*........................................................................................................................................... + sqrdmulh v25.4S, v30.4S, v0.S[1] // ...................................................*............................................................................................................................................ + ldr q30, [x0, #0] // *............................................................................................................................................................................................... + mul v8.4S, v12.4S, v0.S[0] // .................*.............................................................................................................................................................................. + mul v26.4S, v28.4S, v0.S[0] // ...............................................*................................................................................................................................................ + sqrdmulh v20.4S, v28.4S, v0.S[1] // ..............................................*................................................................................................................................................. + ldr q28, [x0, #192] // ...*............................................................................................................................................................................................ + mla v11.4S, v25.4S, v29.4S // .....................................................*.......................................................................................................................................... + ldr q23, [x0, #320] // .....*.......................................................................................................................................................................................... + ldr q25, [x0, #256] // ....*........................................................................................................................................................................................... + mla v19.4S, v10.4S, v29.4S // ...........................................*.................................................................................................................................................... + mla v16.4S, v13.4S, v29.4S // ......................................*......................................................................................................................................................... + sqrdmulh v31.4S, v12.4S, v0.S[1] // ................*............................................................................................................................................................................... + mla v14.4S, v21.4S, v29.4S // .................................*.............................................................................................................................................................. + mul v10.4S, v18.4S, v0.S[0] // ...........................*.................................................................................................................................................................... + sqrdmulh v21.4S, v9.4S, v0.S[1] // .....................*.......................................................................................................................................................................... + sqrdmulh v15.4S, v18.4S, v0.S[1] // ..........................*..................................................................................................................................................................... + mul v27.4S, v9.4S, v0.S[0] // ......................*......................................................................................................................................................................... + sub v13.4S, v22.4S, v11.4S // ......................................................*......................................................................................................................................... + add v18.4S, v22.4S, v11.4S // .......................................................*........................................................................................................................................ + mla v26.4S, v20.4S, v29.4S // ................................................*............................................................................................................................................... + ldr q20, [x0, #384] // ......*......................................................................................................................................................................................... + mla v27.4S, v21.4S, v29.4S // .......................*........................................................................................................................................................................ + mla v8.4S, v31.4S, v29.4S // ..................*............................................................................................................................................................................. + sub v9.4S, v25.4S, v16.4S // .......................................*........................................................................................................................................................ + sqrdmulh v21.4S, v18.4S, v0.S[3] // .......................................................................*........................................................................................................................ + add v24.4S, v25.4S, v16.4S // ........................................*....................................................................................................................................................... + mul v16.4S, v18.4S, v0.S[2] // ........................................................................*....................................................................................................................... + sub v12.4S, v20.4S, v26.4S // .................................................*.............................................................................................................................................. + mul v22.4S, v13.4S, v1.S[0] // ............................................................................................*................................................................................................... + add v11.4S, v20.4S, v26.4S // ..................................................*............................................................................................................................................. + sqrdmulh v31.4S, v9.4S, v1.S[1] // ............................................................................*................................................................................................................... + mul v25.4S, v9.4S, v1.S[0] // .............................................................................*.................................................................................................................. + add v20.4S, v23.4S, v19.4S // .............................................*.................................................................................................................................................. + sqrdmulh v26.4S, v24.4S, v0.S[3] // ........................................................*....................................................................................................................................... + sub v23.4S, v23.4S, v19.4S // ............................................*................................................................................................................................................... + mul v9.4S, v12.4S, v1.S[0] // .......................................................................................*........................................................................................................ + sub v18.4S, v30.4S, v8.4S // ...................*............................................................................................................................................................................ + mla v16.4S, v21.4S, v29.4S // .........................................................................*...................................................................................................................... + add v21.4S, v30.4S, v8.4S // ....................*........................................................................................................................................................................... + sqrdmulh v30.4S, v12.4S, v1.S[1] // ......................................................................................*......................................................................................................... + mul v24.4S, v24.4S, v0.S[2] // .........................................................*...................................................................................................................................... + sub v19.4S, v28.4S, v14.4S // ..................................*............................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v1.S[1] // ...........................................................................................*.................................................................................................... + add v12.4S, v28.4S, v14.4S // ...................................*............................................................................................................................................................ + ldr q28, [x0, #128] // ..*............................................................................................................................................................................................. + mla v10.4S, v15.4S, v29.4S // ............................*................................................................................................................................................................... + sqrdmulh v15.4S, v11.4S, v0.S[3] // ..................................................................*............................................................................................................................. + mul v11.4S, v11.4S, v0.S[2] // ...................................................................*............................................................................................................................ + mul v8.4S, v20.4S, v0.S[2] // ..............................................................*................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v0.S[3] // .............................................................*.................................................................................................................................. + add v14.4S, v17.4S, v27.4S // .........................*...................................................................................................................................................................... + mla v25.4S, v31.4S, v29.4S // ..............................................................................*................................................................................................................. + sub v31.4S, v17.4S, v27.4S // ........................*....................................................................................................................................................................... + add v17.4S, v12.4S, v16.4S // ...........................................................................*.................................................................................................................... + mla v9.4S, v30.4S, v29.4S // ........................................................................................*....................................................................................................... + mla v22.4S, v13.4S, v29.4S // .............................................................................................*.................................................................................................. + sqrdmulh v27.4S, v23.4S, v1.S[1] // .................................................................................*.............................................................................................................. + sub v30.4S, v12.4S, v16.4S // ..........................................................................*..................................................................................................................... + mla v8.4S, v20.4S, v29.4S // ...............................................................*................................................................................................................................ + mla v11.4S, v15.4S, v29.4S // ....................................................................*........................................................................................................................... + sqrdmulh v20.4S, v30.4S, v2.S[1] // ...............................................................................................................*................................................................................ + mul v16.4S, v17.4S, v1.S[2] // ......................................................................................................*......................................................................................... + mul v15.4S, v30.4S, v2.S[0] // ................................................................................................................*............................................................................... + sub v12.4S, v28.4S, v10.4S // .............................*.................................................................................................................................................................. + mla v24.4S, v26.4S, v29.4S // ..........................................................*..................................................................................................................................... + sub v30.4S, v19.4S, v22.4S // ..............................................................................................*................................................................................................. + add v22.4S, v19.4S, v22.4S // ...............................................................................................*................................................................................................ + add v28.4S, v28.4S, v10.4S // ..............................*................................................................................................................................................................. + mul v10.4S, v23.4S, v1.S[0] // ..................................................................................*............................................................................................................. + sub v26.4S, v12.4S, v9.4S // .........................................................................................*...................................................................................................... + add v13.4S, v14.4S, v8.4S // .................................................................*.............................................................................................................................. + sqrdmulh v17.4S, v17.4S, v1.S[3] // .....................................................................................................*.......................................................................................... + mla v15.4S, v20.4S, v29.4S // .................................................................................................................*.............................................................................. + add v23.4S, v12.4S, v9.4S // ..........................................................................................*..................................................................................................... + sub v19.4S, v14.4S, v8.4S // ................................................................*............................................................................................................................... + add v12.4S, v28.4S, v11.4S // ......................................................................*......................................................................................................................... + sqrdmulh v9.4S, v30.4S, v3.S[1] // ...................................................................................................................................*............................................................ + mul v20.4S, v30.4S, v3.S[0] // ....................................................................................................................................*........................................................... + mul v14.4S, v26.4S, v3.S[0] // ...............................................................................................................................*................................................................ sub count, count, #1 layer1234_start: - mla v8.4S, v10.4S, v29.4S // .................................................................................................................*.............................................................................. - mul v24.4S, v25.4S, v2.S[2] // .........................................................................................................................*...................................................................... - mul v16.4S, v30.4S, v1.S[2] // .....................................................................................................*.......................................................................................... - sqrdmulh v11.4S, v30.4S, v1.S[3] // ......................................................................................................*......................................................................................... - sqrdmulh v30.4S, v25.4S, v2.S[3] // ..........................................................................................................................*..................................................................... - sqrdmulh v25.4S, v20.4S, v3.S[1] // ....................................................................................................................................*........................................................... - mul v20.4S, v20.4S, v3.S[0] // ...................................................................................................................................*............................................................ - sqrdmulh v19.4S, v22.4S, v1.S[3] // .................................................................................................*.............................................................................................. - sub v10.4S, v15.4S, v23.4S // ...........................................................*.................................................................................................................................... - mla v27.4S, v18.4S, v29.4S // ............................................................................................................*................................................................................... - add v17.4S, v15.4S, v23.4S // ............................................................*................................................................................................................................... - sqrdmulh v23.4S, v13.4S, v2.S[3] // .....................................................................................................................*.......................................................................... - mul v13.4S, v13.4S, v2.S[2] // ....................................................................................................................*........................................................................... - mla v16.4S, v11.4S, v29.4S // .......................................................................................................*........................................................................................ - sub v15.4S, v21.4S, v8.4S // ..................................................................................................................*............................................................................. - add v8.4S, v21.4S, v8.4S // ...................................................................................................................*............................................................................ - mla v20.4S, v25.4S, v29.4S // .....................................................................................................................................*.......................................................... - mul v21.4S, v26.4S, v3.S[0] // ..............................................................................................................................*................................................................. - mla v24.4S, v30.4S, v29.4S // ...........................................................................................................................*.................................................................... - sub v30.4S, v10.4S, v27.4S // .............................................................................................................*.................................................................................. - sqrdmulh v11.4S, v26.4S, v3.S[1] // ...............................................................................................................................*................................................................ - mul v18.4S, v8.4S, v4.S[2] // ..................................................................................................................................................*............................................. - sqrdmulh v26.4S, v8.4S, v4.S[3] // ...................................................................................................................................................*............................................ - add v25.4S, v9.4S, v24.4S // .............................................................................................................................*.................................................................. - sub v9.4S, v9.4S, v24.4S // ............................................................................................................................*................................................................... - mul v24.4S, v22.4S, v1.S[2] // ................................................................................................*............................................................................................... - add v22.4S, v12.4S, v16.4S // .........................................................................................................*...................................................................................... - sub v16.4S, v12.4S, v16.4S // ........................................................................................................*....................................................................................... - add v8.4S, v10.4S, v27.4S // ..............................................................................................................*................................................................................. - add v10.4S, v14.4S, v20.4S // .......................................................................................................................................*........................................................ - mla v13.4S, v23.4S, v29.4S // ......................................................................................................................*......................................................................... - mla v18.4S, v26.4S, v29.4S // ....................................................................................................................................................*........................................... - sqrdmulh v23.4S, v22.4S, v3.S[3] // .........................................................................................................................................*...................................................... - mul v27.4S, v22.4S, v3.S[2] // ........................................................................................................................................*....................................................... - mul v12.4S, v9.4S, v6.S[0] // .................................................................................................................................................................*.............................. - ldr q22, [x0, #976] // ...............e................................................................................................................................................................................ - sqrdmulh v9.4S, v9.4S, v6.S[1] // ..................................................................................................................................................................*............................. - mla v24.4S, v19.4S, v29.4S // ..................................................................................................*............................................................................................. - mla v12.4S, v9.4S, v29.4S // ...................................................................................................................................................................*............................ - mul v19.4S, v15.4S, v5.S[0] // .......................................................................................................................................................*........................................ - mla v21.4S, v11.4S, v29.4S // ................................................................................................................................*............................................................... - add v26.4S, v8.4S, v18.4S // ......................................................................................................................................................*......................................... - sqrdmulh v9.4S, v15.4S, v5.S[1] // ........................................................................................................................................................*....................................... - add v15.4S, v28.4S, v31.4S // ................................................................................*............................................................................................................... - sub v8.4S, v8.4S, v18.4S // .....................................................................................................................................................*.......................................... - sub v18.4S, v17.4S, v24.4S // ...................................................................................................*............................................................................................ - sub v28.4S, v28.4S, v31.4S // ...............................................................................*................................................................................................................ - mul v11.4S, v10.4S, v6.S[2] // ......................................................................................................................................................................*......................... - sqrdmulh v31.4S, v25.4S, v5.S[3] // .............................................................................................................................................................*.................................. - sub v14.4S, v14.4S, v20.4S // ......................................................................................................................................*......................................................... - add v20.4S, v17.4S, v24.4S // ....................................................................................................*........................................................................................... - mla v27.4S, v23.4S, v29.4S // ..........................................................................................................................................*..................................................... - str q8, [x0, #320] // .....................................................................................................................................................................................*.......... - str q26, [x0, #256] // ....................................................................................................................................................................................*........... - mul v17.4S, v16.4S, v4.S[0] // .............................................................................................................................................*.................................................. - ldr q26, [x0, #912] // ..............e................................................................................................................................................................................. - add v23.4S, v15.4S, v13.4S // ........................................................................................................................*....................................................................... - sub v24.4S, v15.4S, v13.4S // .......................................................................................................................*........................................................................ - sqrdmulh v15.4S, v10.4S, v6.S[3] // .......................................................................................................................................................................*........................ - mla v19.4S, v9.4S, v29.4S // .........................................................................................................................................................*...................................... - add v9.4S, v20.4S, v27.4S // ............................................................................................................................................*................................................... - sub v20.4S, v20.4S, v27.4S // ...........................................................................................................................................*.................................................... - mul v25.4S, v25.4S, v5.S[2] // ............................................................................................................................................................*................................... - sqrdmulh v13.4S, v16.4S, v4.S[1] // ..............................................................................................................................................*................................................. - ldr q16, [x0, #784] // ............e................................................................................................................................................................................... - mul v10.4S, v14.4S, v7.S[0] // ...........................................................................................................................................................................*.................... - sub v27.4S, v28.4S, v21.4S // .................................................................................................................................*.............................................................. - add v21.4S, v28.4S, v21.4S // ..................................................................................................................................*............................................................. - mul v28.4S, v26.4S, v0.S[0] // ..............................................e................................................................................................................................................. - sqrdmulh v8.4S, v26.4S, v0.S[1] // ...............................................e................................................................................................................................................ - mla v11.4S, v15.4S, v29.4S // ........................................................................................................................................................................*....................... - str q20, [x0, #64] // .................................................................................................................................................................................*.............. - sqrdmulh v26.4S, v14.4S, v7.S[1] // ............................................................................................................................................................................*................... - str q9, [x0], #(16) // ................................................................................................................................................................................*............... - add v9.4S, v30.4S, v19.4S // ...........................................................................................................................................................*.................................... - add v14.4S, v24.4S, v12.4S // .....................................................................................................................................................................*.......................... - ldr q20, [x0, #640] // ..........e..................................................................................................................................................................................... - str q9, [x0, #368] // ......................................................................................................................................................................................*......... - mla v10.4S, v26.4S, v29.4S // .............................................................................................................................................................................*.................. - mla v17.4S, v13.4S, v29.4S // ...............................................................................................................................................*................................................ - mul v13.4S, v22.4S, v0.S[0] // ...................................................e............................................................................................................................................ - sqrdmulh v9.4S, v22.4S, v0.S[1] // ....................................................e........................................................................................................................................... - str q14, [x0, #624] // ..........................................................................................................................................................................................*..... - ldr q14, [x0, #384] // ......e......................................................................................................................................................................................... - sqrdmulh v26.4S, v16.4S, v0.S[1] // .....................................e.......................................................................................................................................................... - mla v28.4S, v8.4S, v29.4S // ................................................e............................................................................................................................................... - mul v22.4S, v16.4S, v0.S[0] // ....................................e........................................................................................................................................................... - mla v25.4S, v31.4S, v29.4S // ..............................................................................................................................................................*................................. - ldr q31, [x0, #704] // ...........e.................................................................................................................................................................................... - sub v8.4S, v21.4S, v11.4S // .........................................................................................................................................................................*...................... - add v15.4S, v21.4S, v11.4S // ..........................................................................................................................................................................*..................... - ldr q21, [x0, #448] // .......e........................................................................................................................................................................................ - mul v16.4S, v20.4S, v0.S[0] // ..........................e..................................................................................................................................................................... - sub v11.4S, v24.4S, v12.4S // ....................................................................................................................................................................*........................... - mla v13.4S, v9.4S, v29.4S // .....................................................e.......................................................................................................................................... - sub v12.4S, v27.4S, v10.4S // ..............................................................................................................................................................................*................. - str q15, [x0, #752] // ............................................................................................................................................................................................*... - str q11, [x0, #688] // ...........................................................................................................................................................................................*.... - add v15.4S, v23.4S, v25.4S // ................................................................................................................................................................*............................... - sqrdmulh v9.4S, v20.4S, v0.S[1] // ...........................e.................................................................................................................................................................... - sub v25.4S, v23.4S, v25.4S // ...............................................................................................................................................................*................................ - ldr q23, [x0, #256] // ....e........................................................................................................................................................................................... - ldr q11, [x0, #832] // .............e.................................................................................................................................................................................. - add v24.4S, v21.4S, v13.4S // .......................................................e........................................................................................................................................ - sub v20.4S, v21.4S, v13.4S // ......................................................e......................................................................................................................................... - str q12, [x0, #944] // ...............................................................................................................................................................................................* - add v21.4S, v14.4S, v28.4S // ..................................................e............................................................................................................................................. - str q8, [x0, #816] // .............................................................................................................................................................................................*.. - add v8.4S, v18.4S, v17.4S // .................................................................................................................................................*.............................................. - ldr q13, [x0, #576] // .........e...................................................................................................................................................................................... - mla v22.4S, v26.4S, v29.4S // ......................................e......................................................................................................................................................... - ldr q12, [x0, #512] // ........e....................................................................................................................................................................................... - str q25, [x0, #560] // .........................................................................................................................................................................................*...... - sub v25.4S, v18.4S, v17.4S // ................................................................................................................................................*............................................... - sqrdmulh v17.4S, v31.4S, v0.S[1] // ................................e............................................................................................................................................................... - sub v26.4S, v30.4S, v19.4S // ..........................................................................................................................................................*..................................... - mul v19.4S, v31.4S, v0.S[0] // ...............................e................................................................................................................................................................ - add v30.4S, v27.4S, v10.4S // ...............................................................................................................................................................................*................ - sqrdmulh v18.4S, v24.4S, v0.S[3] // ........................................................................e....................................................................................................................... - mla v16.4S, v9.4S, v29.4S // ............................e................................................................................................................................................................... - mul v9.4S, v21.4S, v0.S[2] // ..................................................................e............................................................................................................................. - str q8, [x0, #112] // ..................................................................................................................................................................................*............. - sub v27.4S, v23.4S, v22.4S // .......................................e........................................................................................................................................................ - sub v10.4S, v14.4S, v28.4S // .................................................e.............................................................................................................................................. - sqrdmulh v31.4S, v11.4S, v0.S[1] // ..........................................e..................................................................................................................................................... - mul v28.4S, v11.4S, v0.S[0] // .........................................e...................................................................................................................................................... - mla v19.4S, v17.4S, v29.4S // .................................e.............................................................................................................................................................. - mul v17.4S, v24.4S, v0.S[2] // .......................................................................e........................................................................................................................ - sqrdmulh v11.4S, v13.4S, v0.S[1] // ......................e......................................................................................................................................................................... - mul v13.4S, v13.4S, v0.S[0] // .....................e.......................................................................................................................................................................... - add v14.4S, v23.4S, v22.4S // ........................................e....................................................................................................................................................... - sqrdmulh v22.4S, v12.4S, v0.S[1] // .................e.............................................................................................................................................................................. - str q25, [x0, #176] // ...................................................................................................................................................................................*............ - mul v24.4S, v12.4S, v0.S[0] // ................e............................................................................................................................................................................... - sqrdmulh v12.4S, v21.4S, v0.S[3] // ...................................................................e............................................................................................................................ - ldr q21, [x0, #320] // .....e.......................................................................................................................................................................................... - ldr q25, [x0, #192] // ...e............................................................................................................................................................................................ - sqrdmulh v8.4S, v27.4S, v1.S[1] // .............................................................................e.................................................................................................................. - str q15, [x0, #496] // ........................................................................................................................................................................................*....... - ldr q15, [x0, #128] // ..e............................................................................................................................................................................................. - mla v28.4S, v31.4S, v29.4S // ...........................................e.................................................................................................................................................... - mla v13.4S, v11.4S, v29.4S // .......................e........................................................................................................................................................................ - mul v11.4S, v20.4S, v1.S[0] // ...........................................................................................e.................................................................................................... - str q26, [x0, #432] // .......................................................................................................................................................................................*........ - mla v17.4S, v18.4S, v29.4S // .........................................................................e...................................................................................................................... - mul v31.4S, v27.4S, v1.S[0] // ............................................................................e................................................................................................................... - sqrdmulh v27.4S, v20.4S, v1.S[1] // ............................................................................................e................................................................................................... - sqrdmulh v20.4S, v10.4S, v1.S[1] // .......................................................................................e........................................................................................................ - mul v18.4S, v10.4S, v1.S[0] // ......................................................................................e......................................................................................................... - mla v9.4S, v12.4S, v29.4S // ....................................................................e........................................................................................................................... - str q30, [x0, #880] // ..............................................................................................................................................................................................*. - mul v23.4S, v14.4S, v0.S[2] // ........................................................e....................................................................................................................................... - sub v26.4S, v21.4S, v28.4S // ............................................e................................................................................................................................................... - add v21.4S, v21.4S, v28.4S // .............................................e.................................................................................................................................................. - ldr q28, [x0, #64] // .e.............................................................................................................................................................................................. - add v30.4S, v25.4S, v19.4S // ...................................e............................................................................................................................................................ - mla v31.4S, v8.4S, v29.4S // ..............................................................................e................................................................................................................. - add v8.4S, v15.4S, v16.4S // ..............................e................................................................................................................................................................. - mla v24.4S, v22.4S, v29.4S // ..................e............................................................................................................................................................................. - sub v15.4S, v15.4S, v16.4S // .............................e.................................................................................................................................................................. - mla v11.4S, v27.4S, v29.4S // .............................................................................................e.................................................................................................. - mul v12.4S, v26.4S, v1.S[0] // .................................................................................e.............................................................................................................. - sqrdmulh v22.4S, v26.4S, v1.S[1] // ..................................................................................e............................................................................................................. - sub v19.4S, v25.4S, v19.4S // ..................................e............................................................................................................................................................. - sqrdmulh v10.4S, v21.4S, v0.S[3] // ..............................................................e................................................................................................................................. - mul v16.4S, v21.4S, v0.S[2] // .............................................................e.................................................................................................................................. - mla v18.4S, v20.4S, v29.4S // ........................................................................................e....................................................................................................... - ldr q20, [x0, #0] // e............................................................................................................................................................................................... - sub v27.4S, v30.4S, v17.4S // ..........................................................................e..................................................................................................................... - add v30.4S, v30.4S, v17.4S // ...........................................................................e.................................................................................................................... - sqrdmulh v17.4S, v14.4S, v0.S[3] // .........................................................e...................................................................................................................................... - sub v14.4S, v28.4S, v13.4S // ........................e....................................................................................................................................................................... - mla v12.4S, v22.4S, v29.4S // ...................................................................................e............................................................................................................ - sub v21.4S, v8.4S, v9.4S // .....................................................................e.......................................................................................................................... - add v25.4S, v19.4S, v11.4S // ...............................................................................................e................................................................................................ - add v28.4S, v28.4S, v13.4S // .........................e...................................................................................................................................................................... - sub v26.4S, v15.4S, v18.4S // .........................................................................................e...................................................................................................... - add v13.4S, v15.4S, v18.4S // ..........................................................................................e..................................................................................................... - mla v16.4S, v10.4S, v29.4S // ...............................................................e................................................................................................................................ - sqrdmulh v10.4S, v27.4S, v2.S[1] // ................................................................................................................e............................................................................... - add v15.4S, v20.4S, v24.4S // ....................e........................................................................................................................................................................... - add v22.4S, v8.4S, v9.4S // ......................................................................e......................................................................................................................... - mul v8.4S, v27.4S, v2.S[0] // ...............................................................................................................e................................................................................ - mul v27.4S, v21.4S, v2.S[0] // ..........................................................................................................e..................................................................................... - sqrdmulh v18.4S, v21.4S, v2.S[1] // ...........................................................................................................e.................................................................................... - add v9.4S, v14.4S, v12.4S // .....................................................................................e.......................................................................................................... - mla v23.4S, v17.4S, v29.4S // ..........................................................e..................................................................................................................................... - sub v14.4S, v14.4S, v12.4S // ....................................................................................e........................................................................................................... - sub v21.4S, v28.4S, v16.4S // ................................................................e............................................................................................................................... - add v12.4S, v28.4S, v16.4S // .................................................................e.............................................................................................................................. - sub v28.4S, v20.4S, v24.4S // ...................e............................................................................................................................................................................ - sub v20.4S, v19.4S, v11.4S // ..............................................................................................e................................................................................................. + // Instructions: 192 + // Expected cycles: 24 + // Expected IPC: 8.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + add v30.4S, v18.4S, v25.4S // ...........*.................................................................................................................................................................................... + sub v8.4S, v18.4S, v25.4S // *............................................................................................................................................................................................... + sqrdmulh v25.4S, v12.4S, v1.S[3] // .....................*.......................................................................................................................................................................... + mul v12.4S, v12.4S, v1.S[2] // ............................*................................................................................................................................................................... + mla v10.4S, v27.4S, v29.4S // ....*........................................................................................................................................................................................... + sub v11.4S, v28.4S, v11.4S // ........*....................................................................................................................................................................................... + sqrdmulh v26.4S, v26.4S, v3.S[1] // ..........................*..................................................................................................................................................................... + add v18.4S, v19.4S, v15.4S // ...*............................................................................................................................................................................................ + mla v16.4S, v17.4S, v29.4S // .*.............................................................................................................................................................................................. + sub v28.4S, v19.4S, v15.4S // ............*................................................................................................................................................................................... + ldr q19, [x0, #656] // ...................................................................................................*............................................................................................ + mla v20.4S, v9.4S, v29.4S // ..........*..................................................................................................................................................................................... + mla v12.4S, v25.4S, v29.4S // ........................................*....................................................................................................................................................... + sqrdmulh v25.4S, v23.4S, v2.S[3] // ......*......................................................................................................................................................................................... + sqrdmulh v17.4S, v18.4S, v4.S[3] // ....................*........................................................................................................................................................................... + mul v27.4S, v18.4S, v4.S[2] // .........*...................................................................................................................................................................................... + mul v18.4S, v22.4S, v2.S[2] // .......*........................................................................................................................................................................................ + sqrdmulh v15.4S, v28.4S, v5.S[1] // ...............................................*................................................................................................................................................ + add v9.4S, v31.4S, v10.4S // ......................*......................................................................................................................................................................... + mla v14.4S, v26.4S, v29.4S // .............................................*.................................................................................................................................................. + mul v26.4S, v11.4S, v2.S[0] // .......................*........................................................................................................................................................................ + sqrdmulh v11.4S, v11.4S, v2.S[1] // ...................*............................................................................................................................................................................ + sqrdmulh v22.4S, v22.4S, v2.S[3] // ..*............................................................................................................................................................................................. + sub v10.4S, v31.4S, v10.4S // .................*.............................................................................................................................................................................. + add v31.4S, v21.4S, v24.4S // ....................................*........................................................................................................................................................... + sub v21.4S, v21.4S, v24.4S // ..............*................................................................................................................................................................................. + mul v24.4S, v23.4S, v2.S[2] // .....*.......................................................................................................................................................................................... + mla v27.4S, v17.4S, v29.4S // .........................................*...................................................................................................................................................... + mul v28.4S, v28.4S, v5.S[0] // .......................................................*........................................................................................................................................ + mla v26.4S, v11.4S, v29.4S // ................................*............................................................................................................................................................... + mla v18.4S, v22.4S, v29.4S // ................*............................................................................................................................................................................... + sub v22.4S, v10.4S, v20.4S // .........................*...................................................................................................................................................................... + add v17.4S, v31.4S, v12.4S // .....................................................*.......................................................................................................................................... + sub v11.4S, v13.4S, v16.4S // .............*.................................................................................................................................................................................. + add v23.4S, v13.4S, v16.4S // ...............*................................................................................................................................................................................ + sub v13.4S, v31.4S, v12.4S // ....................................................*........................................................................................................................................... + mla v24.4S, v25.4S, v29.4S // ..................*............................................................................................................................................................................. + add v16.4S, v10.4S, v20.4S // ........................*....................................................................................................................................................................... + add v10.4S, v21.4S, v26.4S // ...................................................*............................................................................................................................................ + mul v31.4S, v11.4S, v4.S[0] // ...........................*.................................................................................................................................................................... + sqrdmulh v12.4S, v11.4S, v4.S[1] // ..................................*............................................................................................................................................................. + sqrdmulh v11.4S, v16.4S, v6.S[3] // .......................................*........................................................................................................................................................ + add v20.4S, v9.4S, v18.4S // .................................*.............................................................................................................................................................. + sub v9.4S, v9.4S, v18.4S // ...............................*................................................................................................................................................................ + sub v25.4S, v21.4S, v26.4S // .................................................*.............................................................................................................................................. + ldr q26, [x0, #912] // ......................................................................................................*......................................................................................... + mul v21.4S, v16.4S, v6.S[2] // ......................................*......................................................................................................................................................... + mla v28.4S, v15.4S, v29.4S // .................................................................*.............................................................................................................................. + sub v16.4S, v10.4S, v27.4S // ...........................................................*.................................................................................................................................... + add v27.4S, v10.4S, v27.4S // ........................................................*....................................................................................................................................... + sqrdmulh v10.4S, v22.4S, v7.S[1] // .........................................................*...................................................................................................................................... + mul v22.4S, v22.4S, v7.S[0] // ......................................................*......................................................................................................................................... + mla v31.4S, v12.4S, v29.4S // ..................................................*............................................................................................................................................. + sqrdmulh v12.4S, v20.4S, v5.S[3] // ...........................................*.................................................................................................................................................... + str q27, [x0, #256] // ....................................................................*........................................................................................................................... + str q16, [x0, #320] // ..................................................................*............................................................................................................................. + mla v21.4S, v11.4S, v29.4S // ...................................................................*............................................................................................................................ + mul v27.4S, v20.4S, v5.S[2] // ..........................................*..................................................................................................................................................... + mul v15.4S, v9.4S, v6.S[0] // ............................................*................................................................................................................................................... + add v16.4S, v8.4S, v14.4S // ......................................................................*......................................................................................................................... + sqrdmulh v20.4S, v9.4S, v6.S[1] // ..............................................*................................................................................................................................................. + sub v9.4S, v8.4S, v14.4S // ................................................................*............................................................................................................................... + ldr q14, [x0, #848] // ....................................................................................................*........................................................................................... + mla v27.4S, v12.4S, v29.4S // ..........................................................*..................................................................................................................................... + add v12.4S, v25.4S, v28.4S // ........................................................................................*....................................................................................................... + sub v25.4S, v25.4S, v28.4S // ...................................................................................*............................................................................................................ + ldr q28, [x0, #528] // .................................................................................................*.............................................................................................. + add v18.4S, v30.4S, v24.4S // .............................*.................................................................................................................................................................. + mla v22.4S, v10.4S, v29.4S // .......................................................................*........................................................................................................................ + sqrdmulh v10.4S, v26.4S, v0.S[1] // .....................................................................................................................*.......................................................................... + mul v8.4S, v26.4S, v0.S[0] // ....................................................................................................................*........................................................................... + sub v26.4S, v13.4S, v31.4S // .....................................................................*.......................................................................................................................... + str q12, [x0, #384] // ..............................................................................................*................................................................................................. + add v12.4S, v13.4S, v31.4S // .............................................................*.................................................................................................................................. + add v31.4S, v16.4S, v21.4S // .................................................................................*.............................................................................................................. + sub v11.4S, v16.4S, v21.4S // ......................................................................................*......................................................................................................... + sub v16.4S, v30.4S, v24.4S // ..............................*................................................................................................................................................................. + str q26, [x0, #192] // ...............................................................................*................................................................................................................ + ldr q13, [x0, #400] // .....................................................................................................................................*.......................................................... + mla v15.4S, v20.4S, v29.4S // ............................................................*................................................................................................................................... + mla v8.4S, v10.4S, v29.4S // ....................................................................................................................................*........................................................... + ldr q30, [x0, #592] // .......................................................................................................*........................................................................................ + mul v20.4S, v23.4S, v3.S[2] // .....................................*.......................................................................................................................................................... + ldr q10, [x0, #976] // .....................................................................................................*.......................................................................................... + str q12, [x0, #128] // ..............................................................................*................................................................................................................. + str q31, [x0, #768] // ..........................................................................................*..................................................................................................... + sqrdmulh v21.4S, v28.4S, v0.S[1] // ............................................................................................................................*................................................................... + sqrdmulh v31.4S, v19.4S, v0.S[1] // ................................................................................................................................*............................................................... + add v12.4S, v18.4S, v27.4S // ........................................................................*....................................................................................................................... + mul v26.4S, v19.4S, v0.S[0] // ..............................................................................................................................*................................................................. + str q11, [x0, #832] // ............................................................................................*................................................................................................... + sqrdmulh v19.4S, v23.4S, v3.S[3] // ...................................*............................................................................................................................................................ + ldr q11, [x0, #720] // ................................................................................................*............................................................................................... + sqrdmulh v23.4S, v14.4S, v0.S[1] // ..............................................................................................................*................................................................................. + mul v24.4S, v14.4S, v0.S[0] // ...............................................................................................................*................................................................................ + sub v14.4S, v18.4S, v27.4S // .........................................................................*...................................................................................................................... + mla v20.4S, v19.4S, v29.4S // ................................................*............................................................................................................................................... + sqrdmulh v19.4S, v10.4S, v0.S[1] // .................................................................................................................*.............................................................................. + str q14, [x0, #576] // ..................................................................................*............................................................................................................. + ldr q18, [x0, #784] // ..................................................................................................*............................................................................................. + mul v14.4S, v10.4S, v0.S[0] // ................................................................................................................*............................................................................... + str q12, [x0, #512] // ................................................................................*............................................................................................................... + ldr q10, [x0, #336] // ........................................................................................................................*....................................................................... + add v12.4S, v13.4S, v8.4S // ..............................................................................................................................................*................................................. + mla v24.4S, v23.4S, v29.4S // ..........................................................................................................................*..................................................................... + sub v23.4S, v16.4S, v15.4S // .............................................................................*.................................................................................................................. + str q25, [x0, #448] // .........................................................................................*...................................................................................................... + mla v26.4S, v31.4S, v29.4S // ..............................................................................................................................................................*................................. + sub v31.4S, v9.4S, v22.4S // .......................................................................................*........................................................................................................ + add v25.4S, v16.4S, v15.4S // ............................................................................*................................................................................................................... + mul v27.4S, v30.4S, v0.S[0] // .................................................................................................................................*.............................................................. + ldr q15, [x0, #464] // .........................................................................................................*...................................................................................... + sqrdmulh v16.4S, v12.4S, v0.S[3] // ...............................................................................................................................................................*................................ + mla v14.4S, v19.4S, v29.4S // .......................................................................................................................*........................................................................ + str q23, [x0, #704] // ....................................................................................*........................................................................................................... + str q31, [x0, #960] // .............................................................................................*.................................................................................................. + mul v23.4S, v28.4S, v0.S[0] // ...................................................................................................................*............................................................................ + sub v19.4S, v17.4S, v20.4S // ...............................................................*................................................................................................................................ + add v28.4S, v17.4S, v20.4S // ..............................................................*................................................................................................................................. + mul v20.4S, v18.4S, v0.S[0] // ..........................................................................................................*..................................................................................... + sqrdmulh v18.4S, v18.4S, v0.S[1] // ...........................................................................................................*.................................................................................... + add v9.4S, v9.4S, v22.4S // ...........................................................................................*.................................................................................................... + str q25, [x0, #640] // .....................................................................................*.......................................................................................................... + add v31.4S, v10.4S, v24.4S // .................................................................................................................................................*.............................................. + sqrdmulh v17.4S, v11.4S, v0.S[1] // .............................................................................................................*.................................................................................. + sub v25.4S, v13.4S, v8.4S // ............................................................................................................................................*................................................... + mul v13.4S, v11.4S, v0.S[0] // ............................................................................................................*................................................................................... + ldr q22, [x0, #144] // .............................................................................................................................................................*.................................. + mul v11.4S, v12.4S, v0.S[2] // ................................................................................................................................................................*............................... + str q19, [x0, #64] // ..........................................................................*..................................................................................................................... + str q28, [x0], #(16) // ...........................................................................*.................................................................................................................... + add v28.4S, v15.4S, v14.4S // ...................................................................................................................................*............................................................ + sub v14.4S, v15.4S, v14.4S // ..................................................................................................................................*............................................................. + sqrdmulh v15.4S, v31.4S, v0.S[3] // ..................................................................................................................................................................*............................. + str q9, [x0, #880] // ...............................................................................................*................................................................................................ + sqrdmulh v9.4S, v30.4S, v0.S[1] // ...............................................................................................................................*................................................................ + ldr q30, [x0, #256] // .........................................................................................................................*...................................................................... + sqrdmulh v12.4S, v25.4S, v1.S[1] // ........................................................................................................................................................*....................................... + mla v20.4S, v18.4S, v29.4S // ...........................................................................................................................*.................................................................... + ldr q18, [x0, #192] // ......................................................................................................................*......................................................................... + mul v25.4S, v25.4S, v1.S[0] // ....................................................................................................................................................*........................................... + mla v23.4S, v21.4S, v29.4S // .......................................................................................................................................*........................................................ + mla v11.4S, v16.4S, v29.4S // ............................................................................................................................................................................*................... + mla v13.4S, v17.4S, v29.4S // .............................................................................................................................*.................................................................. + sqrdmulh v8.4S, v28.4S, v0.S[3] // .........................................................................................................................................*...................................................... + mul v19.4S, v28.4S, v0.S[2] // ...........................................................................................................................................*.................................................... + mla v27.4S, v9.4S, v29.4S // ......................................................................................................................................*......................................................... + sub v9.4S, v10.4S, v24.4S // ...................................................................................................................................................*............................................ + add v28.4S, v22.4S, v26.4S // ....................................................................................................................................................................................*........... + ldr q21, [x0, #0] // ..................................................................................................................*............................................................................. + mul v16.4S, v31.4S, v0.S[2] // .................................................................................................................................................................*.............................. + mul v10.4S, v14.4S, v1.S[0] // .............................................................................................................................................*.................................................. + sub v31.4S, v30.4S, v20.4S // ........................................................................................................................................*....................................................... + sqrdmulh v17.4S, v14.4S, v1.S[1] // ...........................................................................................................................................................*.................................... + add v14.4S, v30.4S, v20.4S // ..........................................................................................................................................*..................................................... + mla v25.4S, v12.4S, v29.4S // .......................................................................................................................................................................*........................ + add v12.4S, v28.4S, v11.4S // ............................................................................................................................................................................................*... + mla v19.4S, v8.4S, v29.4S // ......................................................................................................................................................*......................................... + add v20.4S, v18.4S, v13.4S // ............................................................................................................................................................*................................... + ldr q8, [x0, #64] // ........................................................................................................*....................................................................................... + mla v16.4S, v15.4S, v29.4S // ...........................................................................................................................................................................*.................... + sub v30.4S, v22.4S, v26.4S // ................................................................................................................................................................................*............... + sub v22.4S, v18.4S, v13.4S // ..........................................................................................................................................................*..................................... + sqrdmulh v13.4S, v14.4S, v0.S[3] // ..................................................................................................................................................*............................................. + mul v24.4S, v14.4S, v0.S[2] // .........................................................................................................................................................*...................................... + sqrdmulh v14.4S, v31.4S, v1.S[1] // ...............................................................................................................................................*................................................ + sub v18.4S, v21.4S, v23.4S // .....................................................................................................................................................*.......................................... + mla v10.4S, v17.4S, v29.4S // ........................................................................................................................................................................*....................... + sub v17.4S, v20.4S, v19.4S // ..........................................................................................................................................................................*..................... + add v21.4S, v21.4S, v23.4S // .......................................................................................................................................................*........................................ + add v20.4S, v20.4S, v19.4S // ......................................................................................................................................................................*......................... + sub v26.4S, v30.4S, v25.4S // ......................................................................................................................................................................................*......... + add v19.4S, v8.4S, v27.4S // ...................................................................................................................................................................*............................ + add v23.4S, v30.4S, v25.4S // ..........................................................................................................................................................................................*..... + mul v25.4S, v31.4S, v1.S[0] // ................................................................................................................................................*............................................... + sub v31.4S, v8.4S, v27.4S // .....................................................................................................................................................................*.......................... + mul v15.4S, v17.4S, v2.S[0] // ...............................................................................................................................................................................*................ + sqrdmulh v30.4S, v17.4S, v2.S[1] // .............................................................................................................................................................................*.................. + sqrdmulh v17.4S, v20.4S, v1.S[3] // ........................................................................................................................................................................................*....... + sub v8.4S, v22.4S, v10.4S // ..................................................................................................................................................................................*............. + mla v24.4S, v13.4S, v29.4S // .................................................................................................................................................................................*.............. + add v13.4S, v19.4S, v16.4S // .......................................................................................................................................................................................*........ + sub v19.4S, v19.4S, v16.4S // ...........................................................................................................................................................................................*.... + mul v16.4S, v20.4S, v1.S[2] // ..............................................................................................................................................................................*................. + sqrdmulh v27.4S, v9.4S, v1.S[1] // .........................................................................................................................................................................*...................... + mul v20.4S, v8.4S, v3.S[0] // ..............................................................................................................................................................................................*. + add v22.4S, v22.4S, v10.4S // ...................................................................................................................................................................................*............ + mul v10.4S, v9.4S, v1.S[0] // .....................................................................................................................................................................................*.......... + mla v25.4S, v14.4S, v29.4S // ....................................................................................................................................................................*........................... + sqrdmulh v9.4S, v8.4S, v3.S[1] // .............................................................................................................................................................................................*.. + mla v15.4S, v30.4S, v29.4S // .........................................................................................................................................................................................*...... + mul v14.4S, v26.4S, v3.S[0] // ...............................................................................................................................................................................................* - // original source code - // ldr q8, [x0, #0] // ....................................................................................................................................e........................|...................................................................................................................................................... - // ldr q9, [x0, #(1*(512/8))] // .......................................................................................................................e.....................................|...................................................................................................................................................... - // ldr q10, [x0, #(2*(512/8))] // ........................................................................................................e....................................................|..........................................................................................................................................e........... - // ldr q11, [x0, #(3*(512/8))] // .....................................................................................................e.......................................................|.......................................................................................................................................e.............. - // ldr q12, [x0, #(4*(512/8))] // ..................................................................e..........................................................................................|....................................................................................................e................................................. - // ldr q13, [x0, #(5*(512/8))] // ....................................................................................................e........................................................|......................................................................................................................................e............... - // ldr q14, [x0, #(6*(512/8))] // ................................................e............................................................................................................|..................................................................................e................................................................... - // ldr q15, [x0, #(7*(512/8))] // ........................................................e....................................................................................................|..........................................................................................e........................................................... - // ldr q16, [x0, #(8*(512/8))] // ............................................................................e................................................................................|..............................................................................................................e....................................... - // ldr q17, [x0, #(9*(512/8))] // ..........................................................................e..................................................................................|............................................................................................................e......................................... - // ldr q18, [x0, #(10*(512/8))] // .........................................e...................................................................................................................|...........................................................................e.......................................................................... - // ldr q19, [x0, #(11*(512/8))] // .....................................................e.......................................................................................................|.......................................................................................e.............................................................. - // ldr q20, [x0, #(12*(512/8))] // .............................e...............................................................................................................................|...............................................................e...................................................................................... - // ldr q21, [x0, #(13*(512/8))] // ...................................................................e.........................................................................................|.....................................................................................................e................................................ - // ldr q22, [x0, #(14*(512/8))] // ....................e........................................................................................................................................|......................................................e............................................................................................... - // ldr q23, [x0, #(15*(512/8))] // e............................................................................................................................................................|..................................e................................................................................................................... - // mul v24.4s, v16.4s, v0.s[0] // ..................................................................................................e..........................................................|....................................................................................................................................e................. - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ................................................................................................e............................................................|..................................................................................................................................e................... - // mla v24.4s, v16.4s, v29.4s // ...........................................................................................................................e.................................|...................................................................................................................................................... - // sub v16.4s, v8.4s, v24.4s // ...........................................................................................................................................................e.|...................................................................................................................................................... - // add v8.4s, v8.4s, v24.4s // .................................................................................................................................................e...........|...................................................................................................................................................... - // mul v24.4s, v17.4s, v0.s[0] // ..............................................................................................e..............................................................|................................................................................................................................e..................... - // sqrdmulh v17.4s, v17.4s, v0.s[1] // .............................................................................................e...............................................................|...............................................................................................................................e...................... - // mla v24.4s, v17.4s, v29.4s // ..........................................................................................................e..................................................|............................................................................................................................................e......... - // sub v17.4s, v9.4s, v24.4s // ........................................................................................................................................e....................|...................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ............................................................................................................................................e................|...................................................................................................................................................... - // mul v24.4s, v18.4s, v0.s[0] // .........................................................e...................................................................................................|...........................................................................................e.......................................................... - // sqrdmulh v18.4s, v18.4s, v0.s[1] // ................................................................e............................................................................................|..................................................................................................e................................................... - // mla v24.4s, v18.4s, v29.4s // ....................................................................................e........................................................................|......................................................................................................................e............................... - // sub v18.4s, v10.4s, v24.4s // ............................................................................................................................e................................|...................................................................................................................................................... - // add v10.4s, v10.4s, v24.4s // ..........................................................................................................................e..................................|...................................................................................................................................................... - // mul v24.4s, v19.4s, v0.s[0] // .................................................................................e...........................................................................|...................................................................................................................e.................................. - // sqrdmulh v19.4s, v19.4s, v0.s[1] // ...............................................................................e.............................................................................|.................................................................................................................e.................................... - // mla v24.4s, v19.4s, v29.4s // ...........................................................................................e.................................................................|.............................................................................................................................e........................ - // sub v19.4s, v11.4s, v24.4s // ................................................................................................................................e............................|...................................................................................................................................................... - // add v11.4s, v11.4s, v24.4s // ........................................................................................................................e....................................|...................................................................................................................................................... - // mul v24.4s, v20.4s, v0.s[0] // ...................................................e.........................................................................................................|.....................................................................................e................................................................ - // sqrdmulh v20.4s, v20.4s, v0.s[1] // .................................................e...........................................................................................................|...................................................................................e.................................................................. - // mla v24.4s, v20.4s, v29.4s // ...........................................................................e.................................................................................|.............................................................................................................e........................................ - // sub v20.4s, v12.4s, v24.4s // .......................................................................................e.....................................................................|.........................................................................................................................e............................ - // add v12.4s, v12.4s, v24.4s // ...............................................................................................e.............................................................|.................................................................................................................................e.................... - // mul v24.4s, v21.4s, v0.s[0] // ..........................................................................................e..................................................................|............................................................................................................................e......................... - // sqrdmulh v21.4s, v21.4s, v0.s[1] // .........................................................................................e...................................................................|...........................................................................................................................e.......................... - // mla v24.4s, v21.4s, v29.4s // .........................................................................................................e...................................................|...........................................................................................................................................e.......... - // sub v21.4s, v13.4s, v24.4s // .....................................................................................................................e.......................................|...................................................................................................................................................... - // add v13.4s, v13.4s, v24.4s // ......................................................................................................................e......................................|...................................................................................................................................................... - // mul v24.4s, v22.4s, v0.s[0] // .................................e...........................................................................................................................|...................................................................e.................................................................................. - // sqrdmulh v22.4s, v22.4s, v0.s[1] // ..................................e..........................................................................................................................|....................................................................e................................................................................. - // mla v24.4s, v22.4s, v29.4s // ..................................................e..........................................................................................................|....................................................................................e................................................................. - // sub v22.4s, v14.4s, v24.4s // ........................................................................................e....................................................................|..........................................................................................................................e........................... - // add v14.4s, v14.4s, v24.4s // .......................................................................e.....................................................................................|.........................................................................................................e............................................ - // mul v24.4s, v23.4s, v0.s[0] // .............................................e...............................................................................................................|...............................................................................e...................................................................... - // sqrdmulh v23.4s, v23.4s, v0.s[1] // ..............................................e..............................................................................................................|................................................................................e..................................................................... - // mla v24.4s, v23.4s, v29.4s // ...........................................................e.................................................................................................|.............................................................................................e........................................................ - // sub v23.4s, v15.4s, v24.4s // .....................................................................e.......................................................................................|.......................................................................................................e.............................................. - // add v15.4s, v15.4s, v24.4s // ....................................................................e........................................................................................|......................................................................................................e............................................... - // mul v24.4s, v12.4s, v0.s[2] // ....................................................................................................................e........................................|...................................................................................................................................................... - // sqrdmulh v12.4s, v12.4s, v0.s[3] // .......................................................................................................................................e.....................|...................................................................................................................................................... - // mla v24.4s, v12.4s, v29.4s // .......................................................................................................................................................e.....|...................................................................................................................................................... - // sub v12.4s, v8.4s, v24.4s // .............................................................................................................................................................|.......*.............................................................................................................................................. - // add v8.4s, v8.4s, v24.4s // .............................................................................................................................................................|.........*............................................................................................................................................ - // mul v24.4s, v13.4s, v0.s[2] // ..................................................................................................................................e..........................|...................................................................................................................................................... - // sqrdmulh v13.4s, v13.4s, v0.s[3] // .................................................................................................................................e...........................|...................................................................................................................................................... - // mla v24.4s, v13.4s, v29.4s // ...............................................................................................................................................e.............|...................................................................................................................................................... - // sub v13.4s, v9.4s, v24.4s // .........................................................................................................................................................e...|...................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ..........................................................................................................................................................e..|...................................................................................................................................................... - // mul v24.4s, v14.4s, v0.s[2] // .....................................................................................e.......................................................................|.......................................................................................................................e.............................. - // sqrdmulh v14.4s, v14.4s, v0.s[3] // ...................................................................................................e.........................................................|.....................................................................................................................................e................ - // mla v24.4s, v14.4s, v29.4s // ..................................................................................................................e..........................................|....................................................................................................................................................e. - // sub v14.4s, v10.4s, v24.4s // ..........................................................................................................................................e..................|...................................................................................................................................................... - // add v10.4s, v10.4s, v24.4s // ..................................................................................................................................................e..........|...................................................................................................................................................... - // mul v24.4s, v15.4s, v0.s[2] // ............................................................................................e................................................................|..............................................................................................................................e....................... - // sqrdmulh v15.4s, v15.4s, v0.s[3] // ...................................................................................e.........................................................................|.....................................................................................................................e................................ - // mla v24.4s, v15.4s, v29.4s // .............................................................................................................e...............................................|...............................................................................................................................................e...... - // sub v15.4s, v11.4s, v24.4s // .....................................................................................................................................e.......................|...................................................................................................................................................... - // add v11.4s, v11.4s, v24.4s // ......................................................................................................................................e......................|...................................................................................................................................................... - // mul v24.4s, v20.4s, v1.s[0] // ..............................................................................................................e..............................................|................................................................................................................................................e..... - // sqrdmulh v20.4s, v20.4s, v1.s[1] // ......................................................................................................e......................................................|........................................................................................................................................e............. - // mla v24.4s, v20.4s, v29.4s // .........................................................................................................................e...................................|...................................................................................................................................................... - // sub v20.4s, v16.4s, v24.4s // ...........*.................................................................................................................................................|.............................................*........................................................................................................ - // add v16.4s, v16.4s, v24.4s // ........*....................................................................................................................................................|..........................................*........................................................................................................... - // mul v24.4s, v21.4s, v1.s[0] // ..............................................................................................................................e..............................|...................................................................................................................................................... - // sqrdmulh v21.4s, v21.4s, v1.s[1] // ...............................................................................................................................e.............................|...................................................................................................................................................... - // mla v24.4s, v21.4s, v29.4s // .........................................................................................................................................e...................|...................................................................................................................................................... - // sub v21.4s, v17.4s, v24.4s // ........................................................................................................................................................e....|...................................................................................................................................................... - // add v17.4s, v17.4s, v24.4s // ......................................................................................................................................................e......|...................................................................................................................................................... - // mul v24.4s, v22.4s, v1.s[0] // .................................................................................................................e...........................................|...................................................................................................................................................e.. - // sqrdmulh v22.4s, v22.4s, v1.s[1] // ................................................................................................................e............................................|..................................................................................................................................................e... - // mla v24.4s, v22.4s, v29.4s // ...................................................................................................................................e.........................|...................................................................................................................................................... - // sub v22.4s, v18.4s, v24.4s // .............................................................................................................................................e...............|...................................................................................................................................................... - // add v18.4s, v18.4s, v24.4s // ..............................................................................................................................................e..............|...................................................................................................................................................... - // mul v24.4s, v23.4s, v1.s[0] // ...........................................................................................................e.................................................|.............................................................................................................................................e........ - // sqrdmulh v23.4s, v23.4s, v1.s[1] // ...............................................................................................................e.............................................|.................................................................................................................................................e.... - // mla v24.4s, v23.4s, v29.4s // .............................................................................................................................e...............................|...................................................................................................................................................... - // sub v23.4s, v19.4s, v24.4s // ............................................................................................................................................................e|...................................................................................................................................................... - // add v19.4s, v19.4s, v24.4s // ...........................................................................................................................................e.................|...................................................................................................................................................... - // mul v24.4s, v10.4s, v1.s[2] // .............................................................................................................................................................|........................*............................................................................................................................. - // sqrdmulh v10.4s, v10.4s, v1.s[3] // .............................................................................................................................................................|......*............................................................................................................................................... - // mla v24.4s, v10.4s, v29.4s // ..*..........................................................................................................................................................|....................................*................................................................................................................. - // sub v10.4s, v8.4s, v24.4s // ..........*..................................................................................................................................................|............................................*......................................................................................................... - // add v8.4s, v8.4s, v24.4s // ...............*.............................................................................................................................................|.................................................*.................................................................................................... - // mul v24.4s, v11.4s, v1.s[2] // .............................................................................................................................................................|.*.................................................................................................................................................... - // sqrdmulh v11.4s, v11.4s, v1.s[3] // .............................................................................................................................................................|..*................................................................................................................................................... - // mla v24.4s, v11.4s, v29.4s // .............................................................................................................................................................|............*......................................................................................................................................... - // sub v11.4s, v9.4s, v24.4s // .............................................................................................................................................................|..........................*........................................................................................................................... - // add v9.4s, v9.4s, v24.4s // .............................................................................................................................................................|.........................*............................................................................................................................ - // mul v24.4s, v14.4s, v2.s[0] // ....................................................................................................................................................e........|...................................................................................................................................................... - // sqrdmulh v14.4s, v14.4s, v2.s[1] // .....................................................................................................................................................e.......|...................................................................................................................................................... - // mla v24.4s, v14.4s, v29.4s // .............................................................................................................................................................|........*............................................................................................................................................. - // sub v14.4s, v12.4s, v24.4s // .............................................................................................................................................................|..................*................................................................................................................................... - // add v12.4s, v12.4s, v24.4s // .............................................................................................................................................................|...........................*.......................................................................................................................... - // mul v24.4s, v15.4s, v2.s[0] // ...................................................................................................................................................e.........|...................................................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v2.s[1] // ................................................................................................................................................e............|...................................................................................................................................................... - // mla v24.4s, v15.4s, v29.4s // .............................................................................................................................................................*...................................................................................................................................................... - // sub v15.4s, v13.4s, v24.4s // .............................................................................................................................................................|.............*........................................................................................................................................ - // add v13.4s, v13.4s, v24.4s // .............................................................................................................................................................|..............*....................................................................................................................................... - // mul v24.4s, v18.4s, v2.s[2] // .............................................................................................................................................................|...........*.......................................................................................................................................... - // sqrdmulh v18.4s, v18.4s, v2.s[3] // .............................................................................................................................................................|..........*........................................................................................................................................... - // mla v24.4s, v18.4s, v29.4s // .............................................................................................................................................................|.............................*........................................................................................................................ - // sub v18.4s, v16.4s, v24.4s // ......................*......................................................................................................................................|........................................................*............................................................................................. - // add v16.4s, v16.4s, v24.4s // .....................*.......................................................................................................................................|.......................................................*.............................................................................................. - // mul v24.4s, v19.4s, v2.s[2] // .............................................................................................................................................................|*..................................................................................................................................................... - // sqrdmulh v19.4s, v19.4s, v2.s[3] // .............................................................................................................................................................|...*.................................................................................................................................................. - // mla v24.4s, v19.4s, v29.4s // .............................................................................................................................................................|.................*.................................................................................................................................... - // sub v19.4s, v17.4s, v24.4s // .............................................................................................................................................................|.......................*.............................................................................................................................. - // add v17.4s, v17.4s, v24.4s // .............................................................................................................................................................|......................*............................................................................................................................... - // mul v24.4s, v22.4s, v3.s[0] // .............................................................................................................................................................|................*..................................................................................................................................... - // sqrdmulh v22.4s, v22.4s, v3.s[1] // .............................................................................................................................................................|...................*.................................................................................................................................. - // mla v24.4s, v22.4s, v29.4s // .....*.......................................................................................................................................................|.......................................*.............................................................................................................. - // sub v22.4s, v20.4s, v24.4s // ...............................*.............................................................................................................................|.................................................................*.................................................................................... - // add v20.4s, v20.4s, v24.4s // ................................*............................................................................................................................|..................................................................*................................................................................... - // mul v24.4s, v23.4s, v3.s[0] // .............................................................................................................................................................|.....*................................................................................................................................................ - // sqrdmulh v23.4s, v23.4s, v3.s[1] // .............................................................................................................................................................|....*................................................................................................................................................. - // mla v24.4s, v23.4s, v29.4s // .............................................................................................................................................................|...............*...................................................................................................................................... - // sub v23.4s, v21.4s, v24.4s // ..............*..............................................................................................................................................|................................................*..................................................................................................... - // add v21.4s, v21.4s, v24.4s // .............................................................................................................................................................|............................*......................................................................................................................... - // mul v24.4s, v9.4s, v3.s[2] // .............................................................................................................................................................|................................*..................................................................................................................... - // sqrdmulh v9.4s, v9.4s, v3.s[3] // .............................................................................................................................................................|...............................*...................................................................................................................... - // mla v24.4s, v9.4s, v29.4s // ................*............................................................................................................................................|..................................................*................................................................................................... - // sub v9.4s, v8.4s, v24.4s // ..........................*..................................................................................................................................|............................................................*......................................................................................... - // add v8.4s, v8.4s, v24.4s // .........................*...................................................................................................................................|...........................................................*.......................................................................................... - // mul v24.4s, v11.4s, v4.s[0] // ...................*.........................................................................................................................................|.....................................................*................................................................................................ - // sqrdmulh v11.4s, v11.4s, v4.s[1] // ............................*................................................................................................................................|..............................................................*....................................................................................... - // mla v24.4s, v11.4s, v29.4s // ............................................*................................................................................................................|..............................................................................*....................................................................... - // sub v11.4s, v10.4s, v24.4s // ..............................................................................*..............................................................................|................................................................................................................*..................................... - // add v10.4s, v10.4s, v24.4s // .........................................................................*...................................................................................|...........................................................................................................*.......................................... - // mul v24.4s, v13.4s, v4.s[2] // .............................................................................................................................................................|....................*................................................................................................................................. - // sqrdmulh v13.4s, v13.4s, v4.s[3] // .............................................................................................................................................................|.....................*................................................................................................................................ - // mla v24.4s, v13.4s, v29.4s // .............................................................................................................................................................|..............................*....................................................................................................................... - // sub v13.4s, v12.4s, v24.4s // .........*...................................................................................................................................................|...........................................*.......................................................................................................... - // add v12.4s, v12.4s, v24.4s // ......*......................................................................................................................................................|........................................*............................................................................................................. - // mul v24.4s, v15.4s, v5.s[0] // ....*........................................................................................................................................................|......................................*............................................................................................................... - // sqrdmulh v15.4s, v15.4s, v5.s[1] // .......*.....................................................................................................................................................|.........................................*............................................................................................................ - // mla v24.4s, v15.4s, v29.4s // ........................*....................................................................................................................................|..........................................................*........................................................................................... - // sub v15.4s, v14.4s, v24.4s // ................................................................................*............................................................................|..................................................................................................................*................................... - // add v14.4s, v14.4s, v24.4s // .......................................*.....................................................................................................................|.........................................................................*............................................................................ - // mul v24.4s, v17.4s, v5.s[2] // ...........................*.................................................................................................................................|.............................................................*........................................................................................ - // sqrdmulh v17.4s, v17.4s, v5.s[3] // .............*...............................................................................................................................................|...............................................*...................................................................................................... - // mla v24.4s, v17.4s, v29.4s // ....................................................*........................................................................................................|......................................................................................*............................................................... - // sub v17.4s, v16.4s, v24.4s // .................................................................*...........................................................................................|...................................................................................................*.................................................. - // add v16.4s, v16.4s, v24.4s // ...............................................................*.............................................................................................|.................................................................................................*.................................................... - // mul v24.4s, v19.4s, v6.s[0] // .............................................................................................................................................................|.................................*.................................................................................................................... - // sqrdmulh v19.4s, v19.4s, v6.s[1] // .*...........................................................................................................................................................|...................................*.................................................................................................................. - // mla v24.4s, v19.4s, v29.4s // ...*.........................................................................................................................................................|.....................................*................................................................................................................ - // sub v19.4s, v18.4s, v24.4s // ..........................................................*..................................................................................................|............................................................................................*......................................................... - // add v18.4s, v18.4s, v24.4s // ........................................*....................................................................................................................|..........................................................................*........................................................................... - // mul v24.4s, v21.4s, v6.s[2] // ............*................................................................................................................................................|..............................................*....................................................................................................... - // sqrdmulh v21.4s, v21.4s, v6.s[3] // .......................*.....................................................................................................................................|.........................................................*............................................................................................ - // mla v24.4s, v21.4s, v29.4s // ...................................*.........................................................................................................................|.....................................................................*................................................................................ - // sub v21.4s, v20.4s, v24.4s // ......................................................*......................................................................................................|........................................................................................*............................................................. - // add v20.4s, v20.4s, v24.4s // .......................................................*.....................................................................................................|.........................................................................................*............................................................ - // mul v24.4s, v23.4s, v7.s[0] // ..............................*..............................................................................................................................|................................................................*..................................................................................... - // sqrdmulh v23.4s, v23.4s, v7.s[1] // .....................................*.......................................................................................................................|.......................................................................*.............................................................................. - // mla v24.4s, v23.4s, v29.4s // ...........................................*.................................................................................................................|.............................................................................*........................................................................ - // sub v23.4s, v22.4s, v24.4s // ............................................................*................................................................................................|..............................................................................................*....................................................... - // add v22.4s, v22.4s, v24.4s // ..................................................................................*..........................................................................|....................................................................................................................*................................. - // str q8, [x0], #(16) // ......................................*......................................................................................................................|........................................................................*............................................................................. - // str q9, [x0, #(-16 + 1*(512/8))] // ....................................*........................................................................................................................|......................................................................*............................................................................... - // str q10, [x0, #(-16 + 2*(512/8))] // ......................................................................................*......................................................................|........................................................................................................................*............................. - // str q11, [x0, #(-16 + 3*(512/8))] // .................................................................................................*...........................................................|...................................................................................................................................*.................. - // str q12, [x0, #(-16 + 4*(512/8))] // ..................*..........................................................................................................................................|....................................................*................................................................................................. - // str q13, [x0, #(-16 + 5*(512/8))] // .................*...........................................................................................................................................|...................................................*.................................................................................................. - // str q14, [x0, #(-16 + 6*(512/8))] // ..........................................*..................................................................................................................|............................................................................*......................................................................... - // str q15, [x0, #(-16 + 7*(512/8))] // ............................................................................................................*................................................|..............................................................................................................................................*....... - // str q16, [x0, #(-16 + 8*(512/8))] // .......................................................................................................*.....................................................|.........................................................................................................................................*............ - // str q17, [x0, #(-16 + 9*(512/8))] // .............................................................................*...............................................................................|...............................................................................................................*...................................... - // str q18, [x0, #(-16 + 10*(512/8))] // ...............................................*.............................................................................................................|.................................................................................*.................................................................... - // str q19, [x0, #(-16 + 11*(512/8))] // ..............................................................*..............................................................................................|................................................................................................*..................................................... - // str q20, [x0, #(-16 + 12*(512/8))] // .............................................................*...............................................................................................|...............................................................................................*...................................................... - // str q21, [x0, #(-16 + 13*(512/8))] // ........................................................................*....................................................................................|..........................................................................................................*........................................... - // str q22, [x0, #(-16 + 14*(512/8))] // ...................................................................................................................*.........................................|.....................................................................................................................................................* - // str q23, [x0, #(-16 + 15*(512/8))] // ......................................................................*......................................................................................|........................................................................................................*............................................. + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // sub v30.4S, v18.4S, v25.4S // .*.............................................................................................................................................................................................. + // mla v16.4S, v17.4S, v29.4S // ........*....................................................................................................................................................................................... + // sqrdmulh v8.4S, v22.4S, v2.S[3] // ......................*......................................................................................................................................................................... + // add v17.4S, v19.4S, v15.4S // .......*........................................................................................................................................................................................ + // mla v10.4S, v27.4S, v29.4S // ....*........................................................................................................................................................................................... + // mul v27.4S, v23.4S, v2.S[2] // ..........................*..................................................................................................................................................................... + // sqrdmulh v23.4S, v23.4S, v2.S[3] // .............*.................................................................................................................................................................................. + // mul v22.4S, v22.4S, v2.S[2] // ................*............................................................................................................................................................................... + // sub v28.4S, v28.4S, v11.4S // .....*.......................................................................................................................................................................................... + // mul v11.4S, v17.4S, v4.S[2] // ...............*................................................................................................................................................................................ + // mla v20.4S, v9.4S, v29.4S // ...........*.................................................................................................................................................................................... + // add v25.4S, v18.4S, v25.4S // *............................................................................................................................................................................................... + // sub v15.4S, v19.4S, v15.4S // .........*...................................................................................................................................................................................... + // sub v19.4S, v13.4S, v16.4S // .................................*.............................................................................................................................................................. + // sub v9.4S, v21.4S, v24.4S // .........................*...................................................................................................................................................................... + // add v18.4S, v13.4S, v16.4S // ..................................*............................................................................................................................................................. + // mla v22.4S, v8.4S, v29.4S // ..............................*................................................................................................................................................................. + // sub v16.4S, v31.4S, v10.4S // .......................*........................................................................................................................................................................ + // mla v27.4S, v23.4S, v29.4S // ....................................*........................................................................................................................................................... + // sqrdmulh v23.4S, v28.4S, v2.S[1] // .....................*.......................................................................................................................................................................... + // sqrdmulh v17.4S, v17.4S, v4.S[3] // ..............*................................................................................................................................................................................. + // sqrdmulh v8.4S, v12.4S, v1.S[3] // ..*............................................................................................................................................................................................. + // add v13.4S, v31.4S, v10.4S // ..................*............................................................................................................................................................................. + // mul v31.4S, v28.4S, v2.S[0] // ....................*........................................................................................................................................................................... + // add v10.4S, v16.4S, v20.4S // .....................................*.......................................................................................................................................................... + // sub v20.4S, v16.4S, v20.4S // ...............................*................................................................................................................................................................ + // sqrdmulh v28.4S, v26.4S, v3.S[1] // ......*......................................................................................................................................................................................... + // mul v26.4S, v19.4S, v4.S[0] // .......................................*........................................................................................................................................................ + // mul v12.4S, v12.4S, v1.S[2] // ...*............................................................................................................................................................................................ + // add v16.4S, v25.4S, v27.4S // ...................................................................*............................................................................................................................ + // sub v27.4S, v25.4S, v27.4S // ............................................................................*................................................................................................................... + // sub v25.4S, v13.4S, v22.4S // ...........................................*.................................................................................................................................................... + // mla v31.4S, v23.4S, v29.4S // .............................*.................................................................................................................................................................. + // add v23.4S, v13.4S, v22.4S // ..........................................*..................................................................................................................................................... + // sqrdmulh v22.4S, v19.4S, v4.S[1] // ........................................*....................................................................................................................................................... + // sqrdmulh v19.4S, v18.4S, v3.S[3] // ...........................................................................................*.................................................................................................... + // add v21.4S, v21.4S, v24.4S // ........................*....................................................................................................................................................................... + // mul v24.4S, v18.4S, v3.S[2] // ..................................................................................*............................................................................................................. + // mul v13.4S, v10.4S, v6.S[2] // ..............................................*................................................................................................................................................. + // sqrdmulh v18.4S, v10.4S, v6.S[3] // .........................................*...................................................................................................................................................... + // mla v12.4S, v8.4S, v29.4S // ............*................................................................................................................................................................................... + // mla v11.4S, v17.4S, v29.4S // ...........................*.................................................................................................................................................................... + // mul v10.4S, v23.4S, v5.S[2] // .........................................................*...................................................................................................................................... + // sqrdmulh v8.4S, v23.4S, v5.S[3] // .....................................................*.......................................................................................................................................... + // mul v17.4S, v25.4S, v6.S[0] // ..........................................................*..................................................................................................................................... + // mla v14.4S, v28.4S, v29.4S // ...................*............................................................................................................................................................................ + // sqrdmulh v28.4S, v25.4S, v6.S[1] // ............................................................*................................................................................................................................... + // sqrdmulh v25.4S, v15.4S, v5.S[1] // .................*.............................................................................................................................................................................. + // mla v24.4S, v19.4S, v29.4S // ................................................................................................*............................................................................................... + // sub v23.4S, v9.4S, v31.4S // ............................................*................................................................................................................................................... + // mla v26.4S, v22.4S, v29.4S // ....................................................*........................................................................................................................................... + // add v9.4S, v9.4S, v31.4S // ......................................*......................................................................................................................................................... + // sub v22.4S, v21.4S, v12.4S // ...................................*............................................................................................................................................................ + // add v21.4S, v21.4S, v12.4S // ................................*............................................................................................................................................................... + // mul v19.4S, v20.4S, v7.S[0] // ...................................................*............................................................................................................................................ + // mul v15.4S, v15.4S, v5.S[0] // ............................*................................................................................................................................................................... + // add v12.4S, v9.4S, v11.4S // .................................................*.............................................................................................................................................. + // sqrdmulh v31.4S, v20.4S, v7.S[1] // ..................................................*............................................................................................................................................. + // mla v10.4S, v8.4S, v29.4S // ...............................................................*................................................................................................................................ + // sub v8.4S, v9.4S, v11.4S // ................................................*............................................................................................................................................... + // mla v17.4S, v28.4S, v29.4S // ...............................................................................*................................................................................................................ + // add v28.4S, v22.4S, v26.4S // .........................................................................*...................................................................................................................... + // add v11.4S, v21.4S, v24.4S // ......................................................................................................................*......................................................................... + // sub v9.4S, v21.4S, v24.4S // .....................................................................................................................*.......................................................................... + // sub v20.4S, v30.4S, v14.4S // .............................................................*.................................................................................................................................. + // mla v15.4S, v25.4S, v29.4S // ...............................................*................................................................................................................................................ + // str q8, [x0, #320] // .......................................................*........................................................................................................................................ + // mla v13.4S, v18.4S, v29.4S // ........................................................*....................................................................................................................................... + // str q12, [x0, #256] // ......................................................*......................................................................................................................................... + // sub v22.4S, v22.4S, v26.4S // .......................................................................*........................................................................................................................ + // add v21.4S, v30.4S, v14.4S // ...........................................................*.................................................................................................................................... + // mla v19.4S, v31.4S, v29.4S // ....................................................................*........................................................................................................................... + // add v26.4S, v16.4S, v10.4S // ........................................................................................*....................................................................................................... + // sub v18.4S, v16.4S, v10.4S // ...............................................................................................*................................................................................................ + // str q9, [x0, #64] // .................................................................................................................................*.............................................................. + // str q11, [x0], #(16) // ..................................................................................................................................*............................................................. + // add v24.4S, v27.4S, v17.4S // .............................................................................................................*.................................................................................. + // sub v16.4S, v27.4S, v17.4S // .........................................................................................................*...................................................................................... + // str q28, [x0, #112] // ....................................................................................*........................................................................................................... + // str q22, [x0, #176] // .............................................................................*.................................................................................................................. + // str q26, [x0, #496] // .....................................................................................................*.......................................................................................... + // add v22.4S, v21.4S, v13.4S // ..........................................................................*..................................................................................................................... + // str q18, [x0, #560] // ..................................................................................................*............................................................................................. + // sub v26.4S, v23.4S, v15.4S // .................................................................*.............................................................................................................................. + // str q16, [x0, #688] // ..................................................................................................................*............................................................................. + // str q24, [x0, #624] // ..........................................................................................................................*..................................................................... + // sub v8.4S, v21.4S, v13.4S // ...........................................................................*.................................................................................................................... + // sub v30.4S, v20.4S, v19.4S // ............................................................................................................*................................................................................... + // add v18.4S, v23.4S, v15.4S // ................................................................*............................................................................................................................... + // str q26, [x0, #432] // ..........................................................................................................*..................................................................................... + // str q22, [x0, #752] // .....................................................................................*.......................................................................................................... + // add v31.4S, v20.4S, v19.4S // .........................................................................................................................*...................................................................... + // str q8, [x0, #816] // ..........................................................................................*..................................................................................................... + // str q30, [x0, #944] // ...................................................................................................................*............................................................................ + // str q18, [x0, #368] // ........................................................................*....................................................................................................................... + // str q31, [x0, #880] // ......................................................................................................................................*......................................................... + // ldr q8, [x0, #704] // ............................................................................................*................................................................................................... + // ldr q12, [x0, #512] // ..................................................................*............................................................................................................................. + // ldr q24, [x0, #768] // ...................................................................................................*............................................................................................ + // ldr q18, [x0, #640] // ..........*..................................................................................................................................................................................... + // ldr q11, [x0, #832] // ..............................................................*................................................................................................................................. + // ldr q30, [x0, #960] // ...................................................................................*............................................................................................................ + // ldr q28, [x0, #896] // .............................................*.................................................................................................................................................. + // ldr q9, [x0, #576] // .................................................................................*.............................................................................................................. + // ldr q17, [x0, #64] // ...............................................................................................................................................................*................................ + // ldr q22, [x0, #448] // ...............................................................................................................*................................................................................ + // mul v16.4S, v24.4S, v0.S[0] // .......................................................................................................................*........................................................................ + // sqrdmulh v13.4S, v24.4S, v0.S[1] // ........................................................................................................................*....................................................................... + // mul v14.4S, v8.4S, v0.S[0] // ..............................................................................................................................*................................................................. + // sqrdmulh v21.4S, v8.4S, v0.S[1] // ............................................................................................................................*................................................................... + // sqrdmulh v10.4S, v11.4S, v0.S[1] // .............................................................................................*.................................................................................................. + // mul v19.4S, v11.4S, v0.S[0] // ..............................................................................................*................................................................................................. + // mul v11.4S, v30.4S, v0.S[0] // ....................................................................................................*........................................................................................... + // sqrdmulh v25.4S, v30.4S, v0.S[1] // .................................................................................................*.............................................................................................. + // ldr q30, [x0, #0] // .....................................................................................................................................................*.......................................... + // mul v8.4S, v12.4S, v0.S[0] // ....................................................................................................................*........................................................................... + // mul v26.4S, v28.4S, v0.S[0] // ......................................................................*......................................................................................................................... + // sqrdmulh v20.4S, v28.4S, v0.S[1] // .....................................................................*.......................................................................................................................... + // ldr q28, [x0, #192] // ...........................................................................................................................................*.................................................... + // mla v11.4S, v25.4S, v29.4S // .................................................................................................................*.............................................................................. + // ldr q23, [x0, #320] // ......................................................................................................*......................................................................................... + // ldr q25, [x0, #256] // ........................................................................................................................................*....................................................... + // mla v19.4S, v10.4S, v29.4S // ........................................................................................................*....................................................................................... + // mla v16.4S, v13.4S, v29.4S // ..........................................................................................................................................*..................................................... + // sqrdmulh v31.4S, v12.4S, v0.S[1] // ......................................................................................*......................................................................................................... + // mla v14.4S, v21.4S, v29.4S // ...............................................................................................................................................*................................................ + // mul v10.4S, v18.4S, v0.S[0] // .........................................................................................*...................................................................................................... + // sqrdmulh v21.4S, v9.4S, v0.S[1] // .......................................................................................................................................*........................................................ + // sqrdmulh v15.4S, v18.4S, v0.S[1] // .......................................................................................*........................................................................................................ + // mul v27.4S, v9.4S, v0.S[0] // ..............................................................................................................*................................................................................. + // sub v13.4S, v22.4S, v11.4S // ....................................................................................................................................*........................................................... + // add v18.4S, v22.4S, v11.4S // ...................................................................................................................................*............................................................ + // mla v26.4S, v20.4S, v29.4S // ................................................................................*............................................................................................................... + // ldr q20, [x0, #384] // ..............................................................................*................................................................................................................. + // mla v27.4S, v21.4S, v29.4S // ..................................................................................................................................................*............................................. + // mla v8.4S, v31.4S, v29.4S // .............................................................................................................................................*.................................................. + // sub v9.4S, v25.4S, v16.4S // ........................................................................................................................................................*....................................... + // sqrdmulh v21.4S, v18.4S, v0.S[3] // ................................................................................................................................................*............................................... + // add v24.4S, v25.4S, v16.4S // ..........................................................................................................................................................*..................................... + // mul v16.4S, v18.4S, v0.S[2] // .................................................................................................................................................*.............................................. + // sub v12.4S, v20.4S, v26.4S // .............................................................................................................................*.................................................................. + // mul v22.4S, v13.4S, v1.S[0] // .......................................................................................................................................................*........................................ + // add v11.4S, v20.4S, v26.4S // .......................................................................................................*........................................................................................ + // sqrdmulh v31.4S, v9.4S, v1.S[1] // .....................................................................................................................................................................*.......................... + // mul v25.4S, v9.4S, v1.S[0] // ..............................................................................................................................................................................*................. + // add v20.4S, v23.4S, v19.4S // ...........................................................................................................................*.................................................................... + // sqrdmulh v26.4S, v24.4S, v0.S[3] // ...................................................................................................................................................................*............................ + // sub v23.4S, v23.4S, v19.4S // ...................................................................................................................................................*............................................ + // mul v9.4S, v12.4S, v1.S[0] // ............................................................................................................................................*................................................... + // sub v18.4S, v30.4S, v8.4S // ......................................................................................................................................................................*......................... + // mla v16.4S, v21.4S, v29.4S // .............................................................................................................................................................*.................................. + // add v21.4S, v30.4S, v8.4S // .........................................................................................................................................................................*...................... + // sqrdmulh v30.4S, v12.4S, v1.S[1] // .........................................................................................................................................*...................................................... + // mul v24.4S, v24.4S, v0.S[2] // ....................................................................................................................................................................*........................... + // sub v19.4S, v28.4S, v14.4S // ..................................................................................................................................................................*............................. + // sqrdmulh v13.4S, v13.4S, v1.S[1] // .........................................................................................................................................................*...................................... + // add v12.4S, v28.4S, v14.4S // ..............................................................................................................................................................*................................. + // ldr q28, [x0, #128] // ...............................................................................................................................*................................................................ + // mla v10.4S, v15.4S, v29.4S // ...........................................................................................................*.................................................................................... + // sqrdmulh v15.4S, v11.4S, v0.S[3] // ................................................................................................................*............................................................................... + // mul v11.4S, v11.4S, v0.S[2] // ................................................................................................................................*............................................................... + // mul v8.4S, v20.4S, v0.S[2] // ......................................................................................................................................................*......................................... + // sqrdmulh v20.4S, v20.4S, v0.S[3] // .....................................................................................................................................*.......................................................... + // add v14.4S, v17.4S, v27.4S // ............................................................................................................................................................................*................... + // mla v25.4S, v31.4S, v29.4S // ............................................................................................................................................................................................*... + // sub v31.4S, v17.4S, v27.4S // ...............................................................................................................................................................................*................ + // add v17.4S, v12.4S, v16.4S // ..........................................................................................................................................................................*..................... + // mla v9.4S, v30.4S, v29.4S // ...........................................................................................................................................................*.................................... + // mla v22.4S, v13.4S, v29.4S // .......................................................................................................................................................................*........................ + // sqrdmulh v27.4S, v23.4S, v1.S[1] // ........................................................................................................................................................................................*....... + // sub v30.4S, v12.4S, v16.4S // ........................................................................................................................................................................*....................... + // mla v8.4S, v20.4S, v29.4S // ................................................................................................................................................................*............................... + // mla v11.4S, v15.4S, v29.4S // ..............................................................................................................................................*................................................. + // sqrdmulh v20.4S, v30.4S, v2.S[1] // .................................................................................................................................................................................*.............. + // mul v16.4S, v17.4S, v1.S[2] // .......................................................................................................................................................................................*........ + // mul v15.4S, v30.4S, v2.S[0] // ................................................................................................................................................................................*............... + // sub v12.4S, v28.4S, v10.4S // .................................................................................................................................................................*.............................. + // mla v24.4S, v26.4S, v29.4S // ....................................................................................................................................................................................*........... + // sub v30.4S, v19.4S, v22.4S // ...................................................................................................................................................................................*............ + // add v22.4S, v19.4S, v22.4S // ..........................................................................................................................................................................................*..... + // add v28.4S, v28.4S, v10.4S // ....................................................................................................................................................*........................................... + // mul v10.4S, v23.4S, v1.S[0] // ...........................................................................................................................................................................................*.... + // sub v26.4S, v12.4S, v9.4S // ...........................................................................................................................................................................*.................... + // add v13.4S, v14.4S, v8.4S // .....................................................................................................................................................................................*.......... + // sqrdmulh v17.4S, v17.4S, v1.S[3] // ..................................................................................................................................................................................*............. + // mla v15.4S, v20.4S, v29.4S // ..............................................................................................................................................................................................*. + // add v23.4S, v12.4S, v9.4S // .............................................................................................................................................................................*.................. + // sub v19.4S, v14.4S, v8.4S // ......................................................................................................................................................................................*......... + // add v12.4S, v28.4S, v11.4S // ............................................................................................................................................................*................................... + // sqrdmulh v9.4S, v30.4S, v3.S[1] // .............................................................................................................................................................................................*.. + // mul v20.4S, v30.4S, v3.S[0] // .........................................................................................................................................................................................*...... + // mul v14.4S, v26.4S, v3.S[0] // ...............................................................................................................................................................................................* sub count, count, #1 cbnz count, layer1234_start - mla v27.4S, v18.4S, v29.4S // ............................................................................................................*................................................................................... - sqrdmulh v17.4S, v20.4S, v3.S[1] // ....................................................................................................................................*........................................................... - mla v8.4S, v10.4S, v29.4S // .................................................................................................................*.............................................................................. - mul v20.4S, v20.4S, v3.S[0] // ...................................................................................................................................*............................................................ - sub v16.4S, v15.4S, v23.4S // ...........................................................*.................................................................................................................................... - mul v19.4S, v25.4S, v2.S[2] // .........................................................................................................................*...................................................................... - sqrdmulh v24.4S, v13.4S, v2.S[3] // .....................................................................................................................*.......................................................................... - sqrdmulh v11.4S, v25.4S, v2.S[3] // ..........................................................................................................................*..................................................................... - sqrdmulh v25.4S, v26.4S, v3.S[1] // ...............................................................................................................................*................................................................ - mul v18.4S, v26.4S, v3.S[0] // ..............................................................................................................................*................................................................. - mul v26.4S, v30.4S, v1.S[2] // .....................................................................................................*.......................................................................................... - sqrdmulh v10.4S, v30.4S, v1.S[3] // ......................................................................................................*......................................................................................... - mla v20.4S, v17.4S, v29.4S // .....................................................................................................................................*.......................................................... - mul v30.4S, v22.4S, v1.S[2] // ................................................................................................*............................................................................................... - add v17.4S, v21.4S, v8.4S // ...................................................................................................................*............................................................................ - sub v21.4S, v21.4S, v8.4S // ..................................................................................................................*............................................................................. - mla v19.4S, v11.4S, v29.4S // ...........................................................................................................................*.................................................................... - sub v8.4S, v16.4S, v27.4S // .............................................................................................................*.................................................................................. - add v27.4S, v16.4S, v27.4S // ..............................................................................................................*................................................................................. - sqrdmulh v11.4S, v22.4S, v1.S[3] // .................................................................................................*.............................................................................................. - mul v22.4S, v17.4S, v4.S[2] // ..................................................................................................................................................*............................................. - mla v26.4S, v10.4S, v29.4S // .......................................................................................................*........................................................................................ - mul v10.4S, v21.4S, v5.S[0] // .......................................................................................................................................................*........................................ - sqrdmulh v16.4S, v21.4S, v5.S[1] // ........................................................................................................................................................*....................................... - mla v18.4S, v25.4S, v29.4S // ................................................................................................................................*............................................................... - sub v21.4S, v14.4S, v20.4S // ......................................................................................................................................*......................................................... - add v14.4S, v14.4S, v20.4S // .......................................................................................................................................*........................................................ - add v25.4S, v15.4S, v23.4S // ............................................................*................................................................................................................................... - sub v23.4S, v9.4S, v19.4S // ............................................................................................................................*................................................................... - mla v30.4S, v11.4S, v29.4S // ..................................................................................................*............................................................................................. - add v15.4S, v9.4S, v19.4S // .............................................................................................................................*.................................................................. - mul v13.4S, v13.4S, v2.S[2] // ....................................................................................................................*........................................................................... - add v20.4S, v12.4S, v26.4S // .........................................................................................................*...................................................................................... - mul v19.4S, v14.4S, v6.S[2] // ......................................................................................................................................................................*......................... - mla v10.4S, v16.4S, v29.4S // .........................................................................................................................................................*...................................... - sqrdmulh v11.4S, v14.4S, v6.S[3] // .......................................................................................................................................................................*........................ - sqrdmulh v9.4S, v21.4S, v7.S[1] // ............................................................................................................................................................................*................... - sqrdmulh v14.4S, v15.4S, v5.S[3] // .............................................................................................................................................................*.................................. - mul v16.4S, v15.4S, v5.S[2] // ............................................................................................................................................................*................................... - mul v15.4S, v21.4S, v7.S[0] // ...........................................................................................................................................................................*.................... - sub v26.4S, v12.4S, v26.4S // ........................................................................................................*....................................................................................... - mul v21.4S, v20.4S, v3.S[2] // ........................................................................................................................................*....................................................... - sqrdmulh v20.4S, v20.4S, v3.S[3] // .........................................................................................................................................*...................................................... - mla v13.4S, v24.4S, v29.4S // ......................................................................................................................*......................................................................... - mla v19.4S, v11.4S, v29.4S // ........................................................................................................................................................................*....................... - sub v11.4S, v28.4S, v31.4S // ...............................................................................*................................................................................................................ - sub v24.4S, v8.4S, v10.4S // ..........................................................................................................................................................*..................................... - add v12.4S, v28.4S, v31.4S // ................................................................................*............................................................................................................... - mla v16.4S, v14.4S, v29.4S // ..............................................................................................................................................................*................................. - mul v31.4S, v26.4S, v4.S[0] // .............................................................................................................................................*.................................................. - mla v15.4S, v9.4S, v29.4S // .............................................................................................................................................................................*.................. - sqrdmulh v28.4S, v26.4S, v4.S[1] // ..............................................................................................................................................*................................................. - sub v14.4S, v11.4S, v18.4S // .................................................................................................................................*.............................................................. - add v18.4S, v11.4S, v18.4S // ..................................................................................................................................*............................................................. - mla v21.4S, v20.4S, v29.4S // ..........................................................................................................................................*..................................................... - add v9.4S, v12.4S, v13.4S // ........................................................................................................................*....................................................................... - sqrdmulh v11.4S, v17.4S, v4.S[3] // ...................................................................................................................................................*............................................ - str q24, [x0, #448] // .......................................................................................................................................................................................*........ - sqrdmulh v24.4S, v23.4S, v6.S[1] // ..................................................................................................................................................................*............................. - mul v17.4S, v23.4S, v6.S[0] // .................................................................................................................................................................*.............................. - add v23.4S, v8.4S, v10.4S // ...........................................................................................................................................................*.................................... - sub v10.4S, v9.4S, v16.4S // ...............................................................................................................................................................*................................ - add v16.4S, v9.4S, v16.4S // ................................................................................................................................................................*............................... - sub v20.4S, v18.4S, v19.4S // .........................................................................................................................................................................*...................... - add v9.4S, v14.4S, v15.4S // ...............................................................................................................................................................................*................ - sub v8.4S, v14.4S, v15.4S // ..............................................................................................................................................................................*................. - sub v14.4S, v12.4S, v13.4S // .......................................................................................................................*........................................................................ - str q23, [x0, #384] // ......................................................................................................................................................................................*......... - add v23.4S, v25.4S, v30.4S // ....................................................................................................*........................................................................................... - mla v31.4S, v28.4S, v29.4S // ...............................................................................................................................................*................................................ - sub v12.4S, v25.4S, v30.4S // ...................................................................................................*............................................................................................ - str q10, [x0, #576] // .........................................................................................................................................................................................*...... - mla v17.4S, v24.4S, v29.4S // ...................................................................................................................................................................*............................ - str q16, [x0, #512] // ........................................................................................................................................................................................*....... - add v16.4S, v18.4S, v19.4S // ..........................................................................................................................................................................*..................... - str q20, [x0, #832] // .............................................................................................................................................................................................*.. - mla v22.4S, v11.4S, v29.4S // ....................................................................................................................................................*........................................... - str q9, [x0, #896] // ..............................................................................................................................................................................................*. - add v20.4S, v23.4S, v21.4S // ............................................................................................................................................*................................................... - sub v9.4S, v23.4S, v21.4S // ...........................................................................................................................................*.................................................... - str q16, [x0, #768] // ............................................................................................................................................................................................*... - sub v15.4S, v12.4S, v31.4S // ................................................................................................................................................*............................................... - add v30.4S, v12.4S, v31.4S // .................................................................................................................................................*.............................................. - str q8, [x0, #960] // ...............................................................................................................................................................................................* - sub v21.4S, v14.4S, v17.4S // ....................................................................................................................................................................*........................... - add v16.4S, v14.4S, v17.4S // .....................................................................................................................................................................*.......................... + sub v30.4S, v18.4S, v25.4S // ...............................................................................*................................................................................................................ + mla v16.4S, v17.4S, v29.4S // .......................................................................................................*........................................................................................ + sqrdmulh v8.4S, v22.4S, v2.S[3] // .........................................................................................................................*...................................................................... + add v17.4S, v19.4S, v15.4S // ...................................................................................................................*............................................................................ + mla v10.4S, v27.4S, v29.4S // ...................................................................................*............................................................................................................ + mul v27.4S, v23.4S, v2.S[2] // .....................................................................................................................*.......................................................................... + sqrdmulh v23.4S, v23.4S, v2.S[3] // ....................................................................................................................*........................................................................... + mul v22.4S, v22.4S, v2.S[2] // ..........................................................................................................................*..................................................................... + sub v28.4S, v28.4S, v11.4S // .....................................................................*.......................................................................................................................... + mul v11.4S, v17.4S, v4.S[2] // ...................................................................................................................................................*............................................ + mla v20.4S, v9.4S, v29.4S // .....................................................................................................................................*.......................................................... + add v25.4S, v18.4S, v25.4S // ................................................................................*............................................................................................................... + sub v15.4S, v19.4S, v15.4S // ..................................................................................................................*............................................................................. + sub v19.4S, v13.4S, v16.4S // ........................................................................................................*....................................................................................... + sub v9.4S, v21.4S, v24.4S // ...........................................................*.................................................................................................................................... + add v18.4S, v13.4S, v16.4S // .........................................................................................................*...................................................................................... + mla v22.4S, v8.4S, v29.4S // ...........................................................................................................................*.................................................................... + sub v16.4S, v31.4S, v10.4S // ....................................................................................*........................................................................................................... + mla v27.4S, v23.4S, v29.4S // ......................................................................................................................*......................................................................... + sqrdmulh v23.4S, v28.4S, v2.S[1] // ..........................................................................................................*..................................................................................... + sqrdmulh v17.4S, v17.4S, v4.S[3] // ..................................................................................................................................................*............................................. + sqrdmulh v8.4S, v12.4S, v1.S[3] // ................................................................................................*............................................................................................... + add v13.4S, v31.4S, v10.4S // .....................................................................................*.......................................................................................................... + mul v31.4S, v28.4S, v2.S[0] // ...........................................................................................................*.................................................................................... + add v10.4S, v16.4S, v20.4S // .......................................................................................................................................*........................................................ + sub v20.4S, v16.4S, v20.4S // ......................................................................................................................................*......................................................... + sqrdmulh v28.4S, v26.4S, v3.S[1] // ..............................................................................................................................*................................................................. + mul v26.4S, v19.4S, v4.S[0] // ..............................................................................................................................................*................................................. + mul v12.4S, v12.4S, v1.S[2] // .................................................................................................*.............................................................................................. + add v16.4S, v25.4S, v27.4S // ........................................................................................................................*....................................................................... + sub v27.4S, v25.4S, v27.4S // .......................................................................................................................*........................................................................ + sub v25.4S, v13.4S, v22.4S // ............................................................................................................................*................................................................... + mla v31.4S, v23.4S, v29.4S // ............................................................................................................*................................................................................... + add v23.4S, v13.4S, v22.4S // .............................................................................................................................*.................................................................. + sqrdmulh v22.4S, v19.4S, v4.S[1] // .............................................................................................................................................*.................................................. + sqrdmulh v19.4S, v18.4S, v3.S[3] // ........................................................................................................................................*....................................................... + add v21.4S, v21.4S, v24.4S // ............................................................*................................................................................................................................... + mul v24.4S, v18.4S, v3.S[2] // .........................................................................................................................................*...................................................... + mul v13.4S, v10.4S, v6.S[2] // .......................................................................................................................................................................*........................ + sqrdmulh v18.4S, v10.4S, v6.S[3] // ......................................................................................................................................................................*......................... + mla v12.4S, v8.4S, v29.4S // ..................................................................................................*............................................................................................. + mla v11.4S, v17.4S, v29.4S // ....................................................................................................................................................*........................................... + mul v10.4S, v23.4S, v5.S[2] // .............................................................................................................................................................*.................................. + sqrdmulh v8.4S, v23.4S, v5.S[3] // ............................................................................................................................................................*................................... + mul v17.4S, v25.4S, v6.S[0] // ..................................................................................................................................................................*............................. + mla v14.4S, v28.4S, v29.4S // ................................................................................................................................*............................................................... + sqrdmulh v28.4S, v25.4S, v6.S[1] // .................................................................................................................................................................*.............................. + sqrdmulh v25.4S, v15.4S, v5.S[1] // .......................................................................................................................................................*........................................ + mla v24.4S, v19.4S, v29.4S // ..........................................................................................................................................*..................................................... + sub v23.4S, v9.4S, v31.4S // .............................................................................................................*.................................................................................. + mla v26.4S, v22.4S, v29.4S // ...............................................................................................................................................*................................................ + add v9.4S, v9.4S, v31.4S // ..............................................................................................................*................................................................................. + sub v22.4S, v21.4S, v12.4S // ...................................................................................................*............................................................................................ + add v21.4S, v21.4S, v12.4S // ....................................................................................................*........................................................................................... + mul v19.4S, v20.4S, v7.S[0] // ............................................................................................................................................................................*................... + mul v15.4S, v15.4S, v5.S[0] // ........................................................................................................................................................*....................................... + add v12.4S, v9.4S, v11.4S // ......................................................................................................................................................*......................................... + sqrdmulh v31.4S, v20.4S, v7.S[1] // ...........................................................................................................................................................................*.................... + mla v10.4S, v8.4S, v29.4S // ..............................................................................................................................................................*................................. + sub v8.4S, v9.4S, v11.4S // .....................................................................................................................................................*.......................................... + mla v17.4S, v28.4S, v29.4S // ...................................................................................................................................................................*............................ + add v28.4S, v22.4S, v26.4S // .................................................................................................................................................*.............................................. + add v11.4S, v21.4S, v24.4S // ............................................................................................................................................*................................................... + sub v9.4S, v21.4S, v24.4S // ...........................................................................................................................................*.................................................... + sub v20.4S, v30.4S, v14.4S // .................................................................................................................................*.............................................................. + mla v15.4S, v25.4S, v29.4S // .........................................................................................................................................................*...................................... + str q8, [x0, #320] // .....................................................................................................................................................................................*.......... + mla v13.4S, v18.4S, v29.4S // ........................................................................................................................................................................*....................... + str q12, [x0, #256] // ....................................................................................................................................................................................*........... + sub v22.4S, v22.4S, v26.4S // ................................................................................................................................................*............................................... + add v21.4S, v30.4S, v14.4S // ..................................................................................................................................*............................................................. + mla v19.4S, v31.4S, v29.4S // .............................................................................................................................................................................*.................. + add v26.4S, v16.4S, v10.4S // ................................................................................................................................................................*............................... + sub v18.4S, v16.4S, v10.4S // ...............................................................................................................................................................*................................ str q9, [x0, #64] // .................................................................................................................................................................................*.............. - str q20, [x0], #(16) // ................................................................................................................................................................................*............... - str q30, [x0, #112] // ..................................................................................................................................................................................*............. - sub v30.4S, v27.4S, v22.4S // .....................................................................................................................................................*.......................................... - add v9.4S, v27.4S, v22.4S // ......................................................................................................................................................*......................................... - str q15, [x0, #176] // ...................................................................................................................................................................................*............ - str q21, [x0, #688] // ...........................................................................................................................................................................................*.... - str q16, [x0, #624] // ..........................................................................................................................................................................................*..... - str q9, [x0, #240] // ....................................................................................................................................................................................*........... - str q30, [x0, #304] // .....................................................................................................................................................................................*.......... + str q11, [x0], #(16) // ................................................................................................................................................................................*............... + add v24.4S, v27.4S, v17.4S // .....................................................................................................................................................................*.......................... + sub v16.4S, v27.4S, v17.4S // ....................................................................................................................................................................*........................... + str q28, [x0, #112] // ..................................................................................................................................................................................*............. + str q22, [x0, #176] // ...................................................................................................................................................................................*............ + str q26, [x0, #496] // ........................................................................................................................................................................................*....... + add v22.4S, v21.4S, v13.4S // ..........................................................................................................................................................................*..................... + str q18, [x0, #560] // .........................................................................................................................................................................................*...... + sub v26.4S, v23.4S, v15.4S // ..........................................................................................................................................................*..................................... + str q16, [x0, #688] // ...........................................................................................................................................................................................*.... + str q24, [x0, #624] // ..........................................................................................................................................................................................*..... + sub v8.4S, v21.4S, v13.4S // .........................................................................................................................................................................*...................... + sub v30.4S, v20.4S, v19.4S // ..............................................................................................................................................................................*................. + add v18.4S, v23.4S, v15.4S // ...........................................................................................................................................................*.................................... + str q26, [x0, #432] // .......................................................................................................................................................................................*........ + str q22, [x0, #752] // ............................................................................................................................................................................................*... + add v31.4S, v20.4S, v19.4S // ...............................................................................................................................................................................*................ + str q8, [x0, #816] // .............................................................................................................................................................................................*.. + str q30, [x0, #944] // ...............................................................................................................................................................................................* + str q18, [x0, #368] // ......................................................................................................................................................................................*......... + str q31, [x0, #880] // ..............................................................................................................................................................................................*. restore inp, STACK0 mov count, #16 @@ -928,425 +910,721 @@ layer1234_start: qform_root3_tw .req q7 .p2align 2 + // Instructions: 35 + // Expected cycles: 23 + // Expected IPC: 1.52 + // + // Wall time: 0.62s + // User time: 0.62s + // + // ------- original position --------> + // 0 25 + // |------------------------|--------- + ldr q6, [x1, #16] // *.................................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + ldr q2, [x3], #16 // .*................................. + ldr q10, [x1, #48] // ..*................................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqrdmulh v17.4S, v10.4S, v2.S[1] // .....*............................. + mul v8.4S, v10.4S, v2.S[0] // ......*............................ + ldr q31, [x1, #32] // ....*.............................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mla v8.4S, v17.4S, v29.4S // ........*.......................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqrdmulh v0.4S, v31.4S, v2.S[1] // ..........*........................ + ldr q24, [x3], #8 // .........*......................... + // gap // ................................... + mul v1.4S, v31.4S, v2.S[0] // .......*........................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + add v10.4S, v6.4S, v8.4S // ............*...................... + sub v8.4S, v6.4S, v8.4S // ...........*....................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + ldr q18, [x1, #0] // ...*............................... + mla v1.4S, v0.4S, v29.4S // ...................*............... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + ldr q30, [x4, #16] // .............*..................... + mul v11.4S, v8.4S, v24.S[0] // ................*.................. + sqrdmulh v3.4S, v8.4S, v24.S[1] // ...............*................... + mul v15.4S, v10.4S, v2.S[2] // ..................*................ + sqrdmulh v21.4S, v10.4S, v2.S[3] // .................*................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + ldr q26, [x4], #(6*16) // ........................*.......... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + add v24.4S, v18.4S, v1.4S // ......................*............ + mla v11.4S, v3.4S, v29.4S // ....................*.............. + mla v15.4S, v21.4S, v29.4S // .....................*............. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sub v12.4S, v18.4S, v1.4S // .......................*........... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + add v17.4S, v24.4S, v15.4S // ............................*...... + sub v23.4S, v24.4S, v15.4S // .............................*..... + add v20.4S, v12.4S, v11.4S // ..........................*........ + sub v16.4S, v12.4S, v11.4S // ...........................*....... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + ldr q9, [x4, #-32] // ..............*.................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + trn2 v6.4S, v20.4S, v16.4S // ..............................*.... + trn1 v20.4S, v20.4S, v16.4S // ...............................*... + trn1 v14.4S, v17.4S, v23.4S // ................................*.. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + trn2 v19.4S, v17.4S, v23.4S // .................................*. + ldr q8, [x4, #-16] // .........................*......... + trn2 v28.2D, v14.2D, v20.2D // ..................................* + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + + // ---------- new position ----------> + // 0 25 + // |------------------------|--------- + // ldr q18, [x1, #16] // *.................................. + // ldr q21, [x3], #16 // .*................................. + // ldr q22, [x1, #48] // ..*................................ + // ldr q25, [x1, #0] // ............*...................... + // ldr q27, [x1, #32] // .....*............................. + // sqrdmulh v10.4S, v22.4S, v21.S[1] // ...*............................... + // mul v30.4S, v22.4S, v21.S[0] // ....*.............................. + // mul v15.4S, v27.4S, v21.S[0] // .........*......................... + // mla v30.4S, v10.4S, v29.4S // ......*............................ + // ldr q10, [x3], #8 // ........*.......................... + // sqrdmulh v13.4S, v27.4S, v21.S[1] // .......*........................... + // sub v5.4S, v18.4S, v30.4S // ...........*....................... + // add v22.4S, v18.4S, v30.4S // ..........*........................ + // ldr q30, [x4, #16] // ..............*.................... + // ldr q9, [x4, #64] // ............................*...... + // sqrdmulh v16.4S, v5.4S, v10.S[1] // ................*.................. + // mul v4.4S, v5.4S, v10.S[0] // ...............*................... + // sqrdmulh v0.4S, v22.4S, v21.S[3] // ..................*................ + // mul v23.4S, v22.4S, v21.S[2] // .................*................. + // mla v15.4S, v13.4S, v29.4S // .............*..................... + // mla v4.4S, v16.4S, v29.4S // .....................*............. + // mla v23.4S, v0.4S, v29.4S // ......................*............ + // add v2.4S, v25.4S, v15.4S // ....................*.............. + // sub v3.4S, v25.4S, v15.4S // .......................*........... + // ldr q26, [x4], #(6*16) // ...................*............... + // ldr q8, [x4, #-16] // .................................*. + // add v28.4S, v3.4S, v4.4S // ..........................*........ + // sub v22.4S, v3.4S, v4.4S // ...........................*....... + // add v4.4S, v2.4S, v23.4S // ........................*.......... + // sub v2.4S, v2.4S, v23.4S // .........................*......... + // trn2 v6.4S, v28.4S, v22.4S // .............................*..... + // trn1 v20.4S, v28.4S, v22.4S // ..............................*.... + // trn1 v14.4S, v4.4S, v2.4S // ...............................*... + // trn2 v19.4S, v4.4S, v2.4S // ................................*.. + // trn2 v28.2D, v14.2D, v20.2D // ..................................* + + sub count, count, #1 layer5678_start: - ldr q8, [x1, #32] // ..*.......................................................... - ldr q11, [x1, #48] // ...*......................................................... - ldr q13, [x3], #16 // ....*........................................................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - ldr q31, [x3], #8 // .....*....................................................... - ldr q19, [x1, #16] // .*........................................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - ldr q16, [x1, #0] // *............................................................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v3.4S, v8.4S, v13.S[0] // ......*...................................................... - sqrdmulh v0.4S, v8.4S, v13.S[1] // .......*..................................................... - mul v26.4S, v11.4S, v13.S[0] // ...........*................................................. - sqrdmulh v6.4S, v11.4S, v13.S[1] // ............*................................................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v3.4S, v0.4S, v29.4S // ........*.................................................... - mla v26.4S, v6.4S, v29.4S // .............*............................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sub v25.4S, v16.4S, v3.4S // .........*................................................... - add v10.4S, v16.4S, v3.4S // ..........*.................................................. - sub v7.4S, v19.4S, v26.4S // ..............*.............................................. - add v2.4S, v19.4S, v26.4S // ...............*............................................. - ldr q19, [x4, #32] // ....................................*........................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v22.4S, v7.4S, v31.S[0] // .....................*....................................... - sqrdmulh v6.4S, v7.4S, v31.S[1] // ......................*...................................... - mul v31.4S, v2.4S, v13.S[2] // ................*............................................ - sqrdmulh v5.4S, v2.4S, v13.S[3] // .................*........................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v31.4S, v5.4S, v29.4S // ..................*.......................................... - mla v22.4S, v6.4S, v29.4S // .......................*..................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sub v1.4S, v25.4S, v22.4S // ........................*.................................... - add v5.4S, v10.4S, v31.4S // ....................*........................................ - add v25.4S, v25.4S, v22.4S // .........................*................................... - sub v12.4S, v10.4S, v31.4S // ...................*......................................... - ldr q31, [x4, #16] // ...................................*......................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - trn2 v6.4S, v25.4S, v1.4S // .............................*............................... - trn1 v27.4S, v25.4S, v1.4S // ............................*................................ - trn2 v26.4S, v5.4S, v12.4S // ...........................*................................. - trn1 v13.4S, v5.4S, v12.4S // ..........................*.................................. - ldr q5, [x4], #(6*16) // ..................................*.......................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - trn2 v10.2D, v26.2D, v6.2D // ...............................*............................. - trn1 v25.2D, v26.2D, v6.2D // .................................*........................... - ldr q26, [x4, #-48] // .....................................*....................... - ldr q6, [x4, #-16] // .......................................*..................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - trn2 v2.2D, v13.2D, v27.2D // ..............................*.............................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v8.4S, v10.4S, v31.4S // ..............................................*.............. - mul v10.4S, v10.4S, v5.4S // .............................................*............... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v14.4S, v2.4S, v31.4S // .........................................*................... - mul v2.4S, v2.4S, v5.4S // ........................................*.................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - ldr q31, [x4, #-32] // ......................................*...................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v10.4S, v8.4S, v29.4S // ...............................................*............. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v2.4S, v14.4S, v29.4S // ..........................................*.................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - add v24.4S, v25.4S, v10.4S // .................................................*........... - sub v25.4S, v25.4S, v10.4S // ................................................*............ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v7.4S, v24.4S, v19.4S // ..................................................*.......... - sqrdmulh v20.4S, v25.4S, v6.4S // ........................................................*.... - sqrdmulh v6.4S, v24.4S, v26.4S // ...................................................*......... - mul v8.4S, v25.4S, v31.4S // .......................................................*..... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - trn1 v26.2D, v13.2D, v27.2D // ................................*............................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v7.4S, v6.4S, v29.4S // ....................................................*........ - mla v8.4S, v20.4S, v29.4S // .........................................................*... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - add v10.4S, v26.4S, v2.4S // ............................................*................ - sub v9.4S, v26.4S, v2.4S // ...........................................*................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sub v12.4S, v9.4S, v8.4S // ..........................................................*.. - add v11.4S, v9.4S, v8.4S // ...........................................................*. - add v9.4S, v10.4S, v7.4S // ......................................................*...... - sub v10.4S, v10.4S, v7.4S // .....................................................*....... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // ............................................................* - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. + // Instructions: 61 + // Expected cycles: 23 + // Expected IPC: 2.65 + // + // Wall time: 8.15s + // User time: 8.15s + // + // -------------------- original position ---------------------> + // 0 25 50 + // |------------------------|------------------------|---------- + trn1 v13.2D, v19.2D, v6.2D // .................................*........................... + ldr q18, [x1, #80] // .e........................................................... + ldr q21, [x3], #16 // ....e........................................................ + trn2 v6.2D, v19.2D, v6.2D // ...............................*............................. + // gap // ............................................................. + // gap // ............................................................. + ldr q22, [x1, #112] // ...e......................................................... + // gap // ............................................................. + ldr q25, [x1, #64] // e............................................................ + ldr q27, [x1, #96] // ..e.......................................................... + sqrdmulh v19.4S, v28.4S, v30.4S // ........................................*.................... + mul v2.4S, v28.4S, v26.4S // .........................................*................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v16.4S, v6.4S, v30.4S // .............................................*............... + mul v6.4S, v6.4S, v26.4S // ..............................................*.............. + ldr q4, [x4, #-64] // ....................................*........................ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v2.4S, v19.4S, v29.4S // ..........................................*.................. + sqrdmulh v10.4S, v22.4S, v21.S[1] // ...........e................................................. + mul v30.4S, v22.4S, v21.S[0] // ............e................................................ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v6.4S, v16.4S, v29.4S // ...............................................*............. + ldr q16, [x4, #-48] // .....................................*....................... + mul v15.4S, v27.4S, v21.S[0] // .......e..................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v30.4S, v10.4S, v29.4S // .............e............................................... + ldr q10, [x3], #8 // .....e....................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v17.4S, v13.4S, v6.4S // ................................................*............ + add v11.4S, v13.4S, v6.4S // .................................................*........... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v13.4S, v27.4S, v21.S[1] // ......e...................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v5.4S, v18.4S, v30.4S // ..............e.............................................. + add v22.4S, v18.4S, v30.4S // ...............e............................................. + sqrdmulh v19.4S, v17.4S, v8.4S // .......................................................*..... + mul v6.4S, v17.4S, v9.4S // ........................................................*.... + ldr q30, [x4, #16] // ...................................e......................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v26.4S, v11.4S, v16.4S // ..................................................*.......... + mul v8.4S, v11.4S, v4.4S // ...................................................*......... + ldr q9, [x4, #64] // ......................................e...................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v16.4S, v5.4S, v10.S[1] // .....................e....................................... + mul v4.4S, v5.4S, v10.S[0] // ......................e...................................... + sqrdmulh v0.4S, v22.4S, v21.S[3] // ................e............................................ + mul v23.4S, v22.4S, v21.S[2] // .................e........................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v6.4S, v19.4S, v29.4S // .........................................................*... + trn1 v19.2D, v14.2D, v20.2D // ................................*............................ + mla v15.4S, v13.4S, v29.4S // ........e.................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mla v8.4S, v26.4S, v29.4S // ....................................................*........ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v26.4S, v19.4S, v2.4S // ...........................................*................. + add v19.4S, v19.4S, v2.4S // ............................................*................ + mla v4.4S, v16.4S, v29.4S // .......................e..................................... + mla v23.4S, v0.4S, v29.4S // ..................e.......................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + add v2.4S, v25.4S, v15.4S // ..........e.................................................. + sub v3.4S, v25.4S, v15.4S // .........e................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v18.4S, v26.4S, v6.4S // ..........................................................*.. + add v17.4S, v26.4S, v6.4S // ...........................................................*. + sub v16.4S, v19.4S, v8.4S // .....................................................*....... + add v15.4S, v19.4S, v8.4S // ......................................................*...... + ldr q26, [x4], #(6*16) // ..................................e.......................... + ldr q8, [x4, #-16] // .......................................e..................... + // gap // ............................................................. + // gap // ............................................................. + add v28.4S, v3.4S, v4.4S // .........................e................................... + sub v22.4S, v3.4S, v4.4S // ........................e.................................... + add v4.4S, v2.4S, v23.4S // ....................e........................................ + sub v2.4S, v2.4S, v23.4S // ...................e......................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1], #64 // ............................................................* + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + trn2 v6.4S, v28.4S, v22.4S // .............................e............................... + trn1 v20.4S, v28.4S, v22.4S // ............................e................................ + trn1 v14.4S, v4.4S, v2.4S // ..........................e.................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + trn2 v19.4S, v4.4S, v2.4S // ...........................e................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + trn2 v28.2D, v14.2D, v20.2D // ..............................e.............................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. - // original source code - // ldr q8, [x1, #(16*0)] // .....*....................................................... - // ldr q9, [x1, #(16*1)] // ....*........................................................ - // ldr q10, [x1, #(16*2)] // *............................................................ - // ldr q11, [x1, #(16*3)] // .*........................................................... - // ldr q0, [x3], #16 // ..*.......................................................... - // ldr q1, [x3], #8 // ...*......................................................... - // mul v24.4s, v10.4s, v0.s[0] // ......*...................................................... - // sqrdmulh v10.4s, v10.4s, v0.s[1] // .......*..................................................... - // mla v24.4s, v10.4s, v29.4s // ..........*.................................................. - // sub v10.4s, v8.4s, v24.4s // ............*................................................ - // add v8.4s, v8.4s, v24.4s // .............*............................................... - // mul v24.4s, v11.4s, v0.s[0] // ........*.................................................... - // sqrdmulh v11.4s, v11.4s, v0.s[1] // .........*................................................... - // mla v24.4s, v11.4s, v29.4s // ...........*................................................. - // sub v11.4s, v9.4s, v24.4s // ..............*.............................................. - // add v9.4s, v9.4s, v24.4s // ...............*............................................. - // mul v24.4s, v9.4s, v0.s[2] // ...................*......................................... - // sqrdmulh v9.4s, v9.4s, v0.s[3] // ....................*........................................ - // mla v24.4s, v9.4s, v29.4s // .....................*....................................... - // sub v9.4s, v8.4s, v24.4s // ..........................*.................................. - // add v8.4s, v8.4s, v24.4s // ........................*.................................... - // mul v24.4s, v11.4s, v1.s[0] // .................*........................................... - // sqrdmulh v11.4s, v11.4s, v1.s[1] // ..................*.......................................... - // mla v24.4s, v11.4s, v29.4s // ......................*...................................... - // sub v11.4s, v10.4s, v24.4s // .......................*..................................... - // add v10.4s, v10.4s, v24.4s // .........................*................................... - // trn1 v25.4s, v8.4s, v9.4s // ...............................*............................. - // trn2 v26.4s, v8.4s, v9.4s // ..............................*.............................. - // trn1 v27.4s, v10.4s, v11.4s // .............................*............................... - // trn2 v28.4s, v10.4s, v11.4s // ............................*................................ - // trn2 v10.2d, v25.2d, v27.2d // .....................................*....................... - // trn2 v11.2d, v26.2d, v28.2d // .................................*........................... - // trn1 v8.2d, v25.2d, v27.2d // ...................................................*......... - // trn1 v9.2d, v26.2d, v28.2d // ..................................*.......................... - // ldr q0, [x4], #(6*16) // ................................*............................ - // ldr q4, [x4, #(-6*16 + 1*16)] // ...........................*................................. - // ldr q1, [x4, #(-6*16 + 2*16)] // ................*............................................ - // ldr q5, [x4, #(-6*16 + 3*16)] // ...................................*......................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ..........................................*.................. - // ldr q6, [x4, #(-6*16 + 5*16)] // ....................................*........................ - // mul v24.4s, v10.4s, v0.4s // .........................................*................... - // sqrdmulh v10.4s, v10.4s, v4.4s // ........................................*.................... - // mla v24.4s, v10.4s, v29.4s // ............................................*................ - // sub v10.4s, v8.4s, v24.4s // .......................................................*..... - // add v8.4s, v8.4s, v24.4s // ......................................................*...... - // mul v24.4s, v11.4s, v0.4s // .......................................*..................... - // sqrdmulh v11.4s, v11.4s, v4.4s // ......................................*...................... - // mla v24.4s, v11.4s, v29.4s // ...........................................*................. - // sub v11.4s, v9.4s, v24.4s // ..............................................*.............. - // add v9.4s, v9.4s, v24.4s // .............................................*............... - // mul v24.4s, v9.4s, v1.4s // ...............................................*............. - // sqrdmulh v9.4s, v9.4s, v5.4s // .................................................*........... - // mla v24.4s, v9.4s, v29.4s // ....................................................*........ - // sub v9.4s, v8.4s, v24.4s // ...........................................................*. - // add v8.4s, v8.4s, v24.4s // ..........................................................*.. - // mul v24.4s, v11.4s, v2.4s // ..................................................*.......... - // sqrdmulh v11.4s, v11.4s, v6.4s // ................................................*............ - // mla v24.4s, v11.4s, v29.4s // .....................................................*....... - // sub v11.4s, v10.4s, v24.4s // ........................................................*.... - // add v10.4s, v10.4s, v24.4s // .........................................................*... - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ............................................................* + // -------------------------------------------------- new position ---------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q8, [x1, #(16*0)] // ....e.......................................................'....~.................................................. + // ldr q9, [x1, #(16*1)] // e...........................................................'~...................................................... + // ldr q10, [x1, #(16*2)] // .....e......................................................'.....~................................................. + // ldr q11, [x1, #(16*3)] // ...e........................................................'...~................................................... + // ldr q0, [x3], #16 // .e..........................................................'.~..................................................... + // ldr q1, [x3], #8 // ..................e.........................................'..................~.................................... + // sqrdmulh v27.4s, v10.4s, v0.s[1] // .....................e......................................'.....................~................................. + // mul v24.4s, v10.4s, v0.s[0] // ................e...........................................'................~...................................... + // mla v24.4s, v27.4s, v29.4s // ....................................e.......................'....................................~.................. + // sub v10.4s, v8.4s, v24.4s // ...........................................e................'...........................................~........... + // add v8.4s, v8.4s, v24.4s // ..........................................e.................'..........................................~............ + // sqrdmulh v27.4s, v11.4s, v0.s[1] // ............e...............................................'............~.......................................... + // mul v24.4s, v11.4s, v0.s[0] // .............e..............................................'.............~......................................... + // mla v24.4s, v27.4s, v29.4s // .................e..........................................'.................~..................................... + // sub v11.4s, v9.4s, v24.4s // ......................e.....................................'......................~................................ + // add v9.4s, v9.4s, v24.4s // .......................e....................................'.......................~............................... + // sqrdmulh v27.4s, v9.4s, v0.s[3] // ................................e...........................'................................~...................... + // mul v24.4s, v9.4s, v0.s[2] // .................................e..........................'.................................~..................... + // mla v24.4s, v27.4s, v29.4s // .........................................e..................'.........................................~............. + // sub v9.4s, v8.4s, v24.4s // .....................................................e......'.....................................................~. + // add v8.4s, v8.4s, v24.4s // ....................................................e.......'....................................................~.. + // sqrdmulh v27.4s, v11.4s, v1.s[1] // ..............................e.............................'..............................~........................ + // mul v24.4s, v11.4s, v1.s[0] // ...............................e............................'...............................~....................... + // mla v24.4s, v27.4s, v29.4s // ........................................e...................'........................................~.............. + // sub v11.4s, v10.4s, v24.4s // ...................................................e........'...................................................~... + // add v10.4s, v10.4s, v24.4s // ..................................................e.........'..................................................~.... + // trn1 v25.4s, v8.4s, v9.4s // .........................................................e..'....................................................... + // trn2 v26.4s, v8.4s, v9.4s // ..........................................................e.'....................................................... + // trn1 v27.4s, v10.4s, v11.4s // ........................................................e...'....................................................... + // trn2 v28.4s, v10.4s, v11.4s // .......................................................e....'....................................................... + // trn2 v10.2d, v25.2d, v27.2d // ...........................................................e'....................................................... + // trn2 v11.2d, v26.2d, v28.2d // ..~.........................................................'..*.................................................... + // trn1 v8.2d, v25.2d, v27.2d // ...................................~........................'...................................*................... + // trn1 v9.2d, v26.2d, v28.2d // ............................................................*....................................................... + // ldr q0, [ x4], #(6*16) // ................................................e...........'................................................~...... + // ldr q4, [x4, #(-6*16 + 1*16)] // ..........................e.................................'..........................~............................ + // ldr q1, [ x4, #(-6*16 + 2*16)] // ..........~.................................................'..........*............................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // ...............~............................................'...............*....................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // .............................e..............................'.............................~......................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .................................................e..........'.................................................~..... + // sqrdmulh v27.4s, v10.4s, v4.4s // ......~.....................................................'......*................................................ + // mul v24.4s, v10.4s, v0.4s // .......~....................................................'.......*............................................... + // mla v24.4s, v27.4s, v29.4s // ...........~................................................'...........*........................................... + // sub v10.4s, v8.4s, v24.4s // ......................................~.....................'......................................*................ + // add v8.4s, v8.4s, v24.4s // .......................................~....................'.......................................*............... + // sqrdmulh v27.4s, v11.4s, v4.4s // ........~...................................................'........*.............................................. + // mul v24.4s, v11.4s, v0.4s // .........~..................................................'.........*............................................. + // mla v24.4s, v27.4s, v29.4s // ..............~.............................................'..............*........................................ + // sub v11.4s, v9.4s, v24.4s // ...................~........................................'...................*................................... + // add v9.4s, v9.4s, v24.4s // ....................~.......................................'....................*.................................. + // sqrdmulh v27.4s, v9.4s, v5.4s // ...........................~................................'...........................*........................... + // mul v24.4s, v9.4s, v1.4s // ............................~...............................'............................*.......................... + // mla v24.4s, v27.4s, v29.4s // .....................................~......................'.....................................*................. + // sub v9.4s, v8.4s, v24.4s // ..............................................~.............'..............................................*........ + // add v8.4s, v8.4s, v24.4s // ...............................................~............'...............................................*....... + // sqrdmulh v27.4s, v11.4s, v6.4s // ........................~...................................'........................*.............................. + // mul v24.4s, v11.4s, v2.4s // .........................~..................................'.........................*............................. + // mla v24.4s, v27.4s, v29.4s // ..................................~.........................'..................................*.................... + // sub v11.4s, v10.4s, v24.4s // ............................................~...............'............................................*.......... + // add v10.4s, v10.4s, v24.4s // .............................................~..............'.............................................*......... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ......................................................~.....'......................................................* sub count, count, #1 cbnz count, layer5678_start + // Instructions: 26 + // Expected cycles: 22 + // Expected IPC: 1.18 + // + // Wall time: 0.31s + // User time: 0.31s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + trn1 v0.2D, v19.2D, v6.2D // *............................. + trn2 v6.2D, v19.2D, v6.2D // .*............................ + mul v4.4S, v28.4S, v26.4S // ...*.......................... + ldr q11, [x4, #-48] // .........*.................... + sqrdmulh v19.4S, v28.4S, v30.4S // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + trn1 v20.2D, v14.2D, v20.2D // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v16.4S, v6.4S, v30.4S // ....*......................... + mul v5.4S, v6.4S, v26.4S // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v4.4S, v19.4S, v29.4S // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v5.4S, v16.4S, v29.4S // ........*..................... + ldr q28, [x4, #-64] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v6.4S, v20.4S, v4.4S // ....................*......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v13.4S, v0.4S, v5.4S // ..........*................... + add v16.4S, v0.4S, v5.4S // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v22.4S, v13.4S, v8.4S // ............*................. + mul v26.4S, v13.4S, v9.4S // .............*................ + sqrdmulh v19.4S, v16.4S, v11.4S // ..............*............... + mul v8.4S, v16.4S, v28.4S // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v26.4S, v22.4S, v29.4S // ................*............. + mla v8.4S, v19.4S, v29.4S // ..................*........... + sub v19.4S, v20.4S, v4.4S // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v16.4S, v19.4S, v26.4S // .....................*........ + add v15.4S, v19.4S, v26.4S // ......................*....... + sub v14.4S, v6.4S, v8.4S // .......................*...... + add v13.4S, v6.4S, v8.4S // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1], #64 // .........................*.... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // trn1 v13.2D, v19.2D, v6.2D // *.............................. + // trn2 v6.2D, v19.2D, v6.2D // .*............................. + // sqrdmulh v19.4S, v28.4S, v30.4S // ....*.......................... + // mul v2.4S, v28.4S, v26.4S // ..*............................ + // sqrdmulh v16.4S, v6.4S, v30.4S // ......*........................ + // mul v6.4S, v6.4S, v26.4S // .......*....................... + // ldr q4, [x4, #-64] // ..........*.................... + // mla v2.4S, v19.4S, v29.4S // ........*...................... + // mla v6.4S, v16.4S, v29.4S // .........*..................... + // ldr q16, [x4, #-48] // ...*........................... + // sub v17.4S, v13.4S, v6.4S // ............*.................. + // add v11.4S, v13.4S, v6.4S // .............*................. + // sqrdmulh v19.4S, v17.4S, v8.4S // ..............*................ + // mul v6.4S, v17.4S, v9.4S // ...............*............... + // sqrdmulh v26.4S, v11.4S, v16.4S // ................*.............. + // mul v8.4S, v11.4S, v4.4S // .................*............. + // mla v6.4S, v19.4S, v29.4S // ..................*............ + // trn1 v19.2D, v14.2D, v20.2D // .....*......................... + // mla v8.4S, v26.4S, v29.4S // ...................*........... + // sub v26.4S, v19.4S, v2.4S // ....................*.......... + // add v19.4S, v19.4S, v2.4S // ...........*................... + // sub v18.4S, v26.4S, v6.4S // .....................*......... + // add v17.4S, v26.4S, v6.4S // ......................*........ + // sub v16.4S, v19.4S, v8.4S // .......................*....... + // add v15.4S, v19.4S, v8.4S // ........................*...... + // st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1], #64 // .........................*..... + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_m1_icestorm.s index cce759fb..5a4d5d1b 100644 --- a/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_1234_5678_opt_m1_icestorm.s @@ -26,27 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -68,15 +47,15 @@ .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -85,12 +64,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -98,31 +71,31 @@ .endm .macro load_roots_1234 - ldr_vi root0, r_ptr0, (8*16) - ldr_vo root1, r_ptr0, (-8*16 + 1*16) - ldr_vo root2, r_ptr0, (-8*16 + 2*16) - ldr_vo root3, r_ptr0, (-8*16 + 3*16) - ldr_vo root4, r_ptr0, (-8*16 + 4*16) - ldr_vo root5, r_ptr0, (-8*16 + 5*16) - ldr_vo root6, r_ptr0, (-8*16 + 6*16) - ldr_vo root7, r_ptr0, (-8*16 + 7*16) + ldr qform_root0, [r_ptr0], #(8*16) + ldr qform_root1, [r_ptr0, #(-8*16 + 1*16)] + ldr qform_root2, [r_ptr0, #(-8*16 + 2*16)] + ldr qform_root3, [r_ptr0, #(-8*16 + 3*16)] + ldr qform_root4, [r_ptr0, #(-8*16 + 4*16)] + ldr qform_root5, [r_ptr0, #(-8*16 + 5*16)] + ldr qform_root6, [r_ptr0, #(-8*16 + 6*16)] + ldr qform_root7, [r_ptr0, #(-8*16 + 7*16)] .endm .macro load_next_roots_56 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_6 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 8 + ldr qform_\root0, [\r_ptr0], #8 .endm .macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -137,7 +110,7 @@ trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -148,7 +121,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -158,7 +131,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -166,7 +139,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -177,19 +150,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -323,589 +296,598 @@ _ntt_dilithium_1234_5678_opt_m1_icestorm: load_roots_1234 .p2align 2 - ldr q9, [x0, #896] // ..............*................................................................................................................................................................................. - ldr q10, [x0, #704] // ...........*.................................................................................................................................................................................... - ldr q24, [x0, #128] // ..*............................................................................................................................................................................................. - ldr q16, [x0, #768] // ............*................................................................................................................................................................................... - ldr q12, [x0, #576] // .........*...................................................................................................................................................................................... - ldr q26, [x0, #192] // ...*............................................................................................................................................................................................ - ldr q21, [x0, #384] // ......*......................................................................................................................................................................................... - ldr q30, [x0, #832] // .............*.................................................................................................................................................................................. - mul v14.4S, v9.4S, v0.S[0] // ..............................................*................................................................................................................................................. - ldr q15, [x0, #512] // ........*....................................................................................................................................................................................... - mul v31.4S, v10.4S, v0.S[0] // ...............................*................................................................................................................................................................ - sqrdmulh v19.4S, v16.4S, v0.S[1] // .....................................*.......................................................................................................................................................... - sqrdmulh v27.4S, v9.4S, v0.S[1] // ...............................................*................................................................................................................................................ - mul v25.4S, v12.4S, v0.S[0] // .....................*.......................................................................................................................................................................... - mul v18.4S, v16.4S, v0.S[0] // ....................................*........................................................................................................................................................... - ldr q22, [x0, #960] // ...............*................................................................................................................................................................................ - ldr q9, [x0, #256] // ....*........................................................................................................................................................................................... - mul v8.4S, v30.4S, v0.S[0] // .........................................*...................................................................................................................................................... - sqrdmulh v20.4S, v30.4S, v0.S[1] // ..........................................*..................................................................................................................................................... - ldr q23, [x0, #64] // .*.............................................................................................................................................................................................. - sqrdmulh v16.4S, v12.4S, v0.S[1] // ......................*......................................................................................................................................................................... - ldr q30, [x0, #640] // ..........*..................................................................................................................................................................................... - mla v14.4S, v27.4S, v29.4S // ................................................*............................................................................................................................................... - mla v8.4S, v20.4S, v29.4S // ...........................................*.................................................................................................................................................... - mla v18.4S, v19.4S, v29.4S // ......................................*......................................................................................................................................................... - ldr q12, [x0, #320] // .....*.......................................................................................................................................................................................... - mul v19.4S, v15.4S, v0.S[0] // ................*............................................................................................................................................................................... - sqrdmulh v28.4S, v15.4S, v0.S[1] // .................*.............................................................................................................................................................................. - mla v25.4S, v16.4S, v29.4S // .......................*........................................................................................................................................................................ - add v13.4S, v21.4S, v14.4S // ..................................................*............................................................................................................................................. - sqrdmulh v11.4S, v30.4S, v0.S[1] // ...........................*.................................................................................................................................................................... - mul v30.4S, v30.4S, v0.S[0] // ..........................*..................................................................................................................................................................... - sqrdmulh v15.4S, v13.4S, v0.S[3] // ...................................................................*............................................................................................................................ - sub v14.4S, v21.4S, v14.4S // .................................................*.............................................................................................................................................. - mul v17.4S, v13.4S, v0.S[2] // ..................................................................*............................................................................................................................. - mla v19.4S, v28.4S, v29.4S // ..................*............................................................................................................................................................................. - mla v30.4S, v11.4S, v29.4S // ............................*................................................................................................................................................................... - mul v11.4S, v14.4S, v1.S[0] // ......................................................................................*......................................................................................................... - sqrdmulh v21.4S, v22.4S, v0.S[1] // ....................................................*........................................................................................................................................... - mul v27.4S, v22.4S, v0.S[0] // ...................................................*............................................................................................................................................ - ldr q28, [x0, #448] // .......*........................................................................................................................................................................................ - sqrdmulh v22.4S, v14.4S, v1.S[1] // .......................................................................................*........................................................................................................ - add v13.4S, v12.4S, v8.4S // .............................................*.................................................................................................................................................. - mla v17.4S, v15.4S, v29.4S // ....................................................................*........................................................................................................................... - mla v27.4S, v21.4S, v29.4S // .....................................................*.......................................................................................................................................... - sqrdmulh v14.4S, v10.4S, v0.S[1] // ................................*............................................................................................................................................................... - mul v16.4S, v13.4S, v0.S[2] // .............................................................*.................................................................................................................................. - mla v11.4S, v22.4S, v29.4S // ........................................................................................*....................................................................................................... - ldr q15, [x0, #0] // *............................................................................................................................................................................................... - sub v10.4S, v12.4S, v8.4S // ............................................*................................................................................................................................................... - add v12.4S, v9.4S, v18.4S // ........................................*....................................................................................................................................................... - add v21.4S, v28.4S, v27.4S // .......................................................*........................................................................................................................................ - sub v20.4S, v28.4S, v27.4S // ......................................................*......................................................................................................................................... - mla v31.4S, v14.4S, v29.4S // .................................*.............................................................................................................................................................. - add v8.4S, v23.4S, v25.4S // .........................*...................................................................................................................................................................... - sqrdmulh v28.4S, v21.4S, v0.S[3] // ........................................................................*....................................................................................................................... - mul v21.4S, v21.4S, v0.S[2] // .......................................................................*........................................................................................................................ - add v27.4S, v15.4S, v19.4S // ....................*........................................................................................................................................................................... - sqrdmulh v22.4S, v12.4S, v0.S[3] // .........................................................*...................................................................................................................................... - add v14.4S, v24.4S, v30.4S // ..............................*................................................................................................................................................................. - sub v18.4S, v9.4S, v18.4S // .......................................*........................................................................................................................................................ - mla v21.4S, v28.4S, v29.4S // .........................................................................*...................................................................................................................... - mul v12.4S, v12.4S, v0.S[2] // ........................................................*....................................................................................................................................... - add v9.4S, v14.4S, v17.4S // ......................................................................*......................................................................................................................... - sub v23.4S, v23.4S, v25.4S // ........................*....................................................................................................................................................................... - sub v28.4S, v14.4S, v17.4S // .....................................................................*.......................................................................................................................... - mul v25.4S, v9.4S, v1.S[2] // ................................................................................................*............................................................................................... - sqrdmulh v9.4S, v9.4S, v1.S[3] // .................................................................................................*.............................................................................................. - sqrdmulh v14.4S, v13.4S, v0.S[3] // ..............................................................*................................................................................................................................. - mla v12.4S, v22.4S, v29.4S // ..........................................................*..................................................................................................................................... - mul v13.4S, v20.4S, v1.S[0] // ...........................................................................................*.................................................................................................... - sqrdmulh v17.4S, v20.4S, v1.S[1] // ............................................................................................*................................................................................................... - sub v20.4S, v15.4S, v19.4S // ...................*............................................................................................................................................................................ - mla v25.4S, v9.4S, v29.4S // ..................................................................................................*............................................................................................. - sub v24.4S, v24.4S, v30.4S // .............................*.................................................................................................................................................................. - add v30.4S, v27.4S, v12.4S // ............................................................*................................................................................................................................... - mla v13.4S, v17.4S, v29.4S // .............................................................................................*.................................................................................................. - sub v19.4S, v26.4S, v31.4S // ..................................*............................................................................................................................................................. - add v15.4S, v26.4S, v31.4S // ...................................*............................................................................................................................................................ - mul v26.4S, v28.4S, v2.S[0] // ..........................................................................................................*..................................................................................... - mla v16.4S, v14.4S, v29.4S // ...............................................................*................................................................................................................................ - add v9.4S, v19.4S, v13.4S // ...............................................................................................*................................................................................................ - sub v31.4S, v15.4S, v21.4S // ..........................................................................*..................................................................................................................... - sqrdmulh v22.4S, v28.4S, v2.S[1] // ...........................................................................................................*.................................................................................... - add v15.4S, v15.4S, v21.4S // ...........................................................................*.................................................................................................................... - mul v21.4S, v31.4S, v2.S[0] // ...............................................................................................................*................................................................................ - sqrdmulh v17.4S, v31.4S, v2.S[1] // ................................................................................................................*............................................................................... - mul v28.4S, v15.4S, v1.S[2] // .....................................................................................................*.......................................................................................... - sqrdmulh v15.4S, v15.4S, v1.S[3] // ......................................................................................................*......................................................................................... - mla v26.4S, v22.4S, v29.4S // ............................................................................................................*................................................................................... - sub v31.4S, v27.4S, v12.4S // ...........................................................*.................................................................................................................................... - add v14.4S, v30.4S, v25.4S // ....................................................................................................*........................................................................................... - mla v21.4S, v17.4S, v29.4S // .................................................................................................................*.............................................................................. - sub v27.4S, v8.4S, v16.4S // ................................................................*............................................................................................................................... - mla v28.4S, v15.4S, v29.4S // .......................................................................................................*........................................................................................ - sub v17.4S, v31.4S, v26.4S // .............................................................................................................*.................................................................................. + ldr q24, [x0, #512] // ........*....................................................................................................................................................................................... + ldr q15, [x0, #832] // .............*.................................................................................................................................................................................. + ldr q10, [x0, #192] // ...*............................................................................................................................................................................................ + ldr q19, [x0, #768] // ............*................................................................................................................................................................................... + ldr q25, [x0, #64] // .*.............................................................................................................................................................................................. + ldr q21, [x0, #960] // ...............*................................................................................................................................................................................ + ldr q27, [x0, #704] // ...........*.................................................................................................................................................................................... + ldr q22, [x0, #896] // ..............*................................................................................................................................................................................. + ldr q14, [x0, #448] // .......*........................................................................................................................................................................................ + ldr q23, [x0, #576] // .........*...................................................................................................................................................................................... + sqrdmulh v20.4S, v15.4S, v0.S[1] // .........................................*...................................................................................................................................................... + mul v30.4S, v15.4S, v0.S[0] // ..........................................*..................................................................................................................................................... + mul v17.4S, v19.4S, v0.S[0] // .....................................*.......................................................................................................................................................... + ldr q11, [x0, #640] // ..........*..................................................................................................................................................................................... + sqrdmulh v31.4S, v19.4S, v0.S[1] // ....................................*........................................................................................................................................................... + sqrdmulh v16.4S, v24.4S, v0.S[1] // ................*............................................................................................................................................................................... + sqrdmulh v19.4S, v21.4S, v0.S[1] // ...................................................*............................................................................................................................................ + ldr q9, [x0, #320] // .....*.......................................................................................................................................................................................... + mla v30.4S, v20.4S, v29.4S // ...........................................*.................................................................................................................................................... + ldr q13, [x0, #256] // ....*........................................................................................................................................................................................... + mul v8.4S, v22.4S, v0.S[0] // ...............................................*................................................................................................................................................ + sub v15.4S, v9.4S, v30.4S // ............................................*................................................................................................................................................... + mul v26.4S, v21.4S, v0.S[0] // ....................................................*........................................................................................................................................... + sqrdmulh v21.4S, v11.4S, v0.S[1] // ..........................*..................................................................................................................................................................... + mul v28.4S, v27.4S, v0.S[0] // ................................*............................................................................................................................................................... + mul v18.4S, v15.4S, v1.S[0] // ..................................................................................*............................................................................................................. + add v20.4S, v9.4S, v30.4S // .............................................*.................................................................................................................................................. + ldr q12, [x0, #0] // *............................................................................................................................................................................................... + mul v9.4S, v24.4S, v0.S[0] // .................*.............................................................................................................................................................................. + sqrdmulh v24.4S, v15.4S, v1.S[1] // .................................................................................*.............................................................................................................. + mla v26.4S, v19.4S, v29.4S // .....................................................*.......................................................................................................................................... + sqrdmulh v15.4S, v20.4S, v0.S[3] // .............................................................*.................................................................................................................................. + mul v30.4S, v20.4S, v0.S[2] // ..............................................................*................................................................................................................................. + mla v9.4S, v16.4S, v29.4S // ..................*............................................................................................................................................................................. + sqrdmulh v19.4S, v22.4S, v0.S[1] // ..............................................*................................................................................................................................................. + mla v17.4S, v31.4S, v29.4S // ......................................*......................................................................................................................................................... + ldr q31, [x0, #384] // ......*......................................................................................................................................................................................... + mla v18.4S, v24.4S, v29.4S // ...................................................................................*............................................................................................................ + add v20.4S, v14.4S, v26.4S // .......................................................*........................................................................................................................................ + sqrdmulh v22.4S, v27.4S, v0.S[1] // ...............................*................................................................................................................................................................ + sqrdmulh v27.4S, v23.4S, v0.S[1] // .....................*.......................................................................................................................................................................... + mul v16.4S, v23.4S, v0.S[0] // ......................*......................................................................................................................................................................... + sqrdmulh v24.4S, v20.4S, v0.S[3] // .......................................................................*........................................................................................................................ + mul v20.4S, v20.4S, v0.S[2] // ........................................................................*....................................................................................................................... + mla v28.4S, v22.4S, v29.4S // .................................*.............................................................................................................................................................. + mla v8.4S, v19.4S, v29.4S // ................................................*............................................................................................................................................... + add v22.4S, v13.4S, v17.4S // ........................................*....................................................................................................................................................... + sub v14.4S, v14.4S, v26.4S // ......................................................*......................................................................................................................................... + mla v20.4S, v24.4S, v29.4S // .........................................................................*...................................................................................................................... + mla v16.4S, v27.4S, v29.4S // .......................*........................................................................................................................................................................ + add v19.4S, v10.4S, v28.4S // ...................................*............................................................................................................................................................ + mul v27.4S, v11.4S, v0.S[0] // ...........................*.................................................................................................................................................................... + mul v26.4S, v22.4S, v0.S[2] // .........................................................*...................................................................................................................................... + add v23.4S, v31.4S, v8.4S // ..................................................*............................................................................................................................................. + sub v24.4S, v31.4S, v8.4S // .................................................*.............................................................................................................................................. + sub v31.4S, v19.4S, v20.4S // ..........................................................................*..................................................................................................................... + sub v11.4S, v25.4S, v16.4S // ........................*....................................................................................................................................................................... + mla v27.4S, v21.4S, v29.4S // ............................*................................................................................................................................................................... + mul v21.4S, v14.4S, v1.S[0] // ............................................................................................*................................................................................................... + sub v28.4S, v10.4S, v28.4S // ..................................*............................................................................................................................................................. + sub v10.4S, v13.4S, v17.4S // .......................................*........................................................................................................................................................ + sqrdmulh v8.4S, v14.4S, v1.S[1] // ...........................................................................................*.................................................................................................... + mul v14.4S, v24.4S, v1.S[0] // .......................................................................................*........................................................................................................ + sqrdmulh v17.4S, v24.4S, v1.S[1] // ......................................................................................*......................................................................................................... + sqrdmulh v13.4S, v22.4S, v0.S[3] // ........................................................*....................................................................................................................................... + ldr q22, [x0, #128] // ..*............................................................................................................................................................................................. + sqrdmulh v24.4S, v10.4S, v1.S[1] // ............................................................................*................................................................................................................... + mla v30.4S, v15.4S, v29.4S // ...............................................................*................................................................................................................................ + mla v21.4S, v8.4S, v29.4S // .............................................................................................*.................................................................................................. + mla v14.4S, v17.4S, v29.4S // ........................................................................................*....................................................................................................... + add v17.4S, v19.4S, v20.4S // ...........................................................................*.................................................................................................................... + mla v26.4S, v13.4S, v29.4S // ..........................................................*..................................................................................................................................... + sqrdmulh v15.4S, v17.4S, v1.S[3] // .....................................................................................................*.......................................................................................... + sub v13.4S, v22.4S, v27.4S // .............................*.................................................................................................................................................................. + add v8.4S, v22.4S, v27.4S // ..............................*................................................................................................................................................................. + add v27.4S, v12.4S, v9.4S // ....................*........................................................................................................................................................................... + sub v19.4S, v28.4S, v21.4S // ..............................................................................................*................................................................................................. + mul v20.4S, v17.4S, v1.S[2] // ......................................................................................................*......................................................................................... + add v21.4S, v28.4S, v21.4S // ...............................................................................................*................................................................................................ + sub v28.4S, v11.4S, v18.4S // ....................................................................................*........................................................................................................... + mul v22.4S, v23.4S, v0.S[2] // ...................................................................*............................................................................................................................ + sqrdmulh v17.4S, v21.4S, v2.S[3] // .........................................................................................................................*...................................................................... + mul v21.4S, v21.4S, v2.S[2] // ..........................................................................................................................*..................................................................... + add v18.4S, v11.4S, v18.4S // .....................................................................................*.......................................................................................................... + sqrdmulh v11.4S, v23.4S, v0.S[3] // ..................................................................*............................................................................................................................. + add v23.4S, v13.4S, v14.4S // ..........................................................................................*..................................................................................................... + mul v10.4S, v10.4S, v1.S[0] // .............................................................................*.................................................................................................................. + mla v21.4S, v17.4S, v29.4S // ...........................................................................................................................*.................................................................... + sub v9.4S, v12.4S, v9.4S // ...................*............................................................................................................................................................................ + sub v13.4S, v13.4S, v14.4S // .........................................................................................*...................................................................................................... + mla v22.4S, v11.4S, v29.4S // ....................................................................*........................................................................................................................... + mla v10.4S, v24.4S, v29.4S // ..............................................................................*................................................................................................................. + mul v14.4S, v23.4S, v2.S[2] // .....................................................................................................................*.......................................................................... + sub v17.4S, v18.4S, v21.4S // ............................................................................................................................*................................................................... + sqrdmulh v23.4S, v23.4S, v2.S[3] // ....................................................................................................................*........................................................................... + sub v11.4S, v8.4S, v22.4S // .....................................................................*.......................................................................................................................... sub count, count, #1 layer1234_start: - mul v22.4S, v10.4S, v1.S[0] // .................................................................................*.............................................................................................................. - add v12.4S, v27.4S, v21.4S // ...................................................................................................................*............................................................................ - sub v15.4S, v19.4S, v13.4S // ..............................................................................................*................................................................................................. - sqrdmulh v13.4S, v10.4S, v1.S[1] // ..................................................................................*............................................................................................................. - mul v10.4S, v12.4S, v4.S[2] // ..................................................................................................................................................*............................................. - add v8.4S, v8.4S, v16.4S // .................................................................*.............................................................................................................................. - mul v16.4S, v15.4S, v3.S[0] // ...................................................................................................................................*............................................................ - sqrdmulh v19.4S, v15.4S, v3.S[1] // ....................................................................................................................................*........................................................... - mla v22.4S, v13.4S, v29.4S // ...................................................................................*............................................................................................................ - add v13.4S, v8.4S, v28.4S // .........................................................................................................*...................................................................................... - sqrdmulh v12.4S, v12.4S, v4.S[3] // ...................................................................................................................................................*............................................ - sub v15.4S, v27.4S, v21.4S // ..................................................................................................................*............................................................................. - sqrdmulh v27.4S, v13.4S, v3.S[3] // .........................................................................................................................................*...................................................... - mla v16.4S, v19.4S, v29.4S // .....................................................................................................................................*.......................................................... - mul v21.4S, v13.4S, v3.S[2] // ........................................................................................................................................*....................................................... - sub v13.4S, v23.4S, v22.4S // ....................................................................................*........................................................................................................... - mla v10.4S, v12.4S, v29.4S // ....................................................................................................................................................*........................................... - sub v25.4S, v30.4S, v25.4S // ...................................................................................................*............................................................................................ - sub v19.4S, v13.4S, v16.4S // ......................................................................................................................................*......................................................... - mul v12.4S, v15.4S, v5.S[0] // .......................................................................................................................................................*........................................ - sqrdmulh v30.4S, v15.4S, v5.S[1] // ........................................................................................................................................................*....................................... - mla v21.4S, v27.4S, v29.4S // ..........................................................................................................................................*..................................................... - add v15.4S, v13.4S, v16.4S // .......................................................................................................................................*........................................................ - mul v13.4S, v18.4S, v1.S[0] // ............................................................................*................................................................................................................... - sub v16.4S, v8.4S, v28.4S // ........................................................................................................*....................................................................................... - add v26.4S, v31.4S, v26.4S // ..............................................................................................................*................................................................................. - mla v12.4S, v30.4S, v29.4S // .........................................................................................................................................................*...................................... - ldr q8, [x0, #592] // .........e...................................................................................................................................................................................... - sqrdmulh v30.4S, v16.4S, v4.S[1] // ..............................................................................................................................................*................................................. - sqrdmulh v27.4S, v18.4S, v1.S[1] // .............................................................................*.................................................................................................................. - ldr q31, [x0, #720] // ...........e.................................................................................................................................................................................... - sub v18.4S, v26.4S, v10.4S // .....................................................................................................................................................*.......................................... - mul v28.4S, v16.4S, v4.S[0] // .............................................................................................................................................*.................................................. - add v16.4S, v17.4S, v12.4S // ...........................................................................................................................................................*.................................... - sub v12.4S, v17.4S, v12.4S // ..........................................................................................................................................................*..................................... - mul v17.4S, v8.4S, v0.S[0] // .....................e.......................................................................................................................................................................... - mla v13.4S, v27.4S, v29.4S // ..............................................................................*................................................................................................................. - str q18, [x0, #320] // .....................................................................................................................................................................................*.......... - add v27.4S, v14.4S, v21.4S // ............................................................................................................................................*................................................... - str q16, [x0, #384] // ......................................................................................................................................................................................*......... - sub v18.4S, v24.4S, v11.4S // .........................................................................................*...................................................................................................... - ldr q16, [x0, #912] // ..............e................................................................................................................................................................................. - add v10.4S, v26.4S, v10.4S // ......................................................................................................................................................*......................................... - sub v21.4S, v14.4S, v21.4S // ...........................................................................................................................................*.................................................... - ldr q26, [x0, #400] // ......e......................................................................................................................................................................................... - str q12, [x0, #448] // .......................................................................................................................................................................................*........ - mul v12.4S, v18.4S, v3.S[0] // ..............................................................................................................................*................................................................. - str q27, [x0], #(16) // ................................................................................................................................................................................*............... - sqrdmulh v18.4S, v18.4S, v3.S[1] // ...............................................................................................................................*................................................................ - mul v14.4S, v19.4S, v7.S[0] // ...........................................................................................................................................................................*.................... - str q10, [x0, #240] // ....................................................................................................................................................................................*........... - sqrdmulh v19.4S, v19.4S, v7.S[1] // ............................................................................................................................................................................*................... - mul v27.4S, v16.4S, v0.S[0] // ..............................................e................................................................................................................................................. - sqrdmulh v10.4S, v16.4S, v0.S[1] // ...............................................e................................................................................................................................................ - str q21, [x0, #48] // .................................................................................................................................................................................*.............. - mul v21.4S, v31.4S, v0.S[0] // ...............................e................................................................................................................................................................ - mla v12.4S, v18.4S, v29.4S // ................................................................................................................................*............................................................... - sqrdmulh v18.4S, v31.4S, v0.S[1] // ................................e............................................................................................................................................................... - add v11.4S, v24.4S, v11.4S // ..........................................................................................*..................................................................................................... - ldr q31, [x0, #960] // ...............e................................................................................................................................................................................ - sqrdmulh v24.4S, v8.4S, v0.S[1] // ......................e......................................................................................................................................................................... - mul v8.4S, v9.4S, v2.S[2] // .........................................................................................................................*...................................................................... - sqrdmulh v9.4S, v9.4S, v2.S[3] // ..........................................................................................................................*..................................................................... - mla v27.4S, v10.4S, v29.4S // ................................................e............................................................................................................................................... - add v22.4S, v23.4S, v22.4S // .....................................................................................*.......................................................................................................... - add v23.4S, v20.4S, v13.4S // ................................................................................*............................................................................................................... - mul v16.4S, v11.4S, v2.S[2] // ....................................................................................................................*........................................................................... - sub v20.4S, v20.4S, v13.4S // ...............................................................................*................................................................................................................ - mla v8.4S, v9.4S, v29.4S // ...........................................................................................................................*.................................................................... - sqrdmulh v9.4S, v11.4S, v2.S[3] // .....................................................................................................................*.......................................................................... - mla v14.4S, v19.4S, v29.4S // .............................................................................................................................................................................*.................. - sub v11.4S, v20.4S, v12.4S // .................................................................................................................................*.............................................................. - mul v13.4S, v31.4S, v0.S[0] // ...................................................e............................................................................................................................................ - sub v19.4S, v22.4S, v8.4S // ............................................................................................................................*................................................................... - add v8.4S, v22.4S, v8.4S // .............................................................................................................................*.................................................................. - mla v16.4S, v9.4S, v29.4S // ......................................................................................................................*......................................................................... - sqrdmulh v10.4S, v31.4S, v0.S[1] // ....................................................e........................................................................................................................................... - add v31.4S, v20.4S, v12.4S // ..................................................................................................................................*............................................................. - ldr q9, [x0, #832] // .............e.................................................................................................................................................................................. - ldr q22, [x0, #192] // ...e............................................................................................................................................................................................ - sqrdmulh v20.4S, v15.4S, v6.S[3] // .......................................................................................................................................................................*........................ - mul v15.4S, v15.4S, v6.S[2] // ......................................................................................................................................................................*......................... - mla v21.4S, v18.4S, v29.4S // .................................e.............................................................................................................................................................. - mul v18.4S, v19.4S, v6.S[0] // .................................................................................................................................................................*.............................. - sqrdmulh v12.4S, v19.4S, v6.S[1] // ..................................................................................................................................................................*............................. - mla v17.4S, v24.4S, v29.4S // .......................e........................................................................................................................................................................ - ldr q24, [x0, #640] // ..........e..................................................................................................................................................................................... - mla v28.4S, v30.4S, v29.4S // ...............................................................................................................................................*................................................ - sqrdmulh v30.4S, v9.4S, v0.S[1] // ..........................................e..................................................................................................................................................... - sub v19.4S, v22.4S, v21.4S // ..................................e............................................................................................................................................................. - mla v18.4S, v12.4S, v29.4S // ...................................................................................................................................................................*............................ - mla v15.4S, v20.4S, v29.4S // ........................................................................................................................................................................*....................... - add v12.4S, v25.4S, v28.4S // .................................................................................................................................................*.............................................. - sub v20.4S, v25.4S, v28.4S // ................................................................................................................................................*............................................... - sub v25.4S, v23.4S, v16.4S // .......................................................................................................................*........................................................................ - add v23.4S, v23.4S, v16.4S // ........................................................................................................................*....................................................................... - str q20, [x0, #176] // ...................................................................................................................................................................................*............ - add v20.4S, v26.4S, v27.4S // ..................................................e............................................................................................................................................. - add v16.4S, v11.4S, v14.4S // ...............................................................................................................................................................................*................ - str q12, [x0, #112] // ..................................................................................................................................................................................*............. - ldr q12, [x0, #768] // ............e................................................................................................................................................................................... - sub v28.4S, v11.4S, v14.4S // ..............................................................................................................................................................................*................. - sub v11.4S, v31.4S, v15.4S // .........................................................................................................................................................................*...................... - str q16, [x0, #880] // ..............................................................................................................................................................................................*. - sqrdmulh v14.4S, v8.4S, v5.S[3] // .............................................................................................................................................................*.................................. - sqrdmulh v16.4S, v20.4S, v0.S[3] // ...................................................................e............................................................................................................................ - str q11, [x0, #816] // .............................................................................................................................................................................................*.. - mul v20.4S, v20.4S, v0.S[2] // ..................................................................e............................................................................................................................. - mla v13.4S, v10.4S, v29.4S // .....................................................e.......................................................................................................................................... - ldr q11, [x0, #512] // ........e....................................................................................................................................................................................... - mul v10.4S, v9.4S, v0.S[0] // .........................................e...................................................................................................................................................... - mul v9.4S, v8.4S, v5.S[2] // ............................................................................................................................................................*................................... - str q28, [x0, #944] // ...............................................................................................................................................................................................* - ldr q8, [x0, #448] // .......e........................................................................................................................................................................................ - sub v28.4S, v26.4S, v27.4S // .................................................e.............................................................................................................................................. - sqrdmulh v26.4S, v12.4S, v0.S[1] // .....................................e.......................................................................................................................................................... - add v27.4S, v22.4S, v21.4S // ...................................e............................................................................................................................................................ - mul v22.4S, v12.4S, v0.S[0] // ....................................e........................................................................................................................................................... - add v12.4S, v31.4S, v15.4S // ..........................................................................................................................................................................*..................... - sqrdmulh v21.4S, v11.4S, v0.S[1] // .................e.............................................................................................................................................................................. - add v15.4S, v8.4S, v13.4S // .......................................................e........................................................................................................................................ - sqrdmulh v31.4S, v24.4S, v0.S[1] // ...........................e.................................................................................................................................................................... - sub v13.4S, v8.4S, v13.4S // ......................................................e......................................................................................................................................... - mul v8.4S, v24.4S, v0.S[0] // ..........................e..................................................................................................................................................................... - str q12, [x0, #752] // ............................................................................................................................................................................................*... - ldr q24, [x0, #128] // ..e............................................................................................................................................................................................. - mla v9.4S, v14.4S, v29.4S // ..............................................................................................................................................................*................................. - mul v12.4S, v15.4S, v0.S[2] // .......................................................................e........................................................................................................................ - sqrdmulh v14.4S, v15.4S, v0.S[3] // ........................................................................e....................................................................................................................... - mul v15.4S, v11.4S, v0.S[0] // ................e............................................................................................................................................................................... - mla v8.4S, v31.4S, v29.4S // ............................e................................................................................................................................................................... - mla v20.4S, v16.4S, v29.4S // ....................................................................e........................................................................................................................... - sub v31.4S, v23.4S, v9.4S // ...............................................................................................................................................................*................................ - sqrdmulh v11.4S, v13.4S, v1.S[1] // ............................................................................................e................................................................................................... - mla v15.4S, v21.4S, v29.4S // ..................e............................................................................................................................................................................. - add v9.4S, v23.4S, v9.4S // ................................................................................................................................................................*............................... - ldr q23, [x0, #64] // .e.............................................................................................................................................................................................. - str q31, [x0, #560] // .........................................................................................................................................................................................*...... - add v16.4S, v24.4S, v8.4S // ..............................e................................................................................................................................................................. - mla v12.4S, v14.4S, v29.4S // .........................................................................e...................................................................................................................... - str q9, [x0, #496] // ........................................................................................................................................................................................*....... - sqrdmulh v21.4S, v28.4S, v1.S[1] // .......................................................................................e........................................................................................................ - mul v13.4S, v13.4S, v1.S[0] // ...........................................................................................e.................................................................................................... - sub v31.4S, v16.4S, v20.4S // .....................................................................e.......................................................................................................................... - add v14.4S, v25.4S, v18.4S // .....................................................................................................................................................................*.......................... - ldr q9, [x0, #256] // ....e........................................................................................................................................................................................... - sub v25.4S, v25.4S, v18.4S // ....................................................................................................................................................................*........................... - mla v22.4S, v26.4S, v29.4S // ......................................e......................................................................................................................................................... - mla v13.4S, v11.4S, v29.4S // .............................................................................................e.................................................................................................. - mul v11.4S, v28.4S, v1.S[0] // ......................................................................................e......................................................................................................... - str q14, [x0, #624] // ..........................................................................................................................................................................................*..... - mul v26.4S, v31.4S, v2.S[0] // ..........................................................................................................e..................................................................................... - ldr q14, [x0, #320] // .....e.......................................................................................................................................................................................... - sqrdmulh v31.4S, v31.4S, v2.S[1] // ...........................................................................................................e.................................................................................... - mla v10.4S, v30.4S, v29.4S // ...........................................e.................................................................................................................................................... - str q25, [x0, #688] // ...........................................................................................................................................................................................*.... - add v25.4S, v9.4S, v22.4S // ........................................e....................................................................................................................................................... - mla v11.4S, v21.4S, v29.4S // ........................................................................................e....................................................................................................... - sub v18.4S, v9.4S, v22.4S // .......................................e........................................................................................................................................................ - mul v9.4S, v25.4S, v0.S[2] // ........................................................e....................................................................................................................................... - sqrdmulh v25.4S, v25.4S, v0.S[3] // .........................................................e...................................................................................................................................... - add v22.4S, v14.4S, v10.4S // .............................................e.................................................................................................................................................. - sub v10.4S, v14.4S, v10.4S // ............................................e................................................................................................................................................... - add v14.4S, v16.4S, v20.4S // ......................................................................e......................................................................................................................... - sub v21.4S, v27.4S, v12.4S // ..........................................................................e..................................................................................................................... - mul v16.4S, v22.4S, v0.S[2] // .............................................................e.................................................................................................................................. - sqrdmulh v30.4S, v22.4S, v0.S[3] // ..............................................................e................................................................................................................................. - sqrdmulh v22.4S, v21.4S, v2.S[1] // ................................................................................................................e............................................................................... - ldr q20, [x0, #0] // e............................................................................................................................................................................................... - mla v26.4S, v31.4S, v29.4S // ............................................................................................................e................................................................................... - mla v9.4S, v25.4S, v29.4S // ..........................................................e..................................................................................................................................... - mul v25.4S, v14.4S, v1.S[2] // ................................................................................................e............................................................................................... - sub v24.4S, v24.4S, v8.4S // .............................e.................................................................................................................................................................. - mla v16.4S, v30.4S, v29.4S // ...............................................................e................................................................................................................................ - add v8.4S, v23.4S, v17.4S // .........................e...................................................................................................................................................................... - add v30.4S, v27.4S, v12.4S // ...........................................................................e.................................................................................................................... - sqrdmulh v31.4S, v14.4S, v1.S[3] // .................................................................................................e.............................................................................................. - add v12.4S, v20.4S, v15.4S // ....................e........................................................................................................................................................................... - sqrdmulh v14.4S, v30.4S, v1.S[3] // ......................................................................................................e......................................................................................... - sub v27.4S, v8.4S, v16.4S // ................................................................e............................................................................................................................... - mul v28.4S, v30.4S, v1.S[2] // .....................................................................................................e.......................................................................................... - add v30.4S, v12.4S, v9.4S // ............................................................e................................................................................................................................... - mla v25.4S, v31.4S, v29.4S // ..................................................................................................e............................................................................................. - sub v31.4S, v12.4S, v9.4S // ...........................................................e.................................................................................................................................... - mul v21.4S, v21.4S, v2.S[0] // ...............................................................................................................e................................................................................ - add v9.4S, v19.4S, v13.4S // ...............................................................................................e................................................................................................ - sub v23.4S, v23.4S, v17.4S // ........................e....................................................................................................................................................................... - sub v17.4S, v31.4S, v26.4S // .............................................................................................................e.................................................................................. - mla v28.4S, v14.4S, v29.4S // .......................................................................................................e........................................................................................ - add v14.4S, v30.4S, v25.4S // ....................................................................................................e........................................................................................... - sub v20.4S, v20.4S, v15.4S // ...................e............................................................................................................................................................................ - mla v21.4S, v22.4S, v29.4S // .................................................................................................................e.............................................................................. + // Instructions: 192 + // Expected cycles: 48 + // Expected IPC: 4.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + sqrdmulh v24.4S, v13.4S, v3.S[1] // ...............*................................................................................................................................................................................ + mul v13.4S, v13.4S, v3.S[0] // ..................*............................................................................................................................................................................. + sqrdmulh v12.4S, v31.4S, v2.S[1] // .............*.................................................................................................................................................................................. + add v18.4S, v18.4S, v21.4S // .*.............................................................................................................................................................................................. + mul v21.4S, v19.4S, v3.S[0] // ...*............................................................................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v3.S[1] // ..*............................................................................................................................................................................................. + add v16.4S, v25.4S, v16.4S // ......*......................................................................................................................................................................................... + mla v13.4S, v24.4S, v29.4S // ...........................*.................................................................................................................................................................... + mla v14.4S, v23.4S, v29.4S // *............................................................................................................................................................................................... + sub v23.4S, v9.4S, v10.4S // ..............*................................................................................................................................................................................. + mla v21.4S, v19.4S, v29.4S // ........*....................................................................................................................................................................................... + add v24.4S, v16.4S, v30.4S // ........................................*....................................................................................................................................................... + sub v19.4S, v23.4S, v13.4S // .................................*.............................................................................................................................................................. + add v8.4S, v8.4S, v22.4S // ............................*................................................................................................................................................................... + add v10.4S, v9.4S, v10.4S // .......*........................................................................................................................................................................................ + sub v22.4S, v27.4S, v26.4S // .....................................*.......................................................................................................................................................... + add v25.4S, v28.4S, v21.4S // ......................................*......................................................................................................................................................... + sub v28.4S, v28.4S, v21.4S // ...................*............................................................................................................................................................................ + mla v20.4S, v15.4S, v29.4S // ................................*............................................................................................................................................................... + mul v9.4S, v8.4S, v1.S[2] // ...........................................*.................................................................................................................................................... + sqrdmulh v15.4S, v8.4S, v1.S[3] // .......................................*........................................................................................................................................................ + mul v21.4S, v18.4S, v5.S[2] // .....*.......................................................................................................................................................................................... + sqrdmulh v18.4S, v18.4S, v5.S[3] // ....*........................................................................................................................................................................................... + mul v8.4S, v31.4S, v2.S[0] // ...........*.................................................................................................................................................................................... + sub v16.4S, v16.4S, v30.4S // ..........................*..................................................................................................................................................................... + mla v9.4S, v15.4S, v29.4S // ...................................................*............................................................................................................................................ + mul v31.4S, v25.4S, v6.S[2] // .....................................................................*.......................................................................................................................... + mla v21.4S, v18.4S, v29.4S // ..........*..................................................................................................................................................................................... + sqrdmulh v18.4S, v25.4S, v6.S[3] // ....................................................................*........................................................................................................................... + add v25.4S, v23.4S, v13.4S // ...........................................................................*.................................................................................................................... + sub v23.4S, v24.4S, v20.4S // ..............................................................*................................................................................................................................. + add v15.4S, v24.4S, v20.4S // .............................................*.................................................................................................................................................. + sqrdmulh v30.4S, v11.4S, v2.S[1] // .........*...................................................................................................................................................................................... + mla v8.4S, v12.4S, v29.4S // ......................*......................................................................................................................................................................... + mla v31.4S, v18.4S, v29.4S // ...............................................................................*................................................................................................................ + mul v18.4S, v11.4S, v2.S[0] // .....................*.......................................................................................................................................................................... + mul v11.4S, v28.4S, v7.S[0] // .........................*...................................................................................................................................................................... + sqrdmulh v13.4S, v28.4S, v7.S[1] // ........................*....................................................................................................................................................................... + mla v11.4S, v13.4S, v29.4S // ...............................*................................................................................................................................................................ + mul v13.4S, v23.4S, v4.S[0] // ..................................................................*............................................................................................................................. + ldr q24, [x0, #976] // .....................................................................................................*.......................................................................................... + sqrdmulh v28.4S, v23.4S, v4.S[1] // .................................................................*.............................................................................................................................. + add v20.4S, v25.4S, v31.4S // ........................................................................................*....................................................................................................... + add v23.4S, v16.4S, v8.4S // ............................................*................................................................................................................................................... + sub v12.4S, v25.4S, v31.4S // .......................................................................................*........................................................................................................ + sub v25.4S, v19.4S, v11.4S // ............................................................*................................................................................................................................... + add v31.4S, v19.4S, v11.4S // ....................................*........................................................................................................................................................... + str q20, [x0, #768] // ..........................................................................................*..................................................................................................... + add v19.4S, v27.4S, v26.4S // ..............................................*................................................................................................................................................. + sqrdmulh v26.4S, v24.4S, v0.S[1] // ................................................................................................................*............................................................................... + str q12, [x0, #832] // ............................................................................................*................................................................................................... + mul v27.4S, v24.4S, v0.S[0] // ......................................................................................................................*......................................................................... + mul v11.4S, v17.4S, v6.S[0] // ......................................................*......................................................................................................................................... + str q31, [x0, #896] // .....................................................*.......................................................................................................................................... + sqrdmulh v12.4S, v15.4S, v3.S[3] // .................................................*.............................................................................................................................................. + sub v16.4S, v16.4S, v8.4S // ..............................*................................................................................................................................................................. + str q25, [x0, #960] // ..............................................................................*................................................................................................................. + add v8.4S, v19.4S, v9.4S // ...........................................................*.................................................................................................................................... + sqrdmulh v25.4S, v23.4S, v4.S[3] // .........................................................*...................................................................................................................................... + ldr q31, [x0, #720] // ......................................................................................................*......................................................................................... + mla v13.4S, v28.4S, v29.4S // ........................................................................*....................................................................................................................... + ldr q28, [x0, #464] // ........................................................................................................*....................................................................................... + mla v27.4S, v26.4S, v29.4S // ..............................................................................................................................*................................................................. + sub v24.4S, v19.4S, v9.4S // ............................................................................*................................................................................................................... + sub v9.4S, v10.4S, v14.4S // .....................................................................................*.......................................................................................................... + mul v20.4S, v15.4S, v3.S[2] // ................................................*............................................................................................................................................... + mla v18.4S, v30.4S, v29.4S // .............................*.................................................................................................................................................................. + add v26.4S, v24.4S, v13.4S // ................................................................................*............................................................................................................... + mul v30.4S, v31.4S, v0.S[0] // ........................................................................................................................*....................................................................... + sub v19.4S, v28.4S, v27.4S // ...............................................................................................................................................*................................................ + add v27.4S, v28.4S, v27.4S // ......................................................................................................................................*......................................................... + str q26, [x0, #128] // .........................................................................................*...................................................................................................... + sqrdmulh v15.4S, v31.4S, v0.S[1] // .......................................................................................................................................*........................................................ + sqrdmulh v28.4S, v19.4S, v1.S[1] // .............................................................................................................................................................*.................................. + mul v19.4S, v19.4S, v1.S[0] // ..........................................................................................................................................................*..................................... + ldr q31, [x0, #208] // ..................................................................................................*............................................................................................. + sqrdmulh v26.4S, v27.4S, v0.S[3] // ..........................................................................................................................................*..................................................... + sub v13.4S, v24.4S, v13.4S // .................................................................................*.............................................................................................................. + ldr q24, [x0, #912] // .......................................................................................................*........................................................................................ + mla v19.4S, v28.4S, v29.4S // ....................................................................................................................................................................*........................... + sub v28.4S, v22.4S, v18.4S // ..........................................*..................................................................................................................................................... + mul v23.4S, v23.4S, v4.S[2] // .......................................................*........................................................................................................................................ + mla v20.4S, v12.4S, v29.4S // ..........................................................*..................................................................................................................................... + str q13, [x0, #192] // ......................................................................................*......................................................................................................... + add v13.4S, v22.4S, v18.4S // .............................................................*.................................................................................................................................. + sqrdmulh v12.4S, v16.4S, v5.S[1] // ...................................*............................................................................................................................................................ + sqrdmulh v22.4S, v17.4S, v6.S[1] // .........................................................................*...................................................................................................................... + sqrdmulh v17.4S, v24.4S, v0.S[1] // ..................................................................................................................................*............................................................. + mla v23.4S, v25.4S, v29.4S // ...............................................................*................................................................................................................................ + sub v25.4S, v8.4S, v20.4S // ......................................................................*......................................................................................................................... + add v18.4S, v8.4S, v20.4S // ................................................................*............................................................................................................................... + mla v11.4S, v22.4S, v29.4S // ....................................................................................*........................................................................................................... + mul v20.4S, v16.4S, v5.S[0] // ..................................*............................................................................................................................................................. + ldr q16, [x0, #848] // .................................................................................................*.............................................................................................. + mla v30.4S, v15.4S, v29.4S // ............................................................................................................................................*................................................... + add v10.4S, v10.4S, v14.4S // ............*................................................................................................................................................................................... + str q18, [x0], #(16) // ...................................................................*............................................................................................................................ + add v18.4S, v13.4S, v23.4S // .......................................................................*........................................................................................................................ + mla v20.4S, v12.4S, v29.4S // .........................................*...................................................................................................................................................... + ldr q15, [x0, #768] // ...................................................................................................*............................................................................................ + sub v8.4S, v13.4S, v23.4S // .............................................................................*.................................................................................................................. + mul v23.4S, v24.4S, v0.S[0] // ....................................................................................................................*........................................................................... + str q25, [x0, #48] // ..................................................................................*............................................................................................................. + ldr q25, [x0, #640] // .............................................................................................................*.................................................................................. + str q18, [x0, #240] // ..........................................................................*..................................................................................................................... + sub v14.4S, v31.4S, v30.4S // ...........................................................................................................................................................*.................................... + mul v22.4S, v27.4S, v0.S[2] // ...........................................................................................................................................*.................................................... + add v18.4S, v28.4S, v20.4S // ....................................................*........................................................................................................................................... + sqrdmulh v12.4S, v16.4S, v0.S[1] // ..........................................................................................................*..................................................................................... + mla v23.4S, v17.4S, v29.4S // .............................................................................................................................................*.................................................. + ldr q17, [x0, #576] // .........................................................................................................*...................................................................................... + sub v28.4S, v28.4S, v20.4S // ...............................................*................................................................................................................................................ + sqrdmulh v24.4S, v15.4S, v0.S[1] // ..............................................................................................................*................................................................................. + mul v13.4S, v15.4S, v0.S[0] // ............................................................................................................*................................................................................... + sqrdmulh v15.4S, v25.4S, v0.S[1] // .......................................................................................................................*........................................................................ + add v27.4S, v14.4S, v19.4S // ..............................................................................................................................................................................*................. + str q8, [x0, #304] // ...................................................................................*............................................................................................................ + mul v8.4S, v16.4S, v0.S[0] // ...........................................................................................................*.................................................................................... + mul v20.4S, v25.4S, v0.S[0] // ...................................................................................................................................................*............................................ + mla v13.4S, v24.4S, v29.4S // ...................................................................................................................................*............................................................ + str q18, [x0, #368] // ........................................................*....................................................................................................................................... + mla v22.4S, v26.4S, v29.4S // ................................................................................................................................................*............................................... + ldr q26, [x0, #256] // ...................................................................................................................*............................................................................ + add v16.4S, v10.4S, v21.4S // .................*.............................................................................................................................................................................. + sub v18.4S, v10.4S, v21.4S // ................*............................................................................................................................................................................... + mla v20.4S, v15.4S, v29.4S // .........................................................................................................................................................*...................................... + mla v8.4S, v12.4S, v29.4S // ..................................................................................................................*............................................................................. + ldr q15, [x0, #384] // ....................................................................................................................................*........................................................... + add v24.4S, v31.4S, v30.4S // ..................................................................................................................................................*............................................. + ldr q12, [x0, #320] // .................................................................................................................*.............................................................................. + str q18, [x0, #560] // ....................*........................................................................................................................................................................... + mul v21.4S, v27.4S, v2.S[2] // ..................................................................................................................................................................................*............. + sqrdmulh v30.4S, v27.4S, v2.S[3] // .................................................................................................................................................................................*.............. + add v18.4S, v9.4S, v11.4S // ...........................................................................................*.................................................................................................... + str q16, [x0, #496] // .......................*........................................................................................................................................................................ + sqrdmulh v31.4S, v17.4S, v0.S[1] // ........................................................................................................................................*....................................................... + mul v16.4S, v17.4S, v0.S[0] // .........................................................................................................................................*...................................................... + sub v10.4S, v26.4S, v13.4S // ............................................................................................................................................................*................................... + sub v17.4S, v15.4S, v23.4S // ......................................................................................................................................................*......................................... + str q28, [x0, #432] // ..................................................*............................................................................................................................................. + sub v27.4S, v12.4S, v8.4S // .....................................................................................................................*.......................................................................... + sub v11.4S, v9.4S, v11.4S // .............................................................................................*.................................................................................................. + str q18, [x0, #624] // ..............................................................................................*................................................................................................. + sqrdmulh v18.4S, v10.4S, v1.S[1] // ..................................................................................................................................................................*............................. + mul v28.4S, v17.4S, v1.S[0] // ..............................................................................................................................................................*................................. + ldr q25, [x0, #64] // ....................................................................................................*........................................................................................... + sqrdmulh v9.4S, v27.4S, v1.S[1] // .............................................................................................................................*.................................................................. + mla v16.4S, v31.4S, v29.4S // .................................................................................................................................................*.............................................. + str q11, [x0, #688] // ...............................................................................................*................................................................................................ + ldr q11, [x0, #128] // .................................................................................................................................................................*.............................. + sub v19.4S, v14.4S, v19.4S // ............................................................................................................................................................................*................... + sub v31.4S, v24.4S, v22.4S // .......................................................................................................................................................*........................................ + sqrdmulh v17.4S, v17.4S, v1.S[1] // ...............................................................................................................................................................*................................ + mul v10.4S, v10.4S, v1.S[0] // ......................................................................................................................................................................................*......... + sub v14.4S, v25.4S, v16.4S // ........................................................................................................................................................*....................................... + mla v21.4S, v30.4S, v29.4S // .......................................................................................................................................................................................*........ + add v26.4S, v26.4S, v13.4S // ..............................................................................................................................................*................................................. + sub v13.4S, v11.4S, v20.4S // .........................................................................................................................................................................*...................... + ldr q30, [x0, #512] // ................................................................................................*............................................................................................... + mla v28.4S, v17.4S, v29.4S // .....................................................................................................................................................................*.......................... + mul v17.4S, v27.4S, v1.S[0] // .........................................................................................................................*...................................................................... + add v27.4S, v12.4S, v8.4S // ..........................................................................................................................*..................................................................... + sqrdmulh v8.4S, v26.4S, v0.S[3] // ................................................................................................................................................................*............................... + add v12.4S, v24.4S, v22.4S // ......................................................................................................................................................................*......................... + mul v26.4S, v26.4S, v0.S[2] // ....................................................................................................................................................*........................................... + mla v17.4S, v9.4S, v29.4S // .....................................................................................................................................*.......................................................... + sqrdmulh v22.4S, v30.4S, v0.S[1] // ...............................................................................................................*................................................................................ + mul v9.4S, v30.4S, v0.S[0] // ............................................................................................................................*................................................................... + mul v30.4S, v27.4S, v0.S[2] // ................................................................................................................................*............................................................... + add v24.4S, v13.4S, v28.4S // .....................................................................................................................................................................................*.......... + mla v26.4S, v8.4S, v29.4S // .......................................................................................................................................................................*........................ + add v15.4S, v15.4S, v23.4S // .....................................................................................................................................................*.......................................... + add v8.4S, v11.4S, v20.4S // ..........................................................................................................................................................................*..................... + ldr q23, [x0, #0] // ...........................................................................................................................*.................................................................... + mul v20.4S, v12.4S, v1.S[2] // .............................................................................................................................................................................*.................. + mla v9.4S, v22.4S, v29.4S // .................................................................................................................................*.............................................................. + mul v22.4S, v15.4S, v0.S[2] // ................................................................................................................................................................................*............... + sqrdmulh v15.4S, v15.4S, v0.S[3] // ....................................................................................................................................................................................*........... + sqrdmulh v11.4S, v27.4S, v0.S[3] // ...............................................................................................................................*................................................................ + mla v10.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................*.... + add v27.4S, v23.4S, v9.4S // ...........................................................................................................................................................................*.................... + sub v9.4S, v23.4S, v9.4S // ........................................................................................................................................................................................*....... + add v18.4S, v14.4S, v17.4S // ...................................................................................................................................................................................*............ + sqrdmulh v23.4S, v24.4S, v2.S[3] // ..............................................................................................................................................................................................*. + mla v22.4S, v15.4S, v29.4S // ..........................................................................................................................................................................................*..... + mla v30.4S, v11.4S, v29.4S // ...................................................................................................................................................................*............................ + sqrdmulh v15.4S, v12.4S, v1.S[3] // ........................................................................................................................................................................*....................... + sub v13.4S, v13.4S, v28.4S // .........................................................................................................................................................................................*...... + sub v28.4S, v14.4S, v17.4S // ...............................................................................................................................................................................*................ + sub v17.4S, v18.4S, v21.4S // .............................................................................................................................................................................................*.. + sub v11.4S, v8.4S, v22.4S // ...............................................................................................................................................................................................* + mul v14.4S, v24.4S, v2.S[2] // ............................................................................................................................................................................................*... - // original source code - // ldr q8, [x0, #0] // .............................................................................................................................................e.......................|........................................................................................................................................................... - // ldr q9, [x0, #(1*(512/8))] // .............................................................................................................e.......................................................|.......................................................................................................................................e................... - // ldr q10, [x0, #(2*(512/8))] // ..................................................................................................e..................................................................|............................................................................................................................e.............................. - // ldr q11, [x0, #(3*(512/8))] // ....................................................e................................................................................................................|..............................................................................e............................................................................ - // ldr q12, [x0, #(4*(512/8))] // ......................................................................................................................e..............................................|................................................................................................................................................e.......... - // ldr q13, [x0, #(5*(512/8))] // .............................................................................................................................e.......................................|.......................................................................................................................................................e... - // ldr q14, [x0, #(6*(512/8))] // .................e...................................................................................................................................................|...........................................e............................................................................................................... - // ldr q15, [x0, #(7*(512/8))] // ......................................................................................e..............................................................................|................................................................................................................e.......................................... - // ldr q16, [x0, #(8*(512/8))] // ..................................................................................e..................................................................................|............................................................................................................e.............................................. - // ldr q17, [x0, #(9*(512/8))] // e....................................................................................................................................................................|..........................e................................................................................................................................ - // ldr q18, [x0, #(10*(512/8))] // ...........................................................e.........................................................................................................|.....................................................................................e..................................................................... - // ldr q19, [x0, #(11*(512/8))] // ...e.................................................................................................................................................................|.............................e............................................................................................................................. - // ldr q20, [x0, #(12*(512/8))] // .........................................................................e...........................................................................................|...................................................................................................e....................................................... - // ldr q21, [x0, #(13*(512/8))] // ...................................................e.................................................................................................................|.............................................................................e............................................................................. - // ldr q22, [x0, #(14*(512/8))] // ..............e......................................................................................................................................................|........................................e.................................................................................................................. - // ldr q23, [x0, #(15*(512/8))] // ................................e....................................................................................................................................|..........................................................e................................................................................................ - // mul v24.4s, v16.4s, v0.s[0] // ......................................................................................................e..............................................................|................................................................................................................................e.......................... - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ............................................................................................e........................................................................|......................................................................................................................e.................................... - // mla v24.4s, v16.4s, v29.4s // ...........................................................................................................e.........................................................|.....................................................................................................................................e..................... - // sub v16.4s, v8.4s, v24.4s // ...................................................................................................................................................................e.|........................................................................................................................................................... - // add v8.4s, v8.4s, v24.4s // ......................................................................................................................................................e..............|........................................................................................................................................................... - // mul v24.4s, v17.4s, v0.s[0] // ........e............................................................................................................................................................|..................................e........................................................................................................................ - // sqrdmulh v17.4s, v17.4s, v0.s[1] // .................................e...................................................................................................................................|...........................................................e............................................................................................... - // mla v24.4s, v17.4s, v29.4s // ..........................................................e..........................................................................................................|....................................................................................e...................................................................... - // sub v17.4s, v9.4s, v24.4s // ...............................................................................................................................................................e.....|........................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ...................................................................................................................................................e.................|........................................................................................................................................................... - // mul v24.4s, v18.4s, v0.s[0] // ................................................................................................e....................................................................|..........................................................................................................................e................................ - // sqrdmulh v18.4s, v18.4s, v0.s[1] // ..............................................................................................e......................................................................|........................................................................................................................e.................................. - // mla v24.4s, v18.4s, v29.4s // .......................................................................................................e.............................................................|.................................................................................................................................e......................... - // sub v18.4s, v10.4s, v24.4s // .................................................................................................................................................e...................|........................................................................................................................................................... - // add v10.4s, v10.4s, v24.4s // ...............................................................................................................e.....................................................|.........................................................................................................................................e................. - // mul v24.4s, v19.4s, v0.s[0] // ............................e........................................................................................................................................|......................................................e.................................................................................................... - // sqrdmulh v19.4s, v19.4s, v0.s[1] // ..............................e......................................................................................................................................|........................................................e.................................................................................................. - // mla v24.4s, v19.4s, v29.4s // .......................................................e.............................................................................................................|.................................................................................e......................................................................... - // sub v19.4s, v11.4s, v24.4s // ..............................................................e......................................................................................................|........................................................................................e.................................................................. - // add v11.4s, v11.4s, v24.4s // .........................................................................................e...........................................................................|...................................................................................................................e....................................... - // mul v24.4s, v20.4s, v0.s[0] // ..........................................................................................e..........................................................................|....................................................................................................................e...................................... - // sqrdmulh v20.4s, v20.4s, v0.s[1] // ........................................................................................e............................................................................|..................................................................................................................e........................................ - // mla v24.4s, v20.4s, v29.4s // ........................................................................................................................e............................................|..................................................................................................................................................e........ - // sub v20.4s, v12.4s, v24.4s // ...................................................................................................................................e.................................|........................................................................................................................................................... - // add v12.4s, v12.4s, v24.4s // .................................................................................................................................e...................................|........................................................................................................................................................... - // mul v24.4s, v21.4s, v0.s[0] // ...................................................................................e.................................................................................|.............................................................................................................e............................................. - // sqrdmulh v21.4s, v21.4s, v0.s[1] // .............................................................e.......................................................................................................|.......................................................................................e................................................................... - // mla v24.4s, v21.4s, v29.4s // ...............................................................................................................................e.....................................|.........................................................................................................................................................e. - // sub v21.4s, v13.4s, v24.4s // .......................................................................................................................................e.............................|........................................................................................................................................................... - // add v13.4s, v13.4s, v24.4s // ......................................................................................................................................e..............................|........................................................................................................................................................... - // mul v24.4s, v22.4s, v0.s[0] // .........................e...........................................................................................................................................|...................................................e....................................................................................................... - // sqrdmulh v22.4s, v22.4s, v0.s[1] // ..........................e..........................................................................................................................................|....................................................e...................................................................................................... - // mla v24.4s, v22.4s, v29.4s // ....................................e................................................................................................................................|..............................................................e............................................................................................ - // sub v22.4s, v14.4s, v24.4s // .......................................................................................e.............................................................................|.................................................................................................................e......................................... - // add v14.4s, v14.4s, v24.4s // ......................................................................e..............................................................................................|................................................................................................e.......................................................... - // mul v24.4s, v23.4s, v0.s[0] // .............................................e.......................................................................................................................|.......................................................................e................................................................................... - // sqrdmulh v23.4s, v23.4s, v0.s[1] // .................................................e...................................................................................................................|...........................................................................e............................................................................... - // mla v24.4s, v23.4s, v29.4s // .................................................................................e...................................................................................|...........................................................................................................e............................................... - // sub v23.4s, v15.4s, v24.4s // ...............................................................................................e.....................................................................|.........................................................................................................................e................................. - // add v15.4s, v15.4s, v24.4s // .............................................................................................e.......................................................................|.......................................................................................................................e................................... - // mul v24.4s, v12.4s, v0.s[2] // ....................................................................................................................................e................................|........................................................................................................................................................... - // sqrdmulh v12.4s, v12.4s, v0.s[3] // .....................................................................................................................................e...............................|........................................................................................................................................................... - // mla v24.4s, v12.4s, v29.4s // ...............................................................................................................................................e.....................|........................................................................................................................................................... - // sub v12.4s, v8.4s, v24.4s // ............................................................................................................................................................e........|........................................................................................................................................................... - // add v8.4s, v8.4s, v24.4s // ..........................................................................................................................................................e..........|........................................................................................................................................................... - // mul v24.4s, v13.4s, v0.s[2] // ..........................................................................................................................................e..........................|........................................................................................................................................................... - // sqrdmulh v13.4s, v13.4s, v0.s[3] // ...........................................................................................................................................e.........................|........................................................................................................................................................... - // mla v24.4s, v13.4s, v29.4s // ..................................................................................................................................................e..................|........................................................................................................................................................... - // sub v13.4s, v9.4s, v24.4s // ........................................................................................................................................................e............|........................................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // .....................................................................................................................................................................|....*...................................................................................................................................................... - // mul v24.4s, v14.4s, v0.s[2] // ................................................................................e....................................................................................|..........................................................................................................e................................................ - // sqrdmulh v14.4s, v14.4s, v0.s[3] // ..............................................................................e......................................................................................|........................................................................................................e.................................................. - // mla v24.4s, v14.4s, v29.4s // ........................................................................................................e............................................................|..................................................................................................................................e........................ - // sub v14.4s, v10.4s, v24.4s // ....................................................................................................................e................................................|..............................................................................................................................................e............ - // add v10.4s, v10.4s, v24.4s // ........................................................................................................................................e............................|........................................................................................................................................................... - // mul v24.4s, v15.4s, v0.s[2] // ....................................................................................................e................................................................|..............................................................................................................................e............................ - // sqrdmulh v15.4s, v15.4s, v0.s[3] // .....................................................................................................e...............................................................|...............................................................................................................................e........................... - // mla v24.4s, v15.4s, v29.4s // ................................................................................................................e....................................................|..........................................................................................................................................e................ - // sub v15.4s, v11.4s, v24.4s // .........................................................................................................................................e...........................|........................................................................................................................................................... - // add v11.4s, v11.4s, v24.4s // ....................................................................................................................................................e................|........................................................................................................................................................... - // mul v24.4s, v20.4s, v1.s[0] // .....................................................................................................................................................................|......................*.................................................................................................................................... - // sqrdmulh v20.4s, v20.4s, v1.s[1] // ..*..................................................................................................................................................................|............................*.............................................................................................................................. - // mla v24.4s, v20.4s, v29.4s // .........*...........................................................................................................................................................|...................................*....................................................................................................................... - // sub v20.4s, v16.4s, v24.4s // ........................................*............................................................................................................................|..................................................................*........................................................................................ - // add v16.4s, v16.4s, v24.4s // ......................................*..............................................................................................................................|................................................................*.......................................................................................... - // mul v24.4s, v21.4s, v1.s[0] // .....................................................................................................................................................................*........................................................................................................................................................... - // sqrdmulh v21.4s, v21.4s, v1.s[1] // .....................................................................................................................................................................|..*........................................................................................................................................................ - // mla v24.4s, v21.4s, v29.4s // .....................................................................................................................................................................|.......*................................................................................................................................................... - // sub v21.4s, v17.4s, v24.4s // .....................................................................................................................................................................|..............*............................................................................................................................................ - // add v17.4s, v17.4s, v24.4s // .....................................*...............................................................................................................................|...............................................................*........................................................................................... - // mul v24.4s, v22.4s, v1.s[0] // ..........................................................................................................................e..........................................|....................................................................................................................................................e...... - // sqrdmulh v22.4s, v22.4s, v1.s[1] // ..................................................................................................................e..................................................|............................................................................................................................................e.............. - // mla v24.4s, v22.4s, v29.4s // ..................................................................................................................................e..................................|........................................................................................................................................................... - // sub v22.4s, v18.4s, v24.4s // .............*.......................................................................................................................................................|.......................................*................................................................................................................... - // add v18.4s, v18.4s, v24.4s // ...............................*.....................................................................................................................................|.........................................................*................................................................................................. - // mul v24.4s, v23.4s, v1.s[0] // ...................................................................................................................e.................................................|.............................................................................................................................................e............. - // sqrdmulh v23.4s, v23.4s, v1.s[1] // ..........................................................................................................e..........................................................|....................................................................................................................................e...................... - // mla v24.4s, v23.4s, v29.4s // .........................................................................................................................e...........................................|...................................................................................................................................................e....... - // sub v23.4s, v19.4s, v24.4s // .....................................................................................................................................................................|.*......................................................................................................................................................... - // add v19.4s, v19.4s, v24.4s // ..............................................................................................................................................................e......|........................................................................................................................................................... - // mul v24.4s, v10.4s, v1.s[2] // ................................................................................................................................................e....................|........................................................................................................................................................... - // sqrdmulh v10.4s, v10.4s, v1.s[3] // .....................................................................................................................................................e...............|........................................................................................................................................................... - // mla v24.4s, v10.4s, v29.4s // ...........................................................................................................................................................e.........|........................................................................................................................................................... - // sub v10.4s, v8.4s, v24.4s // .....................................................................................................................................................................|................*.......................................................................................................................................... - // add v8.4s, v8.4s, v24.4s // ..................................................................................................................................................................e..|........................................................................................................................................................... - // mul v24.4s, v11.4s, v1.s[2] // .........................................................................................................................................................e...........|........................................................................................................................................................... - // sqrdmulh v11.4s, v11.4s, v1.s[3] // .......................................................................................................................................................e.............|........................................................................................................................................................... - // mla v24.4s, v11.4s, v29.4s // .................................................................................................................................................................e...|........................................................................................................................................................... - // sub v11.4s, v9.4s, v24.4s // .....................................................................................................................................................................|.......................*................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // .....................................................................................................................................................................|........*.................................................................................................................................................. - // mul v24.4s, v14.4s, v2.s[0] // ............................................................................................................................e........................................|......................................................................................................................................................e.... - // sqrdmulh v14.4s, v14.4s, v2.s[1] // ..............................................................................................................................e......................................|........................................................................................................................................................e.. - // mla v24.4s, v14.4s, v29.4s // ..............................................................................................................................................e......................|........................................................................................................................................................... - // sub v14.4s, v12.4s, v24.4s // ................................................................................................................................................................e....|........................................................................................................................................................... - // add v12.4s, v12.4s, v24.4s // .....................................................................................................................................................................|........................*.................................................................................................................................. - // mul v24.4s, v15.4s, v2.s[0] // .............................................................................................................................................................e.......|........................................................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v2.s[1] // ............................................................................................................................................e........................|........................................................................................................................................................... - // mla v24.4s, v15.4s, v29.4s // ....................................................................................................................................................................e|........................................................................................................................................................... - // sub v15.4s, v13.4s, v24.4s // .....................................................................................................................................................................|..........*................................................................................................................................................ - // add v13.4s, v13.4s, v24.4s // .....................................................................................................................................................................|*.......................................................................................................................................................... - // mul v24.4s, v18.4s, v2.s[2] // .......................................*.............................................................................................................................|.................................................................*......................................................................................... - // sqrdmulh v18.4s, v18.4s, v2.s[3] // ..........................................*..........................................................................................................................|....................................................................*...................................................................................... - // mla v24.4s, v18.4s, v29.4s // ................................................*....................................................................................................................|..........................................................................*................................................................................ - // sub v18.4s, v16.4s, v24.4s // ...................................................................*.................................................................................................|.............................................................................................*............................................................. - // add v16.4s, v16.4s, v24.4s // ....................................................................*................................................................................................|..............................................................................................*............................................................ - // mul v24.4s, v19.4s, v2.s[2] // ..................................*..................................................................................................................................|............................................................*.............................................................................................. - // sqrdmulh v19.4s, v19.4s, v2.s[3] // ...................................*.................................................................................................................................|.............................................................*............................................................................................. - // mla v24.4s, v19.4s, v29.4s // .........................................*...........................................................................................................................|...................................................................*....................................................................................... - // sub v19.4s, v17.4s, v24.4s // ..............................................*......................................................................................................................|........................................................................*.................................................................................. - // add v17.4s, v17.4s, v24.4s // ...............................................*.....................................................................................................................|.........................................................................*................................................................................. - // mul v24.4s, v22.4s, v3.s[0] // ...................*.................................................................................................................................................|.............................................*............................................................................................................. - // sqrdmulh v22.4s, v22.4s, v3.s[1] // .....................*...............................................................................................................................................|...............................................*........................................................................................................... - // mla v24.4s, v22.4s, v29.4s // .............................*.......................................................................................................................................|.......................................................*................................................................................................... - // sub v22.4s, v20.4s, v24.4s // ............................................*........................................................................................................................|......................................................................*.................................................................................... - // add v20.4s, v20.4s, v24.4s // ..................................................*..................................................................................................................|............................................................................*.............................................................................. - // mul v24.4s, v23.4s, v3.s[0] // .....................................................................................................................................................................|.....*..................................................................................................................................................... - // sqrdmulh v23.4s, v23.4s, v3.s[1] // .....................................................................................................................................................................|......*.................................................................................................................................................... - // mla v24.4s, v23.4s, v29.4s // .....................................................................................................................................................................|............*.............................................................................................................................................. - // sub v23.4s, v21.4s, v24.4s // .....................................................................................................................................................................|.................*......................................................................................................................................... - // add v21.4s, v21.4s, v24.4s // .....................................................................................................................................................................|.....................*..................................................................................................................................... - // mul v24.4s, v9.4s, v3.s[2] // .....................................................................................................................................................................|.............*............................................................................................................................................. - // sqrdmulh v9.4s, v9.4s, v3.s[3] // .....................................................................................................................................................................|...........*............................................................................................................................................... - // mla v24.4s, v9.4s, v29.4s // .....................................................................................................................................................................|....................*...................................................................................................................................... - // sub v9.4s, v8.4s, v24.4s // ................*....................................................................................................................................................|..........................................*................................................................................................................ - // add v8.4s, v8.4s, v24.4s // ...........*.........................................................................................................................................................|.....................................*..................................................................................................................... - // mul v24.4s, v11.4s, v4.s[0] // .....*...............................................................................................................................................................|...............................*........................................................................................................................... - // sqrdmulh v11.4s, v11.4s, v4.s[1] // .*...................................................................................................................................................................|...........................*............................................................................................................................... - // mla v24.4s, v11.4s, v29.4s // ............................................................*........................................................................................................|......................................................................................*.................................................................... - // sub v11.4s, v10.4s, v24.4s // ..................................................................*..................................................................................................|............................................................................................*.............................................................. - // add v10.4s, v10.4s, v24.4s // .................................................................*...................................................................................................|...........................................................................................*............................................................... - // mul v24.4s, v13.4s, v4.s[2] // .....................................................................................................................................................................|...*....................................................................................................................................................... - // sqrdmulh v13.4s, v13.4s, v4.s[3] // .....................................................................................................................................................................|.........*................................................................................................................................................. - // mla v24.4s, v13.4s, v29.4s // .....................................................................................................................................................................|...............*........................................................................................................................................... - // sub v13.4s, v12.4s, v24.4s // ....*................................................................................................................................................................|..............................*............................................................................................................................ - // add v12.4s, v12.4s, v24.4s // ...............*.....................................................................................................................................................|.........................................*................................................................................................................. - // mul v24.4s, v15.4s, v5.s[0] // .....................................................................................................................................................................|..................*........................................................................................................................................ - // sqrdmulh v15.4s, v15.4s, v5.s[1] // .....................................................................................................................................................................|...................*....................................................................................................................................... - // mla v24.4s, v15.4s, v29.4s // .....................................................................................................................................................................|.........................*................................................................................................................................. - // sub v15.4s, v14.4s, v24.4s // .......*.............................................................................................................................................................|.................................*......................................................................................................................... - // add v14.4s, v14.4s, v24.4s // ......*..............................................................................................................................................................|................................*.......................................................................................................................... - // mul v24.4s, v17.4s, v5.s[2] // ....................................................................................*................................................................................|..............................................................................................................*............................................ - // sqrdmulh v17.4s, v17.4s, v5.s[3] // .............................................................................*.......................................................................................|.......................................................................................................*................................................... - // mla v24.4s, v17.4s, v29.4s // ...................................................................................................*.................................................................|.............................................................................................................................*............................. - // sub v17.4s, v16.4s, v24.4s // .........................................................................................................*...........................................................|...................................................................................................................................*....................... - // add v16.4s, v16.4s, v24.4s // ............................................................................................................*........................................................|......................................................................................................................................*.................... - // mul v24.4s, v19.4s, v6.s[0] // ........................................................*............................................................................................................|..................................................................................*........................................................................ - // sqrdmulh v19.4s, v19.4s, v6.s[1] // .........................................................*...........................................................................................................|...................................................................................*....................................................................... - // mla v24.4s, v19.4s, v29.4s // ...............................................................*.....................................................................................................|.........................................................................................*................................................................. - // sub v19.4s, v18.4s, v24.4s // .......................................................................................................................*.............................................|.................................................................................................................................................*......... - // add v18.4s, v18.4s, v24.4s // .....................................................................................................................*...............................................|...............................................................................................................................................*........... - // mul v24.4s, v21.4s, v6.s[2] // ......................................................*..............................................................................................................|................................................................................*.......................................................................... - // sqrdmulh v21.4s, v21.4s, v6.s[3] // .....................................................*...............................................................................................................|...............................................................................*........................................................................... - // mla v24.4s, v21.4s, v29.4s // ................................................................*....................................................................................................|..........................................................................................*................................................................ - // sub v21.4s, v20.4s, v24.4s // ...........................................................................*.........................................................................................|.....................................................................................................*..................................................... - // add v20.4s, v20.4s, v24.4s // ...........................................................................................*.........................................................................|.....................................................................................................................*..................................... - // mul v24.4s, v23.4s, v7.s[0] // ......................*..............................................................................................................................................|................................................*.......................................................................................................... - // sqrdmulh v23.4s, v23.4s, v7.s[1] // ........................*............................................................................................................................................|..................................................*........................................................................................................ - // mla v24.4s, v23.4s, v29.4s // ...........................................*.........................................................................................................................|.....................................................................*..................................................................................... - // sub v23.4s, v22.4s, v24.4s // ..........................................................................*..........................................................................................|....................................................................................................*...................................................... - // add v22.4s, v22.4s, v24.4s // .......................................................................*.............................................................................................|.................................................................................................*......................................................... - // str q8, [x0], #(16) // ....................*................................................................................................................................................|..............................................*............................................................................................................ - // str q9, [x0, #(-16 + 1*(512/8))] // ...........................*.........................................................................................................................................|.....................................................*..................................................................................................... - // str q10, [x0, #(-16 + 2*(512/8))] // ........................................................................*............................................................................................|..................................................................................................*........................................................ - // str q11, [x0, #(-16 + 3*(512/8))] // .....................................................................*...............................................................................................|...............................................................................................*........................................................... - // str q12, [x0, #(-16 + 4*(512/8))] // .......................*.............................................................................................................................................|.................................................*......................................................................................................... - // str q13, [x0, #(-16 + 5*(512/8))] // ..........*..........................................................................................................................................................|....................................*...................................................................................................................... - // str q14, [x0, #(-16 + 6*(512/8))] // ............*........................................................................................................................................................|......................................*.................................................................................................................... - // str q15, [x0, #(-16 + 7*(512/8))] // ..................*..................................................................................................................................................|............................................*.............................................................................................................. - // str q16, [x0, #(-16 + 8*(512/8))] // .................................................................................................................*...................................................|...........................................................................................................................................*............... - // str q17, [x0, #(-16 + 9*(512/8))] // ..............................................................................................................*......................................................|........................................................................................................................................*.................. - // str q18, [x0, #(-16 + 10*(512/8))] // ...........................................................................................................................*.........................................|.....................................................................................................................................................*..... - // str q19, [x0, #(-16 + 11*(512/8))] // ................................................................................................................................*....................................|..........................................................................................................................................................* - // str q20, [x0, #(-16 + 12*(512/8))] // .................................................................................................*...................................................................|...........................................................................................................................*............................... - // str q21, [x0, #(-16 + 13*(512/8))] // ...............................................................................*.....................................................................................|.........................................................................................................*................................................. - // str q22, [x0, #(-16 + 14*(512/8))] // ............................................................................*........................................................................................|......................................................................................................*.................................................... - // str q23, [x0, #(-16 + 15*(512/8))] // .....................................................................................*...............................................................................|...............................................................................................................*........................................... + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // mla v14.4S, v23.4S, v29.4S // ........*....................................................................................................................................................................................... + // add v23.4S, v18.4S, v21.4S // ...*............................................................................................................................................................................................ + // sqrdmulh v18.4S, v19.4S, v3.S[1] // .....*.......................................................................................................................................................................................... + // mul v21.4S, v19.4S, v3.S[0] // ....*........................................................................................................................................................................................... + // sqrdmulh v12.4S, v23.4S, v5.S[3] // ......................*......................................................................................................................................................................... + // mul v24.4S, v23.4S, v5.S[2] // .....................*.......................................................................................................................................................................... + // add v19.4S, v25.4S, v16.4S // ......*......................................................................................................................................................................................... + // add v16.4S, v9.4S, v10.4S // ..............*................................................................................................................................................................................. + // mla v21.4S, v18.4S, v29.4S // ..........*..................................................................................................................................................................................... + // sqrdmulh v23.4S, v11.4S, v2.S[1] // ................................*............................................................................................................................................................... + // mla v24.4S, v12.4S, v29.4S // ...........................*.................................................................................................................................................................... + // mul v12.4S, v31.4S, v2.S[0] // .......................*........................................................................................................................................................................ + // add v25.4S, v16.4S, v14.4S // ...............................................................................................*................................................................................................ + // sqrdmulh v31.4S, v31.4S, v2.S[1] // ..*............................................................................................................................................................................................. + // sub v10.4S, v9.4S, v10.4S // .........*...................................................................................................................................................................................... + // sqrdmulh v18.4S, v13.4S, v3.S[1] // *............................................................................................................................................................................................... + // sub v9.4S, v25.4S, v24.4S // ............................................................................................................................*................................................................... + // add v25.4S, v25.4S, v24.4S // ...........................................................................................................................*.................................................................... + // mul v24.4S, v13.4S, v3.S[0] // .*.............................................................................................................................................................................................. + // sub v13.4S, v28.4S, v21.4S // .................*.............................................................................................................................................................................. + // str q9, [x0, #576] // ..................................................................................................................................*............................................................. + // mul v9.4S, v11.4S, v2.S[0] // ...................................*............................................................................................................................................................ + // mla v12.4S, v31.4S, v29.4S // .................................*.............................................................................................................................................................. + // str q25, [x0, #512] // ......................................................................................................................................*......................................................... + // sqrdmulh v25.4S, v13.4S, v7.S[1] // .....................................*.......................................................................................................................................................... + // mul v11.4S, v13.4S, v7.S[0] // ....................................*........................................................................................................................................................... + // sub v31.4S, v19.4S, v30.4S // ........................*....................................................................................................................................................................... + // mla v24.4S, v18.4S, v29.4S // .......*........................................................................................................................................................................................ + // add v8.4S, v8.4S, v22.4S // .............*.................................................................................................................................................................................. + // mla v9.4S, v23.4S, v29.4S // ..................................................................*............................................................................................................................. + // sub v22.4S, v31.4S, v12.4S // .......................................................*........................................................................................................................................ + // mla v11.4S, v25.4S, v29.4S // ......................................*......................................................................................................................................................... + // mla v20.4S, v15.4S, v29.4S // ..................*............................................................................................................................................................................. + // sub v18.4S, v10.4S, v24.4S // ............*................................................................................................................................................................................... + // mul v23.4S, v22.4S, v5.S[0] // ............................................................................................*................................................................................................... + // sqrdmulh v13.4S, v22.4S, v5.S[1] // .....................................................................................*.......................................................................................................... + // add v25.4S, v18.4S, v11.4S // ..............................................*................................................................................................................................................. + // sub v22.4S, v27.4S, v26.4S // ...............*................................................................................................................................................................................ + // add v21.4S, v28.4S, v21.4S // ................*............................................................................................................................................................................... + // sqrdmulh v15.4S, v8.4S, v1.S[3] // ....................*........................................................................................................................................................................... + // add v28.4S, v19.4S, v30.4S // ...........*.................................................................................................................................................................................... + // mla v23.4S, v13.4S, v29.4S // ..................................................................................................*............................................................................................. + // sub v30.4S, v22.4S, v9.4S // ................................................................................*............................................................................................................... + // mul v19.4S, v8.4S, v1.S[2] // ...................*............................................................................................................................................................................ + // add v8.4S, v31.4S, v12.4S // ...........................................*.................................................................................................................................................... + // add v12.4S, v28.4S, v20.4S // ...............................*................................................................................................................................................................ + // add v31.4S, v27.4S, v26.4S // ................................................*............................................................................................................................................... + // sub v26.4S, v30.4S, v23.4S // ...............................................................................................................*................................................................................ + // mul v27.4S, v12.4S, v3.S[2] // .................................................................*.............................................................................................................................. + // sqrdmulh v12.4S, v12.4S, v3.S[3] // ......................................................*......................................................................................................................................... + // str q26, [x0, #448] // ...........................................................................................................................................*.................................................... + // mla v19.4S, v15.4S, v29.4S // .........................*...................................................................................................................................................................... + // add v13.4S, v30.4S, v23.4S // ...........................................................................................................*.................................................................................... + // str q25, [x0, #896] // .....................................................*.......................................................................................................................................... + // mul v25.4S, v17.4S, v6.S[0] // ....................................................*........................................................................................................................................... + // mul v26.4S, v8.4S, v4.S[2] // .................................................................................*.............................................................................................................. + // str q13, [x0, #384] // ........................................................................................................................*....................................................................... + // sqrdmulh v13.4S, v8.4S, v4.S[3] // ..........................................................*..................................................................................................................................... + // mla v27.4S, v12.4S, v29.4S // ..................................................................................*............................................................................................................. + // add v8.4S, v31.4S, v19.4S // .........................................................*...................................................................................................................................... + // sub v23.4S, v18.4S, v11.4S // .............................................*.................................................................................................................................................. + // add v11.4S, v22.4S, v9.4S // ....................................................................................*........................................................................................................... + // sub v12.4S, v28.4S, v20.4S // ..............................*................................................................................................................................................................. + // mla v26.4S, v13.4S, v29.4S // ........................................................................................*....................................................................................................... + // add v15.4S, v8.4S, v27.4S // ..........................................................................................*..................................................................................................... + // sqrdmulh v20.4S, v12.4S, v4.S[1] // .........................................*...................................................................................................................................................... + // mul v12.4S, v12.4S, v4.S[0] // .......................................*........................................................................................................................................................ + // str q15, [x0], #(16) // ................................................................................................*............................................................................................... + // sqrdmulh v9.4S, v21.4S, v6.S[3] // ............................*................................................................................................................................................................... + // mul v13.4S, v21.4S, v6.S[2] // ..........................*..................................................................................................................................................................... + // sub v15.4S, v8.4S, v27.4S // .........................................................................................*...................................................................................................... + // add v28.4S, v11.4S, v26.4S // .................................................................................................*.............................................................................................. + // mla v12.4S, v20.4S, v29.4S // ............................................................*................................................................................................................................... + // sqrdmulh v27.4S, v17.4S, v6.S[1] // ......................................................................................*......................................................................................................... + // str q28, [x0, #240] // ........................................................................................................*....................................................................................... + // add v21.4S, v10.4S, v24.4S // .............................*.................................................................................................................................................................. + // sub v19.4S, v31.4S, v19.4S // ...............................................................*................................................................................................................................ + // sub v22.4S, v11.4S, v26.4S // ....................................................................................................*........................................................................................... + // str q23, [x0, #944] // ........................................................*....................................................................................................................................... + // mla v13.4S, v9.4S, v29.4S // ..................................*............................................................................................................................................................. + // add v30.4S, v19.4S, v12.4S // ...................................................................*............................................................................................................................ + // sub v12.4S, v19.4S, v12.4S // .............................................................................*.................................................................................................................. + // str q15, [x0, #48] // ......................................................................................................*......................................................................................... + // str q22, [x0, #304] // ....................................................................................................................*........................................................................... + // mla v25.4S, v27.4S, v29.4S // ...........................................................................................*.................................................................................................... + // sub v14.4S, v16.4S, v14.4S // ................................................................*............................................................................................................................... + // str q12, [x0, #176] // ...................................................................................*............................................................................................................ + // sub v27.4S, v21.4S, v13.4S // ............................................*................................................................................................................................................... + // add v9.4S, v21.4S, v13.4S // ..........................................*..................................................................................................................................................... + // str q30, [x0, #112] // .......................................................................*........................................................................................................................ + // str q9, [x0, #752] // ...............................................*................................................................................................................................................ + // add v30.4S, v14.4S, v25.4S // .....................................................................................................................................*.......................................................... + // str q27, [x0, #816] // ..................................................*............................................................................................................................................. + // sub v20.4S, v14.4S, v25.4S // .............................................................................................................................................*.................................................. + // str q30, [x0, #624] // ..............................................................................................................................................*................................................. + // str q20, [x0, #688] // ....................................................................................................................................................*........................................... + // ldr q24, [x0, #512] // ..............................................................................................................................................................*................................. + // ldr q15, [x0, #832] // .............................................................................................*.................................................................................................. + // ldr q10, [x0, #192] // ...........................................................................*.................................................................................................................... + // ldr q19, [x0, #768] // ...................................................................................................*............................................................................................ + // ldr q25, [x0, #64] // .................................................................................................................................................*.............................................. + // ldr q21, [x0, #960] // ........................................*....................................................................................................................................................... + // ldr q27, [x0, #704] // ...........................................................*.................................................................................................................................... + // ldr q22, [x0, #896] // ..............................................................................*................................................................................................................. + // ldr q14, [x0, #448] // .............................................................*.................................................................................................................................. + // ldr q23, [x0, #576] // ..............................................................................................................*................................................................................. + // sqrdmulh v20.4S, v15.4S, v0.S[1] // ............................................................................................................*................................................................................... + // mul v30.4S, v15.4S, v0.S[0] // .....................................................................................................................*.......................................................................... + // mul v17.4S, v19.4S, v0.S[0] // .................................................................................................................*.............................................................................. + // ldr q11, [x0, #640] // .......................................................................................................*........................................................................................ + // sqrdmulh v31.4S, v19.4S, v0.S[1] // ................................................................................................................*............................................................................... + // sqrdmulh v16.4S, v24.4S, v0.S[1] // ......................................................................................................................................................................*......................... + // sqrdmulh v19.4S, v21.4S, v0.S[1] // .................................................*.............................................................................................................................................. + // ldr q9, [x0, #320] // .................................................................................................................................*.............................................................. + // mla v30.4S, v20.4S, v29.4S // ..............................................................................................................................*................................................................. + // ldr q13, [x0, #256] // ..........................................................................................................................*..................................................................... + // mul v8.4S, v22.4S, v0.S[0] // .....................................................................................................*.......................................................................................... + // sub v15.4S, v9.4S, v30.4S // ............................................................................................................................................*................................................... + // mul v26.4S, v21.4S, v0.S[0] // ...................................................*............................................................................................................................................ + // sqrdmulh v21.4S, v11.4S, v0.S[1] // ..................................................................................................................*............................................................................. + // mul v28.4S, v27.4S, v0.S[0] // ....................................................................*........................................................................................................................... + // mul v18.4S, v15.4S, v1.S[0] // ................................................................................................................................................................*............................... + // add v20.4S, v9.4S, v30.4S // .................................................................................................................................................................*.............................. + // ldr q12, [x0, #0] // .............................................................................................................................................................................*.................. + // mul v9.4S, v24.4S, v0.S[0] // .......................................................................................................................................................................*........................ + // sqrdmulh v24.4S, v15.4S, v1.S[1] // ..................................................................................................................................................*............................................. + // mla v26.4S, v19.4S, v29.4S // ..............................................................*................................................................................................................................. + // sqrdmulh v15.4S, v20.4S, v0.S[3] // ..................................................................................................................................................................................*............. + // mul v30.4S, v20.4S, v0.S[2] // ........................................................................................................................................................................*....................... + // mla v9.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*................ + // sqrdmulh v19.4S, v22.4S, v0.S[1] // .......................................................................................*........................................................................................................ + // mla v17.4S, v31.4S, v29.4S // .......................................................................................................................*........................................................................ + // ldr q31, [x0, #384] // ...............................................................................................................................*................................................................ + // mla v18.4S, v24.4S, v29.4S // .....................................................................................................................................................................*.......................... + // add v20.4S, v14.4S, v26.4S // ......................................................................*......................................................................................................................... + // sqrdmulh v22.4S, v27.4S, v0.S[1] // ........................................................................*....................................................................................................................... + // sqrdmulh v27.4S, v23.4S, v0.S[1] // .......................................................................................................................................*........................................................ + // mul v16.4S, v23.4S, v0.S[0] // ........................................................................................................................................*....................................................... + // sqrdmulh v24.4S, v20.4S, v0.S[3] // ............................................................................*................................................................................................................... + // mul v20.4S, v20.4S, v0.S[2] // ..........................................................................................................*..................................................................................... + // mla v28.4S, v22.4S, v29.4S // ..............................................................................................*................................................................................................. + // mla v8.4S, v19.4S, v29.4S // .............................................................................................................*.................................................................................. + // add v22.4S, v13.4S, v17.4S // ............................................................................................................................................................*................................... + // sub v14.4S, v14.4S, v26.4S // .....................................................................*.......................................................................................................................... + // mla v20.4S, v24.4S, v29.4S // .........................................................................................................................*...................................................................... + // mla v16.4S, v27.4S, v29.4S // ...................................................................................................................................................*............................................ + // add v19.4S, v10.4S, v28.4S // ................................................................................................................................*............................................................... + // mul v27.4S, v11.4S, v0.S[0] // ......................................................................................................................*......................................................................... + // mul v26.4S, v22.4S, v0.S[2] // ....................................................................................................................................................................*........................... + // add v23.4S, v31.4S, v8.4S // ...........................................................................................................................................................................*.................... + // sub v24.4S, v31.4S, v8.4S // ..........................................................................................................................................*..................................................... + // sub v31.4S, v19.4S, v20.4S // .......................................................................................................................................................*........................................ + // sub v11.4S, v25.4S, v16.4S // ..........................................................................................................................................................*..................................... + // mla v27.4S, v21.4S, v29.4S // .............................................................................................................................*.................................................................. + // mul v21.4S, v14.4S, v1.S[0] // ..........................................................................*..................................................................................................................... + // sub v28.4S, v10.4S, v28.4S // .........................................................................................................*...................................................................................... + // sub v10.4S, v13.4S, v17.4S // .........................................................................................................................................*...................................................... + // sqrdmulh v8.4S, v14.4S, v1.S[1] // .........................................................................*...................................................................................................................... + // mul v14.4S, v24.4S, v1.S[0] // ................................................................................................................................................*............................................... + // sqrdmulh v17.4S, v24.4S, v1.S[1] // ........................................................................................................................................................*....................................... + // sqrdmulh v13.4S, v22.4S, v0.S[3] // ..................................................................................................................................................................*............................. + // ldr q22, [x0, #128] // .....................................................................................................................................................*.......................................... + // sqrdmulh v24.4S, v10.4S, v1.S[1] // ...............................................................................................................................................*................................................ + // mla v30.4S, v15.4S, v29.4S // .........................................................................................................................................................................................*...... + // mla v21.4S, v8.4S, v29.4S // ...............................................................................*................................................................................................................ + // mla v14.4S, v17.4S, v29.4S // ...............................................................................................................................................................*................................ + // add v17.4S, v19.4S, v20.4S // ...................................................................................................................................................................*............................ + // mla v26.4S, v13.4S, v29.4S // ..........................................................................................................................................................................*..................... + // sqrdmulh v15.4S, v17.4S, v1.S[3] // ..........................................................................................................................................................................................*..... + // sub v13.4S, v22.4S, v27.4S // .............................................................................................................................................................*.................................. + // add v8.4S, v22.4S, v27.4S // ............................................................................................................................................................................*................... + // add v27.4S, v12.4S, v9.4S // ....................................................................................................................................................................................*........... + // sub v19.4S, v28.4S, v21.4S // ......................................................................................................................................................*......................................... + // mul v20.4S, v17.4S, v1.S[2] // ..............................................................................................................................................................................*................. + // add v21.4S, v28.4S, v21.4S // ...................................................................................................................*............................................................................ + // sub v28.4S, v11.4S, v18.4S // ............................................................................................................................................................................................*... + // mul v22.4S, v23.4S, v0.S[2] // ................................................................................................................................................................................*............... + // sqrdmulh v17.4S, v21.4S, v2.S[3] // ....................................................................................................................................*........................................................... + // mul v21.4S, v21.4S, v2.S[2] // ...................................................................................................................................*............................................................ + // add v18.4S, v11.4S, v18.4S // ......................................................................................................................................................................................*......... + // sqrdmulh v11.4S, v23.4S, v0.S[3] // .................................................................................................................................................................................*.............. + // add v23.4S, v13.4S, v14.4S // .........................................................................................................................................................................*...................... + // mul v10.4S, v10.4S, v1.S[0] // .........................................................................................................................................................*...................................... + // mla v21.4S, v17.4S, v29.4S // ...........................................................................................................................................................*.................................... + // sub v9.4S, v12.4S, v9.4S // .....................................................................................................................................................................................*.......... + // sub v13.4S, v13.4S, v14.4S // ...........................................................................................................................................................................................*.... + // mla v22.4S, v11.4S, v29.4S // ........................................................................................................................................................................................*....... + // mla v10.4S, v24.4S, v29.4S // ...................................................................................................................................................................................*............ + // mul v14.4S, v23.4S, v2.S[2] // ...............................................................................................................................................................................................* + // sub v17.4S, v18.4S, v21.4S // .............................................................................................................................................................................................*.. + // sqrdmulh v23.4S, v23.4S, v2.S[3] // .......................................................................................................................................................................................*........ + // sub v11.4S, v8.4S, v22.4S // ..............................................................................................................................................................................................*. sub count, count, #1 cbnz count, layer1234_start - add v16.4S, v8.4S, v16.4S // .................................................................*.............................................................................................................................. - add v12.4S, v27.4S, v21.4S // ...................................................................................................................*............................................................................ - sub v19.4S, v19.4S, v13.4S // ..............................................................................................*................................................................................................. - sqrdmulh v22.4S, v10.4S, v1.S[1] // ..................................................................................*............................................................................................................. - mul v15.4S, v12.4S, v4.S[2] // ..................................................................................................................................................*............................................. - sqrdmulh v8.4S, v12.4S, v4.S[3] // ...................................................................................................................................................*............................................ - mul v13.4S, v19.4S, v3.S[0] // ...................................................................................................................................*............................................................ - sqrdmulh v19.4S, v19.4S, v3.S[1] // ....................................................................................................................................*........................................................... - sqrdmulh v12.4S, v18.4S, v1.S[1] // .............................................................................*.................................................................................................................. - mul v10.4S, v10.4S, v1.S[0] // .................................................................................*.............................................................................................................. - add v31.4S, v31.4S, v26.4S // ..............................................................................................................*................................................................................. - mla v15.4S, v8.4S, v29.4S // ....................................................................................................................................................*........................................... - sub v26.4S, v24.4S, v11.4S // .........................................................................................*...................................................................................................... - mla v13.4S, v19.4S, v29.4S // .....................................................................................................................................*.......................................................... - mla v10.4S, v22.4S, v29.4S // ...................................................................................*............................................................................................................ - mul v22.4S, v18.4S, v1.S[0] // ............................................................................*................................................................................................................... - add v18.4S, v31.4S, v15.4S // ......................................................................................................................................................*......................................... - sqrdmulh v19.4S, v26.4S, v3.S[1] // ...............................................................................................................................*................................................................ - mul v8.4S, v9.4S, v2.S[2] // .........................................................................................................................*...................................................................... - mul v26.4S, v26.4S, v3.S[0] // ..............................................................................................................................*................................................................. - str q18, [x0, #256] // ....................................................................................................................................................................................*........... - add v11.4S, v24.4S, v11.4S // ..........................................................................................*..................................................................................................... - mla v22.4S, v12.4S, v29.4S // ..............................................................................*................................................................................................................. - sqrdmulh v9.4S, v9.4S, v2.S[3] // ..........................................................................................................................*..................................................................... - sub v18.4S, v23.4S, v10.4S // ....................................................................................*........................................................................................................... - sqrdmulh v24.4S, v11.4S, v2.S[3] // .....................................................................................................................*.......................................................................... - mla v26.4S, v19.4S, v29.4S // ................................................................................................................................*............................................................... - sub v12.4S, v20.4S, v22.4S // ...............................................................................*................................................................................................................ - sub v19.4S, v18.4S, v13.4S // ......................................................................................................................................*......................................................... - mla v8.4S, v9.4S, v29.4S // ...........................................................................................................................*.................................................................... - add v18.4S, v18.4S, v13.4S // .......................................................................................................................................*........................................................ - sub v13.4S, v12.4S, v26.4S // .................................................................................................................................*.............................................................. - add v12.4S, v12.4S, v26.4S // ..................................................................................................................................*............................................................. - sqrdmulh v26.4S, v18.4S, v6.S[3] // .......................................................................................................................................................................*........................ - mul v9.4S, v18.4S, v6.S[2] // ......................................................................................................................................................................*......................... - mul v11.4S, v11.4S, v2.S[2] // ....................................................................................................................*........................................................................... - mul v18.4S, v19.4S, v7.S[0] // ...........................................................................................................................................................................*.................... - add v23.4S, v23.4S, v10.4S // .....................................................................................*.......................................................................................................... - sqrdmulh v10.4S, v19.4S, v7.S[1] // ............................................................................................................................................................................*................... - mla v9.4S, v26.4S, v29.4S // ........................................................................................................................................................................*....................... - add v19.4S, v16.4S, v28.4S // .........................................................................................................*...................................................................................... - sub v26.4S, v23.4S, v8.4S // ............................................................................................................................*................................................................... - mla v11.4S, v24.4S, v29.4S // ......................................................................................................................*......................................................................... - mla v18.4S, v10.4S, v29.4S // .............................................................................................................................................................................*.................. - sub v10.4S, v31.4S, v15.4S // .....................................................................................................................................................*.......................................... - add v8.4S, v23.4S, v8.4S // .............................................................................................................................*.................................................................. - add v31.4S, v12.4S, v9.4S // ..........................................................................................................................................................................*..................... - str q10, [x0, #320] // .....................................................................................................................................................................................*.......... - sqrdmulh v15.4S, v26.4S, v6.S[1] // ..................................................................................................................................................................*............................. - add v10.4S, v20.4S, v22.4S // ................................................................................*............................................................................................................... - str q31, [x0, #768] // ............................................................................................................................................................................................*... - mul v26.4S, v26.4S, v6.S[0] // .................................................................................................................................................................*.............................. - sub v23.4S, v13.4S, v18.4S // ..............................................................................................................................................................................*................. - mul v20.4S, v8.4S, v5.S[2] // ............................................................................................................................................................*................................... - sqrdmulh v22.4S, v8.4S, v5.S[3] // .............................................................................................................................................................*.................................. - sub v24.4S, v10.4S, v11.4S // .......................................................................................................................*........................................................................ - add v11.4S, v10.4S, v11.4S // ........................................................................................................................*....................................................................... - sub v8.4S, v27.4S, v21.4S // ..................................................................................................................*............................................................................. - str q23, [x0, #960] // ...............................................................................................................................................................................................* - mla v26.4S, v15.4S, v29.4S // ...................................................................................................................................................................*............................ - mla v20.4S, v22.4S, v29.4S // ..............................................................................................................................................................*................................. - sub v22.4S, v12.4S, v9.4S // .........................................................................................................................................................................*...................... - mul v12.4S, v8.4S, v5.S[0] // .......................................................................................................................................................*........................................ - add v15.4S, v13.4S, v18.4S // ...............................................................................................................................................................................*................ - add v31.4S, v24.4S, v26.4S // .....................................................................................................................................................................*.......................... - sqrdmulh v9.4S, v8.4S, v5.S[1] // ........................................................................................................................................................*....................................... - str q22, [x0, #832] // .............................................................................................................................................................................................*.. - sub v27.4S, v16.4S, v28.4S // ........................................................................................................*....................................................................................... - sub v10.4S, v11.4S, v20.4S // ...............................................................................................................................................................*................................ - str q15, [x0, #896] // ..............................................................................................................................................................................................*. - str q31, [x0, #640] // ..........................................................................................................................................................................................*..... - sqrdmulh v22.4S, v19.4S, v3.S[3] // .........................................................................................................................................*...................................................... - mul v21.4S, v19.4S, v3.S[2] // ........................................................................................................................................*....................................................... - str q10, [x0, #576] // .........................................................................................................................................................................................*...... - sqrdmulh v18.4S, v27.4S, v4.S[1] // ..............................................................................................................................................*................................................. - mla v12.4S, v9.4S, v29.4S // .........................................................................................................................................................*...................................... - mul v27.4S, v27.4S, v4.S[0] // .............................................................................................................................................*.................................................. - sub v9.4S, v24.4S, v26.4S // ....................................................................................................................................................................*........................... - mla v21.4S, v22.4S, v29.4S // ..........................................................................................................................................*..................................................... - add v24.4S, v11.4S, v20.4S // ................................................................................................................................................................*............................... - add v16.4S, v17.4S, v12.4S // ...........................................................................................................................................................*.................................... - sub v17.4S, v17.4S, v12.4S // ..........................................................................................................................................................*..................................... - str q9, [x0, #704] // ...........................................................................................................................................................................................*.... - str q24, [x0, #512] // ........................................................................................................................................................................................*....... - sub v20.4S, v30.4S, v25.4S // ...................................................................................................*............................................................................................ - mla v27.4S, v18.4S, v29.4S // ...............................................................................................................................................*................................................ - add v24.4S, v14.4S, v21.4S // ............................................................................................................................................*................................................... - sub v8.4S, v14.4S, v21.4S // ...........................................................................................................................................*.................................................... - str q17, [x0, #448] // .......................................................................................................................................................................................*........ - str q16, [x0, #384] // ......................................................................................................................................................................................*......... - str q24, [x0], #(16) // ................................................................................................................................................................................*............... - sub v13.4S, v20.4S, v27.4S // ................................................................................................................................................*............................................... - add v19.4S, v20.4S, v27.4S // .................................................................................................................................................*.............................................. - str q8, [x0, #48] // .................................................................................................................................................................................*.............. - str q13, [x0, #176] // ...................................................................................................................................................................................*............ - str q19, [x0, #112] // ..................................................................................................................................................................................*............. + mla v14.4S, v23.4S, v29.4S // ......................................................................................................................*......................................................................... + add v23.4S, v18.4S, v21.4S // .............................................................................................................................*.................................................................. + sqrdmulh v18.4S, v19.4S, v3.S[1] // ...................................................................................................................................*............................................................ + mul v21.4S, v19.4S, v3.S[0] // ....................................................................................................................................*........................................................... + sqrdmulh v12.4S, v23.4S, v5.S[3] // ............................................................................................................................................................*................................... + mul v24.4S, v23.4S, v5.S[2] // .............................................................................................................................................................*.................................. + add v19.4S, v25.4S, v16.4S // .........................*...................................................................................................................................................................... + add v16.4S, v9.4S, v10.4S // ................................................................................*............................................................................................................... + mla v21.4S, v18.4S, v29.4S // .....................................................................................................................................*.......................................................... + sqrdmulh v23.4S, v11.4S, v2.S[1] // ..........................................................................................................*..................................................................................... + mla v24.4S, v12.4S, v29.4S // ..............................................................................................................................................................*................................. + mul v12.4S, v31.4S, v2.S[0] // ................................................................................................................*............................................................................... + add v25.4S, v16.4S, v14.4S // ........................................................................................................................*....................................................................... + sqrdmulh v31.4S, v31.4S, v2.S[1] // ...............................................................................................................*................................................................................ + sub v10.4S, v9.4S, v10.4S // ...............................................................................*................................................................................................................ + sqrdmulh v18.4S, v13.4S, v3.S[1] // ..............................................................................................................................*................................................................. + sub v9.4S, v25.4S, v24.4S // ...............................................................................................................................................................*................................ + add v25.4S, v25.4S, v24.4S // ................................................................................................................................................................*............................... + mul v24.4S, v13.4S, v3.S[0] // ...............................................................................................................................*................................................................ + sub v13.4S, v28.4S, v21.4S // ......................................................................................................................................*......................................................... + str q9, [x0, #576] // .........................................................................................................................................................................................*...... + mul v9.4S, v11.4S, v2.S[0] // ...........................................................................................................*.................................................................................... + mla v12.4S, v31.4S, v29.4S // .................................................................................................................*.............................................................................. + str q25, [x0, #512] // ........................................................................................................................................................................................*....... + sqrdmulh v25.4S, v13.4S, v7.S[1] // ...........................................................................................................................................................................*.................... + mul v11.4S, v13.4S, v7.S[0] // ............................................................................................................................................................................*................... + sub v31.4S, v19.4S, v30.4S // ................................................................*............................................................................................................................... + mla v24.4S, v18.4S, v29.4S // ................................................................................................................................*............................................................... + add v8.4S, v8.4S, v22.4S // ......................................................................*......................................................................................................................... + mla v9.4S, v23.4S, v29.4S // ............................................................................................................*................................................................................... + sub v22.4S, v31.4S, v12.4S // ..................................................................................................................*............................................................................. + mla v11.4S, v25.4S, v29.4S // .............................................................................................................................................................................*.................. + mla v20.4S, v15.4S, v29.4S // .......................................................................................................*........................................................................................ + sub v18.4S, v10.4S, v24.4S // .................................................................................................................................*.............................................................. + mul v23.4S, v22.4S, v5.S[0] // ........................................................................................................................................................*....................................... + sqrdmulh v13.4S, v22.4S, v5.S[1] // .......................................................................................................................................................*........................................ + add v25.4S, v18.4S, v11.4S // ...............................................................................................................................................................................*................ + sub v22.4S, v27.4S, v26.4S // ...........................................................*.................................................................................................................................... + add v21.4S, v28.4S, v21.4S // .......................................................................................................................................*........................................................ + sqrdmulh v15.4S, v8.4S, v1.S[3] // ................................................................................................*............................................................................................... + add v28.4S, v19.4S, v30.4S // .................................................................*.............................................................................................................................. + mla v23.4S, v13.4S, v29.4S // .........................................................................................................................................................*...................................... + sub v30.4S, v22.4S, v9.4S // .............................................................................................................*.................................................................................. + mul v19.4S, v8.4S, v1.S[2] // .................................................................................................*.............................................................................................. + add v8.4S, v31.4S, v12.4S // ...................................................................................................................*............................................................................ + add v12.4S, v28.4S, v20.4S // .........................................................................................................*...................................................................................... + add v31.4S, v27.4S, v26.4S // ............................................................*................................................................................................................................... + sub v26.4S, v30.4S, v23.4S // ..........................................................................................................................................................*..................................... + mul v27.4S, v12.4S, v3.S[2] // .........................................................................................................................................*...................................................... + sqrdmulh v12.4S, v12.4S, v3.S[3] // ........................................................................................................................................*....................................................... + str q26, [x0, #448] // .......................................................................................................................................................................................*........ + mla v19.4S, v15.4S, v29.4S // ..................................................................................................*............................................................................................. + add v13.4S, v30.4S, v23.4S // ...........................................................................................................................................................*.................................... + str q25, [x0, #896] // ..............................................................................................................................................................................................*. + mul v25.4S, v17.4S, v6.S[0] // ..................................................................................................................................................................*............................. + mul v26.4S, v8.4S, v4.S[2] // ...................................................................................................................................................*............................................ + str q13, [x0, #384] // ......................................................................................................................................................................................*......... + sqrdmulh v13.4S, v8.4S, v4.S[3] // ..................................................................................................................................................*............................................. + mla v27.4S, v12.4S, v29.4S // ..........................................................................................................................................*..................................................... + add v8.4S, v31.4S, v19.4S // ....................................................................................................*........................................................................................... + sub v23.4S, v18.4S, v11.4S // ..............................................................................................................................................................................*................. + add v11.4S, v22.4S, v9.4S // ..............................................................................................................*................................................................................. + sub v12.4S, v28.4S, v20.4S // ........................................................................................................*....................................................................................... + mla v26.4S, v13.4S, v29.4S // ....................................................................................................................................................*........................................... + add v15.4S, v8.4S, v27.4S // ............................................................................................................................................*................................................... + sqrdmulh v20.4S, v12.4S, v4.S[1] // .............................................................................................................................................*.................................................. + mul v12.4S, v12.4S, v4.S[0] // ..............................................................................................................................................*................................................. + str q15, [x0], #(16) // ................................................................................................................................................................................*............... + sqrdmulh v9.4S, v21.4S, v6.S[3] // ......................................................................................................................................................................*......................... + mul v13.4S, v21.4S, v6.S[2] // .......................................................................................................................................................................*........................ + sub v15.4S, v8.4S, v27.4S // ...........................................................................................................................................*.................................................... + add v28.4S, v11.4S, v26.4S // ......................................................................................................................................................*......................................... + mla v12.4S, v20.4S, v29.4S // ...............................................................................................................................................*................................................ + sqrdmulh v27.4S, v17.4S, v6.S[1] // .................................................................................................................................................................*.............................. + str q28, [x0, #240] // ....................................................................................................................................................................................*........... + add v21.4S, v10.4S, v24.4S // ..................................................................................................................................*............................................................. + sub v19.4S, v31.4S, v19.4S // ...................................................................................................*............................................................................................ + sub v22.4S, v11.4S, v26.4S // .....................................................................................................................................................*.......................................... + str q23, [x0, #944] // ...............................................................................................................................................................................................* + mla v13.4S, v9.4S, v29.4S // ........................................................................................................................................................................*....................... + add v30.4S, v19.4S, v12.4S // .................................................................................................................................................*.............................................. + sub v12.4S, v19.4S, v12.4S // ................................................................................................................................................*............................................... + str q15, [x0, #48] // .................................................................................................................................................................................*.............. + str q22, [x0, #304] // .....................................................................................................................................................................................*.......... + mla v25.4S, v27.4S, v29.4S // ...................................................................................................................................................................*............................ + sub v14.4S, v16.4S, v14.4S // .......................................................................................................................*........................................................................ + str q12, [x0, #176] // ...................................................................................................................................................................................*............ + sub v27.4S, v21.4S, v13.4S // .........................................................................................................................................................................*...................... + add v9.4S, v21.4S, v13.4S // ..........................................................................................................................................................................*..................... + str q30, [x0, #112] // ..................................................................................................................................................................................*............. + str q9, [x0, #752] // ............................................................................................................................................................................................*... + add v30.4S, v14.4S, v25.4S // .....................................................................................................................................................................*.......................... + str q27, [x0, #816] // .............................................................................................................................................................................................*.. + sub v20.4S, v14.4S, v25.4S // ....................................................................................................................................................................*........................... + str q30, [x0, #624] // ..........................................................................................................................................................................................*..... + str q20, [x0, #688] // ...........................................................................................................................................................................................*.... restore inp, STACK0 mov count, #16 @@ -928,257 +910,465 @@ layer1234_start: qform_root3_tw .req q7 .p2align 2 + // Instructions: 29 + // Expected cycles: 21 + // Expected IPC: 1.38 + // + // Wall time: 0.40s + // User time: 0.40s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q3, [x3], #16 // *............................. + ldr q9, [x1, #48] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q27, [x1, #32] // ...*.......................... + ldr q10, [x3], #8 // ..........*................... + // gap // .............................. + // gap // .............................. + ldr q5, [x4, #16] // ............................*. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q19, [x4, #48] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v23.4S, v9.4S, v3.S[0] // ....*......................... + sqrdmulh v9.4S, v9.4S, v3.S[1] // .....*........................ + // gap // .............................. + // gap // .............................. + mul v22.4S, v27.4S, v3.S[0] // .........*.................... + sqrdmulh v0.4S, v27.4S, v3.S[1] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x1, #16] // ......*....................... + mla v23.4S, v9.4S, v29.4S // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v22.4S, v0.4S, v29.4S // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v9.4S, v13.4S, v23.4S // ............*................. + sub v7.4S, v13.4S, v23.4S // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v0.4S, v9.4S, v3.S[3] // ...............*.............. + mul v13.4S, v9.4S, v3.S[2] // ..............*............... + ldr q3, [x1, #0] // ..*........................... + // gap // .............................. + mul v27.4S, v7.4S, v10.S[0] // ................*............. + sqrdmulh v9.4S, v7.4S, v10.S[1] // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v13.4S, v0.4S, v29.4S // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mla v27.4S, v9.4S, v29.4S // .....................*........ + add v0.4S, v3.4S, v22.4S // ....................*......... + // gap // .............................. + // gap // .............................. + sub v9.4S, v3.4S, v22.4S // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v30.4S, v0.4S, v13.4S // .......................*...... + add v21.4S, v0.4S, v13.4S // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v18.4S, v9.4S, v27.4S // .........................*.... + add v1.4S, v9.4S, v27.4S // ..........................*... + trn2 v25.4S, v21.4S, v30.4S // ...........................*.. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q2, [x3], #16 // *.............................. + // ldr q30, [x1, #48] // .*............................. + // ldr q8, [x1, #0] // .................*............. + // ldr q16, [x1, #32] // ..*............................ + // mul v9.4S, v30.4S, v2.S[0] // ......*........................ + // sqrdmulh v26.4S, v30.4S, v2.S[1] // .......*....................... + // ldr q22, [x1, #16] // ..........*.................... + // sqrdmulh v25.4S, v16.4S, v2.S[1] // .........*..................... + // mla v9.4S, v26.4S, v29.4S // ...........*................... + // mul v16.4S, v16.4S, v2.S[0] // ........*...................... + // ldr q28, [x3], #8 // ...*........................... + // sub v18.4S, v22.4S, v9.4S // ..............*................ + // add v23.4S, v22.4S, v9.4S // .............*................. + // ldr q19, [x4, #48] // .....*......................... + // mul v6.4S, v23.4S, v2.S[2] // ................*.............. + // sqrdmulh v22.4S, v23.4S, v2.S[3] // ...............*............... + // mul v12.4S, v18.4S, v28.S[0] // ..................*............ + // mla v6.4S, v22.4S, v29.4S // ....................*.......... + // mla v16.4S, v25.4S, v29.4S // ............*.................. + // sqrdmulh v22.4S, v18.4S, v28.S[1] // ...................*........... + // add v31.4S, v8.4S, v16.4S // ......................*........ + // mla v12.4S, v22.4S, v29.4S // .....................*......... + // sub v0.4S, v8.4S, v16.4S // .......................*....... + // sub v30.4S, v31.4S, v6.4S // ........................*...... + // add v21.4S, v31.4S, v6.4S // .........................*..... + // sub v18.4S, v0.4S, v12.4S // ..........................*.... + // add v1.4S, v0.4S, v12.4S // ...........................*... + // trn2 v25.4S, v21.4S, v30.4S // ............................*.. + // ldr q5, [x4, #16] // ....*.......................... + + sub count, count, #1 layer5678_start: - ldr q21, [x3], #16 // ....*........................................................ - // gap // ............................................................. - // gap // ............................................................. - ldr q20, [x1, #48] // ...*......................................................... - ldr q9, [x4, #80] // .......................................*..................... - ldr q12, [x1, #32] // ..*.......................................................... - // gap // ............................................................. - // gap // ............................................................. - ldr q31, [x1, #16] // .*........................................................... - ldr q23, [x3], #8 // .....*....................................................... - // gap // ............................................................. - // gap // ............................................................. - ldr q10, [x1, #0] // *............................................................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v16.4S, v20.4S, v21.S[0] // ...........*................................................. - sqrdmulh v3.4S, v20.4S, v21.S[1] // ............*................................................ - ldr q30, [x4, #32] // ....................................*........................ - // gap // ............................................................. - mul v4.4S, v12.4S, v21.S[0] // ......*...................................................... - sqrdmulh v26.4S, v12.4S, v21.S[1] // .......*..................................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v16.4S, v3.4S, v29.4S // .............*............................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v4.4S, v26.4S, v29.4S // ........*.................................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - add v14.4S, v31.4S, v16.4S // ...............*............................................. - sub v25.4S, v31.4S, v16.4S // ..............*.............................................. - // gap // ............................................................. - // gap // ............................................................. - sub v6.4S, v10.4S, v4.4S // .........*................................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mul v3.4S, v14.4S, v21.S[2] // ................*............................................ - sqrdmulh v16.4S, v14.4S, v21.S[3] // .................*........................................... - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v8.4S, v25.4S, v23.S[1] // ......................*...................................... - mul v25.4S, v25.4S, v23.S[0] // .....................*....................................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v3.4S, v16.4S, v29.4S // ..................*.......................................... - add v16.4S, v10.4S, v4.4S // ..........*.................................................. - ldr q4, [x4, #16] // ...................................*......................... - // gap // ............................................................. - mla v25.4S, v8.4S, v29.4S // .......................*..................................... - ldr q8, [x4], #(6*16) // ..................................*.......................... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - add v19.4S, v16.4S, v3.4S // ....................*........................................ - sub v26.4S, v16.4S, v3.4S // ...................*......................................... - // gap // ............................................................. - // gap // ............................................................. - add v16.4S, v6.4S, v25.4S // .........................*................................... - sub v3.4S, v6.4S, v25.4S // ........................*.................................... - // gap // ............................................................. - // gap // ............................................................. - trn2 v12.4S, v19.4S, v26.4S // ...........................*................................. - trn1 v20.4S, v19.4S, v26.4S // ..........................*.................................. - // gap // ............................................................. - // gap // ............................................................. - trn2 v11.4S, v16.4S, v3.4S // .............................*............................... - trn1 v0.4S, v16.4S, v3.4S // ............................*................................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - trn1 v10.2D, v12.2D, v11.2D // .................................*........................... - trn2 v16.2D, v12.2D, v11.2D // ...............................*............................. - // gap // ............................................................. - // gap // ............................................................. - trn2 v21.2D, v20.2D, v0.2D // ..............................*.............................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v3.4S, v16.4S, v4.4S // ..............................................*.............. - mul v16.4S, v16.4S, v8.4S // .............................................*............... - // gap // ............................................................. - // gap // ............................................................. - mul v15.4S, v21.4S, v8.4S // ........................................*.................... - sqrdmulh v11.4S, v21.4S, v4.4S // .........................................*................... - ldr q4, [x4, #-32] // ......................................*...................... - // gap // ............................................................. - trn1 v8.2D, v20.2D, v0.2D // ................................*............................ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v16.4S, v3.4S, v29.4S // ...............................................*............. - ldr q25, [x4, #-48] // .....................................*....................... - // gap // ............................................................. - // gap // ............................................................. - mla v15.4S, v11.4S, v29.4S // ..........................................*.................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sub v19.4S, v10.4S, v16.4S // ................................................*............ - add v16.4S, v10.4S, v16.4S // .................................................*........... - // gap // ............................................................. - // gap // ............................................................. - sub v3.4S, v8.4S, v15.4S // ...........................................*................. - add v24.4S, v8.4S, v15.4S // ............................................*................ - // gap // ............................................................. - // gap // ............................................................. - sqrdmulh v8.4S, v16.4S, v25.4S // ...................................................*......... - mul v21.4S, v16.4S, v30.4S // ..................................................*.......... - // gap // ............................................................. - // gap // ............................................................. - mul v16.4S, v19.4S, v4.4S // .......................................................*..... - sqrdmulh v26.4S, v19.4S, v9.4S // ........................................................*.... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v21.4S, v8.4S, v29.4S // ....................................................*........ - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - mla v16.4S, v26.4S, v29.4S // .........................................................*... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - sub v8.4S, v24.4S, v21.4S // .....................................................*....... - add v7.4S, v24.4S, v21.4S // ......................................................*...... - // gap // ............................................................. - // gap // ............................................................. - sub v10.4S, v3.4S, v16.4S // ..........................................................*.. - add v9.4S, v3.4S, v16.4S // ...........................................................*. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - st4 {v7.4S, v8.4S, v9.4S, v10.4S}, [x1], #64 // ............................................................* - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. + // Instructions: 61 + // Expected cycles: 26 + // Expected IPC: 2.35 + // + // Wall time: 111.11s + // User time: 111.11s + // + // -------------------- original position ---------------------> + // 0 25 50 + // |------------------------|------------------------|---------- + ldr q20, [x4], #(6*16) // ..................................*.......................... + trn1 v17.4S, v1.4S, v18.4S // ............................*................................ + trn2 v9.4S, v1.4S, v18.4S // .............................*............................... + // gap // ............................................................. + trn1 v14.4S, v21.4S, v30.4S // ..........................*.................................. + ldr q2, [x3], #16 // ....e........................................................ + // gap // ............................................................. + ldr q30, [x1, #112] // ...e......................................................... + trn1 v10.2D, v25.2D, v9.2D // .................................*........................... + ldr q8, [x1, #64] // e............................................................ + trn2 v15.2D, v25.2D, v9.2D // ...............................*............................. + // gap // ............................................................. + ldr q27, [x4, #-32] // ......................................*...................... + trn2 v1.2D, v14.2D, v17.2D // ..............................*.............................. + // gap // ............................................................. + // gap // ............................................................. + mul v13.4S, v15.4S, v20.4S // ..............................................*.............. + sqrdmulh v21.4S, v15.4S, v5.4S // .............................................*............... + // gap // ............................................................. + ldr q16, [x1, #96] // ..e.......................................................... + // gap // ............................................................. + // gap // ............................................................. + mul v9.4S, v30.4S, v2.S[0] // ............e................................................ + sqrdmulh v26.4S, v30.4S, v2.S[1] // ...........e................................................. + mul v24.4S, v1.4S, v20.4S // .........................................*................... + // gap // ............................................................. + // gap // ............................................................. + sqrdmulh v4.4S, v1.4S, v5.4S // ........................................*.................... + trn1 v15.2D, v14.2D, v17.2D // ................................*............................ + // gap // ............................................................. + ldr q22, [x1, #80] // .e........................................................... + mla v13.4S, v21.4S, v29.4S // ...............................................*............. + sqrdmulh v25.4S, v16.4S, v2.S[1] // ......e...................................................... + // gap // ............................................................. + mla v9.4S, v26.4S, v29.4S // .............e............................................... + ldr q17, [x4, #-16] // .......................................*..................... + mla v24.4S, v4.4S, v29.4S // ..........................................*.................. + // gap // ............................................................. + mul v16.4S, v16.4S, v2.S[0] // .......e..................................................... + ldr q26, [x4, #-64] // ....................................*........................ + add v7.4S, v10.4S, v13.4S // .................................................*........... + // gap // ............................................................. + // gap // ............................................................. + sub v1.4S, v10.4S, v13.4S // ................................................*............ + // gap // ............................................................. + ldr q28, [x3], #8 // .....e....................................................... + sub v18.4S, v22.4S, v9.4S // ..............e.............................................. + add v23.4S, v22.4S, v9.4S // ...............e............................................. + sqrdmulh v5.4S, v1.4S, v17.4S // .......................................................*..... + sqrdmulh v20.4S, v7.4S, v19.4S // ..................................................*.......... + ldr q19, [x4, #48] // .....................................e....................... + // gap // ............................................................. + mul v11.4S, v1.4S, v27.4S // ........................................................*.... + mul v31.4S, v7.4S, v26.4S // ...................................................*......... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mul v6.4S, v23.4S, v2.S[2] // .................e........................................... + sqrdmulh v22.4S, v23.4S, v2.S[3] // ................e............................................ + mul v12.4S, v18.4S, v28.S[0] // ......................e...................................... + add v1.4S, v15.4S, v24.4S // ............................................*................ + // gap // ............................................................. + // gap // ............................................................. + mla v31.4S, v20.4S, v29.4S // ....................................................*........ + // gap // ............................................................. + // gap // ............................................................. + mla v11.4S, v5.4S, v29.4S // .........................................................*... + sub v20.4S, v15.4S, v24.4S // ...........................................*................. + // gap // ............................................................. + // gap // ............................................................. + mla v6.4S, v22.4S, v29.4S // ..................e.......................................... + // gap // ............................................................. + // gap // ............................................................. + mla v16.4S, v25.4S, v29.4S // ........e.................................................... + sqrdmulh v22.4S, v18.4S, v28.S[1] // .....................e....................................... + add v4.4S, v20.4S, v11.4S // ...........................................................*. + sub v3.4S, v1.4S, v31.4S // .....................................................*....... + // gap // ............................................................. + // gap // ............................................................. + sub v5.4S, v20.4S, v11.4S // ..........................................................*.. + // gap // ............................................................. + // gap // ............................................................. + add v2.4S, v1.4S, v31.4S // ......................................................*...... + add v31.4S, v8.4S, v16.4S // ..........e.................................................. + // gap // ............................................................. + mla v12.4S, v22.4S, v29.4S // .......................e..................................... + // gap // ............................................................. + st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // ............................................................* + sub v0.4S, v8.4S, v16.4S // .........e................................................... + // gap // ............................................................. + // gap // ............................................................. + sub v30.4S, v31.4S, v6.4S // ...................e......................................... + // gap // ............................................................. + // gap // ............................................................. + add v21.4S, v31.4S, v6.4S // ....................e........................................ + sub v18.4S, v0.4S, v12.4S // ........................e.................................... + // gap // ............................................................. + add v1.4S, v0.4S, v12.4S // .........................e................................... + // gap // ............................................................. + trn2 v25.4S, v21.4S, v30.4S // ...........................e................................. + // gap // ............................................................. + // gap // ............................................................. + ldr q5, [x4, #16] // ...................................e......................... - // original source code - // ldr q8, [x1, #(16*0)] // ......*...................................................... - // ldr q9, [x1, #(16*1)] // ....*........................................................ - // ldr q10, [x1, #(16*2)] // ...*......................................................... - // ldr q11, [x1, #(16*3)] // .*........................................................... - // ldr q0, [x3], #16 // *............................................................ - // ldr q1, [x3], #8 // .....*....................................................... - // mul v24.4s, v10.4s, v0.s[0] // ..........*.................................................. - // sqrdmulh v10.4s, v10.4s, v0.s[1] // ...........*................................................. - // mla v24.4s, v10.4s, v29.4s // .............*............................................... - // sub v10.4s, v8.4s, v24.4s // ................*............................................ - // add v8.4s, v8.4s, v24.4s // ......................*...................................... - // mul v24.4s, v11.4s, v0.s[0] // .......*..................................................... - // sqrdmulh v11.4s, v11.4s, v0.s[1] // ........*.................................................... - // mla v24.4s, v11.4s, v29.4s // ............*................................................ - // sub v11.4s, v9.4s, v24.4s // ...............*............................................. - // add v9.4s, v9.4s, v24.4s // ..............*.............................................. - // mul v24.4s, v9.4s, v0.s[2] // .................*........................................... - // sqrdmulh v9.4s, v9.4s, v0.s[3] // ..................*.......................................... - // mla v24.4s, v9.4s, v29.4s // .....................*....................................... - // sub v9.4s, v8.4s, v24.4s // ...........................*................................. - // add v8.4s, v8.4s, v24.4s // ..........................*.................................. - // mul v24.4s, v11.4s, v1.s[0] // ....................*........................................ - // sqrdmulh v11.4s, v11.4s, v1.s[1] // ...................*......................................... - // mla v24.4s, v11.4s, v29.4s // ........................*.................................... - // sub v11.4s, v10.4s, v24.4s // .............................*............................... - // add v10.4s, v10.4s, v24.4s // ............................*................................ - // trn1 v25.4s, v8.4s, v9.4s // ...............................*............................. - // trn2 v26.4s, v8.4s, v9.4s // ..............................*.............................. - // trn1 v27.4s, v10.4s, v11.4s // .................................*........................... - // trn2 v28.4s, v10.4s, v11.4s // ................................*............................ - // trn2 v10.2d, v25.2d, v27.2d // ....................................*........................ - // trn2 v11.2d, v26.2d, v28.2d // ...................................*......................... - // trn1 v8.2d, v25.2d, v27.2d // ..........................................*.................. - // trn1 v9.2d, v26.2d, v28.2d // ..................................*.......................... - // ldr q0, [x4], #(6*16) // .........................*................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // .......................*..................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // .........*................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // ............................................*................ - // ldr q2, [x4, #(-6*16 + 4*16)] // .........................................*................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ..*.......................................................... - // mul v24.4s, v10.4s, v0.4s // .......................................*..................... - // sqrdmulh v10.4s, v10.4s, v4.4s // ........................................*.................... - // mla v24.4s, v10.4s, v29.4s // .............................................*............... - // sub v10.4s, v8.4s, v24.4s // ................................................*............ - // add v8.4s, v8.4s, v24.4s // .................................................*........... - // mul v24.4s, v11.4s, v0.4s // ......................................*...................... - // sqrdmulh v11.4s, v11.4s, v4.4s // .....................................*....................... - // mla v24.4s, v11.4s, v29.4s // ...........................................*................. - // sub v11.4s, v9.4s, v24.4s // ..............................................*.............. - // add v9.4s, v9.4s, v24.4s // ...............................................*............. - // mul v24.4s, v9.4s, v1.4s // ...................................................*......... - // sqrdmulh v9.4s, v9.4s, v5.4s // ..................................................*.......... - // mla v24.4s, v9.4s, v29.4s // ......................................................*...... - // sub v9.4s, v8.4s, v24.4s // ........................................................*.... - // add v8.4s, v8.4s, v24.4s // .........................................................*... - // mul v24.4s, v11.4s, v2.4s // ....................................................*........ - // sqrdmulh v11.4s, v11.4s, v6.4s // .....................................................*....... - // mla v24.4s, v11.4s, v29.4s // .......................................................*..... - // sub v11.4s, v10.4s, v24.4s // ..........................................................*.. - // add v10.4s, v10.4s, v24.4s // ...........................................................*. - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ............................................................* + // ------------------------------------------------ new position ------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|---------- + // ldr q8, [x1, #(16*0)] // ...e.....................................................'......~.............................................. + // ldr q9, [x1, #(16*1)] // ...............e.........................................'..................~.................................. + // ldr q10, [x1, #(16*2)] // .........e...............................................'............~........................................ + // ldr q11, [x1, #(16*3)] // .e.......................................................'....~................................................ + // ldr q0, [x3], #16 // e........................................................'...~................................................. + // ldr q1, [x3], #8 // .........................e...............................'............................~........................ + // sqrdmulh v27.4s, v10.4s, v0.s[1] // .................e.......................................'....................~................................ + // mul v24.4s, v10.4s, v0.s[0] // .....................e...................................'........................~............................ + // mla v24.4s, v27.4s, v29.4s // .........................................e...............'............................................~........ + // sub v10.4s, v8.4s, v24.4s // ..................................................e......'..................................................... + // add v8.4s, v8.4s, v24.4s // ...............................................e.........'..................................................~.. + // sqrdmulh v27.4s, v11.4s, v0.s[1] // ...........e.............................................'..............~...................................... + // mul v24.4s, v11.4s, v0.s[0] // ..........e..............................................'.............~....................................... + // mla v24.4s, v27.4s, v29.4s // ..................e......................................'.....................~............................... + // sub v11.4s, v9.4s, v24.4s // ..........................e..............................'.............................~....................... + // add v9.4s, v9.4s, v24.4s // ...........................e.............................'..............................~...................... + // sqrdmulh v27.4s, v9.4s, v0.s[3] // ..................................e......................'.....................................~............... + // mul v24.4s, v9.4s, v0.s[2] // .................................e.......................'....................................~................ + // mla v24.4s, v27.4s, v29.4s // ........................................e................'...........................................~......... + // sub v9.4s, v8.4s, v24.4s // ...................................................e.....'..................................................... + // add v8.4s, v8.4s, v24.4s // ....................................................e....'..................................................... + // sqrdmulh v27.4s, v11.4s, v1.s[1] // ..........................................e..............'.............................................~....... + // mul v24.4s, v11.4s, v1.s[0] // ...................................e.....................'......................................~.............. + // mla v24.4s, v27.4s, v29.4s // ................................................e........'...................................................~. + // sub v11.4s, v10.4s, v24.4s // .....................................................e...'..................................................... + // add v10.4s, v10.4s, v24.4s // ......................................................e..'..................................................... + // trn1 v25.4s, v8.4s, v9.4s // .........................................................'..*.................................................. + // trn2 v26.4s, v8.4s, v9.4s // .......................................................e.'..................................................... + // trn1 v27.4s, v10.4s, v11.4s // .........................................................'*.................................................... + // trn2 v28.4s, v10.4s, v11.4s // .........................................................'.*................................................... + // trn2 v10.2d, v25.2d, v27.2d // ......~..................................................'.........*........................................... + // trn2 v11.2d, v26.2d, v28.2d // ....~....................................................'.......*............................................. + // trn1 v8.2d, v25.2d, v27.2d // ..............~..........................................'.................*................................... + // trn1 v9.2d, v26.2d, v28.2d // ..~......................................................'.....*............................................... + // ldr q0, [ x4], #(6*16) // .........................................................*..................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ........................................................e'..................................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ......................~..................................'.........................*........................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ..............................e..........................'.................................~................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // .....~...................................................'........*............................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ...................~.....................................'......................*.............................. + // sqrdmulh v27.4s, v10.4s, v4.4s // .............~...........................................'................*.................................... + // mul v24.4s, v10.4s, v0.4s // ............~............................................'...............*..................................... + // mla v24.4s, v27.4s, v29.4s // ....................~....................................'.......................*............................. + // sub v10.4s, v8.4s, v24.4s // .......................................~.................'..........................................*.......... + // add v8.4s, v8.4s, v24.4s // ....................................~....................'.......................................*............. + // sqrdmulh v27.4s, v11.4s, v4.4s // ........~................................................'...........*......................................... + // mul v24.4s, v11.4s, v0.4s // .......~.................................................'..........*.......................................... + // mla v24.4s, v27.4s, v29.4s // ................~........................................'...................*................................. + // sub v11.4s, v9.4s, v24.4s // ........................~................................'...........................*......................... + // add v9.4s, v9.4s, v24.4s // .......................~.................................'..........................*.......................... + // sqrdmulh v27.4s, v9.4s, v5.4s // .............................~...........................'................................*.................... + // mul v24.4s, v9.4s, v1.4s // ................................~........................'...................................*................. + // mla v24.4s, v27.4s, v29.4s // .....................................~...................'........................................*............ + // sub v9.4s, v8.4s, v24.4s // ............................................~............'...............................................*..... + // add v8.4s, v8.4s, v24.4s // ..............................................~..........'.................................................*... + // sqrdmulh v27.4s, v11.4s, v6.4s // ............................~............................'...............................*..................... + // mul v24.4s, v11.4s, v2.4s // ...............................~.........................'..................................*.................. + // mla v24.4s, v27.4s, v29.4s // ......................................~..................'.........................................*........... + // sub v11.4s, v10.4s, v24.4s // .............................................~...........'................................................*.... + // add v10.4s, v10.4s, v24.4s // ...........................................~.............'..............................................*...... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // .................................................~.......'....................................................* sub count, count, #1 cbnz count, layer5678_start + // Instructions: 32 + // Expected cycles: 25 + // Expected IPC: 1.28 + // + // Wall time: 0.43s + // User time: 0.43s + // + // ------ original position ------> + // 0 25 + // |------------------------|------ + trn2 v15.4S, v1.4S, v18.4S // ..*............................. + trn1 v16.4S, v1.4S, v18.4S // .*.............................. + ldr q23, [x4], #(6*16) // *............................... + // gap // ................................ + trn1 v31.4S, v21.4S, v30.4S // ...*............................ + ldr q11, [x4, #-32] // ......*......................... + // gap // ................................ + // gap // ................................ + trn1 v4.2D, v25.2D, v15.2D // ....*........................... + trn2 v1.2D, v25.2D, v15.2D // .....*.......................... + ldr q18, [x4, #-16] // ..............*................. + // gap // ................................ + trn2 v12.2D, v31.2D, v16.2D // .......*........................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mul v24.4S, v1.4S, v23.4S // ........*....................... + sqrdmulh v9.4S, v1.4S, v5.4S // .........*...................... + // gap // ................................ + // gap // ................................ + sqrdmulh v14.4S, v12.4S, v5.4S // ...........*.................... + mul v0.4S, v12.4S, v23.4S // ..........*..................... + // gap // ................................ + // gap // ................................ + trn1 v26.2D, v31.2D, v16.2D // ............*................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + mla v24.4S, v9.4S, v29.4S // .............*.................. + // gap // ................................ + // gap // ................................ + // gap // ................................ + mla v0.4S, v14.4S, v29.4S // ...............*................ + ldr q14, [x4, #-64] // ................*............... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sub v6.4S, v4.4S, v24.4S // ..................*............. + add v24.4S, v4.4S, v24.4S // .................*.............. + // gap // ................................ + // gap // ................................ + add v25.4S, v26.4S, v0.4S // .......................*........ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mul v16.4S, v6.4S, v11.4S // .....................*.......... + sqrdmulh v3.4S, v6.4S, v18.4S // ...................*............ + // gap // ................................ + // gap // ................................ + sqrdmulh v15.4S, v24.4S, v19.4S // ....................*........... + mul v17.4S, v24.4S, v14.4S // ......................*......... + // gap // ................................ + // gap // ................................ + sub v10.4S, v26.4S, v0.4S // ..........................*..... + // gap // ................................ + // gap // ................................ + // gap // ................................ + mla v16.4S, v3.4S, v29.4S // .........................*...... + // gap // ................................ + // gap // ................................ + // gap // ................................ + mla v17.4S, v15.4S, v29.4S // ........................*....... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + add v8.4S, v10.4S, v16.4S // ...........................*.... + sub v9.4S, v10.4S, v16.4S // .............................*.. + // gap // ................................ + // gap // ................................ + sub v7.4S, v25.4S, v17.4S // ............................*... + add v6.4S, v25.4S, v17.4S // ..............................*. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + st4 {v6.4S, v7.4S, v8.4S, v9.4S}, [x1], #64 // ...............................* + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + + // -------- new position ---------> + // 0 25 + // |------------------------|------ + // ldr q20, [x4], #(6*16) // ..*............................. + // trn1 v17.4S, v1.4S, v18.4S // .*.............................. + // trn2 v9.4S, v1.4S, v18.4S // *............................... + // trn1 v14.4S, v21.4S, v30.4S // ...*............................ + // trn1 v10.2D, v25.2D, v9.2D // .....*.......................... + // trn2 v15.2D, v25.2D, v9.2D // ......*......................... + // ldr q27, [x4, #-32] // ....*........................... + // trn2 v1.2D, v14.2D, v17.2D // ........*....................... + // mul v13.4S, v15.4S, v20.4S // .........*...................... + // sqrdmulh v21.4S, v15.4S, v5.4S // ..........*..................... + // mul v24.4S, v1.4S, v20.4S // ............*................... + // sqrdmulh v4.4S, v1.4S, v5.4S // ...........*.................... + // trn1 v15.2D, v14.2D, v17.2D // .............*.................. + // mla v13.4S, v21.4S, v29.4S // ..............*................. + // ldr q17, [x4, #-16] // .......*........................ + // mla v24.4S, v4.4S, v29.4S // ...............*................ + // ldr q26, [x4, #-64] // ................*............... + // add v7.4S, v10.4S, v13.4S // ..................*............. + // sub v1.4S, v10.4S, v13.4S // .................*.............. + // sqrdmulh v5.4S, v1.4S, v17.4S // .....................*.......... + // sqrdmulh v20.4S, v7.4S, v19.4S // ......................*......... + // mul v11.4S, v1.4S, v27.4S // ....................*........... + // mul v31.4S, v7.4S, v26.4S // .......................*........ + // add v1.4S, v15.4S, v24.4S // ...................*............ + // mla v31.4S, v20.4S, v29.4S // ..........................*..... + // mla v11.4S, v5.4S, v29.4S // .........................*...... + // sub v20.4S, v15.4S, v24.4S // ........................*....... + // add v4.4S, v20.4S, v11.4S // ...........................*.... + // sub v3.4S, v1.4S, v31.4S // .............................*.. + // sub v5.4S, v20.4S, v11.4S // ............................*... + // add v2.4S, v1.4S, v31.4S // ..............................*. + // st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // ...............................* + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_a55.s b/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_a55.s index f838ca3c..16bc9e8f 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_a55.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_a55.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, \offset] -.endm - -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], \inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, \offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], \inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +25,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +42,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -89,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -131,40 +103,40 @@ xtmp1 .req x11 str \x\()t_31, [\addr, #(-\inc + 8*7)] .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -186,7 +158,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +169,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +179,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +187,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +198,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -251,7 +223,7 @@ roots: .text .global ntt_dilithium_123_45678_manual_st4_opt_a55 - .global _ntt_dilithium_123_45678_manual_st4_opt_a55 + .global _ntt_dilithium_123_45678_manual_st4 .p2align 4 const_addr: .word 8380417 @@ -375,856 +347,1687 @@ _ntt_dilithium_123_45678_manual_st4_opt_a55: load_roots_123 .p2align 2 - ldr_vo v24, x0, 512 // ....*..... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v15, x0, 0 // *......... - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v23.4S, v24.4S, v0.S[1] // ......*... - // gap // .......... - ldr_vo v29, x0, 128 // .*........ - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v11, x0, 256 // ..*....... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v14, x0, 384 // ...*...... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v4, x0, 896 // .......*.. - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v21, x0, 640 // .....*.... - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v28.4S, v4.4S, v0.S[1] // .........* - // gap // .......... - ldr_vo v13, x0, 768 // ........*. - // gap // .......... - - // original source code - // ldr_vo v15, x0, 0 // .*........ || ..*.............. - // ldr_vo v29, x0, 128 // ...*...... || .....*........... - // ldr_vo v11, x0, 256 // ....*..... || .......*......... - // ldr_vo v14, x0, 384 // .....*.... || .........*....... - // ldr_vo v24, x0, 512 // *......... || *................ - // ldr_vo v21, x0, 640 // .......*.. || .............*... - // sqrdmulh v23.4S, v24.4S, v0.S[1] // ..*....... || ....*............ - // ldr_vo v4, x0, 896 // ......*... || ...........*..... - // ldr_vo v13, x0, 768 // .........* || ................* - // sqrdmulh v28.4S, v4.4S, v0.S[1] // ........*. || ...............*. - + // Instructions: 10 + // Expected cycles: 17 + // Expected IPC: 0.59 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q6, [x0, #512] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q20, [x0, #896] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q9, [x0, #0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x0, #128] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x0, #256] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q26, [x0, #384] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x0, #640] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v12.4S, v20.4S, v0.S[0] // .........*.................... + // gap // .............................. + mul v19.4S, v6.4S, v0.S[0] // ......*....................... + // gap // .............................. + ldr q31, [x0, #768] // ........*..................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q9, [x0, #0] // ..*............................ + // ldr q15, [x0, #128] // ...*........................... + // ldr q4, [x0, #256] // ....*.......................... + // ldr q26, [x0, #384] // .....*......................... + // ldr q6, [x0, #512] // *.............................. + // ldr q22, [x0, #640] // ......*........................ + // mul v19.4S, v6.4S, v0.S[0] // ........*...................... + // ldr q20, [x0, #896] // .*............................. + // ldr q31, [x0, #768] // .........*..................... + // mul v12.4S, v20.4S, v0.S[0] // .......*....................... + sub count, count, #1 -.p2align 2 layer123_start: - mul v24.4S, v24.4S, v0.S[0] // ........*................................................................... + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Wall time: 3.97s + // User time: 3.97s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sqrdmulh v6.4S, v6.4S, v0.S[1] // ........*................................................................... + // gap // ............................................................................ + sqrdmulh v18.4S, v22.4S, v0.S[1] // .............*.............................................................. + // gap // ............................................................................ + mul v16.4S, v22.4S, v0.S[0] // ..............*............................................................. + // gap // ............................................................................ + sqrdmulh v22.4S, v31.4S, v0.S[1] // ..................*......................................................... + // gap // ............................................................................ + mls v19.4S, v6.4S, v8.S[0] // ..........*................................................................. + // gap // ............................................................................ + mul v6.4S, v31.4S, v0.S[0] // ...................*........................................................ + // gap // ............................................................................ + mls v16.4S, v18.4S, v8.S[0] // ...............*............................................................ + // gap // ............................................................................ + sqrdmulh v18.4S, v20.4S, v0.S[1] // .......................*.................................................... + // gap // ............................................................................ + sub v20.4S, v9.4S, v19.4S // ...........*................................................................ + // gap // ............................................................................ + mls v6.4S, v22.4S, v8.S[0] // ....................*....................................................... + // gap // ............................................................................ + add v22.4S, v9.4S, v19.4S // ............*............................................................... + // gap // ............................................................................ + sub v19.4S, v15.4S, v16.4S // ................*........................................................... + // gap // ............................................................................ + add v16.4S, v15.4S, v16.4S // .................*.......................................................... + // gap // ............................................................................ + sub v31.4S, v4.4S, v6.4S // .....................*...................................................... + // gap // ............................................................................ + add v6.4S, v4.4S, v6.4S // ......................*..................................................... + // gap // ............................................................................ + mls v12.4S, v18.4S, v8.S[0] // .........................*.................................................. + // gap // ............................................................................ + sqrdmulh v18.4S, v31.4S, v1.S[1] // ......................................*..................................... + // gap // ............................................................................ + mul v31.4S, v31.4S, v1.S[0] // .......................................*.................................... + // gap // ............................................................................ + sqrdmulh v9.4S, v6.4S, v0.S[3] // ............................*............................................... + // gap // ............................................................................ + sub v15.4S, v26.4S, v12.4S // ..........................*................................................. + // gap // ............................................................................ + add v12.4S, v26.4S, v12.4S // ...........................*................................................ + // gap // ............................................................................ + mls v31.4S, v18.4S, v8.S[0] // ........................................*................................... + // gap // ............................................................................ + mul v6.4S, v6.4S, v0.S[2] // .............................*.............................................. + // gap // ............................................................................ + sqrdmulh v18.4S, v15.4S, v1.S[1] // ...........................................*................................ + // gap // ............................................................................ + mul v15.4S, v15.4S, v1.S[0] // ............................................*............................... + // gap // ............................................................................ + sub v4.4S, v20.4S, v31.4S // .........................................*.................................. + // gap // ............................................................................ + add v20.4S, v20.4S, v31.4S // ..........................................*................................. + // gap // ............................................................................ + mls v6.4S, v9.4S, v8.S[0] // ..............................*............................................. + // gap // ............................................................................ + sqrdmulh v31.4S, v12.4S, v0.S[3] // .................................*.......................................... + // gap // ............................................................................ + mls v15.4S, v18.4S, v8.S[0] // .............................................*.............................. + // gap // ............................................................................ + mul v18.4S, v12.4S, v0.S[2] // ..................................*......................................... + // gap // ............................................................................ + sub v12.4S, v22.4S, v6.4S // ...............................*............................................ + // gap // ............................................................................ + add v6.4S, v22.4S, v6.4S // ................................*........................................... + // gap // ............................................................................ + sub v22.4S, v19.4S, v15.4S // ..............................................*............................. + // gap // ............................................................................ + add v19.4S, v19.4S, v15.4S // ...............................................*............................ + // gap // ............................................................................ + mls v18.4S, v31.4S, v8.S[0] // ...................................*........................................ + // gap // ............................................................................ + sqrdmulh v31.4S, v22.4S, v3.S[1] // ...............................................................*............ + // gap // ............................................................................ + sqrdmulh v9.4S, v19.4S, v2.S[3] // ..........................................................*................. + // gap // ............................................................................ + mul v19.4S, v19.4S, v2.S[2] // ...........................................................*................ + // gap // ............................................................................ + sub v15.4S, v16.4S, v18.4S // ....................................*....................................... + // gap // ............................................................................ + add v18.4S, v16.4S, v18.4S // .....................................*...................................... + // gap // ............................................................................ + mul v16.4S, v22.4S, v3.S[0] // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v22.4S, v15.4S, v2.S[1] // .....................................................*...................... + // gap // ............................................................................ + sqrdmulh v26.4S, v18.4S, v1.S[3] // ................................................*........................... + // gap // ............................................................................ + mul v18.4S, v18.4S, v1.S[2] // .................................................*.......................... + // gap // ............................................................................ + mul v15.4S, v15.4S, v2.S[0] // ......................................................*..................... + // gap // ............................................................................ + mls v19.4S, v9.4S, v8.S[0] // ............................................................*............... + // gap // ............................................................................ + mls v16.4S, v31.4S, v8.S[0] // .................................................................*.......... + // gap // ............................................................................ + mls v18.4S, v26.4S, v8.S[0] // ..................................................*......................... + // gap // ............................................................................ + mls v15.4S, v22.4S, v8.S[0] // .......................................................*.................... + // gap // ............................................................................ + sub v22.4S, v20.4S, v19.4S // .............................................................*.............. + // gap // ............................................................................ + sub v31.4S, v4.4S, v16.4S // ..................................................................*......... + // gap // ............................................................................ + add v16.4S, v4.4S, v16.4S // ...................................................................*........ + // gap // ............................................................................ + add v19.4S, v20.4S, v19.4S // ..............................................................*............. + // gap // ............................................................................ + sub v20.4S, v6.4S, v18.4S // ...................................................*........................ + // gap // ............................................................................ + add v6.4S, v6.4S, v18.4S // ....................................................*....................... + // gap // ............................................................................ + sub v18.4S, v12.4S, v15.4S // ........................................................*................... + // gap // ............................................................................ + add v12.4S, v12.4S, v15.4S // .........................................................*.................. + // gap // ............................................................................ + str q6, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + ldr q9, [x0, #0] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q20, [x0, #112] // .....................................................................*...... + // gap // ............................................................................ + ldr q15, [x0, #128] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q12, [x0, #240] // ......................................................................*..... + // gap // ............................................................................ + ldr q4, [x0, #256] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q18, [x0, #368] // .......................................................................*.... + // gap // ............................................................................ + ldr q26, [x0, #384] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #496] // ........................................................................*... + // gap // ............................................................................ + ldr q6, [x0, #512] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q22, [x0, #624] // .........................................................................*.. + // gap // ............................................................................ + ldr q22, [x0, #640] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #752] // ..........................................................................*. + // gap // ............................................................................ + mul v19.4S, v6.4S, v0.S[0] // .........e.................................................................. + // gap // ............................................................................ + str q31, [x0, #880] // ...........................................................................* + // gap // ............................................................................ + ldr q20, [x0, #896] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q31, [x0, #768] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v12.4S, v20.4S, v0.S[0] // ........................e................................................... + // gap // ............................................................................ + + // ------------------------------------- new position --------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-------------- + // ldr q9, [x0, #0] // e................'..........................................................~............. + // ldr q10, [x0, #(1*(1024/8))] // ..e..............'............................................................~........... + // ldr q11, [x0, #(2*(1024/8))] // ....e............'..............................................................~......... + // ldr q12, [x0, #(3*(1024/8))] // ......e..........'................................................................~....... + // ldr q13, [x0, #(4*(1024/8))] // ........e........'..................................................................~..... + // ldr q14, [x0, #(5*(1024/8))] // ..........e......'....................................................................~... + // ldr q15, [x0, #(6*(1024/8))] // ...............e.'........................................................................ + // ldr q16, [x0, #(7*(1024/8))] // ..............e..'........................................................................ + // sqrdmulh v27.4s, v13.4s, v0.s[1] // .................*........................................................................ + // mul v24.4s, v13.4s, v0.s[0] // ............e....'......................................................................~. + // mls v24.4s, v27.4s, v8.s[0] // .................'...*.................................................................... + // sub v13.4s, v9.4s, v24.4s // .................'.......*................................................................ + // add v9.4s, v9.4s, v24.4s // .................'.........*.............................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .................'*....................................................................... + // mul v24.4s, v14.4s, v0.s[0] // .................'.*...................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'.....*.................................................................. + // sub v14.4s, v10.4s, v24.4s // .................'..........*............................................................. + // add v10.4s, v10.4s, v24.4s // .................'...........*............................................................ + // sqrdmulh v27.4s, v15.4s, v0.s[1] // .................'..*..................................................................... + // mul v24.4s, v15.4s, v0.s[0] // .................'....*................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'........*............................................................... + // sub v15.4s, v11.4s, v24.4s // .................'............*........................................................... + // add v11.4s, v11.4s, v24.4s // .................'.............*.......................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .................'......*................................................................. + // mul v24.4s, v16.4s, v0.s[0] // ................e'........................................................................ + // mls v24.4s, v27.4s, v8.s[0] // .................'..............*......................................................... + // sub v16.4s, v12.4s, v24.4s // .................'..................*..................................................... + // add v12.4s, v12.4s, v24.4s // .................'...................*.................................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .................'.................*...................................................... + // mul v24.4s, v11.4s, v0.s[2] // .................'.....................*.................................................. + // mls v24.4s, v27.4s, v8.s[0] // .................'..........................*............................................. + // sub v11.4s, v9.4s, v24.4s // .................'..............................*......................................... + // add v9.4s, v9.4s, v24.4s // .................'...............................*........................................ + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .................'...........................*............................................ + // mul v24.4s, v12.4s, v0.s[2] // .................'.............................*.......................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'..................................*..................................... + // sub v12.4s, v10.4s, v24.4s // .................'......................................*................................. + // add v10.4s, v10.4s, v24.4s // .................'.......................................*................................ + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .................'...............*........................................................ + // mul v24.4s, v15.4s, v1.s[0] // .................'................*....................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'....................*................................................... + // sub v15.4s, v13.4s, v24.4s // .................'........................*............................................... + // add v13.4s, v13.4s, v24.4s // .................'.........................*.............................................. + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .................'......................*................................................. + // mul v24.4s, v16.4s, v1.s[0] // .................'.......................*................................................ + // mls v24.4s, v27.4s, v8.s[0] // .................'............................*........................................... + // sub v16.4s, v14.4s, v24.4s // .................'................................*....................................... + // add v14.4s, v14.4s, v24.4s // .................'.................................*...................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .................'..........................................*............................. + // mul v24.4s, v10.4s, v1.s[2] // .................'...........................................*............................ + // mls v24.4s, v27.4s, v8.s[0] // .................'...............................................*........................ + // sub v10.4s, v9.4s, v24.4s // .................'.....................................................*.................. + // add v9.4s, v9.4s, v24.4s // .................'......................................................*................. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .................'.........................................*.............................. + // mul v24.4s, v12.4s, v2.s[0] // .................'............................................*........................... + // mls v24.4s, v27.4s, v8.s[0] // .................'................................................*....................... + // sub v12.4s, v11.4s, v24.4s // .................'.......................................................*................ + // add v11.4s, v11.4s, v24.4s // .................'........................................................*............... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // .................'....................................*................................... + // mul v24.4s, v14.4s, v2.s[2] // .................'.....................................*.................................. + // mls v24.4s, v27.4s, v8.s[0] // .................'.............................................*.......................... + // sub v14.4s, v13.4s, v24.4s // .................'.................................................*...................... + // add v13.4s, v13.4s, v24.4s // .................'....................................................*................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .................'...................................*.................................... + // mul v24.4s, v16.4s, v3.s[0] // .................'........................................*............................... + // mls v24.4s, v27.4s, v8.s[0] // .................'..............................................*......................... + // sub v16.4s, v15.4s, v24.4s // .................'..................................................*..................... + // add v15.4s, v15.4s, v24.4s // .................'...................................................*.................... + // str q9, [x0], #(16) // .................'.........................................................*.............. + // str q10, [x0, #(-16 + 1*(1024/8))] // .~...............'...........................................................*............ + // str q11, [x0, #(-16 + 2*(1024/8))] // ...~.............'.............................................................*.......... + // str q12, [x0, #(-16 + 3*(1024/8))] // .....~...........'...............................................................*........ + // str q13, [x0, #(-16 + 4*(1024/8))] // .......~.........'.................................................................*...... + // str q14, [x0, #(-16 + 5*(1024/8))] // .........~.......'...................................................................*.... + // str q15, [x0, #(-16 + 6*(1024/8))] // ...........~.....'.....................................................................*.. + // str q16, [x0, #(-16 + 7*(1024/8))] // .............~...'.......................................................................* + + sub count, count, #1 + cbnz count, layer123_start + // Instructions: 66 + // Expected cycles: 67 + // Expected IPC: 0.99 + // + // Wall time: 6.65s + // User time: 6.65s + // + // ----------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + sqrdmulh v16.4S, v31.4S, v0.S[1] // ...*.............................................................. + // gap // .................................................................. + sqrdmulh v25.4S, v20.4S, v0.S[1] // .......*.......................................................... + // gap // .................................................................. + mul v20.4S, v31.4S, v0.S[0] // .....*............................................................ + // gap // .................................................................. + mul v24.4S, v22.4S, v0.S[0] // ..*............................................................... + // gap // .................................................................. + sqrdmulh v31.4S, v22.4S, v0.S[1] // .*................................................................ + // gap // .................................................................. + mls v12.4S, v25.4S, v8.S[0] // ...............*.................................................. + // gap // .................................................................. + mls v20.4S, v16.4S, v8.S[0] // .........*........................................................ + // gap // .................................................................. + sqrdmulh v27.4S, v6.4S, v0.S[1] // *................................................................. + // gap // .................................................................. + mls v24.4S, v31.4S, v8.S[0] // ......*........................................................... + // gap // .................................................................. + sub v16.4S, v26.4S, v12.4S // ...................*.............................................. + // gap // .................................................................. + sub v10.4S, v4.4S, v20.4S // .............*.................................................... + // gap // .................................................................. + mls v19.4S, v27.4S, v8.S[0] // ....*............................................................. + // gap // .................................................................. + sub v23.4S, v15.4S, v24.4S // ...........*...................................................... + // gap // .................................................................. + mul v22.4S, v16.4S, v1.S[0] // ........................*......................................... + // gap // .................................................................. + sqrdmulh v29.4S, v10.4S, v1.S[1] // ................*................................................. + // gap // .................................................................. + sqrdmulh v13.4S, v16.4S, v1.S[1] // .......................*.......................................... + // gap // .................................................................. + add v5.4S, v15.4S, v24.4S // ............*..................................................... + // gap // .................................................................. + mul v16.4S, v10.4S, v1.S[0] // .................*................................................ + // gap // .................................................................. + sub v6.4S, v9.4S, v19.4S // ........*......................................................... + // gap // .................................................................. + add v10.4S, v26.4S, v12.4S // ....................*............................................. + // gap // .................................................................. + mls v22.4S, v13.4S, v8.S[0] // .............................*.................................... + // gap // .................................................................. + mls v16.4S, v29.4S, v8.S[0] // .....................*............................................ + // gap // .................................................................. + sqrdmulh v25.4S, v10.4S, v0.S[3] // ............................*..................................... + // gap // .................................................................. + add v18.4S, v4.4S, v20.4S // ..............*................................................... + // gap // .................................................................. + mul v26.4S, v10.4S, v0.S[2] // ..............................*................................... + // gap // .................................................................. + add v21.4S, v23.4S, v22.4S // ..................................*............................... + // gap // .................................................................. + sqrdmulh v7.4S, v18.4S, v0.S[3] // ..................*............................................... + // gap // .................................................................. + add v4.4S, v6.4S, v16.4S // ..........................*....................................... + // gap // .................................................................. + sqrdmulh v11.4S, v21.4S, v2.S[3] // .....................................*............................ + // gap // .................................................................. + mls v26.4S, v25.4S, v8.S[0] // ...................................*.............................. + // gap // .................................................................. + mul v24.4S, v18.4S, v0.S[2] // ......................*........................................... + // gap // .................................................................. + mul v27.4S, v21.4S, v2.S[2] // ......................................*........................... + // gap // .................................................................. + add v14.4S, v9.4S, v19.4S // ..........*....................................................... + // gap // .................................................................. + sub v28.4S, v5.4S, v26.4S // .......................................*.......................... + // gap // .................................................................. + mls v24.4S, v7.4S, v8.S[0] // ...........................*...................................... + // gap // .................................................................. + mls v27.4S, v11.4S, v8.S[0] // ..............................................*................... + // gap // .................................................................. + add v31.4S, v5.4S, v26.4S // ........................................*......................... + // gap // .................................................................. + sub v11.4S, v6.4S, v16.4S // .........................*........................................ + // gap // .................................................................. + sub v13.4S, v23.4S, v22.4S // .................................*................................ + // gap // .................................................................. + sqrdmulh v18.4S, v31.4S, v1.S[3] // ...........................................*...................... + // gap // .................................................................. + sub v5.4S, v14.4S, v24.4S // ...............................*.................................. + // gap // .................................................................. + sub v29.4S, v4.4S, v27.4S // ..................................................*............... + // gap // .................................................................. + sqrdmulh v10.4S, v28.4S, v2.S[1] // ..........................................*....................... + // gap // .................................................................. + add v23.4S, v14.4S, v24.4S // ................................*................................. + // gap // .................................................................. + mul v20.4S, v31.4S, v1.S[2] // ............................................*..................... + // gap // .................................................................. + mul v24.4S, v28.4S, v2.S[0] // .............................................*.................... + // gap // .................................................................. + add v25.4S, v4.4S, v27.4S // .....................................................*............ + // gap // .................................................................. + mul v30.4S, v13.4S, v3.S[0] // .........................................*........................ + // gap // .................................................................. + sqrdmulh v22.4S, v13.4S, v3.S[1] // ....................................*............................. + // gap // .................................................................. + mls v20.4S, v18.4S, v8.S[0] // ................................................*................. + // gap // .................................................................. + mls v24.4S, v10.4S, v8.S[0] // .................................................*................ + // gap // .................................................................. + str q29, [x0, #640] // ...............................................................*.. + // gap // .................................................................. + mls v30.4S, v22.4S, v8.S[0] // ...............................................*.................. + // gap // .................................................................. + add v9.4S, v23.4S, v20.4S // .......................................................*.......... + // gap // .................................................................. + str q25, [x0, #512] // ..............................................................*... + // gap // .................................................................. + add v10.4S, v5.4S, v24.4S // .........................................................*........ + // gap // .................................................................. + str q9, [x0], #(16) // ..........................................................*....... + // gap // .................................................................. + sub v15.4S, v5.4S, v24.4S // ........................................................*......... + // gap // .................................................................. + str q10, [x0, #240] // ............................................................*..... + // gap // .................................................................. + add v10.4S, v11.4S, v30.4S // ....................................................*............. + // gap // .................................................................. + str q15, [x0, #368] // .............................................................*.... + // gap // .................................................................. + sub v29.4S, v23.4S, v20.4S // ......................................................*........... + // gap // .................................................................. + str q10, [x0, #752] // ................................................................*. + // gap // .................................................................. + sub v10.4S, v11.4S, v30.4S // ...................................................*.............. + // gap // .................................................................. + str q29, [x0, #112] // ...........................................................*...... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q10, [x0, #880] // .................................................................* + // gap // .................................................................. + + // ------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // sqrdmulh v6.4S, v6.4S, v0.S[1] // .......*.......................................................... + // sqrdmulh v18.4S, v22.4S, v0.S[1] // ....*............................................................. + // mul v16.4S, v22.4S, v0.S[0] // ...*.............................................................. + // sqrdmulh v22.4S, v31.4S, v0.S[1] // *................................................................. + // mls v19.4S, v6.4S, v8.S[0] // ...........*...................................................... + // mul v6.4S, v31.4S, v0.S[0] // ..*............................................................... + // mls v16.4S, v18.4S, v8.S[0] // ........*......................................................... + // sqrdmulh v18.4S, v20.4S, v0.S[1] // .*................................................................ + // sub v20.4S, v9.4S, v19.4S // ..................*............................................... + // mls v6.4S, v22.4S, v8.S[0] // ......*........................................................... + // add v22.4S, v9.4S, v19.4S // ................................*................................. + // sub v19.4S, v15.4S, v16.4S // ............*..................................................... + // add v16.4S, v15.4S, v16.4S // ................*................................................. + // sub v31.4S, v4.4S, v6.4S // ..........*....................................................... + // add v6.4S, v4.4S, v6.4S // .......................*.......................................... + // mls v12.4S, v18.4S, v8.S[0] // .....*............................................................ + // sqrdmulh v18.4S, v31.4S, v1.S[1] // ..............*................................................... + // mul v31.4S, v31.4S, v1.S[0] // .................*................................................ + // sqrdmulh v9.4S, v6.4S, v0.S[3] // ..........................*....................................... + // sub v15.4S, v26.4S, v12.4S // .........*........................................................ + // add v12.4S, v26.4S, v12.4S // ...................*.............................................. + // mls v31.4S, v18.4S, v8.S[0] // .....................*............................................ + // mul v6.4S, v6.4S, v0.S[2] // ..............................*................................... + // sqrdmulh v18.4S, v15.4S, v1.S[1] // ...............*.................................................. + // mul v15.4S, v15.4S, v1.S[0] // .............*.................................................... + // sub v4.4S, v20.4S, v31.4S // .....................................*............................ + // add v20.4S, v20.4S, v31.4S // ...........................*...................................... + // mls v6.4S, v9.4S, v8.S[0] // ..................................*............................... + // sqrdmulh v31.4S, v12.4S, v0.S[3] // ......................*........................................... + // mls v15.4S, v18.4S, v8.S[0] // ....................*............................................. + // mul v18.4S, v12.4S, v0.S[2] // ........................*......................................... + // sub v12.4S, v22.4S, v6.4S // ........................................*......................... + // add v6.4S, v22.4S, v6.4S // ...........................................*...................... + // sub v22.4S, v19.4S, v15.4S // ......................................*........................... + // add v19.4S, v19.4S, v15.4S // .........................*........................................ + // mls v18.4S, v31.4S, v8.S[0] // .............................*.................................... + // sqrdmulh v31.4S, v22.4S, v3.S[1] // ................................................*................. + // sqrdmulh v9.4S, v19.4S, v2.S[3] // ............................*..................................... + // mul v19.4S, v19.4S, v2.S[2] // ...............................*.................................. + // sub v15.4S, v16.4S, v18.4S // .................................*................................ + // add v18.4S, v16.4S, v18.4S // ....................................*............................. + // mul v16.4S, v22.4S, v3.S[0] // ...............................................*.................. + // sqrdmulh v22.4S, v15.4S, v2.S[1] // ..........................................*....................... + // sqrdmulh v26.4S, v18.4S, v1.S[3] // .......................................*.......................... + // mul v18.4S, v18.4S, v1.S[2] // ............................................*..................... + // mul v15.4S, v15.4S, v2.S[0] // .............................................*.................... + // mls v19.4S, v9.4S, v8.S[0] // ...................................*.............................. + // mls v16.4S, v31.4S, v8.S[0] // ....................................................*............. + // mls v18.4S, v26.4S, v8.S[0] // .................................................*................ + // mls v15.4S, v22.4S, v8.S[0] // ..................................................*............... + // sub v22.4S, v20.4S, v19.4S // .........................................*........................ + // sub v31.4S, v4.4S, v16.4S // ...............................................................*.. + // add v16.4S, v4.4S, v16.4S // ...........................................................*...... + // add v19.4S, v20.4S, v19.4S // ..............................................*................... + // sub v20.4S, v6.4S, v18.4S // .............................................................*.... + // add v6.4S, v6.4S, v18.4S // .....................................................*............ + // sub v18.4S, v12.4S, v15.4S // .........................................................*........ + // add v12.4S, v12.4S, v15.4S // .......................................................*.......... + // str q6, [x0], #(16) // ........................................................*......... + // str q20, [x0, #112] // ................................................................*. + // str q12, [x0, #240] // ..........................................................*....... + // str q18, [x0, #368] // ............................................................*..... + // str q19, [x0, #496] // ......................................................*........... + // str q22, [x0, #624] // ...................................................*.............. + // str q16, [x0, #752] // ..............................................................*... + // str q31, [x0, #880] // .................................................................* + + + restore inp, STACK0 + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + // Instructions: 76 + // Expected cycles: 98 + // Expected IPC: 0.78 + // + // Wall time: 27.86s + // User time: 27.86s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q11, [x4], #64 // ........*................................................................... // gap // ............................................................................ - mul v9.4S, v21.4S, v0.S[0] // .............*.............................................................. // gap // ............................................................................ - sqrdmulh v25.4S, v21.4S, v0.S[1] // ..............*............................................................. // gap // ............................................................................ - mul v5.4S, v13.4S, v0.S[0] // ..................*......................................................... + ldr q21, [x2, #48] // .......*.................................................................... // gap // ............................................................................ - mls v24.4S, v23.4S, v8.S[0] // ..........*................................................................. // gap // ............................................................................ - sqrdmulh v21.4S, v13.4S, v0.S[1] // ...................*........................................................ // gap // ............................................................................ - mls v9.4S, v25.4S, v8.S[0] // ...............*............................................................ + ldr q0, [x2, #16] // .....*...................................................................... // gap // ............................................................................ - mul v25.4S, v4.4S, v0.S[0] // .......................*.................................................... // gap // ............................................................................ - sub v23.4S, v15.4S, v24.4S // ...........*................................................................ // gap // ............................................................................ - mls v5.4S, v21.4S, v8.S[0] // ....................*....................................................... + sqrdmulh v4.4S, v21.4S, v11.S[1] // ..................*......................................................... // gap // ............................................................................ - add v24.4S, v15.4S, v24.4S // ............*............................................................... + mul v31.4S, v21.4S, v11.S[0] // ......................*..................................................... // gap // ............................................................................ - sub v21.4S, v29.4S, v9.4S // ................*........................................................... + mul v20.4S, v0.4S, v11.S[0] // .............*.............................................................. // gap // ............................................................................ - add v9.4S, v29.4S, v9.4S // .................*.......................................................... + ldr q26, [x1, #48] // ...*........................................................................ // gap // ............................................................................ - sub v4.4S, v11.4S, v5.4S // .....................*...................................................... // gap // ............................................................................ - add v5.4S, v11.4S, v5.4S // ......................*..................................................... // gap // ............................................................................ - mls v25.4S, v28.4S, v8.S[0] // .........................*.................................................. + mls v31.4S, v4.4S, v8.S[0] // ..........................*................................................. // gap // ............................................................................ - mul v11.4S, v4.4S, v1.S[0] // ......................................*..................................... + ldr q12, [x2, #0] // ....*....................................................................... // gap // ............................................................................ - sqrdmulh v4.4S, v4.4S, v1.S[1] // .......................................*.................................... // gap // ............................................................................ - mul v15.4S, v5.4S, v0.S[2] // ............................*............................................... // gap // ............................................................................ - sub v29.4S, v14.4S, v25.4S // ..........................*................................................. + sqrdmulh v18.4S, v0.4S, v11.S[1] // ............*............................................................... // gap // ............................................................................ - add v25.4S, v14.4S, v25.4S // ...........................*................................................ + add v13.4S, v26.4S, v31.4S // ...............................*............................................ // gap // ............................................................................ - mls v11.4S, v4.4S, v8.S[0] // ........................................*................................... + ldr q22, [x1, #16] // .*.......................................................................... // gap // ............................................................................ - sqrdmulh v5.4S, v5.4S, v0.S[3] // .............................*.............................................. // gap // ............................................................................ - mul v4.4S, v29.4S, v1.S[0] // ...........................................*................................ // gap // ............................................................................ - sqrdmulh v29.4S, v29.4S, v1.S[1] // ............................................*............................... + mul v15.4S, v13.4S, v11.S[2] // ...................................*........................................ // gap // ............................................................................ - sub v14.4S, v23.4S, v11.4S // .........................................*.................................. + sqrdmulh v13.4S, v13.4S, v11.S[3] // ..................................*......................................... // gap // ............................................................................ - add v23.4S, v23.4S, v11.4S // ..........................................*................................. + mls v20.4S, v18.4S, v8.S[0] // .................*.......................................................... // gap // ............................................................................ - mls v15.4S, v5.4S, v8.S[0] // ..............................*............................................. + sqrdmulh v16.4S, v12.4S, v11.S[1] // ..........*................................................................. // gap // ............................................................................ - mul v5.4S, v25.4S, v0.S[2] // .................................*.......................................... + mul v18.4S, v12.4S, v11.S[0] // ...........*................................................................ // gap // ............................................................................ - mls v4.4S, v29.4S, v8.S[0] // .............................................*.............................. + mls v15.4S, v13.4S, v8.S[0] // .......................................*.................................... // gap // ............................................................................ - sqrdmulh v25.4S, v25.4S, v0.S[3] // ..................................*......................................... + add v25.4S, v22.4S, v20.4S // .....................*...................................................... // gap // ............................................................................ - sub v11.4S, v24.4S, v15.4S // ...............................*............................................ + ldr q21, [x4, #-32] // .......................*.................................................... // gap // ............................................................................ - add v24.4S, v24.4S, v15.4S // ................................*........................................... // gap // ............................................................................ - sub v15.4S, v21.4S, v4.4S // ..............................................*............................. // gap // ............................................................................ - add v21.4S, v21.4S, v4.4S // ...............................................*............................ + sub v17.4S, v25.4S, v15.4S // ...........................................*................................ // gap // ............................................................................ - mls v5.4S, v25.4S, v8.S[0] // ...................................*........................................ + ldr q19, [x1, #0] // *........................................................................... // gap // ............................................................................ - mul v25.4S, v15.4S, v3.S[0] // ...............................................................*............ // gap // ............................................................................ - mul v4.4S, v21.4S, v2.S[2] // ..........................................................*................. // gap // ............................................................................ - sqrdmulh v21.4S, v21.4S, v2.S[3] // ...........................................................*................ + mul v3.4S, v17.4S, v21.S[0] // .................................................*.......................... // gap // ............................................................................ - sub v29.4S, v9.4S, v5.4S // ....................................*....................................... + sqrdmulh v27.4S, v17.4S, v21.S[1] // ..............................................*............................. // gap // ............................................................................ - add v9.4S, v9.4S, v5.4S // .....................................*...................................... + mls v18.4S, v16.4S, v8.S[0] // ...............*............................................................ // gap // ............................................................................ - sqrdmulh v5.4S, v15.4S, v3.S[1] // ................................................................*........... + ldr q14, [x2, #32] // ......*..................................................................... // gap // ............................................................................ - mul v15.4S, v29.4S, v2.S[0] // .....................................................*...................... // gap // ............................................................................ - mul v13.4S, v9.4S, v1.S[2] // ................................................*........................... // gap // ............................................................................ - sqrdmulh v9.4S, v9.4S, v1.S[3] // .................................................*.......................... + mls v3.4S, v27.4S, v8.S[0] // ....................................................*....................... // gap // ............................................................................ - sqrdmulh v29.4S, v29.4S, v2.S[1] // ......................................................*..................... + add v16.4S, v19.4S, v18.4S // ...................*........................................................ // gap // ............................................................................ - mls v4.4S, v21.4S, v8.S[0] // ............................................................*............... + sqrdmulh v6.4S, v14.4S, v11.S[1] // ..............*............................................................. // gap // ............................................................................ - mls v25.4S, v5.4S, v8.S[0] // .................................................................*.......... + mul v9.4S, v14.4S, v11.S[0] // ................*........................................................... // gap // ............................................................................ - mls v13.4S, v9.4S, v8.S[0] // ..................................................*......................... + add v12.4S, v25.4S, v15.4S // ............................................*............................... // gap // ............................................................................ - mls v15.4S, v29.4S, v8.S[0] // .......................................................*.................... + ldr q4, [x4, #-48] // .........*.................................................................. // gap // ............................................................................ - sub v9.4S, v23.4S, v4.4S // .............................................................*.............. // gap // ............................................................................ - sub v5.4S, v14.4S, v25.4S // ..................................................................*......... // gap // ............................................................................ - add v25.4S, v14.4S, v25.4S // ...................................................................*........ + ldr q7, [x5], #(12*16) // ..................................................*......................... // gap // ............................................................................ - add v21.4S, v23.4S, v4.4S // ..............................................................*............. // gap // ............................................................................ - sub v23.4S, v24.4S, v13.4S // ...................................................*........................ // gap // ............................................................................ - add v24.4S, v24.4S, v13.4S // ....................................................*....................... + sqrdmulh v10.4S, v12.4S, v4.S[3] // ...............................................*............................ // gap // ............................................................................ - sub v4.4S, v11.4S, v15.4S // ........................................................*................... + mul v17.4S, v12.4S, v4.S[2] // ................................................*........................... // gap // ............................................................................ - add v11.4S, v11.4S, v15.4S // .........................................................*.................. + mls v9.4S, v6.4S, v8.S[0] // ....................*....................................................... // gap // ............................................................................ - str_vi v24, x0, 16 // ....................................................................*....... + ldr q30, [x1, #32] // ..*......................................................................... // gap // ............................................................................ - ldr_vo v15, x0, 0 // e........................................................................... // gap // ............................................................................ // gap // ............................................................................ + mls v17.4S, v10.4S, v8.S[0] // ...................................................*........................ // gap // ............................................................................ - str_vo v23, x0, 112 // .....................................................................*...... + sub v23.4S, v26.4S, v31.4S // ..............................*............................................. // gap // ............................................................................ - ldr_vo v29, x0, 128 // .e.......................................................................... + add v28.4S, v30.4S, v9.4S // .........................*.................................................. // gap // ............................................................................ + ldr q26, [x5, #-176] // .....................................................*...................... // gap // ............................................................................ // gap // ............................................................................ - str_vo v11, x0, 240 // ......................................................................*..... // gap // ............................................................................ - ldr_vo v11, x0, 256 // ..e......................................................................... + sqrdmulh v31.4S, v28.4S, v11.S[3] // ............................*............................................... // gap // ............................................................................ + sqrdmulh v6.4S, v23.4S, v4.S[1] // ........................................*................................... // gap // ............................................................................ + mul v12.4S, v28.4S, v11.S[2] // .............................*.............................................. // gap // ............................................................................ - str_vo v4, x0, 368 // .......................................................................*.... + sub v29.4S, v30.4S, v9.4S // ........................*................................................... // gap // ............................................................................ - ldr_vo v14, x0, 384 // ...e........................................................................ + ldr q11, [x5, #-96] // ...................................................................*........ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - str_vo v21, x0, 496 // ........................................................................*... + mls v12.4S, v31.4S, v8.S[0] // .................................*.......................................... // gap // ............................................................................ - ldr_vo v24, x0, 512 // ....e....................................................................... + mul v15.4S, v23.4S, v4.S[0] // .........................................*.................................. // gap // ............................................................................ + ldr q5, [x5, #-80] // .....................................................................*...... // gap // ............................................................................ // gap // ............................................................................ - str_vo v9, x0, 624 // .........................................................................*.. // gap // ............................................................................ - ldr_vo v21, x0, 640 // .....e...................................................................... + add v24.4S, v16.4S, v12.4S // ......................................*..................................... // gap // ............................................................................ + sub v16.4S, v16.4S, v12.4S // .....................................*...................................... // gap // ............................................................................ + mls v15.4S, v6.4S, v8.S[0] // .............................................*.............................. // gap // ............................................................................ - str_vo v25, x0, 752 // ..........................................................................*. + sub v12.4S, v24.4S, v17.4S // ......................................................*..................... // gap // ............................................................................ - sqrdmulh v23.4S, v24.4S, v0.S[1] // .........e.................................................................. + add v9.4S, v16.4S, v3.4S // .........................................................*.................. // gap // ............................................................................ - str_vo v5, x0, 880 // ...........................................................................* + add v27.4S, v24.4S, v17.4S // .......................................................*.................... // gap // ............................................................................ - ldr_vo v4, x0, 896 // .......e.................................................................... + sub v2.4S, v16.4S, v3.4S // ........................................................*................... // gap // ............................................................................ + sqrdmulh v31.4S, v29.4S, v4.S[1] // ...........................*................................................ // gap // ............................................................................ + trn2 v1.4S, v27.4S, v12.4S // ..........................................................*................. // gap // ............................................................................ - ldr_vo v13, x0, 768 // ......e..................................................................... + trn2 v13.4S, v9.4S, v2.4S // ............................................................*............... // gap // ............................................................................ + ldr q16, [x5, #-112] // ..................................................................*......... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v28.4S, v4.4S, v0.S[1] // ........................e................................................... // gap // ............................................................................ - - // original source code - // ldr_vo v9, x0, 0 // e......................................................................................... || e....................................................................................................... - // ldr_vo v10, x0, 128 // ..e....................................................................................... || ...e.................................................................................................... - // ldr_vo v11, x0, 256 // ....e..................................................................................... || ......e................................................................................................. - // ldr_vo v12, x0, 384 // ......e................................................................................... || .........e.............................................................................................. - // ldr_vo v13, x0, 512 // ........e................................................................................. || ............e........................................................................................... - // ldr_vo v14, x0, 640 // ..........e............................................................................... || ...............e........................................................................................ - // ldr_vo v15, x0, 768 // ...............e.......................................................................... || ......................e................................................................................. - // ldr_vo v16, x0, 896 // ..............e........................................................................... || ....................e................................................................................... - // mul v24.4S, v13.4S, v0.S[0] // .................*........................................................................ || .........................*.............................................................................. - // sqrdmulh v13.4S, v13.4S, v0.S[1] // ............e............................................................................. || ..................e..................................................................................... - // mls v24.4S, v13.4S, v8.S[0] // .....................*.................................................................... || .............................*.......................................................................... - // sub v13.4S, v9.4S, v24.4S // .........................*................................................................ || .................................*...................................................................... - // add v9.4S, v9.4S, v24.4S // ...........................*.............................................................. || ...................................*.................................................................... - // mul v24.4S, v14.4S, v0.S[0] // ..................*....................................................................... || ..........................*............................................................................. - // sqrdmulh v14.4S, v14.4S, v0.S[1] // ...................*...................................................................... || ...........................*............................................................................ - // mls v24.4S, v14.4S, v8.S[0] // .......................*.................................................................. || ...............................*........................................................................ - // sub v14.4S, v10.4S, v24.4S // ............................*............................................................. || ....................................*................................................................... - // add v10.4S, v10.4S, v24.4S // .............................*............................................................ || .....................................*.................................................................. - // mul v24.4S, v15.4S, v0.S[0] // ....................*..................................................................... || ............................*........................................................................... - // sqrdmulh v15.4S, v15.4S, v0.S[1] // ......................*................................................................... || ..............................*......................................................................... - // mls v24.4S, v15.4S, v8.S[0] // ..........................*............................................................... || ..................................*..................................................................... - // sub v15.4S, v11.4S, v24.4S // ..............................*........................................................... || ......................................*................................................................. - // add v11.4S, v11.4S, v24.4S // ...............................*.......................................................... || .......................................*................................................................ - // mul v24.4S, v16.4S, v0.S[0] // ........................*................................................................. || ................................*....................................................................... - // sqrdmulh v16.4S, v16.4S, v0.S[1] // ................e......................................................................... || ........................e............................................................................... - // mls v24.4S, v16.4S, v8.S[0] // ................................*......................................................... || ........................................*............................................................... - // sub v16.4S, v12.4S, v24.4S // ....................................*..................................................... || ............................................*........................................................... - // add v12.4S, v12.4S, v24.4S // .....................................*.................................................... || .............................................*.......................................................... - // mul v24.4S, v11.4S, v0.S[2] // ...................................*...................................................... || ...........................................*............................................................ - // sqrdmulh v11.4S, v11.4S, v0.S[3] // .......................................*.................................................. || ...............................................*........................................................ - // mls v24.4S, v11.4S, v8.S[0] // ............................................*............................................. || ....................................................*................................................... - // sub v11.4S, v9.4S, v24.4S // ................................................*......................................... || ........................................................*............................................... - // add v9.4S, v9.4S, v24.4S // .................................................*........................................ || .........................................................*.............................................. - // mul v24.4S, v12.4S, v0.S[2] // .............................................*............................................ || .....................................................*.................................................. - // sqrdmulh v12.4S, v12.4S, v0.S[3] // ...............................................*.......................................... || .......................................................*................................................ - // mls v24.4S, v12.4S, v8.S[0] // ....................................................*..................................... || ............................................................*........................................... - // sub v12.4S, v10.4S, v24.4S // ........................................................*................................. || ................................................................*....................................... - // add v10.4S, v10.4S, v24.4S // .........................................................*................................ || .................................................................*...................................... - // mul v24.4S, v15.4S, v1.S[0] // .................................*........................................................ || .........................................*.............................................................. - // sqrdmulh v15.4S, v15.4S, v1.S[1] // ..................................*....................................................... || ..........................................*............................................................. - // mls v24.4S, v15.4S, v8.S[0] // ......................................*................................................... || ..............................................*......................................................... - // sub v15.4S, v13.4S, v24.4S // ..........................................*............................................... || ..................................................*..................................................... - // add v13.4S, v13.4S, v24.4S // ...........................................*.............................................. || ...................................................*.................................................... - // mul v24.4S, v16.4S, v1.S[0] // ........................................*................................................. || ................................................*....................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[1] // .........................................*................................................ || .................................................*...................................................... - // mls v24.4S, v16.4S, v8.S[0] // ..............................................*........................................... || ......................................................*................................................. - // sub v16.4S, v14.4S, v24.4S // ..................................................*....................................... || ..........................................................*............................................. - // add v14.4S, v14.4S, v24.4S // ...................................................*...................................... || ...........................................................*............................................ - // mul v24.4S, v10.4S, v1.S[2] // ............................................................*............................. || ....................................................................*................................... - // sqrdmulh v10.4S, v10.4S, v1.S[3] // .............................................................*............................ || .....................................................................*.................................. - // mls v24.4S, v10.4S, v8.S[0] // .................................................................*........................ || .........................................................................*.............................. - // sub v10.4S, v9.4S, v24.4S // .......................................................................*.................. || ...............................................................................*........................ - // add v9.4S, v9.4S, v24.4S // ........................................................................*................. || ................................................................................*....................... - // mul v24.4S, v12.4S, v2.S[0] // ...........................................................*.............................. || ...................................................................*.................................... - // sqrdmulh v12.4S, v12.4S, v2.S[1] // ..............................................................*........................... || ......................................................................*................................. - // mls v24.4S, v12.4S, v8.S[0] // ..................................................................*....................... || ..........................................................................*............................. - // sub v12.4S, v11.4S, v24.4S // .........................................................................*................ || .................................................................................*...................... - // add v11.4S, v11.4S, v24.4S // ..........................................................................*............... || ..................................................................................*..................... - // mul v24.4S, v14.4S, v2.S[2] // ......................................................*................................... || ..............................................................*......................................... - // sqrdmulh v14.4S, v14.4S, v2.S[3] // .......................................................*.................................. || ...............................................................*........................................ - // mls v24.4S, v14.4S, v8.S[0] // ...............................................................*.......................... || .......................................................................*................................ - // sub v14.4S, v13.4S, v24.4S // ...................................................................*...................... || ...........................................................................*............................ - // add v13.4S, v13.4S, v24.4S // ......................................................................*................... || ..............................................................................*......................... - // mul v24.4S, v16.4S, v3.S[0] // .....................................................*.................................... || .............................................................*.......................................... - // sqrdmulh v16.4S, v16.4S, v3.S[1] // ..........................................................*............................... || ..................................................................*..................................... - // mls v24.4S, v16.4S, v8.S[0] // ................................................................*......................... || ........................................................................*............................... - // sub v16.4S, v15.4S, v24.4S // ....................................................................*..................... || ............................................................................*........................... - // add v15.4S, v15.4S, v24.4S // .....................................................................*.................... || .............................................................................*.......................... - // str_vi v9, x0, 16 // ...........................................................................*.............. || ...................................................................................*.................... - // str_vo v10, x0, 112 // .............................................................................*............ || ......................................................................................*................. - // str_vo v11, x0, 240 // ...............................................................................*.......... || .........................................................................................*.............. - // str_vo v12, x0, 368 // .................................................................................*........ || ............................................................................................*........... - // str_vo v13, x0, 496 // ...................................................................................*...... || ...............................................................................................*........ - // str_vo v14, x0, 624 // .....................................................................................*.... || ..................................................................................................*..... - // str_vo v15, x0, 752 // .......................................................................................*.. || .....................................................................................................*.. - // str_vo v16, x0, 880 // .........................................................................................* || .......................................................................................................* - - subs count, count, #1 - cbnz count, layer123_start - mul v5.4S, v4.4S, v0.S[0] // .......*.......................................................... - // gap // .................................................................. - mul v9.4S, v24.4S, v0.S[0] // *................................................................. - // gap // .................................................................. - sqrdmulh v19.4S, v21.4S, v0.S[1] // ..*............................................................... - // gap // .................................................................. - mul v12.4S, v21.4S, v0.S[0] // .*................................................................ - // gap // .................................................................. - mls v5.4S, v28.4S, v8.S[0] // ...............*.................................................. - // gap // .................................................................. - mul v30.4S, v13.4S, v0.S[0] // ...*.............................................................. - // gap // .................................................................. - sqrdmulh v7.4S, v13.4S, v0.S[1] // .....*............................................................ - // gap // .................................................................. - mls v12.4S, v19.4S, v8.S[0] // ......*........................................................... - // gap // .................................................................. - add v18.4S, v14.4S, v5.4S // ....................*............................................. - // gap // .................................................................. - mls v9.4S, v23.4S, v8.S[0] // ....*............................................................. - // gap // .................................................................. - mls v30.4S, v7.4S, v8.S[0] // .........*........................................................ - // gap // .................................................................. - sub v23.4S, v14.4S, v5.4S // ...................*.............................................. - // gap // .................................................................. - add v5.4S, v29.4S, v12.4S // ............*..................................................... - // gap // .................................................................. - sub v26.4S, v15.4S, v9.4S // ........*......................................................... - // gap // .................................................................. - add v20.4S, v11.4S, v30.4S // ..............*................................................... - // gap // .................................................................. - sub v24.4S, v11.4S, v30.4S // .............*.................................................... - // gap // .................................................................. - sqrdmulh v10.4S, v23.4S, v1.S[1] // ........................*......................................... - // gap // .................................................................. - mul v4.4S, v20.4S, v0.S[2] // ..................*............................................... - // gap // .................................................................. - sqrdmulh v20.4S, v20.4S, v0.S[3] // ......................*........................................... - // gap // .................................................................. - mul v22.4S, v24.4S, v1.S[0] // ................*................................................. - // gap // .................................................................. - sqrdmulh v27.4S, v24.4S, v1.S[1] // .................*................................................ - // gap // .................................................................. - add v24.4S, v15.4S, v9.4S // ..........*....................................................... - // gap // .................................................................. - mul v19.4S, v23.4S, v1.S[0] // .......................*.......................................... - // gap // .................................................................. - sub v7.4S, v29.4S, v12.4S // ...........*...................................................... - // gap // .................................................................. - mul v11.4S, v18.4S, v0.S[2] // ............................*..................................... - // gap // .................................................................. - sqrdmulh v28.4S, v18.4S, v0.S[3] // ..............................*................................... - // gap // .................................................................. - mls v4.4S, v20.4S, v8.S[0] // ...........................*...................................... - // gap // .................................................................. - mls v19.4S, v10.4S, v8.S[0] // .............................*.................................... - // gap // .................................................................. - mls v22.4S, v27.4S, v8.S[0] // .....................*............................................ - // gap // .................................................................. - mls v11.4S, v28.4S, v8.S[0] // ...................................*.............................. - // gap // .................................................................. - sub v10.4S, v24.4S, v4.4S // ...............................*.................................. - // gap // .................................................................. - add v6.4S, v24.4S, v4.4S // ................................*................................. - // gap // .................................................................. - sub v30.4S, v26.4S, v22.4S // .........................*........................................ - // gap // .................................................................. - sub v12.4S, v5.4S, v11.4S // .......................................*.......................... - // gap // .................................................................. - add v27.4S, v5.4S, v11.4S // ........................................*......................... - // gap // .................................................................. - sub v18.4S, v7.4S, v19.4S // .................................*................................ - // gap // .................................................................. - add v24.4S, v7.4S, v19.4S // ..................................*............................... - // gap // .................................................................. - sqrdmulh v5.4S, v27.4S, v1.S[3] // ............................................*..................... - // gap // .................................................................. - mul v15.4S, v12.4S, v2.S[0] // ..........................................*....................... - // gap // .................................................................. - mul v14.4S, v24.4S, v2.S[2] // .....................................*............................ - // gap // .................................................................. - sqrdmulh v13.4S, v24.4S, v2.S[3] // ......................................*........................... - // gap // .................................................................. - mul v24.4S, v27.4S, v1.S[2] // ...........................................*...................... - // gap // .................................................................. - add v16.4S, v26.4S, v22.4S // ..........................*....................................... - // gap // .................................................................. - sqrdmulh v27.4S, v12.4S, v2.S[1] // .............................................*.................... - // gap // .................................................................. - mls v14.4S, v13.4S, v8.S[0] // ..............................................*................... - // gap // .................................................................. - mls v24.4S, v5.4S, v8.S[0] // ................................................*................. - // gap // .................................................................. - mul v4.4S, v18.4S, v3.S[0] // ....................................*............................. - // gap // .................................................................. - mls v15.4S, v27.4S, v8.S[0] // .................................................*................ - // gap // .................................................................. - add v17.4S, v16.4S, v14.4S // .....................................................*............ - // gap // .................................................................. - add v12.4S, v6.4S, v24.4S // .......................................................*.......... - // gap // .................................................................. - sub v9.4S, v6.4S, v24.4S // ......................................................*........... - // gap // .................................................................. - sqrdmulh v24.4S, v18.4S, v3.S[1] // .........................................*........................ - // gap // .................................................................. - str_vi v12, x0, 16 // ..........................................................*....... - // gap // .................................................................. - add v20.4S, v10.4S, v15.4S // .........................................................*........ - // gap // .................................................................. - str_vo v9, x0, 112 // ...........................................................*...... - // gap // .................................................................. - sub v25.4S, v10.4S, v15.4S // ........................................................*......... - // gap // .................................................................. - str_vo v20, x0, 240 // ............................................................*..... - // gap // .................................................................. - mls v4.4S, v24.4S, v8.S[0] // ...............................................*.................. - // gap // .................................................................. - str_vo v25, x0, 368 // .............................................................*.... - // gap // .................................................................. - sub v24.4S, v16.4S, v14.4S // ..................................................*............... - // gap // .................................................................. - str_vo v17, x0, 496 // ..............................................................*... - // gap // .................................................................. - add v25.4S, v30.4S, v4.4S // ....................................................*............. - // gap // .................................................................. - str_vo v24, x0, 624 // ...............................................................*.. - // gap // .................................................................. - sub v24.4S, v30.4S, v4.4S // ...................................................*.............. - // gap // .................................................................. - str_vo v25, x0, 752 // ................................................................*. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str_vo v24, x0, 880 // .................................................................* - // gap // .................................................................. - - // original source code - // mul v24.4S, v24.4S, v0.S[0] // .*................................................................ || .*................................................................. - // mul v9.4S, v21.4S, v0.S[0] // ...*.............................................................. || ...*............................................................... - // sqrdmulh v25.4S, v21.4S, v0.S[1] // ..*............................................................... || ..*................................................................ - // mul v5.4S, v13.4S, v0.S[0] // .....*............................................................ || .....*............................................................. - // mls v24.4S, v23.4S, v8.S[0] // .........*........................................................ || .........*......................................................... - // sqrdmulh v21.4S, v13.4S, v0.S[1] // ......*........................................................... || ......*............................................................ - // mls v9.4S, v25.4S, v8.S[0] // .......*.......................................................... || .......*........................................................... - // mul v25.4S, v4.4S, v0.S[0] // *................................................................. || *.................................................................. - // sub v23.4S, v15.4S, v24.4S // .............*.................................................... || .............*..................................................... - // mls v5.4S, v21.4S, v8.S[0] // ..........*....................................................... || ..........*........................................................ - // add v24.4S, v15.4S, v24.4S // .....................*............................................ || .....................*............................................. - // sub v21.4S, v29.4S, v9.4S // .......................*.......................................... || .......................*........................................... - // add v9.4S, v29.4S, v9.4S // ............*..................................................... || ............*...................................................... - // sub v4.4S, v11.4S, v5.4S // ...............*.................................................. || ...............*................................................... - // add v5.4S, v11.4S, v5.4S // ..............*................................................... || ..............*.................................................... - // mls v25.4S, v28.4S, v8.S[0] // ....*............................................................. || ....*.............................................................. - // mul v11.4S, v4.4S, v1.S[0] // ...................*.............................................. || ...................*............................................... - // sqrdmulh v4.4S, v4.4S, v1.S[1] // ....................*............................................. || ....................*.............................................. - // mul v15.4S, v5.4S, v0.S[2] // .................*................................................ || .................*................................................. - // sub v29.4S, v14.4S, v25.4S // ...........*...................................................... || ...........*....................................................... - // add v25.4S, v14.4S, v25.4S // ........*......................................................... || ........*.......................................................... - // mls v11.4S, v4.4S, v8.S[0] // ............................*..................................... || ............................*...................................... - // sqrdmulh v5.4S, v5.4S, v0.S[3] // ..................*............................................... || ..................*................................................ - // mul v4.4S, v29.4S, v1.S[0] // ......................*........................................... || ......................*............................................ - // sqrdmulh v29.4S, v29.4S, v1.S[1] // ................*................................................. || ................*.................................................. - // sub v14.4S, v23.4S, v11.4S // ................................*................................. || ................................*.................................. - // add v23.4S, v23.4S, v11.4S // ..........................................*....................... || ..........................................*........................ - // mls v15.4S, v5.4S, v8.S[0] // ..........................*....................................... || ..........................*........................................ - // mul v5.4S, v25.4S, v0.S[2] // ........................*......................................... || ........................*.......................................... - // mls v4.4S, v29.4S, v8.S[0] // ...........................*...................................... || ...........................*....................................... - // sqrdmulh v25.4S, v25.4S, v0.S[3] // .........................*........................................ || .........................*......................................... - // sub v11.4S, v24.4S, v15.4S // ..............................*................................... || ..............................*.................................... - // add v24.4S, v24.4S, v15.4S // ...............................*.................................. || ...............................*................................... - // sub v15.4S, v21.4S, v4.4S // ...................................*.............................. || ...................................*............................... - // add v21.4S, v21.4S, v4.4S // ....................................*............................. || ....................................*.............................. - // mls v5.4S, v25.4S, v8.S[0] // .............................*.................................... || .............................*..................................... - // mul v25.4S, v15.4S, v3.S[0] // ..............................................*................... || ..............................................*.................... - // mul v4.4S, v21.4S, v2.S[2] // .......................................*.......................... || .......................................*........................... - // sqrdmulh v21.4S, v21.4S, v2.S[3] // ........................................*......................... || ........................................*.......................... - // sub v29.4S, v9.4S, v5.4S // .................................*................................ || .................................*................................. - // add v9.4S, v9.4S, v5.4S // ..................................*............................... || ..................................*................................ - // sqrdmulh v5.4S, v15.4S, v3.S[1] // ...................................................*.............. || ...................................................*............... - // mul v15.4S, v29.4S, v2.S[0] // ......................................*........................... || ......................................*............................ - // mul v13.4S, v9.4S, v1.S[2] // .........................................*........................ || .........................................*......................... - // sqrdmulh v9.4S, v9.4S, v1.S[3] // .....................................*............................ || .....................................*............................. - // sqrdmulh v29.4S, v29.4S, v2.S[1] // ...........................................*...................... || ...........................................*....................... - // mls v4.4S, v21.4S, v8.S[0] // ............................................*..................... || ............................................*...................... - // mls v25.4S, v5.4S, v8.S[0] // .........................................................*........ || .........................................................*......... - // mls v13.4S, v9.4S, v8.S[0] // .............................................*.................... || .............................................*..................... - // mls v15.4S, v29.4S, v8.S[0] // ...............................................*.................. || ...............................................*................... - // sub v9.4S, v23.4S, v4.4S // ...........................................................*...... || ...........................................................*....... - // sub v5.4S, v14.4S, v25.4S // ...............................................................*.. || ...............................................................*... - // add v25.4S, v14.4S, v25.4S // .............................................................*.... || .............................................................*..... - // add v21.4S, v23.4S, v4.4S // ................................................*................. || ................................................*.................. - // sub v23.4S, v24.4S, v13.4S // ..................................................*............... || ..................................................*................ - // add v24.4S, v24.4S, v13.4S // .................................................*................ || .................................................*................. - // sub v4.4S, v11.4S, v15.4S // .......................................................*.......... || .......................................................*........... - // add v11.4S, v11.4S, v15.4S // .....................................................*............ || .....................................................*............. - // str_vi v24, x0, 16 // ....................................................*............. || ....................................................*.............. - // str_vo v23, x0, 112 // ......................................................*........... || ......................................................*............ - // str_vo v11, x0, 240 // ........................................................*......... || ........................................................*.......... - // str_vo v4, x0, 368 // ..........................................................*....... || ..........................................................*........ - // str_vo v21, x0, 496 // ............................................................*..... || ............................................................*...... - // str_vo v9, x0, 624 // ..............................................................*... || ..............................................................*.... - // str_vo v25, x0, 752 // ................................................................*. || ................................................................*.. - // str_vo v5, x0, 880 // .................................................................* || ..................................................................* - - - restore inp, STACK0 - add inpp, inp, #64 - mov count, #8 + trn2 v28.2D, v1.2D, v13.2D // ..............................................................*............. + // gap // ............................................................................ + mul v3.4S, v29.4S, v4.S[0] // ................................*........................................... + // gap // ............................................................................ + mul v14.4S, v28.4S, v7.4S // .................................................................*.......... + // gap // ............................................................................ + sqrdmulh v25.4S, v28.4S, v26.4S // ................................................................*........... + // gap // ............................................................................ + ldr q4, [x4, #-16] // ..........................................*................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v3.4S, v31.4S, v8.S[0] // ....................................*....................................... + // gap // ............................................................................ + mls v14.4S, v25.4S, v8.S[0] // ....................................................................*....... + // gap // ............................................................................ + trn1 v25.2D, v1.2D, v13.2D // ...............................................................*............ + // gap // ............................................................................ + ldr q30, [x5, #-160] // ...........................................................*................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v31.4S, v25.4S, v14.4S // .......................................................................*.... + // gap // ............................................................................ + ldr q17, [x5, #-144] // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v0.4S, v31.4S, v16.4S // .........................................................................*.. + // gap // ............................................................................ + ldr q23, [x5, #-64] // ......................................................................*..... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q29, [x5, #-16] // ...........................................................................* + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q28, [x5, #-32] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q1, [x5, #-48] // ........................................................................*... + // gap // ............................................................................ - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - root3_tw .req v7 - qform_root0_tw .req q4 - qform_root1_tw .req q5 - qform_root2_tw .req q6 - qform_root3_tw .req q7 + // ------------------------------ new position -------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------- + // ldr q19, [x1, #0] // .....................*...................................................... + // ldr q22, [x1, #16] // ...........*................................................................ + // ldr q6, [x1, #32] // ....................................*....................................... + // ldr q16, [x1, #48] // ......*..................................................................... + // ldr q18, [x2, #0] // ........*................................................................... + // ldr q20, [x2, #16] // ..*......................................................................... + // ldr q31, [x2, #32] // .........................*.................................................. + // ldr q3, [x2, #48] // .*.......................................................................... + // ldr q12, [x4], #64 // *........................................................................... + // ldr q9, [x4, #-48] // ...............................*............................................ + // sqrdmulh v15.4S, v18.4S, v12.S[1] // ...............*............................................................ + // mul v18.4S, v18.4S, v12.S[0] // ................*........................................................... + // sqrdmulh v4.4S, v20.4S, v12.S[1] // .........*.................................................................. + // mul v20.4S, v20.4S, v12.S[0] // .....*...................................................................... + // sqrdmulh v26.4S, v31.4S, v12.S[1] // ............................*............................................... + // mls v18.4S, v15.4S, v8.S[0] // ........................*................................................... + // mul v31.4S, v31.4S, v12.S[0] // .............................*.............................................. + // mls v20.4S, v4.4S, v8.S[0] // ..............*............................................................. + // sqrdmulh v15.4S, v3.4S, v12.S[1] // ...*........................................................................ + // add v4.4S, v19.4S, v18.4S // ...........................*................................................ + // mls v31.4S, v26.4S, v8.S[0] // ...................................*........................................ + // add v26.4S, v22.4S, v20.4S // ..................*......................................................... + // mul v3.4S, v3.4S, v12.S[0] // ....*....................................................................... + // ldr q21, [x4, #-32] // ...................*........................................................ + // sub v7.4S, v6.4S, v31.4S // ............................................*............................... + // add v6.4S, v6.4S, v31.4S // .......................................*.................................... + // mls v3.4S, v15.4S, v8.S[0] // .......*.................................................................... + // sqrdmulh v31.4S, v7.4S, v9.S[1] // ........................................................*................... + // sqrdmulh v15.4S, v6.4S, v12.S[3] // .........................................*.................................. + // mul v6.4S, v6.4S, v12.S[2] // ...........................................*................................ + // sub v27.4S, v16.4S, v3.4S // ......................................*..................................... + // add v16.4S, v16.4S, v3.4S // ..........*................................................................. + // mul v3.4S, v7.4S, v9.S[0] // .............................................................*.............. + // mls v6.4S, v15.4S, v8.S[0] // ..............................................*............................. + // sqrdmulh v15.4S, v16.4S, v12.S[3] // .............*.............................................................. + // mul v16.4S, v16.4S, v12.S[2] // ............*............................................................... + // mls v3.4S, v31.4S, v8.S[0] // .................................................................*.......... + // sub v31.4S, v4.4S, v6.4S // ..................................................*......................... + // add v6.4S, v4.4S, v6.4S // .................................................*.......................... + // mls v16.4S, v15.4S, v8.S[0] // .................*.......................................................... + // sqrdmulh v12.4S, v27.4S, v9.S[1] // ..........................................*................................. + // mul v15.4S, v27.4S, v9.S[0] // ...............................................*............................ + // ldr q4, [x4, #-16] // ................................................................*........... + // sub v7.4S, v26.4S, v16.4S // ....................*....................................................... + // add v16.4S, v26.4S, v16.4S // ..............................*............................................. + // mls v15.4S, v12.4S, v8.S[0] // ...................................................*........................ + // sqrdmulh v12.4S, v7.4S, v21.S[1] // .......................*.................................................... + // sqrdmulh v26.4S, v16.4S, v9.S[3] // .................................*.......................................... + // mul v16.4S, v16.4S, v9.S[2] // ..................................*......................................... + // mul v9.4S, v7.4S, v21.S[0] // ......................*..................................................... + // ldr q7, [x5], #(12*16) // ................................*........................................... + // mls v16.4S, v26.4S, v8.S[0] // .....................................*...................................... + // mls v9.4S, v12.4S, v8.S[0] // ..........................*................................................. + // ldr q26, [x5, #-176] // ........................................*................................... + // sub v12.4S, v6.4S, v16.4S // ....................................................*....................... + // add v27.4S, v6.4S, v16.4S // ......................................................*..................... + // sub v2.4S, v31.4S, v9.4S // .......................................................*.................... + // add v9.4S, v31.4S, v9.4S // .....................................................*...................... + // trn2 v6.4S, v27.4S, v12.4S // .........................................................*.................. + // ldr q30, [x5, #-160] // ....................................................................*....... + // trn2 v16.4S, v9.4S, v2.4S // ..........................................................*................. + // ldr q17, [x5, #-144] // ......................................................................*..... + // trn2 v31.2D, v6.2D, v16.2D // ............................................................*............... + // trn1 v25.2D, v6.2D, v16.2D // ...................................................................*........ + // sqrdmulh v6.4S, v31.4S, v26.4S // ...............................................................*............ + // mul v14.4S, v31.4S, v7.4S // ..............................................................*............. + // ldr q16, [x5, #-112] // ...........................................................*................ + // ldr q11, [x5, #-96] // .............................................*.............................. + // mls v14.4S, v6.4S, v8.S[0] // ..................................................................*......... + // ldr q5, [x5, #-80] // ................................................*........................... + // ldr q23, [x5, #-64] // ........................................................................*... + // sub v31.4S, v25.4S, v14.4S // .....................................................................*...... + // ldr q1, [x5, #-48] // ...........................................................................* + // sqrdmulh v0.4S, v31.4S, v16.4S // .......................................................................*.... + // ldr q28, [x5, #-32] // ..........................................................................*. + // ldr q29, [x5, #-16] // .........................................................................*.. - .p2align 2 - ldr_vo v1, x1, 0 - ldr_vo v20, x2, 0 - ldr_vi v23, x4, 64 - ldr_vo v11, x2, 48 - mul v17.4S, v20.4S, v23.S[0] - sqrdmulh v20.4S, v20.4S, v23.S[1] - mul v10.4S, v11.4S, v23.S[0] - ldr_vo v24, x1, 32 - mls v17.4S, v20.4S, v8.S[0] - sqrdmulh v20.4S, v11.4S, v23.S[1] - ldr_vo v11, x2, 32 - sub v25.4S, v1.4S, v17.4S - add v1.4S, v1.4S, v17.4S - mul v17.4S, v11.4S, v23.S[0] - sqrdmulh v11.4S, v11.4S, v23.S[1] - mls v10.4S, v20.4S, v8.S[0] - ldr_vo v20, x1, 48 - mls v17.4S, v11.4S, v8.S[0] - ldr_vo v11, x2, 16 - add v18.4S, v20.4S, v10.4S - add v14.4S, v24.4S, v17.4S - mul v15.4S, v11.4S, v23.S[0] - sqrdmulh v11.4S, v11.4S, v23.S[1] - mul v9.4S, v14.4S, v23.S[2] - sqrdmulh v14.4S, v14.4S, v23.S[3] - mul v16.4S, v18.4S, v23.S[2] - sqrdmulh v23.4S, v18.4S, v23.S[3] - sub v17.4S, v24.4S, v17.4S - ldr_vo v24, x4, -48 - mls v9.4S, v14.4S, v8.S[0] - mls v15.4S, v11.4S, v8.S[0] - mul v11.4S, v17.4S, v24.S[0] - sqrdmulh v17.4S, v17.4S, v24.S[1] - sub v18.4S, v1.4S, v9.4S - add v1.4S, v1.4S, v9.4S - sub v20.4S, v20.4S, v10.4S - mls v11.4S, v17.4S, v8.S[0] - mls v16.4S, v23.4S, v8.S[0] - mul v23.4S, v20.4S, v24.S[0] - sqrdmulh v20.4S, v20.4S, v24.S[1] - sub v17.4S, v25.4S, v11.4S - add v11.4S, v25.4S, v11.4S - mls v23.4S, v20.4S, v8.S[0] - ldr_vo v20, x1, 16 - ldr_vo v10, x4, -32 - add v25.4S, v20.4S, v15.4S - ldr_vo v14, x4, -16 - add v9.4S, v25.4S, v16.4S - sub v20.4S, v20.4S, v15.4S - sub v25.4S, v25.4S, v16.4S - mul v15.4S, v9.4S, v24.S[2] - sqrdmulh v24.4S, v9.4S, v24.S[3] - mul v9.4S, v25.4S, v10.S[0] - sqrdmulh v25.4S, v25.4S, v10.S[1] - sub v16.4S, v20.4S, v23.4S - mls v15.4S, v24.4S, v8.S[0] - add v20.4S, v20.4S, v23.4S - mls v9.4S, v25.4S, v8.S[0] - mul v23.4S, v16.4S, v14.S[0] - sub v24.4S, v1.4S, v15.4S - add v1.4S, v1.4S, v15.4S - sub v25.4S, v18.4S, v9.4S - add v18.4S, v18.4S, v9.4S - sqrdmulh v14.4S, v16.4S, v14.S[1] - mul v15.4S, v20.4S, v10.S[2] - sqrdmulh v20.4S, v20.4S, v10.S[3] - trn1 v10.4S, v1.4S, v24.4S - mls v23.4S, v14.4S, v8.S[0] - trn2 v1.4S, v1.4S, v24.4S - mls v15.4S, v20.4S, v8.S[0] - trn1 v20.4S, v18.4S, v25.4S - sub v24.4S, v17.4S, v23.4S - add v23.4S, v17.4S, v23.4S - sub v17.4S, v11.4S, v15.4S - add v11.4S, v11.4S, v15.4S - trn2 v25.4S, v18.4S, v25.4S - trn2 v18.2D, v10.2D, v20.2D - trn1 v14.4S, v11.4S, v17.4S - trn2 v15.2D, v1.2D, v25.2D - trn1 v20.2D, v10.2D, v20.2D - trn1 v1.2D, v1.2D, v25.2D - trn2 v11.4S, v11.4S, v17.4S sub count, count, #1 -.p2align 2 layer45678_start: - trn2 v12.4S, v23.4S, v24.4S // gap(s) to follow - ldr_vo v2, x5, 112 // gap(s) to follow - ldr_vo v0, x5, 96 // gap(s) to follow - trn2 v13.2D, v11.2D, v12.2D // gap(s) to follow - trn1 v27.4S, v23.4S, v24.4S // gap(s) to follow - sqrdmulh v30.4S, v13.4S, v2.4S // gap(s) to follow - mul v9.4S, v13.4S, v0.4S // gap(s) to follow - trn2 v19.2D, v14.2D, v27.2D // gap(s) to follow - ldr_vo v16, x5, 160 // gap(s) to follow - mls v9.4S, v30.4S, v8.S[0] // gap(s) to follow - trn1 v31.2D, v11.2D, v12.2D // gap(s) to follow - ldr_vo v29, x5, 176 // gap(s) to follow - sub v7.4S, v31.4S, v9.4S // gap(s) to follow - ldr_vo v3, x5, 32 // gap(s) to follow - sqrdmulh v26.4S, v7.4S, v29.4S // gap(s) to follow - mul v7.4S, v7.4S, v16.4S // gap(s) to follow - trn1 v5.2D, v14.2D, v27.2D // gap(s) to follow - ldr_vo v10, x5, 48 // gap(s) to follow - sqrdmulh v4.4S, v19.4S, v2.4S // gap(s) to follow - mls v7.4S, v26.4S, v8.S[0] // gap(s) to follow - ldr_vo v23, x5, 80 // gap(s) to follow - ldr_vo v2, x5, 64 // gap(s) to follow - ldr_vo v27, x5, 128 // gap(s) to follow - ldr_vi v16, x5, 192 // gap(s) to follow - mul v0.4S, v19.4S, v0.4S // gap(s) to follow - ldr_vo v22, x5, -176 // gap(s) to follow - mul v30.4S, v15.4S, v16.4S // gap(s) to follow - mls v0.4S, v4.4S, v8.S[0] // gap(s) to follow - sqrdmulh v25.4S, v15.4S, v22.4S // gap(s) to follow - add v26.4S, v31.4S, v9.4S // gap(s) to follow - ldr_vo v6, x5, -48 // gap(s) to follow - sqrdmulh v17.4S, v18.4S, v22.4S // gap(s) to follow - add v9.4S, v5.4S, v0.4S // gap(s) to follow - mls v30.4S, v25.4S, v8.S[0] // gap(s) to follow - mul v14.4S, v26.4S, v27.4S // gap(s) to follow - sqrdmulh v12.4S, v26.4S, v6.4S // gap(s) to follow - mul v4.4S, v18.4S, v16.4S // gap(s) to follow - sub v28.4S, v1.4S, v30.4S // gap(s) to follow - add v22.4S, v1.4S, v30.4S // gap(s) to follow - sub v11.4S, v5.4S, v0.4S // gap(s) to follow - mul v0.4S, v28.4S, v2.4S // gap(s) to follow - mls v4.4S, v17.4S, v8.S[0] // gap(s) to follow - sqrdmulh v1.4S, v22.4S, v10.4S // gap(s) to follow - mul v17.4S, v22.4S, v3.4S // gap(s) to follow - sqrdmulh v23.4S, v28.4S, v23.4S // gap(s) to follow - sub v10.4S, v20.4S, v4.4S // gap(s) to follow - add v20.4S, v20.4S, v4.4S // gap(s) to follow - mls v14.4S, v12.4S, v8.S[0] // gap(s) to follow - mls v0.4S, v23.4S, v8.S[0] // gap(s) to follow - sub v23.4S, v11.4S, v7.4S // gap(s) to follow - add v11.4S, v11.4S, v7.4S // gap(s) to follow - sub v24.4S, v9.4S, v14.4S // gap(s) to follow - add v25.4S, v9.4S, v14.4S // gap(s) to follow - mls v17.4S, v1.4S, v8.S[0] // gap(s) to follow - sub v1.4S, v10.4S, v0.4S // gap(s) to follow - add v10.4S, v10.4S, v0.4S // gap(s) to follow - trn1 v18.4S, v11.4S, v23.4S // gap(s) to follow - add v14.4S, v20.4S, v17.4S // gap(s) to follow - sub v20.4S, v20.4S, v17.4S // gap(s) to follow - trn2 v23.4S, v11.4S, v23.4S // gap(s) to follow - trn2 v11.4S, v10.4S, v1.4S // gap(s) to follow - trn1 v17.4S, v14.4S, v20.4S // gap(s) to follow - trn1 v15.4S, v25.4S, v24.4S // gap(s) to follow - trn2 v24.4S, v25.4S, v24.4S // gap(s) to follow - trn2 v20.4S, v14.4S, v20.4S // gap(s) to follow - trn1 v1.4S, v10.4S, v1.4S // gap(s) to follow - trn1 v10.2D, v15.2D, v18.2D // gap(s) to follow - trn1 v25.2D, v24.2D, v23.2D // gap(s) to follow - str_vi v10, x2, 128 // gap(s) to follow - trn2 v10.2D, v15.2D, v18.2D // gap(s) to follow - str_vo v25, x2, -112 // gap(s) to follow - trn1 v25.2D, v17.2D, v1.2D // gap(s) to follow - str_vo v10, x2, -96 // gap(s) to follow - trn2 v1.2D, v17.2D, v1.2D // gap(s) to follow - str_vi v25, x1, 128 // gap(s) to follow - trn1 v17.2D, v20.2D, v11.2D // gap(s) to follow - str_vo v1, x1, -96 // gap(s) to follow - trn2 v1.2D, v20.2D, v11.2D // gap(s) to follow - trn2 v20.2D, v24.2D, v23.2D // gap(s) to follow - str_vo v17, x1, -112 // gap(s) to follow - ldr_vo v23, x1, 0 // gap(s) to follow - str_vo v1, x1, -80 // gap(s) to follow - str_vo v20, x2, -80 // gap(s) to follow - ldr_vo v1, x2, 0 // gap(s) to follow - ldr_vi v20, x4, 64 // gap(s) to follow - ldr_vo v11, x2, 48 // gap(s) to follow - mul v17.4S, v1.4S, v20.S[0] // gap(s) to follow - sqrdmulh v1.4S, v1.4S, v20.S[1] // gap(s) to follow - mul v10.4S, v11.4S, v20.S[0] // gap(s) to follow - ldr_vo v24, x1, 32 // gap(s) to follow - mls v17.4S, v1.4S, v8.S[0] // gap(s) to follow - sqrdmulh v1.4S, v11.4S, v20.S[1] // gap(s) to follow - ldr_vo v11, x2, 32 // gap(s) to follow - sub v25.4S, v23.4S, v17.4S // gap(s) to follow - add v23.4S, v23.4S, v17.4S // gap(s) to follow - mls v10.4S, v1.4S, v8.S[0] // gap(s) to follow - mul v1.4S, v11.4S, v20.S[0] // gap(s) to follow - sqrdmulh v11.4S, v11.4S, v20.S[1] // gap(s) to follow - ldr_vo v17, x1, 48 // gap(s) to follow - ldr_vo v18, x2, 16 // gap(s) to follow - mls v1.4S, v11.4S, v8.S[0] // gap(s) to follow - add v11.4S, v17.4S, v10.4S // gap(s) to follow - mul v14.4S, v18.4S, v20.S[0] // gap(s) to follow - sqrdmulh v18.4S, v18.4S, v20.S[1] // gap(s) to follow - add v15.4S, v24.4S, v1.4S // gap(s) to follow - mul v9.4S, v11.4S, v20.S[2] // gap(s) to follow - sqrdmulh v11.4S, v11.4S, v20.S[3] // gap(s) to follow - mul v16.4S, v15.4S, v20.S[2] // gap(s) to follow - sqrdmulh v20.4S, v15.4S, v20.S[3] // gap(s) to follow - sub v1.4S, v24.4S, v1.4S // gap(s) to follow - ldr_vo v24, x4, -48 // gap(s) to follow - mls v16.4S, v20.4S, v8.S[0] // gap(s) to follow - sub v20.4S, v17.4S, v10.4S // gap(s) to follow - mul v17.4S, v1.4S, v24.S[0] // gap(s) to follow - sqrdmulh v1.4S, v1.4S, v24.S[1] // gap(s) to follow - sub v10.4S, v23.4S, v16.4S // gap(s) to follow - add v23.4S, v23.4S, v16.4S // gap(s) to follow - mls v14.4S, v18.4S, v8.S[0] // gap(s) to follow - mls v17.4S, v1.4S, v8.S[0] // gap(s) to follow - mls v9.4S, v11.4S, v8.S[0] // gap(s) to follow - mul v1.4S, v20.4S, v24.S[0] // gap(s) to follow - sqrdmulh v20.4S, v20.4S, v24.S[1] // gap(s) to follow - sub v11.4S, v25.4S, v17.4S // gap(s) to follow - add v17.4S, v25.4S, v17.4S // gap(s) to follow - ldr_vo v25, x1, 16 // gap(s) to follow - mls v1.4S, v20.4S, v8.S[0] // gap(s) to follow - ldr_vo v20, x4, -32 // gap(s) to follow - add v18.4S, v25.4S, v14.4S // gap(s) to follow - sub v25.4S, v25.4S, v14.4S // gap(s) to follow - ldr_vo v14, x4, -16 // gap(s) to follow - add v15.4S, v18.4S, v9.4S // gap(s) to follow - sub v18.4S, v18.4S, v9.4S // gap(s) to follow - sub v9.4S, v25.4S, v1.4S // gap(s) to follow - mul v16.4S, v15.4S, v24.S[2] // gap(s) to follow - sqrdmulh v24.4S, v15.4S, v24.S[3] // gap(s) to follow - mul v15.4S, v18.4S, v20.S[0] // gap(s) to follow - sqrdmulh v18.4S, v18.4S, v20.S[1] // gap(s) to follow - add v1.4S, v25.4S, v1.4S // gap(s) to follow - mls v16.4S, v24.4S, v8.S[0] // gap(s) to follow - mul v25.4S, v9.4S, v14.S[0] // gap(s) to follow - mls v15.4S, v18.4S, v8.S[0] // gap(s) to follow - sqrdmulh v24.4S, v9.4S, v14.S[1] // gap(s) to follow - sub v18.4S, v23.4S, v16.4S // gap(s) to follow - add v23.4S, v23.4S, v16.4S // gap(s) to follow - sub v14.4S, v10.4S, v15.4S // gap(s) to follow - add v10.4S, v10.4S, v15.4S // gap(s) to follow - mul v15.4S, v1.4S, v20.S[2] // gap(s) to follow - sqrdmulh v1.4S, v1.4S, v20.S[3] // gap(s) to follow - trn1 v20.4S, v23.4S, v18.4S // gap(s) to follow - mls v25.4S, v24.4S, v8.S[0] // gap(s) to follow - trn2 v9.4S, v23.4S, v18.4S // gap(s) to follow - mls v15.4S, v1.4S, v8.S[0] // gap(s) to follow - trn1 v1.4S, v10.4S, v14.4S // gap(s) to follow - sub v24.4S, v11.4S, v25.4S // gap(s) to follow - add v23.4S, v11.4S, v25.4S // gap(s) to follow - sub v11.4S, v17.4S, v15.4S // gap(s) to follow - add v17.4S, v17.4S, v15.4S // gap(s) to follow - trn2 v10.4S, v10.4S, v14.4S // gap(s) to follow - trn2 v18.2D, v20.2D, v1.2D // gap(s) to follow - trn1 v14.4S, v17.4S, v11.4S // gap(s) to follow - trn2 v15.2D, v9.2D, v10.2D // gap(s) to follow - trn1 v20.2D, v20.2D, v1.2D // gap(s) to follow - trn1 v1.2D, v9.2D, v10.2D // gap(s) to follow - trn2 v11.4S, v17.4S, v11.4S // gap(s) to follow - subs count, count, #1 + // Instructions: 164 + // Expected cycles: 188 + // Expected IPC: 0.87 + // + // Wall time: 66.08s + // User time: 66.08s + // + // ------------------------------------------------------------------------ original position ------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + ldr q6, [x5, #-128] // ............................................................................................*....................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v18.4S, v19.4S, v18.4S // ...............*.................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v16.4S, v22.4S, v20.4S // ....................*............................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v6.4S, v31.4S, v6.4S // ..............................................................................................................*..................................................... + // gap // .................................................................................................................................................................... + sub v22.4S, v18.4S, v3.4S // .............................................*...................................................................................................................... + // gap // .................................................................................................................................................................... + add v18.4S, v18.4S, v3.4S // ..............................................*..................................................................................................................... + // gap // .................................................................................................................................................................... + sub v19.4S, v16.4S, v15.4S // ..................................................*................................................................................................................. + // gap // .................................................................................................................................................................... + add v16.4S, v16.4S, v15.4S // ...................................................*................................................................................................................ + // gap // .................................................................................................................................................................... + mls v6.4S, v0.4S, v8.S[0] // ...............................................................................................................*.................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v20.4S, v19.4S, v4.S[1] // ...................................................................*................................................................................................ + // gap // .................................................................................................................................................................... + mul v19.4S, v19.4S, v4.S[0] // ....................................................................*............................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v31.4S, v16.4S, v21.S[3] // ..............................................................*..................................................................................................... + // gap // .................................................................................................................................................................... + mul v16.4S, v16.4S, v21.S[2] // ...............................................................*.................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v3.4S, v27.4S, v12.4S // ........................................................................*........................................................................................... + // gap // .................................................................................................................................................................... + mls v19.4S, v20.4S, v8.S[0] // .....................................................................*.............................................................................................. + // gap // .................................................................................................................................................................... + trn1 v20.4S, v9.4S, v2.4S // ..........................................................................*......................................................................................... + // gap // .................................................................................................................................................................... + mls v16.4S, v31.4S, v8.S[0] // ................................................................*................................................................................................... + // gap // .................................................................................................................................................................... + add v31.4S, v25.4S, v14.4S // .......................................................................................................*............................................................ + // gap // .................................................................................................................................................................... + sub v12.4S, v22.4S, v19.4S // ......................................................................*............................................................................................. + // gap // .................................................................................................................................................................... + add v22.4S, v22.4S, v19.4S // .......................................................................*............................................................................................ + // gap // .................................................................................................................................................................... + sub v19.4S, v18.4S, v16.4S // .................................................................*.................................................................................................. + // gap // .................................................................................................................................................................... + add v18.4S, v18.4S, v16.4S // ..................................................................*................................................................................................. + // gap // .................................................................................................................................................................... + trn2 v16.2D, v3.2D, v20.2D // ............................................................................*....................................................................................... + // gap // .................................................................................................................................................................... + trn1 v20.2D, v3.2D, v20.2D // ..............................................................................*..................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v3.4S, v16.4S, v26.4S // ..............................................................................................*..................................................................... + // gap // .................................................................................................................................................................... + mul v16.4S, v16.4S, v7.4S // ...............................................................................................*.................................................................... + // gap // .................................................................................................................................................................... + trn1 v9.4S, v18.4S, v19.4S // ................................................................................*................................................................................... + // gap // .................................................................................................................................................................... + trn2 v18.4S, v18.4S, v19.4S // .................................................................................*.................................................................................. + // gap // .................................................................................................................................................................... + trn1 v19.4S, v22.4S, v12.4S // ..................................................................................*................................................................................. + // gap // .................................................................................................................................................................... + mls v16.4S, v3.4S, v8.S[0] // ................................................................................................*................................................................... + // gap // .................................................................................................................................................................... + trn2 v22.4S, v22.4S, v12.4S // ...................................................................................*................................................................................ + // gap // .................................................................................................................................................................... + trn2 v3.2D, v9.2D, v19.2D // ....................................................................................*............................................................................... + // gap // .................................................................................................................................................................... + trn1 v19.2D, v9.2D, v19.2D // ......................................................................................*............................................................................. + // gap // .................................................................................................................................................................... + sub v12.4S, v20.4S, v16.4S // .................................................................................................*.................................................................. + // gap // .................................................................................................................................................................... + trn2 v9.2D, v18.2D, v22.2D // .....................................................................................*.............................................................................. + // gap // .................................................................................................................................................................... + trn1 v18.2D, v18.2D, v22.2D // .......................................................................................*............................................................................ + // gap // .................................................................................................................................................................... + sub v22.4S, v12.4S, v6.4S // ................................................................................................................*................................................... + // gap // .................................................................................................................................................................... + add v6.4S, v12.4S, v6.4S // .................................................................................................................*.................................................. + // gap // .................................................................................................................................................................... + sqrdmulh v12.4S, v31.4S, v17.4S // ........................................................................................................*........................................................... + // gap // .................................................................................................................................................................... + mul v31.4S, v31.4S, v30.4S // .........................................................................................................*.......................................................... + // gap // .................................................................................................................................................................... + add v16.4S, v20.4S, v16.4S // ..................................................................................................*................................................................. + // gap // .................................................................................................................................................................... + sqrdmulh v20.4S, v3.4S, v5.4S // ........................................................................................................................*........................................... + // gap // .................................................................................................................................................................... + mul v3.4S, v3.4S, v11.4S // .........................................................................................................................*.......................................... + // gap // .................................................................................................................................................................... + mls v31.4S, v12.4S, v8.S[0] // ..........................................................................................................*......................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v12.4S, v9.4S, v5.4S // .............................................................................................................................*...................................... + // gap // .................................................................................................................................................................... + mul v9.4S, v9.4S, v11.4S // ..............................................................................................................................*..................................... + // gap // .................................................................................................................................................................... + mls v3.4S, v20.4S, v8.S[0] // ..........................................................................................................................*......................................... + // gap // .................................................................................................................................................................... + sub v20.4S, v16.4S, v31.4S // ...........................................................................................................*........................................................ + // gap // .................................................................................................................................................................... + add v16.4S, v16.4S, v31.4S // ............................................................................................................*....................................................... + // gap // .................................................................................................................................................................... + mls v9.4S, v12.4S, v8.S[0] // ...............................................................................................................................*.................................... + // gap // .................................................................................................................................................................... + sub v31.4S, v19.4S, v3.4S // ...........................................................................................................................*........................................ + // gap // .................................................................................................................................................................... + add v19.4S, v19.4S, v3.4S // ............................................................................................................................*....................................... + // gap // .................................................................................................................................................................... + trn1 v3.4S, v16.4S, v20.4S // ............................................................................................................................................*....................... + // gap // .................................................................................................................................................................... + sub v12.4S, v18.4S, v9.4S // ................................................................................................................................*................................... + // gap // .................................................................................................................................................................... + add v18.4S, v18.4S, v9.4S // .................................................................................................................................*.................................. + // gap // .................................................................................................................................................................... + trn2 v16.4S, v16.4S, v20.4S // .............................................................................................................................................*...................... + // gap // .................................................................................................................................................................... + sqrdmulh v20.4S, v12.4S, v29.4S // .......................................................................................................................................*............................ + // gap // .................................................................................................................................................................... + sqrdmulh v9.4S, v18.4S, v1.4S // ..................................................................................................................................*................................. + // gap // .................................................................................................................................................................... + mul v18.4S, v18.4S, v23.4S // ...................................................................................................................................*................................ + // gap // .................................................................................................................................................................... + mul v12.4S, v12.4S, v28.4S // ........................................................................................................................................*........................... + // gap // .................................................................................................................................................................... + trn1 v15.4S, v6.4S, v22.4S // ..............................................................................................................................................*..................... + // gap // .................................................................................................................................................................... + trn2 v6.4S, v6.4S, v22.4S // ...............................................................................................................................................*.................... + // gap // .................................................................................................................................................................... + mls v18.4S, v9.4S, v8.S[0] // ....................................................................................................................................*............................... + // gap // .................................................................................................................................................................... + mls v12.4S, v20.4S, v8.S[0] // .........................................................................................................................................*.......................... + // gap // .................................................................................................................................................................... + trn2 v22.2D, v3.2D, v15.2D // ................................................................................................................................................*................... + // gap // .................................................................................................................................................................... + trn2 v20.2D, v16.2D, v6.2D // .................................................................................................................................................*.................. + // gap // .................................................................................................................................................................... + sub v9.4S, v19.4S, v18.4S // .....................................................................................................................................*.............................. + // gap // .................................................................................................................................................................... + add v18.4S, v19.4S, v18.4S // ......................................................................................................................................*............................. + // gap // .................................................................................................................................................................... + sub v19.4S, v31.4S, v12.4S // ..........................................................................................................................................*......................... + // gap // .................................................................................................................................................................... + add v31.4S, v31.4S, v12.4S // ...........................................................................................................................................*........................ + // gap // .................................................................................................................................................................... + trn1 v3.2D, v3.2D, v15.2D // ..................................................................................................................................................*................. + // gap // .................................................................................................................................................................... + trn1 v6.2D, v16.2D, v6.2D // ...................................................................................................................................................*................ + // gap // .................................................................................................................................................................... + trn1 v16.4S, v18.4S, v9.4S // ....................................................................................................................................................*............... + // gap // .................................................................................................................................................................... + trn2 v18.4S, v18.4S, v9.4S // .....................................................................................................................................................*.............. + // gap // .................................................................................................................................................................... + trn1 v12.4S, v31.4S, v19.4S // ......................................................................................................................................................*............. + // gap // .................................................................................................................................................................... + trn2 v19.4S, v31.4S, v19.4S // .......................................................................................................................................................*............ + // gap // .................................................................................................................................................................... + str q3, [x1], #128 // ............................................................................................................................................................*....... + // gap // .................................................................................................................................................................... + trn2 v31.2D, v16.2D, v12.2D // ........................................................................................................................................................*........... + // gap // .................................................................................................................................................................... + trn2 v3.2D, v18.2D, v19.2D // .........................................................................................................................................................*.......... + // gap // .................................................................................................................................................................... + trn1 v16.2D, v16.2D, v12.2D // ..........................................................................................................................................................*......... + // gap // .................................................................................................................................................................... + trn1 v18.2D, v18.2D, v19.2D // ...........................................................................................................................................................*........ + // gap // .................................................................................................................................................................... + str q6, [x1, #-112] // .............................................................................................................................................................*...... + // gap // .................................................................................................................................................................... + ldr q19, [x1, #0] // e................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q22, [x1, #-96] // ..............................................................................................................................................................*..... + // gap // .................................................................................................................................................................... + ldr q22, [x1, #16] // .e.................................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q20, [x1, #-80] // ...............................................................................................................................................................*.... + // gap // .................................................................................................................................................................... + ldr q6, [x1, #32] // ..e................................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q16, [x2], #128 // ................................................................................................................................................................*... + // gap // .................................................................................................................................................................... + ldr q16, [x1, #48] // ...e................................................................................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q18, [x2, #-112] // .................................................................................................................................................................*.. + // gap // .................................................................................................................................................................... + ldr q18, [x2, #0] // ....e............................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q31, [x2, #-96] // ..................................................................................................................................................................*. + // gap // .................................................................................................................................................................... + ldr q20, [x2, #16] // .....e.............................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q3, [x2, #-80] // ...................................................................................................................................................................* + // gap // .................................................................................................................................................................... + ldr q31, [x2, #32] // ......e............................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q3, [x2, #48] // .......e............................................................................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q12, [x4], #64 // ........e........................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q9, [x4, #-48] // .........e.......................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v15.4S, v18.4S, v12.S[1] // ............e....................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v18.4S, v18.4S, v12.S[0] // .............e...................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v4.4S, v20.4S, v12.S[1] // .................e.................................................................................................................................................. + // gap // .................................................................................................................................................................... + mul v20.4S, v20.4S, v12.S[0] // ..................e................................................................................................................................................. + // gap // .................................................................................................................................................................... + sqrdmulh v26.4S, v31.4S, v12.S[1] // ......................e............................................................................................................................................. + // gap // .................................................................................................................................................................... + mls v18.4S, v15.4S, v8.S[0] // ..............e..................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v31.4S, v31.4S, v12.S[0] // .......................e............................................................................................................................................ + // gap // .................................................................................................................................................................... + mls v20.4S, v4.4S, v8.S[0] // ...................e................................................................................................................................................ + // gap // .................................................................................................................................................................... + sqrdmulh v15.4S, v3.4S, v12.S[1] // ...........................e........................................................................................................................................ + // gap // .................................................................................................................................................................... + add v4.4S, v19.4S, v18.4S // ................e................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v31.4S, v26.4S, v8.S[0] // ........................e........................................................................................................................................... + // gap // .................................................................................................................................................................... + add v26.4S, v22.4S, v20.4S // .....................e.............................................................................................................................................. + // gap // .................................................................................................................................................................... + mul v3.4S, v3.4S, v12.S[0] // ............................e....................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q21, [x4, #-32] // ..........e......................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v7.4S, v6.4S, v31.4S // .........................e.......................................................................................................................................... + // gap // .................................................................................................................................................................... + add v6.4S, v6.4S, v31.4S // ..........................e......................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v3.4S, v15.4S, v8.S[0] // .............................e...................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v31.4S, v7.4S, v9.S[1] // ..........................................e......................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v15.4S, v6.4S, v12.S[3] // ................................e................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v6.4S, v6.4S, v12.S[2] // .................................e.................................................................................................................................. + // gap // .................................................................................................................................................................... + sub v27.4S, v16.4S, v3.4S // ..............................e..................................................................................................................................... + // gap // .................................................................................................................................................................... + add v16.4S, v16.4S, v3.4S // ...............................e.................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v3.4S, v7.4S, v9.S[0] // ...........................................e........................................................................................................................ + // gap // .................................................................................................................................................................... + mls v6.4S, v15.4S, v8.S[0] // ..................................e................................................................................................................................. + // gap // .................................................................................................................................................................... + sqrdmulh v15.4S, v16.4S, v12.S[3] // .....................................e.............................................................................................................................. + // gap // .................................................................................................................................................................... + mul v16.4S, v16.4S, v12.S[2] // ......................................e............................................................................................................................. + // gap // .................................................................................................................................................................... + mls v3.4S, v31.4S, v8.S[0] // ............................................e....................................................................................................................... + // gap // .................................................................................................................................................................... + sub v31.4S, v4.4S, v6.4S // ...................................e................................................................................................................................ + // gap // .................................................................................................................................................................... + add v6.4S, v4.4S, v6.4S // ....................................e............................................................................................................................... + // gap // .................................................................................................................................................................... + mls v16.4S, v15.4S, v8.S[0] // .......................................e............................................................................................................................ + // gap // .................................................................................................................................................................... + sqrdmulh v12.4S, v27.4S, v9.S[1] // ...............................................e.................................................................................................................... + // gap // .................................................................................................................................................................... + mul v15.4S, v27.4S, v9.S[0] // ................................................e................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q4, [x4, #-16] // ...........e........................................................................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v7.4S, v26.4S, v16.4S // ........................................e........................................................................................................................... + // gap // .................................................................................................................................................................... + add v16.4S, v26.4S, v16.4S // .........................................e.......................................................................................................................... + // gap // .................................................................................................................................................................... + mls v15.4S, v12.4S, v8.S[0] // .................................................e.................................................................................................................. + // gap // .................................................................................................................................................................... + sqrdmulh v12.4S, v7.4S, v21.S[1] // .........................................................e.......................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v26.4S, v16.4S, v9.S[3] // ....................................................e............................................................................................................... + // gap // .................................................................................................................................................................... + mul v16.4S, v16.4S, v9.S[2] // .....................................................e.............................................................................................................. + // gap // .................................................................................................................................................................... + mul v9.4S, v7.4S, v21.S[0] // ..........................................................e......................................................................................................... + // gap // .................................................................................................................................................................... + ldr q7, [x5], #(12*16) // ........................................................................................e........................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v16.4S, v26.4S, v8.S[0] // ......................................................e............................................................................................................. + // gap // .................................................................................................................................................................... + mls v9.4S, v12.4S, v8.S[0] // ...........................................................e........................................................................................................ + // gap // .................................................................................................................................................................... + ldr q26, [x5, #-176] // .........................................................................................e.......................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v12.4S, v6.4S, v16.4S // .......................................................e............................................................................................................ + // gap // .................................................................................................................................................................... + add v27.4S, v6.4S, v16.4S // ........................................................e........................................................................................................... + // gap // .................................................................................................................................................................... + sub v2.4S, v31.4S, v9.4S // ............................................................e....................................................................................................... + // gap // .................................................................................................................................................................... + add v9.4S, v31.4S, v9.4S // .............................................................e...................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v6.4S, v27.4S, v12.4S // .........................................................................e.......................................................................................... + // gap // .................................................................................................................................................................... + ldr q30, [x5, #-160] // ..........................................................................................e......................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v16.4S, v9.4S, v2.4S // ...........................................................................e........................................................................................ + // gap // .................................................................................................................................................................... + ldr q17, [x5, #-144] // ...........................................................................................e........................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v31.2D, v6.2D, v16.2D // .............................................................................e...................................................................................... + // gap // .................................................................................................................................................................... + trn1 v25.2D, v6.2D, v16.2D // ...............................................................................e.................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v6.4S, v31.4S, v26.4S // ...................................................................................................e................................................................ + // gap // .................................................................................................................................................................... + mul v14.4S, v31.4S, v7.4S // ....................................................................................................e............................................................... + // gap // .................................................................................................................................................................... + ldr q16, [x5, #-112] // .............................................................................................e...................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q11, [x5, #-96] // ..................................................................................................................e................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v14.4S, v6.4S, v8.S[0] // .....................................................................................................e.............................................................. + // gap // .................................................................................................................................................................... + ldr q5, [x5, #-80] // ...................................................................................................................e................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q23, [x5, #-64] // ....................................................................................................................e............................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v31.4S, v25.4S, v14.4S // ......................................................................................................e............................................................. + // gap // .................................................................................................................................................................... + ldr q1, [x5, #-48] // .....................................................................................................................e.............................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v0.4S, v31.4S, v16.4S // .............................................................................................................e...................................................... + // gap // .................................................................................................................................................................... + ldr q28, [x5, #-32] // ......................................................................................................................e............................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q29, [x5, #-16] // .......................................................................................................................e............................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + + // -------------------------------------------------------------------------------- new position ---------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q9, [x1, #(16*0)] // e.................................................................................'.................................................................................~........... + // ldr q10, [x1, #(16*1)] // ..e...............................................................................'...................................................................................~......... + // ldr q11, [x1, #(16*2)] // ....e.............................................................................'.....................................................................................~....... + // ldr q12, [x1, #(16*3)] // ......e...........................................................................'.......................................................................................~..... + // ldr q13, [x2, #(16*0)] // ........e.........................................................................'.........................................................................................~... + // ldr q14, [x2, #(16*1)] // ..........e.......................................................................'...........................................................................................~. + // ldr q15, [x2, #(16*2)] // ............e.....................................................................'............................................................................................. + // ldr q16, [x2, #(16*3)] // .............e....................................................................'............................................................................................. + // ldr q0, [x4], #64 // ..............e...................................................................'............................................................................................. + // ldr q1, [x4, #(-64 + 16)] // ...............e..................................................................'............................................................................................. + // ldr q2, [x4, #(-64 + 32)] // .............................e....................................................'............................................................................................. + // ldr q3, [x4, #(-64 + 48)] // ................................................e.................................'............................................................................................. + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ................e.................................................................'............................................................................................. + // mul v24.4s, v13.4s, v0.s[0] // .................e................................................................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .....................e............................................................'............................................................................................. + // sub v13.4s, v9.4s, v24.4s // ..................................................................................'*............................................................................................ + // add v9.4s, v9.4s, v24.4s // .........................e........................................................'............................................................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ..................e...............................................................'............................................................................................. + // mul v24.4s, v14.4s, v0.s[0] // ...................e..............................................................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .......................e..........................................................'............................................................................................. + // sub v14.4s, v10.4s, v24.4s // ..................................................................................'.*........................................................................................... + // add v10.4s, v10.4s, v24.4s // ...........................e......................................................'............................................................................................. + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ....................e.............................................................'............................................................................................. + // mul v24.4s, v15.4s, v0.s[0] // ......................e...........................................................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..........................e.......................................................'............................................................................................. + // sub v15.4s, v11.4s, v24.4s // ..............................e...................................................'............................................................................................. + // add v11.4s, v11.4s, v24.4s // ...............................e..................................................'............................................................................................. + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ........................e.........................................................'............................................................................................. + // mul v24.4s, v16.4s, v0.s[0] // ............................e.....................................................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ................................e.................................................'............................................................................................. + // sub v16.4s, v12.4s, v24.4s // ....................................e.............................................'............................................................................................. + // add v12.4s, v12.4s, v24.4s // .....................................e............................................'............................................................................................. + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ..................................e...............................................'............................................................................................. + // mul v24.4s, v11.4s, v0.s[2] // ...................................e..............................................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .......................................e..........................................'............................................................................................. + // sub v11.4s, v9.4s, v24.4s // ...........................................e......................................'............................................................................................. + // add v9.4s, v9.4s, v24.4s // ............................................e.....................................'............................................................................................. + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ........................................e.........................................'............................................................................................. + // mul v24.4s, v12.4s, v0.s[2] // .........................................e........................................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .............................................e....................................'............................................................................................. + // sub v12.4s, v10.4s, v24.4s // .................................................e................................'............................................................................................. + // add v10.4s, v10.4s, v24.4s // ..................................................e...............................'............................................................................................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .................................e................................................'............................................................................................. + // mul v24.4s, v15.4s, v1.s[0] // ......................................e...........................................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..........................................e.......................................'............................................................................................. + // sub v15.4s, v13.4s, v24.4s // ..................................................................................'...*......................................................................................... + // add v13.4s, v13.4s, v24.4s // ..................................................................................'....*........................................................................................ + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ..............................................e...................................'............................................................................................. + // mul v24.4s, v16.4s, v1.s[0] // ...............................................e..................................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...................................................e..............................'............................................................................................. + // sub v16.4s, v14.4s, v24.4s // ..................................................................................'.....*....................................................................................... + // add v14.4s, v14.4s, v24.4s // ..................................................................................'......*...................................................................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .....................................................e............................'............................................................................................. + // mul v24.4s, v10.4s, v1.s[2] // ......................................................e...........................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .........................................................e........................'............................................................................................. + // sub v10.4s, v9.4s, v24.4s // ............................................................e.....................'............................................................................................. + // add v9.4s, v9.4s, v24.4s // .............................................................e....................'............................................................................................. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ....................................................e.............................'............................................................................................. + // mul v24.4s, v12.4s, v2.s[0] // .......................................................e..........................'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................e.......................'............................................................................................. + // sub v12.4s, v11.4s, v24.4s // ..............................................................e...................'............................................................................................. + // add v11.4s, v11.4s, v24.4s // ...............................................................e..................'............................................................................................. + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ..................................................................................'..........*.................................................................................. + // mul v24.4s, v14.4s, v2.s[2] // ..................................................................................'...........*................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'...............*............................................................................. + // sub v14.4s, v13.4s, v24.4s // ..................................................................................'...................*......................................................................... + // add v13.4s, v13.4s, v24.4s // ..................................................................................'....................*........................................................................ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ..................................................................................'........*.................................................................................... + // mul v24.4s, v16.4s, v3.s[0] // ..................................................................................'.........*................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'.............*............................................................................... + // sub v16.4s, v15.4s, v24.4s // ..................................................................................'.................*........................................................................... + // add v15.4s, v15.4s, v24.4s // ..................................................................................'..................*.......................................................................... + // trn1 v25.4s, v9.4s, v10.4s // ..................................................................................'............*................................................................................ + // trn2 v26.4s, v9.4s, v10.4s // ................................................................e.................'............................................................................................. + // trn1 v27.4s, v11.4s, v12.4s // ..................................................................................'..............*.............................................................................. + // trn2 v28.4s, v11.4s, v12.4s // ..................................................................e...............'............................................................................................. + // trn2 v11.2d, v25.2d, v27.2d // ..................................................................................'.....................*....................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ....................................................................e.............'............................................................................................. + // trn1 v9.2d, v25.2d, v27.2d // ..................................................................................'......................*...................................................................... + // trn1 v10.2d, v26.2d, v28.2d // .....................................................................e............'............................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ..................................................................................'.........................*................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..................................................................................'..........................*.................................................................. + // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................'...........................*................................................................. + // trn2 v28.4s, v15.4s, v16.4s // ..................................................................................'.............................*............................................................... + // trn2 v15.2d, v25.2d, v27.2d // ..................................................................................'..............................*.............................................................. + // trn2 v16.2d, v26.2d, v28.2d // ..................................................................................'.................................*........................................................... + // trn1 v13.2d, v25.2d, v27.2d // ..................................................................................'...............................*............................................................. + // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................'..................................*.......................................................... + // ldr q0, [ x5], #(12*16) // ........................................................e.........................'............................................................................................. + // ldr q4, [x5, #(-12*16 + 1*16)] // ...........................................................e......................'............................................................................................. + // ldr q1, [ x5, #(-12*16 + 2*16)] // .................................................................e................'............................................................................................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ...................................................................e..............'............................................................................................. + // ldr q2, [ x5, #(-12*16 + 4*16)] // ..................................................................................*............................................................................................. + // ldr q6, [x5, #(-12*16 + 5*16)] // ........................................................................e.........'............................................................................................. + // sqrdmulh v27.4s, v11.4s, v4.4s // ..................................................................................'.......................*..................................................................... + // mul v24.4s, v11.4s, v0.4s // ..................................................................................'........................*.................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'............................*................................................................ + // sub v11.4s, v9.4s, v24.4s // ..................................................................................'................................*............................................................ + // add v9.4s, v9.4s, v24.4s // ..................................................................................'.......................................*..................................................... + // sqrdmulh v27.4s, v12.4s, v4.4s // ......................................................................e...........'............................................................................................. + // mul v24.4s, v12.4s, v0.4s // .......................................................................e..........'............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................e.......'............................................................................................. + // sub v12.4s, v10.4s, v24.4s // .............................................................................e....'............................................................................................. + // add v10.4s, v10.4s, v24.4s // ..................................................................................'................*............................................................................ + // sqrdmulh v27.4s, v10.4s, v5.4s // ..................................................................................'.....................................*....................................................... + // mul v24.4s, v10.4s, v1.4s // ..................................................................................'......................................*...................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'..........................................*.................................................. + // sub v10.4s, v9.4s, v24.4s // ..................................................................................'..............................................*.............................................. + // add v9.4s, v9.4s, v24.4s // ..................................................................................'...............................................*............................................. + // sqrdmulh v27.4s, v12.4s, v6.4s // ...............................................................................e..'............................................................................................. + // mul v24.4s, v12.4s, v2.4s // ..................................................................................'..*.......................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'.......*..................................................................................... + // sub v12.4s, v11.4s, v24.4s // ..................................................................................'...................................*......................................................... + // add v11.4s, v11.4s, v24.4s // ..................................................................................'....................................*........................................................ + // ldr q0, [ x5, #(-12*16 + 6*16)] // .........................................................................e........'............................................................................................. + // ldr q4, [x5, #(-12*16 + 7*16)] // ...........................................................................e......'............................................................................................. + // ldr q1, [ x5, #(-12*16 + 8*16)] // ............................................................................e.....'............................................................................................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ..............................................................................e...'............................................................................................. + // ldr q2, [ x5, #(-12*16 + 10*16)] // ................................................................................e.'............................................................................................. + // ldr q6, [x5, #(-12*16 + 11*16)] // .................................................................................e'............................................................................................. + // sqrdmulh v27.4s, v15.4s, v4.4s // ..................................................................................'........................................*.................................................... + // mul v24.4s, v15.4s, v0.4s // ..................................................................................'.........................................*................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'.............................................*............................................... + // sub v15.4s, v13.4s, v24.4s // ..................................................................................'.................................................*........................................... + // add v13.4s, v13.4s, v24.4s // ..................................................................................'..................................................*.......................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // ..................................................................................'...........................................*................................................. + // mul v24.4s, v16.4s, v0.4s // ..................................................................................'............................................*................................................ + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'................................................*............................................ + // sub v16.4s, v14.4s, v24.4s // ..................................................................................'....................................................*........................................ + // add v14.4s, v14.4s, v24.4s // ..................................................................................'.....................................................*....................................... + // sqrdmulh v27.4s, v14.4s, v5.4s // ..................................................................................'........................................................*.................................... + // mul v24.4s, v14.4s, v1.4s // ..................................................................................'.........................................................*................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'.............................................................*............................... + // sub v14.4s, v13.4s, v24.4s // ..................................................................................'.................................................................*........................... + // add v13.4s, v13.4s, v24.4s // ..................................................................................'..................................................................*.......................... + // sqrdmulh v27.4s, v16.4s, v6.4s // ..................................................................................'.......................................................*..................................... + // mul v24.4s, v16.4s, v2.4s // ..................................................................................'..........................................................*.................................. + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................'..............................................................*.............................. + // sub v16.4s, v15.4s, v24.4s // ..................................................................................'...................................................................*......................... + // add v15.4s, v15.4s, v24.4s // ..................................................................................'....................................................................*........................ + // trn1 v25.4s, v9.4s, v10.4s // ..................................................................................'...................................................*......................................... + // trn2 v26.4s, v9.4s, v10.4s // ..................................................................................'......................................................*...................................... + // trn1 v27.4s, v11.4s, v12.4s // ..................................................................................'...........................................................*................................. + // trn2 v28.4s, v11.4s, v12.4s // ..................................................................................'............................................................*................................ + // trn2 v11.2d, v25.2d, v27.2d // ..................................................................................'...............................................................*............................. + // trn2 v12.2d, v26.2d, v28.2d // ..................................................................................'................................................................*............................ + // trn1 v9.2d, v25.2d, v27.2d // ..................................................................................'.....................................................................*....................... + // trn1 v10.2d, v26.2d, v28.2d // ..................................................................................'......................................................................*...................... + // trn1 v25.4s, v13.4s, v14.4s // ..................................................................................'.......................................................................*..................... + // trn2 v26.4s, v13.4s, v14.4s // ..................................................................................'........................................................................*.................... + // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................'.........................................................................*................... + // trn2 v28.4s, v15.4s, v16.4s // ..................................................................................'..........................................................................*.................. + // trn2 v15.2d, v25.2d, v27.2d // ..................................................................................'............................................................................*................ + // trn2 v16.2d, v26.2d, v28.2d // ..................................................................................'.............................................................................*............... + // trn1 v13.2d, v25.2d, v27.2d // ..................................................................................'..............................................................................*.............. + // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................'...............................................................................*............. + // str q9, [x1], #128 // ..................................................................................'...........................................................................*................. + // str q10, [x1, #(-(128) + 16*1)] // ..................................................................................'................................................................................*............ + // str q11, [x1, #(-(128) + 16*2)] // .~................................................................................'..................................................................................*.......... + // str q12, [x1, #(-(128) + 16*3)] // ...~..............................................................................'....................................................................................*........ + // str q13, [x2], #128 // .....~............................................................................'......................................................................................*...... + // str q14, [x2, #(-(128) + 16*1)] // .......~..........................................................................'........................................................................................*.... + // str q15, [x2, #(-(128) + 16*2)] // .........~........................................................................'..........................................................................................*.. + // str q16, [x2, #(-(128) + 16*3)] // ...........~......................................................................'............................................................................................* + + sub count, count, #1 cbnz count, layer45678_start - ldr_vi v25, x5, 192 - ldr_vo v26, x5, -176 - trn1 v16.4S, v23.4S, v24.4S - mul v30.4S, v15.4S, v25.4S - sqrdmulh v3.4S, v15.4S, v26.4S - ldr_vo v6, x5, -144 - sqrdmulh v21.4S, v18.4S, v26.4S - mls v30.4S, v3.4S, v8.S[0] - mul v9.4S, v18.4S, v25.4S - ldr_vo v27, x5, -160 - add v29.4S, v1.4S, v30.4S - ldr_vo v7, x5, -96 - sqrdmulh v19.4S, v29.4S, v6.4S - mul v31.4S, v29.4S, v27.4S - mls v9.4S, v21.4S, v8.S[0] - ldr_vo v25, x5, -80 - mls v31.4S, v19.4S, v8.S[0] - add v29.4S, v20.4S, v9.4S - trn2 v5.4S, v23.4S, v24.4S - trn2 v27.2D, v14.2D, v16.2D - add v21.4S, v29.4S, v31.4S - sqrdmulh v12.4S, v27.4S, v25.4S - trn2 v13.2D, v11.2D, v5.2D - trn1 v5.2D, v11.2D, v5.2D - sqrdmulh v17.4S, v13.4S, v25.4S - mul v11.4S, v13.4S, v7.4S - ldr_vo v2, x5, -32 - mul v3.4S, v27.4S, v7.4S - mls v11.4S, v17.4S, v8.S[0] - ldr_vo v26, x5, -16 - sub v15.4S, v1.4S, v30.4S - sub v10.4S, v5.4S, v11.4S - ldr_vo v13, x5, -128 - mul v27.4S, v10.4S, v2.4S - sqrdmulh v26.4S, v10.4S, v26.4S - mls v3.4S, v12.4S, v8.S[0] - mul v1.4S, v15.4S, v13.4S - trn1 v4.2D, v14.2D, v16.2D - mls v27.4S, v26.4S, v8.S[0] - sub v12.4S, v4.4S, v3.4S - ldr_vo v0, x5, -112 - add v28.4S, v12.4S, v27.4S - sub v6.4S, v12.4S, v27.4S - sqrdmulh v25.4S, v15.4S, v0.4S - ldr_vo v19, x5, -64 - sub v0.4S, v20.4S, v9.4S - mls v1.4S, v25.4S, v8.S[0] - sub v30.4S, v29.4S, v31.4S - ldr_vo v26, x5, -48 - sub v13.4S, v0.4S, v1.4S - add v0.4S, v0.4S, v1.4S - add v10.4S, v5.4S, v11.4S - trn2 v1.4S, v21.4S, v30.4S - trn2 v27.4S, v0.4S, v13.4S - mul v17.4S, v10.4S, v19.4S - sqrdmulh v22.4S, v10.4S, v26.4S - trn1 v23.2D, v1.2D, v27.2D - trn2 v1.2D, v1.2D, v27.2D - trn1 v20.4S, v28.4S, v6.4S - mls v17.4S, v22.4S, v8.S[0] - add v7.4S, v4.4S, v3.4S - trn2 v18.4S, v28.4S, v6.4S - str_vo v1, x1, 48 - sub v14.4S, v7.4S, v17.4S - add v22.4S, v7.4S, v17.4S - str_vo v23, x1, 16 - trn1 v24.4S, v21.4S, v30.4S - trn1 v17.4S, v22.4S, v14.4S - trn2 v11.4S, v22.4S, v14.4S - trn1 v9.4S, v0.4S, v13.4S - trn1 v14.2D, v17.2D, v20.2D - trn1 v2.2D, v11.2D, v18.2D - str_vi v14, x2, 128 - trn2 v20.2D, v17.2D, v20.2D - str_vo v2, x2, -112 - trn1 v23.2D, v24.2D, v9.2D - str_vo v20, x2, -96 - trn2 v15.2D, v24.2D, v9.2D - str_vi v23, x1, 128 - trn2 v23.2D, v11.2D, v18.2D - str_vo v15, x1, -96 - str_vo v23, x2, -80 + // Instructions: 88 + // Expected cycles: 90 + // Expected IPC: 0.98 + // + // Wall time: 30.31s + // User time: 30.31s + // + // ---------------------------------- original position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + sub v24.4S, v22.4S, v20.4S // ..*..................................................................................... + // gap // ........................................................................................ + sub v10.4S, v19.4S, v18.4S // .*...................................................................................... + // gap // ........................................................................................ + trn1 v19.4S, v9.4S, v2.4S // ...............*........................................................................ + // gap // ........................................................................................ + sub v6.4S, v24.4S, v15.4S // ......*................................................................................. + // gap // ........................................................................................ + add v2.4S, v10.4S, v3.4S // .....*.................................................................................. + // gap // ........................................................................................ + add v15.4S, v24.4S, v15.4S // .......*................................................................................ + // gap // ........................................................................................ + mul v9.4S, v6.4S, v4.S[0] // ..........*............................................................................. + // gap // ........................................................................................ + sqrdmulh v6.4S, v6.4S, v4.S[1] // .........*.............................................................................. + // gap // ........................................................................................ + mul v4.4S, v15.4S, v21.S[2] // ............*........................................................................... + // gap // ........................................................................................ + sqrdmulh v13.4S, v15.4S, v21.S[3] // ...........*............................................................................ + // gap // ........................................................................................ + trn1 v15.4S, v27.4S, v12.4S // .............*.......................................................................... + // gap // ........................................................................................ + mls v9.4S, v6.4S, v8.S[0] // ..............*......................................................................... + // gap // ........................................................................................ + sub v20.4S, v10.4S, v3.4S // ....*................................................................................... + // gap // ........................................................................................ + mls v4.4S, v13.4S, v8.S[0] // ................*....................................................................... + // gap // ........................................................................................ + trn2 v10.2D, v15.2D, v19.2D // ......................*................................................................. + // gap // ........................................................................................ + add v22.4S, v20.4S, v9.4S // ...................*.................................................................... + // gap // ........................................................................................ + sub v6.4S, v20.4S, v9.4S // ..................*..................................................................... + // gap // ........................................................................................ + add v16.4S, v2.4S, v4.4S // .....................*.................................................................. + // gap // ........................................................................................ + sub v18.4S, v2.4S, v4.4S // ....................*................................................................... + // gap // ........................................................................................ + trn2 v3.4S, v22.4S, v6.4S // ..............................*......................................................... + // gap // ........................................................................................ + trn1 v20.4S, v22.4S, v6.4S // ............................*........................................................... + // gap // ........................................................................................ + trn1 v6.4S, v16.4S, v18.4S // ..........................*............................................................. + // gap // ........................................................................................ + sqrdmulh v12.4S, v10.4S, v26.4S // ........................*............................................................... + // gap // ........................................................................................ + trn2 v16.4S, v16.4S, v18.4S // ...........................*............................................................ + // gap // ........................................................................................ + trn2 v4.2D, v6.2D, v20.2D // ...............................*........................................................ + // gap // ........................................................................................ + trn1 v22.2D, v6.2D, v20.2D // ................................*....................................................... + // gap // ........................................................................................ + trn2 v18.2D, v16.2D, v3.2D // ..................................*..................................................... + // gap // ........................................................................................ + add v25.4S, v25.4S, v14.4S // .................*...................................................................... + // gap // ........................................................................................ + mul v6.4S, v18.4S, v11.4S // .............................................*.......................................... + // gap // ........................................................................................ + sqrdmulh v18.4S, v18.4S, v5.4S // ............................................*........................................... + // gap // ........................................................................................ + mul v24.4S, v25.4S, v30.4S // .......................................*................................................ + // gap // ........................................................................................ + mul v21.4S, v10.4S, v7.4S // .........................*.............................................................. + // gap // ........................................................................................ + trn1 v10.2D, v16.2D, v3.2D // ...................................*.................................................... + // gap // ........................................................................................ + mls v6.4S, v18.4S, v8.S[0] // .................................................*...................................... + // gap // ........................................................................................ + sqrdmulh v2.4S, v25.4S, v17.4S // ......................................*................................................. + // gap // ........................................................................................ + mls v21.4S, v12.4S, v8.S[0] // .............................*.......................................................... + // gap // ........................................................................................ + trn1 v26.2D, v15.2D, v19.2D // .......................*................................................................ + // gap // ........................................................................................ + sub v12.4S, v10.4S, v6.4S // .....................................................*.................................. + // gap // ........................................................................................ + mls v24.4S, v2.4S, v8.S[0] // ...........................................*............................................ + // gap // ........................................................................................ + add v17.4S, v26.4S, v21.4S // ........................................*............................................... + // gap // ........................................................................................ + mul v18.4S, v12.4S, v28.4S // ...........................................................*............................ + // gap // ........................................................................................ + sqrdmulh v25.4S, v12.4S, v29.4S // ........................................................*............................... + // gap // ........................................................................................ + add v19.4S, v17.4S, v24.4S // ................................................*....................................... + // gap // ........................................................................................ + sub v3.4S, v17.4S, v24.4S // ...............................................*........................................ + // gap // ........................................................................................ + mul v20.4S, v4.4S, v11.4S // ..........................................*............................................. + // gap // ........................................................................................ + mls v18.4S, v25.4S, v8.S[0] // ...............................................................*........................ + // gap // ........................................................................................ + trn1 v25.4S, v19.4S, v3.4S // ....................................................*................................... + // gap // ........................................................................................ + sqrdmulh v16.4S, v4.4S, v5.4S // .........................................*.............................................. + // gap // ........................................................................................ + add v6.4S, v10.4S, v6.4S // ......................................................*................................. + // gap // ........................................................................................ + ldr q11, [x5, #-128] // *....................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v20.4S, v16.4S, v8.S[0] // ..............................................*......................................... + // gap // ........................................................................................ + sqrdmulh v16.4S, v6.4S, v1.4S // .........................................................*.............................. + // gap // ........................................................................................ + mul v6.4S, v6.4S, v23.4S // ..........................................................*............................. + // gap // ........................................................................................ + mul v15.4S, v31.4S, v11.4S // ...*.................................................................................... + // gap // ........................................................................................ + sub v31.4S, v22.4S, v20.4S // ..................................................*..................................... + // gap // ........................................................................................ + add v9.4S, v22.4S, v20.4S // ...................................................*.................................... + // gap // ........................................................................................ + mls v6.4S, v16.4S, v8.S[0] // ..............................................................*......................... + // gap // ........................................................................................ + add v22.4S, v31.4S, v18.4S // .....................................................................*.................. + // gap // ........................................................................................ + sub v31.4S, v31.4S, v18.4S // ....................................................................*................... + // gap // ........................................................................................ + mls v15.4S, v0.4S, v8.S[0] // ........*............................................................................... + // gap // ........................................................................................ + add v20.4S, v9.4S, v6.4S // ...................................................................*.................... + // gap // ........................................................................................ + trn2 v13.4S, v22.4S, v31.4S // ...........................................................................*............ + // gap // ........................................................................................ + sub v18.4S, v9.4S, v6.4S // ..................................................................*..................... + // gap // ........................................................................................ + sub v6.4S, v26.4S, v21.4S // .................................*...................................................... + // gap // ........................................................................................ + trn2 v12.4S, v19.4S, v3.4S // .......................................................*................................ + // gap // ........................................................................................ + trn2 v14.4S, v20.4S, v18.4S // .........................................................................*.............. + // gap // ........................................................................................ + add v30.4S, v6.4S, v15.4S // .....................................*.................................................. + // gap // ........................................................................................ + sub v9.4S, v6.4S, v15.4S // ....................................*................................................... + // gap // ........................................................................................ + trn1 v6.2D, v14.2D, v13.2D // ................................................................................*....... + // gap // ........................................................................................ + trn1 v1.4S, v22.4S, v31.4S // ..........................................................................*............. + // gap // ........................................................................................ + str q6, [x2, #16] // .....................................................................................*.. + // gap // ........................................................................................ + trn1 v6.4S, v30.4S, v9.4S // ............................................................*........................... + // gap // ........................................................................................ + trn2 v19.4S, v30.4S, v9.4S // .............................................................*.......................... + // gap // ........................................................................................ + trn1 v17.4S, v20.4S, v18.4S // ........................................................................*............... + // gap // ........................................................................................ + trn2 v29.2D, v25.2D, v6.2D // ................................................................*....................... + // gap // ........................................................................................ + trn1 v2.2D, v12.2D, v19.2D // .......................................................................*................ + // gap // ........................................................................................ + str q29, [x1, #32] // ..................................................................................*..... + // gap // ........................................................................................ + trn1 v22.2D, v25.2D, v6.2D // ......................................................................*................. + // gap // ........................................................................................ + str q2, [x1, #16] // .................................................................................*...... + // gap // ........................................................................................ + trn2 v26.2D, v12.2D, v19.2D // .................................................................*...................... + // gap // ........................................................................................ + str q22, [x1], #128 // ............................................................................*........... + // gap // ........................................................................................ + trn1 v6.2D, v17.2D, v1.2D // ...............................................................................*........ + // gap // ........................................................................................ + str q26, [x1, #-80] // ...................................................................................*.... + // gap // ........................................................................................ + trn2 v18.2D, v14.2D, v13.2D // ..............................................................................*......... + // gap // ........................................................................................ + str q6, [x2], #128 // ....................................................................................*... + // gap // ........................................................................................ + trn2 v20.2D, v17.2D, v1.2D // .............................................................................*.......... + // gap // ........................................................................................ + str q18, [x2, #-80] // .......................................................................................* + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q20, [x2, #-96] // ......................................................................................*. + // gap // ........................................................................................ + + // ------------------------------------ new position -------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------ + // ldr q6, [x5, #-128] // .................................................*...................................... + // sub v18.4S, v19.4S, v18.4S // .*...................................................................................... + // sub v16.4S, v22.4S, v20.4S // *....................................................................................... + // mul v6.4S, v31.4S, v6.4S // .....................................................*.................................. + // sub v22.4S, v18.4S, v3.4S // ............*........................................................................... + // add v18.4S, v18.4S, v3.4S // ....*................................................................................... + // sub v19.4S, v16.4S, v15.4S // ...*.................................................................................... + // add v16.4S, v16.4S, v15.4S // .....*.................................................................................. + // mls v6.4S, v0.4S, v8.S[0] // ...........................................................*............................ + // sqrdmulh v20.4S, v19.4S, v4.S[1] // .......*................................................................................ + // mul v19.4S, v19.4S, v4.S[0] // ......*................................................................................. + // sqrdmulh v31.4S, v16.4S, v21.S[3] // .........*.............................................................................. + // mul v16.4S, v16.4S, v21.S[2] // ........*............................................................................... + // trn1 v3.4S, v27.4S, v12.4S // ..........*............................................................................. + // mls v19.4S, v20.4S, v8.S[0] // ...........*............................................................................ + // trn1 v20.4S, v9.4S, v2.4S // ..*..................................................................................... + // mls v16.4S, v31.4S, v8.S[0] // .............*.......................................................................... + // add v31.4S, v25.4S, v14.4S // ...........................*............................................................ + // sub v12.4S, v22.4S, v19.4S // ................*....................................................................... + // add v22.4S, v22.4S, v19.4S // ...............*........................................................................ + // sub v19.4S, v18.4S, v16.4S // ..................*..................................................................... + // add v18.4S, v18.4S, v16.4S // .................*...................................................................... + // trn2 v16.2D, v3.2D, v20.2D // ..............*......................................................................... + // trn1 v20.2D, v3.2D, v20.2D // ....................................*................................................... + // sqrdmulh v3.4S, v16.4S, v26.4S // ......................*................................................................. + // mul v16.4S, v16.4S, v7.4S // ...............................*........................................................ + // trn1 v9.4S, v18.4S, v19.4S // .....................*.................................................................. + // trn2 v18.4S, v18.4S, v19.4S // .......................*................................................................ + // trn1 v19.4S, v22.4S, v12.4S // ....................*................................................................... + // mls v16.4S, v3.4S, v8.S[0] // ...................................*.................................................... + // trn2 v22.4S, v22.4S, v12.4S // ...................*.................................................................... + // trn2 v3.2D, v9.2D, v19.2D // ........................*............................................................... + // trn1 v19.2D, v9.2D, v19.2D // .........................*.............................................................. + // sub v12.4S, v20.4S, v16.4S // ...............................................................*........................ + // trn2 v9.2D, v18.2D, v22.2D // ..........................*............................................................. + // trn1 v18.2D, v18.2D, v22.2D // ................................*....................................................... + // sub v22.4S, v12.4S, v6.4S // ...................................................................*.................... + // add v6.4S, v12.4S, v6.4S // ..................................................................*..................... + // sqrdmulh v12.4S, v31.4S, v17.4S // ..................................*..................................................... + // mul v31.4S, v31.4S, v30.4S // ..............................*......................................................... + // add v16.4S, v20.4S, v16.4S // .......................................*................................................ + // sqrdmulh v20.4S, v3.4S, v5.4S // ...............................................*........................................ + // mul v3.4S, v3.4S, v11.4S // ............................................*........................................... + // mls v31.4S, v12.4S, v8.S[0] // ......................................*................................................. + // sqrdmulh v12.4S, v9.4S, v5.4S // .............................*.......................................................... + // mul v9.4S, v9.4S, v11.4S // ............................*........................................................... + // mls v3.4S, v20.4S, v8.S[0] // ..................................................*..................................... + // sub v20.4S, v16.4S, v31.4S // ...........................................*............................................ + // add v16.4S, v16.4S, v31.4S // ..........................................*............................................. + // mls v9.4S, v12.4S, v8.S[0] // .................................*...................................................... + // sub v31.4S, v19.4S, v3.4S // ......................................................*................................. + // add v19.4S, v19.4S, v3.4S // .......................................................*................................ + // trn1 v3.4S, v16.4S, v20.4S // ..............................................*......................................... + // sub v12.4S, v18.4S, v9.4S // .....................................*.................................................. + // add v18.4S, v18.4S, v9.4S // ................................................*....................................... + // trn2 v16.4S, v16.4S, v20.4S // ................................................................*....................... + // sqrdmulh v20.4S, v12.4S, v29.4S // .........................................*.............................................. + // sqrdmulh v9.4S, v18.4S, v1.4S // ...................................................*.................................... + // mul v18.4S, v18.4S, v23.4S // ....................................................*................................... + // mul v12.4S, v12.4S, v28.4S // ........................................*............................................... + // trn1 v15.4S, v6.4S, v22.4S // .......................................................................*................ + // trn2 v6.4S, v6.4S, v22.4S // ........................................................................*............... + // mls v18.4S, v9.4S, v8.S[0] // ........................................................*............................... + // mls v12.4S, v20.4S, v8.S[0] // .............................................*.......................................... + // trn2 v22.2D, v3.2D, v15.2D // ..........................................................................*............. + // trn2 v20.2D, v16.2D, v6.2D // ...............................................................................*........ + // sub v9.4S, v19.4S, v18.4S // ..............................................................*......................... + // add v18.4S, v19.4S, v18.4S // ............................................................*........................... + // sub v19.4S, v31.4S, v12.4S // ..........................................................*............................. + // add v31.4S, v31.4S, v12.4S // .........................................................*.............................. + // trn1 v3.2D, v3.2D, v15.2D // .............................................................................*.......... + // trn1 v6.2D, v16.2D, v6.2D // ...........................................................................*............ + // trn1 v16.4S, v18.4S, v9.4S // .........................................................................*.............. + // trn2 v18.4S, v18.4S, v9.4S // .................................................................*...................... + // trn1 v12.4S, v31.4S, v19.4S // .....................................................................*.................. + // trn2 v19.4S, v31.4S, v19.4S // .............................................................*.......................... + // str q3, [x1], #128 // ................................................................................*....... + // trn2 v31.2D, v16.2D, v12.2D // .....................................................................................*.. + // trn2 v3.2D, v18.2D, v19.2D // ...................................................................................*.... + // trn1 v16.2D, v16.2D, v12.2D // .................................................................................*...... + // trn1 v18.2D, v18.2D, v19.2D // ....................................................................*................... + // str q6, [x1, #-112] // ..............................................................................*......... + // str q22, [x1, #-96] // ............................................................................*........... + // str q20, [x1, #-80] // ..................................................................................*..... + // str q16, [x2], #128 // ....................................................................................*... + // str q18, [x2, #-112] // ......................................................................*................. + // str q31, [x2, #-96] // .......................................................................................* + // str q3, [x2, #-80] // ......................................................................................*. + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_a72.s b/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_a72.s index eda9e5f1..8ef20b17 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_a72.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_a72.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, \offset] -.endm - -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], \inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, \offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], \inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +25,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +42,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -89,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -131,40 +103,40 @@ xtmp1 .req x11 str \x\()t_31, [\addr, #(-\inc + 8*7)] .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -186,7 +158,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +169,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +179,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +187,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +198,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -251,7 +223,7 @@ roots: .text .global ntt_dilithium_123_45678_manual_st4_opt_a72 - .global _ntt_dilithium_123_45678_manual_st4_opt_a72 + .global _ntt_dilithium_123_45678_manual_st4 .p2align 4 const_addr: .word 8380417 @@ -375,643 +347,678 @@ _ntt_dilithium_123_45678_manual_st4_opt_a72: load_roots_123 .p2align 2 - ldr_vo v6, x0, 384 // ..*......... - ldr_vo v28, x0, 896 // .*.......... - // gap // ............ - ldr_vo v7, x0, 512 // *........... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v18.4S, v28.4S, v0.S[1] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v28.4S, v28.4S, v0.S[0] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v23.4S, v7.4S, v0.S[1] // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v28.4S, v18.4S, v8.S[0] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v22.4S, v7.4S, v0.S[0] // ......*..... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v22.4S, v23.4S, v8.S[0] // ...........* - // gap // ............ - // gap // ............ - add v23.4S, v6.4S, v28.4S // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v26.4S, v23.4S, v0.S[3] // .........*.. - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v9.4S, v23.4S, v0.S[2] // ..........*. - // gap // ............ - // gap // ............ - - // original source code - // ldr_vo v20, x0, 512 // ..*......... || .*................... - // ldr_vo v5, x0, 896 // .*.......... || *.................... - // ldr_vo v6, x0, 384 // *........... || *.................... - // mul v28.4S, v5.4S, v0.S[0] // ....*....... || ......*.............. - // sqrdmulh v26.4S, v5.4S, v0.S[1] // ...*........ || ....*................ - // mls v28.4S, v26.4S, v8.S[0] // ......*..... || ..........*.......... - // mul v22.4S, v20.4S, v0.S[0] // .......*.... || ............*........ - // add v27.4S, v6.4S, v28.4S // .........*.. || ...............*..... - // sqrdmulh v25.4S, v20.4S, v0.S[1] // .....*...... || ........*............ - // sqrdmulh v26.4S, v27.4S, v0.S[3] // ..........*. || ..................*.. - // mul v9.4S, v27.4S, v0.S[2] // ...........* || ....................* - // mls v22.4S, v25.4S, v8.S[0] // ........*... || ..............*...... - + // Instructions: 12 + // Expected cycles: 21 + // Expected IPC: 0.57 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q18, [x0, #896] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q19, [x0, #512] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v20.4S, v18.4S, v0.S[1] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v13.4S, v18.4S, v0.S[0] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v13.4S, v20.4S, v8.S[0] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v18.4S, v19.4S, v0.S[1] // ......*....................... + ldr q12, [x0, #384] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v26.4S, v19.4S, v0.S[0] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v20.4S, v12.4S, v13.4S // .......*...................... + // gap // .............................. + // gap // .............................. + mls v26.4S, v18.4S, v8.S[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v6.4S, v20.4S, v0.S[3] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v30.4S, v20.4S, v0.S[2] // ...........*.................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0, #512] // .*............................. + // ldr q17, [x0, #896] // *.............................. + // ldr q12, [x0, #384] // ......*........................ + // sqrdmulh v11.4S, v17.4S, v0.S[1] // ..*............................ + // mul v13.4S, v17.4S, v0.S[0] // ...*........................... + // mls v13.4S, v11.4S, v8.S[0] // ....*.......................... + // sqrdmulh v19.4S, v6.4S, v0.S[1] // .....*......................... + // add v20.4S, v12.4S, v13.4S // ........*...................... + // mul v26.4S, v6.4S, v0.S[0] // .......*....................... + // sqrdmulh v6.4S, v20.4S, v0.S[3] // ..........*.................... + // mls v26.4S, v19.4S, v8.S[0] // .........*..................... + // mul v30.4S, v20.4S, v0.S[2] // ...........*................... + sub count, count, #1 -.p2align 2 layer123_start: - ldr_vo v23, x0, 0 // *........................................................................... - sub v28.4S, v6.4S, v28.4S // ..........................*................................................. - ldr_vo v10, x0, 256 // ..*......................................................................... - ldr_vo v27, x0, 768 // ......*..................................................................... - ldr_vo v25, x0, 128 // .*.......................................................................... - mls v9.4S, v26.4S, v8.S[0] // ...................................*........................................ - ldr_vo v20, x0, 528 // ....e....................................................................... - ldr_vo v26, x0, 640 // .....*...................................................................... - // gap // ............................................................................ - mul v29.4S, v28.4S, v1.S[0] // ...........................................*................................ - ldr_vo v5, x0, 912 // .......e.................................................................... - ldr_vo v6, x0, 400 // ...e........................................................................ - sub v17.4S, v23.4S, v22.4S // ...........*................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v23.4S, v23.4S, v22.4S // ............*............................................................... - sqrdmulh v28.4S, v28.4S, v1.S[1] // ............................................*............................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v22.4S, v27.4S, v0.S[1] // ...................*........................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v27.4S, v27.4S, v0.S[0] // ..................*......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v18.4S, v26.4S, v0.S[1] // ..............*............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v27.4S, v22.4S, v8.S[0] // ....................*....................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v26.4S, v26.4S, v0.S[0] // .............*.............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v26.4S, v18.4S, v8.S[0] // ...............*............................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v22.4S, v10.4S, v27.4S // .....................*...................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v10.4S, v10.4S, v27.4S // ......................*..................................................... - mls v29.4S, v28.4S, v8.S[0] // .............................................*.............................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v28.4S, v5.4S, v0.S[0] // .......................e.................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v27.4S, v25.4S, v26.4S // .................*.......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v25.4S, v25.4S, v26.4S // ................*........................................................... - sqrdmulh v26.4S, v5.4S, v0.S[1] // ........................e................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v5.4S, v22.4S, v1.S[1] // .......................................*.................................... - sub v18.4S, v27.4S, v9.4S // ....................................*....................................... - // gap // ............................................................................ - add v27.4S, v27.4S, v9.4S // .....................................*...................................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v9.4S, v10.4S, v0.S[3] // .............................*.............................................. - sub v11.4S, v25.4S, v29.4S // ..............................................*............................. - // gap // ............................................................................ - add v25.4S, v25.4S, v29.4S // ...............................................*............................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v29.4S, v22.4S, v1.S[0] // ......................................*..................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v29.4S, v5.4S, v8.S[0] // ........................................*................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v10.4S, v10.4S, v0.S[2] // ............................*............................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v10.4S, v9.4S, v8.S[0] // ..............................*............................................. - // gap // ............................................................................ - // gap // ............................................................................ - sub v9.4S, v17.4S, v29.4S // .........................................*.................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v29.4S, v17.4S, v29.4S // ..........................................*................................. - mul v5.4S, v27.4S, v1.S[2] // ................................................*........................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v27.4S, v27.4S, v1.S[3] // .................................................*.......................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v17.4S, v23.4S, v10.4S // ...............................*............................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v23.4S, v23.4S, v10.4S // ................................*........................................... - mul v10.4S, v18.4S, v2.S[0] // .....................................................*...................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v22.4S, v18.4S, v2.S[1] // ......................................................*..................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.4S, v27.4S, v8.S[0] // ..................................................*......................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v27.4S, v25.4S, v2.S[2] // ..........................................................*................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v10.4S, v22.4S, v8.S[0] // .......................................................*.................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v22.4S, v23.4S, v5.4S // ...................................................*........................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v25.4S, v25.4S, v2.S[3] // ...........................................................*................ - add v23.4S, v23.4S, v5.4S // ....................................................*....................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v22, x0, 128 // .....................................................................*...... - mul v5.4S, v11.4S, v3.S[0] // ...............................................................*............ - // gap // ............................................................................ - str_vi v23, x0, 16 // ....................................................................*....... - sub v23.4S, v17.4S, v10.4S // ........................................................*................... - // gap // ............................................................................ - sqrdmulh v22.4S, v11.4S, v3.S[1] // ................................................................*........... - add v10.4S, v17.4S, v10.4S // .........................................................*.................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v27.4S, v25.4S, v8.S[0] // ............................................................*............... - str_vo v23, x0, 368 // .......................................................................*.... - // gap // ............................................................................ - str_vo v10, x0, 240 // ......................................................................*..... - // gap // ............................................................................ - // gap // ............................................................................ - mls v28.4S, v26.4S, v8.S[0] // .........................e.................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.4S, v22.4S, v8.S[0] // .................................................................*.......... - // gap // ............................................................................ - // gap // ............................................................................ - sub v23.4S, v29.4S, v27.4S // .............................................................*.............. - // gap // ............................................................................ - // gap // ............................................................................ - add v10.4S, v29.4S, v27.4S // ..............................................................*............. - mul v22.4S, v20.4S, v0.S[0] // ........e................................................................... - // gap // ............................................................................ - add v27.4S, v6.4S, v28.4S // ...........................e................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v25.4S, v20.4S, v0.S[1] // .........e.................................................................. - str_vo v23, x0, 624 // .........................................................................*.. - // gap // ............................................................................ - str_vo v10, x0, 496 // ........................................................................*... - sub v23.4S, v9.4S, v5.4S // ..................................................................*......... - // gap // ............................................................................ - add v10.4S, v9.4S, v5.4S // ...................................................................*........ - sqrdmulh v26.4S, v27.4S, v0.S[3] // ..................................e......................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v9.4S, v27.4S, v0.S[2] // .................................e.......................................... - str_vo v23, x0, 880 // ...........................................................................* - // gap // ............................................................................ - str_vo v10, x0, 752 // ..........................................................................*. - // gap // ............................................................................ - // gap // ............................................................................ - mls v22.4S, v25.4S, v8.S[0] // ..........e................................................................. - // gap // ............................................................................ - // gap // ............................................................................ - - // original source code - // ldr_vo v9, x0, 0 // ......................................................................*.......................................................................... || ......................................................................*...................................................................... - // ldr_vo v10, x0, 128 // ..........................................................................*...................................................................... || .......................................................................*..................................................................... - // ldr_vo v11, x0, 256 // ........................................................................*........................................................................ || ......................................................................*...................................................................... - // ldr_vo v12, x0, 384 // ....e............................................................................................................................................ || .e........................................................................................................................................... - // ldr_vo v13, x0, 512 // e................................................................................................................................................ || e............................................................................................................................................ - // ldr_vo v14, x0, 640 // .............................................................................*................................................................... || ........................................................................*.................................................................... - // ldr_vo v15, x0, 768 // .........................................................................*....................................................................... || .......................................................................*..................................................................... - // ldr_vo v16, x0, 896 // ...e............................................................................................................................................. || .e........................................................................................................................................... - // mul v24.4S, v13.4S, v0.S[0] // ..........................................................e...................................................................................... || .............................................................e............................................................................... - // sqrdmulh v13.4S, v13.4S, v0.S[1] // ............................................................e.................................................................................... || ...............................................................e............................................................................. - // mls v24.4S, v13.4S, v8.S[0] // .....................................................................e........................................................................... || .....................................................................e....................................................................... - // sub v13.4S, v9.4S, v24.4S // .................................................................................*............................................................... || ..........................................................................*.................................................................. - // add v9.4S, v9.4S, v24.4S // ..................................................................................*.............................................................. || ...........................................................................*................................................................. - // mul v24.4S, v14.4S, v0.S[0] // ........................................................................................*........................................................ || .....................................................................................*....................................................... - // sqrdmulh v14.4S, v14.4S, v0.S[1] // ......................................................................................*.......................................................... || .................................................................................*........................................................... - // mls v24.4S, v14.4S, v8.S[0] // .........................................................................................*....................................................... || .......................................................................................*..................................................... - // sub v14.4S, v10.4S, v24.4S // ...............................................................................................*................................................. || .............................................................................................*............................................... - // add v10.4S, v10.4S, v24.4S // ..............................................................................................*.................................................. || ............................................................................................*................................................ - // mul v24.4S, v15.4S, v0.S[0] // .....................................................................................*........................................................... || ...............................................................................*............................................................. - // sqrdmulh v15.4S, v15.4S, v0.S[1] // ....................................................................................*............................................................ || .............................................................................*............................................................... - // mls v24.4S, v15.4S, v8.S[0] // .......................................................................................*......................................................... || ...................................................................................*......................................................... - // sub v15.4S, v11.4S, v24.4S // ..........................................................................................*...................................................... || ........................................................................................*.................................................... - // add v11.4S, v11.4S, v24.4S // ...........................................................................................*..................................................... || .........................................................................................*................................................... - // mul v24.4S, v16.4S, v0.S[0] // .................e............................................................................................................................... || ...................e......................................................................................................................... - // sqrdmulh v16.4S, v16.4S, v0.S[1] // ....................e............................................................................................................................ || .....................e....................................................................................................................... - // mls v24.4S, v16.4S, v8.S[0] // ......................................................e.......................................................................................... || .........................................................e................................................................................... - // sub v16.4S, v12.4S, v24.4S // .......................................................................*......................................................................... || ......................................................................*...................................................................... - // add v12.4S, v12.4S, v24.4S // ...........................................................e..................................................................................... || ..............................................................e.............................................................................. - // mul v24.4S, v11.4S, v0.S[2] // .........................................................................................................*....................................... || .......................................................................................................*..................................... - // sqrdmulh v11.4S, v11.4S, v0.S[3] // ....................................................................................................*............................................ || .................................................................................................*........................................... - // mls v24.4S, v11.4S, v8.S[0] // ..........................................................................................................*...................................... || .........................................................................................................*................................... - // sub v11.4S, v9.4S, v24.4S // ...............................................................................................................*................................. || ..............................................................................................................*.............................. - // add v9.4S, v9.4S, v24.4S // ................................................................................................................*................................ || ...............................................................................................................*............................. - // mul v24.4S, v12.4S, v0.S[2] // ..................................................................e.............................................................................. || ...................................................................e......................................................................... - // sqrdmulh v12.4S, v12.4S, v0.S[3] // .................................................................e............................................................................... || .................................................................e........................................................................... - // mls v24.4S, v12.4S, v8.S[0] // ...........................................................................*..................................................................... || .......................................................................*..................................................................... - // sub v12.4S, v10.4S, v24.4S // ..................................................................................................*.............................................. || ...............................................................................................*............................................. - // add v10.4S, v10.4S, v24.4S // ...................................................................................................*............................................. || ................................................................................................*............................................ - // mul v24.4S, v15.4S, v1.S[0] // .......................................................................................................*......................................... || ...................................................................................................*......................................... - // sqrdmulh v15.4S, v15.4S, v1.S[1] // .................................................................................................*............................................... || ...............................................................................................*............................................. - // mls v24.4S, v15.4S, v8.S[0] // ........................................................................................................*........................................ || .....................................................................................................*....................................... - // sub v15.4S, v13.4S, v24.4S // ...........................................................................................................*..................................... || ..........................................................................................................*.................................. - // add v13.4S, v13.4S, v24.4S // ............................................................................................................*.................................... || ...........................................................................................................*................................. - // mul v24.4S, v16.4S, v1.S[0] // ..............................................................................*.................................................................. || .........................................................................*................................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[1] // ...................................................................................*............................................................. || ...........................................................................*................................................................. - // mls v24.4S, v16.4S, v8.S[0] // ............................................................................................*.................................................... || .........................................................................................*................................................... - // sub v16.4S, v14.4S, v24.4S // .....................................................................................................*........................................... || .................................................................................................*........................................... - // add v14.4S, v14.4S, v24.4S // ......................................................................................................*.......................................... || ..................................................................................................*.......................................... - // mul v24.4S, v10.4S, v1.S[2] // .............................................................................................................*................................... || ...........................................................................................................*................................. - // sqrdmulh v10.4S, v10.4S, v1.S[3] // ..............................................................................................................*.................................. || .............................................................................................................*............................... - // mls v24.4S, v10.4S, v8.S[0] // ...................................................................................................................*............................. || ...................................................................................................................*......................... - // sub v10.4S, v9.4S, v24.4S // ......................................................................................................................*.......................... || ........................................................................................................................*.................... - // add v9.4S, v9.4S, v24.4S // ........................................................................................................................*........................ || .........................................................................................................................*................... - // mul v24.4S, v12.4S, v2.S[0] // .................................................................................................................*............................... || ...............................................................................................................*............................. - // sqrdmulh v12.4S, v12.4S, v2.S[1] // ..................................................................................................................*.............................. || .................................................................................................................*........................... - // mls v24.4S, v12.4S, v8.S[0] // .....................................................................................................................*........................... || .......................................................................................................................*..................... - // sub v12.4S, v11.4S, v24.4S // ............................................................................................................................*.................... || ............................................................................................................................*................ - // add v11.4S, v11.4S, v24.4S // ..............................................................................................................................*.................. || .............................................................................................................................*............... - // mul v24.4S, v14.4S, v2.S[2] // ....................................................................................................................*............................ || .....................................................................................................................*....................... - // sqrdmulh v14.4S, v14.4S, v2.S[3] // .......................................................................................................................*......................... || .........................................................................................................................*................... - // mls v24.4S, v14.4S, v8.S[0] // ...............................................................................................................................*................. || ...............................................................................................................................*............. - // sub v14.4S, v13.4S, v24.4S // ....................................................................................................................................*............ || ....................................................................................................................................*........ - // add v13.4S, v13.4S, v24.4S // .....................................................................................................................................*........... || .....................................................................................................................................*....... - // mul v24.4S, v16.4S, v3.S[0] // ..........................................................................................................................*...................... || ...........................................................................................................................*................. - // sqrdmulh v16.4S, v16.4S, v3.S[1] // .............................................................................................................................*................... || .............................................................................................................................*............... - // mls v24.4S, v16.4S, v8.S[0] // ...................................................................................................................................*............. || ...................................................................................................................................*......... - // sub v16.4S, v15.4S, v24.4S // ...........................................................................................................................................*..... || ........................................................................................................................................*.... - // add v15.4S, v15.4S, v24.4S // ............................................................................................................................................*.... || .........................................................................................................................................*... - // str_vi v9, x0, 16 // ...........................................................................................................................*..................... || ............................................................................................................................*................ - // str_vo v10, x0, 112 // .........................................................................................................................*....................... || ...........................................................................................................................*................. - // str_vo v11, x0, 240 // .................................................................................................................................*............... || ................................................................................................................................*............ - // str_vo v12, x0, 368 // ................................................................................................................................*................ || ...............................................................................................................................*............. - // str_vo v13, x0, 496 // ..........................................................................................................................................*...... || ........................................................................................................................................*.... - // str_vo v14, x0, 624 // .........................................................................................................................................*....... || .......................................................................................................................................*..... - // str_vo v15, x0, 752 // ................................................................................................................................................* || ............................................................................................................................................* - // str_vo v16, x0, 880 // ...............................................................................................................................................*. || ...........................................................................................................................................*. - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Wall time: 6.56s + // User time: 6.56s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q18, [x0, #0] // *........................................................................... + sub v20.4S, v12.4S, v13.4S // ..........................*................................................. + ldr q19, [x0, #256] // ..*......................................................................... + ldr q11, [x0, #768] // ......*..................................................................... + ldr q22, [x0, #128] // .*.......................................................................... + mls v30.4S, v6.4S, v8.S[0] // ...................................*........................................ + ldr q6, [x0, #528] // ....e....................................................................... + ldr q13, [x0, #640] // .....*...................................................................... + // gap // ............................................................................ + sqrdmulh v27.4S, v20.4S, v1.S[1] // ...........................................*................................ + ldr q17, [x0, #912] // .......e.................................................................... + ldr q12, [x0, #400] // ...e........................................................................ + sub v14.4S, v18.4S, v26.4S // ...........*................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v18.4S, v26.4S // ............*............................................................... + mul v20.4S, v20.4S, v1.S[0] // ............................................*............................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v26.4S, v11.4S, v0.S[1] // ..................*......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v11.4S, v11.4S, v0.S[0] // ...................*........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v31.4S, v13.4S, v0.S[1] // .............*.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v11.4S, v26.4S, v8.S[0] // ....................*....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.4S, v13.4S, v0.S[0] // ..............*............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.4S, v31.4S, v8.S[0] // ...............*............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.4S, v19.4S, v11.4S // .....................*...................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v19.4S, v19.4S, v11.4S // ......................*..................................................... + mls v20.4S, v27.4S, v8.S[0] // .............................................*.............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v11.4S, v17.4S, v0.S[1] // .......................e.................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v27.4S, v22.4S, v13.4S // .................*.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v22.4S, v22.4S, v13.4S // ................*........................................................... + mul v13.4S, v17.4S, v0.S[0] // ........................e................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v17.4S, v26.4S, v1.S[1] // ......................................*..................................... + sub v31.4S, v27.4S, v30.4S // ....................................*....................................... + // gap // ............................................................................ + add v30.4S, v27.4S, v30.4S // .....................................*...................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v27.4S, v19.4S, v0.S[3] // ............................*............................................... + sub v25.4S, v22.4S, v20.4S // ..............................................*............................. + // gap // ............................................................................ + add v20.4S, v22.4S, v20.4S // ...............................................*............................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v22.4S, v26.4S, v1.S[0] // .......................................*.................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v22.4S, v17.4S, v8.S[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v19.4S, v19.4S, v0.S[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v19.4S, v27.4S, v8.S[0] // ..............................*............................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v27.4S, v14.4S, v22.4S // .........................................*.................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v22.4S, v14.4S, v22.4S // ..........................................*................................. + sqrdmulh v17.4S, v30.4S, v1.S[3] // ................................................*........................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v30.4S, v30.4S, v1.S[2] // .................................................*.......................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v14.4S, v18.4S, v19.4S // ...............................*............................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v18.4S, v19.4S // ................................*........................................... + sqrdmulh v19.4S, v31.4S, v2.S[1] // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v30.4S, v17.4S, v8.S[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v17.4S, v31.4S, v2.S[0] // ......................................................*..................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v17.4S, v19.4S, v8.S[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v19.4S, v18.4S, v30.4S // ...................................................*........................ + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v18.4S, v30.4S // ....................................................*....................... + sqrdmulh v30.4S, v20.4S, v2.S[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v20.4S, v20.4S, v2.S[2] // ...........................................................*................ + str q19, [x0, #128] // .....................................................................*...... + // gap // ............................................................................ + str q18, [x0], #(16) // ....................................................................*....... + sub v18.4S, v14.4S, v17.4S // ........................................................*................... + // gap // ............................................................................ + add v19.4S, v14.4S, v17.4S // .........................................................*.................. + sqrdmulh v17.4S, v25.4S, v3.S[1] // ...............................................................*............ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v20.4S, v30.4S, v8.S[0] // ............................................................*............... + str q18, [x0, #368] // .......................................................................*.... + // gap // ............................................................................ + str q19, [x0, #240] // ......................................................................*..... + // gap // ............................................................................ + // gap // ............................................................................ + mul v18.4S, v25.4S, v3.S[0] // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.4S, v11.4S, v8.S[0] // .........................e.................................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v19.4S, v22.4S, v20.4S // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + add v20.4S, v22.4S, v20.4S // ..............................................................*............. + mls v18.4S, v17.4S, v8.S[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #624] // .........................................................................*.. + sqrdmulh v19.4S, v6.4S, v0.S[1] // ........e................................................................... + // gap // ............................................................................ + str q20, [x0, #496] // ........................................................................*... + add v20.4S, v12.4S, v13.4S // ...........................e................................................ + // gap // ............................................................................ + mul v26.4S, v6.4S, v0.S[0] // .........e.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v11.4S, v27.4S, v18.4S // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v27.4S, v18.4S // ...................................................................*........ + sqrdmulh v6.4S, v20.4S, v0.S[3] // .................................e.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.4S, v19.4S, v8.S[0] // ..........e................................................................. + str q11, [x0, #880] // ...........................................................................* + // gap // ............................................................................ + str q18, [x0, #752] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + mul v30.4S, v20.4S, v0.S[2] // ..................................e......................................... + // gap // ............................................................................ + // gap // ............................................................................ + + // ----------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ldr q9, [x0, #0] // ......................................................................*.......................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ......................................................................'...*...................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ......................................................................'.*........................................................................ + // ldr q12, [x0, #(3*(1024/8))] // ....e.................................................................'.........~................................................................ + // ldr q13, [x0, #(4*(1024/8))] // e.....................................................................'.....~.................................................................... + // ldr q14, [x0, #(5*(1024/8))] // .~....................................................................'......*................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ......................................................................'..*....................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ...e..................................................................'........~................................................................. + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ...........................................................e..........'................................................................~......... + // mul v24.4s, v13.4s, v0.s[0] // ..............................................................e.......'...................................................................~...... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................e...'.......................................................................~.. + // sub v13.4s, v9.4s, v24.4s // .....~................................................................'..........*............................................................... + // add v9.4s, v9.4s, v24.4s // ......~...............................................................'...........*.............................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ..........~...........................................................'...............*.......................................................... + // mul v24.4s, v14.4s, v0.s[0] // ............~.........................................................'.................*........................................................ + // mls v24.4s, v27.4s, v8.s[0] // .............~........................................................'..................*....................................................... + // sub v14.4s, v10.4s, v24.4s // ...................~..................................................'........................*................................................. + // add v10.4s, v10.4s, v24.4s // ..................~...................................................'.......................*.................................................. + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ........~.............................................................'.............*............................................................ + // mul v24.4s, v15.4s, v0.s[0] // .........~............................................................'..............*........................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........~..........................................................'................*......................................................... + // sub v15.4s, v11.4s, v24.4s // ..............~.......................................................'...................*...................................................... + // add v11.4s, v11.4s, v24.4s // ...............~......................................................'....................*..................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .................e....................................................'......................~................................................... + // mul v24.4s, v16.4s, v0.s[0] // ....................e.................................................'.........................~................................................ + // mls v24.4s, v27.4s, v8.s[0] // ......................................................e...............'...........................................................~.............. + // sub v16.4s, v12.4s, v24.4s // ......................................................................'*......................................................................... + // add v12.4s, v12.4s, v24.4s // .............................................................e........'..................................................................~....... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ........................~.............................................'.............................*............................................ + // mul v24.4s, v11.4s, v0.s[2] // .............................~........................................'..................................*....................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................~.......................................'...................................*...................................... + // sub v11.4s, v9.4s, v24.4s // ...................................~..................................'........................................*................................. + // add v9.4s, v9.4s, v24.4s // ....................................~.................................'.........................................*................................ + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .................................................................e....'......................................................................~... + // mul v24.4s, v12.4s, v0.s[2] // .....................................................................e'.......................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................'....*..................................................................... + // sub v12.4s, v10.4s, v24.4s // ......................~...............................................'...........................*.............................................. + // add v10.4s, v10.4s, v24.4s // .......................~..............................................'............................*............................................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .....................~................................................'..........................*............................................... + // mul v24.4s, v15.4s, v1.s[0] // ...........................~..........................................'................................*......................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................~.........................................'.................................*........................................ + // sub v15.4s, v13.4s, v24.4s // ...............................~......................................'....................................*..................................... + // add v13.4s, v13.4s, v24.4s // ................................~.....................................'.....................................*.................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ..~...................................................................'.......*.................................................................. + // mul v24.4s, v16.4s, v1.s[0] // .......~..............................................................'............*............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ................~.....................................................'.....................*.................................................... + // sub v16.4s, v14.4s, v24.4s // .........................~............................................'..............................*........................................... + // add v14.4s, v14.4s, v24.4s // ..........................~...........................................'...............................*.......................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .................................~....................................'......................................*................................... + // mul v24.4s, v10.4s, v1.s[2] // ..................................~...................................'.......................................*.................................. + // mls v24.4s, v27.4s, v8.s[0] // ......................................~...............................'...........................................*.............................. + // sub v10.4s, v9.4s, v24.4s // .........................................~............................'..............................................*........................... + // add v9.4s, v9.4s, v24.4s // ..........................................~...........................'...............................................*.......................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .....................................~................................'..........................................*............................... + // mul v24.4s, v12.4s, v2.s[0] // .......................................~..............................'............................................*............................. + // mls v24.4s, v27.4s, v8.s[0] // ........................................~.............................'.............................................*............................ + // sub v12.4s, v11.4s, v24.4s // ...............................................~......................'....................................................*..................... + // add v11.4s, v11.4s, v24.4s // ................................................~.....................'.....................................................*.................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...........................................~..........................'................................................*......................... + // mul v24.4s, v14.4s, v2.s[2] // ............................................~.........................'.................................................*........................ + // mls v24.4s, v27.4s, v8.s[0] // ..................................................~...................'.......................................................*.................. + // sub v14.4s, v13.4s, v24.4s // .......................................................~..............'............................................................*............. + // add v13.4s, v13.4s, v24.4s // ........................................................~.............'.............................................................*............ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .................................................~....................'......................................................*................... + // mul v24.4s, v16.4s, v3.s[0] // .....................................................~................'..........................................................*............... + // mls v24.4s, v27.4s, v8.s[0] // .........................................................~............'..............................................................*........... + // sub v16.4s, v15.4s, v24.4s // ...............................................................~......'....................................................................*..... + // add v15.4s, v15.4s, v24.4s // ................................................................~.....'.....................................................................*.... + // str q9, [x0], #(16) // ..............................................~.......................'...................................................*...................... + // str q10, [x0, #(-16 + 1*(1024/8))] // .............................................~........................'..................................................*....................... + // str q11, [x0, #(-16 + 2*(1024/8))] // ....................................................~.................'.........................................................*................ + // str q12, [x0, #(-16 + 3*(1024/8))] // ...................................................~..................'........................................................*................. + // str q13, [x0, #(-16 + 4*(1024/8))] // ............................................................~.........'.................................................................*........ + // str q14, [x0, #(-16 + 5*(1024/8))] // ..........................................................~...........'...............................................................*.......... + // str q15, [x0, #(-16 + 6*(1024/8))] // ....................................................................~.'.........................................................................* + // str q16, [x0, #(-16 + 7*(1024/8))] // ...................................................................~..'........................................................................*. + + sub count, count, #1 cbnz count, layer123_start - ldr_vo v4, x0, 640 // ......*......................................................... - ldr_vo v27, x0, 0 // *............................................................... - sub v28.4S, v6.4S, v28.4S // .*.............................................................. - ldr_vo v24, x0, 128 // ....*........................................................... - mls v9.4S, v26.4S, v8.S[0] // .....*.......................................................... - ldr_vo v31, x0, 768 // ...*............................................................ - ldr_vo v7, x0, 256 // ..*............................................................. - // gap // ................................................................ - // gap // ................................................................ - mul v18.4S, v28.4S, v1.S[0] // .......*........................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v13.4S, v27.4S, v22.4S // ........*....................................................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v19.4S, v28.4S, v1.S[1] // ..........*..................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v12.4S, v31.4S, v0.S[1] // ...........*.................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v5.4S, v4.4S, v0.S[1] // .............*.................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v21.4S, v31.4S, v0.S[0] // ............*................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v21.4S, v12.4S, v8.S[0] // ..............*................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v15.4S, v4.4S, v0.S[0] // ...............*................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v15.4S, v5.4S, v8.S[0] // ................*............................................... - // gap // ................................................................ - // gap // ................................................................ - add v12.4S, v7.4S, v21.4S // ..................*............................................. - // gap // ................................................................ - // gap // ................................................................ - mls v18.4S, v19.4S, v8.S[0] // ...................*............................................ - sub v31.4S, v7.4S, v21.4S // .................*.............................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v4.4S, v12.4S, v0.S[3] // .........................*...................................... - // gap // ................................................................ - // gap // ................................................................ - add v28.4S, v24.4S, v15.4S // ....................*........................................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v19.4S, v31.4S, v1.S[1] // ......................*......................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v16.4S, v31.4S, v1.S[0] // ............................*................................... - sub v30.4S, v28.4S, v9.4S // .......................*........................................ - // gap // ................................................................ - add v31.4S, v28.4S, v9.4S // ........................*....................................... - // gap // ................................................................ - // gap // ................................................................ - sub v28.4S, v24.4S, v15.4S // .....................*.......................................... - mul v25.4S, v12.4S, v0.S[2] // ..............................*................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v25.4S, v4.4S, v8.S[0] // ...............................*................................ - // gap // ................................................................ - // gap // ................................................................ - add v10.4S, v28.4S, v18.4S // ...........................*.................................... - // gap // ................................................................ - // gap // ................................................................ - sub v26.4S, v28.4S, v18.4S // ..........................*..................................... - sqrdmulh v12.4S, v30.4S, v2.S[1] // .......................................*........................ - // gap // ................................................................ - add v28.4S, v27.4S, v22.4S // .........*...................................................... - // gap // ................................................................ - // gap // ................................................................ - mul v17.4S, v30.4S, v2.S[0] // ......................................*......................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v30.4S, v31.4S, v1.S[2] // ..................................*............................. - sub v24.4S, v28.4S, v25.4S // ....................................*........................... - // gap // ................................................................ - add v15.4S, v28.4S, v25.4S // .....................................*.......................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v28.4S, v31.4S, v1.S[3] // ...................................*............................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v16.4S, v19.4S, v8.S[0] // .............................*.................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v4.4S, v10.4S, v2.S[2] // .........................................*...................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v30.4S, v28.4S, v8.S[0] // ........................................*....................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v14.4S, v10.4S, v2.S[3] // ............................................*................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v27.4S, v26.4S, v3.S[1] // ..................................................*............. - // gap // ................................................................ - // gap // ................................................................ - sub v28.4S, v15.4S, v30.4S // ...........................................*.................... - // gap // ................................................................ - // gap // ................................................................ - mls v17.4S, v12.4S, v8.S[0] // ..........................................*..................... - add v15.4S, v15.4S, v30.4S // .............................................*.................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v4.4S, v14.4S, v8.S[0] // ....................................................*........... - str_vo v28, x0, 128 // ..............................................*................. - add v28.4S, v13.4S, v16.4S // .................................*.............................. - str_vi v15, x0, 16 // ................................................*............... - // gap // ................................................................ - // gap // ................................................................ - mul v15.4S, v26.4S, v3.S[0] // ...............................................*................ - sub v26.4S, v13.4S, v16.4S // ................................*............................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v15.4S, v27.4S, v8.S[0] // .......................................................*........ - // gap // ................................................................ - // gap // ................................................................ - sub v5.4S, v28.4S, v4.4S // ........................................................*....... - // gap // ................................................................ - // gap // ................................................................ - add v27.4S, v28.4S, v4.4S // .........................................................*...... - sub v12.4S, v24.4S, v17.4S // .................................................*.............. - // gap // ................................................................ - add v28.4S, v24.4S, v17.4S // ...................................................*............ - // gap // ................................................................ - // gap // ................................................................ - str_vo v5, x0, 624 // ..........................................................*..... - // gap // ................................................................ - // gap // ................................................................ - sub v18.4S, v26.4S, v15.4S // ............................................................*... - add v16.4S, v26.4S, v15.4S // .............................................................*.. - str_vo v12, x0, 368 // .....................................................*.......... - str_vo v28, x0, 240 // ......................................................*......... - // gap // ................................................................ - // gap // ................................................................ - str_vo v27, x0, 496 // ...........................................................*.... - // gap // ................................................................ - // gap // ................................................................ - str_vo v18, x0, 880 // ..............................................................*. - str_vo v16, x0, 752 // ...............................................................* - // gap // ................................................................ - - // original source code - // ldr_vo v23, x0, 0 // .*.............................................................. || *............................................................... - // sub v28.4S, v6.4S, v28.4S // ..*............................................................. || *............................................................... - // ldr_vo v10, x0, 256 // ......*......................................................... || ..*............................................................. - // ldr_vo v27, x0, 768 // .....*.......................................................... || .*.............................................................. - // ldr_vo v25, x0, 128 // ...*............................................................ || .*.............................................................. - // mls v9.4S, v26.4S, v8.S[0] // ....*........................................................... || .*.............................................................. - // ldr_vo v26, x0, 640 // *............................................................... || *............................................................... - // mul v29.4S, v28.4S, v1.S[0] // .......*........................................................ || ...*............................................................ - // sub v17.4S, v23.4S, v22.4S // ........*....................................................... || ....*........................................................... - // add v23.4S, v23.4S, v22.4S // ...............................*................................ || ................................*............................... - // sqrdmulh v28.4S, v28.4S, v1.S[1] // .........*...................................................... || .....*.......................................................... - // sqrdmulh v22.4S, v27.4S, v0.S[1] // ..........*..................................................... || .......*........................................................ - // mul v27.4S, v27.4S, v0.S[0] // ............*................................................... || ...........*.................................................... - // sqrdmulh v18.4S, v26.4S, v0.S[1] // ...........*.................................................... || .........*...................................................... - // mls v27.4S, v22.4S, v8.S[0] // .............*.................................................. || .............*.................................................. - // mul v26.4S, v26.4S, v0.S[0] // ..............*................................................. || ...............*................................................ - // mls v26.4S, v18.4S, v8.S[0] // ...............*................................................ || .................*.............................................. - // sub v22.4S, v10.4S, v27.4S // ..................*............................................. || ...................*............................................ - // add v10.4S, v10.4S, v27.4S // ................*............................................... || ..................*............................................. - // mls v29.4S, v28.4S, v8.S[0] // .................*.............................................. || ...................*............................................ - // add v27.4S, v25.4S, v26.4S // ....................*........................................... || ......................*......................................... - // sub v25.4S, v25.4S, v26.4S // .........................*...................................... || ...........................*.................................... - // sqrdmulh v5.4S, v22.4S, v1.S[1] // .....................*.......................................... || .......................*........................................ - // sub v18.4S, v27.4S, v9.4S // .......................*........................................ || .........................*...................................... - // add v27.4S, v27.4S, v9.4S // ........................*....................................... || ..........................*..................................... - // sqrdmulh v9.4S, v10.4S, v0.S[3] // ...................*............................................ || .....................*.......................................... - // sub v11.4S, v25.4S, v29.4S // .............................*.................................. || ...............................*................................ - // add v25.4S, v25.4S, v29.4S // ............................*................................... || ..............................*................................. - // mul v29.4S, v22.4S, v1.S[0] // ......................*......................................... || .........................*...................................... - // mls v29.4S, v5.4S, v8.S[0] // .....................................*.......................... || .......................................*........................ - // mul v10.4S, v10.4S, v0.S[2] // ..........................*..................................... || ...........................*.................................... - // mls v10.4S, v9.4S, v8.S[0] // ...........................*.................................... || .............................*.................................. - // sub v9.4S, v17.4S, v29.4S // ..................................................*............. || .....................................................*.......... - // add v29.4S, v17.4S, v29.4S // ...............................................*................ || ...................................................*............ - // mul v5.4S, v27.4S, v1.S[2] // .................................*.............................. || ...................................*............................ - // sqrdmulh v27.4S, v27.4S, v1.S[3] // ....................................*........................... || .....................................*.......................... - // sub v17.4S, v23.4S, v10.4S // ..................................*............................. || ...................................*............................ - // add v23.4S, v23.4S, v10.4S // ...................................*............................ || ....................................*........................... - // mul v10.4S, v18.4S, v2.S[0] // ................................*............................... || .................................*.............................. - // sqrdmulh v22.4S, v18.4S, v2.S[1] // ..............................*................................. || ...............................*................................ - // mls v5.4S, v27.4S, v8.S[0] // .......................................*........................ || ...........................................*.................... - // mul v27.4S, v25.4S, v2.S[2] // ......................................*......................... || .........................................*...................... - // mls v10.4S, v22.4S, v8.S[0] // ...........................................*.................... || .................................................*.............. - // sub v22.4S, v23.4S, v5.4S // ..........................................*..................... || ................................................*............... - // sqrdmulh v25.4S, v25.4S, v2.S[3] // ........................................*....................... || .............................................*.................. - // add v23.4S, v23.4S, v5.4S // ............................................*................... || .................................................*.............. - // str_vo v22, x0, 128 // ..............................................*................. || ...................................................*............ - // mul v5.4S, v11.4S, v3.S[0] // .................................................*.............. || .....................................................*.......... - // str_vi v23, x0, 16 // ................................................*............... || ....................................................*........... - // sub v23.4S, v17.4S, v10.4S // ......................................................*......... || .........................................................*...... - // sqrdmulh v22.4S, v11.4S, v3.S[1] // .........................................*...................... || ...............................................*................ - // add v10.4S, v17.4S, v10.4S // .......................................................*........ || ..........................................................*..... - // mls v27.4S, v25.4S, v8.S[0] // .............................................*.................. || ...................................................*............ - // str_vo v23, x0, 368 // ...........................................................*.... || ............................................................*... - // str_vo v10, x0, 240 // ............................................................*... || .............................................................*.. - // mls v5.4S, v22.4S, v8.S[0] // ...................................................*............ || .......................................................*........ - // sub v23.4S, v29.4S, v27.4S // ....................................................*........... || ........................................................*....... - // add v10.4S, v29.4S, v27.4S // .....................................................*.......... || .........................................................*...... - // str_vo v23, x0, 624 // ........................................................*....... || ...........................................................*.... - // str_vo v10, x0, 496 // .............................................................*.. || ..............................................................*. - // sub v23.4S, v9.4S, v5.4S // .........................................................*...... || ............................................................*... - // add v10.4S, v9.4S, v5.4S // ..........................................................*..... || ............................................................*... - // str_vo v23, x0, 880 // ..............................................................*. || ...............................................................* - // str_vo v10, x0, 752 // ...............................................................* || ...............................................................* - + // Instructions: 64 + // Expected cycles: 64 + // Expected IPC: 1.00 + // + // Wall time: 1.74s + // User time: 1.74s + // + // ---------------------- original position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + ldr q19, [x0, #768] // ...*............................................................ + sub v20.4S, v12.4S, v13.4S // .*.............................................................. + mls v30.4S, v6.4S, v8.S[0] // .....*.......................................................... + ldr q18, [x0, #0] // *............................................................... + ldr q11, [x0, #256] // ..*............................................................. + // gap // ................................................................ + ldr q22, [x0, #640] // ......*......................................................... + ldr q13, [x0, #128] // ....*........................................................... + // gap // ................................................................ + sqrdmulh v6.4S, v20.4S, v1.S[1] // .......*........................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sub v17.4S, v18.4S, v26.4S // ........*....................................................... + sqrdmulh v27.4S, v19.4S, v0.S[1] // ...........*.................................................... + // gap // ................................................................ + add v18.4S, v18.4S, v26.4S // .........*...................................................... + // gap // ................................................................ + // gap // ................................................................ + mul v19.4S, v19.4S, v0.S[0] // ............*................................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v20.4S, v20.4S, v1.S[0] // ..........*..................................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v12.4S, v22.4S, v0.S[1] // .............*.................................................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v19.4S, v27.4S, v8.S[0] // ..............*................................................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v22.4S, v22.4S, v0.S[0] // ...............*................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v22.4S, v12.4S, v8.S[0] // ................*............................................... + // gap // ................................................................ + // gap // ................................................................ + sub v27.4S, v11.4S, v19.4S // .................*.............................................. + // gap // ................................................................ + // gap // ................................................................ + add v19.4S, v11.4S, v19.4S // ..................*............................................. + mls v20.4S, v6.4S, v8.S[0] // ...................*............................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v11.4S, v27.4S, v1.S[1] // ......................*......................................... + // gap // ................................................................ + // gap // ................................................................ + add v6.4S, v13.4S, v22.4S // ....................*........................................... + // gap // ................................................................ + // gap // ................................................................ + sub v22.4S, v13.4S, v22.4S // .....................*.......................................... + sqrdmulh v13.4S, v19.4S, v0.S[3] // .........................*...................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v27.4S, v27.4S, v1.S[0] // ............................*................................... + sub v12.4S, v6.4S, v30.4S // .......................*........................................ + // gap // ................................................................ + add v30.4S, v6.4S, v30.4S // ........................*....................................... + // gap // ................................................................ + // gap // ................................................................ + mls v27.4S, v11.4S, v8.S[0] // .............................*.................................. + sub v11.4S, v22.4S, v20.4S // ..........................*..................................... + // gap // ................................................................ + add v20.4S, v22.4S, v20.4S // ...........................*.................................... + // gap // ................................................................ + // gap // ................................................................ + mul v19.4S, v19.4S, v0.S[2] // ..............................*................................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v19.4S, v13.4S, v8.S[0] // ...............................*................................ + // gap // ................................................................ + // gap // ................................................................ + sub v22.4S, v17.4S, v27.4S // ................................*............................... + // gap // ................................................................ + // gap // ................................................................ + add v13.4S, v17.4S, v27.4S // .................................*.............................. + sqrdmulh v6.4S, v30.4S, v1.S[3] // ..................................*............................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v30.4S, v30.4S, v1.S[2] // ...................................*............................ + // gap // ................................................................ + // gap // ................................................................ + sub v27.4S, v18.4S, v19.4S // ....................................*........................... + // gap // ................................................................ + // gap // ................................................................ + add v18.4S, v18.4S, v19.4S // .....................................*.......................... + sqrdmulh v19.4S, v12.4S, v2.S[1] // ......................................*......................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v30.4S, v6.4S, v8.S[0] // .......................................*........................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v6.4S, v12.4S, v2.S[0] // ........................................*....................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v6.4S, v19.4S, v8.S[0] // .........................................*...................... + // gap // ................................................................ + // gap // ................................................................ + sub v19.4S, v18.4S, v30.4S // ..........................................*..................... + // gap // ................................................................ + // gap // ................................................................ + add v18.4S, v18.4S, v30.4S // ...........................................*.................... + sqrdmulh v30.4S, v20.4S, v2.S[3] // ............................................*................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v20.4S, v20.4S, v2.S[2] // .............................................*.................. + str q19, [x0, #128] // ..............................................*................. + // gap // ................................................................ + str q18, [x0], #(16) // ...............................................*................ + add v18.4S, v27.4S, v6.4S // .................................................*.............. + // gap // ................................................................ + sqrdmulh v19.4S, v11.4S, v3.S[1] // ..................................................*............. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v20.4S, v30.4S, v8.S[0] // ...................................................*............ + str q18, [x0, #240] // .....................................................*.......... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v18.4S, v11.4S, v3.S[0] // ......................................................*......... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v18.4S, v19.4S, v8.S[0] // .........................................................*...... + // gap // ................................................................ + // gap // ................................................................ + sub v19.4S, v13.4S, v20.4S // .......................................................*........ + // gap // ................................................................ + // gap // ................................................................ + add v20.4S, v13.4S, v20.4S // ........................................................*....... + sub v11.4S, v27.4S, v6.4S // ................................................*............... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q19, [x0, #624] // ..........................................................*..... + // gap // ................................................................ + // gap // ................................................................ + sub v19.4S, v22.4S, v18.4S // ............................................................*... + add v18.4S, v22.4S, v18.4S // .............................................................*.. + str q11, [x0, #368] // ....................................................*........... + str q20, [x0, #496] // ...........................................................*.... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q19, [x0, #880] // ..............................................................*. + str q18, [x0, #752] // ...............................................................* + // gap // ................................................................ + + // ------------------------ new position -------------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + // ldr q18, [x0, #0] // ...*............................................................ + // sub v20.4S, v12.4S, v13.4S // .*.............................................................. + // ldr q19, [x0, #256] // ....*........................................................... + // ldr q11, [x0, #768] // *............................................................... + // ldr q22, [x0, #128] // ......*......................................................... + // mls v30.4S, v6.4S, v8.S[0] // ..*............................................................. + // ldr q13, [x0, #640] // .....*.......................................................... + // sqrdmulh v27.4S, v20.4S, v1.S[1] // .......*........................................................ + // sub v14.4S, v18.4S, v26.4S // ........*....................................................... + // add v18.4S, v18.4S, v26.4S // ..........*..................................................... + // mul v20.4S, v20.4S, v1.S[0] // ............*................................................... + // sqrdmulh v26.4S, v11.4S, v0.S[1] // .........*...................................................... + // mul v11.4S, v11.4S, v0.S[0] // ...........*.................................................... + // sqrdmulh v31.4S, v13.4S, v0.S[1] // .............*.................................................. + // mls v11.4S, v26.4S, v8.S[0] // ..............*................................................. + // mul v13.4S, v13.4S, v0.S[0] // ...............*................................................ + // mls v13.4S, v31.4S, v8.S[0] // ................*............................................... + // sub v26.4S, v19.4S, v11.4S // .................*.............................................. + // add v19.4S, v19.4S, v11.4S // ..................*............................................. + // mls v20.4S, v27.4S, v8.S[0] // ...................*............................................ + // add v27.4S, v22.4S, v13.4S // .....................*.......................................... + // sub v22.4S, v22.4S, v13.4S // ......................*......................................... + // sqrdmulh v17.4S, v26.4S, v1.S[1] // ....................*........................................... + // sub v31.4S, v27.4S, v30.4S // .........................*...................................... + // add v30.4S, v27.4S, v30.4S // ..........................*..................................... + // sqrdmulh v27.4S, v19.4S, v0.S[3] // .......................*........................................ + // sub v25.4S, v22.4S, v20.4S // ............................*................................... + // add v20.4S, v22.4S, v20.4S // .............................*.................................. + // mul v22.4S, v26.4S, v1.S[0] // ........................*....................................... + // mls v22.4S, v17.4S, v8.S[0] // ...........................*.................................... + // mul v19.4S, v19.4S, v0.S[2] // ..............................*................................. + // mls v19.4S, v27.4S, v8.S[0] // ...............................*................................ + // sub v27.4S, v14.4S, v22.4S // ................................*............................... + // add v22.4S, v14.4S, v22.4S // .................................*.............................. + // sqrdmulh v17.4S, v30.4S, v1.S[3] // ..................................*............................. + // mul v30.4S, v30.4S, v1.S[2] // ...................................*............................ + // sub v14.4S, v18.4S, v19.4S // ....................................*........................... + // add v18.4S, v18.4S, v19.4S // .....................................*.......................... + // sqrdmulh v19.4S, v31.4S, v2.S[1] // ......................................*......................... + // mls v30.4S, v17.4S, v8.S[0] // .......................................*........................ + // mul v17.4S, v31.4S, v2.S[0] // ........................................*....................... + // mls v17.4S, v19.4S, v8.S[0] // .........................................*...................... + // sub v19.4S, v18.4S, v30.4S // ..........................................*..................... + // add v18.4S, v18.4S, v30.4S // ...........................................*.................... + // sqrdmulh v30.4S, v20.4S, v2.S[3] // ............................................*................... + // mul v20.4S, v20.4S, v2.S[2] // .............................................*.................. + // str q19, [x0, #128] // ..............................................*................. + // str q18, [x0], #(16) // ...............................................*................ + // sub v18.4S, v14.4S, v17.4S // ........................................................*....... + // add v19.4S, v14.4S, v17.4S // ................................................*............... + // sqrdmulh v17.4S, v25.4S, v3.S[1] // .................................................*.............. + // mls v20.4S, v30.4S, v8.S[0] // ..................................................*............. + // str q18, [x0, #368] // ............................................................*... + // str q19, [x0, #240] // ...................................................*............ + // mul v18.4S, v25.4S, v3.S[0] // ....................................................*........... + // sub v19.4S, v22.4S, v20.4S // ......................................................*......... + // add v20.4S, v22.4S, v20.4S // .......................................................*........ + // mls v18.4S, v17.4S, v8.S[0] // .....................................................*.......... + // str q19, [x0, #624] // .........................................................*...... + // str q20, [x0, #496] // .............................................................*.. + // sub v11.4S, v27.4S, v18.4S // ..........................................................*..... + // add v18.4S, v27.4S, v18.4S // ...........................................................*.... + // str q11, [x0, #880] // ..............................................................*. + // str q18, [x0, #752] // ...............................................................* + restore inp, STACK0 add inpp, inp, #64 @@ -1027,339 +1034,1199 @@ layer123_start: qform_root3_tw .req q7 .p2align 2 - ldr_vo v7, x2, 0 - ldr_vi v19, x4, 64 - ldr_vo v25, x4, -48 - ldr_vo v9, x1, 16 - ldr_vo v17, x1, 48 - ldr_vo v26, x2, 48 - ldr_vo v5, x1, 0 - sqrdmulh v3.4S, v7.4S, v19.S[1] - ldr_vo v23, x2, 32 - ldr_vo v29, x1, 32 - ldr_vo v13, x2, 16 - mul v30.4S, v7.4S, v19.S[0] - sqrdmulh v20.4S, v26.4S, v19.S[1] - mul v1.4S, v26.4S, v19.S[0] - sqrdmulh v15.4S, v23.4S, v19.S[1] - mls v1.4S, v20.4S, v8.S[0] - mul v27.4S, v23.4S, v19.S[0] - mls v27.4S, v15.4S, v8.S[0] - add v20.4S, v17.4S, v1.4S - mul v18.4S, v13.4S, v19.S[0] - sqrdmulh v28.4S, v13.4S, v19.S[1] - add v24.4S, v29.4S, v27.4S - mul v7.4S, v20.4S, v19.S[2] - mul v26.4S, v24.4S, v19.S[2] - sqrdmulh v14.4S, v24.4S, v19.S[3] - sqrdmulh v19.4S, v20.4S, v19.S[3] - mls v18.4S, v28.4S, v8.S[0] - mls v26.4S, v14.4S, v8.S[0] - mls v7.4S, v19.4S, v8.S[0] - sub v19.4S, v17.4S, v1.4S - sub v15.4S, v9.4S, v18.4S - mls v30.4S, v3.4S, v8.S[0] - add v16.4S, v9.4S, v18.4S - ldr_vo v17, x4, -32 - mul v18.4S, v19.4S, v25.S[0] - add v4.4S, v16.4S, v7.4S - sub v23.4S, v16.4S, v7.4S - sqrdmulh v21.4S, v19.4S, v25.S[1] - sub v22.4S, v5.4S, v30.4S - add v20.4S, v5.4S, v30.4S - mul v19.4S, v4.4S, v25.S[2] - sub v7.4S, v29.4S, v27.4S - sqrdmulh v27.4S, v4.4S, v25.S[3] - ldr_vo v5, x4, -16 - add v2.4S, v20.4S, v26.4S - ldr_vo v10, x5, 16 - ldr_vi v6, x5, 192 - sub v20.4S, v20.4S, v26.4S - sqrdmulh v31.4S, v23.4S, v17.S[1] - ldr_vo v11, x5, -160 - ldr_vo v30, x5, -144 - ldr_vo v9, x5, -128 - mul v23.4S, v23.4S, v17.S[0] - ldr_vo v13, x5, -112 - ldr_vo v0, x5, -96 - ldr_vo v4, x5, -80 - mls v18.4S, v21.4S, v8.S[0] - mls v19.4S, v27.4S, v8.S[0] - mls v23.4S, v31.4S, v8.S[0] - add v27.4S, v15.4S, v18.4S - sub v31.4S, v15.4S, v18.4S - sqrdmulh v18.4S, v7.4S, v25.S[1] - sub v14.4S, v2.4S, v19.4S - add v19.4S, v2.4S, v19.4S - sqrdmulh v15.4S, v27.4S, v17.S[3] - sub v2.4S, v20.4S, v23.4S - mul v27.4S, v27.4S, v17.S[2] - add v20.4S, v20.4S, v23.4S - trn1 v23.4S, v19.4S, v14.4S - mul v7.4S, v7.4S, v25.S[0] - trn2 v19.4S, v19.4S, v14.4S - trn1 v14.4S, v20.4S, v2.4S - mls v7.4S, v18.4S, v8.S[0] - trn2 v2.4S, v20.4S, v2.4S - trn2 v20.2D, v23.2D, v14.2D - sqrdmulh v18.4S, v31.4S, v5.S[1] - trn1 v29.2D, v19.2D, v2.2D - mul v31.4S, v31.4S, v5.S[0] - trn2 v19.2D, v19.2D, v2.2D - sub v21.4S, v22.4S, v7.4S - add v7.4S, v22.4S, v7.4S - sqrdmulh v22.4S, v20.4S, v10.4S + // Instructions: 118 + // Expected cycles: 107 + // Expected IPC: 1.10 + // + // Wall time: 112.51s + // User time: 112.51s + // + // ------------------------------------------------- original position -------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + ldr q17, [x2, #0] // *..................................................................................................................... + ldr q3, [x4], #64 // .*.................................................................................................................... + // gap // ...................................................................................................................... + ldr q19, [x5, #128] // ........................................................................*............................................. + ldr q25, [x5, #176] // ......................................................................................*............................... + // gap // ...................................................................................................................... + ldr q2, [x2, #32] // .........*............................................................................................................ + ldr q10, [x2, #16] // ....*................................................................................................................. + // gap // ...................................................................................................................... + ldr q15, [x2, #48] // ......*............................................................................................................... + ldr q20, [x4, #-16] // ........*............................................................................................................. + // gap // ...................................................................................................................... + ldr q31, [x1, #0] // ..................................*................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v18.4S, v17.4S, v3.S[1] // ............................*......................................................................................... + ldr q26, [x1, #16] // ....................*................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v27.4S, v17.4S, v3.S[0] // .......*.............................................................................................................. + ldr q24, [x1, #32] // .....................*................................................................................................ + // gap // ...................................................................................................................... + ldr q5, [x1, #48] // .....*................................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v22.4S, v2.4S, v3.S[1] // .............*........................................................................................................ + ldr q29, [x4, #-32] // .....................................*................................................................................ + // gap // ...................................................................................................................... + ldr q7, [x4, #-48] // ...*.................................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v27.4S, v18.4S, v8.S[0] // ................................*..................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v21.4S, v15.4S, v3.S[1] // ...........*.......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v0.4S, v2.4S, v3.S[0] // ................*..................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v4.4S, v31.4S, v27.4S // ..............................................*....................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v0.4S, v22.4S, v8.S[0] // ...................*.................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v1.4S, v15.4S, v3.S[0] // ............*......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v1.4S, v21.4S, v8.S[0] // ..............*....................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v14.4S, v24.4S, v0.4S // .........................*............................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v6.4S, v10.4S, v3.S[1] // ..........*........................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v12.4S, v10.4S, v3.S[0] // ...............*...................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v17.4S, v5.4S, v1.4S // .................*.................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v16.4S, v5.4S, v1.4S // ..................*................................................................................................... + sqrdmulh v5.4S, v14.4S, v7.S[1] // ..............................*....................................................................................... + // gap // ...................................................................................................................... + add v21.4S, v24.4S, v0.4S // ...........................*.......................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v15.4S, v17.4S, v7.S[1] // .......................*.............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v12.4S, v6.4S, v8.S[0] // ......................*............................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v22.4S, v17.4S, v7.S[0] // ........................*............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v28.4S, v16.4S, v3.S[2] // ...................................................*.................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v0.4S, v26.4S, v12.4S // ..........................*........................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v12.4S, v26.4S, v12.4S // ...............................*...................................................................................... + mul v24.4S, v14.4S, v7.S[0] // ....................................*................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v14.4S, v16.4S, v3.S[3] // ........................................................*............................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v22.4S, v15.4S, v8.S[0] // .............................*........................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v24.4S, v5.4S, v8.S[0] // ......................................*............................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v17.4S, v21.4S, v3.S[2] // ...........................................................*.......................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v1.4S, v0.4S, v22.4S // .................................*.................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v28.4S, v14.4S, v8.S[0] // ................................................................*..................................................... + add v6.4S, v0.4S, v22.4S // ...................................*.................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v11.4S, v1.4S, v20.S[1] // ........................................*............................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v15.4S, v6.4S, v29.S[3] // .........................................*............................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v13.4S, v12.4S, v28.4S // ......................................................................*............................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v10.4S, v12.4S, v28.4S // .......................................................................*.............................................. + mul v30.4S, v1.4S, v20.S[0] // ............................................*......................................................................... + // gap // ...................................................................................................................... + sub v12.4S, v31.4S, v27.4S // .......................................*.............................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v30.4S, v11.4S, v8.S[0] // ..................................................*................................................................... + ldr q1, [x5, #160] // ..*................................................................................................................... + // gap // ...................................................................................................................... + ldr q11, [x5, #112] // ................................................*..................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v5.4S, v6.4S, v29.S[2] // ...........................................*.......................................................................... + sub v9.4S, v12.4S, v24.4S // ..........................................*........................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v5.4S, v15.4S, v8.S[0] // .................................................*.................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v16.4S, v9.4S, v30.4S // ......................................................*............................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v20.4S, v12.4S, v24.4S // .............................................*........................................................................ + sqrdmulh v24.4S, v13.4S, v29.S[1] // .............................................................................*........................................ + // gap // ...................................................................................................................... + sub v15.4S, v9.4S, v30.4S // .......................................................*.............................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v23.4S, v21.4S, v3.S[3] // .............................................................*........................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v22.4S, v20.4S, v5.4S // ....................................................*................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v2.4S, v20.4S, v5.4S // .....................................................*................................................................ + sqrdmulh v14.4S, v10.4S, v7.S[3] // ...........................................................................*.......................................... + // gap // ...................................................................................................................... + trn1 v6.4S, v16.4S, v15.4S // ............................................................*......................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v18.4S, v10.4S, v7.S[2] // ...................................................................................*.................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v21.4S, v22.4S, v2.4S // .........................................................*............................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v17.4S, v23.4S, v8.S[0] // .................................................................................*.................................... + trn2 v23.4S, v16.4S, v15.4S // ..........................................................*........................................................... + // gap // ...................................................................................................................... + trn1 v26.4S, v22.4S, v2.4S // ..............................................................*....................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v0.4S, v13.4S, v29.S[0] // ..........................................................................*........................................... + ldr q15, [x5, #96] // ...............................................*...................................................................... + // gap // ...................................................................................................................... + trn2 v20.2D, v21.2D, v23.2D // ...............................................................*...................................................... + ldr q13, [x5, #16] // ..................................................................................*................................... + // gap // ...................................................................................................................... + mls v0.4S, v24.4S, v8.S[0] // ................................................................................*..................................... + trn1 v5.2D, v21.2D, v23.2D // .................................................................*.................................................... + // gap // ...................................................................................................................... + sub v9.4S, v4.4S, v17.4S // ........................................................................................*............................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v22.4S, v4.4S, v17.4S // .......................................................................................*.............................. + sqrdmulh v12.4S, v20.4S, v11.4S // ....................................................................*................................................. + // gap // ...................................................................................................................... + trn2 v29.2D, v26.2D, v6.2D // ...................................................................*.................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v2.4S, v20.4S, v15.4S // .....................................................................*................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v24.4S, v29.4S, v11.4S // .....................................................................................*................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v2.4S, v12.4S, v8.S[0] // .........................................................................*............................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v11.4S, v29.4S, v15.4S // ...............................................................................*...................................... + ldr q29, [x5, #144] // ....................................................................................*................................. + add v28.4S, v9.4S, v0.4S // .............................................................................................*........................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v18.4S, v14.4S, v8.S[0] // ..........................................................................................*........................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v15.4S, v5.4S, v2.4S // ............................................................................*......................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v11.4S, v24.4S, v8.S[0] // ............................................................................................*......................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v20.4S, v5.4S, v2.4S // .........................................................................................*............................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v29.4S, v15.4S, v29.4S // .................................................................................................*.................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v21.4S, v22.4S, v18.4S // ..............................................................................................*....................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v31.4S, v22.4S, v18.4S // ...............................................................................................*...................... + sqrdmulh v5.4S, v20.4S, v25.4S // .....................................................................................................*................ + // gap // ...................................................................................................................... + trn1 v22.2D, v26.2D, v6.2D // ..................................................................*................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v10.4S, v20.4S, v1.4S // ..........................................................................................................*........... + sub v20.4S, v9.4S, v0.4S // ...........................................................................................*.......................... + // gap // ...................................................................................................................... + trn2 v4.4S, v21.4S, v31.4S // ....................................................................................................*................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v0.4S, v15.4S, v19.4S // ..............................................................................*....................................... + add v18.4S, v22.4S, v11.4S // ..................................................................................................*................... + // gap // ...................................................................................................................... + trn1 v15.4S, v21.4S, v31.4S // ...................................................................................................*.................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v10.4S, v5.4S, v8.S[0] // ...........................................................................................................*.......... + trn2 v27.4S, v28.4S, v20.4S // ......................................................................................................*............... + ldr q5, [x5], #(12*16) // .........................................................................................................*............ + trn1 v9.4S, v28.4S, v20.4S // .......................................................................................................*.............. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mls v0.4S, v29.4S, v8.S[0] // ........................................................................................................*............. + sub v19.4S, v22.4S, v11.4S // ................................................................................................*..................... + // gap // ...................................................................................................................... + trn2 v30.2D, v4.2D, v27.2D // ............................................................................................................*......... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v1.2D, v15.2D, v9.2D // ....................................................................................................................*. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v7.4S, v19.4S, v10.4S // ................................................................................................................*..... + add v19.4S, v19.4S, v10.4S // .................................................................................................................*.... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v6.4S, v18.4S, v0.4S // .............................................................................................................*........ + add v28.4S, v18.4S, v0.4S // ...............................................................................................................*...... + // gap // ...................................................................................................................... + mul v3.4S, v30.4S, v5.4S // ..............................................................................................................*....... + trn1 v25.4S, v19.4S, v7.4S // ...................................................................................................................*.. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v0.4S, v30.4S, v13.4S // .....................................................................................................................* + trn1 v12.4S, v28.4S, v6.4S // ..................................................................................................................*... + // gap // ...................................................................................................................... + + // --------------------------------------------------- new position ----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q10, [x2, #0] // *..................................................................................................................... + // ldr q2, [x4], #64 // .*.................................................................................................................... + // ldr q30, [x5, #160] // ....................................................*................................................................. + // ldr q24, [x4, #-48] // ................*..................................................................................................... + // ldr q17, [x2, #16] // .....*................................................................................................................ + // ldr q20, [x1, #48] // .............*........................................................................................................ + // ldr q6, [x2, #48] // ......*............................................................................................................... + // mul v11.4S, v10.4S, v2.S[0] // ...........*.......................................................................................................... + // ldr q5, [x4, #-16] // .......*.............................................................................................................. + // ldr q21, [x2, #32] // ....*................................................................................................................. + // sqrdmulh v18.4S, v17.4S, v2.S[1] // .........................*............................................................................................ + // sqrdmulh v26.4S, v6.4S, v2.S[1] // ..................*................................................................................................... + // mul v28.4S, v6.4S, v2.S[0] // ......................*............................................................................................... + // sqrdmulh v15.4S, v21.4S, v2.S[1] // ..............*....................................................................................................... + // mls v28.4S, v26.4S, v8.S[0] // .......................*.............................................................................................. + // mul v17.4S, v17.4S, v2.S[0] // ..........................*........................................................................................... + // mul v14.4S, v21.4S, v2.S[0] // ...................*.................................................................................................. + // sub v16.4S, v20.4S, v28.4S // ...........................*.......................................................................................... + // add v26.4S, v20.4S, v28.4S // ............................*......................................................................................... + // mls v14.4S, v15.4S, v8.S[0] // .....................*................................................................................................ + // ldr q28, [x1, #16] // ..........*........................................................................................................... + // ldr q21, [x1, #32] // ............*......................................................................................................... + // mls v17.4S, v18.4S, v8.S[0] // ................................*..................................................................................... + // sqrdmulh v15.4S, v16.4S, v24.S[1] // ...............................*...................................................................................... + // mul v18.4S, v16.4S, v24.S[0] // .................................*.................................................................................... + // sub v20.4S, v21.4S, v14.4S // ........................*............................................................................................. + // sub v22.4S, v28.4S, v17.4S // ...................................*.................................................................................. + // add v31.4S, v21.4S, v14.4S // ..............................*....................................................................................... + // sqrdmulh v10.4S, v10.4S, v2.S[1] // .........*............................................................................................................ + // mls v18.4S, v15.4S, v8.S[0] // .......................................*.............................................................................. + // sqrdmulh v14.4S, v20.4S, v24.S[1] // .............................*........................................................................................ + // add v17.4S, v28.4S, v17.4S // ....................................*................................................................................. + // mls v11.4S, v10.4S, v8.S[0] // .................*.................................................................................................... + // sub v28.4S, v22.4S, v18.4S // ..........................................*........................................................................... + // ldr q7, [x1, #0] // ........*............................................................................................................. + // add v19.4S, v22.4S, v18.4S // ............................................*......................................................................... + // mul v15.4S, v20.4S, v24.S[0] // .....................................*................................................................................ + // ldr q22, [x4, #-32] // ...............*...................................................................................................... + // mls v15.4S, v14.4S, v8.S[0] // ........................................*............................................................................. + // sub v18.4S, v7.4S, v11.4S // ..................................................*................................................................... + // sqrdmulh v4.4S, v28.4S, v5.S[1] // .............................................*........................................................................ + // sqrdmulh v16.4S, v19.4S, v22.S[3] // ..............................................*....................................................................... + // sub v10.4S, v18.4S, v15.4S // .......................................................*.............................................................. + // mul v27.4S, v19.4S, v22.S[2] // ......................................................*............................................................... + // mul v19.4S, v28.4S, v5.S[0] // .................................................*.................................................................... + // add v28.4S, v18.4S, v15.4S // ..........................................................*........................................................... + // add v5.4S, v7.4S, v11.4S // ....................*................................................................................................. + // ldr q7, [x5, #96] // ........................................................................*............................................. + // ldr q11, [x5, #112] // .....................................................*................................................................ + // mls v27.4S, v16.4S, v8.S[0] // ........................................................*............................................................. + // mls v19.4S, v4.4S, v8.S[0] // ...................................................*.................................................................. + // mul v16.4S, v26.4S, v2.S[2] // ..................................*................................................................................... + // add v18.4S, v28.4S, v27.4S // ..............................................................*....................................................... + // sub v27.4S, v28.4S, v27.4S // ...............................................................*...................................................... + // add v6.4S, v10.4S, v19.4S // .........................................................*............................................................ + // sub v10.4S, v10.4S, v19.4S // ............................................................*......................................................... + // sqrdmulh v14.4S, v26.4S, v2.S[3] // ......................................*............................................................................... + // trn2 v26.4S, v18.4S, v27.4S // ...................................................................*.................................................. + // trn2 v28.4S, v6.4S, v10.4S // .....................................................................*................................................ + // mul v4.4S, v31.4S, v2.S[2] // .........................................*............................................................................ + // trn1 v15.4S, v6.4S, v10.4S // .................................................................*.................................................... + // sqrdmulh v10.4S, v31.4S, v2.S[3] // .............................................................*........................................................ + // trn1 v6.4S, v18.4S, v27.4S // ......................................................................*............................................... + // trn2 v27.2D, v26.2D, v28.2D // .........................................................................*............................................ + // mls v16.4S, v14.4S, v8.S[0] // ...........................................*.......................................................................... + // trn1 v19.2D, v26.2D, v28.2D // ............................................................................*......................................... + // trn1 v18.2D, v6.2D, v15.2D // ...............................................................................................*...................... + // trn2 v31.2D, v6.2D, v15.2D // ................................................................................*..................................... + // sqrdmulh v15.4S, v27.4S, v11.4S // ...............................................................................*...................................... + // mul v6.4S, v27.4S, v7.4S // .................................................................................*.................................... + // sub v14.4S, v17.4S, v16.4S // ...............................................*...................................................................... + // add v17.4S, v17.4S, v16.4S // ................................................*..................................................................... + // ldr q16, [x5, #128] // ..*................................................................................................................... + // mls v6.4S, v15.4S, v8.S[0] // ...................................................................................*.................................. + // mul v27.4S, v14.4S, v22.S[0] // .......................................................................*.............................................. + // sqrdmulh v15.4S, v17.4S, v24.S[3] // ................................................................*..................................................... + // add v28.4S, v19.4S, v6.4S // ........................................................................................*............................. + // sqrdmulh v14.4S, v14.4S, v22.S[1] // ...........................................................*.......................................................... + // mul v22.4S, v28.4S, v16.4S // ...................................................................................................*.................. + // mul v16.4S, v31.4S, v7.4S // ....................................................................................*................................. + // mls v27.4S, v14.4S, v8.S[0] // ...........................................................................*.......................................... + // mls v4.4S, v10.4S, v8.S[0] // ....................................................................*................................................. + // ldr q13, [x5, #16] // ..........................................................................*........................................... + // mul v24.4S, v17.4S, v24.S[2] // ..................................................................*................................................... + // ldr q1, [x5, #144] // .....................................................................................*................................ + // sqrdmulh v11.4S, v31.4S, v11.4S // ..................................................................................*................................... + // ldr q12, [x5, #176] // ...*.................................................................................................................. + // add v17.4S, v5.4S, v4.4S // ..............................................................................*....................................... + // sub v4.4S, v5.4S, v4.4S // .............................................................................*........................................ + // sub v6.4S, v19.4S, v6.4S // ..........................................................................................*........................... + // mls v24.4S, v15.4S, v8.S[0] // .......................................................................................*.............................. + // sub v23.4S, v4.4S, v27.4S // .................................................................................................*.................... + // mls v16.4S, v11.4S, v8.S[0] // .........................................................................................*............................ + // add v7.4S, v4.4S, v27.4S // ......................................................................................*............................... + // add v3.4S, v17.4S, v24.4S // ............................................................................................*......................... + // sub v29.4S, v17.4S, v24.4S // .............................................................................................*........................ + // sub v26.4S, v18.4S, v16.4S // ...........................................................................................................*.......... + // sqrdmulh v5.4S, v28.4S, v1.4S // ...........................................................................................*.......................... + // add v17.4S, v18.4S, v16.4S // ....................................................................................................*................. + // trn1 v15.4S, v3.4S, v29.4S // .....................................................................................................*................ + // trn2 v4.4S, v3.4S, v29.4S // ..................................................................................................*................... + // sqrdmulh v12.4S, v6.4S, v12.4S // ..............................................................................................*....................... + // trn2 v27.4S, v7.4S, v23.4S // .......................................................................................................*.............. + // trn1 v9.4S, v7.4S, v23.4S // .........................................................................................................*............ + // mls v22.4S, v5.4S, v8.S[0] // ..........................................................................................................*........... + // ldr q5, [x5], #(12*16) // ........................................................................................................*............. + // mul v11.4S, v6.4S, v30.4S // ................................................................................................*..................... + // mls v11.4S, v12.4S, v8.S[0] // ......................................................................................................*............... + // trn2 v29.2D, v4.2D, v27.2D // ............................................................................................................*......... + // sub v6.4S, v17.4S, v22.4S // ................................................................................................................*..... + // mul v3.4S, v29.4S, v5.4S // ..................................................................................................................*... + // add v28.4S, v17.4S, v22.4S // .................................................................................................................*.... + // sub v7.4S, v26.4S, v11.4S // ..............................................................................................................*....... + // add v19.4S, v26.4S, v11.4S // ...............................................................................................................*...... + // trn1 v12.4S, v28.4S, v6.4S // .....................................................................................................................* + // trn1 v25.4S, v19.4S, v7.4S // ...................................................................................................................*.. + // trn2 v1.2D, v15.2D, v9.2D // .............................................................................................................*........ + // sqrdmulh v0.4S, v29.4S, v13.4S // ....................................................................................................................*. + sub count, count, #1 -.p2align 2 layer45678_start: - ldr_vo v26, x5, -16 - sqrdmulh v12.4S, v19.4S, v10.4S - trn1 v3.2D, v23.2D, v14.2D // gap(s) to follow - mul v5.4S, v19.4S, v6.4S // gap(s) to follow - mls v31.4S, v18.4S, v8.S[0] // gap(s) to follow - mls v5.4S, v12.4S, v8.S[0] // gap(s) to follow - mls v27.4S, v15.4S, v8.S[0] // gap(s) to follow - sub v17.4S, v21.4S, v31.4S // gap(s) to follow - add v12.4S, v21.4S, v31.4S - mul v31.4S, v20.4S, v6.4S // gap(s) to follow - sub v18.4S, v29.4S, v5.4S // gap(s) to follow - mls v31.4S, v22.4S, v8.S[0] - add v6.4S, v29.4S, v5.4S // gap(s) to follow - sub v28.4S, v7.4S, v27.4S // gap(s) to follow - add v16.4S, v7.4S, v27.4S - mul v20.4S, v18.4S, v9.4S // gap(s) to follow - trn2 v24.4S, v12.4S, v17.4S // gap(s) to follow - sqrdmulh v2.4S, v6.4S, v30.4S // gap(s) to follow - trn2 v29.4S, v16.4S, v28.4S // gap(s) to follow - sqrdmulh v1.4S, v18.4S, v13.4S - trn1 v7.4S, v12.4S, v17.4S // gap(s) to follow - add v19.4S, v3.4S, v31.4S // gap(s) to follow - mul v10.4S, v6.4S, v11.4S - trn2 v6.2D, v29.2D, v24.2D // gap(s) to follow - trn1 v5.4S, v16.4S, v28.4S - mls v10.4S, v2.4S, v8.S[0] // gap(s) to follow - sqrdmulh v16.4S, v6.4S, v4.4S // gap(s) to follow - trn2 v17.2D, v5.2D, v7.2D // gap(s) to follow - mul v25.4S, v6.4S, v0.4S // gap(s) to follow - add v27.4S, v19.4S, v10.4S // gap(s) to follow - sqrdmulh v12.4S, v17.4S, v4.4S // gap(s) to follow - mls v25.4S, v16.4S, v8.S[0] - trn1 v16.2D, v29.2D, v24.2D // gap(s) to follow - ldr_vo v28, x5, -48 // gap(s) to follow - mul v15.4S, v17.4S, v0.4S - trn1 v17.2D, v5.2D, v7.2D // gap(s) to follow - ldr_vo v4, x5, -32 // gap(s) to follow - mls v15.4S, v12.4S, v8.S[0] - sub v12.4S, v3.4S, v31.4S // gap(s) to follow - add v24.4S, v16.4S, v25.4S // gap(s) to follow - mls v20.4S, v1.4S, v8.S[0] - sub v7.4S, v16.4S, v25.4S - sqrdmulh v5.4S, v24.4S, v28.4S - ldr_vo v2, x5, -64 - sub v19.4S, v19.4S, v10.4S - ldr_vo v10, x2, 128 - ldr_vi v6, x4, 64 - sub v31.4S, v17.4S, v15.4S - add v22.4S, v17.4S, v15.4S - ldr_vo v25, x4, -48 - sqrdmulh v23.4S, v7.4S, v26.4S - sub v11.4S, v12.4S, v20.4S - ldr_vo v30, x1, 144 - add v20.4S, v12.4S, v20.4S - ldr_vo v9, x2, 176 - ldr_vo v18, x1, 176 - mul v2.4S, v24.4S, v2.4S - ldr_vo v0, x1, 128 - trn2 v13.4S, v27.4S, v19.4S - trn1 v19.4S, v27.4S, v19.4S - ldr_vo v27, x2, 160 - ldr_vo v29, x1, 160 - mul v7.4S, v7.4S, v4.4S - trn2 v4.4S, v20.4S, v11.4S - ldr_vo v14, x2, 144 - trn1 v20.4S, v20.4S, v11.4S - ldr_vo v17, x4, -32 // gap(s) to follow - mls v2.4S, v5.4S, v8.S[0] // gap(s) to follow - trn1 v5.2D, v13.2D, v4.2D // gap(s) to follow - trn2 v4.2D, v13.2D, v4.2D - sqrdmulh v11.4S, v10.4S, v6.S[1] // gap(s) to follow - trn2 v13.2D, v19.2D, v20.2D // gap(s) to follow - mul v10.4S, v10.4S, v6.S[0] - trn1 v19.2D, v19.2D, v20.2D - str_vo v5, x1, 16 - str_vo v4, x1, 48 - sub v5.4S, v22.4S, v2.4S // gap(s) to follow - add v2.4S, v22.4S, v2.4S - mls v7.4S, v23.4S, v8.S[0] - str_vo v13, x1, 32 - str_vi v19, x1, 128 // gap(s) to follow - sqrdmulh v19.4S, v9.4S, v6.S[1] // gap(s) to follow - mul v4.4S, v9.4S, v6.S[0] - trn1 v20.4S, v2.4S, v5.4S - ldr_vo v22, x4, -16 - trn2 v5.4S, v2.4S, v5.4S - ldr_vo v9, x5, 64 - ldr_vo v13, x5, 80 - mls v4.4S, v19.4S, v8.S[0] - sub v19.4S, v31.4S, v7.4S // gap(s) to follow - add v7.4S, v31.4S, v7.4S // gap(s) to follow - mls v10.4S, v11.4S, v8.S[0] - ldr_vo v11, x5, 32 // gap(s) to follow - trn1 v2.4S, v7.4S, v19.4S - sqrdmulh v31.4S, v27.4S, v6.S[1] // gap(s) to follow - add v23.4S, v18.4S, v4.4S // gap(s) to follow - sub v18.4S, v18.4S, v4.4S - mul v27.4S, v27.4S, v6.S[0] - ldr_vo v4, x5, 112 - sub v1.4S, v0.4S, v10.4S // gap(s) to follow - add v15.4S, v0.4S, v10.4S - sqrdmulh v21.4S, v14.4S, v6.S[1] - ldr_vo v10, x5, 16 - trn2 v24.2D, v20.2D, v2.2D - ldr_vo v0, x5, 96 // gap(s) to follow - mul v14.4S, v14.4S, v6.S[0] - trn1 v2.2D, v20.2D, v2.2D // gap(s) to follow - trn2 v19.4S, v7.4S, v19.4S // gap(s) to follow - mls v27.4S, v31.4S, v8.S[0] - str_vo v24, x2, 32 // gap(s) to follow - str_vi v2, x2, 128 // gap(s) to follow - mls v14.4S, v21.4S, v8.S[0] - trn1 v7.2D, v5.2D, v19.2D // gap(s) to follow - trn2 v19.2D, v5.2D, v19.2D // gap(s) to follow - mul v5.4S, v23.4S, v6.S[2] // gap(s) to follow - str_vo v7, x2, -112 - add v7.4S, v29.4S, v27.4S // gap(s) to follow - sub v27.4S, v29.4S, v27.4S - sqrdmulh v2.4S, v23.4S, v6.S[3] - str_vo v19, x2, -80 - sub v19.4S, v30.4S, v14.4S // gap(s) to follow - add v20.4S, v30.4S, v14.4S - mul v31.4S, v7.4S, v6.S[2] - ldr_vo v30, x5, 48 // gap(s) to follow - sqrdmulh v7.4S, v7.4S, v6.S[3] - ldr_vi v6, x5, 192 // gap(s) to follow - mls v5.4S, v2.4S, v8.S[0] // gap(s) to follow - mul v2.4S, v18.4S, v25.S[0] // gap(s) to follow - sqrdmulh v23.4S, v18.4S, v25.S[1] // gap(s) to follow - add v18.4S, v20.4S, v5.4S // gap(s) to follow - sub v5.4S, v20.4S, v5.4S - mls v31.4S, v7.4S, v8.S[0] // gap(s) to follow - mul v7.4S, v18.4S, v25.S[2] // gap(s) to follow - sqrdmulh v20.4S, v18.4S, v25.S[3] // gap(s) to follow - add v18.4S, v15.4S, v31.4S // gap(s) to follow - sub v31.4S, v15.4S, v31.4S - sqrdmulh v14.4S, v5.4S, v17.S[1] // gap(s) to follow - mul v5.4S, v5.4S, v17.S[0] // gap(s) to follow - mls v2.4S, v23.4S, v8.S[0] // gap(s) to follow - mls v7.4S, v20.4S, v8.S[0] // gap(s) to follow - mls v5.4S, v14.4S, v8.S[0] // gap(s) to follow - add v20.4S, v19.4S, v2.4S // gap(s) to follow - sub v19.4S, v19.4S, v2.4S - sqrdmulh v2.4S, v27.4S, v25.S[1] // gap(s) to follow - sub v14.4S, v18.4S, v7.4S // gap(s) to follow - mul v24.4S, v27.4S, v25.S[0] - add v7.4S, v18.4S, v7.4S // gap(s) to follow - sub v18.4S, v31.4S, v5.4S // gap(s) to follow - add v5.4S, v31.4S, v5.4S - sqrdmulh v15.4S, v20.4S, v17.S[3] // gap(s) to follow - trn1 v23.4S, v7.4S, v14.4S // gap(s) to follow - mul v27.4S, v20.4S, v17.S[2] - trn2 v7.4S, v7.4S, v14.4S // gap(s) to follow - trn1 v14.4S, v5.4S, v18.4S // gap(s) to follow - mls v24.4S, v2.4S, v8.S[0] - trn2 v5.4S, v5.4S, v18.4S // gap(s) to follow - trn2 v20.2D, v23.2D, v14.2D - sqrdmulh v18.4S, v19.4S, v22.S[1] // gap(s) to follow - trn1 v29.2D, v7.2D, v5.2D // gap(s) to follow - mul v31.4S, v19.4S, v22.S[0] - trn2 v19.2D, v7.2D, v5.2D // gap(s) to follow - sub v21.4S, v1.4S, v24.4S // gap(s) to follow - add v7.4S, v1.4S, v24.4S - sqrdmulh v22.4S, v20.4S, v10.4S // gap(s) to follow - subs count, count, #1 + // Instructions: 164 + // Expected cycles: 130 + // Expected IPC: 1.26 + // + // Wall time: 3346.23s + // User time: 3346.23s + // + // ------------------------------------------------------------------------ original position ------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + trn2 v23.2D, v12.2D, v25.2D // ........................................................................................................................................................*........... + ldr q10, [x2, #128] // ....e............................................................................................................................................................... + ldr q2, [x4], #64 // ........e........................................................................................................................................................... + ldr q30, [x5, #160] // ......................................................................................................................e............................................. + ldr q24, [x4, #-48] // .........e.......................................................................................................................................................... + mul v29.4S, v1.4S, v5.4S // ...............................................................................................*.................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q17, [x2, #144] // .....e.............................................................................................................................................................. + ldr q20, [x1, #176] // ...e................................................................................................................................................................ + str q23, [x2, #32] // ..................................................................................................................................................................*. + trn1 v23.2D, v15.2D, v9.2D // ..............................................................................*..................................................................................... + trn2 v9.4S, v28.4S, v6.4S // .....................................................................................................................................................*.............. + ldr q6, [x2, #176] // .......e............................................................................................................................................................ + mul v11.4S, v10.4S, v2.S[0] // .............e...................................................................................................................................................... + ldr q5, [x4, #-16] // ...........e........................................................................................................................................................ + ldr q21, [x2, #160] // ......e............................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v18.4S, v17.4S, v2.S[1] // .................e.................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v26.4S, v6.4S, v2.S[1] // ...........................e........................................................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v28.4S, v6.4S, v2.S[0] // ............................e....................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v15.4S, v21.4S, v2.S[1] // ......................e............................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v28.4S, v26.4S, v8.S[0] // .............................e...................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v17.4S, v17.4S, v2.S[0] // ..................e................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v14.4S, v21.4S, v2.S[0] // .......................e............................................................................................................................................ + // gap // .................................................................................................................................................................... + sub v16.4S, v20.4S, v28.4S // ..............................e..................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v26.4S, v20.4S, v28.4S // ...............................e.................................................................................................................................... + mls v14.4S, v15.4S, v8.S[0] // ........................e........................................................................................................................................... + ldr q28, [x1, #144] // .e.................................................................................................................................................................. + ldr q21, [x1, #160] // ..e................................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v17.4S, v18.4S, v8.S[0] // ...................e................................................................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v15.4S, v16.4S, v24.S[1] // ...............................................e.................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v18.4S, v16.4S, v24.S[0] // ................................................e................................................................................................................... + sub v20.4S, v21.4S, v14.4S // .........................e.......................................................................................................................................... + sub v22.4S, v28.4S, v17.4S // ....................e............................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v31.4S, v21.4S, v14.4S // ..........................e......................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v10.4S, v10.4S, v2.S[1] // ............e....................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v18.4S, v15.4S, v8.S[0] // .................................................e.................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v21.4S, v19.4S, v7.4S // .......................................................................................................................................................*............ + // gap // .................................................................................................................................................................... + sqrdmulh v14.4S, v20.4S, v24.S[1] // ..........................................e......................................................................................................................... + add v17.4S, v28.4S, v17.4S // .....................e.............................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v11.4S, v10.4S, v8.S[0] // ..............e..................................................................................................................................................... + sub v28.4S, v22.4S, v18.4S // ..................................................e................................................................................................................. + ldr q7, [x1, #128] // e................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v19.4S, v22.4S, v18.4S // ...................................................e................................................................................................................ + mul v15.4S, v20.4S, v24.S[0] // ...........................................e........................................................................................................................ + ldr q22, [x4, #-32] // ..........e......................................................................................................................................................... + trn1 v20.2D, v4.2D, v27.2D // ...............................................................................*.................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v15.4S, v14.4S, v8.S[0] // ............................................e....................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v18.4S, v7.4S, v11.4S // ...............e.................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v4.4S, v28.4S, v5.S[1] // ...................................................................e................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v16.4S, v19.4S, v22.S[3] // ..............................................................e..................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v10.4S, v18.4S, v15.4S // .............................................e...................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v27.4S, v19.4S, v22.S[2] // ...............................................................e.................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v19.4S, v28.4S, v5.S[0] // ....................................................................e............................................................................................... + add v28.4S, v18.4S, v15.4S // ..............................................e..................................................................................................................... + // gap // .................................................................................................................................................................... + add v5.4S, v7.4S, v11.4S // ................e................................................................................................................................................... + ldr q7, [x5, #96] // ..................................................................................................................e................................................. + ldr q11, [x5, #112] // ...................................................................................................................e................................................ + mls v27.4S, v16.4S, v8.S[0] // ................................................................e................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v19.4S, v4.4S, v8.S[0] // .....................................................................e.............................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v16.4S, v26.4S, v2.S[2] // ......................................e............................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v18.4S, v28.4S, v27.4S // ..................................................................e................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v27.4S, v28.4S, v27.4S // .................................................................e.................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v6.4S, v10.4S, v19.4S // .......................................................................e............................................................................................ + sub v10.4S, v10.4S, v19.4S // ......................................................................e............................................................................................. + // gap // .................................................................................................................................................................... + sqrdmulh v14.4S, v26.4S, v2.S[3] // .....................................e.............................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v26.4S, v18.4S, v27.4S // .................................................................................e.................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v28.4S, v6.4S, v10.4S // ...................................................................................e................................................................................ + mul v4.4S, v31.4S, v2.S[2] // .................................e.................................................................................................................................. + // gap // .................................................................................................................................................................... + trn1 v15.4S, v6.4S, v10.4S // ..................................................................................e................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v10.4S, v31.4S, v2.S[3] // ................................e................................................................................................................................... + trn1 v6.4S, v18.4S, v27.4S // ................................................................................e................................................................................... + // gap // .................................................................................................................................................................... + trn2 v27.2D, v26.2D, v28.2D // .....................................................................................e.............................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v16.4S, v14.4S, v8.S[0] // .......................................e............................................................................................................................ + trn1 v19.2D, v26.2D, v28.2D // .......................................................................................e............................................................................ + // gap // .................................................................................................................................................................... + trn1 v18.2D, v6.2D, v15.2D // ......................................................................................e............................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v31.2D, v6.2D, v15.2D // ....................................................................................e............................................................................... + sqrdmulh v15.4S, v27.4S, v11.4S // .............................................................................................................................e...................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v6.4S, v27.4S, v7.4S // ..............................................................................................................................e..................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v14.4S, v17.4S, v16.4S // ........................................e........................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v17.4S, v17.4S, v16.4S // .........................................e.......................................................................................................................... + ldr q16, [x5, #128] // ....................................................................................................................e............................................... + // gap // .................................................................................................................................................................... + mls v6.4S, v15.4S, v8.S[0] // ...............................................................................................................................e.................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v27.4S, v14.4S, v22.S[0] // ..........................................................e......................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v15.4S, v17.4S, v24.S[3] // ....................................................e............................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v28.4S, v19.4S, v6.4S // .................................................................................................................................e.................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v22.S[1] // .........................................................e.......................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v22.4S, v28.4S, v16.4S // ...................................................................................................................................e................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v16.4S, v31.4S, v7.4S // .........................................................................................................................e.......................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v27.4S, v14.4S, v8.S[0] // ...........................................................e........................................................................................................ + trn1 v14.2D, v9.2D, v21.2D // ...........................................................................................................................................................*........ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v4.4S, v10.4S, v8.S[0] // ..................................e................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v3.4S, v0.4S, v8.S[0] // .....................................................................................................*.............................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v0.4S, v1.4S, v13.4S // ..............................................................................................*..................................................................... + ldr q13, [x5, #16] // .........................................................................................e.......................................................................... + // gap // .................................................................................................................................................................... + ldr q7, [x5, #-128] // ............................................................................................*....................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v24.4S, v17.4S, v24.S[2] // .....................................................e.............................................................................................................. + trn1 v17.2D, v12.2D, v25.2D // ..........................................................................................................................................................*......... + ldr q1, [x5, #144] // .....................................................................................................................e.............................................. + ldr q10, [x5, #-112] // .............................................................................................*...................................................................... + sub v25.4S, v20.4S, v3.4S // ......................................................................................................*............................................................. + // gap // .................................................................................................................................................................... + sqrdmulh v11.4S, v31.4S, v11.4S // ........................................................................................................................e........................................... + ldr q12, [x5, #176] // .......................................................................................................................e............................................ + // gap // .................................................................................................................................................................... + str q17, [x2], #128 // ................................................................................................................................................................*... + add v17.4S, v5.4S, v4.4S // ....................................e............................................................................................................................... + // gap // .................................................................................................................................................................... + mls v29.4S, v0.4S, v8.S[0] // ................................................................................................*................................................................... + str q14, [x2, #-112] // .................................................................................................................................................................*.. + // gap // .................................................................................................................................................................... + sub v4.4S, v5.4S, v4.4S // ...................................e................................................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v6.4S, v19.4S, v6.4S // ................................................................................................................................e................................... + sqrdmulh v26.4S, v25.4S, v10.4S // .............................................................................................................*...................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v10.4S, v25.4S, v7.4S // ..............................................................................................................*..................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v19.4S, v23.4S, v29.4S // .................................................................................................*.................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v24.4S, v15.4S, v8.S[0] // ......................................................e............................................................................................................. + add v25.4S, v23.4S, v29.4S // ..................................................................................................*................................................................. + ldr q0, [x5, #-160] // ..........................................................................................*......................................................................... + ldr q29, [x5, #-144] // ...........................................................................................*........................................................................ + // gap // .................................................................................................................................................................... + sub v23.4S, v4.4S, v27.4S // ............................................................e....................................................................................................... + mls v16.4S, v11.4S, v8.S[0] // ..........................................................................................................................e......................................... + add v14.4S, v20.4S, v3.4S // .......................................................................................................*............................................................ + // gap // .................................................................................................................................................................... + add v7.4S, v4.4S, v27.4S // .............................................................e...................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v10.4S, v26.4S, v8.S[0] // ...............................................................................................................*.................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v3.4S, v17.4S, v24.4S // ........................................................e........................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v2.4S, v14.4S, v29.4S // ........................................................................................................*........................................................... + sub v29.4S, v17.4S, v24.4S // .......................................................e............................................................................................................ + // gap // .................................................................................................................................................................... + sub v26.4S, v18.4S, v16.4S // ...........................................................................................................................e........................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v5.4S, v28.4S, v1.4S // ..................................................................................................................................e................................. + add v17.4S, v18.4S, v16.4S // ............................................................................................................................e....................................... + // gap // .................................................................................................................................................................... + trn2 v18.2D, v9.2D, v21.2D // .........................................................................................................................................................*.......... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v31.4S, v14.4S, v0.4S // .........................................................................................................*.......................................................... + sub v28.4S, v19.4S, v10.4S // ................................................................................................................*................................................... + // gap // .................................................................................................................................................................... + trn1 v15.4S, v3.4S, v29.4S // ........................................................................e........................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v4.4S, v3.4S, v29.4S // .........................................................................e.......................................................................................... + mls v31.4S, v2.4S, v8.S[0] // ..........................................................................................................*......................................................... + str q18, [x2, #-80] // ...................................................................................................................................................................* + add v24.4S, v19.4S, v10.4S // .................................................................................................................*.................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sqrdmulh v12.4S, v6.4S, v12.4S // .......................................................................................................................................e............................ + trn2 v27.4S, v7.4S, v23.4S // ...........................................................................e........................................................................................ + // gap // .................................................................................................................................................................... + trn1 v9.4S, v7.4S, v23.4S // ..........................................................................e......................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v22.4S, v5.4S, v8.S[0] // ....................................................................................................................................e............................... + trn1 v18.4S, v24.4S, v28.4S // ..............................................................................................................................................*..................... + ldr q5, [x5], #(12*16) // ........................................................................................e........................................................................... + sub v2.4S, v25.4S, v31.4S // ...........................................................................................................*........................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v11.4S, v6.4S, v30.4S // ........................................................................................................................................e........................... + add v23.4S, v25.4S, v31.4S // ............................................................................................................*....................................................... + // gap // .................................................................................................................................................................... + trn2 v24.4S, v24.4S, v28.4S // ...............................................................................................................................................*.................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v11.4S, v12.4S, v8.S[0] // .........................................................................................................................................e.......................... + trn2 v29.2D, v4.2D, v27.2D // .............................................................................e...................................................................................... + // gap // .................................................................................................................................................................... + trn1 v0.4S, v23.4S, v2.4S // ............................................................................................................................................*....................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v14.4S, v23.4S, v2.4S // .............................................................................................................................................*...................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v6.4S, v17.4S, v22.4S // .....................................................................................................................................e.............................. + mul v3.4S, v29.4S, v5.4S // ....................................................................................................e............................................................... + // gap // .................................................................................................................................................................... + add v28.4S, v17.4S, v22.4S // ......................................................................................................................................e............................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v7.4S, v26.4S, v11.4S // ..........................................................................................................................................e......................... + add v19.4S, v26.4S, v11.4S // ...........................................................................................................................................e........................ + // gap // .................................................................................................................................................................... + trn2 v23.2D, v14.2D, v24.2D // .................................................................................................................................................*.................. + trn1 v1.2D, v0.2D, v18.2D // ..................................................................................................................................................*................. + // gap // .................................................................................................................................................................... + trn1 v11.2D, v14.2D, v24.2D // ...................................................................................................................................................*................ + trn2 v31.2D, v0.2D, v18.2D // ................................................................................................................................................*................... + // gap // .................................................................................................................................................................... + trn1 v12.4S, v28.4S, v6.4S // ....................................................................................................................................................e............... + trn1 v25.4S, v19.4S, v7.4S // ......................................................................................................................................................e............. + // gap // .................................................................................................................................................................... + str q23, [x1, #48] // ...............................................................................................................................................................*.... + str q1, [x1], #128 // ............................................................................................................................................................*....... + trn2 v1.2D, v15.2D, v9.2D // ............................................................................e....................................................................................... + sqrdmulh v0.4S, v29.4S, v13.4S // ...................................................................................................e................................................................ + str q11, [x1, #-112] // .............................................................................................................................................................*...... + str q31, [x1, #-96] // ..............................................................................................................................................................*..... + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------ new position ------------------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 325 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr q9, [x1, #(16*0)] // .......................................e...........................................................................................................................'.......................................~........................................................................................................................... + // ldr q10, [x1, #(16*1)] // ........................e..........................................................................................................................................'........................~.......................................................................................................................................... + // ldr q11, [x1, #(16*2)] // .........................e.........................................................................................................................................'.........................~......................................................................................................................................... + // ldr q12, [x1, #(16*3)] // ......e............................................................................................................................................................'......~............................................................................................................................................................ + // ldr q13, [x2, #(16*0)] // e..................................................................................................................................................................'~.................................................................................................................................................................. + // ldr q14, [x2, #(16*1)] // .....e.............................................................................................................................................................'.....~............................................................................................................................................................. + // ldr q15, [x2, #(16*2)] // .............e.....................................................................................................................................................'.............~..................................................................................................................................................... + // ldr q16, [x2, #(16*3)] // ..........e........................................................................................................................................................'..........~........................................................................................................................................................ + // ldr q0, [x4], #64 // .e.................................................................................................................................................................'.~................................................................................................................................................................. + // ldr q1, [x4, #(-64 + 16)] // ...e...............................................................................................................................................................'...~............................................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // ..........................................e........................................................................................................................'..........................................~........................................................................................................................ + // ldr q3, [x4, #(-64 + 48)] // ............e......................................................................................................................................................'............~...................................................................................................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ................................e..................................................................................................................................'................................~.................................................................................................................................. + // mul v24.4s, v13.4s, v0.s[0] // ...........e.......................................................................................................................................................'...........~....................................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .....................................e.............................................................................................................................'.....................................~............................................................................................................................. + // sub v13.4s, v9.4s, v24.4s // .............................................e.....................................................................................................................'.............................................~..................................................................................................................... + // add v9.4s, v9.4s, v24.4s // ....................................................e..............................................................................................................'....................................................~.............................................................................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ..............e....................................................................................................................................................'..............~.................................................................................................................................................... + // mul v24.4s, v14.4s, v0.s[0] // ...................e...............................................................................................................................................'...................~............................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................e........................................................................................................................................'..........................~........................................................................................................................................ + // sub v14.4s, v10.4s, v24.4s // ..............................e....................................................................................................................................'..............................~.................................................................................................................................... + // add v10.4s, v10.4s, v24.4s // ....................................e..............................................................................................................................'....................................~.............................................................................................................................. + // sqrdmulh v27.4s, v15.4s, v0.s[1] // .................e.................................................................................................................................................'.................~................................................................................................................................................. + // mul v24.4s, v15.4s, v0.s[0] // ....................e..............................................................................................................................................'....................~.............................................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .......................e...........................................................................................................................................'.......................~........................................................................................................................................... + // sub v15.4s, v11.4s, v24.4s // .............................e.....................................................................................................................................'.............................~..................................................................................................................................... + // add v11.4s, v11.4s, v24.4s // ...............................e...................................................................................................................................'...............................~................................................................................................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ...............e...................................................................................................................................................'...............~................................................................................................................................................... + // mul v24.4s, v16.4s, v0.s[0] // ................e..................................................................................................................................................'................~.................................................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..................e................................................................................................................................................'..................~................................................................................................................................................ + // sub v16.4s, v12.4s, v24.4s // .....................e.............................................................................................................................................'.....................~............................................................................................................................................. + // add v12.4s, v12.4s, v24.4s // ......................e............................................................................................................................................'......................~............................................................................................................................................ + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ...................................................................e...............................................................................................'...................................................................~............................................................................................... + // mul v24.4s, v11.4s, v0.s[2] // .................................................................e.................................................................................................'.................................................................~................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................................e..........................................................................'........................................................................................~.......................................................................... + // sub v11.4s, v9.4s, v24.4s // ........................................................................................................e..........................................................'........................................................................................................~.......................................................... + // add v9.4s, v9.4s, v24.4s // .....................................................................................................e.............................................................'.....................................................................................................~............................................................. + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ..............................................................e....................................................................................................'..............................................................~.................................................................................................... + // mul v24.4s, v12.4s, v0.s[2] // .........................................................e.........................................................................................................'.........................................................~......................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................e............................................................................................'......................................................................~............................................................................................ + // sub v12.4s, v10.4s, v24.4s // ............................................................................e......................................................................................'............................................................................~...................................................................................... + // add v10.4s, v10.4s, v24.4s // .............................................................................e.....................................................................................'.............................................................................~..................................................................................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ...................................e...............................................................................................................................'...................................~............................................................................................................................... + // mul v24.4s, v15.4s, v1.s[0] // .........................................e.........................................................................................................................'.........................................~......................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................e......................................................................................................................'............................................~...................................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ................................................e..................................................................................................................'................................................~.................................................................................................................. + // add v13.4s, v13.4s, v24.4s // ...................................................e...............................................................................................................'...................................................~............................................................................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ...........................e.......................................................................................................................................'...........................~....................................................................................................................................... + // mul v24.4s, v16.4s, v1.s[0] // ............................e......................................................................................................................................'............................~...................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................e.................................................................................................................................'.................................~................................................................................................................................. + // sub v16.4s, v14.4s, v24.4s // ......................................e............................................................................................................................'......................................~............................................................................................................................ + // add v14.4s, v14.4s, v24.4s // ........................................e..........................................................................................................................'........................................~.......................................................................................................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .................................................................................e.................................................................................'.................................................................................~................................................................................. + // mul v24.4s, v10.4s, v1.s[2] // .............................................................................................e.....................................................................'.............................................................................................~..................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................................................e.....................................................'.............................................................................................................~..................................................... + // sub v10.4s, v9.4s, v24.4s // ........................................................................................................................e..........................................'........................................................................................................................~.......................................... + // add v9.4s, v9.4s, v24.4s // ......................................................................................................................e............................................'......................................................................................................................~............................................ + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ...................................................................................e...............................................................................'...................................................................................~............................................................................... + // mul v24.4s, v12.4s, v2.s[0] // ................................................................................e..................................................................................'................................................................................~.................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................................e............................................................................'......................................................................................~............................................................................ + // sub v12.4s, v11.4s, v24.4s // .................................................................................................................e.................................................'.................................................................................................................~................................................. + // add v11.4s, v11.4s, v24.4s // ....................................................................................................................e..............................................'....................................................................................................................~.............................................. + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...............................................e...................................................................................................................'...............................................~................................................................................................................... + // mul v24.4s, v14.4s, v2.s[2] // .................................................e.................................................................................................................'.................................................~................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .......................................................e...........................................................................................................'.......................................................~........................................................................................................... + // sub v14.4s, v13.4s, v24.4s // ...........................................................e.......................................................................................................'...........................................................~....................................................................................................... + // add v13.4s, v13.4s, v24.4s // ..........................................................e........................................................................................................'..........................................................~........................................................................................................ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ..............................................e....................................................................................................................'..............................................~.................................................................................................................... + // mul v24.4s, v16.4s, v3.s[0] // ..................................................e................................................................................................................'..................................................~................................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ........................................................e..........................................................................................................'........................................................~.......................................................................................................... + // sub v16.4s, v15.4s, v24.4s // .............................................................e.....................................................................................................'.............................................................~..................................................................................................... + // add v15.4s, v15.4s, v24.4s // ............................................................e......................................................................................................'............................................................~...................................................................................................... + // trn1 v25.4s, v9.4s, v10.4s // ...............................................................................................................................e...................................'...............................................................................................................................~................................... + // trn2 v26.4s, v9.4s, v10.4s // ................................................................................................................................e..................................'................................................................................................................................~.................................. + // trn1 v27.4s, v11.4s, v12.4s // ......................................................................................................................................e............................'......................................................................................................................................~............................ + // trn2 v28.4s, v11.4s, v12.4s // .....................................................................................................................................e.............................'.....................................................................................................................................~............................. + // trn2 v11.2d, v25.2d, v27.2d // ...............................................................................................................................................................e...'...............................................................................................................................................................~... + // trn2 v12.2d, v26.2d, v28.2d // ...............................................................................................................................................e...................'...............................................................................................................................................~................... + // trn1 v9.2d, v25.2d, v27.2d // ........~..........................................................................................................................................................'........*.......................................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...........................................~.......................................................................................................................'...........................................*....................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ....................................................................e..............................................................................................'....................................................................~.............................................................................................. + // trn2 v26.4s, v13.4s, v14.4s // ...............................................................e...................................................................................................'...............................................................~................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ..................................................................e................................................................................................'..................................................................~................................................................................................ + // trn2 v28.4s, v15.4s, v16.4s // ................................................................e..................................................................................................'................................................................~.................................................................................................. + // trn2 v15.2d, v25.2d, v27.2d // .........................................................................e.........................................................................................'.........................................................................~......................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // .....................................................................e.............................................................................................'.....................................................................~............................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ........................................................................e..........................................................................................'........................................................................~.......................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // .......................................................................e...........................................................................................'.......................................................................~........................................................................................... + // ldr q0, [ x5], #(12*16) // .........................................................................................................................................e.........................'.........................................................................................................................................~......................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ...........................................................................................e.......................................................................'...........................................................................................~....................................................................... + // ldr q1, [ x5, #(-12*16 + 2*16)] // ...............................................................................................................~...................................................'...............................................................................................................*................................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ................................................................................................................~..................................................'................................................................................................................*.................................................. + // ldr q2, [ x5, #(-12*16 + 4*16)] // ............................................................................................~......................................................................'............................................................................................*...................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ................................................................................................~..................................................................'................................................................................................*.................................................................. + // sqrdmulh v27.4s, v11.4s, v4.4s // ..........................................................................................~........................................................................'..........................................................................................*........................................................................ + // mul v24.4s, v11.4s, v0.4s // ....~..............................................................................................................................................................'....*.............................................................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................................................~............................................................'......................................................................................................*............................................................ + // sub v11.4s, v9.4s, v24.4s // ............................................................................................................~......................................................'............................................................................................................*...................................................... + // add v9.4s, v9.4s, v24.4s // ..............................................................................................................~....................................................'..............................................................................................................*.................................................... + // sqrdmulh v27.4s, v12.4s, v4.4s // ................................................................................................................................................................e..'................................................................................................................................................................~.. + // mul v24.4s, v12.4s, v0.4s // ...................................................................................................................................................e...............'...................................................................................................................................................~............... + // mls v24.4s, v27.4s, v8.s[0] // .........................................................................................~.........................................................................'.........................................................................................*......................................................................... + // sub v12.4s, v10.4s, v24.4s // .................................................................................................~.................................................................'.................................................................................................*................................................................. + // add v10.4s, v10.4s, v24.4s // ...................................................................................................................~...............................................'...................................................................................................................*............................................... + // sqrdmulh v27.4s, v10.4s, v5.4s // .......................................................................................................................~...........................................'.......................................................................................................................*........................................... + // mul v24.4s, v10.4s, v1.4s // .............................................................................................................................~.....................................'.............................................................................................................................*..................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................................................................................................................~.................................'.................................................................................................................................*................................. + // sub v10.4s, v9.4s, v24.4s // ..........................................................................................................................................~........................'..........................................................................................................................................*........................ + // add v9.4s, v9.4s, v24.4s // ............................................................................................................................................~......................'............................................................................................................................................*...................... + // sqrdmulh v27.4s, v12.4s, v6.4s // ..........................................................................................................~........................................................'..........................................................................................................*........................................................ + // mul v24.4s, v12.4s, v2.4s // ...........................................................................................................~.......................................................'...........................................................................................................*....................................................... + // mls v24.4s, v27.4s, v8.s[0] // .....................................................................................................................~.............................................'.....................................................................................................................*............................................. + // sub v12.4s, v11.4s, v24.4s // ..............................................................................................................................~....................................'..............................................................................................................................*.................................... + // add v11.4s, v11.4s, v24.4s // ...................................................................................................................................~...............................'...................................................................................................................................*............................... + // ldr q0, [ x5, #(-12*16 + 6*16)] // .....................................................e.............................................................................................................'.....................................................~............................................................................................................. + // ldr q4, [x5, #(-12*16 + 7*16)] // ......................................................e............................................................................................................'......................................................~............................................................................................................ + // ldr q1, [ x5, #(-12*16 + 8*16)] // ..............................................................................e....................................................................................'..............................................................................~.................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ...............................................................................................e...................................................................'...............................................................................................~................................................................... + // ldr q2, [ x5, #(-12*16 + 10*16)] // ..e................................................................................................................................................................'..~................................................................................................................................................................ + // ldr q6, [x5, #(-12*16 + 11*16)] // ...................................................................................................e...............................................................'...................................................................................................~............................................................... + // sqrdmulh v27.4s, v15.4s, v4.4s // ..................................................................................................e................................................................'..................................................................................................~................................................................ + // mul v24.4s, v15.4s, v0.4s // .....................................................................................e.............................................................................'.....................................................................................~............................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................................................e................................................'..................................................................................................................~................................................ + // sub v15.4s, v13.4s, v24.4s // .........................................................................................................................e.........................................'.........................................................................................................................~......................................... + // add v13.4s, v13.4s, v24.4s // ...........................................................................................................................e.......................................'...........................................................................................................................~....................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // ..........................................................................e........................................................................................'..........................................................................~........................................................................................ + // mul v24.4s, v16.4s, v0.4s // ...........................................................................e.......................................................................................'...........................................................................~....................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...............................................................................e...................................................................................'...............................................................................~................................................................................... + // sub v16.4s, v14.4s, v24.4s // .........................................................................................................e.........................................................'.........................................................................................................~......................................................... + // add v14.4s, v14.4s, v24.4s // ..................................................................................e................................................................................'..................................................................................~................................................................................ + // sqrdmulh v27.4s, v14.4s, v5.4s // ..........................................................................................................................e........................................'..........................................................................................................................~........................................ + // mul v24.4s, v14.4s, v1.4s // ....................................................................................e..............................................................................'....................................................................................~.............................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .......................................................................................................................................e...........................'.......................................................................................................................................~........................... + // sub v14.4s, v13.4s, v24.4s // ..................................................................................................................................................e................'..................................................................................................................................................~................ + // add v13.4s, v13.4s, v24.4s // ....................................................................................................................................................e..............'....................................................................................................................................................~.............. + // sqrdmulh v27.4s, v16.4s, v6.4s // ....................................................................................................................................e..............................'....................................................................................................................................~.............................. + // mul v24.4s, v16.4s, v2.4s // ...........................................................................................................................................e.......................'...........................................................................................................................................~....................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................................................................e....................'..............................................................................................................................................~.................... + // sub v16.4s, v15.4s, v24.4s // .....................................................................................................................................................e.............'.....................................................................................................................................................~............. + // add v15.4s, v15.4s, v24.4s // ......................................................................................................................................................e............'......................................................................................................................................................~............ + // trn1 v25.4s, v9.4s, v10.4s // ................................................................................................................................................~..................'................................................................................................................................................*.................. + // trn2 v26.4s, v9.4s, v10.4s // .................................................................................................................................................~.................'.................................................................................................................................................*................. + // trn1 v27.4s, v11.4s, v12.4s // ........................................................................................................................................~..........................'........................................................................................................................................*.......................... + // trn2 v28.4s, v11.4s, v12.4s // .............................................................................................................................................~.....................'.............................................................................................................................................*..................... + // trn2 v11.2d, v25.2d, v27.2d // ..........................................................................................................................................................~........'..........................................................................................................................................................*........ + // trn2 v12.2d, v26.2d, v28.2d // .......................................................................................................................................................~...........'.......................................................................................................................................................*........... + // trn1 v9.2d, v25.2d, v27.2d // ........................................................................................................................................................~..........'........................................................................................................................................................*.......... + // trn1 v10.2d, v26.2d, v28.2d // .........................................................................................................................................................~.........'.........................................................................................................................................................*......... + // trn1 v25.4s, v13.4s, v14.4s // ...........................................................................................................................................................e.......'...........................................................................................................................................................~....... + // trn2 v26.4s, v13.4s, v14.4s // .........~.........................................................................................................................................................'.........*......................................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ............................................................................................................................................................e......'............................................................................................................................................................~...... + // trn2 v28.4s, v15.4s, v16.4s // ..................................~................................................................................................................................'..................................*................................................................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ...................................................................................................................................................................*................................................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ............................................................................................................................~......................................'............................................................................................................................*...................................... + // trn1 v13.2d, v25.2d, v27.2d // ..............................................................................................~....................................................................'..............................................................................................*.................................................................... + // trn1 v14.2d, v26.2d, v28.2d // .......................................................................................~...........................................................................'.......................................................................................*........................................................................... + // str q9, [x1], #128 // ..............................................................................................................................................................~....'..............................................................................................................................................................*.... + // str q10, [x1, #(-(128) + 16*1)] // .................................................................................................................................................................~.'.................................................................................................................................................................*. + // str q11, [x1, #(-(128) + 16*2)] // ..................................................................................................................................................................~'..................................................................................................................................................................* + // str q12, [x1, #(-(128) + 16*3)] // .............................................................................................................................................................~.....'.............................................................................................................................................................*..... + // str q13, [x2], #128 // ....................................................................................................~..............................................................'....................................................................................................*.............................................................. + // str q14, [x2, #(-(128) + 16*1)] // .......................................................................................................~...........................................................'.......................................................................................................*........................................................... + // str q15, [x2, #(-(128) + 16*2)] // .......~...........................................................................................................................................................'.......*........................................................................................................................................................... + // str q16, [x2, #(-(128) + 16*3)] // ..................................................................................................................................~................................'..................................................................................................................................*................................ + + sub count, count, #1 cbnz count, layer45678_start - ldr_vo v2, x5, -32 - trn1 v5.2D, v23.2D, v14.2D - sqrdmulh v26.4S, v19.4S, v10.4S - mul v25.4S, v19.4S, v6.4S - mls v27.4S, v15.4S, v8.S[0] - mls v25.4S, v26.4S, v8.S[0] - mls v31.4S, v18.4S, v8.S[0] - sub v1.4S, v7.4S, v27.4S - mul v10.4S, v20.4S, v6.4S - add v16.4S, v7.4S, v27.4S - sub v14.4S, v29.4S, v25.4S - mls v10.4S, v22.4S, v8.S[0] - add v15.4S, v29.4S, v25.4S - sub v28.4S, v21.4S, v31.4S - sqrdmulh v17.4S, v14.4S, v13.4S - add v26.4S, v21.4S, v31.4S - trn1 v24.4S, v16.4S, v1.4S - trn2 v16.4S, v16.4S, v1.4S - sqrdmulh v23.4S, v15.4S, v30.4S - trn2 v3.4S, v26.4S, v28.4S - trn1 v25.4S, v26.4S, v28.4S - mul v18.4S, v14.4S, v9.4S - sub v19.4S, v5.4S, v10.4S - mls v18.4S, v17.4S, v8.S[0] - trn2 v28.2D, v16.2D, v3.2D - mul v1.4S, v15.4S, v11.4S - trn1 v3.2D, v16.2D, v3.2D - trn2 v16.2D, v24.2D, v25.2D - ldr_vo v26, x5, -48 - sqrdmulh v17.4S, v28.4S, v4.4S - add v27.4S, v19.4S, v18.4S - trn1 v29.2D, v24.2D, v25.2D - sqrdmulh v21.4S, v16.4S, v4.4S - mul v24.4S, v28.4S, v0.4S - ldr_vo v11, x5, -16 - mls v24.4S, v17.4S, v8.S[0] - mul v25.4S, v16.4S, v0.4S - mls v25.4S, v21.4S, v8.S[0] - add v16.4S, v3.4S, v24.4S - sub v12.4S, v3.4S, v24.4S - mls v1.4S, v23.4S, v8.S[0] - ldr_vo v24, x5, -64 - sqrdmulh v3.4S, v16.4S, v26.4S - add v28.4S, v29.4S, v25.4S - add v14.4S, v5.4S, v10.4S - mul v26.4S, v12.4S, v2.4S - sqrdmulh v17.4S, v12.4S, v11.4S - sub v21.4S, v29.4S, v25.4S - sub v15.4S, v14.4S, v1.4S - mul v16.4S, v16.4S, v24.4S - add v12.4S, v14.4S, v1.4S - mls v16.4S, v3.4S, v8.S[0] - trn1 v7.4S, v12.4S, v15.4S - mls v26.4S, v17.4S, v8.S[0] - sub v25.4S, v19.4S, v18.4S - trn2 v24.4S, v12.4S, v15.4S - add v29.4S, v28.4S, v16.4S - sub v3.4S, v28.4S, v16.4S - trn1 v16.4S, v27.4S, v25.4S - trn2 v28.4S, v27.4S, v25.4S - sub v12.4S, v21.4S, v26.4S - add v18.4S, v21.4S, v26.4S - trn1 v17.4S, v29.4S, v3.4S - trn2 v19.4S, v29.4S, v3.4S - trn1 v3.2D, v7.2D, v16.2D - trn1 v25.2D, v24.2D, v28.2D - trn1 v26.4S, v18.4S, v12.4S - trn2 v12.4S, v18.4S, v12.4S - trn2 v7.2D, v7.2D, v16.2D - str_vi v3, x1, 128 - trn2 v3.2D, v24.2D, v28.2D - str_vo v25, x1, -112 - trn2 v16.2D, v17.2D, v26.2D - trn1 v25.2D, v19.2D, v12.2D - trn1 v28.2D, v17.2D, v26.2D - trn2 v17.2D, v19.2D, v12.2D - str_vo v7, x1, -96 - str_vo v3, x1, -80 - str_vo v16, x2, 32 - str_vo v25, x2, 16 - str_vi v28, x2, 128 - str_vo v17, x2, -80 + // Instructions: 46 + // Expected cycles: 35 + // Expected IPC: 1.31 + // + // Wall time: 0.97s + // User time: 0.97s + // + // ------------- original position -------------> + // 0 25 + // |------------------------|-------------------- + trn2 v6.4S, v28.4S, v6.4S // ....*......................................... + mls v3.4S, v0.4S, v8.S[0] // ........*..................................... + ldr q18, [x5, #-128] // ..........*................................... + trn2 v16.4S, v19.4S, v7.4S // .....*........................................ + ldr q31, [x5, #-160] // .....................*........................ + // gap // .............................................. + sqrdmulh v14.4S, v1.4S, v13.4S // .........*.................................... + trn1 v24.2D, v12.2D, v25.2D // ...........*.................................. + ldr q19, [x5, #-144] // ......................*....................... + trn1 v7.2D, v4.2D, v27.2D // ......*....................................... + // gap // .............................................. + // gap // .............................................. + trn2 v13.2D, v12.2D, v25.2D // *............................................. + ldr q25, [x5, #-112] // ............*................................. + // gap // .............................................. + str q24, [x2], #128 // ..............*............................... + trn1 v24.2D, v15.2D, v9.2D // ...*.......................................... + mul v9.4S, v1.4S, v5.4S // .*............................................ + add v20.4S, v7.4S, v3.4S // .......................*...................... + // gap // .............................................. + // gap // .............................................. + sub v26.4S, v7.4S, v3.4S // .............*................................ + mls v9.4S, v14.4S, v8.S[0] // ...............*.............................. + str q13, [x2, #-96] // ..*........................................... + trn1 v29.2D, v6.2D, v16.2D // .......*...................................... + // gap // .............................................. + // gap // .............................................. + trn2 v7.2D, v6.2D, v16.2D // ..........................*................... + sqrdmulh v15.4S, v20.4S, v19.4S // .........................*.................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mul v30.4S, v20.4S, v31.4S // ...........................*.................. + str q29, [x2, #-112] // ................*............................. + // gap // .............................................. + sub v19.4S, v24.4S, v9.4S // ...................*.......................... + str q7, [x2, #-80] // ..............................*............... + // gap // .............................................. + sqrdmulh v14.4S, v26.4S, v25.4S // .................*............................ + add v17.4S, v24.4S, v9.4S // ....................*......................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v30.4S, v15.4S, v8.S[0] // .............................*................ + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mul v4.4S, v26.4S, v18.4S // ..................*........................... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + mls v4.4S, v14.4S, v8.S[0] // ........................*..................... + // gap // .............................................. + // gap // .............................................. + sub v6.4S, v17.4S, v30.4S // .................................*............ + // gap // .............................................. + // gap // .............................................. + add v14.4S, v17.4S, v30.4S // ..................................*........... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + add v5.4S, v19.4S, v4.4S // ...............................*.............. + sub v1.4S, v19.4S, v4.4S // ............................*................. + // gap // .............................................. + trn1 v0.4S, v14.4S, v6.4S // ....................................*......... + // gap // .............................................. + // gap // .............................................. + trn2 v31.4S, v14.4S, v6.4S // .....................................*........ + // gap // .............................................. + // gap // .............................................. + trn1 v2.4S, v5.4S, v1.4S // ................................*............. + trn2 v16.4S, v5.4S, v1.4S // ...................................*.......... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + trn2 v19.2D, v31.2D, v16.2D // ......................................*....... + trn1 v27.2D, v0.2D, v2.2D // .......................................*...... + // gap // .............................................. + trn2 v15.2D, v0.2D, v2.2D // .........................................*.... + trn1 v1.2D, v31.2D, v16.2D // ........................................*..... + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + // gap // .............................................. + str q19, [x1, #48] // ..........................................*... + str q27, [x1], #128 // ...........................................*.. + // gap // .............................................. + str q15, [x1, #-96] // .............................................* + str q1, [x1, #-112] // ............................................*. + // gap // .............................................. + + // --------------- new position ----------------> + // 0 25 + // |------------------------|-------------------- + // trn2 v23.2D, v12.2D, v25.2D // .........*.................................... + // mul v29.4S, v1.4S, v5.4S // .............*................................ + // str q23, [x2, #32] // .................*............................ + // trn1 v23.2D, v15.2D, v9.2D // ............*................................. + // trn2 v9.4S, v28.4S, v6.4S // *............................................. + // trn2 v21.4S, v19.4S, v7.4S // ...*.......................................... + // trn1 v20.2D, v4.2D, v27.2D // ........*..................................... + // trn1 v14.2D, v9.2D, v21.2D // ..................*........................... + // mls v3.4S, v0.4S, v8.S[0] // .*............................................ + // sqrdmulh v0.4S, v1.4S, v13.4S // .....*........................................ + // ldr q7, [x5, #-128] // ..*........................................... + // trn1 v17.2D, v12.2D, v25.2D // ......*....................................... + // ldr q10, [x5, #-112] // ..........*................................... + // sub v25.4S, v20.4S, v3.4S // ...............*.............................. + // str q17, [x2], #128 // ...........*.................................. + // mls v29.4S, v0.4S, v8.S[0] // ................*............................. + // str q14, [x2, #-112] // ......................*....................... + // sqrdmulh v26.4S, v25.4S, v10.4S // .........................*.................... + // mul v10.4S, v25.4S, v7.4S // ............................*................. + // sub v19.4S, v23.4S, v29.4S // .......................*...................... + // add v25.4S, v23.4S, v29.4S // ..........................*................... + // ldr q0, [x5, #-160] // ....*......................................... + // ldr q29, [x5, #-144] // .......*...................................... + // add v14.4S, v20.4S, v3.4S // ..............*............................... + // mls v10.4S, v26.4S, v8.S[0] // .............................*................ + // sqrdmulh v2.4S, v14.4S, v29.4S // ....................*......................... + // trn2 v18.2D, v9.2D, v21.2D // ...................*.......................... + // mul v31.4S, v14.4S, v0.4S // .....................*........................ + // sub v28.4S, v19.4S, v10.4S // .................................*............ + // mls v31.4S, v2.4S, v8.S[0] // ...........................*.................. + // str q18, [x2, #-80] // ........................*..................... + // add v24.4S, v19.4S, v10.4S // ................................*............. + // trn1 v18.4S, v24.4S, v28.4S // ....................................*......... + // sub v2.4S, v25.4S, v31.4S // ..............................*............... + // add v23.4S, v25.4S, v31.4S // ...............................*.............. + // trn2 v24.4S, v24.4S, v28.4S // .....................................*........ + // trn1 v0.4S, v23.4S, v2.4S // ..................................*........... + // trn2 v14.4S, v23.4S, v2.4S // ...................................*.......... + // trn2 v23.2D, v14.2D, v24.2D // ......................................*....... + // trn1 v1.2D, v0.2D, v18.2D // .......................................*...... + // trn1 v11.2D, v14.2D, v24.2D // .........................................*.... + // trn2 v31.2D, v0.2D, v18.2D // ........................................*..... + // str q23, [x1, #48] // ..........................................*... + // str q1, [x1], #128 // ...........................................*.. + // str q11, [x1, #-112] // .............................................* + // str q31, [x1, #-96] // ............................................*. + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm.s index f82b9cba..d129aeef 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +25,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +42,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -89,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -136,35 +108,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -186,7 +158,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +169,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +179,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +187,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +198,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -251,7 +223,7 @@ roots: .text .global ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm - .global _ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm + .global _ntt_dilithium_123_45678_manual_st4 .p2align 4 const_addr: .word 8380417 @@ -375,554 +347,590 @@ _ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm: load_roots_123 .p2align 2 - ldr q6, [x0, #768] // ...*......................... - // gap // ............................. - ldr q16, [x0, #896] // .*........................... - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - ldr q24, [x0, #512] // *............................ - // gap // ............................. - ldr q21, [x0, #640] // ..*.......................... - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - ldr q23, [x0, #384] // .......*..................... - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - mul v25.4S, v6.4S, v0.S[0] // ........*.................... - sqrdmulh v18.4S, v6.4S, v0.S[1] // ..........*.................. - sqrdmulh v26.4S, v16.4S, v0.S[1] // .....*....................... - mul v12.4S, v16.4S, v0.S[0] // ......*...................... - ldr q22, [x0, #256] // .............*............... - // gap // ............................. - // gap // ............................. - // gap // ............................. - sqrdmulh v10.4S, v24.4S, v0.S[1] // ...............*............. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - mls v25.4S, v18.4S, v8.S[0] // ..............*.............. - mls v12.4S, v26.4S, v8.S[0] // ............*................ - sqrdmulh v4.4S, v21.4S, v0.S[1] // .........*................... - mul v9.4S, v21.4S, v0.S[0] // ...........*................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - add v31.4S, v23.4S, v12.4S // .................*........... - sub v19.4S, v23.4S, v12.4S // ....................*........ - mls v9.4S, v4.4S, v8.S[0] // ................*............ - ldr q4, [x0, #128] // ....*........................ - mul v12.4S, v24.4S, v0.S[0] // .....................*....... - // gap // ............................. - // gap // ............................. - // gap // ............................. - add v17.4S, v22.4S, v25.4S // ......................*...... - sub v15.4S, v22.4S, v25.4S // ............................* - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - mul v22.4S, v31.4S, v0.S[2] // ..................*.......... - sqrdmulh v16.4S, v31.4S, v0.S[3] // ...................*......... - sqrdmulh v30.4S, v19.4S, v1.S[1] // .......................*..... - mul v18.4S, v19.4S, v1.S[0] // ........................*.... - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - add v6.4S, v4.4S, v9.4S // ..........................*.. - mul v7.4S, v17.4S, v0.S[2] // ...........................*. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - mls v22.4S, v16.4S, v8.S[0] // .........................*... - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - - // original source code - // ldr q16, [x0, #512] // ..*.......................... - // ldr q25, [x0, #896] // .*........................... - // ldr q19, [x0, #640] // ...*......................... - // ldr q28, [x0, #768] // *............................ - // ldr q4, [x0, #128] // ..................*.......... - // sqrdmulh v26.4S, v25.4S, v0.S[1] // .......*..................... - // mul v20.4S, v25.4S, v0.S[0] // ........*.................... - // ldr q31, [x0, #384] // ....*........................ - // mul v11.4S, v28.4S, v0.S[0] // .....*....................... - // sqrdmulh v17.4S, v19.4S, v0.S[1] // .............*............... - // sqrdmulh v25.4S, v28.4S, v0.S[1] // ......*...................... - // mul v9.4S, v19.4S, v0.S[0] // ..............*.............. - // mls v20.4S, v26.4S, v8.S[0] // ............*................ - // ldr q21, [x0, #256] // .........*................... - // mls v11.4S, v25.4S, v8.S[0] // ...........*................. - // sqrdmulh v10.4S, v16.4S, v0.S[1] // ..........*.................. - // mls v9.4S, v17.4S, v8.S[0] // .................*........... - // add v14.4S, v31.4S, v20.4S // ...............*............. - // mul v22.4S, v14.4S, v0.S[2] // ......................*...... - // sqrdmulh v25.4S, v14.4S, v0.S[3] // .......................*..... - // sub v13.4S, v31.4S, v20.4S // ................*............ - // mul v12.4S, v16.4S, v0.S[0] // ...................*......... - // add v17.4S, v21.4S, v11.4S // ....................*........ - // sqrdmulh v30.4S, v13.4S, v1.S[1] // ........................*.... - // mul v18.4S, v13.4S, v1.S[0] // .........................*... - // mls v22.4S, v25.4S, v8.S[0] // ............................* - // add v6.4S, v4.4S, v9.4S // ..........................*.. - // mul v7.4S, v17.4S, v0.S[2] // ...........................*. - // sub v15.4S, v21.4S, v11.4S // .....................*....... + // Instructions: 38 + // Expected cycles: 16 + // Expected IPC: 2.38 + // + // Wall time: 0.63s + // User time: 0.63s + // + // --------- original position ---------> + // 0 25 + // |------------------------|------------ + ldr q6, [x0, #768] // .*.................................... + ldr q19, [x0, #896] // ..*................................... + ldr q26, [x0, #0] // *..................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + ldr q28, [x0, #640] // ...*.................................. + ldr q22, [x0, #512] // ....*................................. + ldr q29, [x0, #128] // .....*................................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + ldr q4, [x0, #256] // ......*............................... + ldr q10, [x0, #384] // .......*.............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v14.4S, v6.4S, v0.S[0] // ........*............................. + sqrdmulh v6.4S, v6.4S, v0.S[1] // .........*............................ + mul v15.4S, v19.4S, v0.S[0] // ..........*........................... + sqrdmulh v19.4S, v19.4S, v0.S[1] // ...........*.......................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v13.4S, v28.4S, v0.S[1] // .................*.................... + mul v28.4S, v28.4S, v0.S[0] // ..................*................... + sqrdmulh v11.4S, v22.4S, v0.S[1] // ............*......................... + mul v22.4S, v22.4S, v0.S[0] // .............*........................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v14.4S, v6.4S, v8.S[0] // ..............*....................... + mls v15.4S, v19.4S, v8.S[0] // ...............*...................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v22.4S, v11.4S, v8.S[0] // ................*..................... + mls v28.4S, v13.4S, v8.S[0] // ......................*............... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v6.4S, v4.4S, v14.4S // ....................*................. + add v14.4S, v4.4S, v14.4S // ...................*.................. + add v13.4S, v10.4S, v15.4S // .......................*.............. + sub v4.4S, v10.4S, v15.4S // .....................*................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v31.4S, v26.4S, v22.4S // .................................*.... + sub v19.4S, v26.4S, v22.4S // ....................................*. + sub v18.4S, v29.4S, v28.4S // ..................................*... + add v24.4S, v29.4S, v28.4S // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v26.4S, v6.4S, v1.S[0] // ..........................*........... + sqrdmulh v6.4S, v6.4S, v1.S[1] // ...........................*.......... + sqrdmulh v29.4S, v4.4S, v1.S[1] // .........................*............ + mul v4.4S, v4.4S, v1.S[0] // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v17.4S, v14.4S, v0.S[3] // ............................*......... + mul v28.4S, v14.4S, v0.S[2] // .....................................* + mul v22.4S, v13.4S, v0.S[2] // .............................*........ + sqrdmulh v13.4S, v13.4S, v0.S[3] // ..............................*....... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v26.4S, v6.4S, v8.S[0] // ................................*..... + mls v4.4S, v29.4S, v8.S[0] // ...............................*...... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + + // ----------- new position ------------> + // 0 25 + // |------------------------|------------ + // ldr q11, [x0, #0] // ..*................................... + // ldr q12, [x0, #768] // *..................................... + // ldr q10, [x0, #896] // .*.................................... + // ldr q7, [x0, #640] // ...*.................................. + // ldr q13, [x0, #512] // ....*................................. + // ldr q21, [x0, #128] // .....*................................ + // ldr q14, [x0, #256] // ......*............................... + // ldr q15, [x0, #384] // .......*.............................. + // mul v19.4S, v12.4S, v0.S[0] // ........*............................. + // sqrdmulh v12.4S, v12.4S, v0.S[1] // .........*............................ + // mul v26.4S, v10.4S, v0.S[0] // ..........*........................... + // sqrdmulh v27.4S, v10.4S, v0.S[1] // ...........*.......................... + // sqrdmulh v17.4S, v13.4S, v0.S[1] // ..............*....................... + // mul v16.4S, v13.4S, v0.S[0] // ...............*...................... + // mls v19.4S, v12.4S, v8.S[0] // ................*..................... + // mls v26.4S, v27.4S, v8.S[0] // .................*.................... + // mls v16.4S, v17.4S, v8.S[0] // ..................*................... + // sqrdmulh v17.4S, v7.4S, v0.S[1] // ............*......................... + // mul v6.4S, v7.4S, v0.S[0] // .............*........................ + // add v23.4S, v14.4S, v19.4S // .....................*................ + // sub v14.4S, v14.4S, v19.4S // ....................*................. + // sub v28.4S, v15.4S, v26.4S // .......................*.............. + // mls v6.4S, v17.4S, v8.S[0] // ...................*.................. + // add v7.4S, v15.4S, v26.4S // ......................*............... + // mul v4.4S, v28.4S, v1.S[0] // ...............................*...... + // sqrdmulh v15.4S, v28.4S, v1.S[1] // ..............................*....... + // mul v26.4S, v14.4S, v1.S[0] // ............................*......... + // sqrdmulh v30.4S, v14.4S, v1.S[1] // .............................*........ + // sqrdmulh v17.4S, v23.4S, v0.S[3] // ................................*..... + // mul v22.4S, v7.4S, v0.S[2] // ..................................*... + // sqrdmulh v13.4S, v7.4S, v0.S[3] // ...................................*.. + // mls v4.4S, v15.4S, v8.S[0] // .....................................* + // mls v26.4S, v30.4S, v8.S[0] // ....................................*. + // add v31.4S, v11.4S, v16.4S // ........................*............. + // sub v18.4S, v21.4S, v6.4S // ..........................*........... + // add v24.4S, v21.4S, v6.4S // ...........................*.......... + // sub v19.4S, v11.4S, v16.4S // .........................*............ + // mul v28.4S, v23.4S, v0.S[2] // .................................*.... sub count, count, #1 layer123_start: - ldr q5, [x0, #0] // *........................................................................... - ldr q16, [x0, #528] // ....e....................................................................... - ldr q25, [x0, #912] // .......e.................................................................... + // Instructions: 76 + // Expected cycles: 17 + // Expected IPC: 4.47 + // + // Wall time: 32.57s + // User time: 32.57s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q11, [x0, #16] // e........................................................................... + ldr q12, [x0, #784] // ......e..................................................................... + ldr q10, [x0, #912] // .......e.................................................................... + mls v22.4S, v13.4S, v8.S[0] // ...................................*........................................ // gap // ............................................................................ // gap // ............................................................................ - mls v12.4S, v10.4S, v8.S[0] // ..........*................................................................. - sqrdmulh v11.4S, v17.4S, v0.S[3] // .............................*.............................................. - mls v18.4S, v30.4S, v8.S[0] // .............................................*.............................. // gap // ............................................................................ // gap // ............................................................................ - sub v21.4S, v4.4S, v9.4S // ................*........................................................... - ldr q19, [x0, #656] // .....e...................................................................... - ldr q28, [x0, #784] // ......e..................................................................... + add v25.4S, v19.4S, v26.4S // ..........................................*................................. + ldr q7, [x0, #656] // .....e...................................................................... + ldr q13, [x0, #528] // ....e....................................................................... + ldr q21, [x0, #144] // .e.......................................................................... + add v6.4S, v18.4S, v4.4S // ...............................................*............................ + sub v4.4S, v18.4S, v4.4S // ..............................................*............................. // gap // ............................................................................ - add v31.4S, v6.4S, v22.4S // .....................................*...................................... // gap // ............................................................................ - sub v9.4S, v6.4S, v22.4S // ....................................*....................................... - ldr q4, [x0, #144] // .e.......................................................................... + ldr q14, [x0, #272] // ..e......................................................................... + sub v9.4S, v19.4S, v26.4S // .........................................*.................................. + mls v28.4S, v17.4S, v8.S[0] // ..............................*............................................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v24.4S, v15.4S, v1.S[1] // .......................................*.................................... - mul v27.4S, v15.4S, v1.S[0] // ......................................*..................................... // gap // ............................................................................ + mul v29.4S, v6.4S, v2.S[2] // ...........................................................*................ + sqrdmulh v30.4S, v6.4S, v2.S[3] // ..........................................................*................. + add v6.4S, v24.4S, v22.4S // .....................................*...................................... + ldr q15, [x0, #400] // ...e........................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sub v6.4S, v21.4S, v18.4S // ..............................................*............................. - sqrdmulh v23.4S, v31.4S, v1.S[3] // .................................................*.......................... - mul v30.4S, v31.4S, v1.S[2] // ................................................*........................... - mls v7.4S, v11.4S, v8.S[0] // ..............................*............................................. - mul v29.4S, v9.4S, v2.S[0] // .....................................................*...................... - sqrdmulh v26.4S, v25.4S, v0.S[1] // ........................e................................................... + sqrdmulh v23.4S, v4.4S, v3.S[1] // ...............................................................*............ + mul v19.4S, v12.4S, v0.S[0] // ...................e........................................................ + sqrdmulh v12.4S, v12.4S, v0.S[1] // ..................e......................................................... + mul v26.4S, v10.4S, v0.S[0] // ........................e................................................... + sqrdmulh v27.4S, v10.4S, v0.S[1] // .......................e.................................................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v15.4S, v9.4S, v2.S[1] // ......................................................*..................... // gap // ............................................................................ // gap // ............................................................................ - mul v20.4S, v25.4S, v0.S[0] // .......................e.................................................... + sqrdmulh v17.4S, v13.4S, v0.S[1] // ........e................................................................... + mul v16.4S, v13.4S, v0.S[0] // .........e.................................................................. + mul v10.4S, v4.4S, v3.S[0] // ................................................................*........... + sub v24.4S, v24.4S, v22.4S // ....................................*....................................... // gap // ............................................................................ - ldr q31, [x0, #400] // ...e........................................................................ // gap // ............................................................................ - mul v11.4S, v28.4S, v0.S[0] // ..................e......................................................... - sqrdmulh v17.4S, v19.4S, v0.S[1] // ..............e............................................................. - add v22.4S, v21.4S, v18.4S // ...............................................*............................ // gap // ............................................................................ - add v10.4S, v5.4S, v12.4S // ............*............................................................... // gap // ............................................................................ + mul v13.4S, v6.4S, v1.S[2] // .................................................*.......................... + mls v29.4S, v30.4S, v8.S[0] // ............................................................*............... + sqrdmulh v4.4S, v6.4S, v1.S[3] // ................................................*........................... + add v18.4S, v31.4S, v28.4S // ................................*........................................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v13.4S, v6.4S, v3.S[1] // ................................................................*........... - sqrdmulh v25.4S, v28.4S, v0.S[1] // ...................e........................................................ // gap // ............................................................................ - mls v27.4S, v24.4S, v8.S[0] // ........................................*................................... - mul v28.4S, v6.4S, v3.S[0] // ...............................................................*............ // gap // ............................................................................ - mul v9.4S, v19.4S, v0.S[0] // .............e.............................................................. + mls v19.4S, v12.4S, v8.S[0] // ....................e....................................................... + mul v22.4S, v24.4S, v2.S[0] // ......................................................*..................... + sqrdmulh v30.4S, v24.4S, v2.S[1] // .....................................................*...................... + mls v26.4S, v27.4S, v8.S[0] // .........................e.................................................. // gap // ............................................................................ - mul v18.4S, v22.4S, v2.S[2] // ..........................................................*................. - mls v20.4S, v26.4S, v8.S[0] // .........................e.................................................. - ldr q21, [x0, #272] // ..e......................................................................... // gap // ............................................................................ - sqrdmulh v6.4S, v22.4S, v2.S[3] // ...........................................................*................ - mls v30.4S, v23.4S, v8.S[0] // ..................................................*......................... - mls v29.4S, v15.4S, v8.S[0] // .......................................................*.................... // gap // ............................................................................ // gap // ............................................................................ + mls v16.4S, v17.4S, v8.S[0] // ..........e................................................................. + sqrdmulh v17.4S, v7.4S, v0.S[1] // .............e.............................................................. + mls v10.4S, v23.4S, v8.S[0] // .................................................................*.......... + mul v6.4S, v7.4S, v0.S[0] // ..............e............................................................. // gap // ............................................................................ // gap // ............................................................................ - sub v12.4S, v5.4S, v12.4S // ...........*................................................................ - sub v22.4S, v10.4S, v7.4S // ...............................*............................................ // gap // ............................................................................ - mls v11.4S, v25.4S, v8.S[0] // ....................e....................................................... // gap // ............................................................................ - add v24.4S, v10.4S, v7.4S // ................................*........................................... + mls v13.4S, v4.4S, v8.S[0] // ..................................................*......................... + add v7.4S, v25.4S, v29.4S // ..............................................................*............. + sub v27.4S, v25.4S, v29.4S // .............................................................*.............. + sub v12.4S, v31.4S, v28.4S // ...............................*............................................ // gap // ............................................................................ // gap // ............................................................................ - mls v28.4S, v13.4S, v8.S[0] // .................................................................*.......... - sqrdmulh v10.4S, v16.4S, v0.S[1] // .........e.................................................................. - mls v9.4S, v17.4S, v8.S[0] // ...............e............................................................ // gap // ............................................................................ // gap // ............................................................................ - add v14.4S, v31.4S, v20.4S // ...........................e................................................ + add v23.4S, v14.4S, v19.4S // ......................e..................................................... + sub v14.4S, v14.4S, v19.4S // .....................e...................................................... + mls v22.4S, v30.4S, v8.S[0] // .......................................................*.................... + sub v28.4S, v15.4S, v26.4S // ..........................e................................................. // gap // ............................................................................ // gap // ............................................................................ - sub v5.4S, v12.4S, v27.4S // .........................................*.................................. - mls v18.4S, v6.4S, v8.S[0] // ............................................................*............... - sub v15.4S, v24.4S, v30.4S // ...................................................*........................ // gap // ............................................................................ - add v19.4S, v22.4S, v29.4S // .........................................................*.................. - sub v7.4S, v22.4S, v29.4S // ........................................................*................... // gap // ............................................................................ + str q7, [x0, #512] // ........................................................................*... + mls v6.4S, v17.4S, v8.S[0] // ...............e............................................................ + str q27, [x0, #640] // .........................................................................*.. + add v19.4S, v9.4S, v10.4S // ...................................................................*........ + add v7.4S, v15.4S, v26.4S // ...........................e................................................ // gap // ............................................................................ // gap // ............................................................................ - add v27.4S, v12.4S, v27.4S // ..........................................*................................. + sub v29.4S, v9.4S, v10.4S // ..................................................................*......... + mul v4.4S, v28.4S, v1.S[0] // ............................................e............................... + sqrdmulh v15.4S, v28.4S, v1.S[1] // ...........................................e................................ + mul v26.4S, v14.4S, v1.S[0] // .......................................e.................................... + sqrdmulh v30.4S, v14.4S, v1.S[1] // ......................................e..................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mul v22.4S, v14.4S, v0.S[2] // .................................e.......................................... - sqrdmulh v25.4S, v14.4S, v0.S[3] // ..................................e......................................... - sub v23.4S, v5.4S, v28.4S // ..................................................................*......... - sub v13.4S, v31.4S, v20.4S // ..........................e................................................. - mul v12.4S, v16.4S, v0.S[0] // ........e................................................................... - str q7, [x0, #384] // .......................................................................*.... - add v7.4S, v27.4S, v18.4S // ..............................................................*............. + sqrdmulh v17.4S, v23.4S, v0.S[3] // ............................e............................................... + str q29, [x0, #896] // ...........................................................................* + add v28.4S, v12.4S, v22.4S // .........................................................*.................. + sub v29.4S, v12.4S, v22.4S // ........................................................*................... + sub v14.4S, v18.4S, v13.4S // ...................................................*........................ // gap // ............................................................................ // gap // ............................................................................ - str q19, [x0, #256] // ......................................................................*..... - add v17.4S, v21.4S, v11.4S // ......................e..................................................... - add v20.4S, v5.4S, v28.4S // ...................................................................*........ - str q15, [x0, #128] // .....................................................................*...... // gap // ............................................................................ + str q19, [x0, #768] // ..........................................................................*. + add v10.4S, v18.4S, v13.4S // ....................................................*....................... + mul v22.4S, v7.4S, v0.S[2] // ..................................e......................................... + sqrdmulh v13.4S, v7.4S, v0.S[3] // .................................e.......................................... // gap // ............................................................................ - sub v27.4S, v27.4S, v18.4S // .............................................................*.............. - str q23, [x0, #896] // ...........................................................................* - add v23.4S, v24.4S, v30.4S // ....................................................*....................... - sqrdmulh v30.4S, v13.4S, v1.S[1] // ............................................e............................... - mul v18.4S, v13.4S, v1.S[0] // ...........................................e................................ - mls v22.4S, v25.4S, v8.S[0] // ...................................e........................................ - str q7, [x0, #512] // ........................................................................*... - str q20, [x0, #768] // ..........................................................................*. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + mls v4.4S, v15.4S, v8.S[0] // .............................................e.............................. + mls v26.4S, v30.4S, v8.S[0] // ........................................e................................... + str q29, [x0, #384] // .......................................................................*.... + str q14, [x0, #128] // .....................................................................*...... + add v31.4S, v11.4S, v16.4S // ............e............................................................... + sub v18.4S, v21.4S, v6.4S // ................e........................................................... // gap // ............................................................................ // gap // ............................................................................ - add v6.4S, v4.4S, v9.4S // .................e.......................................................... - mul v7.4S, v17.4S, v0.S[2] // ............................e............................................... - str q27, [x0, #640] // .........................................................................*.. + add v24.4S, v21.4S, v6.4S // .................e.......................................................... + sub v19.4S, v11.4S, v16.4S // ...........e................................................................ + str q28, [x0, #256] // ......................................................................*..... + str q10, [x0], #(16) // ....................................................................*....... + mul v28.4S, v23.4S, v0.S[2] // .............................e.............................................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sub v15.4S, v21.4S, v11.4S // .....................e...................................................... - str q23, [x0], #(16) // ....................................................................*....... - - // original source code - // ldr q9, [x0, #0] // ...........................................................................*........................................................................... - // ldr q10, [x0, #(1*(1024/8))] // ..........e................................................................|..........e................................................................ - // ldr q11, [x0, #(2*(1024/8))] // .................................e.........................................|.................................e......................................... - // ldr q12, [x0, #(3*(1024/8))] // .....................e.....................................................|.....................e..................................................... - // ldr q13, [x0, #(4*(1024/8))] // e..........................................................................|e.......................................................................... - // ldr q14, [x0, #(5*(1024/8))] // ......e....................................................................|......e.................................................................... - // ldr q15, [x0, #(6*(1024/8))] // .......e...................................................................|.......e................................................................... - // ldr q16, [x0, #(7*(1024/8))] // .e.........................................................................|.e......................................................................... - // mul v24.4s, v13.4s, v0.s[0] // .......................................................e...................|.......................................................e................... - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ..........................................e................................|..........................................e................................ - // mls v24.4s, v13.4s, v8.s[0] // ..*........................................................................|..*........................................................................ - // sub v13.4s, v9.4s, v24.4s // .....................................*.....................................|.....................................*..................................... - // add v9.4s, v9.4s, v24.4s // .........................*.................................................|.........................*................................................. - // mul v24.4s, v14.4s, v0.s[0] // ..............................e............................................|..............................e............................................ - // sqrdmulh v14.4s, v14.4s, v0.s[1] // .......................e...................................................|.......................e................................................... - // mls v24.4s, v14.4s, v8.s[0] // ...........................................e...............................|...........................................e............................... - // sub v14.4s, v10.4s, v24.4s // .....*.....................................................................|.....*..................................................................... - // add v10.4s, v10.4s, v24.4s // ......................................................................e....|......................................................................e.... - // mul v24.4s, v15.4s, v0.s[0] // ......................e....................................................|......................e.................................................... - // sqrdmulh v15.4s, v15.4s, v0.s[1] // ...........................e...............................................|...........................e............................................... - // mls v24.4s, v15.4s, v8.s[0] // .......................................e...................................|.......................................e................................... - // sub v15.4s, v11.4s, v24.4s // .........................................................................e.|.........................................................................e. - // add v11.4s, v11.4s, v24.4s // ...........................................................e...............|...........................................................e............... - // mul v24.4s, v16.4s, v0.s[0] // ....................e......................................................|....................e...................................................... - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ..................e........................................................|..................e........................................................ - // mls v24.4s, v16.4s, v8.s[0] // ................................e..........................................|................................e.......................................... - // sub v16.4s, v12.4s, v24.4s // ......................................................e....................|......................................................e.................... - // add v12.4s, v12.4s, v24.4s // ............................................e..............................|............................................e.............................. - // mul v24.4s, v11.4s, v0.s[2] // .......................................................................e...|.......................................................................e... - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ...*.......................................................................|...*....................................................................... - // mls v24.4s, v11.4s, v8.s[0] // ................*..........................................................|................*.......................................................... - // sub v11.4s, v9.4s, v24.4s // ......................................*....................................|......................................*.................................... - // add v9.4s, v9.4s, v24.4s // ........................................*..................................|........................................*.................................. - // mul v24.4s, v12.4s, v0.s[2] // ...................................................e.......................|...................................................e....................... - // sqrdmulh v12.4s, v12.4s, v0.s[3] // ....................................................e......................|....................................................e...................... - // mls v24.4s, v12.4s, v8.s[0] // ...................................................................e.......|...................................................................e....... - // sub v12.4s, v10.4s, v24.4s // .........*.................................................................|.........*................................................................. - // add v10.4s, v10.4s, v24.4s // ........*..................................................................|........*.................................................................. - // mul v24.4s, v15.4s, v1.s[0] // ............*..............................................................|............*.............................................................. - // sqrdmulh v15.4s, v15.4s, v1.s[1] // ...........*...............................................................|...........*............................................................... - // mls v24.4s, v15.4s, v8.s[0] // ............................*..............................................|............................*.............................................. - // sub v15.4s, v13.4s, v24.4s // .............................................*.............................|.............................................*............................. - // add v13.4s, v13.4s, v24.4s // ..................................................*........................|..................................................*........................ - // mul v24.4s, v16.4s, v1.s[0] // ..................................................................e........|..................................................................e........ - // sqrdmulh v16.4s, v16.4s, v1.s[1] // .................................................................e.........|.................................................................e......... - // mls v24.4s, v16.4s, v8.s[0] // ....*......................................................................|....*...................................................................... - // sub v16.4s, v14.4s, v24.4s // .............*.............................................................|.............*............................................................. - // add v14.4s, v14.4s, v24.4s // ........................*..................................................|........................*.................................................. - // mul v24.4s, v10.4s, v1.s[2] // ...............*...........................................................|...............*........................................................... - // sqrdmulh v10.4s, v10.4s, v1.s[3] // ..............*............................................................|..............*............................................................ - // mls v24.4s, v10.4s, v8.s[0] // ...................................*.......................................|...................................*....................................... - // sub v10.4s, v9.4s, v24.4s // ...............................................*...........................|...............................................*........................... - // add v9.4s, v9.4s, v24.4s // ................................................................*..........|................................................................*.......... - // mul v24.4s, v12.4s, v2.s[0] // .................*.........................................................|.................*......................................................... - // sqrdmulh v12.4s, v12.4s, v2.s[1] // ...................*.......................................................|...................*....................................................... - // mls v24.4s, v12.4s, v8.s[0] // ....................................*......................................|....................................*...................................... - // sub v12.4s, v11.4s, v24.4s // .................................................*.........................|.................................................*......................... - // add v11.4s, v11.4s, v24.4s // ................................................*..........................|................................................*.......................... - // mul v24.4s, v14.4s, v2.s[2] // ...............................*...........................................|...............................*........................................... - // sqrdmulh v14.4s, v14.4s, v2.s[3] // ..................................*........................................|..................................*........................................ - // mls v24.4s, v14.4s, v8.s[0] // ..............................................*............................|..............................................*............................ - // sub v14.4s, v13.4s, v24.4s // ..............................................................*............|..............................................................*............ - // add v13.4s, v13.4s, v24.4s // .........................................................*.................|.........................................................*................. - // mul v24.4s, v16.4s, v3.s[0] // .............................*.............................................|.............................*............................................. - // sqrdmulh v16.4s, v16.4s, v3.s[1] // ..........................*................................................|..........................*................................................ - // mls v24.4s, v16.4s, v8.s[0] // .........................................*.................................|.........................................*................................. - // sub v16.4s, v15.4s, v24.4s // .....................................................*.....................|.....................................................*..................... - // add v15.4s, v15.4s, v24.4s // ............................................................*..............|............................................................*.............. - // str q9, [x0], #(16) // ..........................................................................*|..........................................................................* - // str q10, [x0, #(-16 + 1*(1024/8))] // .............................................................*.............|.............................................................*............. - // str q11, [x0, #(-16 + 2*(1024/8))] // ..........................................................*................|..........................................................*................ - // str q12, [x0, #(-16 + 3*(1024/8))] // ........................................................*..................|........................................................*.................. - // str q13, [x0, #(-16 + 4*(1024/8))] // ....................................................................*......|....................................................................*...... - // str q14, [x0, #(-16 + 5*(1024/8))] // ........................................................................*..|........................................................................*.. - // str q15, [x0, #(-16 + 6*(1024/8))] // .....................................................................*.....|.....................................................................*..... - // str q16, [x0, #(-16 + 7*(1024/8))] // ...............................................................*...........|...............................................................*........... + + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q9, [x0, #0] // e...........................................................................~.......................................................................... + // ldr q10, [x0, #(1*(1024/8))] // .......e....................................................................'......~................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..........e.................................................................'.........~................................................................ + // ldr q12, [x0, #(3*(1024/8))] // ................e...........................................................'...............~.......................................................... + // ldr q13, [x0, #(4*(1024/8))] // ......e.....................................................................'.....~.................................................................... + // ldr q14, [x0, #(5*(1024/8))] // .....e......................................................................'....~..................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .e..........................................................................'~......................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ..e.........................................................................'.~........................................................................ + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ......................e.....................................................'.....................~.................................................... + // mul v24.4s, v13.4s, v0.s[0] // .......................e....................................................'......................~................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................e.........................................'.................................~........................................ + // sub v13.4s, v9.4s, v24.4s // ........................................................................e...'.......................................................................~.. + // add v9.4s, v9.4s, v24.4s // .....................................................................e......'....................................................................~..... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ...................................e........................................'..................................~....................................... + // mul v24.4s, v14.4s, v0.s[0] // .....................................e......................................'....................................~..................................... + // mls v24.4s, v27.4s, v8.s[0] // ...............................................e............................'..............................................~........................... + // sub v14.4s, v10.4s, v24.4s // ......................................................................e.....'.....................................................................~.... + // add v10.4s, v10.4s, v24.4s // .......................................................................e....'......................................................................~... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ...................e........................................................'..................~....................................................... + // mul v24.4s, v15.4s, v0.s[0] // ..................e.........................................................'.................~........................................................ + // mls v24.4s, v27.4s, v8.s[0] // ..............................e.............................................'.............................~............................................ + // sub v15.4s, v11.4s, v24.4s // ...........................................e................................'..........................................~............................... + // add v11.4s, v11.4s, v24.4s // ..........................................e.................................'.........................................~................................ + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .....................e......................................................'....................~..................................................... + // mul v24.4s, v16.4s, v0.s[0] // ....................e.......................................................'...................~...................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................e..........................................'................................~......................................... + // sub v16.4s, v12.4s, v24.4s // .............................................e..............................'............................................~............................. + // add v12.4s, v12.4s, v24.4s // ..................................................e.........................'.................................................~........................ + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ........................................................e...................'.......................................................~.................. + // mul v24.4s, v11.4s, v0.s[2] // ...........................................................................e'.......................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............~...............................................................'...........*.............................................................. + // sub v11.4s, v9.4s, v24.4s // .........................................~..................................'........................................*................................. + // add v9.4s, v9.4s, v24.4s // .............................~..............................................'............................*............................................. + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ................................................................e...........'...............................................................~.......... + // mul v24.4s, v12.4s, v0.s[2] // ...............................................................e............'..............................................................~........... + // mls v24.4s, v27.4s, v8.s[0] // ...~........................................................................'..*....................................................................... + // sub v12.4s, v10.4s, v24.4s // .........................~..................................................'........................*................................................. + // add v10.4s, v10.4s, v24.4s // ...............~............................................................'..............*........................................................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .......................................................e....................'......................................................~................... + // mul v24.4s, v15.4s, v1.s[0] // ......................................................e.....................'.....................................................~.................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................e.........'.................................................................~........ + // sub v15.4s, v13.4s, v24.4s // ...........~................................................................'..........*............................................................... + // add v13.4s, v13.4s, v24.4s // ....~.......................................................................'...*...................................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .....................................................e......................'....................................................~..................... + // mul v24.4s, v16.4s, v1.s[0] // ....................................................e.......................'...................................................~...................... + // mls v24.4s, v27.4s, v8.s[0] // .................................................................e..........'................................................................~......... + // sub v16.4s, v14.4s, v24.4s // .........~..................................................................'........*................................................................. + // add v14.4s, v14.4s, v24.4s // ........~...................................................................'.......*.................................................................. + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ............................~...............................................'...........................*.............................................. + // mul v24.4s, v10.4s, v1.s[2] // ..........................~.................................................'.........................*................................................ + // mls v24.4s, v27.4s, v8.s[0] // ......................................~.....................................'.....................................*.................................... + // sub v10.4s, v9.4s, v24.4s // ............................................................~...............'...........................................................*.............. + // add v9.4s, v9.4s, v24.4s // ..............................................................~.............'.............................................................*............ + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ................................~...........................................'...............................*.......................................... + // mul v24.4s, v12.4s, v2.s[0] // ...............................~............................................'..............................*........................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................~...............................'...........................................*.............................. + // sub v12.4s, v11.4s, v24.4s // ...........................................................~................'..........................................................*............... + // add v11.4s, v11.4s, v24.4s // ..........................................................~.................'.........................................................*................ + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ..............~.............................................................'.............*............................................................ + // mul v24.4s, v14.4s, v2.s[2] // .............~..............................................................'............*............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................~................................................'..........................*............................................... + // sub v14.4s, v13.4s, v24.4s // ........................................~...................................'.......................................*.................................. + // add v13.4s, v13.4s, v24.4s // .......................................~....................................'......................................*................................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .................~..........................................................'................*......................................................... + // mul v24.4s, v16.4s, v3.s[0] // ........................~...................................................'.......................*.................................................. + // mls v24.4s, v27.4s, v8.s[0] // ....................................~.......................................'...................................*...................................... + // sub v16.4s, v15.4s, v24.4s // ...................................................~........................'..................................................*....................... + // add v15.4s, v15.4s, v24.4s // .................................................~..........................'................................................*......................... + // str q9, [x0], #(16) // ..........................................................................~.'.........................................................................* + // str q10, [x0, #(-16 + 1*(1024/8))] // ....................................................................~.......'...................................................................*...... + // str q11, [x0, #(-16 + 2*(1024/8))] // .........................................................................~..'........................................................................*. + // str q12, [x0, #(-16 + 3*(1024/8))] // ...................................................................~........'..................................................................*....... + // str q13, [x0, #(-16 + 4*(1024/8))] // ..............................................~.............................'.............................................*............................ + // str q14, [x0, #(-16 + 5*(1024/8))] // ................................................~...........................'...............................................*.......................... + // str q15, [x0, #(-16 + 6*(1024/8))] // .............................................................~..............'............................................................*............. + // str q16, [x0, #(-16 + 7*(1024/8))] // .........................................................~..................'........................................................*................. sub count, count, #1 cbnz count, layer123_start - mls v18.4S, v30.4S, v8.S[0] // ...*........................................... - sub v27.4S, v4.4S, v9.4S // ....*.......................................... - add v5.4S, v6.4S, v22.4S // .....*......................................... - sub v29.4S, v6.4S, v22.4S // ......*........................................ - ldr q11, [x0, #0] // *.............................................. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v12.4S, v10.4S, v8.S[0] // .*............................................. - sqrdmulh v9.4S, v17.4S, v0.S[3] // ..*............................................ - mul v28.4S, v15.4S, v1.S[0] // ........*...................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sqrdmulh v14.4S, v15.4S, v1.S[1] // .......*....................................... - sqrdmulh v31.4S, v5.4S, v1.S[3] // ..........*.................................... - sqrdmulh v25.4S, v29.4S, v2.S[1] // ..............*................................ - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sub v23.4S, v27.4S, v18.4S // .........*..................................... - add v18.4S, v27.4S, v18.4S // ...............*............................... - mul v16.4S, v5.4S, v1.S[2] // ...........*................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v7.4S, v9.4S, v8.S[0] // ............*.................................. - mul v20.4S, v29.4S, v2.S[0] // .............*................................. - add v10.4S, v11.4S, v12.4S // ................*.............................. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v19.4S, v23.4S, v3.S[0] // ...................*........................... - sqrdmulh v23.4S, v23.4S, v3.S[1] // .................*............................. - mul v24.4S, v18.4S, v2.S[2] // ....................*.......................... - sqrdmulh v29.4S, v18.4S, v2.S[3] // .....................*......................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v28.4S, v14.4S, v8.S[0] // ..................*............................ - mls v16.4S, v31.4S, v8.S[0] // ......................*........................ - sub v27.4S, v11.4S, v12.4S // ........................*...................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v20.4S, v25.4S, v8.S[0] // .......................*....................... - add v30.4S, v10.4S, v7.4S // ..........................*.................... - sub v17.4S, v10.4S, v7.4S // .........................*..................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v19.4S, v23.4S, v8.S[0] // ...........................*................... - mls v24.4S, v29.4S, v8.S[0] // .............................*................. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sub v25.4S, v27.4S, v28.4S // ............................*.................. - sub v21.4S, v30.4S, v16.4S // ..............................*................ - add v22.4S, v30.4S, v16.4S // ..........................................*.... - add v4.4S, v27.4S, v28.4S // .................................*............. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - add v5.4S, v17.4S, v20.4S // ...............................*............... - sub v23.4S, v17.4S, v20.4S // ................................*.............. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sub v18.4S, v25.4S, v19.4S // ..................................*............ - add v7.4S, v25.4S, v19.4S // ......................................*........ - str q21, [x0, #128] // .......................................*....... - str q22, [x0], #(16) // ..............................................* - add v10.4S, v4.4S, v24.4S // ....................................*.......... - sub v9.4S, v4.4S, v24.4S // ........................................*...... - // gap // ............................................... - // gap // ............................................... - str q5, [x0, #240] // .....................................*......... - str q23, [x0, #368] // ...................................*........... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - str q18, [x0, #880] // .........................................*..... - str q7, [x0, #752] // ............................................*.. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - str q10, [x0, #496] // ...........................................*... - str q9, [x0, #624] // .............................................*. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - - // original source code - // ldr q5, [x0, #0] // ....*.......................................... - // mls v12.4S, v10.4S, v8.S[0] // .....*......................................... - // sqrdmulh v11.4S, v17.4S, v0.S[3] // ......*........................................ - // mls v18.4S, v30.4S, v8.S[0] // *.............................................. - // sub v21.4S, v4.4S, v9.4S // .*............................................. - // add v31.4S, v6.4S, v22.4S // ..*............................................ - // sub v9.4S, v6.4S, v22.4S // ...*........................................... - // sqrdmulh v24.4S, v15.4S, v1.S[1] // ........*...................................... - // mul v27.4S, v15.4S, v1.S[0] // .......*....................................... - // sub v6.4S, v21.4S, v18.4S // ...........*................................... - // sqrdmulh v23.4S, v31.4S, v1.S[3] // .........*..................................... - // mul v30.4S, v31.4S, v1.S[2] // .............*................................. - // mls v7.4S, v11.4S, v8.S[0] // ..............*................................ - // mul v29.4S, v9.4S, v2.S[0] // ...............*............................... - // sqrdmulh v15.4S, v9.4S, v2.S[1] // ..........*.................................... - // add v22.4S, v21.4S, v18.4S // ............*.................................. - // add v10.4S, v5.4S, v12.4S // ................*.............................. - // sqrdmulh v13.4S, v6.4S, v3.S[1] // ..................*............................ - // mls v27.4S, v24.4S, v8.S[0] // .....................*......................... - // mul v28.4S, v6.4S, v3.S[0] // .................*............................. - // mul v18.4S, v22.4S, v2.S[2] // ...................*........................... - // sqrdmulh v6.4S, v22.4S, v2.S[3] // ....................*.......................... - // mls v30.4S, v23.4S, v8.S[0] // ......................*........................ - // mls v29.4S, v15.4S, v8.S[0] // ........................*...................... - // sub v12.4S, v5.4S, v12.4S // .......................*....................... - // sub v22.4S, v10.4S, v7.4S // ..........................*.................... - // add v24.4S, v10.4S, v7.4S // .........................*..................... - // mls v28.4S, v13.4S, v8.S[0] // ...........................*................... - // sub v5.4S, v12.4S, v27.4S // .............................*................. - // mls v18.4S, v6.4S, v8.S[0] // ............................*.................. - // sub v15.4S, v24.4S, v30.4S // ..............................*................ - // add v19.4S, v22.4S, v29.4S // .................................*............. - // sub v7.4S, v22.4S, v29.4S // ..................................*............ - // add v27.4S, v12.4S, v27.4S // ................................*.............. - // sub v23.4S, v5.4S, v28.4S // ...................................*........... - // str q7, [x0, #384] // ..........................................*.... - // add v7.4S, v27.4S, v18.4S // .......................................*....... - // str q19, [x0, #256] // .........................................*..... - // add v20.4S, v5.4S, v28.4S // ....................................*.......... - // str q15, [x0, #128] // .....................................*......... - // sub v27.4S, v27.4S, v18.4S // ........................................*...... - // str q23, [x0, #896] // ...........................................*... - // add v23.4S, v24.4S, v30.4S // ...............................*............... - // str q7, [x0, #512] // .............................................*. - // str q20, [x0, #768] // ............................................*.. - // str q27, [x0, #640] // ..............................................* - // str q23, [x0], #(16) // ......................................*........ + // Instructions: 38 + // Expected cycles: 15 + // Expected IPC: 2.53 + // + // Wall time: 0.52s + // User time: 0.52s + // + // --------- original position ---------> + // 0 25 + // |------------------------|------------ + mls v22.4S, v13.4S, v8.S[0] // *..................................... + add v13.4S, v19.4S, v26.4S // .*.................................... + add v14.4S, v18.4S, v4.4S // ..*................................... + sub v15.4S, v18.4S, v4.4S // ...*.................................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v29.4S, v19.4S, v26.4S // ....*................................. + mls v28.4S, v17.4S, v8.S[0] // .....*................................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v18.4S, v15.4S, v3.S[1] // .........*............................ + mul v21.4S, v15.4S, v3.S[0] // ..........*........................... + sqrdmulh v19.4S, v14.4S, v2.S[3] // .......*.............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v16.4S, v24.4S, v22.4S // ...........*.......................... + add v25.4S, v24.4S, v22.4S // ........*............................. + mul v30.4S, v14.4S, v2.S[2] // ......*............................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v11.4S, v31.4S, v28.4S // ...............*...................... + sub v31.4S, v31.4S, v28.4S // ......................*............... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v26.4S, v16.4S, v2.S[1] // .................*.................... + mls v21.4S, v18.4S, v8.S[0] // ..................*................... + sqrdmulh v28.4S, v25.4S, v1.S[3] // ..............*....................... + mul v23.4S, v25.4S, v1.S[2] // ............*......................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v30.4S, v19.4S, v8.S[0] // .............*........................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v16.4S, v16.4S, v2.S[0] // ................*..................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v25.4S, v29.4S, v21.4S // ...........................*.......... + add v27.4S, v29.4S, v21.4S // ..........................*........... + mls v23.4S, v28.4S, v8.S[0] // ...................*.................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v16.4S, v26.4S, v8.S[0] // .......................*.............. + add v20.4S, v13.4S, v30.4S // ....................*................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v12.4S, v13.4S, v30.4S // .....................*................ + str q27, [x0, #768] // ................................*..... + str q25, [x0, #896] // ............................*......... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v19.4S, v11.4S, v23.4S // ...............................*...... + add v14.4S, v11.4S, v23.4S // .................................*.... + str q20, [x0, #512] // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v5.4S, v31.4S, v16.4S // .............................*........ + sub v10.4S, v31.4S, v16.4S // ..............................*....... + str q12, [x0, #640] // .........................*............ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q19, [x0, #128] // ...................................*.. + str q14, [x0], #(16) // .....................................* + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q5, [x0, #240] // ....................................*. + str q10, [x0, #368] // ..................................*... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + + // ----------- new position ------------> + // 0 25 + // |------------------------|------------ + // mls v22.4S, v13.4S, v8.S[0] // *..................................... + // add v25.4S, v19.4S, v26.4S // .*.................................... + // add v6.4S, v18.4S, v4.4S // ..*................................... + // sub v4.4S, v18.4S, v4.4S // ...*.................................. + // sub v9.4S, v19.4S, v26.4S // ....*................................. + // mls v28.4S, v17.4S, v8.S[0] // .....*................................ + // mul v29.4S, v6.4S, v2.S[2] // ...........*.......................... + // sqrdmulh v30.4S, v6.4S, v2.S[3] // ........*............................. + // add v6.4S, v24.4S, v22.4S // ..........*........................... + // sqrdmulh v23.4S, v4.4S, v3.S[1] // ......*............................... + // mul v10.4S, v4.4S, v3.S[0] // .......*.............................. + // sub v24.4S, v24.4S, v22.4S // .........*............................ + // mul v13.4S, v6.4S, v1.S[2] // .................*.................... + // mls v29.4S, v30.4S, v8.S[0] // ..................*................... + // sqrdmulh v4.4S, v6.4S, v1.S[3] // ................*..................... + // add v18.4S, v31.4S, v28.4S // ............*......................... + // mul v22.4S, v24.4S, v2.S[0] // ...................*.................. + // sqrdmulh v30.4S, v24.4S, v2.S[1] // ..............*....................... + // mls v10.4S, v23.4S, v8.S[0] // ...............*...................... + // mls v13.4S, v4.4S, v8.S[0] // ......................*............... + // add v7.4S, v25.4S, v29.4S // ........................*............. + // sub v27.4S, v25.4S, v29.4S // .........................*............ + // sub v12.4S, v31.4S, v28.4S // .............*........................ + // mls v22.4S, v30.4S, v8.S[0] // .......................*.............. + // str q7, [x0, #512] // ..............................*....... + // str q27, [x0, #640] // .................................*.... + // add v19.4S, v9.4S, v10.4S // .....................*................ + // sub v29.4S, v9.4S, v10.4S // ....................*................. + // str q29, [x0, #896] // ...........................*.......... + // add v28.4S, v12.4S, v22.4S // ...............................*...... + // sub v29.4S, v12.4S, v22.4S // ................................*..... + // sub v14.4S, v18.4S, v13.4S // ............................*......... + // str q19, [x0, #768] // ..........................*........... + // add v10.4S, v18.4S, v13.4S // .............................*........ + // str q29, [x0, #384] // .....................................* + // str q14, [x0, #128] // ..................................*... + // str q28, [x0, #256] // ....................................*. + // str q10, [x0], #(16) // ...................................*.. restore inp, STACK0 @@ -939,1074 +947,1110 @@ layer123_start: qform_root3_tw .req q7 .p2align 2 - // gap // .............................................................................................. - ldr q1, [x4], #64 // ..*........................................................................................... - // gap // .............................................................................................. - ldr q11, [x2, #48] // .*............................................................................................ - ldr q5, [x2, #32] // ....*......................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q22, [x2, #16] // .....*........................................................................................ - ldr q15, [x2, #0] // *............................................................................................. - ldr q16, [x5, #48] // ...........*.................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q17, [x1, #32] // ...*.......................................................................................... - ldr q3, [x4, #-48] // ...............*.............................................................................. - ldr q6, [x5, #96] // ....................................*......................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - ldr q28, [x1, #48] // ........*..................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v7.4S, v11.4S, v1.S[1] // ......*....................................................................................... - mul v12.4S, v11.4S, v1.S[0] // .......*...................................................................................... - ldr q24, [x4, #-32] // ......................................*....................................................... - // gap // .............................................................................................. - sqrdmulh v27.4S, v5.4S, v1.S[1] // ................*............................................................................. - mul v2.4S, v5.4S, v1.S[0] // ..................*........................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v11.4S, v22.4S, v1.S[1] // ..............*............................................................................... - mul v22.4S, v22.4S, v1.S[0] // .............*................................................................................ - ldr q30, [x1, #0] // ....................*......................................................................... - sqrdmulh v13.4S, v15.4S, v1.S[1] // .........*.................................................................................... - mul v18.4S, v15.4S, v1.S[0] // ..........*................................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v12.4S, v7.4S, v8.S[0] // .................*............................................................................ - mls v2.4S, v27.4S, v8.S[0] // ........................*..................................................................... - ldr q15, [x1, #16] // ............*................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v22.4S, v11.4S, v8.S[0] // .....................*........................................................................ - mls v18.4S, v13.4S, v8.S[0] // ...................*.......................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v9.4S, v17.4S, v2.4S // .................................*............................................................ - add v20.4S, v28.4S, v12.4S // ......................*....................................................................... - sub v31.4S, v17.4S, v2.4S // ................................*............................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v19.4S, v15.4S, v22.4S // .........................*.................................................................... - sub v11.4S, v28.4S, v12.4S // .......................*...................................................................... - add v29.4S, v15.4S, v22.4S // ..........................*................................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v27.4S, v20.4S, v1.S[2] // ...........................*.................................................................. - sqrdmulh v4.4S, v20.4S, v1.S[3] // ............................*................................................................. - mul v0.4S, v9.4S, v1.S[2] // ..........................................*................................................... - sqrdmulh v9.4S, v9.4S, v1.S[3] // .........................................*.................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v12.4S, v11.4S, v3.S[0] // .............................*................................................................ - sqrdmulh v20.4S, v11.4S, v3.S[1] // ...............................*.............................................................. - add v5.4S, v30.4S, v18.4S // ...................................*.......................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v28.4S, v30.4S, v18.4S // .......................................*...................................................... - ldr q22, [x4, #-16] // ..............................*............................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v27.4S, v4.4S, v8.S[0] // ..................................*........................................................... - mls v0.4S, v9.4S, v8.S[0] // ....................................................*......................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v12.4S, v20.4S, v8.S[0] // .....................................*........................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sqrdmulh v9.4S, v31.4S, v3.S[1] // ........................................*..................................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v20.4S, v5.4S, v0.4S // .............................................................*................................ - sub v7.4S, v29.4S, v27.4S // ...........................................*.................................................. - add v21.4S, v29.4S, v27.4S // ............................................*................................................. - mul v29.4S, v31.4S, v3.S[0] // .............................................*................................................ - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v2.4S, v19.4S, v12.4S // ...............................................*.............................................. - sub v11.4S, v19.4S, v12.4S // ..............................................*............................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v25.4S, v21.4S, v3.S[2] // ................................................*............................................. - sqrdmulh v10.4S, v21.4S, v3.S[3] // .................................................*............................................ - mul v27.4S, v7.4S, v24.S[0] // ..................................................*........................................... - sqrdmulh v7.4S, v7.4S, v24.S[1] // ...................................................*.......................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v26.4S, v11.4S, v22.S[0] // ......................................................*....................................... - sqrdmulh v12.4S, v11.4S, v22.S[1] // .......................................................*...................................... - mls v29.4S, v9.4S, v8.S[0] // .....................................................*........................................ - sqrdmulh v23.4S, v2.4S, v24.S[3] // .........................................................*.................................... - ldr q9, [x5, #16] // ...........................................................................*.................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v22.4S, v2.4S, v24.S[2] // ........................................................*..................................... - sub v2.4S, v5.4S, v0.4S // ............................................................*................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v27.4S, v7.4S, v8.S[0] // ...........................................................*.................................. - mls v25.4S, v10.4S, v8.S[0] // ..........................................................*................................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - sub v11.4S, v28.4S, v29.4S // ...............................................................*.............................. - add v7.4S, v28.4S, v29.4S // .................................................................*............................ - mls v26.4S, v12.4S, v8.S[0] // ..............................................................*............................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mls v22.4S, v23.4S, v8.S[0] // ................................................................*............................. - ldr q28, [x5, #64] // ....................................................................................*......... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v4.4S, v2.4S, v27.4S // ....................................................................*......................... - sub v27.4S, v2.4S, v27.4S // ...................................................................*.......................... - add v18.4S, v20.4S, v25.4S // ..................................................................*........................... - sub v12.4S, v20.4S, v25.4S // .....................................................................*........................ - ldr q20, [x5], #(12*16) // .........................................................................*.................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v2.4S, v11.4S, v26.4S // .............................................................................*................ - sub v26.4S, v11.4S, v26.4S // ......................................................................*....................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - add v31.4S, v7.4S, v22.4S // .......................................................................*...................... - sub v19.4S, v7.4S, v22.4S // ...............................................................................*.............. - trn2 v10.4S, v4.4S, v27.4S // ..........................................................................*................... - trn2 v15.4S, v18.4S, v12.4S // ........................................................................*..................... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - trn1 v23.4S, v18.4S, v12.4S // ..............................................................................*............... - trn1 v0.4S, v4.4S, v27.4S // ...................................................................................*.......... - trn1 v22.4S, v2.4S, v26.4S // .................................................................................*............ - ldr q27, [x5, #-64] // .............................................................................................* - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - trn2 v11.2D, v15.2D, v10.2D // ................................................................................*............. - trn2 v24.4S, v31.4S, v19.4S // .....................................................................................*........ - trn1 v31.4S, v31.4S, v19.4S // ........................................................................................*..... - trn2 v7.4S, v2.4S, v26.4S // ..................................................................................*........... - ldr q26, [x5, #-80] // ............................................................................*................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - trn2 v13.2D, v23.2D, v0.2D // .........................................................................................*.... - trn1 v25.2D, v23.2D, v0.2D // ..........................................................................................*... - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - mul v21.4S, v11.4S, v20.4S // ......................................................................................*....... - sqrdmulh v23.4S, v11.4S, v9.4S // .......................................................................................*...... - trn2 v0.2D, v24.2D, v7.2D // ...........................................................................................*.. - trn2 v12.2D, v31.2D, v22.2D // ............................................................................................*. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - // gap // .............................................................................................. - - // original source code - // ldr q30, [x2, #0] // ....*......................................................................................... - // ldr q19, [x2, #48] // .*............................................................................................ - // ldr q3, [x4], #64 // *............................................................................................. - // ldr q10, [x1, #32] // ......*....................................................................................... - // ldr q17, [x2, #32] // ..*........................................................................................... - // ldr q1, [x2, #16] // ...*.......................................................................................... - // sqrdmulh v24.4S, v19.4S, v3.S[1] // ..........*................................................................................... - // mul v20.4S, v19.4S, v3.S[0] // ...........*.................................................................................. - // ldr q22, [x1, #48] // .........*.................................................................................... - // sqrdmulh v21.4S, v30.4S, v3.S[1] // ..................*........................................................................... - // mul v0.4S, v30.4S, v3.S[0] // ...................*.......................................................................... - // ldr q16, [x5, #48] // .....*........................................................................................ - // ldr q19, [x1, #16] // ......................*....................................................................... - // mul v9.4S, v1.4S, v3.S[0] // ................*............................................................................. - // sqrdmulh v30.4S, v1.4S, v3.S[1] // ...............*.............................................................................. - // ldr q1, [x4, #-48] // .......*...................................................................................... - // sqrdmulh v13.4S, v17.4S, v3.S[1] // .............*................................................................................ - // mls v20.4S, v24.4S, v8.S[0] // ....................*......................................................................... - // mul v11.4S, v17.4S, v3.S[0] // ..............*............................................................................... - // mls v0.4S, v21.4S, v8.S[0] // ........................*..................................................................... - // ldr q28, [x1, #0] // .................*............................................................................ - // mls v9.4S, v30.4S, v8.S[0] // .......................*...................................................................... - // add v17.4S, v22.4S, v20.4S // ..........................*................................................................... - // sub v26.4S, v22.4S, v20.4S // .............................*................................................................ - // mls v11.4S, v13.4S, v8.S[0] // .....................*........................................................................ - // sub v25.4S, v19.4S, v9.4S // ............................*................................................................. - // add v23.4S, v19.4S, v9.4S // ..............................*............................................................... - // mul v2.4S, v17.4S, v3.S[2] // ...............................*.............................................................. - // sqrdmulh v4.4S, v17.4S, v3.S[3] // ................................*............................................................. - // mul v7.4S, v26.4S, v1.S[0] // ...................................*.......................................................... - // ldr q14, [x4, #-16] // .......................................*...................................................... - // sqrdmulh v17.4S, v26.4S, v1.S[1] // ....................................*......................................................... - // sub v5.4S, v10.4S, v11.4S // ...........................*.................................................................. - // add v11.4S, v10.4S, v11.4S // .........................*.................................................................... - // mls v2.4S, v4.4S, v8.S[0] // ........................................*..................................................... - // add v10.4S, v28.4S, v0.4S // .....................................*........................................................ - // ldr q6, [x5, #96] // ........*..................................................................................... - // mls v7.4S, v17.4S, v8.S[0] // ..........................................*................................................... - // ldr q29, [x4, #-32] // ............*................................................................................. - // sub v15.4S, v28.4S, v0.4S // ......................................*....................................................... - // sqrdmulh v0.4S, v5.4S, v1.S[1] // ...........................................*.................................................. - // sqrdmulh v27.4S, v11.4S, v3.S[3] // ..................................*........................................................... - // mul v11.4S, v11.4S, v3.S[2] // .................................*............................................................ - // sub v3.4S, v23.4S, v2.4S // .............................................*................................................ - // add v23.4S, v23.4S, v2.4S // ..............................................*............................................... - // mul v2.4S, v5.4S, v1.S[0] // ...............................................*.............................................. - // sub v31.4S, v25.4S, v7.4S // .................................................*............................................ - // add v25.4S, v25.4S, v7.4S // ................................................*............................................. - // mul v7.4S, v23.4S, v1.S[2] // ..................................................*........................................... - // sqrdmulh v1.4S, v23.4S, v1.S[3] // ...................................................*.......................................... - // mul v30.4S, v3.4S, v29.S[0] // ....................................................*......................................... - // sqrdmulh v3.4S, v3.4S, v29.S[1] // .....................................................*........................................ - // mls v11.4S, v27.4S, v8.S[0] // .........................................*.................................................... - // mls v2.4S, v0.4S, v8.S[0] // ........................................................*..................................... - // mul v28.4S, v31.4S, v14.S[0] // ......................................................*....................................... - // sqrdmulh v27.4S, v31.4S, v14.S[1] // .......................................................*...................................... - // mul v0.4S, v25.4S, v29.S[2] // ...........................................................*.................................. - // sqrdmulh v29.4S, v25.4S, v29.S[3] // .........................................................*.................................... - // mls v7.4S, v1.4S, v8.S[0] // ..............................................................*............................... - // mls v30.4S, v3.4S, v8.S[0] // .............................................................*................................ - // sub v14.4S, v10.4S, v11.4S // ............................................................*................................. - // add v4.4S, v10.4S, v11.4S // ............................................*................................................. - // mls v28.4S, v27.4S, v8.S[0] // .................................................................*............................ - // sub v11.4S, v15.4S, v2.4S // ...............................................................*.............................. - // mls v0.4S, v29.4S, v8.S[0] // ..................................................................*........................... - // add v21.4S, v15.4S, v2.4S // ................................................................*............................. - // add v19.4S, v4.4S, v7.4S // ......................................................................*....................... - // sub v1.4S, v14.4S, v30.4S // .....................................................................*........................ - // add v14.4S, v14.4S, v30.4S // ....................................................................*......................... - // sub v2.4S, v4.4S, v7.4S // .......................................................................*...................... - // sub v27.4S, v11.4S, v28.4S // ..........................................................................*................... - // add v30.4S, v21.4S, v0.4S // ...........................................................................*.................. - // trn2 v15.4S, v19.4S, v2.4S // ..............................................................................*............... - // ldr q20, [x5], #(12*16) // ........................................................................*..................... - // trn2 v10.4S, v14.4S, v1.4S // .............................................................................*................ - // ldr q9, [x5, #-176] // ..........................................................*................................... - // ldr q26, [x5, #-80] // .......................................................................................*...... - // add v11.4S, v11.4S, v28.4S // .........................................................................*.................... - // trn1 v29.4S, v19.4S, v2.4S // ...............................................................................*.............. - // sub v0.4S, v21.4S, v0.4S // ............................................................................*................. - // trn2 v2.2D, v15.2D, v10.2D // ...................................................................................*.......... - // trn1 v22.4S, v11.4S, v27.4S // .................................................................................*............ - // trn2 v7.4S, v11.4S, v27.4S // ......................................................................................*....... - // trn1 v18.4S, v14.4S, v1.4S // ................................................................................*............. - // ldr q28, [x5, #-128] // ...................................................................*.......................... - // trn2 v24.4S, v30.4S, v0.4S // ....................................................................................*......... - // mul v21.4S, v2.4S, v20.4S // ..........................................................................................*... - // sqrdmulh v23.4S, v2.4S, v9.4S // ...........................................................................................*.. - // trn1 v31.4S, v30.4S, v0.4S // .....................................................................................*........ - // trn2 v13.2D, v29.2D, v18.2D // ........................................................................................*..... - // trn1 v25.2D, v29.2D, v18.2D // .........................................................................................*.... - // trn2 v0.2D, v24.2D, v7.2D // ............................................................................................*. - // trn2 v12.2D, v31.2D, v22.2D // .............................................................................................* - // ldr q27, [x5, #-64] // ..................................................................................*........... + // Instructions: 98 + // Expected cycles: 34 + // Expected IPC: 2.88 + // + // Wall time: 16.32s + // User time: 16.32s + // + // --------------------------------------- original position ---------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|---------------------- + ldr q5, [x2, #48] // ..*............................................................................................... + ldr q31, [x4], #64 // *................................................................................................. + ldr q28, [x2, #32] // .*................................................................................................ + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + ldr q20, [x2, #16] // ....*............................................................................................. + ldr q6, [x5, #96] // ............................................*..................................................... + ldr q16, [x2, #0] // ...*.............................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + ldr q7, [x1, #0] // ....................*............................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + ldr q1, [x1, #16] // .......*.......................................................................................... + sqrdmulh v24.4S, v28.4S, v31.S[1] // ........*......................................................................................... + mul v25.4S, v28.4S, v31.S[0] // ...........*...................................................................................... + sqrdmulh v17.4S, v5.4S, v31.S[1] // .........*........................................................................................ + mul v14.4S, v5.4S, v31.S[0] // ..........*....................................................................................... + ldr q10, [x5], #(12*16) // .....*............................................................................................ + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mul v27.4S, v20.4S, v31.S[0] // .................*................................................................................ + sqrdmulh v23.4S, v20.4S, v31.S[1] // ................*................................................................................. + ldr q18, [x1, #48] // ......*........................................................................................... + mul v3.4S, v16.4S, v31.S[0] // .............*.................................................................................... + sqrdmulh v15.4S, v16.4S, v31.S[1] // ..............*................................................................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + ldr q2, [x1, #32] // ............*..................................................................................... + mls v25.4S, v24.4S, v8.S[0] // ...................*.............................................................................. + mls v14.4S, v17.4S, v8.S[0] // ...............*.................................................................................. + ldr q21, [x4, #-48] // ..................*............................................................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mls v27.4S, v23.4S, v8.S[0] // ........................*......................................................................... + mls v3.4S, v15.4S, v8.S[0] // .....................*............................................................................ + ldr q5, [x5, #-160] // .............................................................................*.................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + add v20.4S, v2.4S, v25.4S // ..........................*....................................................................... + sub v16.4S, v2.4S, v25.4S // .........................*........................................................................ + add v25.4S, v18.4S, v14.4S // ......................*........................................................................... + sub v18.4S, v18.4S, v14.4S // .......................*.......................................................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + sub v2.4S, v7.4S, v3.4S // ...............................*.................................................................. + sub v26.4S, v1.4S, v27.4S // .....................................*............................................................ + add v27.4S, v1.4S, v27.4S // ..................................*............................................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mul v1.4S, v16.4S, v21.S[0] // ....................................*............................................................. + sqrdmulh v4.4S, v16.4S, v21.S[1] // ..............................*................................................................... + sqrdmulh v22.4S, v25.4S, v31.S[3] // ...................................*.............................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mul v25.4S, v25.4S, v31.S[2] // ............................*..................................................................... + sqrdmulh v9.4S, v20.4S, v31.S[3] // ................................*................................................................. + mul v28.4S, v20.4S, v31.S[2] // .................................*................................................................ + sqrdmulh v31.4S, v18.4S, v21.S[1] // .............................*.................................................................... + mul v23.4S, v18.4S, v21.S[0] // ...........................*...................................................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + add v16.4S, v7.4S, v3.4S // .......................................*.......................................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mls v1.4S, v4.4S, v8.S[0] // ..............................................*................................................... + mls v25.4S, v22.4S, v8.S[0] // .............................................*.................................................... + ldr q22, [x4, #-32] // .........................................*........................................................ + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mls v28.4S, v9.4S, v8.S[0] // ..........................................*....................................................... + mls v23.4S, v31.4S, v8.S[0] // ......................................*........................................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + ldr q7, [x4, #-16] // ...........................................*...................................................... + add v15.4S, v27.4S, v25.4S // ....................................................*............................................. + sub v27.4S, v27.4S, v25.4S // .....................................................*............................................ + add v25.4S, v2.4S, v1.4S // ..............................................................*................................... + sub v1.4S, v2.4S, v1.4S // ............................................................*..................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + sub v17.4S, v26.4S, v23.4S // ...............................................*.................................................. + add v31.4S, v26.4S, v23.4S // ................................................*................................................. + sub v20.4S, v16.4S, v28.4S // .................................................................*................................ + add v30.4S, v16.4S, v28.4S // .............................................................*.................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mul v23.4S, v15.4S, v21.S[2] // ........................................................*......................................... + sqrdmulh v2.4S, v15.4S, v21.S[3] // ..........................................................*....................................... + mul v24.4S, v27.4S, v22.S[0] // .........................................................*........................................ + sqrdmulh v27.4S, v27.4S, v22.S[1] // ...........................................................*...................................... + ldr q21, [x5, #-80] // .................................................................................*................ + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mul v9.4S, v17.4S, v7.S[0] // ...................................................*.............................................. + sqrdmulh v17.4S, v17.4S, v7.S[1] // ......................................................*........................................... + mul v7.4S, v31.4S, v22.S[2] // .................................................*................................................ + sqrdmulh v31.4S, v31.4S, v22.S[3] // .......................................................*.......................................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mls v23.4S, v2.4S, v8.S[0] // ...................................................................*.............................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + mls v24.4S, v27.4S, v8.S[0] // ..................................................................*............................... + mls v7.4S, v31.4S, v8.S[0] // ................................................................*................................. + mls v9.4S, v17.4S, v8.S[0] // ...............................................................*.................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + sub v16.4S, v20.4S, v24.4S // ...........................................................................*...................... + add v12.4S, v30.4S, v23.4S // ..........................................................................*....................... + add v2.4S, v20.4S, v24.4S // ........................................................................*......................... + sub v27.4S, v30.4S, v23.4S // .........................................................................*........................ + // gap // .................................................................................................. + // gap // .................................................................................................. + ldr q20, [x5, #-144] // .....................................................................................*............ + ldr q30, [x5, #-176] // ..................................................*............................................... + add v18.4S, v1.4S, v9.4S // ......................................................................*........................... + add v23.4S, v25.4S, v7.4S // .....................................................................*............................ + sub v31.4S, v25.4S, v7.4S // ....................................................................*............................. + sub v17.4S, v1.4S, v9.4S // .......................................................................*.......................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + trn1 v9.4S, v2.4S, v16.4S // ........................................................................................*......... + trn2 v3.4S, v2.4S, v16.4S // ..................................................................................*............... + trn2 v19.4S, v12.4S, v27.4S // ...................................................................................*.............. + trn1 v24.4S, v12.4S, v27.4S // ......................................................................................*........... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + trn2 v1.4S, v18.4S, v17.4S // ...............................................................................*.................. + trn1 v29.4S, v18.4S, v17.4S // ................................................................................*................. + trn2 v27.4S, v23.4S, v31.4S // ............................................................................*..................... + trn1 v22.4S, v23.4S, v31.4S // ..............................................................................*................... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + trn2 v16.2D, v19.2D, v3.2D // .........................................................................................*........ + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + trn1 v31.2D, v27.2D, v1.2D // ................................................................................................*. + trn2 v1.2D, v27.2D, v1.2D // ....................................................................................*............. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + trn2 v0.2D, v22.2D, v29.2D // .......................................................................................*.......... + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + // gap // .................................................................................................. + trn2 v17.2D, v24.2D, v9.2D // ..............................................................................................*... + mul v13.4S, v16.4S, v10.4S // .............................................................................................*.... + mul v15.4S, v0.4S, v6.4S // ...............................................................................................*.. + mul v26.4S, v1.4S, v6.4S // ..........................................................................................*....... + sqrdmulh v12.4S, v16.4S, v30.4S // ............................................................................................*..... + sqrdmulh v11.4S, v1.4S, v21.4S // ...........................................................................................*...... + ldr q16, [x5, #-48] // ........................................*......................................................... + ldr q6, [x5, #-64] // .................................................................................................* + // gap // .................................................................................................. + // gap // .................................................................................................. + + // ----------------------------------------- new position ------------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|---------------------- + // ldr q28, [x4], #64 // .*................................................................................................ + // ldr q7, [x2, #32] // ..*............................................................................................... + // ldr q18, [x2, #48] // *................................................................................................. + // ldr q17, [x2, #0] // .....*............................................................................................ + // ldr q25, [x2, #16] // ...*.............................................................................................. + // ldr q10, [x5], #(12*16) // ............*..................................................................................... + // ldr q0, [x1, #48] // ...............*.................................................................................. + // ldr q21, [x1, #16] // .......*.......................................................................................... + // sqrdmulh v27.4S, v7.4S, v28.S[1] // ........*......................................................................................... + // sqrdmulh v23.4S, v18.4S, v28.S[1] // ..........*....................................................................................... + // mul v26.4S, v18.4S, v28.S[0] // ...........*...................................................................................... + // mul v19.4S, v7.4S, v28.S[0] // .........*........................................................................................ + // ldr q22, [x1, #32] // ..................*............................................................................... + // mul v16.4S, v17.4S, v28.S[0] // ................*................................................................................. + // sqrdmulh v3.4S, v17.4S, v28.S[1] // .................*................................................................................ + // mls v26.4S, v23.4S, v8.S[0] // ....................*............................................................................. + // sqrdmulh v5.4S, v25.4S, v28.S[1] // ..............*................................................................................... + // mul v6.4S, v25.4S, v28.S[0] // .............*.................................................................................... + // ldr q25, [x4, #-48] // .....................*............................................................................ + // mls v19.4S, v27.4S, v8.S[0] // ...................*.............................................................................. + // ldr q7, [x1, #0] // ......*........................................................................................... + // mls v16.4S, v3.4S, v8.S[0] // .......................*.......................................................................... + // add v20.4S, v0.4S, v26.4S // ...........................*...................................................................... + // sub v17.4S, v0.4S, v26.4S // ............................*..................................................................... + // mls v6.4S, v5.4S, v8.S[0] // ......................*........................................................................... + // sub v5.4S, v22.4S, v19.4S // ..........................*....................................................................... + // add v11.4S, v22.4S, v19.4S // .........................*........................................................................ + // mul v29.4S, v17.4S, v25.S[0] // .......................................*.......................................................... + // mul v19.4S, v20.4S, v28.S[2] // ...................................*.............................................................. + // sqrdmulh v31.4S, v17.4S, v25.S[1] // ......................................*........................................................... + // sqrdmulh v0.4S, v5.4S, v25.S[1] // .................................*................................................................ + // sub v3.4S, v7.4S, v16.4S // .............................*.................................................................... + // sqrdmulh v22.4S, v11.4S, v28.S[3] // ....................................*............................................................. + // mul v26.4S, v11.4S, v28.S[2] // .....................................*............................................................ + // add v27.4S, v21.4S, v6.4S // ...............................*.................................................................. + // sqrdmulh v11.4S, v20.4S, v28.S[3] // ..................................*............................................................... + // mul v28.4S, v5.4S, v25.S[0] // ................................*................................................................. + // sub v20.4S, v21.4S, v6.4S // ..............................*................................................................... + // mls v29.4S, v31.4S, v8.S[0] // .............................................*.................................................... + // add v9.4S, v7.4S, v16.4S // ........................................*......................................................... + // ldr q16, [x5, #-48] // ................................................................................................*. + // ldr q21, [x4, #-32] // ...........................................*...................................................... + // mls v26.4S, v22.4S, v8.S[0] // ............................................*..................................................... + // ldr q17, [x4, #-16] // ..............................................*................................................... + // ldr q31, [x5, #-96] // ....*............................................................................................. + // mls v19.4S, v11.4S, v8.S[0] // ..........................................*....................................................... + // mls v28.4S, v0.4S, v8.S[0] // .........................................*........................................................ + // sub v24.4S, v20.4S, v29.4S // ...................................................*.............................................. + // add v22.4S, v20.4S, v29.4S // ....................................................*............................................. + // mul v6.4S, v22.4S, v21.S[2] // ..............................................................*................................... + // ldr q30, [x5, #-176] // .........................................................................*........................ + // mul v14.4S, v24.4S, v17.S[0] // ............................................................*..................................... + // add v20.4S, v27.4S, v19.4S // ...............................................*.................................................. + // sub v7.4S, v27.4S, v19.4S // ................................................*................................................. + // sqrdmulh v1.4S, v24.4S, v17.S[1] // .............................................................*.................................... + // sqrdmulh v24.4S, v22.4S, v21.S[3] // ...............................................................*.................................. + // mul v29.4S, v20.4S, v25.S[2] // .......................................................*.......................................... + // mul v22.4S, v7.4S, v21.S[0] // .........................................................*........................................ + // sqrdmulh v25.4S, v20.4S, v25.S[3] // ........................................................*......................................... + // sqrdmulh v21.4S, v7.4S, v21.S[1] // ..........................................................*....................................... + // sub v15.4S, v3.4S, v28.4S // ..................................................*............................................... + // add v27.4S, v9.4S, v26.4S // ......................................................*........................................... + // add v20.4S, v3.4S, v28.4S // .................................................*................................................ + // mls v14.4S, v1.4S, v8.S[0] // ...................................................................*.............................. + // mls v6.4S, v24.4S, v8.S[0] // ..................................................................*............................... + // sub v28.4S, v9.4S, v26.4S // .....................................................*............................................ + // mls v22.4S, v21.4S, v8.S[0] // .................................................................*................................ + // mls v29.4S, v25.4S, v8.S[0] // ................................................................*................................. + // sub v26.4S, v20.4S, v6.4S // ............................................................................*..................... + // add v17.4S, v20.4S, v6.4S // ...........................................................................*...................... + // add v19.4S, v15.4S, v14.4S // ..........................................................................*....................... + // sub v21.4S, v15.4S, v14.4S // .............................................................................*.................... + // add v15.4S, v28.4S, v22.4S // ......................................................................*........................... + // sub v4.4S, v27.4S, v29.4S // .......................................................................*.......................... + // add v24.4S, v27.4S, v29.4S // .....................................................................*............................ + // sub v27.4S, v28.4S, v22.4S // ....................................................................*............................. + // trn2 v14.4S, v17.4S, v26.4S // ....................................................................................*............. + // ldr q5, [x5, #-160] // ........................*......................................................................... + // trn1 v22.4S, v17.4S, v26.4S // .....................................................................................*............ + // trn2 v6.4S, v19.4S, v21.4S // ..................................................................................*............... + // trn1 v29.4S, v19.4S, v21.4S // ...................................................................................*.............. + // ldr q21, [x5, #-80] // ...........................................................*...................................... + // trn2 v3.4S, v15.4S, v27.4S // ...............................................................................*.................. + // trn2 v19.4S, v24.4S, v4.4S // ................................................................................*................. + // trn2 v12.2D, v14.2D, v6.2D // ........................................................................................*......... + // ldr q20, [x5, #-144] // ........................................................................*......................... + // trn1 v24.4S, v24.4S, v4.4S // .................................................................................*................ + // trn2 v0.2D, v22.2D, v29.2D // .........................................................................................*........ + // trn1 v9.4S, v15.4S, v27.4S // ..............................................................................*................... + // trn2 v27.2D, v19.2D, v3.2D // ......................................................................................*........... + // mul v26.4S, v12.4S, v31.4S // .............................................................................................*.... + // sqrdmulh v11.4S, v12.4S, v21.4S // ...............................................................................................*.. + // sqrdmulh v12.4S, v27.4S, v30.4S // ..............................................................................................*... + // mul v13.4S, v27.4S, v10.4S // ...........................................................................................*...... + // trn2 v17.2D, v24.2D, v9.2D // ..........................................................................................*....... + // mul v15.4S, v0.4S, v31.4S // ............................................................................................*..... + // trn1 v31.2D, v14.2D, v6.2D // .......................................................................................*.......... + // ldr q6, [x5, #-64] // .................................................................................................* sub count, count, #1 layer45678_start: + // Instructions: 164 + // Expected cycles: 36 + // Expected IPC: 4.56 + // + // Wall time: 1532.60s + // User time: 1532.60s + // + // ------------------------------------------------------------------------ original position ------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + ldr q28, [x4], #64 // ........e........................................................................................................................................................... + mls v26.4S, v11.4S, v8.S[0] // ...............................................................................................................................*.................................... + ldr q7, [x2, #160] // ......e............................................................................................................................................................. + sqrdmulh v11.4S, v0.4S, v21.4S // ........................................................................................................................*........................................... + ldr q18, [x2, #176] // .......e............................................................................................................................................................ + // gap // .................................................................................................................................................................... + sqrdmulh v23.4S, v17.4S, v30.4S // ..............................................................................................*..................................................................... + mul v14.4S, v17.4S, v10.4S // ...............................................................................................*.................................................................... + trn1 v1.2D, v24.2D, v9.2D // ..............................................................................*..................................................................................... + trn1 v4.2D, v19.2D, v3.2D // ...............................................................................*.................................................................................... + ldr q9, [x5, #-32] // ......................................................................................................................*............................................. + trn1 v2.2D, v22.2D, v29.2D // ......................................................................................*............................................................................. + ldr q17, [x2, #128] // ....e............................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v13.4S, v12.4S, v8.S[0] // .....................................................................................................*.............................................................. + // gap // .................................................................................................................................................................... + ldr q25, [x2, #144] // .....e.............................................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q10, [x5], #(12*16) // ........................................................................................e........................................................................... + ldr q0, [x1, #176] // ...e................................................................................................................................................................ + sub v29.4S, v31.4S, v26.4S // ................................................................................................................................*................................... + ldr q21, [x1, #144] // .e.................................................................................................................................................................. + mls v14.4S, v23.4S, v8.S[0] // ................................................................................................*................................................................... + mls v15.4S, v11.4S, v8.S[0] // ..........................................................................................................................*......................................... + ldr q12, [x5, #-208] // .......................................................................................................................*............................................ + // gap // .................................................................................................................................................................... + add v22.4S, v31.4S, v26.4S // .................................................................................................................................*.................................. + // gap // .................................................................................................................................................................... + sqrdmulh v27.4S, v7.4S, v28.S[1] // ......................e............................................................................................................................................. + sqrdmulh v23.4S, v18.4S, v28.S[1] // ...........................e........................................................................................................................................ + mul v26.4S, v18.4S, v28.S[0] // ............................e....................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - ldr q30, [x2, #128] // ....e............................................................................................................................................................... - ldr q19, [x2, #176] // .......e............................................................................................................................................................ - ldr q3, [x4], #64 // ........e........................................................................................................................................................... - mls v21.4S, v23.4S, v8.S[0] // .....................................................................................................*.............................................................. - sqrdmulh v4.4S, v0.4S, v26.4S // ..............................................................................................................................*..................................... - mul v11.4S, v0.4S, v6.4S // .............................................................................................................................*...................................... - ldr q14, [x5, #-160] // ..........................................................................................*......................................................................... + add v3.4S, v4.4S, v13.4S // .......................................................................................................*............................................................ + sub v11.4S, v4.4S, v13.4S // ......................................................................................................*............................................................. + mul v4.4S, v22.4S, v6.4S // ...................................................................................................................................*................................ + sqrdmulh v24.4S, v22.4S, v16.4S // ..................................................................................................................................*................................. + mul v19.4S, v7.4S, v28.S[0] // .......................e............................................................................................................................................ // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + ldr q22, [x1, #160] // ..e................................................................................................................................................................. + ldr q31, [x5, #-320] // ............................................................................................*....................................................................... + mul v16.4S, v17.4S, v28.S[0] // .............e...................................................................................................................................................... + mul v30.4S, v29.4S, v9.4S // ........................................................................................................................................*........................... // gap // .................................................................................................................................................................... - mul v2.4S, v12.4S, v6.4S // ........................................................................................................................*........................................... - ldr q6, [x5, #-112] // .............................................................................................*...................................................................... - trn1 v0.2D, v15.2D, v10.2D // ...............................................................................*.................................................................................... - mul v5.4S, v13.4S, v20.4S // ..............................................................................................*..................................................................... - ldr q10, [x1, #160] // ..e................................................................................................................................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - ldr q17, [x2, #160] // ......e............................................................................................................................................................. - sqrdmulh v13.4S, v13.4S, v9.4S // ...............................................................................................*.................................................................... - sqrdmulh v29.4S, v12.4S, v26.4S // .........................................................................................................................*.......................................... - trn1 v23.2D, v24.2D, v7.2D // .......................................................................................*............................................................................ - ldr q1, [x2, #144] // .....e.............................................................................................................................................................. + ldr q18, [x5, #-304] // .............................................................................................*...................................................................... + mul v13.4S, v3.4S, v5.4S // .........................................................................................................*.......................................................... + sqrdmulh v20.4S, v3.4S, v20.4S // ........................................................................................................*........................................................... + sqrdmulh v3.4S, v17.4S, v28.S[1] // ............e....................................................................................................................................................... + mls v26.4S, v23.4S, v8.S[0] // .............................e...................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - sub v18.4S, v0.4S, v21.4S // ......................................................................................................*............................................................. - add v12.4S, v0.4S, v21.4S // .......................................................................................................*............................................................ - mls v11.4S, v4.4S, v8.S[0] // ...............................................................................................................................*.................................... - trn1 v26.2D, v31.2D, v22.2D // ......................................................................................*............................................................................. - ldr q31, [x5, #-48] // .....................................................................................................................*.............................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + add v9.4S, v2.4S, v15.4S // ............................................................................................................................*....................................... + sqrdmulh v5.4S, v25.4S, v28.S[1] // .................e.................................................................................................................................................. + mls v4.4S, v24.4S, v8.S[0] // ....................................................................................................................................*............................... + mul v6.4S, v25.4S, v28.S[0] // ..................e................................................................................................................................................. + ldr q25, [x4, #-48] // .........e.......................................................................................................................................................... + add v24.4S, v1.4S, v14.4S // ..................................................................................................*................................................................. // gap // .................................................................................................................................................................... - ldr q15, [x5, #-16] // .......................................................................................................................*............................................ - sqrdmulh v24.4S, v19.4S, v3.S[1] // ............................e....................................................................................................................................... - mul v20.4S, v19.4S, v3.S[0] // ...........................e........................................................................................................................................ - ldr q22, [x1, #176] // ...e................................................................................................................................................................ - sqrdmulh v21.4S, v30.4S, v3.S[1] // .............e...................................................................................................................................................... - mul v0.4S, v30.4S, v3.S[0] // ............e....................................................................................................................................................... - sqrdmulh v7.4S, v12.4S, v16.4S // .........................................................................................................*.......................................................... - ldr q16, [x5, #48] // ...........................................................................................e........................................................................ - mul v4.4S, v18.4S, v28.4S // .............................................................................................................*...................................................... // gap // .................................................................................................................................................................... - ldr q19, [x1, #144] // .e.................................................................................................................................................................. - sqrdmulh v6.4S, v18.4S, v6.4S // ..............................................................................................................*..................................................... + mls v19.4S, v27.4S, v8.S[0] // ........................e........................................................................................................................................... // gap // .................................................................................................................................................................... - mls v2.4S, v29.4S, v8.S[0] // ..........................................................................................................................*......................................... - mul v9.4S, v1.4S, v3.S[0] // .................e.................................................................................................................................................. + sqrdmulh v29.4S, v29.4S, v12.4S // .......................................................................................................................................*............................ // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + ldr q7, [x1, #128] // e................................................................................................................................................................... + sub v12.4S, v2.4S, v15.4S // ...........................................................................................................................*........................................ + mul v15.4S, v11.4S, v31.4S // ..............................................................................................................*..................................................... + mls v13.4S, v20.4S, v8.S[0] // ..........................................................................................................*......................................................... + mls v16.4S, v3.4S, v8.S[0] // ..............e..................................................................................................................................................... + add v20.4S, v0.4S, v26.4S // ...............................e.................................................................................................................................... // gap // .................................................................................................................................................................... - mls v5.4S, v13.4S, v8.S[0] // ................................................................................................*................................................................... - sub v29.4S, v23.4S, v11.4S // ................................................................................................................................*................................... - sqrdmulh v30.4S, v1.4S, v3.S[1] // ..................e................................................................................................................................................. // gap // .................................................................................................................................................................... + sub v17.4S, v0.4S, v26.4S // ..............................e..................................................................................................................................... // gap // .................................................................................................................................................................... - ldr q1, [x4, #-48] // .........e.......................................................................................................................................................... // gap // .................................................................................................................................................................... - sqrdmulh v13.4S, v17.4S, v3.S[1] // .......................e............................................................................................................................................ - mls v20.4S, v24.4S, v8.S[0] // .............................e...................................................................................................................................... - add v23.4S, v23.4S, v11.4S // .................................................................................................................................*.................................. - mul v18.4S, v12.4S, v14.4S // ........................................................................................................*........................................................... + sqrdmulh v23.4S, v11.4S, v18.4S // .............................................................................................................*...................................................... + mls v6.4S, v5.4S, v8.S[0] // ...................e................................................................................................................................................ + add v2.4S, v9.4S, v4.4S // ......................................................................................................................................*............................. // gap // .................................................................................................................................................................... - mul v11.4S, v17.4S, v3.S[0] // ......................e............................................................................................................................................. + sub v5.4S, v22.4S, v19.4S // .........................e.......................................................................................................................................... + add v11.4S, v22.4S, v19.4S // ..........................e......................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - mls v4.4S, v6.4S, v8.S[0] // ...............................................................................................................*.................................................... - sub v12.4S, v26.4S, v2.4S // ...........................................................................................................................*........................................ - mls v0.4S, v21.4S, v8.S[0] // ..............e..................................................................................................................................................... - mul v21.4S, v23.4S, v27.4S // ..................................................................................................................................*................................. + mls v30.4S, v29.4S, v8.S[0] // .........................................................................................................................................*.......................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - ldr q28, [x1, #128] // e................................................................................................................................................................... - sub v6.4S, v25.4S, v5.4S // .................................................................................................*.................................................................. - add v24.4S, v26.4S, v2.4S // ............................................................................................................................*....................................... - mls v9.4S, v30.4S, v8.S[0] // ...................e................................................................................................................................................ // gap // .................................................................................................................................................................... + mul v29.4S, v17.4S, v25.S[0] // ................................................e................................................................................................................... + mul v19.4S, v20.4S, v28.S[2] // ......................................e............................................................................................................................. + sqrdmulh v31.4S, v17.4S, v25.S[1] // ...............................................e.................................................................................................................... + sqrdmulh v0.4S, v5.4S, v25.S[1] // ..........................................e......................................................................................................................... + sub v3.4S, v7.4S, v16.4S // ...............e.................................................................................................................................................... + sqrdmulh v22.4S, v11.4S, v28.S[3] // ................................e................................................................................................................................... + mul v26.4S, v11.4S, v28.S[2] // .................................e.................................................................................................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - add v17.4S, v22.4S, v20.4S // ...............................e.................................................................................................................................... - sub v26.4S, v22.4S, v20.4S // ..............................e..................................................................................................................................... // gap // .................................................................................................................................................................... - sqrdmulh v27.4S, v23.4S, v31.4S // ...................................................................................................................................*................................ - add v30.4S, v25.4S, v5.4S // ..................................................................................................*................................................................. - add v20.4S, v6.4S, v4.4S // .................................................................................................................*.................................................. // gap // .................................................................................................................................................................... - mls v11.4S, v13.4S, v8.S[0] // ........................e........................................................................................................................................... + add v27.4S, v21.4S, v6.4S // .....................e.............................................................................................................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - sub v22.4S, v6.4S, v4.4S // ................................................................................................................*................................................... - mls v18.4S, v7.4S, v8.S[0] // ..........................................................................................................*......................................................... - ldr q6, [x5, #-32] // ......................................................................................................................*............................................. - sub v25.4S, v19.4S, v9.4S // ....................e............................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + sqrdmulh v11.4S, v20.4S, v28.S[3] // .....................................e.............................................................................................................................. + mul v28.4S, v5.4S, v25.S[0] // ...........................................e........................................................................................................................ + sub v18.4S, v24.4S, v13.4S // ...........................................................................................................*........................................................ + sub v20.4S, v21.4S, v6.4S // ....................e............................................................................................................................................... + mls v29.4S, v31.4S, v8.S[0] // .................................................e.................................................................................................................. + sub v5.4S, v9.4S, v4.4S // .....................................................................................................................................*.............................. // gap // .................................................................................................................................................................... + add v9.4S, v7.4S, v16.4S // ................e................................................................................................................................................... + ldr q16, [x5, #-48] // .....................................................................................................................e.............................................. // gap // .................................................................................................................................................................... - add v23.4S, v19.4S, v9.4S // .....................e.............................................................................................................................................. - mul v2.4S, v17.4S, v3.S[2] // .....................................e.............................................................................................................................. - sqrdmulh v4.4S, v17.4S, v3.S[3] // ......................................e............................................................................................................................. - mul v7.4S, v26.4S, v1.S[0] // ...............................................e.................................................................................................................... + ldr q21, [x4, #-32] // ..........e......................................................................................................................................................... + mls v26.4S, v22.4S, v8.S[0] // ..................................e................................................................................................................................. + sub v4.4S, v12.4S, v30.4S // ..........................................................................................................................................*......................... // gap // .................................................................................................................................................................... + ldr q17, [x4, #-16] // ...........e........................................................................................................................................................ + ldr q31, [x5, #-96] // ..................................................................................................................e................................................. // gap // .................................................................................................................................................................... - trn1 v9.4S, v20.4S, v22.4S // ..............................................................................................................................................*..................... - ldr q14, [x4, #-16] // ...........e........................................................................................................................................................ - sqrdmulh v17.4S, v26.4S, v1.S[1] // ................................................e................................................................................................................... - mls v21.4S, v27.4S, v8.S[0] // ....................................................................................................................................*............................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + mls v19.4S, v11.4S, v8.S[0] // .......................................e............................................................................................................................ + trn2 v11.4S, v2.4S, v5.4S // .....................................................................................................................................................*.............. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - sub v5.4S, v10.4S, v11.4S // .........................e.......................................................................................................................................... - sub v19.4S, v30.4S, v18.4S // ...........................................................................................................*........................................................ - add v31.4S, v30.4S, v18.4S // ............................................................................................................*....................................................... - sqrdmulh v26.4S, v29.4S, v15.4S // ........................................................................................................................................*........................... - add v11.4S, v10.4S, v11.4S // ..........................e......................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + add v13.4S, v24.4S, v13.4S // ............................................................................................................*....................................................... + mls v28.4S, v0.4S, v8.S[0] // ............................................e....................................................................................................................... + sub v24.4S, v20.4S, v29.4S // ..................................................e................................................................................................................. + add v22.4S, v20.4S, v29.4S // ...................................................e................................................................................................................ // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - mls v2.4S, v4.4S, v8.S[0] // .......................................e............................................................................................................................ - add v10.4S, v28.4S, v0.4S // ................e................................................................................................................................................... - mul v4.4S, v29.4S, v6.4S // .......................................................................................................................................*............................ - ldr q6, [x5, #96] // ..................................................................................................................e................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - mls v7.4S, v17.4S, v8.S[0] // .................................................e.................................................................................................................. - ldr q29, [x4, #-32] // ..........e......................................................................................................................................................... - trn1 v17.4S, v31.4S, v19.4S // ............................................................................................................................................*....................... - sub v15.4S, v28.4S, v0.4S // ...............e.................................................................................................................................................... - sqrdmulh v0.4S, v5.4S, v1.S[1] // ...........................................e........................................................................................................................ + mls v15.4S, v23.4S, v8.S[0] // ...............................................................................................................*.................................................... // gap // .................................................................................................................................................................... + trn1 v2.4S, v2.4S, v5.4S // ....................................................................................................................................................*............... + add v5.4S, v12.4S, v30.4S // ...........................................................................................................................................*........................ // gap // .................................................................................................................................................................... - sqrdmulh v27.4S, v11.4S, v3.S[3] // .................................e.................................................................................................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - sub v13.4S, v24.4S, v21.4S // .....................................................................................................................................*.............................. - trn2 v18.4S, v31.4S, v19.4S // .............................................................................................................................................*...................... - mul v11.4S, v11.4S, v3.S[2] // ................................e................................................................................................................................... + trn2 v0.4S, v13.4S, v18.4S // .............................................................................................................................................*...................... + sub v29.4S, v1.4S, v14.4S // .................................................................................................*.................................................................. + mul v6.4S, v22.4S, v21.S[2] // ...............................................................e.................................................................................................... + ldr q30, [x5, #-176] // .........................................................................................e.......................................................................... // gap // .................................................................................................................................................................... + mul v14.4S, v24.4S, v17.S[0] // ....................................................................e............................................................................................... // gap // .................................................................................................................................................................... - sub v3.4S, v23.4S, v2.4S // ........................................e........................................................................................................................... // gap // .................................................................................................................................................................... - add v23.4S, v23.4S, v2.4S // .........................................e.......................................................................................................................... - mul v2.4S, v5.4S, v1.S[0] // ..........................................e......................................................................................................................... + add v20.4S, v27.4S, v19.4S // .........................................e.......................................................................................................................... + sub v7.4S, v27.4S, v19.4S // ........................................e........................................................................................................................... + sqrdmulh v1.4S, v24.4S, v17.S[1] // ...................................................................e................................................................................................ + sqrdmulh v24.4S, v22.4S, v21.S[3] // ..............................................................e..................................................................................................... // gap // .................................................................................................................................................................... - trn1 v5.2D, v17.2D, v9.2D // ..................................................................................................................................................*................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + sub v12.4S, v29.4S, v15.4S // ................................................................................................................*................................................... + add v23.4S, v29.4S, v15.4S // .................................................................................................................*.................................................. + mul v29.4S, v20.4S, v25.S[2] // .....................................................e.............................................................................................................. // gap // .................................................................................................................................................................... - sub v31.4S, v25.4S, v7.4S // ..................................................e................................................................................................................. - add v25.4S, v25.4S, v7.4S // ...................................................e................................................................................................................ - add v24.4S, v24.4S, v21.4S // ......................................................................................................................................*............................. - mls v4.4S, v26.4S, v8.S[0] // .........................................................................................................................................*.......................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + mul v22.4S, v7.4S, v21.S[0] // ..........................................................e......................................................................................................... + sqrdmulh v25.4S, v20.4S, v25.S[3] // ....................................................e............................................................................................................... + sqrdmulh v21.4S, v7.4S, v21.S[1] // .........................................................e.......................................................................................................... + sub v15.4S, v3.4S, v28.4S // .............................................e...................................................................................................................... + add v27.4S, v9.4S, v26.4S // ....................................e............................................................................................................................... + trn2 v7.4S, v5.4S, v4.4S // .......................................................................................................................................................*............ // gap // .................................................................................................................................................................... - mul v7.4S, v23.4S, v1.S[2] // ....................................................e............................................................................................................... - sqrdmulh v1.4S, v23.4S, v1.S[3] // .....................................................e.............................................................................................................. - mul v30.4S, v3.4S, v29.S[0] // .........................................................e.......................................................................................................... - sqrdmulh v3.4S, v3.4S, v29.S[1] // ..........................................................e......................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + trn2 v19.4S, v23.4S, v12.4S // ...............................................................................................................................................*.................... + add v20.4S, v3.4S, v28.4S // ..............................................e..................................................................................................................... + mls v14.4S, v1.4S, v8.S[0] // .....................................................................e.............................................................................................. + mls v6.4S, v24.4S, v8.S[0] // ................................................................e................................................................................................... // gap // .................................................................................................................................................................... - mls v11.4S, v27.4S, v8.S[0] // ..................................e................................................................................................................................. - mls v2.4S, v0.4S, v8.S[0] // ............................................e....................................................................................................................... - mul v28.4S, v31.4S, v14.S[0] // ...................................................................e................................................................................................ - sqrdmulh v27.4S, v31.4S, v14.S[1] // ....................................................................e............................................................................................... + sub v28.4S, v9.4S, v26.4S // ...................................e................................................................................................................................ // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - mul v0.4S, v25.4S, v29.S[2] // ..............................................................e..................................................................................................... - sqrdmulh v29.4S, v25.4S, v29.S[3] // ...............................................................e.................................................................................................... - sub v23.4S, v12.4S, v4.4S // ..........................................................................................................................................*......................... - add v12.4S, v12.4S, v4.4S // ...........................................................................................................................................*........................ // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - mls v7.4S, v1.4S, v8.S[0] // ......................................................e............................................................................................................. + mls v22.4S, v21.4S, v8.S[0] // ...........................................................e........................................................................................................ + trn1 v9.2D, v0.2D, v19.2D // ...................................................................................................................................................*................ + mls v29.4S, v25.4S, v8.S[0] // ......................................................e............................................................................................................. // gap // .................................................................................................................................................................... - mls v30.4S, v3.4S, v8.S[0] // ...........................................................e........................................................................................................ - trn1 v25.4S, v24.4S, v13.4S // ....................................................................................................................................................*............... - trn1 v31.4S, v12.4S, v23.4S // ......................................................................................................................................................*............. + trn2 v25.2D, v11.2D, v7.2D // .........................................................................................................................................................*.......... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - sub v14.4S, v10.4S, v11.4S // ...................................e................................................................................................................................ - add v4.4S, v10.4S, v11.4S // ....................................e............................................................................................................................... - mls v28.4S, v27.4S, v8.S[0] // .....................................................................e.............................................................................................. - trn2 v3.2D, v17.2D, v9.2D // ................................................................................................................................................*................... + trn1 v1.4S, v5.4S, v4.4S // ......................................................................................................................................................*............. + trn2 v4.2D, v0.2D, v19.2D // .................................................................................................................................................*.................. + sub v26.4S, v20.4S, v6.4S // .................................................................e.................................................................................................. + add v17.4S, v20.4S, v6.4S // ..................................................................e................................................................................................. // gap // .................................................................................................................................................................... + add v19.4S, v15.4S, v14.4S // .......................................................................e............................................................................................ + sub v21.4S, v15.4S, v14.4S // ......................................................................e............................................................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - sub v11.4S, v15.4S, v2.4S // .............................................e...................................................................................................................... - mls v0.4S, v29.4S, v8.S[0] // ................................................................e................................................................................................... - add v21.4S, v15.4S, v2.4S // ..............................................e..................................................................................................................... + str q4, [x1, #48] // ...............................................................................................................................................................*.... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + add v15.4S, v28.4S, v22.4S // .............................................................e...................................................................................................... + sub v4.4S, v27.4S, v29.4S // .......................................................e............................................................................................................ + add v24.4S, v27.4S, v29.4S // ........................................................e........................................................................................................... + sub v27.4S, v28.4S, v22.4S // ............................................................e....................................................................................................... + trn2 v14.4S, v17.4S, v26.4S // .................................................................................e.................................................................................. + ldr q5, [x5, #-160] // ..........................................................................................e......................................................................... + trn1 v22.4S, v17.4S, v26.4S // ................................................................................e................................................................................... // gap // .................................................................................................................................................................... - add v19.4S, v4.4S, v7.4S // ........................................................e........................................................................................................... - sub v1.4S, v14.4S, v30.4S // ............................................................e....................................................................................................... - add v14.4S, v14.4S, v30.4S // .............................................................e...................................................................................................... - sub v2.4S, v4.4S, v7.4S // .......................................................e............................................................................................................ // gap // .................................................................................................................................................................... + trn2 v6.4S, v19.4S, v21.4S // ...................................................................................e................................................................................ + trn1 v29.4S, v19.4S, v21.4S // ..................................................................................e................................................................................. + ldr q21, [x5, #-80] // ...................................................................................................................e................................................ // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - trn2 v7.4S, v20.4S, v22.4S // ...............................................................................................................................................*.................... - trn2 v23.4S, v12.4S, v23.4S // .......................................................................................................................................................*............ - trn1 v17.2D, v25.2D, v31.2D // ..........................................................................................................................................................*......... - sub v27.4S, v11.4S, v28.4S // ......................................................................e............................................................................................. // gap // .................................................................................................................................................................... - add v30.4S, v21.4S, v0.4S // ..................................................................e................................................................................................. - trn2 v15.4S, v19.4S, v2.4S // .........................................................................e.......................................................................................... - ldr q20, [x5], #(12*16) // ........................................................................................e........................................................................... - trn2 v10.4S, v14.4S, v1.4S // ...........................................................................e........................................................................................ - ldr q9, [x5, #-176] // .........................................................................................e.......................................................................... - ldr q26, [x5, #-80] // ...................................................................................................................e................................................ - add v11.4S, v11.4S, v28.4S // .......................................................................e............................................................................................ - trn1 v29.4S, v19.4S, v2.4S // ........................................................................e........................................................................................... - trn2 v24.4S, v24.4S, v13.4S // .....................................................................................................................................................*.............. - trn2 v4.2D, v18.2D, v7.2D // .................................................................................................................................................*.................. + trn1 v28.2D, v11.2D, v7.2D // ...........................................................................................................................................................*........ + trn2 v17.2D, v2.2D, v1.2D // ........................................................................................................................................................*........... + trn2 v3.4S, v15.4S, v27.4S // ...........................................................................e........................................................................................ + trn2 v19.4S, v24.4S, v4.4S // .........................................................................e.......................................................................................... + trn1 v26.4S, v23.4S, v12.4S // ..............................................................................................................................................*..................... + trn2 v12.2D, v14.2D, v6.2D // .....................................................................................e.............................................................................. // gap // .................................................................................................................................................................... + ldr q20, [x5, #-144] // ...........................................................................................e........................................................................ // gap // .................................................................................................................................................................... + trn1 v24.4S, v24.4S, v4.4S // ........................................................................e........................................................................................... // gap // .................................................................................................................................................................... + trn1 v4.4S, v13.4S, v18.4S // ............................................................................................................................................*....................... + str q17, [x2, #32] // ..................................................................................................................................................................*. + trn1 v18.2D, v2.2D, v1.2D // ..........................................................................................................................................................*......... // gap // .................................................................................................................................................................... - sub v0.4S, v21.4S, v0.4S // .................................................................e.................................................................................................. - trn2 v2.2D, v15.2D, v10.2D // .............................................................................e...................................................................................... + trn2 v0.2D, v22.2D, v29.2D // ....................................................................................e............................................................................... // gap // .................................................................................................................................................................... + str q9, [x1, #16] // .............................................................................................................................................................*...... + trn1 v9.4S, v15.4S, v27.4S // ..........................................................................e......................................................................................... + trn2 v27.2D, v19.2D, v3.2D // .............................................................................e...................................................................................... + trn2 v23.2D, v4.2D, v26.2D // ................................................................................................................................................*................... + trn1 v7.2D, v4.2D, v26.2D // ..................................................................................................................................................*................. // gap // .................................................................................................................................................................... - str q5, [x1], #128 // ............................................................................................................................................................*....... + str q28, [x2, #16] // .................................................................................................................................................................*.. + mul v26.4S, v12.4S, v31.4S // ..............................................................................................................................e..................................... + sqrdmulh v11.4S, v12.4S, v21.4S // .............................................................................................................................e...................................... // gap // .................................................................................................................................................................... - trn1 v13.2D, v18.2D, v7.2D // ...................................................................................................................................................*................ - trn1 v22.4S, v11.4S, v27.4S // ..................................................................................e................................................................................. - trn2 v7.4S, v11.4S, v27.4S // ...................................................................................e................................................................................ - trn1 v11.2D, v24.2D, v23.2D // ...........................................................................................................................................................*........ - trn2 v19.2D, v24.2D, v23.2D // .........................................................................................................................................................*.......... - trn1 v18.4S, v14.4S, v1.4S // ..........................................................................e......................................................................................... // gap // .................................................................................................................................................................... + sqrdmulh v12.4S, v27.4S, v30.4S // ...................................................................................................e................................................................ + mul v13.4S, v27.4S, v10.4S // ....................................................................................................e............................................................... + str q18, [x2], #128 // ................................................................................................................................................................*... // gap // .................................................................................................................................................................... - str q3, [x1, #-96] // ..............................................................................................................................................................*..... - ldr q28, [x5, #-128] // ............................................................................................e....................................................................... - trn2 v24.4S, v30.4S, v0.4S // .................................................................................e.................................................................................. - trn2 v12.2D, v25.2D, v31.2D // ........................................................................................................................................................*........... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - str q13, [x1, #-112] // .............................................................................................................................................................*...... - mul v21.4S, v2.4S, v20.4S // ...................................................................................................e................................................................ - sqrdmulh v23.4S, v2.4S, v9.4S // ....................................................................................................e............................................................... - trn1 v31.4S, v30.4S, v0.4S // ................................................................................e................................................................................... - str q4, [x1, #-80] // ...............................................................................................................................................................*.... - // gap // .................................................................................................................................................................... - trn2 v13.2D, v29.2D, v18.2D // ............................................................................e....................................................................................... - str q11, [x2, #16] // .................................................................................................................................................................*.. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - str q19, [x2, #48] // ...................................................................................................................................................................* - trn1 v25.2D, v29.2D, v18.2D // ..............................................................................e..................................................................................... - trn2 v0.2D, v24.2D, v7.2D // .....................................................................................e.............................................................................. - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - str q17, [x2], #128 // ................................................................................................................................................................*... - // gap // .................................................................................................................................................................... - str q12, [x2, #-96] // ..................................................................................................................................................................*. - // gap // .................................................................................................................................................................... - trn2 v12.2D, v31.2D, v22.2D // ....................................................................................e............................................................................... - ldr q27, [x5, #-64] // ....................................................................................................................e............................................... - - // original source code - // ldr q9, [x1, #(16*0)] // ................................................e...................................................................................................................|...............................................e................................................................................................................. - // ldr q10, [x1, #(16*1)] // ...............................e....................................................................................................................................|..............................e.................................................................................................................................. - // ldr q11, [x1, #(16*2)] // ...........e........................................................................................................................................................|..........e...................................................................................................................................................... - // ldr q12, [x1, #(16*3)] // .........................e..........................................................................................................................................|........................e........................................................................................................................................ - // ldr q13, [x2, #(16*0)] // e...................................................................................................................................................................e................................................................................................................................................................. - // ldr q14, [x2, #(16*1)] // ................e...................................................................................................................................................|...............e................................................................................................................................................. - // ldr q15, [x2, #(16*2)] // ............e.......................................................................................................................................................|...........e..................................................................................................................................................... - // ldr q16, [x2, #(16*3)] // .e..................................................................................................................................................................|e................................................................................................................................................................ - // ldr q0, [x4], #64 // ..e.................................................................................................................................................................|.e............................................................................................................................................................... - // ldr q1, [x4, #(-64 + 16)] // ......................................e.............................................................................................................................|.....................................e........................................................................................................................... - // ldr q2, [x4, #(-64 + 32)] // ................................................................................e...................................................................................|...............................................................................e................................................................................. - // ldr q3, [x4, #(-64 + 48)] // ...................................................................e................................................................................................|..................................................................e.............................................................................................. - // mul v24.4s, v13.4s, v0.s[0] // ...........................e........................................................................................................................................|..........................e...................................................................................................................................... - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ..........................e.........................................................................................................................................|.........................e....................................................................................................................................... - // mls v24.4s, v13.4s, v8.s[0] // ..............................................e.....................................................................................................................|.............................................e................................................................................................................... - // sub v13.4s, v9.4s, v24.4s // ..................................................................................e.................................................................................|.................................................................................e............................................................................... - // add v9.4s, v9.4s, v24.4s // ............................................................................e.......................................................................................|...........................................................................e..................................................................................... - // mul v24.4s, v14.4s, v0.s[0] // ..................................e.................................................................................................................................|.................................e............................................................................................................................... - // sqrdmulh v14.4s, v14.4s, v0.s[1] // .....................................e..............................................................................................................................|....................................e............................................................................................................................ - // mls v24.4s, v14.4s, v8.s[0] // ...................................................e................................................................................................................|..................................................e.............................................................................................................. - // sub v14.4s, v10.4s, v24.4s // .............................................................e......................................................................................................|............................................................e.................................................................................................... - // add v10.4s, v10.4s, v24.4s // ..............................................................e.....................................................................................................|.............................................................e................................................................................................... - // mul v24.4s, v15.4s, v0.s[0] // ...........................................e........................................................................................................................|..........................................e...................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v0.s[1] // .......................................e............................................................................................................................|......................................e.......................................................................................................................... - // mls v24.4s, v15.4s, v8.s[0] // .........................................................e..........................................................................................................|........................................................e........................................................................................................ - // sub v15.4s, v11.4s, v24.4s // ......................................................................e.............................................................................................|.....................................................................e........................................................................................... - // add v11.4s, v11.4s, v24.4s // ..........................................................................e.........................................................................................|.........................................................................e....................................................................................... - // mul v24.4s, v16.4s, v0.s[0] // ........................e...........................................................................................................................................|.......................e......................................................................................................................................... - // sqrdmulh v16.4s, v16.4s, v0.s[1] // .......................e............................................................................................................................................|......................e.......................................................................................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ........................................e...........................................................................................................................|.......................................e......................................................................................................................... - // sub v16.4s, v12.4s, v24.4s // .....................................................e..............................................................................................................|....................................................e............................................................................................................ - // add v12.4s, v12.4s, v24.4s // ....................................................e...............................................................................................................|...................................................e............................................................................................................. - // mul v24.4s, v11.4s, v0.s[2] // .......................................................................................e............................................................................|......................................................................................e.......................................................................... - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ....................................................................................e...............................................................................|...................................................................................e............................................................................. - // mls v24.4s, v11.4s, v8.s[0] // ....................................................................................................e...............................................................|...................................................................................................e............................................................. - // sub v11.4s, v9.4s, v24.4s // ................................................................................................................e...................................................|...............................................................................................................e................................................. - // add v9.4s, v9.4s, v24.4s // .................................................................................................................e..................................................|................................................................................................................e................................................ - // mul v24.4s, v12.4s, v0.s[2] // ...............................................................e....................................................................................................|..............................................................e.................................................................................................. - // sqrdmulh v12.4s, v12.4s, v0.s[3] // ................................................................e...................................................................................................|...............................................................e................................................................................................. - // mls v24.4s, v12.4s, v8.s[0] // ...........................................................................e........................................................................................|..........................................................................e...................................................................................... - // sub v12.4s, v10.4s, v24.4s // ........................................................................................e...........................................................................|.......................................................................................e......................................................................... - // add v10.4s, v10.4s, v24.4s // .........................................................................................e..........................................................................|........................................................................................e........................................................................ - // mul v24.4s, v15.4s, v1.s[0] // ..........................................................................................e.........................................................................|.........................................................................................e....................................................................... - // sqrdmulh v15.4s, v15.4s, v1.s[1] // ...................................................................................e................................................................................|..................................................................................e.............................................................................. - // mls v24.4s, v15.4s, v8.s[0] // .....................................................................................................e..............................................................|....................................................................................................e............................................................ - // sub v15.4s, v13.4s, v24.4s // ....................................................................................................................e...............................................|...................................................................................................................e............................................. - // add v13.4s, v13.4s, v24.4s // ......................................................................................................................e.............................................|.....................................................................................................................e........................................... - // mul v24.4s, v16.4s, v1.s[0] // .................................................................e..................................................................................................|................................................................e................................................................................................ - // sqrdmulh v16.4s, v16.4s, v1.s[1] // ....................................................................e...............................................................................................|...................................................................e............................................................................................. - // mls v24.4s, v16.4s, v8.s[0] // ...............................................................................e....................................................................................|..............................................................................e.................................................................................. - // sub v16.4s, v14.4s, v24.4s // ............................................................................................e.......................................................................|...........................................................................................e..................................................................... - // add v14.4s, v14.4s, v24.4s // .............................................................................................e......................................................................|............................................................................................e.................................................................... - // mul v24.4s, v10.4s, v1.s[2] // ................................................................................................e...................................................................|...............................................................................................e................................................................. - // sqrdmulh v10.4s, v10.4s, v1.s[3] // .................................................................................................e..................................................................|................................................................................................e................................................................ - // mls v24.4s, v10.4s, v8.s[0] // ............................................................................................................e.......................................................|...........................................................................................................e..................................................... - // sub v10.4s, v9.4s, v24.4s // ..........................................................................................................................e.........................................|.........................................................................................................................e....................................... - // add v9.4s, v9.4s, v24.4s // .......................................................................................................................e............................................|......................................................................................................................e.......................................... - // mul v24.4s, v12.4s, v2.s[0] // ..................................................................................................e.................................................................|.................................................................................................e............................................................... - // sqrdmulh v12.4s, v12.4s, v2.s[1] // ...................................................................................................e................................................................|..................................................................................................e.............................................................. - // mls v24.4s, v12.4s, v8.s[0] // .............................................................................................................e......................................................|............................................................................................................e.................................................... - // sub v12.4s, v11.4s, v24.4s // ........................................................................................................................e...........................................|.......................................................................................................................e......................................... - // add v11.4s, v11.4s, v24.4s // .........................................................................................................................e..........................................|........................................................................................................................e........................................ - // mul v24.4s, v14.4s, v2.s[2] // ........................................................................................................e...........................................................|.......................................................................................................e......................................................... - // sqrdmulh v14.4s, v14.4s, v2.s[3] // .........................................................................................................e..........................................................|........................................................................................................e........................................................ - // mls v24.4s, v14.4s, v8.s[0] // .....................................................................................................................e..............................................|....................................................................................................................e............................................ - // sub v14.4s, v13.4s, v24.4s // .........................................................................................................................................e..........................|........................................................................................................................................e........................ - // add v13.4s, v13.4s, v24.4s // ...............................................................................................................................e....................................|..............................................................................................................................e.................................. - // mul v24.4s, v16.4s, v3.s[0] // ......................................................................................................e.............................................................|.....................................................................................................e........................................................... - // sqrdmulh v16.4s, v16.4s, v3.s[1] // .......................................................................................................e............................................................|......................................................................................................e.......................................................... - // mls v24.4s, v16.4s, v8.s[0] // ..................................................................................................................e.................................................|.................................................................................................................e............................................... - // sub v16.4s, v15.4s, v24.4s // ..............................................................................................................................e.....................................|.............................................................................................................................e................................... - // add v15.4s, v15.4s, v24.4s // .....................................................................................................................................e..............................|....................................................................................................................................e............................ - // trn1 v25.4s, v9.4s, v10.4s // ......................................................................................................................................e.............................|.....................................................................................................................................e........................... - // trn2 v26.4s, v9.4s, v10.4s // ................................................................................................................................e...................................|...............................................................................................................................e................................. - // trn1 v27.4s, v11.4s, v12.4s // .................................................................................................................................................e..................|................................................................................................................................................e................ - // trn2 v28.4s, v11.4s, v12.4s // ..................................................................................................................................e.................................|.................................................................................................................................e............................... - // trn2 v11.2d, v25.2d, v27.2d // ...........................................................................................................................................................e........|..........................................................................................................................................................e...... - // trn2 v12.2d, v26.2d, v28.2d // ..........................................................................................................................................e.........................|.........................................................................................................................................e....................... - // trn1 v9.2d, v25.2d, v27.2d // ..............................................................................................................................................................e.....|.............................................................................................................................................................e... - // trn1 v10.2d, v26.2d, v28.2d // .........*..........................................................................................................................................................|........*........................................................................................................................................................ - // trn1 v25.4s, v13.4s, v14.4s // .........................................................................................................................................................e..........|........................................................................................................................................................e........ - // trn2 v26.4s, v13.4s, v14.4s // ....................................................................................................................................................e...............|...................................................................................................................................................e............. - // trn1 v27.4s, v15.4s, v16.4s // .............................................................................................................................................e......................|............................................................................................................................................e.................... - // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................................................................e.....................|.............................................................................................................................................e................... - // trn2 v15.2d, v25.2d, v27.2d // ..................................................................................................................................................................e.|................................................................................................................................................................. - // trn2 v16.2d, v26.2d, v28.2d // ...............................................................................................................................................................e....|..............................................................................................................................................................e.. - // trn1 v13.2d, v25.2d, v27.2d // ....................*...............................................................................................................................................|...................*............................................................................................................................................. - // trn1 v14.2d, v26.2d, v28.2d // ...............*....................................................................................................................................................|..............*.................................................................................................................................................. - // ldr q0, [x5], #(12*16) // .................................................................................................................................e..................................|................................................................................................................................e................................ - // ldr q4, [x5, #(-12*16 + 1*16)] // ...................................................................................................................................e................................|..................................................................................................................................e.............................. - // ldr q1, [x5, #(-12*16 + 2*16)] // ......*.............................................................................................................................................................|.....*........................................................................................................................................................... - // ldr q5, [x5, #(-12*16 + 3*16)] // .............................e......................................................................................................................................|............................e.................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 4*16)] // ...................................................................................................................................................e................|..................................................................................................................................................e.............. - // ldr q6, [x5, #(-12*16 + 5*16)] // ........*...........................................................................................................................................................|.......*......................................................................................................................................................... - // mul v24.4s, v11.4s, v0.4s // ..........*.........................................................................................................................................................|.........*....................................................................................................................................................... - // sqrdmulh v11.4s, v11.4s, v4.4s // .............*......................................................................................................................................................|............*.................................................................................................................................................... - // mls v24.4s, v11.4s, v8.s[0] // ...................................*................................................................................................................................|..................................*.............................................................................................................................. - // sub v11.4s, v9.4s, v24.4s // .................................................*..................................................................................................................|................................................*................................................................................................................ - // add v9.4s, v9.4s, v24.4s // .......................................................*............................................................................................................|......................................................*.......................................................................................................... - // mul v24.4s, v12.4s, v0.4s // .......................................................................................................................................................e............|......................................................................................................................................................e.......... - // sqrdmulh v12.4s, v12.4s, v4.4s // ........................................................................................................................................................e...........|.......................................................................................................................................................e......... - // mls v24.4s, v12.4s, v8.s[0] // ...*................................................................................................................................................................|..*.............................................................................................................................................................. - // sub v12.4s, v10.4s, v24.4s // .................*..................................................................................................................................................|................*................................................................................................................................................ - // add v10.4s, v10.4s, v24.4s // ..................*.................................................................................................................................................|.................*............................................................................................................................................... - // mul v24.4s, v10.4s, v1.4s // ..........................................*.........................................................................................................................|.........................................*....................................................................................................................... - // sqrdmulh v10.4s, v10.4s, v5.4s // ............................*.......................................................................................................................................|...........................*..................................................................................................................................... - // mls v24.4s, v10.4s, v8.s[0] // ...........................................................*........................................................................................................|..........................................................*...................................................................................................... - // sub v10.4s, v9.4s, v24.4s // .......................................................................*............................................................................................|......................................................................*.......................................................................................... - // add v9.4s, v9.4s, v24.4s // ........................................................................*...........................................................................................|.......................................................................*......................................................................................... - // mul v24.4s, v12.4s, v2.4s // ..............................*.....................................................................................................................................|.............................*................................................................................................................................... - // sqrdmulh v12.4s, v12.4s, v6.4s // ................................*...................................................................................................................................|...............................*................................................................................................................................. - // mls v24.4s, v12.4s, v8.s[0] // ............................................*.......................................................................................................................|...........................................*..................................................................................................................... - // sub v12.4s, v11.4s, v24.4s // ..........................................................*.........................................................................................................|.........................................................*....................................................................................................... - // add v11.4s, v11.4s, v24.4s // ........................................................*...........................................................................................................|.......................................................*......................................................................................................... - // ldr q0, [x5, #(-12*16 + 6*16)] // ..............................................................................e.....................................................................................|.............................................................................e................................................................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // ....................................................................................................................................e...............................|...................................................................................................................................e............................. - // ldr q1, [x5, #(-12*16 + 8*16)] // ...................................................................................................................................................................e|................................................................................................................................................................. - // ldr q5, [x5, #(-12*16 + 9*16)] // .....................*..............................................................................................................................................|....................*............................................................................................................................................ - // ldr q2, [x5, #(-12*16 + 10*16)] // ............................................................*.......................................................................................................|...........................................................*..................................................................................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ......................*.............................................................................................................................................|.....................*........................................................................................................................................... - // mul v24.4s, v15.4s, v0.4s // .......*............................................................................................................................................................|......*.......................................................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v4.4s // ..............*.....................................................................................................................................................|.............*................................................................................................................................................... - // mls v24.4s, v15.4s, v8.s[0] // .................................*..................................................................................................................................|................................*................................................................................................................................ - // sub v15.4s, v13.4s, v24.4s // .............................................*......................................................................................................................|............................................*.................................................................................................................... - // add v13.4s, v13.4s, v24.4s // ..................................................*.................................................................................................................|.................................................*............................................................................................................... - // mul v24.4s, v16.4s, v0.4s // .....*..............................................................................................................................................................|....*............................................................................................................................................................ - // sqrdmulh v16.4s, v16.4s, v4.4s // ....*...............................................................................................................................................................|...*............................................................................................................................................................. - // mls v24.4s, v16.4s, v8.s[0] // ...................*................................................................................................................................................|..................*.............................................................................................................................................. - // sub v16.4s, v14.4s, v24.4s // ....................................*...............................................................................................................................|...................................*............................................................................................................................. - // add v14.4s, v14.4s, v24.4s // .........................................*..........................................................................................................................|........................................*........................................................................................................................ - // mul v24.4s, v14.4s, v1.4s // ...............................................*....................................................................................................................|..............................................*.................................................................................................................. - // sqrdmulh v14.4s, v14.4s, v5.4s // ......................................................*.............................................................................................................|.....................................................*........................................................................................................... - // mls v24.4s, v14.4s, v8.s[0] // .....................................................................*..............................................................................................|....................................................................*............................................................................................ - // sub v14.4s, v13.4s, v24.4s // .....................................................................................*..............................................................................|....................................................................................*............................................................................ - // add v13.4s, v13.4s, v24.4s // ..............................................................................................*.....................................................................|.............................................................................................*................................................................... - // mul v24.4s, v16.4s, v2.4s // .............................................................................*......................................................................................|............................................................................*.................................................................................... - // sqrdmulh v16.4s, v16.4s, v6.4s // .........................................................................*..........................................................................................|........................................................................*........................................................................................ - // mls v24.4s, v16.4s, v8.s[0] // ...............................................................................................*....................................................................|..............................................................................................*.................................................................. - // sub v16.4s, v15.4s, v24.4s // ..........................................................................................................*.........................................................|.........................................................................................................*....................................................... - // add v15.4s, v15.4s, v24.4s // ...........................................................................................................*........................................................|..........................................................................................................*...................................................... - // trn1 v25.4s, v9.4s, v10.4s // .................................................................................*..................................................................................|................................................................................*................................................................................ - // trn2 v26.4s, v9.4s, v10.4s // ......................................................................................*.............................................................................|.....................................................................................*........................................................................... - // trn1 v27.4s, v11.4s, v12.4s // ..................................................................*.................................................................................................|.................................................................*............................................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ...........................................................................................................................*........................................|..........................................................................................................................*...................................... - // trn2 v11.2d, v25.2d, v27.2d // ...................................................................................................................*................................................|..................................................................................................................*.............................................. - // trn2 v12.2d, v26.2d, v28.2d // ........................................................................................................................................*...........................|.......................................................................................................................................*......................... - // trn1 v9.2d, v25.2d, v27.2d // ...........................................................................................*........................................................................|..........................................................................................*...................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ............................................................................................................................................*.......................|...........................................................................................................................................*..................... - // trn1 v25.4s, v13.4s, v14.4s // ..............................................................................................................*.....................................................|.............................................................................................................*................................................... - // trn2 v26.4s, v13.4s, v14.4s // .......................................................................................................................................*............................|......................................................................................................................................*.......................... - // trn1 v27.4s, v15.4s, v16.4s // ...............................................................................................................*....................................................|..............................................................................................................*.................................................. - // trn2 v28.4s, v15.4s, v16.4s // ............................................................................................................................*.......................................|...........................................................................................................................*..................................... - // trn2 v15.2d, v25.2d, v27.2d // .....................................................................................................................................................*..............|....................................................................................................................................................*............ - // trn2 v16.2d, v26.2d, v28.2d // ................................................................................................................................................*...................|...............................................................................................................................................*................. - // trn1 v13.2d, v25.2d, v27.2d // .............................................................................................................................*......................................|............................................................................................................................*.................................... - // trn1 v14.2d, v26.2d, v28.2d // ...............................................................................................................................................*....................|..............................................................................................................................................*.................. - // str q9, [x1], #128 // ...........................................................................................................................................*........................|..........................................................................................................................................*...................... - // str q10, [x1, #(-(128) + 16*1)] // ......................................................................................................................................................*.............|.....................................................................................................................................................*........... - // str q11, [x1, #(-(128) + 16*2)] // ..................................................................................................................................................*.................|.................................................................................................................................................*............... - // str q12, [x1, #(-(128) + 16*3)] // ..........................................................................................................................................................*.........|.........................................................................................................................................................*....... - // str q13, [x2], #128 // ................................................................................................................................................................*...|...............................................................................................................................................................*. - // str q14, [x2, #(-(128) + 16*1)] // ............................................................................................................................................................*.......|...........................................................................................................................................................*..... - // str q15, [x2, #(-(128) + 16*2)] // .................................................................................................................................................................*..|................................................................................................................................................................* - // str q16, [x2, #(-(128) + 16*3)] // .............................................................................................................................................................*......|............................................................................................................................................................*.... + str q25, [x2, #-80] // ...................................................................................................................................................................* + trn2 v17.2D, v24.2D, v9.2D // ............................................................................e....................................................................................... + mul v15.4S, v0.4S, v31.4S // .........................................................................................................................e.......................................... + // gap // .................................................................................................................................................................... + str q23, [x1, #32] // ..............................................................................................................................................................*..... + trn1 v31.2D, v14.2D, v6.2D // .......................................................................................e............................................................................ + // gap // .................................................................................................................................................................... + str q7, [x1], #128 // ............................................................................................................................................................*....... + // gap // .................................................................................................................................................................... + ldr q6, [x5, #-64] // ....................................................................................................................e............................................... + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------ new position ------------------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 325 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr q9, [x1, #(16*0)] // ...............................................e....................................................................................................................'..............................................~................................................................................................................... + // ldr q10, [x1, #(16*1)] // .................e..................................................................................................................................................'................~................................................................................................................................................. + // ldr q11, [x1, #(16*2)] // ..............................e.....................................................................................................................................'.............................~.................................................................................................................................... + // ldr q12, [x1, #(16*3)] // ...............e....................................................................................................................................................'..............~................................................................................................................................................... + // ldr q13, [x2, #(16*0)] // ...........e........................................................................................................................................................'..........~....................................................................................................................................................... + // ldr q14, [x2, #(16*1)] // .............e......................................................................................................................................................'............~..................................................................................................................................................... + // ldr q15, [x2, #(16*2)] // ..e.................................................................................................................................................................'.~................................................................................................................................................................ + // ldr q16, [x2, #(16*3)] // ....e...............................................................................................................................................................'...~.............................................................................................................................................................. + // ldr q0, [x4], #64 // e...................................................................................................................................................................~.................................................................................................................................................................. + // ldr q1, [x4, #(-64 + 16)] // ...........................................e........................................................................................................................'..........................................~....................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // ............................................................................e.......................................................................................'...........................................................................~...................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ...............................................................................e....................................................................................'..............................................................................~................................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // .....................................e..............................................................................................................................'....................................~............................................................................................................................. + // mul v24.4s, v13.4s, v0.s[0] // ................................e...................................................................................................................................'...............................~.................................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...................................................e................................................................................................................'..................................................~............................................................................................................... + // sub v13.4s, v9.4s, v24.4s // ................................................................e...................................................................................................'...............................................................~.................................................................................................. + // add v9.4s, v9.4s, v24.4s // ..........................................................................e.........................................................................................'.........................................................................~........................................................................................ + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ........................................e...........................................................................................................................'.......................................~.......................................................................................................................... + // mul v24.4s, v14.4s, v0.s[0] // ..........................................e.........................................................................................................................'.........................................~........................................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // .......................................................e............................................................................................................'......................................................~........................................................................................................... + // sub v14.4s, v10.4s, v24.4s // .......................................................................e............................................................................................'......................................................................~........................................................................................... + // add v10.4s, v10.4s, v24.4s // ...................................................................e................................................................................................'..................................................................~............................................................................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ......................e.............................................................................................................................................'.....................~............................................................................................................................................ + // mul v24.4s, v15.4s, v0.s[0] // .............................e......................................................................................................................................'............................~..................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .............................................e......................................................................................................................'............................................~..................................................................................................................... + // sub v15.4s, v11.4s, v24.4s // .........................................................e..........................................................................................................'........................................................~......................................................................................................... + // add v11.4s, v11.4s, v24.4s // ..........................................................e.........................................................................................................'.........................................................~........................................................................................................ + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .......................e............................................................................................................................................'......................~........................................................................................................................................... + // mul v24.4s, v16.4s, v0.s[0] // ........................e...........................................................................................................................................'.......................~.......................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ......................................e.............................................................................................................................'.....................................~............................................................................................................................ + // sub v16.4s, v12.4s, v24.4s // .....................................................e..............................................................................................................'....................................................~............................................................................................................. + // add v12.4s, v12.4s, v24.4s // ....................................................e...............................................................................................................'...................................................~.............................................................................................................. + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .................................................................e..................................................................................................'................................................................~................................................................................................. + // mul v24.4s, v11.4s, v0.s[2] // ..................................................................e.................................................................................................'.................................................................~................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................e......................................................................................'............................................................................~..................................................................................... + // sub v11.4s, v9.4s, v24.4s // ................................................................................................................e...................................................'...............................................................................................................~.................................................. + // add v9.4s, v9.4s, v24.4s // ..........................................................................................................e.........................................................'.........................................................................................................~........................................................ + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ....................................................................e...............................................................................................'...................................................................~.............................................................................................. + // mul v24.4s, v12.4s, v0.s[2] // .............................................................e......................................................................................................'............................................................~..................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................................................................e..................................................................................'................................................................................~................................................................................. + // sub v12.4s, v10.4s, v24.4s // ................................................................................................e...................................................................'...............................................................................................~.................................................................. + // add v10.4s, v10.4s, v24.4s // ...............................................................................................e....................................................................'..............................................................................................~................................................................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ...............................................................e....................................................................................................'..............................................................~................................................................................................... + // mul v24.4s, v15.4s, v1.s[0] // .....................................................................e..............................................................................................'....................................................................~............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ....................................................................................e...............................................................................'...................................................................................~.............................................................................. + // sub v15.4s, v13.4s, v24.4s // .........................................................................................................e..........................................................'........................................................................................................~......................................................... + // add v13.4s, v13.4s, v24.4s // .............................................................................................................e......................................................'............................................................................................................~..................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ..............................................................e.....................................................................................................'.............................................................~.................................................................................................... + // mul v24.4s, v16.4s, v1.s[0] // ............................................................e.......................................................................................................'...........................................................~...................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................e...........................................................................................'.......................................................................~.......................................................................................... + // sub v16.4s, v14.4s, v24.4s // .....................................................................................e..............................................................................'....................................................................................~............................................................................. + // add v14.4s, v14.4s, v24.4s // ......................................................................................e.............................................................................'.....................................................................................~............................................................................ + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .......................................................................................................e............................................................'......................................................................................................~........................................................... + // mul v24.4s, v10.4s, v1.s[2] // .....................................................................................................e..............................................................'....................................................................................................~............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...................................................................................................................e................................................'..................................................................................................................~............................................... + // sub v10.4s, v9.4s, v24.4s // .............................................................................................................................e......................................'............................................................................................................................~..................................... + // add v9.4s, v9.4s, v24.4s // ..............................................................................................................................e.....................................'.............................................................................................................................~.................................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ........................................................................................................e...........................................................'.......................................................................................................~.......................................................... + // mul v24.4s, v12.4s, v2.s[0] // ......................................................................................................e.............................................................'.....................................................................................................~............................................................ + // mls v24.4s, v27.4s, v8.s[0] // .................................................................................................................e..................................................'................................................................................................................~................................................. + // sub v12.4s, v11.4s, v24.4s // ...............................................................................................................................e....................................'..............................................................................................................................~................................... + // add v11.4s, v11.4s, v24.4s // ............................................................................................................................e.......................................'...........................................................................................................................~...................................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ..................................................................................................e.................................................................'.................................................................................................~................................................................ + // mul v24.4s, v14.4s, v2.s[2] // ............................................................................................e.......................................................................'...........................................................................................~...................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...............................................................................................................e....................................................'..............................................................................................................~................................................... + // sub v14.4s, v13.4s, v24.4s // .......................................................................................................................e............................................'......................................................................................................................~........................................... + // add v13.4s, v13.4s, v24.4s // ........................................................................................................................e...........................................'.......................................................................................................................~.......................................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .................................................................................................e..................................................................'................................................................................................~................................................................. + // mul v24.4s, v16.4s, v3.s[0] // ..............................................................................................e.....................................................................'.............................................................................................~.................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................................e.....................................................'.............................................................................................................~.................................................... + // sub v16.4s, v15.4s, v24.4s // ..........................................................................................................................e.........................................'.........................................................................................................................~........................................ + // add v15.4s, v15.4s, v24.4s // .........................................................................................................................e..........................................'........................................................................................................................~......................................... + // trn1 v25.4s, v9.4s, v10.4s // .............................................................................................................................................e......................'............................................................................................................................................~..................... + // trn2 v26.4s, v9.4s, v10.4s // .........................................................................................................................................e..........................'........................................................................................................................................~......................... + // trn1 v27.4s, v11.4s, v12.4s // ...................................................................................................................................................e................'..................................................................................................................................................~............... + // trn2 v28.4s, v11.4s, v12.4s // ........................................................................................................................................e...........................'.......................................................................................................................................~.......................... + // trn2 v11.2d, v25.2d, v27.2d // ..............................................................................................................................................................e.....'.............................................................................................................................................................~.... + // trn2 v12.2d, v26.2d, v28.2d // ....................................................................................................................................................e...............'...................................................................................................................................................~.............. + // trn1 v9.2d, v25.2d, v27.2d // .......~............................................................................................................................................................'......*........................................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ........~...........................................................................................................................................................'.......*.......................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ..................................................................................................................................e.................................'.................................................................................................................................~................................ + // trn2 v26.4s, v13.4s, v14.4s // ................................................................................................................................e...................................'...............................................................................................................................~.................................. + // trn1 v27.4s, v15.4s, v16.4s // ....................................................................................................................................e...............................'...................................................................................................................................~.............................. + // trn2 v28.4s, v15.4s, v16.4s // ...................................................................................................................................e................................'..................................................................................................................................~............................... + // trn2 v15.2d, v25.2d, v27.2d // .................................................................................................................................................e..................'................................................................................................................................................~................. + // trn2 v16.2d, v26.2d, v28.2d // ...........................................................................................................................................e........................'..........................................................................................................................................~....................... + // trn1 v13.2d, v25.2d, v27.2d // ..........~.........................................................................................................................................................'.........*........................................................................................................................................................ + // trn1 v14.2d, v26.2d, v28.2d // .................................................................................................................................................................e..'................................................................................................................................................................~. + // ldr q0, [ x5], #(12*16) // ..............e.....................................................................................................................................................'.............~.................................................................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // .............................................................................................e......................................................................'............................................................................................~..................................................................... + // ldr q1, [ x5, #(-12*16 + 2*16)] // .................................................................................................................................e..................................'................................................................................................................................~................................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ............................................................................................................................................e.......................'...........................................................................................................................................~...................... + // ldr q2, [ x5, #(-12*16 + 4*16)] // ...............................~....................................................................................................................................'..............................*................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ..................................~.................................................................................................................................'.................................*................................................................................................................................ + // sqrdmulh v27.4s, v11.4s, v4.4s // .....~..............................................................................................................................................................'....*............................................................................................................................................................. + // mul v24.4s, v11.4s, v0.4s // ......~.............................................................................................................................................................'.....*............................................................................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ..................~.................................................................................................................................................'.................*................................................................................................................................................ + // sub v11.4s, v9.4s, v24.4s // ...........................................................................................~........................................................................'..........................................................................................*....................................................................... + // add v9.4s, v9.4s, v24.4s // ............................................~.......................................................................................................................'...........................................*...................................................................................................................... + // sqrdmulh v27.4s, v12.4s, v4.4s // ..........................................................................................................................................................e.........'.........................................................................................................................................................~........ + // mul v24.4s, v12.4s, v0.4s // ...........................................................................................................................................................e........'..........................................................................................................................................................~....... + // mls v24.4s, v27.4s, v8.s[0] // ............~.......................................................................................................................................................'...........*...................................................................................................................................................... + // sub v12.4s, v10.4s, v24.4s // ..........................~.........................................................................................................................................'.........................*........................................................................................................................................ + // add v10.4s, v10.4s, v24.4s // .........................~..........................................................................................................................................'........................*......................................................................................................................................... + // sqrdmulh v27.4s, v10.4s, v5.4s // ....................................~...............................................................................................................................'...................................*.............................................................................................................................. + // mul v24.4s, v10.4s, v1.4s // ...................................~................................................................................................................................'..................................*............................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................~.................................................................................................................'.................................................*................................................................................................................ + // sub v10.4s, v9.4s, v24.4s // ......................................................................~.............................................................................................'.....................................................................*............................................................................................ + // add v9.4s, v9.4s, v24.4s // ...................................................................................~................................................................................'..................................................................................*............................................................................... + // sqrdmulh v27.4s, v12.4s, v6.4s // ......................................................~.............................................................................................................'.....................................................*............................................................................................................ + // mul v24.4s, v12.4s, v2.4s // .................................................~..................................................................................................................'................................................*................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .......................................................................................~............................................................................'......................................................................................*........................................................................... + // sub v12.4s, v11.4s, v24.4s // ...................................................................................................~................................................................'..................................................................................................*............................................................... + // add v11.4s, v11.4s, v24.4s // ....................................................................................................~...............................................................'...................................................................................................*.............................................................. + // ldr q0, [ x5, #(-12*16 + 6*16)] // ................................................................................e...................................................................................'...............................................................................~.................................................................................. + // ldr q4, [x5, #(-12*16 + 7*16)] // .....................................................................................................................................e..............................'....................................................................................................................................~............................. + // ldr q1, [ x5, #(-12*16 + 8*16)] // ...................................................................................................................................................................e'.................................................................................................................................................................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ...........................................................................e........................................................................................'..........................................................................~....................................................................................... + // ldr q2, [ x5, #(-12*16 + 10*16)] // .........~..........................................................................................................................................................'........*......................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ....................~...............................................................................................................................................'...................*.............................................................................................................................................. + // sqrdmulh v27.4s, v15.4s, v4.4s // ...~................................................................................................................................................................'..*............................................................................................................................................................... + // mul v24.4s, v15.4s, v0.4s // ...............................................................................................................................................................e....'..............................................................................................................................................................~... + // mls v24.4s, v27.4s, v8.s[0] // ...................~................................................................................................................................................'..................*............................................................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ................................................~...................................................................................................................'...............................................*.................................................................................................................. + // add v13.4s, v13.4s, v24.4s // .......................................~............................................................................................................................'......................................*........................................................................................................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // .........................................................................................................................................................e..........'........................................................................................................................................................~......... + // mul v24.4s, v16.4s, v0.4s // ........................................................................................................................................................e...........'.......................................................................................................................................................~.......... + // mls v24.4s, v27.4s, v8.s[0] // .~..................................................................................................................................................................'*................................................................................................................................................................. + // sub v16.4s, v14.4s, v24.4s // ................~...................................................................................................................................................'...............*.................................................................................................................................................. + // add v14.4s, v14.4s, v24.4s // .....................~..............................................................................................................................................'....................*............................................................................................................................................. + // sqrdmulh v27.4s, v14.4s, v5.4s // ............................~.......................................................................................................................................'...........................*...................................................................................................................................... + // mul v24.4s, v14.4s, v1.4s // ...........................~........................................................................................................................................'..........................*....................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .........................................~..........................................................................................................................'........................................*......................................................................................................................... + // sub v14.4s, v13.4s, v24.4s // .........................................................................~..........................................................................................'........................................................................*......................................................................................... + // add v13.4s, v13.4s, v24.4s // ........................................................~...........................................................................................................'.......................................................*.......................................................................................................... + // sqrdmulh v27.4s, v16.4s, v6.4s // ..............................................~.....................................................................................................................'.............................................*.................................................................................................................... + // mul v24.4s, v16.4s, v2.4s // .................................~..................................................................................................................................'................................*................................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................~........................................................................................................'..........................................................*....................................................................................................... + // sub v16.4s, v15.4s, v24.4s // ..............................................................................~.....................................................................................'.............................................................................*.................................................................................... + // add v15.4s, v15.4s, v24.4s // .........................................................................................~..........................................................................'........................................................................................*......................................................................... + // trn1 v25.4s, v9.4s, v10.4s // ..............................................................................................................................................~.....................'.............................................................................................................................................*.................... + // trn2 v26.4s, v9.4s, v10.4s // ..........................................................................................~.........................................................................'.........................................................................................*........................................................................ + // trn1 v27.4s, v11.4s, v12.4s // ..........................................................................................................................................~.........................'.........................................................................................................................................*........................ + // trn2 v28.4s, v11.4s, v12.4s // ............................................................................................................~.......................................................'...........................................................................................................*...................................................... + // trn2 v11.2d, v25.2d, v27.2d // .....................................................................................................................................................~..............'....................................................................................................................................................*............. + // trn2 v12.2d, v26.2d, v28.2d // ......................................................................................................................~.............................................'.....................................................................................................................*............................................ + // trn1 v9.2d, v25.2d, v27.2d // ......................................................................................................................................................~.............'.....................................................................................................................................................*............ + // trn1 v10.2d, v26.2d, v28.2d // ..................................................................................................................~.................................................'.................................................................................................................*................................................ + // trn1 v25.4s, v13.4s, v14.4s // ........................................................................................~...........................................................................'.......................................................................................*.......................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..................................................................................~.................................................................................'.................................................................................*................................................................................ + // trn1 v27.4s, v15.4s, v16.4s // .....................................................................................................................~..............................................'....................................................................................................................*............................................. + // trn2 v28.4s, v15.4s, v16.4s // ...........................................................................................................~........................................................'..........................................................................................................*....................................................... + // trn2 v15.2d, v25.2d, v27.2d // .......................................................................................................................................~............................'......................................................................................................................................*........................... + // trn2 v16.2d, v26.2d, v28.2d // ....................................................................................................................~...............................................'...................................................................................................................*.............................................. + // trn1 v13.2d, v25.2d, v27.2d // ................................................................................................................................................~...................'...............................................................................................................................................*.................. + // trn1 v14.2d, v26.2d, v28.2d // ......................................................................................................................................~.............................'.....................................................................................................................................*............................ + // str q9, [x1], #128 // ..................................................................................................................................................................~.'.................................................................................................................................................................* + // str q10, [x1, #(-(128) + 16*1)] // ..................................................................................................................................................~.................'.................................................................................................................................................*................ + // str q11, [x1, #(-(128) + 16*2)] // ................................................................................................................................................................~...'...............................................................................................................................................................*.. + // str q12, [x1, #(-(128) + 16*3)] // ...........................................................................................................................~........................................'..........................................................................................................................*....................................... + // str q13, [x2], #128 // ............................................................................................................................................................~.......'...........................................................................................................................................................*...... + // str q14, [x2, #(-(128) + 16*1)] // .......................................................................................................................................................~............'......................................................................................................................................................*........... + // str q15, [x2, #(-(128) + 16*2)] // ...............................................................................................................................................~....................'..............................................................................................................................................*................... + // str q16, [x2, #(-(128) + 16*3)] // .............................................................................................................................................................~......'............................................................................................................................................................*..... sub count, count, #1 cbnz count, layer45678_start - sqrdmulh v2.4S, v0.4S, v26.4S // .*.................................................................... - mul v14.4S, v0.4S, v6.4S // ..*................................................................... - trn1 v11.2D, v15.2D, v10.2D // ......*............................................................... - ldr q4, [x5, #-32] // ...................................*.................................. - ldr q1, [x5, #-112] // .....*................................................................ - ldr q3, [x5, #-48] // ...............*...................................................... - mls v21.4S, v23.4S, v8.S[0] // *..................................................................... - // gap // ...................................................................... - sqrdmulh v0.4S, v12.4S, v26.4S // .........*............................................................ - mul v30.4S, v12.4S, v6.4S // ....*................................................................. - mul v5.4S, v13.4S, v20.4S // .......*.............................................................. - ldr q29, [x5, #-160] // ...*.................................................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - trn1 v31.2D, v31.2D, v22.2D // ..............*....................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v6.4S, v11.4S, v21.4S // ...........*.......................................................... - add v11.4S, v11.4S, v21.4S // ............*......................................................... - mls v14.4S, v2.4S, v8.S[0] // .............*........................................................ - sqrdmulh v9.4S, v13.4S, v9.4S // ........*............................................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v30.4S, v0.4S, v8.S[0] // ....................*................................................. - trn1 v17.2D, v24.2D, v7.2D // ..........*........................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v22.4S, v11.4S, v29.4S // ........................*............................................. - sqrdmulh v24.4S, v6.4S, v1.4S // ...................*.................................................. - mul v18.4S, v6.4S, v28.4S // ..................*................................................... - sqrdmulh v7.4S, v11.4S, v16.4S // .................*.................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - ldr q28, [x5, #-16] // ................*..................................................... - sub v26.4S, v17.4S, v14.4S // ......................*............................................... - add v11.4S, v17.4S, v14.4S // .......................*.............................................. - mls v5.4S, v9.4S, v8.S[0] // .....................*................................................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - add v10.4S, v31.4S, v30.4S // .............................*........................................ - sub v1.4S, v31.4S, v30.4S // ..........................*........................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v22.4S, v7.4S, v8.S[0] // ..................................*................................... - mul v2.4S, v11.4S, v27.4S // ...........................*.......................................... - sqrdmulh v12.4S, v11.4S, v3.4S // ..............................*....................................... - mls v18.4S, v24.4S, v8.S[0] // .........................*............................................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mul v11.4S, v26.4S, v4.4S // .........................................*............................ - sub v20.4S, v25.4S, v5.4S // ............................*......................................... - add v5.4S, v25.4S, v5.4S // ...............................*...................................... - sqrdmulh v9.4S, v26.4S, v28.4S // ........................................*............................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v2.4S, v12.4S, v8.S[0] // .....................................*................................ - add v0.4S, v20.4S, v18.4S // ................................*..................................... - sub v3.4S, v20.4S, v18.4S // .................................*.................................... - sub v31.4S, v5.4S, v22.4S // ......................................*............................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - mls v11.4S, v9.4S, v8.S[0] // ...............................................*...................... - add v7.4S, v5.4S, v22.4S // .......................................*.............................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - trn2 v27.4S, v0.4S, v3.4S // .....................................................*................ - trn1 v25.4S, v0.4S, v3.4S // ....................................*................................. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v28.4S, v10.4S, v2.4S // ...........................................*.......................... - add v10.4S, v10.4S, v2.4S // ..............................................*....................... - trn2 v12.4S, v7.4S, v31.4S // ............................................*......................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - sub v20.4S, v1.4S, v11.4S // ................................................*..................... - add v29.4S, v1.4S, v11.4S // .................................................*.................... - trn1 v26.4S, v7.4S, v31.4S // ..........................................*........................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - trn1 v23.2D, v12.2D, v27.2D // ...........................................................*.......... - trn2 v12.2D, v12.2D, v27.2D // .........................................................*............ - trn2 v11.4S, v10.4S, v28.4S // ........................................................*............. - trn1 v18.4S, v10.4S, v28.4S // ..................................................*................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - trn2 v7.4S, v29.4S, v20.4S // ......................................................*............... - trn1 v5.4S, v29.4S, v20.4S // ...................................................*.................. - trn2 v22.2D, v26.2D, v25.2D // ....................................................*................. - trn1 v0.2D, v26.2D, v25.2D // .............................................*........................ - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q23, [x1, #16] // ................................................................*..... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q0, [x1], #128 // ..........................................................*........... - trn1 v1.2D, v11.2D, v7.2D // ............................................................*......... - trn2 v4.2D, v11.2D, v7.2D // .............................................................*........ - str q22, [x1, #-96] // ..............................................................*....... - trn1 v9.2D, v18.2D, v5.2D // .......................................................*.............. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - trn2 v30.2D, v18.2D, v5.2D // ...............................................................*...... - str q12, [x1, #-80] // .................................................................*.... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q1, [x2, #16] // ..................................................................*... - str q4, [x2, #48] // ...................................................................*.. - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - str q9, [x2], #128 // ....................................................................*. - str q30, [x2, #-96] // .....................................................................* - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - // gap // ...................................................................... - - // original source code - // mls v21.4S, v23.4S, v8.S[0] // ......*............................................................... - // sqrdmulh v4.4S, v0.4S, v26.4S // *..................................................................... - // mul v11.4S, v0.4S, v6.4S // .*.................................................................... - // ldr q14, [x5, #-160] // ..........*........................................................... - // mul v2.4S, v12.4S, v6.4S // ........*............................................................. - // ldr q6, [x5, #-112] // ....*................................................................. - // trn1 v0.2D, v15.2D, v10.2D // ..*................................................................... - // mul v5.4S, v13.4S, v20.4S // .........*............................................................ - // sqrdmulh v13.4S, v13.4S, v9.4S // ...............*...................................................... - // sqrdmulh v29.4S, v12.4S, v26.4S // .......*.............................................................. - // trn1 v23.2D, v24.2D, v7.2D // .................*.................................................... - // sub v18.4S, v0.4S, v21.4S // ............*......................................................... - // add v12.4S, v0.4S, v21.4S // .............*........................................................ - // mls v11.4S, v4.4S, v8.S[0] // ..............*....................................................... - // trn1 v26.2D, v31.2D, v22.2D // ...........*.......................................................... - // ldr q31, [x5, #-48] // .....*................................................................ - // ldr q15, [x5, #-16] // ......................*............................................... - // sqrdmulh v7.4S, v12.4S, v16.4S // .....................*................................................ - // mul v4.4S, v18.4S, v28.4S // ....................*................................................. - // sqrdmulh v6.4S, v18.4S, v6.4S // ...................*.................................................. - // mls v2.4S, v29.4S, v8.S[0] // ................*..................................................... - // mls v5.4S, v13.4S, v8.S[0] // .........................*............................................ - // sub v29.4S, v23.4S, v11.4S // .......................*.............................................. - // add v23.4S, v23.4S, v11.4S // ........................*............................................. - // mul v18.4S, v12.4S, v14.4S // ..................*................................................... - // mls v4.4S, v6.4S, v8.S[0] // ...............................*...................................... - // sub v12.4S, v26.4S, v2.4S // ...........................*.......................................... - // mul v21.4S, v23.4S, v27.4S // .............................*........................................ - // sub v6.4S, v25.4S, v5.4S // .................................*.................................... - // add v24.4S, v26.4S, v2.4S // ..........................*........................................... - // sqrdmulh v27.4S, v23.4S, v31.4S // ..............................*....................................... - // add v30.4S, v25.4S, v5.4S // ..................................*................................... - // add v20.4S, v6.4S, v4.4S // .....................................*................................ - // sub v22.4S, v6.4S, v4.4S // ......................................*............................... - // mls v18.4S, v7.4S, v8.S[0] // ............................*......................................... - // ldr q6, [x5, #-32] // ...*.................................................................. - // trn1 v9.4S, v20.4S, v22.4S // ...........................................*.......................... - // mls v21.4S, v27.4S, v8.S[0] // ....................................*................................. - // sub v19.4S, v30.4S, v18.4S // .......................................*.............................. - // add v31.4S, v30.4S, v18.4S // .........................................*............................ - // sqrdmulh v26.4S, v29.4S, v15.4S // ...................................*.................................. - // mul v4.4S, v29.4S, v6.4S // ................................*..................................... - // trn1 v17.4S, v31.4S, v19.4S // .................................................*.................... - // sub v13.4S, v24.4S, v21.4S // ............................................*......................... - // trn2 v18.4S, v31.4S, v19.4S // ..............................................*....................... - // trn1 v5.2D, v17.2D, v9.2D // .........................................................*............ - // add v24.4S, v24.4S, v21.4S // .............................................*........................ - // mls v4.4S, v26.4S, v8.S[0] // ........................................*............................. - // sub v23.4S, v12.4S, v4.4S // ...............................................*...................... - // add v12.4S, v12.4S, v4.4S // ................................................*..................... - // trn1 v25.4S, v24.4S, v13.4S // .....................................................*................ - // trn1 v31.4S, v12.4S, v23.4S // .......................................................*.............. - // trn2 v3.2D, v17.2D, v9.2D // ........................................................*............. - // trn2 v7.4S, v20.4S, v22.4S // ..........................................*........................... - // trn2 v23.4S, v12.4S, v23.4S // ......................................................*............... - // trn1 v17.2D, v25.2D, v31.2D // ...............................................................*...... - // trn2 v24.4S, v24.4S, v13.4S // ....................................................*................. - // trn2 v4.2D, v18.2D, v7.2D // ...................................................*.................. - // str q5, [x1], #128 // ...........................................................*.......... - // trn1 v13.2D, v18.2D, v7.2D // ..................................................*................... - // trn1 v11.2D, v24.2D, v23.2D // ............................................................*......... - // trn2 v19.2D, v24.2D, v23.2D // .............................................................*........ - // str q3, [x1, #-96] // ..............................................................*....... - // trn2 v12.2D, v25.2D, v31.2D // ................................................................*..... - // str q13, [x1, #-112] // ..........................................................*........... - // str q4, [x1, #-80] // .................................................................*.... - // str q11, [x2, #16] // ..................................................................*... - // str q19, [x2, #48] // ...................................................................*.. - // str q17, [x2], #128 // ....................................................................*. - // str q12, [x2, #-96] // .....................................................................* + // Instructions: 66 + // Expected cycles: 21 + // Expected IPC: 3.14 + // + // Wall time: 5.81s + // User time: 5.81s + // + // ----------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + trn1 v4.2D, v19.2D, v3.2D // .....*............................................................ + ldr q25, [x5, #-32] // ......*........................................................... + mls v26.4S, v11.4S, v8.S[0] // *................................................................. + mls v13.4S, v12.4S, v8.S[0] // ........*......................................................... + ldr q28, [x5, #-128] // ..................*............................................... + ldr q1, [x5, #-16] // ............*..................................................... + sqrdmulh v2.4S, v17.4S, v30.4S // ..*............................................................... + // gap // .................................................................. + trn1 v7.2D, v24.2D, v9.2D // ....*............................................................. + sqrdmulh v21.4S, v0.4S, v21.4S // .*................................................................ + mul v10.4S, v17.4S, v10.4S // ...*.............................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn1 v18.2D, v22.2D, v29.2D // .......*.......................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + ldr q0, [x5, #-112] // ....................*............................................. + sub v14.4S, v31.4S, v26.4S // .........*........................................................ + add v19.4S, v31.4S, v26.4S // .............*.................................................... + sub v23.4S, v4.4S, v13.4S // ...............*.................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v10.4S, v2.4S, v8.S[0] // ..........*....................................................... + mls v15.4S, v21.4S, v8.S[0] // ...........*...................................................... + // gap // .................................................................. + add v2.4S, v4.4S, v13.4S // ..............*................................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v26.4S, v19.4S, v6.4S // ................*................................................. + sqrdmulh v6.4S, v19.4S, v16.4S // .................*................................................ + sqrdmulh v9.4S, v14.4S, v1.4S // ..........................*....................................... + mul v13.4S, v14.4S, v25.4S // ...................*.............................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v3.4S, v23.4S, v28.4S // ............................*..................................... + sqrdmulh v16.4S, v23.4S, v0.4S // ..............................*................................... + sqrdmulh v25.4S, v2.4S, v20.4S // ......................*........................................... + mul v31.4S, v2.4S, v5.4S // .....................*............................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v2.4S, v7.4S, v10.4S // .........................*........................................ + sub v19.4S, v7.4S, v10.4S // ..........................................*....................... + add v1.4S, v18.4S, v15.4S // .......................*.......................................... + sub v11.4S, v18.4S, v15.4S // ...........................*...................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v26.4S, v6.4S, v8.S[0] // ........................*......................................... + mls v13.4S, v9.4S, v8.S[0] // ................................*................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mls v3.4S, v16.4S, v8.S[0] // ......................................*........................... + // gap // .................................................................. + // gap // .................................................................. + mls v31.4S, v25.4S, v8.S[0] // .............................*.................................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v4.4S, v1.4S, v26.4S // ...............................*.................................. + sub v28.4S, v11.4S, v13.4S // ...................................*.............................. + sub v29.4S, v1.4S, v26.4S // ..................................*............................... + add v26.4S, v11.4S, v13.4S // ........................................*......................... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + add v22.4S, v2.4S, v31.4S // .....................................*............................ + sub v6.4S, v19.4S, v3.4S // ...........................................*...................... + add v9.4S, v19.4S, v3.4S // ............................................*..................... + sub v19.4S, v2.4S, v31.4S // .................................*................................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v0.4S, v4.4S, v29.4S // ....................................*............................. + trn1 v29.4S, v4.4S, v29.4S // .......................................*.......................... + trn2 v4.4S, v26.4S, v28.4S // .............................................*.................... + trn1 v28.4S, v26.4S, v28.4S // .................................................*................ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v7.4S, v9.4S, v6.4S // ..............................................*................... + trn2 v13.4S, v22.4S, v19.4S // .........................................*........................ + trn1 v19.4S, v22.4S, v19.4S // .......................................................*.......... + trn1 v26.4S, v9.4S, v6.4S // ......................................................*........... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v22.2D, v0.2D, v4.2D // ................................................*................. + trn1 v6.2D, v29.2D, v28.2D // .........................................................*........ + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn1 v14.2D, v19.2D, v26.2D // ............................................................*..... + trn2 v15.2D, v13.2D, v7.2D // ..................................................*............... + trn2 v16.2D, v19.2D, v26.2D // ...........................................................*...... + trn1 v26.2D, v0.2D, v4.2D // ....................................................*............. + str q22, [x2, #48] // ...............................................................*.. + trn1 v22.2D, v13.2D, v7.2D // ...............................................*.................. + str q6, [x2], #128 // ..............................................................*... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + trn2 v19.2D, v29.2D, v28.2D // .....................................................*............ + str q15, [x1, #48] // ...................................................*.............. + str q14, [x1], #128 // .................................................................* + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q16, [x1, #-96] // ................................................................*. + str q22, [x1, #-112] // ..........................................................*....... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q26, [x2, #-112] // .............................................................*.... + str q19, [x2, #-96] // ........................................................*......... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + + // ------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // mls v26.4S, v11.4S, v8.S[0] // ..*............................................................... + // sqrdmulh v11.4S, v0.4S, v21.4S // ........*......................................................... + // sqrdmulh v23.4S, v17.4S, v30.4S // ......*........................................................... + // mul v14.4S, v17.4S, v10.4S // .........*........................................................ + // trn1 v1.2D, v24.2D, v9.2D // .......*.......................................................... + // trn1 v4.2D, v19.2D, v3.2D // *................................................................. + // ldr q9, [x5, #-32] // .*................................................................ + // trn1 v2.2D, v22.2D, v29.2D // ..........*....................................................... + // mls v13.4S, v12.4S, v8.S[0] // ...*.............................................................. + // sub v29.4S, v31.4S, v26.4S // ............*..................................................... + // mls v14.4S, v23.4S, v8.S[0] // ...............*.................................................. + // mls v15.4S, v11.4S, v8.S[0] // ................*................................................. + // ldr q12, [x5, #-16] // .....*............................................................ + // add v22.4S, v31.4S, v26.4S // .............*.................................................... + // add v3.4S, v4.4S, v13.4S // .................*................................................ + // sub v11.4S, v4.4S, v13.4S // ..............*................................................... + // mul v4.4S, v22.4S, v6.4S // ..................*............................................... + // sqrdmulh v24.4S, v22.4S, v16.4S // ...................*.............................................. + // ldr q31, [x5, #-128] // ....*............................................................. + // mul v30.4S, v29.4S, v9.4S // .....................*............................................ + // ldr q18, [x5, #-112] // ...........*...................................................... + // mul v13.4S, v3.4S, v5.4S // .........................*........................................ + // sqrdmulh v20.4S, v3.4S, v20.4S // ........................*......................................... + // add v9.4S, v2.4S, v15.4S // ............................*..................................... + // mls v4.4S, v24.4S, v8.S[0] // ..............................*................................... + // add v24.4S, v1.4S, v14.4S // ..........................*....................................... + // sqrdmulh v29.4S, v29.4S, v12.4S // ....................*............................................. + // sub v12.4S, v2.4S, v15.4S // .............................*.................................... + // mul v15.4S, v11.4S, v31.4S // ......................*........................................... + // mls v13.4S, v20.4S, v8.S[0] // .................................*................................ + // sqrdmulh v23.4S, v11.4S, v18.4S // .......................*.......................................... + // add v2.4S, v9.4S, v4.4S // ..................................*............................... + // mls v30.4S, v29.4S, v8.S[0] // ...............................*.................................. + // sub v18.4S, v24.4S, v13.4S // .........................................*........................ + // sub v5.4S, v9.4S, v4.4S // ....................................*............................. + // sub v4.4S, v12.4S, v30.4S // ...................................*.............................. + // trn2 v11.4S, v2.4S, v5.4S // ..........................................*....................... + // add v13.4S, v24.4S, v13.4S // ......................................*........................... + // mls v15.4S, v23.4S, v8.S[0] // ................................*................................. + // trn1 v2.4S, v2.4S, v5.4S // ...........................................*...................... + // add v5.4S, v12.4S, v30.4S // .....................................*............................ + // trn2 v0.4S, v13.4S, v18.4S // ...............................................*.................. + // sub v29.4S, v1.4S, v14.4S // ...........................*...................................... + // sub v12.4S, v29.4S, v15.4S // .......................................*.......................... + // add v23.4S, v29.4S, v15.4S // ........................................*......................... + // trn2 v7.4S, v5.4S, v4.4S // ............................................*..................... + // trn2 v19.4S, v23.4S, v12.4S // ..............................................*................... + // trn1 v9.2D, v0.2D, v19.2D // .........................................................*........ + // trn2 v25.2D, v11.2D, v7.2D // ..................................................*............... + // trn1 v1.4S, v5.4S, v4.4S // .............................................*.................... + // trn2 v4.2D, v0.2D, v19.2D // .....................................................*............ + // str q4, [x1, #48] // ............................................................*..... + // trn1 v28.2D, v11.2D, v7.2D // .......................................................*.......... + // trn2 v17.2D, v2.2D, v1.2D // ...........................................................*...... + // trn1 v26.4S, v23.4S, v12.4S // .................................................*................ + // trn1 v4.4S, v13.4S, v18.4S // ................................................*................. + // str q17, [x2, #32] // .................................................................* + // trn1 v18.2D, v2.2D, v1.2D // ...................................................*.............. + // str q9, [x1, #16] // ...............................................................*.. + // trn2 v23.2D, v4.2D, v26.2D // ......................................................*........... + // trn1 v7.2D, v4.2D, v26.2D // ....................................................*............. + // str q28, [x2, #16] // ................................................................*. + // str q18, [x2], #128 // ..........................................................*....... + // str q25, [x2, #-80] // ........................................................*......... + // str q23, [x1, #32] // ..............................................................*... + // str q7, [x1], #128 // .............................................................*.... pop_stack diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm.s index 7231387c..47499d07 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +25,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +42,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -89,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -136,35 +108,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -186,7 +158,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +169,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +179,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +187,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +198,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -375,466 +347,490 @@ _ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm: load_roots_123 .p2align 2 - ldr q16, [x0, #384] // .*........................................... - // gap // ............................................. - ldr q21, [x0, #896] // ..*.......................................... - // gap // ............................................. - ldr q10, [x0, #0] // ...........*................................. - ldr q29, [x0, #768] // ...*......................................... - // gap // ............................................. - // gap // ............................................. - ldr q25, [x0, #640] // ......*...................................... - // gap // ............................................. - // gap // ............................................. - // gap // ............................................. - ldr q7, [x0, #512] // *............................................ - // gap // ............................................. - // gap // ............................................. - // gap // ............................................. - mul v4.4S, v21.4S, v0.S[0] // .......*..................................... - sqrdmulh v13.4S, v21.4S, v0.S[1] // ........*.................................... - ldr q9, [x0, #256] // ..............*.............................. - // gap // ............................................. - sqrdmulh v24.4S, v29.4S, v0.S[1] // .........*................................... - mul v6.4S, v29.4S, v0.S[0] // ................*............................ - ldr q29, [x0, #128] // ....................*........................ - // gap // ............................................. - sqrdmulh v21.4S, v25.4S, v0.S[1] // .............*............................... - mul v30.4S, v25.4S, v0.S[0] // ...............*............................. - // gap // ............................................. - // gap // ............................................. - mls v4.4S, v13.4S, v8.S[0] // ............*................................ - sqrdmulh v18.4S, v7.4S, v0.S[1] // .....*....................................... - // gap // ............................................. - // gap // ............................................. - mul v7.4S, v7.4S, v0.S[0] // ....*........................................ - mls v6.4S, v24.4S, v8.S[0] // .....................*....................... - // gap // ............................................. - // gap // ............................................. - mls v30.4S, v21.4S, v8.S[0] // ...................*......................... - // gap // ............................................. - // gap // ............................................. - // gap // ............................................. - sub v12.4S, v16.4S, v4.4S // .....................................*....... - add v4.4S, v16.4S, v4.4S // .................*........................... - // gap // ............................................. - // gap // ............................................. - mls v7.4S, v18.4S, v8.S[0] // ..........*.................................. - add v25.4S, v9.4S, v6.4S // ..........................*.................. - // gap // ............................................. - // gap // ............................................. - mul v15.4S, v4.4S, v0.S[2] // ......................*...................... - sqrdmulh v21.4S, v4.4S, v0.S[3] // .......................*..................... - // gap // ............................................. - // gap // ............................................. - mul v4.4S, v25.4S, v0.S[2] // ..............................*.............. - sub v5.4S, v29.4S, v30.4S // ...........................................*. - // gap // ............................................. - // gap // ............................................. - sqrdmulh v16.4S, v25.4S, v0.S[3] // .................................*........... - sub v19.4S, v10.4S, v7.4S // ..................*.......................... - // gap // ............................................. - // gap // ............................................. - add v17.4S, v10.4S, v7.4S // .........................................*... - mls v15.4S, v21.4S, v8.S[0] // ...........................*................. - // gap // ............................................. - // gap // ............................................. - add v10.4S, v29.4S, v30.4S // ........................*.................... - sub v29.4S, v9.4S, v6.4S // .........................*................... - // gap // ............................................. - // gap // ............................................. - mls v4.4S, v16.4S, v8.S[0] // .......................................*..... - // gap // ............................................. - // gap // ............................................. - // gap // ............................................. - sub v16.4S, v10.4S, v15.4S // ................................*............ - mul v9.4S, v29.4S, v1.S[0] // ............................*................ - // gap // ............................................. - // gap // ............................................. - sqrdmulh v14.4S, v29.4S, v1.S[1] // .............................*............... - add v29.4S, v10.4S, v15.4S // ...............................*............. - // gap // ............................................. - // gap // ............................................. - mul v21.4S, v16.4S, v2.S[0] // ....................................*........ - sqrdmulh v16.4S, v16.4S, v2.S[1] // ...................................*......... - // gap // ............................................. - // gap // ............................................. - mul v30.4S, v29.4S, v1.S[2] // ......................................*...... - // gap // ............................................. - // gap // ............................................. - // gap // ............................................. - mls v9.4S, v14.4S, v8.S[0] // ..................................*.......... - sqrdmulh v20.4S, v29.4S, v1.S[3] // ........................................*.... - // gap // ............................................. - // gap // ............................................. - mls v21.4S, v16.4S, v8.S[0] // ..........................................*.. - mul v16.4S, v12.4S, v1.S[0] // ............................................* - // gap // ............................................. - // gap // ............................................. - - // original source code - // ldr q20, [x0, #512] // .....*....................................... - // ldr q24, [x0, #384] // *............................................ - // ldr q12, [x0, #896] // .*........................................... - // ldr q4, [x0, #768] // ...*......................................... - // mul v10.4S, v20.4S, v0.S[0] // ................*............................ - // sqrdmulh v9.4S, v20.4S, v0.S[1] // ...............*............................. - // ldr q17, [x0, #640] // ....*........................................ - // mul v29.4S, v12.4S, v0.S[0] // ......*...................................... - // sqrdmulh v26.4S, v12.4S, v0.S[1] // .......*..................................... - // sqrdmulh v6.4S, v4.4S, v0.S[1] // .........*................................... - // mls v10.4S, v9.4S, v8.S[0] // .....................*....................... - // ldr q27, [x0, #0] // ..*.......................................... - // mls v29.4S, v26.4S, v8.S[0] // ..............*.............................. - // sqrdmulh v9.4S, v17.4S, v0.S[1] // ............*................................ - // ldr q23, [x0, #256] // ........*.................................... - // mul v16.4S, v17.4S, v0.S[0] // .............*............................... - // mul v4.4S, v4.4S, v0.S[0] // ..........*.................................. - // add v26.4S, v24.4S, v29.4S // ....................*........................ - // sub v19.4S, v27.4S, v10.4S // ............................*................ - // mls v16.4S, v9.4S, v8.S[0] // ..................*.......................... - // ldr q28, [x0, #128] // ...........*................................. - // mls v4.4S, v6.4S, v8.S[0] // .................*........................... - // mul v15.4S, v26.4S, v0.S[2] // .......................*..................... - // sqrdmulh v12.4S, v26.4S, v0.S[3] // ........................*.................... - // add v14.4S, v28.4S, v16.4S // ...............................*............. - // sub v30.4S, v23.4S, v4.4S // ................................*............ - // add v23.4S, v23.4S, v4.4S // ......................*...................... - // mls v15.4S, v12.4S, v8.S[0] // ..............................*.............. - // mul v9.4S, v30.4S, v1.S[0] // ...................................*......... - // sqrdmulh v12.4S, v30.4S, v1.S[1] // ....................................*........ - // mul v4.4S, v23.4S, v0.S[2] // .........................*................... - // add v18.4S, v14.4S, v15.4S // .....................................*....... - // sub v30.4S, v14.4S, v15.4S // ..................................*.......... - // sqrdmulh v20.4S, v23.4S, v0.S[3] // ...........................*................. - // mls v9.4S, v12.4S, v8.S[0] // .........................................*... - // sqrdmulh v11.4S, v30.4S, v2.S[1] // .......................................*..... - // mul v21.4S, v30.4S, v2.S[0] // ......................................*...... - // sub v12.4S, v24.4S, v29.4S // ...................*......................... - // mul v30.4S, v18.4S, v1.S[2] // ........................................*.... - // mls v4.4S, v20.4S, v8.S[0] // .................................*........... - // sqrdmulh v20.4S, v18.4S, v1.S[3] // ..........................................*.. - // add v17.4S, v27.4S, v10.4S // .............................*............... - // mls v21.4S, v11.4S, v8.S[0] // ...........................................*. - // sub v5.4S, v28.4S, v16.4S // ..........................*.................. - // mul v16.4S, v12.4S, v1.S[0] // ............................................* + // Instructions: 43 + // Expected cycles: 24 + // Expected IPC: 1.79 + // + // Wall time: 0.84s + // User time: 0.84s + // + // ----------- original position ------------> + // 0 25 + // |------------------------|----------------- + ldr q17, [x0, #896] // *.......................................... + ldr q9, [x0, #768] // .*......................................... + // gap // ........................................... + // gap // ........................................... + ldr q18, [x0, #256] // .....*..................................... + ldr q10, [x0, #512] // .......*................................... + // gap // ........................................... + // gap // ........................................... + ldr q25, [x0, #128] // ....*...................................... + ldr q26, [x0, #640] // ......*.................................... + // gap // ........................................... + // gap // ........................................... + ldr q31, [x0, #0] // ...*....................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v21.4S, v17.4S, v0.S[0] // .........*................................. + sqrdmulh v5.4S, v17.4S, v0.S[1] // ..........*................................ + // gap // ........................................... + // gap // ........................................... + sqrdmulh v13.4S, v10.4S, v0.S[1] // ...........*............................... + mul v20.4S, v10.4S, v0.S[0] // ...................*....................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v27.4S, v26.4S, v0.S[1] // ...............*........................... + mul v17.4S, v26.4S, v0.S[0] // .............*............................. + // gap // ........................................... + ldr q14, [x0, #384] // ..*........................................ + mls v21.4S, v5.4S, v8.S[0] // ............*.............................. + sqrdmulh v22.4S, v9.4S, v0.S[1] // ........*.................................. + // gap // ........................................... + // gap // ........................................... + mls v20.4S, v13.4S, v8.S[0] // ...........................*............... + mul v9.4S, v9.4S, v0.S[0] // ................*.......................... + // gap // ........................................... + // gap // ........................................... + mls v17.4S, v27.4S, v8.S[0] // ....................*...................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v27.4S, v14.4S, v21.4S // ..............*............................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v28.4S, v14.4S, v21.4S // ......................*.................... + mls v9.4S, v22.4S, v8.S[0] // .....................*..................... + // gap // ........................................... + // gap // ........................................... + mul v24.4S, v27.4S, v0.S[2] // .................*......................... + sqrdmulh v13.4S, v27.4S, v0.S[3] // ..................*........................ + // gap // ........................................... + // gap // ........................................... + add v5.4S, v25.4S, v17.4S // .........................*................. + // gap // ........................................... + // gap // ........................................... + sqrdmulh v26.4S, v28.4S, v1.S[1] // ..................................*........ + add v10.4S, v18.4S, v9.4S // ..........................*................ + sub v19.4S, v18.4S, v9.4S // .......................................*... + // gap // ........................................... + // gap // ........................................... + mls v24.4S, v13.4S, v8.S[0] // .......................*................... + add v11.4S, v31.4S, v20.4S // ....................................*...... + // gap // ........................................... + // gap // ........................................... + mul v27.4S, v10.4S, v0.S[2] // ..............................*............ + sqrdmulh v12.4S, v10.4S, v0.S[3] // .............................*............. + // gap // ........................................... + // gap // ........................................... + mul v10.4S, v28.4S, v1.S[0] // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v13.4S, v5.4S, v24.4S // ............................*.............. + sub v29.4S, v5.4S, v24.4S // ..........................................* + // gap // ........................................... + // gap // ........................................... + mls v27.4S, v12.4S, v8.S[0] // ...................................*....... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v9.4S, v13.4S, v1.S[3] // ................................*.......... + mul v23.4S, v13.4S, v1.S[2] // .................................*......... + // gap // ........................................... + // gap // ........................................... + sub v15.4S, v25.4S, v17.4S // ........................*.................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v24.4S, v11.4S, v27.4S // .........................................*. + add v4.4S, v11.4S, v27.4S // ........................................*.. + // gap // ........................................... + // gap // ........................................... + mls v23.4S, v9.4S, v8.S[0] // .....................................*..... + sub v9.4S, v31.4S, v20.4S // ...............................*........... + // gap // ........................................... + // gap // ........................................... + + // -------------- new position --------------> + // 0 25 + // |------------------------|----------------- + // ldr q31, [x0, #896] // *.......................................... + // ldr q27, [x0, #768] // .*......................................... + // ldr q30, [x0, #384] // .............*............................. + // ldr q14, [x0, #0] // ......*.................................... + // ldr q26, [x0, #128] // ....*...................................... + // ldr q17, [x0, #256] // ..*........................................ + // ldr q29, [x0, #640] // .....*..................................... + // ldr q16, [x0, #512] // ...*....................................... + // sqrdmulh v20.4S, v27.4S, v0.S[1] // ...............*........................... + // mul v13.4S, v31.4S, v0.S[0] // .......*................................... + // sqrdmulh v5.4S, v31.4S, v0.S[1] // ........*.................................. + // sqrdmulh v6.4S, v16.4S, v0.S[1] // .........*................................. + // mls v13.4S, v5.4S, v8.S[0] // ..............*............................ + // mul v9.4S, v29.4S, v0.S[0] // ............*.............................. + // add v19.4S, v30.4S, v13.4S // ...................*....................... + // sqrdmulh v12.4S, v29.4S, v0.S[1] // ...........*............................... + // mul v27.4S, v27.4S, v0.S[0] // .................*......................... + // mul v22.4S, v19.4S, v0.S[2] // ......................*.................... + // sqrdmulh v29.4S, v19.4S, v0.S[3] // .......................*................... + // mul v5.4S, v16.4S, v0.S[0] // ..........*................................ + // mls v9.4S, v12.4S, v8.S[0] // ..................*........................ + // mls v27.4S, v20.4S, v8.S[0] // .....................*..................... + // sub v30.4S, v30.4S, v13.4S // ....................*...................... + // mls v22.4S, v29.4S, v8.S[0] // ............................*.............. + // sub v15.4S, v26.4S, v9.4S // ......................................*.... + // add v11.4S, v26.4S, v9.4S // ........................*.................. + // add v26.4S, v17.4S, v27.4S // ..........................*................ + // mls v5.4S, v6.4S, v8.S[0] // ................*.......................... + // add v29.4S, v11.4S, v22.4S // .................................*......... + // sqrdmulh v12.4S, v26.4S, v0.S[3] // ...............................*........... + // mul v13.4S, v26.4S, v0.S[2] // ..............................*............ + // sub v9.4S, v14.4S, v5.4S // ..........................................* + // sqrdmulh v25.4S, v29.4S, v1.S[3] // ....................................*...... + // mul v23.4S, v29.4S, v1.S[2] // .....................................*..... + // sqrdmulh v26.4S, v30.4S, v1.S[1] // .........................*................. + // mls v13.4S, v12.4S, v8.S[0] // ...................................*....... + // add v24.4S, v14.4S, v5.4S // .............................*............. + // mls v23.4S, v25.4S, v8.S[0] // .........................................*. + // mul v10.4S, v30.4S, v1.S[0] // ................................*.......... + // sub v19.4S, v17.4S, v27.4S // ...........................*............... + // add v4.4S, v24.4S, v13.4S // ........................................*.. + // sub v24.4S, v24.4S, v13.4S // .......................................*... + // sub v29.4S, v11.4S, v22.4S // ..................................*........ sub count, count, #1 layer123_start: - mls v30.4S, v20.4S, v8.S[0] // ..................................................*......................... - ldr q20, [x0, #528] // ....e....................................................................... - ldr q24, [x0, #400] // ...e........................................................................ - sqrdmulh v28.4S, v12.4S, v1.S[1] // ............................................*............................... - add v18.4S, v19.4S, v9.4S // ..........................................*................................. + // Instructions: 76 + // Expected cycles: 30 + // Expected IPC: 2.53 + // + // Wall time: 29.37s + // User time: 29.37s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sub v28.4S, v4.4S, v23.4S // ...................................................*........................ + ldr q31, [x0, #912] // .......e.................................................................... + mls v10.4S, v26.4S, v8.S[0] // .............................................*.............................. + ldr q27, [x0, #784] // ......e..................................................................... + ldr q30, [x0, #400] // ...e........................................................................ + ldr q14, [x0, #16] // e........................................................................... + sqrdmulh v5.4S, v19.4S, v1.S[1] // ......................................*..................................... + mul v21.4S, v29.4S, v2.S[0] // ......................................................*..................... + ldr q26, [x0, #144] // .e.......................................................................... + mul v22.4S, v19.4S, v1.S[0] // .......................................*.................................... + ldr q17, [x0, #272] // ..e......................................................................... + sqrdmulh v11.4S, v29.4S, v2.S[1] // .....................................................*...................... + ldr q29, [x0, #656] // .....e...................................................................... + ldr q16, [x0, #528] // ....e....................................................................... + sub v25.4S, v15.4S, v10.4S // ..............................................*............................. + add v7.4S, v15.4S, v10.4S // ...............................................*............................ + sqrdmulh v20.4S, v27.4S, v0.S[1] // ..................e......................................................... + mul v13.4S, v31.4S, v0.S[0] // ........................e................................................... // gap // ............................................................................ - add v11.4S, v17.4S, v4.4S // ................................*........................................... - ldr q12, [x0, #912] // .......e.................................................................... - sub v31.4S, v17.4S, v4.4S // ...............................*............................................ // gap // ............................................................................ + sqrdmulh v6.4S, v7.4S, v2.S[3] // ..........................................................*................. + mul v15.4S, v7.4S, v2.S[2] // ...........................................................*................ // gap // ............................................................................ - sub v7.4S, v19.4S, v9.4S // .........................................*.................................. - mls v16.4S, v28.4S, v8.S[0] // .............................................*.............................. - sub v22.4S, v11.4S, v30.4S // ...................................................*........................ // gap // ............................................................................ - ldr q4, [x0, #784] // ......e..................................................................... - mul v10.4S, v20.4S, v0.S[0] // ........e................................................................... - sqrdmulh v9.4S, v20.4S, v0.S[1] // .........e.................................................................. + mls v22.4S, v5.4S, v8.S[0] // ........................................*................................... + sqrdmulh v5.4S, v31.4S, v0.S[1] // .......................e.................................................... // gap // ............................................................................ // gap // ............................................................................ + sqrdmulh v7.4S, v25.4S, v3.S[1] // ...............................................................*............ // gap // ............................................................................ - ldr q17, [x0, #656] // .....e...................................................................... - mul v29.4S, v12.4S, v0.S[0] // .......................e.................................................... - sqrdmulh v26.4S, v12.4S, v0.S[1] // ........................e................................................... + mul v10.4S, v25.4S, v3.S[0] // ................................................................*........... // gap // ............................................................................ + mls v15.4S, v6.4S, v8.S[0] // ............................................................*............... + sqrdmulh v6.4S, v16.4S, v0.S[1] // ........e................................................................... // gap // ............................................................................ - sub v20.4S, v5.4S, v16.4S // ..............................................*............................. - add v5.4S, v5.4S, v16.4S // ...............................................*............................ + str q28, [x0, #128] // .....................................................................*...... + mls v13.4S, v5.4S, v8.S[0] // .........................e.................................................. + add v19.4S, v9.4S, v22.4S // ..........................................*................................. // gap // ............................................................................ - sqrdmulh v6.4S, v4.4S, v0.S[1] // ...................e........................................................ - mls v10.4S, v9.4S, v8.S[0] // ..........e................................................................. - ldr q27, [x0, #16] // e........................................................................... // gap // ............................................................................ - sqrdmulh v28.4S, v20.4S, v3.S[1] // ................................................................*........... - mul v15.4S, v20.4S, v3.S[0] // ...............................................................*............ + sub v18.4S, v9.4S, v22.4S // .........................................*.................................. + mls v10.4S, v7.4S, v8.S[0] // .................................................................*.......... // gap // ............................................................................ - mls v29.4S, v26.4S, v8.S[0] // .........................e.................................................. // gap // ............................................................................ - sqrdmulh v9.4S, v17.4S, v0.S[1] // ..............e............................................................. - ldr q23, [x0, #272] // ..e......................................................................... - mul v20.4S, v5.4S, v2.S[2] // ..........................................................*................. + add v28.4S, v19.4S, v15.4S // ..............................................................*............. + sub v7.4S, v19.4S, v15.4S // .............................................................*.............. // gap // ............................................................................ - mul v16.4S, v17.4S, v0.S[0] // .............e.............................................................. // gap // ............................................................................ + mul v9.4S, v29.4S, v0.S[0] // ..............e............................................................. // gap // ............................................................................ + add v19.4S, v30.4S, v13.4S // ...........................e................................................ // gap // ............................................................................ - mls v15.4S, v28.4S, v8.S[0] // .................................................................*.......... - mul v4.4S, v4.4S, v0.S[0] // ..................e......................................................... + str q7, [x0, #640] // .........................................................................*.. + add v5.4S, v18.4S, v10.4S // ...................................................................*........ + sqrdmulh v12.4S, v29.4S, v0.S[1] // .............e.............................................................. // gap // ............................................................................ - add v26.4S, v24.4S, v29.4S // ...........................e................................................ + mul v27.4S, v27.4S, v0.S[0] // ...................e........................................................ + mul v22.4S, v19.4S, v0.S[2] // ..................................e......................................... // gap // ............................................................................ - sub v19.4S, v27.4S, v10.4S // ...........e................................................................ // gap // ............................................................................ - mls v16.4S, v9.4S, v8.S[0] // ...............e............................................................ - sqrdmulh v13.4S, v5.4S, v2.S[3] // ...........................................................*................ - ldr q28, [x0, #144] // .e.......................................................................... - sub v17.4S, v7.4S, v15.4S // ..................................................................*......... - add v9.4S, v7.4S, v15.4S // ...................................................................*........ + sub v7.4S, v18.4S, v10.4S // ..................................................................*......... + str q5, [x0, #768] // ..........................................................................*. // gap // ............................................................................ + sqrdmulh v29.4S, v19.4S, v0.S[3] // .................................e.......................................... + mul v5.4S, v16.4S, v0.S[0] // .........e.................................................................. + mls v9.4S, v12.4S, v8.S[0] // ...............e............................................................ // gap // ............................................................................ // gap // ............................................................................ + str q7, [x0, #896] // ...........................................................................* + mls v27.4S, v20.4S, v8.S[0] // ....................e....................................................... // gap // ............................................................................ - mls v4.4S, v6.4S, v8.S[0] // ....................e....................................................... - mul v15.4S, v26.4S, v0.S[2] // .................................e.......................................... - mls v20.4S, v13.4S, v8.S[0] // ............................................................*............... - str q9, [x0, #768] // ..........................................................................*. + sub v30.4S, v30.4S, v13.4S // ..........................e................................................. + mls v22.4S, v29.4S, v8.S[0] // ...................................e........................................ + mls v21.4S, v11.4S, v8.S[0] // .......................................................*.................... // gap // ............................................................................ - sqrdmulh v12.4S, v26.4S, v0.S[3] // ..................................e......................................... - add v7.4S, v11.4S, v30.4S // ....................................................*....................... // gap // ............................................................................ + sub v15.4S, v26.4S, v9.4S // ................e........................................................... // gap // ............................................................................ - add v14.4S, v28.4S, v16.4S // .................e.......................................................... - sub v30.4S, v23.4S, v4.4S // .....................e...................................................... // gap // ............................................................................ + add v11.4S, v26.4S, v9.4S // .................e.......................................................... + add v26.4S, v17.4S, v27.4S // ......................e..................................................... // gap // ............................................................................ - add v23.4S, v23.4S, v4.4S // ......................e..................................................... - add v13.4S, v18.4S, v20.4S // ..............................................................*............. - mls v15.4S, v12.4S, v8.S[0] // ...................................e........................................ // gap // ............................................................................ + mls v5.4S, v6.4S, v8.S[0] // ..........e................................................................. // gap // ............................................................................ - mul v9.4S, v30.4S, v1.S[0] // ......................................e..................................... - sqrdmulh v12.4S, v30.4S, v1.S[1] // .......................................e.................................... // gap // ............................................................................ + sub v31.4S, v24.4S, v21.4S // ........................................................*................... + add v29.4S, v11.4S, v22.4S // .....................................e...................................... + sqrdmulh v12.4S, v26.4S, v0.S[3] // ............................e............................................... // gap // ............................................................................ // gap // ............................................................................ - sub v11.4S, v18.4S, v20.4S // .............................................................*.............. + mul v13.4S, v26.4S, v0.S[2] // .............................e.............................................. + sub v9.4S, v14.4S, v5.4S // ...........e................................................................ + sqrdmulh v25.4S, v29.4S, v1.S[3] // ................................................e........................... // gap // ............................................................................ - mul v4.4S, v23.4S, v0.S[2] // ............................e............................................... // gap // ............................................................................ // gap // ............................................................................ - add v18.4S, v14.4S, v15.4S // .....................................e...................................... - sub v30.4S, v14.4S, v15.4S // ....................................e....................................... - sqrdmulh v20.4S, v23.4S, v0.S[3] // .............................e.............................................. - mls v9.4S, v12.4S, v8.S[0] // ........................................e................................... + add v6.4S, v4.4S, v23.4S // ....................................................*....................... + mul v23.4S, v29.4S, v1.S[2] // .................................................e.......................... // gap // ............................................................................ - str q11, [x0, #640] // .........................................................................*.. - sqrdmulh v11.4S, v30.4S, v2.S[1] // ......................................................e..................... + sqrdmulh v26.4S, v30.4S, v1.S[1] // ...........................................e................................ + mls v13.4S, v12.4S, v8.S[0] // ..............................e............................................. + str q28, [x0, #512] // ........................................................................*... // gap // ............................................................................ - sub v23.4S, v31.4S, v21.4S // ........................................................*................... - str q13, [x0, #512] // ........................................................................*... + add v7.4S, v24.4S, v21.4S // .........................................................*.................. + add v24.4S, v14.4S, v5.4S // ............e............................................................... // gap // ............................................................................ - str q17, [x0, #896] // ...........................................................................* - add v15.4S, v31.4S, v21.4S // .........................................................*.................. - mul v21.4S, v30.4S, v2.S[0] // .....................................................e...................... - sub v12.4S, v24.4S, v29.4S // ..........................e................................................. - mul v30.4S, v18.4S, v1.S[2] // ................................................e........................... // gap // ............................................................................ - str q7, [x0], #(16) // ....................................................................*....... - mls v4.4S, v20.4S, v8.S[0] // ..............................e............................................. + mls v23.4S, v25.4S, v8.S[0] // ..................................................e......................... + str q31, [x0, #384] // .......................................................................*.... // gap // ............................................................................ - sqrdmulh v20.4S, v18.4S, v1.S[3] // .................................................e.......................... - str q22, [x0, #112] // .....................................................................*...... - add v17.4S, v27.4S, v10.4S // ............e............................................................... + mul v10.4S, v30.4S, v1.S[0] // ............................................e............................... + sub v19.4S, v17.4S, v27.4S // .....................e...................................................... + str q7, [x0, #256] // ......................................................................*..... + add v4.4S, v24.4S, v13.4S // ................................e........................................... // gap // ............................................................................ - mls v21.4S, v11.4S, v8.S[0] // .......................................................e.................... - str q15, [x0, #240] // ......................................................................*..... + str q6, [x0], #(16) // ....................................................................*....... + sub v24.4S, v24.4S, v13.4S // ...............................e............................................ // gap // ............................................................................ - sub v5.4S, v28.4S, v16.4S // ................e........................................................... - str q23, [x0, #368] // .......................................................................*.... - mul v16.4S, v12.4S, v1.S[0] // ...........................................e................................ - - // original source code - // ldr q9, [x0, #0] // ....................e......................................................|....................e..................................................... - // ldr q10, [x0, #(1*(1024/8))] // ..................................e........................................|..................................e....................................... - // ldr q11, [x0, #(2*(1024/8))] // .........................e.................................................|.........................e................................................ - // ldr q12, [x0, #(3*(1024/8))] // .e.........................................................................|.e........................................................................ - // ldr q13, [x0, #(4*(1024/8))] // e..........................................................................|e......................................................................... - // ldr q14, [x0, #(5*(1024/8))] // .............e.............................................................|.............e............................................................ - // ldr q15, [x0, #(6*(1024/8))] // ..........e................................................................|..........e............................................................... - // ldr q16, [x0, #(7*(1024/8))] // .....e.....................................................................|.....e.................................................................... - // mul v24.4s, v13.4s, v0.s[0] // ...........e...............................................................|...........e.............................................................. - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ............e..............................................................|............e............................................................. - // mls v24.4s, v13.4s, v8.s[0] // ...................e.......................................................|...................e...................................................... - // sub v13.4s, v9.4s, v24.4s // ...............................e...........................................|...............................e.......................................... - // add v9.4s, v9.4s, v24.4s // .....................................................................e.....|.....................................................................e.... - // mul v24.4s, v14.4s, v0.s[0] // ...........................e...............................................|...........................e.............................................. - // sqrdmulh v14.4s, v14.4s, v0.s[1] // ........................e..................................................|........................e................................................. - // mls v24.4s, v14.4s, v8.s[0] // ................................e..........................................|................................e......................................... - // sub v14.4s, v10.4s, v24.4s // ........................................................................e..|........................................................................e. - // add v10.4s, v10.4s, v24.4s // ...........................................e...............................|...........................................e.............................. - // mul v24.4s, v15.4s, v0.s[0] // .............................e.............................................|.............................e............................................ - // sqrdmulh v15.4s, v15.4s, v0.s[1] // ..................e........................................................|..................e....................................................... - // mls v24.4s, v15.4s, v8.s[0] // .....................................e.....................................|.....................................e.................................... - // sub v15.4s, v11.4s, v24.4s // ............................................e..............................|............................................e............................. - // add v11.4s, v11.4s, v24.4s // .............................................e.............................|.............................................e............................ - // mul v24.4s, v16.4s, v0.s[0] // ..............e............................................................|..............e........................................................... - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ...............e...........................................................|...............e.......................................................... - // mls v24.4s, v16.4s, v8.s[0] // .......................e...................................................|.......................e.................................................. - // sub v16.4s, v12.4s, v24.4s // ...............................................................e...........|...............................................................e.......... - // add v12.4s, v12.4s, v24.4s // ..............................e............................................|..............................e........................................... - // mul v24.4s, v11.4s, v0.s[2] // ...................................................e.......................|...................................................e...................... - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ......................................................e....................|......................................................e................... - // mls v24.4s, v11.4s, v8.s[0] // ..................................................................e........|..................................................................e....... - // sub v11.4s, v9.4s, v24.4s // ......*....................................................................|......*................................................................... - // add v9.4s, v9.4s, v24.4s // ....*......................................................................|....*..................................................................... - // mul v24.4s, v12.4s, v0.s[2] // ......................................e....................................|......................................e................................... - // sqrdmulh v12.4s, v12.4s, v0.s[3] // .........................................e.................................|.........................................e................................ - // mls v24.4s, v12.4s, v8.s[0] // ...............................................e...........................|...............................................e.......................... - // sub v12.4s, v10.4s, v24.4s // .....................................................e.....................|.....................................................e.................... - // add v10.4s, v10.4s, v24.4s // ....................................................e......................|....................................................e..................... - // mul v24.4s, v15.4s, v1.s[0] // ................................................e..........................|................................................e......................... - // sqrdmulh v15.4s, v15.4s, v1.s[1] // .................................................e.........................|.................................................e........................ - // mls v24.4s, v15.4s, v8.s[0] // .......................................................e...................|.......................................................e.................. - // sub v15.4s, v13.4s, v24.4s // .......*...................................................................|.......*.................................................................. - // add v13.4s, v13.4s, v24.4s // ...*.......................................................................|...*...................................................................... - // mul v24.4s, v16.4s, v1.s[0] // ..........................................................................e|.......................................................................... - // sqrdmulh v16.4s, v16.4s, v1.s[1] // ..*........................................................................|..*....................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ........*..................................................................|........*................................................................. - // sub v16.4s, v14.4s, v24.4s // ................*..........................................................|................*......................................................... - // add v14.4s, v14.4s, v24.4s // .................*.........................................................|.................*........................................................ - // mul v24.4s, v10.4s, v1.s[2] // ................................................................e..........|................................................................e......... - // sqrdmulh v10.4s, v10.4s, v1.s[3] // ...................................................................e.......|...................................................................e...... - // mls v24.4s, v10.4s, v8.s[0] // ...........................................................................*.......................................................................... - // sub v10.4s, v9.4s, v24.4s // .........*.................................................................|.........*................................................................ - // add v9.4s, v9.4s, v24.4s // ..........................................*................................|..........................................*............................... - // mul v24.4s, v12.4s, v2.s[0] // ..............................................................e............|..............................................................e........... - // sqrdmulh v12.4s, v12.4s, v2.s[1] // .........................................................e.................|.........................................................e................ - // mls v24.4s, v12.4s, v8.s[0] // ......................................................................e....|......................................................................e... - // sub v12.4s, v11.4s, v24.4s // ..........................................................*................|..........................................................*............... - // add v11.4s, v11.4s, v24.4s // .............................................................*.............|.............................................................*............ - // mul v24.4s, v14.4s, v2.s[2] // ..........................*................................................|..........................*............................................... - // sqrdmulh v14.4s, v14.4s, v2.s[3] // .................................*.........................................|.................................*........................................ - // mls v24.4s, v14.4s, v8.s[0] // .......................................*...................................|.......................................*.................................. - // sub v14.4s, v13.4s, v24.4s // ..................................................*........................|..................................................*....................... - // add v13.4s, v13.4s, v24.4s // ..............................................*............................|..............................................*........................... - // mul v24.4s, v16.4s, v3.s[0] // ......................*....................................................|......................*................................................... - // sqrdmulh v16.4s, v16.4s, v3.s[1] // .....................*.....................................................|.....................*.................................................... - // mls v24.4s, v16.4s, v8.s[0] // ............................*..............................................|............................*............................................. - // sub v16.4s, v15.4s, v24.4s // ...................................*.......................................|...................................*...................................... - // add v15.4s, v15.4s, v24.4s // ....................................*......................................|....................................*..................................... - // str q9, [x0], #(16) // .................................................................*.........|.................................................................*........ - // str q10, [x0, #(-16 + 1*(1024/8))] // ....................................................................*......|....................................................................*..... - // str q11, [x0, #(-16 + 2*(1024/8))] // .......................................................................*...|.......................................................................*.. - // str q12, [x0, #(-16 + 3*(1024/8))] // .........................................................................*.|.........................................................................* - // str q13, [x0, #(-16 + 4*(1024/8))] // ...........................................................*...............|...........................................................*.............. - // str q14, [x0, #(-16 + 5*(1024/8))] // ........................................................*..................|........................................................*................. - // str q15, [x0, #(-16 + 6*(1024/8))] // ........................................*..................................|........................................*................................. - // str q16, [x0, #(-16 + 7*(1024/8))] // ............................................................*..............|............................................................*............. + sub v29.4S, v11.4S, v22.4S // ....................................e....................................... + + // ------------------------------------------------------------------- new position -------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + // ldr q9, [x0, #0] // ....e......................................................................'....~.................................................................... + // ldr q10, [x0, #(1*(1024/8))] // .......e...................................................................'.......~................................................................. + // ldr q11, [x0, #(2*(1024/8))] // .........e.................................................................'.........~............................................................... + // ldr q12, [x0, #(3*(1024/8))] // ...e.......................................................................'...~..................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ............e..............................................................'............~............................................................ + // ldr q14, [x0, #(5*(1024/8))] // ...........e...............................................................'...........~............................................................. + // ldr q15, [x0, #(6*(1024/8))] // ..e........................................................................'..~...................................................................... + // ldr q16, [x0, #(7*(1024/8))] // e..........................................................................'~........................................................................ + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ........................e..................................................'........................~................................................ + // mul v24.4s, v13.4s, v0.s[0] // ..........................................e................................'..........................................~.............................. + // mls v24.4s, v27.4s, v8.s[0] // ....................................................e......................'....................................................~.................... + // sub v13.4s, v9.4s, v24.4s // .........................................................e.................'.........................................................~............... + // add v9.4s, v9.4s, v24.4s // .................................................................e.........'.................................................................~....... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ....................................e......................................'....................................~.................................... + // mul v24.4s, v14.4s, v0.s[0] // ................................e..........................................'................................~........................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................................e...............................'...........................................~............................. + // sub v14.4s, v10.4s, v24.4s // .................................................e.........................'.................................................~....................... + // add v10.4s, v10.4s, v24.4s // ..................................................e........................'..................................................~...................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ...............e...........................................................'...............~......................................................... + // mul v24.4s, v15.4s, v0.s[0] // .....................................e.....................................'.....................................~................................... + // mls v24.4s, v27.4s, v8.s[0] // .............................................e.............................'.............................................~........................... + // sub v15.4s, v11.4s, v24.4s // .....................................................................e.....'.....................................................................~... + // add v11.4s, v11.4s, v24.4s // ...................................................e.......................'...................................................~..................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ....................e......................................................'....................~.................................................... + // mul v24.4s, v16.4s, v0.s[0] // ................e..........................................................'................~........................................................ + // mls v24.4s, v27.4s, v8.s[0] // ..........................e................................................'..........................~.............................................. + // sub v16.4s, v12.4s, v24.4s // ..............................................e............................'..............................................~.......................... + // add v12.4s, v12.4s, v24.4s // .................................e.........................................'.................................~....................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .......................................................e...................'.......................................................~................. + // mul v24.4s, v11.4s, v0.s[2] // ........................................................e..................'........................................................~................ + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................e............'..............................................................~.......... + // sub v11.4s, v9.4s, v24.4s // .........................................................................e.'......................................................................... + // add v9.4s, v9.4s, v24.4s // .......................................................................e...'.......................................................................~. + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .........................................e.................................'.........................................~............................... + // mul v24.4s, v12.4s, v0.s[2] // ......................................e....................................'......................................~.................................. + // mls v24.4s, v27.4s, v8.s[0] // ...............................................e...........................'...............................................~......................... + // sub v12.4s, v10.4s, v24.4s // ..........................................................................e'......................................................................... + // add v10.4s, v10.4s, v24.4s // ......................................................e....................'......................................................~.................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .....~.....................................................................'.....*................................................................... + // mul v24.4s, v15.4s, v1.s[0] // ........~..................................................................'........*................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...................~.......................................................'...................*..................................................... + // sub v15.4s, v13.4s, v24.4s // ............................~..............................................'............................*............................................ + // add v13.4s, v13.4s, v24.4s // ...........................~...............................................'...........................*............................................. + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .............................................................e.............'.............................................................~........... + // mul v24.4s, v16.4s, v1.s[0] // ....................................................................e......'....................................................................~.... + // mls v24.4s, v27.4s, v8.s[0] // .~.........................................................................'.*....................................................................... + // sub v16.4s, v14.4s, v24.4s // .............~.............................................................'.............*........................................................... + // add v14.4s, v14.4s, v24.4s // ..............~............................................................'..............*.......................................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ..........................................................e................'..........................................................~.............. + // mul v24.4s, v10.4s, v1.s[2] // ............................................................e..............'............................................................~............ + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................e........'..................................................................~...... + // sub v10.4s, v9.4s, v24.4s // ...........................................................................*......................................................................... + // add v9.4s, v9.4s, v24.4s // ...........................................................~...............'...........................................................*............. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ..........~................................................................'..........*.............................................................. + // mul v24.4s, v12.4s, v2.s[0] // ......~....................................................................'......*.................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ................................................~..........................'................................................*........................ + // sub v12.4s, v11.4s, v24.4s // .....................................................~.....................'.....................................................*................... + // add v11.4s, v11.4s, v24.4s // ................................................................~..........'................................................................*........ + // sqrdmulh v27.4s, v14.4s, v2.s[3] // .................~.........................................................'.................*....................................................... + // mul v24.4s, v14.4s, v2.s[2] // ..................~........................................................'..................*...................................................... + // mls v24.4s, v27.4s, v8.s[0] // .......................~...................................................'.......................*................................................. + // sub v14.4s, v13.4s, v24.4s // ...............................~...........................................'...............................*......................................... + // add v13.4s, v13.4s, v24.4s // ..............................~............................................'..............................*.......................................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .....................~.....................................................'.....................*................................................... + // mul v24.4s, v16.4s, v3.s[0] // ......................~....................................................'......................*.................................................. + // mls v24.4s, v27.4s, v8.s[0] // .............................~.............................................'.............................*........................................... + // sub v16.4s, v15.4s, v24.4s // .......................................~...................................'.......................................*................................. + // add v15.4s, v15.4s, v24.4s // ...................................~.......................................'...................................*..................................... + // str q9, [x0], #(16) // ........................................................................~..'........................................................................* + // str q10, [x0, #(-16 + 1*(1024/8))] // .........................~.................................................'.........................*............................................... + // str q11, [x0, #(-16 + 2*(1024/8))] // ......................................................................~....'......................................................................*.. + // str q12, [x0, #(-16 + 3*(1024/8))] // ...................................................................~.......'...................................................................*..... + // str q13, [x0, #(-16 + 4*(1024/8))] // ...............................................................~...........'...............................................................*......... + // str q14, [x0, #(-16 + 5*(1024/8))] // ..................................~........................................'..................................*...................................... + // str q15, [x0, #(-16 + 6*(1024/8))] // ........................................~..................................'........................................*................................ + // str q16, [x0, #(-16 + 7*(1024/8))] // ............................................~..............................'............................................*............................ sub count, count, #1 cbnz count, layer123_start - sub v6.4S, v17.4S, v4.4S // ....*.......................... - sqrdmulh v28.4S, v12.4S, v1.S[1] // .*............................. - // gap // ............................... - // gap // ............................... - // gap // ............................... - mls v30.4S, v20.4S, v8.S[0] // *.............................. - // gap // ............................... - // gap // ............................... - add v11.4S, v6.4S, v21.4S // ..........................*.... - sub v10.4S, v6.4S, v21.4S // .......................*....... - // gap // ............................... - // gap // ............................... - mls v16.4S, v28.4S, v8.S[0] // ......*........................ - add v6.4S, v17.4S, v4.4S // ...*........................... - // gap // ............................... - // gap // ............................... - add v28.4S, v19.4S, v9.4S // ..*............................ - str q11, [x0, #256] // .............................*. - // gap // ............................... - // gap // ............................... - str q10, [x0, #384] // ..............................* - add v22.4S, v6.4S, v30.4S // ...................*........... - // gap // ............................... - // gap // ............................... - sub v14.4S, v5.4S, v16.4S // ........*...................... - add v16.4S, v5.4S, v16.4S // .........*..................... - // gap // ............................... - // gap // ............................... - sub v5.4S, v19.4S, v9.4S // .....*......................... - sub v30.4S, v6.4S, v30.4S // .......*....................... - str q22, [x0], #(16) // ...........................*... - // gap // ............................... - sqrdmulh v19.4S, v14.4S, v3.S[1] // ..........*.................... - mul v6.4S, v14.4S, v3.S[0] // ...........*................... - // gap // ............................... - // gap // ............................... - str q30, [x0, #112] // ............................*.. - mul v10.4S, v16.4S, v2.S[2] // ............*.................. - // gap // ............................... - // gap // ............................... - sqrdmulh v30.4S, v16.4S, v2.S[3] // ..............*................ - // gap // ............................... - // gap // ............................... - // gap // ............................... - mls v6.4S, v19.4S, v8.S[0] // .............*................. - // gap // ............................... - // gap // ............................... - // gap // ............................... - // gap // ............................... - // gap // ............................... - // gap // ............................... - // gap // ............................... - mls v10.4S, v30.4S, v8.S[0] // .................*............. - // gap // ............................... - // gap // ............................... - // gap // ............................... - sub v27.4S, v5.4S, v6.4S // ...............*............... - // gap // ............................... - // gap // ............................... - // gap // ............................... - add v5.4S, v5.4S, v6.4S // ................*.............. - // gap // ............................... - // gap // ............................... - // gap // ............................... - sub v16.4S, v28.4S, v10.4S // .....................*......... - add v9.4S, v28.4S, v10.4S // ....................*.......... - str q27, [x0, #880] // .........................*..... - // gap // ............................... - str q5, [x0, #752] // ..................*............ - // gap // ............................... - // gap // ............................... - // gap // ............................... - str q16, [x0, #624] // ......................*........ - // gap // ............................... - // gap // ............................... - // gap // ............................... - str q9, [x0, #496] // ........................*...... - // gap // ............................... - // gap // ............................... - // gap // ............................... - - // original source code - // mls v30.4S, v20.4S, v8.S[0] // ..*............................ - // sqrdmulh v28.4S, v12.4S, v1.S[1] // .*............................. - // add v18.4S, v19.4S, v9.4S // .......*....................... - // add v11.4S, v17.4S, v4.4S // ......*........................ - // sub v31.4S, v17.4S, v4.4S // *.............................. - // sub v7.4S, v19.4S, v9.4S // .............*................. - // mls v16.4S, v28.4S, v8.S[0] // .....*......................... - // sub v22.4S, v11.4S, v30.4S // ..............*................ - // sub v20.4S, v5.4S, v16.4S // ...........*................... - // add v5.4S, v5.4S, v16.4S // ............*.................. - // sqrdmulh v28.4S, v20.4S, v3.S[1] // ................*.............. - // mul v15.4S, v20.4S, v3.S[0] // .................*............. - // mul v20.4S, v5.4S, v2.S[2] // ...................*........... - // mls v15.4S, v28.4S, v8.S[0] // .....................*......... - // sqrdmulh v13.4S, v5.4S, v2.S[3] // ....................*.......... - // sub v17.4S, v7.4S, v15.4S // .......................*....... - // add v9.4S, v7.4S, v15.4S // ........................*...... - // mls v20.4S, v13.4S, v8.S[0] // ......................*........ - // str q9, [x0, #768] // ............................*.. - // add v7.4S, v11.4S, v30.4S // ..........*.................... - // add v13.4S, v18.4S, v20.4S // ..........................*.... - // sub v11.4S, v18.4S, v20.4S // .........................*..... - // str q11, [x0, #640] // .............................*. - // sub v23.4S, v31.4S, v21.4S // ....*.......................... - // str q13, [x0, #512] // ..............................* - // str q17, [x0, #896] // ...........................*... - // add v15.4S, v31.4S, v21.4S // ...*........................... - // str q7, [x0], #(16) // ...............*............... - // str q22, [x0, #112] // ..................*............ - // str q15, [x0, #240] // ........*...................... - // str q23, [x0, #368] // .........*..................... + // Instructions: 33 + // Expected cycles: 17 + // Expected IPC: 1.94 + // + // Wall time: 0.34s + // User time: 0.34s + // + // ------ original position -------> + // 0 25 + // |------------------------|------- + mls v10.4S, v26.4S, v8.S[0] // .*............................... + // gap // ................................. + sqrdmulh v20.4S, v29.4S, v2.S[1] // .....*........................... + // gap // ................................. + mul v28.4S, v29.4S, v2.S[0] // ...*............................. + sub v13.4S, v4.4S, v23.4S // *................................ + // gap // ................................. + // gap // ................................. + add v21.4S, v4.4S, v23.4S // ...........................*..... + mul v7.4S, v19.4S, v1.S[0] // ....*............................ + // gap // ................................. + // gap // ................................. + str q13, [x0, #128] // ..............*.................. + sqrdmulh v22.4S, v19.4S, v1.S[1] // ..*.............................. + sub v19.4S, v15.4S, v10.4S // ......*.......................... + // gap // ................................. + add v25.4S, v15.4S, v10.4S // .......*......................... + mls v28.4S, v20.4S, v8.S[0] // .........................*....... + str q21, [x0], #(16) // ................................* + // gap // ................................. + sqrdmulh v16.4S, v19.4S, v3.S[1] // ...........*..................... + mul v12.4S, v19.4S, v3.S[0] // ............*.................... + // gap // ................................. + // gap // ................................. + mls v7.4S, v22.4S, v8.S[0] // ..........*...................... + sqrdmulh v17.4S, v25.4S, v2.S[3] // ........*........................ + // gap // ................................. + // gap // ................................. + mul v5.4S, v25.4S, v2.S[2] // .........*....................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v12.4S, v16.4S, v8.S[0] // .................*............... + sub v31.4S, v24.4S, v28.4S // ..........................*...... + // gap // ................................. + // gap // ................................. + add v22.4S, v24.4S, v28.4S // .............................*... + sub v30.4S, v9.4S, v7.4S // ................*................ + // gap // ................................. + // gap // ................................. + add v20.4S, v9.4S, v7.4S // ...............*................. + mls v5.4S, v17.4S, v8.S[0] // .............*................... + str q31, [x0, #368] // ..............................*.. + // gap // ................................. + str q22, [x0, #240] // ...............................*. + add v16.4S, v30.4S, v12.4S // .....................*........... + // gap // ................................. + // gap // ................................. + sub v15.4S, v30.4S, v12.4S // ......................*.......... + // gap // ................................. + // gap // ................................. + // gap // ................................. + str q16, [x0, #752] // .......................*......... + add v18.4S, v20.4S, v5.4S // ..................*.............. + // gap // ................................. + // gap // ................................. + sub v21.4S, v20.4S, v5.4S // ...................*............. + str q15, [x0, #880] // ........................*........ + // gap // ................................. + // gap // ................................. + str q18, [x0, #496] // ............................*.... + // gap // ................................. + // gap // ................................. + // gap // ................................. + str q21, [x0, #624] // ....................*............ + // gap // ................................. + // gap // ................................. + // gap // ................................. + + // --------- new position ---------> + // 0 25 + // |------------------------|------- + // sub v28.4S, v4.4S, v23.4S // ...*............................. + // mls v10.4S, v26.4S, v8.S[0] // *................................ + // sqrdmulh v5.4S, v19.4S, v1.S[1] // .......*......................... + // mul v21.4S, v29.4S, v2.S[0] // ..*.............................. + // mul v22.4S, v19.4S, v1.S[0] // .....*........................... + // sqrdmulh v11.4S, v29.4S, v2.S[1] // .*............................... + // sub v25.4S, v15.4S, v10.4S // ........*........................ + // add v7.4S, v15.4S, v10.4S // .........*....................... + // sqrdmulh v6.4S, v7.4S, v2.S[3] // ...............*................. + // mul v15.4S, v7.4S, v2.S[2] // ................*................ + // mls v22.4S, v5.4S, v8.S[0] // ..............*.................. + // sqrdmulh v7.4S, v25.4S, v3.S[1] // ............*.................... + // mul v10.4S, v25.4S, v3.S[0] // .............*................... + // mls v15.4S, v6.4S, v8.S[0] // ......................*.......... + // str q28, [x0, #128] // ......*.......................... + // add v19.4S, v9.4S, v22.4S // .....................*........... + // sub v18.4S, v9.4S, v22.4S // ....................*............ + // mls v10.4S, v7.4S, v8.S[0] // .................*............... + // add v28.4S, v19.4S, v15.4S // ............................*.... + // sub v7.4S, v19.4S, v15.4S // .............................*... + // str q7, [x0, #640] // ................................* + // add v5.4S, v18.4S, v10.4S // .........................*....... + // sub v7.4S, v18.4S, v10.4S // ..........................*...... + // str q5, [x0, #768] // ...........................*..... + // str q7, [x0, #896] // ..............................*.. + // mls v21.4S, v11.4S, v8.S[0] // ..........*...................... + // sub v31.4S, v24.4S, v21.4S // ..................*.............. + // add v6.4S, v4.4S, v23.4S // ....*............................ + // str q28, [x0, #512] // ...............................*. + // add v7.4S, v24.4S, v21.4S // ...................*............. + // str q31, [x0, #384] // .......................*......... + // str q7, [x0, #256] // ........................*........ + // str q6, [x0], #(16) // ...........*..................... restore inp, STACK0 @@ -851,954 +847,942 @@ layer123_start: qform_root3_tw .req q7 .p2align 2 - // gap // ................................................................................................................ - // gap // ................................................................................................................ - ldr q12, [x4], #64 // *............................................................................................................... - ldr q4, [x2, #48] // .*.............................................................................................................. - ldr q3, [x2, #32] // ..*............................................................................................................. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - ldr q21, [x2, #16] // ....*........................................................................................................... - ldr q27, [x5, #160] // ...........................................................................*.................................... - ldr q14, [x2, #0] // ...*............................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - ldr q5, [x5, #128] // .......................................................................*........................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - ldr q25, [x1, #32] // ...........*.................................................................................................... - ldr q13, [x5, #176] // ...................................................................*............................................ - sqrdmulh v16.4S, v4.4S, v12.S[1] // ......*......................................................................................................... - mul v15.4S, v4.4S, v12.S[0] // .......*........................................................................................................ - ldr q24, [x1, #48] // .............*.................................................................................................. - ldr q11, [x1, #0] // ................*............................................................................................... - mul v10.4S, v3.4S, v12.S[0] // .........*...................................................................................................... - sqrdmulh v1.4S, v3.4S, v12.S[1] // ..........*..................................................................................................... - ldr q0, [x5, #112] // .................................................*.............................................................. - ldr q7, [x5, #96] // .....................*.......................................................................................... - sqrdmulh v3.4S, v14.4S, v12.S[1] // .................*.............................................................................................. - ldr q4, [x1, #16] // ..................*............................................................................................. - mul v19.4S, v14.4S, v12.S[0] // ......................*......................................................................................... - sqrdmulh v29.4S, v21.4S, v12.S[1] // ..............*................................................................................................. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mls v15.4S, v16.4S, v8.S[0] // ...............*................................................................................................ - mls v10.4S, v1.4S, v8.S[0] // ...................*............................................................................................ - // gap // ................................................................................................................ - mul v6.4S, v21.4S, v12.S[0] // ............*................................................................................................... - ldr q26, [x4, #-48] // ........*....................................................................................................... - mls v19.4S, v3.4S, v8.S[0] // ..............................*................................................................................. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sub v16.4S, v24.4S, v15.4S // .......................*........................................................................................ - sub v3.4S, v25.4S, v10.4S // ........................*....................................................................................... - mls v6.4S, v29.4S, v8.S[0] // ....................*........................................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v20.4S, v16.4S, v26.S[1] // ...........................*.................................................................................... - mul v23.4S, v16.4S, v26.S[0] // ..........................*..................................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v18.4S, v3.4S, v26.S[1] // ................................*............................................................................... - mul v21.4S, v3.4S, v26.S[0] // ...............................*................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - add v3.4S, v24.4S, v15.4S // ................................................*............................................................... - mls v23.4S, v20.4S, v8.S[0] // .................................*.............................................................................. - ldr q2, [x5, #16] // ........................................................................*....................................... - // gap // ................................................................................................................ - sub v17.4S, v4.4S, v6.4S // ....................................*........................................................................... - mls v21.4S, v18.4S, v8.S[0] // .......................................*........................................................................ - add v30.4S, v4.4S, v6.4S // ....................................................................*........................................... - ldr q18, [x4, #-32] // .....*.......................................................................................................... - // gap // ................................................................................................................ - add v6.4S, v25.4S, v10.4S // .........................*...................................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - ldr q28, [x4, #-16] // ...................................*............................................................................ - add v29.4S, v17.4S, v23.4S // ........................................*....................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mul v16.4S, v3.4S, v12.S[2] // .....................................................................................*.......................... - sub v9.4S, v11.4S, v19.4S // ......................................*......................................................................... - sub v4.4S, v17.4S, v23.4S // ..........................................*..................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v15.4S, v29.4S, v18.S[3] // ...........................................*.................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mul v20.4S, v29.4S, v18.S[2] // ............................................*................................................................... - mul v29.4S, v4.4S, v28.S[0] // .............................................*.................................................................. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v4.4S, v4.4S, v28.S[1] // ..............................................*................................................................. - sqrdmulh v22.4S, v6.4S, v12.S[3] // .............................*.................................................................................. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v24.4S, v3.4S, v12.S[3] // ..............................................................*................................................. - mul v23.4S, v6.4S, v12.S[2] // ............................*................................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mls v20.4S, v15.4S, v8.S[0] // ..................................................*............................................................. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - add v3.4S, v9.4S, v21.4S // ...............................................*................................................................ - mls v29.4S, v4.4S, v8.S[0] // ....................................................*........................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sub v9.4S, v9.4S, v21.4S // ...................................................*............................................................ - mls v16.4S, v24.4S, v8.S[0] // ..............................................................................................*................. - // gap // ................................................................................................................ - sub v15.4S, v3.4S, v20.4S // ......................................................*......................................................... - // gap // ................................................................................................................ - add v21.4S, v3.4S, v20.4S // .....................................................*.......................................................... - // gap // ................................................................................................................ - sub v3.4S, v9.4S, v29.4S // ........................................................*....................................................... - add v29.4S, v9.4S, v29.4S // .......................................................*........................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mls v23.4S, v22.4S, v8.S[0] // ..................................*............................................................................. - // gap // ................................................................................................................ - sub v4.4S, v30.4S, v16.4S // ..................................................................................................*............. - trn2 v24.4S, v21.4S, v15.4S // .........................................................*...................................................... - trn2 v28.4S, v29.4S, v3.4S // ...........................................................*.................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - trn1 v1.4S, v21.4S, v15.4S // ..........................................................*..................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - trn1 v14.4S, v29.4S, v3.4S // ............................................................*................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v3.4S, v4.4S, v18.S[1] // ....................................................................................................*........... - trn2 v29.2D, v24.2D, v28.2D // .............................................................*.................................................. - add v17.4S, v11.4S, v19.4S // .....................................*.......................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - trn1 v21.2D, v1.2D, v14.2D // ...................................................................................*............................ - sqrdmulh v20.4S, v29.4S, v0.4S // .................................................................*.............................................. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mul v9.4S, v29.4S, v7.4S // ................................................................*............................................... - mul v29.4S, v4.4S, v18.S[0] // ...................................................................................................*............ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - trn2 v4.2D, v1.2D, v14.2D // ...............................................................*................................................ - trn1 v15.2D, v24.2D, v28.2D // ..........................................................................*..................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mls v9.4S, v20.4S, v8.S[0] // ......................................................................*......................................... - // gap // ................................................................................................................ - ldr q20, [x5, #144] // .........................................................................*...................................... - sqrdmulh v0.4S, v4.4S, v0.4S // .....................................................................*.......................................... - mul v4.4S, v4.4S, v7.4S // ..................................................................*............................................. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - add v16.4S, v30.4S, v16.4S // .................................................................................................*.............. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sub v30.4S, v15.4S, v9.4S // ..............................................................................*................................. - add v15.4S, v15.4S, v9.4S // .............................................................................*.................................. - // gap // ................................................................................................................ - mul v9.4S, v16.4S, v26.S[2] // .......................................................................................................*........ - // gap // ................................................................................................................ - mls v4.4S, v0.4S, v8.S[0] // ............................................................................*................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v20.4S, v15.4S, v20.4S // ................................................................................*............................... - mul v0.4S, v15.4S, v5.4S // ...............................................................................*................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v15.4S, v30.4S, v13.4S // ..................................................................................*............................. - mul v30.4S, v30.4S, v27.4S // .................................................................................*.............................. - mls v29.4S, v3.4S, v8.S[0] // ........................................................................................................*....... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sub v3.4S, v21.4S, v4.4S // ........................................................................................*....................... - sub v27.4S, v17.4S, v23.4S // .........................................*...................................................................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mls v0.4S, v20.4S, v8.S[0] // ....................................................................................*........................... - add v4.4S, v21.4S, v4.4S // .......................................................................................*........................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - mls v30.4S, v15.4S, v8.S[0] // ......................................................................................*......................... - sub v15.4S, v27.4S, v29.4S // ..............................................................................................................*. - add v13.4S, v17.4S, v23.4S // ............................................................................................................*... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - add v17.4S, v4.4S, v0.4S // ..........................................................................................*..................... - // gap // ................................................................................................................ - sub v23.4S, v4.4S, v0.4S // .........................................................................................*...................... - sub v0.4S, v3.4S, v30.4S // ...........................................................................................*.................... - add v20.4S, v3.4S, v30.4S // ............................................................................................*................... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - sqrdmulh v16.4S, v16.4S, v26.S[3] // ......................................................................................................*......... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - trn2 v21.4S, v20.4S, v0.4S // ...............................................................................................*................ - trn2 v30.4S, v17.4S, v23.4S // .............................................................................................*.................. - add v3.4S, v27.4S, v29.4S // .............................................................................................................*.. - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - trn1 v4.2D, v30.2D, v21.2D // ................................................................................................*............... - trn2 v6.2D, v30.2D, v21.2D // .........................................................................................................*...... - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - // gap // ................................................................................................................ - str q4, [x2, #16] // .....................................................................................................*.......... - ldr q19, [x5, #32] // ...............................................................................................................* - mls v9.4S, v16.4S, v8.S[0] // ..........................................................................................................*..... - trn1 v1.4S, v20.4S, v0.4S // ...........................................................................................................*.... - - // original source code - // ldr q11, [x4], #64 // *............................................................................................................... - // ldr q12, [x2, #48] // .*.............................................................................................................. - // ldr q14, [x2, #32] // ..*............................................................................................................. - // ldr q29, [x2, #0] // .....*.......................................................................................................... - // ldr q28, [x2, #16] // ...*............................................................................................................ - // ldr q6, [x4, #-32] // .......................................*........................................................................ - // sqrdmulh v21.4S, v12.4S, v11.S[1] // .........*...................................................................................................... - // mul v4.4S, v12.4S, v11.S[0] // ..........*..................................................................................................... - // ldr q31, [x4, #-48] // ........................*....................................................................................... - // mul v16.4S, v14.4S, v11.S[0] // .............*.................................................................................................. - // sqrdmulh v22.4S, v14.4S, v11.S[1] // ..............*................................................................................................. - // ldr q15, [x1, #32] // .......*........................................................................................................ - // mul v3.4S, v28.4S, v11.S[0] // .......................*........................................................................................ - // ldr q27, [x1, #48] // ...........*.................................................................................................... - // sqrdmulh v1.4S, v28.4S, v11.S[1] // ....................*........................................................................................... - // mls v4.4S, v21.4S, v8.S[0] // .....................*.......................................................................................... - // ldr q28, [x1, #0] // ............*................................................................................................... - // sqrdmulh v26.4S, v29.4S, v11.S[1] // .................*.............................................................................................. - // ldr q21, [x1, #16] // ..................*............................................................................................. - // mls v16.4S, v22.4S, v8.S[0] // ......................*......................................................................................... - // mls v3.4S, v1.4S, v8.S[0] // ............................*................................................................................... - // ldr q20, [x5, #96] // ................*............................................................................................... - // mul v10.4S, v29.4S, v11.S[0] // ...................*............................................................................................ - // sub v13.4S, v27.4S, v4.4S // ..........................*..................................................................................... - // sub v14.4S, v15.4S, v16.4S // ...........................*.................................................................................... - // add v23.4S, v15.4S, v16.4S // ........................................*....................................................................... - // mul v9.4S, v13.4S, v31.S[0] // ..............................*................................................................................. - // sqrdmulh v22.4S, v13.4S, v31.S[1] // .............................*.................................................................................. - // mul v29.4S, v23.4S, v11.S[2] // ....................................................*........................................................... - // sqrdmulh v7.4S, v23.4S, v11.S[3] // ..................................................*............................................................. - // mls v10.4S, v26.4S, v8.S[0] // .........................*...................................................................................... - // mul v15.4S, v14.4S, v31.S[0] // ................................*............................................................................... - // sqrdmulh v26.4S, v14.4S, v31.S[1] // ...............................*................................................................................ - // mls v9.4S, v22.4S, v8.S[0] // ..................................*............................................................................. - // mls v29.4S, v7.4S, v8.S[0] // ..............................................................*................................................. - // ldr q14, [x4, #-16] // .........................................*...................................................................... - // sub v7.4S, v21.4S, v3.4S // ....................................*........................................................................... - // add v13.4S, v28.4S, v10.4S // ......................................................................*......................................... - // sub v5.4S, v28.4S, v10.4S // ............................................*................................................................... - // mls v15.4S, v26.4S, v8.S[0] // .....................................*.......................................................................... - // add v16.4S, v7.4S, v9.4S // ..........................................*..................................................................... - // sub v22.4S, v13.4S, v29.4S // ............................................................................................*................... - // sub v28.4S, v7.4S, v9.4S // .............................................*.................................................................. - // sqrdmulh v23.4S, v16.4S, v6.S[3] // ..............................................*................................................................. - // mul v10.4S, v16.4S, v6.S[2] // ...............................................*................................................................ - // mul v16.4S, v28.4S, v14.S[0] // ................................................*............................................................... - // sqrdmulh v28.4S, v28.4S, v14.S[1] // .................................................*.............................................................. - // add v9.4S, v5.4S, v15.4S // ......................................................*......................................................... - // add v27.4S, v27.4S, v4.4S // .................................*.............................................................................. - // ldr q26, [x5, #112] // ...............*................................................................................................ - // mls v10.4S, v23.4S, v8.S[0] // .....................................................*.......................................................... - // sub v14.4S, v5.4S, v15.4S // ........................................................*....................................................... - // mls v16.4S, v28.4S, v8.S[0] // .......................................................*........................................................ - // add v28.4S, v9.4S, v10.4S // ...........................................................*.................................................... - // sub v10.4S, v9.4S, v10.4S // ..........................................................*..................................................... - // add v9.4S, v14.4S, v16.4S // .............................................................*.................................................. - // sub v23.4S, v14.4S, v16.4S // ............................................................*................................................... - // trn2 v17.4S, v28.4S, v10.4S // ................................................................*............................................... - // trn1 v10.4S, v28.4S, v10.4S // ..................................................................*............................................. - // trn2 v25.4S, v9.4S, v23.4S // .................................................................*.............................................. - // trn1 v14.4S, v9.4S, v23.4S // ...................................................................*............................................ - // trn2 v2.2D, v17.2D, v25.2D // .....................................................................*.......................................... - // sqrdmulh v9.4S, v27.4S, v11.S[3] // ...................................................*............................................................ - // trn2 v18.2D, v10.2D, v14.2D // ...........................................................................*.................................... - // mul v16.4S, v2.4S, v20.4S // .........................................................................*...................................... - // sqrdmulh v2.4S, v2.4S, v26.4S // ........................................................................*....................................... - // mul v23.4S, v18.4S, v20.4S // ................................................................................*............................... - // ldr q20, [x5, #176] // ........*....................................................................................................... - // add v3.4S, v21.4S, v3.4S // ......................................*......................................................................... - // sqrdmulh v18.4S, v18.4S, v26.4S // ...............................................................................*................................ - // mls v16.4S, v2.4S, v8.S[0] // .............................................................................*.................................. - // ldr q21, [x5, #128] // ......*......................................................................................................... - // ldr q2, [x5, #16] // ...................................*............................................................................ - // ldr q0, [x5, #144] // ..............................................................................*................................. - // trn1 v26.2D, v17.2D, v25.2D // ............................................................................*................................... - // ldr q12, [x5, #160] // ....*........................................................................................................... - // mls v23.4S, v18.4S, v8.S[0] // .....................................................................................*.......................... - // add v7.4S, v26.4S, v16.4S // ...................................................................................*............................ - // sub v24.4S, v26.4S, v16.4S // ..................................................................................*............................. - // mul v17.4S, v7.4S, v21.4S // .......................................................................................*........................ - // sqrdmulh v0.4S, v7.4S, v0.4S // ......................................................................................*......................... - // mul v7.4S, v24.4S, v12.4S // .........................................................................................*...................... - // sqrdmulh v16.4S, v24.4S, v20.4S // ........................................................................................*....................... - // trn1 v20.2D, v10.2D, v14.2D // .......................................................................*........................................ - // mls v17.4S, v0.4S, v8.S[0] // .............................................................................................*.................. - // mul v10.4S, v27.4S, v11.S[2] // ...........................................*.................................................................... - // mls v7.4S, v16.4S, v8.S[0] // ...............................................................................................*................ - // add v16.4S, v20.4S, v23.4S // ..............................................................................................*................. - // sub v24.4S, v20.4S, v23.4S // ...........................................................................................*.................... - // sub v23.4S, v16.4S, v17.4S // ...................................................................................................*............ - // add v17.4S, v16.4S, v17.4S // ..................................................................................................*............. - // sub v26.4S, v24.4S, v7.4S // ....................................................................................................*........... - // add v11.4S, v24.4S, v7.4S // .....................................................................................................*.......... - // trn2 v16.4S, v17.4S, v23.4S // ........................................................................................................*....... - // mls v10.4S, v9.4S, v8.S[0] // .........................................................*...................................................... - // trn2 v27.4S, v11.4S, v26.4S // .......................................................................................................*........ - // trn1 v9.2D, v16.2D, v27.2D // ..........................................................................................................*..... - // add v18.4S, v3.4S, v10.4S // .................................................................................*.............................. - // sub v7.4S, v3.4S, v10.4S // ...............................................................*................................................ - // mul v15.4S, v7.4S, v6.S[0] // ..........................................................................*..................................... - // sqrdmulh v7.4S, v7.4S, v6.S[1] // ....................................................................*........................................... - // str q9, [x2, #16] // ............................................................................................................*... - // sqrdmulh v28.4S, v18.4S, v31.S[3] // ......................................................................................................*......... - // mul v9.4S, v18.4S, v31.S[2] // ....................................................................................*........................... - // mls v15.4S, v7.4S, v8.S[0] // ..........................................................................................*..................... - // trn2 v6.2D, v16.2D, v27.2D // ...........................................................................................................*.... - // mls v9.4S, v28.4S, v8.S[0] // ..............................................................................................................*. - // trn1 v1.4S, v11.4S, v26.4S // ...............................................................................................................* - // add v13.4S, v13.4S, v29.4S // .................................................................................................*.............. - // add v3.4S, v22.4S, v15.4S // .........................................................................................................*...... - // sub v15.4S, v22.4S, v15.4S // ................................................................................................*............... - // ldr q19, [x5, #32] // .............................................................................................................*.. + // Instructions: 35 + // Expected cycles: 13 + // Expected IPC: 2.69 + // + // Wall time: 0.72s + // User time: 0.72s + // + // ------- original position --------> + // 0 25 + // |------------------------|--------- + ldr q9, [x2, #48] // ............*...................... + ldr q10, [x4], #64 // ..*................................ + // gap // ................................... + // gap // ................................... + ldr q7, [x2, #16] // *.................................. + // gap // ................................... + // gap // ................................... + ldr q27, [x2, #0] // ......*............................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + ldr q15, [x2, #32] // .*................................. + ldr q23, [x1, #48] // ........*.......................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + ldr q28, [x5, #48] // .........................*......... + sqrdmulh v13.4S, v9.4S, v10.S[1] // ...............*................... + // gap // ................................... + mul v3.4S, v9.4S, v10.S[0] // ..............*.................... + // gap // ................................... + // gap // ................................... + mul v1.4S, v27.4S, v10.S[0] // ..........*........................ + sqrdmulh v22.4S, v27.4S, v10.S[1] // .........*......................... + sqrdmulh v27.4S, v15.4S, v10.S[1] // .......................*........... + ldr q24, [x5, #112] // ......................*............ + mul v0.4S, v15.4S, v10.S[0] // ........................*.......... + // gap // ................................... + mul v9.4S, v7.4S, v10.S[0] // ...*............................... + ldr q30, [x1, #0] // ....*.............................. + ldr q25, [x5, #80] // ..................*................ + mls v3.4S, v13.4S, v8.S[0] // .................*................. + // gap // ................................... + mls v1.4S, v22.4S, v8.S[0] // .............*..................... + ldr q11, [x4, #-48] // .....*............................. + sqrdmulh v26.4S, v7.4S, v10.S[1] // .......*........................... + mls v0.4S, v27.4S, v8.S[0] // ..............................*.... + ldr q20, [x5, #16] // ...........*....................... + ldr q13, [x1, #16] // ................*.................. + // gap // ................................... + sub v4.4S, v23.4S, v3.4S // ..........................*........ + add v5.4S, v23.4S, v3.4S // ...................*............... + ldr q2, [x5, #176] // ....................*.............. + ldr q18, [x4, #-16] // .....................*............. + add v7.4S, v30.4S, v1.4S // .................................*. + mls v9.4S, v26.4S, v8.S[0] // ............................*...... + ldr q31, [x1, #32] // ...........................*....... + ldr q17, [x5], #(12*16) // .............................*..... + mul v3.4S, v4.4S, v11.S[0] // ...............................*... + sqrdmulh v15.4S, v4.4S, v11.S[1] // ..................................* + ldr q12, [x5, #-96] // ................................*.. + // gap // ................................... + + // ---------- new position ----------> + // 0 25 + // |------------------------|--------- + // ldr q23, [x2, #16] // ..*................................ + // ldr q22, [x2, #32] // ....*.............................. + // ldr q10, [x4], #64 // .*................................. + // mul v9.4S, v23.4S, v10.S[0] // ..............*.................... + // ldr q30, [x1, #0] // ...............*................... + // ldr q11, [x4, #-48] // ...................*............... + // ldr q21, [x2, #0] // ...*............................... + // sqrdmulh v14.4S, v23.4S, v10.S[1] // ....................*.............. + // ldr q31, [x1, #48] // .....*............................. + // sqrdmulh v5.4S, v21.4S, v10.S[1] // ..........*........................ + // mul v1.4S, v21.4S, v10.S[0] // .........*......................... + // ldr q20, [x5, #16] // ......................*............ + // ldr q12, [x2, #48] // *.................................. + // mls v1.4S, v5.4S, v8.S[0] // ..................*................ + // mul v27.4S, v12.4S, v10.S[0] // ........*.......................... + // sqrdmulh v12.4S, v12.4S, v10.S[1] // .......*........................... + // ldr q13, [x1, #16] // .......................*........... + // mls v27.4S, v12.4S, v8.S[0] // .................*................. + // ldr q25, [x5, #80] // ................*.................. + // add v5.4S, v31.4S, v27.4S // .........................*......... + // ldr q2, [x5, #176] // ..........................*........ + // ldr q18, [x4, #-16] // ...........................*....... + // ldr q24, [x5, #112] // ............*...................... + // sqrdmulh v15.4S, v22.4S, v10.S[1] // ...........*....................... + // mul v0.4S, v22.4S, v10.S[0] // .............*..................... + // ldr q28, [x5, #48] // ......*............................ + // sub v23.4S, v31.4S, v27.4S // ........................*.......... + // ldr q31, [x1, #32] // ..............................*.... + // mls v9.4S, v14.4S, v8.S[0] // .............................*..... + // ldr q17, [x5], #(12*16) // ...............................*... + // mls v0.4S, v15.4S, v8.S[0] // .....................*............. + // mul v3.4S, v23.4S, v11.S[0] // ................................*.. + // ldr q12, [x5, #-96] // ..................................* + // add v7.4S, v30.4S, v1.4S // ............................*...... + // sqrdmulh v15.4S, v23.4S, v11.S[1] // .................................*. sub count, count, #1 layer45678_start: - add v24.4S, v13.4S, v9.4S // ........................................................*........................................................................................................... - ldr q11, [x4], #64 // ........e........................................................................................................................................................... - trn1 v16.4S, v17.4S, v23.4S // ....................................................................................................................................................*............... - ldr q12, [x2, #176] // .......e............................................................................................................................................................ - // gap // .................................................................................................................................................................... - sub v7.4S, v13.4S, v9.4S // .......................................................*............................................................................................................ - ldr q14, [x2, #160] // ......e............................................................................................................................................................. - trn1 v18.4S, v3.4S, v15.4S // ..........................................................................*......................................................................................... - ldr q29, [x2, #128] // ....e............................................................................................................................................................... - trn1 v9.2D, v16.2D, v1.2D // ..........................................................................................................................................................*......... - ldr q28, [x2, #144] // .....e.............................................................................................................................................................. - trn2 v0.4S, v3.4S, v15.4S // ...........................................................................*........................................................................................ - ldr q30, [x5, #48] // ...........................................................................................*........................................................................ - trn2 v3.2D, v16.2D, v1.2D // ........................................................................................................................................................*........... - trn2 v25.4S, v24.4S, v7.4S // .........................................................................*.......................................................................................... - str q6, [x2, #48] // ...................................................................................................................................................................* + // Instructions: 164 + // Expected cycles: 66 + // Expected IPC: 2.48 + // + // Wall time: 1005.34s + // User time: 1005.34s + // + // ------------------------------------------------------------------------ original position ------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- // gap // .................................................................................................................................................................... - ldr q6, [x4, #-32] // ..........e......................................................................................................................................................... - sqrdmulh v21.4S, v12.4S, v11.S[1] // ............................e....................................................................................................................................... - mul v4.4S, v12.4S, v11.S[0] // ...........................e........................................................................................................................................ - str q3, [x2, #32] // ..................................................................................................................................................................*. - ldr q31, [x4, #-48] // .........e.......................................................................................................................................................... - mul v16.4S, v14.4S, v11.S[0] // ......................e............................................................................................................................................. - sqrdmulh v22.4S, v14.4S, v11.S[1] // .......................e............................................................................................................................................ - ldr q15, [x1, #160] // ..e................................................................................................................................................................. - mul v3.4S, v28.4S, v11.S[0] // .................e.................................................................................................................................................. - ldr q27, [x1, #176] // ...e................................................................................................................................................................ - sqrdmulh v1.4S, v28.4S, v11.S[1] // ..................e................................................................................................................................................. - mls v4.4S, v21.4S, v8.S[0] // .............................e...................................................................................................................................... + ldr q23, [x2, #144] // .....e.............................................................................................................................................................. + mul v27.4S, v5.4S, v10.S[2] // ......................................*............................................................................................................................. + sub v26.4S, v13.4S, v9.4S // ....................*............................................................................................................................................... + sqrdmulh v14.4S, v5.4S, v10.S[3] // .....................................*.............................................................................................................................. // gap // .................................................................................................................................................................... - ldr q28, [x1, #128] // e................................................................................................................................................................... - sqrdmulh v26.4S, v29.4S, v11.S[1] // .............e...................................................................................................................................................... - ldr q21, [x1, #144] // .e.................................................................................................................................................................. - trn1 v17.4S, v24.4S, v7.4S // ........................................................................*........................................................................................... - mls v16.4S, v22.4S, v8.S[0] // ........................e........................................................................................................................................... // gap // .................................................................................................................................................................... - mls v3.4S, v1.4S, v8.S[0] // ...................e................................................................................................................................................ + add v29.4S, v31.4S, v0.4S // ..........................*......................................................................................................................................... + mls v3.4S, v15.4S, v8.S[0] // .................................................*.................................................................................................................. + ldr q22, [x2, #160] // ......e............................................................................................................................................................. // gap // .................................................................................................................................................................... - ldr q20, [x5, #288] // ..................................................................................................................e................................................. - mul v10.4S, v29.4S, v11.S[0] // ............e....................................................................................................................................................... - trn2 v1.2D, v17.2D, v18.2D // ............................................................................*....................................................................................... - sub v13.4S, v27.4S, v4.4S // ..............................e..................................................................................................................................... + sub v1.4S, v30.4S, v1.4S // ...............*.................................................................................................................................................... + ldr q30, [x4, #-32] // ..........*......................................................................................................................................................... // gap // .................................................................................................................................................................... - str q9, [x2], #128 // ................................................................................................................................................................*... - sub v14.4S, v15.4S, v16.4S // .........................e.......................................................................................................................................... + sub v31.4S, v31.4S, v0.4S // .........................*.......................................................................................................................................... + sqrdmulh v16.4S, v29.4S, v10.S[3] // ................................*................................................................................................................................... + mul v0.4S, v29.4S, v10.S[2] // .................................*.................................................................................................................................. + ldr q10, [x4], #64 // ........e........................................................................................................................................................... + mls v27.4S, v14.4S, v8.S[0] // .......................................*............................................................................................................................ // gap // .................................................................................................................................................................... + add v15.4S, v13.4S, v9.4S // .....................*.............................................................................................................................................. // gap // .................................................................................................................................................................... - add v23.4S, v15.4S, v16.4S // ..........................e......................................................................................................................................... - ldr q12, [x5], #(12*16) // ........................................................................................*........................................................................... - mul v9.4S, v13.4S, v31.S[0] // ...............................................e.................................................................................................................... - sqrdmulh v22.4S, v13.4S, v31.S[1] // ................................................e................................................................................................................... // gap // .................................................................................................................................................................... - mul v29.4S, v23.4S, v11.S[2] // ................................e................................................................................................................................... - sqrdmulh v7.4S, v23.4S, v11.S[3] // .................................e.................................................................................................................................. + sub v4.4S, v26.4S, v3.4S // ..................................................*................................................................................................................. + add v29.4S, v26.4S, v3.4S // ...................................................*................................................................................................................ // gap // .................................................................................................................................................................... + mul v13.4S, v31.4S, v11.S[0] // ...........................................*........................................................................................................................ // gap // .................................................................................................................................................................... + sqrdmulh v21.4S, v31.4S, v11.S[1] // ..........................................*......................................................................................................................... // gap // .................................................................................................................................................................... - mls v10.4S, v26.4S, v8.S[0] // ..............e..................................................................................................................................................... // gap // .................................................................................................................................................................... - mul v15.4S, v14.4S, v31.S[0] // ..........................................e......................................................................................................................... - sqrdmulh v26.4S, v14.4S, v31.S[1] // ...........................................e........................................................................................................................ + add v14.4S, v15.4S, v27.4S // .........................................*.......................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - mls v9.4S, v22.4S, v8.S[0] // .................................................e.................................................................................................................. - ldr q24, [x5, #-112] // .............................................................................................*...................................................................... - mls v29.4S, v7.4S, v8.S[0] // ..................................e................................................................................................................................. - ldr q14, [x4, #-16] // ...........e........................................................................................................................................................ - sub v7.4S, v21.4S, v3.4S // ....................e............................................................................................................................................... - add v13.4S, v28.4S, v10.4S // ................e................................................................................................................................................... - sub v5.4S, v28.4S, v10.4S // ...............e.................................................................................................................................................... + mul v26.4S, v4.4S, v18.S[0] // ....................................................................*............................................................................................... + mul v9.4S, v23.4S, v10.S[0] // ..................e................................................................................................................................................. // gap // .................................................................................................................................................................... + sqrdmulh v5.4S, v14.4S, v11.S[3] // ....................................................*............................................................................................................... // gap // .................................................................................................................................................................... - mls v15.4S, v26.4S, v8.S[0] // ............................................e....................................................................................................................... - add v16.4S, v7.4S, v9.4S // ...................................................e................................................................................................................ + sub v31.4S, v15.4S, v27.4S // ........................................*........................................................................................................................... + mls v0.4S, v16.4S, v8.S[0] // ..................................*................................................................................................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - sub v22.4S, v13.4S, v29.4S // ...................................e................................................................................................................................ + mul v3.4S, v14.4S, v11.S[2] // .....................................................*.............................................................................................................. + mls v13.4S, v21.4S, v8.S[0] // ............................................*....................................................................................................................... // gap // .................................................................................................................................................................... + sqrdmulh v14.4S, v31.4S, v30.S[1] // .........................................................*.......................................................................................................... // gap // .................................................................................................................................................................... - sub v28.4S, v7.4S, v9.4S // ..................................................e................................................................................................................. + sqrdmulh v21.4S, v4.4S, v18.S[1] // ...................................................................*................................................................................................ + sqrdmulh v15.4S, v29.4S, v30.S[3] // ..............................................................*..................................................................................................... // gap // .................................................................................................................................................................... - sqrdmulh v23.4S, v16.4S, v6.S[3] // ...............................................................e.................................................................................................... - mul v10.4S, v16.4S, v6.S[2] // ..............................................................e..................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + mul v27.4S, v31.4S, v30.S[0] // ..........................................................*......................................................................................................... + sub v11.4S, v7.4S, v0.4S // ...................................*................................................................................................................................ // gap // .................................................................................................................................................................... - mul v16.4S, v28.4S, v14.S[0] // ...................................................................e................................................................................................ - sqrdmulh v28.4S, v28.4S, v14.S[1] // ....................................................................e............................................................................................... + mls v3.4S, v5.4S, v8.S[0] // ......................................................*............................................................................................................. // gap // .................................................................................................................................................................... - add v9.4S, v5.4S, v15.4S // ..............................................e..................................................................................................................... - add v27.4S, v27.4S, v4.4S // ...............................e.................................................................................................................................... - ldr q26, [x5, #112] // ...................................................................................................................e................................................ // gap // .................................................................................................................................................................... - mls v10.4S, v23.4S, v8.S[0] // ................................................................e................................................................................................... + add v31.4S, v7.4S, v0.4S // ....................................*............................................................................................................................... + mls v26.4S, v21.4S, v8.S[0] // .....................................................................*.............................................................................................. // gap // .................................................................................................................................................................... - sub v14.4S, v5.4S, v15.4S // .............................................e...................................................................................................................... - trn1 v7.2D, v25.2D, v0.2D // ...............................................................................*.................................................................................... // gap // .................................................................................................................................................................... + sub v16.4S, v1.4S, v13.4S // .............................................*...................................................................................................................... + mls v27.4S, v14.4S, v8.S[0] // ...........................................................*........................................................................................................ // gap // .................................................................................................................................................................... - mls v16.4S, v28.4S, v8.S[0] // .....................................................................e.............................................................................................. // gap // .................................................................................................................................................................... - trn2 v4.2D, v25.2D, v0.2D // .............................................................................*...................................................................................... + mul v0.4S, v29.4S, v30.S[2] // ...............................................................*.................................................................................................... // gap // .................................................................................................................................................................... - sqrdmulh v0.4S, v1.4S, v2.4S // ...............................................................................................*.................................................................... - add v28.4S, v9.4S, v10.4S // ..................................................................e................................................................................................. + add v21.4S, v31.4S, v3.4S // ........................................................*........................................................................................................... + sub v3.4S, v31.4S, v3.4S // .......................................................*............................................................................................................ // gap // .................................................................................................................................................................... + sub v5.4S, v16.4S, v26.4S // ......................................................................*............................................................................................. + add v7.4S, v16.4S, v26.4S // .......................................................................*............................................................................................ // gap // .................................................................................................................................................................... - sub v10.4S, v9.4S, v10.4S // .................................................................e.................................................................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - add v9.4S, v14.4S, v16.4S // .......................................................................e............................................................................................ - sub v23.4S, v14.4S, v16.4S // ......................................................................e............................................................................................. - trn1 v15.2D, v17.2D, v18.2D // ..............................................................................*..................................................................................... - trn2 v17.4S, v28.4S, v10.4S // .................................................................................e.................................................................................. + mls v0.4S, v15.4S, v8.S[0] // ................................................................*................................................................................................... // gap // .................................................................................................................................................................... + trn2 v6.4S, v21.4S, v3.4S // .........................................................................*.......................................................................................... + add v14.4S, v11.4S, v27.4S // .............................................................*...................................................................................................... + add v31.4S, v1.4S, v13.4S // ..............................................*..................................................................................................................... // gap // .................................................................................................................................................................... - trn1 v10.4S, v28.4S, v10.4S // ................................................................................e................................................................................... // gap // .................................................................................................................................................................... + sub v26.4S, v11.4S, v27.4S // ............................................................*....................................................................................................... + ldr q30, [x1, #128] // e................................................................................................................................................................... + ldr q29, [x5, #-160] // ..........................................................................................*......................................................................... + trn2 v18.4S, v7.4S, v5.4S // ...................................................................................*................................................................................ // gap // .................................................................................................................................................................... - trn2 v25.4S, v9.4S, v23.4S // ...................................................................................e................................................................................ - trn1 v14.4S, v9.4S, v23.4S // ..................................................................................e................................................................................. // gap // .................................................................................................................................................................... + add v27.4S, v31.4S, v0.4S // ..................................................................*................................................................................................. + sub v16.4S, v31.4S, v0.4S // .................................................................*.................................................................................................. + trn1 v4.4S, v14.4S, v26.4S // ..........................................................................*......................................................................................... + trn2 v0.4S, v14.4S, v26.4S // ...........................................................................*........................................................................................ // gap // .................................................................................................................................................................... - sqrdmulh v5.4S, v4.4S, v2.4S // ....................................................................................................*............................................................... + ldr q11, [x4, #-48] // .........e.......................................................................................................................................................... + trn1 v1.4S, v7.4S, v5.4S // ..................................................................................*................................................................................. // gap // .................................................................................................................................................................... + trn2 v26.4S, v27.4S, v16.4S // .................................................................................*.................................................................................. + ldr q7, [x5, #-64] // ....................................................................................................................*............................................... + trn2 v31.2D, v6.2D, v0.2D // .............................................................................*...................................................................................... + trn1 v19.2D, v6.2D, v0.2D // ...............................................................................*.................................................................................... // gap // .................................................................................................................................................................... - trn2 v2.2D, v17.2D, v25.2D // .....................................................................................e.............................................................................. - sqrdmulh v9.4S, v27.4S, v11.S[3] // ......................................e............................................................................................................................. - mul v28.4S, v4.4S, v12.4S // ...................................................................................................*................................................................ - trn2 v18.2D, v10.2D, v14.2D // ....................................................................................e............................................................................... // gap // .................................................................................................................................................................... + trn2 v14.2D, v26.2D, v18.2D // .....................................................................................*.............................................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + trn1 v5.4S, v21.4S, v3.4S // ........................................................................*........................................................................................... + mul v15.4S, v31.4S, v17.4S // ....................................................................................................*............................................................... + ldr q21, [x2, #128] // ....e............................................................................................................................................................... + sqrdmulh v3.4S, v31.4S, v20.4S // ...................................................................................................*................................................................ // gap // .................................................................................................................................................................... - mul v16.4S, v2.4S, v20.4S // .............................................................................................................................e...................................... - sqrdmulh v2.4S, v2.4S, v26.4S // ..............................................................................................................................e..................................... - mul v23.4S, v18.4S, v20.4S // ........................................................................................................................e........................................... - mul v4.4S, v1.4S, v12.4S // ..............................................................................................*..................................................................... - ldr q20, [x5, #176] // .......................................................................................................................e............................................ + sqrdmulh v6.4S, v14.4S, v24.4S // .............................................................................................................................*...................................... + mul v0.4S, v14.4S, v12.4S // ..............................................................................................................................*..................................... // gap // .................................................................................................................................................................... - ldr q1, [x5, #-128] // ............................................................................................*....................................................................... // gap // .................................................................................................................................................................... - mls v28.4S, v5.4S, v8.S[0] // .....................................................................................................*.............................................................. - add v3.4S, v21.4S, v3.4S // .....................e.............................................................................................................................................. - sqrdmulh v18.4S, v18.4S, v26.4S // .........................................................................................................................e.......................................... // gap // .................................................................................................................................................................... - mls v16.4S, v2.4S, v8.S[0] // ...............................................................................................................................e.................................... - ldr q21, [x5, #128] // ....................................................................................................................e............................................... - mls v4.4S, v0.4S, v8.S[0] // ................................................................................................*................................................................... - ldr q2, [x5, #16] // .........................................................................................e.......................................................................... - ldr q0, [x5, #144] // .....................................................................................................................e.............................................. - trn1 v26.2D, v17.2D, v25.2D // .......................................................................................e............................................................................ - add v5.4S, v7.4S, v28.4S // .......................................................................................................*............................................................ // gap // .................................................................................................................................................................... - ldr q12, [x5, #160] // ......................................................................................................................e............................................. - sub v25.4S, v7.4S, v28.4S // ......................................................................................................*............................................................. - mls v23.4S, v18.4S, v8.S[0] // ..........................................................................................................................e......................................... + sqrdmulh v14.4S, v23.4S, v10.S[1] // .................e.................................................................................................................................................. + trn2 v31.2D, v5.2D, v4.2D // ............................................................................*....................................................................................... + mls v15.4S, v3.4S, v8.S[0] // .....................................................................................................*.............................................................. + trn1 v16.4S, v27.4S, v16.4S // ................................................................................*................................................................................... // gap // .................................................................................................................................................................... - add v7.4S, v26.4S, v16.4S // .................................................................................................................................e.................................. + ldr q13, [x5, #-32] // ......................................................................................................................*............................................. + mls v0.4S, v6.4S, v8.S[0] // ...............................................................................................................................*.................................... + trn1 v3.2D, v5.2D, v4.2D // ..............................................................................*..................................................................................... + ldr q5, [x5, #-128] // ............................................................................................*....................................................................... // gap // .................................................................................................................................................................... - sqrdmulh v28.4S, v25.4S, v24.4S // ..............................................................................................................*..................................................... - sub v24.4S, v26.4S, v16.4S // ................................................................................................................................e................................... + trn1 v23.2D, v26.2D, v18.2D // .......................................................................................*............................................................................ // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - mul v17.4S, v7.4S, v21.4S // ..................................................................................................................................e................................. - sqrdmulh v0.4S, v7.4S, v0.4S // ...................................................................................................................................e................................ + sqrdmulh v6.4S, v31.4S, v20.4S // ..............................................................................................*..................................................................... + sub v27.4S, v19.4S, v15.4S // ......................................................................................................*............................................................. // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... + mul v17.4S, v31.4S, v17.4S // ...............................................................................................*.................................................................... + sub v18.4S, v23.4S, v0.4S // ................................................................................................................................*................................... + trn1 v4.2D, v16.2D, v1.2D // ......................................................................................*............................................................................. + ldr q31, [x1, #176] // ...e................................................................................................................................................................ // gap // .................................................................................................................................................................... - mul v7.4S, v24.4S, v12.4S // .......................................................................................................................................e............................ - sqrdmulh v16.4S, v24.4S, v20.4S // ........................................................................................................................................e........................... + trn2 v16.2D, v16.2D, v1.2D // ....................................................................................*............................................................................... // gap // .................................................................................................................................................................... - sqrdmulh v21.4S, v5.4S, v30.4S // .........................................................................................................*.......................................................... // gap // .................................................................................................................................................................... + mul v26.4S, v27.4S, v5.4S // ..............................................................................................................*..................................................... // gap // .................................................................................................................................................................... - trn1 v20.2D, v10.2D, v14.2D // ......................................................................................e............................................................................. - mls v17.4S, v0.4S, v8.S[0] // ....................................................................................................................................e............................... - mul v10.4S, v27.4S, v11.S[2] // .....................................e.............................................................................................................................. + sqrdmulh v5.4S, v21.4S, v10.S[1] // ............e....................................................................................................................................................... // gap // .................................................................................................................................................................... + mul v1.4S, v21.4S, v10.S[0] // .............e...................................................................................................................................................... + sqrdmulh v25.4S, v27.4S, v25.4S // .............................................................................................................*...................................................... + sqrdmulh v24.4S, v16.4S, v24.4S // ........................................................................................................................*........................................... + ldr q20, [x5, #16] // .........................................................................................e.......................................................................... + ldr q27, [x5, #-48] // .....................................................................................................................*.............................................. + mul v21.4S, v16.4S, v12.4S // .........................................................................................................................*.......................................... // gap // .................................................................................................................................................................... + mls v17.4S, v6.4S, v8.S[0] // ................................................................................................*................................................................... + ldr q12, [x2, #176] // .......e............................................................................................................................................................ + sqrdmulh v2.4S, v18.4S, v2.4S // .......................................................................................................................................*............................ + mls v1.4S, v5.4S, v8.S[0] // ..............e..................................................................................................................................................... // gap // .................................................................................................................................................................... // gap // .................................................................................................................................................................... - mls v7.4S, v16.4S, v8.S[0] // .........................................................................................................................................e.......................... - add v16.4S, v20.4S, v23.4S // ............................................................................................................................e....................................... - sub v24.4S, v20.4S, v23.4S // ...........................................................................................................................e........................................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mul v20.4S, v25.4S, v1.4S // .............................................................................................................*...................................................... - // gap // .................................................................................................................................................................... - sub v23.4S, v16.4S, v17.4S // .....................................................................................................................................e.............................. - add v17.4S, v16.4S, v17.4S // ......................................................................................................................................e............................. - // gap // .................................................................................................................................................................... - sub v26.4S, v24.4S, v7.4S // ..........................................................................................................................................e......................... - add v11.4S, v24.4S, v7.4S // ...........................................................................................................................................e........................ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - trn2 v16.4S, v17.4S, v23.4S // .....................................................................................................................................................e.............. - mul v30.4S, v5.4S, v19.4S // ........................................................................................................*........................................................... - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - mls v10.4S, v9.4S, v8.S[0] // .......................................e............................................................................................................................ - trn2 v27.4S, v11.4S, v26.4S // .......................................................................................................................................................e............ - // gap // .................................................................................................................................................................... - // gap // .................................................................................................................................................................... - sub v14.4S, v15.4S, v4.4S // .................................................................................................*.................................................................. + add v16.4S, v23.4S, v0.4S // .................................................................................................................................*.................................. // gap // .................................................................................................................................................................... + mls v26.4S, v25.4S, v8.S[0] // ...............................................................................................................*.................................................... // gap // .................................................................................................................................................................... - mls v20.4S, v28.4S, v8.S[0] // ...............................................................................................................*.................................................... - trn1 v9.2D, v16.2D, v27.2D // ...........................................................................................................................................................e........ - mls v30.4S, v21.4S, v8.S[0] // ..........................................................................................................*......................................................... + mls v21.4S, v24.4S, v8.S[0] // ..........................................................................................................................*......................................... // gap // .................................................................................................................................................................... + sub v5.4S, v3.4S, v17.4S // .................................................................................................*.................................................................. // gap // .................................................................................................................................................................... - add v18.4S, v3.4S, v10.4S // .........................................e.......................................................................................................................... + mul v23.4S, v16.4S, v7.4S // ...................................................................................................................................*................................ + sqrdmulh v16.4S, v16.4S, v27.4S // ..................................................................................................................................*................................. // gap // .................................................................................................................................................................... - add v28.4S, v15.4S, v4.4S // ..................................................................................................*................................................................. // gap // .................................................................................................................................................................... - sub v7.4S, v3.4S, v10.4S // ........................................e........................................................................................................................... + mul v27.4S, v12.4S, v10.S[0] // ............................e....................................................................................................................................... // gap // .................................................................................................................................................................... - sub v25.4S, v14.4S, v20.4S // ................................................................................................................*................................................... // gap // .................................................................................................................................................................... - sub v24.4S, v28.4S, v30.4S // ...........................................................................................................*........................................................ + add v7.4S, v5.4S, v26.4S // .................................................................................................................*.................................................. + sub v24.4S, v4.4S, v21.4S // ...........................................................................................................................*........................................ // gap // .................................................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v10.S[1] // ...........................e........................................................................................................................................ // gap // .................................................................................................................................................................... - add v20.4S, v14.4S, v20.4S // .................................................................................................................*.................................................. - mul v15.4S, v7.4S, v6.S[0] // .........................................................e.......................................................................................................... + mul v0.4S, v18.4S, v13.4S // ........................................................................................................................................*........................... + ldr q13, [x1, #144] // .e.................................................................................................................................................................. + mls v23.4S, v16.4S, v8.S[0] // ....................................................................................................................................*............................... // gap // .................................................................................................................................................................... - add v0.4S, v28.4S, v30.4S // ............................................................................................................*....................................................... + add v16.4S, v4.4S, v21.4S // ............................................................................................................................*....................................... // gap // .................................................................................................................................................................... - sqrdmulh v7.4S, v7.4S, v6.S[1] // ..........................................................e......................................................................................................... // gap // .................................................................................................................................................................... + sub v4.4S, v5.4S, v26.4S // ................................................................................................................*................................................... + mls v27.4S, v12.4S, v8.S[0] // .............................e...................................................................................................................................... + add v25.4S, v19.4S, v15.4S // .......................................................................................................*............................................................ // gap // .................................................................................................................................................................... - trn1 v14.4S, v20.4S, v25.4S // ..............................................................................................................................................*..................... - trn2 v5.4S, v20.4S, v25.4S // ...............................................................................................................................................*.................... - trn2 v4.4S, v0.4S, v24.4S // .............................................................................................................................................*...................... // gap // .................................................................................................................................................................... + mls v0.4S, v2.4S, v8.S[0] // .........................................................................................................................................*.......................... // gap // .................................................................................................................................................................... - str q9, [x2, #16] // .................................................................................................................................................................e.. - sqrdmulh v28.4S, v18.4S, v31.S[3] // .....................................................e.............................................................................................................. // gap // .................................................................................................................................................................... - trn1 v24.4S, v0.4S, v24.4S // ............................................................................................................................................*....................... + add v26.4S, v16.4S, v23.4S // ......................................................................................................................................*............................. + mul v29.4S, v25.4S, v29.4S // .........................................................................................................*.......................................................... + sqrdmulh v18.4S, v25.4S, v28.4S // ........................................................................................................*........................................................... + ldr q25, [x5, #80] // .............................................................................................e...................................................................... // gap // .................................................................................................................................................................... + add v5.4S, v31.4S, v27.4S // ...............................e.................................................................................................................................... + sub v16.4S, v16.4S, v23.4S // .....................................................................................................................................*.............................. + ldr q2, [x5, #176] // .......................................................................................................................e............................................ // gap // .................................................................................................................................................................... - trn2 v21.2D, v4.2D, v5.2D // .................................................................................................................................................*.................. - mul v9.4S, v18.4S, v31.S[2] // ....................................................e............................................................................................................... - mls v15.4S, v7.4S, v8.S[0] // ...........................................................e........................................................................................................ + add v12.4S, v3.4S, v17.4S // ..................................................................................................*................................................................. + add v6.4S, v24.4S, v0.4S // ...........................................................................................................................................*........................ // gap // .................................................................................................................................................................... - trn1 v31.2D, v24.2D, v14.2D // ..................................................................................................................................................*................. // gap // .................................................................................................................................................................... - trn2 v6.2D, v16.2D, v27.2D // .........................................................................................................................................................e.......... - trn1 v0.2D, v4.2D, v5.2D // ...................................................................................................................................................*................ + sub v21.4S, v24.4S, v0.4S // ..........................................................................................................................................*......................... // gap // .................................................................................................................................................................... - str q21, [x1, #48] // ...............................................................................................................................................................*.... - mls v9.4S, v28.4S, v8.S[0] // ......................................................e............................................................................................................. // gap // .................................................................................................................................................................... - str q31, [x1], #128 // ............................................................................................................................................................*....... - trn2 v16.2D, v24.2D, v14.2D // ................................................................................................................................................*................... - // gap // .................................................................................................................................................................... - str q0, [x1, #-112] // .............................................................................................................................................................*...... - trn1 v1.4S, v11.4S, v26.4S // ......................................................................................................................................................e............. - add v13.4S, v13.4S, v29.4S // ....................................e............................................................................................................................... - add v3.4S, v22.4S, v15.4S // .............................................................e...................................................................................................... - sub v15.4S, v22.4S, v15.4S // ............................................................e....................................................................................................... - str q16, [x1, #-96] // ..............................................................................................................................................................*..... - ldr q19, [x5, #32] // ..........................................................................................e......................................................................... - - // original source code - // ldr q9, [x1, #(16*0)] // ..........................e........................................................................................................................................|..........................e....................................................................................................................................... - // ldr q10, [x1, #(16*1)] // ............................e......................................................................................................................................|............................e..................................................................................................................................... - // ldr q11, [x1, #(16*2)] // .....................e.............................................................................................................................................|.....................e............................................................................................................................................ - // ldr q12, [x1, #(16*3)] // .......................e...........................................................................................................................................|.......................e.......................................................................................................................................... - // ldr q13, [x2, #(16*0)] // ......e............................................................................................................................................................|......e........................................................................................................................................................... - // ldr q14, [x2, #(16*1)] // ........e..........................................................................................................................................................|........e......................................................................................................................................................... - // ldr q15, [x2, #(16*2)] // ....e..............................................................................................................................................................|....e............................................................................................................................................................. - // ldr q16, [x2, #(16*3)] // ..e................................................................................................................................................................|..e............................................................................................................................................................... - // ldr q0, [x4], #64 // e..................................................................................................................................................................|e................................................................................................................................................................. - // ldr q1, [x4, #(-64 + 16)] // ..................e................................................................................................................................................|..................e............................................................................................................................................... - // ldr q2, [x4, #(-64 + 32)] // ..............e....................................................................................................................................................|..............e................................................................................................................................................... - // ldr q3, [x4, #(-64 + 48)] // ..................................................e................................................................................................................|..................................................e............................................................................................................... - // mul v24.4s, v13.4s, v0.s[0] // .................................e.................................................................................................................................|.................................e................................................................................................................................ - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ...........................e.......................................................................................................................................|...........................e...................................................................................................................................... - // mls v24.4s, v13.4s, v8.s[0] // ............................................e......................................................................................................................|............................................e..................................................................................................................... - // sub v13.4s, v9.4s, v24.4s // .....................................................e.............................................................................................................|.....................................................e............................................................................................................ - // add v9.4s, v9.4s, v24.4s // ....................................................e..............................................................................................................|....................................................e............................................................................................................. - // mul v24.4s, v14.4s, v0.s[0] // ......................e............................................................................................................................................|......................e........................................................................................................................................... - // sqrdmulh v14.4s, v14.4s, v0.s[1] // ........................e..........................................................................................................................................|........................e......................................................................................................................................... - // mls v24.4s, v14.4s, v8.s[0] // ...............................e...................................................................................................................................|...............................e.................................................................................................................................. - // sub v14.4s, v10.4s, v24.4s // ...................................................e...............................................................................................................|...................................................e.............................................................................................................. - // add v10.4s, v10.4s, v24.4s // ............................................................................................e......................................................................|............................................................................................e..................................................................... - // mul v24.4s, v15.4s, v0.s[0] // ...................e...............................................................................................................................................|...................e.............................................................................................................................................. - // sqrdmulh v15.4s, v15.4s, v0.s[1] // ....................e..............................................................................................................................................|....................e............................................................................................................................................. - // mls v24.4s, v15.4s, v8.s[0] // ..............................e....................................................................................................................................|..............................e................................................................................................................................... - // sub v15.4s, v11.4s, v24.4s // .....................................e.............................................................................................................................|.....................................e............................................................................................................................ - // add v11.4s, v11.4s, v24.4s // ......................................e............................................................................................................................|......................................e........................................................................................................................... - // mul v24.4s, v16.4s, v0.s[0] // ................e..................................................................................................................................................|................e................................................................................................................................................. - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ...............e...................................................................................................................................................|...............e.................................................................................................................................................. - // mls v24.4s, v16.4s, v8.s[0] // .........................e.........................................................................................................................................|.........................e........................................................................................................................................ - // sub v16.4s, v12.4s, v24.4s // ...................................e...............................................................................................................................|...................................e.............................................................................................................................. - // add v12.4s, v12.4s, v24.4s // ...............................................................e...................................................................................................|...............................................................e.................................................................................................. - // mul v24.4s, v11.4s, v0.s[2] // ..........................................e........................................................................................................................|..........................................e....................................................................................................................... - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ...........................................e.......................................................................................................................|...........................................e...................................................................................................................... - // mls v24.4s, v11.4s, v8.s[0] // .................................................e.................................................................................................................|.................................................e................................................................................................................ - // sub v11.4s, v9.4s, v24.4s // ........................................................e..........................................................................................................|........................................................e......................................................................................................... - // add v9.4s, v9.4s, v24.4s // ..............................................................................................................................................................e....|..............................................................................................................................................................e... - // mul v24.4s, v12.4s, v0.s[2] // ..................................................................................................................e................................................|..................................................................................................................e............................................... - // sqrdmulh v12.4s, v12.4s, v0.s[3] // ..................................................................................e................................................................................|..................................................................................e............................................................................... - // mls v24.4s, v12.4s, v8.s[0] // .............................................................................................................................e.....................................|.............................................................................................................................e.................................... - // sub v12.4s, v10.4s, v24.4s // .....................................................................................................................................e.............................|.....................................................................................................................................e............................ - // add v10.4s, v10.4s, v24.4s // ...................................................................................................................................e...............................|...................................................................................................................................e.............................. - // mul v24.4s, v15.4s, v1.s[0] // .............................................e.....................................................................................................................|.............................................e.................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v1.s[1] // ..............................................e....................................................................................................................|..............................................e................................................................................................................... - // mls v24.4s, v15.4s, v8.s[0] // ......................................................e............................................................................................................|......................................................e........................................................................................................... - // sub v15.4s, v13.4s, v24.4s // ..................................................................e................................................................................................|..................................................................e............................................................................................... - // add v13.4s, v13.4s, v24.4s // ..............................................................e....................................................................................................|..............................................................e................................................................................................... - // mul v24.4s, v16.4s, v1.s[0] // ........................................e..........................................................................................................................|........................................e......................................................................................................................... - // sqrdmulh v16.4s, v16.4s, v1.s[1] // .........................................e.........................................................................................................................|.........................................e........................................................................................................................ - // mls v24.4s, v16.4s, v8.s[0] // ...............................................e...................................................................................................................|...............................................e.................................................................................................................. - // sub v16.4s, v14.4s, v24.4s // .........................................................e.........................................................................................................|.........................................................e........................................................................................................ - // add v14.4s, v14.4s, v24.4s // .......................................................e...........................................................................................................|.......................................................e.......................................................................................................... - // mul v24.4s, v10.4s, v1.s[2] // ...................................................................................................................................................e...............|...................................................................................................................................................e.............. - // sqrdmulh v10.4s, v10.4s, v1.s[3] // ................................................................................................................................................e..................|................................................................................................................................................e................. - // mls v24.4s, v10.4s, v8.s[0] // .........................................................................................................................................................e.........|.........................................................................................................................................................e........ - // sub v10.4s, v9.4s, v24.4s // ...*...............................................................................................................................................................|...*.............................................................................................................................................................. - // add v9.4s, v9.4s, v24.4s // ...................................................................................................................................................................*.................................................................................................................................................................. - // mul v24.4s, v12.4s, v2.s[0] // .........................................................................................................................................e.........................|.........................................................................................................................................e........................ - // sqrdmulh v12.4s, v12.4s, v2.s[1] // ...........................................................................................................................................e.......................|...........................................................................................................................................e...................... - // mls v24.4s, v12.4s, v8.s[0] // ....................................................................................................................................................e..............|....................................................................................................................................................e............. - // sub v12.4s, v11.4s, v24.4s // ................................................................................................................................................................e..|................................................................................................................................................................e. - // add v11.4s, v11.4s, v24.4s // ...............................................................................................................................................................e...|...............................................................................................................................................................e.. - // mul v24.4s, v14.4s, v2.s[2] // ...........................................................e.......................................................................................................|...........................................................e...................................................................................................... - // sqrdmulh v14.4s, v14.4s, v2.s[3] // ..........................................................e........................................................................................................|..........................................................e....................................................................................................... - // mls v24.4s, v14.4s, v8.s[0] // .................................................................e.................................................................................................|.................................................................e................................................................................................ - // sub v14.4s, v13.4s, v24.4s // ........................................................................e..........................................................................................|........................................................................e......................................................................................... - // add v13.4s, v13.4s, v24.4s // .......................................................................e...........................................................................................|.......................................................................e.......................................................................................... - // mul v24.4s, v16.4s, v3.s[0] // ............................................................e......................................................................................................|............................................................e..................................................................................................... - // sqrdmulh v16.4s, v16.4s, v3.s[1] // .............................................................e.....................................................................................................|.............................................................e.................................................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ....................................................................e..............................................................................................|....................................................................e............................................................................................. - // sub v16.4s, v15.4s, v24.4s // ..........................................................................e........................................................................................|..........................................................................e....................................................................................... - // add v15.4s, v15.4s, v24.4s // .........................................................................e.........................................................................................|.........................................................................e........................................................................................ - // trn1 v25.4s, v9.4s, v10.4s // .............................*.....................................................................................................................................|.............................*.................................................................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ............*......................................................................................................................................................|............*..................................................................................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // .....*.............................................................................................................................................................|.....*............................................................................................................................................................ - // trn2 v28.4s, v11.4s, v12.4s // .........*.........................................................................................................................................................|.........*........................................................................................................................................................ - // trn2 v11.2d, v25.2d, v27.2d // ..................................*................................................................................................................................|..................................*............................................................................................................................... - // trn2 v12.2d, v26.2d, v28.2d // .....................................................................*.............................................................................................|.....................................................................*............................................................................................ - // trn1 v9.2d, v25.2d, v27.2d // ...........................................................................*.......................................................................................|...........................................................................*...................................................................................... - // trn1 v10.2d, v26.2d, v28.2d // ...................................................................*...............................................................................................|...................................................................*.............................................................................................. - // trn1 v25.4s, v13.4s, v14.4s // .............................................................................e.....................................................................................|.............................................................................e.................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // ............................................................................e......................................................................................|............................................................................e..................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ...............................................................................e...................................................................................|...............................................................................e.................................................................................. - // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................e....................................................................................|..............................................................................e................................................................................... - // trn2 v15.2d, v25.2d, v27.2d // ....................................................................................e..............................................................................|....................................................................................e............................................................................. - // trn2 v16.2d, v26.2d, v28.2d // .................................................................................e.................................................................................|.................................................................................e................................................................................ - // trn1 v13.2d, v25.2d, v27.2d // ................................................................................................................e..................................................|................................................................................................................e................................................. - // trn1 v14.2d, v26.2d, v28.2d // ...................................................................................................e...............................................................|...................................................................................................e.............................................................. - // ldr q0, [x5], #(12*16) // .......................................*...........................................................................................................................|.......................................*.......................................................................................................................... - // ldr q4, [x5, #(-12*16 + 1*16)] // .................................................................................................e.................................................................|.................................................................................................e................................................................ - // ldr q1, [x5, #(-12*16 + 2*16)] // ..................................................................................................................................................................e|.................................................................................................................................................................. - // ldr q5, [x5, #(-12*16 + 3*16)] // ..........*........................................................................................................................................................|..........*....................................................................................................................................................... - // ldr q2, [x5, #(-12*16 + 4*16)] // ..........................................................................................*........................................................................|..........................................................................................*....................................................................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ................................................*..................................................................................................................|................................................*................................................................................................................. - // mul v24.4s, v11.4s, v0.4s // ........................................................................................*..........................................................................|........................................................................................*......................................................................... - // sqrdmulh v11.4s, v11.4s, v4.4s // ......................................................................*............................................................................................|......................................................................*........................................................................................... - // mls v24.4s, v11.4s, v8.s[0] // ................................................................................................*..................................................................|................................................................................................*................................................................. - // sub v11.4s, v9.4s, v24.4s // ...............................................................................................................................*...................................|...............................................................................................................................*.................................. - // add v9.4s, v9.4s, v24.4s // ....................................................................................................................................*..............................|....................................................................................................................................*............................. - // mul v24.4s, v12.4s, v0.4s // ...................................................................................*...............................................................................|...................................................................................*.............................................................................. - // sqrdmulh v12.4s, v12.4s, v4.4s // ................................................................................*..................................................................................|................................................................................*................................................................................. - // mls v24.4s, v12.4s, v8.s[0] // ...........................................................................................*.......................................................................|...........................................................................................*...................................................................... - // sub v12.4s, v10.4s, v24.4s // ......................................................................................................*............................................................|......................................................................................................*........................................................... - // add v10.4s, v10.4s, v24.4s // ....................................................................................................*..............................................................|....................................................................................................*............................................................. - // mul v24.4s, v10.4s, v1.4s // ............................................................................................................................*......................................|............................................................................................................................*..................................... - // sqrdmulh v10.4s, v10.4s, v5.4s // ...............................................................................................................*...................................................|...............................................................................................................*.................................................. - // mls v24.4s, v10.4s, v8.s[0] // ..................................................................................................................................*................................|..................................................................................................................................*............................... - // sub v10.4s, v9.4s, v24.4s // .......................................................................................................................................*...........................|.......................................................................................................................................*.......................... - // add v9.4s, v9.4s, v24.4s // ..........................................................................................................................................*........................|..........................................................................................................................................*....................... - // mul v24.4s, v12.4s, v2.4s // ......................................................................................................................*............................................|......................................................................................................................*........................................... - // sqrdmulh v12.4s, v12.4s, v6.4s // .........................................................................................................*.........................................................|.........................................................................................................*........................................................ - // mls v24.4s, v12.4s, v8.s[0] // ................................................................................................................................*..................................|................................................................................................................................*................................. - // sub v12.4s, v11.4s, v24.4s // ......................................................................................................................................*............................|......................................................................................................................................*........................... - // add v11.4s, v11.4s, v24.4s // ........................................................................................................................................*..........................|........................................................................................................................................*......................... - // ldr q0, [x5, #(-12*16 + 6*16)] // ................................e..................................................................................................................................|................................e................................................................................................................................. - // ldr q4, [x5, #(-12*16 + 7*16)] // ................................................................e..................................................................................................|................................................................e................................................................................................. - // ldr q1, [x5, #(-12*16 + 8*16)] // ...............................................................................................e...................................................................|...............................................................................................e.................................................................. - // ldr q5, [x5, #(-12*16 + 9*16)] // ..................................................................................................e................................................................|..................................................................................................e............................................................... - // ldr q2, [x5, #(-12*16 + 10*16)] // .....................................................................................................e.............................................................|.....................................................................................................e............................................................ - // ldr q6, [x5, #(-12*16 + 11*16)] // .........................................................................................e.........................................................................|.........................................................................................e........................................................................ - // mul v24.4s, v15.4s, v0.4s // .......................................................................................e...........................................................................|.......................................................................................e.......................................................................... - // sqrdmulh v15.4s, v15.4s, v4.4s // .............................................................................................e.....................................................................|.............................................................................................e.................................................................... - // mls v24.4s, v15.4s, v8.s[0] // .......................................................................................................e...........................................................|.......................................................................................................e.......................................................... - // sub v15.4s, v13.4s, v24.4s // .....................................................................................................................e.............................................|.....................................................................................................................e............................................ - // add v13.4s, v13.4s, v24.4s // ....................................................................................................................e..............................................|....................................................................................................................e............................................. - // mul v24.4s, v16.4s, v0.4s // .....................................................................................e.............................................................................|.....................................................................................e............................................................................ - // sqrdmulh v16.4s, v16.4s, v4.4s // ......................................................................................e............................................................................|......................................................................................e........................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ..............................................................................................e....................................................................|..............................................................................................e................................................................... - // sub v16.4s, v14.4s, v24.4s // ..........................................................................................................e........................................................|..........................................................................................................e....................................................... - // add v14.4s, v14.4s, v24.4s // ........................................................................................................e..........................................................|........................................................................................................e......................................................... - // mul v24.4s, v14.4s, v1.4s // ...........................................................................................................e.......................................................|...........................................................................................................e...................................................... - // sqrdmulh v14.4s, v14.4s, v5.4s // ............................................................................................................e......................................................|............................................................................................................e..................................................... - // mls v24.4s, v14.4s, v8.s[0] // .................................................................................................................e.................................................|.................................................................................................................e................................................ - // sub v14.4s, v13.4s, v24.4s // .......................................................................................................................e...........................................|.......................................................................................................................e.......................................... - // add v13.4s, v13.4s, v24.4s // ........................................................................................................................e..........................................|........................................................................................................................e......................................... - // mul v24.4s, v16.4s, v2.4s // .............................................................................................................e.....................................................|.............................................................................................................e.................................................... - // sqrdmulh v16.4s, v16.4s, v6.4s // ..............................................................................................................e....................................................|..............................................................................................................e................................................... - // mls v24.4s, v16.4s, v8.s[0] // ...................................................................................................................e...............................................|...................................................................................................................e.............................................. - // sub v16.4s, v15.4s, v24.4s // .........................................................................................................................e.........................................|.........................................................................................................................e........................................ - // add v15.4s, v15.4s, v24.4s // ..........................................................................................................................e........................................|..........................................................................................................................e....................................... - // trn1 v25.4s, v9.4s, v10.4s // .................................................................................................................................................*.................|.................................................................................................................................................*................ - // trn2 v26.4s, v9.4s, v10.4s // ..............................................................................................................................................*....................|..............................................................................................................................................*................... - // trn1 v27.4s, v11.4s, v12.4s // ............................................................................................................................................*......................|............................................................................................................................................*..................... - // trn2 v28.4s, v11.4s, v12.4s // .............................................................................................................................................*.....................|.............................................................................................................................................*.................... - // trn2 v11.2d, v25.2d, v27.2d // ...........................................................................................................................................................*.......|...........................................................................................................................................................*...... - // trn2 v12.2d, v26.2d, v28.2d // ..................................................................................................................................................*................|..................................................................................................................................................*............... - // trn1 v9.2d, v25.2d, v27.2d // .....................................................................................................................................................*.............|.....................................................................................................................................................*............ - // trn1 v10.2d, v26.2d, v28.2d // .......................................................................................................................................................*...........|.......................................................................................................................................................*.......... - // trn1 v25.4s, v13.4s, v14.4s // .*.................................................................................................................................................................|.*................................................................................................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // ...........................................................................................................................e.......................................|...........................................................................................................................e...................................... - // trn1 v27.4s, v15.4s, v16.4s // .............................................................................................................................................................e.....|.............................................................................................................................................................e.... - // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................................................e....................................|..............................................................................................................................e................................... - // trn2 v15.2d, v25.2d, v27.2d // ...........*.......................................................................................................................................................|...........*...................................................................................................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ......................................................................................................................................................e............|......................................................................................................................................................e........... - // trn1 v13.2d, v25.2d, v27.2d // .......*...........................................................................................................................................................|.......*.......................................................................................................................................................... - // trn1 v14.2d, v26.2d, v28.2d // .................................................................................................................................e.................................|.................................................................................................................................e................................ - // str q9, [x1], #128 // ..........................................................................................................................................................*........|..........................................................................................................................................................*....... - // str q10, [x1, #(-(128) + 16*1)] // ............................................................................................................................................................*......|............................................................................................................................................................*..... - // str q11, [x1, #(-(128) + 16*2)] // .................................................................................................................................................................*.|.................................................................................................................................................................* - // str q12, [x1, #(-(128) + 16*3)] // ........................................................................................................................................................*..........|........................................................................................................................................................*......... - // str q13, [x2], #128 // ....................................*..............................................................................................................................|....................................*............................................................................................................................. - // str q14, [x2, #(-(128) + 16*1)] // ...............................................................................................................................................e...................|...............................................................................................................................................e.................. - // str q15, [x2, #(-(128) + 16*2)] // .................*.................................................................................................................................................|.................*................................................................................................................................................ - // str q16, [x2, #(-(128) + 16*3)] // .............*.....................................................................................................................................................|.............*.................................................................................................................................................... + mls v29.4S, v18.4S, v8.S[0] // ..........................................................................................................*......................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v23.4S, v26.4S, v16.4S // .....................................................................................................................................................*.............. + trn1 v3.4S, v26.4S, v16.4S // ....................................................................................................................................................*............... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v26.4S, v6.4S, v21.4S // .......................................................................................................................................................*............ + trn1 v28.4S, v6.4S, v21.4S // ......................................................................................................................................................*............. + // gap // .................................................................................................................................................................... + sub v6.4S, v12.4S, v29.4S // ...........................................................................................................*........................................................ + // gap // .................................................................................................................................................................... + add v15.4S, v12.4S, v29.4S // ............................................................................................................*....................................................... + trn2 v29.4S, v7.4S, v4.4S // ...............................................................................................................................................*.................... + // gap // .................................................................................................................................................................... + ldr q18, [x4, #-16] // ...........e........................................................................................................................................................ + trn1 v21.2D, v3.2D, v28.2D // ..........................................................................................................................................................*......... + // gap // .................................................................................................................................................................... + trn2 v0.2D, v3.2D, v28.2D // ........................................................................................................................................................*........... + // gap // .................................................................................................................................................................... + trn1 v3.4S, v7.4S, v4.4S // ..............................................................................................................................................*..................... + trn2 v17.4S, v15.4S, v6.4S // .............................................................................................................................................*...................... + // gap // .................................................................................................................................................................... + str q21, [x2], #128 // ................................................................................................................................................................*... + trn1 v12.2D, v23.2D, v26.2D // ...........................................................................................................................................................*........ + str q0, [x2, #-96] // ..................................................................................................................................................................*. + ldr q24, [x5, #112] // ...................................................................................................................e................................................ + trn1 v7.4S, v15.4S, v6.4S // ............................................................................................................................................*....................... + trn2 v21.2D, v23.2D, v26.2D // .........................................................................................................................................................*.......... + sqrdmulh v15.4S, v22.4S, v10.S[1] // ......................e............................................................................................................................................. + str q12, [x2, #-112] // .................................................................................................................................................................*.. + // gap // .................................................................................................................................................................... + trn2 v4.2D, v17.2D, v29.2D // .................................................................................................................................................*.................. + str q21, [x2, #-80] // ...................................................................................................................................................................* + // gap // .................................................................................................................................................................... + mul v0.4S, v22.4S, v10.S[0] // .......................e............................................................................................................................................ + trn1 v16.2D, v17.2D, v29.2D // ...................................................................................................................................................*................ + str q4, [x1, #48] // ...............................................................................................................................................................*.... + ldr q28, [x5, #48] // ...........................................................................................e........................................................................ + trn1 v29.2D, v7.2D, v3.2D // ..................................................................................................................................................*................. + sub v23.4S, v31.4S, v27.4S // ..............................e..................................................................................................................................... + ldr q31, [x1, #160] // ..e................................................................................................................................................................. + str q16, [x1, #16] // .............................................................................................................................................................*...... + trn2 v6.2D, v7.2D, v3.2D // ................................................................................................................................................*................... + mls v9.4S, v14.4S, v8.S[0] // ...................e................................................................................................................................................ + str q29, [x1], #128 // ............................................................................................................................................................*....... + ldr q17, [x5], #(12*16) // ........................................................................................e........................................................................... + mls v0.4S, v15.4S, v8.S[0] // ........................e........................................................................................................................................... + mul v3.4S, v23.4S, v11.S[0] // ................................................e................................................................................................................... + str q6, [x1, #-96] // ..............................................................................................................................................................*..... + ldr q12, [x5, #-96] // ..................................................................................................................e................................................. + add v7.4S, v30.4S, v1.4S // ................e................................................................................................................................................... + sqrdmulh v15.4S, v23.4S, v11.S[1] // ...............................................e.................................................................................................................... + + // ----------------------------------------------------------------------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------ + // ldr q9, [x1, #(16*0)] // ...............................................e....................................................................................................................'..............................................~................................................................................................................. + // ldr q10, [x1, #(16*1)] // ..........................................................................................................e.........................................................'.........................................................................................................~...................................................... + // ldr q11, [x1, #(16*2)] // ........................................................................................................................................................e...........'.......................................................................................................................................................~........ + // ldr q12, [x1, #(16*3)] // .................................................................................e..................................................................................'................................................................................~............................................................................... + // ldr q13, [x2, #(16*0)] // ...............................................................e....................................................................................................'..............................................................~................................................................................................. + // ldr q14, [x2, #(16*1)] // e...................................................................................................................................................................~................................................................................................................................................................ + // ldr q15, [x2, #(16*2)] // ......e.............................................................................................................................................................'.....~.......................................................................................................................................................... + // ldr q16, [x2, #(16*3)] // ............................................................................................e.......................................................................'...........................................................................................~.................................................................... + // ldr q0, [x4], #64 // ............e.......................................................................................................................................................'...........~.................................................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // ......................................................e.............................................................................................................'.....................................................~.......................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // ........~...........................................................................................................................................................'.......*........................................................................................................................................................ + // ldr q3, [x4, #(-64 + 48)] // ...................................................................................................................................e................................'..................................................................................................................................~............................. + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ....................................................................................e...............................................................................'...................................................................................~............................................................................ + // mul v24.4s, v13.4s, v0.s[0] // .....................................................................................e..............................................................................'....................................................................................~........................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................e.....................................................................'.............................................................................................~.................................................................. + // sub v13.4s, v9.4s, v24.4s // .......~............................................................................................................................................................'......*......................................................................................................................................................... + // add v9.4s, v9.4s, v24.4s // ..................................................................................................................................................................e.'................................................................................................................................................................ + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ...................................................................e................................................................................................'..................................................................~............................................................................................. + // mul v24.4s, v14.4s, v0.s[0] // .....................e..............................................................................................................................................'....................~........................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................................................................................................................e........'..........................................................................................................................................................~..... + // sub v14.4s, v10.4s, v24.4s // ..~.................................................................................................................................................................'.*.............................................................................................................................................................. + // add v10.4s, v10.4s, v24.4s // ..............~.....................................................................................................................................................'.............*.................................................................................................................................................. + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ..............................................................................................................................................e.....................'.............................................................................................................................................~.................. + // mul v24.4s, v15.4s, v0.s[0] // ..................................................................................................................................................e.................'.................................................................................................................................................~.............. + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................................................................................e.....'.............................................................................................................................................................~.. + // sub v15.4s, v11.4s, v24.4s // .........~..........................................................................................................................................................'........*....................................................................................................................................................... + // add v11.4s, v11.4s, v24.4s // ....~...............................................................................................................................................................'...*............................................................................................................................................................ + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ........................................................................................................e...........................................................'.......................................................................................................~........................................................ + // mul v24.4s, v16.4s, v0.s[0] // .....................................................................................................e..............................................................'....................................................................................................~........................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................................e.....................................................'.............................................................................................................~.................................................. + // sub v16.4s, v12.4s, v24.4s // .......................................................................................................................................................e............'......................................................................................................................................................~......... + // add v12.4s, v12.4s, v24.4s // .....................................................................................................................e..............................................'....................................................................................................................~........................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ..........~.........................................................................................................................................................'.........*...................................................................................................................................................... + // mul v24.4s, v11.4s, v0.s[2] // ...........~........................................................................................................................................................'..........*..................................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ........................~...........................................................................................................................................'.......................*........................................................................................................................................ + // sub v11.4s, v9.4s, v24.4s // ...............................~....................................................................................................................................'..............................*................................................................................................................................. + // add v9.4s, v9.4s, v24.4s // .................................~..................................................................................................................................'................................*............................................................................................................................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ...~................................................................................................................................................................'..*............................................................................................................................................................. + // mul v24.4s, v12.4s, v0.s[2] // .~..................................................................................................................................................................'*............................................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .............~......................................................................................................................................................'............*................................................................................................................................................... + // sub v12.4s, v10.4s, v24.4s // .......................~............................................................................................................................................'......................*......................................................................................................................................... + // add v10.4s, v10.4s, v24.4s // ...................~................................................................................................................................................'..................*............................................................................................................................................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ..................~.................................................................................................................................................'.................*.............................................................................................................................................. + // mul v24.4s, v15.4s, v1.s[0] // .................~..................................................................................................................................................'................*............................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................~.........................................................................................................................................'.........................*...................................................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ...................................~................................................................................................................................'..................................*............................................................................................................................. + // add v13.4s, v13.4s, v24.4s // .............................................~......................................................................................................................'............................................*................................................................................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ...................................................................................................................................................................e'................................................................................................................................................................ + // mul v24.4s, v16.4s, v1.s[0] // ...............................................................................................................................................................e....'..............................................................................................................................................................~. + // mls v24.4s, v27.4s, v8.s[0] // .....~..............................................................................................................................................................'....*........................................................................................................................................................... + // sub v16.4s, v14.4s, v24.4s // ...............~....................................................................................................................................................'..............*................................................................................................................................................. + // add v14.4s, v14.4s, v24.4s // ................~...................................................................................................................................................'...............*................................................................................................................................................ + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ......................~.............................................................................................................................................'.....................*.......................................................................................................................................... + // mul v24.4s, v10.4s, v1.s[2] // .........................~..........................................................................................................................................'........................*....................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................................~...................................................................................................................................'...............................*................................................................................................................................ + // sub v10.4s, v9.4s, v24.4s // .......................................~............................................................................................................................'......................................*......................................................................................................................... + // add v9.4s, v9.4s, v24.4s // ......................................~.............................................................................................................................'.....................................*.......................................................................................................................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ...........................~........................................................................................................................................'..........................*..................................................................................................................................... + // mul v24.4s, v12.4s, v2.s[0] // ..............................~.....................................................................................................................................'.............................*.................................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ....................................~...............................................................................................................................'...................................*............................................................................................................................ + // sub v12.4s, v11.4s, v24.4s // ..............................................~.....................................................................................................................'.............................................*.................................................................................................................. + // add v11.4s, v11.4s, v24.4s // ............................................~.......................................................................................................................'...........................................*.................................................................................................................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // .............................~......................................................................................................................................'............................*................................................................................................................................... + // mul v24.4s, v14.4s, v2.s[2] // .....................................~..............................................................................................................................'....................................*........................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................................~.........................................................................................................................'.........................................*...................................................................................................................... + // sub v14.4s, v13.4s, v24.4s // ...................................................~................................................................................................................'..................................................*............................................................................................................. + // add v13.4s, v13.4s, v24.4s // ..................................................~.................................................................................................................'.................................................*.............................................................................................................. + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ............................~.......................................................................................................................................'...........................*.................................................................................................................................... + // mul v24.4s, v16.4s, v3.s[0] // ....................~...............................................................................................................................................'...................*............................................................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ..................................~.................................................................................................................................'.................................*.............................................................................................................................. + // sub v16.4s, v15.4s, v24.4s // ........................................~...........................................................................................................................'.......................................*........................................................................................................................ + // add v15.4s, v15.4s, v24.4s // .........................................~..........................................................................................................................'........................................*....................................................................................................................... + // trn1 v25.4s, v9.4s, v10.4s // .............................................................~......................................................................................................'............................................................*................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...........................................~........................................................................................................................'..........................................*..................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ....................................................~...............................................................................................................'...................................................*............................................................................................................ + // trn2 v28.4s, v11.4s, v12.4s // .....................................................~..............................................................................................................'....................................................*........................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ....................................................................~...............................................................................................'...................................................................*............................................................................................ + // trn2 v12.2d, v26.2d, v28.2d // ..........................................................~.........................................................................................................'.........................................................*...................................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // .........................................................................~..........................................................................................'........................................................................*....................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...........................................................~........................................................................................................'..........................................................*..................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ......................................................................~.............................................................................................'.....................................................................*.......................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ........................................................~...........................................................................................................'.......................................................*........................................................................................................ + // trn1 v27.4s, v15.4s, v16.4s // .......................................................~............................................................................................................'......................................................*......................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // .................................................~..................................................................................................................'................................................*............................................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ..................................................................................~.................................................................................'.................................................................................*.............................................................................. + // trn2 v16.2d, v26.2d, v28.2d // ............................................................~.......................................................................................................'...........................................................*.................................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ................................................................................~...................................................................................'...............................................................................*................................................................................ + // trn1 v14.2d, v26.2d, v28.2d // ...........................................................................~........................................................................................'..........................................................................*..................................................................................... + // ldr q0, [ x5], #(12*16) // .............................................................................................................................................................e......'............................................................................................................................................................~... + // ldr q4, [x5, #(-12*16 + 1*16)] // ........................................................................................e...........................................................................'.......................................................................................~........................................................................ + // ldr q1, [ x5, #(-12*16 + 2*16)] // ................................................~...................................................................................................................'...............................................*................................................................................................................ + // ldr q5, [x5, #(-12*16 + 3*16)] // .....................................................................................................................................................e..............'....................................................................................................................................................~........... + // ldr q2, [ x5, #(-12*16 + 4*16)] // ..........................................................................~.........................................................................................'.........................................................................*...................................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ....................................................................................................................e...............................................'...................................................................................................................~............................................ + // sqrdmulh v27.4s, v11.4s, v4.4s // ............................................................................~.......................................................................................'...........................................................................*.................................................................................... + // mul v24.4s, v11.4s, v0.4s // ..............................................................................~.....................................................................................'.............................................................................*.................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................................................~........................................................................'..........................................................................................*..................................................................... + // sub v11.4s, v9.4s, v24.4s // ..................................................................................................~.................................................................'.................................................................................................*.............................................................. + // add v9.4s, v9.4s, v24.4s // ........................................................................................................................~...........................................'.......................................................................................................................*........................................ + // sqrdmulh v27.4s, v12.4s, v4.4s // ................................................................~...................................................................................................'...............................................................*................................................................................................ + // mul v24.4s, v12.4s, v0.4s // ..............................................................~.....................................................................................................'.............................................................*.................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .....................................................................~..............................................................................................'....................................................................*........................................................................................... + // sub v12.4s, v10.4s, v24.4s // .............................................................................~......................................................................................'............................................................................*................................................................................... + // add v10.4s, v10.4s, v24.4s // ...............................................................................................................~....................................................'..............................................................................................................*................................................. + // sqrdmulh v27.4s, v10.4s, v5.4s // ...................................................................................................................~................................................'..................................................................................................................*............................................. + // mul v24.4s, v10.4s, v1.4s // ..................................................................................................................~.................................................'.................................................................................................................*.............................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................................................................................~........................................'..........................................................................................................................*..................................... + // sub v10.4s, v9.4s, v24.4s // ................................................................................................................................~...................................'...............................................................................................................................*................................ + // add v9.4s, v9.4s, v24.4s // .................................................................................................................................~..................................'................................................................................................................................*............................... + // sqrdmulh v27.4s, v12.4s, v6.4s // ......................................................................................~.............................................................................'.....................................................................................*.......................................................................... + // mul v24.4s, v12.4s, v2.4s // ...................................................................................~................................................................................'..................................................................................*............................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ................................................................................................~...................................................................'...............................................................................................*................................................................ + // sub v12.4s, v11.4s, v24.4s // .............................................................................................................~......................................................'............................................................................................................*................................................... + // add v11.4s, v11.4s, v24.4s // ......................................................................................................~.............................................................'.....................................................................................................*.......................................................... + // ldr q0, [ x5, #(-12*16 + 6*16)] // .................................................................................................................................................................e..'................................................................................................................................................................ + // ldr q4, [x5, #(-12*16 + 7*16)] // ...........................................................................................................................................e........................'..........................................................................................................................................~..................... + // ldr q1, [ x5, #(-12*16 + 8*16)] // .........................................................~..........................................................................................................'........................................................*....................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // .........................................................................................~..........................................................................'........................................................................................*....................................................................... + // ldr q2, [ x5, #(-12*16 + 10*16)] // .......................................................................~............................................................................................'......................................................................*......................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // .......................................................................................................................e............................................'......................................................................................................................~......................................... + // sqrdmulh v27.4s, v15.4s, v4.4s // .......................................................................................~............................................................................'......................................................................................*......................................................................... + // mul v24.4s, v15.4s, v0.4s // ..........................................................................................~.........................................................................'.........................................................................................*...................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................................................................................~..................................................................'................................................................................................*............................................................... + // sub v15.4s, v13.4s, v24.4s // .......................................................................................................~............................................................'......................................................................................................*......................................................... + // add v13.4s, v13.4s, v24.4s // ............................................................................................................~.......................................................'...........................................................................................................*.................................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // .................................................................~..................................................................................................'................................................................*............................................................................................... + // mul v24.4s, v16.4s, v0.4s // ..................................................................~.................................................................................................'.................................................................*.............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................~...........................................................................................'.......................................................................*........................................................................................ + // sub v16.4s, v14.4s, v24.4s // ...............................................................................~....................................................................................'..............................................................................*................................................................................. + // add v14.4s, v14.4s, v24.4s // ...............................................................................................~....................................................................'..............................................................................................*................................................................. + // sqrdmulh v27.4s, v14.4s, v5.4s // ....................................................................................................~...............................................................'...................................................................................................*............................................................ + // mul v24.4s, v14.4s, v1.4s // ...................................................................................................~................................................................'..................................................................................................*............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................................................................~........................................................'..........................................................................................................*..................................................... + // sub v14.4s, v13.4s, v24.4s // ......................................................................................................................~.............................................'.....................................................................................................................*.......................................... + // add v13.4s, v13.4s, v24.4s // .................................................................................................................~..................................................'................................................................................................................*............................................... + // sqrdmulh v27.4s, v16.4s, v6.4s // .............................................................................................~......................................................................'............................................................................................*................................................................... + // mul v24.4s, v16.4s, v2.4s // .........................................................................................................~..........................................................'........................................................................................................*....................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................................................................................................................~...................................................'...............................................................................................................*................................................ + // sub v16.4s, v15.4s, v24.4s // ..........................................................................................................................~.........................................'.........................................................................................................................*...................................... + // add v15.4s, v15.4s, v24.4s // .........................................................................................................................~..........................................'........................................................................................................................*....................................... + // trn1 v25.4s, v9.4s, v10.4s // ............................................................................................................................................~.......................'...........................................................................................................................................*.................... + // trn2 v26.4s, v9.4s, v10.4s // .......................................................................................................................................~............................'......................................................................................................................................*......................... + // trn1 v27.4s, v11.4s, v12.4s // ......................................................................................................................................~.............................'.....................................................................................................................................*.......................... + // trn2 v28.4s, v11.4s, v12.4s // ..................................................................................................................................~.................................'.................................................................................................................................*.............................. + // trn2 v11.2d, v25.2d, v27.2d // ..........................................................................................................................................................~.........'.........................................................................................................................................................*...... + // trn2 v12.2d, v26.2d, v28.2d // ................................................................................................................................................~...................'...............................................................................................................................................*................ + // trn1 v9.2d, v25.2d, v27.2d // ......................................................................................................................................................~.............'.....................................................................................................................................................*.......... + // trn1 v10.2d, v26.2d, v28.2d // ...................................................................................................................................................~................'..................................................................................................................................................*............. + // trn1 v25.4s, v13.4s, v14.4s // .............................................................................................................................~......................................'............................................................................................................................*................................... + // trn2 v26.4s, v13.4s, v14.4s // ............................................................................................................................~.......................................'...........................................................................................................................*.................................... + // trn1 v27.4s, v15.4s, v16.4s // ...............................................................................................................................~....................................'..............................................................................................................................*................................. + // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................................................~.....................................'.............................................................................................................................*.................................. + // trn2 v15.2d, v25.2d, v27.2d // .....................................................................................................................................~..............................'....................................................................................................................................*........................... + // trn2 v16.2d, v26.2d, v28.2d // .............................................................................................................................................~......................'............................................................................................................................................*................... + // trn1 v13.2d, v25.2d, v27.2d // ....................................................................................................................................~...............................'...................................................................................................................................*............................ + // trn1 v14.2d, v26.2d, v28.2d // .........................................................................................................................................~..........................'........................................................................................................................................*....................... + // str q9, [x1], #128 // ............................................................................................................................................................~.......'...........................................................................................................................................................*.... + // str q10, [x1, #(-(128) + 16*1)] // .........................................................................................................................................................~..........'........................................................................................................................................................*....... + // str q11, [x1, #(-(128) + 16*2)] // ................................................................................................................................................................~...'...............................................................................................................................................................* + // str q12, [x1, #(-(128) + 16*3)] // ....................................................................................................................................................~...............'...................................................................................................................................................*............ + // str q13, [x2], #128 // ........................................................................................................................................~...........................'.......................................................................................................................................*........................ + // str q14, [x2, #(-(128) + 16*1)] // ...............................................................................................................................................~....................'..............................................................................................................................................*................. + // str q15, [x2, #(-(128) + 16*2)] // ..........................................................................................................................................~.........................'.........................................................................................................................................*...................... + // str q16, [x2, #(-(128) + 16*3)] // .................................................................................................................................................~..................'................................................................................................................................................*............... sub count, count, #1 cbnz count, layer45678_start - // gap // .................................................... - // gap // .................................................... - add v0.4S, v13.4S, v9.4S // *................................................... - sub v12.4S, v13.4S, v9.4S // ..*................................................. - // gap // .................................................... - // gap // .................................................... - trn2 v21.4S, v3.4S, v15.4S // .....*.............................................. - // gap // .................................................... - str q6, [x2, #48] // .........*.......................................... - ldr q4, [x5], #(12*16) // ..............*..................................... - trn1 v9.4S, v17.4S, v23.4S // .*.................................................. - trn2 v17.4S, v0.4S, v12.4S // ........*........................................... - trn1 v6.4S, v0.4S, v12.4S // ...........*........................................ - trn1 v29.4S, v3.4S, v15.4S // ...*................................................ - ldr q30, [x5, #-144] // ......*............................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - trn2 v24.2D, v17.2D, v21.2D // .................*.................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - trn2 v16.2D, v6.2D, v29.2D // ............*....................................... - // gap // .................................................... - // gap // .................................................... - mul v11.4S, v24.4S, v4.4S // .....................*.............................. - sqrdmulh v12.4S, v24.4S, v2.4S // ....................*............................... - // gap // .................................................... - sqrdmulh v24.4S, v16.4S, v2.4S // ..................*................................. - ldr q3, [x5, #-112] // ...............*.................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - trn1 v20.2D, v17.2D, v21.2D // ................*................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v11.4S, v12.4S, v8.S[0] // ........................*........................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mul v17.4S, v16.4S, v4.4S // ......................*............................. - // gap // .................................................... - // gap // .................................................... - trn1 v16.2D, v6.2D, v29.2D // ...................*................................ - ldr q4, [x5, #-128] // .......................*............................ - // gap // .................................................... - // gap // .................................................... - add v10.4S, v20.4S, v11.4S // ..........................*......................... - // gap // .................................................... - sub v11.4S, v20.4S, v11.4S // ...........................*........................ - // gap // .................................................... - // gap // .................................................... - mls v17.4S, v24.4S, v8.S[0] // .........................*.......................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v18.4S, v10.4S, v30.4S // .............................*...................... - mul v27.4S, v10.4S, v19.4S // ...............................*.................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v13.4S, v11.4S, v3.4S // ............................*....................... - mul v6.4S, v11.4S, v4.4S // ..............................*..................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - add v26.4S, v16.4S, v17.4S // ...................................*................ - // gap // .................................................... - // gap // .................................................... - mls v27.4S, v18.4S, v8.S[0] // ..................................*................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v6.4S, v13.4S, v8.S[0] // .................................*.................. - // gap // .................................................... - // gap // .................................................... - sub v29.4S, v16.4S, v17.4S // ................................*................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v30.4S, v26.4S, v27.4S // .....................................*.............. - add v3.4S, v26.4S, v27.4S // .......................................*............ - // gap // .................................................... - // gap // .................................................... - sub v26.4S, v29.4S, v6.4S // ....................................*............... - add v15.4S, v29.4S, v6.4S // ......................................*............. - // gap // .................................................... - // gap // .................................................... - trn1 v24.2D, v9.2D, v1.2D // ....*............................................... - trn2 v1.2D, v9.2D, v1.2D // .......*............................................ - // gap // .................................................... - // gap // .................................................... - trn1 v21.4S, v15.4S, v26.4S // ........................................*........... - trn1 v23.4S, v3.4S, v30.4S // ...........................................*........ - // gap // .................................................... - // gap // .................................................... - trn2 v26.4S, v15.4S, v26.4S // .........................................*.......... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - trn2 v15.4S, v3.4S, v30.4S // ..........................................*......... - str q1, [x2, #32] // ..........*......................................... - trn2 v29.2D, v23.2D, v21.2D // .................................................*.. - // gap // .................................................... - str q24, [x2], #128 // .............*...................................... - trn1 v27.2D, v23.2D, v21.2D // .............................................*...... - // gap // .................................................... - // gap // .................................................... - str q29, [x1, #32] // ...................................................* - trn2 v16.2D, v15.2D, v26.2D // ............................................*....... - // gap // .................................................... - // gap // .................................................... - trn1 v13.2D, v15.2D, v26.2D // ..............................................*..... - str q27, [x1], #128 // ................................................*... - // gap // .................................................... - // gap // .................................................... - str q16, [x1, #-80] // ...............................................*.... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - str q13, [x1, #-112] // ..................................................*. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - - // original source code - // add v24.4S, v13.4S, v9.4S // *................................................... - // trn1 v16.4S, v17.4S, v23.4S // .....*.............................................. - // sub v7.4S, v13.4S, v9.4S // .*.................................................. - // trn1 v18.4S, v3.4S, v15.4S // ........*........................................... - // trn1 v9.2D, v16.2D, v1.2D // ....................................*............... - // trn2 v0.4S, v3.4S, v15.4S // ..*................................................. - // ldr q30, [x5, #48] // .........*.......................................... - // trn2 v3.2D, v16.2D, v1.2D // .....................................*.............. - // trn2 v25.4S, v24.4S, v7.4S // ......*............................................. - // str q6, [x2, #48] // ...*................................................ - // str q3, [x2, #32] // ..........................................*......... - // trn1 v17.4S, v24.4S, v7.4S // .......*............................................ - // trn2 v1.2D, v17.2D, v18.2D // ...........*........................................ - // str q9, [x2], #128 // ............................................*....... - // ldr q12, [x5], #(12*16) // ....*............................................... - // ldr q24, [x5, #-112] // ...............*.................................... - // trn1 v7.2D, v25.2D, v0.2D // ................*................................... - // trn2 v4.2D, v25.2D, v0.2D // ..........*......................................... - // sqrdmulh v0.4S, v1.4S, v2.4S // ..............*..................................... - // trn1 v15.2D, v17.2D, v18.2D // ...................*................................ - // sqrdmulh v5.4S, v4.4S, v2.4S // .............*...................................... - // mul v28.4S, v4.4S, v12.4S // ............*....................................... - // mul v4.4S, v1.4S, v12.4S // ..................*................................. - // ldr q1, [x5, #-128] // ....................*............................... - // mls v28.4S, v5.4S, v8.S[0] // .................*.................................. - // mls v4.4S, v0.4S, v8.S[0] // .......................*............................ - // add v5.4S, v7.4S, v28.4S // .....................*.............................. - // sub v25.4S, v7.4S, v28.4S // ......................*............................. - // sqrdmulh v28.4S, v25.4S, v24.4S // ..........................*......................... - // sqrdmulh v21.4S, v5.4S, v30.4S // ........................*........................... - // mul v20.4S, v25.4S, v1.4S // ...........................*........................ - // mul v30.4S, v5.4S, v19.4S // .........................*.......................... - // sub v14.4S, v15.4S, v4.4S // ...............................*.................... - // mls v20.4S, v28.4S, v8.S[0] // ..............................*..................... - // mls v30.4S, v21.4S, v8.S[0] // .............................*...................... - // add v28.4S, v15.4S, v4.4S // ............................*....................... - // sub v25.4S, v14.4S, v20.4S // ..................................*................. - // sub v24.4S, v28.4S, v30.4S // ................................*................... - // add v20.4S, v14.4S, v20.4S // ...................................*................ - // add v0.4S, v28.4S, v30.4S // .................................*.................. - // trn1 v14.4S, v20.4S, v25.4S // ......................................*............. - // trn2 v5.4S, v20.4S, v25.4S // ........................................*........... - // trn2 v4.4S, v0.4S, v24.4S // .........................................*.......... - // trn1 v24.4S, v0.4S, v24.4S // .......................................*............ - // trn2 v21.2D, v4.2D, v5.2D // ...............................................*.... - // trn1 v31.2D, v24.2D, v14.2D // .............................................*...... - // trn1 v0.2D, v4.2D, v5.2D // ................................................*... - // str q21, [x1, #48] // ..................................................*. - // str q31, [x1], #128 // .................................................*.. - // trn2 v16.2D, v24.2D, v14.2D // ...........................................*........ - // str q0, [x1, #-112] // ...................................................* - // str q16, [x1, #-96] // ..............................................*..... + // Instructions: 129 + // Expected cycles: 61 + // Expected IPC: 2.11 + // + // Wall time: 113.39s + // User time: 113.39s + // + // ------------------------------------------------------ original position -------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--- + add v16.4S, v31.4S, v0.4S // ...*............................................................................................................................. + mls v3.4S, v15.4S, v8.S[0] // ....*............................................................................................................................ + ldr q26, [x5, #-48] // .............................................................................*................................................... + ldr q22, [x5, #-64] // ...................................................*............................................................................. + sub v19.4S, v30.4S, v1.4S // .....*........................................................................................................................... + // gap // ................................................................................................................................. + sub v14.4S, v13.4S, v9.4S // .*............................................................................................................................... + ldr q23, [x5, #-32] // ...............................................................*................................................................. + mul v30.4S, v16.4S, v10.S[2] // .........*....................................................................................................................... + sub v29.4S, v31.4S, v0.4S // .......*......................................................................................................................... + ldr q31, [x4, #-32] // ......*.......................................................................................................................... + // gap // ................................................................................................................................. + sub v27.4S, v14.4S, v3.4S // ............*.................................................................................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + add v15.4S, v14.4S, v3.4S // .............*................................................................................................................... + ldr q3, [x5, #-128] // ..................................................................*.............................................................. + // gap // ................................................................................................................................. + sqrdmulh v4.4S, v29.4S, v11.S[1] // ...............*................................................................................................................. + mul v6.4S, v29.4S, v11.S[0] // ..............*.................................................................................................................. + mul v0.4S, v27.4S, v18.S[0] // .................*............................................................................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sqrdmulh v29.4S, v27.4S, v18.S[1] // ........................*........................................................................................................ + mul v21.4S, v15.4S, v31.S[2] // .................................*............................................................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + add v27.4S, v13.4S, v9.4S // ...........*..................................................................................................................... + sqrdmulh v9.4S, v15.4S, v31.S[3] // .........................*....................................................................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mls v6.4S, v4.4S, v8.S[0] // ......................*.......................................................................................................... + mls v0.4S, v29.4S, v8.S[0] // ..............................*.................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sqrdmulh v14.4S, v16.4S, v10.S[3] // ........*........................................................................................................................ + sqrdmulh v4.4S, v5.4S, v10.S[3] // ..*.............................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mul v15.4S, v5.4S, v10.S[2] // *................................................................................................................................ + mls v21.4S, v9.4S, v8.S[0] // ......................................*.......................................................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sub v10.4S, v19.4S, v6.4S // ...............................*................................................................................................. + mls v30.4S, v14.4S, v8.S[0] // ....................*............................................................................................................ + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + add v5.4S, v19.4S, v6.4S // .........................................*....................................................................................... + sub v18.4S, v10.4S, v0.4S // ....................................*............................................................................................ + add v14.4S, v10.4S, v0.4S // .....................................*........................................................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sub v1.4S, v5.4S, v21.4S // ..............................................*.................................................................................. + add v16.4S, v5.4S, v21.4S // .............................................*................................................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn2 v5.4S, v14.4S, v18.4S // ............................................*.................................................................................... + // gap // ................................................................................................................................. + mls v15.4S, v4.4S, v8.S[0] // ..........*...................................................................................................................... + // gap // ................................................................................................................................. + trn1 v10.4S, v16.4S, v1.4S // ..............................................................*.................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn2 v21.4S, v16.4S, v1.4S // ..................................................*.............................................................................. + sub v13.4S, v7.4S, v30.4S // ...........................*..................................................................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn1 v19.4S, v14.4S, v18.4S // .................................................*............................................................................... + trn1 v16.2D, v21.2D, v5.2D // ...................................................................*............................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn2 v1.2D, v21.2D, v5.2D // ......................................................*.......................................................................... + // gap // ................................................................................................................................. + add v5.4S, v27.4S, v15.4S // ................*................................................................................................................ + // gap // ................................................................................................................................. + trn2 v18.2D, v10.2D, v19.2D // .........................................................................*....................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sqrdmulh v4.4S, v1.4S, v24.4S // ..........................................................*...................................................................... + mul v14.4S, v1.4S, v12.4S // ...........................................................*..................................................................... + sqrdmulh v29.4S, v18.4S, v24.4S // ............................................................................*.................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sub v6.4S, v27.4S, v15.4S // ...................*............................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mul v0.4S, v5.4S, v11.S[2] // .....................*........................................................................................................... + sqrdmulh v1.4S, v5.4S, v11.S[3] // ..................*.............................................................................................................. + mls v14.4S, v4.4S, v8.S[0] // ................................................................*................................................................ + mul v5.4S, v18.4S, v12.4S // ..............................................................................*.................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mul v27.4S, v6.4S, v31.S[0] // ..........................*...................................................................................................... + sqrdmulh v4.4S, v6.4S, v31.S[1] // .......................*......................................................................................................... + trn1 v6.2D, v10.2D, v19.2D // ........................................................................*........................................................ + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mls v0.4S, v1.4S, v8.S[0] // ............................*.................................................................................................... + sub v1.4S, v16.4S, v14.4S // .......................................................................*......................................................... + mls v5.4S, v29.4S, v8.S[0] // ...................................................................................*............................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mls v27.4S, v4.4S, v8.S[0] // ................................*................................................................................................ + add v7.4S, v7.4S, v30.4S // .............................*................................................................................................... + mul v24.4S, v1.4S, v23.4S // .........................................................................................*....................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sqrdmulh v18.4S, v1.4S, v2.4S // ................................................................................*................................................ + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sub v1.4S, v7.4S, v0.4S // ...................................*............................................................................................. + add v7.4S, v7.4S, v0.4S // ..................................*.............................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sub v31.4S, v13.4S, v27.4S // ..........................................*...................................................................................... + add v19.4S, v13.4S, v27.4S // ........................................*........................................................................................ + mls v24.4S, v18.4S, v8.S[0] // ..............................................................................................*.................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + add v15.4S, v16.4S, v14.4S // .................................................................................*............................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn2 v11.4S, v19.4S, v31.4S // ................................................*................................................................................ + trn2 v29.4S, v7.4S, v1.4S // .......................................*......................................................................................... + trn1 v4.4S, v7.4S, v1.4S // .......................................................*......................................................................... + // gap // ................................................................................................................................. + trn1 v16.4S, v19.4S, v31.4S // ...............................................*................................................................................. + // gap // ................................................................................................................................. + add v2.4S, v6.4S, v5.4S // ...........................................................................................*..................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn2 v0.2D, v29.2D, v11.2D // ....................................................*............................................................................ + trn1 v9.2D, v29.2D, v11.2D // .....................................................*........................................................................... + sub v1.4S, v6.4S, v5.4S // ........................................................................................*........................................ + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mul v29.4S, v0.4S, v17.4S // ........................................................*........................................................................ + sqrdmulh v31.4S, v0.4S, v20.4S // .........................................................*....................................................................... + add v18.4S, v1.4S, v24.4S // ....................................................................................................*............................ + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn2 v19.2D, v4.2D, v16.2D // ............................................................*.................................................................... + sub v7.4S, v1.4S, v24.4S // .....................................................................................................*........................... + sqrdmulh v1.4S, v15.4S, v26.4S // ......................................................................................*.......................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mul v0.4S, v19.4S, v17.4S // ......................................................................*.......................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mls v29.4S, v31.4S, v8.S[0] // .............................................................*................................................................... + // gap // ................................................................................................................................. + sqrdmulh v21.4S, v19.4S, v20.4S // ....................................................................*............................................................ + mul v24.4S, v15.4S, v22.4S // .....................................................................................*........................................... + ldr q27, [x5, #-160] // ...........................................*..................................................................................... + trn1 v5.4S, v18.4S, v7.4S // ..........................................................................................................*...................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn2 v31.4S, v18.4S, v7.4S // .........................................................................................................*....................... + sub v6.4S, v9.4S, v29.4S // .....................................................................*........................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + add v7.4S, v9.4S, v29.4S // .............................................................................................*................................... + mls v0.4S, v21.4S, v8.S[0] // ...............................................................................*................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mls v24.4S, v1.4S, v8.S[0] // ..........................................................................................*...................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mul v26.4S, v7.4S, v27.4S // ................................................................................................*................................ + sqrdmulh v18.4S, v7.4S, v28.4S // .................................................................................................*............................... + trn1 v12.2D, v4.2D, v16.2D // .................................................................*............................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mul v20.4S, v6.4S, v3.4S // ..........................................................................*...................................................... + sqrdmulh v30.4S, v6.4S, v25.4S // ...........................................................................*..................................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sub v3.4S, v2.4S, v24.4S // ..................................................................................................*.............................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + mls v26.4S, v18.4S, v8.S[0] // ......................................................................................................*.......................... + add v18.4S, v2.4S, v24.4S // ...............................................................................................*................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + sub v28.4S, v12.4S, v0.4S // ....................................................................................*............................................ + add v6.4S, v12.4S, v0.4S // ...................................................................................................*............................. + mls v20.4S, v30.4S, v8.S[0] // ..................................................................................*.............................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn1 v27.4S, v18.4S, v3.4S // ........................................................................................................*........................ + trn2 v25.4S, v18.4S, v3.4S // .......................................................................................................*......................... + // gap // ................................................................................................................................. + sub v1.4S, v6.4S, v26.4S // ...........................................................................................................*..................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + add v26.4S, v6.4S, v26.4S // ............................................................................................................*.................... + trn2 v18.2D, v27.2D, v5.2D // ...............................................................................................................*................. + sub v7.4S, v28.4S, v20.4S // ............................................................................................*.................................... + add v9.4S, v28.4S, v20.4S // .......................................................................................*......................................... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn1 v20.2D, v27.2D, v5.2D // ..............................................................................................................*.................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn1 v12.2D, v25.2D, v31.2D // ...................................................................................................................*............. + // gap // ................................................................................................................................. + trn2 v0.4S, v9.4S, v7.4S // .............................................................................................................*................... + trn2 v27.4S, v26.4S, v1.4S // .................................................................................................................*............... + str q18, [x2, #32] // ....................................................................................................................*............ + str q12, [x2, #16] // .......................................................................................................................*......... + trn2 v25.2D, v25.2D, v31.2D // ......................................................................................................................*.......... + trn1 v15.4S, v9.4S, v7.4S // ................................................................................................................*................ + // gap // ................................................................................................................................. + str q20, [x2], #128 // ..................................................................................................................*.............. + trn1 v26.4S, v26.4S, v1.4S // .....................................................................................................................*........... + // gap // ................................................................................................................................. + trn2 v5.2D, v27.2D, v0.2D // ........................................................................................................................*........ + trn1 v1.2D, v27.2D, v0.2D // ..........................................................................................................................*...... + str q25, [x2, #-80] // .........................................................................................................................*....... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + trn1 v22.2D, v26.2D, v15.2D // ............................................................................................................................*.... + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + str q5, [x1, #48] // ...........................................................................................................................*..... + trn2 v15.2D, v26.2D, v15.2D // ..............................................................................................................................*.. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + str q1, [x1, #16] // .............................................................................................................................*... + // gap // ................................................................................................................................. + str q22, [x1], #128 // ...............................................................................................................................*. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + str q15, [x1, #-96] // ................................................................................................................................* + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + // gap // ................................................................................................................................. + + // --------------------------------------------------------- new position ---------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--- + // mul v27.4S, v5.4S, v10.S[2] // ........................*........................................................................................................ + // sub v26.4S, v13.4S, v9.4S // .....*........................................................................................................................... + // sqrdmulh v14.4S, v5.4S, v10.S[3] // .......................*......................................................................................................... + // add v29.4S, v31.4S, v0.4S // *................................................................................................................................ + // mls v3.4S, v15.4S, v8.S[0] // .*............................................................................................................................... + // sub v1.4S, v30.4S, v1.4S // ....*............................................................................................................................ + // ldr q30, [x4, #-32] // .........*....................................................................................................................... + // sub v31.4S, v31.4S, v0.4S // ........*........................................................................................................................ + // sqrdmulh v16.4S, v29.4S, v10.S[3] // ......................*.......................................................................................................... + // mul v0.4S, v29.4S, v10.S[2] // .......*......................................................................................................................... + // mls v27.4S, v14.4S, v8.S[0] // ..................................*.............................................................................................. + // add v15.4S, v13.4S, v9.4S // ..................*.............................................................................................................. + // sub v4.4S, v26.4S, v3.4S // ..........*...................................................................................................................... + // add v29.4S, v26.4S, v3.4S // ...........*..................................................................................................................... + // mul v13.4S, v31.4S, v11.S[0] // ..............*.................................................................................................................. + // sqrdmulh v21.4S, v31.4S, v11.S[1] // .............*................................................................................................................... + // add v14.4S, v15.4S, v27.4S // .........................................*....................................................................................... + // mul v26.4S, v4.4S, v18.S[0] // ...............*................................................................................................................. + // sqrdmulh v5.4S, v14.4S, v11.S[3] // ................................................*................................................................................ + // sub v31.4S, v15.4S, v27.4S // ..............................................*.................................................................................. + // mls v0.4S, v16.4S, v8.S[0] // ...........................*..................................................................................................... + // mul v3.4S, v14.4S, v11.S[2] // ...............................................*................................................................................. + // mls v13.4S, v21.4S, v8.S[0] // ....................*............................................................................................................ + // sqrdmulh v14.4S, v31.4S, v30.S[1] // ....................................................*............................................................................ + // sqrdmulh v21.4S, v4.4S, v18.S[1] // ................*................................................................................................................ + // sqrdmulh v15.4S, v29.4S, v30.S[3] // ...................*............................................................................................................. + // mul v27.4S, v31.4S, v30.S[0] // ...................................................*............................................................................. + // sub v11.4S, v7.4S, v0.4S // .....................................*........................................................................................... + // mls v3.4S, v5.4S, v8.S[0] // ......................................................*.......................................................................... + // add v31.4S, v7.4S, v0.4S // ..........................................................*...................................................................... + // mls v26.4S, v21.4S, v8.S[0] // .....................*........................................................................................................... + // sub v16.4S, v1.4S, v13.4S // ..........................*...................................................................................................... + // mls v27.4S, v14.4S, v8.S[0] // .........................................................*....................................................................... + // mul v0.4S, v29.4S, v30.S[2] // .................*............................................................................................................... + // add v21.4S, v31.4S, v3.4S // ..............................................................*.................................................................. + // sub v3.4S, v31.4S, v3.4S // .............................................................*................................................................... + // sub v5.4S, v16.4S, v26.4S // .............................*................................................................................................... + // add v7.4S, v16.4S, v26.4S // ..............................*.................................................................................................. + // mls v0.4S, v15.4S, v8.S[0] // .........................*....................................................................................................... + // trn2 v6.4S, v21.4S, v3.4S // ....................................................................*............................................................ + // add v14.4S, v11.4S, v27.4S // ................................................................*................................................................ + // add v31.4S, v1.4S, v13.4S // ............................*.................................................................................................... + // sub v26.4S, v11.4S, v27.4S // ...............................................................*................................................................. + // ldr q29, [x5, #-160] // .....................................................................................*........................................... + // trn2 v18.4S, v7.4S, v5.4S // .................................*............................................................................................... + // add v27.4S, v31.4S, v0.4S // ................................*................................................................................................ + // sub v16.4S, v31.4S, v0.4S // ...............................*................................................................................................. + // trn1 v4.4S, v14.4S, v26.4S // ......................................................................*.......................................................... + // trn2 v0.4S, v14.4S, v26.4S // ...................................................................*............................................................. + // trn1 v1.4S, v7.4S, v5.4S // ......................................*.......................................................................................... + // trn2 v26.4S, v27.4S, v16.4S // ....................................*............................................................................................ + // ldr q7, [x5, #-64] // ...*............................................................................................................................. + // trn2 v31.2D, v6.2D, v0.2D // ........................................................................*........................................................ + // trn1 v19.2D, v6.2D, v0.2D // .........................................................................*....................................................... + // trn2 v14.2D, v26.2D, v18.2D // ........................................*........................................................................................ + // trn1 v5.4S, v21.4S, v3.4S // .....................................................................*........................................................... + // mul v15.4S, v31.4S, v17.4S // ...........................................................................*..................................................... + // sqrdmulh v3.4S, v31.4S, v20.4S // ............................................................................*.................................................... + // sqrdmulh v6.4S, v14.4S, v24.4S // ...........................................*..................................................................................... + // mul v0.4S, v14.4S, v12.4S // ............................................*.................................................................................... + // trn2 v31.2D, v5.2D, v4.2D // ..............................................................................*.................................................. + // mls v15.4S, v3.4S, v8.S[0] // ..................................................................................*.............................................. + // trn1 v16.4S, v27.4S, v16.4S // ...................................*............................................................................................. + // ldr q13, [x5, #-32] // ......*.......................................................................................................................... + // mls v0.4S, v6.4S, v8.S[0] // .................................................*............................................................................... + // trn1 v3.2D, v5.2D, v4.2D // ..............................................................................................*.................................. + // ldr q5, [x5, #-128] // ............*.................................................................................................................... + // trn1 v23.2D, v26.2D, v18.2D // .......................................*......................................................................................... + // sqrdmulh v6.4S, v31.4S, v20.4S // ...................................................................................*............................................. + // sub v27.4S, v19.4S, v15.4S // ........................................................................................*........................................ + // mul v17.4S, v31.4S, v17.4S // .................................................................................*............................................... + // sub v18.4S, v23.4S, v0.4S // .......................................................*......................................................................... + // trn1 v4.2D, v16.2D, v1.2D // .....................................................*........................................................................... + // trn2 v16.2D, v16.2D, v1.2D // ..........................................*...................................................................................... + // mul v26.4S, v27.4S, v5.4S // ...............................................................................................*................................. + // sqrdmulh v25.4S, v27.4S, v25.4S // ................................................................................................*................................ + // sqrdmulh v24.4S, v16.4S, v24.4S // .............................................*................................................................................... + // ldr q27, [x5, #-48] // ..*.............................................................................................................................. + // mul v21.4S, v16.4S, v12.4S // ..................................................*.............................................................................. + // mls v17.4S, v6.4S, v8.S[0] // ..........................................................................................*...................................... + // sqrdmulh v2.4S, v18.4S, v2.4S // ............................................................*.................................................................... + // add v16.4S, v23.4S, v0.4S // ..................................................................*.............................................................. + // mls v26.4S, v25.4S, v8.S[0] // ......................................................................................................*.......................... + // mls v21.4S, v24.4S, v8.S[0] // ........................................................*........................................................................ + // sub v5.4S, v3.4S, v17.4S // ....................................................................................................*............................ + // mul v23.4S, v16.4S, v7.4S // ....................................................................................*............................................ + // sqrdmulh v16.4S, v16.4S, v27.4S // ................................................................................*................................................ + // add v7.4S, v5.4S, v26.4S // .............................................................................................................*................... + // sub v24.4S, v4.4S, v21.4S // ..........................................................................*...................................................... + // mul v0.4S, v18.4S, v13.4S // ...........................................................*..................................................................... + // mls v23.4S, v16.4S, v8.S[0] // ...........................................................................................*..................................... + // add v16.4S, v4.4S, v21.4S // .......................................................................*......................................................... + // sub v4.4S, v5.4S, v26.4S // ............................................................................................................*.................... + // add v25.4S, v19.4S, v15.4S // .........................................................................................*....................................... + // mls v0.4S, v2.4S, v8.S[0] // .................................................................*............................................................... + // add v26.4S, v16.4S, v23.4S // ...................................................................................................*............................. + // mul v29.4S, v25.4S, v29.4S // ............................................................................................*.................................... + // sqrdmulh v18.4S, v25.4S, v28.4S // .............................................................................................*................................... + // sub v16.4S, v16.4S, v23.4S // .................................................................................................*............................... + // add v12.4S, v3.4S, v17.4S // .....................................................................................................*........................... + // add v6.4S, v24.4S, v0.4S // .............................................................................*................................................... + // sub v21.4S, v24.4S, v0.4S // ...............................................................................*................................................. + // mls v29.4S, v18.4S, v8.S[0] // ..................................................................................................*.............................. + // trn2 v23.4S, v26.4S, v16.4S // ........................................................................................................*........................ + // trn1 v3.4S, v26.4S, v16.4S // .......................................................................................................*......................... + // trn2 v26.4S, v6.4S, v21.4S // .......................................................................................*......................................... + // trn1 v28.4S, v6.4S, v21.4S // ......................................................................................*.......................................... + // sub v6.4S, v12.4S, v29.4S // .........................................................................................................*....................... + // add v15.4S, v12.4S, v29.4S // ..........................................................................................................*...................... + // trn2 v29.4S, v7.4S, v4.4S // ................................................................................................................*................ + // trn1 v21.2D, v3.2D, v28.2D // ..............................................................................................................*.................. + // trn2 v0.2D, v3.2D, v28.2D // ...........................................................................................................*..................... + // trn1 v3.4S, v7.4S, v4.4S // .....................................................................................................................*........... + // trn2 v17.4S, v15.4S, v6.4S // .................................................................................................................*............... + // str q21, [x2], #128 // ......................................................................................................................*.......... + // trn1 v12.2D, v23.2D, v26.2D // ...............................................................................................................*................. + // str q0, [x2, #-96] // ..................................................................................................................*.............. + // trn1 v7.4S, v15.4S, v6.4S // .......................................................................................................................*......... + // trn2 v21.2D, v23.2D, v26.2D // ....................................................................................................................*............ + // str q12, [x2, #-112] // ...................................................................................................................*............. + // trn2 v4.2D, v17.2D, v29.2D // ........................................................................................................................*........ + // str q21, [x2, #-80] // ..........................................................................................................................*...... + // trn1 v16.2D, v17.2D, v29.2D // .........................................................................................................................*....... + // str q4, [x1, #48] // ............................................................................................................................*.... + // trn1 v29.2D, v7.2D, v3.2D // ...........................................................................................................................*..... + // str q16, [x1, #16] // ..............................................................................................................................*.. + // trn2 v6.2D, v7.2D, v3.2D // .............................................................................................................................*... + // str q29, [x1], #128 // ...............................................................................................................................*. + // str q6, [x1, #-96] // ................................................................................................................................* pop_stack diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_opt_a55.s b/examples/opt/aarch64/ntt_dilithium_123_45678_opt_a55.s index 85d49dbf..1a50ab66 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_opt_a55.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_opt_a55.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, \offset] -.endm - -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], \inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, \offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], \inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +25,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +42,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -89,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -131,40 +103,40 @@ xtmp1 .req x11 str \x\()t_31, [\addr, #(-\inc + 8*7)] .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -186,7 +158,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +169,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +179,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +187,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +198,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -251,7 +223,7 @@ roots: .text .global ntt_dilithium_123_45678_opt_a55 - .global _ntt_dilithium_123_45678_opt_a55 + .global _ntt_dilithium_123_45678 .p2align 4 const_addr: .word 8380417 @@ -375,508 +347,543 @@ _ntt_dilithium_123_45678_opt_a55: load_roots_123 .p2align 2 - ldr_vo v31, x0, 0 // *......... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v21, x0, 128 // .*........ - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v17, x0, 256 // ..*....... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v29, x0, 384 // ...*...... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v9, x0, 512 // ....*..... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v12, x0, 896 // .......*.. - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v14.4S, v9.4S, v0.S[1] // ......*... - // gap // .......... - ldr_vo v6, x0, 640 // .....*.... - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v18.4S, v12.4S, v0.S[1] // .........* - // gap // .......... - ldr_vo v27, x0, 768 // ........*. - // gap // .......... - - // original source code - // ldr_vo v31, x0, 0 // *......... || *................ - // ldr_vo v21, x0, 128 // .*........ || ..*.............. - // ldr_vo v17, x0, 256 // ..*....... || ....*............ - // ldr_vo v29, x0, 384 // ...*...... || ......*.......... - // ldr_vo v9, x0, 512 // ....*..... || ........*........ - // ldr_vo v6, x0, 640 // .......*.. || .............*... - // sqrdmulh v14.4S, v9.4S, v0.S[1] // ......*... || ............*.... - // ldr_vo v12, x0, 896 // .....*.... || ..........*...... - // ldr_vo v27, x0, 768 // .........* || ................* - // sqrdmulh v18.4S, v12.4S, v0.S[1] // ........*. || ...............*. - + // Instructions: 10 + // Expected cycles: 17 + // Expected IPC: 0.59 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q6, [x0, #512] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q20, [x0, #896] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q9, [x0, #0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x0, #128] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x0, #256] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q26, [x0, #384] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x0, #640] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v12.4S, v20.4S, v0.S[0] // .........*.................... + // gap // .............................. + mul v19.4S, v6.4S, v0.S[0] // ......*....................... + // gap // .............................. + ldr q31, [x0, #768] // ........*..................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q9, [x0, #0] // ..*............................ + // ldr q15, [x0, #128] // ...*........................... + // ldr q4, [x0, #256] // ....*.......................... + // ldr q26, [x0, #384] // .....*......................... + // ldr q6, [x0, #512] // *.............................. + // ldr q22, [x0, #640] // ......*........................ + // mul v19.4S, v6.4S, v0.S[0] // ........*...................... + // ldr q20, [x0, #896] // .*............................. + // ldr q31, [x0, #768] // .........*..................... + // mul v12.4S, v20.4S, v0.S[0] // .......*....................... + sub count, count, #1 -.p2align 2 layer123_start: - mul v9.4S, v9.4S, v0.S[0] // ........*................................................................... - // gap // ............................................................................ - mul v30.4S, v6.4S, v0.S[0] // .............*.............................................................. - // gap // ............................................................................ - sqrdmulh v20.4S, v6.4S, v0.S[1] // ..............*............................................................. - // gap // ............................................................................ - mul v25.4S, v27.4S, v0.S[0] // ..................*......................................................... - // gap // ............................................................................ - mls v9.4S, v14.4S, v8.S[0] // ..........*................................................................. - // gap // ............................................................................ - sqrdmulh v6.4S, v27.4S, v0.S[1] // ...................*........................................................ - // gap // ............................................................................ - mls v30.4S, v20.4S, v8.S[0] // ...............*............................................................ - // gap // ............................................................................ - mul v20.4S, v12.4S, v0.S[0] // .......................*.................................................... - // gap // ............................................................................ - sub v14.4S, v31.4S, v9.4S // ...........*................................................................ - // gap // ............................................................................ - mls v25.4S, v6.4S, v8.S[0] // ....................*....................................................... - // gap // ............................................................................ - add v9.4S, v31.4S, v9.4S // ............*............................................................... - // gap // ............................................................................ - sub v6.4S, v21.4S, v30.4S // ................*........................................................... - // gap // ............................................................................ - add v30.4S, v21.4S, v30.4S // .................*.......................................................... - // gap // ............................................................................ - sub v12.4S, v17.4S, v25.4S // .....................*...................................................... - // gap // ............................................................................ - add v25.4S, v17.4S, v25.4S // ......................*..................................................... - // gap // ............................................................................ - mls v20.4S, v18.4S, v8.S[0] // .........................*.................................................. - // gap // ............................................................................ - mul v17.4S, v12.4S, v1.S[0] // ......................................*..................................... - // gap // ............................................................................ - sqrdmulh v12.4S, v12.4S, v1.S[1] // .......................................*.................................... - // gap // ............................................................................ - mul v31.4S, v25.4S, v0.S[2] // ............................*............................................... - // gap // ............................................................................ - sub v21.4S, v29.4S, v20.4S // ..........................*................................................. - // gap // ............................................................................ - add v20.4S, v29.4S, v20.4S // ...........................*................................................ - // gap // ............................................................................ - mls v17.4S, v12.4S, v8.S[0] // ........................................*................................... - // gap // ............................................................................ - sqrdmulh v25.4S, v25.4S, v0.S[3] // .............................*.............................................. - // gap // ............................................................................ - mul v12.4S, v21.4S, v1.S[0] // ...........................................*................................ - // gap // ............................................................................ - sqrdmulh v21.4S, v21.4S, v1.S[1] // ............................................*............................... - // gap // ............................................................................ - sub v29.4S, v14.4S, v17.4S // .........................................*.................................. - // gap // ............................................................................ - add v14.4S, v14.4S, v17.4S // ..........................................*................................. - // gap // ............................................................................ - mls v31.4S, v25.4S, v8.S[0] // ..............................*............................................. - // gap // ............................................................................ - mul v25.4S, v20.4S, v0.S[2] // .................................*.......................................... - // gap // ............................................................................ - mls v12.4S, v21.4S, v8.S[0] // .............................................*.............................. - // gap // ............................................................................ - sqrdmulh v20.4S, v20.4S, v0.S[3] // ..................................*......................................... - // gap // ............................................................................ - sub v17.4S, v9.4S, v31.4S // ...............................*............................................ - // gap // ............................................................................ - add v9.4S, v9.4S, v31.4S // ................................*........................................... - // gap // ............................................................................ - sub v31.4S, v6.4S, v12.4S // ..............................................*............................. - // gap // ............................................................................ - add v6.4S, v6.4S, v12.4S // ...............................................*............................ - // gap // ............................................................................ - mls v25.4S, v20.4S, v8.S[0] // ...................................*........................................ - // gap // ............................................................................ - mul v20.4S, v31.4S, v3.S[0] // ...............................................................*............ - // gap // ............................................................................ - mul v12.4S, v6.4S, v2.S[2] // ..........................................................*................. - // gap // ............................................................................ - sqrdmulh v6.4S, v6.4S, v2.S[3] // ...........................................................*................ - // gap // ............................................................................ - sub v21.4S, v30.4S, v25.4S // ....................................*....................................... - // gap // ............................................................................ - add v30.4S, v30.4S, v25.4S // .....................................*...................................... - // gap // ............................................................................ - sqrdmulh v25.4S, v31.4S, v3.S[1] // ................................................................*........... - // gap // ............................................................................ - mul v31.4S, v21.4S, v2.S[0] // .....................................................*...................... - // gap // ............................................................................ - mul v27.4S, v30.4S, v1.S[2] // ................................................*........................... - // gap // ............................................................................ - sqrdmulh v30.4S, v30.4S, v1.S[3] // .................................................*.......................... - // gap // ............................................................................ - sqrdmulh v21.4S, v21.4S, v2.S[1] // ......................................................*..................... - // gap // ............................................................................ - mls v12.4S, v6.4S, v8.S[0] // ............................................................*............... - // gap // ............................................................................ - mls v20.4S, v25.4S, v8.S[0] // .................................................................*.......... - // gap // ............................................................................ - mls v27.4S, v30.4S, v8.S[0] // ..................................................*......................... - // gap // ............................................................................ - mls v31.4S, v21.4S, v8.S[0] // .......................................................*.................... - // gap // ............................................................................ - sub v30.4S, v14.4S, v12.4S // .............................................................*.............. - // gap // ............................................................................ - sub v25.4S, v29.4S, v20.4S // ..................................................................*......... - // gap // ............................................................................ - add v20.4S, v29.4S, v20.4S // ...................................................................*........ - // gap // ............................................................................ - add v6.4S, v14.4S, v12.4S // ..............................................................*............. - // gap // ............................................................................ - sub v14.4S, v9.4S, v27.4S // ...................................................*........................ - // gap // ............................................................................ - add v9.4S, v9.4S, v27.4S // ....................................................*....................... - // gap // ............................................................................ - sub v12.4S, v17.4S, v31.4S // ........................................................*................... - // gap // ............................................................................ - add v17.4S, v17.4S, v31.4S // .........................................................*.................. - // gap // ............................................................................ - str_vi v9, x0, 16 // ....................................................................*....... - // gap // ............................................................................ - ldr_vo v31, x0, 0 // e........................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v14, x0, 112 // .....................................................................*...... - // gap // ............................................................................ - ldr_vo v21, x0, 128 // .e.......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v17, x0, 240 // ......................................................................*..... - // gap // ............................................................................ - ldr_vo v17, x0, 256 // ..e......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v12, x0, 368 // .......................................................................*.... - // gap // ............................................................................ - ldr_vo v29, x0, 384 // ...e........................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v6, x0, 496 // ........................................................................*... - // gap // ............................................................................ - ldr_vo v9, x0, 512 // ....e....................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v30, x0, 624 // .........................................................................*.. - // gap // ............................................................................ - ldr_vo v6, x0, 640 // .....e...................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v20, x0, 752 // ..........................................................................*. - // gap // ............................................................................ - sqrdmulh v14.4S, v9.4S, v0.S[1] // .........e.................................................................. - // gap // ............................................................................ - str_vo v25, x0, 880 // ...........................................................................* - // gap // ............................................................................ - ldr_vo v12, x0, 896 // .......e.................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - ldr_vo v27, x0, 768 // ......e..................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v18.4S, v12.4S, v0.S[1] // ........................e................................................... - // gap // ............................................................................ - - // original source code - // ldr_vo v9, x0, 0 // e......................................................................................... || e....................................................................................................... - // ldr_vo v10, x0, 128 // ..e....................................................................................... || ...e.................................................................................................... - // ldr_vo v11, x0, 256 // ....e..................................................................................... || ......e................................................................................................. - // ldr_vo v12, x0, 384 // ......e................................................................................... || .........e.............................................................................................. - // ldr_vo v13, x0, 512 // ........e................................................................................. || ............e........................................................................................... - // ldr_vo v14, x0, 640 // ..........e............................................................................... || ...............e........................................................................................ - // ldr_vo v15, x0, 768 // ...............e.......................................................................... || ......................e................................................................................. - // ldr_vo v16, x0, 896 // ..............e........................................................................... || ....................e................................................................................... - // mul v24.4S, v13.4S, v0.S[0] // .................*........................................................................ || .........................*.............................................................................. - // sqrdmulh v13.4S, v13.4S, v0.S[1] // ............e............................................................................. || ..................e..................................................................................... - // mls v24.4S, v13.4S, v8.S[0] // .....................*.................................................................... || .............................*.......................................................................... - // sub v13.4S, v9.4S, v24.4S // .........................*................................................................ || .................................*...................................................................... - // add v9.4S, v9.4S, v24.4S // ...........................*.............................................................. || ...................................*.................................................................... - // mul v24.4S, v14.4S, v0.S[0] // ..................*....................................................................... || ..........................*............................................................................. - // sqrdmulh v14.4S, v14.4S, v0.S[1] // ...................*...................................................................... || ...........................*............................................................................ - // mls v24.4S, v14.4S, v8.S[0] // .......................*.................................................................. || ...............................*........................................................................ - // sub v14.4S, v10.4S, v24.4S // ............................*............................................................. || ....................................*................................................................... - // add v10.4S, v10.4S, v24.4S // .............................*............................................................ || .....................................*.................................................................. - // mul v24.4S, v15.4S, v0.S[0] // ....................*..................................................................... || ............................*........................................................................... - // sqrdmulh v15.4S, v15.4S, v0.S[1] // ......................*................................................................... || ..............................*......................................................................... - // mls v24.4S, v15.4S, v8.S[0] // ..........................*............................................................... || ..................................*..................................................................... - // sub v15.4S, v11.4S, v24.4S // ..............................*........................................................... || ......................................*................................................................. - // add v11.4S, v11.4S, v24.4S // ...............................*.......................................................... || .......................................*................................................................ - // mul v24.4S, v16.4S, v0.S[0] // ........................*................................................................. || ................................*....................................................................... - // sqrdmulh v16.4S, v16.4S, v0.S[1] // ................e......................................................................... || ........................e............................................................................... - // mls v24.4S, v16.4S, v8.S[0] // ................................*......................................................... || ........................................*............................................................... - // sub v16.4S, v12.4S, v24.4S // ....................................*..................................................... || ............................................*........................................................... - // add v12.4S, v12.4S, v24.4S // .....................................*.................................................... || .............................................*.......................................................... - // mul v24.4S, v11.4S, v0.S[2] // ...................................*...................................................... || ...........................................*............................................................ - // sqrdmulh v11.4S, v11.4S, v0.S[3] // .......................................*.................................................. || ...............................................*........................................................ - // mls v24.4S, v11.4S, v8.S[0] // ............................................*............................................. || ....................................................*................................................... - // sub v11.4S, v9.4S, v24.4S // ................................................*......................................... || ........................................................*............................................... - // add v9.4S, v9.4S, v24.4S // .................................................*........................................ || .........................................................*.............................................. - // mul v24.4S, v12.4S, v0.S[2] // .............................................*............................................ || .....................................................*.................................................. - // sqrdmulh v12.4S, v12.4S, v0.S[3] // ...............................................*.......................................... || .......................................................*................................................ - // mls v24.4S, v12.4S, v8.S[0] // ....................................................*..................................... || ............................................................*........................................... - // sub v12.4S, v10.4S, v24.4S // ........................................................*................................. || ................................................................*....................................... - // add v10.4S, v10.4S, v24.4S // .........................................................*................................ || .................................................................*...................................... - // mul v24.4S, v15.4S, v1.S[0] // .................................*........................................................ || .........................................*.............................................................. - // sqrdmulh v15.4S, v15.4S, v1.S[1] // ..................................*....................................................... || ..........................................*............................................................. - // mls v24.4S, v15.4S, v8.S[0] // ......................................*................................................... || ..............................................*......................................................... - // sub v15.4S, v13.4S, v24.4S // ..........................................*............................................... || ..................................................*..................................................... - // add v13.4S, v13.4S, v24.4S // ...........................................*.............................................. || ...................................................*.................................................... - // mul v24.4S, v16.4S, v1.S[0] // ........................................*................................................. || ................................................*....................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[1] // .........................................*................................................ || .................................................*...................................................... - // mls v24.4S, v16.4S, v8.S[0] // ..............................................*........................................... || ......................................................*................................................. - // sub v16.4S, v14.4S, v24.4S // ..................................................*....................................... || ..........................................................*............................................. - // add v14.4S, v14.4S, v24.4S // ...................................................*...................................... || ...........................................................*............................................ - // mul v24.4S, v10.4S, v1.S[2] // ............................................................*............................. || ....................................................................*................................... - // sqrdmulh v10.4S, v10.4S, v1.S[3] // .............................................................*............................ || .....................................................................*.................................. - // mls v24.4S, v10.4S, v8.S[0] // .................................................................*........................ || .........................................................................*.............................. - // sub v10.4S, v9.4S, v24.4S // .......................................................................*.................. || ...............................................................................*........................ - // add v9.4S, v9.4S, v24.4S // ........................................................................*................. || ................................................................................*....................... - // mul v24.4S, v12.4S, v2.S[0] // ...........................................................*.............................. || ...................................................................*.................................... - // sqrdmulh v12.4S, v12.4S, v2.S[1] // ..............................................................*........................... || ......................................................................*................................. - // mls v24.4S, v12.4S, v8.S[0] // ..................................................................*....................... || ..........................................................................*............................. - // sub v12.4S, v11.4S, v24.4S // .........................................................................*................ || .................................................................................*...................... - // add v11.4S, v11.4S, v24.4S // ..........................................................................*............... || ..................................................................................*..................... - // mul v24.4S, v14.4S, v2.S[2] // ......................................................*................................... || ..............................................................*......................................... - // sqrdmulh v14.4S, v14.4S, v2.S[3] // .......................................................*.................................. || ...............................................................*........................................ - // mls v24.4S, v14.4S, v8.S[0] // ...............................................................*.......................... || .......................................................................*................................ - // sub v14.4S, v13.4S, v24.4S // ...................................................................*...................... || ...........................................................................*............................ - // add v13.4S, v13.4S, v24.4S // ......................................................................*................... || ..............................................................................*......................... - // mul v24.4S, v16.4S, v3.S[0] // .....................................................*.................................... || .............................................................*.......................................... - // sqrdmulh v16.4S, v16.4S, v3.S[1] // ..........................................................*............................... || ..................................................................*..................................... - // mls v24.4S, v16.4S, v8.S[0] // ................................................................*......................... || ........................................................................*............................... - // sub v16.4S, v15.4S, v24.4S // ....................................................................*..................... || ............................................................................*........................... - // add v15.4S, v15.4S, v24.4S // .....................................................................*.................... || .............................................................................*.......................... - // str_vi v9, x0, 16 // ...........................................................................*.............. || ...................................................................................*.................... - // str_vo v10, x0, 112 // .............................................................................*............ || ......................................................................................*................. - // str_vo v11, x0, 240 // ...............................................................................*.......... || .........................................................................................*.............. - // str_vo v12, x0, 368 // .................................................................................*........ || ............................................................................................*........... - // str_vo v13, x0, 496 // ...................................................................................*...... || ...............................................................................................*........ - // str_vo v14, x0, 624 // .....................................................................................*.... || ..................................................................................................*..... - // str_vo v15, x0, 752 // .......................................................................................*.. || .....................................................................................................*.. - // str_vo v16, x0, 880 // .........................................................................................* || .......................................................................................................* - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Wall time: 4.55s + // User time: 4.55s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sqrdmulh v6.4S, v6.4S, v0.S[1] // ........*................................................................... + // gap // ............................................................................ + sqrdmulh v18.4S, v22.4S, v0.S[1] // .............*.............................................................. + // gap // ............................................................................ + mul v16.4S, v22.4S, v0.S[0] // ..............*............................................................. + // gap // ............................................................................ + sqrdmulh v22.4S, v31.4S, v0.S[1] // ..................*......................................................... + // gap // ............................................................................ + mls v19.4S, v6.4S, v8.S[0] // ..........*................................................................. + // gap // ............................................................................ + mul v6.4S, v31.4S, v0.S[0] // ...................*........................................................ + // gap // ............................................................................ + mls v16.4S, v18.4S, v8.S[0] // ...............*............................................................ + // gap // ............................................................................ + sqrdmulh v18.4S, v20.4S, v0.S[1] // .......................*.................................................... + // gap // ............................................................................ + sub v20.4S, v9.4S, v19.4S // ...........*................................................................ + // gap // ............................................................................ + mls v6.4S, v22.4S, v8.S[0] // ....................*....................................................... + // gap // ............................................................................ + add v22.4S, v9.4S, v19.4S // ............*............................................................... + // gap // ............................................................................ + sub v19.4S, v15.4S, v16.4S // ................*........................................................... + // gap // ............................................................................ + add v16.4S, v15.4S, v16.4S // .................*.......................................................... + // gap // ............................................................................ + sub v31.4S, v4.4S, v6.4S // .....................*...................................................... + // gap // ............................................................................ + add v6.4S, v4.4S, v6.4S // ......................*..................................................... + // gap // ............................................................................ + mls v12.4S, v18.4S, v8.S[0] // .........................*.................................................. + // gap // ............................................................................ + sqrdmulh v18.4S, v31.4S, v1.S[1] // ......................................*..................................... + // gap // ............................................................................ + mul v31.4S, v31.4S, v1.S[0] // .......................................*.................................... + // gap // ............................................................................ + sqrdmulh v9.4S, v6.4S, v0.S[3] // ............................*............................................... + // gap // ............................................................................ + sub v15.4S, v26.4S, v12.4S // ..........................*................................................. + // gap // ............................................................................ + add v12.4S, v26.4S, v12.4S // ...........................*................................................ + // gap // ............................................................................ + mls v31.4S, v18.4S, v8.S[0] // ........................................*................................... + // gap // ............................................................................ + mul v6.4S, v6.4S, v0.S[2] // .............................*.............................................. + // gap // ............................................................................ + sqrdmulh v18.4S, v15.4S, v1.S[1] // ...........................................*................................ + // gap // ............................................................................ + mul v15.4S, v15.4S, v1.S[0] // ............................................*............................... + // gap // ............................................................................ + sub v4.4S, v20.4S, v31.4S // .........................................*.................................. + // gap // ............................................................................ + add v20.4S, v20.4S, v31.4S // ..........................................*................................. + // gap // ............................................................................ + mls v6.4S, v9.4S, v8.S[0] // ..............................*............................................. + // gap // ............................................................................ + sqrdmulh v31.4S, v12.4S, v0.S[3] // .................................*.......................................... + // gap // ............................................................................ + mls v15.4S, v18.4S, v8.S[0] // .............................................*.............................. + // gap // ............................................................................ + mul v18.4S, v12.4S, v0.S[2] // ..................................*......................................... + // gap // ............................................................................ + sub v12.4S, v22.4S, v6.4S // ...............................*............................................ + // gap // ............................................................................ + add v6.4S, v22.4S, v6.4S // ................................*........................................... + // gap // ............................................................................ + sub v22.4S, v19.4S, v15.4S // ..............................................*............................. + // gap // ............................................................................ + add v19.4S, v19.4S, v15.4S // ...............................................*............................ + // gap // ............................................................................ + mls v18.4S, v31.4S, v8.S[0] // ...................................*........................................ + // gap // ............................................................................ + sqrdmulh v31.4S, v22.4S, v3.S[1] // ...............................................................*............ + // gap // ............................................................................ + sqrdmulh v9.4S, v19.4S, v2.S[3] // ..........................................................*................. + // gap // ............................................................................ + mul v19.4S, v19.4S, v2.S[2] // ...........................................................*................ + // gap // ............................................................................ + sub v15.4S, v16.4S, v18.4S // ....................................*....................................... + // gap // ............................................................................ + add v18.4S, v16.4S, v18.4S // .....................................*...................................... + // gap // ............................................................................ + mul v16.4S, v22.4S, v3.S[0] // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v22.4S, v15.4S, v2.S[1] // .....................................................*...................... + // gap // ............................................................................ + sqrdmulh v26.4S, v18.4S, v1.S[3] // ................................................*........................... + // gap // ............................................................................ + mul v18.4S, v18.4S, v1.S[2] // .................................................*.......................... + // gap // ............................................................................ + mul v15.4S, v15.4S, v2.S[0] // ......................................................*..................... + // gap // ............................................................................ + mls v19.4S, v9.4S, v8.S[0] // ............................................................*............... + // gap // ............................................................................ + mls v16.4S, v31.4S, v8.S[0] // .................................................................*.......... + // gap // ............................................................................ + mls v18.4S, v26.4S, v8.S[0] // ..................................................*......................... + // gap // ............................................................................ + mls v15.4S, v22.4S, v8.S[0] // .......................................................*.................... + // gap // ............................................................................ + sub v22.4S, v20.4S, v19.4S // .............................................................*.............. + // gap // ............................................................................ + sub v31.4S, v4.4S, v16.4S // ..................................................................*......... + // gap // ............................................................................ + add v16.4S, v4.4S, v16.4S // ...................................................................*........ + // gap // ............................................................................ + add v19.4S, v20.4S, v19.4S // ..............................................................*............. + // gap // ............................................................................ + sub v20.4S, v6.4S, v18.4S // ...................................................*........................ + // gap // ............................................................................ + add v6.4S, v6.4S, v18.4S // ....................................................*....................... + // gap // ............................................................................ + sub v18.4S, v12.4S, v15.4S // ........................................................*................... + // gap // ............................................................................ + add v12.4S, v12.4S, v15.4S // .........................................................*.................. + // gap // ............................................................................ + str q6, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + ldr q9, [x0, #0] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q20, [x0, #112] // .....................................................................*...... + // gap // ............................................................................ + ldr q15, [x0, #128] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q12, [x0, #240] // ......................................................................*..... + // gap // ............................................................................ + ldr q4, [x0, #256] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q18, [x0, #368] // .......................................................................*.... + // gap // ............................................................................ + ldr q26, [x0, #384] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #496] // ........................................................................*... + // gap // ............................................................................ + ldr q6, [x0, #512] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q22, [x0, #624] // .........................................................................*.. + // gap // ............................................................................ + ldr q22, [x0, #640] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #752] // ..........................................................................*. + // gap // ............................................................................ + mul v19.4S, v6.4S, v0.S[0] // .........e.................................................................. + // gap // ............................................................................ + str q31, [x0, #880] // ...........................................................................* + // gap // ............................................................................ + ldr q20, [x0, #896] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q31, [x0, #768] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v12.4S, v20.4S, v0.S[0] // ........................e................................................... + // gap // ............................................................................ + + // ------------------------------------- new position --------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-------------- + // ldr q9, [x0, #0] // e................'..........................................................~............. + // ldr q10, [x0, #(1*(1024/8))] // ..e..............'............................................................~........... + // ldr q11, [x0, #(2*(1024/8))] // ....e............'..............................................................~......... + // ldr q12, [x0, #(3*(1024/8))] // ......e..........'................................................................~....... + // ldr q13, [x0, #(4*(1024/8))] // ........e........'..................................................................~..... + // ldr q14, [x0, #(5*(1024/8))] // ..........e......'....................................................................~... + // ldr q15, [x0, #(6*(1024/8))] // ...............e.'........................................................................ + // ldr q16, [x0, #(7*(1024/8))] // ..............e..'........................................................................ + // sqrdmulh v27.4s, v13.4s, v0.s[1] // .................*........................................................................ + // mul v24.4s, v13.4s, v0.s[0] // ............e....'......................................................................~. + // mls v24.4s, v27.4s, v8.s[0] // .................'...*.................................................................... + // sub v13.4s, v9.4s, v24.4s // .................'.......*................................................................ + // add v9.4s, v9.4s, v24.4s // .................'.........*.............................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .................'*....................................................................... + // mul v24.4s, v14.4s, v0.s[0] // .................'.*...................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'.....*.................................................................. + // sub v14.4s, v10.4s, v24.4s // .................'..........*............................................................. + // add v10.4s, v10.4s, v24.4s // .................'...........*............................................................ + // sqrdmulh v27.4s, v15.4s, v0.s[1] // .................'..*..................................................................... + // mul v24.4s, v15.4s, v0.s[0] // .................'....*................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'........*............................................................... + // sub v15.4s, v11.4s, v24.4s // .................'............*........................................................... + // add v11.4s, v11.4s, v24.4s // .................'.............*.......................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .................'......*................................................................. + // mul v24.4s, v16.4s, v0.s[0] // ................e'........................................................................ + // mls v24.4s, v27.4s, v8.s[0] // .................'..............*......................................................... + // sub v16.4s, v12.4s, v24.4s // .................'..................*..................................................... + // add v12.4s, v12.4s, v24.4s // .................'...................*.................................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .................'.................*...................................................... + // mul v24.4s, v11.4s, v0.s[2] // .................'.....................*.................................................. + // mls v24.4s, v27.4s, v8.s[0] // .................'..........................*............................................. + // sub v11.4s, v9.4s, v24.4s // .................'..............................*......................................... + // add v9.4s, v9.4s, v24.4s // .................'...............................*........................................ + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .................'...........................*............................................ + // mul v24.4s, v12.4s, v0.s[2] // .................'.............................*.......................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'..................................*..................................... + // sub v12.4s, v10.4s, v24.4s // .................'......................................*................................. + // add v10.4s, v10.4s, v24.4s // .................'.......................................*................................ + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .................'...............*........................................................ + // mul v24.4s, v15.4s, v1.s[0] // .................'................*....................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'....................*................................................... + // sub v15.4s, v13.4s, v24.4s // .................'........................*............................................... + // add v13.4s, v13.4s, v24.4s // .................'.........................*.............................................. + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .................'......................*................................................. + // mul v24.4s, v16.4s, v1.s[0] // .................'.......................*................................................ + // mls v24.4s, v27.4s, v8.s[0] // .................'............................*........................................... + // sub v16.4s, v14.4s, v24.4s // .................'................................*....................................... + // add v14.4s, v14.4s, v24.4s // .................'.................................*...................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .................'..........................................*............................. + // mul v24.4s, v10.4s, v1.s[2] // .................'...........................................*............................ + // mls v24.4s, v27.4s, v8.s[0] // .................'...............................................*........................ + // sub v10.4s, v9.4s, v24.4s // .................'.....................................................*.................. + // add v9.4s, v9.4s, v24.4s // .................'......................................................*................. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .................'.........................................*.............................. + // mul v24.4s, v12.4s, v2.s[0] // .................'............................................*........................... + // mls v24.4s, v27.4s, v8.s[0] // .................'................................................*....................... + // sub v12.4s, v11.4s, v24.4s // .................'.......................................................*................ + // add v11.4s, v11.4s, v24.4s // .................'........................................................*............... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // .................'....................................*................................... + // mul v24.4s, v14.4s, v2.s[2] // .................'.....................................*.................................. + // mls v24.4s, v27.4s, v8.s[0] // .................'.............................................*.......................... + // sub v14.4s, v13.4s, v24.4s // .................'.................................................*...................... + // add v13.4s, v13.4s, v24.4s // .................'....................................................*................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .................'...................................*.................................... + // mul v24.4s, v16.4s, v3.s[0] // .................'........................................*............................... + // mls v24.4s, v27.4s, v8.s[0] // .................'..............................................*......................... + // sub v16.4s, v15.4s, v24.4s // .................'..................................................*..................... + // add v15.4s, v15.4s, v24.4s // .................'...................................................*.................... + // str q9, [x0], #(16) // .................'.........................................................*.............. + // str q10, [x0, #(-16 + 1*(1024/8))] // .~...............'...........................................................*............ + // str q11, [x0, #(-16 + 2*(1024/8))] // ...~.............'.............................................................*.......... + // str q12, [x0, #(-16 + 3*(1024/8))] // .....~...........'...............................................................*........ + // str q13, [x0, #(-16 + 4*(1024/8))] // .......~.........'.................................................................*...... + // str q14, [x0, #(-16 + 5*(1024/8))] // .........~.......'...................................................................*.... + // str q15, [x0, #(-16 + 6*(1024/8))] // ...........~.....'.....................................................................*.. + // str q16, [x0, #(-16 + 7*(1024/8))] // .............~...'.......................................................................* + + sub count, count, #1 cbnz count, layer123_start - mul v19.4S, v12.4S, v0.S[0] // .......*.......................................................... - // gap // .................................................................. - mul v23.4S, v27.4S, v0.S[0] // ...*.............................................................. - // gap // .................................................................. - mul v12.4S, v6.4S, v0.S[0] // .*................................................................ - // gap // .................................................................. - sqrdmulh v24.4S, v6.4S, v0.S[1] // ..*............................................................... - // gap // .................................................................. - mls v19.4S, v18.4S, v8.S[0] // ...............*.................................................. - // gap // .................................................................. - sqrdmulh v4.4S, v27.4S, v0.S[1] // .....*............................................................ - // gap // .................................................................. - mul v13.4S, v9.4S, v0.S[0] // *................................................................. - // gap // .................................................................. - mls v12.4S, v24.4S, v8.S[0] // ......*........................................................... - // gap // .................................................................. - add v6.4S, v29.4S, v19.4S // ....................*............................................. - // gap // .................................................................. - sub v9.4S, v29.4S, v19.4S // ...................*.............................................. - // gap // .................................................................. - mls v13.4S, v14.4S, v8.S[0] // ....*............................................................. - // gap // .................................................................. - mul v20.4S, v6.4S, v0.S[2] // ............................*..................................... - // gap // .................................................................. - sqrdmulh v22.4S, v9.4S, v1.S[1] // ........................*......................................... - // gap // .................................................................. - sub v24.4S, v21.4S, v12.4S // ...........*...................................................... - // gap // .................................................................. - mul v15.4S, v9.4S, v1.S[0] // .......................*.......................................... - // gap // .................................................................. - sqrdmulh v28.4S, v6.4S, v0.S[3] // ..............................*................................... - // gap // .................................................................. - add v11.4S, v31.4S, v13.4S // ..........*....................................................... - // gap // .................................................................. - mls v23.4S, v4.4S, v8.S[0] // .........*........................................................ - // gap // .................................................................. - mls v15.4S, v22.4S, v8.S[0] // .............................*.................................... - // gap // .................................................................. - mls v20.4S, v28.4S, v8.S[0] // ...................................*.............................. - // gap // .................................................................. - add v12.4S, v21.4S, v12.4S // ............*..................................................... - // gap // .................................................................. - add v18.4S, v17.4S, v23.4S // ..............*................................................... - // gap // .................................................................. - add v10.4S, v24.4S, v15.4S // ..................................*............................... - // gap // .................................................................. - add v4.4S, v12.4S, v20.4S // ........................................*......................... - // gap // .................................................................. - sub v29.4S, v12.4S, v20.4S // .......................................*.......................... - // gap // .................................................................. - mul v28.4S, v10.4S, v2.S[2] // .....................................*............................ - // gap // .................................................................. - mul v16.4S, v4.4S, v1.S[2] // ...........................................*...................... - // gap // .................................................................. - mul v22.4S, v29.4S, v2.S[0] // ..........................................*....................... - // gap // .................................................................. - mul v27.4S, v18.4S, v0.S[2] // ..................*............................................... - // gap // .................................................................. - sqrdmulh v5.4S, v18.4S, v0.S[3] // ......................*........................................... - // gap // .................................................................. - sub v20.4S, v17.4S, v23.4S // .............*.................................................... - // gap // .................................................................. - sqrdmulh v12.4S, v4.4S, v1.S[3] // ............................................*..................... - // gap // .................................................................. - sqrdmulh v25.4S, v29.4S, v2.S[1] // .............................................*.................... - // gap // .................................................................. - mls v27.4S, v5.4S, v8.S[0] // ...........................*...................................... - // gap // .................................................................. - mul v30.4S, v20.4S, v1.S[0] // ................*................................................. - // gap // .................................................................. - mls v16.4S, v12.4S, v8.S[0] // ................................................*................. - // gap // .................................................................. - mls v22.4S, v25.4S, v8.S[0] // .................................................*................ - // gap // .................................................................. - add v9.4S, v11.4S, v27.4S // ................................*................................. - // gap // .................................................................. - sub v27.4S, v11.4S, v27.4S // ...............................*.................................. - // gap // .................................................................. - sub v26.4S, v24.4S, v15.4S // .................................*................................ - // gap // .................................................................. - sub v5.4S, v9.4S, v16.4S // ......................................................*........... - // gap // .................................................................. - add v12.4S, v27.4S, v22.4S // .........................................................*........ - // gap // .................................................................. - sqrdmulh v0.4S, v20.4S, v1.S[1] // .................*................................................ - // gap // .................................................................. - mul v15.4S, v26.4S, v3.S[0] // ....................................*............................. - // gap // .................................................................. - str_vo v12, x0, 256 // ............................................................*..... - // gap // .................................................................. - sqrdmulh v26.4S, v26.4S, v3.S[1] // .........................................*........................ - // gap // .................................................................. - str_vo v5, x0, 128 // ...........................................................*...... - // gap // .................................................................. - mls v30.4S, v0.4S, v8.S[0] // .....................*............................................ - // gap // .................................................................. - sub v17.4S, v31.4S, v13.4S // ........*......................................................... - // gap // .................................................................. - add v4.4S, v9.4S, v16.4S // .......................................................*.......... - // gap // .................................................................. - mls v15.4S, v26.4S, v8.S[0] // ...............................................*.................. - // gap // .................................................................. - sub v23.4S, v17.4S, v30.4S // .........................*........................................ - // gap // .................................................................. - str_vi v4, x0, 16 // ..........................................................*....... - // gap // .................................................................. - sqrdmulh v6.4S, v10.4S, v2.S[3] // ......................................*........................... - // gap // .................................................................. - sub v21.4S, v23.4S, v15.4S // ...................................................*.............. - // gap // .................................................................. - add v7.4S, v23.4S, v15.4S // ....................................................*............. - // gap // .................................................................. - add v20.4S, v17.4S, v30.4S // ..........................*....................................... - // gap // .................................................................. - mls v28.4S, v6.4S, v8.S[0] // ..............................................*................... - // gap // .................................................................. - str_vo v7, x0, 752 // ................................................................*. - // gap // .................................................................. - sub v1.4S, v27.4S, v22.4S // ........................................................*......... - // gap // .................................................................. - str_vo v21, x0, 880 // .................................................................* - // gap // .................................................................. - add v19.4S, v20.4S, v28.4S // .....................................................*............ - // gap // .................................................................. - str_vo v1, x0, 368 // .............................................................*.... - // gap // .................................................................. - sub v14.4S, v20.4S, v28.4S // ..................................................*............... - // gap // .................................................................. - str_vo v19, x0, 496 // ..............................................................*... - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str_vo v14, x0, 624 // ...............................................................*.. - // gap // .................................................................. - - // original source code - // mul v9.4S, v9.4S, v0.S[0] // ......*........................................................... || ......*............................................................ - // mul v30.4S, v6.4S, v0.S[0] // ..*............................................................... || ..*................................................................ - // sqrdmulh v20.4S, v6.4S, v0.S[1] // ...*.............................................................. || ...*............................................................... - // mul v25.4S, v27.4S, v0.S[0] // .*................................................................ || .*................................................................. - // mls v9.4S, v14.4S, v8.S[0] // ..........*....................................................... || ..........*........................................................ - // sqrdmulh v6.4S, v27.4S, v0.S[1] // .....*............................................................ || .....*............................................................. - // mls v30.4S, v20.4S, v8.S[0] // .......*.......................................................... || .......*........................................................... - // mul v20.4S, v12.4S, v0.S[0] // *................................................................. || *.................................................................. - // sub v14.4S, v31.4S, v9.4S // ................................................*................. || ................................................*.................. - // mls v25.4S, v6.4S, v8.S[0] // .................*................................................ || .................*................................................. - // add v9.4S, v31.4S, v9.4S // ................*................................................. || ................*.................................................. - // sub v6.4S, v21.4S, v30.4S // .............*.................................................... || .............*..................................................... - // add v30.4S, v21.4S, v30.4S // ....................*............................................. || ....................*.............................................. - // sub v12.4S, v17.4S, v25.4S // ..............................*................................... || ..............................*.................................... - // add v25.4S, v17.4S, v25.4S // .....................*............................................ || .....................*............................................. - // mls v20.4S, v18.4S, v8.S[0] // ....*............................................................. || ....*.............................................................. - // mul v17.4S, v12.4S, v1.S[0] // ..................................*............................... || ..................................*................................ - // sqrdmulh v12.4S, v12.4S, v1.S[1] // ..........................................*....................... || ..........................................*........................ - // mul v31.4S, v25.4S, v0.S[2] // ............................*..................................... || ............................*...................................... - // sub v21.4S, v29.4S, v20.4S // .........*........................................................ || .........*......................................................... - // add v20.4S, v29.4S, v20.4S // ........*......................................................... || ........*.......................................................... - // mls v17.4S, v12.4S, v8.S[0] // ...............................................*.................. || ...............................................*................... - // sqrdmulh v25.4S, v25.4S, v0.S[3] // .............................*.................................... || .............................*..................................... - // mul v12.4S, v21.4S, v1.S[0] // ..............*................................................... || ..............*.................................................... - // sqrdmulh v21.4S, v21.4S, v1.S[1] // ............*..................................................... || ............*...................................................... - // sub v29.4S, v14.4S, v17.4S // ...................................................*.............. || ...................................................*............... - // add v14.4S, v14.4S, v17.4S // ........................................................*......... || ........................................................*.......... - // mls v31.4S, v25.4S, v8.S[0] // .................................*................................ || .................................*................................. - // mul v25.4S, v20.4S, v0.S[2] // ...........*...................................................... || ...........*....................................................... - // mls v12.4S, v21.4S, v8.S[0] // ..................*............................................... || ..................*................................................ - // sqrdmulh v20.4S, v20.4S, v0.S[3] // ...............*.................................................. || ...............*................................................... - // sub v17.4S, v9.4S, v31.4S // ......................................*........................... || ......................................*............................ - // add v9.4S, v9.4S, v31.4S // .....................................*............................ || .....................................*............................. - // sub v31.4S, v6.4S, v12.4S // .......................................*.......................... || .......................................*........................... - // add v6.4S, v6.4S, v12.4S // ......................*........................................... || ......................*............................................ - // mls v25.4S, v20.4S, v8.S[0] // ...................*.............................................. || ...................*............................................... - // mul v20.4S, v31.4S, v3.S[0] // ...........................................*...................... || ...........................................*....................... - // mul v12.4S, v6.4S, v2.S[2] // .........................*........................................ || .........................*......................................... - // sqrdmulh v6.4S, v6.4S, v2.S[3] // .....................................................*............ || .....................................................*............. - // sub v21.4S, v30.4S, v25.4S // ........................*......................................... || ........................*.......................................... - // add v30.4S, v30.4S, v25.4S // .......................*.......................................... || .......................*........................................... - // sqrdmulh v25.4S, v31.4S, v3.S[1] // .............................................*.................... || .............................................*..................... - // mul v31.4S, v21.4S, v2.S[0] // ...........................*...................................... || ...........................*....................................... - // mul v27.4S, v30.4S, v1.S[2] // ..........................*....................................... || ..........................*........................................ - // sqrdmulh v30.4S, v30.4S, v1.S[3] // ...............................*.................................. || ...............................*................................... - // sqrdmulh v21.4S, v21.4S, v2.S[1] // ................................*................................. || ................................*.................................. - // mls v12.4S, v6.4S, v8.S[0] // .........................................................*........ || .........................................................*......... - // mls v20.4S, v25.4S, v8.S[0] // ..................................................*............... || ..................................................*................ - // mls v27.4S, v30.4S, v8.S[0] // ...................................*.............................. || ...................................*............................... - // mls v31.4S, v21.4S, v8.S[0] // ....................................*............................. || ....................................*.............................. - // sub v30.4S, v14.4S, v12.4S // ...............................................................*.. || ...............................................................*... - // sub v25.4S, v29.4S, v20.4S // ......................................................*........... || ......................................................*............ - // add v20.4S, v29.4S, v20.4S // .......................................................*.......... || .......................................................*........... - // add v6.4S, v14.4S, v12.4S // .............................................................*.... || .............................................................*..... - // sub v14.4S, v9.4S, v27.4S // ........................................*......................... || ........................................*.......................... - // add v9.4S, v9.4S, v27.4S // .................................................*................ || .................................................*................. - // sub v12.4S, v17.4S, v31.4S // ...........................................................*...... || ...........................................................*....... - // add v17.4S, v17.4S, v31.4S // .........................................*........................ || .........................................*......................... - // str_vi v9, x0, 16 // ....................................................*............. || ....................................................*.............. - // str_vo v14, x0, 112 // ..............................................*................... || ..............................................*.................... - // str_vo v17, x0, 240 // ............................................*..................... || ............................................*...................... - // str_vo v12, x0, 368 // ..............................................................*... || ..............................................................*.... - // str_vo v6, x0, 496 // ................................................................*. || ................................................................*.. - // str_vo v30, x0, 624 // .................................................................* || ..................................................................* - // str_vo v20, x0, 752 // ..........................................................*....... || ..........................................................*........ - // str_vo v25, x0, 880 // ............................................................*..... || ............................................................*...... - + // Instructions: 66 + // Expected cycles: 67 + // Expected IPC: 0.99 + // + // Wall time: 6.68s + // User time: 6.68s + // + // ----------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + sqrdmulh v23.4S, v20.4S, v0.S[1] // .......*.......................................................... + // gap // .................................................................. + mul v24.4S, v31.4S, v0.S[0] // .....*............................................................ + // gap // .................................................................. + sqrdmulh v31.4S, v31.4S, v0.S[1] // ...*.............................................................. + // gap // .................................................................. + sqrdmulh v18.4S, v22.4S, v0.S[1] // .*................................................................ + // gap // .................................................................. + mls v12.4S, v23.4S, v8.S[0] // ...............*.................................................. + // gap // .................................................................. + mul v22.4S, v22.4S, v0.S[0] // ..*............................................................... + // gap // .................................................................. + mls v24.4S, v31.4S, v8.S[0] // .........*........................................................ + // gap // .................................................................. + sqrdmulh v21.4S, v6.4S, v0.S[1] // *................................................................. + // gap // .................................................................. + add v16.4S, v26.4S, v12.4S // ....................*............................................. + // gap // .................................................................. + sub v11.4S, v26.4S, v12.4S // ...................*.............................................. + // gap // .................................................................. + mls v22.4S, v18.4S, v8.S[0] // ......*........................................................... + // gap // .................................................................. + mul v27.4S, v16.4S, v0.S[2] // ..............................*................................... + // gap // .................................................................. + sqrdmulh v31.4S, v16.4S, v0.S[3] // ............................*..................................... + // gap // .................................................................. + mul v20.4S, v11.4S, v1.S[0] // ........................*......................................... + // gap // .................................................................. + add v7.4S, v4.4S, v24.4S // ..............*................................................... + // gap // .................................................................. + add v23.4S, v15.4S, v22.4S // ............*..................................................... + // gap // .................................................................. + mls v27.4S, v31.4S, v8.S[0] // ...................................*.............................. + // gap // .................................................................. + sqrdmulh v31.4S, v7.4S, v0.S[3] // ..................*............................................... + // gap // .................................................................. + mul v6.4S, v7.4S, v0.S[2] // ......................*........................................... + // gap // .................................................................. + mls v19.4S, v21.4S, v8.S[0] // ....*............................................................. + // gap // .................................................................. + sub v18.4S, v23.4S, v27.4S // .......................................*.......................... + // gap // .................................................................. + add v21.4S, v23.4S, v27.4S // ........................................*......................... + // gap // .................................................................. + mls v6.4S, v31.4S, v8.S[0] // ...........................*...................................... + // gap // .................................................................. + sqrdmulh v17.4S, v18.4S, v2.S[1] // ..........................................*....................... + // gap // .................................................................. + mul v30.4S, v18.4S, v2.S[0] // .............................................*.................... + // gap // .................................................................. + sqrdmulh v31.4S, v21.4S, v1.S[3] // ...........................................*...................... + // gap // .................................................................. + add v23.4S, v9.4S, v19.4S // ..........*....................................................... + // gap // .................................................................. + mul v16.4S, v21.4S, v1.S[2] // ............................................*..................... + // gap // .................................................................. + mls v30.4S, v17.4S, v8.S[0] // .................................................*................ + // gap // .................................................................. + sub v29.4S, v23.4S, v6.4S // ...............................*.................................. + // gap // .................................................................. + add v25.4S, v23.4S, v6.4S // ................................*................................. + // gap // .................................................................. + mls v16.4S, v31.4S, v8.S[0] // ................................................*................. + // gap // .................................................................. + sub v23.4S, v29.4S, v30.4S // ........................................................*......... + // gap // .................................................................. + sqrdmulh v7.4S, v11.4S, v1.S[1] // .......................*.......................................... + // gap // .................................................................. + sub v17.4S, v15.4S, v22.4S // ...........*...................................................... + // gap // .................................................................. + add v18.4S, v25.4S, v16.4S // .......................................................*.......... + // gap // .................................................................. + str q23, [x0, #384] // .............................................................*.... + // gap // .................................................................. + mls v20.4S, v7.4S, v8.S[0] // .............................*.................................... + // gap // .................................................................. + str q18, [x0], #(16) // ..........................................................*....... + // gap // .................................................................. + sub v6.4S, v4.4S, v24.4S // .............*.................................................... + // gap // .................................................................. + sub v22.4S, v25.4S, v16.4S // ......................................................*........... + // gap // .................................................................. + sub v27.4S, v17.4S, v20.4S // .................................*................................ + // gap // .................................................................. + mul v26.4S, v6.4S, v1.S[0] // .................*................................................ + // gap // .................................................................. + add v15.4S, v17.4S, v20.4S // ..................................*............................... + // gap // .................................................................. + sqrdmulh v31.4S, v6.4S, v1.S[1] // ................*................................................. + // gap // .................................................................. + mul v6.4S, v27.4S, v3.S[0] // .........................................*........................ + // gap // .................................................................. + mul v18.4S, v15.4S, v2.S[2] // ......................................*........................... + // gap // .................................................................. + sqrdmulh v12.4S, v15.4S, v2.S[3] // .....................................*............................ + // gap // .................................................................. + sqrdmulh v16.4S, v27.4S, v3.S[1] // ....................................*............................. + // gap // .................................................................. + mls v26.4S, v31.4S, v8.S[0] // .....................*............................................ + // gap // .................................................................. + sub v20.4S, v9.4S, v19.4S // ........*......................................................... + // gap // .................................................................. + add v27.4S, v29.4S, v30.4S // .........................................................*........ + // gap // .................................................................. + mls v6.4S, v16.4S, v8.S[0] // ...............................................*.................. + // gap // .................................................................. + sub v19.4S, v20.4S, v26.4S // .........................*........................................ + // gap // .................................................................. + str q27, [x0, #240] // ............................................................*..... + // gap // .................................................................. + mls v18.4S, v12.4S, v8.S[0] // ..............................................*................... + // gap // .................................................................. + add v17.4S, v20.4S, v26.4S // ..........................*....................................... + // gap // .................................................................. + add v16.4S, v19.4S, v6.4S // ....................................................*............. + // gap // .................................................................. + str q22, [x0, #112] // ...........................................................*...... + // gap // .................................................................. + add v20.4S, v17.4S, v18.4S // .....................................................*............ + // gap // .................................................................. + str q16, [x0, #752] // ................................................................*. + // gap // .................................................................. + sub v6.4S, v19.4S, v6.4S // ...................................................*.............. + // gap // .................................................................. + str q20, [x0, #496] // ..............................................................*... + // gap // .................................................................. + sub v18.4S, v17.4S, v18.4S // ..................................................*............... + // gap // .................................................................. + str q6, [x0, #880] // .................................................................* + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q18, [x0, #624] // ...............................................................*.. + // gap // .................................................................. + + // ------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // sqrdmulh v6.4S, v6.4S, v0.S[1] // .......*.......................................................... + // sqrdmulh v18.4S, v22.4S, v0.S[1] // ...*.............................................................. + // mul v16.4S, v22.4S, v0.S[0] // .....*............................................................ + // sqrdmulh v22.4S, v31.4S, v0.S[1] // ..*............................................................... + // mls v19.4S, v6.4S, v8.S[0] // ...................*.............................................. + // mul v6.4S, v31.4S, v0.S[0] // .*................................................................ + // mls v16.4S, v18.4S, v8.S[0] // ..........*....................................................... + // sqrdmulh v18.4S, v20.4S, v0.S[1] // *................................................................. + // sub v20.4S, v9.4S, v19.4S // ..................................................*............... + // mls v6.4S, v22.4S, v8.S[0] // ......*........................................................... + // add v22.4S, v9.4S, v19.4S // ..........................*....................................... + // sub v19.4S, v15.4S, v16.4S // ..................................*............................... + // add v16.4S, v15.4S, v16.4S // ...............*.................................................. + // sub v31.4S, v4.4S, v6.4S // .......................................*.......................... + // add v6.4S, v4.4S, v6.4S // ..............*................................................... + // mls v12.4S, v18.4S, v8.S[0] // ....*............................................................. + // sqrdmulh v18.4S, v31.4S, v1.S[1] // ............................................*..................... + // mul v31.4S, v31.4S, v1.S[0] // ..........................................*....................... + // sqrdmulh v9.4S, v6.4S, v0.S[3] // .................*................................................ + // sub v15.4S, v26.4S, v12.4S // .........*........................................................ + // add v12.4S, v26.4S, v12.4S // ........*......................................................... + // mls v31.4S, v18.4S, v8.S[0] // .................................................*................ + // mul v6.4S, v6.4S, v0.S[2] // ..................*............................................... + // sqrdmulh v18.4S, v15.4S, v1.S[1] // .................................*................................ + // mul v15.4S, v15.4S, v1.S[0] // .............*.................................................... + // sub v4.4S, v20.4S, v31.4S // .....................................................*............ + // add v20.4S, v20.4S, v31.4S // ........................................................*......... + // mls v6.4S, v9.4S, v8.S[0] // ......................*........................................... + // sqrdmulh v31.4S, v12.4S, v0.S[3] // ............*..................................................... + // mls v15.4S, v18.4S, v8.S[0] // .....................................*............................ + // mul v18.4S, v12.4S, v0.S[2] // ...........*...................................................... + // sub v12.4S, v22.4S, v6.4S // .............................*.................................... + // add v6.4S, v22.4S, v6.4S // ..............................*................................... + // sub v22.4S, v19.4S, v15.4S // .........................................*........................ + // add v19.4S, v19.4S, v15.4S // ...........................................*...................... + // mls v18.4S, v31.4S, v8.S[0] // ................*................................................. + // sqrdmulh v31.4S, v22.4S, v3.S[1] // ................................................*................. + // sqrdmulh v9.4S, v19.4S, v2.S[3] // ...............................................*.................. + // mul v19.4S, v19.4S, v2.S[2] // ..............................................*................... + // sub v15.4S, v16.4S, v18.4S // ....................*............................................. + // add v18.4S, v16.4S, v18.4S // .....................*............................................ + // mul v16.4S, v22.4S, v3.S[0] // .............................................*.................... + // sqrdmulh v22.4S, v15.4S, v2.S[1] // .......................*.......................................... + // sqrdmulh v26.4S, v18.4S, v1.S[3] // .........................*........................................ + // mul v18.4S, v18.4S, v1.S[2] // ...........................*...................................... + // mul v15.4S, v15.4S, v2.S[0] // ........................*......................................... + // mls v19.4S, v9.4S, v8.S[0] // .......................................................*.......... + // mls v16.4S, v31.4S, v8.S[0] // ....................................................*............. + // mls v18.4S, v26.4S, v8.S[0] // ...............................*.................................. + // mls v15.4S, v22.4S, v8.S[0] // ............................*..................................... + // sub v22.4S, v20.4S, v19.4S // ...............................................................*.. + // sub v31.4S, v4.4S, v16.4S // .............................................................*.... + // add v16.4S, v4.4S, v16.4S // .........................................................*........ + // add v19.4S, v20.4S, v19.4S // ...........................................................*...... + // sub v20.4S, v6.4S, v18.4S // ........................................*......................... + // add v6.4S, v6.4S, v18.4S // ...................................*.............................. + // sub v18.4S, v12.4S, v15.4S // ................................*................................. + // add v12.4S, v12.4S, v15.4S // ...................................................*.............. + // str q6, [x0], #(16) // ......................................*........................... + // str q20, [x0, #112] // ..........................................................*....... + // str q12, [x0, #240] // ......................................................*........... + // str q18, [x0, #368] // ....................................*............................. + // str q19, [x0, #496] // ..............................................................*... + // str q22, [x0, #624] // .................................................................* + // str q16, [x0, #752] // ............................................................*..... + // str q31, [x0, #880] // ................................................................*. + restore inp, STACK0 add inpp, inp, #64 @@ -895,1000 +902,1035 @@ layer123_start: sub inpp, inpp, #64 .p2align 2 - ldr_vi v6, x4, 64 // *........... - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v31, x4, -48 // .*.......... - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v13, x4, -32 // ..*......... - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v26, x4, -16 // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vi v4, x5, 192 // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v15, x5, -176 // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v19, x5, -160 // ......*..... - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v10, x5, -144 // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v27, x5, -128 // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v17, x5, -112 // .........*.. - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v18, x5, -96 // ..........*. - // gap // ............ - // gap // ............ - // gap // ............ - ldr_vo v7, x5, -80 // ...........* - // gap // ............ - - // original source code - // ldr_vi v6, x4, 64 // *........... || *...................... - // ldr_vo v31, x4, -48 // .*.......... || ..*.................... - // ldr_vo v13, x4, -32 // ..*......... || ....*.................. - // ldr_vo v26, x4, -16 // ...*........ || ......*................ - // ldr_vi v4, x5, 192 // ....*....... || ........*.............. - // ldr_vo v15, x5, -176 // .....*...... || ..........*............ - // ldr_vo v19, x5, -160 // ......*..... || ............*.......... - // ldr_vo v10, x5, -144 // .......*.... || ..............*........ - // ldr_vo v27, x5, -128 // ........*... || ................*...... - // ldr_vo v17, x5, -112 // .........*.. || ..................*.... - // ldr_vo v18, x5, -96 // ..........*. || ....................*.. - // ldr_vo v7, x5, -80 // ...........* || ......................* - + // Instructions: 19 + // Expected cycles: 37 + // Expected IPC: 0.51 + // + // Wall time: 0.15s + // User time: 0.15s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q17, [x5, #176] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q31, [x1, #64] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x1, #112] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q3, [x1, #80] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q12, [x1, #96] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q16, [x2, #64] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x2, #80] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q9, [x2, #96] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x2, #112] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q21, [x4], #64 // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q7, [x4, #-48] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q2, [x5, #16] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q30, [x5, #32] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q14, [x5, #48] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x5, #64] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q5, [x5, #80] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q23, [x5, #96] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q0, [x5, #160] // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q1, [x5, #112] // ................*............. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q31, [x1, #64] // .*............................. + // ldr q3, [x1, #80] // ...*........................... + // ldr q12, [x1, #96] // ....*.......................... + // ldr q15, [x1, #112] // ..*............................ + // ldr q16, [x2, #64] // .....*......................... + // ldr q22, [x2, #80] // ......*........................ + // ldr q9, [x2, #96] // .......*....................... + // ldr q4, [x2, #112] // ........*...................... + // ldr q21, [x4], #64 // .........*..................... + // ldr q7, [x4, #-48] // ..........*.................... + // ldr q2, [x5, #16] // ...........*................... + // ldr q30, [x5, #32] // ............*.................. + // ldr q14, [x5, #48] // .............*................. + // ldr q11, [x5, #64] // ..............*................ + // ldr q5, [x5, #80] // ...............*............... + // ldr q23, [x5, #96] // ................*.............. + // ldr q1, [x5, #112] // ..................*............ + // ldr q0, [x5, #160] // .................*............. + // ldr q17, [x5, #176] // *.............................. + sub count, count, #1 -.p2align 2 layer45678_start: - ldr_vo v29, x2, 64 // ....*........................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v12, x2, 80 // .....*.......................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v1, x2, 96 // ......*......................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v2, x2, 112 // .......*........................................................................................................................................ - add x2, x2, #64 // .........*...................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v28, x1, 64 // *............................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v30, x1, 80 // .*.............................................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v14, x1, 96 // ..*............................................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v5, x1, 112 // ...*............................................................................................................................................ - add x1, x1, #64 // ........*....................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v11.4S, v29.4S, v6.S[0] // ..............*................................................................................................................................. - // gap // ................................................................................................................................................ - sqrdmulh v29.4S, v29.4S, v6.S[1] // ...............*................................................................................................................................ - // gap // ................................................................................................................................................ - mul v3.4S, v12.4S, v6.S[0] // ...................*............................................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v12.4S, v12.4S, v6.S[1] // ....................*........................................................................................................................... - // gap // ................................................................................................................................................ - mul v22.4S, v1.4S, v6.S[0] // ........................*....................................................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v1.4S, v1.4S, v6.S[1] // .........................*...................................................................................................................... - // gap // ................................................................................................................................................ - mul v20.4S, v2.4S, v6.S[0] // .............................*.................................................................................................................. - // gap // ................................................................................................................................................ - sqrdmulh v2.4S, v2.4S, v6.S[1] // ..............................*................................................................................................................. - // gap // ................................................................................................................................................ - mls v11.4S, v29.4S, v8.S[0] // ................*............................................................................................................................... - // gap // ................................................................................................................................................ - mls v3.4S, v12.4S, v8.S[0] // .....................*.......................................................................................................................... - // gap // ................................................................................................................................................ - mls v22.4S, v1.4S, v8.S[0] // ..........................*..................................................................................................................... - // gap // ................................................................................................................................................ - mls v20.4S, v2.4S, v8.S[0] // ...............................*................................................................................................................ - // gap // ................................................................................................................................................ - sub v29.4S, v28.4S, v11.4S // .................*.............................................................................................................................. - // gap // ................................................................................................................................................ - add v12.4S, v28.4S, v11.4S // ..................*............................................................................................................................. - // gap // ................................................................................................................................................ - sub v1.4S, v30.4S, v3.4S // ......................*......................................................................................................................... - // gap // ................................................................................................................................................ - add v2.4S, v30.4S, v3.4S // .......................*........................................................................................................................ - // gap // ................................................................................................................................................ - sub v28.4S, v14.4S, v22.4S // ...........................*.................................................................................................................... - // gap // ................................................................................................................................................ - add v30.4S, v14.4S, v22.4S // ............................*................................................................................................................... - // gap // ................................................................................................................................................ - sub v14.4S, v5.4S, v20.4S // ................................*............................................................................................................... - // gap // ................................................................................................................................................ - add v5.4S, v5.4S, v20.4S // .................................*.............................................................................................................. - // gap // ................................................................................................................................................ - mul v11.4S, v30.4S, v6.S[2] // ..................................*............................................................................................................. - // gap // ................................................................................................................................................ - sqrdmulh v30.4S, v30.4S, v6.S[3] // ...................................*............................................................................................................ - // gap // ................................................................................................................................................ - mul v3.4S, v5.4S, v6.S[2] // .......................................*........................................................................................................ - // gap // ................................................................................................................................................ - sqrdmulh v5.4S, v5.4S, v6.S[3] // ........................................*....................................................................................................... - // gap // ................................................................................................................................................ - mul v22.4S, v28.4S, v31.S[0] // ............................................*................................................................................................... - // gap // ................................................................................................................................................ - mls v11.4S, v30.4S, v8.S[0] // ....................................*........................................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v28.4S, v31.S[1] // .............................................*.................................................................................................. - // gap // ................................................................................................................................................ - mls v3.4S, v5.4S, v8.S[0] // .........................................*...................................................................................................... - // gap // ................................................................................................................................................ - mul v30.4S, v14.4S, v31.S[0] // .................................................*.............................................................................................. - // gap // ................................................................................................................................................ - sub v5.4S, v12.4S, v11.4S // .....................................*.......................................................................................................... - // gap // ................................................................................................................................................ - add v12.4S, v12.4S, v11.4S // ......................................*......................................................................................................... - // gap // ................................................................................................................................................ - sub v11.4S, v2.4S, v3.4S // ..........................................*..................................................................................................... - // gap // ................................................................................................................................................ - add v2.4S, v2.4S, v3.4S // ...........................................*.................................................................................................... - // gap // ................................................................................................................................................ - mls v22.4S, v28.4S, v8.S[0] // ..............................................*................................................................................................. - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v14.4S, v31.S[1] // ..................................................*............................................................................................. - // gap // ................................................................................................................................................ - mul v14.4S, v2.4S, v31.S[2] // ......................................................*......................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v2.4S, v2.4S, v31.S[3] // .......................................................*........................................................................................ - // gap // ................................................................................................................................................ - sub v3.4S, v29.4S, v22.4S // ...............................................*................................................................................................ - // gap // ................................................................................................................................................ - add v29.4S, v29.4S, v22.4S // ................................................*............................................................................................... - // gap // ................................................................................................................................................ - mls v30.4S, v28.4S, v8.S[0] // ...................................................*............................................................................................ - // gap // ................................................................................................................................................ - mls v14.4S, v2.4S, v8.S[0] // ........................................................*....................................................................................... - // gap // ................................................................................................................................................ - mul v2.4S, v11.4S, v13.S[0] // ...........................................................*.................................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v11.4S, v13.S[1] // ............................................................*................................................................................... - // gap // ................................................................................................................................................ - sub v11.4S, v1.4S, v30.4S // ....................................................*........................................................................................... - // gap // ................................................................................................................................................ - add v1.4S, v1.4S, v30.4S // .....................................................*.......................................................................................... - // gap // ................................................................................................................................................ - sub v30.4S, v12.4S, v14.4S // .........................................................*...................................................................................... - // gap // ................................................................................................................................................ - add v12.4S, v12.4S, v14.4S // ..........................................................*..................................................................................... - // gap // ................................................................................................................................................ - mls v2.4S, v28.4S, v8.S[0] // .............................................................*.................................................................................. - // gap // ................................................................................................................................................ - mul v28.4S, v1.4S, v13.S[2] // ................................................................*............................................................................... - // gap // ................................................................................................................................................ - sqrdmulh v1.4S, v1.4S, v13.S[3] // .................................................................*.............................................................................. - // gap // ................................................................................................................................................ - mul v14.4S, v11.4S, v26.S[0] // .....................................................................*.......................................................................... - // gap // ................................................................................................................................................ - sub v22.4S, v5.4S, v2.4S // ..............................................................*................................................................................. - // gap // ................................................................................................................................................ - add v2.4S, v5.4S, v2.4S // ...............................................................*................................................................................ - // gap // ................................................................................................................................................ - mls v28.4S, v1.4S, v8.S[0] // ..................................................................*............................................................................. - // gap // ................................................................................................................................................ - sqrdmulh v1.4S, v11.4S, v26.S[1] // ......................................................................*......................................................................... - // gap // ................................................................................................................................................ - trn1 v5.4S, v12.4S, v30.4S // ..........................................................................*..................................................................... - // gap // ................................................................................................................................................ - trn2 v12.4S, v12.4S, v30.4S // ...........................................................................*.................................................................... - // gap // ................................................................................................................................................ - sub v30.4S, v29.4S, v28.4S // ...................................................................*............................................................................ - // gap // ................................................................................................................................................ - add v29.4S, v29.4S, v28.4S // ....................................................................*........................................................................... - // gap // ................................................................................................................................................ - mls v14.4S, v1.4S, v8.S[0] // .......................................................................*........................................................................ - // gap // ................................................................................................................................................ - trn1 v1.4S, v2.4S, v22.4S // ............................................................................*................................................................... - // gap // ................................................................................................................................................ - trn2 v2.4S, v2.4S, v22.4S // .............................................................................*.................................................................. - // gap // ................................................................................................................................................ - trn1 v28.4S, v29.4S, v30.4S // ..................................................................................*............................................................. - // gap // ................................................................................................................................................ - sub v11.4S, v3.4S, v14.4S // ........................................................................*....................................................................... - // gap // ................................................................................................................................................ - add v14.4S, v3.4S, v14.4S // .........................................................................*...................................................................... - // gap // ................................................................................................................................................ - trn2 v3.2D, v5.2D, v1.2D // ..............................................................................*................................................................. - // gap // ................................................................................................................................................ - trn2 v22.2D, v12.2D, v2.2D // ...............................................................................*................................................................ - // gap // ................................................................................................................................................ - trn1 v1.2D, v5.2D, v1.2D // ................................................................................*............................................................... - // gap // ................................................................................................................................................ - trn1 v12.2D, v12.2D, v2.2D // .................................................................................*.............................................................. - // gap // ................................................................................................................................................ - trn2 v29.4S, v29.4S, v30.4S // ...................................................................................*............................................................ - // gap // ................................................................................................................................................ - trn1 v2.4S, v14.4S, v11.4S // ....................................................................................*........................................................... - // gap // ................................................................................................................................................ - trn2 v30.4S, v14.4S, v11.4S // .....................................................................................*.......................................................... - // gap // ................................................................................................................................................ - mul v14.4S, v3.4S, v4.4S // ................................................................................................*............................................... - // gap // ................................................................................................................................................ - trn2 v5.2D, v28.2D, v2.2D // ......................................................................................*......................................................... - // gap // ................................................................................................................................................ - trn2 v11.2D, v29.2D, v30.2D // .......................................................................................*........................................................ - // gap // ................................................................................................................................................ - trn1 v2.2D, v28.2D, v2.2D // ........................................................................................*....................................................... - // gap // ................................................................................................................................................ - trn1 v29.2D, v29.2D, v30.2D // .........................................................................................*...................................................... - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v3.4S, v15.4S // .................................................................................................*.............................................. - // gap // ................................................................................................................................................ - mul v30.4S, v22.4S, v4.4S // .....................................................................................................*.......................................... - // gap // ................................................................................................................................................ - sqrdmulh v3.4S, v22.4S, v15.4S // ......................................................................................................*......................................... - // gap // ................................................................................................................................................ - mul v22.4S, v5.4S, v18.4S // ..........................................................................................................................*..................... - // gap // ................................................................................................................................................ - mls v14.4S, v28.4S, v8.S[0] // ..................................................................................................*............................................. - // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v5.4S, v7.4S // ...........................................................................................................................*.................... - // gap // ................................................................................................................................................ - mls v30.4S, v3.4S, v8.S[0] // .......................................................................................................*........................................ - // gap // ................................................................................................................................................ - mul v5.4S, v11.4S, v18.4S // ...............................................................................................................................*................ - // gap // ................................................................................................................................................ - sub v3.4S, v1.4S, v14.4S // ...................................................................................................*............................................ - // gap // ................................................................................................................................................ - add v1.4S, v1.4S, v14.4S // ....................................................................................................*........................................... - // gap // ................................................................................................................................................ - sub v14.4S, v12.4S, v30.4S // ........................................................................................................*....................................... - // gap // ................................................................................................................................................ - add v12.4S, v12.4S, v30.4S // .........................................................................................................*...................................... - // gap // ................................................................................................................................................ - mls v22.4S, v28.4S, v8.S[0] // ............................................................................................................................*................... - // gap // ................................................................................................................................................ - mul v28.4S, v14.4S, v27.4S // ...............................................................................................................*................................ - // gap // ................................................................................................................................................ - mul v30.4S, v12.4S, v19.4S // ..........................................................................................................*..................................... - // gap // ................................................................................................................................................ - sqrdmulh v12.4S, v12.4S, v10.4S // ...........................................................................................................*.................................... - // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v14.4S, v17.4S // ................................................................................................................*............................... - // gap // ................................................................................................................................................ - sub v20.4S, v2.4S, v22.4S // .............................................................................................................................*.................. - // gap // ................................................................................................................................................ - add v2.4S, v2.4S, v22.4S // ..............................................................................................................................*................. - // gap // ................................................................................................................................................ - mls v30.4S, v12.4S, v8.S[0] // ............................................................................................................*................................... - // gap // ................................................................................................................................................ - mls v28.4S, v14.4S, v8.S[0] // .................................................................................................................*.............................. - // gap // ................................................................................................................................................ - sqrdmulh v12.4S, v11.4S, v7.4S // ................................................................................................................................*............... - // gap // ................................................................................................................................................ - ldr_vo v14, x5, -64 // ......................................................................................................................*......................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v22.4S, v1.4S, v30.4S // .............................................................................................................*.................................. - // gap // ................................................................................................................................................ - add v21.4S, v1.4S, v30.4S // ..............................................................................................................*................................. - // gap // ................................................................................................................................................ - sub v24.4S, v3.4S, v28.4S // ..................................................................................................................*............................. - // gap // ................................................................................................................................................ - add v23.4S, v3.4S, v28.4S // ...................................................................................................................*............................ - // gap // ................................................................................................................................................ - mls v5.4S, v12.4S, v8.S[0] // .................................................................................................................................*.............. - // gap // ................................................................................................................................................ - ldr_vo v12, x5, -48 // .......................................................................................................................*........................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v1, x5, -32 // ........................................................................................................................*....................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v28.4S, v29.4S, v5.4S // ..................................................................................................................................*............. - // gap // ................................................................................................................................................ - add v29.4S, v29.4S, v5.4S // ...................................................................................................................................*............ - // gap // ................................................................................................................................................ - ldr_vo v30, x5, -16 // .........................................................................................................................*...................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v14.4S, v29.4S, v14.4S // ....................................................................................................................................*........... - // gap // ................................................................................................................................................ - sqrdmulh v29.4S, v29.4S, v12.4S // .....................................................................................................................................*.......... - // gap // ................................................................................................................................................ - mul v12.4S, v28.4S, v1.4S // .........................................................................................................................................*...... - // gap // ................................................................................................................................................ - sqrdmulh v1.4S, v28.4S, v30.4S // ..........................................................................................................................................*..... - // gap // ................................................................................................................................................ - ldr_vi v6, x4, 64 // ..........e..................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mls v14.4S, v29.4S, v8.S[0] // ......................................................................................................................................*......... - // gap // ................................................................................................................................................ - mls v12.4S, v1.4S, v8.S[0] // ...........................................................................................................................................*.... - // gap // ................................................................................................................................................ - ldr_vo v31, x4, -48 // ...........e.................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - sub v1.4S, v2.4S, v14.4S // .......................................................................................................................................*........ - // gap // ................................................................................................................................................ - add v0.4S, v2.4S, v14.4S // ........................................................................................................................................*....... - // gap // ................................................................................................................................................ - sub v3.4S, v20.4S, v12.4S // ............................................................................................................................................*... - // gap // ................................................................................................................................................ - add v2.4S, v20.4S, v12.4S // .............................................................................................................................................*.. - // gap // ................................................................................................................................................ - ldr_vo v13, x4, -32 // ............e................................................................................................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v26, x4, -16 // .............e.................................................................................................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vi v4, x5, 192 // ..........................................................................................e..................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v15, x5, -176 // ...........................................................................................e.................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v19, x5, -160 // ............................................................................................e................................................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v10, x5, -144 // .............................................................................................e.................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v27, x5, -128 // ..............................................................................................e................................................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v17, x5, -112 // ...............................................................................................e................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v18, x5, -96 // ....................................................................................................................e........................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - ldr_vo v7, x5, -80 // .....................................................................................................................e.......................... - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - st4 {v21.4S,v22.4S,v23.4S,v24.4S}, [x1], #64 // ..............................................................................................................................................*. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - st4 {v0.4S,v1.4S,v2.4S,v3.4S}, [x2], #64 // ...............................................................................................................................................* - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - - // original source code - // ldr_vo v9, x1, 64 // .........................*.......................................................................................................................................... || ................................................*................................................................................................................................................................. - // ldr_vo v10, x1, 80 // ..........................*......................................................................................................................................... || ..................................................*............................................................................................................................................................... - // ldr_vo v11, x1, 96 // ...........................*........................................................................................................................................ || ....................................................*............................................................................................................................................................. - // ldr_vo v12, x1, 112 // ............................*....................................................................................................................................... || ......................................................*........................................................................................................................................................... - // ldr_vo v13, x2, 64 // ....................*............................................................................................................................................... || ........................................*......................................................................................................................................................................... - // ldr_vo v14, x2, 80 // .....................*.............................................................................................................................................. || ..........................................*....................................................................................................................................................................... - // ldr_vo v15, x2, 96 // ......................*............................................................................................................................................. || ............................................*..................................................................................................................................................................... - // ldr_vo v16, x2, 112 // .......................*............................................................................................................................................ || ..............................................*................................................................................................................................................................... - // add x1, x1, #64 // .............................*...................................................................................................................................... || ......................................................*........................................................................................................................................................... - // add x2, x2, #64 // ........................*........................................................................................................................................... || ..............................................*................................................................................................................................................................... - // ldr_vi v0, x4, 64 // e................................................................................................................................................................... || e................................................................................................................................................................................................................. - // ldr_vo v1, x4, -48 // ...e................................................................................................................................................................ || ....e............................................................................................................................................................................................................. - // ldr_vo v2, x4, -32 // ........e........................................................................................................................................................... || ..........e....................................................................................................................................................................................................... - // ldr_vo v3, x4, -16 // .........e.......................................................................................................................................................... || ............e..................................................................................................................................................................................................... - // mul v24.4S, v13.4S, v0.S[0] // ..............................*..................................................................................................................................... || ........................................................*......................................................................................................................................................... - // sqrdmulh v13.4S, v13.4S, v0.S[1] // ...............................*.................................................................................................................................... || .........................................................*........................................................................................................................................................ - // mls v24.4S, v13.4S, v8.S[0] // ......................................*............................................................................................................................. || ................................................................*................................................................................................................................................. - // sub v13.4S, v9.4S, v24.4S // ..........................................*......................................................................................................................... || ....................................................................*............................................................................................................................................. - // add v9.4S, v9.4S, v24.4S // ...........................................*........................................................................................................................ || .....................................................................*............................................................................................................................................ - // mul v24.4S, v14.4S, v0.S[0] // ................................*................................................................................................................................... || ..........................................................*....................................................................................................................................................... - // sqrdmulh v14.4S, v14.4S, v0.S[1] // .................................*.................................................................................................................................. || ...........................................................*...................................................................................................................................................... - // mls v24.4S, v14.4S, v8.S[0] // .......................................*............................................................................................................................ || .................................................................*................................................................................................................................................ - // sub v14.4S, v10.4S, v24.4S // ............................................*....................................................................................................................... || ......................................................................*........................................................................................................................................... - // add v10.4S, v10.4S, v24.4S // .............................................*...................................................................................................................... || .......................................................................*.......................................................................................................................................... - // mul v24.4S, v15.4S, v0.S[0] // ..................................*................................................................................................................................. || ............................................................*..................................................................................................................................................... - // sqrdmulh v15.4S, v15.4S, v0.S[1] // ...................................*................................................................................................................................ || .............................................................*.................................................................................................................................................... - // mls v24.4S, v15.4S, v8.S[0] // ........................................*........................................................................................................................... || ..................................................................*............................................................................................................................................... - // sub v15.4S, v11.4S, v24.4S // ..............................................*..................................................................................................................... || ........................................................................*......................................................................................................................................... - // add v11.4S, v11.4S, v24.4S // ...............................................*.................................................................................................................... || .........................................................................*........................................................................................................................................ - // mul v24.4S, v16.4S, v0.S[0] // ....................................*............................................................................................................................... || ..............................................................*................................................................................................................................................... - // sqrdmulh v16.4S, v16.4S, v0.S[1] // .....................................*.............................................................................................................................. || ...............................................................*.................................................................................................................................................. - // mls v24.4S, v16.4S, v8.S[0] // .........................................*.......................................................................................................................... || ...................................................................*.............................................................................................................................................. - // sub v16.4S, v12.4S, v24.4S // ................................................*................................................................................................................... || ..........................................................................*....................................................................................................................................... - // add v12.4S, v12.4S, v24.4S // .................................................*.................................................................................................................. || ...........................................................................*...................................................................................................................................... - // mul v24.4S, v11.4S, v0.S[2] // ..................................................*................................................................................................................. || ............................................................................*..................................................................................................................................... - // sqrdmulh v11.4S, v11.4S, v0.S[3] // ...................................................*................................................................................................................ || .............................................................................*.................................................................................................................................... - // mls v24.4S, v11.4S, v8.S[0] // .......................................................*............................................................................................................ || .................................................................................*................................................................................................................................ - // sub v11.4S, v9.4S, v24.4S // ...........................................................*........................................................................................................ || .....................................................................................*............................................................................................................................ - // add v9.4S, v9.4S, v24.4S // ............................................................*....................................................................................................... || ......................................................................................*........................................................................................................................... - // mul v24.4S, v12.4S, v0.S[2] // ....................................................*............................................................................................................... || ..............................................................................*................................................................................................................................... - // sqrdmulh v12.4S, v12.4S, v0.S[3] // .....................................................*.............................................................................................................. || ...............................................................................*.................................................................................................................................. - // mls v24.4S, v12.4S, v8.S[0] // .........................................................*.......................................................................................................... || ...................................................................................*.............................................................................................................................. - // sub v12.4S, v10.4S, v24.4S // .............................................................*...................................................................................................... || .......................................................................................*.......................................................................................................................... - // add v10.4S, v10.4S, v24.4S // ..............................................................*..................................................................................................... || ........................................................................................*......................................................................................................................... - // mul v24.4S, v15.4S, v1.S[0] // ......................................................*............................................................................................................. || ................................................................................*................................................................................................................................. - // sqrdmulh v15.4S, v15.4S, v1.S[1] // ........................................................*........................................................................................................... || ..................................................................................*............................................................................................................................... - // mls v24.4S, v15.4S, v8.S[0] // ...............................................................*.................................................................................................... || .........................................................................................*........................................................................................................................ - // sub v15.4S, v13.4S, v24.4S // ...................................................................*................................................................................................ || .............................................................................................*.................................................................................................................... - // add v13.4S, v13.4S, v24.4S // ....................................................................*............................................................................................... || ..............................................................................................*................................................................................................................... - // mul v24.4S, v16.4S, v1.S[0] // ..........................................................*......................................................................................................... || ....................................................................................*............................................................................................................................. - // sqrdmulh v16.4S, v16.4S, v1.S[1] // ................................................................*................................................................................................... || ..........................................................................................*....................................................................................................................... - // mls v24.4S, v16.4S, v8.S[0] // .....................................................................*.............................................................................................. || ...............................................................................................*.................................................................................................................. - // sub v16.4S, v14.4S, v24.4S // .........................................................................*.......................................................................................... || ...................................................................................................*.............................................................................................................. - // add v14.4S, v14.4S, v24.4S // ..........................................................................*......................................................................................... || ....................................................................................................*............................................................................................................. - // mul v24.4S, v10.4S, v1.S[2] // .................................................................*.................................................................................................. || ...........................................................................................*...................................................................................................................... - // sqrdmulh v10.4S, v10.4S, v1.S[3] // ..................................................................*................................................................................................. || ............................................................................................*..................................................................................................................... - // mls v24.4S, v10.4S, v8.S[0] // ......................................................................*............................................................................................. || ................................................................................................*................................................................................................................. - // sub v10.4S, v9.4S, v24.4S // ...........................................................................*........................................................................................ || .....................................................................................................*............................................................................................................ - // add v9.4S, v9.4S, v24.4S // ............................................................................*....................................................................................... || ......................................................................................................*........................................................................................................... - // mul v24.4S, v12.4S, v2.S[0] // .......................................................................*............................................................................................ || .................................................................................................*................................................................................................................ - // sqrdmulh v12.4S, v12.4S, v2.S[1] // ........................................................................*........................................................................................... || ..................................................................................................*............................................................................................................... - // mls v24.4S, v12.4S, v8.S[0] // .............................................................................*...................................................................................... || .......................................................................................................*.......................................................................................................... - // sub v12.4S, v11.4S, v24.4S // .................................................................................*.................................................................................. || ...........................................................................................................*...................................................................................................... - // add v11.4S, v11.4S, v24.4S // ..................................................................................*................................................................................. || ............................................................................................................*..................................................................................................... - // mul v24.4S, v14.4S, v2.S[2] // ..............................................................................*..................................................................................... || ........................................................................................................*......................................................................................................... - // sqrdmulh v14.4S, v14.4S, v2.S[3] // ...............................................................................*.................................................................................... || .........................................................................................................*........................................................................................................ - // mls v24.4S, v14.4S, v8.S[0] // ...................................................................................*................................................................................ || .............................................................................................................*.................................................................................................... - // sub v14.4S, v13.4S, v24.4S // .......................................................................................*............................................................................ || .................................................................................................................*................................................................................................ - // add v13.4S, v13.4S, v24.4S // ........................................................................................*........................................................................... || ..................................................................................................................*............................................................................................... - // mul v24.4S, v16.4S, v3.S[0] // ................................................................................*................................................................................... || ..........................................................................................................*....................................................................................................... - // sqrdmulh v16.4S, v16.4S, v3.S[1] // ....................................................................................*............................................................................... || ..............................................................................................................*................................................................................................... - // mls v24.4S, v16.4S, v8.S[0] // .........................................................................................*.......................................................................... || ...................................................................................................................*.............................................................................................. - // sub v16.4S, v15.4S, v24.4S // .............................................................................................*...................................................................... || .......................................................................................................................*.......................................................................................... - // add v15.4S, v15.4S, v24.4S // ..............................................................................................*..................................................................... || ........................................................................................................................*......................................................................................... - // trn1 v25.4S, v9.4S, v10.4S // .....................................................................................*.............................................................................. || ...............................................................................................................*.................................................................................................. - // trn2 v26.4S, v9.4S, v10.4S // ......................................................................................*............................................................................. || ................................................................................................................*................................................................................................. - // trn1 v27.4S, v11.4S, v12.4S // ..........................................................................................*......................................................................... || ....................................................................................................................*............................................................................................. - // trn2 v28.4S, v11.4S, v12.4S // ...........................................................................................*........................................................................ || .....................................................................................................................*............................................................................................ - // trn2 v11.2D, v25.2D, v27.2D // ...............................................................................................*.................................................................... || .........................................................................................................................*........................................................................................ - // trn2 v12.2D, v26.2D, v28.2D // ................................................................................................*................................................................... || ..........................................................................................................................*....................................................................................... - // trn1 v9.2D, v25.2D, v27.2D // .................................................................................................*.................................................................. || ...........................................................................................................................*...................................................................................... - // trn1 v10.2D, v26.2D, v28.2D // ..................................................................................................*................................................................. || ............................................................................................................................*..................................................................................... - // trn1 v25.4S, v13.4S, v14.4S // ............................................................................................*....................................................................... || ......................................................................................................................*........................................................................................... - // trn2 v26.4S, v13.4S, v14.4S // ...................................................................................................*................................................................ || .............................................................................................................................*.................................................................................... - // trn1 v27.4S, v15.4S, v16.4S // ....................................................................................................*............................................................... || ..............................................................................................................................*................................................................................... - // trn2 v28.4S, v15.4S, v16.4S // .....................................................................................................*.............................................................. || ...............................................................................................................................*.................................................................................. - // trn2 v15.2D, v25.2D, v27.2D // .......................................................................................................*............................................................ || .................................................................................................................................*................................................................................ - // trn2 v16.2D, v26.2D, v28.2D // ........................................................................................................*........................................................... || ..................................................................................................................................*............................................................................... - // trn1 v13.2D, v25.2D, v27.2D // .........................................................................................................*.......................................................... || ...................................................................................................................................*.............................................................................. - // trn1 v14.2D, v26.2D, v28.2D // ..........................................................................................................*......................................................... || ....................................................................................................................................*............................................................................. - // ldr_vi v0, x5, 192 // ..........e......................................................................................................................................................... || ..............e................................................................................................................................................................................................... - // ldr_vo v4, x5, -176 // ...........e........................................................................................................................................................ || ................e................................................................................................................................................................................................. - // ldr_vo v1, x5, -160 // ............e....................................................................................................................................................... || ..................e............................................................................................................................................................................................... - // ldr_vo v5, x5, -144 // .............e...................................................................................................................................................... || ....................e............................................................................................................................................................................................. - // ldr_vo v2, x5, -128 // ..............e..................................................................................................................................................... || ......................e........................................................................................................................................................................................... - // ldr_vo v6, x5, -112 // ...............e.................................................................................................................................................... || ........................e......................................................................................................................................................................................... - // mul v24.4S, v11.4S, v0.4S // ......................................................................................................*............................................................. || ................................................................................................................................*................................................................................. - // sqrdmulh v11.4S, v11.4S, v4.4S // ...........................................................................................................*........................................................ || .....................................................................................................................................*............................................................................ - // mls v24.4S, v11.4S, v8.S[0] // ...............................................................................................................*.................................................... || .........................................................................................................................................*........................................................................ - // sub v11.4S, v9.4S, v24.4S // ...................................................................................................................*................................................ || .............................................................................................................................................*.................................................................... - // add v9.4S, v9.4S, v24.4S // ....................................................................................................................*............................................... || ..............................................................................................................................................*................................................................... - // mul v24.4S, v12.4S, v0.4S // ............................................................................................................*....................................................... || ......................................................................................................................................*........................................................................... - // sqrdmulh v12.4S, v12.4S, v4.4S // .............................................................................................................*...................................................... || .......................................................................................................................................*.......................................................................... - // mls v24.4S, v12.4S, v8.S[0] // .................................................................................................................*.................................................. || ...........................................................................................................................................*...................................................................... - // sub v12.4S, v10.4S, v24.4S // .....................................................................................................................*.............................................. || ...............................................................................................................................................*.................................................................. - // add v10.4S, v10.4S, v24.4S // ......................................................................................................................*............................................. || ................................................................................................................................................*................................................................. - // mul v24.4S, v10.4S, v1.4S // .........................................................................................................................*.......................................... || ...................................................................................................................................................*.............................................................. - // sqrdmulh v10.4S, v10.4S, v5.4S // ..........................................................................................................................*......................................... || ....................................................................................................................................................*............................................................. - // mls v24.4S, v10.4S, v8.S[0] // ..............................................................................................................................*..................................... || ........................................................................................................................................................*......................................................... - // sub v10.4S, v9.4S, v24.4S // ..................................................................................................................................*................................. || .............................................................................................................................................................*.................................................... - // add v9.4S, v9.4S, v24.4S // ...................................................................................................................................*................................ || ..............................................................................................................................................................*................................................... - // mul v24.4S, v12.4S, v2.4S // ........................................................................................................................*........................................... || ..................................................................................................................................................*............................................................... - // sqrdmulh v12.4S, v12.4S, v6.4S // ...........................................................................................................................*........................................ || .....................................................................................................................................................*............................................................ - // mls v24.4S, v12.4S, v8.S[0] // ...............................................................................................................................*.................................... || .........................................................................................................................................................*........................................................ - // sub v12.4S, v11.4S, v24.4S // ....................................................................................................................................*............................... || ...............................................................................................................................................................*.................................................. - // add v11.4S, v11.4S, v24.4S // .....................................................................................................................................*.............................. || ................................................................................................................................................................*................................................. - // ldr_vo v0, x5, -96 // ................e................................................................................................................................................... || ..........................e....................................................................................................................................................................................... - // ldr_vo v4, x5, -80 // .................e.................................................................................................................................................. || ............................e..................................................................................................................................................................................... - // ldr_vo v1, x5, -64 // .................................................................................................................................*.................................. || ...........................................................................................................................................................*...................................................... - // ldr_vo v5, x5, -48 // .......................................................................................................................................*............................ || ..................................................................................................................................................................*............................................... - // ldr_vo v2, x5, -32 // ........................................................................................................................................*........................... || ....................................................................................................................................................................*............................................. - // ldr_vo v6, x5, -16 // ...........................................................................................................................................*........................ || ........................................................................................................................................................................*......................................... - // mul v24.4S, v15.4S, v0.4S // ..............................................................................................................*..................................................... || ........................................................................................................................................*......................................................................... - // sqrdmulh v15.4S, v15.4S, v4.4S // ................................................................................................................*................................................... || ..........................................................................................................................................*....................................................................... - // mls v24.4S, v15.4S, v8.S[0] // .......................................................................................................................*............................................ || .................................................................................................................................................*................................................................ - // sub v15.4S, v13.4S, v24.4S // ............................................................................................................................*....................................... || ......................................................................................................................................................*........................................................... - // add v13.4S, v13.4S, v24.4S // .............................................................................................................................*...................................... || .......................................................................................................................................................*.......................................................... - // mul v24.4S, v16.4S, v0.4S // ..................................................................................................................*................................................. || ............................................................................................................................................*..................................................................... - // sqrdmulh v16.4S, v16.4S, v4.4S // ................................................................................................................................*................................... || ..........................................................................................................................................................*....................................................... - // mls v24.4S, v16.4S, v8.S[0] // ......................................................................................................................................*............................. || .................................................................................................................................................................*................................................ - // sub v16.4S, v14.4S, v24.4S // .........................................................................................................................................*.......................... || ......................................................................................................................................................................*........................................... - // add v14.4S, v14.4S, v24.4S // ..........................................................................................................................................*......................... || .......................................................................................................................................................................*.......................................... - // mul v24.4S, v14.4S, v1.4S // ............................................................................................................................................*....................... || ..........................................................................................................................................................................*....................................... - // sqrdmulh v14.4S, v14.4S, v5.4S // .............................................................................................................................................*...................... || ...........................................................................................................................................................................*...................................... - // mls v24.4S, v14.4S, v8.S[0] // .................................................................................................................................................*.................. || ................................................................................................................................................................................*................................. - // sub v14.4S, v13.4S, v24.4S // ....................................................................................................................................................*............... || ....................................................................................................................................................................................*............................. - // add v13.4S, v13.4S, v24.4S // .....................................................................................................................................................*.............. || .....................................................................................................................................................................................*............................ - // mul v24.4S, v16.4S, v2.4S // ..............................................................................................................................................*..................... || ............................................................................................................................................................................*..................................... - // sqrdmulh v16.4S, v16.4S, v6.4S // ...............................................................................................................................................*.................... || .............................................................................................................................................................................*.................................... - // mls v24.4S, v16.4S, v8.S[0] // ..................................................................................................................................................*................. || .................................................................................................................................................................................*................................ - // sub v16.4S, v15.4S, v24.4S // ......................................................................................................................................................*............. || ......................................................................................................................................................................................*........................... - // add v15.4S, v15.4S, v24.4S // .......................................................................................................................................................*............ || .......................................................................................................................................................................................*.......................... - // st4 {v9.4S,v10.4S,v11.4S,v12.4S}, [x1], #64 // ..................................................................................................................................................................*. || ............................................................................................................................................................................................................*..... - // st4 {v13.4S,v14.4S,v15.4S,v16.4S}, [x2], #64 // ...................................................................................................................................................................* || .................................................................................................................................................................................................................* - - subs count, count, #1 + // Instructions: 144 + // Expected cycles: 174 + // Expected IPC: 0.83 + // + // Wall time: 45.34s + // User time: 45.34s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + add x1, x1, #64 // ........*....................................................................................................................................... + add x2, x2, #64 // .........*...................................................................................................................................... + sqrdmulh v6.4S, v16.4S, v21.S[1] // ..............*................................................................................................................................. + // gap // ................................................................................................................................................ + mul v18.4S, v16.4S, v21.S[0] // ...............*................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v22.4S, v21.S[1] // ...................*............................................................................................................................ + // gap // ................................................................................................................................................ + mul v22.4S, v22.4S, v21.S[0] // ....................*........................................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v19.4S, v9.4S, v21.S[1] // ........................*....................................................................................................................... + // gap // ................................................................................................................................................ + mls v18.4S, v6.4S, v8.S[0] // ................*............................................................................................................................... + // gap // ................................................................................................................................................ + mul v6.4S, v9.4S, v21.S[0] // .........................*...................................................................................................................... + // gap // ................................................................................................................................................ + mls v22.4S, v16.4S, v8.S[0] // .....................*.......................................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v4.4S, v21.S[1] // .............................*.................................................................................................................. + // gap // ................................................................................................................................................ + sub v20.4S, v31.4S, v18.4S // .................*.............................................................................................................................. + // gap // ................................................................................................................................................ + mls v6.4S, v19.4S, v8.S[0] // ..........................*..................................................................................................................... + // gap // ................................................................................................................................................ + add v18.4S, v31.4S, v18.4S // ..................*............................................................................................................................. + // gap // ................................................................................................................................................ + sub v19.4S, v3.4S, v22.4S // ......................*......................................................................................................................... + // gap // ................................................................................................................................................ + add v22.4S, v3.4S, v22.4S // .......................*........................................................................................................................ + // gap // ................................................................................................................................................ + sub v31.4S, v12.4S, v6.4S // ...........................*.................................................................................................................... + // gap // ................................................................................................................................................ + add v6.4S, v12.4S, v6.4S // ............................*................................................................................................................... + // gap // ................................................................................................................................................ + mul v3.4S, v4.4S, v21.S[0] // ..............................*................................................................................................................. + // gap // ................................................................................................................................................ + sqrdmulh v12.4S, v31.4S, v7.S[1] // ............................................*................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v9.4S, v6.4S, v21.S[3] // ..................................*............................................................................................................. + // gap // ................................................................................................................................................ + mul v6.4S, v6.4S, v21.S[2] // ...................................*............................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v16.4S, v8.S[0] // ...............................*................................................................................................................ + // gap // ................................................................................................................................................ + mul v16.4S, v31.4S, v7.S[0] // .............................................*.................................................................................................. + // gap // ................................................................................................................................................ + ldr q31, [x4, #-32] // ............*................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v6.4S, v9.4S, v8.S[0] // ....................................*........................................................................................................... + // gap // ................................................................................................................................................ + mls v16.4S, v12.4S, v8.S[0] // ..............................................*................................................................................................. + // gap // ................................................................................................................................................ + add v12.4S, v15.4S, v3.4S // .................................*.............................................................................................................. + // gap // ................................................................................................................................................ + sub v3.4S, v15.4S, v3.4S // ................................*............................................................................................................... + // gap // ................................................................................................................................................ + sub v9.4S, v18.4S, v6.4S // .....................................*.......................................................................................................... + // gap // ................................................................................................................................................ + sub v15.4S, v20.4S, v16.4S // ...............................................*................................................................................................ + // gap // ................................................................................................................................................ + add v16.4S, v20.4S, v16.4S // ................................................*............................................................................................... + // gap // ................................................................................................................................................ + add v6.4S, v18.4S, v6.4S // ......................................*......................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v18.4S, v3.4S, v7.S[1] // .................................................*.............................................................................................. + // gap // ................................................................................................................................................ + mul v20.4S, v3.4S, v7.S[0] // ..................................................*............................................................................................. + // gap // ................................................................................................................................................ + sqrdmulh v3.4S, v12.4S, v21.S[3] // .......................................*........................................................................................................ + // gap // ................................................................................................................................................ + ldr q4, [x4, #-16] // .............*.................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v20.4S, v18.4S, v8.S[0] // ...................................................*............................................................................................ + // gap // ................................................................................................................................................ + mul v18.4S, v12.4S, v21.S[2] // ........................................*....................................................................................................... + // gap // ................................................................................................................................................ + ldr q12, [x5], #(12*16) // ..........................................................................................*..................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v26.4S, v19.4S, v20.4S // ....................................................*........................................................................................... + // gap // ................................................................................................................................................ + add v19.4S, v19.4S, v20.4S // .....................................................*.......................................................................................... + // gap // ................................................................................................................................................ + mls v18.4S, v3.4S, v8.S[0] // .........................................*...................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v26.4S, v4.S[1] // .....................................................................*.......................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v3.4S, v19.4S, v31.S[3] // ................................................................*............................................................................... + // gap // ................................................................................................................................................ + mul v19.4S, v19.4S, v31.S[2] // .................................................................*.............................................................................. + // gap // ................................................................................................................................................ + sub v21.4S, v22.4S, v18.4S // ..........................................*..................................................................................................... + // gap // ................................................................................................................................................ + add v18.4S, v22.4S, v18.4S // ...........................................*.................................................................................................... + // gap // ................................................................................................................................................ + mul v22.4S, v26.4S, v4.S[0] // ......................................................................*......................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v4.4S, v21.4S, v31.S[1] // ...........................................................*.................................................................................... + // gap // ................................................................................................................................................ + mul v31.4S, v21.4S, v31.S[0] // ............................................................*................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v18.4S, v7.S[3] // ......................................................*......................................................................................... + // gap // ................................................................................................................................................ + mul v18.4S, v18.4S, v7.S[2] // .......................................................*........................................................................................ + // gap // ................................................................................................................................................ + mls v19.4S, v3.4S, v8.S[0] // ..................................................................*............................................................................. + // gap // ................................................................................................................................................ + mls v31.4S, v4.4S, v8.S[0] // .............................................................*.................................................................................. + // gap // ................................................................................................................................................ + mls v22.4S, v20.4S, v8.S[0] // .......................................................................*........................................................................ + // gap // ................................................................................................................................................ + mls v18.4S, v26.4S, v8.S[0] // ........................................................*....................................................................................... + // gap // ................................................................................................................................................ + sub v20.4S, v16.4S, v19.4S // ...................................................................*............................................................................ + // gap // ................................................................................................................................................ + sub v3.4S, v9.4S, v31.4S // ..............................................................*................................................................................. + // gap // ................................................................................................................................................ + add v31.4S, v9.4S, v31.4S // ...............................................................*................................................................................ + // gap // ................................................................................................................................................ + sub v9.4S, v15.4S, v22.4S // ........................................................................*....................................................................... + // gap // ................................................................................................................................................ + add v22.4S, v15.4S, v22.4S // .........................................................................*...................................................................... + // gap // ................................................................................................................................................ + add v16.4S, v16.4S, v19.4S // ....................................................................*........................................................................... + // gap // ................................................................................................................................................ + sub v19.4S, v6.4S, v18.4S // .........................................................*...................................................................................... + // gap // ................................................................................................................................................ + add v6.4S, v6.4S, v18.4S // ..........................................................*..................................................................................... + // gap // ................................................................................................................................................ + trn1 v18.4S, v31.4S, v3.4S // ............................................................................*................................................................... + // gap // ................................................................................................................................................ + trn2 v31.4S, v31.4S, v3.4S // .............................................................................*.................................................................. + // gap // ................................................................................................................................................ + trn1 v3.4S, v6.4S, v19.4S // ..........................................................................*..................................................................... + // gap // ................................................................................................................................................ + trn2 v6.4S, v6.4S, v19.4S // ...........................................................................*.................................................................... + // gap // ................................................................................................................................................ + trn1 v19.4S, v16.4S, v20.4S // ..................................................................................*............................................................. + // gap // ................................................................................................................................................ + trn2 v15.2D, v3.2D, v18.2D // ..............................................................................*................................................................. + // gap // ................................................................................................................................................ + trn2 v4.2D, v6.2D, v31.2D // ...............................................................................*................................................................ + // gap // ................................................................................................................................................ + trn1 v18.2D, v3.2D, v18.2D // ................................................................................*............................................................... + // gap // ................................................................................................................................................ + trn1 v6.2D, v6.2D, v31.2D // .................................................................................*.............................................................. + // gap // ................................................................................................................................................ + trn2 v16.4S, v16.4S, v20.4S // ...................................................................................*............................................................ + // gap // ................................................................................................................................................ + trn1 v20.4S, v22.4S, v9.4S // ....................................................................................*........................................................... + // gap // ................................................................................................................................................ + trn2 v22.4S, v22.4S, v9.4S // .....................................................................................*.......................................................... + // gap // ................................................................................................................................................ + sqrdmulh v31.4S, v15.4S, v2.4S // ................................................................................................*............................................... + // gap // ................................................................................................................................................ + trn2 v3.2D, v19.2D, v20.2D // ......................................................................................*......................................................... + // gap // ................................................................................................................................................ + mul v9.4S, v15.4S, v12.4S // .................................................................................................*.............................................. + // gap // ................................................................................................................................................ + mul v12.4S, v4.4S, v12.4S // ......................................................................................................*......................................... + // gap // ................................................................................................................................................ + trn2 v15.2D, v16.2D, v22.2D // .......................................................................................*........................................................ + // gap // ................................................................................................................................................ + trn1 v19.2D, v19.2D, v20.2D // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + trn1 v16.2D, v16.2D, v22.2D // .........................................................................................*...................................................... + // gap // ................................................................................................................................................ + mls v9.4S, v31.4S, v8.S[0] // ..................................................................................................*............................................. + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v4.4S, v2.4S // .....................................................................................................*.......................................... + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v3.4S, v1.4S // ..........................................................................................................................*..................... + // gap // ................................................................................................................................................ + mul v31.4S, v3.4S, v23.4S // ...........................................................................................................................*.................... + // gap // ................................................................................................................................................ + sub v3.4S, v18.4S, v9.4S // ...................................................................................................*............................................ + // gap // ................................................................................................................................................ + add v18.4S, v18.4S, v9.4S // ....................................................................................................*........................................... + // gap // ................................................................................................................................................ + mls v12.4S, v22.4S, v8.S[0] // .......................................................................................................*........................................ + // gap // ................................................................................................................................................ + mls v31.4S, v20.4S, v8.S[0] // ............................................................................................................................*................... + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v15.4S, v1.4S // ...............................................................................................................................*................ + // gap // ................................................................................................................................................ + mul v20.4S, v15.4S, v23.4S // ................................................................................................................................*............... + // gap // ................................................................................................................................................ + sub v9.4S, v6.4S, v12.4S // ........................................................................................................*....................................... + // gap // ................................................................................................................................................ + add v6.4S, v6.4S, v12.4S // .........................................................................................................*...................................... + // gap // ................................................................................................................................................ + sub v12.4S, v19.4S, v31.4S // .............................................................................................................................*.................. + // gap // ................................................................................................................................................ + sqrdmulh v15.4S, v9.4S, v5.4S // ...............................................................................................................*................................ + // gap // ................................................................................................................................................ + sqrdmulh v4.4S, v6.4S, v14.4S // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + mul v6.4S, v6.4S, v30.4S // ...........................................................................................................*.................................... + // gap // ................................................................................................................................................ + mul v9.4S, v9.4S, v11.4S // ................................................................................................................*............................... + // gap // ................................................................................................................................................ + add v19.4S, v19.4S, v31.4S // ..............................................................................................................................*................. + // gap // ................................................................................................................................................ + mls v20.4S, v22.4S, v8.S[0] // .................................................................................................................................*.............. + // gap // ................................................................................................................................................ + mls v6.4S, v4.4S, v8.S[0] // ............................................................................................................*................................... + // gap // ................................................................................................................................................ + mls v9.4S, v15.4S, v8.S[0] // .................................................................................................................*.............................. + // gap // ................................................................................................................................................ + ldr q22, [x5, #-64] // ......................................................................................................................*......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v26.4S, v18.4S, v6.4S // .............................................................................................................*.................................. + // gap // ................................................................................................................................................ + add v25.4S, v18.4S, v6.4S // ..............................................................................................................*................................. + // gap // ................................................................................................................................................ + sub v28.4S, v3.4S, v9.4S // ..................................................................................................................*............................. + // gap // ................................................................................................................................................ + add v27.4S, v3.4S, v9.4S // ...................................................................................................................*............................ + // gap // ................................................................................................................................................ + sub v6.4S, v16.4S, v20.4S // ..................................................................................................................................*............. + // gap // ................................................................................................................................................ + add v18.4S, v16.4S, v20.4S // ...................................................................................................................................*............ + // gap // ................................................................................................................................................ + ldr q16, [x5, #-48] // .......................................................................................................................*........................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v22.4S, v18.4S, v22.4S // .....................................................................................................................................*.......... + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v6.4S, v17.4S // .........................................................................................................................................*...... + // gap // ................................................................................................................................................ + sqrdmulh v18.4S, v18.4S, v16.4S // ....................................................................................................................................*........... + // gap // ................................................................................................................................................ + mul v6.4S, v6.4S, v0.4S // ..........................................................................................................................................*..... + // gap // ................................................................................................................................................ + ldr q31, [x1, #128] // e............................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v22.4S, v18.4S, v8.S[0] // ......................................................................................................................................*......... + // gap // ................................................................................................................................................ + mls v6.4S, v20.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + ldr q3, [x1, #144] // .e.............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v18.4S, v19.4S, v22.4S // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + add v17.4S, v19.4S, v22.4S // ........................................................................................................................................*....... + // gap // ................................................................................................................................................ + sub v20.4S, v12.4S, v6.4S // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + add v19.4S, v12.4S, v6.4S // .............................................................................................................................................*.. + // gap // ................................................................................................................................................ + ldr q12, [x1, #160] // ..e............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q15, [x1, #176] // ...e............................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q16, [x2, #128] // ....e........................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q22, [x2, #144] // .....e.......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q9, [x2, #160] // ......e......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q4, [x2, #176] // .......e........................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q21, [x4], #64 // ..........e..................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q7, [x4, #-48] // ...........e.................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q2, [x5, #16] // ...........................................................................................e.................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q30, [x5, #32] // ............................................................................................e................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q14, [x5, #48] // .............................................................................................e.................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q11, [x5, #64] // ..............................................................................................e................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q5, [x5, #80] // ...............................................................................................e................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q23, [x5, #96] // ....................................................................................................................e........................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q1, [x5, #112] // .....................................................................................................................e.......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q0, [x5, #160] // ........................................................................................................................e....................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // ..............................................................................................................................................*. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x2], #64 // ...............................................................................................................................................* + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q17, [x5, #176] // .........................................................................................................................e...................... + // gap // ................................................................................................................................................ + + // ----------------------------------------------------------------------------- new position ------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ldr q9, [x1, #(16*0 + (64))] // e..........................'....................................................................................................................~......................... + // ldr q10, [x1, #(16*1 + (64))] // ...e.......................'.......................................................................................................................~...................... + // ldr q11, [x1, #(16*2 + (64))] // ........e..................'............................................................................................................................~................. + // ldr q12, [x1, #(16*3 + (64))] // .........e.................'.............................................................................................................................~................ + // ldr q13, [x2, #(16*0 + (64))] // ..........e................'..............................................................................................................................~............... + // ldr q14, [x2, #(16*1 + (64))] // ...........e...............'...............................................................................................................................~.............. + // ldr q15, [x2, #(16*2 + (64))] // ............e..............'................................................................................................................................~............. + // ldr q16, [x2, #(16*3 + (64))] // .............e.............'.................................................................................................................................~............ + // add x1, x1, #64 // ...........................*.............................................................................................................................................. + // add x2, x2, #64 // ...........................'*............................................................................................................................................. + // ldr q0, [x4], #64 // ..............e............'..................................................................................................................................~........... + // ldr q1, [x4, #(-64 + 16)] // ...............e...........'...................................................................................................................................~.......... + // ldr q2, [x4, #(-64 + 32)] // ...........................'.......................*...................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ...........................'...................................*.......................................................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ...........................'.*............................................................................................................................................ + // mul v24.4s, v13.4s, v0.s[0] // ...........................'..*........................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'......*....................................................................................................................................... + // sub v13.4s, v9.4s, v24.4s // ...........................'..........*................................................................................................................................... + // add v9.4s, v9.4s, v24.4s // ...........................'............*................................................................................................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ...........................'...*.......................................................................................................................................... + // mul v24.4s, v14.4s, v0.s[0] // ...........................'....*......................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'........*..................................................................................................................................... + // sub v14.4s, v10.4s, v24.4s // ...........................'.............*................................................................................................................................ + // add v10.4s, v10.4s, v24.4s // ...........................'..............*............................................................................................................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ...........................'.....*........................................................................................................................................ + // mul v24.4s, v15.4s, v0.s[0] // ...........................'.......*...................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'...........*.................................................................................................................................. + // sub v15.4s, v11.4s, v24.4s // ...........................'...............*.............................................................................................................................. + // add v11.4s, v11.4s, v24.4s // ...........................'................*............................................................................................................................. + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ...........................'.........*.................................................................................................................................... + // mul v24.4s, v16.4s, v0.s[0] // ...........................'.................*............................................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.....................*........................................................................................................................ + // sub v16.4s, v12.4s, v24.4s // ...........................'...........................*.................................................................................................................. + // add v12.4s, v12.4s, v24.4s // ...........................'..........................*................................................................................................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ...........................'...................*.......................................................................................................................... + // mul v24.4s, v11.4s, v0.s[2] // ...........................'....................*......................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'........................*..................................................................................................................... + // sub v11.4s, v9.4s, v24.4s // ...........................'............................*................................................................................................................. + // add v9.4s, v9.4s, v24.4s // ...........................'...............................*.............................................................................................................. + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ...........................'..................................*........................................................................................................... + // mul v24.4s, v12.4s, v0.s[2] // ...........................'.....................................*........................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.........................................*.................................................................................................... + // sub v12.4s, v10.4s, v24.4s // ...........................'.............................................*................................................................................................ + // add v10.4s, v10.4s, v24.4s // ...........................'..............................................*............................................................................................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ...........................'..................*........................................................................................................................... + // mul v24.4s, v15.4s, v1.s[0] // ...........................'......................*....................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.........................*.................................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ...........................'.............................*................................................................................................................ + // add v13.4s, v13.4s, v24.4s // ...........................'..............................*............................................................................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ...........................'................................*............................................................................................................. + // mul v24.4s, v16.4s, v1.s[0] // ...........................'.................................*............................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................'....................................*......................................................................................................... + // sub v16.4s, v14.4s, v24.4s // ...........................'.......................................*...................................................................................................... + // add v14.4s, v14.4s, v24.4s // ...........................'........................................*..................................................................................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ...........................'..................................................*........................................................................................... + // mul v24.4s, v10.4s, v1.s[2] // ...........................'...................................................*.......................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.......................................................*...................................................................................... + // sub v10.4s, v9.4s, v24.4s // ...........................'..............................................................*............................................................................... + // add v9.4s, v9.4s, v24.4s // ...........................'...............................................................*.............................................................................. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ...........................'................................................*............................................................................................. + // mul v24.4s, v12.4s, v2.s[0] // ...........................'.................................................*............................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.....................................................*........................................................................................ + // sub v12.4s, v11.4s, v24.4s // ...........................'.........................................................*.................................................................................... + // add v11.4s, v11.4s, v24.4s // ...........................'..........................................................*................................................................................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...........................'...........................................*.................................................................................................. + // mul v24.4s, v14.4s, v2.s[2] // ...........................'............................................*................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................'....................................................*......................................................................................... + // sub v14.4s, v13.4s, v24.4s // ...........................'........................................................*..................................................................................... + // add v13.4s, v13.4s, v24.4s // ...........................'.............................................................*................................................................................ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ...........................'..........................................*................................................................................................... + // mul v24.4s, v16.4s, v3.s[0] // ...........................'...............................................*.............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................'......................................................*....................................................................................... + // sub v16.4s, v15.4s, v24.4s // ...........................'...........................................................*.................................................................................. + // add v15.4s, v15.4s, v24.4s // ...........................'............................................................*................................................................................. + // trn1 v25.4s, v9.4s, v10.4s // ...........................'..................................................................*........................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...........................'...................................................................*.......................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ...........................'................................................................*............................................................................. + // trn2 v28.4s, v11.4s, v12.4s // ...........................'.................................................................*............................................................................ + // trn2 v11.2d, v25.2d, v27.2d // ...........................'.....................................................................*........................................................................ + // trn2 v12.2d, v26.2d, v28.2d // ...........................'......................................................................*....................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ...........................'.......................................................................*...................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...........................'........................................................................*..................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ...........................'....................................................................*......................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ...........................'.........................................................................*.................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...........................'..........................................................................*................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ...........................'...........................................................................*.................................................................. + // trn2 v15.2d, v25.2d, v27.2d // ...........................'.............................................................................*................................................................ + // trn2 v16.2d, v26.2d, v28.2d // ...........................'................................................................................*............................................................. + // trn1 v13.2d, v25.2d, v27.2d // ...........................'.................................................................................*............................................................ + // trn1 v14.2d, v26.2d, v28.2d // ...........................'..................................................................................*........................................................... + // ldr q0, [ x5], #(12*16) // ...........................'......................................*....................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ................e..........'....................................................................................................................................~......... + // ldr q1, [ x5, #(-12*16 + 2*16)] // .................e.........'.....................................................................................................................................~........ + // ldr q5, [x5, #(-12*16 + 3*16)] // ..................e........'......................................................................................................................................~....... + // ldr q2, [ x5, #(-12*16 + 4*16)] // ...................e.......'.......................................................................................................................................~...... + // ldr q6, [x5, #(-12*16 + 5*16)] // ....................e......'........................................................................................................................................~..... + // sqrdmulh v27.4s, v11.4s, v4.4s // ...........................'............................................................................*................................................................. + // mul v24.4s, v11.4s, v0.4s // ...........................'..............................................................................*............................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'...................................................................................*.......................................................... + // sub v11.4s, v9.4s, v24.4s // ...........................'.......................................................................................*...................................................... + // add v9.4s, v9.4s, v24.4s // ...........................'........................................................................................*..................................................... + // sqrdmulh v27.4s, v12.4s, v4.4s // ...........................'....................................................................................*......................................................... + // mul v24.4s, v12.4s, v0.4s // ...........................'...............................................................................*.............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.........................................................................................*.................................................... + // sub v12.4s, v10.4s, v24.4s // ...........................'.............................................................................................*................................................ + // add v10.4s, v10.4s, v24.4s // ...........................'..............................................................................................*............................................... + // sqrdmulh v27.4s, v10.4s, v5.4s // ...........................'.................................................................................................*............................................ + // mul v24.4s, v10.4s, v1.4s // ...........................'..................................................................................................*........................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'......................................................................................................*....................................... + // sub v10.4s, v9.4s, v24.4s // ...........................'.........................................................................................................*.................................... + // add v9.4s, v9.4s, v24.4s // ...........................'..........................................................................................................*................................... + // sqrdmulh v27.4s, v12.4s, v6.4s // ...........................'................................................................................................*............................................. + // mul v24.4s, v12.4s, v2.4s // ...........................'...................................................................................................*.......................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.......................................................................................................*...................................... + // sub v12.4s, v11.4s, v24.4s // ...........................'...........................................................................................................*.................................. + // add v11.4s, v11.4s, v24.4s // ...........................'............................................................................................................*................................. + // ldr q0, [ x5, #(-12*16 + 6*16)] // .....................e.....'.........................................................................................................................................~.... + // ldr q4, [x5, #(-12*16 + 7*16)] // ......................e....'..........................................................................................................................................~... + // ldr q1, [ x5, #(-12*16 + 8*16)] // ...........................'........................................................................................................*..................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ...........................'...............................................................................................................*.............................. + // ldr q2, [ x5, #(-12*16 + 10*16)] // .......................e...'...........................................................................................................................................~.. + // ldr q6, [x5, #(-12*16 + 11*16)] // ..........................e'.............................................................................................................................................. + // sqrdmulh v27.4s, v15.4s, v4.4s // ...........................'.....................................................................................*........................................................ + // mul v24.4s, v15.4s, v0.4s // ...........................'......................................................................................*....................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'..........................................................................................*................................................... + // sub v15.4s, v13.4s, v24.4s // ...........................'...............................................................................................*.............................................. + // add v13.4s, v13.4s, v24.4s // ...........................'....................................................................................................*......................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // ...........................'...........................................................................................*.................................................. + // mul v24.4s, v16.4s, v0.4s // ...........................'............................................................................................*................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.....................................................................................................*........................................ + // sub v16.4s, v14.4s, v24.4s // ...........................'.............................................................................................................*................................ + // add v14.4s, v14.4s, v24.4s // ...........................'..............................................................................................................*............................... + // sqrdmulh v27.4s, v14.4s, v5.4s // ...........................'..................................................................................................................*........................... + // mul v24.4s, v14.4s, v1.4s // ...........................'................................................................................................................*............................. + // mls v24.4s, v27.4s, v8.s[0] // .~.........................'.....................................................................................................................*........................ + // sub v14.4s, v13.4s, v24.4s // ....~......................'........................................................................................................................*..................... + // add v13.4s, v13.4s, v24.4s // .....~.....................'.........................................................................................................................*.................... + // sqrdmulh v27.4s, v16.4s, v6.4s // ...........................'.................................................................................................................*............................ + // mul v24.4s, v16.4s, v2.4s // ...........................'...................................................................................................................*.......................... + // mls v24.4s, v27.4s, v8.s[0] // ..~........................'......................................................................................................................*....................... + // sub v16.4s, v15.4s, v24.4s // ......~....................'..........................................................................................................................*................... + // add v15.4s, v15.4s, v24.4s // .......~...................'...........................................................................................................................*.................. + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // ........................~..'............................................................................................................................................*. + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // .........................~.'.............................................................................................................................................* + + sub count, count, #1 cbnz count, layer45678_start - ldr_vo v29, x5, -64 // .............................................................................................................*...................... - add x8, x2, #64 // ....*............................................................................................................................... - add x28, x1, #64 // .........*.......................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v12, x5, -48 // ...................................................................................................................*................ - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v1, x5, -32 // ....................................................................................................................*............... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v2, x5, -16 // .......................................................................................................................*............ - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v28, x2, 64 // *................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v30, x2, 80 // .*.................................................................................................................................. - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v14, x2, 96 // ..*................................................................................................................................. - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v5, x2, 112 // ...*................................................................................................................................ - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v11, x1, 64 // .....*.............................................................................................................................. - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v3, x1, 80 // ......*............................................................................................................................. - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v22, x1, 96 // .......*............................................................................................................................ - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - ldr_vo v20, x1, 112 // ........*........................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - mul v24.4S, v14.4S, v6.S[0] // ..............*..................................................................................................................... - // gap // .................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v6.S[1] // ...............*.................................................................................................................... - // gap // .................................................................................................................................... - mul v23.4S, v5.4S, v6.S[0] // ................*................................................................................................................... - // gap // .................................................................................................................................... - sqrdmulh v5.4S, v5.4S, v6.S[1] // .................*.................................................................................................................. - // gap // .................................................................................................................................... - mul v21.4S, v28.4S, v6.S[0] // ..........*......................................................................................................................... - // gap // .................................................................................................................................... - mls v24.4S, v14.4S, v8.S[0] // ....................*............................................................................................................... - // gap // .................................................................................................................................... - sqrdmulh v28.4S, v28.4S, v6.S[1] // ...........*........................................................................................................................ - // gap // .................................................................................................................................... - mls v23.4S, v5.4S, v8.S[0] // .....................*.............................................................................................................. - // gap // .................................................................................................................................... - mul v14.4S, v30.4S, v6.S[0] // ............*....................................................................................................................... - // gap // .................................................................................................................................... - add v5.4S, v22.4S, v24.4S // ...........................*........................................................................................................ - // gap // .................................................................................................................................... - sqrdmulh v30.4S, v30.4S, v6.S[1] // .............*...................................................................................................................... - // gap // .................................................................................................................................... - add v0.4S, v20.4S, v23.4S // .............................*...................................................................................................... - // gap // .................................................................................................................................... - mul v16.4S, v5.4S, v6.S[2] // ..............................*..................................................................................................... - // gap // .................................................................................................................................... - sqrdmulh v5.4S, v5.4S, v6.S[3] // ...............................*.................................................................................................... - // gap // .................................................................................................................................... - mul v25.4S, v0.4S, v6.S[2] // ................................*................................................................................................... - // gap // .................................................................................................................................... - sqrdmulh v6.4S, v0.4S, v6.S[3] // .................................*.................................................................................................. - // gap // .................................................................................................................................... - mls v14.4S, v30.4S, v8.S[0] // ...................*................................................................................................................ - // gap // .................................................................................................................................... - sub v30.4S, v22.4S, v24.4S // ..........................*......................................................................................................... - // gap // .................................................................................................................................... - sub v22.4S, v20.4S, v23.4S // ............................*....................................................................................................... - // gap // .................................................................................................................................... - mls v25.4S, v6.4S, v8.S[0] // .....................................*.............................................................................................. - // gap // .................................................................................................................................... - add v20.4S, v3.4S, v14.4S // .........................*.......................................................................................................... - // gap // .................................................................................................................................... - mul v6.4S, v30.4S, v31.S[0] // ..................................*................................................................................................. - // gap // .................................................................................................................................... - mul v24.4S, v22.4S, v31.S[0] // ......................................*............................................................................................. - // gap // .................................................................................................................................... - add v23.4S, v20.4S, v25.4S // ..........................................*......................................................................................... - // gap // .................................................................................................................................... - sqrdmulh v22.4S, v22.4S, v31.S[1] // ............................................*....................................................................................... - // gap // .................................................................................................................................... - sqrdmulh v30.4S, v30.4S, v31.S[1] // ....................................*............................................................................................... - // gap // .................................................................................................................................... - mul v0.4S, v23.4S, v31.S[2] // .............................................*...................................................................................... - // gap // .................................................................................................................................... - sqrdmulh v31.4S, v23.4S, v31.S[3] // ..............................................*..................................................................................... - // gap // .................................................................................................................................... - mls v24.4S, v22.4S, v8.S[0] // .................................................*.................................................................................. - // gap // .................................................................................................................................... - sub v14.4S, v3.4S, v14.4S // ........................*........................................................................................................... - // gap // .................................................................................................................................... - sub v3.4S, v20.4S, v25.4S // .........................................*.......................................................................................... - // gap // .................................................................................................................................... - mls v21.4S, v28.4S, v8.S[0] // ..................*................................................................................................................. - // gap // .................................................................................................................................... - add v28.4S, v14.4S, v24.4S // ......................................................*............................................................................. - // gap // .................................................................................................................................... - mul v22.4S, v3.4S, v13.S[0] // ...................................................*................................................................................ - // gap // .................................................................................................................................... - sqrdmulh v3.4S, v3.4S, v13.S[1] // ....................................................*............................................................................... - // gap // .................................................................................................................................... - mul v20.4S, v28.4S, v13.S[2] // ..........................................................*......................................................................... - // gap // .................................................................................................................................... - sqrdmulh v28.4S, v28.4S, v13.S[3] // ...........................................................*........................................................................ - // gap // .................................................................................................................................... - sub v14.4S, v14.4S, v24.4S // .....................................................*.............................................................................. - // gap // .................................................................................................................................... - add v24.4S, v11.4S, v21.4S // .......................*............................................................................................................ - // gap // .................................................................................................................................... - mls v16.4S, v5.4S, v8.S[0] // ...................................*................................................................................................ - // gap // .................................................................................................................................... - mul v5.4S, v14.4S, v26.S[0] // ............................................................*....................................................................... - // gap // .................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v26.S[1] // ................................................................*................................................................... - // gap // .................................................................................................................................... - mls v22.4S, v3.4S, v8.S[0] // .........................................................*.......................................................................... - // gap // .................................................................................................................................... - sub v3.4S, v24.4S, v16.4S // .......................................*............................................................................................ - // gap // .................................................................................................................................... - mls v0.4S, v31.4S, v8.S[0] // ..................................................*................................................................................. - // gap // .................................................................................................................................... - add v24.4S, v24.4S, v16.4S // ........................................*........................................................................................... - // gap // .................................................................................................................................... - sub v31.4S, v3.4S, v22.4S // .............................................................*...................................................................... - // gap // .................................................................................................................................... - add v3.4S, v3.4S, v22.4S // ..............................................................*..................................................................... - // gap // .................................................................................................................................... - sub v22.4S, v24.4S, v0.4S // .......................................................*............................................................................ - // gap // .................................................................................................................................... - add v24.4S, v24.4S, v0.4S // ........................................................*........................................................................... - // gap // .................................................................................................................................... - trn1 v13.4S, v3.4S, v31.4S // ......................................................................*............................................................. - // gap // .................................................................................................................................... - trn2 v3.4S, v3.4S, v31.4S // .......................................................................*............................................................ - // gap // .................................................................................................................................... - trn1 v31.4S, v24.4S, v22.4S // .................................................................*.................................................................. - // gap // .................................................................................................................................... - trn2 v22.4S, v24.4S, v22.4S // ..................................................................*................................................................. - // gap // .................................................................................................................................... - sub v11.4S, v11.4S, v21.4S // ......................*............................................................................................................. - // gap // .................................................................................................................................... - trn2 v24.2D, v31.2D, v13.2D // ...........................................................................*........................................................ - // gap // .................................................................................................................................... - trn2 v26.2D, v22.2D, v3.2D // ............................................................................*....................................................... - // gap // .................................................................................................................................... - mul v23.4S, v24.4S, v4.4S // ..................................................................................*................................................. - // gap // .................................................................................................................................... - mul v4.4S, v26.4S, v4.4S // ........................................................................................*........................................... - // gap // .................................................................................................................................... - sqrdmulh v26.4S, v26.4S, v15.4S // .........................................................................................*.......................................... - // gap // .................................................................................................................................... - sqrdmulh v24.4S, v24.4S, v15.4S // .......................................................................................*............................................ - // gap // .................................................................................................................................... - mls v6.4S, v30.4S, v8.S[0] // ...........................................*........................................................................................ - // gap // .................................................................................................................................... - mls v20.4S, v28.4S, v8.S[0] // ...............................................................*.................................................................... - // gap // .................................................................................................................................... - mls v4.4S, v26.4S, v8.S[0] // .............................................................................................*...................................... - // gap // .................................................................................................................................... - mls v5.4S, v14.4S, v8.S[0] // .....................................................................*.............................................................. - // gap // .................................................................................................................................... - trn1 v28.2D, v22.2D, v3.2D // ..............................................................................*..................................................... - // gap // .................................................................................................................................... - sub v30.4S, v11.4S, v6.4S // ...............................................*.................................................................................... - // gap // .................................................................................................................................... - sub v14.4S, v28.4S, v4.4S // .................................................................................................*.................................. - // gap // .................................................................................................................................... - add v28.4S, v28.4S, v4.4S // ..................................................................................................*................................. - // gap // .................................................................................................................................... - add v11.4S, v11.4S, v6.4S // ................................................*................................................................................... - // gap // .................................................................................................................................... - mul v3.4S, v14.4S, v27.4S // ....................................................................................................*............................... - // gap // .................................................................................................................................... - mul v22.4S, v28.4S, v19.4S // .....................................................................................................*.............................. - // gap // .................................................................................................................................... - sqrdmulh v28.4S, v28.4S, v10.4S // ......................................................................................................*............................. - // gap // .................................................................................................................................... - sqrdmulh v14.4S, v14.4S, v17.4S // .......................................................................................................*............................ - // gap // .................................................................................................................................... - add v6.4S, v11.4S, v20.4S // ....................................................................*............................................................... - // gap // .................................................................................................................................... - sub v11.4S, v11.4S, v20.4S // ...................................................................*................................................................ - // gap // .................................................................................................................................... - sub v20.4S, v30.4S, v5.4S // .........................................................................*.......................................................... - // gap // .................................................................................................................................... - add v30.4S, v30.4S, v5.4S // ..........................................................................*......................................................... - // gap // .................................................................................................................................... - trn1 v5.4S, v6.4S, v11.4S // ........................................................................*........................................................... - // gap // .................................................................................................................................... - trn2 v11.4S, v6.4S, v11.4S // ...............................................................................*.................................................... - // gap // .................................................................................................................................... - trn1 v6.4S, v30.4S, v20.4S // ................................................................................*................................................... - // gap // .................................................................................................................................... - trn2 v30.4S, v30.4S, v20.4S // .................................................................................*.................................................. - // gap // .................................................................................................................................... - trn1 v20.2D, v31.2D, v13.2D // .............................................................................*...................................................... - // gap // .................................................................................................................................... - trn2 v31.2D, v5.2D, v6.2D // ...................................................................................*................................................ - // gap // .................................................................................................................................... - trn2 v13.2D, v11.2D, v30.2D // ....................................................................................*............................................... - // gap // .................................................................................................................................... - mul v26.4S, v31.4S, v18.4S // ..........................................................................................*......................................... - // gap // .................................................................................................................................... - mul v4.4S, v13.4S, v18.4S // ..............................................................................................*..................................... - // gap // .................................................................................................................................... - sqrdmulh v13.4S, v13.4S, v7.4S // ............................................................................................................*....................... - // gap // .................................................................................................................................... - sqrdmulh v31.4S, v31.4S, v7.4S // ............................................................................................*....................................... - // gap // .................................................................................................................................... - trn1 v5.2D, v5.2D, v6.2D // .....................................................................................*.............................................. - // gap // .................................................................................................................................... - trn1 v30.2D, v11.2D, v30.2D // ......................................................................................*............................................. - // gap // .................................................................................................................................... - mls v4.4S, v13.4S, v8.S[0] // ..................................................................................................................*................. - // gap // .................................................................................................................................... - mls v23.4S, v24.4S, v8.S[0] // ...........................................................................................*........................................ - // gap // .................................................................................................................................... - mls v26.4S, v31.4S, v8.S[0] // ...................................................................................................*................................ - // gap // .................................................................................................................................... - mls v22.4S, v28.4S, v8.S[0] // ..........................................................................................................*......................... - // gap // .................................................................................................................................... - add v28.4S, v30.4S, v4.4S // ......................................................................................................................*............. - // gap // .................................................................................................................................... - sub v11.4S, v20.4S, v23.4S // ...............................................................................................*.................................... - // gap // .................................................................................................................................... - sub v30.4S, v30.4S, v4.4S // .....................................................................................................................*.............. - // gap // .................................................................................................................................... - mul v29.4S, v28.4S, v29.4S // ........................................................................................................................*........... - // gap // .................................................................................................................................... - sqrdmulh v12.4S, v28.4S, v12.4S // .........................................................................................................................*.......... - // gap // .................................................................................................................................... - mul v1.4S, v30.4S, v1.4S // ..........................................................................................................................*......... - // gap // .................................................................................................................................... - sqrdmulh v2.4S, v30.4S, v2.4S // ...........................................................................................................................*........ - // gap // .................................................................................................................................... - add v28.4S, v20.4S, v23.4S // ................................................................................................*................................... - // gap // .................................................................................................................................... - mls v3.4S, v14.4S, v8.S[0] // ...........................................................................................................*........................ - // gap // .................................................................................................................................... - mls v29.4S, v12.4S, v8.S[0] // ............................................................................................................................*....... - // gap // .................................................................................................................................... - mls v1.4S, v2.4S, v8.S[0] // .............................................................................................................................*...... - // gap // .................................................................................................................................... - sub v12.4S, v5.4S, v26.4S // ........................................................................................................*........................... - // gap // .................................................................................................................................... - add v2.4S, v5.4S, v26.4S // .........................................................................................................*.......................... - // gap // .................................................................................................................................... - sub v14.4S, v28.4S, v22.4S // ..............................................................................................................*..................... - // gap // .................................................................................................................................... - add v13.4S, v28.4S, v22.4S // ...............................................................................................................*.................... - // gap // .................................................................................................................................... - sub v28.4S, v2.4S, v29.4S // ..............................................................................................................................*..... - // gap // .................................................................................................................................... - add v27.4S, v2.4S, v29.4S // ...............................................................................................................................*.... - // gap // .................................................................................................................................... - sub v30.4S, v12.4S, v1.4S // ................................................................................................................................*... - // gap // .................................................................................................................................... - add v29.4S, v12.4S, v1.4S // .................................................................................................................................*.. - // gap // .................................................................................................................................... - sub v16.4S, v11.4S, v3.4S // ................................................................................................................*................... - // gap // .................................................................................................................................... - add v15.4S, v11.4S, v3.4S // .................................................................................................................*.................. - // gap // .................................................................................................................................... - st4 {v27.4S,v28.4S,v29.4S,v30.4S}, [x8], #64 // ...................................................................................................................................* - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - st4 {v13.4S,v14.4S,v15.4S,v16.4S}, [x28], #64 // ..................................................................................................................................*. - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - // gap // .................................................................................................................................... - - // original source code - // ldr_vo v29, x2, 64 // ......*............................................................................................................................. || ........*......................................................................................................................................... - // ldr_vo v12, x2, 80 // .......*............................................................................................................................ || ..........*....................................................................................................................................... - // ldr_vo v1, x2, 96 // ........*........................................................................................................................... || ............*..................................................................................................................................... - // ldr_vo v2, x2, 112 // .........*.......................................................................................................................... || ..............*................................................................................................................................... - // add x2, x2, #64 // .*.................................................................................................................................. || *................................................................................................................................................. - // ldr_vo v28, x1, 64 // ..........*......................................................................................................................... || ................*................................................................................................................................. - // ldr_vo v30, x1, 80 // ...........*........................................................................................................................ || ..................*............................................................................................................................... - // ldr_vo v14, x1, 96 // ............*....................................................................................................................... || ....................*............................................................................................................................. - // ldr_vo v5, x1, 112 // .............*...................................................................................................................... || ......................*........................................................................................................................... - // add x1, x1, #64 // ..*................................................................................................................................. || .*................................................................................................................................................ - // mul v11.4S, v29.4S, v6.S[0] // ..................*................................................................................................................. || ............................*..................................................................................................................... - // sqrdmulh v29.4S, v29.4S, v6.S[1] // ....................*............................................................................................................... || ..............................*................................................................................................................... - // mul v3.4S, v12.4S, v6.S[0] // ......................*............................................................................................................. || ................................*................................................................................................................. - // sqrdmulh v12.4S, v12.4S, v6.S[1] // ........................*........................................................................................................... || ..................................*............................................................................................................... - // mul v22.4S, v1.4S, v6.S[0] // ..............*..................................................................................................................... || ........................*......................................................................................................................... - // sqrdmulh v1.4S, v1.4S, v6.S[1] // ...............*.................................................................................................................... || .........................*........................................................................................................................ - // mul v20.4S, v2.4S, v6.S[0] // ................*................................................................................................................... || ..........................*....................................................................................................................... - // sqrdmulh v2.4S, v2.4S, v6.S[1] // .................*.................................................................................................................. || ...........................*...................................................................................................................... - // mls v11.4S, v29.4S, v8.S[0] // .............................................*...................................................................................... || .......................................................*.......................................................................................... - // mls v3.4S, v12.4S, v8.S[0] // ..............................*..................................................................................................... || ........................................*......................................................................................................... - // mls v22.4S, v1.4S, v8.S[0] // ...................*................................................................................................................ || .............................*.................................................................................................................... - // mls v20.4S, v2.4S, v8.S[0] // .....................*.............................................................................................................. || ...............................*.................................................................................................................. - // sub v29.4S, v28.4S, v11.4S // ....................................................................*............................................................... || ..............................................................................*................................................................... - // add v12.4S, v28.4S, v11.4S // ....................................................*............................................................................... || ..............................................................*................................................................................... - // sub v1.4S, v30.4S, v3.4S // ...........................................*........................................................................................ || .....................................................*............................................................................................ - // add v2.4S, v30.4S, v3.4S // ..................................*................................................................................................. || ............................................*..................................................................................................... - // sub v28.4S, v14.4S, v22.4S // ...............................*.................................................................................................... || .........................................*........................................................................................................ - // add v30.4S, v14.4S, v22.4S // .......................*............................................................................................................ || .................................*................................................................................................................ - // sub v14.4S, v5.4S, v20.4S // ................................*................................................................................................... || ..........................................*....................................................................................................... - // add v5.4S, v5.4S, v20.4S // .........................*.......................................................................................................... || ...................................*.............................................................................................................. - // mul v11.4S, v30.4S, v6.S[2] // ..........................*......................................................................................................... || ....................................*............................................................................................................. - // sqrdmulh v30.4S, v30.4S, v6.S[3] // ...........................*........................................................................................................ || .....................................*............................................................................................................ - // mul v3.4S, v5.4S, v6.S[2] // ............................*....................................................................................................... || ......................................*........................................................................................................... - // sqrdmulh v5.4S, v5.4S, v6.S[3] // .............................*...................................................................................................... || .......................................*.......................................................................................................... - // mul v22.4S, v28.4S, v31.S[0] // ...................................*................................................................................................ || .............................................*.................................................................................................... - // mls v11.4S, v30.4S, v8.S[0] // .....................................................*.............................................................................. || ...............................................................*.................................................................................. - // sqrdmulh v28.4S, v28.4S, v31.S[1] // .......................................*............................................................................................ || .................................................*................................................................................................ - // mls v3.4S, v5.4S, v8.S[0] // .................................*.................................................................................................. || ...........................................*...................................................................................................... - // mul v30.4S, v14.4S, v31.S[0] // ....................................*............................................................................................... || ..............................................*................................................................................................... - // sub v5.4S, v12.4S, v11.4S // .........................................................*.......................................................................... || ...................................................................*.............................................................................. - // add v12.4S, v12.4S, v11.4S // ...........................................................*........................................................................ || .....................................................................*............................................................................ - // sub v11.4S, v2.4S, v3.4S // ............................................*....................................................................................... || ......................................................*........................................................................................... - // add v2.4S, v2.4S, v3.4S // .....................................*.............................................................................................. || ...............................................*.................................................................................................. - // mls v22.4S, v28.4S, v8.S[0] // ...........................................................................*........................................................ || .....................................................................................*............................................................ - // sqrdmulh v28.4S, v14.4S, v31.S[1] // ......................................*............................................................................................. || ................................................*................................................................................................. - // mul v14.4S, v2.4S, v31.S[2] // ........................................*........................................................................................... || ..................................................*............................................................................................... - // sqrdmulh v2.4S, v2.4S, v31.S[3] // .........................................*.......................................................................................... || ...................................................*.............................................................................................. - // sub v3.4S, v29.4S, v22.4S // ................................................................................*................................................... || ..........................................................................................*....................................................... - // add v29.4S, v29.4S, v22.4S // ...................................................................................*................................................ || .............................................................................................*.................................................... - // mls v30.4S, v28.4S, v8.S[0] // ..........................................*......................................................................................... || ....................................................*............................................................................................. - // mls v14.4S, v2.4S, v8.S[0] // ..........................................................*......................................................................... || ....................................................................*............................................................................. - // mul v2.4S, v11.4S, v13.S[0] // ...............................................*.................................................................................... || .........................................................*........................................................................................ - // sqrdmulh v28.4S, v11.4S, v13.S[1] // ................................................*................................................................................... || ..........................................................*....................................................................................... - // sub v11.4S, v1.4S, v30.4S // ...................................................*................................................................................ || .............................................................*.................................................................................... - // add v1.4S, v1.4S, v30.4S // ..............................................*..................................................................................... || ........................................................*......................................................................................... - // sub v30.4S, v12.4S, v14.4S // ..............................................................*..................................................................... || ........................................................................*......................................................................... - // add v12.4S, v12.4S, v14.4S // ...............................................................*.................................................................... || .........................................................................*........................................................................ - // mls v2.4S, v28.4S, v8.S[0] // ........................................................*........................................................................... || ..................................................................*............................................................................... - // mul v28.4S, v1.4S, v13.S[2] // .................................................*.................................................................................. || ...........................................................*...................................................................................... - // sqrdmulh v1.4S, v1.4S, v13.S[3] // ..................................................*................................................................................. || ............................................................*..................................................................................... - // mul v14.4S, v11.4S, v26.S[0] // ......................................................*............................................................................. || ................................................................*................................................................................. - // sub v22.4S, v5.4S, v2.4S // ............................................................*....................................................................... || ......................................................................*........................................................................... - // add v2.4S, v5.4S, v2.4S // .............................................................*...................................................................... || .......................................................................*.......................................................................... - // mls v28.4S, v1.4S, v8.S[0] // ............................................................................*....................................................... || ......................................................................................*........................................................... - // sqrdmulh v1.4S, v11.4S, v26.S[1] // .......................................................*............................................................................ || .................................................................*................................................................................ - // trn1 v5.4S, v12.4S, v30.4S // ..................................................................*................................................................. || ............................................................................*..................................................................... - // trn2 v12.4S, v12.4S, v30.4S // ...................................................................*................................................................ || .............................................................................*.................................................................... - // sub v30.4S, v29.4S, v28.4S // .........................................................................................*.......................................... || ...................................................................................................*.............................................. - // add v29.4S, v29.4S, v28.4S // ........................................................................................*........................................... || ..................................................................................................*............................................... - // mls v14.4S, v1.4S, v8.S[0] // ..............................................................................*..................................................... || ........................................................................................*......................................................... - // trn1 v1.4S, v2.4S, v22.4S // ................................................................*................................................................... || ..........................................................................*....................................................................... - // trn2 v2.4S, v2.4S, v22.4S // .................................................................*.................................................................. || ...........................................................................*...................................................................... - // trn1 v28.4S, v29.4S, v30.4S // ............................................................................................*....................................... || ......................................................................................................*........................................... - // sub v11.4S, v3.4S, v14.4S // ..........................................................................................*......................................... || ....................................................................................................*............................................. - // add v14.4S, v3.4S, v14.4S // ...........................................................................................*........................................ || .....................................................................................................*............................................ - // trn2 v3.2D, v5.2D, v1.2D // .....................................................................*.............................................................. || ...............................................................................*.................................................................. - // trn2 v22.2D, v12.2D, v2.2D // ......................................................................*............................................................. || ................................................................................*................................................................. - // trn1 v1.2D, v5.2D, v1.2D // ................................................................................................*................................... || ..........................................................................................................*....................................... - // trn1 v12.2D, v12.2D, v2.2D // ...............................................................................*.................................................... || .........................................................................................*........................................................ - // trn2 v29.4S, v29.4S, v30.4S // .............................................................................................*...................................... || .......................................................................................................*.......................................... - // trn1 v2.4S, v14.4S, v11.4S // ..............................................................................................*..................................... || ........................................................................................................*......................................... - // trn2 v30.4S, v14.4S, v11.4S // ...............................................................................................*.................................... || .........................................................................................................*........................................ - // mul v14.4S, v3.4S, v4.4S // .......................................................................*............................................................ || .................................................................................*................................................................ - // trn2 v5.2D, v28.2D, v2.2D // .................................................................................................*.................................. || ...........................................................................................................*...................................... - // trn2 v11.2D, v29.2D, v30.2D // ..................................................................................................*................................. || ............................................................................................................*..................................... - // trn1 v2.2D, v28.2D, v2.2D // .......................................................................................................*............................ || .................................................................................................................*................................ - // trn1 v29.2D, v29.2D, v30.2D // ........................................................................................................*........................... || ..................................................................................................................*............................... - // sqrdmulh v28.4S, v3.4S, v15.4S // ..........................................................................*......................................................... || ....................................................................................*............................................................. - // mul v30.4S, v22.4S, v4.4S // ........................................................................*........................................................... || ..................................................................................*............................................................... - // sqrdmulh v3.4S, v22.4S, v15.4S // .........................................................................*.......................................................... || ...................................................................................*.............................................................. - // mul v22.4S, v5.4S, v18.4S // ...................................................................................................*................................ || .............................................................................................................*.................................... - // mls v14.4S, v28.4S, v8.S[0] // ..........................................................................................................*......................... || ....................................................................................................................*............................. - // sqrdmulh v28.4S, v5.4S, v7.4S // ......................................................................................................*............................. || ................................................................................................................*................................. - // mls v30.4S, v3.4S, v8.S[0] // .............................................................................*...................................................... || .......................................................................................*.......................................................... - // mul v5.4S, v11.4S, v18.4S // ....................................................................................................*............................... || ..............................................................................................................*................................... - // sub v3.4S, v1.4S, v14.4S // ..............................................................................................................*..................... || ........................................................................................................................*......................... - // add v1.4S, v1.4S, v14.4S // ....................................................................................................................*............... || ..............................................................................................................................*................... - // sub v14.4S, v12.4S, v30.4S // .................................................................................*.................................................. || ...........................................................................................*...................................................... - // add v12.4S, v12.4S, v30.4S // ..................................................................................*................................................. || ............................................................................................*..................................................... - // mls v22.4S, v28.4S, v8.S[0] // ...........................................................................................................*........................ || .....................................................................................................................*............................ - // mul v28.4S, v14.4S, v27.4S // ....................................................................................*............................................... || ..............................................................................................*................................................... - // mul v30.4S, v12.4S, v19.4S // .....................................................................................*.............................................. || ...............................................................................................*.................................................. - // sqrdmulh v12.4S, v12.4S, v10.4S // ......................................................................................*............................................. || ................................................................................................*................................................. - // sqrdmulh v14.4S, v14.4S, v17.4S // .......................................................................................*............................................ || .................................................................................................*................................................ - // sub v20.4S, v2.4S, v22.4S // ........................................................................................................................*........... || ..................................................................................................................................*............... - // add v2.4S, v2.4S, v22.4S // .........................................................................................................................*.......... || ...................................................................................................................................*.............. - // mls v30.4S, v12.4S, v8.S[0] // ............................................................................................................*....................... || ......................................................................................................................*........................... - // mls v28.4S, v14.4S, v8.S[0] // .....................................................................................................................*.............. || ...............................................................................................................................*.................. - // sqrdmulh v12.4S, v11.4S, v7.4S // .....................................................................................................*.............................. || ...............................................................................................................*.................................. - // ldr_vo v14, x5, -64 // *................................................................................................................................... || *................................................................................................................................................. - // sub v22.4S, v1.4S, v30.4S // ..........................................................................................................................*......... || ....................................................................................................................................*............. - // add v21.4S, v1.4S, v30.4S // ...........................................................................................................................*........ || .....................................................................................................................................*............ - // sub v24.4S, v3.4S, v28.4S // ................................................................................................................................*... || ..........................................................................................................................................*....... - // add v23.4S, v3.4S, v28.4S // .................................................................................................................................*.. || ...........................................................................................................................................*...... - // mls v5.4S, v12.4S, v8.S[0] // .........................................................................................................*.......................... || ...................................................................................................................*.............................. - // ldr_vo v12, x5, -48 // ...*................................................................................................................................ || ..*............................................................................................................................................... - // ldr_vo v1, x5, -32 // ....*............................................................................................................................... || ....*............................................................................................................................................. - // sub v28.4S, v29.4S, v5.4S // ...............................................................................................................*.................... || .........................................................................................................................*........................ - // add v29.4S, v29.4S, v5.4S // .............................................................................................................*...................... || .......................................................................................................................*.......................... - // ldr_vo v30, x5, -16 // .....*.............................................................................................................................. || ......*........................................................................................................................................... - // mul v14.4S, v29.4S, v14.4S // ................................................................................................................*................... || ..........................................................................................................................*....................... - // sqrdmulh v29.4S, v29.4S, v12.4S // .................................................................................................................*.................. || ...........................................................................................................................*...................... - // mul v12.4S, v28.4S, v1.4S // ..................................................................................................................*................. || ............................................................................................................................*..................... - // sqrdmulh v1.4S, v28.4S, v30.4S // ...................................................................................................................*................ || .............................................................................................................................*.................... - // mls v14.4S, v29.4S, v8.S[0] // ......................................................................................................................*............. || ................................................................................................................................*................. - // mls v12.4S, v1.4S, v8.S[0] // .......................................................................................................................*............ || .................................................................................................................................*................ - // sub v1.4S, v2.4S, v14.4S // ............................................................................................................................*....... || ......................................................................................................................................*........... - // add v0.4S, v2.4S, v14.4S // .............................................................................................................................*...... || .......................................................................................................................................*.......... - // sub v3.4S, v20.4S, v12.4S // ..............................................................................................................................*..... || ........................................................................................................................................*......... - // add v2.4S, v20.4S, v12.4S // ...............................................................................................................................*.... || .........................................................................................................................................*........ - // st4 {v21.4S,v22.4S,v23.4S,v24.4S}, [x1], #64 // ...................................................................................................................................* || .................................................................................................................................................* - // st4 {v0.4S,v1.4S,v2.4S,v3.4S}, [x2], #64 // ..................................................................................................................................*. || ............................................................................................................................................*..... - + // Instructions: 125 + // Expected cycles: 135 + // Expected IPC: 0.93 + // + // Wall time: 36.49s + // User time: 36.49s + // + // ---------------------------------------------------- original position -----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------| + sqrdmulh v18.4S, v22.4S, v21.S[1] // ....*........................................................................................................................ + add x2, x2, #64 // .*........................................................................................................................... + mul v24.4S, v22.4S, v21.S[0] // .....*....................................................................................................................... + add x1, x1, #64 // *............................................................................................................................ + sqrdmulh v20.4S, v4.4S, v21.S[1] // ..........*.................................................................................................................. + // gap // ............................................................................................................................. + sqrdmulh v27.4S, v16.4S, v21.S[1] // ..*.......................................................................................................................... + // gap // ............................................................................................................................. + mul v4.4S, v4.4S, v21.S[0] // ..................*.......................................................................................................... + // gap // ............................................................................................................................. + mls v24.4S, v18.4S, v8.S[0] // .........*................................................................................................................... + // gap // ............................................................................................................................. + mul v28.4S, v16.4S, v21.S[0] // ...*......................................................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v22.4S, v9.4S, v21.S[1] // ......*...................................................................................................................... + // gap // ............................................................................................................................. + mul v29.4S, v9.4S, v21.S[0] // ........*.................................................................................................................... + // gap // ............................................................................................................................. + mls v4.4S, v20.4S, v8.S[0] // ......................*...................................................................................................... + // gap // ............................................................................................................................. + sub v26.4S, v3.4S, v24.4S // ..............*.............................................................................................................. + // gap // ............................................................................................................................. + mls v28.4S, v27.4S, v8.S[0] // .......*..................................................................................................................... + // gap // ............................................................................................................................. + mls v29.4S, v22.4S, v8.S[0] // ............*................................................................................................................ + // gap // ............................................................................................................................. + sub v27.4S, v15.4S, v4.4S // ............................*................................................................................................ + // gap // ............................................................................................................................. + ldr q22, [x4, #-32] // ........................*.................................................................................................... + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + sqrdmulh v6.4S, v27.4S, v7.S[1] // .................................*........................................................................................... + // gap // ............................................................................................................................. + mul v9.4S, v27.4S, v7.S[0] // ..................................*.......................................................................................... + // gap // ............................................................................................................................. + sub v16.4S, v12.4S, v29.4S // ................*............................................................................................................ + // gap // ............................................................................................................................. + ldr q27, [x4, #-16] // ....................................*........................................................................................ + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + mls v9.4S, v6.4S, v8.S[0] // .....................................*....................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v10.4S, v16.4S, v7.S[1] // ...................*......................................................................................................... + // gap // ............................................................................................................................. + mul v16.4S, v16.4S, v7.S[0] // .......................*..................................................................................................... + // gap // ............................................................................................................................. + sub v13.4S, v31.4S, v28.4S // ...........*................................................................................................................. + // gap // ............................................................................................................................. + sub v6.4S, v26.4S, v9.4S // ........................................*.................................................................................... + // gap // ............................................................................................................................. + add v25.4S, v26.4S, v9.4S // .........................................*................................................................................... + // gap // ............................................................................................................................. + mls v16.4S, v10.4S, v8.S[0] // ..........................*.................................................................................................. + // gap // ............................................................................................................................. + sqrdmulh v20.4S, v6.4S, v27.S[1] // ...........................................*................................................................................. + // gap // ............................................................................................................................. + mul v9.4S, v6.4S, v27.S[0] // ................................................*............................................................................ + // gap // ............................................................................................................................. + sqrdmulh v10.4S, v25.4S, v22.S[3] // ............................................*................................................................................ + // gap // ............................................................................................................................. + mul v19.4S, v25.4S, v22.S[2] // .............................................*............................................................................... + // gap // ............................................................................................................................. + add v27.4S, v3.4S, v24.4S // ...............*............................................................................................................. + // gap // ............................................................................................................................. + mls v9.4S, v20.4S, v8.S[0] // .......................................................*..................................................................... + // gap // ............................................................................................................................. + sub v24.4S, v13.4S, v16.4S // ..............................*.............................................................................................. + // gap // ............................................................................................................................. + mls v19.4S, v10.4S, v8.S[0] // .....................................................*....................................................................... + // gap // ............................................................................................................................. + add v18.4S, v13.4S, v16.4S // ...............................*............................................................................................. + // gap // ............................................................................................................................. + sub v13.4S, v24.4S, v9.4S // ............................................................*................................................................ + // gap // ............................................................................................................................. + add v26.4S, v24.4S, v9.4S // .............................................................*............................................................... + // gap // ............................................................................................................................. + sub v6.4S, v18.4S, v19.4S // .........................................................*................................................................... + // gap // ............................................................................................................................. + add v3.4S, v18.4S, v19.4S // ..............................................................*.............................................................. + // gap // ............................................................................................................................. + trn1 v19.4S, v26.4S, v13.4S // ...........................................................................*................................................. + // gap // ............................................................................................................................. + trn2 v10.4S, v26.4S, v13.4S // ............................................................................*................................................ + // gap // ............................................................................................................................. + trn1 v25.4S, v3.4S, v6.4S // .....................................................................*....................................................... + // gap // ............................................................................................................................. + trn2 v24.4S, v3.4S, v6.4S // ..........................................................................*.................................................. + // gap // ............................................................................................................................. + add v4.4S, v15.4S, v4.4S // ...........................*................................................................................................. + // gap // ............................................................................................................................. + trn2 v15.2D, v25.2D, v19.2D // ..............................................................................*.............................................. + // gap // ............................................................................................................................. + trn2 v20.2D, v24.2D, v10.2D // .................................................................................*........................................... + // gap // ............................................................................................................................. + mul v9.4S, v15.4S, v23.4S // .......................................................................................*..................................... + // gap // ............................................................................................................................. + mul v6.4S, v20.4S, v23.4S // .............................................................................................*............................... + // gap // ............................................................................................................................. + sqrdmulh v20.4S, v20.4S, v1.4S // ............................................................................................*................................ + // gap // ............................................................................................................................. + sqrdmulh v3.4S, v4.4S, v21.S[3] // ...................................*......................................................................................... + // gap // ............................................................................................................................. + mul v26.4S, v4.4S, v21.S[2] // ......................................*...................................................................................... + // gap // ............................................................................................................................. + add v18.4S, v12.4S, v29.4S // .................*........................................................................................................... + // gap // ............................................................................................................................. + mls v6.4S, v20.4S, v8.S[0] // ......................................................................................................*...................... + // gap // ............................................................................................................................. + trn1 v16.2D, v24.2D, v10.2D // ...................................................................................*......................................... + // gap // ............................................................................................................................. + mls v26.4S, v3.4S, v8.S[0] // ..........................................*.................................................................................. + // gap // ............................................................................................................................. + sqrdmulh v12.4S, v18.4S, v21.S[3] // ....................*........................................................................................................ + // gap // ............................................................................................................................. + sub v20.4S, v16.4S, v6.4S // ..............................................................................................................*.............. + // gap // ............................................................................................................................. + mul v21.4S, v18.4S, v21.S[2] // .....................*....................................................................................................... + // gap // ............................................................................................................................. + sub v18.4S, v27.4S, v26.4S // ..............................................*.............................................................................. + // gap // ............................................................................................................................. + sqrdmulh v24.4S, v20.4S, v17.4S // ..................................................................................................................*.......... + // gap // ............................................................................................................................. + mul v17.4S, v20.4S, v0.4S // ....................................................................................................................*........ + // gap // ............................................................................................................................. + sqrdmulh v10.4S, v18.4S, v22.S[1] // .................................................*........................................................................... + // gap // ............................................................................................................................. + mul v23.4S, v18.4S, v22.S[0] // ..................................................*.......................................................................... + // gap // ............................................................................................................................. + add v18.4S, v27.4S, v26.4S // ...............................................*............................................................................. + // gap // ............................................................................................................................. + add v0.4S, v31.4S, v28.4S // .............*............................................................................................................... + // gap // ............................................................................................................................. + mls v21.4S, v12.4S, v8.S[0] // .........................*................................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v27.4S, v18.4S, v7.S[3] // ...................................................*......................................................................... + // gap // ............................................................................................................................. + mul v28.4S, v18.4S, v7.S[2] // ....................................................*........................................................................ + // gap // ............................................................................................................................. + sqrdmulh v29.4S, v15.4S, v1.4S // ......................................................................................*...................................... + // gap // ............................................................................................................................. + mls v23.4S, v10.4S, v8.S[0] // ......................................................*...................................................................... + // gap // ............................................................................................................................. + sub v18.4S, v0.4S, v21.4S // .............................*............................................................................................... + // gap // ............................................................................................................................. + mls v28.4S, v27.4S, v8.S[0] // ........................................................*.................................................................... + // gap // ............................................................................................................................. + add v1.4S, v0.4S, v21.4S // ................................*............................................................................................ + // gap // ............................................................................................................................. + sub v22.4S, v18.4S, v23.4S // ..........................................................*.................................................................. + // gap // ............................................................................................................................. + add v10.4S, v18.4S, v23.4S // ...........................................................*................................................................. + // gap // ............................................................................................................................. + sub v26.4S, v1.4S, v28.4S // ...............................................................*............................................................. + // gap // ............................................................................................................................. + add v4.4S, v1.4S, v28.4S // ................................................................*............................................................ + // gap // ............................................................................................................................. + trn1 v18.4S, v10.4S, v22.4S // .................................................................*........................................................... + // gap // ............................................................................................................................. + trn2 v20.4S, v10.4S, v22.4S // ..................................................................*.......................................................... + // gap // ............................................................................................................................. + trn1 v22.4S, v4.4S, v26.4S // ...................................................................*......................................................... + // gap // ............................................................................................................................. + trn2 v4.4S, v4.4S, v26.4S // ....................................................................*........................................................ + // gap // ............................................................................................................................. + add v21.4S, v16.4S, v6.4S // ...............................................................................................................*............. + // gap // ............................................................................................................................. + trn2 v15.2D, v22.2D, v18.2D // ......................................................................*...................................................... + // gap // ............................................................................................................................. + trn2 v26.2D, v4.2D, v20.2D // .......................................................................*..................................................... + // gap // ............................................................................................................................. + sqrdmulh v7.4S, v15.4S, v2.4S // .............................................................................*............................................... + // gap // ............................................................................................................................. + sqrdmulh v6.4S, v26.4S, v2.4S // .....................................................................................*....................................... + // gap // ............................................................................................................................. + ldr q10, [x5], #(12*16) // .......................................*..................................................................................... + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + mls v9.4S, v29.4S, v8.S[0] // ...........................................................................................*................................. + // gap // ............................................................................................................................. + trn1 v12.2D, v22.2D, v18.2D // ........................................................................*.................................................... + // gap // ............................................................................................................................. + mul v0.4S, v26.4S, v10.4S // ................................................................................*............................................ + // gap // ............................................................................................................................. + mul v27.4S, v15.4S, v10.4S // ...............................................................................*............................................. + // gap // ............................................................................................................................. + ldr q23, [x5, #-48] // ................................................................................................................*............ + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + mls v0.4S, v6.4S, v8.S[0] // ..........................................................................................*.................................. + // gap // ............................................................................................................................. + mls v27.4S, v7.4S, v8.S[0] // ....................................................................................*........................................ + // gap // ............................................................................................................................. + trn1 v6.2D, v4.2D, v20.2D // .........................................................................*................................................... + // gap // ............................................................................................................................. + sqrdmulh v7.4S, v21.4S, v23.4S // ...................................................................................................................*......... + // gap // ............................................................................................................................. + sub v29.4S, v6.4S, v0.4S // ..............................................................................................*.............................. + // gap // ............................................................................................................................. + add v22.4S, v6.4S, v0.4S // ...............................................................................................*............................. + // gap // ............................................................................................................................. + mls v17.4S, v24.4S, v8.S[0] // ......................................................................................................................*...... + // gap // ............................................................................................................................. + mul v3.4S, v29.4S, v11.4S // ....................................................................................................*........................ + // gap // ............................................................................................................................. + ldr q11, [x5, #-64] // .........................................................................................................*................... + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + mul v31.4S, v22.4S, v30.4S // ...................................................................................................*......................... + // gap // ............................................................................................................................. + sqrdmulh v6.4S, v22.4S, v14.4S // ..................................................................................................*.......................... + // gap // ............................................................................................................................. + mul v28.4S, v21.4S, v11.4S // .................................................................................................................*........... + // gap // ............................................................................................................................. + sqrdmulh v21.4S, v29.4S, v5.4S // .................................................................................................*........................... + // gap // ............................................................................................................................. + sub v5.4S, v12.4S, v27.4S // ........................................................................................*.................................... + // gap // ............................................................................................................................. + mls v31.4S, v6.4S, v8.S[0] // .......................................................................................................*..................... + // gap // ............................................................................................................................. + mls v28.4S, v7.4S, v8.S[0] // .....................................................................................................................*....... + // gap // ............................................................................................................................. + trn1 v7.2D, v25.2D, v19.2D // ..................................................................................*.......................................... + // gap // ............................................................................................................................. + mls v3.4S, v21.4S, v8.S[0] // ........................................................................................................*.................... + // gap // ............................................................................................................................. + sub v2.4S, v7.4S, v9.4S // ................................................................................................*............................ + // gap // ............................................................................................................................. + add v21.4S, v7.4S, v9.4S // .....................................................................................................*....................... + // gap // ............................................................................................................................. + add v7.4S, v12.4S, v27.4S // .........................................................................................*................................... + // gap // ............................................................................................................................. + sub v19.4S, v5.4S, v3.4S // ............................................................................................................*................ + // gap // ............................................................................................................................. + add v18.4S, v5.4S, v3.4S // .............................................................................................................*............... + // gap // ............................................................................................................................. + add v27.4S, v21.4S, v28.4S // ........................................................................................................................*.... + // gap // ............................................................................................................................. + sub v28.4S, v21.4S, v28.4S // .......................................................................................................................*..... + // gap // ............................................................................................................................. + sub v30.4S, v2.4S, v17.4S // .........................................................................................................................*... + // gap // ............................................................................................................................. + add v29.4S, v2.4S, v17.4S // ..........................................................................................................................*.. + // gap // ............................................................................................................................. + sub v17.4S, v7.4S, v31.4S // ..........................................................................................................*.................. + // gap // ............................................................................................................................. + add v16.4S, v7.4S, v31.4S // ...........................................................................................................*................. + // gap // ............................................................................................................................. + st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x2], #64 // ............................................................................................................................* + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1], #64 // ...........................................................................................................................*. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + + // ------------------------------------------------------- new position -------------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------------ + // add x1, x1, #64 // ...*......................................................................................................................... + // add x2, x2, #64 // .*........................................................................................................................... + // sqrdmulh v6.4S, v16.4S, v21.S[1] // .....*....................................................................................................................... + // mul v18.4S, v16.4S, v21.S[0] // ........*.................................................................................................................... + // sqrdmulh v16.4S, v22.4S, v21.S[1] // *............................................................................................................................ + // mul v22.4S, v22.4S, v21.S[0] // ..*.......................................................................................................................... + // sqrdmulh v19.4S, v9.4S, v21.S[1] // .........*................................................................................................................... + // mls v18.4S, v6.4S, v8.S[0] // .............*............................................................................................................... + // mul v6.4S, v9.4S, v21.S[0] // ..........*.................................................................................................................. + // mls v22.4S, v16.4S, v8.S[0] // .......*..................................................................................................................... + // sqrdmulh v16.4S, v4.4S, v21.S[1] // ....*........................................................................................................................ + // sub v20.4S, v31.4S, v18.4S // ........................*.................................................................................................... + // mls v6.4S, v19.4S, v8.S[0] // ..............*.............................................................................................................. + // add v18.4S, v31.4S, v18.4S // ..................................................................*.......................................................... + // sub v19.4S, v3.4S, v22.4S // ............*................................................................................................................ + // add v22.4S, v3.4S, v22.4S // ................................*............................................................................................ + // sub v31.4S, v12.4S, v6.4S // ...................*......................................................................................................... + // add v6.4S, v12.4S, v6.4S // .....................................................*....................................................................... + // mul v3.4S, v4.4S, v21.S[0] // ......*...................................................................................................................... + // sqrdmulh v12.4S, v31.4S, v7.S[1] // ......................*...................................................................................................... + // sqrdmulh v9.4S, v6.4S, v21.S[3] // .........................................................*................................................................... + // mul v6.4S, v6.4S, v21.S[2] // ...........................................................*................................................................. + // mls v3.4S, v16.4S, v8.S[0] // ...........*................................................................................................................. + // mul v16.4S, v31.4S, v7.S[0] // .......................*..................................................................................................... + // ldr q31, [x4, #-32] // ................*............................................................................................................ + // mls v6.4S, v9.4S, v8.S[0] // ...................................................................*......................................................... + // mls v16.4S, v12.4S, v8.S[0] // ...........................*................................................................................................. + // add v12.4S, v15.4S, v3.4S // .............................................*............................................................................... + // sub v3.4S, v15.4S, v3.4S // ...............*............................................................................................................. + // sub v9.4S, v18.4S, v6.4S // ........................................................................*.................................................... + // sub v15.4S, v20.4S, v16.4S // ..................................*.......................................................................................... + // add v16.4S, v20.4S, v16.4S // ....................................*........................................................................................ + // add v6.4S, v18.4S, v6.4S // ..........................................................................*.................................................. + // sqrdmulh v18.4S, v3.4S, v7.S[1] // .................*........................................................................................................... + // mul v20.4S, v3.4S, v7.S[0] // ..................*.......................................................................................................... + // sqrdmulh v3.4S, v12.4S, v21.S[3] // ...................................................*......................................................................... + // ldr q4, [x4, #-16] // ....................*........................................................................................................ + // mls v20.4S, v18.4S, v8.S[0] // .....................*....................................................................................................... + // mul v18.4S, v12.4S, v21.S[2] // ....................................................*........................................................................ + // ldr q12, [x5], #(12*16) // ........................................................................................*.................................... + // sub v26.4S, v19.4S, v20.4S // .........................*................................................................................................... + // add v19.4S, v19.4S, v20.4S // ..........................*.................................................................................................. + // mls v18.4S, v3.4S, v8.S[0] // ........................................................*.................................................................... + // sqrdmulh v20.4S, v26.4S, v4.S[1] // ............................*................................................................................................ + // sqrdmulh v3.4S, v19.4S, v31.S[3] // ..............................*.............................................................................................. + // mul v19.4S, v19.4S, v31.S[2] // ...............................*............................................................................................. + // sub v21.4S, v22.4S, v18.4S // ............................................................*................................................................ + // add v18.4S, v22.4S, v18.4S // .................................................................*........................................................... + // mul v22.4S, v26.4S, v4.S[0] // .............................*............................................................................................... + // sqrdmulh v4.4S, v21.4S, v31.S[1] // ...............................................................*............................................................. + // mul v31.4S, v21.4S, v31.S[0] // ................................................................*............................................................ + // sqrdmulh v26.4S, v18.4S, v7.S[3] // ....................................................................*........................................................ + // mul v18.4S, v18.4S, v7.S[2] // .....................................................................*....................................................... + // mls v19.4S, v3.4S, v8.S[0] // ...................................*......................................................................................... + // mls v31.4S, v4.4S, v8.S[0] // .......................................................................*..................................................... + // mls v22.4S, v20.4S, v8.S[0] // .................................*........................................................................................... + // mls v18.4S, v26.4S, v8.S[0] // .........................................................................*................................................... + // sub v20.4S, v16.4S, v19.4S // .......................................*..................................................................................... + // sub v3.4S, v9.4S, v31.4S // ...........................................................................*................................................. + // add v31.4S, v9.4S, v31.4S // ............................................................................*................................................ + // sub v9.4S, v15.4S, v22.4S // .....................................*....................................................................................... + // add v22.4S, v15.4S, v22.4S // ......................................*...................................................................................... + // add v16.4S, v16.4S, v19.4S // ........................................*.................................................................................... + // sub v19.4S, v6.4S, v18.4S // .............................................................................*............................................... + // add v6.4S, v6.4S, v18.4S // ..............................................................................*.............................................. + // trn1 v18.4S, v31.4S, v3.4S // ...............................................................................*............................................. + // trn2 v31.4S, v31.4S, v3.4S // ................................................................................*............................................ + // trn1 v3.4S, v6.4S, v19.4S // .................................................................................*........................................... + // trn2 v6.4S, v6.4S, v19.4S // ..................................................................................*.......................................... + // trn1 v19.4S, v16.4S, v20.4S // ...........................................*................................................................................. + // trn2 v15.2D, v3.2D, v18.2D // ....................................................................................*........................................ + // trn2 v4.2D, v6.2D, v31.2D // .....................................................................................*....................................... + // trn1 v18.2D, v3.2D, v18.2D // ..........................................................................................*.................................. + // trn1 v6.2D, v6.2D, v31.2D // ................................................................................................*............................ + // trn2 v16.4S, v16.4S, v20.4S // ............................................*................................................................................ + // trn1 v20.4S, v22.4S, v9.4S // .........................................*................................................................................... + // trn2 v22.4S, v22.4S, v9.4S // ..........................................*.................................................................................. + // sqrdmulh v31.4S, v15.4S, v2.4S // ......................................................................................*...................................... + // trn2 v3.2D, v19.2D, v20.2D // ..............................................*.............................................................................. + // mul v9.4S, v15.4S, v12.4S // ............................................................................................*................................ + // mul v12.4S, v4.4S, v12.4S // ...........................................................................................*................................. + // trn2 v15.2D, v16.2D, v22.2D // ...............................................*............................................................................. + // trn1 v19.2D, v19.2D, v20.2D // ..............................................................................................................*.............. + // trn1 v16.2D, v16.2D, v22.2D // .......................................................*..................................................................... + // mls v9.4S, v31.4S, v8.S[0] // ...............................................................................................*............................. + // sqrdmulh v22.4S, v4.4S, v2.4S // .......................................................................................*..................................... + // sqrdmulh v20.4S, v3.4S, v1.4S // ......................................................................*...................................................... + // mul v31.4S, v3.4S, v23.4S // ................................................*............................................................................ + // sub v3.4S, v18.4S, v9.4S // ...........................................................................................................*................. + // add v18.4S, v18.4S, v9.4S // ..................................................................................................................*.......... + // mls v12.4S, v22.4S, v8.S[0] // ..............................................................................................*.............................. + // mls v31.4S, v20.4S, v8.S[0] // .........................................................................................*................................... + // sqrdmulh v22.4S, v15.4S, v1.4S // ..................................................*.......................................................................... + // mul v20.4S, v15.4S, v23.4S // .................................................*........................................................................... + // sub v9.4S, v6.4S, v12.4S // ..................................................................................................*.......................... + // add v6.4S, v6.4S, v12.4S // ...................................................................................................*......................... + // sub v12.4S, v19.4S, v31.4S // ................................................................................................................*............ + // sqrdmulh v15.4S, v9.4S, v5.4S // ..........................................................................................................*.................. + // sqrdmulh v4.4S, v6.4S, v14.4S // ........................................................................................................*.................... + // mul v6.4S, v6.4S, v30.4S // .......................................................................................................*..................... + // mul v9.4S, v9.4S, v11.4S // .....................................................................................................*....................... + // add v19.4S, v19.4S, v31.4S // .................................................................................................................*........... + // mls v20.4S, v22.4S, v8.S[0] // ......................................................*...................................................................... + // mls v6.4S, v4.4S, v8.S[0] // ............................................................................................................*................ + // mls v9.4S, v15.4S, v8.S[0] // ...............................................................................................................*............. + // ldr q22, [x5, #-64] // ......................................................................................................*...................... + // sub v26.4S, v18.4S, v6.4S // .........................................................................................................................*... + // add v25.4S, v18.4S, v6.4S // ..........................................................................................................................*.. + // sub v28.4S, v3.4S, v9.4S // ...................................................................................................................*......... + // add v27.4S, v3.4S, v9.4S // ....................................................................................................................*........ + // sub v6.4S, v16.4S, v20.4S // ..........................................................*.................................................................. + // add v18.4S, v16.4S, v20.4S // ...................................................................................*......................................... + // ldr q16, [x5, #-48] // .............................................................................................*............................... + // mul v22.4S, v18.4S, v22.4S // .........................................................................................................*................... + // sqrdmulh v20.4S, v6.4S, v17.4S // .............................................................*............................................................... + // sqrdmulh v18.4S, v18.4S, v16.4S // .................................................................................................*........................... + // mul v6.4S, v6.4S, v0.4S // ..............................................................*.............................................................. + // mls v22.4S, v18.4S, v8.S[0] // .............................................................................................................*............... + // mls v6.4S, v20.4S, v8.S[0] // ....................................................................................................*........................ + // sub v18.4S, v19.4S, v22.4S // ......................................................................................................................*...... + // add v17.4S, v19.4S, v22.4S // .....................................................................................................................*....... + // sub v20.4S, v12.4S, v6.4S // .......................................................................................................................*..... + // add v19.4S, v12.4S, v6.4S // ........................................................................................................................*.... + // st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // ............................................................................................................................* + // st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x2], #64 // ...........................................................................................................................*. + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_opt_a72.s b/examples/opt/aarch64/ntt_dilithium_123_45678_opt_a72.s index 027d63aa..cb60e85f 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_opt_a72.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_opt_a72.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, \offset] -.endm - -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], \inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, \offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], \inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +25,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +42,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -89,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -131,40 +103,40 @@ xtmp1 .req x11 str \x\()t_31, [\addr, #(-\inc + 8*7)] .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -186,7 +158,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +169,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +179,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +187,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +198,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -251,7 +223,7 @@ roots: .text .global ntt_dilithium_123_45678_opt_a72 - .global _ntt_dilithium_123_45678_opt_a72 + .global _ntt_dilithium_123_45678 .p2align 4 const_addr: .word 8380417 @@ -375,643 +347,672 @@ _ntt_dilithium_123_45678_opt_a72: load_roots_123 .p2align 2 - ldr_vo v19, x0, 384 // ..*......... - ldr_vo v20, x0, 896 // .*.......... - // gap // ............ - ldr_vo v4, x0, 512 // *........... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v30.4S, v20.4S, v0.S[1] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v20.4S, v20.4S, v0.S[0] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v12.4S, v4.4S, v0.S[1] // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v20.4S, v30.4S, v8.S[0] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v22.4S, v4.4S, v0.S[0] // ......*..... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v22.4S, v12.4S, v8.S[0] // ...........* - // gap // ............ - // gap // ............ - add v12.4S, v19.4S, v20.4S // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v13.4S, v12.4S, v0.S[3] // .........*.. - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v24.4S, v12.4S, v0.S[2] // ..........*. - // gap // ............ - // gap // ............ - - // original source code - // ldr_vo v29, x0, 512 // ..*......... || .*................... - // ldr_vo v17, x0, 896 // .*.......... || *.................... - // ldr_vo v19, x0, 384 // *........... || *.................... - // mul v20.4S, v17.4S, v0.S[0] // ....*....... || ......*.............. - // sqrdmulh v13.4S, v17.4S, v0.S[1] // ...*........ || ....*................ - // mls v20.4S, v13.4S, v8.S[0] // ......*..... || ..........*.......... - // mul v22.4S, v29.4S, v0.S[0] // .......*.... || ............*........ - // add v14.4S, v19.4S, v20.4S // .........*.. || ...............*..... - // sqrdmulh v25.4S, v29.4S, v0.S[1] // .....*...... || ........*............ - // sqrdmulh v13.4S, v14.4S, v0.S[3] // ..........*. || ..................*.. - // mul v24.4S, v14.4S, v0.S[2] // ...........* || ....................* - // mls v22.4S, v25.4S, v8.S[0] // ........*... || ..............*...... - + // Instructions: 11 + // Expected cycles: 17 + // Expected IPC: 0.65 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q6, [x0, #640] // *............................. + ldr q20, [x0, #896] // ...*.......................... + // gap // .............................. + ldr q18, [x0, #768] // ..*........................... + // gap // .............................. + // gap // .............................. + ldr q27, [x0, #256] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v5.4S, v20.4S, v0.S[1] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v25.4S, v20.4S, v0.S[0] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v20.4S, v18.4S, v0.S[1] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v17.4S, v18.4S, v0.S[0] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v25.4S, v5.4S, v8.S[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v17.4S, v20.4S, v8.S[0] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v31.4S, v6.4S, v0.S[0] // .........*.................... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0, #640] // *.............................. + // ldr q27, [x0, #256] // ...*........................... + // ldr q12, [x0, #768] // ..*............................ + // ldr q14, [x0, #896] // .*............................. + // sqrdmulh v30.4S, v12.4S, v0.S[1] // ......*........................ + // mul v17.4S, v12.4S, v0.S[0] // .......*....................... + // sqrdmulh v19.4S, v14.4S, v0.S[1] // ....*.......................... + // mul v25.4S, v14.4S, v0.S[0] // .....*......................... + // mls v17.4S, v30.4S, v8.S[0] // .........*..................... + // mul v31.4S, v6.4S, v0.S[0] // ..........*.................... + // mls v25.4S, v19.4S, v8.S[0] // ........*...................... + sub count, count, #1 -.p2align 2 layer123_start: - ldr_vo v12, x0, 0 // *........................................................................... - sub v20.4S, v19.4S, v20.4S // ..........................*................................................. - ldr_vo v18, x0, 256 // ..*......................................................................... - ldr_vo v14, x0, 768 // ......*..................................................................... - ldr_vo v25, x0, 128 // .*.......................................................................... - mls v24.4S, v13.4S, v8.S[0] // ...................................*........................................ - ldr_vo v29, x0, 528 // ....e....................................................................... - ldr_vo v13, x0, 640 // .....*...................................................................... - // gap // ............................................................................ - mul v10.4S, v20.4S, v1.S[0] // ...........................................*................................ - ldr_vo v17, x0, 912 // .......e.................................................................... - ldr_vo v19, x0, 400 // ...e........................................................................ - sub v21.4S, v12.4S, v22.4S // ...........*................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v12.4S, v12.4S, v22.4S // ............*............................................................... - sqrdmulh v20.4S, v20.4S, v1.S[1] // ............................................*............................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v22.4S, v14.4S, v0.S[1] // ...................*........................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v14.4S, v14.4S, v0.S[0] // ..................*......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v30.4S, v13.4S, v0.S[1] // ..............*............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v14.4S, v22.4S, v8.S[0] // ....................*....................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v13.4S, v13.4S, v0.S[0] // .............*.............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v13.4S, v30.4S, v8.S[0] // ...............*............................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v22.4S, v18.4S, v14.4S // .....................*...................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v18.4S, v18.4S, v14.4S // ......................*..................................................... - mls v10.4S, v20.4S, v8.S[0] // .............................................*.............................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v20.4S, v17.4S, v0.S[0] // .......................e.................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v14.4S, v25.4S, v13.4S // .................*.......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v25.4S, v25.4S, v13.4S // ................*........................................................... - sqrdmulh v13.4S, v17.4S, v0.S[1] // ........................e................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v17.4S, v22.4S, v1.S[1] // .......................................*.................................... - sub v30.4S, v14.4S, v24.4S // ....................................*....................................... - // gap // ............................................................................ - add v14.4S, v14.4S, v24.4S // .....................................*...................................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v24.4S, v18.4S, v0.S[3] // .............................*.............................................. - sub v31.4S, v25.4S, v10.4S // ..............................................*............................. - // gap // ............................................................................ - add v25.4S, v25.4S, v10.4S // ...............................................*............................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v10.4S, v22.4S, v1.S[0] // ......................................*..................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v10.4S, v17.4S, v8.S[0] // ........................................*................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v18.4S, v18.4S, v0.S[2] // ............................*............................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v18.4S, v24.4S, v8.S[0] // ..............................*............................................. - // gap // ............................................................................ - // gap // ............................................................................ - sub v24.4S, v21.4S, v10.4S // .........................................*.................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v10.4S, v21.4S, v10.4S // ..........................................*................................. - mul v17.4S, v14.4S, v1.S[2] // ................................................*........................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v14.4S, v14.4S, v1.S[3] // .................................................*.......................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v21.4S, v12.4S, v18.4S // ...............................*............................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v12.4S, v12.4S, v18.4S // ................................*........................................... - mul v18.4S, v30.4S, v2.S[0] // .....................................................*...................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v22.4S, v30.4S, v2.S[1] // ......................................................*..................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v17.4S, v14.4S, v8.S[0] // ..................................................*......................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v14.4S, v25.4S, v2.S[2] // ..........................................................*................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v18.4S, v22.4S, v8.S[0] // .......................................................*.................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v22.4S, v12.4S, v17.4S // ...................................................*........................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v25.4S, v25.4S, v2.S[3] // ...........................................................*................ - add v12.4S, v12.4S, v17.4S // ....................................................*....................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v22, x0, 128 // .....................................................................*...... - mul v17.4S, v31.4S, v3.S[0] // ...............................................................*............ - // gap // ............................................................................ - str_vi v12, x0, 16 // ....................................................................*....... - sub v12.4S, v21.4S, v18.4S // ........................................................*................... - // gap // ............................................................................ - sqrdmulh v22.4S, v31.4S, v3.S[1] // ................................................................*........... - add v18.4S, v21.4S, v18.4S // .........................................................*.................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v14.4S, v25.4S, v8.S[0] // ............................................................*............... - str_vo v12, x0, 368 // .......................................................................*.... - // gap // ............................................................................ - str_vo v18, x0, 240 // ......................................................................*..... - // gap // ............................................................................ - // gap // ............................................................................ - mls v20.4S, v13.4S, v8.S[0] // .........................e.................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v17.4S, v22.4S, v8.S[0] // .................................................................*.......... - // gap // ............................................................................ - // gap // ............................................................................ - sub v12.4S, v10.4S, v14.4S // .............................................................*.............. - // gap // ............................................................................ - // gap // ............................................................................ - add v18.4S, v10.4S, v14.4S // ..............................................................*............. - mul v22.4S, v29.4S, v0.S[0] // ........e................................................................... - // gap // ............................................................................ - add v14.4S, v19.4S, v20.4S // ...........................e................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v25.4S, v29.4S, v0.S[1] // .........e.................................................................. - str_vo v12, x0, 624 // .........................................................................*.. - // gap // ............................................................................ - str_vo v18, x0, 496 // ........................................................................*... - sub v12.4S, v24.4S, v17.4S // ..................................................................*......... - // gap // ............................................................................ - add v18.4S, v24.4S, v17.4S // ...................................................................*........ - sqrdmulh v13.4S, v14.4S, v0.S[3] // ..................................e......................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v24.4S, v14.4S, v0.S[2] // .................................e.......................................... - str_vo v12, x0, 880 // ...........................................................................* - // gap // ............................................................................ - str_vo v18, x0, 752 // ..........................................................................*. - // gap // ............................................................................ - // gap // ............................................................................ - mls v22.4S, v25.4S, v8.S[0] // ..........e................................................................. - // gap // ............................................................................ - // gap // ............................................................................ - - // original source code - // ldr_vo v9, x0, 0 // ......................................................................*.......................................................................... || ......................................................................*...................................................................... - // ldr_vo v10, x0, 128 // ..........................................................................*...................................................................... || .......................................................................*..................................................................... - // ldr_vo v11, x0, 256 // ........................................................................*........................................................................ || ......................................................................*...................................................................... - // ldr_vo v12, x0, 384 // ....e............................................................................................................................................ || .e........................................................................................................................................... - // ldr_vo v13, x0, 512 // e................................................................................................................................................ || e............................................................................................................................................ - // ldr_vo v14, x0, 640 // .............................................................................*................................................................... || ........................................................................*.................................................................... - // ldr_vo v15, x0, 768 // .........................................................................*....................................................................... || .......................................................................*..................................................................... - // ldr_vo v16, x0, 896 // ...e............................................................................................................................................. || .e........................................................................................................................................... - // mul v24.4S, v13.4S, v0.S[0] // ..........................................................e...................................................................................... || .............................................................e............................................................................... - // sqrdmulh v13.4S, v13.4S, v0.S[1] // ............................................................e.................................................................................... || ...............................................................e............................................................................. - // mls v24.4S, v13.4S, v8.S[0] // .....................................................................e........................................................................... || .....................................................................e....................................................................... - // sub v13.4S, v9.4S, v24.4S // .................................................................................*............................................................... || ..........................................................................*.................................................................. - // add v9.4S, v9.4S, v24.4S // ..................................................................................*.............................................................. || ...........................................................................*................................................................. - // mul v24.4S, v14.4S, v0.S[0] // ........................................................................................*........................................................ || .....................................................................................*....................................................... - // sqrdmulh v14.4S, v14.4S, v0.S[1] // ......................................................................................*.......................................................... || .................................................................................*........................................................... - // mls v24.4S, v14.4S, v8.S[0] // .........................................................................................*....................................................... || .......................................................................................*..................................................... - // sub v14.4S, v10.4S, v24.4S // ...............................................................................................*................................................. || .............................................................................................*............................................... - // add v10.4S, v10.4S, v24.4S // ..............................................................................................*.................................................. || ............................................................................................*................................................ - // mul v24.4S, v15.4S, v0.S[0] // .....................................................................................*........................................................... || ...............................................................................*............................................................. - // sqrdmulh v15.4S, v15.4S, v0.S[1] // ....................................................................................*............................................................ || .............................................................................*............................................................... - // mls v24.4S, v15.4S, v8.S[0] // .......................................................................................*......................................................... || ...................................................................................*......................................................... - // sub v15.4S, v11.4S, v24.4S // ..........................................................................................*...................................................... || ........................................................................................*.................................................... - // add v11.4S, v11.4S, v24.4S // ...........................................................................................*..................................................... || .........................................................................................*................................................... - // mul v24.4S, v16.4S, v0.S[0] // .................e............................................................................................................................... || ...................e......................................................................................................................... - // sqrdmulh v16.4S, v16.4S, v0.S[1] // ....................e............................................................................................................................ || .....................e....................................................................................................................... - // mls v24.4S, v16.4S, v8.S[0] // ......................................................e.......................................................................................... || .........................................................e................................................................................... - // sub v16.4S, v12.4S, v24.4S // .......................................................................*......................................................................... || ......................................................................*...................................................................... - // add v12.4S, v12.4S, v24.4S // ...........................................................e..................................................................................... || ..............................................................e.............................................................................. - // mul v24.4S, v11.4S, v0.S[2] // .........................................................................................................*....................................... || .......................................................................................................*..................................... - // sqrdmulh v11.4S, v11.4S, v0.S[3] // ....................................................................................................*............................................ || .................................................................................................*........................................... - // mls v24.4S, v11.4S, v8.S[0] // ..........................................................................................................*...................................... || .........................................................................................................*................................... - // sub v11.4S, v9.4S, v24.4S // ...............................................................................................................*................................. || ..............................................................................................................*.............................. - // add v9.4S, v9.4S, v24.4S // ................................................................................................................*................................ || ...............................................................................................................*............................. - // mul v24.4S, v12.4S, v0.S[2] // ..................................................................e.............................................................................. || ...................................................................e......................................................................... - // sqrdmulh v12.4S, v12.4S, v0.S[3] // .................................................................e............................................................................... || .................................................................e........................................................................... - // mls v24.4S, v12.4S, v8.S[0] // ...........................................................................*..................................................................... || .......................................................................*..................................................................... - // sub v12.4S, v10.4S, v24.4S // ..................................................................................................*.............................................. || ...............................................................................................*............................................. - // add v10.4S, v10.4S, v24.4S // ...................................................................................................*............................................. || ................................................................................................*............................................ - // mul v24.4S, v15.4S, v1.S[0] // .......................................................................................................*......................................... || ...................................................................................................*......................................... - // sqrdmulh v15.4S, v15.4S, v1.S[1] // .................................................................................................*............................................... || ...............................................................................................*............................................. - // mls v24.4S, v15.4S, v8.S[0] // ........................................................................................................*........................................ || .....................................................................................................*....................................... - // sub v15.4S, v13.4S, v24.4S // ...........................................................................................................*..................................... || ..........................................................................................................*.................................. - // add v13.4S, v13.4S, v24.4S // ............................................................................................................*.................................... || ...........................................................................................................*................................. - // mul v24.4S, v16.4S, v1.S[0] // ..............................................................................*.................................................................. || .........................................................................*................................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[1] // ...................................................................................*............................................................. || ...........................................................................*................................................................. - // mls v24.4S, v16.4S, v8.S[0] // ............................................................................................*.................................................... || .........................................................................................*................................................... - // sub v16.4S, v14.4S, v24.4S // .....................................................................................................*........................................... || .................................................................................................*........................................... - // add v14.4S, v14.4S, v24.4S // ......................................................................................................*.......................................... || ..................................................................................................*.......................................... - // mul v24.4S, v10.4S, v1.S[2] // .............................................................................................................*................................... || ...........................................................................................................*................................. - // sqrdmulh v10.4S, v10.4S, v1.S[3] // ..............................................................................................................*.................................. || .............................................................................................................*............................... - // mls v24.4S, v10.4S, v8.S[0] // ...................................................................................................................*............................. || ...................................................................................................................*......................... - // sub v10.4S, v9.4S, v24.4S // ......................................................................................................................*.......................... || ........................................................................................................................*.................... - // add v9.4S, v9.4S, v24.4S // ........................................................................................................................*........................ || .........................................................................................................................*................... - // mul v24.4S, v12.4S, v2.S[0] // .................................................................................................................*............................... || ...............................................................................................................*............................. - // sqrdmulh v12.4S, v12.4S, v2.S[1] // ..................................................................................................................*.............................. || .................................................................................................................*........................... - // mls v24.4S, v12.4S, v8.S[0] // .....................................................................................................................*........................... || .......................................................................................................................*..................... - // sub v12.4S, v11.4S, v24.4S // ............................................................................................................................*.................... || ............................................................................................................................*................ - // add v11.4S, v11.4S, v24.4S // ..............................................................................................................................*.................. || .............................................................................................................................*............... - // mul v24.4S, v14.4S, v2.S[2] // ....................................................................................................................*............................ || .....................................................................................................................*....................... - // sqrdmulh v14.4S, v14.4S, v2.S[3] // .......................................................................................................................*......................... || .........................................................................................................................*................... - // mls v24.4S, v14.4S, v8.S[0] // ...............................................................................................................................*................. || ...............................................................................................................................*............. - // sub v14.4S, v13.4S, v24.4S // ....................................................................................................................................*............ || ....................................................................................................................................*........ - // add v13.4S, v13.4S, v24.4S // .....................................................................................................................................*........... || .....................................................................................................................................*....... - // mul v24.4S, v16.4S, v3.S[0] // ..........................................................................................................................*...................... || ...........................................................................................................................*................. - // sqrdmulh v16.4S, v16.4S, v3.S[1] // .............................................................................................................................*................... || .............................................................................................................................*............... - // mls v24.4S, v16.4S, v8.S[0] // ...................................................................................................................................*............. || ...................................................................................................................................*......... - // sub v16.4S, v15.4S, v24.4S // ...........................................................................................................................................*..... || ........................................................................................................................................*.... - // add v15.4S, v15.4S, v24.4S // ............................................................................................................................................*.... || .........................................................................................................................................*... - // str_vi v9, x0, 16 // ...........................................................................................................................*..................... || ............................................................................................................................*................ - // str_vo v10, x0, 112 // .........................................................................................................................*....................... || ...........................................................................................................................*................. - // str_vo v11, x0, 240 // .................................................................................................................................*............... || ................................................................................................................................*............ - // str_vo v12, x0, 368 // ................................................................................................................................*................ || ...............................................................................................................................*............. - // str_vo v13, x0, 496 // ..........................................................................................................................................*...... || ........................................................................................................................................*.... - // str_vo v14, x0, 624 // .........................................................................................................................................*....... || .......................................................................................................................................*..... - // str_vo v15, x0, 752 // ................................................................................................................................................* || ............................................................................................................................................* - // str_vo v16, x0, 880 // ...............................................................................................................................................*. || ...........................................................................................................................................*. - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Wall time: 6.51s + // User time: 6.51s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sub v20.4S, v27.4S, v17.4S // .....................*...................................................... + ldr q18, [x0, #128] // .*.......................................................................... + ldr q19, [x0, #0] // *........................................................................... + ldr q11, [x0, #384] // ...*........................................................................ + sqrdmulh v22.4S, v6.4S, v0.S[1] // .............*.............................................................. + ldr q30, [x0, #512] // ....*....................................................................... + add v13.4S, v27.4S, v17.4S // ......................*..................................................... + ldr q6, [x0, #656] // .....e...................................................................... + ldr q27, [x0, #272] // ..e......................................................................... + sqrdmulh v17.4S, v20.4S, v1.S[1] // ......................................*..................................... + ldr q12, [x0, #784] // ......e..................................................................... + ldr q14, [x0, #912] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v20.4S, v20.4S, v1.S[0] // .......................................*.................................... + sub v26.4S, v11.4S, v25.4S // ..........................*................................................. + // gap // ............................................................................ + add v11.4S, v11.4S, v25.4S // ...........................*................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v31.4S, v22.4S, v8.S[0] // ...............*............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v22.4S, v30.4S, v0.S[1] // ........*................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v30.4S, v30.4S, v0.S[0] // .........*.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v25.4S, v18.4S, v31.4S // ................*........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v18.4S, v31.4S // .................*.......................................................... + sqrdmulh v31.4S, v13.4S, v0.S[3] // ............................*............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v30.4S, v22.4S, v8.S[0] // ..........*................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v22.4S, v13.4S, v0.S[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v20.4S, v17.4S, v8.S[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v13.4S, v19.4S, v30.4S // ...........*................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v19.4S, v19.4S, v30.4S // ............*............................................................... + sqrdmulh v30.4S, v12.4S, v0.S[1] // ..................e......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v17.4S, v12.4S, v0.S[0] // ...................e........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v12.4S, v13.4S, v20.4S // .........................................*.................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v20.4S, v13.4S, v20.4S // ..........................................*................................. + sqrdmulh v13.4S, v26.4S, v1.S[1] // ...........................................*................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v26.4S, v26.4S, v1.S[0] // ............................................*............................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v16.4S, v11.4S, v0.S[3] // .................................*.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.4S, v13.4S, v8.S[0] // .............................................*.............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v22.4S, v31.4S, v8.S[0] // ..............................*............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v11.4S, v11.4S, v0.S[2] // ..................................*......................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v13.4S, v25.4S, v26.4S // ..............................................*............................. + // gap // ............................................................................ + // gap // ............................................................................ + add v26.4S, v25.4S, v26.4S // ...............................................*............................ + mls v11.4S, v16.4S, v8.S[0] // ...................................*........................................ + // gap // ............................................................................ + sub v31.4S, v19.4S, v22.4S // ...............................*............................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v19.4S, v19.4S, v22.4S // ................................*........................................... + sqrdmulh v22.4S, v13.4S, v3.S[1] // ...............................................................*............ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v25.4S, v26.4S, v2.S[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v16.4S, v18.4S, v11.4S // ....................................*....................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v18.4S, v11.4S // .....................................*...................................... + mul v11.4S, v26.4S, v2.S[2] // ...........................................................*................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v26.4S, v16.4S, v2.S[1] // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v10.4S, v18.4S, v1.S[3] // ................................................*........................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v18.4S, v18.4S, v1.S[2] // .................................................*.......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v16.4S, v16.4S, v2.S[0] // ......................................................*..................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v18.4S, v10.4S, v8.S[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v16.4S, v26.4S, v8.S[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v11.4S, v25.4S, v8.S[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.4S, v19.4S, v18.4S // ...................................................*........................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.4S, v13.4S, v3.S[0] // ................................................................*........... + add v18.4S, v19.4S, v18.4S // ....................................................*....................... + // gap // ............................................................................ + sub v19.4S, v31.4S, v16.4S // ........................................................*................... + // gap // ............................................................................ + // gap // ............................................................................ + add v31.4S, v31.4S, v16.4S // .........................................................*.................. + mls v13.4S, v22.4S, v8.S[0] // .................................................................*.......... + str q26, [x0, #128] // .....................................................................*...... + str q18, [x0], #(16) // ....................................................................*....... + sub v18.4S, v20.4S, v11.4S // .............................................................*.............. + // gap // ............................................................................ + add v20.4S, v20.4S, v11.4S // ..............................................................*............. + str q19, [x0, #368] // .......................................................................*.... + sqrdmulh v19.4S, v14.4S, v0.S[1] // .......................e.................................................... + str q31, [x0, #240] // ......................................................................*..... + // gap // ............................................................................ + // gap // ............................................................................ + mul v25.4S, v14.4S, v0.S[0] // ........................e................................................... + str q18, [x0, #624] // .........................................................................*.. + // gap // ............................................................................ + str q20, [x0, #496] // ........................................................................*... + sub v20.4S, v12.4S, v13.4S // ..................................................................*......... + // gap // ............................................................................ + mls v17.4S, v30.4S, v8.S[0] // ....................e....................................................... + add v18.4S, v12.4S, v13.4S // ...................................................................*........ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q20, [x0, #880] // ...........................................................................* + mul v31.4S, v6.4S, v0.S[0] // ..............e............................................................. + // gap // ............................................................................ + str q18, [x0, #752] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + mls v25.4S, v19.4S, v8.S[0] // .........................e.................................................. + // gap // ............................................................................ + // gap // ............................................................................ + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q9, [x0, #0] // .....................................................................'.*........................................................................ + // ldr q10, [x0, #(1*(1024/8))] // .....................................................................'*......................................................................... + // ldr q11, [x0, #(2*(1024/8))] // .e...................................................................'.......~.................................................................. + // ldr q12, [x0, #(3*(1024/8))] // .....................................................................'..*....................................................................... + // ldr q13, [x0, #(4*(1024/8))] // .....................................................................'....*..................................................................... + // ldr q14, [x0, #(5*(1024/8))] // e....................................................................'......~................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ...e.................................................................'.........~................................................................ + // ldr q16, [x0, #(7*(1024/8))] // ....e................................................................'..........~............................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // .........~...........................................................'...............*.......................................................... + // mul v24.4s, v13.4s, v0.s[0] // ..........~..........................................................'................*......................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............~......................................................'....................*..................................................... + // sub v13.4s, v9.4s, v24.4s // .................~...................................................'.......................*.................................................. + // add v9.4s, v9.4s, v24.4s // ..................~..................................................'........................*................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .....................................................................'...*...................................................................... + // mul v24.4s, v14.4s, v0.s[0] // ..................................................................e..'........................................................................~. + // mls v24.4s, v27.4s, v8.s[0] // ........~............................................................'..............*........................................................... + // sub v14.4s, v10.4s, v24.4s // ...........~.........................................................'.................*........................................................ + // add v10.4s, v10.4s, v24.4s // ............~........................................................'..................*....................................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ...................e.................................................'.........................~................................................ + // mul v24.4s, v15.4s, v0.s[0] // ....................e................................................'..........................~............................................... + // mls v24.4s, v27.4s, v8.s[0] // ...............................................................e.....'.....................................................................~.... + // sub v15.4s, v11.4s, v24.4s // .....................................................................*.......................................................................... + // add v11.4s, v11.4s, v24.4s // .....................................................................'.....*.................................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .........................................................e...........'...............................................................~.......... + // mul v24.4s, v16.4s, v0.s[0] // ...........................................................e.........'.................................................................~........ + // mls v24.4s, v27.4s, v8.s[0] // ....................................................................e'.......................................................................... + // sub v16.4s, v12.4s, v24.4s // ......~..............................................................'............*............................................................. + // add v12.4s, v12.4s, v24.4s // .......~.............................................................'.............*............................................................ + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .............~.......................................................'...................*...................................................... + // mul v24.4s, v11.4s, v0.s[2] // ...............~.....................................................'.....................*.................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................~.........................................'.................................*........................................ + // sub v11.4s, v9.4s, v24.4s // ................................~....................................'......................................*................................... + // add v9.4s, v9.4s, v24.4s // .................................~...................................'.......................................*.................................. + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .........................~...........................................'...............................*.......................................... + // mul v24.4s, v12.4s, v0.s[2] // ............................~........................................'..................................*....................................... + // mls v24.4s, v27.4s, v8.s[0] // ...............................~.....................................'.....................................*.................................... + // sub v12.4s, v10.4s, v24.4s // ....................................~................................'..........................................*............................... + // add v10.4s, v10.4s, v24.4s // .....................................~...............................'...........................................*.............................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ..~..................................................................'........*................................................................. + // mul v24.4s, v15.4s, v1.s[0] // .....~...............................................................'...........*.............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ................~....................................................'......................*................................................... + // sub v15.4s, v13.4s, v24.4s // .....................~...............................................'...........................*.............................................. + // add v13.4s, v13.4s, v24.4s // ......................~..............................................'............................*............................................. + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .......................~.............................................'.............................*............................................ + // mul v24.4s, v16.4s, v1.s[0] // ........................~............................................'..............................*........................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................~..........................................'................................*......................................... + // sub v16.4s, v14.4s, v24.4s // .............................~.......................................'...................................*...................................... + // add v14.4s, v14.4s, v24.4s // ..............................~......................................'....................................*..................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ........................................~............................'..............................................*........................... + // mul v24.4s, v10.4s, v1.s[2] // .........................................~...........................'...............................................*.......................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................~.........................'.................................................*........................ + // sub v10.4s, v9.4s, v24.4s // ..............................................~......................'....................................................*..................... + // add v9.4s, v9.4s, v24.4s // ................................................~....................'......................................................*................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .......................................~.............................'.............................................*............................ + // mul v24.4s, v12.4s, v2.s[0] // ..........................................~..........................'................................................*......................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................~........................'..................................................*....................... + // sub v12.4s, v11.4s, v24.4s // .................................................~...................'.......................................................*.................. + // add v11.4s, v11.4s, v24.4s // ..................................................~..................'........................................................*................. + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...................................~.................................'.........................................*................................ + // mul v24.4s, v14.4s, v2.s[2] // ......................................~..............................'............................................*............................. + // mls v24.4s, v27.4s, v8.s[0] // .............................................~.......................'...................................................*...................... + // sub v14.4s, v13.4s, v24.4s // ......................................................~..............'............................................................*............. + // add v13.4s, v13.4s, v24.4s // .......................................................~.............'.............................................................*............ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ..................................~..................................'........................................*................................. + // mul v24.4s, v16.4s, v3.s[0] // ...............................................~.....................'.....................................................*.................... + // mls v24.4s, v27.4s, v8.s[0] // ...................................................~.................'.........................................................*................ + // sub v16.4s, v15.4s, v24.4s // ..............................................................~......'....................................................................*..... + // add v15.4s, v15.4s, v24.4s // ................................................................~....'......................................................................*... + // str q9, [x0], #(16) // .....................................................~...............'...........................................................*.............. + // str q10, [x0, #(-16 + 1*(1024/8))] // ....................................................~................'..........................................................*............... + // str q11, [x0, #(-16 + 2*(1024/8))] // ..........................................................~..........'................................................................*......... + // str q12, [x0, #(-16 + 3*(1024/8))] // ........................................................~............'..............................................................*........... + // str q13, [x0, #(-16 + 4*(1024/8))] // .............................................................~.......'...................................................................*...... + // str q14, [x0, #(-16 + 5*(1024/8))] // ............................................................~........'..................................................................*....... + // str q15, [x0, #(-16 + 6*(1024/8))] // ...................................................................~.'.........................................................................* + // str q16, [x0, #(-16 + 7*(1024/8))] // .................................................................~...'.......................................................................*.. + + sub count, count, #1 cbnz count, layer123_start - mls v24.4S, v13.4S, v8.S[0] // .....*.......................................................... - sub v29.4S, v19.4S, v20.4S // .*.............................................................. - ldr_vo v17, x0, 0 // *............................................................... - ldr_vo v25, x0, 768 // ...*............................................................ - ldr_vo v27, x0, 640 // ......*......................................................... - // gap // ................................................................ - // gap // ................................................................ - ldr_vo v28, x0, 128 // ....*........................................................... - // gap // ................................................................ - ldr_vo v5, x0, 256 // ..*............................................................. - mul v15.4S, v29.4S, v1.S[0] // .......*........................................................ - // gap // ................................................................ - add v19.4S, v17.4S, v22.4S // .........*...................................................... - // gap // ................................................................ - // gap // ................................................................ - sub v16.4S, v17.4S, v22.4S // ........*....................................................... - sqrdmulh v11.4S, v29.4S, v1.S[1] // ..........*..................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v21.4S, v25.4S, v0.S[1] // ...........*.................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v4.4S, v25.4S, v0.S[0] // ............*................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v20.4S, v27.4S, v0.S[1] // .............*.................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v4.4S, v21.4S, v8.S[0] // ..............*................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v26.4S, v27.4S, v0.S[0] // ...............*................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v26.4S, v20.4S, v8.S[0] // ................*............................................... - // gap // ................................................................ - // gap // ................................................................ - add v10.4S, v5.4S, v4.4S // ..................*............................................. - // gap // ................................................................ - // gap // ................................................................ - sub v30.4S, v5.4S, v4.4S // .................*.............................................. - mls v15.4S, v11.4S, v8.S[0] // ...................*............................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v11.4S, v10.4S, v0.S[3] // .........................*...................................... - // gap // ................................................................ - // gap // ................................................................ - add v9.4S, v28.4S, v26.4S // ....................*........................................... - // gap // ................................................................ - // gap // ................................................................ - sub v4.4S, v28.4S, v26.4S // .....................*.......................................... - mul v22.4S, v10.4S, v0.S[2] // ..............................*................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v20.4S, v30.4S, v1.S[1] // ......................*......................................... - sub v7.4S, v9.4S, v24.4S // .......................*........................................ - // gap // ................................................................ - add v26.4S, v9.4S, v24.4S // ........................*....................................... - // gap // ................................................................ - // gap // ................................................................ - mls v22.4S, v11.4S, v8.S[0] // ...............................*................................ - // gap // ................................................................ - // gap // ................................................................ - add v23.4S, v4.4S, v15.4S // ...........................*.................................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v28.4S, v26.4S, v1.S[3] // ...................................*............................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v5.4S, v23.4S, v2.S[2] // .........................................*...................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v17.4S, v30.4S, v1.S[0] // ............................*................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v31.4S, v7.4S, v2.S[0] // ......................................*......................... - add v21.4S, v19.4S, v22.4S // .....................................*.......................... - // gap // ................................................................ - sub v15.4S, v4.4S, v15.4S // ..........................*..................................... - // gap // ................................................................ - // gap // ................................................................ - mul v27.4S, v26.4S, v1.S[2] // ..................................*............................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v17.4S, v20.4S, v8.S[0] // .............................*.................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v4.4S, v15.4S, v3.S[0] // ...............................................*................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v12.4S, v15.4S, v3.S[1] // ..................................................*............. - // gap // ................................................................ - // gap // ................................................................ - sub v6.4S, v16.4S, v17.4S // ................................*............................... - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v15.4S, v7.4S, v2.S[1] // .......................................*........................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v27.4S, v28.4S, v8.S[0] // ........................................*....................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v30.4S, v23.4S, v2.S[3] // ............................................*................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v31.4S, v15.4S, v8.S[0] // ..........................................*..................... - sub v15.4S, v19.4S, v22.4S // ....................................*........................... - // gap // ................................................................ - sub v7.4S, v21.4S, v27.4S // ...........................................*.................... - // gap // ................................................................ - // gap // ................................................................ - add v20.4S, v16.4S, v17.4S // .................................*.............................. - mls v4.4S, v12.4S, v8.S[0] // .......................................................*........ - // gap // ................................................................ - add v12.4S, v21.4S, v27.4S // .............................................*.................. - // gap // ................................................................ - // gap // ................................................................ - mls v5.4S, v30.4S, v8.S[0] // ....................................................*........... - str_vo v7, x0, 128 // ..............................................*................. - // gap // ................................................................ - sub v16.4S, v15.4S, v31.4S // .................................................*.............. - // gap // ................................................................ - // gap // ................................................................ - add v30.4S, v15.4S, v31.4S // ...................................................*............ - str_vi v12, x0, 16 // ................................................*............... - // gap // ................................................................ - sub v27.4S, v6.4S, v4.4S // ............................................................*... - // gap // ................................................................ - // gap // ................................................................ - str_vo v16, x0, 368 // .....................................................*.......... - add v16.4S, v6.4S, v4.4S // .............................................................*.. - // gap // ................................................................ - sub v23.4S, v20.4S, v5.4S // ........................................................*....... - add v28.4S, v20.4S, v5.4S // .........................................................*...... - str_vo v30, x0, 240 // ......................................................*......... - str_vo v27, x0, 880 // ..............................................................*. - // gap // ................................................................ - // gap // ................................................................ - str_vo v16, x0, 752 // ...............................................................* - // gap // ................................................................ - // gap // ................................................................ - str_vo v23, x0, 624 // ..........................................................*..... - str_vo v28, x0, 496 // ...........................................................*.... - // gap // ................................................................ - - // original source code - // ldr_vo v12, x0, 0 // ..*............................................................. || *............................................................... - // sub v20.4S, v19.4S, v20.4S // .*.............................................................. || *............................................................... - // ldr_vo v18, x0, 256 // ......*......................................................... || ...*............................................................ - // ldr_vo v14, x0, 768 // ...*............................................................ || .*.............................................................. - // ldr_vo v25, x0, 128 // .....*.......................................................... || ..*............................................................. - // mls v24.4S, v13.4S, v8.S[0] // *............................................................... || *............................................................... - // ldr_vo v13, x0, 640 // ....*........................................................... || .*.............................................................. - // mul v10.4S, v20.4S, v1.S[0] // .......*........................................................ || ...*............................................................ - // sub v21.4S, v12.4S, v22.4S // .........*...................................................... || .....*.......................................................... - // add v12.4S, v12.4S, v22.4S // ........*....................................................... || ....*........................................................... - // sqrdmulh v20.4S, v20.4S, v1.S[1] // ..........*..................................................... || .....*.......................................................... - // sqrdmulh v22.4S, v14.4S, v0.S[1] // ...........*.................................................... || .......*........................................................ - // mul v14.4S, v14.4S, v0.S[0] // ............*................................................... || .........*...................................................... - // sqrdmulh v30.4S, v13.4S, v0.S[1] // .............*.................................................. || ...........*.................................................... - // mls v14.4S, v22.4S, v8.S[0] // ..............*................................................. || .............*.................................................. - // mul v13.4S, v13.4S, v0.S[0] // ...............*................................................ || ...............*................................................ - // mls v13.4S, v30.4S, v8.S[0] // ................*............................................... || .................*.............................................. - // sub v22.4S, v18.4S, v14.4S // ..................*............................................. || ...................*............................................ - // add v18.4S, v18.4S, v14.4S // .................*.............................................. || ..................*............................................. - // mls v10.4S, v20.4S, v8.S[0] // ...................*............................................ || ...................*............................................ - // add v14.4S, v25.4S, v13.4S // .....................*.......................................... || ......................*......................................... - // sub v25.4S, v25.4S, v13.4S // ......................*......................................... || .......................*........................................ - // sqrdmulh v17.4S, v22.4S, v1.S[1] // ........................*....................................... || .........................*...................................... - // sub v30.4S, v14.4S, v24.4S // .........................*...................................... || .........................*...................................... - // add v14.4S, v14.4S, v24.4S // ..........................*..................................... || ..........................*..................................... - // sqrdmulh v24.4S, v18.4S, v0.S[3] // ....................*........................................... || .....................*.......................................... - // sub v31.4S, v25.4S, v10.4S // ..................................*............................. || ....................................*........................... - // add v25.4S, v25.4S, v10.4S // ............................*................................... || ............................*................................... - // mul v10.4S, v22.4S, v1.S[0] // ...............................*................................ || .................................*.............................. - // mls v10.4S, v17.4S, v8.S[0] // ....................................*........................... || .......................................*........................ - // mul v18.4S, v18.4S, v0.S[2] // .......................*........................................ || .......................*........................................ - // mls v18.4S, v24.4S, v8.S[0] // ...........................*.................................... || ...........................*.................................... - // sub v24.4S, v21.4S, v10.4S // .......................................*........................ || ............................................*................... - // add v10.4S, v21.4S, v10.4S // ..............................................*................. || .....................................................*.......... - // mul v17.4S, v14.4S, v1.S[2] // ...................................*............................ || .....................................*.......................... - // sqrdmulh v14.4S, v14.4S, v1.S[3] // .............................*.................................. || .............................*.................................. - // sub v21.4S, v12.4S, v18.4S // ............................................*................... || ...................................................*............ - // add v12.4S, v12.4S, v18.4S // .................................*.............................. || ...................................*............................ - // mul v18.4S, v30.4S, v2.S[0] // ................................*............................... || ...................................*............................ - // sqrdmulh v22.4S, v30.4S, v2.S[1] // ........................................*....................... || .............................................*.................. - // mls v17.4S, v14.4S, v8.S[0] // .........................................*...................... || ...............................................*................ - // mul v14.4S, v25.4S, v2.S[2] // ..............................*................................. || ...............................*................................ - // mls v18.4S, v22.4S, v8.S[0] // ...........................................*.................... || ...................................................*............ - // sub v22.4S, v12.4S, v17.4S // .............................................*.................. || ....................................................*........... - // sqrdmulh v25.4S, v25.4S, v2.S[3] // ..........................................*..................... || .................................................*.............. - // add v12.4S, v12.4S, v17.4S // ................................................*............... || ......................................................*......... - // str_vo v22, x0, 128 // ..................................................*............. || .......................................................*........ - // mul v17.4S, v31.4S, v3.S[0] // .....................................*.......................... || .........................................*...................... - // str_vi v12, x0, 16 // .....................................................*.......... || .........................................................*...... - // sub v12.4S, v21.4S, v18.4S // ...................................................*............ || ........................................................*....... - // sqrdmulh v22.4S, v31.4S, v3.S[1] // ......................................*......................... || ...........................................*.................... - // add v18.4S, v21.4S, v18.4S // ....................................................*........... || .........................................................*...... - // mls v14.4S, v25.4S, v8.S[0] // .................................................*.............. || .......................................................*........ - // str_vo v12, x0, 368 // .......................................................*........ || ...........................................................*.... - // str_vo v18, x0, 240 // ...........................................................*.... || ............................................................*... - // mls v17.4S, v22.4S, v8.S[0] // ...............................................*................ || .....................................................*.......... - // sub v12.4S, v10.4S, v14.4S // .........................................................*...... || ............................................................*... - // add v18.4S, v10.4S, v14.4S // ..........................................................*..... || ............................................................*... - // str_vo v12, x0, 624 // ..............................................................*. || ...............................................................* - // str_vo v18, x0, 496 // ...............................................................* || ...............................................................* - // sub v12.4S, v24.4S, v17.4S // ......................................................*......... || ..........................................................*..... - // add v18.4S, v24.4S, v17.4S // ........................................................*....... || ...........................................................*.... - // str_vo v12, x0, 880 // ............................................................*... || .............................................................*.. - // str_vo v18, x0, 752 // .............................................................*.. || ..............................................................*. - + // Instructions: 65 + // Expected cycles: 66 + // Expected IPC: 0.98 + // + // Wall time: 1.63s + // User time: 1.63s + // + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + sub v20.4S, v27.4S, v17.4S // *................................................................ + ldr q22, [x0, #128] // .*............................................................... + add v18.4S, v27.4S, v17.4S // ......*.......................................................... + ldr q11, [x0, #384] // ...*............................................................. + sqrdmulh v19.4S, v6.4S, v0.S[1] // ....*............................................................ + ldr q30, [x0, #0] // ..*.............................................................. + ldr q13, [x0, #512] // .....*........................................................... + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v6.4S, v20.4S, v1.S[1] // .......*......................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v20.4S, v20.4S, v1.S[0] // ........*........................................................ + sub v27.4S, v11.4S, v25.4S // .........*....................................................... + // gap // ................................................................. + add v11.4S, v11.4S, v25.4S // ..........*...................................................... + // gap // ................................................................. + // gap // ................................................................. + mls v31.4S, v19.4S, v8.S[0] // ...........*..................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v19.4S, v18.4S, v0.S[3] // ................*................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v18.4S, v18.4S, v0.S[2] // ..................*.............................................. + // gap // ................................................................. + // gap // ................................................................. + sub v17.4S, v22.4S, v31.4S // ..............*.................................................. + // gap // ................................................................. + // gap // ................................................................. + add v22.4S, v22.4S, v31.4S // ...............*................................................. + sqrdmulh v12.4S, v13.4S, v0.S[1] // ............*.................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v13.4S, v13.4S, v0.S[0] // .............*................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v20.4S, v6.4S, v8.S[0] // ...................*............................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v13.4S, v12.4S, v8.S[0] // .................*............................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v6.4S, v27.4S, v1.S[1] // ........................*........................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v27.4S, v27.4S, v1.S[0] // .........................*....................................... + // gap // ................................................................. + // gap // ................................................................. + sub v12.4S, v30.4S, v13.4S // ....................*............................................ + // gap // ................................................................. + // gap // ................................................................. + add v30.4S, v30.4S, v13.4S // .....................*........................................... + sqrdmulh v13.4S, v11.4S, v0.S[3] // ..........................*...................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v27.4S, v6.4S, v8.S[0] // ...........................*..................................... + sub v6.4S, v12.4S, v20.4S // ......................*.......................................... + // gap // ................................................................. + add v20.4S, v12.4S, v20.4S // .......................*......................................... + // gap // ................................................................. + // gap // ................................................................. + mul v11.4S, v11.4S, v0.S[2] // .............................*................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v19.4S, v8.S[0] // ............................*.................................... + // gap // ................................................................. + // gap // ................................................................. + sub v19.4S, v17.4S, v27.4S // ..............................*.................................. + // gap // ................................................................. + // gap // ................................................................. + add v27.4S, v17.4S, v27.4S // ...............................*................................. + mls v11.4S, v13.4S, v8.S[0] // ................................*................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v13.4S, v19.4S, v3.S[1] // ...................................*............................. + // gap // ................................................................. + // gap // ................................................................. + sub v17.4S, v30.4S, v18.4S // .................................*............................... + // gap // ................................................................. + // gap // ................................................................. + add v18.4S, v30.4S, v18.4S // ..................................*.............................. + sqrdmulh v30.4S, v27.4S, v2.S[3] // ....................................*............................ + // gap // ................................................................. + sub v12.4S, v22.4S, v11.4S // .....................................*........................... + // gap // ................................................................. + // gap // ................................................................. + add v11.4S, v22.4S, v11.4S // ......................................*.......................... + mul v22.4S, v27.4S, v2.S[2] // .......................................*......................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v27.4S, v12.4S, v2.S[1] // ........................................*........................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v14.4S, v11.4S, v1.S[3] // .........................................*....................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v11.4S, v11.4S, v1.S[2] // ..........................................*...................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v12.4S, v12.4S, v2.S[0] // ...........................................*..................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v11.4S, v14.4S, v8.S[0] // ............................................*.................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v12.4S, v27.4S, v8.S[0] // .............................................*................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v22.4S, v30.4S, v8.S[0] // ..............................................*.................. + // gap // ................................................................. + // gap // ................................................................. + add v30.4S, v18.4S, v11.4S // .................................................*............... + // gap // ................................................................. + // gap // ................................................................. + mul v19.4S, v19.4S, v3.S[0] // ................................................*................ + // gap // ................................................................. + // gap // ................................................................. + sub v27.4S, v17.4S, v12.4S // ..................................................*.............. + // gap // ................................................................. + // gap // ................................................................. + mls v19.4S, v13.4S, v8.S[0] // ....................................................*............ + add v13.4S, v17.4S, v12.4S // ...................................................*............. + str q30, [x0], #(16) // ......................................................*.......... + sub v30.4S, v20.4S, v22.4S // .......................................................*......... + // gap // ................................................................. + // gap // ................................................................. + add v20.4S, v20.4S, v22.4S // ........................................................*........ + sub v18.4S, v18.4S, v11.4S // ...............................................*................. + str q27, [x0, #368] // .........................................................*....... + str q13, [x0, #240] // ..........................................................*...... + // gap // ................................................................. + // gap // ................................................................. + str q30, [x0, #624] // ...........................................................*..... + // gap // ................................................................. + // gap // ................................................................. + sub v11.4S, v6.4S, v19.4S // .............................................................*... + add v19.4S, v6.4S, v19.4S // ..............................................................*.. + str q18, [x0, #112] // .....................................................*........... + str q20, [x0, #496] // ............................................................*.... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q11, [x0, #880] // ...............................................................*. + str q19, [x0, #752] // ................................................................* + // gap // ................................................................. + + // ------------------------- new position -------------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + // sub v20.4S, v27.4S, v17.4S // *................................................................ + // ldr q18, [x0, #128] // .*............................................................... + // ldr q19, [x0, #0] // .....*........................................................... + // ldr q11, [x0, #384] // ...*............................................................. + // sqrdmulh v22.4S, v6.4S, v0.S[1] // ....*............................................................ + // ldr q30, [x0, #512] // ......*.......................................................... + // add v13.4S, v27.4S, v17.4S // ..*.............................................................. + // sqrdmulh v17.4S, v20.4S, v1.S[1] // .......*......................................................... + // mul v20.4S, v20.4S, v1.S[0] // ........*........................................................ + // sub v26.4S, v11.4S, v25.4S // .........*....................................................... + // add v11.4S, v11.4S, v25.4S // ..........*...................................................... + // mls v31.4S, v22.4S, v8.S[0] // ...........*..................................................... + // sqrdmulh v22.4S, v30.4S, v0.S[1] // ................*................................................ + // mul v30.4S, v30.4S, v0.S[0] // .................*............................................... + // sub v25.4S, v18.4S, v31.4S // ..............*.................................................. + // add v18.4S, v18.4S, v31.4S // ...............*................................................. + // sqrdmulh v31.4S, v13.4S, v0.S[3] // ............*.................................................... + // mls v30.4S, v22.4S, v8.S[0] // ...................*............................................. + // mul v22.4S, v13.4S, v0.S[2] // .............*................................................... + // mls v20.4S, v17.4S, v8.S[0] // ..................*.............................................. + // sub v13.4S, v19.4S, v30.4S // ......................*.......................................... + // add v19.4S, v19.4S, v30.4S // .......................*......................................... + // sub v12.4S, v13.4S, v20.4S // ..........................*...................................... + // add v20.4S, v13.4S, v20.4S // ...........................*..................................... + // sqrdmulh v13.4S, v26.4S, v1.S[1] // ....................*............................................ + // mul v26.4S, v26.4S, v1.S[0] // .....................*........................................... + // sqrdmulh v16.4S, v11.4S, v0.S[3] // ........................*........................................ + // mls v26.4S, v13.4S, v8.S[0] // .........................*....................................... + // mls v22.4S, v31.4S, v8.S[0] // .............................*................................... + // mul v11.4S, v11.4S, v0.S[2] // ............................*.................................... + // sub v13.4S, v25.4S, v26.4S // ..............................*.................................. + // add v26.4S, v25.4S, v26.4S // ...............................*................................. + // mls v11.4S, v16.4S, v8.S[0] // ................................*................................ + // sub v31.4S, v19.4S, v22.4S // ..................................*.............................. + // add v19.4S, v19.4S, v22.4S // ...................................*............................. + // sqrdmulh v22.4S, v13.4S, v3.S[1] // .................................*............................... + // sqrdmulh v25.4S, v26.4S, v2.S[3] // ....................................*............................ + // sub v16.4S, v18.4S, v11.4S // .....................................*........................... + // add v18.4S, v18.4S, v11.4S // ......................................*.......................... + // mul v11.4S, v26.4S, v2.S[2] // .......................................*......................... + // sqrdmulh v26.4S, v16.4S, v2.S[1] // ........................................*........................ + // sqrdmulh v10.4S, v18.4S, v1.S[3] // .........................................*....................... + // mul v18.4S, v18.4S, v1.S[2] // ..........................................*...................... + // mul v16.4S, v16.4S, v2.S[0] // ...........................................*..................... + // mls v18.4S, v10.4S, v8.S[0] // ............................................*.................... + // mls v16.4S, v26.4S, v8.S[0] // .............................................*................... + // mls v11.4S, v25.4S, v8.S[0] // ..............................................*.................. + // sub v26.4S, v19.4S, v18.4S // .......................................................*......... + // mul v13.4S, v13.4S, v3.S[0] // ................................................*................ + // add v18.4S, v19.4S, v18.4S // ...............................................*................. + // sub v19.4S, v31.4S, v16.4S // .................................................*............... + // add v31.4S, v31.4S, v16.4S // ...................................................*............. + // mls v13.4S, v22.4S, v8.S[0] // ..................................................*.............. + // str q26, [x0, #128] // .............................................................*... + // str q18, [x0], #(16) // ....................................................*............ + // sub v18.4S, v20.4S, v11.4S // .....................................................*........... + // add v20.4S, v20.4S, v11.4S // ......................................................*.......... + // str q19, [x0, #368] // ........................................................*........ + // str q31, [x0, #240] // .........................................................*....... + // str q18, [x0, #624] // ..........................................................*...... + // str q20, [x0, #496] // ..............................................................*.. + // sub v20.4S, v12.4S, v13.4S // ...........................................................*..... + // add v18.4S, v12.4S, v13.4S // ............................................................*.... + // str q20, [x0, #880] // ...............................................................*. + // str q18, [x0, #752] // ................................................................* + restore inp, STACK0 add inpp, inp, #64 @@ -1030,299 +1031,1381 @@ layer123_start: sub inpp, inpp, #64 .p2align 2 - ldr_vo v27, x2, 80 - ldr_vi v12, x4, 64 - ldr_vo v1, x1, 112 - ldr_vo v30, x1, 64 - ldr_vo v6, x1, 96 - ldr_vo v7, x1, 80 - add x1, x1, #64 - ldr_vo v29, x2, 112 - ldr_vo v3, x2, 64 - ldr_vo v13, x2, 96 - add x2, x2, #64 - mul v0.4S, v27.4S, v12.S[0] - ldr_vo v19, x4, -48 - ldr_vo v31, x4, -32 - sqrdmulh v27.4S, v27.4S, v12.S[1] - ldr_vo v4, x4, -16 - sqrdmulh v11.4S, v29.4S, v12.S[1] - mul v29.4S, v29.4S, v12.S[0] - sqrdmulh v16.4S, v13.4S, v12.S[1] - mls v29.4S, v11.4S, v8.S[0] - mul v13.4S, v13.4S, v12.S[0] - mls v13.4S, v16.4S, v8.S[0] - sub v11.4S, v1.4S, v29.4S - add v1.4S, v1.4S, v29.4S - mul v29.4S, v3.4S, v12.S[0] - sqrdmulh v3.4S, v3.4S, v12.S[1] - sub v16.4S, v6.4S, v13.4S - add v6.4S, v6.4S, v13.4S - mul v13.4S, v1.4S, v12.S[2] - sqrdmulh v1.4S, v1.4S, v12.S[3] - mul v17.4S, v6.4S, v12.S[2] - sqrdmulh v12.4S, v6.4S, v12.S[3] - mls v29.4S, v3.4S, v8.S[0] - mls v0.4S, v27.4S, v8.S[0] - mls v13.4S, v1.4S, v8.S[0] - sub v27.4S, v30.4S, v29.4S - add v1.4S, v30.4S, v29.4S - mls v17.4S, v12.4S, v8.S[0] - ldr_vi v12, x5, 192 - add v30.4S, v7.4S, v0.4S - ldr_vo v29, x5, -176 - ldr_vo v3, x5, -160 - sub v7.4S, v7.4S, v0.4S - mul v0.4S, v16.4S, v19.S[0] - ldr_vo v22, x5, -144 - ldr_vo v2, x5, -128 - ldr_vo v6, x5, -112 - sqrdmulh v16.4S, v16.4S, v19.S[1] - add v15.4S, v30.4S, v13.4S - sub v30.4S, v30.4S, v13.4S - mul v13.4S, v11.4S, v19.S[0] - sub v14.4S, v1.4S, v17.4S - add v1.4S, v1.4S, v17.4S - sqrdmulh v11.4S, v11.4S, v19.S[1] - mul v17.4S, v15.4S, v19.S[2] - sqrdmulh v19.4S, v15.4S, v19.S[3] - mls v13.4S, v11.4S, v8.S[0] - sqrdmulh v11.4S, v30.4S, v31.S[1] - mul v30.4S, v30.4S, v31.S[0] - add v15.4S, v7.4S, v13.4S - sub v7.4S, v7.4S, v13.4S - mls v17.4S, v19.4S, v8.S[0] - mls v30.4S, v11.4S, v8.S[0] - mul v13.4S, v15.4S, v31.S[2] - sub v19.4S, v1.4S, v17.4S - sqrdmulh v31.4S, v15.4S, v31.S[3] - add v1.4S, v1.4S, v17.4S - sub v11.4S, v14.4S, v30.4S - add v30.4S, v14.4S, v30.4S - sqrdmulh v17.4S, v7.4S, v4.S[1] - trn1 v15.4S, v1.4S, v19.4S - mul v7.4S, v7.4S, v4.S[0] + // Instructions: 72 + // Expected cycles: 72 + // Expected IPC: 1.00 + // + // Wall time: 8.05s + // User time: 8.05s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ldr q18, [x4], #64 // .....*.................................................................. + ldr q12, [x2, #112] // ....*................................................................... + // gap // ........................................................................ + ldr q23, [x1, #64] // .*...................................................................... + ldr q9, [x1, #112] // *....................................................................... + // gap // ........................................................................ + ldr q5, [x1, #80] // .......................*................................................ + ldr q10, [x1, #96] // ......*................................................................. + add x1, x1, #64 // ...................................................................*.... + ldr q24, [x4, #-32] // ..........................................*............................. + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v3.4S, v12.4S, v18.S[1] // ........*............................................................... + ldr q22, [x2, #96] // ...*.................................................................... + // gap // ........................................................................ + ldr q11, [x4, #-48] // ..................................*..................................... + // gap // ........................................................................ + // gap // ........................................................................ + ldr q4, [x2, #80] // ..*..................................................................... + // gap // ........................................................................ + mul v12.4S, v12.4S, v18.S[0] // ...........*............................................................ + ldr q2, [x2, #64] // .................*...................................................... + add x2, x2, #64 // .................................*...................................... + // gap // ........................................................................ + sqrdmulh v17.4S, v22.4S, v18.S[1] // .......*................................................................ + ldr q7, [x5, #16] // .........................................................*.............. + // gap // ........................................................................ + ldr q20, [x5], #(12*16) // ................................................................*....... + // gap // ........................................................................ + // gap // ........................................................................ + mls v12.4S, v3.4S, v8.S[0] // ...............*........................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v14.4S, v22.4S, v18.S[0] // .........*.............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v14.4S, v17.4S, v8.S[0] // ..........*............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v22.4S, v9.4S, v12.4S // ...........................*............................................ + add v21.4S, v9.4S, v12.4S // ............................*........................................... + // gap // ........................................................................ + sqrdmulh v25.4S, v2.4S, v18.S[1] // ..................*..................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v16.4S, v2.4S, v18.S[0] // ...................*.................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v13.4S, v10.4S, v14.4S // .............*.......................................................... + sub v17.4S, v10.4S, v14.4S // ................*....................................................... + mul v15.4S, v4.4S, v18.S[0] // ............*........................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v1.4S, v13.4S, v18.S[2] // ..............*......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v4.4S, v4.4S, v18.S[1] // .....................*.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v0.4S, v13.4S, v18.S[3] // ..........................*............................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v26.4S, v21.4S, v18.S[3] // .............................*.......................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v13.4S, v21.4S, v18.S[2] // ..............................*......................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v15.4S, v4.4S, v8.S[0] // ......................*................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v16.4S, v25.4S, v8.S[0] // ....................*................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v13.4S, v26.4S, v8.S[0] // ...............................*........................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v28.4S, v5.4S, v15.4S // ........................*............................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v25.4S, v5.4S, v15.4S // .........................*.............................................. + mul v5.4S, v22.4S, v11.S[0] // ........................................*............................... + // gap // ........................................................................ + add v3.4S, v23.4S, v16.4S // ...................................*.................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v4.4S, v23.4S, v16.4S // ......................................................*................. + mls v1.4S, v0.4S, v8.S[0] // ................................*....................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v21.4S, v25.4S, v13.4S // ....................................*................................... + sqrdmulh v26.4S, v22.4S, v11.S[1] // .........................................*.............................. + sub v31.4S, v25.4S, v13.4S // ............................................*........................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v16.4S, v21.4S, v11.S[2] // .......................................*................................ + add v22.4S, v3.4S, v1.4S // .....................................*.................................. + // gap // ........................................................................ + // gap // ........................................................................ + sub v29.4S, v3.4S, v1.4S // ......................................*................................. + // gap // ........................................................................ + sqrdmulh v10.4S, v31.4S, v24.S[1] // .............................................*.......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v30.4S, v21.4S, v11.S[3] // ...................................................*.................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v15.4S, v31.4S, v24.S[0] // ..............................................*......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v15.4S, v10.4S, v8.S[0] // ....................................................*................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v16.4S, v30.4S, v8.S[0] // .....................................................*.................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v5.4S, v26.4S, v8.S[0] // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v2.4S, v29.4S, v15.4S // .......................................................*................ + add v15.4S, v29.4S, v15.4S // ........................................................*............... + ldr q29, [x4, #-16] // ......................................................................*. + sqrdmulh v30.4S, v17.4S, v11.S[1] // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + add v21.4S, v22.4S, v16.4S // ..........................................................*............. + mul v19.4S, v17.4S, v11.S[0] // ...............................................................*........ + sub v9.4S, v22.4S, v16.4S // ...........................................................*............ + // gap // ........................................................................ + trn2 v14.4S, v15.4S, v2.4S // .............................................................*.......... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v13.4S, v28.4S, v5.4S // ...............................................*........................ + mls v19.4S, v30.4S, v8.S[0] // ..................................................................*..... + // gap // ........................................................................ + trn2 v6.4S, v21.4S, v9.4S // ..............................................................*......... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v31.4S, v13.4S, v24.S[2] // ..................................................*..................... + trn2 v25.2D, v6.2D, v14.2D // .................................................................*...... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v12.2D, v6.2D, v14.2D // .....................................................................*.. + sqrdmulh v13.4S, v13.4S, v24.S[3] // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v22.4S, v25.4S, v20.4S // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v1.4S, v28.4S, v5.4S // ................................................*....................... + sqrdmulh v10.4S, v25.4S, v7.4S // ....................................................................*... + // gap // ........................................................................ + + // ---------------------------- new position -----------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + // ldr q28, [x1, #112] // ...*.................................................................... + // ldr q5, [x1, #64] // ..*..................................................................... + // ldr q3, [x2, #80] // ...........*............................................................ + // ldr q0, [x2, #96] // .........*.............................................................. + // ldr q18, [x2, #112] // .*...................................................................... + // ldr q16, [x4], #64 // *....................................................................... + // ldr q22, [x1, #96] // .....*.................................................................. + // sqrdmulh v21.4S, v0.4S, v16.S[1] // ...............*........................................................ + // sqrdmulh v6.4S, v18.4S, v16.S[1] // ........*............................................................... + // mul v30.4S, v0.4S, v16.S[0] // ...................*.................................................... + // mls v30.4S, v21.4S, v8.S[0] // ....................*................................................... + // mul v26.4S, v18.4S, v16.S[0] // ............*........................................................... + // mul v27.4S, v3.4S, v16.S[0] // ...........................*............................................ + // add v9.4S, v22.4S, v30.4S // .........................*.............................................. + // mul v12.4S, v9.4S, v16.S[2] // ............................*........................................... + // mls v26.4S, v6.4S, v8.S[0] // ..................*..................................................... + // sub v0.4S, v22.4S, v30.4S // ..........................*............................................. + // ldr q18, [x2, #64] // .............*.......................................................... + // sqrdmulh v22.4S, v18.4S, v16.S[1] // .......................*................................................ + // mul v18.4S, v18.4S, v16.S[0] // ........................*............................................... + // mls v18.4S, v22.4S, v8.S[0] // ..................................*..................................... + // sqrdmulh v4.4S, v3.4S, v16.S[1] // .............................*.......................................... + // mls v27.4S, v4.4S, v8.S[0] // .................................*...................................... + // ldr q14, [x1, #80] // ....*................................................................... + // sub v29.4S, v14.4S, v27.4S // ....................................*................................... + // add v6.4S, v14.4S, v27.4S // .....................................*.................................. + // sqrdmulh v17.4S, v9.4S, v16.S[3] // ..............................*......................................... + // sub v23.4S, v28.4S, v26.4S // .....................*.................................................. + // add v28.4S, v28.4S, v26.4S // ......................*................................................. + // sqrdmulh v31.4S, v28.4S, v16.S[3] // ...............................*........................................ + // mul v26.4S, v28.4S, v16.S[2] // ................................*....................................... + // mls v26.4S, v31.4S, v8.S[0] // ...................................*.................................... + // mls v12.4S, v17.4S, v8.S[0] // .........................................*.............................. + // add x2, x2, #64 // ..............*......................................................... + // ldr q20, [x4, #-48] // ..........*............................................................. + // add v14.4S, v5.4S, v18.4S // .......................................*................................ + // add v21.4S, v6.4S, v26.4S // ..........................................*............................. + // add v9.4S, v14.4S, v12.4S // ..............................................*......................... + // sub v7.4S, v14.4S, v12.4S // ...............................................*........................ + // mul v16.4S, v21.4S, v20.S[2] // .............................................*.......................... + // mul v15.4S, v23.4S, v20.S[0] // ......................................*................................. + // sqrdmulh v28.4S, v23.4S, v20.S[1] // ...........................................*............................ + // ldr q27, [x4, #-32] // .......*................................................................ + // mls v15.4S, v28.4S, v8.S[0] // .....................................................*.................. + // sub v13.4S, v6.4S, v26.4S // ............................................*........................... + // sqrdmulh v6.4S, v13.4S, v27.S[1] // ................................................*....................... + // mul v17.4S, v13.4S, v27.S[0] // ..................................................*..................... + // add v31.4S, v29.4S, v15.4S // ..............................................................*......... + // sub v1.4S, v29.4S, v15.4S // ......................................................................*. + // sqrdmulh v13.4S, v31.4S, v27.S[3] // ....................................................................*... + // mul v31.4S, v31.4S, v27.S[2] // .................................................................*...... + // sqrdmulh v27.4S, v21.4S, v20.S[3] // .................................................*...................... + // mls v17.4S, v6.4S, v8.S[0] // ...................................................*.................... + // mls v16.4S, v27.4S, v8.S[0] // ....................................................*................... + // sub v4.4S, v5.4S, v18.4S // ........................................*............................... + // sub v2.4S, v7.4S, v17.4S // ......................................................*................. + // add v15.4S, v7.4S, v17.4S // .......................................................*................ + // ldr q7, [x5, #16] // ................*....................................................... + // add v21.4S, v9.4S, v16.4S // ..........................................................*............. + // sub v9.4S, v9.4S, v16.4S // ............................................................*........... + // sqrdmulh v11.4S, v0.4S, v20.S[1] // .........................................................*.............. + // trn2 v17.4S, v15.4S, v2.4S // .............................................................*.......... + // trn2 v29.4S, v21.4S, v9.4S // ................................................................*....... + // mul v19.4S, v0.4S, v20.S[0] // ...........................................................*............ + // ldr q20, [x5], #(12*16) // .................*...................................................... + // trn2 v26.2D, v29.2D, v17.2D // ..................................................................*..... + // mls v19.4S, v11.4S, v8.S[0] // ...............................................................*........ + // add x1, x1, #64 // ......*................................................................. + // sqrdmulh v10.4S, v26.4S, v7.4S // .......................................................................* + // trn1 v12.2D, v29.2D, v17.2D // ...................................................................*.... + // ldr q29, [x4, #-16] // ........................................................*............... + // mul v22.4S, v26.4S, v20.4S // .....................................................................*.. + sub count, count, #1 -.p2align 2 layer45678_start: - mls v0.4S, v16.4S, v8.S[0] - trn1 v4.4S, v30.4S, v11.4S - ldr_vo v16, x5, -16 - trn2 v10.4S, v1.4S, v19.4S - ldr_vo v1, x5, -80 - ldr_vo v24, x5, -96 - mls v13.4S, v31.4S, v8.S[0] - trn2 v30.4S, v30.4S, v11.4S // gap(s) to follow - trn2 v5.2D, v15.2D, v4.2D // gap(s) to follow - trn1 v21.2D, v15.2D, v4.2D - mls v7.4S, v17.4S, v8.S[0] // gap(s) to follow - add v18.4S, v27.4S, v0.4S // gap(s) to follow - sub v25.4S, v27.4S, v0.4S - sqrdmulh v27.4S, v5.4S, v29.4S // gap(s) to follow - trn2 v20.2D, v10.2D, v30.2D // gap(s) to follow - sub v23.4S, v18.4S, v13.4S - add v28.4S, v18.4S, v13.4S // gap(s) to follow - mul v31.4S, v5.4S, v12.4S - sub v19.4S, v25.4S, v7.4S // gap(s) to follow - add v18.4S, v25.4S, v7.4S // gap(s) to follow - trn1 v9.2D, v10.2D, v30.2D - sqrdmulh v5.4S, v20.4S, v29.4S // gap(s) to follow - trn2 v10.4S, v28.4S, v23.4S // gap(s) to follow - mls v31.4S, v27.4S, v8.S[0] - trn2 v25.4S, v18.4S, v19.4S // gap(s) to follow - trn1 v15.4S, v28.4S, v23.4S // gap(s) to follow - mul v27.4S, v20.4S, v12.4S - trn1 v23.4S, v18.4S, v19.4S // gap(s) to follow - trn2 v20.2D, v10.2D, v25.2D // gap(s) to follow - mls v27.4S, v5.4S, v8.S[0] - trn1 v28.2D, v10.2D, v25.2D // gap(s) to follow - trn2 v18.2D, v15.2D, v23.2D // gap(s) to follow - trn1 v29.2D, v15.2D, v23.2D - mul v5.4S, v20.4S, v24.4S // gap(s) to follow - mul v13.4S, v18.4S, v24.4S // gap(s) to follow - sub v30.4S, v9.4S, v27.4S // gap(s) to follow - add v27.4S, v9.4S, v27.4S - sqrdmulh v12.4S, v20.4S, v1.4S - ldr_vo v7, x5, -64 - add v11.4S, v21.4S, v31.4S - ldr_vo v19, x5, -32 - ldr_vo v4, x5, -48 - sub v0.4S, v21.4S, v31.4S - sqrdmulh v1.4S, v18.4S, v1.4S - ldr_vo v17, x2, 144 - ldr_vo v15, x1, 176 - ldr_vi v14, x4, 64 // gap(s) to follow - mul v21.4S, v30.4S, v2.4S - ldr_vo v9, x2, 176 - ldr_vo v26, x1, 160 - ldr_vo v20, x2, 160 - ldr_vo v24, x2, 128 // gap(s) to follow - sqrdmulh v6.4S, v30.4S, v6.4S - ldr_vo v30, x1, 128 - ldr_vo v31, x4, -32 // gap(s) to follow - mul v3.4S, v27.4S, v3.4S // gap(s) to follow - mls v5.4S, v12.4S, v8.S[0] // gap(s) to follow - sqrdmulh v27.4S, v27.4S, v22.4S // gap(s) to follow - mls v13.4S, v1.4S, v8.S[0] // gap(s) to follow - sub v1.4S, v28.4S, v5.4S // gap(s) to follow - add v22.4S, v28.4S, v5.4S - mls v21.4S, v6.4S, v8.S[0] // gap(s) to follow - sqrdmulh v6.4S, v1.4S, v16.4S // gap(s) to follow - sub v16.4S, v29.4S, v13.4S // gap(s) to follow - add v29.4S, v29.4S, v13.4S - mls v3.4S, v27.4S, v8.S[0] // gap(s) to follow - sub v13.4S, v0.4S, v21.4S // gap(s) to follow - mul v27.4S, v22.4S, v7.4S - add v12.4S, v0.4S, v21.4S - ldr_vo v7, x1, 144 // gap(s) to follow - mul v1.4S, v1.4S, v19.4S - ldr_vo v19, x4, -48 // gap(s) to follow - sqrdmulh v0.4S, v22.4S, v4.4S - add v10.4S, v11.4S, v3.4S - ldr_vo v4, x4, -16 - sub v11.4S, v11.4S, v3.4S - ldr_vo v3, x5, 32 - ldr_vo v22, x5, 48 - mls v1.4S, v6.4S, v8.S[0] - ldr_vo v2, x5, 64 - ldr_vo v6, x5, 80 // gap(s) to follow - st4 {v10.4S,v11.4S,v12.4S,v13.4S}, [x1], #64 - add x1, x1, #64 - mul v13.4S, v17.4S, v14.S[0] // gap(s) to follow - sqrdmulh v17.4S, v17.4S, v14.S[1] // gap(s) to follow - mls v27.4S, v0.4S, v8.S[0] // gap(s) to follow - sqrdmulh v0.4S, v9.4S, v14.S[1] // gap(s) to follow - mul v21.4S, v9.4S, v14.S[0] - sub v12.4S, v16.4S, v1.4S // gap(s) to follow - add v11.4S, v16.4S, v1.4S // gap(s) to follow - add v9.4S, v29.4S, v27.4S - sqrdmulh v1.4S, v20.4S, v14.S[1] // gap(s) to follow - sub v10.4S, v29.4S, v27.4S - ldr_vo v29, x5, 16 // gap(s) to follow - mul v16.4S, v20.4S, v14.S[0] // gap(s) to follow - st4 {v9.4S,v10.4S,v11.4S,v12.4S}, [x2], #64 - add x2, x2, #64 - sqrdmulh v27.4S, v24.4S, v14.S[1] - ldr_vi v12, x5, 192 // gap(s) to follow - mul v11.4S, v24.4S, v14.S[0] // gap(s) to follow - mls v21.4S, v0.4S, v8.S[0] // gap(s) to follow - mls v11.4S, v27.4S, v8.S[0] // gap(s) to follow - mls v16.4S, v1.4S, v8.S[0] // gap(s) to follow - add v1.4S, v15.4S, v21.4S // gap(s) to follow - sub v15.4S, v15.4S, v21.4S - mls v13.4S, v17.4S, v8.S[0] // gap(s) to follow - sub v27.4S, v30.4S, v11.4S // gap(s) to follow - add v30.4S, v30.4S, v11.4S - mul v11.4S, v1.4S, v14.S[2] // gap(s) to follow - add v0.4S, v26.4S, v16.4S // gap(s) to follow - sub v16.4S, v26.4S, v16.4S - sqrdmulh v1.4S, v1.4S, v14.S[3] // gap(s) to follow - add v17.4S, v7.4S, v13.4S // gap(s) to follow - sub v7.4S, v7.4S, v13.4S - mul v13.4S, v0.4S, v14.S[2] // gap(s) to follow - sqrdmulh v14.4S, v0.4S, v14.S[3] // gap(s) to follow - mls v11.4S, v1.4S, v8.S[0] // gap(s) to follow - mul v0.4S, v16.4S, v19.S[0] // gap(s) to follow - sqrdmulh v16.4S, v16.4S, v19.S[1] // gap(s) to follow - add v1.4S, v17.4S, v11.4S // gap(s) to follow - sub v11.4S, v17.4S, v11.4S - mul v17.4S, v15.4S, v19.S[0] // gap(s) to follow - sqrdmulh v15.4S, v15.4S, v19.S[1] // gap(s) to follow - mul v21.4S, v1.4S, v19.S[2] // gap(s) to follow - sqrdmulh v1.4S, v1.4S, v19.S[3] // gap(s) to follow - mls v13.4S, v14.4S, v8.S[0] // gap(s) to follow - mls v17.4S, v15.4S, v8.S[0] // gap(s) to follow - sqrdmulh v19.4S, v11.4S, v31.S[1] // gap(s) to follow - sub v15.4S, v30.4S, v13.4S // gap(s) to follow - add v30.4S, v30.4S, v13.4S - mul v14.4S, v11.4S, v31.S[0] // gap(s) to follow - add v11.4S, v7.4S, v17.4S // gap(s) to follow - sub v7.4S, v7.4S, v17.4S - mls v21.4S, v1.4S, v8.S[0] // gap(s) to follow - mls v14.4S, v19.4S, v8.S[0] // gap(s) to follow - mul v13.4S, v11.4S, v31.S[2] // gap(s) to follow - sub v19.4S, v30.4S, v21.4S // gap(s) to follow - sqrdmulh v31.4S, v11.4S, v31.S[3] - add v1.4S, v30.4S, v21.4S // gap(s) to follow - sub v11.4S, v15.4S, v14.4S // gap(s) to follow - add v30.4S, v15.4S, v14.4S - sqrdmulh v17.4S, v7.4S, v4.S[1] // gap(s) to follow - trn1 v15.4S, v1.4S, v19.4S // gap(s) to follow - mul v7.4S, v7.4S, v4.S[0] // gap(s) to follow - subs count, count, #1 + // Instructions: 144 + // Expected cycles: 209 + // Expected IPC: 0.69 + // + // Wall time: 2938.52s + // User time: 2938.52s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + ldr q28, [x1, #176] // ...e............................................................................................................................................ + ldr q5, [x1, #128] // e............................................................................................................................................... + mls v22.4S, v10.4S, v8.S[0] // .......................................................................................................*........................................ + ldr q3, [x2, #144] // .....e.......................................................................................................................................... + ldr q0, [x2, #160] // ......e......................................................................................................................................... + trn1 v17.4S, v21.4S, v9.4S // ..........................................................................*..................................................................... + ldr q18, [x2, #176] // .......e........................................................................................................................................ + ldr q16, [x4], #64 // ..........e..................................................................................................................................... + mls v31.4S, v13.4S, v8.S[0] // ..................................................................*............................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v1.4S, v29.S[1] // .....................................................................*.......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v12.4S, v22.4S // ........................................................................................................*....................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v11.4S, v12.4S, v22.4S // .........................................................................................................*...................................... + ldr q22, [x1, #160] // ..e............................................................................................................................................. + sqrdmulh v21.4S, v0.4S, v16.S[1] // ........................e....................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v6.4S, v18.4S, v16.S[1] // .............................e.................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v30.4S, v0.4S, v16.S[0] // .........................e...................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v30.4S, v21.4S, v8.S[0] // ..........................e..................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v26.4S, v18.4S, v16.S[0] // ..............................e................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v27.4S, v3.4S, v16.S[0] // ....................e........................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v9.4S, v22.4S, v30.4S // ............................e................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v23.4S, v1.4S, v29.S[0] // ......................................................................*......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v12.4S, v9.4S, v16.S[2] // ...................................e............................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v21.4S, v4.4S, v19.4S // ...............................................*................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v23.4S, v24.4S, v8.S[0] // .......................................................................*........................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v26.4S, v6.4S, v8.S[0] // ...............................e................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v4.4S, v4.4S, v19.4S // ................................................*............................................................................................... + trn1 v19.4S, v15.4S, v2.4S // ............................................................................*................................................................... + ldr q15, [x5, #-112] // ...............................................................................................*................................................ + sub v25.4S, v21.4S, v23.4S // ........................................................................*....................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v2.4S, v21.4S, v23.4S // .........................................................................*...................................................................... + ldr q21, [x5, #-128] // ..............................................................................................*................................................. + // gap // ................................................................................................................................................ + sub v29.4S, v4.4S, v31.4S // ...................................................................*............................................................................ + add v0.4S, v4.4S, v31.4S // ....................................................................*........................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v10.4S, v14.4S, v15.4S // ...............................................................................................................*................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v15.4S, v0.4S, v29.4S // ..................................................................................*............................................................. + trn1 v23.4S, v2.4S, v25.4S // ....................................................................................*........................................................... + // gap // ................................................................................................................................................ + trn2 v6.4S, v0.4S, v29.4S // ...................................................................................*............................................................ + trn1 v13.2D, v17.2D, v19.2D // ................................................................................*............................................................... + // gap // ................................................................................................................................................ + mul v24.4S, v14.4S, v21.4S // ................................................................................................................*............................... + sub v0.4S, v22.4S, v30.4S // ...........................e.................................................................................................................... + // gap // ................................................................................................................................................ + trn1 v21.2D, v15.2D, v23.2D // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v24.4S, v10.4S, v8.S[0] // .................................................................................................................*.............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q18, [x2, #128] // ....e........................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v18.4S, v16.S[1] // ..............e................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v18.4S, v18.4S, v16.S[0] // ...............e................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v18.4S, v22.4S, v8.S[0] // ................e............................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q22, [x5, #-32] // ........................................................................................................................*....................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q31, [x5, #-96] // ....................................................................................................................*........................... + trn2 v4.2D, v15.2D, v23.2D // ......................................................................................*......................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q1, [x5, #-80] // .....................................................................................................................*.......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v10.4S, v2.4S, v25.4S // .....................................................................................*.......................................................... + trn2 v25.2D, v17.2D, v19.2D // ..............................................................................*................................................................. + // gap // ................................................................................................................................................ + mul v30.4S, v4.4S, v31.4S // ...........................................................................................................................*.................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v2.2D, v6.2D, v10.2D // .......................................................................................*........................................................ + trn1 v10.2D, v6.2D, v10.2D // .........................................................................................*...................................................... + // gap // ................................................................................................................................................ + sqrdmulh v19.4S, v4.4S, v1.4S // ..........................................................................................................................*..................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v17.4S, v2.4S, v1.4S // ...............................................................................................................................*................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v4.4S, v3.4S, v16.S[1] // ...................e............................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v3.4S, v2.4S, v31.4S // ................................................................................................................................*............... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v17.4S, v8.S[0] // .................................................................................................................................*.............. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v27.4S, v4.4S, v8.S[0] // .....................e.......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v23.4S, v10.4S, v3.4S // ...................................................................................................................................*............ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v30.4S, v19.4S, v8.S[0] // ............................................................................................................................*................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v31.4S, v21.4S, v30.4S // ..............................................................................................................................*................. + sub v30.4S, v21.4S, v30.4S // .............................................................................................................................*.................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v19.4S, v10.4S, v3.4S // ..................................................................................................................................*............. + ldr q10, [x5, #-48] // .......................................................................................................................*........................ + // gap // ................................................................................................................................................ + ldr q2, [x5, #-64] // ......................................................................................................................*......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v17.4S, v23.4S, v10.4S // ....................................................................................................................................*........... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v10.4S, v19.4S, v22.4S // ..........................................................................................................................................*..... + ldr q14, [x1, #144] // .e.............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v23.4S, v23.4S, v2.4S // .....................................................................................................................................*.......... + sub v29.4S, v14.4S, v27.4S // ......................e......................................................................................................................... + // gap // ................................................................................................................................................ + add v6.4S, v14.4S, v27.4S // .......................e........................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v23.4S, v17.4S, v8.S[0] // ......................................................................................................................................*......... + ldr q21, [x5, #-16] // .........................................................................................................................*...................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v17.4S, v9.4S, v16.S[3] // ..................................e............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v3.4S, v19.4S, v21.4S // .........................................................................................................................................*...... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v2.4S, v31.4S, v23.4S // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v1.4S, v31.4S, v23.4S // ........................................................................................................................................*....... + sub v23.4S, v28.4S, v26.4S // ................................e............................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v28.4S, v28.4S, v26.4S // .................................e.............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v10.4S, v3.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v31.4S, v28.4S, v16.S[3] // .......................................e........................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v26.4S, v28.4S, v16.S[2] // ........................................e....................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v4.4S, v30.4S, v10.4S // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v3.4S, v30.4S, v10.4S // .............................................................................................................................................*.. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v26.4S, v31.4S, v8.S[0] // .........................................e...................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v12.4S, v17.4S, v8.S[0] // ....................................e........................................................................................................... + st4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2], #64 // ...............................................................................................................................................* + add x2, x2, #64 // .........e...................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v14.4S, v25.4S, v7.4S // ................................................................................................*............................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v25.4S, v25.4S, v20.4S // .................................................................................................*.............................................. + ldr q20, [x4, #-48] // ...........e.................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v25.4S, v14.4S, v8.S[0] // ..................................................................................................*............................................. + add v14.4S, v5.4S, v18.4S // ..................e............................................................................................................................. + ldr q4, [x5, #-160] // ............................................................................................*................................................... + add v21.4S, v6.4S, v26.4S // ...........................................e.................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v9.4S, v14.4S, v12.4S // ......................................e......................................................................................................... + sub v7.4S, v14.4S, v12.4S // .....................................e.......................................................................................................... + // gap // ................................................................................................................................................ + ldr q3, [x5, #-144] // .............................................................................................*.................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v22.4S, v13.4S, v25.4S // ...................................................................................................*............................................ + // gap // ................................................................................................................................................ + mul v16.4S, v21.4S, v20.S[2] // .......................................................e........................................................................................ + add v14.4S, v13.4S, v25.4S // ....................................................................................................*........................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v15.4S, v23.4S, v20.S[0] // ..................................................e............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v25.4S, v22.4S, v24.4S // ..................................................................................................................*............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v24.4S, v22.4S, v24.4S // ...................................................................................................................*............................ + sqrdmulh v28.4S, v23.4S, v20.S[1] // .................................................e.............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q27, [x4, #-32] // ............e................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v15.4S, v28.4S, v8.S[0] // ...................................................e............................................................................................ + // gap // ................................................................................................................................................ + sub v13.4S, v6.4S, v26.4S // ..........................................e..................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v6.4S, v13.4S, v27.S[1] // ...........................................................e.................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v17.4S, v13.4S, v27.S[0] // ............................................................e................................................................................... + add v31.4S, v29.4S, v15.4S // .....................................................e.......................................................................................... + // gap // ................................................................................................................................................ + sub v1.4S, v29.4S, v15.4S // ....................................................e........................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v13.4S, v31.4S, v27.S[3] // ................................................................e............................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v31.4S, v31.4S, v27.S[2] // .................................................................e.............................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v27.4S, v21.4S, v20.S[3] // ......................................................e......................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v12.4S, v11.4S, v3.4S // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v17.4S, v6.4S, v8.S[0] // .............................................................e.................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v16.4S, v27.4S, v8.S[0] // ........................................................e....................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v23.4S, v11.4S, v4.4S // ...........................................................................................................*.................................... + sub v4.4S, v5.4S, v18.4S // .................e.............................................................................................................................. + // gap // ................................................................................................................................................ + sub v2.4S, v7.4S, v17.4S // ..............................................................e................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v15.4S, v7.4S, v17.4S // ...............................................................e................................................................................ + mls v23.4S, v12.4S, v8.S[0] // ............................................................................................................*................................... + ldr q7, [x5, #16] // ...........................................................................................e.................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v21.4S, v9.4S, v16.4S // ..........................................................e..................................................................................... + sub v9.4S, v9.4S, v16.4S // .........................................................e...................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v11.4S, v0.4S, v20.S[1] // ............................................e................................................................................................... + trn2 v17.4S, v15.4S, v2.4S // .............................................................................e.................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v22.4S, v14.4S, v23.4S // ..............................................................................................................*................................. + trn2 v29.4S, v21.4S, v9.4S // ...........................................................................e.................................................................... + // gap // ................................................................................................................................................ + mul v19.4S, v0.4S, v20.S[0] // .............................................e.................................................................................................. + sub v23.4S, v14.4S, v23.4S // .............................................................................................................*.................................. + ldr q20, [x5], #(12*16) // ..........................................................................................e..................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v26.2D, v29.2D, v17.2D // ...............................................................................e................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + st4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1], #64 // ..............................................................................................................................................*. + mls v19.4S, v11.4S, v8.S[0] // ..............................................e................................................................................................. + add x1, x1, #64 // ........e....................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v10.4S, v26.4S, v7.4S // .....................................................................................................e.......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v12.2D, v29.2D, v17.2D // .................................................................................e.............................................................. + ldr q29, [x4, #-16] // .............e.................................................................................................................................. + // gap // ................................................................................................................................................ + mul v22.4S, v26.4S, v20.4S // ......................................................................................................e......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + + // ------------------------------------------------------------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------ + // ldr q9, [x1, #(16*0 + (64))] // .e..............................................................................................................................................'~........................................................................................................................................ + // ldr q10, [x1, #(16*1 + (64))] // ......................................................................e.........................................................................'.....................................................................~................................................................... + // ldr q11, [x1, #(16*2 + (64))] // ............e...................................................................................................................................'...........~............................................................................................................................. + // ldr q12, [x1, #(16*3 + (64))] // e...............................................................................................................................................~......................................................................................................................................... + // ldr q13, [x2, #(16*0 + (64))] // ..........................................e.....................................................................................................'.........................................~............................................................................................... + // ldr q14, [x2, #(16*1 + (64))] // ...e............................................................................................................................................'..~...................................................................................................................................... + // ldr q15, [x2, #(16*2 + (64))] // ....e...........................................................................................................................................'...~..................................................................................................................................... + // ldr q16, [x2, #(16*3 + (64))] // ......e.........................................................................................................................................'.....~................................................................................................................................... + // add x1, x1, #64 // ...........................................................................................................................................e....'......................................................................................................................................... + // add x2, x2, #64 // ..........................................................................................e.....................................................'.........................................................................................~............................................... + // ldr q0, [x4], #64 // .......e........................................................................................................................................'......~.................................................................................................................................. + // ldr q1, [x4, #(-64 + 16)] // .............................................................................................e..................................................'............................................................................................~............................................ + // ldr q2, [x4, #(-64 + 32)] // ............................................................................................................e...................................'...........................................................................................................~............................. + // ldr q3, [x4, #(-64 + 48)] // ..............................................................................................................................................e.'......................................................................................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ...........................................e....................................................................................................'..........................................~.............................................................................................. + // mul v24.4s, v13.4s, v0.s[0] // ............................................e...................................................................................................'...........................................~............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .............................................e..................................................................................................'............................................~............................................................................................ + // sub v13.4s, v9.4s, v24.4s // ..........................................................................................................................e.....................'.........................................................................................................................~............... + // add v9.4s, v9.4s, v24.4s // ...............................................................................................e................................................'..............................................................................................~.......................................... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .........................................................e......................................................................................'........................................................~................................................................................ + // mul v24.4s, v14.4s, v0.s[0] // ..................e.............................................................................................................................'.................~....................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................................e...................................................................................'...........................................................~............................................................................. + // sub v14.4s, v10.4s, v24.4s // ........................................................................e.......................................................................'.......................................................................~................................................................. + // add v10.4s, v10.4s, v24.4s // .........................................................................e......................................................................'........................................................................~................................................................ + // sqrdmulh v27.4s, v15.4s, v0.s[1] // .............e..................................................................................................................................'............~............................................................................................................................ + // mul v24.4s, v15.4s, v0.s[0] // ...............e................................................................................................................................'..............~.......................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................e...............................................................................................................................'...............~......................................................................................................................... + // sub v15.4s, v11.4s, v24.4s // .......................................e........................................................................................................'......................................~.................................................................................................. + // add v11.4s, v11.4s, v24.4s // ...................e............................................................................................................................'..................~...................................................................................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ..............e.................................................................................................................................'.............~........................................................................................................................... + // mul v24.4s, v16.4s, v0.s[0] // .................e..............................................................................................................................'................~........................................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ........................e.......................................................................................................................'.......................~................................................................................................................. + // sub v16.4s, v12.4s, v24.4s // ................................................................................e...............................................................'...............................................................................~......................................................... + // add v12.4s, v12.4s, v24.4s // .................................................................................e..............................................................'................................................................................~........................................................ + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ............................................................................e...................................................................'...........................................................................~............................................................. + // mul v24.4s, v11.4s, v0.s[2] // .....................e..........................................................................................................................'....................~.................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................................e.......................................................'.......................................................................................~................................................. + // sub v11.4s, v9.4s, v24.4s // ...................................................................................................e............................................'..................................................................................................~...................................... + // add v9.4s, v9.4s, v24.4s // ..................................................................................................e.............................................'.................................................................................................~....................................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ...................................................................................e............................................................'..................................................................................~...................................................... + // mul v24.4s, v12.4s, v0.s[2] // ....................................................................................e...........................................................'...................................................................................~..................................................... + // mls v24.4s, v27.4s, v8.s[0] // .......................................................................................e........................................................'......................................................................................~.................................................. + // sub v12.4s, v10.4s, v24.4s // ..............................................................................................................e.................................'.............................................................................................................~........................... + // add v10.4s, v10.4s, v24.4s // .................................................................................................e..............................................'................................................................................................~........................................ + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .................................................................................................................................e..............'................................................................................................................................~........ + // mul v24.4s, v15.4s, v1.s[0] // .....................................................................................................................................e..........'....................................................................................................................................~.... + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................................................................................e.....'......................................................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ......................~.........................................................................................................................'.....................*................................................................................................................... + // add v13.4s, v13.4s, v24.4s // .........................~......................................................................................................................'........................*................................................................................................................ + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ...........................................................................................................e....................................'..........................................................................................................~.............................. + // mul v24.4s, v16.4s, v1.s[0] // ........................................................................................................e.......................................'.......................................................................................................~................................. + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................................................e..................................'............................................................................................................~............................ + // sub v16.4s, v14.4s, v24.4s // ..................................................................................................................e.............................'.................................................................................................................~....................... + // add v14.4s, v14.4s, v24.4s // .................................................................................................................e..............................'................................................................................................................~........................ + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .....................................................................................................................e..........................'....................................................................................................................~.................... + // mul v24.4s, v10.4s, v1.s[2] // ......................................................................................................e.........................................'.....................................................................................................~................................... + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................................................................e.......................'.......................................................................................................................~................. + // sub v10.4s, v9.4s, v24.4s // ................................................................................................................................e...............'...............................................................................................................................~......... + // add v9.4s, v9.4s, v24.4s // ...............................................................................................................................e................'..............................................................................................................................~.......... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ...............................................................................................................e................................'..............................................................................................................~.......................... + // mul v24.4s, v12.4s, v2.s[0] // ................................................................................................................e...............................'...............................................................................................................~......................... + // mls v24.4s, v27.4s, v8.s[0] // .......................................................................................................................e........................'......................................................................................................................~.................. + // sub v12.4s, v11.4s, v24.4s // ...........................................................................................................................e....................'..........................................................................................................................~.............. + // add v11.4s, v11.4s, v24.4s // ............................................................................................................................e...................'...........................................................................................................................~............. + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...................................................................................................................e............................'..................................................................................................................~...................... + // mul v24.4s, v14.4s, v2.s[2] // ....................................................................................................................e...........................'...................................................................................................................~..................... + // mls v24.4s, v27.4s, v8.s[0] // ........~.......................................................................................................................................'.......*................................................................................................................................. + // sub v14.4s, v13.4s, v24.4s // ...............................~................................................................................................................'..............................*.......................................................................................................... + // add v13.4s, v13.4s, v24.4s // ................................~...............................................................................................................'...............................*......................................................................................................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .........~......................................................................................................................................'........*................................................................................................................................ + // mul v24.4s, v16.4s, v3.s[0] // ....................~...........................................................................................................................'...................*..................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .......................~........................................................................................................................'......................*.................................................................................................................. + // sub v16.4s, v15.4s, v24.4s // ............................~...................................................................................................................'...........................*............................................................................................................. + // add v15.4s, v15.4s, v24.4s // .............................~..................................................................................................................'............................*............................................................................................................ + // trn1 v25.4s, v9.4s, v10.4s // .....~..........................................................................................................................................'....*.................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ....................................................................................................................................e...........'...................................................................................................................................~..... + // trn1 v27.4s, v11.4s, v12.4s // ..........................~.....................................................................................................................'.........................*............................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ..................................................................................................................................e.............'.................................................................................................................................~....... + // trn2 v11.2d, v25.2d, v27.2d // ...................................................~............................................................................................'..................................................*...................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ........................................................................................................................................e.......'.......................................................................................................................................~. + // trn1 v9.2d, v25.2d, v27.2d // .....................................~..........................................................................................................'....................................*.................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // .............................................................................................................................................e..'......................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ..................................~.............................................................................................................'.................................*....................................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ....................................~...........................................................................................................'...................................*..................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................~............................................................................................................'..................................*...................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ..................................................~.............................................................................................'.................................................*....................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ................................................~...............................................................................................'...............................................*......................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // .....................................................~..........................................................................................'....................................................*.................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ........................................~.......................................................................................................'.......................................*................................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ......................................................~.........................................................................................'.....................................................*................................................................................... + // ldr q0, [ x5], #(12*16) // .......................................................................................................................................e........'......................................................................................................................................~.. + // ldr q4, [x5, #(-12*16 + 1*16)] // ..............................................................................................................................e.................'.............................................................................................................................~........... + // ldr q1, [ x5, #(-12*16 + 2*16)] // ................................................................................................~...............................................'...............................................................................................*......................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ....................................................................................................~...........................................'...................................................................................................*..................................... + // ldr q2, [ x5, #(-12*16 + 4*16)] // ..............................~.................................................................................................................'.............................*........................................................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ...........................~....................................................................................................................'..........................*.............................................................................................................. + // sqrdmulh v27.4s, v11.4s, v4.4s // ...........................................................................................~....................................................'..........................................................................................*.............................................. + // mul v24.4s, v11.4s, v0.4s // ............................................................................................~...................................................'...........................................................................................*............................................. + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................~.................................................'.............................................................................................*........................................... + // sub v11.4s, v9.4s, v24.4s // .....................................................................................................~..........................................'....................................................................................................*.................................... + // add v9.4s, v9.4s, v24.4s // .......................................................................................................~........................................'......................................................................................................*.................................. + // sqrdmulh v27.4s, v12.4s, v4.4s // ............................................................................................................................................e...'......................................................................................................................................... + // mul v24.4s, v12.4s, v0.4s // ...............................................................................................................................................e'......................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..~.............................................................................................................................................'.*....................................................................................................................................... + // sub v12.4s, v10.4s, v24.4s // ..........~.....................................................................................................................................'.........*............................................................................................................................... + // add v10.4s, v10.4s, v24.4s // ...........~....................................................................................................................................'..........*.............................................................................................................................. + // sqrdmulh v27.4s, v10.4s, v5.4s // ......................................................................................................................~.........................'.....................................................................................................................*................... + // mul v24.4s, v10.4s, v1.4s // .........................................................................................................................~......................'........................................................................................................................*................ + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................................................................~..................'............................................................................................................................*............ + // sub v10.4s, v9.4s, v24.4s // ......................................................................................................................................~.........'.....................................................................................................................................*... + // add v9.4s, v9.4s, v24.4s // ...................................................................................................................................~............'..................................................................................................................................*...... + // sqrdmulh v27.4s, v12.4s, v6.4s // .................................~..............................................................................................................'................................*........................................................................................................ + // mul v24.4s, v12.4s, v2.4s // ......................................~.........................................................................................................'.....................................*................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .........................................~......................................................................................................'........................................*................................................................................................ + // sub v12.4s, v11.4s, v24.4s // .........................................................................................................~......................................'........................................................................................................*................................ + // add v11.4s, v11.4s, v24.4s // ..........................................................................................................~.....................................'.........................................................................................................*............................... + // ldr q0, [ x5, #(-12*16 + 6*16)] // ...............................................~................................................................................................'..............................................*.......................................................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // .................................................~..............................................................................................'................................................*........................................................................................ + // ldr q1, [ x5, #(-12*16 + 8*16)] // ...................................................................~............................................................................'..................................................................*...................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ..................................................................~.............................................................................'.................................................................*....................................................................... + // ldr q2, [ x5, #(-12*16 + 10*16)] // ..............................................~.................................................................................................'.............................................*........................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...........................................................................~....................................................................'..........................................................................*.............................................................. + // sqrdmulh v27.4s, v15.4s, v4.4s // .......................................................~........................................................................................'......................................................*.................................................................................. + // mul v24.4s, v15.4s, v0.4s // ....................................................~...........................................................................................'...................................................*..................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................~.................................................................................'.............................................................*........................................................................... + // sub v15.4s, v13.4s, v24.4s // ................................................................~...............................................................................'...............................................................*......................................................................... + // add v13.4s, v13.4s, v24.4s // ...............................................................~................................................................................'..............................................................*.......................................................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // ........................................................~.......................................................................................'.......................................................*................................................................................. + // mul v24.4s, v16.4s, v0.4s // ..........................................................~.....................................................................................'.........................................................*............................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................~....................................................................................'..........................................................*.............................................................................. + // sub v16.4s, v14.4s, v24.4s // .................................................................~..............................................................................'................................................................*........................................................................ + // add v14.4s, v14.4s, v24.4s // .............................................................~..................................................................................'............................................................*............................................................................ + // sqrdmulh v27.4s, v14.4s, v5.4s // ....................................................................~...........................................................................'...................................................................*..................................................................... + // mul v24.4s, v14.4s, v1.4s // .......................................................................~........................................................................'......................................................................*.................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................~.....................................................................'.........................................................................*............................................................... + // sub v14.4s, v13.4s, v24.4s // ..............................................................................~.................................................................'.............................................................................*........................................................... + // add v13.4s, v13.4s, v24.4s // ...............................................................................~................................................................'..............................................................................*.......................................................... + // sqrdmulh v27.4s, v16.4s, v6.4s // .............................................................................~..................................................................'............................................................................*............................................................ + // mul v24.4s, v16.4s, v2.4s // .....................................................................~..........................................................................'....................................................................*.................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................~.............................................................'.................................................................................*....................................................... + // sub v16.4s, v15.4s, v24.4s // .....................................................................................~..........................................................'....................................................................................*.................................................... + // add v15.4s, v15.4s, v24.4s // ......................................................................................~.........................................................'.....................................................................................*................................................... + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // .........................................................................................................................................~......'........................................................................................................................................* + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // .........................................................................................~......................................................'........................................................................................*................................................ + + sub count, count, #1 cbnz count, layer45678_start - trn1 v4.4S, v30.4S, v11.4S - mls v0.4S, v16.4S, v8.S[0] - trn2 v26.4S, v1.4S, v19.4S - trn2 v5.4S, v30.4S, v11.4S - mls v13.4S, v31.4S, v8.S[0] - trn2 v1.2D, v15.2D, v4.2D - mls v7.4S, v17.4S, v8.S[0] - ldr_vo v17, x5, -48 - add v21.4S, v27.4S, v0.4S - sub v10.4S, v27.4S, v0.4S - sqrdmulh v31.4S, v1.4S, v29.4S - trn2 v14.2D, v26.2D, v5.2D - mul v23.4S, v1.4S, v12.4S - sub v18.4S, v21.4S, v13.4S - add v0.4S, v21.4S, v13.4S - sub v20.4S, v10.4S, v7.4S - add v24.4S, v10.4S, v7.4S - ldr_vo v7, x5, -96 - sqrdmulh v11.4S, v14.4S, v29.4S - ldr_vo v1, x5, -80 - trn2 v16.4S, v0.4S, v18.4S - mls v23.4S, v31.4S, v8.S[0] - trn2 v9.4S, v24.4S, v20.4S - trn1 v21.4S, v0.4S, v18.4S - mul v25.4S, v14.4S, v12.4S - trn1 v29.4S, v24.4S, v20.4S - ldr_vo v12, x5, -16 - trn2 v24.2D, v16.2D, v9.2D - mls v25.4S, v11.4S, v8.S[0] - trn2 v19.2D, v21.2D, v29.2D - trn1 v27.2D, v26.2D, v5.2D - mul v11.4S, v24.4S, v7.4S - trn1 v5.2D, v16.2D, v9.2D - trn1 v9.2D, v21.2D, v29.2D - mul v13.4S, v19.4S, v7.4S - sub v28.4S, v27.4S, v25.4S - add v0.4S, v27.4S, v25.4S - sqrdmulh v27.4S, v24.4S, v1.4S - mul v18.4S, v28.4S, v2.4S - sqrdmulh v16.4S, v19.4S, v1.4S - mls v11.4S, v27.4S, v8.S[0] - sqrdmulh v25.4S, v28.4S, v6.4S - ldr_vo v30, x5, -32 - sqrdmulh v31.4S, v0.4S, v22.4S - sub v24.4S, v5.4S, v11.4S - add v28.4S, v5.4S, v11.4S - mls v13.4S, v16.4S, v8.S[0] - ldr_vo v1, x5, -64 - sqrdmulh v2.4S, v24.4S, v12.4S - mul v27.4S, v24.4S, v30.4S - add v14.4S, v9.4S, v13.4S - sqrdmulh v26.4S, v28.4S, v17.4S - mls v27.4S, v2.4S, v8.S[0] - mul v5.4S, v28.4S, v1.4S - sub v28.4S, v9.4S, v13.4S - mul v11.4S, v0.4S, v3.4S - add v6.4S, v28.4S, v27.4S - mls v5.4S, v26.4S, v8.S[0] - sub v7.4S, v28.4S, v27.4S - trn1 v27.2D, v15.2D, v4.2D - mls v11.4S, v31.4S, v8.S[0] - mls v18.4S, v25.4S, v8.S[0] - add v24.4S, v27.4S, v23.4S - sub v27.4S, v27.4S, v23.4S - add v4.4S, v14.4S, v5.4S - sub v5.4S, v14.4S, v5.4S - sub v25.4S, v24.4S, v11.4S - add v24.4S, v24.4S, v11.4S - add v26.4S, v27.4S, v18.4S - sub v27.4S, v27.4S, v18.4S - st4 {v4.4S,v5.4S,v6.4S,v7.4S}, [x2], #64 - st4 {v24.4S,v25.4S,v26.4S,v27.4S}, [x1], #64 + // Instructions: 72 + // Expected cycles: 65 + // Expected IPC: 1.11 + // + // Wall time: 4.52s + // User time: 4.52s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + mls v22.4S, v10.4S, v8.S[0] // *....................................................................... + trn1 v18.4S, v21.4S, v9.4S // .*...................................................................... + ldr q11, [x5, #-112] // ...........*............................................................ + trn1 v0.4S, v15.4S, v2.4S // ..........*............................................................. + ldr q30, [x5, #-128] // ..............*......................................................... + ldr q6, [x5, #-32] // .........................*.............................................. + mls v31.4S, v13.4S, v8.S[0] // ..*..................................................................... + sub v13.4S, v4.4S, v19.4S // .......*................................................................ + ldr q27, [x5, #-96] // ..........................*............................................. + add v19.4S, v4.4S, v19.4S // .........*.............................................................. + ldr q17, [x5, #-80] // ............................*........................................... + ldr q14, [x5, #-48] // ...........................................*............................ + trn1 v25.2D, v18.2D, v0.2D // .....................*.................................................. + sqrdmulh v26.4S, v1.4S, v29.S[1] // ...*.................................................................... + ldr q16, [x5, #-64] // ............................................*........................... + sub v10.4S, v12.4S, v22.4S // ....*................................................................... + ldr q23, [x5, #-160] // ............................................................*........... + ldr q28, [x5, #-16] // .................................................*...................... + mul v1.4S, v1.4S, v29.S[0] // ......*................................................................. + add v22.4S, v12.4S, v22.4S // .....*.................................................................. + ldr q12, [x5, #-144] // .............................................................*.......... + trn2 v18.2D, v18.2D, v0.2D // ..............................*......................................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v11.4S, v10.4S, v11.4S // .................*...................................................... + sub v0.4S, v19.4S, v31.4S // ...............*........................................................ + // gap // ........................................................................ + add v19.4S, v19.4S, v31.4S // ................*....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v1.4S, v26.4S, v8.S[0] // ........*............................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v30.4S, v10.4S, v30.4S // ......................*................................................. + trn1 v26.4S, v19.4S, v0.4S // ..................*..................................................... + // gap // ........................................................................ + trn2 v19.4S, v19.4S, v0.4S // ....................*................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v30.4S, v11.4S, v8.S[0] // ........................*............................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v11.4S, v13.4S, v1.4S // ............*........................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v0.4S, v18.4S, v7.4S // .........................................................*.............. + add v13.4S, v13.4S, v1.4S // .............*.......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v20.4S, v18.4S, v20.4S // ..........................................................*............. + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v18.4S, v13.4S, v11.4S // ...................*.................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mul v1.4S, v22.4S, v23.4S // ...................................................................*.... + trn2 v11.4S, v13.4S, v11.4S // .............................*.......................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v20.4S, v0.4S, v8.S[0] // ...........................................................*............ + trn1 v0.2D, v26.2D, v18.2D // .......................*................................................ + // gap // ........................................................................ + trn2 v13.2D, v19.2D, v11.2D // ................................*....................................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v22.4S, v22.4S, v12.4S // ..................................................................*..... + trn2 v18.2D, v26.2D, v18.2D // ...........................*............................................ + // gap // ........................................................................ + trn1 v19.2D, v19.2D, v11.2D // .................................*...................................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v11.4S, v13.4S, v17.4S // ...................................*.................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v12.4S, v25.4S, v20.4S // ..............................................................*......... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v17.4S, v18.4S, v17.4S // ..................................*..................................... + add v20.4S, v25.4S, v20.4S // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v13.4S, v13.4S, v27.4S // ....................................*................................... + sub v26.4S, v12.4S, v30.4S // ................................................................*....... + // gap // ........................................................................ + add v25.4S, v12.4S, v30.4S // .................................................................*...... + // gap // ........................................................................ + // gap // ........................................................................ + mls v13.4S, v11.4S, v8.S[0] // .....................................*.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v18.4S, v18.4S, v27.4S // ...............................*........................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v18.4S, v17.4S, v8.S[0] // .......................................*................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v11.4S, v19.4S, v13.4S // ..........................................*............................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v1.4S, v22.4S, v8.S[0] // ....................................................................*... + add v19.4S, v19.4S, v13.4S // ......................................*................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v22.4S, v11.4S, v6.4S // ..............................................*......................... + // gap // ........................................................................ + // gap // ........................................................................ + add v30.4S, v0.4S, v18.4S // ........................................*............................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v13.4S, v19.4S, v14.4S // .............................................*.......................... + sub v18.4S, v0.4S, v18.4S // .........................................*.............................. + // gap // ........................................................................ + add v23.4S, v20.4S, v1.4S // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v11.4S, v11.4S, v28.4S // ..................................................*..................... + sub v24.4S, v20.4S, v1.4S // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v20.4S, v19.4S, v16.4S // ...............................................*........................ + // gap // ........................................................................ + // gap // ........................................................................ + st4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x1], #64 // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + mls v20.4S, v13.4S, v8.S[0] // ................................................*....................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v22.4S, v11.4S, v8.S[0] // .....................................................*.................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v11.4S, v30.4S, v20.4S // ...................................................*.................... + // gap // ........................................................................ + // gap // ........................................................................ + add v10.4S, v30.4S, v20.4S // ....................................................*................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v13.4S, v18.4S, v22.4S // ......................................................*................. + add v12.4S, v18.4S, v22.4S // .......................................................*................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x2], #64 // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + + // ---------------------------- new position -----------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + // mls v22.4S, v10.4S, v8.S[0] // *....................................................................... + // trn1 v17.4S, v21.4S, v9.4S // .*...................................................................... + // mls v31.4S, v13.4S, v8.S[0] // ......*................................................................. + // sqrdmulh v24.4S, v1.4S, v29.S[1] // .............*.......................................................... + // sub v14.4S, v12.4S, v22.4S // ...............*........................................................ + // add v11.4S, v12.4S, v22.4S // ...................*.................................................... + // mul v23.4S, v1.4S, v29.S[0] // ..................*..................................................... + // sub v21.4S, v4.4S, v19.4S // .......*................................................................ + // mls v23.4S, v24.4S, v8.S[0] // .........................*.............................................. + // add v4.4S, v4.4S, v19.4S // .........*.............................................................. + // trn1 v19.4S, v15.4S, v2.4S // ...*.................................................................... + // ldr q15, [x5, #-112] // ..*..................................................................... + // sub v25.4S, v21.4S, v23.4S // ..............................*......................................... + // add v2.4S, v21.4S, v23.4S // ................................*....................................... + // ldr q21, [x5, #-128] // ....*................................................................... + // sub v29.4S, v4.4S, v31.4S // .......................*................................................ + // add v0.4S, v4.4S, v31.4S // ........................*............................................... + // sqrdmulh v10.4S, v14.4S, v15.4S // ......................*................................................. + // trn1 v15.4S, v0.4S, v29.4S // ...........................*............................................ + // trn1 v23.4S, v2.4S, v25.4S // ..................................*..................................... + // trn2 v6.4S, v0.4S, v29.4S // ............................*........................................... + // trn1 v13.2D, v17.2D, v19.2D // ............*........................................................... + // mul v24.4S, v14.4S, v21.4S // ..........................*............................................. + // trn1 v21.2D, v15.2D, v23.2D // ......................................*................................. + // mls v24.4S, v10.4S, v8.S[0] // .............................*.......................................... + // ldr q22, [x5, #-32] // .....*.................................................................. + // ldr q31, [x5, #-96] // ........*............................................................... + // trn2 v4.2D, v15.2D, v23.2D // .........................................*.............................. + // ldr q1, [x5, #-80] // ..........*............................................................. + // trn2 v10.4S, v2.4S, v25.4S // ....................................*................................... + // trn2 v25.2D, v17.2D, v19.2D // .....................*.................................................. + // mul v30.4S, v4.4S, v31.4S // ...................................................*.................... + // trn2 v2.2D, v6.2D, v10.2D // .......................................*................................ + // trn1 v10.2D, v6.2D, v10.2D // ..........................................*............................. + // sqrdmulh v19.4S, v4.4S, v1.4S // .............................................*.......................... + // sqrdmulh v17.4S, v2.4S, v1.4S // ...........................................*............................ + // mul v3.4S, v2.4S, v31.4S // ...............................................*........................ + // mls v3.4S, v17.4S, v8.S[0] // ..................................................*..................... + // add v23.4S, v10.4S, v3.4S // .......................................................*................ + // mls v30.4S, v19.4S, v8.S[0] // ....................................................*................... + // add v31.4S, v21.4S, v30.4S // .........................................................*.............. + // sub v30.4S, v21.4S, v30.4S // ...........................................................*............ + // sub v19.4S, v10.4S, v3.4S // .....................................................*.................. + // ldr q10, [x5, #-48] // ...........*............................................................ + // ldr q2, [x5, #-64] // ..............*......................................................... + // sqrdmulh v17.4S, v23.4S, v10.4S // ..........................................................*............. + // mul v10.4S, v19.4S, v22.4S // ........................................................*............... + // mul v23.4S, v23.4S, v2.4S // ...............................................................*........ + // mls v23.4S, v17.4S, v8.S[0] // .................................................................*...... + // ldr q21, [x5, #-16] // .................*...................................................... + // sqrdmulh v3.4S, v19.4S, v21.4S // .............................................................*.......... + // sub v2.4S, v31.4S, v23.4S // ...................................................................*.... + // add v1.4S, v31.4S, v23.4S // ....................................................................*... + // mls v10.4S, v3.4S, v8.S[0] // ..................................................................*..... + // sub v4.4S, v30.4S, v10.4S // .....................................................................*.. + // add v3.4S, v30.4S, v10.4S // ......................................................................*. + // st4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2], #64 // .......................................................................* + // sqrdmulh v14.4S, v25.4S, v7.4S // ...............................*........................................ + // mul v25.4S, v25.4S, v20.4S // .................................*...................................... + // mls v25.4S, v14.4S, v8.S[0] // .....................................*.................................. + // ldr q4, [x5, #-160] // ................*....................................................... + // ldr q3, [x5, #-144] // ....................*................................................... + // sub v22.4S, v13.4S, v25.4S // ............................................*........................... + // add v14.4S, v13.4S, v25.4S // ..............................................*......................... + // sub v25.4S, v22.4S, v24.4S // ................................................*....................... + // add v24.4S, v22.4S, v24.4S // .................................................*...................... + // sqrdmulh v12.4S, v11.4S, v3.4S // ........................................*............................... + // mul v23.4S, v11.4S, v4.4S // ...................................*.................................... + // mls v23.4S, v12.4S, v8.S[0] // ......................................................*................. + // add v22.4S, v14.4S, v23.4S // ............................................................*........... + // sub v23.4S, v14.4S, v23.4S // ..............................................................*......... + // st4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1], #64 // ................................................................*....... + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_dilithium_123_45678_opt_m1_firestorm.s index ea1d6a37..e0fde38c 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_opt_m1_firestorm.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +25,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +42,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -89,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -136,35 +108,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -186,7 +158,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +169,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +179,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +187,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +198,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -375,554 +347,582 @@ _ntt_dilithium_123_45678_opt_m1_firestorm: load_roots_123 .p2align 2 - ldr q21, [x0, #640] // ..*..................................... - ldr q11, [x0, #768] // .*...................................... - ldr q14, [x0, #896] // *....................................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - ldr q17, [x0, #512] // ...*.................................... - ldr q15, [x0, #0] // .....*.................................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - ldr q12, [x0, #256] // ........*............................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v25.4S, v14.4S, v0.S[0] // ....*................................... - sqrdmulh v4.4S, v14.4S, v0.S[1] // ......*................................. - sqrdmulh v31.4S, v11.4S, v0.S[1] // .......*................................ - mul v13.4S, v11.4S, v0.S[0] // .........*.............................. - ldr q14, [x0, #384] // ............*........................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqrdmulh v18.4S, v21.4S, v0.S[1] // ..........*............................. - mul v5.4S, v21.4S, v0.S[0] // ...........*............................ - mul v23.4S, v17.4S, v0.S[0] // .............*.......................... - sqrdmulh v9.4S, v17.4S, v0.S[1] // ..............*......................... - ldr q21, [x0, #128] // ...............*........................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v13.4S, v31.4S, v8.S[0] // .................*...................... - mls v25.4S, v4.4S, v8.S[0] // ................*....................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v5.4S, v18.4S, v8.S[0] // ..................*..................... - mls v23.4S, v9.4S, v8.S[0] // .....................*.................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sub v9.4S, v12.4S, v13.4S // ......................*................. - add v29.4S, v12.4S, v13.4S // .......................*................ - add v18.4S, v14.4S, v25.4S // ...................*.................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sub v17.4S, v14.4S, v25.4S // ....................*................... - add v6.4S, v15.4S, v23.4S // .....................................*.. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v7.4S, v29.4S, v0.S[2] // ...............................*........ - sqrdmulh v29.4S, v29.4S, v0.S[3] // ..............................*......... - sqrdmulh v24.4S, v18.4S, v0.S[3] // ..........................*............. - mul v11.4S, v18.4S, v0.S[2] // ...........................*............ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v20.4S, v9.4S, v1.S[0] // ............................*........... - sqrdmulh v26.4S, v9.4S, v1.S[1] // .............................*.......... - sqrdmulh v31.4S, v17.4S, v1.S[1] // ........................*............... - mul v27.4S, v17.4S, v1.S[0] // .........................*.............. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sub v10.4S, v15.4S, v23.4S // .................................*...... - sub v15.4S, v21.4S, v5.4S // ................................*....... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v7.4S, v29.4S, v8.S[0] // .......................................* - mls v11.4S, v24.4S, v8.S[0] // ....................................*... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v9.4S, v21.4S, v5.4S // ...................................*.... - mls v20.4S, v26.4S, v8.S[0] // ......................................*. - mls v27.4S, v31.4S, v8.S[0] // ..................................*..... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - - // original source code - // ldr q21, [x0, #896] // ..*..................................... - // ldr q19, [x0, #768] // .*...................................... - // ldr q9, [x0, #640] // *....................................... - // ldr q4, [x0, #512] // ...*.................................... - // mul v20.4S, v21.4S, v0.S[0] // ......*................................. - // ldr q28, [x0, #0] // ....*................................... - // sqrdmulh v14.4S, v21.4S, v0.S[1] // .......*................................ - // sqrdmulh v24.4S, v19.4S, v0.S[1] // ........*............................... - // ldr q21, [x0, #256] // .....*.................................. - // mul v11.4S, v19.4S, v0.S[0] // .........*.............................. - // sqrdmulh v19.4S, v9.4S, v0.S[1] // ...........*............................ - // mul v9.4S, v9.4S, v0.S[0] // ............*........................... - // ldr q27, [x0, #384] // ..........*............................. - // mul v12.4S, v4.4S, v0.S[0] // .............*.......................... - // sqrdmulh v15.4S, v4.4S, v0.S[1] // ..............*......................... - // ldr q4, [x0, #128] // ...............*........................ - // mls v20.4S, v14.4S, v8.S[0] // .................*...................... - // mls v11.4S, v24.4S, v8.S[0] // ................*....................... - // mls v9.4S, v19.4S, v8.S[0] // ..................*..................... - // add v14.4S, v27.4S, v20.4S // ......................*................. - // sub v22.4S, v27.4S, v20.4S // .......................*................ - // mls v12.4S, v15.4S, v8.S[0] // ...................*.................... - // sub v7.4S, v21.4S, v11.4S // ....................*................... - // add v6.4S, v21.4S, v11.4S // .....................*.................. - // sqrdmulh v24.4S, v22.4S, v1.S[1] // ...............................*........ - // mul v27.4S, v22.4S, v1.S[0] // ................................*....... - // sqrdmulh v21.4S, v14.4S, v0.S[3] // ...........................*............ - // mul v11.4S, v14.4S, v0.S[2] // ............................*........... - // mul v20.4S, v7.4S, v1.S[0] // .............................*.......... - // sqrdmulh v17.4S, v7.4S, v1.S[1] // ..............................*......... - // sqrdmulh v22.4S, v6.4S, v0.S[3] // ..........................*............. - // mul v7.4S, v6.4S, v0.S[2] // .........................*.............. - // sub v15.4S, v4.4S, v9.4S // ..................................*..... - // sub v10.4S, v28.4S, v12.4S // .................................*...... - // mls v27.4S, v24.4S, v8.S[0] // .......................................* - // add v9.4S, v4.4S, v9.4S // .....................................*.. - // mls v11.4S, v21.4S, v8.S[0] // ....................................*... - // add v6.4S, v28.4S, v12.4S // ........................*............... - // mls v20.4S, v17.4S, v8.S[0] // ......................................*. - // mls v7.4S, v22.4S, v8.S[0] // ...................................*.... + // Instructions: 33 + // Expected cycles: 16 + // Expected IPC: 2.06 + // + // Wall time: 0.31s + // User time: 0.31s + // + // ------ original position -------> + // 0 25 + // |------------------------|------- + ldr q24, [x0, #896] // ..*.............................. + ldr q16, [x0, #640] // *................................ + ldr q7, [x0, #768] // ...*............................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + ldr q4, [x0, #512] // .*............................... + ldr q17, [x0, #384] // ..........*...................... + ldr q10, [x0, #0] // ....*............................ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + ldr q29, [x0, #128] // ..............*.................. + ldr q30, [x0, #256] // .....*........................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v18.4S, v24.4S, v0.S[1] // ........*........................ + mul v5.4S, v24.4S, v0.S[0] // .........*....................... + mul v15.4S, v16.4S, v0.S[0] // .......*......................... + sqrdmulh v12.4S, v7.4S, v0.S[1] // .............*................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v22.4S, v16.4S, v0.S[1] // ...........*..................... + mul v19.4S, v7.4S, v0.S[0] // ............*.................... + sqrdmulh v26.4S, v4.4S, v0.S[1] // ..........................*...... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v16.4S, v4.4S, v0.S[0] // ......*.......................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v5.4S, v18.4S, v8.S[0] // ...............*................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v15.4S, v22.4S, v8.S[0] // ...................*............. + mls v19.4S, v12.4S, v8.S[0] // ................*................ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v16.4S, v26.4S, v8.S[0] // ................................* + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + add v20.4S, v17.4S, v5.4S // .................*............... + sub v24.4S, v17.4S, v5.4S // ..................*.............. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sub v7.4S, v30.4S, v19.4S // ....................*............ + sub v31.4S, v29.4S, v15.4S // ...........................*..... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v22.4S, v20.4S, v0.S[2] // ......................*.......... + sqrdmulh v23.4S, v20.4S, v0.S[3] // ........................*........ + mul v28.4S, v24.4S, v1.S[0] // .....................*........... + sqrdmulh v6.4S, v24.4S, v1.S[1] // .......................*......... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + add v20.4S, v29.4S, v15.4S // ...............................*. + sqrdmulh v12.4S, v7.4S, v1.S[1] // .........................*....... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v29.4S, v7.4S, v1.S[0] // ............................*.... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v22.4S, v23.4S, v8.S[0] // .............................*... + mls v28.4S, v6.4S, v8.S[0] // ..............................*.. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + + // --------- new position ---------> + // 0 25 + // |------------------------|------- + // ldr q9, [x0, #640] // .*............................... + // ldr q7, [x0, #512] // ...*............................. + // ldr q26, [x0, #896] // *................................ + // ldr q15, [x0, #768] // ..*.............................. + // ldr q10, [x0, #0] // .....*........................... + // ldr q30, [x0, #256] // .......*......................... + // mul v16.4S, v7.4S, v0.S[0] // ...............*................. + // mul v6.4S, v9.4S, v0.S[0] // ..........*...................... + // sqrdmulh v13.4S, v26.4S, v0.S[1] // ........*........................ + // mul v26.4S, v26.4S, v0.S[0] // .........*....................... + // ldr q12, [x0, #384] // ....*............................ + // sqrdmulh v9.4S, v9.4S, v0.S[1] // ............*.................... + // mul v19.4S, v15.4S, v0.S[0] // .............*................... + // sqrdmulh v15.4S, v15.4S, v0.S[1] // ...........*..................... + // ldr q21, [x0, #128] // ......*.......................... + // mls v26.4S, v13.4S, v8.S[0] // ................*................ + // mls v19.4S, v15.4S, v8.S[0] // ..................*.............. + // add v25.4S, v12.4S, v26.4S // ....................*............ + // sub v24.4S, v12.4S, v26.4S // .....................*........... + // mls v6.4S, v9.4S, v8.S[0] // .................*............... + // sub v14.4S, v30.4S, v19.4S // ......................*.......... + // mul v28.4S, v24.4S, v1.S[0] // ..........................*...... + // mul v22.4S, v25.4S, v0.S[2] // ........................*........ + // sqrdmulh v26.4S, v24.4S, v1.S[1] // ...........................*..... + // sqrdmulh v23.4S, v25.4S, v0.S[3] // .........................*....... + // sqrdmulh v12.4S, v14.4S, v1.S[1] // .............................*... + // sqrdmulh v7.4S, v7.4S, v0.S[1] // ..............*.................. + // sub v31.4S, v21.4S, v6.4S // .......................*......... + // mul v29.4S, v14.4S, v1.S[0] // ..............................*.. + // mls v22.4S, v23.4S, v8.S[0] // ...............................*. + // mls v28.4S, v26.4S, v8.S[0] // ................................* + // add v20.4S, v21.4S, v6.4S // ............................*.... + // mls v16.4S, v7.4S, v8.S[0] // ...................*............. sub count, count, #1 layer123_start: - ldr q21, [x0, #912] // .......e.................................................................... + // Instructions: 76 + // Expected cycles: 17 + // Expected IPC: 4.47 + // + // Wall time: 41.62s + // User time: 41.62s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + add v4.4S, v30.4S, v19.4S // ......................*..................................................... + ldr q9, [x0, #656] // .....e...................................................................... + ldr q7, [x0, #528] // ....e....................................................................... + ldr q26, [x0, #912] // .......e.................................................................... + mls v29.4S, v12.4S, v8.S[0] // ........................................*................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + add v27.4S, v20.4S, v22.4S // .....................................*...................................... + sub v5.4S, v20.4S, v22.4S // ....................................*....................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + ldr q15, [x0, #784] // ......e..................................................................... + add v6.4S, v31.4S, v28.4S // ...............................................*............................ + sub v23.4S, v31.4S, v28.4S // ..............................................*............................. + mul v28.4S, v4.4S, v0.S[2] // .............................*.............................................. + sub v24.4S, v10.4S, v16.4S // ...........*................................................................ + add v18.4S, v10.4S, v16.4S // ............*............................................................... + ldr q10, [x0, #16] // e........................................................................... // gap // ............................................................................ - add v17.4S, v15.4S, v27.4S // ...............................................*............................ - add v23.4S, v9.4S, v11.4S // .....................................*...................................... - ldr q19, [x0, #784] // ......e..................................................................... - sub v22.4S, v9.4S, v11.4S // ....................................*....................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sub v12.4S, v15.4S, v27.4S // ..............................................*............................. - add v26.4S, v6.4S, v7.4S // ................................*........................................... - ldr q9, [x0, #656] // .....e...................................................................... - add v16.4S, v10.4S, v20.4S // ..........................................*................................. - sub v10.4S, v10.4S, v20.4S // .........................................*.................................. + mul v22.4S, v23.4S, v3.S[0] // ................................................................*........... + sqrdmulh v11.4S, v6.4S, v2.S[3] // ..........................................................*................. + mul v14.4S, v6.4S, v2.S[2] // ...........................................................*................ + sqrdmulh v25.4S, v23.4S, v3.S[1] // ...............................................................*............ + ldr q30, [x0, #272] // ..e......................................................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + mul v16.4S, v7.4S, v0.S[0] // .........e.................................................................. + mul v6.4S, v9.4S, v0.S[0] // ..............e............................................................. + sqrdmulh v13.4S, v26.4S, v0.S[1] // .......................e.................................................... + mul v26.4S, v26.4S, v0.S[0] // ........................e................................................... + ldr q12, [x0, #400] // ...e........................................................................ // gap // ............................................................................ - ldr q4, [x0, #528] // ....e....................................................................... - mul v5.4S, v12.4S, v3.S[0] // ...............................................................*............ - sqrdmulh v31.4S, v12.4S, v3.S[1] // ................................................................*........... - mul v29.4S, v22.4S, v2.S[0] // .....................................................*...................... // gap // ............................................................................ // gap // ............................................................................ + sqrdmulh v9.4S, v9.4S, v0.S[1] // .............e.............................................................. + mul v19.4S, v15.4S, v0.S[0] // ...................e........................................................ + sqrdmulh v15.4S, v15.4S, v0.S[1] // ..................e......................................................... + sqrdmulh v20.4S, v4.4S, v0.S[3] // ............................*............................................... // gap // ............................................................................ - sqrdmulh v22.4S, v22.4S, v2.S[1] // ......................................................*..................... - sqrdmulh v13.4S, v23.4S, v1.S[3] // .................................................*.......................... - sqrdmulh v25.4S, v17.4S, v2.S[3] // ...........................................................*................ - mul v20.4S, v21.4S, v0.S[0] // .......................e.................................................... - ldr q28, [x0, #16] // e........................................................................... - sqrdmulh v14.4S, v21.4S, v0.S[1] // ........................e................................................... + ldr q21, [x0, #144] // .e.......................................................................... // gap // ............................................................................ // gap // ............................................................................ + mls v14.4S, v11.4S, v8.S[0] // ............................................................*............... + add v23.4S, v24.4S, v29.4S // ..........................................*................................. + sqrdmulh v17.4S, v5.4S, v2.S[1] // .....................................................*...................... // gap // ............................................................................ - mul v18.4S, v23.4S, v1.S[2] // ................................................*........................... - sqrdmulh v24.4S, v19.4S, v0.S[1] // ...................e........................................................ - ldr q21, [x0, #272] // ..e......................................................................... - mul v30.4S, v17.4S, v2.S[2] // ..........................................................*................. - mul v11.4S, v19.4S, v0.S[0] // ..................e......................................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v19.4S, v9.4S, v0.S[1] // ..............e............................................................. - mul v9.4S, v9.4S, v0.S[0] // .............e.............................................................. - ldr q27, [x0, #400] // ...e........................................................................ - mls v29.4S, v22.4S, v8.S[0] // .......................................................*.................... + mls v22.4S, v25.4S, v8.S[0] // .................................................................*.......... + mls v26.4S, v13.4S, v8.S[0] // .........................e.................................................. // gap // ............................................................................ + sub v25.4S, v24.4S, v29.4S // .........................................*.................................. // gap // ............................................................................ // gap // ............................................................................ - mls v5.4S, v31.4S, v8.S[0] // .................................................................*.......... - mul v12.4S, v4.4S, v0.S[0] // ........e................................................................... - sqrdmulh v15.4S, v4.4S, v0.S[1] // .........e.................................................................. - ldr q4, [x0, #144] // .e.......................................................................... - mls v20.4S, v14.4S, v8.S[0] // .........................e.................................................. // gap // ............................................................................ // gap // ............................................................................ + mul v13.4S, v5.4S, v2.S[0] // ......................................................*..................... + sqrdmulh v31.4S, v27.4S, v1.S[3] // ................................................*........................... // gap // ............................................................................ - sub v31.4S, v6.4S, v7.4S // ...............................*............................................ - mls v18.4S, v13.4S, v8.S[0] // ..................................................*......................... - mls v30.4S, v25.4S, v8.S[0] // ............................................................*............... - mls v11.4S, v24.4S, v8.S[0] // ....................e....................................................... + mul v4.4S, v27.4S, v1.S[2] // .................................................*.......................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + mls v19.4S, v15.4S, v8.S[0] // ....................e....................................................... + mls v28.4S, v20.4S, v8.S[0] // ..............................*............................................. + add v29.4S, v23.4S, v14.4S // ..............................................................*............. + sub v5.4S, v23.4S, v14.4S // .............................................................*.............. + add v11.4S, v25.4S, v22.4S // ...................................................................*........ + sub v27.4S, v25.4S, v22.4S // ..................................................................*......... // gap // ............................................................................ // gap // ............................................................................ - sub v24.4S, v10.4S, v5.4S // ..................................................................*......... - sub v7.4S, v31.4S, v29.4S // ........................................................*................... - add v29.4S, v31.4S, v29.4S // .........................................................*.................. - mls v9.4S, v19.4S, v8.S[0] // ...............e............................................................ // gap // ............................................................................ // gap // ............................................................................ + add v25.4S, v12.4S, v26.4S // ...........................e................................................ + sub v24.4S, v12.4S, v26.4S // ..........................e................................................. + mls v6.4S, v9.4S, v8.S[0] // ...............e............................................................ // gap // ............................................................................ // gap // ............................................................................ - add v31.4S, v10.4S, v5.4S // ...................................................................*........ - add v14.4S, v27.4S, v20.4S // ...........................e................................................ - sub v22.4S, v27.4S, v20.4S // ..........................e................................................. - mls v12.4S, v15.4S, v8.S[0] // ..........e................................................................. // gap // ............................................................................ // gap // ............................................................................ + mls v13.4S, v17.4S, v8.S[0] // .......................................................*.................... + str q11, [x0, #768] // ..........................................................................*. + str q5, [x0, #640] // .........................................................................*.. + sub v9.4S, v18.4S, v28.4S // ...............................*............................................ + add v5.4S, v18.4S, v28.4S // ................................*........................................... + sub v14.4S, v30.4S, v19.4S // .....................e...................................................... // gap // ............................................................................ // gap // ............................................................................ - sub v19.4S, v26.4S, v18.4S // ...................................................*........................ - str q7, [x0, #384] // .......................................................................*.... - sub v7.4S, v21.4S, v11.4S // .....................e...................................................... - add v6.4S, v21.4S, v11.4S // ......................e..................................................... - str q24, [x0, #896] // ...........................................................................* + mls v4.4S, v31.4S, v8.S[0] // ..................................................*......................... + mul v28.4S, v24.4S, v1.S[0] // ............................................e............................... + mul v22.4S, v25.4S, v0.S[2] // ..................................e......................................... + sqrdmulh v26.4S, v24.4S, v1.S[1] // ...........................................e................................ + str q29, [x0, #512] // ........................................................................*... // gap // ............................................................................ + sqrdmulh v23.4S, v25.4S, v0.S[3] // .................................e.......................................... // gap // ............................................................................ - add v10.4S, v16.4S, v30.4S // ..............................................................*............. - str q29, [x0, #256] // ......................................................................*..... - sqrdmulh v24.4S, v22.4S, v1.S[1] // ............................................e............................... - mul v27.4S, v22.4S, v1.S[0] // ...........................................e................................ - sqrdmulh v21.4S, v14.4S, v0.S[3] // ..................................e......................................... - mul v11.4S, v14.4S, v0.S[2] // .................................e.......................................... // gap // ............................................................................ + sqrdmulh v12.4S, v14.4S, v1.S[1] // ......................................e..................................... + sub v15.4S, v9.4S, v13.4S // ........................................................*................... + str q27, [x0, #896] // ...........................................................................* // gap // ............................................................................ // gap // ............................................................................ - mul v20.4S, v7.4S, v1.S[0] // ......................................e..................................... - sqrdmulh v17.4S, v7.4S, v1.S[1] // .......................................e.................................... - sqrdmulh v22.4S, v6.4S, v0.S[3] // .............................e.............................................. - str q10, [x0, #512] // ........................................................................*... - mul v7.4S, v6.4S, v0.S[2] // ............................e............................................... // gap // ............................................................................ + add v18.4S, v9.4S, v13.4S // .........................................................*.................. + sqrdmulh v7.4S, v7.4S, v0.S[1] // ........e................................................................... + add v11.4S, v5.4S, v4.4S // ....................................................*....................... + sub v9.4S, v5.4S, v4.4S // ...................................................*........................ + sub v31.4S, v21.4S, v6.4S // ................e........................................................... + mul v29.4S, v14.4S, v1.S[0] // .......................................e.................................... // gap // ............................................................................ // gap // ............................................................................ - sub v15.4S, v4.4S, v9.4S // ................e........................................................... - add v29.4S, v26.4S, v18.4S // ....................................................*....................... - sub v10.4S, v28.4S, v12.4S // ...........e................................................................ - sub v5.4S, v16.4S, v30.4S // .............................................................*.............. - str q31, [x0, #768] // ..........................................................................*. // gap // ............................................................................ // gap // ............................................................................ + mls v22.4S, v23.4S, v8.S[0] // ...................................e........................................ + mls v28.4S, v26.4S, v8.S[0] // .............................................e.............................. + add v20.4S, v21.4S, v6.4S // .................e.......................................................... + str q18, [x0, #256] // ......................................................................*..... // gap // ............................................................................ - mls v27.4S, v24.4S, v8.S[0] // .............................................e.............................. - add v9.4S, v4.4S, v9.4S // .................e.......................................................... - str q19, [x0, #128] // .....................................................................*...... - mls v11.4S, v21.4S, v8.S[0] // ...................................e........................................ + str q15, [x0, #384] // .......................................................................*.... // gap // ............................................................................ // gap // ............................................................................ + str q9, [x0, #128] // .....................................................................*...... // gap // ............................................................................ // gap // ............................................................................ - str q29, [x0], #(16) // ....................................................................*....... - add v6.4S, v28.4S, v12.4S // ............e............................................................... - str q5, [x0, #624] // .........................................................................*.. - mls v20.4S, v17.4S, v8.S[0] // ........................................e................................... - mls v7.4S, v22.4S, v8.S[0] // ..............................e............................................. + str q11, [x0], #(16) // ....................................................................*....... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - - // original source code - // ldr q9, [x0, #0] // ..................e.........................................................|.................e....................................................... - // ldr q10, [x0, #(1*(1024/8))] // ................................e...........................................|...............................e......................................... - // ldr q11, [x0, #(2*(1024/8))] // ......................e.....................................................|.....................e................................................... - // ldr q12, [x0, #(3*(1024/8))] // ...........................e................................................|..........................e.............................................. - // ldr q13, [x0, #(4*(1024/8))] // ..........e.................................................................|.........e............................................................... - // ldr q14, [x0, #(5*(1024/8))] // .......e....................................................................|......e.................................................................. - // ldr q15, [x0, #(6*(1024/8))] // ...e........................................................................|..e...................................................................... - // ldr q16, [x0, #(7*(1024/8))] // e...........................................................................e......................................................................... - // mul v24.4s, v13.4s, v0.s[0] // ..............................e.............................................|.............................e........................................... - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ...............................e............................................|..............................e.......................................... - // mls v24.4s, v13.4s, v8.s[0] // .............................................e..............................|............................................e............................ - // sub v13.4s, v9.4s, v24.4s // ................................................................e...........|...............................................................e......... - // add v9.4s, v9.4s, v24.4s // ........................................................................e...|.......................................................................e. - // mul v24.4s, v14.4s, v0.s[0] // ..........................e.................................................|.........................e............................................... - // sqrdmulh v14.4s, v14.4s, v0.s[1] // .........................e..................................................|........................e................................................ - // mls v24.4s, v14.4s, v8.s[0] // .........................................e..................................|........................................e................................ - // sub v14.4s, v10.4s, v24.4s // ..............................................................e.............|.............................................................e........... - // add v10.4s, v10.4s, v24.4s // ....................................................................e.......|...................................................................e..... - // mul v24.4s, v15.4s, v0.s[0] // ........................e...................................................|.......................e................................................. - // sqrdmulh v15.4s, v15.4s, v0.s[1] // .....................e......................................................|....................e.................................................... - // mls v24.4s, v15.4s, v8.s[0] // .....................................e......................................|....................................e.................................... - // sub v15.4s, v11.4s, v24.4s // ................................................e...........................|...............................................e......................... - // add v11.4s, v11.4s, v24.4s // .................................................e..........................|................................................e........................ - // mul v24.4s, v16.4s, v0.s[0] // .................e..........................................................|................e........................................................ - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ...................e........................................................|..................e...................................................... - // mls v24.4s, v16.4s, v8.s[0] // .................................e..........................................|................................e........................................ - // sub v16.4s, v12.4s, v24.4s // ............................................e...............................|...........................................e............................. - // add v12.4s, v12.4s, v24.4s // ...........................................e................................|..........................................e.............................. - // mul v24.4s, v11.4s, v0.s[2] // .............................................................e..............|............................................................e............ - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ...........................................................e................|..........................................................e.............. - // mls v24.4s, v11.4s, v8.s[0] // ...........................................................................e|......................................................................... - // sub v11.4s, v9.4s, v24.4s // ..................................*.........................................|.................................*....................................... - // add v9.4s, v9.4s, v24.4s // ......*.....................................................................|.....*................................................................... - // mul v24.4s, v12.4s, v0.s[2] // ........................................................e...................|.......................................................e................. - // sqrdmulh v12.4s, v12.4s, v0.s[3] // .......................................................e....................|......................................................e.................. - // mls v24.4s, v12.4s, v8.s[0] // ......................................................................e.....|.....................................................................e... - // sub v12.4s, v10.4s, v24.4s // ....*.......................................................................|...*..................................................................... - // add v10.4s, v10.4s, v24.4s // ..*.........................................................................|.*....................................................................... - // mul v24.4s, v15.4s, v1.s[0] // .........................................................e..................|........................................................e................ - // sqrdmulh v15.4s, v15.4s, v1.s[1] // ..........................................................e.................|.........................................................e............... - // mls v24.4s, v15.4s, v8.s[0] // ..........................................................................e.|......................................................................... - // sub v15.4s, v13.4s, v24.4s // .........*..................................................................|........*................................................................ - // add v13.4s, v13.4s, v24.4s // ........*...................................................................|.......*................................................................. - // mul v24.4s, v16.4s, v1.s[0] // ......................................................e.....................|.....................................................e................... - // sqrdmulh v16.4s, v16.4s, v1.s[1] // .....................................................e......................|....................................................e.................... - // mls v24.4s, v16.4s, v8.s[0] // ...................................................................e........|..................................................................e...... - // sub v16.4s, v14.4s, v24.4s // .....*......................................................................|....*.................................................................... - // add v14.4s, v14.4s, v24.4s // .*..........................................................................|*........................................................................ - // mul v24.4s, v10.4s, v1.s[2] // ....................*.......................................................|...................*..................................................... - // sqrdmulh v10.4s, v10.4s, v1.s[3] // ...............*............................................................|..............*.......................................................... - // mls v24.4s, v10.4s, v8.s[0] // ...................................*........................................|..................................*...................................... - // sub v10.4s, v9.4s, v24.4s // ..............................................*.............................|.............................................*........................... - // add v9.4s, v9.4s, v24.4s // ...............................................................*............|..............................................................*.......... - // mul v24.4s, v12.4s, v2.s[0] // .............*..............................................................|............*............................................................ - // sqrdmulh v12.4s, v12.4s, v2.s[1] // ..............*.............................................................|.............*........................................................... - // mls v24.4s, v12.4s, v8.s[0] // ............................*...............................................|...........................*............................................. - // sub v12.4s, v11.4s, v24.4s // .......................................*....................................|......................................*.................................. - // add v11.4s, v11.4s, v24.4s // ........................................*...................................|.......................................*................................. - // mul v24.4s, v14.4s, v2.s[2] // .......................*....................................................|......................*.................................................. - // sqrdmulh v14.4s, v14.4s, v2.s[3] // ................*...........................................................|...............*......................................................... - // mls v24.4s, v14.4s, v8.s[0] // ....................................*.......................................|...................................*..................................... - // sub v14.4s, v13.4s, v24.4s // .................................................................*..........|................................................................*........ - // add v13.4s, v13.4s, v24.4s // ...................................................*........................|..................................................*...................... - // mul v24.4s, v16.4s, v3.s[0] // ...........*................................................................|..........*.............................................................. - // sqrdmulh v16.4s, v16.4s, v3.s[1] // ............*...............................................................|...........*............................................................. - // mls v24.4s, v16.4s, v8.s[0] // .............................*..............................................|............................*............................................ - // sub v16.4s, v15.4s, v24.4s // ......................................*.....................................|.....................................*................................... - // add v15.4s, v15.4s, v24.4s // ..........................................*.................................|.........................................*............................... - // str q9, [x0], #(16) // .......................................................................*....|......................................................................*.. - // str q10, [x0, #(-16 + 1*(1024/8))] // .....................................................................*......|....................................................................*.... - // str q11, [x0, #(-16 + 2*(1024/8))] // ....................................................*.......................|...................................................*..................... - // str q12, [x0, #(-16 + 3*(1024/8))] // ...............................................*............................|..............................................*.......................... - // str q13, [x0, #(-16 + 4*(1024/8))] // ............................................................*...............|...........................................................*............. - // str q14, [x0, #(-16 + 5*(1024/8))] // .........................................................................*..|........................................................................* - // str q15, [x0, #(-16 + 6*(1024/8))] // ..................................................................*.........|.................................................................*....... - // str q16, [x0, #(-16 + 7*(1024/8))] // ..................................................*.........................|.................................................*....................... + mls v16.4S, v7.4S, v8.S[0] // ..........e................................................................. + + // ------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------ + // ldr q9, [x0, #0] // ............e..............................................................'............~............................................................. + // ldr q10, [x0, #(1*(1024/8))] // ...........................e...............................................'...........................~.............................................. + // ldr q11, [x0, #(2*(1024/8))] // .................e.........................................................'.................~........................................................ + // ldr q12, [x0, #(3*(1024/8))] // ......................e....................................................'......................~................................................... + // ldr q13, [x0, #(4*(1024/8))] // .e.........................................................................'.~........................................................................ + // ldr q14, [x0, #(5*(1024/8))] // e..........................................................................'~......................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ......e....................................................................'......~................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ..e........................................................................'..~....................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ..............................................................e............'..............................................................~........... + // mul v24.4s, v13.4s, v0.s[0] // ..................e........................................................'..................~....................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................e'.......................................................................... + // sub v13.4s, v9.4s, v24.4s // ..........~................................................................'..........*............................................................... + // add v9.4s, v9.4s, v24.4s // ...........~...............................................................'...........*.............................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .......................e...................................................'.......................~.................................................. + // mul v24.4s, v14.4s, v0.s[0] // ...................e.......................................................'...................~...................................................... + // mls v24.4s, v27.4s, v8.s[0] // .............................................e.............................'.............................................~............................ + // sub v14.4s, v10.4s, v24.4s // .................................................................e.........'.................................................................~........ + // add v10.4s, v10.4s, v24.4s // .....................................................................e.....'.....................................................................~.... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // .........................e.................................................'.........................~................................................ + // mul v24.4s, v15.4s, v0.s[0] // ........................e..................................................'........................~................................................. + // mls v24.4s, v27.4s, v8.s[0] // .....................................e.....................................'.....................................~.................................... + // sub v15.4s, v11.4s, v24.4s // ...................................................e.......................'...................................................~...................... + // add v11.4s, v11.4s, v24.4s // ...........................................................................*.......................................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ....................e......................................................'....................~..................................................... + // mul v24.4s, v16.4s, v0.s[0] // .....................e.....................................................'.....................~.................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................................e..........................................'................................~......................................... + // sub v16.4s, v12.4s, v24.4s // ............................................e..............................'............................................~............................. + // add v12.4s, v12.4s, v24.4s // ...........................................e...............................'...........................................~.............................. + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ..........................~................................................'..........................*............................................... + // mul v24.4s, v11.4s, v0.s[2] // .........~.................................................................'.........*................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ......................................~....................................'......................................*................................... + // sub v11.4s, v9.4s, v24.4s // .................................................~.........................'.................................................*........................ + // add v9.4s, v9.4s, v24.4s // ..................................................~........................'..................................................*....................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .........................................................e.................'.........................................................~................ + // mul v24.4s, v12.4s, v0.s[2] // ......................................................e....................'......................................................~................... + // mls v24.4s, v27.4s, v8.s[0] // ...................................................................e.......'...................................................................~...... + // sub v12.4s, v10.4s, v24.4s // .....~.....................................................................'.....*.................................................................... + // add v10.4s, v10.4s, v24.4s // ....~......................................................................'....*..................................................................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ..........................................................e................'..........................................................~............... + // mul v24.4s, v15.4s, v1.s[0] // ..................................................................e........'..................................................................~....... + // mls v24.4s, v27.4s, v8.s[0] // ...~.......................................................................'...*...................................................................... + // sub v15.4s, v13.4s, v24.4s // .................................~.........................................'.................................*........................................ + // add v13.4s, v13.4s, v24.4s // .............................~.............................................'.............................*............................................ + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .......................................................e...................'.......................................................~.................. + // mul v24.4s, v16.4s, v1.s[0] // .....................................................e.....................'.....................................................~.................... + // mls v24.4s, v27.4s, v8.s[0] // ....................................................................e......'....................................................................~..... + // sub v16.4s, v14.4s, v24.4s // ........~..................................................................'........*................................................................. + // add v14.4s, v14.4s, v24.4s // .......~...................................................................'.......*.................................................................. + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ...................................~.......................................'...................................*...................................... + // mul v24.4s, v10.4s, v1.s[2] // ....................................~......................................'....................................*..................................... + // mls v24.4s, v27.4s, v8.s[0] // ....................................................~......................'....................................................*..................... + // sub v10.4s, v9.4s, v24.4s // ................................................................~..........'................................................................*......... + // add v9.4s, v9.4s, v24.4s // ...............................................................~...........'...............................................................*.......... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ..............................~............................................'..............................*........................................... + // mul v24.4s, v12.4s, v2.s[0] // ..................................~........................................'..................................*....................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................................~............................'..............................................*........................... + // sub v12.4s, v11.4s, v24.4s // ...........................................................~...............'...........................................................*.............. + // add v11.4s, v11.4s, v24.4s // .............................................................~.............'.............................................................*............ + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ..............~............................................................'..............*........................................................... + // mul v24.4s, v14.4s, v2.s[2] // ...............~...........................................................'...............*.......................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................~..............................................'............................*............................................. + // sub v14.4s, v13.4s, v24.4s // ........................................~..................................'........................................*................................. + // add v13.4s, v13.4s, v24.4s // .......................................~...................................'.......................................*.................................. + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ................~..........................................................'................*......................................................... + // mul v24.4s, v16.4s, v3.s[0] // .............~.............................................................'.............*............................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...............................~...........................................'...............................*.......................................... + // sub v16.4s, v15.4s, v24.4s // ..........................................~................................'..........................................*............................... + // add v15.4s, v15.4s, v24.4s // .........................................~.................................'.........................................*................................ + // str q9, [x0], #(16) // .........................................................................~.'.........................................................................* + // str q10, [x0, #(-16 + 1*(1024/8))] // ........................................................................~..'........................................................................*. + // str q11, [x0, #(-16 + 2*(1024/8))] // ......................................................................~....'......................................................................*... + // str q12, [x0, #(-16 + 3*(1024/8))] // .......................................................................~...'.......................................................................*.. + // str q13, [x0, #(-16 + 4*(1024/8))] // ........................................................~..................'........................................................*................. + // str q14, [x0, #(-16 + 5*(1024/8))] // ................................................~..........................'................................................*......................... + // str q15, [x0, #(-16 + 6*(1024/8))] // ...............................................~...........................'...............................................*.......................... + // str q16, [x0, #(-16 + 7*(1024/8))] // ............................................................~..............'............................................................*............. sub count, count, #1 cbnz count, layer123_start - sub v29.4S, v6.4S, v7.4S // .................*.................. - sub v13.4S, v9.4S, v11.4S // ..*................................. - sub v31.4S, v10.4S, v20.4S // ......*............................. - add v22.4S, v9.4S, v11.4S // .*.................................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - add v26.4S, v10.4S, v20.4S // .....*.............................. - sub v14.4S, v15.4S, v27.4S // ...*................................ - add v24.4S, v15.4S, v27.4S // *................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v15.4S, v13.4S, v2.S[0] // .........*.......................... - sqrdmulh v23.4S, v13.4S, v2.S[1] // ..........*......................... - mul v20.4S, v22.4S, v1.S[2] // .............*...................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sqrdmulh v4.4S, v22.4S, v1.S[3] // ...........*........................ - sqrdmulh v17.4S, v14.4S, v3.S[1] // ........*........................... - mul v14.4S, v14.4S, v3.S[0] // .......*............................ - sqrdmulh v25.4S, v24.4S, v2.S[3] // ............*....................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v13.4S, v24.4S, v2.S[2] // ..............*..................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v15.4S, v23.4S, v8.S[0] // ...............*.................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v14.4S, v17.4S, v8.S[0] // ................*................... - add v17.4S, v6.4S, v7.4S // ....*............................... - mls v20.4S, v4.4S, v8.S[0] // ..................*................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v13.4S, v25.4S, v8.S[0] // ...................*................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sub v22.4S, v29.4S, v15.4S // .....................*.............. - add v16.4S, v29.4S, v15.4S // ......................*............. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sub v7.4S, v31.4S, v14.4S // ....................*............... - add v18.4S, v31.4S, v14.4S // .......................*............ - add v19.4S, v17.4S, v20.4S // ..............................*..... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - add v6.4S, v26.4S, v13.4S // ...........................*........ - sub v23.4S, v26.4S, v13.4S // ...............................*.... - str q22, [x0, #384] // .........................*.......... - str q16, [x0, #256] // ............................*....... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sub v26.4S, v17.4S, v20.4S // ........................*........... - str q7, [x0, #896] // ..........................*......... - str q19, [x0], #(16) // ..................................*. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q18, [x0, #752] // ................................*... - str q6, [x0, #496] // .............................*...... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str q23, [x0, #624] // ...................................* - str q26, [x0, #112] // .................................*.. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - - // original source code - // add v17.4S, v15.4S, v27.4S // ......*............................. - // add v23.4S, v9.4S, v11.4S // ...*................................ - // sub v22.4S, v9.4S, v11.4S // .*.................................. - // sub v12.4S, v15.4S, v27.4S // .....*.............................. - // add v26.4S, v6.4S, v7.4S // .................*.................. - // add v16.4S, v10.4S, v20.4S // ....*............................... - // sub v10.4S, v10.4S, v20.4S // ..*................................. - // mul v5.4S, v12.4S, v3.S[0] // ............*....................... - // sqrdmulh v31.4S, v12.4S, v3.S[1] // ...........*........................ - // mul v29.4S, v22.4S, v2.S[0] // .......*............................ - // sqrdmulh v22.4S, v22.4S, v2.S[1] // ........*........................... - // sqrdmulh v13.4S, v23.4S, v1.S[3] // ..........*......................... - // sqrdmulh v25.4S, v17.4S, v2.S[3] // .............*...................... - // mul v18.4S, v23.4S, v1.S[2] // .........*.......................... - // mul v30.4S, v17.4S, v2.S[2] // ..............*..................... - // mls v29.4S, v22.4S, v8.S[0] // ...............*.................... - // mls v5.4S, v31.4S, v8.S[0] // ................*................... - // sub v31.4S, v6.4S, v7.4S // *................................... - // mls v18.4S, v13.4S, v8.S[0] // ..................*................. - // mls v30.4S, v25.4S, v8.S[0] // ...................*................ - // sub v24.4S, v10.4S, v5.4S // ......................*............. - // sub v7.4S, v31.4S, v29.4S // ....................*............... - // add v29.4S, v31.4S, v29.4S // .....................*.............. - // add v31.4S, v10.4S, v5.4S // .......................*............ - // sub v19.4S, v26.4S, v18.4S // .............................*...... - // str q7, [x0, #384] // ...........................*........ - // str q24, [x0, #896] // ..............................*..... - // add v10.4S, v16.4S, v30.4S // .........................*.......... - // str q29, [x0, #256] // ............................*....... - // str q10, [x0, #512] // .................................*.. - // add v29.4S, v26.4S, v18.4S // ........................*........... - // sub v5.4S, v16.4S, v30.4S // ..........................*......... - // str q31, [x0, #768] // ................................*... - // str q19, [x0, #128] // ...................................* - // str q29, [x0], #(16) // ...............................*.... - // str q5, [x0, #624] // ..................................*. + // Instructions: 43 + // Expected cycles: 14 + // Expected IPC: 3.07 + // + // Wall time: 0.85s + // User time: 0.85s + // + // ----------- original position ------------> + // 0 25 + // |------------------------|----------------- + sub v14.4S, v31.4S, v28.4S // .....*..................................... + add v17.4S, v30.4S, v19.4S // *.......................................... + mls v29.4S, v12.4S, v8.S[0] // .*......................................... + add v27.4S, v31.4S, v28.4S // ....*...................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v23.4S, v20.4S, v22.4S // ..*........................................ + sub v7.4S, v20.4S, v22.4S // ...*....................................... + sub v11.4S, v10.4S, v16.4S // .......*................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v25.4S, v17.4S, v0.S[2] // ......*.................................... + sqrdmulh v6.4S, v17.4S, v0.S[3] // .............*............................. + mul v5.4S, v14.4S, v3.S[0] // .........*................................. + sqrdmulh v22.4S, v14.4S, v3.S[1] // ............*.............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v24.4S, v11.4S, v29.4S // ...............*........................... + sub v29.4S, v11.4S, v29.4S // ..................*........................ + sqrdmulh v17.4S, v27.4S, v2.S[3] // ..........*................................ + mul v20.4S, v27.4S, v2.S[2] // ...........*............................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v12.4S, v23.4S, v1.S[3] // ....................*...................... + mul v14.4S, v7.4S, v2.S[0] // ...................*....................... + sqrdmulh v26.4S, v7.4S, v2.S[1] // ................*.......................... + mul v23.4S, v23.4S, v1.S[2] // .....................*..................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v25.4S, v6.4S, v8.S[0] // ......................*.................... + mls v5.4S, v22.4S, v8.S[0] // .................*......................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v18.4S, v10.4S, v16.4S // ........*.................................. + mls v20.4S, v17.4S, v8.S[0] // ..............*............................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v23.4S, v12.4S, v8.S[0] // ................................*.......... + mls v14.4S, v26.4S, v8.S[0] // ...........................*............... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v6.4S, v18.4S, v25.4S // ...............................*........... + sub v19.4S, v18.4S, v25.4S // ..............................*............ + add v12.4S, v29.4S, v5.4S // .........................*................. + sub v26.4S, v29.4S, v5.4S // ..........................*................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v18.4S, v24.4S, v20.4S // ........................*.................. + // gap // ........................................... + add v24.4S, v24.4S, v20.4S // .......................*................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v5.4S, v6.4S, v23.4S // .....................................*..... + sub v10.4S, v6.4S, v23.4S // ......................................*.... + str q12, [x0, #768] // ............................*.............. + str q26, [x0, #896] // ...................................*....... + sub v13.4S, v19.4S, v14.4S // ..................................*........ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v14.4S, v19.4S, v14.4S // ....................................*...... + str q18, [x0, #640] // .............................*............. + str q24, [x0, #512] // .................................*......... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q5, [x0], #(16) // ..........................................* + str q10, [x0, #112] // .........................................*. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q13, [x0, #368] // ........................................*.. + str q14, [x0, #240] // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + + // -------------- new position --------------> + // 0 25 + // |------------------------|----------------- + // add v4.4S, v30.4S, v19.4S // .*......................................... + // mls v29.4S, v12.4S, v8.S[0] // ..*........................................ + // add v27.4S, v20.4S, v22.4S // ....*...................................... + // sub v5.4S, v20.4S, v22.4S // .....*..................................... + // add v6.4S, v31.4S, v28.4S // ...*....................................... + // sub v23.4S, v31.4S, v28.4S // *.......................................... + // mul v28.4S, v4.4S, v0.S[2] // .......*................................... + // sub v24.4S, v10.4S, v16.4S // ......*.................................... + // add v18.4S, v10.4S, v16.4S // .....................*..................... + // mul v22.4S, v23.4S, v3.S[0] // .........*................................. + // sqrdmulh v11.4S, v6.4S, v2.S[3] // .............*............................. + // mul v14.4S, v6.4S, v2.S[2] // ..............*............................ + // sqrdmulh v25.4S, v23.4S, v3.S[1] // ..........*................................ + // sqrdmulh v20.4S, v4.4S, v0.S[3] // ........*.................................. + // mls v14.4S, v11.4S, v8.S[0] // ......................*.................... + // add v23.4S, v24.4S, v29.4S // ...........*............................... + // sqrdmulh v17.4S, v5.4S, v2.S[1] // .................*......................... + // mls v22.4S, v25.4S, v8.S[0] // ....................*...................... + // sub v25.4S, v24.4S, v29.4S // ............*.............................. + // mul v13.4S, v5.4S, v2.S[0] // ................*.......................... + // sqrdmulh v31.4S, v27.4S, v1.S[3] // ...............*........................... + // mul v4.4S, v27.4S, v1.S[2] // ..................*........................ + // mls v28.4S, v20.4S, v8.S[0] // ...................*....................... + // add v29.4S, v23.4S, v14.4S // ..............................*............ + // sub v5.4S, v23.4S, v14.4S // .............................*............. + // add v11.4S, v25.4S, v22.4S // ...........................*............... + // sub v27.4S, v25.4S, v22.4S // ............................*.............. + // mls v13.4S, v17.4S, v8.S[0] // ........................*.................. + // str q11, [x0, #768] // .................................*......... + // str q5, [x0, #640] // .....................................*..... + // sub v9.4S, v18.4S, v28.4S // ..........................*................ + // add v5.4S, v18.4S, v28.4S // .........................*................. + // mls v4.4S, v31.4S, v8.S[0] // .......................*................... + // str q29, [x0, #512] // ......................................*.... + // sub v15.4S, v9.4S, v13.4S // ...................................*....... + // str q27, [x0, #896] // ..................................*........ + // add v18.4S, v9.4S, v13.4S // ....................................*...... + // add v11.4S, v5.4S, v4.4S // ...............................*........... + // sub v9.4S, v5.4S, v4.4S // ................................*.......... + // str q18, [x0, #256] // ..........................................* + // str q15, [x0, #384] // .........................................*. + // str q9, [x0, #128] // ........................................*.. + // str q11, [x0], #(16) // .......................................*... restore inp, STACK0 @@ -942,986 +942,1022 @@ layer123_start: sub inpp, inpp, #64 .p2align 2 - // gap // ............................................................ - ldr q24, [x4], #64 // *........................................................... - ldr q25, [x2, #64] // .*.......................................................... - // gap // ............................................................ - // gap // ............................................................ - ldr q21, [x2, #112] // ....*....................................................... - // gap // ............................................................ - // gap // ............................................................ - ldr q5, [x4, #-32] // ..................................*......................... - ldr q23, [x2, #96] // ...*........................................................ - ldr q13, [x1, #96] // ............*............................................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - ldr q4, [x1, #112] // .................*.......................................... - ldr q22, [x2, #80] // ..*......................................................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - ldr q11, [x4, #-48] // .....................*...................................... - ldr q7, [x1, #80] // ........*................................................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mul v26.4S, v21.4S, v24.S[0] // .............*.............................................. - sqrdmulh v6.4S, v21.4S, v24.S[1] // ..............*............................................. - // gap // ............................................................ - sqrdmulh v19.4S, v25.4S, v24.S[1] // .....*...................................................... - ldr q21, [x5, #176] // .........................................................*.. - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - sqrdmulh v16.4S, v23.4S, v24.S[1] // ......*..................................................... - mul v28.4S, v23.4S, v24.S[0] // .......*.................................................... - mul v18.4S, v25.4S, v24.S[0] // .........*.................................................. - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - sqrdmulh v31.4S, v22.4S, v24.S[1] // ...........*................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mul v29.4S, v22.4S, v24.S[0] // ..........*................................................. - mls v26.4S, v6.4S, v8.S[0] // ...................*........................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mls v28.4S, v16.4S, v8.S[0] // ...............*............................................ - mls v18.4S, v19.4S, v8.S[0] // ..................*......................................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - sub v10.4S, v4.4S, v26.4S // ......................*..................................... - add v4.4S, v4.4S, v26.4S // .......................*.................................... - mls v29.4S, v31.4S, v8.S[0] // ................*........................................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - add v12.4S, v13.4S, v28.4S // ....................*....................................... - sub v27.4S, v13.4S, v28.4S // ........................*................................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - sqrdmulh v16.4S, v10.4S, v11.S[1] // ..........................*................................. - mul v19.4S, v10.4S, v11.S[0] // ...........................*................................ - sqrdmulh v22.4S, v4.4S, v24.S[3] // .........................*.................................. - mul v3.4S, v4.4S, v24.S[2] // ..............................*............................. - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - sqrdmulh v17.4S, v12.4S, v24.S[3] // .........................................*.................. - mul v14.4S, v12.4S, v24.S[2] // ...........................................*................ - sqrdmulh v9.4S, v27.4S, v11.S[1] // .............................*.............................. - sub v6.4S, v7.4S, v29.4S // .................................*.......................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - add v26.4S, v7.4S, v29.4S // ....................................*....................... - mul v12.4S, v27.4S, v11.S[0] // ...............................*............................ - ldr q29, [x4, #-16] // .....................................*...................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mls v19.4S, v16.4S, v8.S[0] // ................................*........................... - ldr q16, [x1, #64] // ........................................*................... - mls v3.4S, v22.4S, v8.S[0] // ...................................*........................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mls v14.4S, v17.4S, v8.S[0] // .....................................................*...... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mls v12.4S, v9.4S, v8.S[0] // ......................................*..................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - sub v31.4S, v6.4S, v19.4S // .......................................*.................... - add v28.4S, v6.4S, v19.4S // ............................................*............... - add v25.4S, v26.4S, v3.4S // ...............................................*............ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - sub v0.4S, v16.4S, v18.4S // ....................................................*....... - add v22.4S, v16.4S, v18.4S // ........................................................*... - sub v16.4S, v26.4S, v3.4S // ..........................................*................. - ldr q3, [x5, #112] // ............................*............................... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mul v20.4S, v31.4S, v29.S[0] // .............................................*.............. - sqrdmulh v7.4S, v31.4S, v29.S[1] // ..............................................*............. - mul v15.4S, v25.4S, v11.S[2] // ......................................................*..... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - sqrdmulh v9.4S, v25.4S, v11.S[3] // .......................................................*.... - mul v1.4S, v16.4S, v5.S[0] // ................................................*........... - sqrdmulh v16.4S, v16.4S, v5.S[1] // .................................................*.......... - ldr q11, [x5, #32] // ..........................................................*. - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mul v29.4S, v28.4S, v5.S[2] // ...................................................*........ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - mls v20.4S, v7.4S, v8.S[0] // ...........................................................* - sqrdmulh v7.4S, v28.4S, v5.S[3] // ..................................................*......... - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - // gap // ............................................................ - - // original source code - // ldr q13, [x4], #64 // *........................................................... - // ldr q27, [x2, #64] // .*.......................................................... - // ldr q12, [x2, #80] // .......*.................................................... - // ldr q4, [x2, #96] // ....*....................................................... - // ldr q19, [x2, #112] // ..*......................................................... - // sqrdmulh v24.4S, v27.4S, v13.S[1] // ............*............................................... - // sqrdmulh v7.4S, v4.4S, v13.S[1] // ..............*............................................. - // mul v16.4S, v4.4S, v13.S[0] // ...............*............................................ - // ldr q0, [x1, #80] // .........*.................................................. - // mul v22.4S, v27.4S, v13.S[0] // ................*........................................... - // mul v26.4S, v12.4S, v13.S[0] // ..................*......................................... - // sqrdmulh v14.4S, v12.4S, v13.S[1] // .................*.......................................... - // ldr q23, [x1, #96] // .....*...................................................... - // mul v12.4S, v19.4S, v13.S[0] // ..........*................................................. - // sqrdmulh v17.4S, v19.4S, v13.S[1] // ...........*................................................ - // mls v16.4S, v7.4S, v8.S[0] // ....................*....................................... - // mls v26.4S, v14.4S, v8.S[0] // ........................*................................... - // ldr q6, [x1, #112] // ......*..................................................... - // mls v22.4S, v24.4S, v8.S[0] // .....................*...................................... - // mls v12.4S, v17.4S, v8.S[0] // ...................*........................................ - // add v19.4S, v23.4S, v16.4S // .........................*.................................. - // ldr q24, [x4, #-48] // ........*................................................... - // sub v29.4S, v6.4S, v12.4S // ......................*..................................... - // add v18.4S, v6.4S, v12.4S // .......................*.................................... - // sub v23.4S, v23.4S, v16.4S // ..........................*................................. - // sqrdmulh v16.4S, v18.4S, v13.S[3] // .............................*.............................. - // sqrdmulh v6.4S, v29.4S, v24.S[1] // ...........................*................................ - // mul v25.4S, v29.4S, v24.S[0] // ............................*............................... - // ldr q3, [x5, #112] // .................................................*.......... - // sqrdmulh v9.4S, v23.4S, v24.S[1] // .................................*.......................... - // mul v10.4S, v18.4S, v13.S[2] // ..............................*............................. - // mul v12.4S, v23.4S, v24.S[0] // ....................................*....................... - // mls v25.4S, v6.4S, v8.S[0] // ......................................*..................... - // sub v1.4S, v0.4S, v26.4S // ..................................*......................... - // ldr q28, [x4, #-32] // ...*........................................................ - // mls v10.4S, v16.4S, v8.S[0] // ........................................*................... - // add v26.4S, v0.4S, v26.4S // ...................................*........................ - // ldr q0, [x4, #-16] // .....................................*...................... - // mls v12.4S, v9.4S, v8.S[0] // ..........................................*................. - // sub v11.4S, v1.4S, v25.4S // ...........................................*................ - // ldr q31, [x1, #64] // .......................................*.................... - // sqrdmulh v9.4S, v19.4S, v13.S[3] // ...............................*............................ - // sub v21.4S, v26.4S, v10.4S // ................................................*........... - // mul v14.4S, v19.4S, v13.S[2] // ................................*........................... - // add v13.4S, v1.4S, v25.4S // ............................................*............... - // mul v20.4S, v11.4S, v0.S[0] // ..................................................*......... - // sqrdmulh v17.4S, v11.4S, v0.S[1] // ...................................................*........ - // add v18.4S, v26.4S, v10.4S // .............................................*.............. - // mul v1.4S, v21.4S, v28.S[0] // ......................................................*..... - // sqrdmulh v16.4S, v21.4S, v28.S[1] // .......................................................*.... - // sqrdmulh v7.4S, v13.4S, v28.S[3] // ...........................................................* - // mul v29.4S, v13.4S, v28.S[2] // .........................................................*.. - // sub v0.4S, v31.4S, v22.4S // ..............................................*............. - // mls v14.4S, v9.4S, v8.S[0] // .........................................*.................. - // mul v15.4S, v18.4S, v24.S[2] // ....................................................*....... - // sqrdmulh v9.4S, v18.4S, v24.S[3] // .....................................................*...... - // add v22.4S, v31.4S, v22.4S // ...............................................*............ - // ldr q21, [x5, #176] // .............*.............................................. - // ldr q11, [x5, #32] // ........................................................*... - // mls v20.4S, v17.4S, v8.S[0] // ..........................................................*. + // Instructions: 93 + // Expected cycles: 32 + // Expected IPC: 2.91 + // + // Wall time: 15.65s + // User time: 15.65s + // + // ------------------------------------ original position -------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|----------------- + // gap // ............................................................................................. + ldr q28, [x2, #112] // .*........................................................................................... + ldr q15, [x4], #64 // *............................................................................................ + ldr q5, [x2, #96] // ....*........................................................................................ + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + ldr q4, [x4, #-32] // .....................................*....................................................... + ldr q12, [x1, #64] // .......*..................................................................................... + ldr q18, [x5, #96] // ........................................................................*.................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + ldr q21, [x2, #80] // ..*.......................................................................................... + ldr q0, [x1, #96] // ...............*............................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + ldr q13, [x4, #-48] // .......................*..................................................................... + ldr q27, [x5, #112] // ...................................................................*......................... + ldr q10, [x2, #64] // ...*......................................................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mul v29.4S, v28.4S, v15.S[0] // ...........*................................................................................. + sqrdmulh v28.4S, v28.4S, v15.S[1] // ..........*.................................................................................. + ldr q9, [x1, #80] // .....*....................................................................................... + add x2, x2, #64 // ...........................................................................................*. + sqrdmulh v26.4S, v5.4S, v15.S[1] // ............*................................................................................ + mul v16.4S, v5.4S, v15.S[0] // .............*............................................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + ldr q19, [x1, #112] // ................*............................................................................ + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + sqrdmulh v30.4S, v21.4S, v15.S[1] // ........*.................................................................................... + mul v1.4S, v21.4S, v15.S[0] // .........*................................................................................... + ldr q14, [x4, #-16] // ......*...................................................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + add x1, x1, #64 // ............................................................................................* + // gap // ............................................................................................. + // gap // ............................................................................................. + mls v29.4S, v28.4S, v8.S[0] // .................*........................................................................... + mul v11.4S, v10.4S, v15.S[0] // ..............*.............................................................................. + sqrdmulh v6.4S, v10.4S, v15.S[1] // ...................*......................................................................... + mls v16.4S, v26.4S, v8.S[0] // .....................*....................................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mls v1.4S, v30.4S, v8.S[0] // ..................*.......................................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + sub v25.4S, v19.4S, v29.4S // ..........................*.................................................................. + add v19.4S, v19.4S, v29.4S // ...........................*................................................................. + mls v11.4S, v6.4S, v8.S[0] // .........................*................................................................... + add v5.4S, v0.4S, v16.4S // ..................................*.......................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + sub v7.4S, v0.4S, v16.4S // ................................*............................................................ + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mul v29.4S, v19.4S, v15.S[2] // ............................*................................................................ + sqrdmulh v3.4S, v19.4S, v15.S[3] // .............................*............................................................... + mul v22.4S, v5.4S, v15.S[2] // .........................................*................................................... + sqrdmulh v30.4S, v5.4S, v15.S[3] // ..........................................*.................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mul v28.4S, v25.4S, v13.S[0] // ..............................*.............................................................. + sub v17.4S, v9.4S, v1.4S // .......................................*..................................................... + sqrdmulh v16.4S, v25.4S, v13.S[1] // ...............................*............................................................. + sqrdmulh v31.4S, v7.4S, v13.S[1] // ...................................*......................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + add v23.4S, v12.4S, v11.4S // .................................*........................................................... + sub v20.4S, v12.4S, v11.4S // ...........................................................*................................. + mul v10.4S, v7.4S, v13.S[0] // ........................................*.................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mls v29.4S, v3.4S, v8.S[0] // ....................................*........................................................ + mls v22.4S, v30.4S, v8.S[0] // .................................................*........................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mls v28.4S, v16.4S, v8.S[0] // ......................................*...................................................... + add v16.4S, v9.4S, v1.4S // ........................*.................................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + add v5.4S, v23.4S, v22.4S // ............................................................*................................ + sub v21.4S, v16.4S, v29.4S // ............................................*................................................ + sub v22.4S, v23.4S, v22.4S // ..........................................................*.................................. + add v30.4S, v16.4S, v29.4S // .............................................*............................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + sub v6.4S, v17.4S, v28.4S // ...............................................*............................................. + mls v10.4S, v31.4S, v8.S[0] // ................................................*............................................ + add v2.4S, v17.4S, v28.4S // ..............................................*.............................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + sqrdmulh v25.4S, v30.4S, v13.S[3] // ......................................................*...................................... + mul v29.4S, v30.4S, v13.S[2] // .......................................................*..................................... + mul v13.4S, v21.4S, v4.S[0] // ........................................................*.................................... + sqrdmulh v1.4S, v21.4S, v4.S[1] // .........................................................*................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mul v0.4S, v6.4S, v14.S[0] // ...................................................*......................................... + sqrdmulh v28.4S, v6.4S, v14.S[1] // .....................................................*....................................... + mul v14.4S, v2.4S, v4.S[2] // ..................................................*.......................................... + sqrdmulh v4.4S, v2.4S, v4.S[3] // ....................................................*........................................ + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + sub v19.4S, v20.4S, v10.4S // ...............................................................*............................. + add v24.4S, v20.4S, v10.4S // ..................................................................*.......................... + ldr q10, [x5, #176] // ......................*...................................................................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mls v29.4S, v25.4S, v8.S[0] // ................................................................*............................ + mls v13.4S, v1.4S, v8.S[0] // .................................................................*........................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + mls v0.4S, v28.4S, v8.S[0] // ..............................................................*.............................. + mls v14.4S, v4.4S, v8.S[0] // .............................................................*............................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + sub v26.4S, v5.4S, v29.4S // ..........................................................................*.................. + sub v6.4S, v22.4S, v13.4S // ............................................................................*................ + add v15.4S, v22.4S, v13.4S // ...........................................................................*................. + add v28.4S, v5.4S, v29.4S // .........................................................................*................... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + add v16.4S, v19.4S, v0.4S // ......................................................................*...................... + add v22.4S, v24.4S, v14.4S // .......................................................................*..................... + sub v4.4S, v19.4S, v0.4S // .....................................................................*....................... + sub v13.4S, v24.4S, v14.4S // ....................................................................*........................ + ldr q14, [x5, #160] // ........................................................................................*.... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + trn2 v24.4S, v28.4S, v26.4S // ..................................................................................*.......... + trn1 v30.4S, v28.4S, v26.4S // ....................................................................................*........ + trn2 v20.4S, v15.4S, v6.4S // ...................................................................................*......... + trn1 v29.4S, v15.4S, v6.4S // .....................................................................................*....... + ldr q28, [x5, #16] // ...........................................*................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + trn2 v3.4S, v16.4S, v4.4S // .............................................................................*............... + trn1 v16.4S, v16.4S, v4.4S // ................................................................................*............ + trn1 v31.4S, v22.4S, v13.4S // ..............................................................................*.............. + trn2 v11.4S, v22.4S, v13.4S // ...............................................................................*............. + ldr q22, [x5, #144] // ....................*........................................................................ + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + ldr q26, [x5], #(12*16) // .................................................................................*........... + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + trn2 v4.2D, v31.2D, v16.2D // ......................................................................................*...... + trn2 v13.2D, v11.2D, v3.2D // .......................................................................................*..... + trn2 v15.2D, v24.2D, v20.2D // .........................................................................................*... + trn2 v21.2D, v30.2D, v29.2D // ..........................................................................................*.. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + // gap // ............................................................................................. + + // --------------------------------------- new position ---------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|----------------- + // ldr q23, [x4], #64 // .*........................................................................................... + // ldr q5, [x2, #112] // *............................................................................................ + // ldr q7, [x2, #80] // ......*...................................................................................... + // ldr q12, [x2, #64] // ..........*.................................................................................. + // ldr q17, [x2, #96] // ..*.......................................................................................... + // ldr q0, [x1, #80] // .............*............................................................................... + // ldr q31, [x4, #-16] // ....................*........................................................................ + // ldr q21, [x1, #64] // ....*........................................................................................ + // sqrdmulh v20.4S, v7.4S, v23.S[1] // ..................*.......................................................................... + // mul v16.4S, v7.4S, v23.S[0] // ...................*......................................................................... + // sqrdmulh v27.4S, v5.4S, v23.S[1] // ............*................................................................................ + // mul v5.4S, v5.4S, v23.S[0] // ...........*................................................................................. + // sqrdmulh v26.4S, v17.4S, v23.S[1] // ...............*............................................................................. + // mul v19.4S, v17.4S, v23.S[0] // ................*............................................................................ + // mul v11.4S, v12.4S, v23.S[0] // .......................*..................................................................... + // ldr q25, [x1, #96] // .......*..................................................................................... + // ldr q6, [x1, #112] // .................*........................................................................... + // mls v5.4S, v27.4S, v8.S[0] // ......................*...................................................................... + // mls v16.4S, v20.4S, v8.S[0] // ..........................*.................................................................. + // sqrdmulh v30.4S, v12.4S, v23.S[1] // ........................*.................................................................... + // ldr q22, [x5, #144] // .......................................................................................*..... + // mls v19.4S, v26.4S, v8.S[0] // .........................*................................................................... + // ldr q10, [x5, #176] // ................................................................*............................ + // ldr q17, [x4, #-48] // ........*.................................................................................... + // add v15.4S, v0.4S, v16.4S // ..............................................*.............................................. + // mls v11.4S, v30.4S, v8.S[0] // .............................*............................................................... + // sub v3.4S, v6.4S, v5.4S // ...........................*................................................................. + // add v5.4S, v6.4S, v5.4S // ............................*................................................................ + // mul v30.4S, v5.4S, v23.S[2] // ................................*............................................................ + // sqrdmulh v5.4S, v5.4S, v23.S[3] // .................................*........................................................... + // mul v12.4S, v3.4S, v17.S[0] // ....................................*........................................................ + // sqrdmulh v24.4S, v3.4S, v17.S[1] // ......................................*...................................................... + // sub v13.4S, v25.4S, v19.4S // ...............................*............................................................. + // add v20.4S, v21.4S, v11.4S // ........................................*.................................................... + // add v26.4S, v25.4S, v19.4S // ..............................*.............................................................. + // sqrdmulh v19.4S, v13.4S, v17.S[1] // .......................................*..................................................... + // mls v30.4S, v5.4S, v8.S[0] // ...........................................*................................................. + // ldr q5, [x4, #-32] // ...*......................................................................................... + // mls v12.4S, v24.4S, v8.S[0] // .............................................*............................................... + // sub v25.4S, v0.4S, v16.4S // .....................................*....................................................... + // mul v13.4S, v13.4S, v17.S[0] // ..........................................*.................................................. + // mul v0.4S, v26.4S, v23.S[2] // ..................................*.......................................................... + // sqrdmulh v26.4S, v26.4S, v23.S[3] // ...................................*......................................................... + // ldr q28, [x5, #16] // ..................................................................................*.......... + // sub v6.4S, v15.4S, v30.4S // ................................................*............................................ + // add v30.4S, v15.4S, v30.4S // ..................................................*.......................................... + // add v2.4S, v25.4S, v12.4S // .....................................................*....................................... + // sub v25.4S, v25.4S, v12.4S // ...................................................*......................................... + // mls v13.4S, v19.4S, v8.S[0] // ....................................................*........................................ + // mls v0.4S, v26.4S, v8.S[0] // ............................................*................................................ + // mul v12.4S, v2.4S, v5.S[2] // ............................................................*................................ + // mul v23.4S, v25.4S, v31.S[0] // ..........................................................*.................................. + // sqrdmulh v26.4S, v2.4S, v5.S[3] // .............................................................*............................... + // sqrdmulh v15.4S, v25.4S, v31.S[1] // ...........................................................*................................. + // sqrdmulh v2.4S, v30.4S, v17.S[3] // ......................................................*...................................... + // mul v30.4S, v30.4S, v17.S[2] // .......................................................*..................................... + // mul v19.4S, v6.4S, v5.S[0] // ........................................................*.................................... + // sqrdmulh v31.4S, v6.4S, v5.S[1] // .........................................................*................................... + // sub v25.4S, v20.4S, v0.4S // .................................................*........................................... + // sub v27.4S, v21.4S, v11.4S // .........................................*................................................... + // add v5.4S, v20.4S, v0.4S // ...............................................*............................................. + // mls v12.4S, v26.4S, v8.S[0] // ....................................................................*........................ + // mls v23.4S, v15.4S, v8.S[0] // ...................................................................*......................... + // sub v0.4S, v27.4S, v13.4S // ..............................................................*.............................. + // mls v30.4S, v2.4S, v8.S[0] // .................................................................*........................... + // mls v19.4S, v31.4S, v8.S[0] // ..................................................................*.......................... + // add v31.4S, v27.4S, v13.4S // ...............................................................*............................. + // ldr q27, [x5, #112] // .........*................................................................................... + // sub v29.4S, v31.4S, v12.4S // ............................................................................*................ + // sub v13.4S, v0.4S, v23.4S // ...........................................................................*................. + // add v7.4S, v0.4S, v23.4S // .........................................................................*................... + // add v26.4S, v31.4S, v12.4S // ..........................................................................*.................. + // ldr q18, [x5, #96] // .....*....................................................................................... + // add v9.4S, v5.4S, v30.4S // ........................................................................*.................... + // sub v6.4S, v5.4S, v30.4S // .....................................................................*....................... + // add v21.4S, v25.4S, v19.4S // .......................................................................*..................... + // sub v25.4S, v25.4S, v19.4S // ......................................................................*...................... + // trn2 v3.4S, v7.4S, v13.4S // ...................................................................................*......... + // trn1 v31.4S, v26.4S, v29.4S // .....................................................................................*....... + // trn2 v11.4S, v26.4S, v29.4S // ......................................................................................*...... + // trn1 v16.4S, v7.4S, v13.4S // ....................................................................................*........ + // ldr q26, [x5], #(12*16) // ........................................................................................*.... + // trn2 v24.4S, v9.4S, v6.4S // ..............................................................................*.............. + // trn2 v20.4S, v21.4S, v25.4S // ................................................................................*............ + // trn1 v30.4S, v9.4S, v6.4S // ...............................................................................*............. + // trn1 v29.4S, v21.4S, v25.4S // .................................................................................*........... + // trn2 v4.2D, v31.2D, v16.2D // .........................................................................................*... + // trn2 v13.2D, v11.2D, v3.2D // ..........................................................................................*.. + // ldr q14, [x5, #-32] // .............................................................................*............... + // trn2 v15.2D, v24.2D, v20.2D // ...........................................................................................*. + // trn2 v21.2D, v30.2D, v29.2D // ............................................................................................* + // add x2, x2, #64 // ..............*.............................................................................. + // add x1, x1, #64 // .....................*....................................................................... sub count, count, #1 layer45678_start: - add x2, x2, #64 // .........*...................................................................................................................................... + // Instructions: 144 + // Expected cycles: 32 + // Expected IPC: 4.50 + // + // Wall time: 2191.91s + // User time: 2191.91s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + sqrdmulh v25.4S, v13.4S, v27.4S // ...............................................................................................................................*................ + ldr q23, [x4], #64 // ..........e..................................................................................................................................... + ldr q5, [x2, #176] // .......e........................................................................................................................................ + mul v6.4S, v13.4S, v18.4S // ................................................................................................................................*............... + ldr q7, [x2, #144] // .....e.......................................................................................................................................... + // gap // ................................................................................................................................................ + mul v2.4S, v4.4S, v18.4S // ...........................................................................................................................*.................... + sqrdmulh v19.4S, v4.4S, v27.4S // ..........................................................................................................................*..................... + sqrdmulh v9.4S, v21.4S, v28.4S // ................................................................................................*............................................... + sqrdmulh v1.4S, v15.4S, v28.4S // .....................................................................................................*.......................................... + mul v13.4S, v15.4S, v26.4S // ......................................................................................................*......................................... + ldr q12, [x2, #128] // ....e........................................................................................................................................... + ldr q17, [x2, #160] // ......e......................................................................................................................................... + ldr q0, [x1, #144] // .e.............................................................................................................................................. + trn1 v15.2D, v31.2D, v16.2D // ........................................................................................*....................................................... // gap // ................................................................................................................................................ - sub v18.4S, v0.4S, v12.4S // ...............................................*................................................................................................ + ldr q4, [x5, #-144] // .............................................................................................*.................................................. + trn1 v3.2D, v11.2D, v3.2D // .........................................................................................*...................................................... + ldr q31, [x4, #-16] // .............e.................................................................................................................................. + mul v18.4S, v21.4S, v26.4S // .................................................................................................*.............................................. // gap // ................................................................................................................................................ - add v28.4S, v0.4S, v12.4S // ................................................*............................................................................................... - mls v1.4S, v16.4S, v8.S[0] // .............................................................*.................................................................................. - mls v29.4S, v7.4S, v8.S[0] // ..................................................................*............................................................................. - ldr q13, [x4], #64 // ..........e..................................................................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - ldr q27, [x2, #128] // ....e........................................................................................................................................... - ldr q12, [x2, #144] // .....e.......................................................................................................................................... - add v2.4S, v22.4S, v14.4S // ......................................*......................................................................................................... - sub v17.4S, v22.4S, v14.4S // .....................................*.......................................................................................................... - mls v15.4S, v9.4S, v8.S[0] // ........................................................*....................................................................................... - sub v7.4S, v18.4S, v20.4S // ........................................................................*....................................................................... - add v26.4S, v18.4S, v20.4S // .........................................................................*...................................................................... // gap // ................................................................................................................................................ + ldr q21, [x1, #128] // e............................................................................................................................................... + mls v2.4S, v19.4S, v8.S[0] // ............................................................................................................................*................... + mls v6.4S, v25.4S, v8.S[0] // .................................................................................................................................*.............. + trn1 v24.2D, v24.2D, v20.2D // .................................................................................*.............................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - ldr q4, [x2, #160] // ......e......................................................................................................................................... + sqrdmulh v20.4S, v7.4S, v23.S[1] // ...................e............................................................................................................................ + mul v16.4S, v7.4S, v23.S[0] // ....................e........................................................................................................................... + ldr q7, [x5, #-64] // ......................................................................................................................*......................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + sqrdmulh v27.4S, v5.4S, v23.S[1] // .............................e.................................................................................................................. + mul v5.4S, v5.4S, v23.S[0] // ..............................e................................................................................................................. // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v17.4S, v23.S[1] // ........................e....................................................................................................................... + mul v19.4S, v17.4S, v23.S[0] // .........................e...................................................................................................................... + mul v11.4S, v12.4S, v23.S[0] // ...............e................................................................................................................................ + ldr q25, [x1, #160] // ..e............................................................................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sub v31.4S, v17.4S, v1.4S // ..............................................................*................................................................................. - add v6.4S, v17.4S, v1.4S // ...............................................................*................................................................................ - add v23.4S, v28.4S, v29.4S // ....................................................................*........................................................................... - sub v0.4S, v28.4S, v29.4S // ...................................................................*............................................................................ // gap // ................................................................................................................................................ + trn1 v29.2D, v30.2D, v29.2D // ................................................................................*............................................................... + sub v28.4S, v15.4S, v2.4S // .............................................................................................................................*.................. + sub v17.4S, v3.4S, v6.4S // ..................................................................................................................................*............. + mls v13.4S, v1.4S, v8.S[0] // .......................................................................................................*........................................ // gap // ................................................................................................................................................ - sub v17.4S, v2.4S, v15.4S // .........................................................*...................................................................................... - trn1 v9.4S, v26.4S, v7.4S // ....................................................................................*........................................................... - trn2 v25.4S, v26.4S, v7.4S // .....................................................................................*.......................................................... - add v18.4S, v2.4S, v15.4S // ..........................................................*..................................................................................... + add v1.4S, v3.4S, v6.4S // ...................................................................................................................................*............ + ldr q6, [x1, #176] // ...e............................................................................................................................................ // gap // ................................................................................................................................................ - ldr q19, [x2, #176] // .......e........................................................................................................................................ - trn1 v10.4S, v23.4S, v0.4S // ..................................................................................*............................................................. // gap // ................................................................................................................................................ + mls v5.4S, v27.4S, v8.S[0] // ...............................e................................................................................................................ + mls v18.4S, v9.4S, v8.S[0] // ..................................................................................................*............................................. + mls v16.4S, v20.4S, v8.S[0] // .....................e.......................................................................................................................... + sqrdmulh v30.4S, v12.4S, v23.S[1] // ..............e................................................................................................................................. // gap // ................................................................................................................................................ - add x1, x1, #64 // ........*....................................................................................................................................... - sqrdmulh v24.4S, v27.4S, v13.S[1] // ...............e................................................................................................................................ - trn2 v2.4S, v6.4S, v31.4S // .............................................................................*.................................................................. - ldr q5, [x5, #96] // ....................................................................................................................*........................... - trn2 v29.4S, v23.4S, v0.4S // ...................................................................................*............................................................ - trn2 v30.4S, v18.4S, v17.4S // ...........................................................................*.................................................................... - sqrdmulh v7.4S, v4.4S, v13.S[1] // .........................e...................................................................................................................... - mul v16.4S, v4.4S, v13.S[0] // ........................e....................................................................................................................... - ldr q0, [x1, #144] // .e.............................................................................................................................................. - mul v22.4S, v27.4S, v13.S[0] // ..............e................................................................................................................................. // gap // ................................................................................................................................................ - ldr q20, [x5], #(12*16) // ..........................................................................................*..................................................... - ldr q4, [x5, #-176] // ...........................................................................................*.................................................... - trn1 v28.2D, v10.2D, v9.2D // ........................................................................................*....................................................... - trn2 v1.2D, v29.2D, v25.2D // .......................................................................................*........................................................ // gap // ................................................................................................................................................ - mul v26.4S, v12.4S, v13.S[0] // ...................e............................................................................................................................ + ldr q9, [x5, #-160] // ............................................................................................*................................................... + sqrdmulh v12.4S, v17.4S, v10.4S // .........................................................................................................................................*...... + sqrdmulh v27.4S, v1.4S, v22.4S // ....................................................................................................................................*........... + ldr q22, [x5, #144] // .......................................................................................................................e........................ + mls v19.4S, v26.4S, v8.S[0] // ..........................e..................................................................................................................... + ldr q10, [x5, #176] // .........................................................................................................................e...................... + mul v26.4S, v17.4S, v14.4S // ..........................................................................................................................................*..... + ldr q17, [x4, #-48] // ...........e.................................................................................................................................... // gap // ................................................................................................................................................ + mul v1.4S, v1.4S, v7.4S // .....................................................................................................................................*.......... + ldr q7, [x5, #-128] // ..............................................................................................*................................................. + add v14.4S, v15.4S, v2.4S // ..............................................................................................................................*................. // gap // ................................................................................................................................................ + add v2.4S, v24.4S, v13.4S // .........................................................................................................*...................................... // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v12.4S, v13.S[1] // ....................e........................................................................................................................... - trn2 v15.2D, v30.2D, v2.2D // ...............................................................................*................................................................ // gap // ................................................................................................................................................ - ldr q23, [x1, #160] // ..e............................................................................................................................................. - mul v12.4S, v19.4S, v13.S[0] // .............................e.................................................................................................................. + ldr q20, [x5, #-112] // ...............................................................................................*................................................ + add v15.4S, v0.4S, v16.4S // .......................e........................................................................................................................ + mls v11.4S, v30.4S, v8.S[0] // ................e............................................................................................................................... + sub v3.4S, v6.4S, v5.4S // ................................e............................................................................................................... + add v5.4S, v6.4S, v5.4S // .................................e.............................................................................................................. // gap // ................................................................................................................................................ - trn1 v27.4S, v18.4S, v17.4S // ..........................................................................*..................................................................... - sqrdmulh v17.4S, v19.4S, v13.S[1] // ..............................e................................................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + mul v9.4S, v2.4S, v9.4S // ...........................................................................................................*.................................... + sqrdmulh v6.4S, v2.4S, v4.4S // ..........................................................................................................*..................................... + mls v26.4S, v12.4S, v8.S[0] // ...........................................................................................................................................*.... + sub v2.4S, v24.4S, v13.4S // ........................................................................................................*....................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - mls v16.4S, v7.4S, v8.S[0] // ..........................e..................................................................................................................... - mul v7.4S, v1.4S, v5.4S // ...............................................................................................................................*................ - sqrdmulh v19.4S, v1.4S, v3.4S // ................................................................................................................................*............... - trn1 v18.4S, v6.4S, v31.4S // ............................................................................*................................................................... - mls v26.4S, v14.4S, v8.S[0] // .....................e.......................................................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - ldr q6, [x1, #176] // ...e............................................................................................................................................ + mul v30.4S, v5.4S, v23.S[2] // ........................................e....................................................................................................... + sqrdmulh v5.4S, v5.4S, v23.S[3] // .......................................e........................................................................................................ + mul v12.4S, v3.4S, v17.S[0] // ..................................................e............................................................................................. + sqrdmulh v24.4S, v3.4S, v17.S[1] // .................................................e.............................................................................................. // gap // ................................................................................................................................................ - trn2 v14.2D, v10.2D, v9.2D // ......................................................................................*......................................................... - mul v9.4S, v15.4S, v20.4S // .....................................................................................................*.......................................... - sqrdmulh v10.4S, v15.4S, v4.4S // ......................................................................................................*......................................... - mls v22.4S, v24.4S, v8.S[0] // ................e............................................................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + sub v13.4S, v25.4S, v19.4S // ...........................e.................................................................................................................... // gap // ................................................................................................................................................ - mls v12.4S, v17.4S, v8.S[0] // ...............................e................................................................................................................ - trn2 v15.2D, v27.2D, v18.2D // ..............................................................................*................................................................. - trn1 v31.2D, v27.2D, v18.2D // ................................................................................*............................................................... - mls v7.4S, v19.4S, v8.S[0] // .................................................................................................................................*.............. - add v19.4S, v23.4S, v16.4S // ............................e................................................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - ldr q24, [x4, #-48] // ...........e.................................................................................................................................... - mul v5.4S, v14.4S, v5.4S // ..........................................................................................................................*..................... - sqrdmulh v17.4S, v14.4S, v3.4S // ...........................................................................................................................*.................... + mul v7.4S, v2.4S, v7.4S // ................................................................................................................*............................... + sqrdmulh v2.4S, v2.4S, v20.4S // ...............................................................................................................*................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - trn1 v25.2D, v29.2D, v25.2D // .........................................................................................*...................................................... + sub v4.4S, v28.4S, v26.4S // ............................................................................................................................................*... + add v3.4S, v28.4S, v26.4S // .............................................................................................................................................*.. + add v20.4S, v21.4S, v11.4S // ..................e............................................................................................................................. // gap // ................................................................................................................................................ - mul v14.4S, v15.4S, v20.4S // ................................................................................................*............................................... - sqrdmulh v15.4S, v15.4S, v4.4S // .................................................................................................*.............................................. - mls v9.4S, v10.4S, v8.S[0] // .......................................................................................................*........................................ - ldr q27, [x5, #-32] // ........................................................................................................................*....................... + add v26.4S, v25.4S, v19.4S // ............................e................................................................................................................... // gap // ................................................................................................................................................ - sub v29.4S, v6.4S, v12.4S // ................................e............................................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + sqrdmulh v19.4S, v13.4S, v17.S[1] // ............................................e................................................................................................... + mls v30.4S, v5.4S, v8.S[0] // .........................................e...................................................................................................... + ldr q5, [x4, #-32] // ............e................................................................................................................................... + mls v12.4S, v24.4S, v8.S[0] // ...................................................e............................................................................................ + sub v25.4S, v0.4S, v16.4S // ......................e......................................................................................................................... // gap // ................................................................................................................................................ - add v18.4S, v6.4S, v12.4S // .................................e.............................................................................................................. - trn1 v12.2D, v30.2D, v2.2D // .................................................................................*.............................................................. - ldr q30, [x5, #-128] // ..............................................................................................*................................................. - ldr q20, [x5, #-112] // ...............................................................................................*................................................ - mls v5.4S, v17.4S, v8.S[0] // ............................................................................................................................*................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sub v23.4S, v23.4S, v16.4S // ...........................e.................................................................................................................... - sub v1.4S, v25.4S, v7.4S // ..................................................................................................................................*............. - ldr q10, [x5, #-144] // .............................................................................................*.................................................. - ldr q2, [x5, #-64] // ......................................................................................................................*......................... - ldr q17, [x5, #-48] // .......................................................................................................................*........................ + mul v13.4S, v13.4S, v17.S[0] // .............................................e.................................................................................................. + mul v0.4S, v26.4S, v23.S[2] // ...................................e............................................................................................................ + sqrdmulh v26.4S, v26.4S, v23.S[3] // ..................................e............................................................................................................. + mls v7.4S, v2.4S, v8.S[0] // .................................................................................................................*.............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q28, [x5, #16] // ...........................................................................................e.................................................... + sub v24.4S, v29.4S, v18.4S // ...................................................................................................*............................................ + mls v9.4S, v6.4S, v8.S[0] // ............................................................................................................*................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v6.4S, v15.4S, v30.4S // ..........................................e..................................................................................................... + // gap // ................................................................................................................................................ + add v30.4S, v15.4S, v30.4S // ...........................................e.................................................................................................... + // gap // ................................................................................................................................................ + add v2.4S, v25.4S, v12.4S // .....................................................e.......................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + sub v25.4S, v25.4S, v12.4S // ....................................................e........................................................................................... + add v16.4S, v24.4S, v7.4S // ...................................................................................................................*............................ + mls v13.4S, v19.4S, v8.S[0] // ..............................................e................................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sqrdmulh v16.4S, v18.4S, v13.S[3] // ........................................e....................................................................................................... - sub v4.4S, v12.4S, v9.4S // ........................................................................................................*....................................... - mls v14.4S, v15.4S, v8.S[0] // ..................................................................................................*............................................. - sqrdmulh v6.4S, v29.4S, v24.S[1] // ..................................................e............................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + mls v0.4S, v26.4S, v8.S[0] // ....................................e........................................................................................................... + mul v12.4S, v2.4S, v5.S[2] // .................................................................e.............................................................................. + mul v23.4S, v25.4S, v31.S[0] // ......................................................................e......................................................................... + sqrdmulh v26.4S, v2.4S, v5.S[3] // ................................................................e............................................................................... // gap // ................................................................................................................................................ - mul v27.4S, v1.4S, v27.4S // .........................................................................................................................................*...... - sqrdmulh v21.4S, v1.4S, v21.4S // ..........................................................................................................................................*..... - add v1.4S, v12.4S, v9.4S // .........................................................................................................*...................................... - add v3.4S, v25.4S, v7.4S // ...................................................................................................................................*............ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + sqrdmulh v15.4S, v25.4S, v31.S[1] // .....................................................................e.......................................................................... + sqrdmulh v2.4S, v30.4S, v17.S[3] // ......................................................e......................................................................................... + mul v30.4S, v30.4S, v17.S[2] // .......................................................e........................................................................................ + mul v19.4S, v6.4S, v5.S[0] // ............................................................e................................................................................... + sqrdmulh v31.4S, v6.4S, v5.S[1] // ...........................................................e.................................................................................... // gap // ................................................................................................................................................ - mul v25.4S, v29.4S, v24.S[0] // .................................................e.............................................................................................. - mul v29.4S, v4.4S, v30.4S // ...............................................................................................................*................................ - sqrdmulh v20.4S, v4.4S, v20.4S // ................................................................................................................*............................... - sub v7.4S, v28.4S, v5.4S // .............................................................................................................................*.................. - sqrdmulh v17.4S, v3.4S, v17.4S // .....................................................................................................................................*.......... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - mul v15.4S, v1.4S, v11.4S // ..........................................................................................................*..................................... - mul v2.4S, v3.4S, v2.4S // ....................................................................................................................................*........... - ldr q3, [x5, #112] // .....................................................................................................................e.......................... - sqrdmulh v11.4S, v1.4S, v10.4S // ...........................................................................................................*.................................... // gap // ................................................................................................................................................ - sqrdmulh v9.4S, v23.4S, v24.S[1] // .............................................e.................................................................................................. + sub v25.4S, v20.4S, v0.4S // .....................................e.......................................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - mul v10.4S, v18.4S, v13.S[2] // .......................................e........................................................................................................ + mls v1.4S, v27.4S, v8.S[0] // ......................................................................................................................................*......... + sub v27.4S, v21.4S, v11.4S // .................e.............................................................................................................................. // gap // ................................................................................................................................................ - mls v27.4S, v21.4S, v8.S[0] // ...........................................................................................................................................*.... - mul v12.4S, v23.4S, v24.S[0] // ............................................e................................................................................................... + add v5.4S, v20.4S, v0.4S // ......................................e......................................................................................................... + mls v12.4S, v26.4S, v8.S[0] // ..................................................................e............................................................................. + mls v23.4S, v15.4S, v8.S[0] // .......................................................................e........................................................................ + add v26.4S, v29.4S, v18.4S // ....................................................................................................*........................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sub v23.4S, v31.4S, v14.4S // ...................................................................................................*............................................ - mls v25.4S, v6.4S, v8.S[0] // ...................................................e............................................................................................ - add v14.4S, v31.4S, v14.4S // ....................................................................................................*........................................... - mls v29.4S, v20.4S, v8.S[0] // .................................................................................................................*.............................. + sub v17.4S, v24.4S, v7.4S // ..................................................................................................................*............................. + sub v0.4S, v27.4S, v13.4S // ...............................................e................................................................................................ + mls v30.4S, v2.4S, v8.S[0] // ........................................................e....................................................................................... + mls v19.4S, v31.4S, v8.S[0] // .............................................................e.................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sub v1.4S, v0.4S, v26.4S // ......................e......................................................................................................................... + add v31.4S, v27.4S, v13.4S // ................................................e............................................................................................... + ldr q27, [x5, #112] // .....................................................................................................................e.......................... + sub v15.4S, v26.4S, v9.4S // .............................................................................................................*.................................. + sub v2.4S, v14.4S, v1.4S // .......................................................................................................................................*........ // gap // ................................................................................................................................................ - mls v15.4S, v11.4S, v8.S[0] // ............................................................................................................*................................... - add v20.4S, v28.4S, v5.4S // ..............................................................................................................................*................. - mls v2.4S, v17.4S, v8.S[0] // ......................................................................................................................................*......... // gap // ................................................................................................................................................ - ldr q28, [x4, #-32] // ............e................................................................................................................................... + add v1.4S, v14.4S, v1.4S // ........................................................................................................................................*....... // gap // ................................................................................................................................................ - sub v18.4S, v7.4S, v27.4S // ............................................................................................................................................*... - mls v10.4S, v16.4S, v8.S[0] // .........................................e...................................................................................................... - add v26.4S, v0.4S, v26.4S // .......................e........................................................................................................................ - add v17.4S, v7.4S, v27.4S // .............................................................................................................................................*.. - ldr q0, [x4, #-16] // .............e.................................................................................................................................. - mls v12.4S, v9.4S, v8.S[0] // ..............................................e................................................................................................. - sub v11.4S, v1.4S, v25.4S // ....................................................e........................................................................................... + add v14.4S, v26.4S, v9.4S // ..............................................................................................................*................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + sub v29.4S, v31.4S, v12.4S // ...................................................................e............................................................................ + sub v13.4S, v0.4S, v23.4S // ........................................................................e....................................................................... + add v7.4S, v0.4S, v23.4S // .........................................................................e...................................................................... + add v26.4S, v31.4S, v12.4S // ....................................................................e........................................................................... + ldr q18, [x5, #96] // ....................................................................................................................e........................... // gap // ................................................................................................................................................ - sub v7.4S, v23.4S, v29.4S // ..................................................................................................................*............................. - add v6.4S, v23.4S, v29.4S // ...................................................................................................................*............................ // gap // ................................................................................................................................................ + add v9.4S, v5.4S, v30.4S // ..........................................................e..................................................................................... + sub v6.4S, v5.4S, v30.4S // .........................................................e...................................................................................... + st4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2], #64 // ...............................................................................................................................................* + add v21.4S, v25.4S, v19.4S // ...............................................................e................................................................................ + sub v25.4S, v25.4S, v19.4S // ..............................................................e................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sub v16.4S, v20.4S, v2.4S // .......................................................................................................................................*........ // gap // ................................................................................................................................................ - sub v5.4S, v14.4S, v15.4S // .............................................................................................................*.................................. - add v4.4S, v14.4S, v15.4S // ..............................................................................................................*................................. - ldr q31, [x1, #128] // e............................................................................................................................................... - add v15.4S, v20.4S, v2.4S // ........................................................................................................................................*....... - sqrdmulh v9.4S, v19.4S, v13.S[3] // ...................................e............................................................................................................ // gap // ................................................................................................................................................ + trn2 v3.4S, v7.4S, v13.4S // .....................................................................................e.......................................................... + trn1 v31.4S, v26.4S, v29.4S // ..................................................................................e............................................................. + trn2 v11.4S, v26.4S, v29.4S // ...................................................................................e............................................................ // gap // ................................................................................................................................................ + st4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1], #64 // ..............................................................................................................................................*. + trn1 v16.4S, v7.4S, v13.4S // ....................................................................................e........................................................... + ldr q26, [x5], #(12*16) // ..........................................................................................e..................................................... + trn2 v24.4S, v9.4S, v6.4S // ...........................................................................e.................................................................... + trn2 v20.4S, v21.4S, v25.4S // .............................................................................e.................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sub v21.4S, v26.4S, v10.4S // ..........................................e..................................................................................................... - mul v14.4S, v19.4S, v13.S[2] // ..................................e............................................................................................................. - add v13.4S, v1.4S, v25.4S // .....................................................e.......................................................................................... - mul v20.4S, v11.4S, v0.S[0] // .....................................................................e.......................................................................... - st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x2], #64 // ...............................................................................................................................................* - sqrdmulh v17.4S, v11.4S, v0.S[1] // ......................................................................e......................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + trn1 v30.4S, v9.4S, v6.4S // ..........................................................................e..................................................................... + trn1 v29.4S, v21.4S, v25.4S // ............................................................................e................................................................... // gap // ................................................................................................................................................ - st4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x1], #64 // ..............................................................................................................................................*. - add v18.4S, v26.4S, v10.4S // ...........................................e.................................................................................................... + trn2 v4.2D, v31.2D, v16.2D // ......................................................................................e......................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + trn2 v13.2D, v11.2D, v3.2D // .......................................................................................e........................................................ + ldr q14, [x5, #-32] // ........................................................................................................................e....................... // gap // ................................................................................................................................................ - mul v1.4S, v21.4S, v28.S[0] // ...........................................................e.................................................................................... - sqrdmulh v16.4S, v21.4S, v28.S[1] // ............................................................e................................................................................... // gap // ................................................................................................................................................ - sqrdmulh v7.4S, v13.4S, v28.S[3] // .................................................................e.............................................................................. - mul v29.4S, v13.4S, v28.S[2] // ................................................................e............................................................................... + trn2 v15.2D, v24.2D, v20.2D // ...............................................................................e................................................................ + trn2 v21.2D, v30.2D, v29.2D // ..............................................................................e................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sub v0.4S, v31.4S, v22.4S // .................e.............................................................................................................................. - mls v14.4S, v9.4S, v8.S[0] // ....................................e........................................................................................................... - mul v15.4S, v18.4S, v24.S[2] // ......................................................e......................................................................................... - sqrdmulh v9.4S, v18.4S, v24.S[3] // .......................................................e........................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - add v22.4S, v31.4S, v22.4S // ..................e............................................................................................................................. - ldr q21, [x5, #176] // .........................................................................................................................e...................... - ldr q11, [x5, #32] // ............................................................................................e................................................... - mls v20.4S, v17.4S, v8.S[0] // .......................................................................e........................................................................ - - // original source code - // ldr q9, [x1, #(16*0 + (64))] // ....................................................................................................................e......................|........................................................................................................................e......... - // ldr q10, [x1, #(16*1 + (64))] // ...........................e...............................................................................................................|...............................e.................................................................................................. - // ldr q11, [x1, #(16*2 + (64))] // ....................................e......................................................................................................|........................................e......................................................................................... - // ldr q12, [x1, #(16*3 + (64))] // .............................................e.............................................................................................|.................................................e................................................................................ - // ldr q13, [x2, #(16*0 + (64))] // .e.........................................................................................................................................|.....e............................................................................................................................ - // ldr q14, [x2, #(16*1 + (64))] // ..e........................................................................................................................................|......e........................................................................................................................... - // ldr q15, [x2, #(16*2 + (64))] // ........e..................................................................................................................................|............e..................................................................................................................... - // ldr q16, [x2, #(16*3 + (64))] // .................e.........................................................................................................................|.....................e............................................................................................................ - // add x1, x1, #64 // ...................*.......................................................................................................................|.......................*.......................................................................................................... - // add x2, x2, #64 // ...........................................................................................................................................*.................................................................................................................................. - // ldr q0, [x4], #64 // e..........................................................................................................................................|....e............................................................................................................................. - // ldr q1, [x4, #(-64 + 16)] // .......................................................e...................................................................................|...........................................................e...................................................................... - // ldr q2, [x4, #(-64 + 32)] // .......................................................................................................e...................................|...........................................................................................................e...................... - // ldr q3, [x4, #(-64 + 48)] // ............................................................................................................e..............................|................................................................................................................e................. - // mul v24.4s, v13.4s, v0.s[0] // ............................e..............................................................................................................|................................e................................................................................................. - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ....................e......................................................................................................................|........................e......................................................................................................... - // mls v24.4s, v13.4s, v8.s[0] // .................................................e.........................................................................................|.....................................................e............................................................................ - // sub v13.4s, v9.4s, v24.4s // ...................................................................................................................................e.......|.................................................................................................................................. - // add v9.4s, v9.4s, v24.4s // .......................................................................................................................................e...|.................................................................................................................................. - // mul v24.4s, v14.4s, v0.s[0] // .................................e.........................................................................................................|.....................................e............................................................................................ - // sqrdmulh v14.4s, v14.4s, v0.s[1] // ..................................e........................................................................................................|......................................e........................................................................................... - // mls v24.4s, v14.4s, v8.s[0] // ............................................e..............................................................................................|................................................e................................................................................. - // sub v14.4s, v10.4s, v24.4s // ...................................................................................................e.......................................|.......................................................................................................e.......................... - // add v10.4s, v10.4s, v24.4s // ..........................................................................................................e................................|..............................................................................................................e................... - // mul v24.4s, v15.4s, v0.s[0] // ..........................e................................................................................................................|..............................e................................................................................................... - // sqrdmulh v15.4s, v15.4s, v0.s[1] // .........................e.................................................................................................................|.............................e.................................................................................................... - // mls v24.4s, v15.4s, v8.s[0] // ........................................e..................................................................................................|............................................e..................................................................................... - // sub v15.4s, v11.4s, v24.4s // .....................................................................e.....................................................................|.........................................................................e........................................................ - // add v11.4s, v11.4s, v24.4s // ......................................................e....................................................................................|..........................................................e....................................................................... - // mul v24.4s, v16.4s, v0.s[0] // .....................................e.....................................................................................................|.........................................e........................................................................................ - // sqrdmulh v16.4s, v16.4s, v0.s[1] // .......................................e...................................................................................................|...........................................e...................................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ..................................................e........................................................................................|......................................................e........................................................................... - // sub v16.4s, v12.4s, v24.4s // ...............................................................e...........................................................................|...................................................................e.............................................................. - // add v12.4s, v12.4s, v24.4s // ................................................................e..........................................................................|....................................................................e............................................................. - // mul v24.4s, v11.4s, v0.s[2] // ........................................................................................................................e..................|............................................................................................................................e..... - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ......................................................................................................................e....................|..........................................................................................................................e....... - // mls v24.4s, v11.4s, v8.s[0] // ....................................................................................................................................e......|.................................................................................................................................. - // sub v11.4s, v9.4s, v24.4s // ....*......................................................................................................................................|........*......................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ...*.......................................................................................................................................|.......*.......................................................................................................................... - // mul v24.4s, v12.4s, v0.s[2] // ............................................................................................e..............................................|................................................................................................e................................. - // sqrdmulh v12.4s, v12.4s, v0.s[3] // ..........................................................................e................................................................|..............................................................................e................................................... - // mls v24.4s, v12.4s, v8.s[0] // .........................................................................................................e.................................|.............................................................................................................e.................... - // sub v12.4s, v10.4s, v24.4s // .......................................................................................................................e...................|...........................................................................................................................e...... - // add v10.4s, v10.4s, v24.4s // ..............................................................................................................................e............|.................................................................................................................................. - // mul v24.4s, v15.4s, v1.s[0] // ..............................................................................................e............................................|..................................................................................................e............................... - // sqrdmulh v15.4s, v15.4s, v1.s[1] // ...........................................................................................e...............................................|...............................................................................................e.................................. - // mls v24.4s, v15.4s, v8.s[0] // .............................................................................................................e.............................|.................................................................................................................e................ - // sub v15.4s, v13.4s, v24.4s // ...........................................................................................................................................|*................................................................................................................................. - // add v13.4s, v13.4s, v24.4s // ...........................................................................................................................................|.*................................................................................................................................ - // mul v24.4s, v16.4s, v1.s[0] // ..................................................................................e........................................................|......................................................................................e........................................... - // sqrdmulh v16.4s, v16.4s, v1.s[1] // .............................................................................e.............................................................|.................................................................................e................................................ - // mls v24.4s, v16.4s, v8.s[0] // ................................................................................................e..........................................|....................................................................................................e............................. - // sub v16.4s, v14.4s, v24.4s // ..............................................................................................................e............................|..................................................................................................................e............... - // add v14.4s, v14.4s, v24.4s // .........................................................................................................................e.................|.............................................................................................................................e.... - // mul v24.4s, v10.4s, v1.s[2] // .....................................................................................................................................e.....|.................................................................................................................................. - // sqrdmulh v10.4s, v10.4s, v1.s[3] // ......................................................................................................................................e....|.................................................................................................................................. - // mls v24.4s, v10.4s, v8.s[0] // .....*.....................................................................................................................................|.........*........................................................................................................................ - // sub v10.4s, v9.4s, v24.4s // .............*.............................................................................................................................|.................*................................................................................................................ - // add v9.4s, v9.4s, v24.4s // ................*..........................................................................................................................|....................*............................................................................................................. - // mul v24.4s, v12.4s, v2.s[0] // ...............................................................................................................................e...........|.................................................................................................................................. - // sqrdmulh v12.4s, v12.4s, v2.s[1] // ................................................................................................................................e..........|.................................................................................................................................. - // mls v24.4s, v12.4s, v8.s[0] // ...........................................................................................................................................|..*............................................................................................................................... - // sub v12.4s, v11.4s, v24.4s // .........*.................................................................................................................................|.............*.................................................................................................................... - // add v11.4s, v11.4s, v24.4s // ..........*................................................................................................................................|..............*................................................................................................................... - // mul v24.4s, v14.4s, v2.s[2] // ..................................................................................................................................e........|.................................................................................................................................. - // sqrdmulh v14.4s, v14.4s, v2.s[3] // .................................................................................................................................e.........|.................................................................................................................................. - // mls v24.4s, v14.4s, v8.s[0] // ...........................................................................................................................................|...*.............................................................................................................................. - // sub v14.4s, v13.4s, v24.4s // ............*..............................................................................................................................|................*................................................................................................................. - // add v13.4s, v13.4s, v24.4s // ...........*...............................................................................................................................|...............*.................................................................................................................. - // mul v24.4s, v16.4s, v3.s[0] // ..........................................................................................................................e................|..............................................................................................................................e... - // sqrdmulh v16.4s, v16.4s, v3.s[1] // ............................................................................................................................e..............|................................................................................................................................e. - // mls v24.4s, v16.4s, v8.s[0] // ..........................................................................................................................................e|.................................................................................................................................. - // sub v16.4s, v15.4s, v24.4s // ......*....................................................................................................................................|..........*....................................................................................................................... - // add v15.4s, v15.4s, v24.4s // .......*...................................................................................................................................|...........*...................................................................................................................... - // trn1 v25.4s, v9.4s, v10.4s // ......................................*....................................................................................................|..........................................*....................................................................................... - // trn2 v26.4s, v9.4s, v10.4s // ........................*..................................................................................................................|............................*..................................................................................................... - // trn1 v27.4s, v11.4s, v12.4s // ...........................................*...............................................................................................|...............................................*.................................................................................. - // trn2 v28.4s, v11.4s, v12.4s // .....................*.....................................................................................................................|.........................*........................................................................................................ - // trn2 v11.2d, v25.2d, v27.2d // ...................................................*.......................................................................................|.......................................................*.......................................................................... - // trn2 v12.2d, v26.2d, v28.2d // ...................................*.......................................................................................................|.......................................*.......................................................................................... - // trn1 v9.2d, v25.2d, v27.2d // ....................................................*......................................................................................|........................................................*......................................................................... - // trn1 v10.2d, v26.2d, v28.2d // .................................................................*.........................................................................|.....................................................................*............................................................ - // trn1 v25.4s, v13.4s, v14.4s // ..................*........................................................................................................................|......................*........................................................................................................... - // trn2 v26.4s, v13.4s, v14.4s // .......................*...................................................................................................................|...........................*...................................................................................................... - // trn1 v27.4s, v15.4s, v16.4s // ..............*............................................................................................................................|..................*............................................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ...............*...........................................................................................................................|...................*.............................................................................................................. - // trn2 v15.2d, v25.2d, v27.2d // ..............................................*............................................................................................|..................................................*............................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ................................*..........................................................................................................|....................................*............................................................................................. - // trn1 v13.2d, v25.2d, v27.2d // ...............................*...........................................................................................................|...................................*.............................................................................................. - // trn1 v14.2d, v26.2d, v28.2d // ..........................................................*................................................................................|..............................................................*................................................................... - // ldr q0, [x5], #(12*16) // .............................*.............................................................................................................|.................................*................................................................................................ - // ldr q4, [x5, #(-12*16 + 1*16)] // ..............................*............................................................................................................|..................................*............................................................................................... - // ldr q1, [x5, #(-12*16 + 2*16)] // .........................................................................................................................................e.|.................................................................................................................................. - // ldr q5, [x5, #(-12*16 + 3*16)] // .......................................................................*...................................................................|...........................................................................*...................................................... - // ldr q2, [x5, #(-12*16 + 4*16)] // ..................................................................*........................................................................|......................................................................*........................................................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ...................................................................*.......................................................................|.......................................................................*.......................................................... - // mul v24.4s, v11.4s, v0.4s // ...........................................................*...............................................................................|...............................................................*.................................................................. - // sqrdmulh v11.4s, v11.4s, v4.4s // ............................................................*..............................................................................|................................................................*................................................................. - // mls v24.4s, v11.4s, v8.s[0] // ............................................................................*..............................................................|................................................................................*................................................. - // sub v11.4s, v9.4s, v24.4s // ...............................................................................................*...........................................|...................................................................................................*.............................. - // add v9.4s, v9.4s, v24.4s // .................................................................................................*.........................................|.....................................................................................................*............................ - // mul v24.4s, v12.4s, v0.4s // ...............................................*...........................................................................................|...................................................*.............................................................................. - // sqrdmulh v12.4s, v12.4s, v4.4s // ................................................*..........................................................................................|....................................................*............................................................................. - // mls v24.4s, v12.4s, v8.s[0] // .............................................................*.............................................................................|.................................................................*................................................................ - // sub v12.4s, v10.4s, v24.4s // ...........................................................................*...............................................................|...............................................................................*.................................................. - // add v10.4s, v10.4s, v24.4s // ................................................................................*..........................................................|....................................................................................*............................................. - // mul v24.4s, v10.4s, v1.4s // .......................................................................................*...................................................|...........................................................................................*...................................... - // sqrdmulh v10.4s, v10.4s, v5.4s // ..........................................................................................*................................................|..............................................................................................*................................... - // mls v24.4s, v10.4s, v8.s[0] // ....................................................................................................*......................................|........................................................................................................*......................... - // sub v10.4s, v9.4s, v24.4s // ..................................................................................................................*........................|......................................................................................................................*........... - // add v9.4s, v9.4s, v24.4s // ...................................................................................................................*.......................|.......................................................................................................................*.......... - // mul v24.4s, v12.4s, v2.4s // ...................................................................................*.......................................................|.......................................................................................*.......................................... - // sqrdmulh v12.4s, v12.4s, v6.4s // ....................................................................................*......................................................|........................................................................................*......................................... - // mls v24.4s, v12.4s, v8.s[0] // ..................................................................................................*........................................|......................................................................................................*........................... - // sub v12.4s, v11.4s, v24.4s // ...............................................................................................................*...........................|...................................................................................................................*.............. - // add v11.4s, v11.4s, v24.4s // ................................................................................................................*..........................|....................................................................................................................*............. - // ldr q0, [x5, #(-12*16 + 6*16)] // ......................*....................................................................................................................|..........................*....................................................................................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // .........................................................................................e.................................................|.............................................................................................e.................................... - // ldr q1, [x5, #(-12*16 + 8*16)] // ........................................................................*..................................................................|............................................................................*..................................................... - // ldr q5, [x5, #(-12*16 + 9*16)] // .........................................................................*.................................................................|.............................................................................*.................................................... - // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................................................*............................................................................|..................................................................*............................................................... - // ldr q6, [x5, #(-12*16 + 11*16)] // ........................................................................................................................................e..|.................................................................................................................................. - // mul v24.4s, v15.4s, v0.4s // ........................................................*..................................................................................|............................................................*..................................................................... - // sqrdmulh v15.4s, v15.4s, v4.4s // .........................................................*.................................................................................|.............................................................*.................................................................... - // mls v24.4s, v15.4s, v8.s[0] // ....................................................................*......................................................................|........................................................................*......................................................... - // sub v15.4s, v13.4s, v24.4s // .....................................................................................*.....................................................|.........................................................................................*........................................ - // add v13.4s, v13.4s, v24.4s // .....................................................................................................*.....................................|.........................................................................................................*........................ - // mul v24.4s, v16.4s, v0.4s // .........................................*.................................................................................................|.............................................*.................................................................................... - // sqrdmulh v16.4s, v16.4s, v4.4s // ..........................................*................................................................................................|..............................................*................................................................................... - // mls v24.4s, v16.4s, v8.s[0] // .....................................................*.....................................................................................|.........................................................*........................................................................ - // sub v16.4s, v14.4s, v24.4s // ......................................................................*....................................................................|..........................................................................*....................................................... - // add v14.4s, v14.4s, v24.4s // .................................................................................*.........................................................|.....................................................................................*............................................ - // mul v24.4s, v14.4s, v1.4s // ........................................................................................*..................................................|............................................................................................*..................................... - // sqrdmulh v14.4s, v14.4s, v5.4s // ......................................................................................*....................................................|..........................................................................................*....................................... - // mls v24.4s, v14.4s, v8.s[0] // ......................................................................................................*....................................|..........................................................................................................*....................... - // sub v14.4s, v13.4s, v24.4s // .................................................................................................................*.........................|.....................................................................................................................*............ - // add v13.4s, v13.4s, v24.4s // .....................................................................................................................*.....................|.........................................................................................................................*........ - // mul v24.4s, v16.4s, v2.4s // ..............................................................................*............................................................|..................................................................................*............................................... - // sqrdmulh v16.4s, v16.4s, v6.4s // ...............................................................................*...........................................................|...................................................................................*.............................................. - // mls v24.4s, v16.4s, v8.s[0] // .............................................................................................*.............................................|.................................................................................................*................................ - // sub v16.4s, v15.4s, v24.4s // ........................................................................................................*..................................|............................................................................................................*..................... - // add v15.4s, v15.4s, v24.4s // ...........................................................................................................*...............................|...............................................................................................................*.................. - // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // .............................................................................................................................*.............|.................................................................................................................................* - // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // ...........................................................................................................................*...............|...............................................................................................................................*.. + add x2, x2, #64 // .........e...................................................................................................................................... + add x1, x1, #64 // ........e....................................................................................................................................... + + // --------------------------------------------------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + // ldr q9, [x1, #(16*0 + (64))] // .................e.............................................................................................................................'.................~................................................................................................................ + // ldr q10, [x1, #(16*1 + (64))] // ...........e...................................................................................................................................'...........~...................................................................................................................... + // ldr q11, [x1, #(16*2 + (64))] // .............................e.................................................................................................................'.............................~.................................................................................................... + // ldr q12, [x1, #(16*3 + (64))] // ...................................e...........................................................................................................'...................................~.............................................................................................. + // ldr q13, [x2, #(16*0 + (64))] // .........e.....................................................................................................................................'.........~........................................................................................................................ + // ldr q14, [x2, #(16*1 + (64))] // ...e...........................................................................................................................................'...~.............................................................................................................................. + // ldr q15, [x2, #(16*2 + (64))] // ..........e....................................................................................................................................'..........~....................................................................................................................... + // ldr q16, [x2, #(16*3 + (64))] // .e.............................................................................................................................................'.~................................................................................................................................ + // add x1, x1, #64 // ..............................................................................................................................................e'.................................................................................................................................. + // add x2, x2, #64 // .............................................................................................................................................e.'.................................................................................................................................. + // ldr q0, [x4], #64 // e..............................................................................................................................................'~................................................................................................................................. + // ldr q1, [x4, #(-64 + 16)] // ...............................................e...............................................................................................'...............................................~.................................................................................. + // ldr q2, [x4, #(-64 + 32)] // ..........................................................................e....................................................................'..........................................................................~....................................................... + // ldr q3, [x4, #(-64 + 48)] // ...............e...............................................................................................................................'...............~.................................................................................................................. + // sqrdmulh v27.4s, v13.4s, v0.s[1] // .......................................e.......................................................................................................'.......................................~.......................................................................................... + // mul v24.4s, v13.4s, v0.s[0] // ............................e..................................................................................................................'............................~..................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ......................................................e........................................................................................'......................................................~........................................................................... + // sub v13.4s, v9.4s, v24.4s // .....................................................................................................e.........................................'.....................................................................................................~............................ + // add v9.4s, v9.4s, v24.4s // ......................................................................e........................................................................'......................................................................~........................................................... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .....................e.........................................................................................................................'.....................~............................................................................................................ + // mul v24.4s, v14.4s, v0.s[0] // ......................e........................................................................................................................'......................~........................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ......................................e........................................................................................................'......................................~........................................................................................... + // sub v14.4s, v10.4s, v24.4s // ............................................................................e..................................................................'............................................................................~..................................................... + // add v10.4s, v10.4s, v24.4s // .....................................................e.........................................................................................'.....................................................~............................................................................ + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ..........................e....................................................................................................................'..........................~....................................................................................................... + // mul v24.4s, v15.4s, v0.s[0] // ...........................e...................................................................................................................'...........................~...................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................e..................................................................................................'............................................~..................................................................................... + // sub v15.4s, v11.4s, v24.4s // .................................................................e.............................................................................'.................................................................~................................................................ + // add v11.4s, v11.4s, v24.4s // .......................................................................e.......................................................................'.......................................................................~.......................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ........................e......................................................................................................................'........................~......................................................................................................... + // mul v24.4s, v16.4s, v0.s[0] // .........................e.....................................................................................................................'.........................~........................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ....................................e..........................................................................................................'....................................~............................................................................................. + // sub v16.4s, v12.4s, v24.4s // .......................................................e.......................................................................................'.......................................................~.......................................................................... + // add v12.4s, v12.4s, v24.4s // ........................................................e......................................................................................'........................................................~......................................................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ...............................................................................e...............................................................'...............................................................................~.................................................. + // mul v24.4s, v11.4s, v0.s[2] // ..............................................................................e................................................................'..............................................................................~................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................................e....................................................'..........................................................................................~....................................... + // sub v11.4s, v9.4s, v24.4s // ...................................................................................................e...........................................'...................................................................................................~.............................. + // add v9.4s, v9.4s, v24.4s // ......................................................................................................e........................................'......................................................................................................~........................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ..............................................................e................................................................................'..............................................................~................................................................... + // mul v24.4s, v12.4s, v0.s[2] // .............................................................e.................................................................................'.............................................................~.................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .........................................................................e.....................................................................'.........................................................................~........................................................ + // sub v12.4s, v10.4s, v24.4s // ....................................................................................e..........................................................'....................................................................................~............................................. + // add v10.4s, v10.4s, v24.4s // .....................................................................................e.........................................................'.....................................................................................~............................................ + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ........................................................................e......................................................................'........................................................................~......................................................... + // mul v24.4s, v15.4s, v1.s[0] // .............................................................................e.................................................................'.............................................................................~.................................................... + // mls v24.4s, v27.4s, v8.s[0] // .........................................................................................e.....................................................'.........................................................................................~........................................ + // sub v15.4s, v13.4s, v24.4s // ...........................................................................................................e...................................'...........................................................................................................~...................... + // add v13.4s, v13.4s, v24.4s // ..............................................................................................................e................................'..............................................................................................................~................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ................................................................e..............................................................................'................................................................~................................................................. + // mul v24.4s, v16.4s, v1.s[0] // ...............................................................e...............................................................................'...............................................................~.................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................................e...................................................................'...........................................................................~...................................................... + // sub v16.4s, v14.4s, v24.4s // .......................................................................................e.......................................................'.......................................................................................~.......................................... + // add v14.4s, v14.4s, v24.4s // ......................................................................................e........................................................'......................................................................................~........................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ...............................................................................................e...............................................'...............................................................................................~.................................. + // mul v24.4s, v10.4s, v1.s[2] // ................................................................................................e..............................................'................................................................................................~................................. + // mls v24.4s, v27.4s, v8.s[0] // ............................................................................................................e..................................'............................................................................................................~..................... + // sub v10.4s, v9.4s, v24.4s // ..........................................................................................................................e....................'..........................................................................................................................~....... + // add v9.4s, v9.4s, v24.4s // .........................................................................................................................e.....................'.........................................................................................................................~........ + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ..................................................................................................e............................................'..................................................................................................~............................... + // mul v24.4s, v12.4s, v2.s[0] // .................................................................................................e.............................................'.................................................................................................~................................ + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................................................e.................................'.............................................................................................................~.................... + // sub v12.4s, v11.4s, v24.4s // .............................................................................................................................e.................'.............................................................................................................................~.... + // add v11.4s, v11.4s, v24.4s // ............................................................................................................................e..................'............................................................................................................................~..... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // .............................................................................................e.................................................'.............................................................................................~.................................... + // mul v24.4s, v14.4s, v2.s[2] // ...........................................................................................e...................................................'...........................................................................................~...................................... + // mls v24.4s, v27.4s, v8.s[0] // .......................................................................................................e.......................................'.......................................................................................................~.......................... + // sub v14.4s, v13.4s, v24.4s // ....................................................................................................................e..........................'....................................................................................................................~............. + // add v13.4s, v13.4s, v24.4s // .......................................................................................................................e.......................'.......................................................................................................................~.......... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ..............................................................................................e................................................'..............................................................................................~................................... + // mul v24.4s, v16.4s, v3.s[0] // ............................................................................................e..................................................'............................................................................................~..................................... + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................................................e......................................'........................................................................................................~......................... + // sub v16.4s, v15.4s, v24.4s // .....................................................................................................................e.........................'.....................................................................................................................~............ + // add v15.4s, v15.4s, v24.4s // ......................................................................................................................e........................'......................................................................................................................~........... + // trn1 v25.4s, v9.4s, v10.4s // ......................................................................................................................................e........'.................................................................................................................................. + // trn2 v26.4s, v9.4s, v10.4s // ....................................................................................................................................e..........'.................................................................................................................................. + // trn1 v27.4s, v11.4s, v12.4s // .......................................................................................................................................e.......'.................................................................................................................................. + // trn2 v28.4s, v11.4s, v12.4s // .....................................................................................................................................e.........'.................................................................................................................................. + // trn2 v11.2d, v25.2d, v27.2d // ............................................................................................................................................e..'.................................................................................................................................. + // trn2 v12.2d, v26.2d, v28.2d // ...........................................................................................................................................e...'.................................................................................................................................. + // trn1 v9.2d, v25.2d, v27.2d // ..............................~................................................................................................................'..............................*................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ....................~..........................................................................................................................'....................*............................................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ...............................................................................................................................e...............'...............................................................................................................................~.. + // trn2 v26.4s, v13.4s, v14.4s // ................................................................................................................................e..............'................................................................................................................................~. + // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................................................................e............'.................................................................................................................................. + // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................................................e................'..............................................................................................................................~... + // trn2 v15.2d, v25.2d, v27.2d // ........................................................................................................................................e......'.................................................................................................................................. + // trn2 v16.2d, v26.2d, v28.2d // .........................................................................................................................................e.....'.................................................................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ............~..................................................................................................................................'............*..................................................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ..............~................................................................................................................................'..............*................................................................................................................... + // ldr q0, [ x5], #(12*16) // ...................................................................................................................................e...........'.................................................................................................................................. + // ldr q4, [x5, #(-12*16 + 1*16)] // .................................................................................e.............................................................'.................................................................................~................................................ + // ldr q1, [ x5, #(-12*16 + 2*16)] // ........................................~......................................................................................................'........................................*......................................................................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // .............~.................................................................................................................................'.............*.................................................................................................................... + // ldr q2, [ x5, #(-12*16 + 4*16)] // .................................................~.............................................................................................'.................................................*................................................................................ + // ldr q6, [x5, #(-12*16 + 5*16)] // ....................................................~..........................................................................................'....................................................*............................................................................. + // sqrdmulh v27.4s, v11.4s, v4.4s // ......~........................................................................................................................................'......*........................................................................................................................... + // mul v24.4s, v11.4s, v0.4s // ................~..............................................................................................................................'................*................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .....................................~.........................................................................................................'.....................................*............................................................................................ + // sub v11.4s, v9.4s, v24.4s // ..................................................................................~............................................................'..................................................................................*............................................... + // add v9.4s, v9.4s, v24.4s // .........................................................................................................~.....................................'.........................................................................................................*........................ + // sqrdmulh v27.4s, v12.4s, v4.4s // .......~.......................................................................................................................................'.......*.......................................................................................................................... + // mul v24.4s, v12.4s, v0.4s // ........~......................................................................................................................................'........*......................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................~.............................................................................................................'.................................*................................................................................................ + // sub v12.4s, v10.4s, v24.4s // ............................................................~..................................................................................'............................................................*..................................................................... + // add v10.4s, v10.4s, v24.4s // ...................................................~...........................................................................................'...................................................*.............................................................................. + // sqrdmulh v27.4s, v10.4s, v5.4s // ..........................................................~....................................................................................'..........................................................*....................................................................... + // mul v24.4s, v10.4s, v1.4s // .........................................................~.....................................................................................'.........................................................*........................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...................................................................................~...........................................................'...................................................................................*.............................................. + // sub v10.4s, v9.4s, v24.4s // ................................................................................................................~..............................'................................................................................................................*................. + // add v9.4s, v9.4s, v24.4s // ...................................................................................................................~...........................'...................................................................................................................*.............. + // sqrdmulh v27.4s, v12.4s, v6.4s // ...................................................................~...........................................................................'...................................................................*.............................................................. + // mul v24.4s, v12.4s, v2.4s // ..................................................................~............................................................................'..................................................................*............................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................................................................................~..............................................................'................................................................................*................................................. + // sub v12.4s, v11.4s, v24.4s // ..........................................................................................................~....................................'..........................................................................................................*....................... + // add v11.4s, v11.4s, v24.4s // ........................................................................................~......................................................'........................................................................................*......................................... + // ldr q0, [ x5, #(-12*16 + 6*16)] // ........................................................................................................................e......................'........................................................................................................................~......... + // ldr q4, [x5, #(-12*16 + 7*16)] // ...............................................................................................................e...............................'...............................................................................................................~.................. + // ldr q1, [ x5, #(-12*16 + 8*16)] // .......................~.......................................................................................................................'.......................*.......................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ...........................................e...................................................................................................'...........................................~...................................................................................... + // ldr q2, [ x5, #(-12*16 + 10*16)] // ..........................................................................................................................................e....'.................................................................................................................................. + // ldr q6, [x5, #(-12*16 + 11*16)] // .............................................e.................................................................................................'.............................................~.................................................................................... + // sqrdmulh v27.4s, v15.4s, v4.4s // .....~.........................................................................................................................................'.....*............................................................................................................................ + // mul v24.4s, v15.4s, v0.4s // ....~..........................................................................................................................................'....*............................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..................~............................................................................................................................'..................*............................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ...............................~...............................................................................................................'...............................*.................................................................................................. + // add v13.4s, v13.4s, v24.4s // ..................................................~............................................................................................'..................................................*............................................................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // ...............................................................................................................................................*.................................................................................................................................. + // mul v24.4s, v16.4s, v0.4s // ..~............................................................................................................................................'..*............................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...................~...........................................................................................................................'...................*.............................................................................................................. + // sub v16.4s, v14.4s, v24.4s // ................................~..............................................................................................................'................................*................................................................................................. + // add v14.4s, v14.4s, v24.4s // ..................................~............................................................................................................'..................................*............................................................................................... + // sqrdmulh v27.4s, v14.4s, v5.4s // ..........................................~....................................................................................................'..........................................*....................................................................................... + // mul v24.4s, v14.4s, v1.4s // ................................................~..............................................................................................'................................................*................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ....................................................................................................~..........................................'....................................................................................................*............................. + // sub v14.4s, v13.4s, v24.4s // .................................................................................................................~.............................'.................................................................................................................*................ + // add v13.4s, v13.4s, v24.4s // ..................................................................................................................~............................'..................................................................................................................*............... + // sqrdmulh v27.4s, v16.4s, v6.4s // .........................................~.....................................................................................................'.........................................*........................................................................................ + // mul v24.4s, v16.4s, v2.4s // ..............................................~................................................................................................'..............................................*................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................~...................................................................................'...........................................................*...................................................................... + // sub v16.4s, v15.4s, v24.4s // ....................................................................~..........................................................................'....................................................................*............................................................. + // add v15.4s, v15.4s, v24.4s // .....................................................................~.........................................................................'.....................................................................*............................................................ + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // .................................................................................................................................~.............'.................................................................................................................................* + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // ...........................................................................................................................~...................'...........................................................................................................................*...... sub count, count, #1 cbnz count, layer45678_start - mls v15.4S, v9.4S, v8.S[0] // .......*............................................................................ - add v31.4S, v22.4S, v14.4S // .....*.............................................................................. - ldr q13, [x5, #64] // ...............................................*.................................... - add x1, x1, #64 // ...................*................................................................ - add x2, x2, #64 // *................................................................................... - sub v23.4S, v0.4S, v12.4S // .*.................................................................................. - mls v29.4S, v7.4S, v8.S[0] // ....*............................................................................... - ldr q5, [x5, #160] // .............................................*...................................... - mls v1.4S, v16.4S, v8.S[0] // ...*................................................................................ - sub v26.4S, v22.4S, v14.4S // ......*............................................................................. - add v2.4S, v0.4S, v12.4S // ..*................................................................................. - ldr q14, [x5], #(12*16) // ........................*........................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - sub v27.4S, v23.4S, v20.4S // ........*........................................................................... - add v24.4S, v23.4S, v20.4S // .........*.......................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - sub v12.4S, v31.4S, v15.4S // ..............*..................................................................... - add v9.4S, v31.4S, v15.4S // .................*.................................................................. - sub v7.4S, v2.4S, v29.4S // .............*...................................................................... - add v2.4S, v2.4S, v29.4S // ............*....................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - add v22.4S, v26.4S, v1.4S // ...........*........................................................................ - sub v18.4S, v26.4S, v1.4S // ..........*......................................................................... - ldr q26, [x5, #-64] // ....................................................*............................... - trn1 v31.4S, v24.4S, v27.4S // ...............*.................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - trn2 v23.4S, v2.4S, v7.4S // ......................*............................................................. - trn1 v1.4S, v2.4S, v7.4S // ..................*................................................................. - trn2 v17.4S, v24.4S, v27.4S // ................*................................................................... - ldr q24, [x5, #-96] // .....................*.............................................................. - trn1 v19.4S, v9.4S, v12.4S // .............................*...................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - trn2 v0.4S, v9.4S, v12.4S // .......................*............................................................ - trn2 v2.4S, v22.4S, v18.4S // ....................*............................................................... - trn1 v12.4S, v22.4S, v18.4S // ................................*................................................... - ldr q22, [x5, #-176] // .........................*.......................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - trn2 v20.2D, v1.2D, v31.2D // .................................*.................................................. - trn2 v7.2D, v23.2D, v17.2D // ...........................*........................................................ - trn1 v29.2D, v1.2D, v31.2D // ..........................*......................................................... - ldr q31, [x5, #-48] // .....................................................*.............................. - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - trn1 v27.2D, v0.2D, v2.2D // ..............................................*..................................... - trn2 v25.2D, v0.2D, v2.2D // ............................*....................................................... - trn2 v9.2D, v19.2D, v12.2D // ....................................*............................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - mul v1.4S, v20.4S, v24.4S // .......................................*............................................ - sqrdmulh v20.4S, v20.4S, v3.4S // ........................................*........................................... - sqrdmulh v2.4S, v7.4S, v3.4S // ...............................*.................................................... - mul v18.4S, v7.4S, v24.4S // ..............................*..................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - mul v3.4S, v25.4S, v14.4S // ..................................*................................................. - mul v4.4S, v9.4S, v14.4S // ..........................................*......................................... - sqrdmulh v0.4S, v25.4S, v22.4S // ...................................*................................................ - sqrdmulh v9.4S, v9.4S, v22.4S // ...........................................*........................................ - ldr q14, [x5, #-144] // ...................................................*................................ - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - mls v1.4S, v20.4S, v8.S[0] // .................................................*.................................. - mls v18.4S, v2.4S, v8.S[0] // ......................................*............................................. - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - mls v3.4S, v0.4S, v8.S[0] // ............................................*....................................... - mls v4.4S, v9.4S, v8.S[0] // .......................................................*............................ - trn1 v0.2D, v23.2D, v17.2D // .........................................*.......................................... - ldr q23, [x5, #-112] // ................................................*................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - trn1 v9.2D, v19.2D, v12.2D // .....................................*.............................................. - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - sub v24.4S, v0.4S, v18.4S // ..................................................*................................. - add v20.4S, v0.4S, v18.4S // ...........................................................*........................ - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - add v16.4S, v9.4S, v4.4S // .....................................................................*.............. - sub v7.4S, v9.4S, v4.4S // ....................................................................*............... - sub v12.4S, v27.4S, v3.4S // ......................................................*............................. - add v0.4S, v27.4S, v3.4S // ..........................................................*......................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - sqrdmulh v30.4S, v24.4S, v21.4S // .........................................................*.......................... - mul v22.4S, v20.4S, v26.4S // .................................................................*.................. - mul v26.4S, v24.4S, v5.4S // ........................................................*........................... - sqrdmulh v27.4S, v20.4S, v31.4S // ...............................................................*.................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - mul v2.4S, v0.4S, v11.4S // ................................................................*................... - sqrdmulh v20.4S, v12.4S, v23.4S // .............................................................*...................... - mul v12.4S, v12.4S, v13.4S // ............................................................*....................... - sqrdmulh v9.4S, v0.4S, v14.4S // ..................................................................*................. - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - sub v0.4S, v29.4S, v1.4S // ..............................................................*..................... - add v18.4S, v29.4S, v1.4S // ........................................................................*........... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - mls v26.4S, v30.4S, v8.S[0] // ...................................................................*................ - mls v22.4S, v27.4S, v8.S[0] // .........................................................................*.......... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - mls v2.4S, v9.4S, v8.S[0] // .......................................................................*............ - mls v12.4S, v20.4S, v8.S[0] // ......................................................................*............. - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - sub v29.4S, v0.4S, v26.4S // ..........................................................................*......... - add v28.4S, v0.4S, v26.4S // ...........................................................................*........ - sub v27.4S, v18.4S, v22.4S // ..............................................................................*..... - add v26.4S, v18.4S, v22.4S // .................................................................................*.. - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - sub v20.4S, v16.4S, v2.4S // ...............................................................................*.... - add v19.4S, v16.4S, v2.4S // ................................................................................*... - sub v22.4S, v7.4S, v12.4S // ............................................................................*....... - add v21.4S, v7.4S, v12.4S // .............................................................................*...... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - st4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x2], #64 // ..................................................................................*. - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - st4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1], #64 // ...................................................................................* - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - // gap // .................................................................................... - - // original source code - // add x2, x2, #64 // ....*............................................................................... - // sub v18.4S, v0.4S, v12.4S // .....*.............................................................................. - // add v28.4S, v0.4S, v12.4S // ..........*......................................................................... - // mls v1.4S, v16.4S, v8.S[0] // ........*........................................................................... - // mls v29.4S, v7.4S, v8.S[0] // ......*............................................................................. - // add v2.4S, v22.4S, v14.4S // .*.................................................................................. - // sub v17.4S, v22.4S, v14.4S // .........*.......................................................................... - // mls v15.4S, v9.4S, v8.S[0] // *................................................................................... - // sub v7.4S, v18.4S, v20.4S // ............*....................................................................... - // add v26.4S, v18.4S, v20.4S // .............*...................................................................... - // sub v31.4S, v17.4S, v1.4S // ...................*................................................................ - // add v6.4S, v17.4S, v1.4S // ..................*................................................................. - // add v23.4S, v28.4S, v29.4S // .................*.................................................................. - // sub v0.4S, v28.4S, v29.4S // ................*................................................................... - // sub v17.4S, v2.4S, v15.4S // ..............*..................................................................... - // trn1 v9.4S, v26.4S, v7.4S // .....................*.............................................................. - // trn2 v25.4S, v26.4S, v7.4S // ........................*........................................................... - // add v18.4S, v2.4S, v15.4S // ...............*.................................................................... - // trn1 v10.4S, v23.4S, v0.4S // .......................*............................................................ - // add x1, x1, #64 // ...*................................................................................ - // trn2 v2.4S, v6.4S, v31.4S // ............................*....................................................... - // ldr q5, [x5, #96] // .........................*.......................................................... - // trn2 v29.4S, v23.4S, v0.4S // ......................*............................................................. - // trn2 v30.4S, v18.4S, v17.4S // ...........................*........................................................ - // ldr q20, [x5], #(12*16) // ...........*........................................................................ - // ldr q4, [x5, #-176] // ..............................*..................................................... - // trn1 v28.2D, v10.2D, v9.2D // .................................*.................................................. - // trn2 v1.2D, v29.2D, v25.2D // ................................*................................................... - // trn2 v15.2D, v30.2D, v2.2D // ....................................*............................................... - // trn1 v27.4S, v18.4S, v17.4S // ..........................*......................................................... - // mul v7.4S, v1.4S, v5.4S // .........................................*.......................................... - // sqrdmulh v19.4S, v1.4S, v3.4S // ........................................*........................................... - // trn1 v18.4S, v6.4S, v31.4S // .............................*...................................................... - // trn2 v14.2D, v10.2D, v9.2D // ...............................*.................................................... - // mul v9.4S, v15.4S, v20.4S // ..........................................*......................................... - // sqrdmulh v10.4S, v15.4S, v4.4S // ............................................*....................................... - // trn2 v15.2D, v27.2D, v18.2D // .....................................*.............................................. - // trn1 v31.2D, v27.2D, v18.2D // .....................................................*.............................. - // mls v7.4S, v19.4S, v8.S[0] // ................................................*................................... - // mul v5.4S, v14.4S, v5.4S // ......................................*............................................. - // sqrdmulh v17.4S, v14.4S, v3.4S // .......................................*............................................ - // trn1 v25.2D, v29.2D, v25.2D // ...................................................*................................ - // mul v14.4S, v15.4S, v20.4S // ...........................................*........................................ - // sqrdmulh v15.4S, v15.4S, v4.4S // .............................................*...................................... - // mls v9.4S, v10.4S, v8.S[0] // .................................................*.................................. - // ldr q27, [x5, #-32] // .......*............................................................................ - // trn1 v12.2D, v30.2D, v2.2D // ...................................*................................................ - // ldr q30, [x5, #-128] // ..*................................................................................. - // ldr q20, [x5, #-112] // ....................................................*............................... - // mls v5.4S, v17.4S, v8.S[0] // ...............................................*.................................... - // sub v1.4S, v25.4S, v7.4S // ......................................................*............................. - // ldr q10, [x5, #-144] // ..............................................*..................................... - // ldr q2, [x5, #-64] // ....................*............................................................... - // ldr q17, [x5, #-48] // ..................................*................................................. - // sub v4.4S, v12.4S, v9.4S // ..........................................................*......................... - // mls v14.4S, v15.4S, v8.S[0] // ..................................................*................................. - // mul v27.4S, v1.4S, v27.4S // ..............................................................*..................... - // sqrdmulh v21.4S, v1.4S, v21.4S // ............................................................*....................... - // add v1.4S, v12.4S, v9.4S // ...........................................................*........................ - // add v3.4S, v25.4S, v7.4S // .......................................................*............................ - // mul v29.4S, v4.4S, v30.4S // ..................................................................*................. - // sqrdmulh v20.4S, v4.4S, v20.4S // .................................................................*.................. - // sub v7.4S, v28.4S, v5.4S // ....................................................................*............... - // sqrdmulh v17.4S, v3.4S, v17.4S // ...............................................................*.................... - // mul v15.4S, v1.4S, v11.4S // ................................................................*................... - // mul v2.4S, v3.4S, v2.4S // .............................................................*...................... - // sqrdmulh v11.4S, v1.4S, v10.4S // ...................................................................*................ - // mls v27.4S, v21.4S, v8.S[0] // ......................................................................*............. - // sub v23.4S, v31.4S, v14.4S // .........................................................*.......................... - // add v14.4S, v31.4S, v14.4S // ........................................................*........................... - // mls v29.4S, v20.4S, v8.S[0] // .........................................................................*.......... - // mls v15.4S, v11.4S, v8.S[0] // ........................................................................*........... - // add v20.4S, v28.4S, v5.4S // .....................................................................*.............. - // mls v2.4S, v17.4S, v8.S[0] // .......................................................................*............ - // sub v18.4S, v7.4S, v27.4S // ..........................................................................*......... - // add v17.4S, v7.4S, v27.4S // ...........................................................................*........ - // sub v7.4S, v23.4S, v29.4S // ................................................................................*... - // add v6.4S, v23.4S, v29.4S // .................................................................................*.. - // sub v16.4S, v20.4S, v2.4S // ............................................................................*....... - // sub v5.4S, v14.4S, v15.4S // ..............................................................................*..... - // add v4.4S, v14.4S, v15.4S // ...............................................................................*.... - // add v15.4S, v20.4S, v2.4S // .............................................................................*...... - // st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x2], #64 // ..................................................................................*. - // st4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x1], #64 // ...................................................................................* + // Instructions: 51 + // Expected cycles: 21 + // Expected IPC: 2.43 + // + // Wall time: 1.55s + // User time: 1.55s + // + // --------------- original position ----------------> + // 0 25 50 + // |------------------------|------------------------| + mul v12.4S, v15.4S, v26.4S // ......*............................................ + mul v25.4S, v21.4S, v26.4S // ..........*........................................ + ldr q2, [x5, #-112] // .............................*..................... + sqrdmulh v15.4S, v15.4S, v28.4S // .....*............................................. + // gap // ................................................... + // gap // ................................................... + sqrdmulh v28.4S, v21.4S, v28.4S // ....*.............................................. + // gap // ................................................... + mul v23.4S, v4.4S, v18.4S // ..*................................................ + mul v17.4S, v13.4S, v18.4S // .*................................................. + sqrdmulh v6.4S, v13.4S, v27.4S // *.................................................. + ldr q13, [x5, #-144] // ........*.......................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + trn1 v9.2D, v11.2D, v3.2D // .........*......................................... + trn1 v3.2D, v30.2D, v29.2D // ...............*................................... + ldr q11, [x5, #-64] // ..............*.................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v12.4S, v15.4S, v8.S[0] // ..................*................................ + mls v25.4S, v28.4S, v8.S[0] // ....................*.............................. + ldr q21, [x5, #-128] // ..........................*........................ + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v17.4S, v6.4S, v8.S[0] // ............*...................................... + trn1 v6.2D, v24.2D, v20.2D // .............*..................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + ldr q20, [x5, #-160] // .....................*............................. + sqrdmulh v5.4S, v4.4S, v27.4S // ...*............................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + add v18.4S, v3.4S, v25.4S // ...........................................*....... + add v0.4S, v6.4S, v12.4S // ............................*...................... + sub v12.4S, v6.4S, v12.4S // .................................*................. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + add v19.4S, v9.4S, v17.4S // ...................*............................... + sub v9.4S, v9.4S, v17.4S // .................*................................. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v24.4S, v12.4S, v2.4S // ...................................*............... + sqrdmulh v13.4S, v0.4S, v13.4S // ...............................*................... + mul v21.4S, v12.4S, v21.4S // ..................................*................ + mul v26.4S, v0.4S, v20.4S // ..............................*.................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v20.4S, v19.4S, v22.4S // .......................*........................... + mul v4.4S, v19.4S, v11.4S // .........................*......................... + sqrdmulh v19.4S, v9.4S, v10.4S // ......................*............................ + mul v1.4S, v9.4S, v14.4S // ........................*.......................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + trn1 v6.2D, v31.2D, v16.2D // .......*........................................... + mls v23.4S, v5.4S, v8.S[0] // ...........*....................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v21.4S, v24.4S, v8.S[0] // ......................................*............ + mls v26.4S, v13.4S, v8.S[0] // ........................................*.......... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v4.4S, v20.4S, v8.S[0] // ..........................................*........ + mls v1.4S, v19.4S, v8.S[0] // ................................*.................. + sub v28.4S, v3.4S, v25.4S // .......................................*........... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + add v19.4S, v6.4S, v23.4S // ...........................*....................... + sub v6.4S, v6.4S, v23.4S // ................*.................................. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sub v16.4S, v28.4S, v21.4S // ............................................*...... + add v15.4S, v28.4S, v21.4S // .........................................*......... + add v13.4S, v18.4S, v26.4S // ................................................*.. + sub v14.4S, v18.4S, v26.4S // .............................................*..... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + add v3.4S, v19.4S, v4.4S // ...............................................*... + sub v4.4S, v19.4S, v4.4S // ..............................................*.... + add v5.4S, v6.4S, v1.4S // .....................................*............. + sub v6.4S, v6.4S, v1.4S // ....................................*.............. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1], #64 // ..................................................* + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + st4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x2], #64 // .................................................*. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + + // ------------------ new position ------------------> + // 0 25 + // |------------------------|------------------------- + // sqrdmulh v25.4S, v13.4S, v27.4S // .......*........................................... + // mul v6.4S, v13.4S, v18.4S // ......*............................................ + // mul v2.4S, v4.4S, v18.4S // .....*............................................. + // sqrdmulh v19.4S, v4.4S, v27.4S // ..................*................................ + // sqrdmulh v9.4S, v21.4S, v28.4S // ....*.............................................. + // sqrdmulh v1.4S, v15.4S, v28.4S // ...*............................................... + // mul v13.4S, v15.4S, v26.4S // *.................................................. + // trn1 v15.2D, v31.2D, v16.2D // ................................*.................. + // ldr q4, [x5, #-144] // ........*.......................................... + // trn1 v3.2D, v11.2D, v3.2D // .........*......................................... + // mul v18.4S, v21.4S, v26.4S // .*................................................. + // mls v2.4S, v19.4S, v8.S[0] // .................................*................. + // mls v6.4S, v25.4S, v8.S[0] // ...............*................................... + // trn1 v24.2D, v24.2D, v20.2D // ................*.................................. + // ldr q7, [x5, #-64] // ...........*....................................... + // trn1 v29.2D, v30.2D, v29.2D // ..........*........................................ + // sub v28.4S, v15.4S, v2.4S // ........................................*.......... + // sub v17.4S, v3.4S, v6.4S // .......................*........................... + // mls v13.4S, v1.4S, v8.S[0] // ............*...................................... + // add v1.4S, v3.4S, v6.4S // ......................*............................ + // mls v18.4S, v9.4S, v8.S[0] // .............*..................................... + // ldr q9, [x5, #-160] // .................*................................. + // sqrdmulh v12.4S, v17.4S, v10.4S // ..............................*.................... + // sqrdmulh v27.4S, v1.4S, v22.4S // ............................*...................... + // mul v26.4S, v17.4S, v14.4S // ...............................*................... + // mul v1.4S, v1.4S, v7.4S // .............................*..................... + // ldr q7, [x5, #-128] // ..............*.................................... + // add v14.4S, v15.4S, v2.4S // .......................................*........... + // add v2.4S, v24.4S, v13.4S // ....................*.............................. + // ldr q20, [x5, #-112] // ..*................................................ + // mul v9.4S, v2.4S, v9.4S // ...........................*....................... + // sqrdmulh v6.4S, v2.4S, v4.4S // .........................*......................... + // mls v26.4S, v12.4S, v8.S[0] // .....................................*............. + // sub v2.4S, v24.4S, v13.4S // .....................*............................. + // mul v7.4S, v2.4S, v7.4S // ..........................*........................ + // sqrdmulh v2.4S, v2.4S, v20.4S // ........................*.......................... + // sub v4.4S, v28.4S, v26.4S // ................................................*.. + // add v3.4S, v28.4S, v26.4S // ...............................................*... + // mls v7.4S, v2.4S, v8.S[0] // ..................................*................ + // sub v24.4S, v29.4S, v18.4S // ......................................*............ + // mls v9.4S, v6.4S, v8.S[0] // ...................................*............... + // add v16.4S, v24.4S, v7.4S // ..........................................*........ + // mls v1.4S, v27.4S, v8.S[0] // ....................................*.............. + // add v26.4S, v29.4S, v18.4S // ...................*............................... + // sub v17.4S, v24.4S, v7.4S // .........................................*......... + // sub v15.4S, v26.4S, v9.4S // ............................................*...... + // sub v2.4S, v14.4S, v1.4S // ..............................................*.... + // add v1.4S, v14.4S, v1.4S // .............................................*..... + // add v14.4S, v26.4S, v9.4S // ...........................................*....... + // st4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2], #64 // ..................................................* + // st4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1], #64 // .................................................*. pop_stack diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_dilithium_123_45678_opt_m1_icestorm.s index d956ae36..8dd09b01 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_opt_m1_icestorm.s @@ -2,31 +2,9 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -47,15 +25,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +42,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -89,24 +61,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -136,35 +108,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -186,7 +158,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -197,7 +169,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -207,7 +179,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -215,7 +187,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -226,19 +198,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -375,454 +347,502 @@ _ntt_dilithium_123_45678_opt_m1_icestorm: load_roots_123 .p2align 2 - // gap // .......................................... - ldr q22, [x0, #768] // .*........................................ - ldr q27, [x0, #896] // ....*..................................... - // gap // .......................................... - ldr q12, [x0, #640] // *......................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - ldr q13, [x0, #512] // ..*....................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - ldr q18, [x0, #128] // ...*...................................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sqrdmulh v25.4S, v27.4S, v0.S[1] // ............*............................. - mul v10.4S, v27.4S, v0.S[0] // ..............*........................... - ldr q17, [x0, #256] // ........*................................. - // gap // .......................................... - mul v4.4S, v22.4S, v0.S[0] // ......*................................... - sqrdmulh v9.4S, v22.4S, v0.S[1] // ..........*............................... - ldr q22, [x0, #384] // ...............*.......................... - // gap // .......................................... - sqrdmulh v11.4S, v12.4S, v0.S[1] // .......*.................................. - mul v29.4S, v12.4S, v0.S[0] // .........*................................ - ldr q12, [x0, #0] // .....*.................................... - // gap // .......................................... - mls v10.4S, v25.4S, v8.S[0] // .................*........................ - mul v24.4S, v13.4S, v0.S[0] // ...........*.............................. - // gap // .......................................... - // gap // .......................................... - mls v4.4S, v9.4S, v8.S[0] // .............*............................ - sqrdmulh v9.4S, v13.4S, v0.S[1] // ............................*............. - // gap // .......................................... - // gap // .......................................... - mls v29.4S, v11.4S, v8.S[0] // ....................*..................... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - add v11.4S, v22.4S, v10.4S // .....................*.................... - sub v28.4S, v22.4S, v10.4S // ......................*................... - // gap // .......................................... - // gap // .......................................... - add v16.4S, v17.4S, v4.4S // ................*......................... - mls v24.4S, v9.4S, v8.S[0] // ...............................*.......... - // gap // .......................................... - // gap // .......................................... - sqrdmulh v5.4S, v11.4S, v0.S[3] // .......................*.................. - mul v15.4S, v11.4S, v0.S[2] // ........................*................. - // gap // .......................................... - // gap // .......................................... - add v31.4S, v18.4S, v29.4S // .............................*............ - sub v14.4S, v18.4S, v29.4S // .....................................*.... - // gap // .......................................... - // gap // .......................................... - sqrdmulh v10.4S, v16.4S, v0.S[3] // ..................*....................... - mul v19.4S, v16.4S, v0.S[2] // ...................*...................... - // gap // .......................................... - // gap // .......................................... - mls v15.4S, v5.4S, v8.S[0] // ...........................*.............. - mul v30.4S, v28.4S, v1.S[0] // .........................*................ - // gap // .......................................... - // gap // .......................................... - sqrdmulh v5.4S, v28.4S, v1.S[1] // ..........................*............... - add v26.4S, v12.4S, v24.4S // ....................................*..... - // gap // .......................................... - // gap // .......................................... - mls v19.4S, v10.4S, v8.S[0] // ...................................*...... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - sub v13.4S, v31.4S, v15.4S // ................................*......... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v30.4S, v5.4S, v8.S[0] // ..............................*........... - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mul v27.4S, v13.4S, v2.S[0] // .................................*........ - sqrdmulh v29.4S, v13.4S, v2.S[1] // ..................................*....... - // gap // .......................................... - // gap // .......................................... - sub v22.4S, v12.4S, v24.4S // ......................................*... - sub v24.4S, v26.4S, v19.4S // ........................................*. - // gap // .......................................... - // gap // .......................................... - add v19.4S, v26.4S, v19.4S // .........................................* - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - mls v27.4S, v29.4S, v8.S[0] // .......................................*.. - // gap // .......................................... - // gap // .......................................... - // gap // .......................................... - - // original source code - // ldr q28, [x0, #640] // ..*....................................... - // ldr q29, [x0, #768] // *......................................... - // ldr q7, [x0, #512] // ...*...................................... - // ldr q11, [x0, #128] // ....*..................................... - // ldr q12, [x0, #896] // .*........................................ - // ldr q20, [x0, #0] // .............*............................ - // mul v4.4S, v29.4S, v0.S[0] // ........*................................. - // sqrdmulh v13.4S, v28.4S, v0.S[1] // ...........*.............................. - // ldr q17, [x0, #256] // .......*.................................. - // mul v16.4S, v28.4S, v0.S[0] // ............*............................. - // sqrdmulh v29.4S, v29.4S, v0.S[1] // .........*................................ - // mul v10.4S, v7.4S, v0.S[0] // ...............*.......................... - // sqrdmulh v18.4S, v12.4S, v0.S[1] // .....*.................................... - // mls v4.4S, v29.4S, v8.S[0] // ................*......................... - // mul v29.4S, v12.4S, v0.S[0] // ......*................................... - // ldr q30, [x0, #384] // ..........*............................... - // add v21.4S, v17.4S, v4.4S // .....................*.................... - // mls v29.4S, v18.4S, v8.S[0] // ..............*........................... - // sqrdmulh v6.4S, v21.4S, v0.S[3] // ...........................*.............. - // mul v21.4S, v21.4S, v0.S[2] // ............................*............. - // mls v16.4S, v13.4S, v8.S[0] // ..................*....................... - // add v13.4S, v30.4S, v29.4S // ...................*...................... - // sub v26.4S, v30.4S, v29.4S // ....................*..................... - // sqrdmulh v25.4S, v13.4S, v0.S[3] // .......................*.................. - // mul v15.4S, v13.4S, v0.S[2] // ........................*................. - // mul v30.4S, v26.4S, v1.S[0] // ..............................*........... - // sqrdmulh v27.4S, v26.4S, v1.S[1] // ...............................*.......... - // mls v15.4S, v25.4S, v8.S[0] // .............................*............ - // sqrdmulh v7.4S, v7.4S, v0.S[1] // .................*........................ - // add v31.4S, v11.4S, v16.4S // .........................*................ - // mls v30.4S, v27.4S, v8.S[0] // ...................................*...... - // mls v10.4S, v7.4S, v8.S[0] // ......................*................... - // sub v7.4S, v31.4S, v15.4S // ..................................*....... - // mul v27.4S, v7.4S, v2.S[0] // ....................................*..... - // sqrdmulh v28.4S, v7.4S, v2.S[1] // .....................................*.... - // mls v21.4S, v6.4S, v8.S[0] // .................................*........ - // add v7.4S, v20.4S, v10.4S // ................................*......... - // sub v14.4S, v11.4S, v16.4S // ..........................*............... - // sub v22.4S, v20.4S, v10.4S // ......................................*... - // mls v27.4S, v28.4S, v8.S[0] // .........................................* - // sub v24.4S, v7.4S, v21.4S // .......................................*.. - // add v19.4S, v7.4S, v21.4S // ........................................*. + // Instructions: 29 + // Expected cycles: 21 + // Expected IPC: 1.38 + // + // Wall time: 0.15s + // User time: 0.15s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q9, [x0, #896] // ...*.......................... + ldr q17, [x0, #640] // *............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x0, #768] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q14, [x0, #128] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q16, [x0, #512] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v13.4S, v9.4S, v0.S[0] // .........*.................... + sqrdmulh v27.4S, v9.4S, v0.S[1] // ...........*.................. + ldr q30, [x0, #384] // ..........*................... + // gap // .............................. + sqrdmulh v18.4S, v25.4S, v0.S[1] // ......*....................... + mul v22.4S, v25.4S, v0.S[0] // .......*...................... + // gap // .............................. + // gap // .............................. + sqrdmulh v23.4S, v17.4S, v0.S[1] // ....*......................... + mul v9.4S, v17.4S, v0.S[0] // .....*........................ + // gap // .............................. + // gap // .............................. + mls v13.4S, v27.4S, v8.S[0] // ..............*............... + ldr q27, [x0, #256] // ...................*.......... + // gap // .............................. + // gap // .............................. + mls v22.4S, v18.4S, v8.S[0] // ....................*......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v29.4S, v16.4S, v0.S[1] // .............*................ + mls v9.4S, v23.4S, v8.S[0] // ............*................. + // gap // .............................. + // gap // .............................. + sub v10.4S, v30.4S, v13.4S // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v11.4S, v27.4S, v22.4S // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v7.4S, v10.4S, v1.S[1] // ..................*........... + mul v10.4S, v10.4S, v1.S[0] // .................*............ + // gap // .............................. + // gap // .............................. + sub v27.4S, v27.4S, v22.4S // .......................*...... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v10.4S, v7.4S, v8.S[0] // .....................*........ + mul v17.4S, v27.4S, v1.S[0] // .........................*.... + // gap // .............................. + // gap // .............................. + sqrdmulh v12.4S, v27.4S, v1.S[1] // ...........................*.. + sub v27.4S, v14.4S, v9.4S // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v31.4S, v27.4S, v10.4S // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v22.4S, v27.4S, v10.4S // ..........................*... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v19.4S, v31.4S, v3.S[1] // ............................*. + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q4, [x0, #640] // .*............................. + // ldr q26, [x0, #768] // ..*............................ + // ldr q14, [x0, #128] // ...*........................... + // ldr q23, [x0, #896] // *.............................. + // sqrdmulh v20.4S, v4.4S, v0.S[1] // ..........*.................... + // mul v9.4S, v4.4S, v0.S[0] // ...........*................... + // sqrdmulh v18.4S, v26.4S, v0.S[1] // ........*...................... + // mul v27.4S, v26.4S, v0.S[0] // .........*..................... + // ldr q16, [x0, #512] // ....*.......................... + // mul v13.4S, v23.4S, v0.S[0] // .....*......................... + // ldr q30, [x0, #384] // .......*....................... + // sqrdmulh v25.4S, v23.4S, v0.S[1] // ......*........................ + // mls v9.4S, v20.4S, v8.S[0] // ................*.............. + // sqrdmulh v29.4S, v16.4S, v0.S[1] // ...............*............... + // mls v13.4S, v25.4S, v8.S[0] // ............*.................. + // sub v6.4S, v14.4S, v9.4S // .........................*..... + // sub v25.4S, v30.4S, v13.4S // .................*............. + // mul v5.4S, v25.4S, v1.S[0] // ....................*.......... + // sqrdmulh v31.4S, v25.4S, v1.S[1] // ...................*........... + // ldr q15, [x0, #256] // .............*................. + // mls v27.4S, v18.4S, v8.S[0] // ..............*................ + // mls v5.4S, v31.4S, v8.S[0] // ......................*........ + // add v11.4S, v15.4S, v27.4S // ..................*............ + // sub v25.4S, v15.4S, v27.4S // .....................*......... + // sub v31.4S, v6.4S, v5.4S // ..........................*.... + // mul v17.4S, v25.4S, v1.S[0] // .......................*....... + // add v22.4S, v6.4S, v5.4S // ...........................*... + // sqrdmulh v12.4S, v25.4S, v1.S[1] // ........................*...... + // sqrdmulh v19.4S, v31.4S, v3.S[1] // ............................*.. sub count, count, #1 layer123_start: - ldr q28, [x0, #656] // .....e...................................................................... - sub v16.4S, v17.4S, v4.4S // .....................*...................................................... - sub v23.4S, v14.4S, v30.4S // ..............................................*............................. - ldr q29, [x0, #784] // ......e..................................................................... - ldr q7, [x0, #528] // ....e....................................................................... - add v4.4S, v24.4S, v27.4S // .........................................................*.................. - ldr q11, [x0, #144] // .e.......................................................................... - add v10.4S, v31.4S, v15.4S // .....................................*...................................... - sqrdmulh v21.4S, v23.4S, v3.S[1] // ................................................................*........... - sqrdmulh v18.4S, v16.4S, v1.S[1] // .......................................*.................................... + // Instructions: 76 + // Expected cycles: 30 + // Expected IPC: 2.53 + // + // Wall time: 26.98s + // User time: 26.98s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + mul v21.4S, v22.4S, v2.S[2] // ...........................................................*................ + sqrdmulh v27.4S, v22.4S, v2.S[3] // ..........................................................*................. // gap // ............................................................................ - ldr q12, [x0, #912] // .......e.................................................................... - mul v15.4S, v10.4S, v1.S[2] // ................................................*........................... - str q4, [x0, #256] // ......................................................................*..... - ldr q20, [x0, #16] // e........................................................................... - mul v9.4S, v16.4S, v1.S[0] // ......................................*..................................... - mul v4.4S, v29.4S, v0.S[0] // ..................e......................................................... - sqrdmulh v13.4S, v28.4S, v0.S[1] // ..............e............................................................. - ldr q17, [x0, #272] // ..e......................................................................... + ldr q4, [x0, #656] // .....e...................................................................... + mul v5.4S, v16.4S, v0.S[0] // .........*.................................................................. // gap // ............................................................................ - mul v16.4S, v28.4S, v0.S[0] // .............e.............................................................. // gap // ............................................................................ + sqrdmulh v28.4S, v11.4S, v0.S[3] // ............................*............................................... + mls v17.4S, v12.4S, v8.S[0] // ........................................*................................... + add v12.4S, v14.4S, v9.4S // .................*.......................................................... + ldr q26, [x0, #784] // ......e..................................................................... + ldr q14, [x0, #144] // .e.......................................................................... + mls v21.4S, v27.4S, v8.S[0] // ............................................................*............... + add v27.4S, v30.4S, v13.4S // ...........................*................................................ + ldr q13, [x0, #0] // *........................................................................... + ldr q23, [x0, #912] // .......e.................................................................... // gap // ............................................................................ - sqrdmulh v29.4S, v29.4S, v0.S[1] // ...................e........................................................ - mls v9.4S, v18.4S, v8.S[0] // ........................................*................................... - sqrdmulh v28.4S, v10.4S, v1.S[3] // .................................................*.......................... // gap // ............................................................................ + sqrdmulh v20.4S, v4.4S, v0.S[1] // .............e.............................................................. + mls v5.4S, v29.4S, v8.S[0] // ..........*................................................................. + sqrdmulh v7.4S, v27.4S, v0.S[3] // .................................*.......................................... // gap // ............................................................................ - mul v10.4S, v7.4S, v0.S[0] // ........e................................................................... + mul v9.4S, v4.4S, v0.S[0] // ..............e............................................................. // gap // ............................................................................ // gap // ............................................................................ - mul v23.4S, v23.4S, v3.S[0] // ...............................................................*............ - sqrdmulh v18.4S, v12.4S, v0.S[1] // ........................e................................................... + sqrdmulh v18.4S, v26.4S, v0.S[1] // ..................e......................................................... // gap // ............................................................................ + mul v10.4S, v27.4S, v0.S[2] // ..................................*......................................... // gap // ............................................................................ - mls v4.4S, v29.4S, v8.S[0] // ....................e....................................................... - mul v29.4S, v12.4S, v0.S[0] // .......................e.................................................... - mls v15.4S, v28.4S, v8.S[0] // ..................................................*......................... + sub v22.4S, v13.4S, v5.4S // ...........*................................................................ + mul v27.4S, v26.4S, v0.S[0] // ...................e........................................................ + ldr q16, [x0, #528] // ....e....................................................................... + add v15.4S, v13.4S, v5.4S // ............*............................................................... + mul v13.4S, v23.4S, v0.S[0] // ........................e................................................... // gap // ............................................................................ // gap // ............................................................................ - add v14.4S, v14.4S, v30.4S // ...............................................*............................ + add v24.4S, v22.4S, v17.4S // ..........................................*................................. + mls v10.4S, v7.4S, v8.S[0] // ...................................*........................................ + ldr q30, [x0, #400] // ...e........................................................................ // gap // ............................................................................ + sqrdmulh v25.4S, v23.4S, v0.S[1] // .......................e.................................................... + mls v9.4S, v20.4S, v8.S[0] // ...............e............................................................ // gap // ............................................................................ - mls v23.4S, v21.4S, v8.S[0] // .................................................................*.......... - sub v5.4S, v22.4S, v9.4S // .........................................*.................................. // gap // ............................................................................ - ldr q30, [x0, #400] // ...e........................................................................ - add v21.4S, v17.4S, v4.4S // ......................e..................................................... - mls v29.4S, v18.4S, v8.S[0] // .........................e.................................................. - sub v28.4S, v19.4S, v15.4S // ...................................................*........................ + add v4.4S, v24.4S, v21.4S // ..............................................................*............. + sqrdmulh v29.4S, v16.4S, v0.S[1] // ........e................................................................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v6.4S, v21.4S, v0.S[3] // .............................e.............................................. - add v18.4S, v5.4S, v23.4S // ...................................................................*........ // gap // ............................................................................ // gap // ............................................................................ - str q28, [x0, #128] // .....................................................................*...... - mul v21.4S, v21.4S, v0.S[2] // ............................e............................................... - mls v16.4S, v13.4S, v8.S[0] // ...............e............................................................ + mul v5.4S, v11.4S, v0.S[2] // .............................*.............................................. + add v26.4S, v12.4S, v10.4S // .....................................*...................................... // gap // ............................................................................ - add v12.4S, v19.4S, v15.4S // ....................................................*....................... - add v13.4S, v30.4S, v29.4S // ...........................e................................................ + sub v20.4S, v12.4S, v10.4S // ....................................*....................................... + mls v13.4S, v25.4S, v8.S[0] // .........................e.................................................. // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v19.4S, v14.4S, v2.S[3] // ...........................................................*................ // gap // ............................................................................ + sqrdmulh v12.4S, v26.4S, v1.S[3] // ................................................*........................... + mul v23.4S, v26.4S, v1.S[2] // .................................................*.......................... + sub v6.4S, v14.4S, v9.4S // ................e........................................................... // gap // ............................................................................ - sub v26.4S, v30.4S, v29.4S // ..........................e................................................. - str q12, [x0], #(16) // ....................................................................*....... - sqrdmulh v25.4S, v13.4S, v0.S[3] // ..................................e......................................... - mul v15.4S, v13.4S, v0.S[2] // .................................e.......................................... + mls v5.4S, v28.4S, v8.S[0] // ..............................*............................................. // gap // ............................................................................ - mul v28.4S, v14.4S, v2.S[2] // ..........................................................*................. - mul v30.4S, v26.4S, v1.S[0] // ...........................................e................................ // gap // ............................................................................ // gap // ............................................................................ - sub v29.4S, v24.4S, v27.4S // ........................................................*................... + sqrdmulh v10.4S, v20.4S, v2.S[1] // .....................................................*...................... + mul v26.4S, v20.4S, v2.S[0] // ......................................................*..................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v27.4S, v26.4S, v1.S[1] // ............................................e............................... - mls v15.4S, v25.4S, v8.S[0] // ...................................e........................................ - sqrdmulh v7.4S, v7.4S, v0.S[1] // .........e.................................................................. + sub v25.4S, v30.4S, v13.4S // ..........................e................................................. + mls v23.4S, v12.4S, v8.S[0] // ..................................................*......................... + sub v11.4S, v15.4S, v5.4S // ...............................*............................................ + add v28.4S, v15.4S, v5.4S // ................................*........................................... // gap // ............................................................................ // gap // ............................................................................ - add v31.4S, v11.4S, v16.4S // .................e.......................................................... - mls v28.4S, v19.4S, v8.S[0] // ............................................................*............... // gap // ............................................................................ // gap // ............................................................................ - mls v30.4S, v27.4S, v8.S[0] // .............................................e.............................. + mul v5.4S, v25.4S, v1.S[0] // ............................................e............................... + mls v26.4S, v10.4S, v8.S[0] // .......................................................*.................... // gap // ............................................................................ + add v12.4S, v28.4S, v23.4S // ....................................................*....................... // gap // ............................................................................ - add v27.4S, v22.4S, v9.4S // ..........................................*................................. - mls v10.4S, v7.4S, v8.S[0] // ..........e................................................................. + mul v10.4S, v31.4S, v3.S[0] // ................................................................*........... + sub v20.4S, v28.4S, v23.4S // ...................................................*........................ + sqrdmulh v31.4S, v25.4S, v1.S[1] // ...........................................e................................ + ldr q15, [x0, #272] // ..e......................................................................... // gap // ............................................................................ + mls v27.4S, v18.4S, v8.S[0] // ....................e....................................................... // gap // ............................................................................ - sub v7.4S, v31.4S, v15.4S // ....................................e....................................... - sub v9.4S, v27.4S, v28.4S // .............................................................*.............. - add v25.4S, v27.4S, v28.4S // ..............................................................*............. + add v23.4S, v11.4S, v26.4S // .........................................................*.................. + str q12, [x0], #(16) // ....................................................................*....... // gap // ............................................................................ + sub v26.4S, v11.4S, v26.4S // ........................................................*................... + mls v10.4S, v19.4S, v8.S[0] // .................................................................*.......... + str q20, [x0, #112] // .....................................................................*...... + sub v28.4S, v22.4S, v17.4S // .........................................*.................................. + mls v5.4S, v31.4S, v8.S[0] // .............................................e.............................. // gap // ............................................................................ - mul v27.4S, v7.4S, v2.S[0] // .....................................................e...................... - sqrdmulh v28.4S, v7.4S, v2.S[1] // ......................................................e..................... + str q23, [x0, #240] // ......................................................................*..... + add v11.4S, v15.4S, v27.4S // ......................e..................................................... // gap // ............................................................................ - str q29, [x0, #368] // .......................................................................*.... - mls v21.4S, v6.4S, v8.S[0] // ..............................e............................................. + sub v17.4S, v24.4S, v21.4S // .............................................................*.............. + str q26, [x0, #368] // .......................................................................*.... // gap // ............................................................................ - add v7.4S, v20.4S, v10.4S // ............e............................................................... - str q25, [x0, #496] // ........................................................................*... + sub v25.4S, v15.4S, v27.4S // .....................e...................................................... + add v19.4S, v28.4S, v10.4S // ...................................................................*........ + str q4, [x0, #496] // ........................................................................*... + sub v31.4S, v6.4S, v5.4S // ..............................................e............................. // gap // ............................................................................ - sub v14.4S, v11.4S, v16.4S // ................e........................................................... - sub v5.4S, v5.4S, v23.4S // ..................................................................*......... - str q9, [x0, #624] // .........................................................................*.. - sub v22.4S, v20.4S, v10.4S // ...........e................................................................ - mls v27.4S, v28.4S, v8.S[0] // .......................................................e.................... + sub v27.4S, v28.4S, v10.4S // ..................................................................*......... + str q17, [x0, #624] // .........................................................................*.. // gap // ............................................................................ - str q18, [x0, #752] // ..........................................................................*. - sub v24.4S, v7.4S, v21.4S // ...............................e............................................ + mul v17.4S, v25.4S, v1.S[0] // .......................................e.................................... + add v22.4S, v6.4S, v5.4S // ...............................................e............................ + str q19, [x0, #752] // ..........................................................................*. // gap // ............................................................................ - str q5, [x0, #880] // ...........................................................................* - add v19.4S, v7.4S, v21.4S // ................................e........................................... - - // original source code - // ldr q9, [x0, #0] // .............e..............................................................|............e............................................................. - // ldr q10, [x0, #(1*(1024/8))] // ......e.....................................................................|.....e.................................................................... - // ldr q11, [x0, #(2*(1024/8))] // .................e..........................................................|................e......................................................... - // ldr q12, [x0, #(3*(1024/8))] // ...............................e............................................|..............................e........................................... - // ldr q13, [x0, #(4*(1024/8))] // ....e.......................................................................|...e...................................................................... - // ldr q14, [x0, #(5*(1024/8))] // e...........................................................................e.......................................................................... - // ldr q15, [x0, #(6*(1024/8))] // ...e........................................................................|..e....................................................................... - // ldr q16, [x0, #(7*(1024/8))] // ..........e.................................................................|.........e................................................................ - // mul v24.4s, v13.4s, v0.s[0] // ......................e.....................................................|.....................e.................................................... - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ....................................................e.......................|...................................................e...................... - // mls v24.4s, v13.4s, v8.s[0] // .........................................................e..................|........................................................e................. - // sub v13.4s, v9.4s, v24.4s // ......................................................................e.....|.....................................................................e.... - // add v9.4s, v9.4s, v24.4s // .................................................................e..........|................................................................e......... - // mul v24.4s, v14.4s, v0.s[0] // ..................e.........................................................|.................e........................................................ - // sqrdmulh v14.4s, v14.4s, v0.s[1] // ................e...........................................................|...............e.......................................................... - // mls v24.4s, v14.4s, v8.s[0] // .......................................e....................................|......................................e................................... - // sub v14.4s, v10.4s, v24.4s // ...................................................................e........|..................................................................e....... - // add v10.4s, v10.4s, v24.4s // .....................................................e......................|....................................................e..................... - // mul v24.4s, v15.4s, v0.s[0] // ...............e............................................................|..............e........................................................... - // sqrdmulh v15.4s, v15.4s, v0.s[1] // ...................e........................................................|..................e....................................................... - // mls v24.4s, v15.4s, v8.s[0] // .........................e..................................................|........................e................................................. - // sub v15.4s, v11.4s, v24.4s // .*..........................................................................|*......................................................................... - // add v11.4s, v11.4s, v24.4s // ................................e...........................................|...............................e.......................................... - // mul v24.4s, v16.4s, v0.s[0] // ..........................e.................................................|.........................e................................................ - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ........................e...................................................|.......................e.................................................. - // mls v24.4s, v16.4s, v8.s[0] // .................................e..........................................|................................e......................................... - // sub v16.4s, v12.4s, v24.4s // ...........................................e................................|..........................................e............................... - // add v12.4s, v12.4s, v24.4s // .........................................e..................................|........................................e................................. - // mul v24.4s, v11.4s, v0.s[2] // ......................................e.....................................|.....................................e.................................... - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ...................................e........................................|..................................e....................................... - // mls v24.4s, v11.4s, v8.s[0] // ................................................................e...........|...............................................................e.......... - // sub v11.4s, v9.4s, v24.4s // .........................................................................e..|........................................................................e. - // add v9.4s, v9.4s, v24.4s // ...........................................................................e|.......................................................................... - // mul v24.4s, v12.4s, v0.s[2] // ..............................................e.............................|.............................................e............................ - // sqrdmulh v12.4s, v12.4s, v0.s[3] // .............................................e..............................|............................................e............................. - // mls v24.4s, v12.4s, v8.s[0] // ...................................................e........................|..................................................e....................... - // sub v12.4s, v10.4s, v24.4s // ..........................................................e.................|.........................................................e................ - // add v10.4s, v10.4s, v24.4s // .......*....................................................................|......*................................................................... - // mul v24.4s, v15.4s, v1.s[0] // ..............*.............................................................|.............*............................................................ - // sqrdmulh v15.4s, v15.4s, v1.s[1] // .........*..................................................................|........*................................................................. - // mls v24.4s, v15.4s, v8.s[0] // ....................*.......................................................|...................*...................................................... - // sub v15.4s, v13.4s, v24.4s // ..............................*.............................................|.............................*............................................ - // add v13.4s, v13.4s, v24.4s // ........................................................*...................|.......................................................*.................. - // mul v24.4s, v16.4s, v1.s[0] // ................................................e...........................|...............................................e.......................... - // sqrdmulh v16.4s, v16.4s, v1.s[1] // ..................................................e.........................|.................................................e........................ - // mls v24.4s, v16.4s, v8.s[0] // .......................................................e....................|......................................................e................... - // sub v16.4s, v14.4s, v24.4s // ..*.........................................................................|.*........................................................................ - // add v14.4s, v14.4s, v24.4s // ............................*...............................................|...........................*.............................................. - // mul v24.4s, v10.4s, v1.s[2] // ...........*................................................................|..........*............................................................... - // sqrdmulh v10.4s, v10.4s, v1.s[3] // .....................*......................................................|....................*..................................................... - // mls v24.4s, v10.4s, v8.s[0] // ...........................*................................................|..........................*............................................... - // sub v10.4s, v9.4s, v24.4s // ..................................*.........................................|.................................*........................................ - // add v9.4s, v9.4s, v24.4s // ........................................*...................................|.......................................*.................................. - // mul v24.4s, v12.4s, v2.s[0] // .............................................................e..............|............................................................e............. - // sqrdmulh v12.4s, v12.4s, v2.s[1] // ..............................................................e.............|.............................................................e............ - // mls v24.4s, v12.4s, v8.s[0] // .......................................................................e....|......................................................................e... - // sub v12.4s, v11.4s, v24.4s // .................................................*..........................|................................................*......................... - // add v11.4s, v11.4s, v24.4s // .....*......................................................................|....*..................................................................... - // mul v24.4s, v14.4s, v2.s[2] // ...............................................*............................|..............................................*........................... - // sqrdmulh v14.4s, v14.4s, v2.s[3] // ..........................................*.................................|.........................................*................................ - // mls v24.4s, v14.4s, v8.s[0] // ......................................................*.....................|.....................................................*.................... - // sub v14.4s, v13.4s, v24.4s // ...........................................................*................|..........................................................*............... - // add v13.4s, v13.4s, v24.4s // ............................................................*...............|...........................................................*.............. - // mul v24.4s, v16.4s, v3.s[0] // .......................*....................................................|......................*................................................... - // sqrdmulh v16.4s, v16.4s, v3.s[1] // ........*...................................................................|.......*.................................................................. - // mls v24.4s, v16.4s, v8.s[0] // .............................*..............................................|............................*............................................. - // sub v16.4s, v15.4s, v24.4s // ....................................................................*.......|...................................................................*...... - // add v15.4s, v15.4s, v24.4s // ....................................*.......................................|...................................*...................................... - // str q9, [x0], #(16) // ............................................*...............................|...........................................*.............................. - // str q10, [x0, #(-16 + 1*(1024/8))] // .....................................*......................................|....................................*..................................... - // str q11, [x0, #(-16 + 2*(1024/8))] // ............*...............................................................|...........*.............................................................. - // str q12, [x0, #(-16 + 3*(1024/8))] // ...............................................................*............|..............................................................*........... - // str q13, [x0, #(-16 + 4*(1024/8))] // ..................................................................*.........|.................................................................*........ - // str q14, [x0, #(-16 + 5*(1024/8))] // .....................................................................*......|....................................................................*..... - // str q15, [x0, #(-16 + 6*(1024/8))] // ........................................................................*...|.......................................................................*.. - // str q16, [x0, #(-16 + 7*(1024/8))] // ..........................................................................*.|.........................................................................* + sqrdmulh v12.4S, v25.4S, v1.S[1] // ......................................e..................................... + sqrdmulh v19.4S, v31.4S, v3.S[1] // ...............................................................e............ + str q27, [x0, #880] // ...........................................................................* + + // ------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------ + // ldr q9, [x0, #0] // .........~................................................................'..........*................................................................ + // ldr q10, [x0, #(1*(1024/8))] // ......e...................................................................'.......~................................................................... + // ldr q11, [x0, #(2*(1024/8))] // .................................................e........................'..................................................~........................ + // ldr q12, [x0, #(3*(1024/8))] // ........................e.................................................'.........................~................................................. + // ldr q13, [x0, #(4*(1024/8))] // ...................e......................................................'....................~...................................................... + // ldr q14, [x0, #(5*(1024/8))] // e.........................................................................'.~......................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .....e....................................................................'......~.................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ..........e...............................................................'...........~............................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ............................e.............................................'.............................~............................................. + // mul v24.4s, v13.4s, v0.s[0] // .~........................................................................'..*........................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ............~.............................................................'.............*............................................................. + // sub v13.4s, v9.4s, v24.4s // .................~........................................................'..................*........................................................ + // add v9.4s, v9.4s, v24.4s // ....................~.....................................................'.....................*..................................................... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ...........e..............................................................'............~.............................................................. + // mul v24.4s, v14.4s, v0.s[0] // ..............e...........................................................'...............~........................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................e...............................................'...........................~............................................... + // sub v14.4s, v10.4s, v24.4s // ...................................e......................................'....................................~...................................... + // add v10.4s, v10.4s, v24.4s // ....~.....................................................................'.....*..................................................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ...............e..........................................................'................~.......................................................... + // mul v24.4s, v15.4s, v0.s[0] // ..................e.......................................................'...................~....................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................e.......................'...................................................~....................... + // sub v15.4s, v11.4s, v24.4s // ..............................................................e...........'...............................................................~........... + // add v11.4s, v11.4s, v24.4s // ...........................................................e..............'............................................................~.............. + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .........................e................................................'..........................~................................................ + // mul v24.4s, v16.4s, v0.s[0] // .....................e....................................................'......................~.................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................................e.........................................'.................................~......................................... + // sub v16.4s, v12.4s, v24.4s // .......................................e..................................'........................................~.................................. + // add v12.4s, v12.4s, v24.4s // ........~.................................................................'.........*................................................................. + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ..~.......................................................................'...*....................................................................... + // mul v24.4s, v11.4s, v0.s[2] // .............................~............................................'..............................*............................................ + // mls v24.4s, v27.4s, v8.s[0] // ....................................~.....................................'.....................................*..................................... + // sub v11.4s, v9.4s, v24.4s // .........................................~................................'..........................................*................................ + // add v9.4s, v9.4s, v24.4s // ..........................................~...............................'...........................................*............................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .............~............................................................'..............*............................................................ + // mul v24.4s, v12.4s, v0.s[2] // ................~.........................................................'.................*......................................................... + // mls v24.4s, v27.4s, v8.s[0] // .......................~..................................................'........................*.................................................. + // sub v12.4s, v10.4s, v24.4s // ...............................~..........................................'................................*.......................................... + // add v10.4s, v10.4s, v24.4s // ..............................~...........................................'...............................*........................................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .......................................................................e..'........................................................................~.. + // mul v24.4s, v15.4s, v1.s[0] // ....................................................................e.....'.....................................................................~..... + // mls v24.4s, v27.4s, v8.s[0] // ...~......................................................................'....*...................................................................... + // sub v15.4s, v13.4s, v24.4s // ........................................................~.................'.........................................................*................. + // add v13.4s, v13.4s, v24.4s // ......................~...................................................'.......................*................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ................................................e.........................'.................................................~......................... + // mul v24.4s, v16.4s, v1.s[0] // ...........................................e..............................'............................................~.............................. + // mls v24.4s, v27.4s, v8.s[0] // .........................................................e................'..........................................................~................ + // sub v16.4s, v14.4s, v24.4s // .................................................................e........'..................................................................~........ + // add v14.4s, v14.4s, v24.4s // .....................................................................e....'......................................................................~.... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .................................~........................................'..................................*........................................ + // mul v24.4s, v10.4s, v1.s[2] // ..................................~.......................................'...................................*....................................... + // mls v24.4s, v27.4s, v8.s[0] // ........................................~.................................'.........................................*................................. + // sub v10.4s, v9.4s, v24.4s // ...............................................~..........................'................................................*.......................... + // add v9.4s, v9.4s, v24.4s // .............................................~............................'..............................................*............................ + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .....................................~....................................'......................................*.................................... + // mul v24.4s, v12.4s, v2.s[0] // ......................................~...................................'.......................................*................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................~.............................'.............................................*............................. + // sub v12.4s, v11.4s, v24.4s // .....................................................~....................'......................................................*.................... + // add v11.4s, v11.4s, v24.4s // ...................................................~......................'....................................................*...................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ..........................................................................'*.......................................................................... + // mul v24.4s, v14.4s, v2.s[2] // ..........................................................................*........................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .......~..................................................................'........*.................................................................. + // sub v14.4s, v13.4s, v24.4s // ............................................................~.............'.............................................................*............. + // add v13.4s, v13.4s, v24.4s // ...........................~..............................................'............................*.............................................. + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ........................................................................e.'.........................................................................~. + // mul v24.4s, v16.4s, v3.s[0] // ..............................................~...........................'...............................................*........................... + // mls v24.4s, v27.4s, v8.s[0] // ......................................................~...................'.......................................................*................... + // sub v16.4s, v15.4s, v24.4s // ..................................................................~.......'...................................................................*....... + // add v15.4s, v15.4s, v24.4s // ...............................................................~..........'................................................................*.......... + // str q9, [x0], #(16) // ....................................................~.....................'.....................................................*..................... + // str q10, [x0, #(-16 + 1*(1024/8))] // .......................................................~..................'........................................................*.................. + // str q11, [x0, #(-16 + 2*(1024/8))] // ..........................................................~...............'...........................................................*............... + // str q12, [x0, #(-16 + 3*(1024/8))] // .............................................................~............'..............................................................*............ + // str q13, [x0, #(-16 + 4*(1024/8))] // ................................................................~.........'.................................................................*......... + // str q14, [x0, #(-16 + 5*(1024/8))] // ...................................................................~......'....................................................................*...... + // str q15, [x0, #(-16 + 6*(1024/8))] // ......................................................................~...'.......................................................................*... + // str q16, [x0, #(-16 + 7*(1024/8))] // .........................................................................~'..........................................................................* sub count, count, #1 cbnz count, layer123_start - sub v4.4S, v17.4S, v4.4S // *................................. - // gap // .................................. - add v20.4S, v31.4S, v15.4S // ...*.............................. - // gap // .................................. - // gap // .................................. - sub v17.4S, v14.4S, v30.4S // .*................................ - // gap // .................................. - add v7.4S, v14.4S, v30.4S // .............*.................... - sqrdmulh v29.4S, v4.4S, v1.S[1] // .....*............................ - mul v15.4S, v4.4S, v1.S[0] // ........*......................... - // gap // .................................. - // gap // .................................. - mul v14.4S, v20.4S, v1.S[2] // ......*........................... - sqrdmulh v18.4S, v20.4S, v1.S[3] // ..........*....................... - // gap // .................................. - // gap // .................................. - sqrdmulh v28.4S, v17.4S, v3.S[1] // ....*............................. - mul v30.4S, v17.4S, v3.S[0] // ...........*...................... - // gap // .................................. - // gap // .................................. - mul v4.4S, v7.4S, v2.S[2] // ......................*........... - // gap // .................................. - // gap // .................................. - mls v15.4S, v29.4S, v8.S[0] // .........*........................ - sqrdmulh v16.4S, v7.4S, v2.S[3] // ....................*............. - // gap // .................................. - // gap // .................................. - mls v14.4S, v18.4S, v8.S[0] // ............*..................... - mls v30.4S, v28.4S, v8.S[0] // ..............*................... - sub v26.4S, v24.4S, v27.4S // .......................*.......... - // gap // .................................. - // gap // .................................. - sub v11.4S, v22.4S, v15.4S // ...............*.................. - // gap // .................................. - // gap // .................................. - add v28.4S, v24.4S, v27.4S // ..*............................... - add v21.4S, v19.4S, v14.4S // ...................*.............. - mls v4.4S, v16.4S, v8.S[0] // ........................*......... - // gap // .................................. - str q26, [x0, #384] // ............................*..... - sub v16.4S, v11.4S, v30.4S // ..............................*... - // gap // .................................. - add v12.4S, v22.4S, v15.4S // .........................*........ - str q28, [x0, #256] // .......*.......................... - str q21, [x0], #(16) // .....................*............ - sub v20.4S, v19.4S, v14.4S // ................*................. - // gap // .................................. - // gap // .................................. - sub v29.4S, v12.4S, v4.4S // ..........................*....... - str q16, [x0, #880] // .................................* - add v4.4S, v12.4S, v4.4S // ...........................*...... - // gap // .................................. - // gap // .................................. - str q20, [x0, #112] // ..................*............... - // gap // .................................. - add v31.4S, v11.4S, v30.4S // .................*................ - str q29, [x0, #624] // ...............................*.. - // gap // .................................. - // gap // .................................. - // gap // .................................. - str q31, [x0, #752] // ................................*. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - str q4, [x0, #496] // .............................*.... - - // original source code - // sub v16.4S, v17.4S, v4.4S // *................................. - // sub v23.4S, v14.4S, v30.4S // ..*............................... - // add v4.4S, v24.4S, v27.4S // .................*................ - // add v10.4S, v31.4S, v15.4S // .*................................ - // sqrdmulh v21.4S, v23.4S, v3.S[1] // ........*......................... - // sqrdmulh v18.4S, v16.4S, v1.S[1] // ....*............................. - // mul v15.4S, v10.4S, v1.S[2] // ......*........................... - // str q4, [x0, #256] // .......................*.......... - // mul v9.4S, v16.4S, v1.S[0] // .....*............................ - // mls v9.4S, v18.4S, v8.S[0] // ...........*...................... - // sqrdmulh v28.4S, v10.4S, v1.S[3] // .......*.......................... - // mul v23.4S, v23.4S, v3.S[0] // .........*........................ - // mls v15.4S, v28.4S, v8.S[0] // .............*.................... - // add v14.4S, v14.4S, v30.4S // ...*.............................. - // mls v23.4S, v21.4S, v8.S[0] // ..............*................... - // sub v5.4S, v22.4S, v9.4S // ................*................. - // sub v28.4S, v19.4S, v15.4S // .........................*........ - // add v18.4S, v5.4S, v23.4S // ..............................*... - // str q28, [x0, #128] // .............................*.... - // add v12.4S, v19.4S, v15.4S // ..................*............... - // sqrdmulh v19.4S, v14.4S, v2.S[3] // ............*..................... - // str q12, [x0], #(16) // ........................*......... - // mul v28.4S, v14.4S, v2.S[2] // ..........*....................... - // sub v29.4S, v24.4S, v27.4S // ...............*.................. - // mls v28.4S, v19.4S, v8.S[0] // ...................*.............. - // add v27.4S, v22.4S, v9.4S // ......................*........... - // sub v9.4S, v27.4S, v28.4S // ..........................*....... - // add v25.4S, v27.4S, v28.4S // ............................*..... - // str q29, [x0, #368] // ....................*............. - // str q25, [x0, #496] // .................................* - // sub v5.4S, v5.4S, v23.4S // .....................*............ - // str q9, [x0, #624] // ...............................*.. - // str q18, [x0, #752] // ................................*. - // str q5, [x0, #880] // ...........................*...... + // Instructions: 47 + // Expected cycles: 23 + // Expected IPC: 2.04 + // + // Wall time: 3.25s + // User time: 3.25s + // + // ------------- original position --------------> + // 0 25 + // |------------------------|--------------------- + mul v10.4S, v31.4S, v3.S[0] // ..............................*................ + add v27.4S, v30.4S, v13.4S // .......*....................................... + ldr q26, [x0, #0] // ........*...................................... + // gap // ............................................... + mul v31.4S, v11.4S, v0.S[2] // .................*............................. + mul v30.4S, v16.4S, v0.S[0] // ..*............................................ + // gap // ............................................... + // gap // ............................................... + mul v16.4S, v27.4S, v0.S[2] // ...........*................................... + sqrdmulh v27.4S, v27.4S, v0.S[3] // ..........*.................................... + // gap // ............................................... + // gap // ............................................... + mls v10.4S, v19.4S, v8.S[0] // ...................................*........... + sqrdmulh v4.4S, v11.4S, v0.S[3] // ...*........................................... + // gap // ............................................... + // gap // ............................................... + sqrdmulh v15.4S, v22.4S, v2.S[3] // .*............................................. + // gap // ............................................... + // gap // ............................................... + mls v30.4S, v29.4S, v8.S[0] // .........*..................................... + mls v17.4S, v12.4S, v8.S[0] // ....*.......................................... + mls v16.4S, v27.4S, v8.S[0] // ...............*............................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mls v31.4S, v4.4S, v8.S[0] // ......................*........................ + // gap // ............................................... + add v24.4S, v14.4S, v9.4S // .....*......................................... + sub v28.4S, v26.4S, v30.4S // ............*.................................. + add v30.4S, v26.4S, v30.4S // .............*................................. + // gap // ............................................... + // gap // ............................................... + mul v22.4S, v22.4S, v2.S[2] // *.............................................. + add v19.4S, v24.4S, v16.4S // ..................*............................ + // gap // ............................................... + // gap // ............................................... + sub v7.4S, v30.4S, v31.4S // ..........................*.................... + // gap // ............................................... + // gap // ............................................... + sub v13.4S, v28.4S, v17.4S // .....................................*......... + sqrdmulh v4.4S, v19.4S, v1.S[3] // ....................*.......................... + sub v14.4S, v24.4S, v16.4S // ...................*........................... + // gap // ............................................... + // gap // ............................................... + add v5.4S, v13.4S, v10.4S // .........................................*..... + mul v29.4S, v19.4S, v1.S[2] // .....................*......................... + // gap // ............................................... + // gap // ............................................... + mul v21.4S, v14.4S, v2.S[0] // ........................*...................... + // gap // ............................................... + // gap // ............................................... + mls v22.4S, v15.4S, v8.S[0] // ......*........................................ + add v19.4S, v28.4S, v17.4S // ..............*................................ + str q5, [x0, #768] // .............................................*. + // gap // ............................................... + sqrdmulh v16.4S, v14.4S, v2.S[1] // .......................*....................... + sub v18.4S, v13.4S, v10.4S // ...........................................*... + mls v29.4S, v4.4S, v8.S[0] // .........................*..................... + // gap // ............................................... + // gap // ............................................... + add v9.4S, v19.4S, v22.4S // ................*.............................. + add v26.4S, v30.4S, v31.4S // ...........................*................... + // gap // ............................................... + // gap // ............................................... + str q18, [x0, #896] // ..............................................* + mls v21.4S, v16.4S, v8.S[0] // ............................*.................. + // gap // ............................................... + sub v24.4S, v19.4S, v22.4S // .......................................*....... + sub v31.4S, v26.4S, v29.4S // ...............................*............... + str q9, [x0, #512] // ..........................................*.... + add v30.4S, v26.4S, v29.4S // .............................*................. + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + str q24, [x0, #640] // ............................................*.. + str q31, [x0, #128] // ....................................*.......... + sub v6.4S, v7.4S, v21.4S // ..................................*............ + add v7.4S, v7.4S, v21.4S // ................................*.............. + // gap // ............................................... + str q30, [x0], #(16) // .................................*............. + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + str q6, [x0, #368] // ........................................*...... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + str q7, [x0, #240] // ......................................*........ + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + + // ---------------- new position ----------------> + // 0 25 + // |------------------------|--------------------- + // mul v21.4S, v22.4S, v2.S[2] // .................*............................. + // sqrdmulh v27.4S, v22.4S, v2.S[3] // .........*..................................... + // mul v5.4S, v16.4S, v0.S[0] // ....*.......................................... + // sqrdmulh v28.4S, v11.4S, v0.S[3] // ........*...................................... + // mls v17.4S, v12.4S, v8.S[0] // ...........*................................... + // add v12.4S, v14.4S, v9.4S // ..............*................................ + // mls v21.4S, v27.4S, v8.S[0] // ..........................*.................... + // add v27.4S, v30.4S, v13.4S // .*............................................. + // ldr q13, [x0, #0] // ..*............................................ + // mls v5.4S, v29.4S, v8.S[0] // ..........*.................................... + // sqrdmulh v7.4S, v27.4S, v0.S[3] // ......*........................................ + // mul v10.4S, v27.4S, v0.S[2] // .....*......................................... + // sub v22.4S, v13.4S, v5.4S // ...............*............................... + // add v15.4S, v13.4S, v5.4S // ................*.............................. + // add v24.4S, v22.4S, v17.4S // ...........................*................... + // mls v10.4S, v7.4S, v8.S[0] // ............*.................................. + // add v4.4S, v24.4S, v21.4S // ................................*.............. + // mul v5.4S, v11.4S, v0.S[2] // ...*........................................... + // add v26.4S, v12.4S, v10.4S // ..................*............................ + // sub v20.4S, v12.4S, v10.4S // ......................*........................ + // sqrdmulh v12.4S, v26.4S, v1.S[3] // .....................*......................... + // mul v23.4S, v26.4S, v1.S[2] // ........................*...................... + // mls v5.4S, v28.4S, v8.S[0] // .............*................................. + // sqrdmulh v10.4S, v20.4S, v2.S[1] // .............................*................. + // mul v26.4S, v20.4S, v2.S[0] // .........................*..................... + // mls v23.4S, v12.4S, v8.S[0] // ...............................*............... + // sub v11.4S, v15.4S, v5.4S // ...................*........................... + // add v28.4S, v15.4S, v5.4S // .................................*............. + // mls v26.4S, v10.4S, v8.S[0] // ...................................*........... + // add v12.4S, v28.4S, v23.4S // .......................................*....... + // mul v10.4S, v31.4S, v3.S[0] // *.............................................. + // sub v20.4S, v28.4S, v23.4S // .....................................*......... + // add v23.4S, v11.4S, v26.4S // ...........................................*... + // str q12, [x0], #(16) // ............................................*.. + // sub v26.4S, v11.4S, v26.4S // ..........................................*.... + // mls v10.4S, v19.4S, v8.S[0] // .......*....................................... + // str q20, [x0, #112] // .........................................*..... + // sub v28.4S, v22.4S, v17.4S // ....................*.......................... + // str q23, [x0, #240] // ..............................................* + // sub v17.4S, v24.4S, v21.4S // ....................................*.......... + // str q26, [x0, #368] // .............................................*. + // add v19.4S, v28.4S, v10.4S // .......................*....................... + // str q4, [x0, #496] // ......................................*........ + // sub v27.4S, v28.4S, v10.4S // ..............................*................ + // str q17, [x0, #624] // ........................................*...... + // str q19, [x0, #752] // ............................*.................. + // str q27, [x0, #880] // ..................................*............ restore inp, STACK0 @@ -842,870 +862,846 @@ layer123_start: sub inpp, inpp, #64 .p2align 2 - ldr q16, [x2, #112] // .*.......................................................................................... - ldr q22, [x4], #64 // *........................................................................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - ldr q24, [x5, #112] // ......................................................*..................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - ldr q17, [x2, #96] // ..*......................................................................................... - // gap // ............................................................................................ - ldr q3, [x2, #80] // .........*.................................................................................. - ldr q4, [x2, #64] // ...*........................................................................................ - // gap // ............................................................................................ - ldr q11, [x4, #-16] // .....*...................................................................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v27.4S, v16.4S, v22.S[0] // .......*.................................................................................... - sqrdmulh v30.4S, v16.4S, v22.S[1] // ........*................................................................................... - ldr q2, [x4, #-32] // ...........*................................................................................ - ldr q13, [x4, #-48] // ............*............................................................................... - ldr q10, [x1, #112] // ...............*............................................................................ - mul v9.4S, v17.4S, v22.S[0] // ..........*................................................................................. - sqrdmulh v21.4S, v17.4S, v22.S[1] // ..............*............................................................................. - // gap // ............................................................................................ - ldr q26, [x1, #64] // .................................*.......................................................... - mul v28.4S, v4.4S, v22.S[0] // .............*.............................................................................. - mul v1.4S, v3.4S, v22.S[0] // ....................*....................................................................... - // gap // ............................................................................................ - mls v27.4S, v30.4S, v8.S[0] // ................*........................................................................... - sqrdmulh v4.4S, v4.4S, v22.S[1] // .................*.......................................................................... - ldr q30, [x1, #96] // ....*....................................................................................... - // gap // ............................................................................................ - sqrdmulh v16.4S, v3.4S, v22.S[1] // ..................*......................................................................... - ldr q18, [x5, #48] // ................................................................................*........... - mls v9.4S, v21.4S, v8.S[0] // ...................*........................................................................ - // gap // ............................................................................................ - ldr q6, [x5, #128] // ...................................................................*........................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - sub v31.4S, v10.4S, v27.4S // ......................*..................................................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - mls v28.4S, v4.4S, v8.S[0] // ..............................*............................................................. - // gap // ............................................................................................ - mls v1.4S, v16.4S, v8.S[0] // ...........................*................................................................ - sub v16.4S, v30.4S, v9.4S // ............................*............................................................... - // gap // ............................................................................................ - sqrdmulh v29.4S, v31.4S, v13.S[1] // .........................*.................................................................. - mul v4.4S, v31.4S, v13.S[0] // ..........................*................................................................. - ldr q31, [x1, #80] // .....................*...................................................................... - // gap // ............................................................................................ - mul v15.4S, v16.4S, v13.S[0] // ...............................*............................................................ - sqrdmulh v16.4S, v16.4S, v13.S[1] // ....................................*....................................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - sub v21.4S, v26.4S, v28.4S // ........................................*................................................... - add v12.4S, v26.4S, v28.4S // .........................................................................................*.. - // gap // ............................................................................................ - // gap // ............................................................................................ - add v3.4S, v30.4S, v9.4S // .......................................*.................................................... - mls v4.4S, v29.4S, v8.S[0] // ................................*........................................................... - ldr q9, [x5, #96] // ......*..................................................................................... - // gap // ............................................................................................ - mls v15.4S, v16.4S, v8.S[0] // .............................................*.............................................. - sub v5.4S, v31.4S, v1.4S // ...................................*........................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - sqrdmulh v28.4S, v3.4S, v22.S[3] // ...................................................................................*........ - mul v23.4S, v3.4S, v22.S[2] // ......................................................................................*..... - // gap // ............................................................................................ - // gap // ............................................................................................ - add v26.4S, v5.4S, v4.4S // ......................................*..................................................... - sub v14.4S, v5.4S, v4.4S // ..........................................*................................................. - // gap // ............................................................................................ - // gap // ............................................................................................ - sub v3.4S, v21.4S, v15.4S // .........................................................*.................................. - add v20.4S, v10.4S, v27.4S // .......................*.................................................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v25.4S, v26.4S, v2.S[2] // ...........................................*................................................ - sqrdmulh v16.4S, v26.4S, v2.S[3] // ............................................*............................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v27.4S, v14.4S, v11.S[0] // ...............................................*............................................ - sqrdmulh v10.4S, v14.4S, v11.S[1] // ................................................*........................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v19.4S, v20.4S, v22.S[2] // .............................*.............................................................. - add v30.4S, v21.4S, v15.4S // ..................................................*......................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - mls v25.4S, v16.4S, v8.S[0] // .................................................*.......................................... - sqrdmulh v16.4S, v20.4S, v22.S[3] // ........................*................................................................... - ldr q22, [x5], #(12*16) // ...........................................................................*................ - // gap // ............................................................................................ - mls v27.4S, v10.4S, v8.S[0] // ....................................................*....................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - add v29.4S, v30.4S, v25.4S // .....................................................*...................................... - sub v4.4S, v30.4S, v25.4S // .......................................................*.................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - add v10.4S, v3.4S, v27.4S // ............................................................*............................... - sub v25.4S, v3.4S, v27.4S // ...........................................................*................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - trn1 v3.4S, v29.4S, v4.4S // ...............................................................*............................ - trn2 v15.4S, v29.4S, v4.4S // ........................................................*................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - trn2 v29.4S, v10.4S, v25.4S // ..............................................................*............................. - trn1 v10.4S, v10.4S, v25.4S // .............................................................*.............................. - // gap // ............................................................................................ - // gap // ............................................................................................ - mls v19.4S, v16.4S, v8.S[0] // .....................................*...................................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - trn2 v30.2D, v15.2D, v29.2D // ................................................................*........................... - trn2 v4.2D, v3.2D, v10.2D // .................................................................*.......................... - // gap // ............................................................................................ - // gap // ............................................................................................ - trn1 v27.2D, v3.2D, v10.2D // ..................................................................................*......... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v16.4S, v30.4S, v9.4S // ......................................................................*..................... - sqrdmulh v3.4S, v30.4S, v24.4S // .......................................................................*.................... - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v21.4S, v4.4S, v9.4S // ..........................................................................*................. - ldr q9, [x5, #-32] // .........................................*.................................................. - sqrdmulh v10.4S, v4.4S, v24.4S // .....................................................................*...................... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - mls v16.4S, v3.4S, v8.S[0] // ........................................................................*................... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - trn1 v15.2D, v15.2D, v29.2D // ..................................................................*......................... - ldr q0, [x5, #-48] // ....................................................................*....................... - mls v21.4S, v10.4S, v8.S[0] // ..............................................................................*............. - // gap // ............................................................................................ - add v29.4S, v31.4S, v1.4S // ..................................*......................................................... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - add v3.4S, v15.4S, v16.4S // .............................................................................*.............. - sub v4.4S, v15.4S, v16.4S // ............................................................................*............... - ldr q15, [x5, #-16] // ...................................................*........................................ - // gap // ............................................................................................ - add v25.4S, v27.4S, v21.4S // .......................................................................................*.... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v16.4S, v3.4S, v6.4S // ...............................................................................*............ - sqrdmulh v3.4S, v3.4S, v0.4S // .................................................................................*.......... - // gap // ............................................................................................ - // gap // ............................................................................................ - add v6.4S, v29.4S, v19.4S // ..............................................*............................................. - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - mls v16.4S, v3.4S, v8.S[0] // .....................................................................................*...... - sqrdmulh v14.4S, v6.4S, v13.S[3] // ..........................................................*................................. - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v20.4S, v6.4S, v13.S[2] // .........................................................................*.................. - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - sqrdmulh v3.4S, v4.4S, v15.4S // ....................................................................................*....... - mul v4.4S, v4.4S, v9.4S // ........................................................................................*... - // gap // ............................................................................................ - // gap // ............................................................................................ - add v9.4S, v25.4S, v16.4S // ..........................................................................................*. - sub v10.4S, v25.4S, v16.4S // ...........................................................................................* - // gap // ............................................................................................ - // gap // ............................................................................................ - - // original source code - // ldr q13, [x4], #64 // .*.......................................................................................... - // ldr q30, [x2, #112] // *........................................................................................... - // ldr q0, [x2, #96] // ...*........................................................................................ - // ldr q7, [x2, #64] // .....*...................................................................................... - // ldr q14, [x1, #96] // ...................*........................................................................ - // ldr q26, [x4, #-16] // ......*..................................................................................... - // ldr q1, [x5, #96] // .....................................*...................................................... - // mul v21.4S, v30.4S, v13.S[0] // .......*.................................................................................... - // sqrdmulh v29.4S, v30.4S, v13.S[1] // ........*................................................................................... - // ldr q27, [x2, #80] // ....*....................................................................................... - // mul v31.4S, v0.4S, v13.S[0] // ............*............................................................................... - // ldr q2, [x4, #-32] // .........*.................................................................................. - // ldr q28, [x4, #-48] // ..........*................................................................................. - // mul v30.4S, v7.4S, v13.S[0] // ...............*............................................................................ - // sqrdmulh v15.4S, v0.4S, v13.S[1] // .............*.............................................................................. - // ldr q0, [x1, #112] // ...........*................................................................................ - // mls v21.4S, v29.4S, v8.S[0] // .................*.......................................................................... - // sqrdmulh v29.4S, v7.4S, v13.S[1] // ..................*......................................................................... - // sqrdmulh v25.4S, v27.4S, v13.S[1] // ....................*....................................................................... - // mls v31.4S, v15.4S, v8.S[0] // ......................*..................................................................... - // mul v7.4S, v27.4S, v13.S[0] // ................*........................................................................... - // ldr q24, [x1, #80] // ..............................*............................................................. - // sub v15.4S, v0.4S, v21.4S // ........................*................................................................... - // add v23.4S, v0.4S, v21.4S // .............................................*.............................................. - // sqrdmulh v3.4S, v23.4S, v13.S[3] // .....................................................*...................................... - // sqrdmulh v6.4S, v15.4S, v28.S[1] // ............................*............................................................... - // mul v16.4S, v15.4S, v28.S[0] // .............................*.............................................................. - // mls v7.4S, v25.4S, v8.S[0] // ..........................*................................................................. - // sub v9.4S, v14.4S, v31.4S // ...........................*................................................................ - // mul v19.4S, v23.4S, v13.S[2] // ..................................................*......................................... - // mls v30.4S, v29.4S, v8.S[0] // .........................*.................................................................. - // mul v12.4S, v9.4S, v28.S[0] // ...............................*............................................................ - // mls v16.4S, v6.4S, v8.S[0] // ....................................*....................................................... - // ldr q10, [x1, #64] // ..............*............................................................................. - // add v29.4S, v24.4S, v7.4S // .............................................................................*.............. - // sub v24.4S, v24.4S, v7.4S // .......................................*.................................................... - // sqrdmulh v4.4S, v9.4S, v28.S[1] // ................................*........................................................... - // mls v19.4S, v3.4S, v8.S[0] // ................................................................*........................... - // add v21.4S, v24.4S, v16.4S // ..........................................*................................................. - // add v11.4S, v14.4S, v31.4S // ...................................*........................................................ - // sub v14.4S, v10.4S, v30.4S // .................................*.......................................................... - // ldr q15, [x5, #160] // .......................................................................*.................... - // sub v7.4S, v24.4S, v16.4S // ...........................................*................................................ - // mul v24.4S, v21.4S, v2.S[2] // ..............................................*............................................. - // sqrdmulh v3.4S, v21.4S, v2.S[3] // ...............................................*............................................ - // mls v12.4S, v4.4S, v8.S[0] // ......................................*..................................................... - // add v21.4S, v29.4S, v19.4S // ....................................................................................*....... - // mul v4.4S, v7.4S, v26.S[0] // ................................................*........................................... - // sqrdmulh v16.4S, v7.4S, v26.S[1] // .................................................*.......................................... - // mls v24.4S, v3.4S, v8.S[0] // ....................................................*....................................... - // add v0.4S, v14.4S, v12.4S // ...................................................*........................................ - // ldr q9, [x5, #176] // ................................................................................*........... - // mls v4.4S, v16.4S, v8.S[0] // .......................................................*.................................... - // add v25.4S, v0.4S, v24.4S // ........................................................*................................... - // ldr q3, [x5, #112] // ..*......................................................................................... - // sub v7.4S, v0.4S, v24.4S // .........................................................*.................................. - // trn2 v23.4S, v25.4S, v7.4S // .............................................................*.............................. - // sub v26.4S, v14.4S, v12.4S // ............................................*............................................... - // sqrdmulh v14.4S, v21.4S, v28.S[3] // ......................................................................................*..... - // sub v27.4S, v26.4S, v4.4S // ...........................................................*................................ - // add v16.4S, v26.4S, v4.4S // ..........................................................*................................. - // trn1 v12.4S, v16.4S, v27.4S // ...............................................................*............................ - // trn2 v16.4S, v16.4S, v27.4S // ..............................................................*............................. - // trn1 v27.4S, v25.4S, v7.4S // ............................................................*............................... - // trn2 v26.2D, v23.2D, v16.2D // .................................................................*.......................... - // trn2 v17.2D, v27.2D, v12.2D // ..................................................................*......................... - // trn1 v4.2D, v23.2D, v16.2D // ..........................................................................*................. - // ldr q16, [x5, #128] // .......................*.................................................................... - // ldr q31, [x5, #144] // ...........................................................................*................ - // sqrdmulh v23.4S, v17.4S, v3.4S // ........................................................................*................... - // mul v5.4S, v26.4S, v1.4S // ....................................................................*....................... - // sqrdmulh v22.4S, v26.4S, v3.4S // .....................................................................*...................... - // mls v5.4S, v22.4S, v8.S[0] // .........................................................................*.................. - // mul v20.4S, v21.4S, v28.S[2] // .......................................................................................*.... - // mul v21.4S, v17.4S, v1.4S // ......................................................................*..................... - // ldr q22, [x5], #(12*16) // ......................................................*..................................... - // sub v0.4S, v4.4S, v5.4S // ...............................................................................*............ - // add v3.4S, v4.4S, v5.4S // ..............................................................................*............. - // mls v21.4S, v23.4S, v8.S[0] // ............................................................................*............... - // mul v7.4S, v3.4S, v16.4S // ..................................................................................*......... - // ldr q18, [x5, #-144] // .....................*...................................................................... - // sqrdmulh v4.4S, v3.4S, v31.4S // ...................................................................................*........ - // trn1 v27.2D, v27.2D, v12.2D // ...................................................................*........................ - // sqrdmulh v28.4S, v11.4S, v13.S[3] // ........................................*................................................... - // sqrdmulh v3.4S, v0.4S, v9.4S // ........................................................................................*... - // mls v7.4S, v4.4S, v8.S[0] // .....................................................................................*...... - // mul v23.4S, v11.4S, v13.S[2] // .........................................*.................................................. - // add v16.4S, v27.4S, v21.4S // .................................................................................*.......... - // mul v4.4S, v0.4S, v15.4S // .........................................................................................*.. - // add v12.4S, v10.4S, v30.4S // ..................................*......................................................... - // add v9.4S, v16.4S, v7.4S // ..........................................................................................*. - // sub v10.4S, v16.4S, v7.4S // ...........................................................................................* + // Instructions: 124 + // Expected cycles: 54 + // Expected IPC: 2.30 + // + // Wall time: 167.95s + // User time: 167.95s + // + // ---------------------------------------------------- original position ----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------------- + ldr q5, [x2, #112] // *........................................................................................................................... + ldr q1, [x4], #64 // .*.......................................................................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + ldr q22, [x1, #96] // .........*.................................................................................................................. + ldr q4, [x2, #96] // ..*......................................................................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + ldr q14, [x5, #96] // ................................................................*........................................................... + // gap // ............................................................................................................................ + ldr q20, [x2, #64] // ...*........................................................................................................................ + // gap // ............................................................................................................................ + ldr q12, [x5, #144] // ................................................................................................*........................... + ldr q21, [x1, #80] // ......*..................................................................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + sqrdmulh v17.4S, v5.4S, v1.S[1] // .......*.................................................................................................................... + mul v5.4S, v5.4S, v1.S[0] // ........*................................................................................................................... + // gap // ............................................................................................................................ + ldr q7, [x2, #80] // .....*...................................................................................................................... + ldr q13, [x5], #(12*16) // ...........................................................*................................................................ + // gap // ............................................................................................................................ + sqrdmulh v16.4S, v4.4S, v1.S[1] // ..........*................................................................................................................. + mul v23.4S, v4.4S, v1.S[0] // ...........*................................................................................................................ + ldr q28, [x1, #112] // ....*....................................................................................................................... + // gap // ............................................................................................................................ + sqrdmulh v18.4S, v20.4S, v1.S[1] // ............*............................................................................................................... + mul v10.4S, v20.4S, v1.S[0] // .............*.............................................................................................................. + ldr q31, [x4, #-48] // ..............*............................................................................................................. + mls v5.4S, v17.4S, v8.S[0] // ...............*............................................................................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mls v23.4S, v16.4S, v8.S[0] // ...................*........................................................................................................ + sqrdmulh v25.4S, v7.4S, v1.S[1] // .................*.......................................................................................................... + ldr q19, [x5, #-32] // ..................................................*......................................................................... + ldr q0, [x5, #-176] // ..............................................................*............................................................. + mls v10.4S, v18.4S, v8.S[0] // ....................*....................................................................................................... + mul v15.4S, v7.4S, v1.S[0] // ................*........................................................................................................... + ldr q4, [x4, #-32] // ..................*......................................................................................................... + // gap // ............................................................................................................................ + sub v2.4S, v28.4S, v5.4S // .......................*.................................................................................................... + add v26.4S, v28.4S, v5.4S // ........................*................................................................................................... + sub v7.4S, v22.4S, v23.4S // ...................................*........................................................................................ + // gap // ............................................................................................................................ + add v27.4S, v22.4S, v23.4S // ..........................*................................................................................................. + ldr q30, [x5, #-64] // ................................................................................*........................................... + sqrdmulh v24.4S, v26.4S, v1.S[3] // ...........................*................................................................................................ + mul v17.4S, v26.4S, v1.S[2] // ............................*............................................................................................... + ldr q20, [x1, #64] // ......................*..................................................................................................... + // gap // ............................................................................................................................ + mls v15.4S, v25.4S, v8.S[0] // .........................*.................................................................................................. + // gap // ............................................................................................................................ + ldr q16, [x5, #-80] // .................................................................*.......................................................... + sqrdmulh v6.4S, v27.4S, v1.S[3] // ...............................*............................................................................................ + mul v28.4S, v27.4S, v1.S[2] // ................................*........................................................................................... + ldr q18, [x4, #-16] // .....................................*...................................................................................... + sqrdmulh v27.4S, v7.4S, v31.S[1] // ...........................................*................................................................................ + ldr q22, [x5, #-144] // ..........................................................................................................................*. + mls v17.4S, v24.4S, v8.S[0] // ..................................*......................................................................................... + sqrdmulh v3.4S, v2.4S, v31.S[1] // ..............................*............................................................................................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + add v29.4S, v21.4S, v15.4S // .................................*.......................................................................................... + mul v1.4S, v2.4S, v31.S[0] // .............................*.............................................................................................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + add v25.4S, v20.4S, v10.4S // .................................................*.......................................................................... + sub v15.4S, v21.4S, v15.4S // ......................................*..................................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + sub v11.4S, v29.4S, v17.4S // .........................................*.................................................................................. + add v5.4S, v29.4S, v17.4S // ...............................................................*............................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mls v28.4S, v6.4S, v8.S[0] // .......................................*.................................................................................... + mls v1.4S, v3.4S, v8.S[0] // ....................................*....................................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mul v17.4S, v5.4S, v31.S[2] // ....................................................................*....................................................... + sqrdmulh v2.4S, v5.4S, v31.S[3] // .....................................................................*...................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mul v26.4S, v11.4S, v4.S[0] // .............................................*.............................................................................. + sqrdmulh v6.4S, v11.4S, v4.S[1] // ............................................*............................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mul v29.4S, v7.4S, v31.S[0] // ........................................*................................................................................... + sub v5.4S, v15.4S, v1.4S // ..........................................*................................................................................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + add v23.4S, v25.4S, v28.4S // .......................................................................*.................................................... + mls v17.4S, v2.4S, v8.S[0] // .........................................................................*.................................................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mls v26.4S, v6.4S, v8.S[0] // ....................................................*....................................................................... + sqrdmulh v24.4S, v5.4S, v18.S[1] // ...............................................*............................................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + sub v7.4S, v25.4S, v28.4S // ......................................................*..................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + add v25.4S, v15.4S, v1.4S // ............................................................*............................................................... + sub v21.4S, v23.4S, v17.4S // .................................................................................*.......................................... + // gap // ............................................................................................................................ + add v1.4S, v23.4S, v17.4S // ..............................................................................*............................................. + // gap // ............................................................................................................................ + sub v3.4S, v7.4S, v26.4S // ........................................................*................................................................... + add v7.4S, v7.4S, v26.4S // .........................................................*.................................................................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn1 v11.4S, v1.4S, v21.4S // ....................................................................................*....................................... + // gap // ............................................................................................................................ + mul v2.4S, v5.4S, v18.S[0] // ..............................................*............................................................................. + ldr q9, [x5, #-16] // ....................................................................................................*....................... + trn1 v18.4S, v7.4S, v3.4S // .............................................................*.............................................................. + sqrdmulh v5.4S, v25.4S, v4.S[3] // ...................................................................*........................................................ + ldr q17, [x5, #-128] // ..........................................................................................................*................. + ldr q15, [x5, #-112] // .....................................................................................................................*...... + mul v6.4S, v25.4S, v4.S[2] // ..................................................................*......................................................... + mls v29.4S, v27.4S, v8.S[0] // ................................................*........................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn2 v31.4S, v1.4S, v21.4S // .....................................................................................*...................................... + sub v1.4S, v20.4S, v10.4S // ...................................................*........................................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn2 v3.4S, v7.4S, v3.4S // ..............................................................................................*............................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mls v2.4S, v24.4S, v8.S[0] // .....................................................*...................................................................... + mls v6.4S, v5.4S, v8.S[0] // ........................................................................*................................................... + sub v23.4S, v1.4S, v29.4S // .......................................................*.................................................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + add v29.4S, v1.4S, v29.4S // ..........................................................................*................................................. + // gap // ............................................................................................................................ + trn2 v28.2D, v11.2D, v18.2D // ........................................................................................*................................... + sub v20.4S, v23.4S, v2.4S // ......................................................................*..................................................... + add v2.4S, v23.4S, v2.4S // ..........................................................*................................................................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + sub v1.4S, v29.4S, v6.4S // ...............................................................................*............................................ + add v24.4S, v29.4S, v6.4S // .............................................................................*.............................................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn1 v18.2D, v11.2D, v18.2D // ...........................................................................................................*................ + // gap // ............................................................................................................................ + mul v5.4S, v28.4S, v13.4S // .............................................................................................*.............................. + trn2 v6.4S, v24.4S, v1.4S // ..................................................................................*......................................... + trn2 v7.4S, v2.4S, v20.4S // ...........................................................................*................................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn2 v25.2D, v31.2D, v3.2D // ..................................................................................................*......................... + sqrdmulh v28.4S, v28.4S, v0.4S // ...........................................................................................*................................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn1 v4.4S, v24.4S, v1.4S // ...................................................................................*........................................ + trn2 v1.2D, v6.2D, v7.2D // ......................................................................................*..................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn1 v24.4S, v2.4S, v20.4S // ............................................................................*............................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + sqrdmulh v10.4S, v25.4S, v0.4S // ........................................................................................................*................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + sqrdmulh v21.4S, v1.4S, v16.4S // .........................................................................................*.................................. + mul v0.4S, v1.4S, v14.4S // ..........................................................................................*................................. + mul v11.4S, v25.4S, v13.4S // ......................................................................................................*..................... + trn2 v1.2D, v4.2D, v24.2D // .......................................................................................*.................................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mls v5.4S, v28.4S, v8.S[0] // ...................................................................................................*........................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn1 v6.2D, v6.2D, v7.2D // .................................................................................................*.......................... + mul v13.4S, v1.4S, v14.4S // .....................................................................................................*...................... + mls v0.4S, v21.4S, v8.S[0] // ...............................................................................................*............................ + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn1 v23.2D, v4.2D, v24.2D // ...................................................................................................................*........ + sqrdmulh v1.4S, v1.4S, v16.4S // ............................................................................................*............................... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + trn1 v7.2D, v31.2D, v3.2D // ..................................................................................................................*......... + // gap // ............................................................................................................................ + sub v28.4S, v18.4S, v5.4S // .............................................................................................................*.............. + // gap // ............................................................................................................................ + mls v11.4S, v10.4S, v8.S[0] // ................................................................................................................*........... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + add v20.4S, v6.4S, v0.4S // .......................................................................................................*.................... + sub v14.4S, v6.4S, v0.4S // ......................................................................................................................*..... + mls v13.4S, v1.4S, v8.S[0] // ............................................................................................................*............... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + sqrdmulh v25.4S, v20.4S, v12.4S // ..............................................................................................................*............. + mul v16.4S, v20.4S, v30.4S // .........................................................................................................*.................. + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + mul v26.4S, v14.4S, v19.4S // ...........................................................................................................................* + add v27.4S, v7.4S, v11.4S // ........................................................................................................................*... + // gap // ............................................................................................................................ + add x2, x2, #64 // .................................................................................................................*.......... + add v0.4S, v23.4S, v13.4S // .......................................................................................................................*.... + ldr q1, [x5, #-160] // .....................*...................................................................................................... + sub v10.4S, v7.4S, v11.4S // .........................................................................................................................*.. + // gap // ............................................................................................................................ + mls v16.4S, v25.4S, v8.S[0] // ....................................................................................................................*....... + // gap // ............................................................................................................................ + // gap // ............................................................................................................................ + add v3.4S, v18.4S, v5.4S // ...............................................................................................................*............ + + // ------------------------------------------------------ new position -------------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------------- + // ldr q29, [x2, #112] // *........................................................................................................................... + // ldr q30, [x4], #64 // .*.......................................................................................................................... + // ldr q2, [x2, #96] // ...*........................................................................................................................ + // ldr q4, [x2, #64] // .....*...................................................................................................................... + // ldr q15, [x1, #112] // ..............*............................................................................................................. + // ldr q17, [x2, #80] // ..........*................................................................................................................. + // ldr q19, [x1, #80] // .......*.................................................................................................................... + // sqrdmulh v0.4S, v29.4S, v30.S[1] // ........*................................................................................................................... + // mul v9.4S, v29.4S, v30.S[0] // .........*.................................................................................................................. + // ldr q16, [x1, #96] // ..*......................................................................................................................... + // sqrdmulh v22.4S, v2.4S, v30.S[1] // ............*............................................................................................................... + // mul v14.4S, v2.4S, v30.S[0] // .............*.............................................................................................................. + // sqrdmulh v27.4S, v4.4S, v30.S[1] // ...............*............................................................................................................ + // mul v24.4S, v4.4S, v30.S[0] // ................*........................................................................................................... + // ldr q12, [x4, #-48] // .................*.......................................................................................................... + // mls v9.4S, v0.4S, v8.S[0] // ..................*......................................................................................................... + // mul v29.4S, v17.4S, v30.S[0] // ........................*................................................................................................... + // sqrdmulh v0.4S, v17.4S, v30.S[1] // ....................*....................................................................................................... + // ldr q7, [x4, #-32] // .........................*.................................................................................................. + // mls v14.4S, v22.4S, v8.S[0] // ...................*........................................................................................................ + // mls v24.4S, v27.4S, v8.S[0] // .......................*.................................................................................................... + // ldr q1, [x5, #32] // ........................................................................................................................*... + // ldr q2, [x1, #64] // .................................*.......................................................................................... + // sub v20.4S, v15.4S, v9.4S // ..........................*................................................................................................. + // add v9.4S, v15.4S, v9.4S // ...........................*................................................................................................ + // mls v29.4S, v0.4S, v8.S[0] // ..................................*......................................................................................... + // add v22.4S, v16.4S, v14.4S // .............................*.............................................................................................. + // sqrdmulh v0.4S, v9.4S, v30.S[3] // ...............................*............................................................................................ + // mul v15.4S, v9.4S, v30.S[2] // ................................*........................................................................................... + // mul v17.4S, v20.4S, v12.S[0] // ............................................*............................................................................... + // sqrdmulh v9.4S, v20.4S, v12.S[1] // ..........................................*................................................................................. + // sqrdmulh v27.4S, v22.4S, v30.S[3] // ....................................*....................................................................................... + // mul v4.4S, v22.4S, v30.S[2] // .....................................*...................................................................................... + // add v25.4S, v19.4S, v29.4S // ...........................................*................................................................................ + // mls v15.4S, v0.4S, v8.S[0] // .........................................*.................................................................................. + // sub v0.4S, v16.4S, v14.4S // ............................*............................................................................................... + // mls v17.4S, v9.4S, v8.S[0] // ..................................................*......................................................................... + // ldr q16, [x4, #-16] // ......................................*..................................................................................... + // sub v19.4S, v19.4S, v29.4S // ..............................................*............................................................................. + // mls v4.4S, v27.4S, v8.S[0] // .................................................*.......................................................................... + // mul v30.4S, v0.4S, v12.S[0] // .......................................................*.................................................................... + // sub v9.4S, v25.4S, v15.4S // ...............................................*............................................................................ + // sub v22.4S, v19.4S, v17.4S // ........................................................*................................................................... + // sqrdmulh v14.4S, v0.4S, v12.S[1] // .......................................*.................................................................................... + // sqrdmulh v27.4S, v9.4S, v7.S[1] // ......................................................*..................................................................... + // mul v9.4S, v9.4S, v7.S[0] // .....................................................*...................................................................... + // mul v0.4S, v22.4S, v16.S[0] // ....................................................................*....................................................... + // sqrdmulh v31.4S, v22.4S, v16.S[1] // ............................................................*............................................................... + // mls v30.4S, v14.4S, v8.S[0] // ...........................................................................*................................................ + // add v29.4S, v2.4S, v24.4S // .............................................*.............................................................................. + // ldr q20, [x5, #160] // .....................*...................................................................................................... + // sub v2.4S, v2.4S, v24.4S // .............................................................................*.............................................. + // mls v9.4S, v27.4S, v8.S[0] // ...........................................................*................................................................ + // mls v0.4S, v31.4S, v8.S[0] // ...............................................................................*............................................ + // sub v27.4S, v29.4S, v4.4S // .............................................................*.............................................................. + // sub v21.4S, v2.4S, v30.4S // .................................................................................*.......................................... + // sub v24.4S, v27.4S, v9.4S // .................................................................*.......................................................... + // add v22.4S, v27.4S, v9.4S // ..................................................................*......................................................... + // add v9.4S, v21.4S, v0.4S // .....................................................................................*...................................... + // ldr q14, [x5], #(12*16) // ...........*................................................................................................................ + // add v19.4S, v19.4S, v17.4S // ..............................................................*............................................................. + // trn1 v18.4S, v22.4S, v24.4S // ......................................................................*..................................................... + // ldr q16, [x5, #-176] // ......................*..................................................................................................... + // add v27.4S, v25.4S, v15.4S // ................................................*........................................................................... + // ldr q25, [x5, #-96] // ....*....................................................................................................................... + // ldr q15, [x5, #-80] // ...................................*........................................................................................ + // mul v17.4S, v19.4S, v7.S[2] // ..........................................................................*................................................. + // sqrdmulh v7.4S, v19.4S, v7.S[3] // .......................................................................*.................................................... + // mul v19.4S, v27.4S, v12.S[2] // ...................................................*........................................................................ + // sqrdmulh v12.4S, v27.4S, v12.S[3] // ....................................................*....................................................................... + // sub v31.4S, v21.4S, v0.4S // ....................................................................................*....................................... + // add v29.4S, v29.4S, v4.4S // .........................................................*.................................................................. + // mls v17.4S, v7.4S, v8.S[0] // ................................................................................*........................................... + // mls v19.4S, v12.4S, v8.S[0] // ..........................................................*................................................................. + // add v7.4S, v2.4S, v30.4S // ..................................................................................*......................................... + // trn2 v2.4S, v9.4S, v31.4S // ...........................................................................................*................................ + // trn1 v21.4S, v9.4S, v31.4S // ................................................................................................*........................... + // add v30.4S, v7.4S, v17.4S // .......................................................................................*.................................... + // add v4.4S, v29.4S, v19.4S // ................................................................*........................................................... + // sub v0.4S, v7.4S, v17.4S // ......................................................................................*..................................... + // ldr q17, [x5, #-64] // ..............................*............................................................................................. + // sub v26.4S, v29.4S, v19.4S // ...............................................................*............................................................ + // trn2 v9.4S, v30.4S, v0.4S // ..........................................................................................*................................. + // trn1 v7.4S, v30.4S, v0.4S // ..............................................................................................*............................. + // trn1 v31.4S, v4.4S, v26.4S // ...................................................................*........................................................ + // trn2 v19.4S, v4.4S, v26.4S // ............................................................................*............................................... + // trn2 v30.2D, v9.2D, v2.2D // ...............................................................................................*............................ + // trn2 v26.2D, v7.2D, v21.2D // .....................................................................................................*...................... + // trn2 v29.2D, v31.2D, v18.2D // ...................................................................................*........................................ + // sqrdmulh v0.4S, v30.4S, v15.4S // ..................................................................................................*......................... + // mul v30.4S, v30.4S, v25.4S // ...................................................................................................*........................ + // sqrdmulh v4.4S, v29.4S, v16.4S // .............................................................................................*.............................. + // sqrdmulh v5.4S, v26.4S, v15.4S // ...........................................................................................................*................ + // mul v12.4S, v29.4S, v14.4S // .........................................................................................*.................................. + // trn2 v15.4S, v22.4S, v24.4S // ..............................................................................*............................................. + // mls v30.4S, v0.4S, v8.S[0] // .........................................................................................................*.................. + // ldr q22, [x5, #-48] // ......*..................................................................................................................... + // trn1 v0.2D, v9.2D, v2.2D // .......................................................................................................*.................... + // trn2 v29.2D, v19.2D, v15.2D // ............................................................................................*............................... + // mls v12.4S, v4.4S, v8.S[0] // ......................................................................................................*..................... + // ldr q9, [x5, #-16] // .....................................................................*...................................................... + // mul v13.4S, v26.4S, v25.4S // ........................................................................................................*................... + // mul v10.4S, v29.4S, v14.4S // ....................................................................................................*....................... + // add v2.4S, v0.4S, v30.4S // ...............................................................................................................*............ + // sqrdmulh v23.4S, v29.4S, v16.4S // .................................................................................................*.......................... + // mul v16.4S, v2.4S, v17.4S // ...................................................................................................................*........ + // ldr q17, [x5, #-128] // ........................................................................*................................................... + // trn1 v29.2D, v31.2D, v18.2D // ........................................................................................*................................... + // mls v13.4S, v5.4S, v8.S[0] // .................................................................................................................*.......... + // sub v28.4S, v29.4S, v12.4S // .............................................................................................................*.............. + // sqrdmulh v11.4S, v2.4S, v22.4S // ..................................................................................................................*......... + // add v3.4S, v29.4S, v12.4S // ...........................................................................................................................* + // mls v10.4S, v23.4S, v8.S[0] // ..............................................................................................................*............. + // add x2, x2, #64 // ......................................................................................................................*..... + // trn1 v5.2D, v19.2D, v15.2D // ............................................................................................................*............... + // trn1 v23.2D, v7.2D, v21.2D // ..........................................................................................................*................. + // mls v16.4S, v11.4S, v8.S[0] // ..........................................................................................................................*. + // ldr q15, [x5, #-112] // .........................................................................*.................................................. + // sub v14.4S, v0.4S, v30.4S // ................................................................................................................*........... + // add v0.4S, v23.4S, v13.4S // .......................................................................................................................*.... + // add v27.4S, v5.4S, v10.4S // .....................................................................................................................*...... + // sub v10.4S, v5.4S, v10.4S // .........................................................................................................................*.. + // ldr q22, [x5, #-144] // ........................................*................................................................................... + // mul v26.4S, v14.4S, v20.4S // ....................................................................................................................*....... sub count, count, #1 layer45678_start: - sub v24.4S, v29.4S, v19.4S // ..........................................*..................................................................................................... - mls v23.4S, v28.4S, v8.S[0] // ....................................*........................................................................................................... - ldr q13, [x4], #64 // ..........e..................................................................................................................................... - add x2, x2, #64 // .........*...................................................................................................................................... - ldr q30, [x2, #176] // .......e........................................................................................................................................ - mls v20.4S, v14.4S, v8.S[0] // ........................................................*....................................................................................... + // Instructions: 144 + // Expected cycles: 58 + // Expected IPC: 2.48 + // + // Wall time: 2106.64s + // User time: 2106.64s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + sub v11.4S, v0.4S, v16.4S // .......................................................................................................................................*........ + mul v6.4S, v27.4S, v1.4S // ...........................................................................................................*.................................... + ldr q29, [x2, #176] // .......e........................................................................................................................................ + ldr q30, [x4], #64 // ..........e..................................................................................................................................... + sub v13.4S, v23.4S, v13.4S // .............................................................................................................................*.................. + ldr q2, [x2, #160] // ......e......................................................................................................................................... add x1, x1, #64 // ........*....................................................................................................................................... - mls v4.4S, v3.4S, v8.S[0] // ...........................................................................................................................................*.... - ldr q0, [x2, #160] // ......e......................................................................................................................................... - sqrdmulh v3.4S, v24.4S, v2.S[1] // ............................................................*................................................................................... - sub v31.4S, v27.4S, v21.4S // .............................................................................................................................*.................. - ldr q7, [x2, #128] // ....e........................................................................................................................................... - add v17.4S, v12.4S, v23.4S // ......................................*......................................................................................................... - sub v16.4S, v12.4S, v23.4S // .....................................*.......................................................................................................... - ldr q14, [x1, #160] // ..e............................................................................................................................................. - ldr q26, [x4, #-16] // .............e.................................................................................................................................. - // gap // ................................................................................................................................................ + mul v23.4S, v10.4S, v17.4S // ................................................................................................................*............................... // gap // ................................................................................................................................................ - add v11.4S, v31.4S, v4.4S // .............................................................................................................................................*.. - sub v12.4S, v31.4S, v4.4S // ............................................................................................................................................*... - ldr q1, [x5, #96] // ....................................................................................................................e........................... - mul v21.4S, v30.4S, v13.S[0] // .............................e.................................................................................................................. - sqrdmulh v29.4S, v30.4S, v13.S[1] // ..............................e................................................................................................................. - ldr q27, [x2, #144] // .....e.......................................................................................................................................... - mul v31.4S, v0.4S, v13.S[0] // ........................e....................................................................................................................... - mul v4.4S, v24.4S, v2.S[0] // ...........................................................*.................................................................................... - ldr q2, [x4, #-32] // ............e................................................................................................................................... - ldr q28, [x4, #-48] // ...........e.................................................................................................................................... - mul v30.4S, v7.4S, v13.S[0] // ..............e................................................................................................................................. - sqrdmulh v15.4S, v0.4S, v13.S[1] // .........................e...................................................................................................................... + ldr q4, [x2, #128] // ....e........................................................................................................................................... + sqrdmulh v20.4S, v10.4S, v15.4S // ...............................................................................................................*................................ + sqrdmulh v21.4S, v14.4S, v9.4S // .........................................................................................................................................*...... + sqrdmulh v5.4S, v27.4S, v22.4S // ..........................................................................................................*..................................... + ldr q15, [x1, #176] // ...e............................................................................................................................................ + ldr q17, [x2, #144] // .....e.......................................................................................................................................... + add v10.4S, v0.4S, v16.4S // ........................................................................................................................................*....... + ldr q19, [x1, #144] // .e.............................................................................................................................................. + sqrdmulh v0.4S, v29.4S, v30.S[1] // .............................e.................................................................................................................. + mul v9.4S, v29.4S, v30.S[0] // ..............................e................................................................................................................. + ldr q16, [x1, #160] // ..e............................................................................................................................................. + sqrdmulh v22.4S, v2.4S, v30.S[1] // ........................e....................................................................................................................... + mul v14.4S, v2.4S, v30.S[0] // .........................e...................................................................................................................... // gap // ................................................................................................................................................ - ldr q0, [x1, #176] // ...e............................................................................................................................................ // gap // ................................................................................................................................................ + sqrdmulh v27.4S, v4.4S, v30.S[1] // ..............e................................................................................................................................. + mul v24.4S, v4.4S, v30.S[0] // ...............e................................................................................................................................ // gap // ................................................................................................................................................ - mls v21.4S, v29.4S, v8.S[0] // ...............................e................................................................................................................ - sqrdmulh v29.4S, v7.4S, v13.S[1] // ...............e................................................................................................................................ - mls v4.4S, v3.4S, v8.S[0] // .............................................................*.................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sqrdmulh v25.4S, v27.4S, v13.S[1] // ....................e........................................................................................................................... - mls v31.4S, v15.4S, v8.S[0] // ..........................e..................................................................................................................... - mul v7.4S, v27.4S, v13.S[0] // ...................e............................................................................................................................ + ldr q12, [x4, #-48] // ...........e.................................................................................................................................... + mls v9.4S, v0.4S, v8.S[0] // ...............................e................................................................................................................ + mul v29.4S, v17.4S, v30.S[0] // ....................e........................................................................................................................... + sqrdmulh v0.4S, v17.4S, v30.S[1] // ...................e............................................................................................................................ // gap // ................................................................................................................................................ + ldr q7, [x4, #-32] // ............e................................................................................................................................... + mls v14.4S, v22.4S, v8.S[0] // ..........................e..................................................................................................................... // gap // ................................................................................................................................................ + mls v24.4S, v27.4S, v8.S[0] // ................e............................................................................................................................... + ldr q1, [x5, #32] // ............................................................................................e................................................... + mls v23.4S, v20.4S, v8.S[0] // .................................................................................................................*.............................. + ldr q2, [x1, #128] // e............................................................................................................................................... + sub v20.4S, v15.4S, v9.4S // ................................e............................................................................................................... + add v9.4S, v15.4S, v9.4S // .................................e.............................................................................................................. // gap // ................................................................................................................................................ - ldr q24, [x1, #144] // .e.............................................................................................................................................. - sub v15.4S, v0.4S, v21.4S // ................................e............................................................................................................... - add v23.4S, v0.4S, v21.4S // .................................e.............................................................................................................. - add v0.4S, v16.4S, v4.4S // ...............................................................*................................................................................ - sub v27.4S, v16.4S, v4.4S // ..............................................................*................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sqrdmulh v3.4S, v23.4S, v13.S[3] // ........................................e....................................................................................................... + mls v29.4S, v0.4S, v8.S[0] // .....................e.......................................................................................................................... + add v22.4S, v16.4S, v14.4S // ............................e................................................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sqrdmulh v6.4S, v15.4S, v28.S[1] // ..................................................e............................................................................................. - trn2 v5.4S, v0.4S, v27.4S // .............................................................................*.................................................................. - mul v16.4S, v15.4S, v28.S[0] // .................................................e.............................................................................................. + sqrdmulh v0.4S, v9.4S, v30.S[3] // .......................................e........................................................................................................ + mul v15.4S, v9.4S, v30.S[2] // ........................................e....................................................................................................... + mul v17.4S, v20.4S, v12.S[0] // ..................................................e............................................................................................. + sqrdmulh v9.4S, v20.4S, v12.S[1] // .................................................e.............................................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x2], #64 // ...............................................................................................................................................* - mls v7.4S, v25.4S, v8.S[0] // .....................e.......................................................................................................................... // gap // ................................................................................................................................................ - sub v9.4S, v14.4S, v31.4S // ...........................e.................................................................................................................... - mul v19.4S, v23.4S, v13.S[2] // .......................................e........................................................................................................ - mls v30.4S, v29.4S, v8.S[0] // ................e............................................................................................................................... // gap // ................................................................................................................................................ + sqrdmulh v27.4S, v22.4S, v30.S[3] // ..................................e............................................................................................................. + mul v4.4S, v22.4S, v30.S[2] // ...................................e............................................................................................................ + add v25.4S, v19.4S, v29.4S // .......................e........................................................................................................................ // gap // ................................................................................................................................................ - mul v12.4S, v9.4S, v28.S[0] // ............................................e................................................................................................... - mls v16.4S, v6.4S, v8.S[0] // ...................................................e............................................................................................ + mls v15.4S, v0.4S, v8.S[0] // .........................................e...................................................................................................... // gap // ................................................................................................................................................ - ldr q10, [x1, #128] // e............................................................................................................................................... - add v29.4S, v24.4S, v7.4S // .......................e........................................................................................................................ - sub v24.4S, v24.4S, v7.4S // ......................e......................................................................................................................... + sub v0.4S, v16.4S, v14.4S // ...........................e.................................................................................................................... // gap // ................................................................................................................................................ + mls v17.4S, v9.4S, v8.S[0] // ...................................................e............................................................................................ + ldr q16, [x4, #-16] // .............e.................................................................................................................................. + sub v19.4S, v19.4S, v29.4S // ......................e......................................................................................................................... // gap // ................................................................................................................................................ - sqrdmulh v4.4S, v9.4S, v28.S[1] // .............................................e.................................................................................................. + mls v4.4S, v27.4S, v8.S[0] // ....................................e........................................................................................................... // gap // ................................................................................................................................................ + mul v30.4S, v0.4S, v12.S[0] // .............................................e.................................................................................................. // gap // ................................................................................................................................................ - mls v19.4S, v3.4S, v8.S[0] // .........................................e...................................................................................................... // gap // ................................................................................................................................................ - add v21.4S, v24.4S, v16.4S // .....................................................e.......................................................................................... + sub v9.4S, v25.4S, v15.4S // ..........................................e..................................................................................................... + sub v22.4S, v19.4S, v17.4S // ....................................................e........................................................................................... // gap // ................................................................................................................................................ - add v11.4S, v14.4S, v31.4S // ............................e................................................................................................................... - sub v14.4S, v10.4S, v30.4S // .................e.............................................................................................................................. - ldr q15, [x5, #160] // ........................................................................................................................e....................... // gap // ................................................................................................................................................ - sub v7.4S, v24.4S, v16.4S // ....................................................e........................................................................................... - mul v24.4S, v21.4S, v2.S[2] // ................................................................e............................................................................... - sqrdmulh v3.4S, v21.4S, v2.S[3] // .................................................................e.............................................................................. + sqrdmulh v14.4S, v0.4S, v12.S[1] // ............................................e................................................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ + sqrdmulh v27.4S, v9.4S, v7.S[1] // ...........................................................e.................................................................................... + mul v9.4S, v9.4S, v7.S[0] // ............................................................e................................................................................... + mul v0.4S, v22.4S, v16.S[0] // ......................................................................e......................................................................... // gap // ................................................................................................................................................ + sqrdmulh v31.4S, v22.4S, v16.S[1] // .....................................................................e.......................................................................... // gap // ................................................................................................................................................ - mls v12.4S, v4.4S, v8.S[0] // ..............................................e................................................................................................. - add v21.4S, v29.4S, v19.4S // ...........................................e.................................................................................................... + mls v30.4S, v14.4S, v8.S[0] // ..............................................e................................................................................................. // gap // ................................................................................................................................................ + add v29.4S, v2.4S, v24.4S // ..................e............................................................................................................................. + ldr q20, [x5, #160] // ........................................................................................................................e....................... + sub v2.4S, v2.4S, v24.4S // .................e.............................................................................................................................. // gap // ................................................................................................................................................ - mul v4.4S, v7.4S, v26.S[0] // .....................................................................e.......................................................................... - sqrdmulh v16.4S, v7.4S, v26.S[1] // ......................................................................e......................................................................... // gap // ................................................................................................................................................ - mls v24.4S, v3.4S, v8.S[0] // ..................................................................e............................................................................. - trn1 v31.4S, v0.4S, v27.4S // ............................................................................*................................................................... + mls v9.4S, v27.4S, v8.S[0] // .............................................................e.................................................................................. + mls v0.4S, v31.4S, v8.S[0] // .......................................................................e........................................................................ // gap // ................................................................................................................................................ - sub v26.4S, v17.4S, v20.4S // .........................................................*...................................................................................... - add v0.4S, v14.4S, v12.4S // ................................................e............................................................................................... // gap // ................................................................................................................................................ - ldr q9, [x5, #176] // .........................................................................................................................e...................... - mls v4.4S, v16.4S, v8.S[0] // .......................................................................e........................................................................ - add v16.4S, v17.4S, v20.4S // ..........................................................*..................................................................................... + sub v27.4S, v29.4S, v4.4S // .....................................e.......................................................................................................... + mls v26.4S, v21.4S, v8.S[0] // ...........................................................................................................................................*.... + sub v21.4S, v2.4S, v30.4S // ...............................................e................................................................................................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - add v25.4S, v0.4S, v24.4S // ....................................................................e........................................................................... + sub v24.4S, v27.4S, v9.4S // ..............................................................e................................................................................. + add v22.4S, v27.4S, v9.4S // ...............................................................e................................................................................ // gap // ................................................................................................................................................ - ldr q3, [x5, #112] // .....................................................................................................................e.......................... - sub v7.4S, v0.4S, v24.4S // ...................................................................e............................................................................ - trn2 v6.4S, v16.4S, v26.4S // ...........................................................................*.................................................................... - trn1 v17.4S, v16.4S, v26.4S // ..........................................................................*..................................................................... // gap // ................................................................................................................................................ + add v9.4S, v21.4S, v0.4S // .........................................................................e...................................................................... // gap // ................................................................................................................................................ + ldr q14, [x5], #(12*16) // ..........................................................................................e..................................................... + add v19.4S, v19.4S, v17.4S // .....................................................e.......................................................................................... + trn1 v18.4S, v22.4S, v24.4S // ............................................................................e................................................................... // gap // ................................................................................................................................................ - trn2 v23.4S, v25.4S, v7.4S // ...................................................................................e............................................................ - sub v26.4S, v14.4S, v12.4S // ...............................................e................................................................................................ + ldr q16, [x5, #-176] // ...........................................................................................e.................................................... + add v27.4S, v25.4S, v15.4S // ...........................................e.................................................................................................... + ldr q25, [x5, #-96] // ....................................................................................................................e........................... + ldr q15, [x5, #-80] // .....................................................................................................................e.......................... + mul v17.4S, v19.4S, v7.S[2] // .................................................................e.............................................................................. + sqrdmulh v7.4S, v19.4S, v7.S[3] // ................................................................e............................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sqrdmulh v14.4S, v21.4S, v28.S[3] // .......................................................e........................................................................................ - ldr q20, [x5, #-176] // ...........................................................................................*.................................................... - trn2 v0.2D, v17.2D, v31.2D // ..............................................................................*................................................................. + mul v19.4S, v27.4S, v12.S[2] // .......................................................e........................................................................................ + sqrdmulh v12.4S, v27.4S, v12.S[3] // ......................................................e......................................................................................... // gap // ................................................................................................................................................ - sub v27.4S, v26.4S, v4.4S // ........................................................................e....................................................................... // gap // ................................................................................................................................................ - add v16.4S, v26.4S, v4.4S // .........................................................................e...................................................................... + sub v27.4S, v28.4S, v23.4S // ..................................................................................................................*............................. + sub v31.4S, v21.4S, v0.4S // ........................................................................e....................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - trn2 v4.2D, v6.2D, v5.2D // ...............................................................................*................................................................ - mul v24.4S, v0.4S, v22.4S // ................................................................................................*............................................... - trn1 v12.4S, v16.4S, v27.4S // ....................................................................................e........................................................... - trn2 v16.4S, v16.4S, v27.4S // .....................................................................................e.......................................................... + add v29.4S, v29.4S, v4.4S // ......................................e......................................................................................................... + mls v17.4S, v7.4S, v8.S[0] // ..................................................................e............................................................................. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - mul v22.4S, v4.4S, v22.4S // .....................................................................................................*.......................................... + mls v19.4S, v12.4S, v8.S[0] // ........................................................e....................................................................................... + add v7.4S, v2.4S, v30.4S // ................................................e............................................................................................... + add v12.4S, v13.4S, v26.4S // .............................................................................................................................................*.. // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sqrdmulh v4.4S, v4.4S, v20.4S // ......................................................................................................*......................................... + trn2 v2.4S, v9.4S, v31.4S // .....................................................................................e.......................................................... // gap // ................................................................................................................................................ + trn1 v21.4S, v9.4S, v31.4S // ....................................................................................e........................................................... // gap // ................................................................................................................................................ - trn1 v27.4S, v25.4S, v7.4S // ..................................................................................e............................................................. - sqrdmulh v7.4S, v0.4S, v20.4S // .................................................................................................*.............................................. - trn1 v0.2D, v17.2D, v31.2D // ................................................................................*............................................................... - trn2 v26.2D, v23.2D, v16.2D // .......................................................................................e........................................................ - ldr q20, [x5, #-128] // ..............................................................................................*................................................. + add v30.4S, v7.4S, v17.4S // ....................................................................e........................................................................... // gap // ................................................................................................................................................ - mls v22.4S, v4.4S, v8.S[0] // .......................................................................................................*........................................ - ldr q25, [x5, #-160] // ............................................................................................*................................................... // gap // ................................................................................................................................................ - trn2 v17.2D, v27.2D, v12.2D // ......................................................................................e......................................................... - trn1 v4.2D, v23.2D, v16.2D // .........................................................................................e...................................................... - ldr q16, [x5, #128] // ......................................................................................................................e......................... - ldr q31, [x5, #144] // .......................................................................................................................e........................ - trn1 v6.2D, v6.2D, v5.2D // .................................................................................*.............................................................. - sqrdmulh v23.4S, v17.4S, v3.4S // ...........................................................................................................................e.................... + add v4.4S, v29.4S, v19.4S // ..........................................................e..................................................................................... + sub v0.4S, v7.4S, v17.4S // ...................................................................e............................................................................ + sub v13.4S, v13.4S, v26.4S // ............................................................................................................................................*... + ldr q17, [x5, #-64] // ......................................................................................................................e......................... // gap // ................................................................................................................................................ + sub v26.4S, v29.4S, v19.4S // .........................................................e...................................................................................... + trn2 v9.4S, v30.4S, v0.4S // ...................................................................................e............................................................ // gap // ................................................................................................................................................ - mls v24.4S, v7.4S, v8.S[0] // ..................................................................................................*............................................. - sub v7.4S, v6.4S, v22.4S // ........................................................................................................*....................................... // gap // ................................................................................................................................................ + trn1 v7.4S, v30.4S, v0.4S // ..................................................................................e............................................................. + trn1 v31.4S, v4.4S, v26.4S // ..........................................................................e..................................................................... // gap // ................................................................................................................................................ - add v6.4S, v6.4S, v22.4S // .........................................................................................................*...................................... // gap // ................................................................................................................................................ - mul v5.4S, v26.4S, v1.4S // ...............................................................................................................................e................ - sqrdmulh v22.4S, v26.4S, v3.4S // ................................................................................................................................e............... - ldr q3, [x5, #-112] // ...............................................................................................*................................................ + trn2 v19.4S, v4.4S, v26.4S // ...........................................................................e.................................................................... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - sub v26.4S, v0.4S, v24.4S // ...................................................................................................*............................................ - mul v25.4S, v6.4S, v25.4S // ..........................................................................................................*..................................... + trn2 v30.2D, v9.2D, v2.2D // .......................................................................................e........................................................ + trn2 v26.2D, v7.2D, v21.2D // ......................................................................................e......................................................... + mls v6.4S, v5.4S, v8.S[0] // ............................................................................................................*................................... // gap // ................................................................................................................................................ + trn2 v29.2D, v31.2D, v18.2D // ..............................................................................e................................................................. // gap // ................................................................................................................................................ - sqrdmulh v18.4S, v6.4S, v18.4S // ...........................................................................................................*.................................... - mul v6.4S, v7.4S, v20.4S // ...............................................................................................................*................................ + sqrdmulh v0.4S, v30.4S, v15.4S // ...............................................................................................................................e................ + mul v30.4S, v30.4S, v25.4S // ................................................................................................................................e............... // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - mls v5.4S, v22.4S, v8.S[0] // .................................................................................................................................e.............. - mul v20.4S, v21.4S, v28.S[2] // ......................................................e......................................................................................... - mul v21.4S, v17.4S, v1.4S // ..........................................................................................................................e..................... - ldr q22, [x5], #(12*16) // ..........................................................................................e..................................................... // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v7.4S, v3.4S // ................................................................................................................*............................... - add v1.4S, v0.4S, v24.4S // ....................................................................................................*........................................... // gap // ................................................................................................................................................ - mls v25.4S, v18.4S, v8.S[0] // ............................................................................................................*................................... + sqrdmulh v4.4S, v29.4S, v16.4S // ................................................................................................e............................................... + sqrdmulh v5.4S, v26.4S, v15.4S // ..........................................................................................................................e..................... + st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x2], #64 // ...............................................................................................................................................* // gap // ................................................................................................................................................ + mul v12.4S, v29.4S, v14.4S // .................................................................................................e.............................................. + trn2 v15.4S, v22.4S, v24.4S // .............................................................................e.................................................................. + mls v30.4S, v0.4S, v8.S[0] // .................................................................................................................................e.............. + ldr q22, [x5, #-48] // .......................................................................................................................e........................ // gap // ................................................................................................................................................ - sub v0.4S, v4.4S, v5.4S // ..................................................................................................................................e............. - add v3.4S, v4.4S, v5.4S // ...................................................................................................................................e............ + trn1 v0.2D, v9.2D, v2.2D // .........................................................................................e...................................................... // gap // ................................................................................................................................................ - mls v21.4S, v23.4S, v8.S[0] // ............................................................................................................................e................... // gap // ................................................................................................................................................ + add v24.4S, v3.4S, v6.4S // ..............................................................................................................*................................. + trn2 v29.2D, v19.2D, v15.2D // ...............................................................................e................................................................ // gap // ................................................................................................................................................ - mls v6.4S, v28.4S, v8.S[0] // .................................................................................................................*.............................. - mul v7.4S, v3.4S, v16.4S // ....................................................................................................................................e........... - ldr q18, [x5, #-144] // .............................................................................................e.................................................. - sub v24.4S, v1.4S, v25.4S // .............................................................................................................*.................................. + mls v12.4S, v4.4S, v8.S[0] // ..................................................................................................e............................................. + ldr q9, [x5, #-16] // .........................................................................................................................e...................... + mul v13.4S, v26.4S, v25.4S // ...........................................................................................................................e.................... // gap // ................................................................................................................................................ - sqrdmulh v4.4S, v3.4S, v31.4S // .....................................................................................................................................e.......... // gap // ................................................................................................................................................ - add v23.4S, v1.4S, v25.4S // ..............................................................................................................*................................. + mul v10.4S, v29.4S, v14.4S // ......................................................................................................e......................................... + add v2.4S, v0.4S, v30.4S // ...................................................................................................................................e............ + add v26.4S, v28.4S, v23.4S // ...................................................................................................................*............................ // gap // ................................................................................................................................................ // gap // ................................................................................................................................................ - add v25.4S, v26.4S, v6.4S // ...................................................................................................................*............................ + sqrdmulh v23.4S, v29.4S, v16.4S // .....................................................................................................e.......................................... + mul v16.4S, v2.4S, v17.4S // .....................................................................................................................................e.......... // gap // ................................................................................................................................................ - sub v26.4S, v26.4S, v6.4S // ..................................................................................................................*............................. - trn1 v27.2D, v27.2D, v12.2D // ........................................................................................e....................................................... + ldr q17, [x5, #-128] // ..............................................................................................e................................................. + trn1 v29.2D, v31.2D, v18.2D // ................................................................................e............................................................... // gap // ................................................................................................................................................ + sub v25.4S, v3.4S, v6.4S // .............................................................................................................*.................................. // gap // ................................................................................................................................................ - sqrdmulh v28.4S, v11.4S, v13.S[3] // ...................................e............................................................................................................ - sqrdmulh v3.4S, v0.4S, v9.4S // ..........................................................................................................................................e..... - mls v7.4S, v4.4S, v8.S[0] // ......................................................................................................................................e......... - st4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x1], #64 // ..............................................................................................................................................*. - // gap // ................................................................................................................................................ - mul v23.4S, v11.4S, v13.S[2] // ..................................e............................................................................................................. - add v16.4S, v27.4S, v21.4S // ..............................................................................................................................e................. - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - mul v4.4S, v0.4S, v15.4S // .........................................................................................................................................e...... - add v12.4S, v10.4S, v30.4S // ..................e............................................................................................................................. - add v9.4S, v16.4S, v7.4S // ........................................................................................................................................e....... - sub v10.4S, v16.4S, v7.4S // .......................................................................................................................................e........ - // gap // ................................................................................................................................................ - // gap // ................................................................................................................................................ - - // original source code - // ldr q9, [x1, #(16*0 + (64))] // .................................................e............................................................................................|..................................................e...................................................................................... - // ldr q10, [x1, #(16*1 + (64))] // .................................e............................................................................................................|..................................e...................................................................................................... - // ldr q11, [x1, #(16*2 + (64))] // ............e.................................................................................................................................|.............e........................................................................................................................... - // ldr q12, [x1, #(16*3 + (64))] // ..........................e...................................................................................................................|...........................e............................................................................................................. - // ldr q13, [x2, #(16*0 + (64))] // .........e....................................................................................................................................|..........e.............................................................................................................................. - // ldr q14, [x2, #(16*1 + (64))] // ...................e..........................................................................................................................|....................e.................................................................................................................... - // ldr q15, [x2, #(16*2 + (64))] // ......e.......................................................................................................................................|.......e................................................................................................................................. - // ldr q16, [x2, #(16*3 + (64))] // ..e...........................................................................................................................................|...e..................................................................................................................................... - // add x1, x1, #64 // ....*.........................................................................................................................................|.....*................................................................................................................................... - // add x2, x2, #64 // .*............................................................................................................................................|..*...................................................................................................................................... - // ldr q0, [x4], #64 // e.............................................................................................................................................|.e....................................................................................................................................... - // ldr q1, [x4, #(-64 + 16)] // .......................e......................................................................................................................|........................e................................................................................................................ - // ldr q2, [x4, #(-64 + 32)] // ......................e.......................................................................................................................|.......................e................................................................................................................. - // ldr q3, [x4, #(-64 + 48)] // .............e................................................................................................................................|..............e.......................................................................................................................... - // mul v24.4s, v13.4s, v0.s[0] // ........................e.....................................................................................................................|.........................e............................................................................................................... - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ............................e.................................................................................................................|.............................e........................................................................................................... - // mls v24.4s, v13.4s, v8.s[0] // ..............................................e...............................................................................................|...............................................e......................................................................................... - // sub v13.4s, v9.4s, v24.4s // ........................................................e.....................................................................................|.........................................................e............................................................................... - // add v9.4s, v9.4s, v24.4s // ...........................................................................................................................................e..|......................................................................................................................................... - // mul v24.4s, v14.4s, v0.s[0] // ................................e.............................................................................................................|.................................e....................................................................................................... - // sqrdmulh v14.4s, v14.4s, v0.s[1] // ..............................e...............................................................................................................|...............................e......................................................................................................... - // mls v24.4s, v14.4s, v8.s[0] // ...........................................e..................................................................................................|............................................e............................................................................................ - // sub v14.4s, v10.4s, v24.4s // ...................................................e..........................................................................................|....................................................e.................................................................................... - // add v10.4s, v10.4s, v24.4s // ..................................................e...........................................................................................|...................................................e..................................................................................... - // mul v24.4s, v15.4s, v0.s[0] // ....................e.........................................................................................................................|.....................e................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v0.s[1] // .........................e....................................................................................................................|..........................e.............................................................................................................. - // mls v24.4s, v15.4s, v8.s[0] // ...............................e..............................................................................................................|................................e........................................................................................................ - // sub v15.4s, v11.4s, v24.4s // ............................................e.................................................................................................|.............................................e........................................................................................... - // add v11.4s, v11.4s, v24.4s // .......................................................e......................................................................................|........................................................e................................................................................ - // mul v24.4s, v16.4s, v0.s[0] // .................e............................................................................................................................|..................e...................................................................................................................... - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ..................e...........................................................................................................................|...................e..................................................................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ...........................e..................................................................................................................|............................e............................................................................................................ - // sub v16.4s, v12.4s, v24.4s // ..................................e...........................................................................................................|...................................e..................................................................................................... - // add v12.4s, v12.4s, v24.4s // ...................................e..........................................................................................................|....................................e.................................................................................................... - // mul v24.4s, v11.4s, v0.s[2] // ........................................................................................................................................e.....|......................................................................................................................................... - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ....................................................................................................................................e.........|.....................................................................................................................................e... - // mls v24.4s, v11.4s, v8.s[0] // ..............................................................................................................................................|*........................................................................................................................................ - // sub v11.4s, v9.4s, v24.4s // ...........*..................................................................................................................................|............*............................................................................................................................ - // add v9.4s, v9.4s, v24.4s // ..........*...................................................................................................................................|...........*............................................................................................................................. - // mul v24.4s, v12.4s, v0.s[2] // .............................................e................................................................................................|..............................................e.......................................................................................... - // sqrdmulh v12.4s, v12.4s, v0.s[3] // ......................................e.......................................................................................................|.......................................e................................................................................................. - // mls v24.4s, v12.4s, v8.s[0] // .....................................................e........................................................................................|......................................................e.................................................................................. - // sub v12.4s, v10.4s, v24.4s // ..............................................................................................................................................*......................................................................................................................................... - // add v10.4s, v10.4s, v24.4s // ..............................................................e...............................................................................|...............................................................e......................................................................... - // mul v24.4s, v15.4s, v1.s[0] // ...............................................e..............................................................................................|................................................e........................................................................................ - // sqrdmulh v15.4s, v15.4s, v1.s[1] // ....................................................e.........................................................................................|.....................................................e................................................................................... - // mls v24.4s, v15.4s, v8.s[0] // .............................................................e................................................................................|..............................................................e.......................................................................... - // sub v15.4s, v13.4s, v24.4s // ..............................................................................e...............................................................|...............................................................................e......................................................... - // add v13.4s, v13.4s, v24.4s // ....................................................................e.........................................................................|.....................................................................e................................................................... - // mul v24.4s, v16.4s, v1.s[0] // .........................................e....................................................................................................|..........................................e.............................................................................................. - // sqrdmulh v16.4s, v16.4s, v1.s[1] // .......................................e......................................................................................................|........................................e................................................................................................ - // mls v24.4s, v16.4s, v8.s[0] // ................................................e.............................................................................................|.................................................e....................................................................................... - // sub v16.4s, v14.4s, v24.4s // ..........................................................e...................................................................................|...........................................................e............................................................................. - // add v14.4s, v14.4s, v24.4s // ......................................................e.......................................................................................|.......................................................e................................................................................. - // mul v24.4s, v10.4s, v1.s[2] // ..................................................................................................................e...........................|...................................................................................................................e..................... - // sqrdmulh v10.4s, v10.4s, v1.s[3] // ...............................................................................e..............................................................|................................................................................e........................................................ - // mls v24.4s, v10.4s, v8.s[0] // ...*..........................................................................................................................................|....*.................................................................................................................................... - // sub v10.4s, v9.4s, v24.4s // ...................................................................*..........................................................................|....................................................................*.................................................................... - // add v9.4s, v9.4s, v24.4s // .......................................................................*......................................................................|........................................................................*................................................................ - // mul v24.4s, v12.4s, v2.s[0] // .....................*........................................................................................................................|......................*.................................................................................................................. - // sqrdmulh v12.4s, v12.4s, v2.s[1] // .......*......................................................................................................................................|........*................................................................................................................................ - // mls v24.4s, v12.4s, v8.s[0] // .............................*................................................................................................................|..............................*.......................................................................................................... - // sub v12.4s, v11.4s, v24.4s // .....................................*........................................................................................................|......................................*.................................................................................................. - // add v11.4s, v11.4s, v24.4s // ....................................*.........................................................................................................|.....................................*................................................................................................... - // mul v24.4s, v14.4s, v2.s[2] // ...........................................................e..................................................................................|............................................................e............................................................................ - // sqrdmulh v14.4s, v14.4s, v2.s[3] // ............................................................e.................................................................................|.............................................................e........................................................................... - // mls v24.4s, v14.4s, v8.s[0] // .................................................................e............................................................................|..................................................................e...................................................................... - // sub v14.4s, v13.4s, v24.4s // ..........................................................................e...................................................................|...........................................................................e............................................................. - // add v13.4s, v13.4s, v24.4s // ........................................................................e.....................................................................|.........................................................................e............................................................... - // mul v24.4s, v16.4s, v3.s[0] // ...............................................................e..............................................................................|................................................................e........................................................................ - // sqrdmulh v16.4s, v16.4s, v3.s[1] // ................................................................e.............................................................................|.................................................................e....................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ......................................................................e.......................................................................|.......................................................................e................................................................. - // sub v16.4s, v15.4s, v24.4s // ..................................................................................e...........................................................|...................................................................................e..................................................... - // add v15.4s, v15.4s, v24.4s // ...................................................................................e..........................................................|....................................................................................e.................................................... - // trn1 v25.4s, v9.4s, v10.4s // ............................................................................*.................................................................|.............................................................................*........................................................... - // trn2 v26.4s, v9.4s, v10.4s // ...........................................................................*..................................................................|............................................................................*............................................................ - // trn1 v27.4s, v11.4s, v12.4s // ..................................................................*...........................................................................|...................................................................*..................................................................... - // trn2 v28.4s, v11.4s, v12.4s // ........................................*.....................................................................................................|.........................................*............................................................................................... - // trn2 v11.2d, v25.2d, v27.2d // .................................................................................*............................................................|..................................................................................*...................................................... - // trn2 v12.2d, v26.2d, v28.2d // ....................................................................................*.........................................................|.....................................................................................*................................................... - // trn1 v9.2d, v25.2d, v27.2d // ............................................................................................*.................................................|.............................................................................................*........................................... - // trn1 v10.2d, v26.2d, v28.2d // .....................................................................................................*........................................|......................................................................................................*.................................. - // trn1 v25.4s, v13.4s, v14.4s // ..........................................................................................e...................................................|...........................................................................................e............................................. - // trn2 v26.4s, v13.4s, v14.4s // .............................................................................e................................................................|..............................................................................e.......................................................... - // trn1 v27.4s, v15.4s, v16.4s // ......................................................................................e.......................................................|.......................................................................................e................................................. - // trn2 v28.4s, v15.4s, v16.4s // .......................................................................................e......................................................|........................................................................................e................................................ - // trn2 v15.2d, v25.2d, v27.2d // .................................................................................................e............................................|..................................................................................................e...................................... - // trn2 v16.2d, v26.2d, v28.2d // .............................................................................................e................................................|..............................................................................................e.......................................... - // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................................................e..........|....................................................................................................................................e.... - // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................................e...........................................|...................................................................................................e..................................... - // ldr q0, [x5], #(12*16) // ....................................................................................................................e.........................|.....................................................................................................................e................... - // ldr q4, [x5, #(-12*16 + 1*16)] // ................................................................................*.............................................................|.................................................................................*....................................................... - // ldr q1, [x5, #(-12*16 + 2*16)] // ................................................................................................*.............................................|.................................................................................................*....................................... - // ldr q5, [x5, #(-12*16 + 3*16)] // .............................................................................................................................e................|..............................................................................................................................e.......... - // ldr q2, [x5, #(-12*16 + 4*16)] // ..............................................................................................*...............................................|...............................................................................................*......................................... - // ldr q6, [x5, #(-12*16 + 5*16)] // ............................................................................................................*.................................|.............................................................................................................*........................... - // mul v24.4s, v11.4s, v0.4s // .....................................................................................*........................................................|......................................................................................*.................................................. - // sqrdmulh v11.4s, v11.4s, v4.4s // ...........................................................................................*..................................................|............................................................................................*............................................ - // mls v24.4s, v11.4s, v8.s[0] // .......................................................................................................*......................................|........................................................................................................*................................ - // sub v11.4s, v9.4s, v24.4s // .............................................................................................................*................................|..............................................................................................................*.......................... - // add v9.4s, v9.4s, v24.4s // ......................................................................................................................*.......................|.......................................................................................................................*................. - // mul v24.4s, v12.4s, v0.4s // ........................................................................................*.....................................................|.........................................................................................*............................................... - // sqrdmulh v12.4s, v12.4s, v4.4s // .........................................................................................*....................................................|..........................................................................................*.............................................. - // mls v24.4s, v12.4s, v8.s[0] // ...............................................................................................*..............................................|................................................................................................*........................................ - // sub v12.4s, v10.4s, v24.4s // ........................................................................................................*.....................................|.........................................................................................................*............................... - // add v10.4s, v10.4s, v24.4s // .........................................................................................................*....................................|..........................................................................................................*.............................. - // mul v24.4s, v10.4s, v1.4s // ..............................................................................................................*...............................|...............................................................................................................*......................... - // sqrdmulh v10.4s, v10.4s, v5.4s // ...............................................................................................................*..............................|................................................................................................................*........................ - // mls v24.4s, v10.4s, v8.s[0] // .......................................................................................................................*......................|........................................................................................................................*................ - // sub v10.4s, v9.4s, v24.4s // ..............................................................................................................................*...............|...............................................................................................................................*......... - // add v9.4s, v9.4s, v24.4s // ................................................................................................................................*.............|.................................................................................................................................*....... - // mul v24.4s, v12.4s, v2.4s // ................................................................................................................*.............................|.................................................................................................................*....................... - // sqrdmulh v12.4s, v12.4s, v6.4s // .....................................................................................................................*........................|......................................................................................................................*.................. - // mls v24.4s, v12.4s, v8.s[0] // ...........................................................................................................................*..................|............................................................................................................................*............ - // sub v12.4s, v11.4s, v24.4s // ..................................................................................................................................*...........|...................................................................................................................................*..... - // add v11.4s, v11.4s, v24.4s // .................................................................................................................................*............|..................................................................................................................................*...... - // ldr q0, [x5, #(-12*16 + 6*16)] // ................e.............................................................................................................................|.................e....................................................................................................................... - // ldr q4, [x5, #(-12*16 + 7*16)] // .........................................................................e....................................................................|..........................................................................e.............................................................. - // ldr q1, [x5, #(-12*16 + 8*16)] // ...................................................................................................e..........................................|....................................................................................................e.................................... - // ldr q5, [x5, #(-12*16 + 9*16)] // ....................................................................................................e.........................................|.....................................................................................................e................................... - // ldr q2, [x5, #(-12*16 + 10*16)] // .........................................................e....................................................................................|..........................................................e.............................................................................. - // ldr q6, [x5, #(-12*16 + 11*16)] // .....................................................................e........................................................................|......................................................................e.................................................................. - // mul v24.4s, v15.4s, v0.4s // ...................................................................................................................e..........................|....................................................................................................................e.................... - // sqrdmulh v15.4s, v15.4s, v4.4s // ......................................................................................................e.......................................|.......................................................................................................e................................. - // mls v24.4s, v15.4s, v8.s[0] // ..........................................................................................................................e...................|...........................................................................................................................e............. - // sub v15.4s, v13.4s, v24.4s // ........*.....................................................................................................................................|.........*............................................................................................................................... - // add v13.4s, v13.4s, v24.4s // .........................................................................................................................................e....|......................................................................................................................................... - // mul v24.4s, v16.4s, v0.4s // ..........................................................................................................e...................................|...........................................................................................................e............................. - // sqrdmulh v16.4s, v16.4s, v4.4s // ...........................................................................................................e..................................|............................................................................................................e............................ - // mls v24.4s, v16.4s, v8.s[0] // .................................................................................................................e............................|..................................................................................................................e...................... - // sub v16.4s, v14.4s, v24.4s // ........................................................................................................................e.....................|.........................................................................................................................e............... - // add v14.4s, v14.4s, v24.4s // .........................................................................................................................e....................|..........................................................................................................................e.............. - // mul v24.4s, v14.4s, v1.4s // ............................................................................................................................e.................|.............................................................................................................................e........... - // sqrdmulh v14.4s, v14.4s, v5.4s // ...............................................................................................................................e..............|................................................................................................................................e........ - // mls v24.4s, v14.4s, v8.s[0] // ......................................................................................................................................e.......|.......................................................................................................................................e. - // sub v14.4s, v13.4s, v24.4s // .............................................................................................................................................e|......................................................................................................................................... - // add v13.4s, v13.4s, v24.4s // ............................................................................................................................................e.|......................................................................................................................................... - // mul v24.4s, v16.4s, v2.4s // ..........................................................................................................................................e...|......................................................................................................................................... - // sqrdmulh v16.4s, v16.4s, v6.4s // .....................................................................................................................................e........|......................................................................................................................................e.. - // mls v24.4s, v16.4s, v8.s[0] // .....*........................................................................................................................................|......*.................................................................................................................................. - // sub v16.4s, v15.4s, v24.4s // ...............*..............................................................................................................................|................*........................................................................................................................ - // add v15.4s, v15.4s, v24.4s // ..............*...............................................................................................................................|...............*......................................................................................................................... - // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // .......................................................................................................................................*......|........................................................................................................................................* - // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // ..........................................*...................................................................................................|...........................................*............................................................................................. + mls v13.4S, v5.4S, v8.S[0] // ............................................................................................................................e................... + sub v28.4S, v29.4S, v12.4S // ...................................................................................................e............................................ + sqrdmulh v11.4S, v2.4S, v22.4S // ....................................................................................................................................e........... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 // ..............................................................................................................................................*. + add v3.4S, v29.4S, v12.4S // ....................................................................................................e........................................... + mls v10.4S, v23.4S, v8.S[0] // .......................................................................................................e........................................ + // gap // ................................................................................................................................................ + add x2, x2, #64 // .........e...................................................................................................................................... + trn1 v5.2D, v19.2D, v15.2D // .................................................................................e.............................................................. + trn1 v23.2D, v7.2D, v21.2D // ........................................................................................e....................................................... + mls v16.4S, v11.4S, v8.S[0] // ......................................................................................................................................e......... + // gap // ................................................................................................................................................ + ldr q15, [x5, #-112] // ...............................................................................................e................................................ + sub v14.4S, v0.4S, v30.4S // ..................................................................................................................................e............. + add v0.4S, v23.4S, v13.4S // ..............................................................................................................................e................. + add v27.4S, v5.4S, v10.4S // .........................................................................................................e...................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v10.4S, v5.4S, v10.4S // ........................................................................................................e....................................... + ldr q22, [x5, #-144] // .............................................................................................e.................................................. + // gap // ................................................................................................................................................ + mul v26.4S, v14.4S, v20.4S // ..........................................................................................................................................e..... + + // --------------------------------------------------------------------------------------------------------------------------------- new position ---------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------------- + // ldr q9, [x1, #(16*0 + (64))] // ..............................e...............................................................................................................'...............................~.................................................................................................. + // ldr q10, [x1, #(16*1 + (64))] // .............e................................................................................................................................'..............~................................................................................................................... + // ldr q11, [x1, #(16*2 + (64))] // ................e.............................................................................................................................'.................~................................................................................................................ + // ldr q12, [x1, #(16*3 + (64))] // ..........e...................................................................................................................................'...........~...................................................................................................................... + // ldr q13, [x2, #(16*0 + (64))] // ......e.......................................................................................................................................'.......~.......................................................................................................................... + // ldr q14, [x2, #(16*1 + (64))] // ...........e..................................................................................................................................'............~..................................................................................................................... + // ldr q15, [x2, #(16*2 + (64))] // ...e..........................................................................................................................................'....~............................................................................................................................. + // ldr q16, [x2, #(16*3 + (64))] // e.............................................................................................................................................'.~................................................................................................................................ + // add x1, x1, #64 // ....~.........................................................................................................................................'.....*............................................................................................................................ + // add x2, x2, #64 // ...................................................................................................................................e..........'.................................................................................................................................. + // ldr q0, [x4], #64 // .e............................................................................................................................................'..~............................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // .....................e........................................................................................................................'......................~........................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // .........................e....................................................................................................................'..........................~....................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // .............................................e................................................................................................'..............................................~................................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ...................e..........................................................................................................................'....................~............................................................................................................. + // mul v24.4s, v13.4s, v0.s[0] // ....................e.........................................................................................................................'.....................~............................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................e..................................................................................................................'............................~..................................................................................................... + // sub v13.4s, v9.4s, v24.4s // ...........................................................e..................................................................................'............................................................~..................................................................... + // add v9.4s, v9.4s, v24.4s // .........................................................e....................................................................................'..........................................................~....................................................................... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ........................e.....................................................................................................................'.........................~........................................................................................................ + // mul v24.4s, v14.4s, v0.s[0] // .......................e......................................................................................................................'........................~......................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................e............................................................................................................'..................................~............................................................................................... + // sub v14.4s, v10.4s, v24.4s // ..............................................e...............................................................................................'...............................................~.................................................................................. + // add v10.4s, v10.4s, v24.4s // .........................................e....................................................................................................'..........................................~....................................................................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // .................e............................................................................................................................'..................~............................................................................................................... + // mul v24.4s, v15.4s, v0.s[0] // ..................e...........................................................................................................................'...................~.............................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..........................e...................................................................................................................'...........................~...................................................................................................... + // sub v15.4s, v11.4s, v24.4s // ...........................................e..................................................................................................'............................................~..................................................................................... + // add v11.4s, v11.4s, v24.4s // ..................................e...........................................................................................................'...................................~.............................................................................................. + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ..............e...............................................................................................................................'...............~.................................................................................................................. + // mul v24.4s, v16.4s, v0.s[0] // ...............e..............................................................................................................................'................~................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ......................e.......................................................................................................................'.......................~.......................................................................................................... + // sub v16.4s, v12.4s, v24.4s // ...............................e..............................................................................................................'................................~................................................................................................. + // add v12.4s, v12.4s, v24.4s // ................................e.............................................................................................................'.................................~................................................................................................ + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .......................................e......................................................................................................'........................................~......................................................................................... + // mul v24.4s, v11.4s, v0.s[2] // ........................................e.....................................................................................................'.........................................~........................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...............................................e..............................................................................................'................................................~................................................................................. + // sub v11.4s, v9.4s, v24.4s // ..............................................................e...............................................................................'...............................................................~.................................................................. + // add v9.4s, v9.4s, v24.4s // .................................................................................e............................................................'..................................................................................~............................................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ...................................e..........................................................................................................'....................................~............................................................................................. + // mul v24.4s, v12.4s, v0.s[2] // ....................................e.........................................................................................................'.....................................~............................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ..........................................e...................................................................................................'...........................................~...................................................................................... + // sub v12.4s, v10.4s, v24.4s // .................................................e............................................................................................'..................................................~............................................................................... + // add v10.4s, v10.4s, v24.4s // ........................................................................e.....................................................................'.........................................................................~........................................................ + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ...................................................e..........................................................................................'....................................................~............................................................................. + // mul v24.4s, v15.4s, v1.s[0] // ................................................e.............................................................................................'.................................................~................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ........................................................e.....................................................................................'.........................................................~........................................................................ + // sub v15.4s, v13.4s, v24.4s // ................................................................e.............................................................................'.................................................................~................................................................ + // add v13.4s, v13.4s, v24.4s // ....................................................................................e.........................................................'.....................................................................................~............................................ + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ......................................e.......................................................................................................'.......................................~.......................................................................................... + // mul v24.4s, v16.4s, v1.s[0] // .....................................e........................................................................................................'......................................~........................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................e.................................................................................................'.............................................~.................................................................................... + // sub v16.4s, v14.4s, v24.4s // ..................................................e...........................................................................................'...................................................~.............................................................................. + // add v14.4s, v14.4s, v24.4s // .....................................................................e........................................................................'......................................................................~........................................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ..............................................................................e...............................................................'...............................................................................~.................................................. + // mul v24.4s, v10.4s, v1.s[2] // .............................................................................e................................................................'..............................................................................~................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...................................................................................e..........................................................'....................................................................................~............................................. + // sub v10.4s, v9.4s, v24.4s // .............................................................................................e................................................'..............................................................................................~................................... + // add v9.4s, v9.4s, v24.4s // .........................................................................................e....................................................'..........................................................................................~....................................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ....................................................e.........................................................................................'.....................................................~............................................................................ + // mul v24.4s, v12.4s, v2.s[0] // .....................................................e........................................................................................'......................................................~........................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................................e.................................................................................'.............................................................~.................................................................... + // sub v12.4s, v11.4s, v24.4s // .................................................................e............................................................................'..................................................................~............................................................... + // add v11.4s, v11.4s, v24.4s // ..................................................................e...........................................................................'...................................................................~.............................................................. + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ............................................................................e.................................................................'.............................................................................~.................................................... + // mul v24.4s, v14.4s, v2.s[2] // ...........................................................................e..................................................................'............................................................................~..................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................e...........................................................'...................................................................................~.............................................. + // sub v14.4s, v13.4s, v24.4s // ..........................................................................................e...................................................'...........................................................................................~...................................... + // add v13.4s, v13.4s, v24.4s // ........................................................................................e.....................................................'.........................................................................................~........................................ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .......................................................e......................................................................................'........................................................~......................................................................... + // mul v24.4s, v16.4s, v3.s[0] // ......................................................e.......................................................................................'.......................................................~.......................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .............................................................e................................................................................'..............................................................~................................................................... + // sub v16.4s, v15.4s, v24.4s // ................................................................................e.............................................................'.................................................................................~................................................ + // add v15.4s, v15.4s, v24.4s // ...................................................................e..........................................................................'....................................................................~............................................................. + // trn1 v25.4s, v9.4s, v10.4s // ................................................................................................e.............................................'.................................................................................................~................................ + // trn2 v26.4s, v9.4s, v10.4s // .................................................................................................e............................................'..................................................................................................~............................... + // trn1 v27.4s, v11.4s, v12.4s // ......................................................................e.......................................................................'.......................................................................~.......................................................... + // trn2 v28.4s, v11.4s, v12.4s // ............................................................................................................e.................................'.............................................................................................................~.................... + // trn2 v11.2d, v25.2d, v27.2d // .....................................................................................................e........................................'......................................................................................................~........................... + // trn2 v12.2d, v26.2d, v28.2d // .................................................................................................................e............................'..................................................................................................................~............... + // trn1 v9.2d, v25.2d, v27.2d // ...........................................................................................................................e..................'............................................................................................................................~..... + // trn1 v10.2d, v26.2d, v28.2d // ....................................................................................................................................e.........'.................................................................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ...............................................................................................e..............................................'................................................................................................~................................. + // trn2 v26.4s, v13.4s, v14.4s // ..............................................................................................e...............................................'...............................................................................................~.................................. + // trn1 v27.4s, v15.4s, v16.4s // .......................................................................................e......................................................'........................................................................................~......................................... + // trn2 v28.4s, v15.4s, v16.4s // ......................................................................................e.......................................................'.......................................................................................~.......................................... + // trn2 v15.2d, v25.2d, v27.2d // ...................................................................................................e..........................................'....................................................................................................~............................. + // trn2 v16.2d, v26.2d, v28.2d // ..................................................................................................e...........................................'...................................................................................................~.............................. + // trn1 v13.2d, v25.2d, v27.2d // .....................................................................................................................................e........'.................................................................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ...............................................................................................................e..............................'................................................................................................................~................. + // ldr q0, [ x5], #(12*16) // ....................................................................e.........................................................................'.....................................................................~............................................................ + // ldr q4, [x5, #(-12*16 + 1*16)] // .......................................................................e......................................................................'........................................................................~......................................................... + // ldr q1, [ x5, #(-12*16 + 2*16)] // ............................e.................................................................................................................'.............................~.................................................................................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ............................................................................................................................................e.'.................................................................................................................................. + // ldr q2, [ x5, #(-12*16 + 4*16)] // ..........................................................................................................................e...................'...........................................................................................................................~...... + // ldr q6, [x5, #(-12*16 + 5*16)] // .......................................................................................................................................e......'.................................................................................................................................. + // sqrdmulh v27.4s, v11.4s, v4.4s // ........................................................................................................e.....................................'.........................................................................................................~........................ + // mul v24.4s, v11.4s, v0.4s // ...........................................................................................................e..................................'............................................................................................................~..................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................................................e...........................'...................................................................................................................~.............. + // sub v11.4s, v9.4s, v24.4s // ..............................................................................................................................e...............'...............................................................................................................................~.. + // add v9.4s, v9.4s, v24.4s // .................................................................................................................................e............'.................................................................................................................................. + // sqrdmulh v27.4s, v12.4s, v4.4s // ........................................................................................................................e.....................'.........................................................................................................................~........ + // mul v24.4s, v12.4s, v0.4s // .....................................................................................................................e........................'......................................................................................................................~........... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................................................................e...........'.................................................................................................................................. + // sub v12.4s, v10.4s, v24.4s // ...........................................................................................................................................e..'.................................................................................................................................. + // add v10.4s, v10.4s, v24.4s // ..........................................................................................................................................e...'.................................................................................................................................. + // sqrdmulh v27.4s, v10.4s, v5.4s // .........~....................................................................................................................................'..........*....................................................................................................................... + // mul v24.4s, v10.4s, v1.4s // ..............................................................................................................................................'*................................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ....................................................................................................~.........................................'.....................................................................................................*............................ + // sub v10.4s, v9.4s, v24.4s // ............................................................................................................................~.................'.............................................................................................................................*.... + // add v9.4s, v9.4s, v24.4s // ................................................................................................................~.............................'.................................................................................................................*................ + // sqrdmulh v27.4s, v12.4s, v6.4s // .......~......................................................................................................................................'........*......................................................................................................................... + // mul v24.4s, v12.4s, v2.4s // .....~........................................................................................................................................'......*........................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .............................~................................................................................................................'..............................*................................................................................................... + // sub v12.4s, v11.4s, v24.4s // ...............................................................................~..............................................................'................................................................................*................................................. + // add v11.4s, v11.4s, v24.4s // .......................................................................................................................~......................'........................................................................................................................*......... + // ldr q0, [ x5, #(-12*16 + 6*16)] // .........................................................................e....................................................................'..........................................................................~....................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..........................................................................e...................................................................'...........................................................................~...................................................... + // ldr q1, [ x5, #(-12*16 + 8*16)] // ............................................................................................e.................................................'.............................................................................................~.................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ..............................................................................................................e...............................'...............................................................................................................~.................. + // ldr q2, [ x5, #(-12*16 + 10*16)] // ..........................................................e...................................................................................'...........................................................~...................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...................................................................................................................e..........................'....................................................................................................................~............. + // sqrdmulh v27.4s, v15.4s, v4.4s // .........................................................................................................e....................................'..........................................................................................................~....................... + // mul v24.4s, v15.4s, v0.4s // ....................................................................................................................e.........................'.....................................................................................................................~............ + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................................................................e................'..............................................................................................................................~... + // sub v15.4s, v13.4s, v24.4s // ..~...........................................................................................................................................'...*.............................................................................................................................. + // add v13.4s, v13.4s, v24.4s // .........................................................................................................................................e....'.................................................................................................................................. + // sqrdmulh v27.4s, v16.4s, v4.4s // ......................................................................................................e.......................................'.......................................................................................................~.......................... + // mul v24.4s, v16.4s, v0.4s // .......................................................................................................e......................................'........................................................................................................~......................... + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................................................e................................'..............................................................................................................~................... + // sub v16.4s, v14.4s, v24.4s // ........................................................................................................................................e.....'.................................................................................................................................. + // add v14.4s, v14.4s, v24.4s // ......................................................................................................................e.......................'.......................................................................................................................~.......... + // sqrdmulh v27.4s, v14.4s, v5.4s // ...............................................................................................................................e..............'................................................................................................................................~. + // mul v24.4s, v14.4s, v1.4s // .........................................................................................................................e....................'..........................................................................................................................~....... + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................................................................................e.......'.................................................................................................................................. + // sub v14.4s, v13.4s, v24.4s // ..............................................................................................................................................*.................................................................................................................................. + // add v13.4s, v13.4s, v24.4s // ............~.................................................................................................................................'.............*.................................................................................................................... + // sqrdmulh v27.4s, v16.4s, v6.4s // ........~.....................................................................................................................................'.........*........................................................................................................................ + // mul v24.4s, v16.4s, v2.4s // .............................................................................................................................................e'.................................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...............................................................~..............................................................................'................................................................*................................................................. + // sub v16.4s, v15.4s, v24.4s // ...........................................................................................~..................................................'............................................................................................*..................................... + // add v15.4s, v15.4s, v24.4s // .....................................................................................~........................................................'......................................................................................*........................................... + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // ................................................................................................................................~.............'.................................................................................................................................* + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // ..........................................................................................................~...................................'...........................................................................................................*...................... sub count, count, #1 cbnz count, layer45678_start - sub v25.4S, v29.4S, v19.4S // *................................................... - ldr q15, [x5, #-176] // .......................*............................ - mls v23.4S, v28.4S, v8.S[0] // .*.................................................. - add x2, x2, #64 // ..*................................................. - mls v20.4S, v14.4S, v8.S[0] // ...*................................................ - add x1, x1, #64 // ....*............................................... - ldr q31, [x5, #-128] // ...............................*.................... - mls v4.4S, v3.4S, v8.S[0] // .....*.............................................. - sqrdmulh v17.4S, v25.4S, v2.S[1] // ......*............................................. - mul v25.4S, v25.4S, v2.S[0] // ............*....................................... - ldr q11, [x5, #-160] // .................................*.................. - // gap // .................................................... - sub v0.4S, v12.4S, v23.4S // .........*.......................................... - ldr q3, [x5, #-112] // ......................................*............. - sub v6.4S, v27.4S, v21.4S // .......*............................................ - // gap // .................................................... - add v1.4S, v12.4S, v23.4S // ........*........................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v25.4S, v17.4S, v8.S[0] // .............*...................................... - sub v12.4S, v6.4S, v4.4S // ...........*........................................ - // gap // .................................................... - // gap // .................................................... - sub v16.4S, v1.4S, v20.4S // ...................*................................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - add v1.4S, v1.4S, v20.4S // ....................*............................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v20.4S, v0.4S, v25.4S // ...............*.................................... - add v19.4S, v0.4S, v25.4S // ..............*..................................... - // gap // .................................................... - // gap // .................................................... - trn2 v5.4S, v1.4S, v16.4S // .....................*.............................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - trn1 v25.4S, v19.4S, v20.4S // ..................*................................. - trn2 v13.4S, v19.4S, v20.4S // ................*................................... - // gap // .................................................... - // gap // .................................................... - trn1 v17.4S, v1.4S, v16.4S // ......................*............................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - trn2 v2.2D, v5.2D, v13.2D // .........................*.......................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - trn2 v0.2D, v17.2D, v25.2D // ........................*........................... - trn1 v26.2D, v17.2D, v25.2D // ..............................*..................... - // gap // .................................................... - // gap // .................................................... - mul v28.4S, v2.4S, v22.4S // ...........................*........................ - sqrdmulh v7.4S, v2.4S, v15.4S // ............................*....................... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v1.4S, v0.4S, v15.4S // .............................*...................... - mul v15.4S, v0.4S, v22.4S // ..........................*......................... - // gap // .................................................... - // gap // .................................................... - trn1 v20.2D, v5.2D, v13.2D // ..................................*................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v28.4S, v7.4S, v8.S[0] // ................................*................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v15.4S, v1.4S, v8.S[0] // ...................................*................ - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - add v1.4S, v20.4S, v28.4S // .....................................*.............. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v21.4S, v26.4S, v15.4S // .......................................*............ - sub v28.4S, v20.4S, v28.4S // ....................................*............... - // gap // .................................................... - // gap // .................................................... - sqrdmulh v2.4S, v1.4S, v18.4S // .........................................*.......... - mul v18.4S, v1.4S, v11.4S // ........................................*........... - // gap // .................................................... - // gap // .................................................... - mul v31.4S, v28.4S, v31.4S // ..........................................*......... - sqrdmulh v5.4S, v28.4S, v3.4S // ...........................................*........ - // gap // .................................................... - // gap // .................................................... - add v11.4S, v6.4S, v4.4S // ..........*......................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - mls v18.4S, v2.4S, v8.S[0] // .............................................*...... - add v14.4S, v26.4S, v15.4S // ............................................*....... - // gap // .................................................... - // gap // .................................................... - mls v31.4S, v5.4S, v8.S[0] // ..............................................*..... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - sub v2.4S, v14.4S, v18.4S // ...............................................*.... - add v1.4S, v14.4S, v18.4S // ................................................*... - // gap // .................................................... - // gap // .................................................... - add v3.4S, v21.4S, v31.4S // .................................................*.. - sub v4.4S, v21.4S, v31.4S // ..................................................*. - // gap // .................................................... - // gap // .................................................... - st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x2], #64 // .................*.................................. - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - st4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x1], #64 // ...................................................* - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - // gap // .................................................... - - // original source code - // sub v24.4S, v29.4S, v19.4S // *................................................... - // mls v23.4S, v28.4S, v8.S[0] // ..*................................................. - // add x2, x2, #64 // ...*................................................ - // mls v20.4S, v14.4S, v8.S[0] // ....*............................................... - // add x1, x1, #64 // .....*.............................................. - // mls v4.4S, v3.4S, v8.S[0] // .......*............................................ - // sqrdmulh v3.4S, v24.4S, v2.S[1] // ........*........................................... - // sub v31.4S, v27.4S, v21.4S // .............*...................................... - // add v17.4S, v12.4S, v23.4S // ..............*..................................... - // sub v16.4S, v12.4S, v23.4S // ...........*........................................ - // add v11.4S, v31.4S, v4.4S // ..........................................*......... - // sub v12.4S, v31.4S, v4.4S // ................*................................... - // mul v4.4S, v24.4S, v2.S[0] // .........*.......................................... - // mls v4.4S, v3.4S, v8.S[0] // ...............*.................................... - // add v0.4S, v16.4S, v4.4S // ....................*............................... - // sub v27.4S, v16.4S, v4.4S // ...................*................................ - // trn2 v5.4S, v0.4S, v27.4S // .......................*............................ - // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x2], #64 // ..................................................*. - // trn1 v31.4S, v0.4S, v27.4S // ......................*............................. - // sub v26.4S, v17.4S, v20.4S // .................*.................................. - // add v16.4S, v17.4S, v20.4S // ..................*................................. - // trn2 v6.4S, v16.4S, v26.4S // .....................*.............................. - // trn1 v17.4S, v16.4S, v26.4S // ........................*........................... - // ldr q20, [x5, #-176] // .*.................................................. - // trn2 v0.2D, v17.2D, v31.2D // ..........................*......................... - // trn2 v4.2D, v6.2D, v5.2D // .........................*.......................... - // mul v24.4S, v0.4S, v22.4S // ...............................*.................... - // mul v22.4S, v4.4S, v22.4S // ............................*....................... - // sqrdmulh v4.4S, v4.4S, v20.4S // .............................*...................... - // sqrdmulh v7.4S, v0.4S, v20.4S // ..............................*..................... - // trn1 v0.2D, v17.2D, v31.2D // ...........................*........................ - // ldr q20, [x5, #-128] // ......*............................................. - // mls v22.4S, v4.4S, v8.S[0] // .................................*.................. - // ldr q25, [x5, #-160] // ..........*......................................... - // trn1 v6.2D, v6.2D, v5.2D // ................................*................... - // mls v24.4S, v7.4S, v8.S[0] // ..................................*................. - // sub v7.4S, v6.4S, v22.4S // .....................................*.............. - // add v6.4S, v6.4S, v22.4S // ...................................*................ - // ldr q3, [x5, #-112] // ............*....................................... - // sub v26.4S, v0.4S, v24.4S // ....................................*............... - // mul v25.4S, v6.4S, v25.4S // .......................................*............ - // sqrdmulh v18.4S, v6.4S, v18.4S // ......................................*............. - // mul v6.4S, v7.4S, v20.4S // ........................................*........... - // sqrdmulh v28.4S, v7.4S, v3.4S // .........................................*.......... - // add v1.4S, v0.4S, v24.4S // ............................................*....... - // mls v25.4S, v18.4S, v8.S[0] // ...........................................*........ - // mls v6.4S, v28.4S, v8.S[0] // .............................................*...... - // sub v24.4S, v1.4S, v25.4S // ..............................................*..... - // add v23.4S, v1.4S, v25.4S // ...............................................*.... - // add v25.4S, v26.4S, v6.4S // ................................................*... - // sub v26.4S, v26.4S, v6.4S // .................................................*.. - // st4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x1], #64 // ...................................................* + // Instructions: 20 + // Expected cycles: 14 + // Expected IPC: 1.43 + // + // Wall time: 0.15s + // User time: 0.15s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mul v19.4S, v27.4S, v1.4S // .*............................ + sqrdmulh v1.4S, v27.4S, v22.4S // .......*...................... + add x1, x1, #64 // ...*.......................... + // gap // .............................. + mul v21.4S, v10.4S, v17.4S // ....*......................... + sqrdmulh v17.4S, v10.4S, v15.4S // .....*........................ + // gap // .............................. + // gap // .............................. + sqrdmulh v10.4S, v14.4S, v9.4S // ......*....................... + add v11.4S, v0.4S, v16.4S // ........*..................... + // gap // .............................. + // gap // .............................. + mls v19.4S, v1.4S, v8.S[0] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v21.4S, v17.4S, v8.S[0] // .........*.................... + sub v12.4S, v0.4S, v16.4S // *............................. + // gap // .............................. + // gap // .............................. + mls v26.4S, v10.4S, v8.S[0] // ..........*................... + sub v10.4S, v23.4S, v13.4S // ..*........................... + // gap // .............................. + // gap // .............................. + add v2.4S, v3.4S, v19.4S // ................*............. + sub v3.4S, v3.4S, v19.4S // ..................*........... + // gap // .............................. + // gap // .............................. + sub v5.4S, v28.4S, v21.4S // ...........*.................. + add v4.4S, v28.4S, v21.4S // .................*............ + // gap // .............................. + // gap // .............................. + add v13.4S, v10.4S, v26.4S // ............*................. + sub v14.4S, v10.4S, v26.4S // .............*................ + // gap // .............................. + // gap // .............................. + st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + st4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x2], #64 // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sub v11.4S, v0.4S, v16.4S // .........*..................... + // mul v6.4S, v27.4S, v1.4S // *.............................. + // sub v13.4S, v23.4S, v13.4S // ...........*................... + // add x1, x1, #64 // ..*............................ + // mul v23.4S, v10.4S, v17.4S // ...*........................... + // sqrdmulh v20.4S, v10.4S, v15.4S // ....*.......................... + // sqrdmulh v21.4S, v14.4S, v9.4S // .....*......................... + // sqrdmulh v5.4S, v27.4S, v22.4S // .*............................. + // add v10.4S, v0.4S, v16.4S // ......*........................ + // mls v23.4S, v20.4S, v8.S[0] // ........*...................... + // mls v26.4S, v21.4S, v8.S[0] // ..........*.................... + // sub v27.4S, v28.4S, v23.4S // ..............*................ + // add v12.4S, v13.4S, v26.4S // ................*.............. + // sub v13.4S, v13.4S, v26.4S // .................*............. + // mls v6.4S, v5.4S, v8.S[0] // .......*....................... + // st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x2], #64 // ...................*........... + // add v24.4S, v3.4S, v6.4S // ............*.................. + // add v26.4S, v28.4S, v23.4S // ...............*............... + // sub v25.4S, v3.4S, v6.4S // .............*................. + // st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 // ..................*............ pop_stack diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_a55.s b/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_a55.s index 32d7441d..c23486f4 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_a55.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_a55.s @@ -2,41 +2,13 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro vins vec_out, gpr_in, lane // slothy:no-unfold +.macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, \offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], \inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -57,15 +29,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -74,12 +46,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -99,24 +65,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -141,40 +107,40 @@ xtmp1 .req x11 str \x\()t_31, [\addr, #(-\inc + 8*7)] .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -196,7 +162,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -207,7 +173,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -217,7 +183,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -225,7 +191,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -236,19 +202,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -261,7 +227,7 @@ roots: .text .global ntt_dilithium_123_45678_w_scalar_opt_a55 - .global _ntt_dilithium_123_45678_w_scalar_opt_a55 + .global _ntt_dilithium_123_45678_w_scalar .p2align 4 const_addr: .word 8380417 @@ -385,512 +351,543 @@ _ntt_dilithium_123_45678_w_scalar_opt_a55: load_roots_123 .p2align 2 - ldr x16, [x0, #896] // ..................*.................... - // gap // ....................................... - ldr x10, [x0, #640] // .............*......................... - // gap // ....................................... - ldr x29, [x0, #904] // .....................*................. - // gap // ....................................... - ldr x25, [x0, #648] // ..............*........................ - // gap // ....................................... - vins v30, x16, 0 // .......................*............... - ldr x16, [x0, #768] // ...............*....................... - vins v22, x10, 0 // .................*..................... - ldr x18, [x0, #256] // ..*.................................... - vins v30, x29, 1 // ........................*.............. - ldr x29, [x0, #776] // ................*...................... - vins v22, x25, 1 // ...................*................... - ldr x10, [x0, #384] // ....*.................................. - mul v17.4S, v30.4S, v0.S[0] // .........................*............. - vins v31, x16, 0 // ....................*.................. - sqrdmulh v20.4S, v30.4S, v0.S[1] // ..........................*............ - vins v15, x18, 0 // ........*.............................. - vins v31, x29, 1 // ......................*................ - ldr x16, [x0, #128] // *...................................... - mul v21.4S, v22.4S, v0.S[0] // ..............................*........ - ldr x29, [x0, #392] // .....*................................. - sqrdmulh v30.4S, v22.4S, v0.S[1] // ...............................*....... - ldr x25, [x0, #136] // .*..................................... - mls v17.4S, v20.4S, v8.S[0] // ...........................*........... - vins v20, x10, 0 // .........*............................. - mul v6.4S, v31.4S, v0.S[0] // .................................*..... - ldr x13, [x0, #264] // ...*................................... - sqrdmulh v19.4S, v31.4S, v0.S[1] // ..................................*.... - vins v20, x29, 1 // ...........*........................... - mls v21.4S, v30.4S, v8.S[0] // ................................*...... - ldr x21, [x0, #512] // ......*................................ - add v30.4S, v20.4S, v17.4S // .............................*......... - ldr x29, [x0, #520] // .......*............................... - sub v12.4S, v20.4S, v17.4S // ............................*.......... - // gap // ....................................... - mls v6.4S, v19.4S, v8.S[0] // ....................................*.. - // gap // ....................................... - mul v22.4S, v30.4S, v0.S[2] // ...................................*... - vins v10, x21, 0 // ..........*............................ - sqrdmulh v27.4S, v30.4S, v0.S[3] // .....................................*. - // gap // ....................................... - sqrdmulh v9.4S, v12.4S, v1.S[1] // ......................................* - vins v10, x29, 1 // ............*.......................... - - // original source code - // ldr x16, [x0, #128] // .................*..................... || ..........*............ - // ldr x25, [x0, #136] // .....................*................. || ............*.......... - // ldr x10, [x0, #256] // .......*............................... || .....*................. - // ldr x13, [x0, #264] // .........................*............. || ..............*........ - // ldr x29, [x0, #384] // ...........*........................... || .......*............... - // ldr x18, [x0, #392] // ...................*................... || ...........*........... - // ldr x8, [x0, #512] // .............................*......... || ................*...... - // ldr x11, [x0, #520] // ...............................*....... || .................*..... - // vins v15, x10, 0 // ...............*....................... || .........*............. - // vins v4, x29, 0 // .......................*............... || .............*......... - // vins v10, x8, 0 // ...................................*... || ....................*.. - // vins v4, x18, 1 // ...........................*........... || ...............*....... - // vins v10, x11, 1 // ......................................* || ......................* - // ldr x29, [x0, #640] // .*..................................... || .*..................... - // ldr x10, [x0, #648] // ...*................................... || ...*................... - // ldr x18, [x0, #768] // .....*................................. || ....*.................. - // ldr x8, [x0, #776] // .........*............................. || ......*................ - // vins v17, x29, 0 // ......*................................ || .....*................. - // ldr x29, [x0, #896] // *...................................... || *...................... - // vins v17, x10, 1 // ..........*............................ || .......*............... - // vins v26, x18, 0 // .............*......................... || ........*.............. - // ldr x10, [x0, #904] // ..*.................................... || ..*.................... - // vins v26, x8, 1 // ................*...................... || ..........*............ - // vins v16, x29, 0 // ....*.................................. || ....*.................. - // vins v16, x10, 1 // ........*.............................. || ......*................ - // mul v20.4S, v16.4S, v0.S[0] // ............*.......................... || ........*.............. - // sqrdmulh v27.4S, v16.4S, v0.S[1] // ..............*........................ || .........*............. - // mls v20.4S, v27.4S, v8.S[0] // ......................*................ || .............*......... - // sub v12.4S, v4.4S, v20.4S // ................................*...... || ..................*.... - // add v20.4S, v4.4S, v20.4S // ..............................*........ || .................*..... - // mul v21.4S, v17.4S, v0.S[0] // ..................*.................... || ...........*........... - // sqrdmulh v17.4S, v17.4S, v0.S[1] // ....................*.................. || ............*.......... - // mls v21.4S, v17.4S, v8.S[0] // ............................*.......... || ................*...... - // mul v6.4S, v26.4S, v0.S[0] // ........................*.............. || ..............*........ - // sqrdmulh v30.4S, v26.4S, v0.S[1] // ..........................*............ || ...............*....... - // mul v22.4S, v20.4S, v0.S[2] // ..................................*.... || ....................*.. - // mls v6.4S, v30.4S, v8.S[0] // .................................*..... || ...................*... - // sqrdmulh v27.4S, v20.4S, v0.S[3] // ....................................*.. || .....................*. - // sqrdmulh v9.4S, v12.4S, v1.S[1] // .....................................*. || ......................* - + // Instructions: 10 + // Expected cycles: 17 + // Expected IPC: 0.59 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q6, [x0, #512] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q20, [x0, #896] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q9, [x0, #0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x0, #128] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x0, #256] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q26, [x0, #384] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x0, #640] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v12.4S, v20.4S, v0.S[0] // .........*.................... + // gap // .............................. + mul v19.4S, v6.4S, v0.S[0] // ......*....................... + // gap // .............................. + ldr q31, [x0, #768] // ........*..................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q9, [x0, #0] // ..*............................ + // ldr q15, [x0, #128] // ...*........................... + // ldr q4, [x0, #256] // ....*.......................... + // ldr q26, [x0, #384] // .....*......................... + // ldr q6, [x0, #512] // *.............................. + // ldr q22, [x0, #640] // ......*........................ + // mul v19.4S, v6.4S, v0.S[0] // ........*...................... + // ldr q20, [x0, #896] // .*............................. + // ldr q31, [x0, #768] // .........*..................... + // mul v12.4S, v20.4S, v0.S[0] // .......*....................... + sub count, count, #1 -.p2align 2 layer123_start: - mul v30.4S, v10.4S, v0.S[0] // ................................*................................................................... - ldr x29, [x0, #0] // *................................................................................................... - mul v20.4S, v12.4S, v1.S[0] // ...................................................................*................................ - vins v17, x16, 0 // ......*............................................................................................. - sqrdmulh v31.4S, v10.4S, v0.S[1] // .................................*.................................................................. - vins v15, x13, 1 // ...........*........................................................................................ - mls v22.4S, v27.4S, v8.S[0] // ...........................................................*........................................ - vins v17, x25, 1 // .......*............................................................................................ - add v27.4S, v15.4S, v6.4S // ..............................................*..................................................... - vins v25, x29, 0 // ..*................................................................................................. - add v12.4S, v17.4S, v21.4S // .........................................*.......................................................... - ldr x29, [x0, #8] // .*.................................................................................................. - mls v30.4S, v31.4S, v8.S[0] // ..................................*................................................................. - ldr x16, [x0, #144] // ....e............................................................................................... - mul v31.4S, v27.4S, v0.S[2] // ....................................................*............................................... - ldr x25, [x0, #152] // .....e.............................................................................................. - add v4.4S, v12.4S, v22.4S // .............................................................*...................................... - ldr x10, [x0, #272] // ........e........................................................................................... - sqrdmulh v27.4S, v27.4S, v0.S[3] // .....................................................*.............................................. - vins v25, x29, 1 // ...*................................................................................................ - mls v20.4S, v9.4S, v8.S[0] // .....................................................................*.............................. - ldr x13, [x0, #280] // .........e.......................................................................................... - mul v9.4S, v4.4S, v1.S[2] // ........................................................................*........................... - ldr x29, [x0, #400] // ............e....................................................................................... - sqrdmulh v4.4S, v4.4S, v1.S[3] // .........................................................................*.......................... - ldr x18, [x0, #408] // .............e...................................................................................... - mls v31.4S, v27.4S, v8.S[0] // ......................................................*............................................. - ldr x8, [x0, #528] // ................e................................................................................... - add v27.4S, v25.4S, v30.4S // ....................................*............................................................... - ldr x11, [x0, #536] // .................e.................................................................................. - sub v26.4S, v15.4S, v6.4S // .............................................*...................................................... - vins v15, x10, 0 // ..........e......................................................................................... - mls v9.4S, v4.4S, v8.S[0] // ..........................................................................*......................... - vins v4, x29, 0 // ..............e..................................................................................... - add v6.4S, v27.4S, v31.4S // ........................................................*........................................... - vins v10, x8, 0 // ..................e................................................................................. - sub v22.4S, v12.4S, v22.4S // ............................................................*....................................... - vins v4, x18, 1 // ...............e.................................................................................... - sub v17.4S, v17.4S, v21.4S // ........................................*........................................................... - vins v10, x11, 1 // ...................e................................................................................ - sub v12.4S, v6.4S, v9.4S // ...........................................................................*........................ - ldr x29, [x0, #656] // ....................e............................................................................... - sub v30.4S, v25.4S, v30.4S // ...................................*................................................................ - ldr x10, [x0, #664] // .....................e.............................................................................. - sub v25.4S, v17.4S, v20.4S // ......................................................................*............................. - ldr x18, [x0, #784] // ........................e........................................................................... - str_vo v12, x0, 128 // .............................................................................................*...... - ldr x8, [x0, #792] // .........................e.......................................................................... - add v20.4S, v17.4S, v20.4S // .......................................................................*............................ - vins v17, x29, 0 // ......................e............................................................................. - sub v31.4S, v27.4S, v31.4S // .......................................................*............................................ - ldr x29, [x0, #912] // ............................e....................................................................... - mul v27.4S, v26.4S, v1.S[0] // ..............................................................*..................................... - vins v17, x10, 1 // .......................e............................................................................ - sqrdmulh v12.4S, v26.4S, v1.S[1] // ...............................................................*.................................... - vins v26, x18, 0 // ..........................e......................................................................... - add v9.4S, v6.4S, v9.4S // ............................................................................*....................... - ldr x10, [x0, #920] // .............................e...................................................................... - mul v6.4S, v22.4S, v2.S[0] // .............................................................................*...................... - vins v26, x8, 1 // ...........................e........................................................................ - sqrdmulh v22.4S, v22.4S, v2.S[1] // ..............................................................................*..................... - vins v16, x29, 0 // ..............................e..................................................................... - mls v27.4S, v12.4S, v8.S[0] // ................................................................*................................... - // gap // .................................................................................................... - mul v12.4S, v20.4S, v2.S[2] // ..................................................................................*................. - vins v16, x10, 1 // ...............................e.................................................................... - sqrdmulh v20.4S, v20.4S, v2.S[3] // ...................................................................................*................ - // gap // .................................................................................................... - mls v6.4S, v22.4S, v8.S[0] // ...............................................................................*.................... - // gap // .................................................................................................... - sub v22.4S, v30.4S, v27.4S // .................................................................*.................................. - // gap // .................................................................................................... - add v30.4S, v30.4S, v27.4S // ..................................................................*................................. - // gap // .................................................................................................... - mls v12.4S, v20.4S, v8.S[0] // ....................................................................................*............... - // gap // .................................................................................................... - mul v20.4S, v16.4S, v0.S[0] // ...............................................e.................................................... - // gap // .................................................................................................... - sqrdmulh v27.4S, v16.4S, v0.S[1] // ................................................e................................................... - // gap // .................................................................................................... - sub v16.4S, v31.4S, v6.4S // ................................................................................*................... - // gap // .................................................................................................... - add v31.4S, v31.4S, v6.4S // .................................................................................*.................. - // gap // .................................................................................................... - sub v6.4S, v30.4S, v12.4S // .....................................................................................*.............. - // gap // .................................................................................................... - mls v20.4S, v27.4S, v8.S[0] // .................................................e.................................................. - // gap // .................................................................................................... - add v30.4S, v30.4S, v12.4S // ......................................................................................*............. - // gap // .................................................................................................... - mul v27.4S, v25.4S, v3.S[0] // .......................................................................................*............ - // gap // .................................................................................................... - sqrdmulh v25.4S, v25.4S, v3.S[1] // ........................................................................................*........... - // gap // .................................................................................................... - sub v12.4S, v4.4S, v20.4S // ..................................................e................................................. - // gap // .................................................................................................... - add v20.4S, v4.4S, v20.4S // ...................................................e................................................ - // gap // .................................................................................................... - str_vi v9, x0, 16 // ............................................................................................*....... - // gap // .................................................................................................... - mls v27.4S, v25.4S, v8.S[0] // .........................................................................................*.......... - // gap // .................................................................................................... - str_vo v31, x0, 240 // ..............................................................................................*..... - // gap // .................................................................................................... - mul v21.4S, v17.4S, v0.S[0] // .....................................e.............................................................. - // gap // .................................................................................................... - str_vo v16, x0, 368 // ...............................................................................................*.... - // gap // .................................................................................................... - sqrdmulh v17.4S, v17.4S, v0.S[1] // ......................................e............................................................. - // gap // .................................................................................................... - sub v31.4S, v22.4S, v27.4S // ..........................................................................................*......... - // gap // .................................................................................................... - add v22.4S, v22.4S, v27.4S // ...........................................................................................*........ - // gap // .................................................................................................... - str_vo v30, x0, 496 // ................................................................................................*... - // gap // .................................................................................................... - mls v21.4S, v17.4S, v8.S[0] // .......................................e............................................................ - // gap // .................................................................................................... - str_vo v6, x0, 624 // .................................................................................................*.. - // gap // .................................................................................................... - mul v6.4S, v26.4S, v0.S[0] // ..........................................e......................................................... - // gap // .................................................................................................... - sqrdmulh v30.4S, v26.4S, v0.S[1] // ...........................................e........................................................ - // gap // .................................................................................................... - str_vo v22, x0, 752 // ..................................................................................................*. - // gap // .................................................................................................... - mul v22.4S, v20.4S, v0.S[2] // .........................................................e.......................................... - // gap // .................................................................................................... - str_vo v31, x0, 880 // ...................................................................................................* - // gap // .................................................................................................... - mls v6.4S, v30.4S, v8.S[0] // ............................................e....................................................... - // gap // .................................................................................................... - sqrdmulh v27.4S, v20.4S, v0.S[3] // ..........................................................e......................................... - // gap // .................................................................................................... - sqrdmulh v9.4S, v12.4S, v1.S[1] // ....................................................................e............................... - // gap // .................................................................................................... - - // original source code - // ldr x10, [x0, #0] // ........................................................................................*............................................................................................... || ..............................................................*................................................................ - // ldr x11, [x0, #8] // ..................................................................................................*..................................................................................... || ...................................................................*........................................................... - // vins v9, x10, 0 // ................................................................................................*....................................................................................... || ..................................................................*............................................................ - // vins v9, x11, 1 // ..........................................................................................................*............................................................................. || .......................................................................*....................................................... - // ldr x10, [x0, #128] // e....................................................................................................................................................................................... || e.............................................................................................................................. - // ldr x11, [x0, #136] // ..e..................................................................................................................................................................................... || .e............................................................................................................................. - // vins v10, x10, 0 // ..........................................................................................*............................................................................................. || ...............................................................*............................................................... - // vins v10, x11, 1 // ..............................................................................................*......................................................................................... || .................................................................*............................................................. - // ldr x10, [x0, #256] // ....e................................................................................................................................................................................... || ..e............................................................................................................................ - // ldr x11, [x0, #264] // ........e............................................................................................................................................................................... || ....e.......................................................................................................................... - // vins v11, x10, 0 // ..................e..................................................................................................................................................................... || .........e..................................................................................................................... - // vins v11, x11, 1 // ............................................................................................*........................................................................................... || ................................................................*.............................................................. - // ldr x10, [x0, #384] // ..........e............................................................................................................................................................................. || .....e......................................................................................................................... - // ldr x11, [x0, #392] // ............e........................................................................................................................................................................... || ......e........................................................................................................................ - // vins v12, x10, 0 // ....................e................................................................................................................................................................... || ..........e.................................................................................................................... - // vins v12, x11, 1 // ........................e............................................................................................................................................................... || ............e.................................................................................................................. - // ldr x10, [x0, #512] // ..............e......................................................................................................................................................................... || .......e....................................................................................................................... - // ldr x11, [x0, #520] // ................e....................................................................................................................................................................... || ........e...................................................................................................................... - // vins v13, x10, 0 // ......................e................................................................................................................................................................. || ...........e................................................................................................................... - // vins v13, x11, 1 // ..........................e............................................................................................................................................................. || .............e................................................................................................................. - // ldr x10, [x0, #640] // ............................e........................................................................................................................................................... || ..............e................................................................................................................ - // ldr x11, [x0, #648] // ..............................e......................................................................................................................................................... || ...............e............................................................................................................... - // vins v14, x10, 0 // ....................................e................................................................................................................................................... || ..................e............................................................................................................ - // vins v14, x11, 1 // ........................................e............................................................................................................................................... || ....................e.......................................................................................................... - // ldr x10, [x0, #768] // ................................e....................................................................................................................................................... || ................e.............................................................................................................. - // ldr x11, [x0, #776] // ..................................e..................................................................................................................................................... || .................e............................................................................................................. - // vins v15, x10, 0 // ..........................................e............................................................................................................................................. || .....................e......................................................................................................... - // vins v15, x11, 1 // ..............................................e......................................................................................................................................... || .......................e....................................................................................................... - // ldr x10, [x0, #896] // ......................................e................................................................................................................................................. || ...................e........................................................................................................... - // ldr x11, [x0, #904] // ............................................e........................................................................................................................................... || ......................e........................................................................................................ - // vins v16, x10, 0 // ................................................e....................................................................................................................................... || ........................e...................................................................................................... - // vins v16, x11, 1 // ...................................................e.................................................................................................................................... || ..........................e.................................................................................................... - // mul v24.4S, v13.4S, v0.S[0] // .......................................................................................*................................................................................................ || ..............................................................*................................................................ - // sqrdmulh v13.4S, v13.4S, v0.S[1] // ...........................................................................................*............................................................................................ || ................................................................*.............................................................. - // mls v24.4S, v13.4S, v8.S[0] // ...................................................................................................*.................................................................................... || ....................................................................*.......................................................... - // sub v13.4S, v9.4S, v24.4S // .................................................................................................................................*...................................................... || ...................................................................................*........................................... - // add v9.4S, v9.4S, v24.4S // ...................................................................................................................*.................................................................... || ............................................................................*.................................................. - // mul v24.4S, v14.4S, v0.S[0] // .......................................................................e................................................................................................................ || ..............................................e................................................................................ - // sqrdmulh v14.4S, v14.4S, v0.S[1] // .........................................................................e.............................................................................................................. || ................................................e.............................................................................. - // mls v24.4S, v14.4S, v8.S[0] // .............................................................................e.......................................................................................................... || ....................................................e.......................................................................... - // sub v14.4S, v10.4S, v24.4S // .............................................................................................................................*.......................................................... || .................................................................................*............................................. - // add v10.4S, v10.4S, v24.4S // .................................................................................................*...................................................................................... || ...................................................................*........................................................... - // mul v24.4S, v15.4S, v0.S[0] // ...............................................................................e........................................................................................................ || ......................................................e........................................................................ - // sqrdmulh v15.4S, v15.4S, v0.S[1] // ................................................................................e....................................................................................................... || .......................................................e....................................................................... - // mls v24.4S, v15.4S, v8.S[0] // ....................................................................................e................................................................................................... || ...........................................................e................................................................... - // sub v15.4S, v11.4S, v24.4S // .....................................................................................................................*.................................................................. || .............................................................................*................................................. - // add v11.4S, v11.4S, v24.4S // ...............................................................................................*........................................................................................ || ..................................................................*............................................................ - // mul v24.4S, v16.4S, v0.S[0] // .........................................................e.............................................................................................................................. || ................................e.............................................................................................. - // sqrdmulh v16.4S, v16.4S, v0.S[1] // ..........................................................e............................................................................................................................. || .................................e............................................................................................. - // mls v24.4S, v16.4S, v8.S[0] // ..............................................................e......................................................................................................................... || .....................................e......................................................................................... - // sub v16.4S, v12.4S, v24.4S // ..................................................................e..................................................................................................................... || .........................................e..................................................................................... - // add v12.4S, v12.4S, v24.4S // ...................................................................e.................................................................................................................... || ..........................................e.................................................................................... - // mul v24.4S, v11.4S, v0.S[2] // .....................................................................................................*.................................................................................. || .....................................................................*......................................................... - // sqrdmulh v11.4S, v11.4S, v0.S[3] // .........................................................................................................*.............................................................................. || .......................................................................*....................................................... - // mls v24.4S, v11.4S, v8.S[0] // .................................................................................................................*...................................................................... || ...........................................................................*................................................... - // sub v11.4S, v9.4S, v24.4S // .........................................................................................................................................*.............................................. || .......................................................................................*....................................... - // add v9.4S, v9.4S, v24.4S // .........................................................................................................................*.............................................................. || ...............................................................................*............................................... - // mul v24.4S, v12.4S, v0.S[2] // ..................................................................................e..................................................................................................... || .........................................................e..................................................................... - // sqrdmulh v12.4S, v12.4S, v0.S[3] // .....................................................................................e.................................................................................................. || ............................................................e.................................................................. - // mls v24.4S, v12.4S, v8.S[0] // .............................................................................................*.......................................................................................... || .................................................................*............................................................. - // sub v12.4S, v10.4S, v24.4S // ...........................................................................................................................*............................................................ || ................................................................................*.............................................. - // add v10.4S, v10.4S, v24.4S // .......................................................................................................*................................................................................ || ......................................................................*........................................................ - // mul v24.4S, v15.4S, v1.S[0] // ...........................................................................................................................................*............................................ || ........................................................................................*...................................... - // sqrdmulh v15.4S, v15.4S, v1.S[1] // .............................................................................................................................................*.......................................... || .........................................................................................*..................................... - // mls v24.4S, v15.4S, v8.S[0] // .....................................................................................................................................................*.................................. || .............................................................................................*................................. - // sub v15.4S, v13.4S, v24.4S // ..........................................................................................................................................................*............................. || .................................................................................................*............................. - // add v13.4S, v13.4S, v24.4S // ...........................................................................................................................................................*............................ || ..................................................................................................*............................ - // mul v24.4S, v16.4S, v1.S[0] // .........................................................................................*.............................................................................................. || ...............................................................*............................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[1] // ......................................................................................e................................................................................................. || .............................................................e................................................................. - // mls v24.4S, v16.4S, v8.S[0] // ...........................................................................................................*............................................................................ || ........................................................................*...................................................... - // sub v16.4S, v14.4S, v24.4S // ...................................................................................................................................*.................................................... || ....................................................................................*.......................................... - // add v14.4S, v14.4S, v24.4S // .......................................................................................................................................*................................................ || ......................................................................................*........................................ - // mul v24.4S, v10.4S, v1.S[2] // .............................................................................................................*.......................................................................... || .........................................................................*..................................................... - // sqrdmulh v10.4S, v10.4S, v1.S[3] // ...............................................................................................................*........................................................................ || ..........................................................................*.................................................... - // mls v24.4S, v10.4S, v8.S[0] // .......................................................................................................................*................................................................ || ..............................................................................*................................................ - // sub v10.4S, v9.4S, v24.4S // ...............................................................................................................................*........................................................ || ..................................................................................*............................................ - // add v9.4S, v9.4S, v24.4S // ...............................................................................................................................................*........................................ || ..........................................................................................*.................................... - // mul v24.4S, v12.4S, v2.S[0] // .................................................................................................................................................*...................................... || ...........................................................................................*................................... - // sqrdmulh v12.4S, v12.4S, v2.S[1] // ...................................................................................................................................................*.................................... || ............................................................................................*.................................. - // mls v24.4S, v12.4S, v8.S[0] // .........................................................................................................................................................*.............................. || ................................................................................................*.............................. - // sub v12.4S, v11.4S, v24.4S // ...............................................................................................................................................................*........................ || ......................................................................................................*........................ - // add v11.4S, v11.4S, v24.4S // ................................................................................................................................................................*....................... || .......................................................................................................*....................... - // mul v24.4S, v14.4S, v2.S[2] // ......................................................................................................................................................*................................. || ..............................................................................................*................................ - // sqrdmulh v14.4S, v14.4S, v2.S[3] // ........................................................................................................................................................*............................... || ...............................................................................................*............................... - // mls v24.4S, v14.4S, v8.S[0] // ............................................................................................................................................................*........................... || ...................................................................................................*........................... - // sub v14.4S, v13.4S, v24.4S // .................................................................................................................................................................*...................... || ........................................................................................................*...................... - // add v13.4S, v13.4S, v24.4S // ...................................................................................................................................................................*.................... || ..........................................................................................................*.................... - // mul v24.4S, v16.4S, v3.S[0] // ....................................................................................................................................................................*................... || ...........................................................................................................*................... - // sqrdmulh v16.4S, v16.4S, v3.S[1] // .....................................................................................................................................................................*.................. || ............................................................................................................*.................. - // mls v24.4S, v16.4S, v8.S[0] // .........................................................................................................................................................................*.............. || ................................................................................................................*.............. - // sub v16.4S, v15.4S, v24.4S // ..............................................................................................................................................................................*......... || .....................................................................................................................*......... - // add v15.4S, v15.4S, v24.4S // ...............................................................................................................................................................................*........ || ......................................................................................................................*........ - // str_vi v9, x0, 16 // ........................................................................................................................................................................*............... || ...............................................................................................................*............... - // str_vo v10, x0, 112 // .....................................................................................................................................*.................................................. || .....................................................................................*......................................... - // str_vo v11, x0, 240 // ..........................................................................................................................................................................*............. || .................................................................................................................*............. - // str_vo v12, x0, 368 // ............................................................................................................................................................................*........... || ...................................................................................................................*........... - // str_vo v13, x0, 496 // ................................................................................................................................................................................*....... || .......................................................................................................................*....... - // str_vo v14, x0, 624 // ..................................................................................................................................................................................*..... || .........................................................................................................................*..... - // str_vo v15, x0, 752 // .....................................................................................................................................................................................*.. || ............................................................................................................................*.. - // str_vo v16, x0, 880 // .......................................................................................................................................................................................* || ..............................................................................................................................* - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Wall time: 4.14s + // User time: 4.14s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sqrdmulh v6.4S, v6.4S, v0.S[1] // ........*................................................................... + // gap // ............................................................................ + sqrdmulh v18.4S, v22.4S, v0.S[1] // .............*.............................................................. + // gap // ............................................................................ + mul v16.4S, v22.4S, v0.S[0] // ..............*............................................................. + // gap // ............................................................................ + sqrdmulh v22.4S, v31.4S, v0.S[1] // ..................*......................................................... + // gap // ............................................................................ + mls v19.4S, v6.4S, v8.S[0] // ..........*................................................................. + // gap // ............................................................................ + mul v6.4S, v31.4S, v0.S[0] // ...................*........................................................ + // gap // ............................................................................ + mls v16.4S, v18.4S, v8.S[0] // ...............*............................................................ + // gap // ............................................................................ + sqrdmulh v18.4S, v20.4S, v0.S[1] // .......................*.................................................... + // gap // ............................................................................ + sub v20.4S, v9.4S, v19.4S // ...........*................................................................ + // gap // ............................................................................ + mls v6.4S, v22.4S, v8.S[0] // ....................*....................................................... + // gap // ............................................................................ + add v22.4S, v9.4S, v19.4S // ............*............................................................... + // gap // ............................................................................ + sub v19.4S, v15.4S, v16.4S // ................*........................................................... + // gap // ............................................................................ + add v16.4S, v15.4S, v16.4S // .................*.......................................................... + // gap // ............................................................................ + sub v31.4S, v4.4S, v6.4S // .....................*...................................................... + // gap // ............................................................................ + add v6.4S, v4.4S, v6.4S // ......................*..................................................... + // gap // ............................................................................ + mls v12.4S, v18.4S, v8.S[0] // .........................*.................................................. + // gap // ............................................................................ + sqrdmulh v18.4S, v31.4S, v1.S[1] // ......................................*..................................... + // gap // ............................................................................ + mul v31.4S, v31.4S, v1.S[0] // .......................................*.................................... + // gap // ............................................................................ + sqrdmulh v9.4S, v6.4S, v0.S[3] // ............................*............................................... + // gap // ............................................................................ + sub v15.4S, v26.4S, v12.4S // ..........................*................................................. + // gap // ............................................................................ + add v12.4S, v26.4S, v12.4S // ...........................*................................................ + // gap // ............................................................................ + mls v31.4S, v18.4S, v8.S[0] // ........................................*................................... + // gap // ............................................................................ + mul v6.4S, v6.4S, v0.S[2] // .............................*.............................................. + // gap // ............................................................................ + sqrdmulh v18.4S, v15.4S, v1.S[1] // ...........................................*................................ + // gap // ............................................................................ + mul v15.4S, v15.4S, v1.S[0] // ............................................*............................... + // gap // ............................................................................ + sub v4.4S, v20.4S, v31.4S // .........................................*.................................. + // gap // ............................................................................ + add v20.4S, v20.4S, v31.4S // ..........................................*................................. + // gap // ............................................................................ + mls v6.4S, v9.4S, v8.S[0] // ..............................*............................................. + // gap // ............................................................................ + sqrdmulh v31.4S, v12.4S, v0.S[3] // .................................*.......................................... + // gap // ............................................................................ + mls v15.4S, v18.4S, v8.S[0] // .............................................*.............................. + // gap // ............................................................................ + mul v18.4S, v12.4S, v0.S[2] // ..................................*......................................... + // gap // ............................................................................ + sub v12.4S, v22.4S, v6.4S // ...............................*............................................ + // gap // ............................................................................ + add v6.4S, v22.4S, v6.4S // ................................*........................................... + // gap // ............................................................................ + sub v22.4S, v19.4S, v15.4S // ..............................................*............................. + // gap // ............................................................................ + add v19.4S, v19.4S, v15.4S // ...............................................*............................ + // gap // ............................................................................ + mls v18.4S, v31.4S, v8.S[0] // ...................................*........................................ + // gap // ............................................................................ + sqrdmulh v31.4S, v22.4S, v3.S[1] // ...............................................................*............ + // gap // ............................................................................ + sqrdmulh v9.4S, v19.4S, v2.S[3] // ..........................................................*................. + // gap // ............................................................................ + mul v19.4S, v19.4S, v2.S[2] // ...........................................................*................ + // gap // ............................................................................ + sub v15.4S, v16.4S, v18.4S // ....................................*....................................... + // gap // ............................................................................ + add v18.4S, v16.4S, v18.4S // .....................................*...................................... + // gap // ............................................................................ + mul v16.4S, v22.4S, v3.S[0] // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v22.4S, v15.4S, v2.S[1] // .....................................................*...................... + // gap // ............................................................................ + sqrdmulh v26.4S, v18.4S, v1.S[3] // ................................................*........................... + // gap // ............................................................................ + mul v18.4S, v18.4S, v1.S[2] // .................................................*.......................... + // gap // ............................................................................ + mul v15.4S, v15.4S, v2.S[0] // ......................................................*..................... + // gap // ............................................................................ + mls v19.4S, v9.4S, v8.S[0] // ............................................................*............... + // gap // ............................................................................ + mls v16.4S, v31.4S, v8.S[0] // .................................................................*.......... + // gap // ............................................................................ + mls v18.4S, v26.4S, v8.S[0] // ..................................................*......................... + // gap // ............................................................................ + mls v15.4S, v22.4S, v8.S[0] // .......................................................*.................... + // gap // ............................................................................ + sub v22.4S, v20.4S, v19.4S // .............................................................*.............. + // gap // ............................................................................ + sub v31.4S, v4.4S, v16.4S // ..................................................................*......... + // gap // ............................................................................ + add v16.4S, v4.4S, v16.4S // ...................................................................*........ + // gap // ............................................................................ + add v19.4S, v20.4S, v19.4S // ..............................................................*............. + // gap // ............................................................................ + sub v20.4S, v6.4S, v18.4S // ...................................................*........................ + // gap // ............................................................................ + add v6.4S, v6.4S, v18.4S // ....................................................*....................... + // gap // ............................................................................ + sub v18.4S, v12.4S, v15.4S // ........................................................*................... + // gap // ............................................................................ + add v12.4S, v12.4S, v15.4S // .........................................................*.................. + // gap // ............................................................................ + str q6, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + ldr q9, [x0, #0] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q20, [x0, #112] // .....................................................................*...... + // gap // ............................................................................ + ldr q15, [x0, #128] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q12, [x0, #240] // ......................................................................*..... + // gap // ............................................................................ + ldr q4, [x0, #256] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q18, [x0, #368] // .......................................................................*.... + // gap // ............................................................................ + ldr q26, [x0, #384] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #496] // ........................................................................*... + // gap // ............................................................................ + ldr q6, [x0, #512] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q22, [x0, #624] // .........................................................................*.. + // gap // ............................................................................ + ldr q22, [x0, #640] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #752] // ..........................................................................*. + // gap // ............................................................................ + mul v19.4S, v6.4S, v0.S[0] // .........e.................................................................. + // gap // ............................................................................ + str q31, [x0, #880] // ...........................................................................* + // gap // ............................................................................ + ldr q20, [x0, #896] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q31, [x0, #768] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v12.4S, v20.4S, v0.S[0] // ........................e................................................... + // gap // ............................................................................ + + // ------------------------------------- new position --------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-------------- + // ldr q9, [x0, #0] // e................'..........................................................~............. + // ldr q10, [x0, #(1*(1024/8))] // ..e..............'............................................................~........... + // ldr q11, [x0, #(2*(1024/8))] // ....e............'..............................................................~......... + // ldr q12, [x0, #(3*(1024/8))] // ......e..........'................................................................~....... + // ldr q13, [x0, #(4*(1024/8))] // ........e........'..................................................................~..... + // ldr q14, [x0, #(5*(1024/8))] // ..........e......'....................................................................~... + // ldr q15, [x0, #(6*(1024/8))] // ...............e.'........................................................................ + // ldr q16, [x0, #(7*(1024/8))] // ..............e..'........................................................................ + // sqrdmulh v27.4s, v13.4s, v0.s[1] // .................*........................................................................ + // mul v24.4s, v13.4s, v0.s[0] // ............e....'......................................................................~. + // mls v24.4s, v27.4s, v8.s[0] // .................'...*.................................................................... + // sub v13.4s, v9.4s, v24.4s // .................'.......*................................................................ + // add v9.4s, v9.4s, v24.4s // .................'.........*.............................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .................'*....................................................................... + // mul v24.4s, v14.4s, v0.s[0] // .................'.*...................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'.....*.................................................................. + // sub v14.4s, v10.4s, v24.4s // .................'..........*............................................................. + // add v10.4s, v10.4s, v24.4s // .................'...........*............................................................ + // sqrdmulh v27.4s, v15.4s, v0.s[1] // .................'..*..................................................................... + // mul v24.4s, v15.4s, v0.s[0] // .................'....*................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'........*............................................................... + // sub v15.4s, v11.4s, v24.4s // .................'............*........................................................... + // add v11.4s, v11.4s, v24.4s // .................'.............*.......................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .................'......*................................................................. + // mul v24.4s, v16.4s, v0.s[0] // ................e'........................................................................ + // mls v24.4s, v27.4s, v8.s[0] // .................'..............*......................................................... + // sub v16.4s, v12.4s, v24.4s // .................'..................*..................................................... + // add v12.4s, v12.4s, v24.4s // .................'...................*.................................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .................'.................*...................................................... + // mul v24.4s, v11.4s, v0.s[2] // .................'.....................*.................................................. + // mls v24.4s, v27.4s, v8.s[0] // .................'..........................*............................................. + // sub v11.4s, v9.4s, v24.4s // .................'..............................*......................................... + // add v9.4s, v9.4s, v24.4s // .................'...............................*........................................ + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .................'...........................*............................................ + // mul v24.4s, v12.4s, v0.s[2] // .................'.............................*.......................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'..................................*..................................... + // sub v12.4s, v10.4s, v24.4s // .................'......................................*................................. + // add v10.4s, v10.4s, v24.4s // .................'.......................................*................................ + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .................'...............*........................................................ + // mul v24.4s, v15.4s, v1.s[0] // .................'................*....................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................'....................*................................................... + // sub v15.4s, v13.4s, v24.4s // .................'........................*............................................... + // add v13.4s, v13.4s, v24.4s // .................'.........................*.............................................. + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .................'......................*................................................. + // mul v24.4s, v16.4s, v1.s[0] // .................'.......................*................................................ + // mls v24.4s, v27.4s, v8.s[0] // .................'............................*........................................... + // sub v16.4s, v14.4s, v24.4s // .................'................................*....................................... + // add v14.4s, v14.4s, v24.4s // .................'.................................*...................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .................'..........................................*............................. + // mul v24.4s, v10.4s, v1.s[2] // .................'...........................................*............................ + // mls v24.4s, v27.4s, v8.s[0] // .................'...............................................*........................ + // sub v10.4s, v9.4s, v24.4s // .................'.....................................................*.................. + // add v9.4s, v9.4s, v24.4s // .................'......................................................*................. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .................'.........................................*.............................. + // mul v24.4s, v12.4s, v2.s[0] // .................'............................................*........................... + // mls v24.4s, v27.4s, v8.s[0] // .................'................................................*....................... + // sub v12.4s, v11.4s, v24.4s // .................'.......................................................*................ + // add v11.4s, v11.4s, v24.4s // .................'........................................................*............... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // .................'....................................*................................... + // mul v24.4s, v14.4s, v2.s[2] // .................'.....................................*.................................. + // mls v24.4s, v27.4s, v8.s[0] // .................'.............................................*.......................... + // sub v14.4s, v13.4s, v24.4s // .................'.................................................*...................... + // add v13.4s, v13.4s, v24.4s // .................'....................................................*................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .................'...................................*.................................... + // mul v24.4s, v16.4s, v3.s[0] // .................'........................................*............................... + // mls v24.4s, v27.4s, v8.s[0] // .................'..............................................*......................... + // sub v16.4s, v15.4s, v24.4s // .................'..................................................*..................... + // add v15.4s, v15.4s, v24.4s // .................'...................................................*.................... + // str q9, [x0], #(16) // .................'.........................................................*.............. + // str q10, [x0, #(-16 + 1*(1024/8))] // .~...............'...........................................................*............ + // str q11, [x0, #(-16 + 2*(1024/8))] // ...~.............'.............................................................*.......... + // str q12, [x0, #(-16 + 3*(1024/8))] // .....~...........'...............................................................*........ + // str q13, [x0, #(-16 + 4*(1024/8))] // .......~.........'.................................................................*...... + // str q14, [x0, #(-16 + 5*(1024/8))] // .........~.......'...................................................................*.... + // str q15, [x0, #(-16 + 6*(1024/8))] // ...........~.....'.....................................................................*.. + // str q16, [x0, #(-16 + 7*(1024/8))] // .............~...'.......................................................................* + + sub count, count, #1 cbnz count, layer123_start - mul v14.4S, v10.4S, v0.S[0] // *............................................................ - vins v28, x16, 0 // ...*......................................................... - mul v18.4S, v12.4S, v1.S[0] // ..*.......................................................... - ldr x29, [x0, #0] // .*........................................................... - mls v22.4S, v27.4S, v8.S[0] // ......*...................................................... - vins v15, x13, 1 // .....*....................................................... - sqrdmulh v29.4S, v10.4S, v0.S[1] // ....*........................................................ - ldr x9, [x0, #8] // ...........*................................................. - sub v24.4S, v15.4S, v6.4S // ......................*...................................... - vins v28, x25, 1 // .......*..................................................... - mls v18.4S, v9.4S, v8.S[0] // .................*........................................... - vins v12, x29, 0 // .........*................................................... - sub v5.4S, v28.4S, v21.4S // ..........................*.................................. - // gap // ............................................................. - mul v31.4S, v24.4S, v1.S[0] // .................................*........................... - vins v12, x9, 1 // ................*............................................ - mls v14.4S, v29.4S, v8.S[0] // ............*................................................ - // gap // ............................................................. - add v29.4S, v5.4S, v18.4S // ...............................*............................. - // gap // ............................................................. - sqrdmulh v11.4S, v24.4S, v1.S[1] // ..................................*.......................... - // gap // ............................................................. - add v19.4S, v15.4S, v6.4S // ........*.................................................... - // gap // ............................................................. - mul v9.4S, v29.4S, v2.S[2] // .......................................*..................... - // gap // ............................................................. - sqrdmulh v25.4S, v29.4S, v2.S[3] // ........................................*.................... - // gap // ............................................................. - mls v31.4S, v11.4S, v8.S[0] // ......................................*...................... - // gap // ............................................................. - sub v10.4S, v12.4S, v14.4S // ............................*................................ - // gap // ............................................................. - mul v26.4S, v19.4S, v0.S[2] // .............*............................................... - // gap // ............................................................. - mls v9.4S, v25.4S, v8.S[0] // ............................................*................ - // gap // ............................................................. - add v16.4S, v10.4S, v31.4S // ...........................................*................. - // gap // ............................................................. - sqrdmulh v4.4S, v19.4S, v0.S[3] // ...............*............................................. - // gap // ............................................................. - add v20.4S, v12.4S, v14.4S // .....................*....................................... - // gap // ............................................................. - add v15.4S, v16.4S, v9.4S // ................................................*............ - // gap // ............................................................. - add v19.4S, v28.4S, v21.4S // ..........*.................................................. - // gap // ............................................................. - mls v26.4S, v4.4S, v8.S[0] // ....................*........................................ - // gap // ............................................................. - sub v14.4S, v5.4S, v18.4S // .............................*............................... - // gap // ............................................................. - add v12.4S, v19.4S, v22.4S // ..............*.............................................. - // gap // ............................................................. - sub v13.4S, v19.4S, v22.4S // .........................*................................... - // gap // ............................................................. - add v22.4S, v20.4S, v26.4S // ........................*.................................... - // gap // ............................................................. - mul v27.4S, v12.4S, v1.S[2] // ..................*.......................................... - // gap // ............................................................. - sqrdmulh v11.4S, v12.4S, v1.S[3] // ...................*......................................... - // gap // ............................................................. - mul v7.4S, v13.4S, v2.S[0] // ....................................*........................ - // gap // ............................................................. - sqrdmulh v4.4S, v13.4S, v2.S[1] // .....................................*....................... - // gap // ............................................................. - mul v19.4S, v14.4S, v3.S[0] // .................................................*........... - // gap // ............................................................. - mls v27.4S, v11.4S, v8.S[0] // .......................*..................................... - // gap // ............................................................. - sqrdmulh v28.4S, v14.4S, v3.S[1] // ..................................................*.......... - // gap // ............................................................. - mls v7.4S, v4.4S, v8.S[0] // .........................................*................... - // gap // ............................................................. - sub v11.4S, v20.4S, v26.4S // ................................*............................ - // gap // ............................................................. - sub v1.4S, v22.4S, v27.4S // ...........................*................................. - // gap // ............................................................. - sub v30.4S, v10.4S, v31.4S // ..........................................*.................. - // gap // ............................................................. - sub v17.4S, v11.4S, v7.4S // .............................................*............... - // gap // ............................................................. - str_vo v1, x0, 128 // ..............................*.............................. - // gap // ............................................................. - mls v19.4S, v28.4S, v8.S[0] // ....................................................*........ - // gap // ............................................................. - str_vo v17, x0, 384 // ......................................................*...... - // gap // ............................................................. - sub v20.4S, v16.4S, v9.4S // ...............................................*............. - // gap // ............................................................. - str_vo v15, x0, 512 // .........................................................*... - // gap // ............................................................. - sub v18.4S, v30.4S, v19.4S // .......................................................*..... - // gap // ............................................................. - str_vo v20, x0, 640 // ..........................................................*.. - // gap // ............................................................. - add v5.4S, v30.4S, v19.4S // ........................................................*.... - // gap // ............................................................. - str_vo v18, x0, 896 // ............................................................* - // gap // ............................................................. - add v30.4S, v11.4S, v7.4S // ..............................................*.............. - // gap // ............................................................. - str_vo v5, x0, 768 // ...........................................................*. - // gap // ............................................................. - add v3.4S, v22.4S, v27.4S // ...................................*......................... - // gap // ............................................................. - str_vo v30, x0, 256 // .....................................................*....... - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - str_vi v3, x0, 16 // ...................................................*......... - // gap // ............................................................. - - // original source code - // mul v30.4S, v10.4S, v0.S[0] // *............................................................ || *...................................................... - // ldr x29, [x0, #0] // ...*......................................................... || .*..................................................... - // mul v20.4S, v12.4S, v1.S[0] // ..*.......................................................... || .*..................................................... - // vins v17, x16, 0 // .*........................................................... || *...................................................... - // sqrdmulh v31.4S, v10.4S, v0.S[1] // ......*...................................................... || ...*................................................... - // vins v15, x13, 1 // .....*....................................................... || ..*.................................................... - // mls v22.4S, v27.4S, v8.S[0] // ....*........................................................ || ..*.................................................... - // vins v17, x25, 1 // .........*................................................... || ....*.................................................. - // add v27.4S, v15.4S, v6.4S // ..................*.......................................... || ...........*........................................... - // vins v25, x29, 0 // ...........*................................................. || .....*................................................. - // add v12.4S, v17.4S, v21.4S // .............................*............................... || ......................*................................ - // ldr x29, [x0, #8] // .......*..................................................... || ...*................................................... - // mls v30.4S, v31.4S, v8.S[0] // ...............*............................................. || ........*.............................................. - // mul v31.4S, v27.4S, v0.S[2] // .......................*..................................... || ................*...................................... - // add v4.4S, v12.4S, v22.4S // ................................*............................ || .........................*............................. - // sqrdmulh v27.4S, v27.4S, v0.S[3] // ..........................*.................................. || ...................*................................... - // vins v25, x29, 1 // ..............*.............................................. || .......*............................................... - // mls v20.4S, v9.4S, v8.S[0] // ..........*.................................................. || .....*................................................. - // mul v9.4S, v4.4S, v1.S[2] // ...................................*......................... || ............................*.......................... - // sqrdmulh v4.4S, v4.4S, v1.S[3] // ....................................*........................ || .............................*......................... - // mls v31.4S, v27.4S, v8.S[0] // ..............................*.............................. || .......................*............................... - // add v27.4S, v25.4S, v30.4S // ...........................*................................. || ....................*.................................. - // sub v26.4S, v15.4S, v6.4S // ........*.................................................... || ....*.................................................. - // mls v9.4S, v4.4S, v8.S[0] // ........................................*.................... || .................................*..................... - // add v6.4S, v27.4S, v31.4S // ..................................*.......................... || ...........................*........................... - // sub v22.4S, v12.4S, v22.4S // .................................*........................... || ..........................*............................ - // sub v17.4S, v17.4S, v21.4S // ............*................................................ || ......*................................................ - // sub v12.4S, v6.4S, v9.4S // ............................................*................ || .....................................*................. - // sub v30.4S, v25.4S, v30.4S // ......................*...................................... || ...............*....................................... - // sub v25.4S, v17.4S, v20.4S // ...............................*............................. || ........................*.............................. - // str_vo v12, x0, 128 // ...............................................*............. || ........................................*.............. - // add v20.4S, v17.4S, v20.4S // ................*............................................ || .........*............................................. - // sub v31.4S, v27.4S, v31.4S // ...........................................*................. || ....................................*.................. - // mul v27.4S, v26.4S, v1.S[0] // .............*............................................... || .......*............................................... - // sqrdmulh v12.4S, v26.4S, v1.S[1] // .................*........................................... || ..........*............................................ - // add v9.4S, v6.4S, v9.4S // ..........................................................*.. || ...................................................*... - // mul v6.4S, v22.4S, v2.S[0] // .....................................*....................... || ..............................*........................ - // sqrdmulh v22.4S, v22.4S, v2.S[1] // ......................................*...................... || ...............................*....................... - // mls v27.4S, v12.4S, v8.S[0] // .....................*....................................... || ..............*........................................ - // mul v12.4S, v20.4S, v2.S[2] // ...................*......................................... || ............*.......................................... - // sqrdmulh v20.4S, v20.4S, v2.S[3] // ....................*........................................ || .............*......................................... - // mls v6.4S, v22.4S, v8.S[0] // ..........................................*.................. || ...................................*................... - // sub v22.4S, v30.4S, v27.4S // .............................................*............... || ......................................*................ - // add v30.4S, v30.4S, v27.4S // .........................*................................... || ..................*.................................... - // mls v12.4S, v20.4S, v8.S[0] // ........................*.................................... || .................*..................................... - // sub v16.4S, v31.4S, v6.4S // ..............................................*.............. || .......................................*............... - // add v31.4S, v31.4S, v6.4S // ........................................................*.... || .................................................*..... - // sub v6.4S, v30.4S, v12.4S // ..................................................*.......... || ...........................................*........... - // add v30.4S, v30.4S, v12.4S // ............................*................................ || .....................*................................. - // mul v27.4S, v25.4S, v3.S[0] // .......................................*..................... || ................................*...................... - // sqrdmulh v25.4S, v25.4S, v3.S[1] // .........................................*................... || ..................................*.................... - // str_vi v9, x0, 16 // ............................................................* || ......................................................* - // mls v27.4S, v25.4S, v8.S[0] // ................................................*............ || .........................................*............. - // str_vo v31, x0, 240 // ...........................................................*. || ....................................................*.. - // str_vo v16, x0, 368 // .................................................*........... || ..........................................*............ - // sub v31.4S, v22.4S, v27.4S // ....................................................*........ || .............................................*......... - // add v22.4S, v22.4S, v27.4S // ......................................................*...... || ...............................................*....... - // str_vo v30, x0, 496 // ...................................................*......... || ............................................*.......... - // str_vo v6, x0, 624 // .....................................................*....... || ..............................................*........ - // str_vo v22, x0, 752 // .........................................................*... || ..................................................*.... - // str_vo v31, x0, 880 // .......................................................*..... || ................................................*...... - + // Instructions: 66 + // Expected cycles: 67 + // Expected IPC: 0.99 + // + // Wall time: 6.93s + // User time: 6.93s + // + // ----------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + sqrdmulh v21.4S, v6.4S, v0.S[1] // *................................................................. + // gap // .................................................................. + sqrdmulh v6.4S, v22.4S, v0.S[1] // .*................................................................ + // gap // .................................................................. + mul v22.4S, v22.4S, v0.S[0] // ..*............................................................... + // gap // .................................................................. + sqrdmulh v18.4S, v31.4S, v0.S[1] // ...*.............................................................. + // gap // .................................................................. + mls v19.4S, v21.4S, v8.S[0] // ....*............................................................. + // gap // .................................................................. + mul v31.4S, v31.4S, v0.S[0] // .....*............................................................ + // gap // .................................................................. + mls v22.4S, v6.4S, v8.S[0] // ......*........................................................... + // gap // .................................................................. + sqrdmulh v6.4S, v20.4S, v0.S[1] // .......*.......................................................... + // gap // .................................................................. + sub v27.4S, v9.4S, v19.4S // ........*......................................................... + // gap // .................................................................. + mls v31.4S, v18.4S, v8.S[0] // .........*........................................................ + // gap // .................................................................. + add v11.4S, v9.4S, v19.4S // ..........*....................................................... + // gap // .................................................................. + sub v7.4S, v15.4S, v22.4S // ...........*...................................................... + // gap // .................................................................. + add v21.4S, v15.4S, v22.4S // ............*..................................................... + // gap // .................................................................. + sub v18.4S, v4.4S, v31.4S // .............*.................................................... + // gap // .................................................................. + add v16.4S, v4.4S, v31.4S // ..............*................................................... + // gap // .................................................................. + mls v12.4S, v6.4S, v8.S[0] // ...............*.................................................. + // gap // .................................................................. + sqrdmulh v6.4S, v18.4S, v1.S[1] // ................*................................................. + // gap // .................................................................. + mul v31.4S, v18.4S, v1.S[0] // .................*................................................ + // gap // .................................................................. + sqrdmulh v20.4S, v16.4S, v0.S[3] // ..................*............................................... + // gap // .................................................................. + sub v18.4S, v26.4S, v12.4S // ...................*.............................................. + // gap // .................................................................. + add v19.4S, v26.4S, v12.4S // ....................*............................................. + // gap // .................................................................. + mls v31.4S, v6.4S, v8.S[0] // .....................*............................................ + // gap // .................................................................. + mul v22.4S, v16.4S, v0.S[2] // ......................*........................................... + // gap // .................................................................. + sqrdmulh v6.4S, v18.4S, v1.S[1] // .......................*.......................................... + // gap // .................................................................. + mul v16.4S, v18.4S, v1.S[0] // ........................*......................................... + // gap // .................................................................. + sub v4.4S, v27.4S, v31.4S // .........................*........................................ + // gap // .................................................................. + mls v22.4S, v20.4S, v8.S[0] // ...........................*...................................... + // gap // .................................................................. + sqrdmulh v18.4S, v19.4S, v0.S[3] // ............................*..................................... + // gap // .................................................................. + mls v16.4S, v6.4S, v8.S[0] // .............................*.................................... + // gap // .................................................................. + mul v6.4S, v19.4S, v0.S[2] // ..............................*................................... + // gap // .................................................................. + add v15.4S, v27.4S, v31.4S // ..........................*....................................... + // gap // .................................................................. + sub v9.4S, v11.4S, v22.4S // ...............................*.................................. + // gap // .................................................................. + sub v20.4S, v7.4S, v16.4S // .................................*................................ + // gap // .................................................................. + mls v6.4S, v18.4S, v8.S[0] // ...................................*.............................. + // gap // .................................................................. + add v12.4S, v11.4S, v22.4S // ................................*................................. + // gap // .................................................................. + add v16.4S, v7.4S, v16.4S // ..................................*............................... + // gap // .................................................................. + sqrdmulh v31.4S, v20.4S, v3.S[1] // ....................................*............................. + // gap // .................................................................. + add v18.4S, v21.4S, v6.4S // ........................................*......................... + // gap // .................................................................. + sqrdmulh v19.4S, v16.4S, v2.S[3] // .....................................*............................ + // gap // .................................................................. + sub v22.4S, v21.4S, v6.4S // .......................................*.......................... + // gap // .................................................................. + sqrdmulh v6.4S, v18.4S, v1.S[3] // ...........................................*...................... + // gap // .................................................................. + mul v18.4S, v18.4S, v1.S[2] // ............................................*..................... + // gap // .................................................................. + mul v26.4S, v16.4S, v2.S[2] // ......................................*........................... + // gap // .................................................................. + mul v20.4S, v20.4S, v3.S[0] // .........................................*........................ + // gap // .................................................................. + sqrdmulh v16.4S, v22.4S, v2.S[1] // ..........................................*....................... + // gap // .................................................................. + mls v18.4S, v6.4S, v8.S[0] // ................................................*................. + // gap // .................................................................. + mul v22.4S, v22.4S, v2.S[0] // .............................................*.................... + // gap // .................................................................. + mls v26.4S, v19.4S, v8.S[0] // ..............................................*................... + // gap // .................................................................. + mls v20.4S, v31.4S, v8.S[0] // ...............................................*.................. + // gap // .................................................................. + add v6.4S, v12.4S, v18.4S // .......................................................*.......... + // gap // .................................................................. + mls v22.4S, v16.4S, v8.S[0] // .................................................*................ + // gap // .................................................................. + sub v16.4S, v15.4S, v26.4S // ..................................................*............... + // gap // .................................................................. + str q6, [x0], #(16) // ..........................................................*....... + // gap // .................................................................. + sub v18.4S, v12.4S, v18.4S // ......................................................*........... + // gap // .................................................................. + str q16, [x0, #624] // ...............................................................*.. + // gap // .................................................................. + add v6.4S, v9.4S, v22.4S // .........................................................*........ + // gap // .................................................................. + str q18, [x0, #112] // ...........................................................*...... + // gap // .................................................................. + sub v18.4S, v9.4S, v22.4S // ........................................................*......... + // gap // .................................................................. + str q6, [x0, #240] // ............................................................*..... + // gap // .................................................................. + add v6.4S, v15.4S, v26.4S // .....................................................*............ + // gap // .................................................................. + str q18, [x0, #368] // .............................................................*.... + // gap // .................................................................. + add v18.4S, v4.4S, v20.4S // ....................................................*............. + // gap // .................................................................. + str q6, [x0, #496] // ..............................................................*... + // gap // .................................................................. + sub v6.4S, v4.4S, v20.4S // ...................................................*.............. + // gap // .................................................................. + str q18, [x0, #752] // ................................................................*. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q6, [x0, #880] // .................................................................* + // gap // .................................................................. + + // ------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // sqrdmulh v6.4S, v6.4S, v0.S[1] // *................................................................. + // sqrdmulh v18.4S, v22.4S, v0.S[1] // .*................................................................ + // mul v16.4S, v22.4S, v0.S[0] // ..*............................................................... + // sqrdmulh v22.4S, v31.4S, v0.S[1] // ...*.............................................................. + // mls v19.4S, v6.4S, v8.S[0] // ....*............................................................. + // mul v6.4S, v31.4S, v0.S[0] // .....*............................................................ + // mls v16.4S, v18.4S, v8.S[0] // ......*........................................................... + // sqrdmulh v18.4S, v20.4S, v0.S[1] // .......*.......................................................... + // sub v20.4S, v9.4S, v19.4S // ........*......................................................... + // mls v6.4S, v22.4S, v8.S[0] // .........*........................................................ + // add v22.4S, v9.4S, v19.4S // ..........*....................................................... + // sub v19.4S, v15.4S, v16.4S // ...........*...................................................... + // add v16.4S, v15.4S, v16.4S // ............*..................................................... + // sub v31.4S, v4.4S, v6.4S // .............*.................................................... + // add v6.4S, v4.4S, v6.4S // ..............*................................................... + // mls v12.4S, v18.4S, v8.S[0] // ...............*.................................................. + // sqrdmulh v18.4S, v31.4S, v1.S[1] // ................*................................................. + // mul v31.4S, v31.4S, v1.S[0] // .................*................................................ + // sqrdmulh v9.4S, v6.4S, v0.S[3] // ..................*............................................... + // sub v15.4S, v26.4S, v12.4S // ...................*.............................................. + // add v12.4S, v26.4S, v12.4S // ....................*............................................. + // mls v31.4S, v18.4S, v8.S[0] // .....................*............................................ + // mul v6.4S, v6.4S, v0.S[2] // ......................*........................................... + // sqrdmulh v18.4S, v15.4S, v1.S[1] // .......................*.......................................... + // mul v15.4S, v15.4S, v1.S[0] // ........................*......................................... + // sub v4.4S, v20.4S, v31.4S // .........................*........................................ + // add v20.4S, v20.4S, v31.4S // ..............................*................................... + // mls v6.4S, v9.4S, v8.S[0] // ..........................*....................................... + // sqrdmulh v31.4S, v12.4S, v0.S[3] // ...........................*...................................... + // mls v15.4S, v18.4S, v8.S[0] // ............................*..................................... + // mul v18.4S, v12.4S, v0.S[2] // .............................*.................................... + // sub v12.4S, v22.4S, v6.4S // ...............................*.................................. + // add v6.4S, v22.4S, v6.4S // ..................................*............................... + // sub v22.4S, v19.4S, v15.4S // ................................*................................. + // add v19.4S, v19.4S, v15.4S // ...................................*.............................. + // mls v18.4S, v31.4S, v8.S[0] // .................................*................................ + // sqrdmulh v31.4S, v22.4S, v3.S[1] // ....................................*............................. + // sqrdmulh v9.4S, v19.4S, v2.S[3] // ......................................*........................... + // mul v19.4S, v19.4S, v2.S[2] // ..........................................*....................... + // sub v15.4S, v16.4S, v18.4S // .......................................*.......................... + // add v18.4S, v16.4S, v18.4S // .....................................*............................ + // mul v16.4S, v22.4S, v3.S[0] // ...........................................*...................... + // sqrdmulh v22.4S, v15.4S, v2.S[1] // ............................................*..................... + // sqrdmulh v26.4S, v18.4S, v1.S[3] // ........................................*......................... + // mul v18.4S, v18.4S, v1.S[2] // .........................................*........................ + // mul v15.4S, v15.4S, v2.S[0] // ..............................................*................... + // mls v19.4S, v9.4S, v8.S[0] // ...............................................*.................. + // mls v16.4S, v31.4S, v8.S[0] // ................................................*................. + // mls v18.4S, v26.4S, v8.S[0] // .............................................*.................... + // mls v15.4S, v22.4S, v8.S[0] // ..................................................*............... + // sub v22.4S, v20.4S, v19.4S // ...................................................*.............. + // sub v31.4S, v4.4S, v16.4S // ...............................................................*.. + // add v16.4S, v4.4S, v16.4S // .............................................................*.... + // add v19.4S, v20.4S, v19.4S // ...........................................................*...... + // sub v20.4S, v6.4S, v18.4S // .....................................................*............ + // add v6.4S, v6.4S, v18.4S // .................................................*................ + // sub v18.4S, v12.4S, v15.4S // .........................................................*........ + // add v12.4S, v12.4S, v15.4S // .......................................................*.......... + // str q6, [x0], #(16) // ....................................................*............. + // str q20, [x0, #112] // ........................................................*......... + // str q12, [x0, #240] // ..........................................................*....... + // str q18, [x0, #368] // ............................................................*..... + // str q19, [x0, #496] // ..............................................................*... + // str q22, [x0, #624] // ......................................................*........... + // str q16, [x0, #752] // ................................................................*. + // str q31, [x0, #880] // .................................................................* + restore inp, STACK0 add inpp, inp, #64 @@ -909,994 +906,1035 @@ layer123_start: sub inpp, inpp, #64 .p2align 2 - ldr x13, [x4] , #64 // *... - // gap // .... - ldr x10, [x4, #-56] // .*.. - // gap // .... - ldr x22, [x4, #-48] // ..*. - // gap // .... - ldr x28, [x4, #-40] // ...* - // gap // .... - - // original source code - // ldr x13, [x4] , #64 // *... || *... - // ldr x10, [x4, #-56] // .*.. || .*.. - // ldr x22, [x4, #-48] // ..*. || ..*. - // ldr x28, [x4, #-40] // ...* || ...* - + // Instructions: 19 + // Expected cycles: 37 + // Expected IPC: 0.51 + // + // Wall time: 0.15s + // User time: 0.15s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q17, [x5, #176] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q31, [x1, #64] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x1, #112] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q3, [x1, #80] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q12, [x1, #96] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q16, [x2, #64] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x2, #80] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q9, [x2, #96] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q4, [x2, #112] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q21, [x4], #64 // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q7, [x4, #-48] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q2, [x5, #16] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q30, [x5, #32] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q14, [x5, #48] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x5, #64] // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q5, [x5, #80] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q23, [x5, #96] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q0, [x5, #160] // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q1, [x5, #112] // ................*............. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q31, [x1, #64] // .*............................. + // ldr q3, [x1, #80] // ...*........................... + // ldr q12, [x1, #96] // ....*.......................... + // ldr q15, [x1, #112] // ..*............................ + // ldr q16, [x2, #64] // .....*......................... + // ldr q22, [x2, #80] // ......*........................ + // ldr q9, [x2, #96] // .......*....................... + // ldr q4, [x2, #112] // ........*...................... + // ldr q21, [x4], #64 // .........*..................... + // ldr q7, [x4, #-48] // ..........*.................... + // ldr q2, [x5, #16] // ...........*................... + // ldr q30, [x5, #32] // ............*.................. + // ldr q14, [x5, #48] // .............*................. + // ldr q11, [x5, #64] // ..............*................ + // ldr q5, [x5, #80] // ...............*............... + // ldr q23, [x5, #96] // ................*.............. + // ldr q1, [x5, #112] // ..................*............ + // ldr q0, [x5, #160] // .................*............. + // ldr q17, [x5, #176] // *.............................. + sub count, count, #1 -.p2align 2 layer45678_start: - vins v11, x13, 0 // ....................................*................................................................................................................................................................................... - ldr x16, [x2, #80] // ....................*................................................................................................................................................................................................... - vins v20, x22, 0 // ........................................*............................................................................................................................................................................... - ldr x22, [x2, #112] // ............................*........................................................................................................................................................................................... - vins v11, x10, 1 // .....................................*.................................................................................................................................................................................. - ldr x13, [x2, #88] // .....................*.................................................................................................................................................................................................. - vins v20, x28, 1 // .........................................*.............................................................................................................................................................................. - ldr x10, [x2, #120] // .............................*.......................................................................................................................................................................................... - vins v23, x16, 0 // ......................*................................................................................................................................................................................................. - ldr x16, [x2, #64] // ................*....................................................................................................................................................................................................... - vins v25, x22, 0 // ..............................*......................................................................................................................................................................................... - ldr x22, [x2, #72] // .................*...................................................................................................................................................................................................... - vins v23, x13, 1 // .......................*................................................................................................................................................................................................ - ldr x13, [x2, #96] // ........................*............................................................................................................................................................................................... - vins v25, x10, 1 // ...............................*........................................................................................................................................................................................ - ldr x10, [x2, #104] // .........................*.............................................................................................................................................................................................. - mul v7.4S, v23.4S, v11.S[0] // .......................................................*................................................................................................................................................................ - vins v3, x16, 0 // ..................*..................................................................................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v11.S[1] // ........................................................*............................................................................................................................................................... - add x2, x2, #64 // .................................*...................................................................................................................................................................................... - mul v18.4S, v25.4S, v11.S[0] // .................................................................*...................................................................................................................................................... - vins v3, x22, 1 // ...................*.................................................................................................................................................................................................... - sqrdmulh v25.4S, v25.4S, v11.S[1] // ..................................................................*..................................................................................................................................................... - ldr x16, [x1, #112] // ............*........................................................................................................................................................................................................... - mul v27.4S, v3.4S, v11.S[0] // ..................................................*..................................................................................................................................................................... - vins v10, x13, 0 // ..........................*............................................................................................................................................................................................. - sqrdmulh v3.4S, v3.4S, v11.S[1] // ...................................................*.................................................................................................................................................................... - ldr x22, [x1, #120] // .............*.......................................................................................................................................................................................................... - mls v7.4S, v23.4S, v8.S[0] // .........................................................*.............................................................................................................................................................. - vins v10, x10, 1 // ...........................*............................................................................................................................................................................................ - mls v18.4S, v25.4S, v8.S[0] // ...................................................................*.................................................................................................................................................... - vins v23, x16, 0 // ..............*......................................................................................................................................................................................................... - mul v25.4S, v10.4S, v11.S[0] // ............................................................*........................................................................................................................................................... - ldr x16, [x1, #88] // .....*.................................................................................................................................................................................................................. - sqrdmulh v10.4S, v10.4S, v11.S[1] // .............................................................*.......................................................................................................................................................... - vins v23, x22, 1 // ...............*........................................................................................................................................................................................................ - mls v27.4S, v3.4S, v8.S[0] // ....................................................*................................................................................................................................................................... - ldr x22, [x1, #96] // ........*............................................................................................................................................................................................................... - sub v3.4S, v23.4S, v18.4S // ....................................................................*................................................................................................................................................... - ldr x13, [x1, #64] // *....................................................................................................................................................................................................................... - add v23.4S, v23.4S, v18.4S // .....................................................................*.................................................................................................................................................. - ldr x10, [x1, #104] // .........*.............................................................................................................................................................................................................. - mls v25.4S, v10.4S, v8.S[0] // ..............................................................*......................................................................................................................................................... - ldr x28, [x1, #72] // .*...................................................................................................................................................................................................................... - mul v18.4S, v3.4S, v20.S[0] // .....................................................................................*.................................................................................................................................. - vins v10, x22, 0 // ..........*............................................................................................................................................................................................................. - mul v29.4S, v23.4S, v11.S[2] // ...........................................................................*............................................................................................................................................ - ldr x22, [x1, #80] // ....*................................................................................................................................................................................................................... - sqrdmulh v23.4S, v23.4S, v11.S[3] // ............................................................................*........................................................................................................................................... - vins v10, x10, 1 // ...........*............................................................................................................................................................................................................ - sqrdmulh v3.4S, v3.4S, v20.S[1] // ......................................................................................*................................................................................................................................. - add x1, x1, #64 // ................................*....................................................................................................................................................................................... - add v21.4S, v10.4S, v25.4S // ................................................................*....................................................................................................................................................... - vins v1, x13, 0 // ..*..................................................................................................................................................................................................................... - sub v25.4S, v10.4S, v25.4S // ...............................................................*........................................................................................................................................................ - vins v10, x22, 0 // ......*................................................................................................................................................................................................................. - mls v29.4S, v23.4S, v8.S[0] // .............................................................................*.......................................................................................................................................... - vins v1, x28, 1 // ...*.................................................................................................................................................................................................................... - mul v23.4S, v21.4S, v11.S[2] // ......................................................................*................................................................................................................................................. - vins v10, x16, 1 // .......*................................................................................................................................................................................................................ - sqrdmulh v11.4S, v21.4S, v11.S[3] // .......................................................................*................................................................................................................................................ - ldr x16, [x4, #-32] // ..........................................*............................................................................................................................................................................. - add v21.4S, v10.4S, v7.4S // ...........................................................*............................................................................................................................................................ - ldr x22, [x4, #-24] // ...........................................*............................................................................................................................................................................ - sub v7.4S, v10.4S, v7.4S // ..........................................................*............................................................................................................................................................. - ldr x13, [x4, #-16] // ..............................................*......................................................................................................................................................................... - sub v10.4S, v1.4S, v27.4S // .....................................................*.................................................................................................................................................................. - ldr x10, [x4, #-8] // ...............................................*........................................................................................................................................................................ - add v19.4S, v21.4S, v29.4S // ...............................................................................*........................................................................................................................................ - vins v2, x16, 0 // ............................................*........................................................................................................................................................................... - mul v15.4S, v25.4S, v20.S[0] // ................................................................................*....................................................................................................................................... - ldr x16, [x5] , #192 // ..............................................................................................................................*......................................................................................... - sqrdmulh v25.4S, v25.4S, v20.S[1] // .................................................................................*...................................................................................................................................... - vins v2, x22, 1 // .............................................*.......................................................................................................................................................................... - mul v12.4S, v19.4S, v20.S[2] // ..........................................................................................*............................................................................................................................. - vins v14, x13, 0 // ................................................*....................................................................................................................................................................... - sqrdmulh v20.4S, v19.4S, v20.S[3] // ...........................................................................................*............................................................................................................................ - ldr x22, [x5, #-184] // ...............................................................................................................................*........................................................................................ - add v27.4S, v1.4S, v27.4S // ......................................................*................................................................................................................................................................. - vins v14, x10, 1 // .................................................*...................................................................................................................................................................... - mls v23.4S, v11.4S, v8.S[0] // ........................................................................*............................................................................................................................................... - vins v11, x16, 0 // ................................................................................................................................*....................................................................................... - sub v29.4S, v21.4S, v29.4S // ..............................................................................*......................................................................................................................................... - ldr x16, [x5, #-176] // ..................................................................................................................................*..................................................................................... - mls v15.4S, v25.4S, v8.S[0] // ..................................................................................*..................................................................................................................................... - vins v11, x22, 1 // .................................................................................................................................*...................................................................................... - mls v18.4S, v3.4S, v8.S[0] // .......................................................................................*................................................................................................................................ - ldr x22, [x5, #-168] // ...................................................................................................................................*.................................................................................... - sub v25.4S, v27.4S, v23.4S // .........................................................................*.............................................................................................................................................. - ldr x13, [x5, #-160] // ......................................................................................................................................*................................................................................. - add v23.4S, v27.4S, v23.4S // ..........................................................................*............................................................................................................................................. - vins v3, x16, 0 // ....................................................................................................................................*................................................................................... - sub v27.4S, v10.4S, v15.4S // ...................................................................................*.................................................................................................................................... - ldr x16, [x5, #-152] // .......................................................................................................................................*................................................................................ - add v10.4S, v10.4S, v15.4S // ....................................................................................*................................................................................................................................... - vins v3, x22, 1 // .....................................................................................................................................*.................................................................................. - sub v21.4S, v7.4S, v18.4S // ........................................................................................*............................................................................................................................... - vins v1, x13, 0 // ........................................................................................................................................*............................................................................... - add v7.4S, v7.4S, v18.4S // .........................................................................................*.............................................................................................................................. - ldr x22, [x5, #-144] // ..........................................................................................................................................*............................................................................. - mls v12.4S, v20.4S, v8.S[0] // ............................................................................................*........................................................................................................................... - vins v1, x16, 1 // .........................................................................................................................................*.............................................................................. - mul v20.4S, v29.4S, v2.S[0] // ...............................................................................................*........................................................................................................................ - ldr x16, [x5, #-136] // ...........................................................................................................................................*............................................................................ - sqrdmulh v18.4S, v29.4S, v2.S[1] // ................................................................................................*....................................................................................................................... - ldr x13, [x5, #-128] // ..............................................................................................................................................*......................................................................... - mul v29.4S, v7.4S, v2.S[2] // ....................................................................................................*................................................................................................................... - vins v19, x22, 0 // ............................................................................................................................................*........................................................................... - sub v15.4S, v23.4S, v12.4S // .............................................................................................*.......................................................................................................................... - ldr x22, [x5, #-120] // ...............................................................................................................................................*........................................................................ - add v23.4S, v23.4S, v12.4S // ..............................................................................................*......................................................................................................................... - vins v19, x16, 1 // .............................................................................................................................................*.......................................................................... - mls v20.4S, v18.4S, v8.S[0] // .................................................................................................*...................................................................................................................... - vins v18, x13, 0 // ................................................................................................................................................*....................................................................... - sqrdmulh v7.4S, v7.4S, v2.S[3] // .....................................................................................................*.................................................................................................................. - ldr x16, [x5, #-112] // ..................................................................................................................................................*..................................................................... - mul v2.4S, v21.4S, v14.S[0] // .........................................................................................................*.............................................................................................................. - vins v18, x22, 1 // .................................................................................................................................................*...................................................................... - sqrdmulh v21.4S, v21.4S, v14.S[1] // ..........................................................................................................*............................................................................................................. - ldr x22, [x5, #-104] // ...................................................................................................................................................*.................................................................... - sub v12.4S, v25.4S, v20.4S // ..................................................................................................*..................................................................................................................... - ldr x13, [x5, #-96] // ..........................................................................................................................................................................*............................................. - add v20.4S, v25.4S, v20.4S // ...................................................................................................*.................................................................................................................... - vins v25, x16, 0 // ....................................................................................................................................................*................................................................... - mls v29.4S, v7.4S, v8.S[0] // ......................................................................................................*................................................................................................................. - ldr x16, [x5, #-88] // ...........................................................................................................................................................................*............................................ - mls v2.4S, v21.4S, v8.S[0] // ...........................................................................................................*............................................................................................................ - vins v25, x22, 1 // .....................................................................................................................................................*.................................................................. - trn1 v7.4S, v23.4S, v15.4S // ..............................................................................................................*......................................................................................................... - vins v21, x13, 0 // ............................................................................................................................................................................*........................................... - trn2 v23.4S, v23.4S, v15.4S // ...............................................................................................................*........................................................................................................ - ldr x22, [x5, #-80] // ..............................................................................................................................................................................*......................................... - sub v15.4S, v10.4S, v29.4S // .......................................................................................................*................................................................................................................ - vins v21, x16, 1 // .............................................................................................................................................................................*.......................................... - add v10.4S, v10.4S, v29.4S // ........................................................................................................*............................................................................................................... - ldr x16, [x5, #-72] // ...............................................................................................................................................................................*........................................ - sub v29.4S, v27.4S, v2.4S // ............................................................................................................*........................................................................................................... - ldr x13, [x5, #-64] // ..................................................................................................................................................................................*..................................... - add v27.4S, v27.4S, v2.4S // .............................................................................................................*.......................................................................................................... - vins v2, x22, 0 // ................................................................................................................................................................................*....................................... - trn1 v14.4S, v20.4S, v12.4S // ................................................................................................................*....................................................................................................... - ldr x22, [x5, #-56] // ...................................................................................................................................................................................*.................................... - trn2 v20.4S, v20.4S, v12.4S // .................................................................................................................*...................................................................................................... - vins v2, x16, 1 // .................................................................................................................................................................................*...................................... - trn1 v12.4S, v10.4S, v15.4S // ......................................................................................................................*................................................................................................. - vins v22, x13, 0 // ....................................................................................................................................................................................*................................... - trn2 v24.2D, v7.2D, v14.2D // ..................................................................................................................*..................................................................................................... - ldr x16, [x5, #-48] // ......................................................................................................................................................................................*................................. - trn2 v17.2D, v23.2D, v20.2D // ...................................................................................................................*.................................................................................................... - vins v22, x22, 1 // .....................................................................................................................................................................................*.................................. - trn1 v7.2D, v7.2D, v14.2D // ....................................................................................................................*................................................................................................... - ldr x22, [x5, #-40] // .......................................................................................................................................................................................*................................ - trn1 v20.2D, v23.2D, v20.2D // .....................................................................................................................*.................................................................................................. - ldr x13, [x5, #-32] // ..........................................................................................................................................................................................*............................. - trn2 v23.4S, v10.4S, v15.4S // .......................................................................................................................*................................................................................................ - vins v10, x16, 0 // ........................................................................................................................................................................................*............................... - trn1 v15.4S, v27.4S, v29.4S // ........................................................................................................................*............................................................................................... - ldr x16, [x5, #-24] // ...........................................................................................................................................................................................*............................ - trn2 v27.4S, v27.4S, v29.4S // .........................................................................................................................*.............................................................................................. - vins v10, x22, 1 // .........................................................................................................................................................................................*.............................. - mul v29.4S, v24.4S, v11.4S // ......................................................................................................................................................*................................................................. - vins v14, x13, 0 // ............................................................................................................................................................................................*........................... - trn2 v13.2D, v12.2D, v15.2D // ..........................................................................................................................*............................................................................................. - ldr x22, [x5, #-16] // ..............................................................................................................................................................................................*......................... - trn2 v30.2D, v23.2D, v27.2D // ...........................................................................................................................*............................................................................................ - vins v14, x16, 1 // .............................................................................................................................................................................................*.......................... - trn1 v15.2D, v12.2D, v15.2D // ............................................................................................................................*........................................................................................... - ldr x16, [x5, #-8] // ...............................................................................................................................................................................................*........................ - trn1 v23.2D, v23.2D, v27.2D // .............................................................................................................................*.......................................................................................... - ldr x13, [x4] , #64 // ..................................e..................................................................................................................................................................................... - sqrdmulh v27.4S, v24.4S, v3.4S // .......................................................................................................................................................*................................................................ - vins v12, x22, 0 // ................................................................................................................................................................................................*....................... - mul v11.4S, v17.4S, v11.4S // ...........................................................................................................................................................*............................................................ - ldr x10, [x4, #-56] // ...................................e.................................................................................................................................................................................... - sqrdmulh v3.4S, v17.4S, v3.4S // ............................................................................................................................................................*........................................................... - vins v12, x16, 1 // .................................................................................................................................................................................................*...................... - mul v24.4S, v13.4S, v21.4S // ..................................................................................................................................................................................................*..................... - ldr x22, [x4, #-48] // ......................................e................................................................................................................................................................................. - mls v29.4S, v27.4S, v8.S[0] // ........................................................................................................................................................*............................................................... - ldr x28, [x4, #-40] // .......................................e................................................................................................................................................................................ - sqrdmulh v27.4S, v13.4S, v2.4S // ...................................................................................................................................................................................................*.................... - // gap // ........................................................................................................................................................................................................................ - mls v11.4S, v3.4S, v8.S[0] // .............................................................................................................................................................*.......................................................... - // gap // ........................................................................................................................................................................................................................ - mul v3.4S, v30.4S, v21.4S // .......................................................................................................................................................................................................*................ - // gap // ........................................................................................................................................................................................................................ - sub v21.4S, v7.4S, v29.4S // .........................................................................................................................................................*.............................................................. - // gap // ........................................................................................................................................................................................................................ - add v7.4S, v7.4S, v29.4S // ..........................................................................................................................................................*............................................................. - // gap // ........................................................................................................................................................................................................................ - sub v29.4S, v20.4S, v11.4S // ..............................................................................................................................................................*......................................................... - // gap // ........................................................................................................................................................................................................................ - add v11.4S, v20.4S, v11.4S // ...............................................................................................................................................................*........................................................ - // gap // ........................................................................................................................................................................................................................ - mls v24.4S, v27.4S, v8.S[0] // ....................................................................................................................................................................................................*................... - // gap // ........................................................................................................................................................................................................................ - mul v20.4S, v29.4S, v18.4S // .....................................................................................................................................................................*.................................................. - // gap // ........................................................................................................................................................................................................................ - mul v18.4S, v11.4S, v1.4S // ................................................................................................................................................................*....................................................... - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v27.4S, v30.4S, v2.4S // ........................................................................................................................................................................................................*............... - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v11.4S, v11.4S, v19.4S // .................................................................................................................................................................*...................................................... - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v25.4S, v29.4S, v25.4S // ......................................................................................................................................................................*................................................. - // gap // ........................................................................................................................................................................................................................ - sub v29.4S, v15.4S, v24.4S // .....................................................................................................................................................................................................*.................. - // gap // ........................................................................................................................................................................................................................ - mls v3.4S, v27.4S, v8.S[0] // .........................................................................................................................................................................................................*.............. - // gap // ........................................................................................................................................................................................................................ - add v27.4S, v15.4S, v24.4S // ......................................................................................................................................................................................................*................. - // gap // ........................................................................................................................................................................................................................ - mls v18.4S, v11.4S, v8.S[0] // ..................................................................................................................................................................*..................................................... - // gap // ........................................................................................................................................................................................................................ - mls v20.4S, v25.4S, v8.S[0] // .......................................................................................................................................................................*................................................ - // gap // ........................................................................................................................................................................................................................ - sub v11.4S, v23.4S, v3.4S // ..........................................................................................................................................................................................................*............. - // gap // ........................................................................................................................................................................................................................ - add v23.4S, v23.4S, v3.4S // ...........................................................................................................................................................................................................*............ - // gap // ........................................................................................................................................................................................................................ - sub v3.4S, v7.4S, v18.4S // ...................................................................................................................................................................*.................................................... - // gap // ........................................................................................................................................................................................................................ - mul v25.4S, v11.4S, v14.4S // .................................................................................................................................................................................................................*...... - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v11.4S, v11.4S, v12.4S // ..................................................................................................................................................................................................................*..... - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v10.4S, v23.4S, v10.4S // .............................................................................................................................................................................................................*.......... - // gap // ........................................................................................................................................................................................................................ - mul v23.4S, v23.4S, v22.4S // ............................................................................................................................................................................................................*........... - // gap // ........................................................................................................................................................................................................................ - add v2.4S, v7.4S, v18.4S // ....................................................................................................................................................................*................................................... - // gap // ........................................................................................................................................................................................................................ - mls v25.4S, v11.4S, v8.S[0] // ...................................................................................................................................................................................................................*.... - // gap // ........................................................................................................................................................................................................................ - sub v5.4S, v21.4S, v20.4S // ........................................................................................................................................................................*............................................... - // gap // ........................................................................................................................................................................................................................ - mls v23.4S, v10.4S, v8.S[0] // ..............................................................................................................................................................................................................*......... - // gap // ........................................................................................................................................................................................................................ - add v4.4S, v21.4S, v20.4S // .........................................................................................................................................................................*.............................................. - // gap // ........................................................................................................................................................................................................................ - sub v20.4S, v29.4S, v25.4S // ....................................................................................................................................................................................................................*... - // gap // ........................................................................................................................................................................................................................ - add v19.4S, v29.4S, v25.4S // .....................................................................................................................................................................................................................*.. - // gap // ........................................................................................................................................................................................................................ - sub v18.4S, v27.4S, v23.4S // ...............................................................................................................................................................................................................*........ - // gap // ........................................................................................................................................................................................................................ - add v17.4S, v27.4S, v23.4S // ................................................................................................................................................................................................................*....... - // gap // ........................................................................................................................................................................................................................ - st4 {v2.4S,v3.4S,v4.4S,v5.4S}, [x1], #64 // ......................................................................................................................................................................................................................*. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - st4 {v17.4S,v18.4S,v19.4S,v20.4S}, [x2], #64 // .......................................................................................................................................................................................................................* - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - - // original source code - // ldr x10, [x1, #64] // ......................................................................................*................................................................................................................................................................................ || .....................................................................*.............................................................................................................. - // ldr x11, [x1, #72] // ..........................................................................................*............................................................................................................................................................................ || .......................................................................*............................................................................................................ - // vins v9, x10, 0 // ....................................................................................................*.................................................................................................................................................................. || ............................................................................*....................................................................................................... - // vins v9, x11, 1 // ........................................................................................................*.............................................................................................................................................................. || ..............................................................................*..................................................................................................... - // ldr x10, [x1, #80] // ..............................................................................................*........................................................................................................................................................................ || .........................................................................*.......................................................................................................... - // ldr x11, [x1, #88] // ................................................................................*...................................................................................................................................................................................... || ..................................................................*................................................................................................................. - // vins v10, x10, 0 // ......................................................................................................*................................................................................................................................................................ || .............................................................................*...................................................................................................... - // vins v10, x11, 1 // ..........................................................................................................*............................................................................................................................................................ || ...............................................................................*.................................................................................................... - // ldr x10, [x1, #96] // ....................................................................................*.................................................................................................................................................................................. || ....................................................................*............................................................................................................... - // ldr x11, [x1, #104] // ........................................................................................*.............................................................................................................................................................................. || ......................................................................*............................................................................................................. - // vins v11, x10, 0 // ............................................................................................*.......................................................................................................................................................................... || ........................................................................*........................................................................................................... - // vins v11, x11, 1 // ................................................................................................*...................................................................................................................................................................... || ..........................................................................*......................................................................................................... - // ldr x10, [x1, #112] // ......................................................................*................................................................................................................................................................................................ || .............................................................*...................................................................................................................... - // ldr x11, [x1, #120] // ..........................................................................*............................................................................................................................................................................................ || ...............................................................*.................................................................................................................... - // vins v12, x10, 0 // ..............................................................................*........................................................................................................................................................................................ || .................................................................*.................................................................................................................. - // vins v12, x11, 1 // ..................................................................................*.................................................................................................................................................................................... || ...................................................................*................................................................................................................ - // ldr x10, [x2, #64] // ........................................................*.............................................................................................................................................................................................................. || ......................................................*............................................................................................................................. - // ldr x11, [x2, #72] // ..........................................................*............................................................................................................................................................................................................ || .......................................................*............................................................................................................................ - // vins v13, x10, 0 // ................................................................*...................................................................................................................................................................................................... || ..........................................................*......................................................................................................................... - // vins v13, x11, 1 // ....................................................................*.................................................................................................................................................................................................. || ............................................................*....................................................................................................................... - // ldr x10, [x2, #80] // ................................................*...................................................................................................................................................................................................................... || ..................................................*................................................................................................................................. - // ldr x11, [x2, #88] // ....................................................*.................................................................................................................................................................................................................. || ....................................................*............................................................................................................................... - // vins v14, x10, 0 // .......................................................*............................................................................................................................................................................................................... || ......................................................*............................................................................................................................. - // vins v14, x11, 1 // ...........................................................*........................................................................................................................................................................................................... || ........................................................*........................................................................................................................... - // ldr x10, [x2, #96] // ............................................................*.......................................................................................................................................................................................................... || ........................................................*........................................................................................................................... - // ldr x11, [x2, #104] // ..............................................................*........................................................................................................................................................................................................ || .........................................................*.......................................................................................................................... - // vins v15, x10, 0 // ........................................................................*.............................................................................................................................................................................................. || ..............................................................*..................................................................................................................... - // vins v15, x11, 1 // ............................................................................*.......................................................................................................................................................................................... || ................................................................*................................................................................................................... - // ldr x10, [x2, #112] // ..................................................*.................................................................................................................................................................................................................... || ...................................................*................................................................................................................................ - // ldr x11, [x2, #120] // ......................................................*................................................................................................................................................................................................................ || .....................................................*.............................................................................................................................. - // vins v16, x10, 0 // .........................................................*............................................................................................................................................................................................................. || .......................................................*............................................................................................................................ - // vins v16, x11, 1 // .............................................................*......................................................................................................................................................................................................... || .........................................................*.......................................................................................................................... - // add x1, x1, #64 // ..................................................................................................*.................................................................................................................................................................... || ...........................................................................*........................................................................................................ - // add x2, x2, #64 // ..................................................................*.................................................................................................................................................................................................... || ...........................................................*........................................................................................................................ - // ldr x10, [x4] , #64 // e...................................................................................................................................................................................................................................................................... || e................................................................................................................................................................................... - // ldr x11, [x4, #-56] // ....e.................................................................................................................................................................................................................................................................. || ..e................................................................................................................................................................................. - // vins v0, x10, 0 // ...............................................*....................................................................................................................................................................................................................... || ..................................................*................................................................................................................................. - // vins v0, x11, 1 // ...................................................*................................................................................................................................................................................................................... || ....................................................*............................................................................................................................... - // ldr x10, [x4, #-48] // ........e.............................................................................................................................................................................................................................................................. || ....e............................................................................................................................................................................... - // ldr x11, [x4, #-40] // ..........e............................................................................................................................................................................................................................................................ || .....e.............................................................................................................................................................................. - // vins v1, x10, 0 // .................................................*..................................................................................................................................................................................................................... || ...................................................*................................................................................................................................ - // vins v1, x11, 1 // .....................................................*................................................................................................................................................................................................................. || .....................................................*.............................................................................................................................. - // ldr x10, [x4, #-32] // ............................................................................................................*.......................................................................................................................................................... || ................................................................................*................................................................................................... - // ldr x11, [x4, #-24] // ..............................................................................................................*........................................................................................................................................................ || .................................................................................*.................................................................................................. - // vins v2, x10, 0 // ....................................................................................................................*.................................................................................................................................................. || ....................................................................................*............................................................................................... - // vins v2, x11, 1 // ........................................................................................................................*.............................................................................................................................................. || ......................................................................................*............................................................................................. - // ldr x10, [x4, #-16] // ................................................................................................................*...................................................................................................................................................... || ..................................................................................*................................................................................................. - // ldr x11, [x4, #-8] // ..................................................................................................................*.................................................................................................................................................... || ...................................................................................*................................................................................................ - // vins v3, x10, 0 // ..........................................................................................................................*............................................................................................................................................ || .......................................................................................*............................................................................................ - // vins v3, x11, 1 // ..............................................................................................................................*........................................................................................................................................ || .........................................................................................*.......................................................................................... - // mul v24.4S, v13.4S, v0.S[0] // .......................................................................*............................................................................................................................................................................................... || ..............................................................*..................................................................................................................... - // sqrdmulh v13.4S, v13.4S, v0.S[1] // .........................................................................*............................................................................................................................................................................................. || ...............................................................*.................................................................................................................... - // mls v24.4S, v13.4S, v8.S[0] // ...................................................................................*................................................................................................................................................................................... || ....................................................................*............................................................................................................... - // sub v13.4S, v9.4S, v24.4S // .................................................................................................................*..................................................................................................................................................... || ...................................................................................*................................................................................................ - // add v9.4S, v9.4S, v24.4S // .............................................................................................................................*......................................................................................................................................... || .........................................................................................*.......................................................................................... - // mul v24.4S, v14.4S, v0.S[0] // ...............................................................*....................................................................................................................................................................................................... || ..........................................................*......................................................................................................................... - // sqrdmulh v14.4S, v14.4S, v0.S[1] // .................................................................*..................................................................................................................................................................................................... || ...........................................................*........................................................................................................................ - // mls v24.4S, v14.4S, v8.S[0] // ...........................................................................*........................................................................................................................................................................................... || ................................................................*................................................................................................................... - // sub v14.4S, v10.4S, v24.4S // ...............................................................................................................*....................................................................................................................................................... || ..................................................................................*................................................................................................. - // add v10.4S, v10.4S, v24.4S // .............................................................................................................*......................................................................................................................................................... || .................................................................................*.................................................................................................. - // mul v24.4S, v15.4S, v0.S[0] // ...............................................................................*....................................................................................................................................................................................... || ..................................................................*................................................................................................................. - // sqrdmulh v15.4S, v15.4S, v0.S[1] // .................................................................................*..................................................................................................................................................................................... || ...................................................................*................................................................................................................ - // mls v24.4S, v15.4S, v8.S[0] // .........................................................................................*............................................................................................................................................................................. || .......................................................................*............................................................................................................ - // sub v15.4S, v11.4S, v24.4S // .....................................................................................................*................................................................................................................................................................. || .............................................................................*...................................................................................................... - // add v11.4S, v11.4S, v24.4S // ...................................................................................................*................................................................................................................................................................... || ............................................................................*....................................................................................................... - // mul v24.4S, v16.4S, v0.S[0] // ...................................................................*................................................................................................................................................................................................... || ............................................................*....................................................................................................................... - // sqrdmulh v16.4S, v16.4S, v0.S[1] // .....................................................................*................................................................................................................................................................................................. || .............................................................*...................................................................................................................... - // mls v24.4S, v16.4S, v8.S[0] // .............................................................................*......................................................................................................................................................................................... || .................................................................*.................................................................................................................. - // sub v16.4S, v12.4S, v24.4S // .....................................................................................*................................................................................................................................................................................. || .....................................................................*.............................................................................................................. - // add v12.4S, v12.4S, v24.4S // .......................................................................................*............................................................................................................................................................................... || ......................................................................*............................................................................................................. - // mul v24.4S, v11.4S, v0.S[2] // .........................................................................................................*............................................................................................................................................................. || ...............................................................................*.................................................................................................... - // sqrdmulh v11.4S, v11.4S, v0.S[3] // ...........................................................................................................*........................................................................................................................................................... || ................................................................................*................................................................................................... - // mls v24.4S, v11.4S, v8.S[0] // ...............................................................................................................................*....................................................................................................................................... || ..........................................................................................*......................................................................................... - // sub v11.4S, v9.4S, v24.4S // .......................................................................................................................................*............................................................................................................................... || ..............................................................................................*..................................................................................... - // add v9.4S, v9.4S, v24.4S // .........................................................................................................................................*............................................................................................................................. || ...............................................................................................*.................................................................................... - // mul v24.4S, v12.4S, v0.S[2] // .............................................................................................*......................................................................................................................................................................... || .........................................................................*.......................................................................................................... - // sqrdmulh v12.4S, v12.4S, v0.S[3] // ...............................................................................................*....................................................................................................................................................................... || ..........................................................................*......................................................................................................... - // mls v24.4S, v12.4S, v8.S[0] // .......................................................................................................*............................................................................................................................................................... || ..............................................................................*..................................................................................................... - // sub v12.4S, v10.4S, v24.4S // .................................................................................................................................*..................................................................................................................................... || ...........................................................................................*........................................................................................ - // add v10.4S, v10.4S, v24.4S // ...................................................................................................................*................................................................................................................................................... || ....................................................................................*............................................................................................... - // mul v24.4S, v15.4S, v1.S[0] // .....................................................................................................................*................................................................................................................................................. || .....................................................................................*.............................................................................................. - // sqrdmulh v15.4S, v15.4S, v1.S[1] // .......................................................................................................................*............................................................................................................................................... || ......................................................................................*............................................................................................. - // mls v24.4S, v15.4S, v8.S[0] // ...................................................................................................................................*................................................................................................................................... || ............................................................................................*....................................................................................... - // sub v15.4S, v13.4S, v24.4S // ...........................................................................................................................................*........................................................................................................................... || ................................................................................................*................................................................................... - // add v13.4S, v13.4S, v24.4S // .............................................................................................................................................*......................................................................................................................... || .................................................................................................*.................................................................................. - // mul v24.4S, v16.4S, v1.S[0] // ...........................................................................................*........................................................................................................................................................................... || ........................................................................*........................................................................................................... - // sqrdmulh v16.4S, v16.4S, v1.S[1] // .................................................................................................*..................................................................................................................................................................... || ...........................................................................*........................................................................................................ - // mls v24.4S, v16.4S, v8.S[0] // .....................................................................................................................................*................................................................................................................................. || .............................................................................................*...................................................................................... - // sub v16.4S, v14.4S, v24.4S // ...............................................................................................................................................*....................................................................................................................... || ..................................................................................................*................................................................................. - // add v14.4S, v14.4S, v24.4S // .................................................................................................................................................*..................................................................................................................... || ...................................................................................................*................................................................................ - // mul v24.4S, v10.4S, v1.S[2] // .........................................................................................................................*............................................................................................................................................. || .......................................................................................*............................................................................................ - // sqrdmulh v10.4S, v10.4S, v1.S[3] // ...........................................................................................................................*........................................................................................................................................... || ........................................................................................*........................................................................................... - // mls v24.4S, v10.4S, v8.S[0] // ...................................................................................................................................................*................................................................................................................... || ....................................................................................................*............................................................................... - // sub v10.4S, v9.4S, v24.4S // ...........................................................................................................................................................*........................................................................................................... || ........................................................................................................*........................................................................... - // add v9.4S, v9.4S, v24.4S // .............................................................................................................................................................*......................................................................................................... || .........................................................................................................*.......................................................................... - // mul v24.4S, v12.4S, v2.S[0] // .....................................................................................................................................................*................................................................................................................. || .....................................................................................................*.............................................................................. - // sqrdmulh v12.4S, v12.4S, v2.S[1] // .......................................................................................................................................................*............................................................................................................... || ......................................................................................................*............................................................................. - // mls v24.4S, v12.4S, v8.S[0] // ...............................................................................................................................................................*....................................................................................................... || ..........................................................................................................*......................................................................... - // sub v12.4S, v11.4S, v24.4S // .......................................................................................................................................................................*............................................................................................... || ..............................................................................................................*..................................................................... - // add v11.4S, v11.4S, v24.4S // .........................................................................................................................................................................*............................................................................................. || ...............................................................................................................*.................................................................... - // mul v24.4S, v14.4S, v2.S[2] // .........................................................................................................................................................*............................................................................................................. || .......................................................................................................*............................................................................ - // sqrdmulh v14.4S, v14.4S, v2.S[3] // .................................................................................................................................................................*..................................................................................................... || ...........................................................................................................*........................................................................ - // mls v24.4S, v14.4S, v8.S[0] // ...........................................................................................................................................................................*........................................................................................... || ................................................................................................................*................................................................... - // sub v14.4S, v13.4S, v24.4S // ...................................................................................................................................................................................*................................................................................... || ....................................................................................................................*............................................................... - // add v13.4S, v13.4S, v24.4S // .....................................................................................................................................................................................*................................................................................. || .....................................................................................................................*.............................................................. - // mul v24.4S, v16.4S, v3.S[0] // ...................................................................................................................................................................*................................................................................................... || ............................................................................................................*....................................................................... - // sqrdmulh v16.4S, v16.4S, v3.S[1] // .....................................................................................................................................................................*................................................................................................. || .............................................................................................................*...................................................................... - // mls v24.4S, v16.4S, v8.S[0] // .............................................................................................................................................................................*......................................................................................... || .................................................................................................................*.................................................................. - // sub v16.4S, v15.4S, v24.4S // .......................................................................................................................................................................................*............................................................................... || ......................................................................................................................*............................................................. - // add v15.4S, v15.4S, v24.4S // .........................................................................................................................................................................................*............................................................................. || .......................................................................................................................*............................................................ - // trn1 v25.4S, v9.4S, v10.4S // ...............................................................................................................................................................................*....................................................................................... || ..................................................................................................................*................................................................. - // trn2 v26.4S, v9.4S, v10.4S // .................................................................................................................................................................................*..................................................................................... || ...................................................................................................................*................................................................ - // trn1 v27.4S, v11.4S, v12.4S // ...........................................................................................................................................................................................*........................................................................... || ........................................................................................................................*........................................................... - // trn2 v28.4S, v11.4S, v12.4S // .............................................................................................................................................................................................*......................................................................... || .........................................................................................................................*.......................................................... - // trn2 v11.2D, v25.2D, v27.2D // .................................................................................................................................................................................................*..................................................................... || ...........................................................................................................................*........................................................ - // trn2 v12.2D, v26.2D, v28.2D // ...................................................................................................................................................................................................*................................................................... || ............................................................................................................................*....................................................... - // trn1 v9.2D, v25.2D, v27.2D // .....................................................................................................................................................................................................*................................................................. || .............................................................................................................................*...................................................... - // trn1 v10.2D, v26.2D, v28.2D // .......................................................................................................................................................................................................*............................................................... || ..............................................................................................................................*..................................................... - // trn1 v25.4S, v13.4S, v14.4S // ...............................................................................................................................................................................................*....................................................................... || ..........................................................................................................................*......................................................... - // trn2 v26.4S, v13.4S, v14.4S // .........................................................................................................................................................................................................*............................................................. || ...............................................................................................................................*.................................................... - // trn1 v27.4S, v15.4S, v16.4S // ...........................................................................................................................................................................................................*........................................................... || ................................................................................................................................*................................................... - // trn2 v28.4S, v15.4S, v16.4S // .............................................................................................................................................................................................................*......................................................... || .................................................................................................................................*.................................................. - // trn2 v15.2D, v25.2D, v27.2D // .................................................................................................................................................................................................................*..................................................... || ...................................................................................................................................*................................................ - // trn2 v16.2D, v26.2D, v28.2D // ...................................................................................................................................................................................................................*................................................... || ....................................................................................................................................*............................................... - // trn1 v13.2D, v25.2D, v27.2D // .....................................................................................................................................................................................................................*................................................. || .....................................................................................................................................*.............................................. - // trn1 v14.2D, v26.2D, v28.2D // .......................................................................................................................................................................................................................*............................................... || ......................................................................................................................................*............................................. - // ldr x10, [x5] , #192 // ......................................................................................................................*................................................................................................................................................ || .....................................................................................*.............................................................................................. - // ldr x11, [x5, #-184] // ............................................................................................................................*.......................................................................................................................................... || ........................................................................................*........................................................................................... - // vins v0, x10, 0 // ................................................................................................................................*...................................................................................................................................... || ..........................................................................................*......................................................................................... - // vins v0, x11, 1 // ....................................................................................................................................*.................................................................................................................................. || ............................................................................................*....................................................................................... - // ldr x10, [x5, #-176] // ..................................................................................................................................*.................................................................................................................................... || ...........................................................................................*........................................................................................ - // ldr x11, [x5, #-168] // ......................................................................................................................................*................................................................................................................................ || .............................................................................................*...................................................................................... - // vins v4, x10, 0 // ..........................................................................................................................................*............................................................................................................................ || ...............................................................................................*.................................................................................... - // vins v4, x11, 1 // ..............................................................................................................................................*........................................................................................................................ || .................................................................................................*.................................................................................. - // ldr x10, [x5, #-160] // ........................................................................................................................................*.............................................................................................................................. || ..............................................................................................*..................................................................................... - // ldr x11, [x5, #-152] // ............................................................................................................................................*.......................................................................................................................... || ................................................................................................*................................................................................... - // vins v1, x10, 0 // ................................................................................................................................................*...................................................................................................................... || ..................................................................................................*................................................................................. - // vins v1, x11, 1 // ....................................................................................................................................................*.................................................................................................................. || ....................................................................................................*............................................................................... - // ldr x10, [x5, #-144] // ..................................................................................................................................................*.................................................................................................................... || ...................................................................................................*................................................................................ - // ldr x11, [x5, #-136] // ......................................................................................................................................................*................................................................................................................ || .....................................................................................................*.............................................................................. - // vins v5, x10, 0 // ..........................................................................................................................................................*............................................................................................................ || .......................................................................................................*............................................................................ - // vins v5, x11, 1 // ..............................................................................................................................................................*........................................................................................................ || .........................................................................................................*.......................................................................... - // ldr x10, [x5, #-128] // ........................................................................................................................................................*.............................................................................................................. || ......................................................................................................*............................................................................. - // ldr x11, [x5, #-120] // ............................................................................................................................................................*.......................................................................................................... || ........................................................................................................*........................................................................... - // vins v2, x10, 0 // ................................................................................................................................................................*...................................................................................................... || ..........................................................................................................*......................................................................... - // vins v2, x11, 1 // ....................................................................................................................................................................*.................................................................................................. || ............................................................................................................*....................................................................... - // ldr x10, [x5, #-112] // ..................................................................................................................................................................*.................................................................................................... || ...........................................................................................................*........................................................................ - // ldr x11, [x5, #-104] // ......................................................................................................................................................................*................................................................................................ || .............................................................................................................*...................................................................... - // vins v6, x10, 0 // ..........................................................................................................................................................................*............................................................................................ || ...............................................................................................................*.................................................................... - // vins v6, x11, 1 // ..............................................................................................................................................................................*........................................................................................ || .................................................................................................................*.................................................................. - // mul v24.4S, v11.4S, v0.4S // ...............................................................................................................................................................................................................*....................................................... || ..................................................................................................................................*................................................. - // sqrdmulh v11.4S, v11.4S, v4.4S // .........................................................................................................................................................................................................................*............................................. || .......................................................................................................................................*............................................ - // mls v24.4S, v11.4S, v8.S[0] // .................................................................................................................................................................................................................................*..................................... || ...........................................................................................................................................*........................................ - // sub v11.4S, v9.4S, v24.4S // ......................................................................................................................................................................................................................................*................................ || ...............................................................................................................................................*.................................... - // add v9.4S, v9.4S, v24.4S // .......................................................................................................................................................................................................................................*............................... || ................................................................................................................................................*................................... - // mul v24.4S, v12.4S, v0.4S // ...........................................................................................................................................................................................................................*........................................... || ........................................................................................................................................*........................................... - // sqrdmulh v12.4S, v12.4S, v4.4S // .............................................................................................................................................................................................................................*......................................... || .........................................................................................................................................*.......................................... - // mls v24.4S, v12.4S, v8.S[0] // ....................................................................................................................................................................................................................................*.................................. || .............................................................................................................................................*...................................... - // sub v12.4S, v10.4S, v24.4S // ........................................................................................................................................................................................................................................*.............................. || .................................................................................................................................................*.................................. - // add v10.4S, v10.4S, v24.4S // .........................................................................................................................................................................................................................................*............................. || ..................................................................................................................................................*................................. - // mul v24.4S, v10.4S, v1.4S // ............................................................................................................................................................................................................................................*.......................... || .....................................................................................................................................................*.............................. - // sqrdmulh v10.4S, v10.4S, v5.4S // ..............................................................................................................................................................................................................................................*........................ || .......................................................................................................................................................*............................ - // mls v24.4S, v10.4S, v8.S[0] // ...................................................................................................................................................................................................................................................*................... || ............................................................................................................................................................*....................... - // sub v10.4S, v9.4S, v24.4S // .......................................................................................................................................................................................................................................................*............... || ................................................................................................................................................................*................... - // add v9.4S, v9.4S, v24.4S // ............................................................................................................................................................................................................................................................*.......... || .....................................................................................................................................................................*.............. - // mul v24.4S, v12.4S, v2.4S // ...........................................................................................................................................................................................................................................*........................... || ....................................................................................................................................................*............................... - // sqrdmulh v12.4S, v12.4S, v6.4S // ...............................................................................................................................................................................................................................................*....................... || ........................................................................................................................................................*........................... - // mls v24.4S, v12.4S, v8.S[0] // ....................................................................................................................................................................................................................................................*.................. || .............................................................................................................................................................*...................... - // sub v12.4S, v11.4S, v24.4S // ..............................................................................................................................................................................................................................................................*........ || .......................................................................................................................................................................*............ - // add v11.4S, v11.4S, v24.4S // ................................................................................................................................................................................................................................................................*...... || .........................................................................................................................................................................*.......... - // ldr x10, [x5, #-96] // ........................................................................................................................................................................*.............................................................................................. || ..............................................................................................................*..................................................................... - // ldr x11, [x5, #-88] // ............................................................................................................................................................................*.......................................................................................... || ................................................................................................................*................................................................... - // vins v0, x10, 0 // ................................................................................................................................................................................*...................................................................................... || ..................................................................................................................*................................................................. - // vins v0, x11, 1 // ....................................................................................................................................................................................*.................................................................................. || ....................................................................................................................*............................................................... - // ldr x10, [x5, #-80] // ..................................................................................................................................................................................*.................................................................................... || ...................................................................................................................*................................................................ - // ldr x11, [x5, #-72] // ......................................................................................................................................................................................*................................................................................ || .....................................................................................................................*.............................................................. - // vins v4, x10, 0 // ..........................................................................................................................................................................................*............................................................................ || .......................................................................................................................*............................................................ - // vins v4, x11, 1 // ..............................................................................................................................................................................................*........................................................................ || .........................................................................................................................*.......................................................... - // ldr x10, [x5, #-64] // ........................................................................................................................................................................................*.............................................................................. || ......................................................................................................................*............................................................. - // ldr x11, [x5, #-56] // ............................................................................................................................................................................................*.......................................................................... || ........................................................................................................................*........................................................... - // vins v1, x10, 0 // ................................................................................................................................................................................................*...................................................................... || ..........................................................................................................................*......................................................... - // vins v1, x11, 1 // ....................................................................................................................................................................................................*.................................................................. || ............................................................................................................................*....................................................... - // ldr x10, [x5, #-48] // ..................................................................................................................................................................................................*.................................................................... || ...........................................................................................................................*........................................................ - // ldr x11, [x5, #-40] // ......................................................................................................................................................................................................*................................................................ || .............................................................................................................................*...................................................... - // vins v5, x10, 0 // ..........................................................................................................................................................................................................*............................................................ || ...............................................................................................................................*.................................................... - // vins v5, x11, 1 // ..............................................................................................................................................................................................................*........................................................ || .................................................................................................................................*.................................................. - // ldr x10, [x5, #-32] // ........................................................................................................................................................................................................*.............................................................. || ..............................................................................................................................*..................................................... - // ldr x11, [x5, #-24] // ............................................................................................................................................................................................................*.......................................................... || ................................................................................................................................*................................................... - // vins v2, x10, 0 // ................................................................................................................................................................................................................*...................................................... || ..................................................................................................................................*................................................. - // vins v2, x11, 1 // ....................................................................................................................................................................................................................*.................................................. || ....................................................................................................................................*............................................... - // ldr x10, [x5, #-16] // ..................................................................................................................................................................................................................*.................................................... || ...................................................................................................................................*................................................ - // ldr x11, [x5, #-8] // ......................................................................................................................................................................................................................*................................................ || .....................................................................................................................................*.............................................. - // vins v6, x10, 0 // ..........................................................................................................................................................................................................................*............................................ || .......................................................................................................................................*............................................ - // vins v6, x11, 1 // ..............................................................................................................................................................................................................................*........................................ || .........................................................................................................................................*.......................................... - // mul v24.4S, v15.4S, v0.4S // ...............................................................................................................................................................................................................................*....................................... || ..........................................................................................................................................*......................................... - // sqrdmulh v15.4S, v15.4S, v4.4S // ...................................................................................................................................................................................................................................*................................... || ............................................................................................................................................*....................................... - // mls v24.4S, v15.4S, v8.S[0] // ..........................................................................................................................................................................................................................................*............................ || ...................................................................................................................................................*................................ - // sub v15.4S, v13.4S, v24.4S // ................................................................................................................................................................................................................................................*...................... || .........................................................................................................................................................*.......................... - // add v13.4S, v13.4S, v24.4S // ..................................................................................................................................................................................................................................................*.................... || ...........................................................................................................................................................*........................ - // mul v24.4S, v16.4S, v0.4S // .....................................................................................................................................................................................................................................*................................. || ..............................................................................................................................................*..................................... - // sqrdmulh v16.4S, v16.4S, v4.4S // .............................................................................................................................................................................................................................................*......................... || ......................................................................................................................................................*............................. - // mls v24.4S, v16.4S, v8.S[0] // .................................................................................................................................................................................................................................................*..................... || ..........................................................................................................................................................*......................... - // sub v16.4S, v14.4S, v24.4S // .....................................................................................................................................................................................................................................................*................. || ..............................................................................................................................................................*..................... - // add v14.4S, v14.4S, v24.4S // ......................................................................................................................................................................................................................................................*................ || ...............................................................................................................................................................*.................... - // mul v24.4S, v14.4S, v1.4S // ...........................................................................................................................................................................................................................................................*........... || ....................................................................................................................................................................*............... - // sqrdmulh v14.4S, v14.4S, v5.4S // ..........................................................................................................................................................................................................................................................*............ || ...................................................................................................................................................................*................ - // mls v24.4S, v14.4S, v8.S[0] // ...............................................................................................................................................................................................................................................................*....... || ........................................................................................................................................................................*........... - // sub v14.4S, v13.4S, v24.4S // ...................................................................................................................................................................................................................................................................*... || ............................................................................................................................................................................*....... - // add v13.4S, v13.4S, v24.4S // ....................................................................................................................................................................................................................................................................*.. || .............................................................................................................................................................................*...... - // mul v24.4S, v16.4S, v2.4S // ........................................................................................................................................................................................................................................................*.............. || .................................................................................................................................................................*.................. - // sqrdmulh v16.4S, v16.4S, v6.4S // .........................................................................................................................................................................................................................................................*............. || ..................................................................................................................................................................*................. - // mls v24.4S, v16.4S, v8.S[0] // .............................................................................................................................................................................................................................................................*......... || ......................................................................................................................................................................*............. - // sub v16.4S, v15.4S, v24.4S // .................................................................................................................................................................................................................................................................*..... || ..........................................................................................................................................................................*......... - // add v15.4S, v15.4S, v24.4S // ..................................................................................................................................................................................................................................................................*.... || ...........................................................................................................................................................................*........ - // st4 {v9.4S,v10.4S,v11.4S,v12.4S}, [x1], #64 // .....................................................................................................................................................................................................................................................................*. || ..............................................................................................................................................................................*..... - // st4 {v13.4S,v14.4S,v15.4S,v16.4S}, [x2], #64 // ......................................................................................................................................................................................................................................................................* || ...................................................................................................................................................................................* - - subs count, count, #1 + // Instructions: 144 + // Expected cycles: 174 + // Expected IPC: 0.83 + // + // Wall time: 39.99s + // User time: 39.99s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + add x1, x1, #64 // ........*....................................................................................................................................... + add x2, x2, #64 // .........*...................................................................................................................................... + sqrdmulh v6.4S, v16.4S, v21.S[1] // ..............*................................................................................................................................. + // gap // ................................................................................................................................................ + mul v18.4S, v16.4S, v21.S[0] // ...............*................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v22.4S, v21.S[1] // ...................*............................................................................................................................ + // gap // ................................................................................................................................................ + mul v22.4S, v22.4S, v21.S[0] // ....................*........................................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v19.4S, v9.4S, v21.S[1] // ........................*....................................................................................................................... + // gap // ................................................................................................................................................ + mls v18.4S, v6.4S, v8.S[0] // ................*............................................................................................................................... + // gap // ................................................................................................................................................ + mul v6.4S, v9.4S, v21.S[0] // .........................*...................................................................................................................... + // gap // ................................................................................................................................................ + mls v22.4S, v16.4S, v8.S[0] // .....................*.......................................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v4.4S, v21.S[1] // .............................*.................................................................................................................. + // gap // ................................................................................................................................................ + sub v20.4S, v31.4S, v18.4S // .................*.............................................................................................................................. + // gap // ................................................................................................................................................ + mls v6.4S, v19.4S, v8.S[0] // ..........................*..................................................................................................................... + // gap // ................................................................................................................................................ + add v18.4S, v31.4S, v18.4S // ..................*............................................................................................................................. + // gap // ................................................................................................................................................ + sub v19.4S, v3.4S, v22.4S // ......................*......................................................................................................................... + // gap // ................................................................................................................................................ + add v22.4S, v3.4S, v22.4S // .......................*........................................................................................................................ + // gap // ................................................................................................................................................ + sub v31.4S, v12.4S, v6.4S // ...........................*.................................................................................................................... + // gap // ................................................................................................................................................ + add v6.4S, v12.4S, v6.4S // ............................*................................................................................................................... + // gap // ................................................................................................................................................ + mul v3.4S, v4.4S, v21.S[0] // ..............................*................................................................................................................. + // gap // ................................................................................................................................................ + sqrdmulh v12.4S, v31.4S, v7.S[1] // ............................................*................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v9.4S, v6.4S, v21.S[3] // ..................................*............................................................................................................. + // gap // ................................................................................................................................................ + mul v6.4S, v6.4S, v21.S[2] // ...................................*............................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v16.4S, v8.S[0] // ...............................*................................................................................................................ + // gap // ................................................................................................................................................ + mul v16.4S, v31.4S, v7.S[0] // .............................................*.................................................................................................. + // gap // ................................................................................................................................................ + ldr q31, [x4, #-32] // ............*................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v6.4S, v9.4S, v8.S[0] // ....................................*........................................................................................................... + // gap // ................................................................................................................................................ + mls v16.4S, v12.4S, v8.S[0] // ..............................................*................................................................................................. + // gap // ................................................................................................................................................ + add v12.4S, v15.4S, v3.4S // .................................*.............................................................................................................. + // gap // ................................................................................................................................................ + sub v3.4S, v15.4S, v3.4S // ................................*............................................................................................................... + // gap // ................................................................................................................................................ + sub v9.4S, v18.4S, v6.4S // .....................................*.......................................................................................................... + // gap // ................................................................................................................................................ + sub v15.4S, v20.4S, v16.4S // ...............................................*................................................................................................ + // gap // ................................................................................................................................................ + add v16.4S, v20.4S, v16.4S // ................................................*............................................................................................... + // gap // ................................................................................................................................................ + add v6.4S, v18.4S, v6.4S // ......................................*......................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v18.4S, v3.4S, v7.S[1] // .................................................*.............................................................................................. + // gap // ................................................................................................................................................ + mul v20.4S, v3.4S, v7.S[0] // ..................................................*............................................................................................. + // gap // ................................................................................................................................................ + sqrdmulh v3.4S, v12.4S, v21.S[3] // .......................................*........................................................................................................ + // gap // ................................................................................................................................................ + ldr q4, [x4, #-16] // .............*.................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v20.4S, v18.4S, v8.S[0] // ...................................................*............................................................................................ + // gap // ................................................................................................................................................ + mul v18.4S, v12.4S, v21.S[2] // ........................................*....................................................................................................... + // gap // ................................................................................................................................................ + ldr q12, [x5], #(12*16) // ..........................................................................................*..................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v26.4S, v19.4S, v20.4S // ....................................................*........................................................................................... + // gap // ................................................................................................................................................ + add v19.4S, v19.4S, v20.4S // .....................................................*.......................................................................................... + // gap // ................................................................................................................................................ + mls v18.4S, v3.4S, v8.S[0] // .........................................*...................................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v26.4S, v4.S[1] // .....................................................................*.......................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v3.4S, v19.4S, v31.S[3] // ................................................................*............................................................................... + // gap // ................................................................................................................................................ + mul v19.4S, v19.4S, v31.S[2] // .................................................................*.............................................................................. + // gap // ................................................................................................................................................ + sub v21.4S, v22.4S, v18.4S // ..........................................*..................................................................................................... + // gap // ................................................................................................................................................ + add v18.4S, v22.4S, v18.4S // ...........................................*.................................................................................................... + // gap // ................................................................................................................................................ + mul v22.4S, v26.4S, v4.S[0] // ......................................................................*......................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v4.4S, v21.4S, v31.S[1] // ...........................................................*.................................................................................... + // gap // ................................................................................................................................................ + mul v31.4S, v21.4S, v31.S[0] // ............................................................*................................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v18.4S, v7.S[3] // ......................................................*......................................................................................... + // gap // ................................................................................................................................................ + mul v18.4S, v18.4S, v7.S[2] // .......................................................*........................................................................................ + // gap // ................................................................................................................................................ + mls v19.4S, v3.4S, v8.S[0] // ..................................................................*............................................................................. + // gap // ................................................................................................................................................ + mls v31.4S, v4.4S, v8.S[0] // .............................................................*.................................................................................. + // gap // ................................................................................................................................................ + mls v22.4S, v20.4S, v8.S[0] // .......................................................................*........................................................................ + // gap // ................................................................................................................................................ + mls v18.4S, v26.4S, v8.S[0] // ........................................................*....................................................................................... + // gap // ................................................................................................................................................ + sub v20.4S, v16.4S, v19.4S // ...................................................................*............................................................................ + // gap // ................................................................................................................................................ + sub v3.4S, v9.4S, v31.4S // ..............................................................*................................................................................. + // gap // ................................................................................................................................................ + add v31.4S, v9.4S, v31.4S // ...............................................................*................................................................................ + // gap // ................................................................................................................................................ + sub v9.4S, v15.4S, v22.4S // ........................................................................*....................................................................... + // gap // ................................................................................................................................................ + add v22.4S, v15.4S, v22.4S // .........................................................................*...................................................................... + // gap // ................................................................................................................................................ + add v16.4S, v16.4S, v19.4S // ....................................................................*........................................................................... + // gap // ................................................................................................................................................ + sub v19.4S, v6.4S, v18.4S // .........................................................*...................................................................................... + // gap // ................................................................................................................................................ + add v6.4S, v6.4S, v18.4S // ..........................................................*..................................................................................... + // gap // ................................................................................................................................................ + trn1 v18.4S, v31.4S, v3.4S // ............................................................................*................................................................... + // gap // ................................................................................................................................................ + trn2 v31.4S, v31.4S, v3.4S // .............................................................................*.................................................................. + // gap // ................................................................................................................................................ + trn1 v3.4S, v6.4S, v19.4S // ..........................................................................*..................................................................... + // gap // ................................................................................................................................................ + trn2 v6.4S, v6.4S, v19.4S // ...........................................................................*.................................................................... + // gap // ................................................................................................................................................ + trn1 v19.4S, v16.4S, v20.4S // ..................................................................................*............................................................. + // gap // ................................................................................................................................................ + trn2 v15.2D, v3.2D, v18.2D // ..............................................................................*................................................................. + // gap // ................................................................................................................................................ + trn2 v4.2D, v6.2D, v31.2D // ...............................................................................*................................................................ + // gap // ................................................................................................................................................ + trn1 v18.2D, v3.2D, v18.2D // ................................................................................*............................................................... + // gap // ................................................................................................................................................ + trn1 v6.2D, v6.2D, v31.2D // .................................................................................*.............................................................. + // gap // ................................................................................................................................................ + trn2 v16.4S, v16.4S, v20.4S // ...................................................................................*............................................................ + // gap // ................................................................................................................................................ + trn1 v20.4S, v22.4S, v9.4S // ....................................................................................*........................................................... + // gap // ................................................................................................................................................ + trn2 v22.4S, v22.4S, v9.4S // .....................................................................................*.......................................................... + // gap // ................................................................................................................................................ + sqrdmulh v31.4S, v15.4S, v2.4S // ................................................................................................*............................................... + // gap // ................................................................................................................................................ + trn2 v3.2D, v19.2D, v20.2D // ......................................................................................*......................................................... + // gap // ................................................................................................................................................ + mul v9.4S, v15.4S, v12.4S // .................................................................................................*.............................................. + // gap // ................................................................................................................................................ + mul v12.4S, v4.4S, v12.4S // ......................................................................................................*......................................... + // gap // ................................................................................................................................................ + trn2 v15.2D, v16.2D, v22.2D // .......................................................................................*........................................................ + // gap // ................................................................................................................................................ + trn1 v19.2D, v19.2D, v20.2D // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + trn1 v16.2D, v16.2D, v22.2D // .........................................................................................*...................................................... + // gap // ................................................................................................................................................ + mls v9.4S, v31.4S, v8.S[0] // ..................................................................................................*............................................. + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v4.4S, v2.4S // .....................................................................................................*.......................................... + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v3.4S, v1.4S // ..........................................................................................................................*..................... + // gap // ................................................................................................................................................ + mul v31.4S, v3.4S, v23.4S // ...........................................................................................................................*.................... + // gap // ................................................................................................................................................ + sub v3.4S, v18.4S, v9.4S // ...................................................................................................*............................................ + // gap // ................................................................................................................................................ + add v18.4S, v18.4S, v9.4S // ....................................................................................................*........................................... + // gap // ................................................................................................................................................ + mls v12.4S, v22.4S, v8.S[0] // .......................................................................................................*........................................ + // gap // ................................................................................................................................................ + mls v31.4S, v20.4S, v8.S[0] // ............................................................................................................................*................... + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v15.4S, v1.4S // ...............................................................................................................................*................ + // gap // ................................................................................................................................................ + mul v20.4S, v15.4S, v23.4S // ................................................................................................................................*............... + // gap // ................................................................................................................................................ + sub v9.4S, v6.4S, v12.4S // ........................................................................................................*....................................... + // gap // ................................................................................................................................................ + add v6.4S, v6.4S, v12.4S // .........................................................................................................*...................................... + // gap // ................................................................................................................................................ + sub v12.4S, v19.4S, v31.4S // .............................................................................................................................*.................. + // gap // ................................................................................................................................................ + sqrdmulh v15.4S, v9.4S, v5.4S // ...............................................................................................................*................................ + // gap // ................................................................................................................................................ + sqrdmulh v4.4S, v6.4S, v14.4S // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + mul v6.4S, v6.4S, v30.4S // ...........................................................................................................*.................................... + // gap // ................................................................................................................................................ + mul v9.4S, v9.4S, v11.4S // ................................................................................................................*............................... + // gap // ................................................................................................................................................ + add v19.4S, v19.4S, v31.4S // ..............................................................................................................................*................. + // gap // ................................................................................................................................................ + mls v20.4S, v22.4S, v8.S[0] // .................................................................................................................................*.............. + // gap // ................................................................................................................................................ + mls v6.4S, v4.4S, v8.S[0] // ............................................................................................................*................................... + // gap // ................................................................................................................................................ + mls v9.4S, v15.4S, v8.S[0] // .................................................................................................................*.............................. + // gap // ................................................................................................................................................ + ldr q22, [x5, #-64] // ......................................................................................................................*......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v26.4S, v18.4S, v6.4S // .............................................................................................................*.................................. + // gap // ................................................................................................................................................ + add v25.4S, v18.4S, v6.4S // ..............................................................................................................*................................. + // gap // ................................................................................................................................................ + sub v28.4S, v3.4S, v9.4S // ..................................................................................................................*............................. + // gap // ................................................................................................................................................ + add v27.4S, v3.4S, v9.4S // ...................................................................................................................*............................ + // gap // ................................................................................................................................................ + sub v6.4S, v16.4S, v20.4S // ..................................................................................................................................*............. + // gap // ................................................................................................................................................ + add v18.4S, v16.4S, v20.4S // ...................................................................................................................................*............ + // gap // ................................................................................................................................................ + ldr q16, [x5, #-48] // .......................................................................................................................*........................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v22.4S, v18.4S, v22.4S // .....................................................................................................................................*.......... + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v6.4S, v17.4S // .........................................................................................................................................*...... + // gap // ................................................................................................................................................ + sqrdmulh v18.4S, v18.4S, v16.4S // ....................................................................................................................................*........... + // gap // ................................................................................................................................................ + mul v6.4S, v6.4S, v0.4S // ..........................................................................................................................................*..... + // gap // ................................................................................................................................................ + ldr q31, [x1, #128] // e............................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v22.4S, v18.4S, v8.S[0] // ......................................................................................................................................*......... + // gap // ................................................................................................................................................ + mls v6.4S, v20.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + ldr q3, [x1, #144] // .e.............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v18.4S, v19.4S, v22.4S // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + add v17.4S, v19.4S, v22.4S // ........................................................................................................................................*....... + // gap // ................................................................................................................................................ + sub v20.4S, v12.4S, v6.4S // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + add v19.4S, v12.4S, v6.4S // .............................................................................................................................................*.. + // gap // ................................................................................................................................................ + ldr q12, [x1, #160] // ..e............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q15, [x1, #176] // ...e............................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q16, [x2, #128] // ....e........................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q22, [x2, #144] // .....e.......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q9, [x2, #160] // ......e......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q4, [x2, #176] // .......e........................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q21, [x4], #64 // ..........e..................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q7, [x4, #-48] // ...........e.................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q2, [x5, #16] // ...........................................................................................e.................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q30, [x5, #32] // ............................................................................................e................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q14, [x5, #48] // .............................................................................................e.................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q11, [x5, #64] // ..............................................................................................e................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q5, [x5, #80] // ...............................................................................................e................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q23, [x5, #96] // ....................................................................................................................e........................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q1, [x5, #112] // .....................................................................................................................e.......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q0, [x5, #160] // ........................................................................................................................e....................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // ..............................................................................................................................................*. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x2], #64 // ...............................................................................................................................................* + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q17, [x5, #176] // .........................................................................................................................e...................... + // gap // ................................................................................................................................................ + + // ----------------------------------------------------------------------------- new position ------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ldr q9, [x1, #(16*0 + (64))] // e..........................'....................................................................................................................~......................... + // ldr q10, [x1, #(16*1 + (64))] // ...e.......................'.......................................................................................................................~...................... + // ldr q11, [x1, #(16*2 + (64))] // ........e..................'............................................................................................................................~................. + // ldr q12, [x1, #(16*3 + (64))] // .........e.................'.............................................................................................................................~................ + // ldr q13, [x2, #(16*0 + (64))] // ..........e................'..............................................................................................................................~............... + // ldr q14, [x2, #(16*1 + (64))] // ...........e...............'...............................................................................................................................~.............. + // ldr q15, [x2, #(16*2 + (64))] // ............e..............'................................................................................................................................~............. + // ldr q16, [x2, #(16*3 + (64))] // .............e.............'.................................................................................................................................~............ + // add x1, x1, #64 // ...........................*.............................................................................................................................................. + // add x2, x2, #64 // ...........................'*............................................................................................................................................. + // ldr q0, [x4], #64 // ..............e............'..................................................................................................................................~........... + // ldr q1, [x4, #(-64 + 16)] // ...............e...........'...................................................................................................................................~.......... + // ldr q2, [x4, #(-64 + 32)] // ...........................'.......................*...................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ...........................'...................................*.......................................................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ...........................'.*............................................................................................................................................ + // mul v24.4s, v13.4s, v0.s[0] // ...........................'..*........................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'......*....................................................................................................................................... + // sub v13.4s, v9.4s, v24.4s // ...........................'..........*................................................................................................................................... + // add v9.4s, v9.4s, v24.4s // ...........................'............*................................................................................................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ...........................'...*.......................................................................................................................................... + // mul v24.4s, v14.4s, v0.s[0] // ...........................'....*......................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'........*..................................................................................................................................... + // sub v14.4s, v10.4s, v24.4s // ...........................'.............*................................................................................................................................ + // add v10.4s, v10.4s, v24.4s // ...........................'..............*............................................................................................................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ...........................'.....*........................................................................................................................................ + // mul v24.4s, v15.4s, v0.s[0] // ...........................'.......*...................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'...........*.................................................................................................................................. + // sub v15.4s, v11.4s, v24.4s // ...........................'...............*.............................................................................................................................. + // add v11.4s, v11.4s, v24.4s // ...........................'................*............................................................................................................................. + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ...........................'.........*.................................................................................................................................... + // mul v24.4s, v16.4s, v0.s[0] // ...........................'.................*............................................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.....................*........................................................................................................................ + // sub v16.4s, v12.4s, v24.4s // ...........................'...........................*.................................................................................................................. + // add v12.4s, v12.4s, v24.4s // ...........................'..........................*................................................................................................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ...........................'...................*.......................................................................................................................... + // mul v24.4s, v11.4s, v0.s[2] // ...........................'....................*......................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'........................*..................................................................................................................... + // sub v11.4s, v9.4s, v24.4s // ...........................'............................*................................................................................................................. + // add v9.4s, v9.4s, v24.4s // ...........................'...............................*.............................................................................................................. + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ...........................'..................................*........................................................................................................... + // mul v24.4s, v12.4s, v0.s[2] // ...........................'.....................................*........................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.........................................*.................................................................................................... + // sub v12.4s, v10.4s, v24.4s // ...........................'.............................................*................................................................................................ + // add v10.4s, v10.4s, v24.4s // ...........................'..............................................*............................................................................................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ...........................'..................*........................................................................................................................... + // mul v24.4s, v15.4s, v1.s[0] // ...........................'......................*....................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.........................*.................................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ...........................'.............................*................................................................................................................ + // add v13.4s, v13.4s, v24.4s // ...........................'..............................*............................................................................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ...........................'................................*............................................................................................................. + // mul v24.4s, v16.4s, v1.s[0] // ...........................'.................................*............................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................'....................................*......................................................................................................... + // sub v16.4s, v14.4s, v24.4s // ...........................'.......................................*...................................................................................................... + // add v14.4s, v14.4s, v24.4s // ...........................'........................................*..................................................................................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ...........................'..................................................*........................................................................................... + // mul v24.4s, v10.4s, v1.s[2] // ...........................'...................................................*.......................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.......................................................*...................................................................................... + // sub v10.4s, v9.4s, v24.4s // ...........................'..............................................................*............................................................................... + // add v9.4s, v9.4s, v24.4s // ...........................'...............................................................*.............................................................................. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ...........................'................................................*............................................................................................. + // mul v24.4s, v12.4s, v2.s[0] // ...........................'.................................................*............................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.....................................................*........................................................................................ + // sub v12.4s, v11.4s, v24.4s // ...........................'.........................................................*.................................................................................... + // add v11.4s, v11.4s, v24.4s // ...........................'..........................................................*................................................................................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...........................'...........................................*.................................................................................................. + // mul v24.4s, v14.4s, v2.s[2] // ...........................'............................................*................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................'....................................................*......................................................................................... + // sub v14.4s, v13.4s, v24.4s // ...........................'........................................................*..................................................................................... + // add v13.4s, v13.4s, v24.4s // ...........................'.............................................................*................................................................................ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ...........................'..........................................*................................................................................................... + // mul v24.4s, v16.4s, v3.s[0] // ...........................'...............................................*.............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................'......................................................*....................................................................................... + // sub v16.4s, v15.4s, v24.4s // ...........................'...........................................................*.................................................................................. + // add v15.4s, v15.4s, v24.4s // ...........................'............................................................*................................................................................. + // trn1 v25.4s, v9.4s, v10.4s // ...........................'..................................................................*........................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...........................'...................................................................*.......................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ...........................'................................................................*............................................................................. + // trn2 v28.4s, v11.4s, v12.4s // ...........................'.................................................................*............................................................................ + // trn2 v11.2d, v25.2d, v27.2d // ...........................'.....................................................................*........................................................................ + // trn2 v12.2d, v26.2d, v28.2d // ...........................'......................................................................*....................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ...........................'.......................................................................*...................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...........................'........................................................................*..................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ...........................'....................................................................*......................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ...........................'.........................................................................*.................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...........................'..........................................................................*................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ...........................'...........................................................................*.................................................................. + // trn2 v15.2d, v25.2d, v27.2d // ...........................'.............................................................................*................................................................ + // trn2 v16.2d, v26.2d, v28.2d // ...........................'................................................................................*............................................................. + // trn1 v13.2d, v25.2d, v27.2d // ...........................'.................................................................................*............................................................ + // trn1 v14.2d, v26.2d, v28.2d // ...........................'..................................................................................*........................................................... + // ldr q0, [ x5], #(12*16) // ...........................'......................................*....................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ................e..........'....................................................................................................................................~......... + // ldr q1, [ x5, #(-12*16 + 2*16)] // .................e.........'.....................................................................................................................................~........ + // ldr q5, [x5, #(-12*16 + 3*16)] // ..................e........'......................................................................................................................................~....... + // ldr q2, [ x5, #(-12*16 + 4*16)] // ...................e.......'.......................................................................................................................................~...... + // ldr q6, [x5, #(-12*16 + 5*16)] // ....................e......'........................................................................................................................................~..... + // sqrdmulh v27.4s, v11.4s, v4.4s // ...........................'............................................................................*................................................................. + // mul v24.4s, v11.4s, v0.4s // ...........................'..............................................................................*............................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'...................................................................................*.......................................................... + // sub v11.4s, v9.4s, v24.4s // ...........................'.......................................................................................*...................................................... + // add v9.4s, v9.4s, v24.4s // ...........................'........................................................................................*..................................................... + // sqrdmulh v27.4s, v12.4s, v4.4s // ...........................'....................................................................................*......................................................... + // mul v24.4s, v12.4s, v0.4s // ...........................'...............................................................................*.............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.........................................................................................*.................................................... + // sub v12.4s, v10.4s, v24.4s // ...........................'.............................................................................................*................................................ + // add v10.4s, v10.4s, v24.4s // ...........................'..............................................................................................*............................................... + // sqrdmulh v27.4s, v10.4s, v5.4s // ...........................'.................................................................................................*............................................ + // mul v24.4s, v10.4s, v1.4s // ...........................'..................................................................................................*........................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'......................................................................................................*....................................... + // sub v10.4s, v9.4s, v24.4s // ...........................'.........................................................................................................*.................................... + // add v9.4s, v9.4s, v24.4s // ...........................'..........................................................................................................*................................... + // sqrdmulh v27.4s, v12.4s, v6.4s // ...........................'................................................................................................*............................................. + // mul v24.4s, v12.4s, v2.4s // ...........................'...................................................................................................*.......................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.......................................................................................................*...................................... + // sub v12.4s, v11.4s, v24.4s // ...........................'...........................................................................................................*.................................. + // add v11.4s, v11.4s, v24.4s // ...........................'............................................................................................................*................................. + // ldr q0, [ x5, #(-12*16 + 6*16)] // .....................e.....'.........................................................................................................................................~.... + // ldr q4, [x5, #(-12*16 + 7*16)] // ......................e....'..........................................................................................................................................~... + // ldr q1, [ x5, #(-12*16 + 8*16)] // ...........................'........................................................................................................*..................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ...........................'...............................................................................................................*.............................. + // ldr q2, [ x5, #(-12*16 + 10*16)] // .......................e...'...........................................................................................................................................~.. + // ldr q6, [x5, #(-12*16 + 11*16)] // ..........................e'.............................................................................................................................................. + // sqrdmulh v27.4s, v15.4s, v4.4s // ...........................'.....................................................................................*........................................................ + // mul v24.4s, v15.4s, v0.4s // ...........................'......................................................................................*....................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................'..........................................................................................*................................................... + // sub v15.4s, v13.4s, v24.4s // ...........................'...............................................................................................*.............................................. + // add v13.4s, v13.4s, v24.4s // ...........................'....................................................................................................*......................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // ...........................'...........................................................................................*.................................................. + // mul v24.4s, v16.4s, v0.4s // ...........................'............................................................................................*................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................'.....................................................................................................*........................................ + // sub v16.4s, v14.4s, v24.4s // ...........................'.............................................................................................................*................................ + // add v14.4s, v14.4s, v24.4s // ...........................'..............................................................................................................*............................... + // sqrdmulh v27.4s, v14.4s, v5.4s // ...........................'..................................................................................................................*........................... + // mul v24.4s, v14.4s, v1.4s // ...........................'................................................................................................................*............................. + // mls v24.4s, v27.4s, v8.s[0] // .~.........................'.....................................................................................................................*........................ + // sub v14.4s, v13.4s, v24.4s // ....~......................'........................................................................................................................*..................... + // add v13.4s, v13.4s, v24.4s // .....~.....................'.........................................................................................................................*.................... + // sqrdmulh v27.4s, v16.4s, v6.4s // ...........................'.................................................................................................................*............................ + // mul v24.4s, v16.4s, v2.4s // ...........................'...................................................................................................................*.......................... + // mls v24.4s, v27.4s, v8.s[0] // ..~........................'......................................................................................................................*....................... + // sub v16.4s, v15.4s, v24.4s // ......~....................'..........................................................................................................................*................... + // add v15.4s, v15.4s, v24.4s // .......~...................'...........................................................................................................................*.................. + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // ........................~..'............................................................................................................................................*. + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // .........................~.'.............................................................................................................................................* + + sub count, count, #1 cbnz count, layer45678_start - ldr x8, [x2, #96] // .............*...................................................................................................................................................................................................... - vins v29, x22, 0 // ..*................................................................................................................................................................................................................. - vins v4, x13, 0 // *................................................................................................................................................................................................................... - ldr x15, [x2, #112] // ...*................................................................................................................................................................................................................ - ldr x9, [x2, #104] // ...............*.................................................................................................................................................................................................... - vins v29, x28, 1 // ......*............................................................................................................................................................................................................. - vins v4, x10, 1 // ....*............................................................................................................................................................................................................... - ldr x21, [x2, #120] // .......*............................................................................................................................................................................................................ - ldr x16, [x2, #80] // .*.................................................................................................................................................................................................................. - vins v13, x8, 0 // .........................*.......................................................................................................................................................................................... - vins v3, x15, 0 // ..........*......................................................................................................................................................................................................... - ldr x13, [x2, #88] // .....*.............................................................................................................................................................................................................. - ldr x26, [x2, #64] // .........*.......................................................................................................................................................................................................... - vins v13, x9, 1 // .............................*...................................................................................................................................................................................... - vins v3, x21, 1 // ..............*..................................................................................................................................................................................................... - ldr x7, [x2, #72] // ...........*........................................................................................................................................................................................................ - sqrdmulh v30.4S, v13.4S, v4.S[1] // ..................................*................................................................................................................................................................................. - vins v2, x16, 0 // ........*........................................................................................................................................................................................................... - mul v24.4S, v13.4S, v4.S[0] // ................................*................................................................................................................................................................................... - ldr x6, [x1, #96] // .....................................*.............................................................................................................................................................................. - mul v23.4S, v3.4S, v4.S[0] // ....................*............................................................................................................................................................................................... - vins v2, x13, 1 // ............*....................................................................................................................................................................................................... - sqrdmulh v14.4S, v3.4S, v4.S[1] // ......................*............................................................................................................................................................................................. - ldr x25, [x1, #104] // .........................................*.......................................................................................................................................................................... - mul v6.4S, v2.4S, v4.S[0] // ................*................................................................................................................................................................................................... - vins v7, x26, 0 // .................*.................................................................................................................................................................................................. - mls v24.4S, v30.4S, v8.S[0] // ..........................................*......................................................................................................................................................................... - vins v16, x6, 0 // .............................................*...................................................................................................................................................................... - sqrdmulh v25.4S, v2.4S, v4.S[1] // ..................*................................................................................................................................................................................................. - vins v7, x7, 1 // .....................*.............................................................................................................................................................................................. - mls v23.4S, v14.4S, v8.S[0] // ..............................*..................................................................................................................................................................................... - vins v16, x25, 1 // .................................................*.................................................................................................................................................................. - mul v0.4S, v7.4S, v4.S[0] // ........................*........................................................................................................................................................................................... - ldr x18, [x1, #112] // .......................*............................................................................................................................................................................................ - add v19.4S, v16.4S, v24.4S // ....................................................*............................................................................................................................................................... - ldr x0, [x1, #64] // .......................................*............................................................................................................................................................................ - sqrdmulh v18.4S, v7.4S, v4.S[1] // ..........................*......................................................................................................................................................................................... - ldr x7, [x1, #120] // ...........................*........................................................................................................................................................................................ - sub v22.4S, v16.4S, v24.4S // ......................................................*............................................................................................................................................................. - ldr x12, [x1, #72] // ...........................................*........................................................................................................................................................................ - mul v2.4S, v19.4S, v4.S[2] // ..........................................................*......................................................................................................................................................... - ldr x19, [x1, #80] // ...............................................*.................................................................................................................................................................... - sqrdmulh v30.4S, v19.4S, v4.S[3] // ............................................................*....................................................................................................................................................... - vins v20, x0, 0 // .....................................................*.............................................................................................................................................................. - mls v0.4S, v18.4S, v8.S[0] // ....................................*............................................................................................................................................................................... - ldr x16, [x1, #88] // .................................*.................................................................................................................................................................................. - mls v6.4S, v25.4S, v8.S[0] // ............................*....................................................................................................................................................................................... - vins v20, x12, 1 // .........................................................*.......................................................................................................................................................... - sqrdmulh v26.4S, v22.4S, v29.S[1] // ........................................................................*........................................................................................................................................... - vins v11, x19, 0 // .......................................................*............................................................................................................................................................ - mls v2.4S, v30.4S, v8.S[0] // ................................................................................*................................................................................................................................... - vins v31, x18, 0 // ...............................*.................................................................................................................................................................................... - add v14.4S, v20.4S, v0.4S // ..............................................................................*..................................................................................................................................... - vins v11, x16, 1 // ...........................................................*........................................................................................................................................................ - mul v16.4S, v22.4S, v29.S[0] // ......................................................................*............................................................................................................................................. - vins v31, x7, 1 // ...................................*................................................................................................................................................................................ - sub v24.4S, v11.4S, v6.4S // ................................................................*................................................................................................................................................... - ldr x24, [x4, #-32] // .............................................................*...................................................................................................................................................... - add v10.4S, v14.4S, v2.4S // ..........................................................................................*......................................................................................................................... - ldr x26, [x4, #-16] // .................................................................*.................................................................................................................................................. - add v28.4S, v31.4S, v23.4S // ........................................*........................................................................................................................................................................... - ldr x16, [x4, #-8] // ...................................................................*................................................................................................................................................ - mls v16.4S, v26.4S, v8.S[0] // ....................................................................................*............................................................................................................................... - ldr x18, [x5, #128] // .........................................................................................................................................*.......................................................................... - sub v1.4S, v14.4S, v2.4S // ........................................................................................*........................................................................................................................... - ldr x19, [x5, #8] // .............................................................................*...................................................................................................................................... - sub v31.4S, v31.4S, v23.4S // ......................................*............................................................................................................................................................................. - vins v25, x26, 0 // ...........................................................................*........................................................................................................................................ - mul v12.4S, v28.4S, v4.S[2] // ..............................................*..................................................................................................................................................................... - add x20, x2, #64 // ...................*................................................................................................................................................................................................ - sqrdmulh v5.4S, v28.4S, v4.S[3] // ................................................*................................................................................................................................................................... - vins v25, x16, 1 // ...............................................................................*.................................................................................................................................... - sqrdmulh v15.4S, v31.4S, v29.S[1] // ..................................................*................................................................................................................................................................. - ldr x23, [x5, #24] // .......................................................................................*............................................................................................................................ - mul v13.4S, v31.4S, v29.S[0] // ............................................*....................................................................................................................................................................... - ldr x25, [x4, #-24] // ...............................................................*.................................................................................................................................................... - sub v27.4S, v20.4S, v0.4S // ..................................................................*................................................................................................................................................. - vins v28, x24, 0 // .....................................................................*.............................................................................................................................................. - add v19.4S, v11.4S, v6.4S // ..............................................................*..................................................................................................................................................... - ldr x16, [x5] , #192 // .......................................................................*............................................................................................................................................ - mls v12.4S, v5.4S, v8.S[0] // ........................................................*........................................................................................................................................................... - add x14, x1, #64 // ...................................................*................................................................................................................................................................ - mls v13.4S, v15.4S, v8.S[0] // ......................................................................................*............................................................................................................................. - ldr x6, [x5, #-176] // ...................................................................................*................................................................................................................................ - sub v23.4S, v27.4S, v16.4S // ............................................................................................*....................................................................................................................... - ldr x7, [x5, #-152] // .............................................................................................*...................................................................................................................... - add v4.4S, v27.4S, v16.4S // ..............................................................................................*..................................................................................................................... - vins v20, x16, 0 // .................................................................................*.................................................................................................................................. - add v6.4S, v19.4S, v12.4S // ....................................................................*............................................................................................................................................... - vins v28, x25, 1 // .........................................................................*.......................................................................................................................................... - add v31.4S, v24.4S, v13.4S // ..................................................................................................*................................................................................................................. - ldr x21, [x5, #-144] // ...................................................................................................*................................................................................................................ - sub v16.4S, v24.4S, v13.4S // ................................................................................................*................................................................................................................... - vins v30, x18, 0 // .................................................................................................................................................*.................................................................. - mul v0.4S, v6.4S, v29.S[2] // ..........................................................................*......................................................................................................................................... - ldr x4, [x5, #-136] // .......................................................................................................*............................................................................................................ - mul v17.4S, v31.4S, v28.S[2] // ..........................................................................................................*......................................................................................................... - ldr x18, [x5, #-96] // .........................................................................................................................*.......................................................................................... - sqrdmulh v5.4S, v6.4S, v29.S[3] // ............................................................................*....................................................................................................................................... - ldr x8, [x5, #-88] // .............................................................................................................................*...................................................................................... - sub v26.4S, v19.4S, v12.4S // ..................................................................................*................................................................................................................................. - ldr x27, [x5, #-80] // ...................................................................................................................................*................................................................................ - sqrdmulh v21.4S, v31.4S, v28.S[3] // ..................................................................................................................*................................................................................................. - ldr x13, [x5, #-72] // .......................................................................................................................................*............................................................................ - sqrdmulh v3.4S, v16.4S, v25.S[1] // ......................................................................................................................*............................................................................................. - ldr x15, [x5, #-56] // .............................................................................................................................................*...................................................................... - mls v0.4S, v5.4S, v8.S[0] // ....................................................................................................*............................................................................................................... - ldr x22, [x5, #-48] // ...................................................................................................................................................*................................................................ - mul v29.4S, v26.4S, v28.S[0] // ......................................................................................................*............................................................................................................. - ldr x26, [x5, #-40] // .......................................................................................................................................................*............................................................ - mul v2.4S, v16.4S, v25.S[0] // ....................................................................................................................*............................................................................................... - ldr x12, [x5, #-16] // ...................................................................................................................................................................*................................................ - mls v17.4S, v21.4S, v8.S[0] // ............................................................................................................................*....................................................................................... - ldr x9, [x5, #-8] // .......................................................................................................................................................................*............................................ - sub v9.4S, v10.4S, v0.4S // ............................................................................................................*....................................................................................................... - ldr x28, [x5, #-128] // .........................................................................................................*.......................................................................................................... - add v24.4S, v10.4S, v0.4S // ..............................................................................................................*..................................................................................................... - ldr x29, [x5, #-104] // .......................................................................................................................*............................................................................................ - sqrdmulh v11.4S, v26.4S, v28.S[1] // ........................................................................................................*........................................................................................................... - ldr x17, [x5, #-24] // .............................................................................................................................................................*...................................................... - mls v2.4S, v3.4S, v8.S[0] // ..............................................................................................................................*..................................................................................... - ldr x1, [x5, #-160] // .........................................................................................*.......................................................................................................................... - sub v31.4S, v4.4S, v17.4S // ....................................................................................................................................*............................................................................... - vins v30, x15, 1 // .....................................................................................................................................................*.............................................................. - trn1 v27.4S, v24.4S, v9.4S // ................................................................................................................................*................................................................................... - ldr x11, [x5, #-120] // .............................................................................................................*...................................................................................................... - mls v29.4S, v11.4S, v8.S[0] // ................................................................................................................*................................................................................................... - ldr x16, [x5, #-32] // .........................................................................................................................................................*.......................................................... - add v12.4S, v4.4S, v17.4S // ......................................................................................................................................*............................................................................. - vins v22, x1, 0 // .................................................................................................*.................................................................................................................. - trn2 v15.4S, v24.4S, v9.4S // ..................................................................................................................................*................................................................................. - vins v26, x18, 0 // .................................................................................................................................*.................................................................................. - add v3.4S, v23.4S, v2.4S // ..........................................................................................................................................*......................................................................... - vins v22, x7, 1 // .....................................................................................................*.............................................................................................................. - sub v18.4S, v1.4S, v29.4S // ........................................................................................................................*........................................................................................... - ldr x7, [x5, #-112] // ...................................................................................................................*................................................................................................ - add v21.4S, v1.4S, v29.4S // ..........................................................................................................................*......................................................................................... - vins v26, x8, 1 // .....................................................................................................................................*.............................................................................. - trn1 v16.4S, v12.4S, v31.4S // ................................................................................................................................................*................................................................... - vins v19, x27, 0 // ...........................................................................................................................................*........................................................................ - trn2 v9.4S, v12.4S, v31.4S // ..........................................................................................................................................................*......................................................... - vins v11, x21, 0 // ...........................................................................................................*........................................................................................................ - trn2 v24.4S, v21.4S, v18.4S // ..............................................................................................................................................*..................................................................... - vins v5, x7, 0 // ...........................................................................................................................*........................................................................................ - trn1 v12.4S, v21.4S, v18.4S // ............................................................................................................................................*....................................................................... - vins v11, x4, 1 // ...............................................................................................................*.................................................................................................... - sub v14.4S, v23.4S, v2.4S // ........................................................................................................................................*........................................................................... - vins v19, x13, 1 // ...............................................................................................................................................*.................................................................... - trn2 v29.2D, v15.2D, v24.2D // ....................................................................................................................................................*............................................................... - vins v5, x29, 1 // ...............................................................................................................................*.................................................................................... - trn2 v18.2D, v27.2D, v12.2D // ..................................................................................................................................................*................................................................. - vins v20, x19, 1 // .....................................................................................*.............................................................................................................................. - trn2 v28.4S, v3.4S, v14.4S // ..............................................................................................................................................................*..................................................... - // gap // .................................................................................................................................................................................................................... - mul v7.4S, v18.4S, v20.4S // ................................................................................................................................................................*................................................... - // gap // .................................................................................................................................................................................................................... - mul v23.4S, v29.4S, v20.4S // ...........................................................................................................................................................................*........................................ - vins v20, x6, 0 // ...........................................................................................*........................................................................................................................ - trn2 v4.2D, v9.2D, v28.2D // ....................................................................................................................................................................*............................................... - // gap // .................................................................................................................................................................................................................... - trn1 v0.2D, v15.2D, v24.2D // ........................................................................................................................................................*........................................................... - vins v20, x23, 1 // ...............................................................................................*.................................................................................................................... - mul v17.4S, v4.4S, v26.4S // ..................................................................................................................................................................................*................................. - // gap // .................................................................................................................................................................................................................... - sqrdmulh v25.4S, v29.4S, v20.4S // ............................................................................................................................................................................*....................................... - // gap // .................................................................................................................................................................................................................... - sqrdmulh v6.4S, v18.4S, v20.4S // .........................................................................................................................................................................*.......................................... - // gap // .................................................................................................................................................................................................................... - sqrdmulh v31.4S, v4.4S, v19.4S // ..........................................................................................................................................................................................*......................... - // gap // .................................................................................................................................................................................................................... - trn1 v20.2D, v9.2D, v28.2D // ........................................................................................................................................................................*........................................... - // gap // .................................................................................................................................................................................................................... - mls v23.4S, v25.4S, v8.S[0] // .................................................................................................................................................................................*.................................. - // gap // .................................................................................................................................................................................................................... - mls v7.4S, v6.4S, v8.S[0] // ...............................................................................................................................................................................*.................................... - // gap // .................................................................................................................................................................................................................... - mls v17.4S, v31.4S, v8.S[0] // ..............................................................................................................................................................................................*..................... - // gap // .................................................................................................................................................................................................................... - trn1 v31.4S, v3.4S, v14.4S // ............................................................................................................................................................*....................................................... - // gap // .................................................................................................................................................................................................................... - sub v21.4S, v0.4S, v23.4S // .....................................................................................................................................................................................*.............................. - // gap // .................................................................................................................................................................................................................... - add v2.4S, v0.4S, v23.4S // ......................................................................................................................................................................................*............................. - // gap // .................................................................................................................................................................................................................... - trn1 v0.2D, v27.2D, v12.2D // ......................................................................................................................................................*............................................................. - // gap // .................................................................................................................................................................................................................... - add v1.4S, v20.4S, v17.4S // ...................................................................................................................................................................................................*................ - // gap // .................................................................................................................................................................................................................... - sub v27.4S, v20.4S, v17.4S // ..................................................................................................................................................................................................*................. - vins v20, x28, 0 // .................................................................................................................*.................................................................................................. - sqrdmulh v28.4S, v2.4S, v11.4S // ...........................................................................................................................................................................................*........................ - vins v11, x22, 0 // ...........................................................................................................................................................*........................................................ - sqrdmulh v5.4S, v21.4S, v5.4S // ............................................................................................................................................................................................*....................... - vins v20, x11, 1 // .....................................................................................................................*.............................................................................................. - trn2 v17.2D, v16.2D, v31.2D // ..................................................................................................................................................................*................................................. - vins v11, x26, 1 // ...............................................................................................................................................................*.................................................... - mul v12.4S, v2.4S, v22.4S // .........................................................................................................................................................................................*.......................... - // gap // .................................................................................................................................................................................................................... - sqrdmulh v22.4S, v1.4S, v11.4S // .......................................................................................................................................................................................................*............ - vins v11, x12, 0 // ..........................................................................................................................................................................*......................................... - mul v18.4S, v21.4S, v20.4S // ........................................................................................................................................................................................*........................... - vins v20, x16, 0 // .................................................................................................................................................................*.................................................. - sqrdmulh v9.4S, v17.4S, v19.4S // ................................................................................................................................................................................*................................... - vins v11, x9, 1 // .............................................................................................................................................................................*...................................... - mls v12.4S, v28.4S, v8.S[0] // ................................................................................................................................................................................................*................... - vins v20, x17, 1 // .....................................................................................................................................................................*.............................................. - sqrdmulh v11.4S, v27.4S, v11.4S // ......................................................................................................................................................................................................*............. - // gap // .................................................................................................................................................................................................................... - mul v15.4S, v27.4S, v20.4S // .....................................................................................................................................................................................................*.............. - // gap // .................................................................................................................................................................................................................... - mul v20.4S, v17.4S, v26.4S // ..............................................................................................................................................................................*..................................... - // gap // .................................................................................................................................................................................................................... - mul v30.4S, v1.4S, v30.4S // ........................................................................................................................................................................................................*........... - // gap // .................................................................................................................................................................................................................... - mls v18.4S, v5.4S, v8.S[0] // .................................................................................................................................................................................................*.................. - // gap // .................................................................................................................................................................................................................... - mls v15.4S, v11.4S, v8.S[0] // ..........................................................................................................................................................................................................*......... - // gap // .................................................................................................................................................................................................................... - mls v20.4S, v9.4S, v8.S[0] // .......................................................................................................................................................................................*............................ - // gap // .................................................................................................................................................................................................................... - mls v30.4S, v22.4S, v8.S[0] // ............................................................................................................................................................................................................*....... - // gap // .................................................................................................................................................................................................................... - trn1 v8.2D, v16.2D, v31.2D // ......................................................................................................................................................................*............................................. - // gap // .................................................................................................................................................................................................................... - sub v31.4S, v0.4S, v7.4S // ...................................................................................................................................................................................*................................ - // gap // .................................................................................................................................................................................................................... - sub v25.4S, v8.4S, v20.4S // .............................................................................................................................................................................................*...................... - // gap // .................................................................................................................................................................................................................... - add v9.4S, v8.4S, v20.4S // ...............................................................................................................................................................................................*.................... - // gap // .................................................................................................................................................................................................................... - add v8.4S, v0.4S, v7.4S // ....................................................................................................................................................................................*............................... - // gap // .................................................................................................................................................................................................................... - add v16.4S, v25.4S, v15.4S // ...............................................................................................................................................................................................................*.... - // gap // .................................................................................................................................................................................................................... - sub v17.4S, v25.4S, v15.4S // ..............................................................................................................................................................................................................*..... - // gap // .................................................................................................................................................................................................................... - sub v11.4S, v8.4S, v12.4S // ....................................................................................................................................................................................................*............... - // gap // .................................................................................................................................................................................................................... - add v10.4S, v8.4S, v12.4S // .........................................................................................................................................................................................................*.......... - // gap // .................................................................................................................................................................................................................... - add v12.4S, v31.4S, v18.4S // .............................................................................................................................................................................................................*...... - // gap // .................................................................................................................................................................................................................... - sub v13.4S, v31.4S, v18.4S // ...........................................................................................................................................................................................................*........ - // gap // .................................................................................................................................................................................................................... - sub v15.4S, v9.4S, v30.4S // ................................................................................................................................................................................................................*... - // gap // .................................................................................................................................................................................................................... - add v14.4S, v9.4S, v30.4S // .................................................................................................................................................................................................................*.. - // gap // .................................................................................................................................................................................................................... - st4 {v10.4S,v11.4S,v12.4S,v13.4S}, [x14], #64 // ..................................................................................................................................................................................................................*. - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - st4 {v14.4S,v15.4S,v16.4S,v17.4S}, [x20], #64 // ...................................................................................................................................................................................................................* - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - // gap // .................................................................................................................................................................................................................... - - // original source code - // vins v11, x13, 0 // ..*................................................................................................................................................................................................................. || .*................................................................................................................................ - // ldr x16, [x2, #80] // ........*........................................................................................................................................................................................................... || ....*............................................................................................................................. - // vins v20, x22, 0 // .*.................................................................................................................................................................................................................. || *................................................................................................................................. - // ldr x22, [x2, #112] // ...*................................................................................................................................................................................................................ || .*................................................................................................................................ - // vins v11, x10, 1 // ......*............................................................................................................................................................................................................. || ...*.............................................................................................................................. - // ldr x13, [x2, #88] // ...........*........................................................................................................................................................................................................ || .....*............................................................................................................................ - // vins v20, x28, 1 // .....*.............................................................................................................................................................................................................. || ..*............................................................................................................................... - // ldr x10, [x2, #120] // .......*............................................................................................................................................................................................................ || ...*.............................................................................................................................. - // vins v23, x16, 0 // .................*.................................................................................................................................................................................................. || ........*......................................................................................................................... - // ldr x16, [x2, #64] // ............*....................................................................................................................................................................................................... || ......*........................................................................................................................... - // vins v25, x22, 0 // ..........*......................................................................................................................................................................................................... || .....*............................................................................................................................ - // ldr x22, [x2, #72] // ...............*.................................................................................................................................................................................................... || .......*.......................................................................................................................... - // vins v23, x13, 1 // .....................*.............................................................................................................................................................................................. || ..........*....................................................................................................................... - // ldr x13, [x2, #96] // *................................................................................................................................................................................................................... || *................................................................................................................................. - // vins v25, x10, 1 // ..............*..................................................................................................................................................................................................... || .......*.......................................................................................................................... - // ldr x10, [x2, #104] // ....*............................................................................................................................................................................................................... || ..*............................................................................................................................... - // mul v7.4S, v23.4S, v11.S[0] // ........................*........................................................................................................................................................................................... || ............*..................................................................................................................... - // vins v3, x16, 0 // .........................*.......................................................................................................................................................................................... || ............*..................................................................................................................... - // sqrdmulh v23.4S, v23.4S, v11.S[1] // ............................*....................................................................................................................................................................................... || ..............*................................................................................................................... - // add x2, x2, #64 // .....................................................................*.............................................................................................................................................. || ..................................*............................................................................................... - // mul v18.4S, v25.4S, v11.S[0] // ....................*............................................................................................................................................................................................... || ..........*....................................................................................................................... - // vins v3, x22, 1 // .............................*...................................................................................................................................................................................... || ..............*................................................................................................................... - // sqrdmulh v25.4S, v25.4S, v11.S[1] // ......................*............................................................................................................................................................................................. || ...........*...................................................................................................................... - // ldr x16, [x1, #112] // .................................*.................................................................................................................................................................................. || ................*................................................................................................................. - // mul v27.4S, v3.4S, v11.S[0] // ................................*................................................................................................................................................................................... || ................*................................................................................................................. - // vins v10, x13, 0 // .........*.......................................................................................................................................................................................................... || ....*............................................................................................................................. - // sqrdmulh v3.4S, v3.4S, v11.S[1] // ....................................*............................................................................................................................................................................... || ..................*............................................................................................................... - // ldr x22, [x1, #120] // .....................................*.............................................................................................................................................................................. || ..................*............................................................................................................... - // mls v7.4S, v23.4S, v8.S[0] // ..............................................*..................................................................................................................................................................... || .......................*.......................................................................................................... - // vins v10, x10, 1 // .............*...................................................................................................................................................................................................... || ......*........................................................................................................................... - // mls v18.4S, v25.4S, v8.S[0] // ..............................*..................................................................................................................................................................................... || ...............*.................................................................................................................. - // vins v23, x16, 0 // ...................................................*................................................................................................................................................................ || .........................*........................................................................................................ - // mul v25.4S, v10.4S, v11.S[0] // ..................*................................................................................................................................................................................................. || .........*........................................................................................................................ - // ldr x16, [x1, #88] // .............................................*...................................................................................................................................................................... || ......................*........................................................................................................... - // sqrdmulh v10.4S, v10.4S, v11.S[1] // ................*................................................................................................................................................................................................... || ........*......................................................................................................................... - // vins v23, x22, 1 // .......................................................*............................................................................................................................................................ || ...........................*...................................................................................................... - // mls v27.4S, v3.4S, v8.S[0] // ............................................*....................................................................................................................................................................... || ......................*........................................................................................................... - // ldr x22, [x1, #96] // ...................*................................................................................................................................................................................................ || .........*........................................................................................................................ - // sub v3.4S, v23.4S, v18.4S // ..................................................................*................................................................................................................................................. || .................................*................................................................................................ - // ldr x13, [x1, #64] // ...................................*................................................................................................................................................................................ || .................*................................................................................................................ - // add v23.4S, v23.4S, v18.4S // ............................................................*....................................................................................................................................................... || ..............................*................................................................................................... - // ldr x10, [x1, #104] // .......................*............................................................................................................................................................................................ || ...........*...................................................................................................................... - // mls v25.4S, v10.4S, v8.S[0] // ..........................*......................................................................................................................................................................................... || .............*.................................................................................................................... - // ldr x28, [x1, #72] // .......................................*............................................................................................................................................................................ || ...................*.............................................................................................................. - // mul v18.4S, v3.4S, v20.S[0] // ..........................................................................*......................................................................................................................................... || .....................................*............................................................................................ - // vins v10, x22, 0 // ...........................*........................................................................................................................................................................................ || .............*.................................................................................................................... - // mul v29.4S, v23.4S, v11.S[2] // ....................................................................*............................................................................................................................................... || ..................................*............................................................................................... - // ldr x22, [x1, #80] // .........................................*.......................................................................................................................................................................... || ....................*............................................................................................................. - // sqrdmulh v23.4S, v23.4S, v11.S[3] // ......................................................................*............................................................................................................................................. || ...................................*.............................................................................................. - // vins v10, x10, 1 // ...............................*.................................................................................................................................................................................... || ...............*.................................................................................................................. - // sqrdmulh v3.4S, v3.4S, v20.S[1] // ........................................................................*........................................................................................................................................... || ....................................*............................................................................................. - // add x1, x1, #64 // .................................................................................*.................................................................................................................................. || ........................................*......................................................................................... - // add v21.4S, v10.4S, v25.4S // ..................................*................................................................................................................................................................................. || .................*................................................................................................................ - // vins v1, x13, 0 // ...........................................*........................................................................................................................................................................ || .....................*............................................................................................................ - // sub v25.4S, v10.4S, v25.4S // ......................................*............................................................................................................................................................................. || ...................*.............................................................................................................. - // vins v10, x22, 0 // .................................................*.................................................................................................................................................................. || ........................*......................................................................................................... - // mls v29.4S, v23.4S, v8.S[0] // ................................................................................*................................................................................................................................... || ........................................*......................................................................................... - // vins v1, x28, 1 // ...............................................*.................................................................................................................................................................... || .......................*.......................................................................................................... - // mul v23.4S, v21.4S, v11.S[2] // ........................................*........................................................................................................................................................................... || ....................*............................................................................................................. - // vins v10, x16, 1 // .....................................................*.............................................................................................................................................................. || ..........................*....................................................................................................... - // sqrdmulh v11.4S, v21.4S, v11.S[3] // ..........................................*......................................................................................................................................................................... || .....................*............................................................................................................ - // ldr x16, [x4, #-32] // .........................................................*.......................................................................................................................................................... || ............................*..................................................................................................... - // add v21.4S, v10.4S, v7.4S // ..............................................................................*..................................................................................................................................... || .......................................*.......................................................................................... - // ldr x22, [x4, #-24] // ...........................................................................*........................................................................................................................................ || .....................................*............................................................................................ - // sub v7.4S, v10.4S, v7.4S // ........................................................*........................................................................................................................................................... || ............................*..................................................................................................... - // ldr x13, [x4, #-16] // ...........................................................*........................................................................................................................................................ || .............................*.................................................................................................... - // sub v10.4S, v1.4S, v27.4S // ............................................................................*....................................................................................................................................... || ......................................*........................................................................................... - // ldr x10, [x4, #-8] // .............................................................*...................................................................................................................................................... || ..............................*................................................................................................... - // add v19.4S, v21.4S, v29.4S // ........................................................................................*........................................................................................................................... || ............................................*..................................................................................... - // vins v2, x16, 0 // .............................................................................*...................................................................................................................................... || ......................................*........................................................................................... - // mul v15.4S, v25.4S, v20.S[0] // ......................................................*............................................................................................................................................................. || ...........................*...................................................................................................... - // ldr x16, [x5] , #192 // ...............................................................................*.................................................................................................................................... || .......................................*.......................................................................................... - // sqrdmulh v25.4S, v25.4S, v20.S[1] // ................................................*................................................................................................................................................................... || ........................*......................................................................................................... - // vins v2, x22, 1 // .........................................................................................*.......................................................................................................................... || ............................................*..................................................................................... - // mul v12.4S, v19.4S, v20.S[2] // ..............................................................................................*..................................................................................................................... || ...............................................*.................................................................................. - // vins v14, x13, 0 // ...................................................................*................................................................................................................................................ || .................................*................................................................................................ - // sqrdmulh v20.4S, v19.4S, v20.S[3] // ..................................................................................................*................................................................................................................. || .................................................*................................................................................ - // ldr x22, [x5, #-184] // .................................................................*.................................................................................................................................................. || ................................*................................................................................................. - // add v27.4S, v1.4S, v27.4S // ....................................................*............................................................................................................................................................... || ..........................*....................................................................................................... - // vins v14, x10, 1 // .......................................................................*............................................................................................................................................ || ...................................*.............................................................................................. - // mls v23.4S, v11.4S, v8.S[0] // ..................................................*................................................................................................................................................................. || .........................*........................................................................................................ - // vins v11, x16, 0 // .......................................................................................*............................................................................................................................ || ...........................................*...................................................................................... - // sub v29.4S, v21.4S, v29.4S // ....................................................................................................*............................................................................................................... || ..................................................*............................................................................... - // ldr x16, [x5, #-176] // ...................................................................................*................................................................................................................................ || .........................................*........................................................................................ - // mls v15.4S, v25.4S, v8.S[0] // ..............................................................*..................................................................................................................................................... || ...............................*.................................................................................................. - // vins v11, x22, 1 // .......................................................................................................................................................*............................................................ || ...........................................................................*...................................................... - // mls v18.4S, v3.4S, v8.S[0] // ..................................................................................*................................................................................................................................. || .........................................*........................................................................................ - // ldr x22, [x5, #-168] // .........................................................................*.......................................................................................................................................... || ....................................*............................................................................................. - // sub v25.4S, v27.4S, v23.4S // ................................................................*................................................................................................................................................... || ................................*................................................................................................. - // ldr x13, [x5, #-160] // .........................................................................................................................*.......................................................................................... || ............................................................*..................................................................... - // add v23.4S, v27.4S, v23.4S // ..........................................................*......................................................................................................................................................... || .............................*.................................................................................................... - // vins v3, x16, 0 // ...........................................................................................................................................................*........................................................ || ..............................................................................*................................................... - // sub v27.4S, v10.4S, v15.4S // ....................................................................................*............................................................................................................................... || ..........................................*....................................................................................... - // ldr x16, [x5, #-152] // .....................................................................................*.............................................................................................................................. || ..........................................*....................................................................................... - // add v10.4S, v10.4S, v15.4S // ......................................................................................*............................................................................................................................. || ...........................................*...................................................................................... - // vins v3, x22, 1 // ..............................................................................................................................................................*..................................................... || ................................................................................*................................................. - // sub v21.4S, v7.4S, v18.4S // ............................................................................................*....................................................................................................................... || ..............................................*................................................................................... - // vins v1, x13, 0 // .................................................................................................................................*.................................................................................. || ................................................................*................................................................. - // add v7.4S, v7.4S, v18.4S // ..........................................................................................*......................................................................................................................... || .............................................*.................................................................................... - // ldr x22, [x5, #-144] // ...........................................................................................*........................................................................................................................ || .............................................*.................................................................................... - // mls v12.4S, v20.4S, v8.S[0] // ..........................................................................................................*......................................................................................................... || .....................................................*............................................................................ - // vins v1, x16, 1 // .....................................................................................................................................*.............................................................................. || ..................................................................*............................................................... - // mul v20.4S, v29.4S, v2.S[0] // ............................................................................................................*....................................................................................................... || ......................................................*........................................................................... - // ldr x16, [x5, #-136] // ...............................................................................................*.................................................................................................................... || ...............................................*.................................................................................. - // sqrdmulh v18.4S, v29.4S, v2.S[1] // ......................................................................................................................*............................................................................................. || ...........................................................*...................................................................... - // ldr x13, [x5, #-128] // ...................................................................................................................*................................................................................................ || .........................................................*........................................................................ - // mul v29.4S, v7.4S, v2.S[2] // ................................................................................................*................................................................................................................... || ................................................*................................................................................. - // vins v19, x22, 0 // .............................................................................................................................................*...................................................................... || ......................................................................*........................................................... - // sub v15.4S, v23.4S, v12.4S // ..................................................................................................................*................................................................................................. || .........................................................*........................................................................ - // ldr x22, [x5, #-120] // .............................................................................................................................*...................................................................................... || ..............................................................*................................................................... - // add v23.4S, v23.4S, v12.4S // ....................................................................................................................*............................................................................................... || ..........................................................*....................................................................... - // vins v19, x16, 1 // .................................................................................................................................................*.................................................................. || ........................................................................*......................................................... - // mls v20.4S, v18.4S, v8.S[0] // ..............................................................................................................................*..................................................................................... || ...............................................................*.................................................................. - // vins v18, x13, 0 // .............................................................................................................................................................................*...................................... || ..............................................................................................*................................... - // sqrdmulh v7.4S, v7.4S, v2.S[3] // ......................................................................................................*............................................................................................................. || ...................................................*.............................................................................. - // ldr x16, [x5, #-112] // .......................................................................................................................................*............................................................................ || ...................................................................*.............................................................. - // mul v2.4S, v21.4S, v14.S[0] // ..............................................................................................................*..................................................................................................... || .......................................................*.......................................................................... - // vins v18, x22, 1 // .................................................................................................................................................................................*.................................. || ................................................................................................*................................. - // sqrdmulh v21.4S, v21.4S, v14.S[1] // ........................................................................................................*........................................................................................................... || ....................................................*............................................................................. - // ldr x22, [x5, #-104] // .....................................................................................................................*.............................................................................................. || ..........................................................*....................................................................... - // sub v12.4S, v25.4S, v20.4S // ......................................................................................................................................*............................................................................. || ...................................................................*.............................................................. - // ldr x13, [x5, #-96] // .................................................................................................*.................................................................................................................. || ................................................*................................................................................. - // add v20.4S, v25.4S, v20.4S // ........................................................................................................................................*........................................................................... || ....................................................................*............................................................. - // vins v25, x16, 0 // ...............................................................................................................................................*.................................................................... || .......................................................................*.......................................................... - // mls v29.4S, v7.4S, v8.S[0] // ................................................................................................................*................................................................................................... || ........................................................*......................................................................... - // ldr x16, [x5, #-88] // ...................................................................................................*................................................................................................................ || .................................................*................................................................................ - // mls v2.4S, v21.4S, v8.S[0] // ........................................................................................................................*........................................................................................... || ............................................................*..................................................................... - // vins v25, x22, 1 // .....................................................................................................................................................*.............................................................. || ..........................................................................*....................................................... - // trn1 v7.4S, v23.4S, v15.4S // ............................................................................................................................*....................................................................................... || ..............................................................*................................................................... - // vins v21, x13, 0 // ...................................................................................................................................*................................................................................ || .................................................................*................................................................ - // trn2 v23.4S, v23.4S, v15.4S // ..................................................................................................................................*................................................................................. || .................................................................*................................................................ - // ldr x22, [x5, #-80] // .....................................................................................................*.............................................................................................................. || ..................................................*............................................................................... - // sub v15.4S, v10.4S, v29.4S // ..........................................................................................................................*......................................................................................... || .............................................................*.................................................................... - // vins v21, x16, 1 // .........................................................................................................................................*.......................................................................... || ....................................................................*............................................................. - // add v10.4S, v10.4S, v29.4S // ................................................................................................................................*................................................................................... || ................................................................*................................................................. - // ldr x16, [x5, #-72] // .......................................................................................................*............................................................................................................ || ...................................................*.............................................................................. - // sub v29.4S, v27.4S, v2.4S // ..................................................................................................................................................*................................................................. || .........................................................................*........................................................ - // ldr x13, [x5, #-64] // ...............................................................*.................................................................................................................................................... || ...............................*.................................................................................................. - // add v27.4S, v27.4S, v2.4S // ....................................................................................................................................*............................................................................... || ..................................................................*............................................................... - // vins v2, x22, 0 // ...........................................................................................................................................*........................................................................ || .....................................................................*............................................................ - // trn1 v14.4S, v20.4S, v12.4S // ................................................................................................................................................*................................................................... || ........................................................................*......................................................... - // ldr x22, [x5, #-56] // .........................................................................................................*.......................................................................................................... || ....................................................*............................................................................. - // trn2 v20.4S, v20.4S, v12.4S // ..............................................................................................................................................*..................................................................... || .......................................................................*.......................................................... - // vins v2, x16, 1 // ...................................................................................................................................................*................................................................ || .........................................................................*........................................................ - // trn1 v12.4S, v10.4S, v15.4S // ..........................................................................................................................................*......................................................................... || .....................................................................*............................................................ - // vins v22, x13, 0 // .............................................................................................*...................................................................................................................... || ..............................................*................................................................................... - // trn2 v24.2D, v7.2D, v14.2D // ......................................................................................................................................................*............................................................. || ...........................................................................*...................................................... - // ldr x16, [x5, #-48] // ...........................................................................................................*........................................................................................................ || .....................................................*............................................................................ - // trn2 v17.2D, v23.2D, v20.2D // ....................................................................................................................................................*............................................................... || ..........................................................................*....................................................... - // vins v22, x22, 1 // ...........................................................................................................................*........................................................................................ || .............................................................*.................................................................... - // trn1 v7.2D, v7.2D, v14.2D // ..........................................................................................................................................................................*......................................... || ............................................................................................*..................................... - // ldr x22, [x5, #-40] // .............................................................................................................*...................................................................................................... || ......................................................*........................................................................... - // trn1 v20.2D, v23.2D, v20.2D // .............................................................................................................................................................*...................................................... || ................................................................................*................................................. - // ldr x13, [x5, #-32] // ...............................................................................................................................*.................................................................................... || ...............................................................*.................................................................. - // trn2 v23.4S, v10.4S, v15.4S // ............................................................................................................................................*....................................................................... || ......................................................................*........................................................... - // vins v10, x16, 0 // ...............................................................................................................................................................................*.................................... || ...............................................................................................*.................................. - // trn1 v15.4S, v27.4S, v29.4S // .......................................................................................................................................................................*............................................ || .........................................................................................*........................................ - // ldr x16, [x5, #-24] // .......................................................................................................................*............................................................................................ || ...........................................................*...................................................................... - // trn2 v27.4S, v27.4S, v29.4S // ........................................................................................................................................................*........................................................... || ............................................................................*..................................................... - // vins v10, x22, 1 // ...................................................................................................................................................................................*................................ || .................................................................................................*................................ - // mul v29.4S, v24.4S, v11.4S // .........................................................................................................................................................*.......................................................... || .............................................................................*.................................................... - // vins v14, x13, 0 // ........................................................................................................................................................................................*........................... || ....................................................................................................*............................. - // trn2 v13.2D, v12.2D, v15.2D // ..................................................................................................................................................................................*................................. || .................................................................................................*................................ - // ldr x22, [x5, #-16] // ...............................................................................................................*.................................................................................................... || .......................................................*.......................................................................... - // trn2 v30.2D, v23.2D, v27.2D // ............................................................................................................................................................*....................................................... || ...............................................................................*.................................................. - // vins v14, x16, 1 // ............................................................................................................................................................................................*....................... || ......................................................................................................*........................... - // trn1 v15.2D, v12.2D, v15.2D // .....................................................................................................................................................................................................*.............. || ...............................................................................................................*.................. - // ldr x16, [x5, #-8] // .................................................................................................................*.................................................................................................. || ........................................................*......................................................................... - // trn1 v23.2D, v23.2D, v27.2D // ...................................................................................................................................................................*................................................ || .....................................................................................*............................................ - // sqrdmulh v27.4S, v24.4S, v3.4S // .................................................................................................................................................................*.................................................. || ...................................................................................*.............................................. - // vins v12, x22, 0 // ......................................................................................................................................................................................*............................. || ...................................................................................................*.............................. - // mul v11.4S, v17.4S, v11.4S // ..........................................................................................................................................................*......................................................... || ..............................................................................*................................................... - // sqrdmulh v3.4S, v17.4S, v3.4S // ................................................................................................................................................................*................................................... || ..................................................................................*............................................... - // vins v12, x16, 1 // ..........................................................................................................................................................................................*......................... || .....................................................................................................*............................ - // mul v24.4S, v13.4S, v21.4S // ...............................................................................................................................................................................................*.................... || .........................................................................................................*........................ - // mls v29.4S, v27.4S, v8.S[0] // .....................................................................................................................................................................*.............................................. || .......................................................................................*.......................................... - // sqrdmulh v27.4S, v13.4S, v2.4S // .........................................................................................................................................................................................*.......................... || .....................................................................................................*............................ - // mls v11.4S, v3.4S, v8.S[0] // ....................................................................................................................................................................*............................................... || ......................................................................................*........................................... - // mul v3.4S, v30.4S, v21.4S // ...............................................................................................................................................................*.................................................... || .................................................................................*................................................ - // sub v21.4S, v7.4S, v29.4S // ......................................................................................................................................................................................................*............. || ................................................................................................................*................. - // add v7.4S, v7.4S, v29.4S // .........................................................................................................................................................................................................*.......... || ...................................................................................................................*.............. - // sub v29.4S, v20.4S, v11.4S // ........................................................................................................................................................................*........................................... || ..........................................................................................*....................................... - // add v11.4S, v20.4S, v11.4S // .........................................................................................................................................................................*.......................................... || ...........................................................................................*...................................... - // mls v24.4S, v27.4S, v8.S[0] // ...................................................................................................................................................................................................*................ || .............................................................................................................*.................... - // mul v20.4S, v29.4S, v18.4S // .......................................................................................................................................................................................*............................ || ....................................................................................................*............................. - // mul v18.4S, v11.4S, v1.4S // ....................................................................................................................................................................................*............................... || ..................................................................................................*............................... - // sqrdmulh v27.4S, v30.4S, v2.4S // ..................................................................................................................................................................*................................................. || ....................................................................................*............................................. - // sqrdmulh v11.4S, v11.4S, v19.4S // ..............................................................................................................................................................................*..................................... || ...............................................................................................*.................................. - // sqrdmulh v25.4S, v29.4S, v25.4S // ................................................................................................................................................................................*................................... || ................................................................................................*................................. - // sub v29.4S, v15.4S, v24.4S // .......................................................................................................................................................................................................*............ || .................................................................................................................*................ - // mls v3.4S, v27.4S, v8.S[0] // ......................................................................................................................................................................*............................................. || ........................................................................................*......................................... - // add v27.4S, v15.4S, v24.4S // ........................................................................................................................................................................................................*........... || ..................................................................................................................*............... - // mls v18.4S, v11.4S, v8.S[0] // ...........................................................................................................................................................................................*........................ || ......................................................................................................*........................... - // mls v20.4S, v25.4S, v8.S[0] // .................................................................................................................................................................................................*.................. || ...........................................................................................................*...................... - // sub v11.4S, v23.4S, v3.4S // ............................................................................................................................................................................*....................................... || ..............................................................................................*................................... - // add v23.4S, v23.4S, v3.4S // ...........................................................................................................................................................................*........................................ || .............................................................................................*.................................... - // sub v3.4S, v7.4S, v18.4S // ............................................................................................................................................................................................................*....... || ......................................................................................................................*........... - // mul v25.4S, v11.4S, v14.4S // ..............................................................................................................................................................................................*..................... || ........................................................................................................*......................... - // sqrdmulh v11.4S, v11.4S, v12.4S // .............................................................................................................................................................................................*...................... || .......................................................................................................*.......................... - // sqrdmulh v10.4S, v23.4S, v10.4S // .....................................................................................................................................................................................*.............................. || ...................................................................................................*.............................. - // mul v23.4S, v23.4S, v22.4S // ................................................................................................................................................................................................*................... || ..........................................................................................................*....................... - // add v2.4S, v7.4S, v18.4S // .............................................................................................................................................................................................................*...... || .......................................................................................................................*.......... - // mls v25.4S, v11.4S, v8.S[0] // ..................................................................................................................................................................................................*................. || ............................................................................................................*..................... - // sub v5.4S, v21.4S, v20.4S // ...............................................................................................................................................................................................................*.... || .........................................................................................................................*........ - // mls v23.4S, v10.4S, v8.S[0] // ....................................................................................................................................................................................................*............... || ..............................................................................................................*................... - // add v4.4S, v21.4S, v20.4S // ..............................................................................................................................................................................................................*..... || ........................................................................................................................*......... - // sub v20.4S, v29.4S, v25.4S // ...........................................................................................................................................................................................................*........ || .....................................................................................................................*............ - // add v19.4S, v29.4S, v25.4S // ..........................................................................................................................................................................................................*......... || ....................................................................................................................*............. - // sub v18.4S, v27.4S, v23.4S // ................................................................................................................................................................................................................*... || ..........................................................................................................................*....... - // add v17.4S, v27.4S, v23.4S // .................................................................................................................................................................................................................*.. || ...........................................................................................................................*...... - // st4 {v2.4S,v3.4S,v4.4S,v5.4S}, [x1], #64 // ..................................................................................................................................................................................................................*. || ............................................................................................................................*..... - // st4 {v17.4S,v18.4S,v19.4S,v20.4S}, [x2], #64 // ...................................................................................................................................................................................................................* || .................................................................................................................................* - + // Instructions: 125 + // Expected cycles: 135 + // Expected IPC: 0.93 + // + // Wall time: 52.80s + // User time: 52.80s + // + // ---------------------------------------------------- original position -----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------| + ldr q29, [x4, #-16] // ....................................*........................................................................................ + add x1, x1, #64 // *............................................................................................................................ + add x2, x2, #64 // .*........................................................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v27.4S, v16.4S, v21.S[1] // ..*.......................................................................................................................... + // gap // ............................................................................................................................. + mul v18.4S, v16.4S, v21.S[0] // ...*......................................................................................................................... + // gap // ............................................................................................................................. + mul v6.4S, v4.4S, v21.S[0] // ..................*.......................................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v20.4S, v4.4S, v21.S[1] // ..........*.................................................................................................................. + // gap // ............................................................................................................................. + sqrdmulh v10.4S, v22.4S, v21.S[1] // ....*........................................................................................................................ + // gap // ............................................................................................................................. + mul v4.4S, v22.4S, v21.S[0] // .....*....................................................................................................................... + // gap // ............................................................................................................................. + mls v18.4S, v27.4S, v8.S[0] // .......*..................................................................................................................... + // gap // ............................................................................................................................. + mls v6.4S, v20.4S, v8.S[0] // ......................*...................................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v13.4S, v9.4S, v21.S[1] // ......*...................................................................................................................... + // gap // ............................................................................................................................. + mul v9.4S, v9.4S, v21.S[0] // ........*.................................................................................................................... + // gap // ............................................................................................................................. + mls v4.4S, v10.4S, v8.S[0] // .........*................................................................................................................... + // gap // ............................................................................................................................. + add v27.4S, v15.4S, v6.4S // ...........................*................................................................................................. + // gap // ............................................................................................................................. + sub v6.4S, v15.4S, v6.4S // ............................*................................................................................................ + // gap // ............................................................................................................................. + mls v9.4S, v13.4S, v8.S[0] // ............*................................................................................................................ + // gap // ............................................................................................................................. + sub v22.4S, v3.4S, v4.4S // ..............*.............................................................................................................. + // gap // ............................................................................................................................. + mul v19.4S, v6.4S, v7.S[0] // ..................................*.......................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v6.4S, v6.4S, v7.S[1] // .................................*........................................................................................... + // gap // ............................................................................................................................. + sub v20.4S, v12.4S, v9.4S // ................*............................................................................................................ + // gap // ............................................................................................................................. + ldr q16, [x4, #-32] // ........................*.................................................................................................... + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + mls v19.4S, v6.4S, v8.S[0] // .....................................*....................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v25.4S, v20.4S, v7.S[1] // ...................*......................................................................................................... + // gap // ............................................................................................................................. + mul v28.4S, v20.4S, v7.S[0] // .......................*..................................................................................................... + // gap // ............................................................................................................................. + sub v20.4S, v31.4S, v18.4S // ...........*................................................................................................................. + // gap // ............................................................................................................................. + sub v13.4S, v22.4S, v19.4S // ........................................*.................................................................................... + // gap // ............................................................................................................................. + add v10.4S, v22.4S, v19.4S // .........................................*................................................................................... + // gap // ............................................................................................................................. + mls v28.4S, v25.4S, v8.S[0] // ..........................*.................................................................................................. + // gap // ............................................................................................................................. + sqrdmulh v26.4S, v13.4S, v29.S[1] // ...........................................*................................................................................. + // gap // ............................................................................................................................. + mul v24.4S, v13.4S, v29.S[0] // ................................................*............................................................................ + // gap // ............................................................................................................................. + sqrdmulh v19.4S, v10.4S, v16.S[3] // ............................................*................................................................................ + // gap // ............................................................................................................................. + mul v25.4S, v10.4S, v16.S[2] // .............................................*............................................................................... + // gap // ............................................................................................................................. + add v10.4S, v12.4S, v9.4S // .................*........................................................................................................... + // gap // ............................................................................................................................. + mls v24.4S, v26.4S, v8.S[0] // .......................................................*..................................................................... + // gap // ............................................................................................................................. + sub v13.4S, v20.4S, v28.4S // ..............................*.............................................................................................. + // gap // ............................................................................................................................. + mls v25.4S, v19.4S, v8.S[0] // .....................................................*....................................................................... + // gap // ............................................................................................................................. + add v29.4S, v20.4S, v28.4S // ...............................*............................................................................................. + // gap // ............................................................................................................................. + sub v28.4S, v13.4S, v24.4S // ............................................................*................................................................ + // gap // ............................................................................................................................. + add v20.4S, v13.4S, v24.4S // .............................................................*............................................................... + // gap // ............................................................................................................................. + sub v26.4S, v29.4S, v25.4S // .........................................................*................................................................... + // gap // ............................................................................................................................. + add v19.4S, v29.4S, v25.4S // ..............................................................*.............................................................. + // gap // ............................................................................................................................. + sqrdmulh v13.4S, v10.4S, v21.S[3] // ....................*........................................................................................................ + // gap // ............................................................................................................................. + trn2 v29.4S, v20.4S, v28.4S // ............................................................................*................................................ + // gap // ............................................................................................................................. + trn2 v24.4S, v19.4S, v26.4S // ..........................................................................*.................................................. + // gap // ............................................................................................................................. + mul v25.4S, v10.4S, v21.S[2] // .....................*....................................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v15.4S, v27.4S, v21.S[3] // ...................................*......................................................................................... + // gap // ............................................................................................................................. + trn2 v12.2D, v24.2D, v29.2D // .................................................................................*........................................... + // gap // ............................................................................................................................. + mul v22.4S, v27.4S, v21.S[2] // ......................................*...................................................................................... + // gap // ............................................................................................................................. + sqrdmulh v10.4S, v12.4S, v1.4S // ............................................................................................*................................ + // gap // ............................................................................................................................. + mul v12.4S, v12.4S, v23.4S // .............................................................................................*............................... + // gap // ............................................................................................................................. + add v9.4S, v3.4S, v4.4S // ...............*............................................................................................................. + // gap // ............................................................................................................................. + mls v22.4S, v15.4S, v8.S[0] // ..........................................*.................................................................................. + // gap // ............................................................................................................................. + add v6.4S, v31.4S, v18.4S // .............*............................................................................................................... + // gap // ............................................................................................................................. + mls v12.4S, v10.4S, v8.S[0] // ......................................................................................................*...................... + // gap // ............................................................................................................................. + trn1 v27.2D, v24.2D, v29.2D // ...................................................................................*......................................... + // gap // ............................................................................................................................. + add v31.4S, v9.4S, v22.4S // ...............................................*............................................................................. + // gap // ............................................................................................................................. + mls v25.4S, v13.4S, v8.S[0] // .........................*................................................................................................... + // gap // ............................................................................................................................. + sub v21.4S, v27.4S, v12.4S // ..............................................................................................................*.............. + // gap // ............................................................................................................................. + mul v24.4S, v31.4S, v7.S[2] // ....................................................*........................................................................ + // gap // ............................................................................................................................. + sqrdmulh v18.4S, v31.4S, v7.S[3] // ...................................................*......................................................................... + // gap // ............................................................................................................................. + mul v29.4S, v21.4S, v0.4S // ....................................................................................................................*........ + // gap // ............................................................................................................................. + sub v0.4S, v9.4S, v22.4S // ..............................................*.............................................................................. + // gap // ............................................................................................................................. + sqrdmulh v15.4S, v21.4S, v17.4S // ..................................................................................................................*.......... + // gap // ............................................................................................................................. + add v17.4S, v6.4S, v25.4S // ................................*............................................................................................ + // gap // ............................................................................................................................. + sqrdmulh v31.4S, v0.4S, v16.S[1] // .................................................*........................................................................... + // gap // ............................................................................................................................. + mul v7.4S, v0.4S, v16.S[0] // ..................................................*.......................................................................... + // gap // ............................................................................................................................. + mls v24.4S, v18.4S, v8.S[0] // ........................................................*.................................................................... + // gap // ............................................................................................................................. + ldr q0, [x5], #(12*16) // .......................................*..................................................................................... + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + mls v7.4S, v31.4S, v8.S[0] // ......................................................*...................................................................... + // gap // ............................................................................................................................. + sub v10.4S, v6.4S, v25.4S // .............................*............................................................................................... + // gap // ............................................................................................................................. + sub v16.4S, v17.4S, v24.4S // ...............................................................*............................................................. + // gap // ............................................................................................................................. + add v22.4S, v17.4S, v24.4S // ................................................................*............................................................ + // gap // ............................................................................................................................. + sub v3.4S, v10.4S, v7.4S // ..........................................................*.................................................................. + // gap // ............................................................................................................................. + add v24.4S, v10.4S, v7.4S // ...........................................................*................................................................. + // gap // ............................................................................................................................. + trn1 v25.4S, v22.4S, v16.4S // ...................................................................*......................................................... + // gap // ............................................................................................................................. + trn2 v7.4S, v22.4S, v16.4S // ....................................................................*........................................................ + // gap // ............................................................................................................................. + trn1 v22.4S, v24.4S, v3.4S // .................................................................*........................................................... + // gap // ............................................................................................................................. + trn2 v13.4S, v24.4S, v3.4S // ..................................................................*.......................................................... + // gap // ............................................................................................................................. + trn1 v18.4S, v20.4S, v28.4S // ...........................................................................*................................................. + // gap // ............................................................................................................................. + trn2 v3.2D, v25.2D, v22.2D // ......................................................................*...................................................... + // gap // ............................................................................................................................. + trn2 v20.2D, v7.2D, v13.2D // .......................................................................*..................................................... + // gap // ............................................................................................................................. + sqrdmulh v4.4S, v3.4S, v2.4S // .............................................................................*............................................... + // gap // ............................................................................................................................. + sqrdmulh v16.4S, v20.4S, v2.4S // .....................................................................................*....................................... + // gap // ............................................................................................................................. + mul v24.4S, v20.4S, v0.4S // ................................................................................*............................................ + // gap // ............................................................................................................................. + trn1 v17.4S, v19.4S, v26.4S // .....................................................................*....................................................... + // gap // ............................................................................................................................. + mul v0.4S, v3.4S, v0.4S // ...............................................................................*............................................. + // gap // ............................................................................................................................. + trn1 v2.2D, v7.2D, v13.2D // .........................................................................*................................................... + // gap // ............................................................................................................................. + mls v24.4S, v16.4S, v8.S[0] // ..........................................................................................*.................................. + // gap // ............................................................................................................................. + trn2 v21.2D, v17.2D, v18.2D // ..............................................................................*.............................................. + // gap // ............................................................................................................................. + mls v0.4S, v4.4S, v8.S[0] // ....................................................................................*........................................ + // gap // ............................................................................................................................. + mul v9.4S, v21.4S, v23.4S // .......................................................................................*..................................... + // gap // ............................................................................................................................. + sub v4.4S, v2.4S, v24.4S // ..............................................................................................*.............................. + // gap // ............................................................................................................................. + add v16.4S, v2.4S, v24.4S // ...............................................................................................*............................. + // gap // ............................................................................................................................. + sqrdmulh v13.4S, v21.4S, v1.4S // ......................................................................................*...................................... + // gap // ............................................................................................................................. + mul v11.4S, v4.4S, v11.4S // ....................................................................................................*........................ + // gap // ............................................................................................................................. + sqrdmulh v31.4S, v16.4S, v14.4S // ..................................................................................................*.......................... + // gap // ............................................................................................................................. + sqrdmulh v28.4S, v4.4S, v5.4S // .................................................................................................*........................... + // gap // ............................................................................................................................. + mul v19.4S, v16.4S, v30.4S // ...................................................................................................*......................... + // gap // ............................................................................................................................. + mls v9.4S, v13.4S, v8.S[0] // ...........................................................................................*................................. + // gap // ............................................................................................................................. + trn1 v21.2D, v17.2D, v18.2D // ..................................................................................*.......................................... + // gap // ............................................................................................................................. + ldr q4, [x5, #-48] // ................................................................................................................*............ + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + sub v24.4S, v21.4S, v9.4S // ................................................................................................*............................ + // gap // ............................................................................................................................. + add v5.4S, v21.4S, v9.4S // .....................................................................................................*....................... + // gap // ............................................................................................................................. + trn1 v9.2D, v25.2D, v22.2D // ........................................................................*.................................................... + // gap // ............................................................................................................................. + add v16.4S, v27.4S, v12.4S // ...............................................................................................................*............. + // gap // ............................................................................................................................. + sub v21.4S, v9.4S, v0.4S // ........................................................................................*.................................... + // gap // ............................................................................................................................. + add v6.4S, v9.4S, v0.4S // .........................................................................................*................................... + // gap // ............................................................................................................................. + ldr q0, [x5, #-64] // .........................................................................................................*................... + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + mls v29.4S, v15.4S, v8.S[0] // ......................................................................................................................*...... + // gap // ............................................................................................................................. + sqrdmulh v3.4S, v16.4S, v4.4S // ...................................................................................................................*......... + // gap // ............................................................................................................................. + mul v20.4S, v16.4S, v0.4S // .................................................................................................................*........... + // gap // ............................................................................................................................. + mls v19.4S, v31.4S, v8.S[0] // .......................................................................................................*..................... + // gap // ............................................................................................................................. + sub v7.4S, v24.4S, v29.4S // .........................................................................................................................*... + // gap // ............................................................................................................................. + mls v11.4S, v28.4S, v8.S[0] // ........................................................................................................*.................... + // gap // ............................................................................................................................. + mls v20.4S, v3.4S, v8.S[0] // .....................................................................................................................*....... + // gap // ............................................................................................................................. + sub v10.4S, v6.4S, v19.4S // ..........................................................................................................*.................. + // gap // ............................................................................................................................. + add v9.4S, v6.4S, v19.4S // ...........................................................................................................*................. + // gap // ............................................................................................................................. + add v6.4S, v24.4S, v29.4S // ..........................................................................................................................*.. + // gap // ............................................................................................................................. + add v4.4S, v5.4S, v20.4S // ........................................................................................................................*.... + // gap // ............................................................................................................................. + sub v5.4S, v5.4S, v20.4S // .......................................................................................................................*..... + // gap // ............................................................................................................................. + sub v12.4S, v21.4S, v11.4S // ............................................................................................................*................ + // gap // ............................................................................................................................. + add v11.4S, v21.4S, v11.4S // .............................................................................................................*............... + // gap // ............................................................................................................................. + st4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x2], #64 // ............................................................................................................................* + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // ...........................................................................................................................*. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + // gap // ............................................................................................................................. + + // ------------------------------------------------------- new position -------------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------------ + // add x1, x1, #64 // .*........................................................................................................................... + // add x2, x2, #64 // ..*.......................................................................................................................... + // sqrdmulh v6.4S, v16.4S, v21.S[1] // ...*......................................................................................................................... + // mul v18.4S, v16.4S, v21.S[0] // ....*........................................................................................................................ + // sqrdmulh v16.4S, v22.4S, v21.S[1] // .......*..................................................................................................................... + // mul v22.4S, v22.4S, v21.S[0] // ........*.................................................................................................................... + // sqrdmulh v19.4S, v9.4S, v21.S[1] // ...........*................................................................................................................. + // mls v18.4S, v6.4S, v8.S[0] // .........*................................................................................................................... + // mul v6.4S, v9.4S, v21.S[0] // ............*................................................................................................................ + // mls v22.4S, v16.4S, v8.S[0] // .............*............................................................................................................... + // sqrdmulh v16.4S, v4.4S, v21.S[1] // ......*...................................................................................................................... + // sub v20.4S, v31.4S, v18.4S // .........................*................................................................................................... + // mls v6.4S, v19.4S, v8.S[0] // ................*............................................................................................................ + // add v18.4S, v31.4S, v18.4S // .....................................................*....................................................................... + // sub v19.4S, v3.4S, v22.4S // .................*........................................................................................................... + // add v22.4S, v3.4S, v22.4S // ...................................................*......................................................................... + // sub v31.4S, v12.4S, v6.4S // ....................*........................................................................................................ + // add v6.4S, v12.4S, v6.4S // .................................*........................................................................................... + // mul v3.4S, v4.4S, v21.S[0] // .....*....................................................................................................................... + // sqrdmulh v12.4S, v31.4S, v7.S[1] // .......................*..................................................................................................... + // sqrdmulh v9.4S, v6.4S, v21.S[3] // ..........................................*.................................................................................. + // mul v6.4S, v6.4S, v21.S[2] // .............................................*............................................................................... + // mls v3.4S, v16.4S, v8.S[0] // ..........*.................................................................................................................. + // mul v16.4S, v31.4S, v7.S[0] // ........................*.................................................................................................... + // ldr q31, [x4, #-32] // .....................*....................................................................................................... + // mls v6.4S, v9.4S, v8.S[0] // .........................................................*................................................................... + // mls v16.4S, v12.4S, v8.S[0] // ............................*................................................................................................ + // add v12.4S, v15.4S, v3.4S // ..............*.............................................................................................................. + // sub v3.4S, v15.4S, v3.4S // ...............*............................................................................................................. + // sub v9.4S, v18.4S, v6.4S // ......................................................................*...................................................... + // sub v15.4S, v20.4S, v16.4S // ...................................*......................................................................................... + // add v16.4S, v20.4S, v16.4S // .....................................*....................................................................................... + // add v6.4S, v18.4S, v6.4S // ................................................................*............................................................ + // sqrdmulh v18.4S, v3.4S, v7.S[1] // ...................*......................................................................................................... + // mul v20.4S, v3.4S, v7.S[0] // ..................*.......................................................................................................... + // sqrdmulh v3.4S, v12.4S, v21.S[3] // ..............................................*.............................................................................. + // ldr q4, [x4, #-16] // *............................................................................................................................ + // mls v20.4S, v18.4S, v8.S[0] // ......................*...................................................................................................... + // mul v18.4S, v12.4S, v21.S[2] // ................................................*............................................................................ + // ldr q12, [x5], #(12*16) // ....................................................................*........................................................ + // sub v26.4S, v19.4S, v20.4S // ..........................*.................................................................................................. + // add v19.4S, v19.4S, v20.4S // ...........................*................................................................................................. + // mls v18.4S, v3.4S, v8.S[0] // ....................................................*........................................................................ + // sqrdmulh v20.4S, v26.4S, v4.S[1] // .............................*............................................................................................... + // sqrdmulh v3.4S, v19.4S, v31.S[3] // ...............................*............................................................................................. + // mul v19.4S, v19.4S, v31.S[2] // ................................*............................................................................................ + // sub v21.4S, v22.4S, v18.4S // ..............................................................*.............................................................. + // add v18.4S, v22.4S, v18.4S // ........................................................*.................................................................... + // mul v22.4S, v26.4S, v4.S[0] // ..............................*.............................................................................................. + // sqrdmulh v4.4S, v21.4S, v31.S[1] // .................................................................*........................................................... + // mul v31.4S, v21.4S, v31.S[0] // ..................................................................*.......................................................... + // sqrdmulh v26.4S, v18.4S, v7.S[3] // ............................................................*................................................................ + // mul v18.4S, v18.4S, v7.S[2] // ...........................................................*................................................................. + // mls v19.4S, v3.4S, v8.S[0] // ....................................*........................................................................................ + // mls v31.4S, v4.4S, v8.S[0] // .....................................................................*....................................................... + // mls v22.4S, v20.4S, v8.S[0] // ..................................*.......................................................................................... + // mls v18.4S, v26.4S, v8.S[0] // ...................................................................*......................................................... + // sub v20.4S, v16.4S, v19.4S // ........................................*.................................................................................... + // sub v3.4S, v9.4S, v31.4S // .........................................................................*................................................... + // add v31.4S, v9.4S, v31.4S // ..........................................................................*.................................................. + // sub v9.4S, v15.4S, v22.4S // ......................................*...................................................................................... + // add v22.4S, v15.4S, v22.4S // .......................................*..................................................................................... + // add v16.4S, v16.4S, v19.4S // .........................................*................................................................................... + // sub v19.4S, v6.4S, v18.4S // .......................................................................*..................................................... + // add v6.4S, v6.4S, v18.4S // ........................................................................*.................................................... + // trn1 v18.4S, v31.4S, v3.4S // .............................................................................*............................................... + // trn2 v31.4S, v31.4S, v3.4S // ..............................................................................*.............................................. + // trn1 v3.4S, v6.4S, v19.4S // ...........................................................................*................................................. + // trn2 v6.4S, v6.4S, v19.4S // ............................................................................*................................................ + // trn1 v19.4S, v16.4S, v20.4S // .....................................................................................*....................................... + // trn2 v15.2D, v3.2D, v18.2D // ................................................................................*............................................ + // trn2 v4.2D, v6.2D, v31.2D // .................................................................................*........................................... + // trn1 v18.2D, v3.2D, v18.2D // ........................................................................................................*.................... + // trn1 v6.2D, v6.2D, v31.2D // .......................................................................................*..................................... + // trn2 v16.4S, v16.4S, v20.4S // ............................................*................................................................................ + // trn1 v20.4S, v22.4S, v9.4S // ...............................................................................*............................................. + // trn2 v22.4S, v22.4S, v9.4S // ...........................................*................................................................................. + // sqrdmulh v31.4S, v15.4S, v2.4S // ..................................................................................*.......................................... + // trn2 v3.2D, v19.2D, v20.2D // .........................................................................................*................................... + // mul v9.4S, v15.4S, v12.4S // ......................................................................................*...................................... + // mul v12.4S, v4.4S, v12.4S // ....................................................................................*........................................ + // trn2 v15.2D, v16.2D, v22.2D // ...............................................*............................................................................. + // trn1 v19.2D, v19.2D, v20.2D // ....................................................................................................*........................ + // trn1 v16.2D, v16.2D, v22.2D // .......................................................*..................................................................... + // mls v9.4S, v31.4S, v8.S[0] // ..........................................................................................*.................................. + // sqrdmulh v22.4S, v4.4S, v2.4S // ...................................................................................*......................................... + // sqrdmulh v20.4S, v3.4S, v1.4S // ..............................................................................................*.............................. + // mul v31.4S, v3.4S, v23.4S // ...........................................................................................*................................. + // sub v3.4S, v18.4S, v9.4S // ..........................................................................................................*.................. + // add v18.4S, v18.4S, v9.4S // ...........................................................................................................*................. + // mls v12.4S, v22.4S, v8.S[0] // ........................................................................................*.................................... + // mls v31.4S, v20.4S, v8.S[0] // ...................................................................................................*......................... + // sqrdmulh v22.4S, v15.4S, v1.4S // .................................................*........................................................................... + // mul v20.4S, v15.4S, v23.4S // ..................................................*.......................................................................... + // sub v9.4S, v6.4S, v12.4S // ............................................................................................*................................ + // add v6.4S, v6.4S, v12.4S // .............................................................................................*............................... + // sub v12.4S, v19.4S, v31.4S // ......................................................................................................*...................... + // sqrdmulh v15.4S, v9.4S, v5.4S // .................................................................................................*........................... + // sqrdmulh v4.4S, v6.4S, v14.4S // ................................................................................................*............................ + // mul v6.4S, v6.4S, v30.4S // ..................................................................................................*.......................... + // mul v9.4S, v9.4S, v11.4S // ...............................................................................................*............................. + // add v19.4S, v19.4S, v31.4S // .......................................................................................................*..................... + // mls v20.4S, v22.4S, v8.S[0] // ......................................................*...................................................................... + // mls v6.4S, v4.4S, v8.S[0] // ................................................................................................................*............ + // mls v9.4S, v15.4S, v8.S[0] // ..................................................................................................................*.......... + // ldr q22, [x5, #-64] // ............................................................................................................*................ + // sub v26.4S, v18.4S, v6.4S // ....................................................................................................................*........ + // add v25.4S, v18.4S, v6.4S // .....................................................................................................................*....... + // sub v28.4S, v3.4S, v9.4S // .........................................................................................................................*... + // add v27.4S, v3.4S, v9.4S // ..........................................................................................................................*.. + // sub v6.4S, v16.4S, v20.4S // ..........................................................*.................................................................. + // add v18.4S, v16.4S, v20.4S // .........................................................................................................*................... + // ldr q16, [x5, #-48] // .....................................................................................................*....................... + // mul v22.4S, v18.4S, v22.4S // ...............................................................................................................*............. + // sqrdmulh v20.4S, v6.4S, v17.4S // ...............................................................*............................................................. + // sqrdmulh v18.4S, v18.4S, v16.4S // ..............................................................................................................*.............. + // mul v6.4S, v6.4S, v0.4S // .............................................................*............................................................... + // mls v22.4S, v18.4S, v8.S[0] // ...................................................................................................................*......... + // mls v6.4S, v20.4S, v8.S[0] // .............................................................................................................*............... + // sub v18.4S, v19.4S, v22.4S // ........................................................................................................................*.... + // add v17.4S, v19.4S, v22.4S // .......................................................................................................................*..... + // sub v20.4S, v12.4S, v6.4S // .................................................................................................................*........... + // add v19.4S, v12.4S, v6.4S // ......................................................................................................................*...... + // st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // ............................................................................................................................* + // st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x2], #64 // ...........................................................................................................................*. + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_a72.s b/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_a72.s index 8e900124..2678bae2 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_a72.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_a72.s @@ -2,109 +2,54 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro vins vec_out, gpr_in, lane // slothy:no-unfold +.macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, \offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], \inc -.endm -.macro vsub d,a,b // slothy:no-unfold - sub \d\().4s, \a\().4s, \b\().4s -.endm -.macro vadd d,a,b // slothy:no-unfold - add \d\().4s, \a\().4s, \b\().4s -.endm -.macro vqrdmulh d,a,b // slothy:no-unfold +.macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm -.macro vmul d,a,b // slothy:no-unfold - mul \d\().4s, \a\().4s, \b\().4s -.endm -.macro vmls d,a,b // slothy:no-unfold +.macro vmls d,a,b mls \d\().4s, \a\().4s, \b\().4s .endm -.macro vqrdmulhq d,a,b,i // slothy:no-unfold +.macro vqrdmulhq d,a,b,i sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] .endm -.macro vqdmulhq d,a,b,i // slothy:no-unfold +.macro vqdmulhq d,a,b,i sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] .endm -.macro vmulq d,a,b,i // slothy:no-unfold +.macro vmulq d,a,b,i mul \d\().4s, \a\().4s, \b\().s[\i] .endm -.macro vmlsq d,a,b,i // slothy:no-unfold +.macro vmlsq d,a,b,i mls \d\().4s, \a\().4s, \b\().s[\i] .endm -.macro trn1_d d,a,b // slothy:no-unfold - trn1 \d\().2d, \a\().2d, \b\().2d -.endm -.macro trn2_d d,a,b // slothy:no-unfold - trn2 \d\().2d, \a\().2d, \b\().2d -.endm -.macro trn1_s d,a,b // slothy:no-unfold - trn1 \d\().4s, \a\().4s, \b\().4s -.endm -.macro trn2_s d,a,b // slothy:no-unfold - trn2 \d\().4s, \a\().4s, \b\().4s -.endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 mulmodq tmp, \b, \root, \idx0, \idx1 - vsub \b, \a, tmp - vadd \a, \a, tmp -.endm - -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s .endm .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted - vsub \b, \a, tmp - vadd \a, \a, tmp + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s .endm .macro barrett_reduce_single a @@ -120,24 +65,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -162,62 +107,62 @@ xtmp1 .req x11 str \x\()t_31, [\addr, #(-\inc + 8*7)] .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 - trn1_s t0, \data0, \data1 - trn2_s t1, \data0, \data1 - trn1_s t2, \data2, \data3 - trn2_s t3, \data2, \data3 - - trn2_d \data2, t0, t2 - trn2_d \data3, t1, t3 - trn1_d \data0, t0, t2 - trn1_d \data1, t1, t3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d .endm .macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 - trn1_s \data_out0, \data_in0, \data_in1 - trn2_s \data_out1, \data_in0, \data_in1 - trn1_s \data_out2, \data_in2, \data_in3 - trn2_s \data_out3, \data_in2, \data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -228,7 +173,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -238,7 +183,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -246,7 +191,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -257,19 +202,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -281,7 +226,7 @@ roots: #include "ntt_dilithium_123_456_78_twiddles.s" .text - .global ntt_dilithium_123_45678_w_scalar + .global ntt_dilithium_123_45678_w_scalar_opt_a72 .global _ntt_dilithium_123_45678_w_scalar .p2align 4 @@ -290,8 +235,8 @@ const_addr: .word 8380417 .word 0 .word 0 -ntt_dilithium_123_45678_w_scalar: -_ntt_dilithium_123_45678_w_scalar: +ntt_dilithium_123_45678_w_scalar_opt_a72: +_ntt_dilithium_123_45678_w_scalar_opt_a72: push_stack in .req x0 @@ -406,43 +351,678 @@ _ntt_dilithium_123_45678_w_scalar: load_roots_123 .p2align 2 + // Instructions: 12 + // Expected cycles: 21 + // Expected IPC: 0.57 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q18, [x0, #896] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q19, [x0, #512] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v20.4S, v18.4S, v0.S[1] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v13.4S, v18.4S, v0.S[0] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v13.4S, v20.4S, v8.S[0] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v18.4S, v19.4S, v0.S[1] // ......*....................... + ldr q12, [x0, #384] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v26.4S, v19.4S, v0.S[0] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v20.4S, v12.4S, v13.4S // .......*...................... + // gap // .............................. + // gap // .............................. + mls v26.4S, v18.4S, v8.S[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v6.4S, v20.4S, v0.S[3] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v30.4S, v20.4S, v0.S[2] // ...........*.................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0, #512] // .*............................. + // ldr q17, [x0, #896] // *.............................. + // ldr q12, [x0, #384] // ......*........................ + // sqrdmulh v11.4S, v17.4S, v0.S[1] // ..*............................ + // mul v13.4S, v17.4S, v0.S[0] // ...*........................... + // mls v13.4S, v11.4S, v8.S[0] // ....*.......................... + // sqrdmulh v19.4S, v6.4S, v0.S[1] // .....*......................... + // add v20.4S, v12.4S, v13.4S // ........*...................... + // mul v26.4S, v6.4S, v0.S[0] // .......*....................... + // sqrdmulh v6.4S, v20.4S, v0.S[3] // ..........*.................... + // mls v26.4S, v19.4S, v8.S[0] // .........*..................... + // mul v30.4S, v20.4S, v0.S[2] // ...........*................... + + sub count, count, #1 layer123_start: + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Wall time: 5.00s + // User time: 5.00s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q18, [x0, #0] // *........................................................................... + sub v20.4S, v12.4S, v13.4S // ..........................*................................................. + ldr q19, [x0, #256] // ..*......................................................................... + ldr q11, [x0, #768] // ......*..................................................................... + ldr q22, [x0, #128] // .*.......................................................................... + mls v30.4S, v6.4S, v8.S[0] // ...................................*........................................ + ldr q6, [x0, #528] // ....e....................................................................... + ldr q13, [x0, #640] // .....*...................................................................... + // gap // ............................................................................ + sqrdmulh v27.4S, v20.4S, v1.S[1] // ...........................................*................................ + ldr q17, [x0, #912] // .......e.................................................................... + ldr q12, [x0, #400] // ...e........................................................................ + sub v14.4S, v18.4S, v26.4S // ...........*................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v18.4S, v26.4S // ............*............................................................... + mul v20.4S, v20.4S, v1.S[0] // ............................................*............................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v26.4S, v11.4S, v0.S[1] // ..................*......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v11.4S, v11.4S, v0.S[0] // ...................*........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v31.4S, v13.4S, v0.S[1] // .............*.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v11.4S, v26.4S, v8.S[0] // ....................*....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.4S, v13.4S, v0.S[0] // ..............*............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.4S, v31.4S, v8.S[0] // ...............*............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.4S, v19.4S, v11.4S // .....................*...................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v19.4S, v19.4S, v11.4S // ......................*..................................................... + mls v20.4S, v27.4S, v8.S[0] // .............................................*.............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v11.4S, v17.4S, v0.S[1] // .......................e.................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v27.4S, v22.4S, v13.4S // .................*.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v22.4S, v22.4S, v13.4S // ................*........................................................... + mul v13.4S, v17.4S, v0.S[0] // ........................e................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v17.4S, v26.4S, v1.S[1] // ......................................*..................................... + sub v31.4S, v27.4S, v30.4S // ....................................*....................................... + // gap // ............................................................................ + add v30.4S, v27.4S, v30.4S // .....................................*...................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v27.4S, v19.4S, v0.S[3] // ............................*............................................... + sub v25.4S, v22.4S, v20.4S // ..............................................*............................. + // gap // ............................................................................ + add v20.4S, v22.4S, v20.4S // ...............................................*............................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v22.4S, v26.4S, v1.S[0] // .......................................*.................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v22.4S, v17.4S, v8.S[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v19.4S, v19.4S, v0.S[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v19.4S, v27.4S, v8.S[0] // ..............................*............................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v27.4S, v14.4S, v22.4S // .........................................*.................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v22.4S, v14.4S, v22.4S // ..........................................*................................. + sqrdmulh v17.4S, v30.4S, v1.S[3] // ................................................*........................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v30.4S, v30.4S, v1.S[2] // .................................................*.......................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v14.4S, v18.4S, v19.4S // ...............................*............................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v18.4S, v19.4S // ................................*........................................... + sqrdmulh v19.4S, v31.4S, v2.S[1] // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v30.4S, v17.4S, v8.S[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v17.4S, v31.4S, v2.S[0] // ......................................................*..................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v17.4S, v19.4S, v8.S[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v19.4S, v18.4S, v30.4S // ...................................................*........................ + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v18.4S, v30.4S // ....................................................*....................... + sqrdmulh v30.4S, v20.4S, v2.S[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v20.4S, v20.4S, v2.S[2] // ...........................................................*................ + str q19, [x0, #128] // .....................................................................*...... + // gap // ............................................................................ + str q18, [x0], #(16) // ....................................................................*....... + sub v18.4S, v14.4S, v17.4S // ........................................................*................... + // gap // ............................................................................ + add v19.4S, v14.4S, v17.4S // .........................................................*.................. + sqrdmulh v17.4S, v25.4S, v3.S[1] // ...............................................................*............ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v20.4S, v30.4S, v8.S[0] // ............................................................*............... + str q18, [x0, #368] // .......................................................................*.... + // gap // ............................................................................ + str q19, [x0, #240] // ......................................................................*..... + // gap // ............................................................................ + // gap // ............................................................................ + mul v18.4S, v25.4S, v3.S[0] // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.4S, v11.4S, v8.S[0] // .........................e.................................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v19.4S, v22.4S, v20.4S // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + add v20.4S, v22.4S, v20.4S // ..............................................................*............. + mls v18.4S, v17.4S, v8.S[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #624] // .........................................................................*.. + sqrdmulh v19.4S, v6.4S, v0.S[1] // ........e................................................................... + // gap // ............................................................................ + str q20, [x0, #496] // ........................................................................*... + add v20.4S, v12.4S, v13.4S // ...........................e................................................ + // gap // ............................................................................ + mul v26.4S, v6.4S, v0.S[0] // .........e.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v11.4S, v27.4S, v18.4S // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + add v18.4S, v27.4S, v18.4S // ...................................................................*........ + sqrdmulh v6.4S, v20.4S, v0.S[3] // .................................e.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.4S, v19.4S, v8.S[0] // ..........e................................................................. + str q11, [x0, #880] // ...........................................................................* + // gap // ............................................................................ + str q18, [x0, #752] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + mul v30.4S, v20.4S, v0.S[2] // ..................................e......................................... + // gap // ............................................................................ + // gap // ............................................................................ + + // ----------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ldr q9, [x0, #0] // ......................................................................*.......................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ......................................................................'...*...................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ......................................................................'.*........................................................................ + // ldr q12, [x0, #(3*(1024/8))] // ....e.................................................................'.........~................................................................ + // ldr q13, [x0, #(4*(1024/8))] // e.....................................................................'.....~.................................................................... + // ldr q14, [x0, #(5*(1024/8))] // .~....................................................................'......*................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ......................................................................'..*....................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ...e..................................................................'........~................................................................. + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ...........................................................e..........'................................................................~......... + // mul v24.4s, v13.4s, v0.s[0] // ..............................................................e.......'...................................................................~...... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................e...'.......................................................................~.. + // sub v13.4s, v9.4s, v24.4s // .....~................................................................'..........*............................................................... + // add v9.4s, v9.4s, v24.4s // ......~...............................................................'...........*.............................................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ..........~...........................................................'...............*.......................................................... + // mul v24.4s, v14.4s, v0.s[0] // ............~.........................................................'.................*........................................................ + // mls v24.4s, v27.4s, v8.s[0] // .............~........................................................'..................*....................................................... + // sub v14.4s, v10.4s, v24.4s // ...................~..................................................'........................*................................................. + // add v10.4s, v10.4s, v24.4s // ..................~...................................................'.......................*.................................................. + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ........~.............................................................'.............*............................................................ + // mul v24.4s, v15.4s, v0.s[0] // .........~............................................................'..............*........................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........~..........................................................'................*......................................................... + // sub v15.4s, v11.4s, v24.4s // ..............~.......................................................'...................*...................................................... + // add v11.4s, v11.4s, v24.4s // ...............~......................................................'....................*..................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .................e....................................................'......................~................................................... + // mul v24.4s, v16.4s, v0.s[0] // ....................e.................................................'.........................~................................................ + // mls v24.4s, v27.4s, v8.s[0] // ......................................................e...............'...........................................................~.............. + // sub v16.4s, v12.4s, v24.4s // ......................................................................'*......................................................................... + // add v12.4s, v12.4s, v24.4s // .............................................................e........'..................................................................~....... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ........................~.............................................'.............................*............................................ + // mul v24.4s, v11.4s, v0.s[2] // .............................~........................................'..................................*....................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................~.......................................'...................................*...................................... + // sub v11.4s, v9.4s, v24.4s // ...................................~..................................'........................................*................................. + // add v9.4s, v9.4s, v24.4s // ....................................~.................................'.........................................*................................ + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .................................................................e....'......................................................................~... + // mul v24.4s, v12.4s, v0.s[2] // .....................................................................e'.......................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................'....*..................................................................... + // sub v12.4s, v10.4s, v24.4s // ......................~...............................................'...........................*.............................................. + // add v10.4s, v10.4s, v24.4s // .......................~..............................................'............................*............................................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .....................~................................................'..........................*............................................... + // mul v24.4s, v15.4s, v1.s[0] // ...........................~..........................................'................................*......................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................~.........................................'.................................*........................................ + // sub v15.4s, v13.4s, v24.4s // ...............................~......................................'....................................*..................................... + // add v13.4s, v13.4s, v24.4s // ................................~.....................................'.....................................*.................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ..~...................................................................'.......*.................................................................. + // mul v24.4s, v16.4s, v1.s[0] // .......~..............................................................'............*............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ................~.....................................................'.....................*.................................................... + // sub v16.4s, v14.4s, v24.4s // .........................~............................................'..............................*........................................... + // add v14.4s, v14.4s, v24.4s // ..........................~...........................................'...............................*.......................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .................................~....................................'......................................*................................... + // mul v24.4s, v10.4s, v1.s[2] // ..................................~...................................'.......................................*.................................. + // mls v24.4s, v27.4s, v8.s[0] // ......................................~...............................'...........................................*.............................. + // sub v10.4s, v9.4s, v24.4s // .........................................~............................'..............................................*........................... + // add v9.4s, v9.4s, v24.4s // ..........................................~...........................'...............................................*.......................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .....................................~................................'..........................................*............................... + // mul v24.4s, v12.4s, v2.s[0] // .......................................~..............................'............................................*............................. + // mls v24.4s, v27.4s, v8.s[0] // ........................................~.............................'.............................................*............................ + // sub v12.4s, v11.4s, v24.4s // ...............................................~......................'....................................................*..................... + // add v11.4s, v11.4s, v24.4s // ................................................~.....................'.....................................................*.................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...........................................~..........................'................................................*......................... + // mul v24.4s, v14.4s, v2.s[2] // ............................................~.........................'.................................................*........................ + // mls v24.4s, v27.4s, v8.s[0] // ..................................................~...................'.......................................................*.................. + // sub v14.4s, v13.4s, v24.4s // .......................................................~..............'............................................................*............. + // add v13.4s, v13.4s, v24.4s // ........................................................~.............'.............................................................*............ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .................................................~....................'......................................................*................... + // mul v24.4s, v16.4s, v3.s[0] // .....................................................~................'..........................................................*............... + // mls v24.4s, v27.4s, v8.s[0] // .........................................................~............'..............................................................*........... + // sub v16.4s, v15.4s, v24.4s // ...............................................................~......'....................................................................*..... + // add v15.4s, v15.4s, v24.4s // ................................................................~.....'.....................................................................*.... + // str q9, [x0], #(16) // ..............................................~.......................'...................................................*...................... + // str q10, [x0, #(-16 + 1*(1024/8))] // .............................................~........................'..................................................*....................... + // str q11, [x0, #(-16 + 2*(1024/8))] // ....................................................~.................'.........................................................*................ + // str q12, [x0, #(-16 + 3*(1024/8))] // ...................................................~..................'........................................................*................. + // str q13, [x0, #(-16 + 4*(1024/8))] // ............................................................~.........'.................................................................*........ + // str q14, [x0, #(-16 + 5*(1024/8))] // ..........................................................~...........'...............................................................*.......... + // str q15, [x0, #(-16 + 6*(1024/8))] // ....................................................................~.'.........................................................................* + // str q16, [x0, #(-16 + 7*(1024/8))] // ...................................................................~..'........................................................................*. - ldr_vo data0, in, 0 - ldr_vo data1, in, (1*(1024/8)) - ldr_vo data2, in, (2*(1024/8)) - ldr_vo data3, in, (3*(1024/8)) - ldr_vo data4, in, (4*(1024/8)) - ldr_vo data5, in, (5*(1024/8)) - ldr_vo data6, in, (6*(1024/8)) - ldr_vo data7, in, (7*(1024/8)) - - ct_butterfly data0, data4, root0, 0, 1 - ct_butterfly data1, data5, root0, 0, 1 - ct_butterfly data2, data6, root0, 0, 1 - ct_butterfly data3, data7, root0, 0, 1 - - ct_butterfly data0, data2, root0, 2, 3 - ct_butterfly data1, data3, root0, 2, 3 - ct_butterfly data4, data6, root1, 0, 1 - ct_butterfly data5, data7, root1, 0, 1 - - ct_butterfly data0, data1, root1, 2, 3 - ct_butterfly data2, data3, root2, 0, 1 - ct_butterfly data4, data5, root2, 2, 3 - ct_butterfly data6, data7, root3, 0, 1 - - str_vi data0, in, (16) - str_vo data1, in, (-16 + 1*(1024/8)) - str_vo data2, in, (-16 + 2*(1024/8)) - str_vo data3, in, (-16 + 3*(1024/8)) - str_vo data4, in, (-16 + 4*(1024/8)) - str_vo data5, in, (-16 + 5*(1024/8)) - str_vo data6, in, (-16 + 6*(1024/8)) - str_vo data7, in, (-16 + 7*(1024/8)) - - subs count, count, #1 + sub count, count, #1 cbnz count, layer123_start + // Instructions: 64 + // Expected cycles: 64 + // Expected IPC: 1.00 + // + // Wall time: 1.66s + // User time: 1.66s + // + // ---------------------- original position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + ldr q19, [x0, #768] // ...*............................................................ + sub v20.4S, v12.4S, v13.4S // .*.............................................................. + mls v30.4S, v6.4S, v8.S[0] // .....*.......................................................... + ldr q18, [x0, #0] // *............................................................... + ldr q11, [x0, #256] // ..*............................................................. + // gap // ................................................................ + ldr q22, [x0, #640] // ......*......................................................... + ldr q13, [x0, #128] // ....*........................................................... + // gap // ................................................................ + sqrdmulh v6.4S, v20.4S, v1.S[1] // .......*........................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sub v17.4S, v18.4S, v26.4S // ........*....................................................... + sqrdmulh v27.4S, v19.4S, v0.S[1] // ...........*.................................................... + // gap // ................................................................ + add v18.4S, v18.4S, v26.4S // .........*...................................................... + // gap // ................................................................ + // gap // ................................................................ + mul v19.4S, v19.4S, v0.S[0] // ............*................................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v20.4S, v20.4S, v1.S[0] // ..........*..................................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v12.4S, v22.4S, v0.S[1] // .............*.................................................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v19.4S, v27.4S, v8.S[0] // ..............*................................................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v22.4S, v22.4S, v0.S[0] // ...............*................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v22.4S, v12.4S, v8.S[0] // ................*............................................... + // gap // ................................................................ + // gap // ................................................................ + sub v27.4S, v11.4S, v19.4S // .................*.............................................. + // gap // ................................................................ + // gap // ................................................................ + add v19.4S, v11.4S, v19.4S // ..................*............................................. + mls v20.4S, v6.4S, v8.S[0] // ...................*............................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v11.4S, v27.4S, v1.S[1] // ......................*......................................... + // gap // ................................................................ + // gap // ................................................................ + add v6.4S, v13.4S, v22.4S // ....................*........................................... + // gap // ................................................................ + // gap // ................................................................ + sub v22.4S, v13.4S, v22.4S // .....................*.......................................... + sqrdmulh v13.4S, v19.4S, v0.S[3] // .........................*...................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v27.4S, v27.4S, v1.S[0] // ............................*................................... + sub v12.4S, v6.4S, v30.4S // .......................*........................................ + // gap // ................................................................ + add v30.4S, v6.4S, v30.4S // ........................*....................................... + // gap // ................................................................ + // gap // ................................................................ + mls v27.4S, v11.4S, v8.S[0] // .............................*.................................. + sub v11.4S, v22.4S, v20.4S // ..........................*..................................... + // gap // ................................................................ + add v20.4S, v22.4S, v20.4S // ...........................*.................................... + // gap // ................................................................ + // gap // ................................................................ + mul v19.4S, v19.4S, v0.S[2] // ..............................*................................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v19.4S, v13.4S, v8.S[0] // ...............................*................................ + // gap // ................................................................ + // gap // ................................................................ + sub v22.4S, v17.4S, v27.4S // ................................*............................... + // gap // ................................................................ + // gap // ................................................................ + add v13.4S, v17.4S, v27.4S // .................................*.............................. + sqrdmulh v6.4S, v30.4S, v1.S[3] // ..................................*............................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v30.4S, v30.4S, v1.S[2] // ...................................*............................ + // gap // ................................................................ + // gap // ................................................................ + sub v27.4S, v18.4S, v19.4S // ....................................*........................... + // gap // ................................................................ + // gap // ................................................................ + add v18.4S, v18.4S, v19.4S // .....................................*.......................... + sqrdmulh v19.4S, v12.4S, v2.S[1] // ......................................*......................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v30.4S, v6.4S, v8.S[0] // .......................................*........................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v6.4S, v12.4S, v2.S[0] // ........................................*....................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v6.4S, v19.4S, v8.S[0] // .........................................*...................... + // gap // ................................................................ + // gap // ................................................................ + sub v19.4S, v18.4S, v30.4S // ..........................................*..................... + // gap // ................................................................ + // gap // ................................................................ + add v18.4S, v18.4S, v30.4S // ...........................................*.................... + sqrdmulh v30.4S, v20.4S, v2.S[3] // ............................................*................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v20.4S, v20.4S, v2.S[2] // .............................................*.................. + str q19, [x0, #128] // ..............................................*................. + // gap // ................................................................ + str q18, [x0], #(16) // ...............................................*................ + add v18.4S, v27.4S, v6.4S // .................................................*.............. + // gap // ................................................................ + sqrdmulh v19.4S, v11.4S, v3.S[1] // ..................................................*............. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v20.4S, v30.4S, v8.S[0] // ...................................................*............ + str q18, [x0, #240] // .....................................................*.......... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v18.4S, v11.4S, v3.S[0] // ......................................................*......... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v18.4S, v19.4S, v8.S[0] // .........................................................*...... + // gap // ................................................................ + // gap // ................................................................ + sub v19.4S, v13.4S, v20.4S // .......................................................*........ + // gap // ................................................................ + // gap // ................................................................ + add v20.4S, v13.4S, v20.4S // ........................................................*....... + sub v11.4S, v27.4S, v6.4S // ................................................*............... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q19, [x0, #624] // ..........................................................*..... + // gap // ................................................................ + // gap // ................................................................ + sub v19.4S, v22.4S, v18.4S // ............................................................*... + add v18.4S, v22.4S, v18.4S // .............................................................*.. + str q11, [x0, #368] // ....................................................*........... + str q20, [x0, #496] // ...........................................................*.... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q19, [x0, #880] // ..............................................................*. + str q18, [x0, #752] // ...............................................................* + // gap // ................................................................ + + // ------------------------ new position -------------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + // ldr q18, [x0, #0] // ...*............................................................ + // sub v20.4S, v12.4S, v13.4S // .*.............................................................. + // ldr q19, [x0, #256] // ....*........................................................... + // ldr q11, [x0, #768] // *............................................................... + // ldr q22, [x0, #128] // ......*......................................................... + // mls v30.4S, v6.4S, v8.S[0] // ..*............................................................. + // ldr q13, [x0, #640] // .....*.......................................................... + // sqrdmulh v27.4S, v20.4S, v1.S[1] // .......*........................................................ + // sub v14.4S, v18.4S, v26.4S // ........*....................................................... + // add v18.4S, v18.4S, v26.4S // ..........*..................................................... + // mul v20.4S, v20.4S, v1.S[0] // ............*................................................... + // sqrdmulh v26.4S, v11.4S, v0.S[1] // .........*...................................................... + // mul v11.4S, v11.4S, v0.S[0] // ...........*.................................................... + // sqrdmulh v31.4S, v13.4S, v0.S[1] // .............*.................................................. + // mls v11.4S, v26.4S, v8.S[0] // ..............*................................................. + // mul v13.4S, v13.4S, v0.S[0] // ...............*................................................ + // mls v13.4S, v31.4S, v8.S[0] // ................*............................................... + // sub v26.4S, v19.4S, v11.4S // .................*.............................................. + // add v19.4S, v19.4S, v11.4S // ..................*............................................. + // mls v20.4S, v27.4S, v8.S[0] // ...................*............................................ + // add v27.4S, v22.4S, v13.4S // .....................*.......................................... + // sub v22.4S, v22.4S, v13.4S // ......................*......................................... + // sqrdmulh v17.4S, v26.4S, v1.S[1] // ....................*........................................... + // sub v31.4S, v27.4S, v30.4S // .........................*...................................... + // add v30.4S, v27.4S, v30.4S // ..........................*..................................... + // sqrdmulh v27.4S, v19.4S, v0.S[3] // .......................*........................................ + // sub v25.4S, v22.4S, v20.4S // ............................*................................... + // add v20.4S, v22.4S, v20.4S // .............................*.................................. + // mul v22.4S, v26.4S, v1.S[0] // ........................*....................................... + // mls v22.4S, v17.4S, v8.S[0] // ...........................*.................................... + // mul v19.4S, v19.4S, v0.S[2] // ..............................*................................. + // mls v19.4S, v27.4S, v8.S[0] // ...............................*................................ + // sub v27.4S, v14.4S, v22.4S // ................................*............................... + // add v22.4S, v14.4S, v22.4S // .................................*.............................. + // sqrdmulh v17.4S, v30.4S, v1.S[3] // ..................................*............................. + // mul v30.4S, v30.4S, v1.S[2] // ...................................*............................ + // sub v14.4S, v18.4S, v19.4S // ....................................*........................... + // add v18.4S, v18.4S, v19.4S // .....................................*.......................... + // sqrdmulh v19.4S, v31.4S, v2.S[1] // ......................................*......................... + // mls v30.4S, v17.4S, v8.S[0] // .......................................*........................ + // mul v17.4S, v31.4S, v2.S[0] // ........................................*....................... + // mls v17.4S, v19.4S, v8.S[0] // .........................................*...................... + // sub v19.4S, v18.4S, v30.4S // ..........................................*..................... + // add v18.4S, v18.4S, v30.4S // ...........................................*.................... + // sqrdmulh v30.4S, v20.4S, v2.S[3] // ............................................*................... + // mul v20.4S, v20.4S, v2.S[2] // .............................................*.................. + // str q19, [x0, #128] // ..............................................*................. + // str q18, [x0], #(16) // ...............................................*................ + // sub v18.4S, v14.4S, v17.4S // ........................................................*....... + // add v19.4S, v14.4S, v17.4S // ................................................*............... + // sqrdmulh v17.4S, v25.4S, v3.S[1] // .................................................*.............. + // mls v20.4S, v30.4S, v8.S[0] // ..................................................*............. + // str q18, [x0, #368] // ............................................................*... + // str q19, [x0, #240] // ...................................................*............ + // mul v18.4S, v25.4S, v3.S[0] // ....................................................*........... + // sub v19.4S, v22.4S, v20.4S // ......................................................*......... + // add v20.4S, v22.4S, v20.4S // .......................................................*........ + // mls v18.4S, v17.4S, v8.S[0] // .....................................................*.......... + // str q19, [x0, #624] // .........................................................*...... + // str q20, [x0, #496] // .............................................................*.. + // sub v11.4S, v27.4S, v18.4S // ..........................................................*..... + // add v18.4S, v27.4S, v18.4S // ...........................................................*.... + // str q11, [x0, #880] // ..............................................................*. + // str q18, [x0, #752] // ...............................................................* + restore inp, STACK0 add inpp, inp, #64 @@ -461,1346 +1041,1132 @@ layer123_start: sub inpp, inpp, #64 .p2align 2 - ldr x19, [x4, #16] // ......................................e................................................................................................................................................................................. - ldr x14, [x4, #24] // .......................................e................................................................................................................................................................................ - ldr x16, [x5] , #192 // ..............................................................................................................................e......................................................................................... - ldr x25, [x4, #32] // ..........................................e............................................................................................................................................................................. - ldr x22, [x5, #-184] // ...............................................................................................................................e........................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x0, [x4] , #64 // ..................................e..................................................................................................................................................................................... - ldr x6, [x5, #-88] // ...........................................................................................................................................................................e............................................ - ldr x13, [x5, #-144] // ..........................................................................................................................................e............................................................................. - ldr x11, [x5, #-152] // .......................................................................................................................................e................................................................................ - ldr x23, [x5, #-96] // ..........................................................................................................................................................................e............................................. - ldr x28, [x5, #-160] // ......................................................................................................................................e................................................................................. - ldr x12, [x5, #-16] // ..............................................................................................................................................................................................e......................... - ldr x24, [x5, #-72] // ...............................................................................................................................................................................e........................................ - ldr x17, [x5, #-112] // ..................................................................................................................................................e..................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x7, [x4, #-16] // ..............................................e......................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x27, [x5, #-40] // .......................................................................................................................................................................................e................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x9, [x5, #-80] // ..............................................................................................................................................................................e......................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x15, [x5, #-128] // ..............................................................................................................................................e......................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v4, x19, 0 // ........................................e............................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v4, x14, 1 // .........................................e.............................................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v21, x13, 0 // ............................................................................................................................................e........................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v6, x17, 0 // ....................................................................................................................................................e................................................................... - vins v0, x9, 0 // ................................................................................................................................................................................e....................................... - ldr x9, [x5, #-8] // ...............................................................................................................................................................................................e........................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v11, x25, 0 // ............................................e........................................................................................................................................................................... - ldr x25, [x5, #-104] // ...................................................................................................................................................e.................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v30, x23, 0 // ............................................................................................................................................................................e........................................... - vins v9, x0, 0 // ....................................e................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ + // Instructions: 95 + // Expected cycles: 88 + // Expected IPC: 1.08 + // + // Wall time: 7.72s + // User time: 7.72s + // + // ------------------------------------- original position --------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------- + ldr q20, [x2, #64] // .....*......................................................................................... + ldr q18, [x4], #64 // .*............................................................................................. + // gap // ............................................................................................... + ldr q19, [x2, #112] // *.............................................................................................. + ldr q11, [x2, #96] // ....*.......................................................................................... + // gap // ............................................................................................... + ldr q0, [x2, #80] // ......*........................................................................................ + add x2, x2, #64 // ..............................*................................................................ + ldr q30, [x1, #112] // .........*..................................................................................... + ldr q6, [x1, #96] // ................*.............................................................................. + ldr q27, [x1, #80] // ..............*................................................................................ + // gap // ............................................................................................... + ldr q17, [x1, #64] // ................................................*.............................................. + add x1, x1, #64 // .........................................................................................*..... + sqrdmulh v12.4S, v20.4S, v18.S[1] // .....................*......................................................................... + ldr q14, [x4, #-32] // ...*........................................................................................... + ldr q26, [x5, #96] // ....................................................*.......................................... + // gap // ............................................................................................... + mul v20.4S, v20.4S, v18.S[0] // ..........................*.................................................................... + ldr q1, [x5, #112] // ................................*.............................................................. + ldr q13, [x5, #176] // ....................................................................................*.......... + ldr q31, [x4, #-16] // .........................................*..................................................... + ldr q24, [x5, #128] // ..........................................................................................*.... + // gap // ............................................................................................... + ldr q16, [x4, #-48] // ..*............................................................................................ + sqrdmulh v25.4S, v19.4S, v18.S[1] // .......*....................................................................................... + ldr q22, [x5, #144] // ............................................................................................*.. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v19.4S, v19.4S, v18.S[0] // ........*...................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v10.4S, v11.4S, v18.S[0] // ........................*...................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v11.4S, v11.4S, v18.S[1] // ...........................*................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v19.4S, v25.4S, v8.S[0] // ...........*................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v25.4S, v0.4S, v18.S[1] // ..........*.................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v10.4S, v11.4S, v8.S[0] // ...............................*............................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + add v11.4S, v30.4S, v19.4S // ...............*............................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v0.4S, v0.4S, v18.S[0] // ............*.................................................................................. + sub v19.4S, v30.4S, v19.4S // .............................*................................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v30.4S, v11.4S, v18.S[3] // .................*............................................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + add v23.4S, v6.4S, v10.4S // ...................................*........................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v6.4S, v6.4S, v10.4S // .....................................*......................................................... + mul v11.4S, v11.4S, v18.S[2] // ...................*........................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v10.4S, v23.4S, v18.S[3] // ....................................*.......................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v18.4S, v23.4S, v18.S[2] // .......................................*....................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v0.4S, v25.4S, v8.S[0] // .............*................................................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v20.4S, v12.4S, v8.S[0] // ............................*.................................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v12.4S, v19.4S, v16.S[1] // .................................*............................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + add v25.4S, v27.4S, v0.4S // ..................*............................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v0.4S, v27.4S, v0.4S // ......................*........................................................................ + mul v19.4S, v19.4S, v16.S[0] // ..................................*............................................................ + // gap // ............................................................................................... + add v27.4S, v17.4S, v20.4S // ...................................................*........................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v20.4S, v17.4S, v20.4S // ........................................................*...................................... + sqrdmulh v17.4S, v6.4S, v16.S[1] // ..................................................*............................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v19.4S, v12.4S, v8.S[0] // ......................................*........................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v11.4S, v30.4S, v8.S[0] // ....................*.......................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v30.4S, v6.4S, v16.S[0] // ..............................................*................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v6.4S, v0.4S, v19.4S // ..........................................*.................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + add v19.4S, v0.4S, v19.4S // ...........................................*................................................... + mls v30.4S, v17.4S, v8.S[0] // .......................................................*....................................... + // gap // ............................................................................................... + sub v0.4S, v25.4S, v11.4S // .......................*....................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + add v11.4S, v25.4S, v11.4S // .............................................*................................................. + sqrdmulh v17.4S, v6.4S, v31.S[1] // ............................................*.................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v6.4S, v6.4S, v31.S[0] // .................................................*............................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + add v12.4S, v20.4S, v30.4S // ............................................................*.................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v20.4S, v20.4S, v30.4S // .............................................................*................................. + sqrdmulh v30.4S, v19.4S, v14.S[3] // .....................................................*......................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v6.4S, v17.4S, v8.S[0] // ......................................................*........................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v19.4S, v19.4S, v14.S[2] // .........................................................*..................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v19.4S, v30.4S, v8.S[0] // ..........................................................*.................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v30.4S, v20.4S, v6.4S // ...............................................................*............................... + // gap // ............................................................................................... + // gap // ............................................................................................... + add v20.4S, v20.4S, v6.4S // ................................................................*.............................. + mul v6.4S, v0.4S, v14.S[0] // .........................*..................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v0.4S, v0.4S, v14.S[1] // ...............................................*............................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + add v17.4S, v12.4S, v19.4S // .................................................................*............................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v19.4S, v12.4S, v19.4S // .....................................................................*......................... + sqrdmulh v12.4S, v11.4S, v16.S[3] // ..................................................................*............................ + // gap // ............................................................................................... + trn1 v2.4S, v20.4S, v30.4S // ......................................................................*........................ + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v14.4S, v11.4S, v16.S[2] // .......................................................................*....................... + trn2 v20.4S, v20.4S, v30.4S // ........................................................................*...................... + // gap // ............................................................................................... + trn2 v11.4S, v17.4S, v19.4S // .........................................................................*..................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v18.4S, v10.4S, v8.S[0] // ........................................*...................................................... + trn1 v9.4S, v17.4S, v19.4S // ............................................................................*.................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v6.4S, v0.4S, v8.S[0] // ...........................................................*................................... + trn1 v19.2D, v11.2D, v20.2D // .............................................................................*................. + // gap // ............................................................................................... + trn2 v20.2D, v11.2D, v20.2D // ..............................................................................*................ + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v14.4S, v12.4S, v8.S[0] // ...........................................................................*................... + trn2 v0.2D, v9.2D, v2.2D // ..................................................................................*............ + // gap // ............................................................................................... + sub v11.4S, v27.4S, v18.4S // ..............................................................*................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + add v18.4S, v27.4S, v18.4S // ..........................................................................*.................... + sqrdmulh v30.4S, v20.4S, v1.4S // .................................................................................*............. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v17.4S, v0.4S, v1.4S // ........................................................................................*...... + sub v31.4S, v11.4S, v6.4S // ...................................................................*........................... + // gap // ............................................................................................... + add v27.4S, v11.4S, v6.4S // ....................................................................*.......................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v11.4S, v20.4S, v26.4S // ...................................................................................*........... + sub v4.4S, v18.4S, v14.4S // ...............................................................................*............... + // gap // ............................................................................................... + add v7.4S, v18.4S, v14.4S // ................................................................................*.............. + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v11.4S, v30.4S, v8.S[0] // .....................................................................................*......... + trn2 v30.4S, v27.4S, v31.4S // .......................................................................................*....... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v3.4S, v0.4S, v26.4S // ...........................................................................................*... + trn2 v26.4S, v7.4S, v4.4S // ......................................................................................*........ + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v3.4S, v17.4S, v8.S[0] // ..............................................................................................* + // gap // ............................................................................................... + // gap // ............................................................................................... + add v23.4S, v19.4S, v11.4S // .............................................................................................*. + // gap // ............................................................................................... + // gap // ............................................................................................... + + // ---------------------------------------- new position ----------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------- + // ldr q14, [x2, #112] // ..*............................................................................................ + // ldr q18, [x4], #64 // .*............................................................................................. + // ldr q0, [x4, #-48] // ...................*........................................................................... + // ldr q10, [x4, #-32] // ............*.................................................................................. + // ldr q28, [x2, #96] // ...*........................................................................................... + // ldr q17, [x2, #64] // *.............................................................................................. + // ldr q29, [x2, #80] // ....*.......................................................................................... + // sqrdmulh v20.4S, v14.4S, v18.S[1] // ....................*.......................................................................... + // mul v22.4S, v14.4S, v18.S[0] // ......................*........................................................................ + // ldr q2, [x1, #112] // ......*........................................................................................ + // sqrdmulh v19.4S, v29.4S, v18.S[1] // ..........................*.................................................................... + // mls v22.4S, v20.4S, v8.S[0] // .........................*..................................................................... + // mul v9.4S, v29.4S, v18.S[0] // .............................*................................................................. + // mls v9.4S, v19.4S, v8.S[0] // .....................................*......................................................... + // ldr q6, [x1, #80] // ........*...................................................................................... + // add v20.4S, v2.4S, v22.4S // ............................*.................................................................. + // ldr q3, [x1, #96] // .......*....................................................................................... + // sqrdmulh v31.4S, v20.4S, v18.S[3] // ...............................*............................................................... + // add v19.4S, v6.4S, v9.4S // ........................................*...................................................... + // mul v23.4S, v20.4S, v18.S[2] // ..................................*............................................................ + // mls v23.4S, v31.4S, v8.S[0] // ...............................................*............................................... + // sqrdmulh v5.4S, v17.4S, v18.S[1] // ...........*................................................................................... + // sub v31.4S, v6.4S, v9.4S // .........................................*..................................................... + // sub v20.4S, v19.4S, v23.4S // ....................................................*.......................................... + // mul v7.4S, v28.4S, v18.S[0] // .......................*....................................................................... + // mul v27.4S, v20.4S, v10.S[0] // ................................................................*.............................. + // mul v13.4S, v17.4S, v18.S[0] // ..............*................................................................................ + // sqrdmulh v28.4S, v28.4S, v18.S[1] // ........................*...................................................................... + // mls v13.4S, v5.4S, v8.S[0] // ......................................*........................................................ + // sub v25.4S, v2.4S, v22.4S // ..............................*................................................................ + // add x2, x2, #64 // .....*......................................................................................... + // mls v7.4S, v28.4S, v8.S[0] // ...........................*................................................................... + // ldr q28, [x5, #112] // ...............*............................................................................... + // sqrdmulh v14.4S, v25.4S, v0.S[1] // .......................................*....................................................... + // mul v15.4S, v25.4S, v0.S[0] // ..........................................*.................................................... + // add v16.4S, v3.4S, v7.4S // ................................*.............................................................. + // sqrdmulh v25.4S, v16.4S, v18.S[3] // ...................................*........................................................... + // sub v2.4S, v3.4S, v7.4S // .................................*............................................................. + // mls v15.4S, v14.4S, v8.S[0] // ..............................................*................................................ + // mul v4.4S, v16.4S, v18.S[2] // ....................................*.......................................................... + // mls v4.4S, v25.4S, v8.S[0] // .........................................................................*..................... + // ldr q25, [x4, #-16] // .................*............................................................................. + // sub v30.4S, v31.4S, v15.4S // .................................................*............................................. + // add v31.4S, v31.4S, v15.4S // ..................................................*............................................ + // sqrdmulh v15.4S, v30.4S, v25.S[1] // ......................................................*........................................ + // add v11.4S, v19.4S, v23.4S // .....................................................*......................................... + // mul v19.4S, v2.4S, v0.S[0] // ................................................*.............................................. + // sqrdmulh v3.4S, v20.4S, v10.S[1] // .................................................................*............................. + // ldr q20, [x1, #64] // .........*..................................................................................... + // mul v29.4S, v30.4S, v25.S[0] // .......................................................*....................................... + // sqrdmulh v7.4S, v2.4S, v0.S[1] // .............................................*................................................. + // add v18.4S, v20.4S, v13.4S // ...........................................*................................................... + // ldr q14, [x5, #96] // .............*................................................................................. + // sqrdmulh v25.4S, v31.4S, v10.S[3] // ..........................................................*.................................... + // mls v29.4S, v15.4S, v8.S[0] // ...........................................................*................................... + // mls v19.4S, v7.4S, v8.S[0] // ...................................................*........................................... + // sub v7.4S, v20.4S, v13.4S // ............................................*.................................................. + // mul v5.4S, v31.4S, v10.S[2] // ............................................................*.................................. + // mls v5.4S, v25.4S, v8.S[0] // .............................................................*................................. + // mls v27.4S, v3.4S, v8.S[0] // ...........................................................................*................... + // add v13.4S, v7.4S, v19.4S // ........................................................*...................................... + // sub v9.4S, v7.4S, v19.4S // .........................................................*..................................... + // sub v1.4S, v18.4S, v4.4S // ................................................................................*.............. + // sub v26.4S, v9.4S, v29.4S // ..............................................................*................................ + // add v16.4S, v9.4S, v29.4S // ...............................................................*............................... + // add v9.4S, v13.4S, v5.4S // ..................................................................*............................ + // sqrdmulh v29.4S, v11.4S, v0.S[3] // ....................................................................*.......................... + // sub v31.4S, v1.4S, v27.4S // ....................................................................................*.......... + // add v27.4S, v1.4S, v27.4S // .....................................................................................*......... + // sub v21.4S, v13.4S, v5.4S // ...................................................................*........................... + // trn1 v2.4S, v16.4S, v26.4S // .....................................................................*......................... + // mul v5.4S, v11.4S, v0.S[2] // ......................................................................*........................ + // trn2 v7.4S, v16.4S, v26.4S // .......................................................................*....................... + // trn2 v13.4S, v9.4S, v21.4S // ........................................................................*...................... + // add v12.4S, v18.4S, v4.4S // .................................................................................*............. + // mls v5.4S, v29.4S, v8.S[0] // ..............................................................................*................ + // trn1 v9.4S, v9.4S, v21.4S // ..........................................................................*.................... + // trn1 v19.2D, v13.2D, v7.2D // ............................................................................*.................. + // trn2 v13.2D, v13.2D, v7.2D // .............................................................................*................. + // sub v4.4S, v12.4S, v5.4S // .......................................................................................*....... + // add v7.4S, v12.4S, v5.4S // ........................................................................................*...... + // sqrdmulh v20.4S, v13.4S, v28.4S // ..................................................................................*............ + // trn2 v0.2D, v9.2D, v2.2D // ...............................................................................*............... + // mul v11.4S, v13.4S, v14.4S // ......................................................................................*........ + // ldr q13, [x5, #176] // ................*.............................................................................. + // mls v11.4S, v20.4S, v8.S[0] // .........................................................................................*..... + // trn2 v26.4S, v7.4S, v4.4S // ............................................................................................*.. + // trn2 v30.4S, v27.4S, v31.4S // ..........................................................................................*.... + // sqrdmulh v1.4S, v0.4S, v28.4S // ...................................................................................*........... + // add x1, x1, #64 // ..........*.................................................................................... + // ldr q24, [x5, #128] // ..................*............................................................................ + // mul v3.4S, v0.4S, v14.4S // ...........................................................................................*... + // ldr q22, [x5, #144] // .....................*......................................................................... + // add v23.4S, v19.4S, v11.4S // ..............................................................................................* + // mls v3.4S, v1.4S, v8.S[0] // .............................................................................................*. + sub count, count, #1 -.p2align 2 layer45678_start: - ldr x8, [x1, #80] // ....*................................................................................................................................................................................................................... - vins v15, x15, 0 // ................................................................................................................................................*....................................................................... - ldr x13, [x2, #112] // ............................*........................................................................................................................................................................................... - ldr x21, [x2, #80] // ....................*................................................................................................................................................................................................... - ldr x18, [x5, #-136] // ...........................................................................................................................................*............................................................................ - vins v24, x28, 0 // ........................................................................................................................................*............................................................................... - ldr x15, [x1, #112] // ............*........................................................................................................................................................................................................... - ldr x19, [x4, #16] // ......................................e................................................................................................................................................................................. - vins v30, x6, 1 // .............................................................................................................................................................................*.......................................... - ldr x10, [x2, #64] // ................*....................................................................................................................................................................................................... - ldr x29, [x2, #96] // ........................*............................................................................................................................................................................................... - vins v0, x24, 1 // .................................................................................................................................................................................*...................................... - vins v10, x13, 0 // ..............................*......................................................................................................................................................................................... - ldr x23, [x1, #96] // ........*............................................................................................................................................................................................................... - vins v2, x8, 0 // ......*................................................................................................................................................................................................................. - ldr x6, [x4, #-56] // ...................................*.................................................................................................................................................................................... - vins v21, x18, 1 // .............................................................................................................................................*.......................................................................... - ldr x0, [x1, #88] // .....*.................................................................................................................................................................................................................. - vins v19, x21, 0 // ......................*................................................................................................................................................................................................. - ldr x26, [x1, #72] // .*...................................................................................................................................................................................................................... - ldr x13, [x2, #120] // .............................*.......................................................................................................................................................................................... - vins v25, x10, 0 // ..................*..................................................................................................................................................................................................... - vins v29, x29, 0 // ..........................*............................................................................................................................................................................................. - ldr x28, [x1, #104] // .........*.............................................................................................................................................................................................................. - vins v26, x23, 0 // ..........*............................................................................................................................................................................................................. - ldr x21, [x2, #88] // .....................*.................................................................................................................................................................................................. - vins v24, x11, 1 // .........................................................................................................................................*.............................................................................. - ldr x14, [x4, #24] // .......................................e................................................................................................................................................................................ - vins v14, x15, 0 // ..............*......................................................................................................................................................................................................... - vins v9, x6, 1 // .....................................*.................................................................................................................................................................................. - vins v6, x25, 1 // .....................................................................................................................................................*.................................................................. - ldr x11, [x2, #104] // .........................*.............................................................................................................................................................................................. - vins v10, x13, 1 // ...............................*........................................................................................................................................................................................ - ldr x20, [x1, #64] // *....................................................................................................................................................................................................................... - vins v18, x16, 0 // ................................................................................................................................*....................................................................................... - ldr x16, [x5] , #192 // ..............................................................................................................................e......................................................................................... - ldr x25, [x4, #32] // ..........................................e............................................................................................................................................................................. - vins v19, x21, 1 // .......................*................................................................................................................................................................................................ - ldr x18, [x1, #120] // .............*.......................................................................................................................................................................................................... - ldr x23, [x2, #72] // .................*...................................................................................................................................................................................................... - ldr x15, [x4, #-8] // ...............................................*........................................................................................................................................................................ - vins v2, x0, 1 // .......*................................................................................................................................................................................................................ - vins v29, x11, 1 // ...........................*............................................................................................................................................................................................ - vins v26, x28, 1 // ...........*............................................................................................................................................................................................................ - ldr x8, [x5, #-368] // ..................................................................................................................................*..................................................................................... - vins v5, x20, 0 // ..*..................................................................................................................................................................................................................... - ldr x0, [x5, #-240] // ......................................................................................................................................................................................*................................. - vins v27, x12, 0 // ................................................................................................................................................................................................*....................... - add x2, x2, #64 // .................................*...................................................................................................................................................................................... - vqrdmulhq v7, v10, v9, 1 // ..................................................................*..................................................................................................................................................... - vins v14, x18, 1 // ...............*........................................................................................................................................................................................................ - vins v25, x23, 1 // ...................*.................................................................................................................................................................................................... - ldr x21, [x5, #-256] // ..................................................................................................................................................................................*..................................... - ldr x29, [x5, #-360] // ...................................................................................................................................*.................................................................................... - vins v18, x22, 1 // .................................................................................................................................*...................................................................................... - ldr x12, [x4, #-24] // ...........................................*............................................................................................................................................................................ - vmulq v20, v10, v9, 0 // .................................................................*...................................................................................................................................................... - vins v23, x0, 0 // ........................................................................................................................................................................................*............................... - ldr x22, [x5, #-184] // ...............................................................................................................................e........................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x0, [x4] , #64 // ..................................e..................................................................................................................................................................................... - ldr x6, [x5, #-88] // ...........................................................................................................................................................................e............................................ - vqrdmulhq v22, v29, v9, 1 // .............................................................*.......................................................................................................................................................... - vins v5, x26, 1 // ...*.................................................................................................................................................................................................................... - ldr x13, [x5, #-144] // ..........................................................................................................................................e............................................................................. - ldr x11, [x5, #-152] // .......................................................................................................................................e................................................................................ - vins v16, x21, 0 // ....................................................................................................................................................................................*................................... - ldr x21, [x5, #-248] // ...................................................................................................................................................................................*.................................... - vmlsq v20, v7, v8, 0 // ...................................................................*.................................................................................................................................................... - ldr x10, [x5, #-224] // ..........................................................................................................................................................................................*............................. - vins v11, x12, 1 // .............................................*.......................................................................................................................................................................... - add x1, x1, #64 // ................................*....................................................................................................................................................................................... - vmulq v13, v19, v9, 0 // .......................................................*................................................................................................................................................................ - ldr x23, [x5, #-96] // ..........................................................................................................................................................................e............................................. - ldr x28, [x5, #-160] // ......................................................................................................................................e................................................................................. - vins v28, x8, 0 // ....................................................................................................................................*................................................................................... - ldr x12, [x5, #-16] // ..............................................................................................................................................................................................e......................... - ldr x24, [x5, #-72] // ...............................................................................................................................................................................e........................................ - vmulq v17, v29, v9, 0 // ............................................................*........................................................................................................................................................... - vins v12, x7, 0 // ................................................*....................................................................................................................................................................... - ldr x8, [x5, #-216] // ...........................................................................................................................................................................................*............................ - vins v27, x9, 1 // .................................................................................................................................................................................................*...................... - ldr x17, [x5, #-112] // ..................................................................................................................................................e..................................................................... - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v31, v19, v9, 1 // ........................................................*............................................................................................................................................................... - vadd v3, v14, v20 // .....................................................................*.................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - vsub v1, v14, v20 // ....................................................................*................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v16, x21, 1 // .....................................................................................................................................................................................*.................................. - vmlsq v17, v22, v8, 0 // ..............................................................*......................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v12, x15, 1 // .................................................*...................................................................................................................................................................... - vqrdmulhq v14, v3, v9, 3 // ............................................................................*........................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - ldr x7, [x4, #-16] // ..............................................e......................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v7, v3, v9, 2 // ...........................................................................*............................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v13, v31, v8, 0 // .........................................................*.............................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v7, v14, v8, 0 // .............................................................................*.......................................................................................................................................... - vadd v22, v26, v17 // ................................................................*....................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vadd v3, v2, v13 // ...........................................................*............................................................................................................................................................ - vmulq v10, v25, v9, 0 // ..................................................*..................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v19, v22, v9, 2 // ......................................................................*................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v14, v22, v9, 3 // .......................................................................*................................................................................................................................................ - vadd v22, v3, v7 // ...............................................................................*........................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v29, v26, v17 // ...............................................................*........................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v25, v25, v9, 1 // ...................................................*.................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v3, v3, v7 // ..............................................................................*......................................................................................................................................... - vmulq v31, v22, v4, 2 // ..........................................................................................*............................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v10, v25, v8, 0 // ....................................................*................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v17, v22, v4, 3 // ...........................................................................................*............................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vadd v26, v5, v10 // ......................................................*................................................................................................................................................................. - vmlsq v19, v14, v8, 0 // ........................................................................*............................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v14, v1, v4, 0 // .....................................................................................*.................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v7, v3, v11, 0 // ...............................................................................................*........................................................................................................................ - vsub v20, v26, v19 // .........................................................................*.............................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - vins v28, x29, 1 // .....................................................................................................................................*.................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v3, v3, v11, 1 // ................................................................................................*....................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v22, v1, v4, 1 // ......................................................................................*................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v7, v3, v8, 0 // .................................................................................................*...................................................................................................................... - vadd v3, v26, v19 // ..........................................................................*............................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v14, v22, v8, 0 // .......................................................................................*................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v23, x27, 1 // .........................................................................................................................................................................................*.............................. - vsub v25, v2, v13 // ..........................................................*............................................................................................................................................................. - ldr x27, [x5, #-40] // .......................................................................................................................................................................................e................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v31, v17, v8, 0 // ............................................................................................*........................................................................................................................... - vadd v22, v20, v7 // ...................................................................................................*.................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - vsub v20, v20, v7 // ..................................................................................................*..................................................................................................................... - ldr x9, [x5, #-80] // ..............................................................................................................................................................................e......................................... - // gap // ........................................................................................................................................................................................................................ - vsub v1, v5, v10 // .....................................................*.................................................................................................................................................................. - vmulq v2, v29, v4, 0 // ................................................................................*....................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - vadd v26, v25, v14 // .........................................................................................*.............................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v10, v29, v4, 1 // .................................................................................*...................................................................................................................................... - vadd v5, v3, v31 // ..............................................................................................*......................................................................................................................... - ldr x15, [x5, #-128] // ..............................................................................................................................................e......................................................................... - vsub v19, v25, v14 // ........................................................................................*............................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v25, v3, v31 // .............................................................................................*.......................................................................................................................... - vqrdmulhq v13, v26, v11, 3 // .....................................................................................................*.................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - trn1_s v7, v22, v20 // ................................................................................................................*....................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2_s v31, v22, v20 // .................................................................................................................*...................................................................................................... - vmlsq v2, v10, v8, 0 // ..................................................................................*..................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - trn2_s v22, v5, v25 // ...............................................................................................................*........................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1_s v17, v5, v25 // ..............................................................................................................*......................................................................................................... - vmulq v25, v19, v12, 0 // .........................................................................................................*.............................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v5, v19, v12, 1 // ..........................................................................................................*............................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2_d v10, v22, v31 // ...................................................................................................................*.................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v19, v26, v11, 2 // ....................................................................................................*................................................................................................................... - trn2_d v26, v17, v7 // ..................................................................................................................*..................................................................................................... - // gap // ........................................................................................................................................................................................................................ - trn1_d v14, v22, v31 // .....................................................................................................................*.................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v31, x10, 0 // ............................................................................................................................................................................................*........................... - vmul v20, v10, v18 // ...........................................................................................................................................................*............................................................ - ldr x10, [x5, #-312] // ...............................................................................................................................................*........................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v9, v10, v28 // ............................................................................................................................................................*........................................................... - vadd v10, v1, v2 // ....................................................................................*................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v3, v26, v28 // .......................................................................................................................................................*................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v31, x8, 1 // .............................................................................................................................................................................................*.......................... - vmlsq v20, v9, v8, 0 // .............................................................................................................................................................*.......................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v19, v13, v8, 0 // ......................................................................................................*................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v13, v1, v2 // ...................................................................................*.................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v25, v5, v8, 0 // ...........................................................................................................*............................................................................................................ - vins v4, x19, 0 // ........................................e............................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - vadd v12, v14, v20 // ...............................................................................................................................................................*........................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmul v28, v26, v18 // ......................................................................................................................................................*................................................................. - vadd v1, v10, v19 // ........................................................................................................*............................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v15, x10, 1 // .................................................................................................................................................*...................................................................... - vsub v18, v13, v25 // ............................................................................................................*........................................................................................................... - vqrdmulh v11, v12, v21 // .................................................................................................................................................................*...................................................... - // gap // ........................................................................................................................................................................................................................ - vsub v2, v10, v19 // .......................................................................................................*................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vadd v22, v13, v25 // .............................................................................................................*.......................................................................................................... - vmul v25, v12, v24 // ................................................................................................................................................................*....................................................... - // gap // ........................................................................................................................................................................................................................ - vsub v26, v14, v20 // ..............................................................................................................................................................*......................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v28, v3, v8, 0 // ........................................................................................................................................................*............................................................... - trn2_s v9, v1, v2 // .......................................................................................................................*................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2_s v5, v22, v18 // .........................................................................................................................*.............................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1_s v18, v22, v18 // ........................................................................................................................*............................................................................................... - vmlsq v25, v11, v8, 0 // ..................................................................................................................................................................*..................................................... - // gap // ........................................................................................................................................................................................................................ - trn1_s v3, v1, v2 // ......................................................................................................................*................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v4, x14, 1 // .........................................e.............................................................................................................................................................................. - vmul v12, v26, v15 // .....................................................................................................................................................................*.................................................. - // gap // ........................................................................................................................................................................................................................ - trn2_d v1, v9, v5 // ...........................................................................................................................*............................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v13, v26, v6 // ......................................................................................................................................................................*................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2_d v10, v3, v18 // ..........................................................................................................................*............................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1_d v22, v3, v18 // ............................................................................................................................*........................................................................................... - vmul v29, v1, v30 // .......................................................................................................................................................................................................*................ - // gap // ........................................................................................................................................................................................................................ - trn1_d v3, v9, v5 // .............................................................................................................................*.......................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v2, v1, v0 // ........................................................................................................................................................................................................*............... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v21, x13, 0 // ............................................................................................................................................e........................................................................... - vqrdmulh v26, v10, v0 // ...................................................................................................................................................................................................*.................... - // gap // ........................................................................................................................................................................................................................ - trn1_d v0, v17, v7 // ....................................................................................................................*................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v29, v2, v8, 0 // .........................................................................................................................................................................................................*.............. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmul v1, v10, v30 // ..................................................................................................................................................................................................*..................... - vadd v14, v0, v28 // ..........................................................................................................................................................*............................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v12, v13, v8, 0 // .......................................................................................................................................................................*................................................ - vsub v18, v3, v29 // ..........................................................................................................................................................................................................*............. - // gap // ........................................................................................................................................................................................................................ - vadd v13, v3, v29 // ...........................................................................................................................................................................................................*............ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v1, v26, v8, 0 // ....................................................................................................................................................................................................*................... - vsub v26, v14, v25 // ...................................................................................................................................................................*.................................................... - // gap // ........................................................................................................................................................................................................................ - vadd v25, v14, v25 // ....................................................................................................................................................................*................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v9, v13, v23 // .............................................................................................................................................................................................................*.......... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v24, v0, v28 // .........................................................................................................................................................*.............................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v7, v18, v27 // ..................................................................................................................................................................................................................*..... - // gap // ........................................................................................................................................................................................................................ - vins v6, x17, 0 // ....................................................................................................................................................e................................................................... - vins v0, x9, 0 // ................................................................................................................................................................................e....................................... - ldr x9, [x5, #-8] // ...............................................................................................................................................................................................e........................ - // gap // ........................................................................................................................................................................................................................ - vmul v29, v18, v31 // .................................................................................................................................................................................................................*...... - vadd v27, v24, v12 // .........................................................................................................................................................................*.............................................. - // gap // ........................................................................................................................................................................................................................ - vsub v23, v22, v1 // .....................................................................................................................................................................................................*.................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmul v30, v13, v16 // ............................................................................................................................................................................................................*........... - vadd v18, v22, v1 // ......................................................................................................................................................................................................*................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v29, v7, v8, 0 // ...................................................................................................................................................................................................................*.... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v30, v9, v8, 0 // ..............................................................................................................................................................................................................*......... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v11, x25, 0 // ............................................e........................................................................................................................................................................... - vsub v17, v23, v29 // ....................................................................................................................................................................................................................*... - ldr x25, [x5, #-104] // ...................................................................................................................................................e.................................................................... - vsub v28, v24, v12 // ........................................................................................................................................................................*............................................... - vadd v16, v23, v29 // .....................................................................................................................................................................................................................*.. - // gap // ........................................................................................................................................................................................................................ - vsub v15, v18, v30 // ...............................................................................................................................................................................................................*........ - vadd v14, v18, v30 // ................................................................................................................................................................................................................*....... - // gap // ........................................................................................................................................................................................................................ - vins v30, x23, 0 // ............................................................................................................................................................................e........................................... - vins v9, x0, 0 // ....................................e................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - st4 {v25.4S,v26.4S,v27.4S,v28.4S}, [x1], #64 // ......................................................................................................................................................................................................................*. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - st4 {v14.4S,v15.4S,v16.4S,v17.4S}, [x2], #64 // .......................................................................................................................................................................................................................* - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - - // original source code - // ldr x10, [x1, #64] // ..................................................................................................................................................................................................................................................*...................................................................................................................................................................................... || .............................................................................................................................................................*.................................................................................................................................. - // ldr x11, [x1, #72] // ....................................................................................................................................................................................................................................*.................................................................................................................................................................................................... || ........................................................................................................................................................*....................................................................................................................................... - // vins v9, x10, 0 // ..............................................................................................................................................................................................................................................................*.......................................................................................................................................................................... || .................................................................................................................................................................*.............................................................................................................................. - // vins v9, x11, 1 // ...............................................................................................................................................................................................................................................................................*......................................................................................................................................................... || .......................................................................................................................................................................*........................................................................................................................ - // ldr x10, [x1, #80] // .................................................................................................................................................................................................................*....................................................................................................................................................................................................................... || ..................................................................................................................................................*............................................................................................................................................. - // ldr x11, [x1, #88] // ..................................................................................................................................................................................................................................*...................................................................................................................................................................................................... || .......................................................................................................................................................*........................................................................................................................................ - // vins v10, x10, 0 // ...............................................................................................................................................................................................................................*......................................................................................................................................................................................................... || ......................................................................................................................................................*......................................................................................................................................... - // vins v10, x11, 1 // ..........................................................................................................................................................................................................................................................*.............................................................................................................................................................................. || ...............................................................................................................................................................*................................................................................................................................ - // ldr x10, [x1, #96] // ..............................................................................................................................................................................................................................*.......................................................................................................................................................................................................... || ......................................................................................................................................................*......................................................................................................................................... - // ldr x11, [x1, #104] // ........................................................................................................................................................................................................................................*................................................................................................................................................................................................ || .........................................................................................................................................................*...................................................................................................................................... - // vins v11, x10, 0 // .........................................................................................................................................................................................................................................*............................................................................................................................................................................................... || ..........................................................................................................................................................*..................................................................................................................................... - // vins v11, x11, 1 // ............................................................................................................................................................................................................................................................*............................................................................................................................................................................ || ................................................................................................................................................................*............................................................................................................................... - // ldr x10, [x1, #112] // .......................................................................................................................................................................................................................*................................................................................................................................................................................................................. || ....................................................................................................................................................*........................................................................................................................................... - // ldr x11, [x1, #120] // .......................................................................................................................................................................................................................................................*................................................................................................................................................................................. || ..............................................................................................................................................................*................................................................................................................................. - // vins v12, x10, 0 // .............................................................................................................................................................................................................................................*........................................................................................................................................................................................... || ...........................................................................................................................................................*.................................................................................................................................... - // vins v12, x11, 1 // ...................................................................................................................................................................................................................................................................*..................................................................................................................................................................... || ..................................................................................................................................................................*............................................................................................................................. - // ldr x10, [x2, #64] // ..........................................................................................................................................................................................................................*.............................................................................................................................................................................................................. || .....................................................................................................................................................*.......................................................................................................................................... - // ldr x11, [x2, #72] // ........................................................................................................................................................................................................................................................*................................................................................................................................................................................ || ...............................................................................................................................................................*................................................................................................................................ - // vins v13, x10, 0 // ......................................................................................................................................................................................................................................*.................................................................................................................................................................................................. || .........................................................................................................................................................*...................................................................................................................................... - // vins v13, x11, 1 // ....................................................................................................................................................................................................................................................................*.................................................................................................................................................................... || ...................................................................................................................................................................*............................................................................................................................ - // ldr x10, [x2, #80] // ....................................................................................................................................................................................................................*.................................................................................................................................................................................................................... || ...................................................................................................................................................*............................................................................................................................................ - // ldr x11, [x2, #88] // ..........................................................................................................................................................................................................................................*.............................................................................................................................................................................................. || ..........................................................................................................................................................*..................................................................................................................................... - // vins v14, x10, 0 // ...................................................................................................................................................................................................................................*..................................................................................................................................................................................................... || ........................................................................................................................................................*....................................................................................................................................... - // vins v14, x11, 1 // ......................................................................................................................................................................................................................................................*.................................................................................................................................................................................. || ..............................................................................................................................................................*................................................................................................................................. - // ldr x10, [x2, #96] // ...........................................................................................................................................................................................................................*............................................................................................................................................................................................................. || .....................................................................................................................................................*.......................................................................................................................................... - // ldr x11, [x2, #104] // ................................................................................................................................................................................................................................................*........................................................................................................................................................................................ || ............................................................................................................................................................*................................................................................................................................... - // vins v15, x10, 0 // .......................................................................................................................................................................................................................................*................................................................................................................................................................................................. || .........................................................................................................................................................*...................................................................................................................................... - // vins v15, x11, 1 // ...........................................................................................................................................................................................................................................................*............................................................................................................................................................................. || ................................................................................................................................................................*............................................................................................................................... - // ldr x10, [x2, #112] // ...................................................................................................................................................................................................................*..................................................................................................................................................................................................................... || ..................................................................................................................................................*............................................................................................................................................. - // ldr x11, [x2, #120] // .....................................................................................................................................................................................................................................*................................................................................................................................................................................................... || ........................................................................................................................................................*....................................................................................................................................... - // vins v16, x10, 0 // .............................................................................................................................................................................................................................*........................................................................................................................................................................................................... || ......................................................................................................................................................*......................................................................................................................................... - // vins v16, x11, 1 // .................................................................................................................................................................................................................................................*....................................................................................................................................................................................... || ............................................................................................................................................................*................................................................................................................................... - // add x1, x1, #64 // .......................................................................................................................................................................................................................................................................................*................................................................................................................................................. || .........................................................................................................................................................................*...................................................................................................................... - // add x2, x2, #64 // .................................................................................................................................................................................................................................................................*....................................................................................................................................................................... || ..................................................................................................................................................................*............................................................................................................................. - // ldr x10, [x4] , #64 // ....................................................e.................................................................................................................................................................................................................................................................................................................................................................................... || ..................e............................................................................................................................................................................................................................................................................. - // ldr x11, [x4, #-56] // ................................................................................................................................................................................................................................*........................................................................................................................................................................................................ || .......................................................................................................................................................*........................................................................................................................................ - // vins v0, x10, 0 // ..............................................................................................................................................................................................................e.......................................................................................................................................................................................................................... || .........................................................................................................................................e...................................................................................................................................................... - // vins v0, x11, 1 // ..............................................................................................................................................................................................................................................*.......................................................................................................................................................................................... || ...........................................................................................................................................................*.................................................................................................................................... - // ldr x10, [x4, #-48] // e........................................................................................................................................................................................................................................................................................................................................................................................................................................ || e............................................................................................................................................................................................................................................................................................... - // ldr x11, [x4, #-40] // ....................e.................................................................................................................................................................................................................................................................................................................................................................................................................... || .......e........................................................................................................................................................................................................................................................................................ - // vins v1, x10, 0 // ...................................................................................................................................................e..................................................................................................................................................................................................................................................................................... || ............................................................................................e................................................................................................................................................................................................... - // vins v1, x11, 1 // ....................................................................................................................................................................e.................................................................................................................................................................................................................................................................... || ........................................................................................................e....................................................................................................................................................................................... - // ldr x10, [x4, #-32] // .............................e........................................................................................................................................................................................................................................................................................................................................................................................................... || ..........e..................................................................................................................................................................................................................................................................................... - // ldr x11, [x4, #-24] // ........................................................................................................................................................................................................................................................................*................................................................................................................................................................ || ....................................................................................................................................................................*........................................................................................................................... - // vins v2, x10, 0 // ......................................................................................................................................................................................................e.................................................................................................................................................................................................................................. || ......................................................................................................................................e......................................................................................................................................................... - // vins v2, x11, 1 // ......................................................................................................................................................................................................................................................................................*.................................................................................................................................................. || .........................................................................................................................................................................*...................................................................................................................... - // ldr x10, [x4, #-16] // ..................................................................................e...................................................................................................................................................................................................................................................................................................................................................... || ...............................e................................................................................................................................................................................................................................................................ - // ldr x11, [x4, #-8] // .........................................................................................................................................................................................................................................................*............................................................................................................................................................................... || ...............................................................................................................................................................*................................................................................................................................ - // vins v3, x10, 0 // ...............................................................................................................................................................................................................................................................................................*......................................................................................................................................... || ............................................................................................................................................................................*................................................................................................................... - // vins v3, x11, 1 // ........................................................................................................................................................................................................................................................................................................*................................................................................................................................ || ..................................................................................................................................................................................*............................................................................................................. - // vmulq v24, v13, v0, 0 // ................................................................................................................................................................................................................................................................................................................*........................................................................................................................ || ..........................................................................................................................................................................................*..................................................................................................... - // vqrdmulhq v13, v13, v0, 1 // .....................................................................................................................................................................................................................................................................................................................*................................................................................................................... || ................................................................................................................................................................................................*............................................................................................... - // vmlsq v24, v13, v8, 0 // ........................................................................................................................................................................................................................................................................................................................*................................................................................................................ || ....................................................................................................................................................................................................*........................................................................................... - // vsub v13, v9, v24 // ............................................................................................................................................................................................................................................................................................................................................*............................................................................................ || ........................................................................................................................................................................................................................*....................................................................... - // vadd v9, v9, v24 // ..........................................................................................................................................................................................................................................................................................................................*.............................................................................................................. || ........................................................................................................................................................................................................*....................................................................................... - // vmulq v24, v14, v0, 0 // ........................................................................................................................................................................................................................................................................................*................................................................................................................................................ || ..........................................................................................................................................................................*..................................................................................................................... - // vqrdmulhq v14, v14, v0, 1 // ...................................................................................................................................................................................................................................................................................................*..................................................................................................................................... || ..............................................................................................................................................................................*................................................................................................................. - // vmlsq v24, v14, v8, 0 // ............................................................................................................................................................................................................................................................................................................*............................................................................................................................ || ......................................................................................................................................................................................*......................................................................................................... - // vsub v14, v10, v24 // ......................................................................................................................................................................................................................................................................................................................................*.................................................................................................. || .....................................................................................................................................................................................................................*.......................................................................... - // vadd v10, v10, v24 // ...............................................................................................................................................................................................................................................................................................................*......................................................................................................................... || ..........................................................................................................................................................................................*..................................................................................................... - // vmulq v24, v15, v0, 0 // ..............................................................................................................................................................................................................................................................................................*.......................................................................................................................................... || ............................................................................................................................................................................*................................................................................................................... - // vqrdmulhq v15, v15, v0, 1 // ..............................................................................................................................................................................................................................................................................*.......................................................................................................................................................... || ......................................................................................................................................................................*......................................................................................................................... - // vmlsq v24, v15, v8, 0 // .......................................................................................................................................................................................................................................................................................................*................................................................................................................................. || ................................................................................................................................................................................*............................................................................................................... - // vsub v15, v11, v24 // ....................................................................................................................................................................................................................................................................................................................*.................................................................................................................... || ...............................................................................................................................................................................................*................................................................................................ - // vadd v11, v11, v24 // ..............................................................................................................................................................................................................................................................................................................*.......................................................................................................................... || ........................................................................................................................................................................................*....................................................................................................... - // vmulq v24, v16, v0, 0 // .........................................................................................................................................................................................................................................................................*............................................................................................................................................................... || ....................................................................................................................................................................*........................................................................................................................... - // vqrdmulhq v16, v16, v0, 1 // ..................................................................................................................................................................................................................................................................*...................................................................................................................................................................... || ..................................................................................................................................................................*............................................................................................................................. - // vmlsq v24, v16, v8, 0 // ....................................................................................................................................................................................................................................................................................*.................................................................................................................................................... || ........................................................................................................................................................................*....................................................................................................................... - // vsub v16, v12, v24 // .....................................................................................................................................................................................................................................................................................................*................................................................................................................................... || ...............................................................................................................................................................................*................................................................................................................ - // vadd v12, v12, v24 // ....................................................................................................................................................................................................................................................................................................*.................................................................................................................................... || ..............................................................................................................................................................................*................................................................................................................. - // vmulq v24, v11, v0, 2 // .................................................................................................................................................................................................................................................................................................................*....................................................................................................................... || ............................................................................................................................................................................................*................................................................................................... - // vqrdmulhq v11, v11, v0, 3 // ..................................................................................................................................................................................................................................................................................................................*...................................................................................................................... || ..............................................................................................................................................................................................*................................................................................................. - // vmlsq v24, v11, v8, 0 // ...........................................................................................................................................................................................................................................................................................................................*............................................................................................................. || ........................................................................................................................................................................................................*....................................................................................... - // vsub v11, v9, v24 // ..............................................................................................................................................................................................................................................................................................................................*.......................................................................................................... || ............................................................................................................................................................................................................*................................................................................... - // vadd v9, v9, v24 // ...................................................................................................................................................................................................................................................................................................................................*..................................................................................................... || ..................................................................................................................................................................................................................*............................................................................. - // vmulq v24, v12, v0, 2 // ...........................................................................................................................................................................................................................................................................................................*............................................................................................................................. || ....................................................................................................................................................................................*........................................................................................................... - // vqrdmulhq v12, v12, v0, 3 // .........................................................................................................................................................................................................................................................................................................*............................................................................................................................... || ..................................................................................................................................................................................*............................................................................................................. - // vmlsq v24, v12, v8, 0 // .............................................................................................................................................................................................................................................................................................................*........................................................................................................................... || ........................................................................................................................................................................................*....................................................................................................... - // vsub v12, v10, v24 // ......................................................................................................................................................................................................................................................................................................................*.................................................................................................................. || ..................................................................................................................................................................................................*............................................................................................. - // vadd v10, v10, v24 // ...................................................................................................................................................................................................................................................................................................................*..................................................................................................................... || ..............................................................................................................................................................................................*................................................................................................. - // vmulq v24, v15, v1, 0 // .............................................................................................................................................................................................................................................................................................................................................*........................................................................................... || ........................................................................................................................................................................................................................*....................................................................... - // vqrdmulhq v15, v15, v1, 1 // ...............................................................................................................................................................................................................................................................................................................................................*......................................................................................... || ..........................................................................................................................................................................................................................*..................................................................... - // vmlsq v24, v15, v8, 0 // .......................................................................................................................................................................................................................................................................................................................................................*................................................................................. || ..............................................................................................................................................................................................................................*................................................................. - // vsub v15, v13, v24 // .........................................................................................................................................................................................................................................................................................................................................................................*............................................................... || ...............................................................................................................................................................................................................................................*................................................ - // vadd v13, v13, v24 // ....................................................................................................................................................................................................................................................................................................................................................................*.................................................................... || ........................................................................................................................................................................................................................................*....................................................... - // vmulq v24, v16, v1, 0 // ............................................................................................................................................................................................................................................................................................................................*............................................................................................................ || ..........................................................................................................................................................................................................*..................................................................................... - // vqrdmulhq v16, v16, v1, 1 // .................................................................................................................................................................................................................................................................................................................................*....................................................................................................... || ................................................................................................................................................................................................................*............................................................................... - // vmlsq v24, v16, v8, 0 // ....................................................................................................................................................................................................................................................................................................................................*.................................................................................................... || ....................................................................................................................................................................................................................*........................................................................... - // vsub v16, v14, v24 // ..................................................................................................................................................................................................................................................................................................................................................*...................................................................................... || ...........................................................................................................................................................................................................................*.................................................................... - // vadd v14, v14, v24 // ..............................................................................................................................................................................................................................................................................................................................................*.......................................................................................... || .........................................................................................................................................................................................................................*...................................................................... - // vmulq v24, v10, v1, 2 // .......................................................................................................................................................................................................................................................................................................................*................................................................................................................. || ..................................................................................................................................................................................................*............................................................................................. - // vqrdmulhq v10, v10, v1, 3 // .........................................................................................................................................................................................................................................................................................................................*............................................................................................................... || ......................................................................................................................................................................................................*......................................................................................... - // vmlsq v24, v10, v8, 0 // ........................................................................................................................................................................................................................................................................................................................................*................................................................................................ || ......................................................................................................................................................................................................................*......................................................................... - // vsub v10, v9, v24 // ...................................................................................................................................................................................................................................................................................................................................................*..................................................................................... || ............................................................................................................................................................................................................................*................................................................... - // vadd v9, v9, v24 // ................................................................................................................................................................................................................................................................................................................................................*........................................................................................ || ..........................................................................................................................................................................................................................*..................................................................... - // vmulq v24, v12, v2, 0 // .............................................................................................................................................................................................................................................................................................................................*........................................................................................................... || ............................................................................................................................................................................................................*................................................................................... - // vqrdmulhq v12, v12, v2, 1 // ................................................................................................................................................................................................................................................................................................................................*........................................................................................................ || ..............................................................................................................................................................................................................*................................................................................. - // vmlsq v24, v12, v8, 0 // ..................................................................................................................................................................................................................................................................................................................................*...................................................................................................... || ..................................................................................................................................................................................................................*............................................................................. - // vsub v12, v11, v24 // ..........................................................................................................................................................................................................................................................................................................................................*.............................................................................................. || .......................................................................................................................................................................................................................*........................................................................ - // vadd v11, v11, v24 // .........................................................................................................................................................................................................................................................................................................................................*............................................................................................... || ......................................................................................................................................................................................................................*......................................................................... - // vmulq v24, v14, v2, 2 // .............................................................................................................................................................................................................................................................................................................................................................*........................................................................... || ....................................................................................................................................................................................................................................*........................................................... - // vqrdmulhq v14, v14, v2, 3 // ....................................................................................................................................................................................................................................................................................................................................................*.................................................................................... || ............................................................................................................................................................................................................................*................................................................... - // vmlsq v24, v14, v8, 0 // ........................................................................................................................................................................................................................................................................................................................................................................*................................................................ || ..............................................................................................................................................................................................................................................*................................................. - // vsub v14, v13, v24 // ..................................................................................................................................................................................................................................................................................................................................................................................*...................................................... || .....................................................................................................................................................................................................................................................*.......................................... - // vadd v13, v13, v24 // ..............................................................................................................................................................................................................................................................................................................................................................................*.......................................................... || ..................................................................................................................................................................................................................................................*............................................. - // vmulq v24, v16, v3, 0 // ..........................................................................................................................................................................................................................................................................................................................................................*.............................................................................. || ................................................................................................................................................................................................................................*............................................................... - // vqrdmulhq v16, v16, v3, 1 // ...........................................................................................................................................................................................................................................................................................................................................................*............................................................................. || ..................................................................................................................................................................................................................................*............................................................. - // vmlsq v24, v16, v8, 0 // ..........................................................................................................................................................................................................................................................................................................................................................................*.............................................................. || ................................................................................................................................................................................................................................................*............................................... - // vsub v16, v15, v24 // ................................................................................................................................................................................................................................................................................................................................................................................*........................................................ || ....................................................................................................................................................................................................................................................*........................................... - // vadd v15, v15, v24 // ...................................................................................................................................................................................................................................................................................................................................................................................*..................................................... || ......................................................................................................................................................................................................................................................*......................................... - // trn1_s v25, v9, v10 // .........................................................................................................................................................................................................................................................................................................................................................*............................................................................... || ................................................................................................................................................................................................................................*............................................................... - // trn2_s v26, v9, v10 // ........................................................................................................................................................................................................................................................................................................................................................*................................................................................ || ...............................................................................................................................................................................................................................*................................................................ - // trn1_s v27, v11, v12 // .....................................................................................................................................................................................................................................................................................................................................................*................................................................................... || .............................................................................................................................................................................................................................*.................................................................. - // trn2_s v28, v11, v12 // ......................................................................................................................................................................................................................................................................................................................................................*.................................................................................. || ..............................................................................................................................................................................................................................*................................................................. - // trn2_d v11, v25, v27 // ..............................................................................................................................................................................................................................................................................................................................................................*.......................................................................... || ....................................................................................................................................................................................................................................*........................................................... - // trn2_d v12, v26, v28 // ............................................................................................................................................................................................................................................................................................................................................................*............................................................................ || ...................................................................................................................................................................................................................................*............................................................ - // trn1_d v9, v25, v27 // .......................................................................................................................................................................................................................................................................................................................................................................................................*................................. || .....................................................................................................................................................................................................................................................................*.......................... - // trn1_d v10, v26, v28 // ...............................................................................................................................................................................................................................................................................................................................................................*......................................................................... || .....................................................................................................................................................................................................................................*.......................................................... - // trn1_s v25, v13, v14 // ...........................................................................................................................................................................................................................................................................................................................................................................................*............................................. || ...........................................................................................................................................................................................................................................................*.................................... - // trn2_s v26, v13, v14 // .......................................................................................................................................................................................................................................................................................................................................................................................*................................................. || ........................................................................................................................................................................................................................................................*....................................... - // trn1_s v27, v15, v16 // .........................................................................................................................................................................................................................................................................................................................................................................................*............................................... || ..........................................................................................................................................................................................................................................................*..................................... - // trn2_s v28, v15, v16 // ........................................................................................................................................................................................................................................................................................................................................................................................*................................................ || .........................................................................................................................................................................................................................................................*...................................... - // trn2_d v15, v25, v27 // ................................................................................................................................................................................................................................................................................................................................................................................................*........................................ || ...............................................................................................................................................................................................................................................................*................................ - // trn2_d v16, v26, v28 // ..............................................................................................................................................................................................................................................................................................................................................................................................*.......................................... || .............................................................................................................................................................................................................................................................*.................................. - // trn1_d v13, v25, v27 // .................................................................................................................................................................................................................................................................................................................................................................................................*....................................... || ................................................................................................................................................................................................................................................................*............................... - // trn1_d v14, v26, v28 // ...................................................................................................................................................................................................................................................................................................................................................................................................*..................................... || .................................................................................................................................................................................................................................................................*.............................. - // ldr x10, [x5] , #192 // ............................e............................................................................................................................................................................................................................................................................................................................................................................................................ || .........e...................................................................................................................................................................................................................................................................................... - // ldr x11, [x5, #-184] // ...................................................e..................................................................................................................................................................................................................................................................................................................................................................................... || .................e.............................................................................................................................................................................................................................................................................. - // vins v0, x10, 0 // ...................................................................................................................................................................................................................................................*..................................................................................................................................................................................... || .............................................................................................................................................................*.................................................................................................................................. - // vins v0, x11, 1 // .......................................................................................................................................................................................................................................................................*................................................................................................................................................................. || ....................................................................................................................................................................*........................................................................................................................... - // ldr x10, [x5, #-176] // .............................................................................................................................................................................................................................................................*........................................................................................................................................................................... || ................................................................................................................................................................*............................................................................................................................... - // ldr x11, [x5, #-168] // ......................................................................................................................................................................................................................................................................*.................................................................................................................................................................. || ...................................................................................................................................................................*............................................................................................................................ - // vins v4, x10, 0 // ...........................................................................................................................................................................................................................................................................................*............................................................................................................................................. || ...........................................................................................................................................................................*.................................................................................................................... - // vins v4, x11, 1 // ...............................................................................................................................................................................................................................................................................................................................*......................................................................................................... || .............................................................................................................................................................................................................*.................................................................................. - // ldr x10, [x5, #-160] // ..................................................................e...................................................................................................................................................................................................................................................................................................................................................................... || ......................e......................................................................................................................................................................................................................................................................... - // ldr x11, [x5, #-152] // .........................................................e............................................................................................................................................................................................................................................................................................................................................................................... || ...................e............................................................................................................................................................................................................................................................................ - // vins v1, x10, 0 // ......................................................................................................................................................................................................................*.................................................................................................................................................................................................................. || ...................................................................................................................................................*............................................................................................................................................ - // vins v1, x11, 1 // ...........................................................................................................................................................................................................................................*............................................................................................................................................................................................. || ..........................................................................................................................................................*..................................................................................................................................... - // ldr x10, [x5, #-144] // ........................................................e................................................................................................................................................................................................................................................................................................................................................................................ || ...................e............................................................................................................................................................................................................................................................................ - // ldr x11, [x5, #-136] // .....................................................................................................................................................................................................................*................................................................................................................................................................................................................... || ...................................................................................................................................................*............................................................................................................................................ - // vins v5, x10, 0 // .............................................................................................................................................................................e........................................................................................................................................................................................................................................................... || ................................................................................................................e............................................................................................................................................................................... - // vins v5, x11, 1 // .................................................................................................................................................................................................................................*....................................................................................................................................................................................................... || .......................................................................................................................................................*........................................................................................................................................ - // ldr x10, [x5, #-128] // .........................................................................................................................e............................................................................................................................................................................................................................................................................................................... || ......................................................................e......................................................................................................................................................................................................................... - // ldr x11, [x5, #-120] // ..................................................................................................................................................................................................................................................................................................................................................................*...................................................................... || ......................................................................................................................................................................................................................................*......................................................... - // vins v2, x10, 0 // ..................................................................................................................................................................................................................*...................................................................................................................................................................................................................... || ..................................................................................................................................................*............................................................................................................................................. - // vins v2, x11, 1 // ...............................................................................................................................................................................................................................................................................................................................................................................*......................................................... || ...................................................................................................................................................................................................................................................*............................................ - // ldr x10, [x5, #-112] // ..........................................................................e.............................................................................................................................................................................................................................................................................................................................................................. || .........................e...................................................................................................................................................................................................................................................................... - // ldr x11, [x5, #-104] // ........................................................................................................................................................................................................e................................................................................................................................................................................................................................ || ......................................................................................................................................e......................................................................................................................................................... - // vins v6, x10, 0 // ............................................................................................................................................................................................e............................................................................................................................................................................................................................................ || ............................................................................................................................e................................................................................................................................................................... - // vins v6, x11, 1 // ...............................................................................................................................................................................................................................................*......................................................................................................................................................................................... || ............................................................................................................................................................*................................................................................................................................... - // vmul v24, v11, v0 // .............................................................................................................................................................................................................................................................................................................................................................................*........................................................... || ..................................................................................................................................................................................................................................................*............................................. - // vqrdmulh v11, v11, v4 // .....................................................................................................................................................................................................................................................................................................................................................................*................................................................... || ..........................................................................................................................................................................................................................................*..................................................... - // vmlsq v24, v11, v8, 0 // ......................................................................................................................................................................................................................................................................................................................................................................................*.................................................. || ........................................................................................................................................................................................................................................................*....................................... - // vsub v11, v9, v24 // ..................................................................................................................................................................................................................................................................................................................................................................................................................*...................... || ...............................................................................................................................................................................................................................................................................*................ - // vadd v9, v9, v24 // ..........................................................................................................................................................................................................................................................................................................................................................................................................*.............................. || ........................................................................................................................................................................................................................................................................*....................... - // vmul v24, v12, v0 // .................................................................................................................................................................................................................................................................................................................................................................*....................................................................... || ......................................................................................................................................................................................................................................*......................................................... - // vqrdmulh v12, v12, v4 // ...................................................................................................................................................................................................................................................................................................................................................................*..................................................................... || ........................................................................................................................................................................................................................................*....................................................... - // vmlsq v24, v12, v8, 0 // .......................................................................................................................................................................................................................................................................................................................................................................*................................................................. || ............................................................................................................................................................................................................................................*................................................... - // vsub v12, v10, v24 // .....................................................................................................................................................................................................................................................................................................................................................................................*................................................... || .......................................................................................................................................................................................................................................................*........................................ - // vadd v10, v10, v24 // ............................................................................................................................................................................................................................................................................................................................................................................*............................................................ || .................................................................................................................................................................................................................................................*.............................................. - // vmul v24, v10, v1 // ....................................................................................................................................................................................................................................................................................................................................................................................*.................................................... || ......................................................................................................................................................................................................................................................*......................................... - // vqrdmulh v10, v10, v5 // .................................................................................................................................................................................................................................................................................................................................................................................*....................................................... || ....................................................................................................................................................................................................................................................*........................................... - // vmlsq v24, v10, v8, 0 // ..........................................................................................................................................................................................................................................................................................................................................................................................*.............................................. || ..........................................................................................................................................................................................................................................................*..................................... - // vsub v10, v9, v24 // ...............................................................................................................................................................................................................................................................................................................................................................................................................*......................... || ............................................................................................................................................................................................................................................................................*................... - // vadd v9, v9, v24 // ................................................................................................................................................................................................................................................................................................................................................................................................................*........................ || .............................................................................................................................................................................................................................................................................*.................. - // vmul v24, v12, v2 // .............................................................................................................................................................................................................................................................................................................................................................................................*........................................... || ............................................................................................................................................................................................................................................................*................................... - // vqrdmulh v12, v12, v6 // ...............................................................................................................................................................................................................................................................................................................................................................................................*......................................... || ..............................................................................................................................................................................................................................................................*................................. - // vmlsq v24, v12, v8, 0 // ...........................................................................................................................................................................................................................................................................................................................................................................................................*............................. || ..........................................................................................................................................................................................................................................................................*..................... - // vsub v12, v11, v24 // .................................................................................................................................................................................................................................................................................................................................................................................................................................*....... || ...........................................................................................................................................................................................................................................................................................*.... - // vadd v11, v11, v24 // ........................................................................................................................................................................................................................................................................................................................................................................................................................*................ || ..................................................................................................................................................................................................................................................................................*............. - // ldr x10, [x5, #-96] // .................................................................e....................................................................................................................................................................................................................................................................................................................................................................... || ......................e......................................................................................................................................................................................................................................................................... - // ldr x11, [x5, #-88] // .....................................................e................................................................................................................................................................................................................................................................................................................................................................................... || ..................e............................................................................................................................................................................................................................................................................. - // vins v0, x10, 0 // .............................................................................................................................................................................................................e........................................................................................................................................................................................................................... || .........................................................................................................................................e...................................................................................................................................................... - // vins v0, x11, 1 // .........................................................................................................................................................................................................................*............................................................................................................................................................................................................... || ....................................................................................................................................................*........................................................................................................................................... - // ldr x10, [x5, #-80] // ...................................................................................................................e..................................................................................................................................................................................................................................................................................................................... || ...................................................................e............................................................................................................................................................................................................................ - // ldr x11, [x5, #-72] // .....................................................................e................................................................................................................................................................................................................................................................................................................................................................... || .......................e........................................................................................................................................................................................................................................................................ - // vins v4, x10, 0 // .............................................................................................................................................................................................e........................................................................................................................................................................................................................................... || .............................................................................................................................e.................................................................................................................................................................. - // vins v4, x11, 1 // ............................................................................................................................................................................................................................*............................................................................................................................................................................................................ || .....................................................................................................................................................*.......................................................................................................................................... - // ldr x10, [x5, #-64] // .....................................................................................................................................................................................................................................................................*................................................................................................................................................................... || ...................................................................................................................................................................*............................................................................................................................ - // ldr x11, [x5, #-56] // ...................................................................................................................................................................................................................................................................................*..................................................................................................................................................... || ........................................................................................................................................................................*....................................................................................................................... - // vins v1, x10, 0 // ..................................................................................................................................................................................................................................................................................*...................................................................................................................................................... || ........................................................................................................................................................................*....................................................................................................................... - // vins v1, x11, 1 // ......................................................................................................................................................................................................................................................................................................*.................................................................................................................................. || ................................................................................................................................................................................*............................................................................................................... - // ldr x10, [x5, #-48] // ...............................................................................................................................................................................................................................................................*......................................................................................................................................................................... || .................................................................................................................................................................*.............................................................................................................................. - // ldr x11, [x5, #-40] // ...............................................................................................................e......................................................................................................................................................................................................................................................................................................................... || .................................................................e.............................................................................................................................................................................................................................. - // vins v5, x10, 0 // ..........................................................................................................................................................................................................................................................................*.............................................................................................................................................................. || .....................................................................................................................................................................*.......................................................................................................................... - // vins v5, x11, 1 // .....................................................................................................................................................................................................................................................................................................................................*................................................................................................... || ....................................................................................................................................................................................................................*........................................................................... - // ldr x10, [x5, #-32] // .....................................................................................................................................................................................................................................................................................*................................................................................................................................................... || .........................................................................................................................................................................*...................................................................................................................... - // ldr x11, [x5, #-24] // ................................................................................................................................................................................................................................................................................................*........................................................................................................................................ || ............................................................................................................................................................................*................................................................................................................... - // vins v2, x10, 0 // ................................................................................................................................................................................................................................................................................................................................................................*........................................................................ || ......................................................................................................................................................................................................................................*......................................................... - // vins v2, x11, 1 // ......................................................................................................................................................................................................................................................................................................................................................................*.................................................................. || ............................................................................................................................................................................................................................................*................................................... - // ldr x10, [x5, #-16] // ....................................................................e.................................................................................................................................................................................................................................................................................................................................................................... || .......................e........................................................................................................................................................................................................................................................................ - // ldr x11, [x5, #-8] // ..............................................................................................................................................................................................e.......................................................................................................................................................................................................................................... || .............................................................................................................................e.................................................................................................................................................................. - // vins v6, x10, 0 // ................................................................................................................................................................................................................................................................*........................................................................................................................................................................ || .................................................................................................................................................................*.............................................................................................................................. - // vins v6, x11, 1 // .................................................................................................................................................................................................................................................................................................*....................................................................................................................................... || .............................................................................................................................................................................*.................................................................................................................. - // vmul v24, v15, v0 // .........................................................................................................................................................................................................................................................................................................................................................................................................*............................... || ........................................................................................................................................................................................................................................................................*....................... - // vqrdmulh v15, v15, v4 // ......................................................................................................................................................................................................................................................................................................................................................................................................*.................................. || ....................................................................................................................................................................................................................................................................*........................... - // vmlsq v24, v15, v8, 0 // ..............................................................................................................................................................................................................................................................................................................................................................................................................*.......................... || ............................................................................................................................................................................................................................................................................*................... - // vsub v15, v13, v24 // .........................................................................................................................................................................................................................................................................................................................................................................................................................*............... || ...................................................................................................................................................................................................................................................................................*............ - // vadd v13, v13, v24 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................*............. || ....................................................................................................................................................................................................................................................................................*........... - // vmul v24, v16, v0 // ..................................................................................................................................................................................................................................................................................................................................................................................................*...................................... || ................................................................................................................................................................................................................................................................*............................... - // vqrdmulh v16, v16, v4 // ....................................................................................................................................................................................................................................................................................................................................................................................................*.................................... || ..................................................................................................................................................................................................................................................................*............................. - // vmlsq v24, v16, v8, 0 // ........................................................................................................................................................................................................................................................................................................................................................................................................*................................ || ......................................................................................................................................................................................................................................................................*......................... - // vsub v16, v14, v24 // ............................................................................................................................................................................................................................................................................................................................................................................................................*............................ || ..........................................................................................................................................................................................................................................................................*..................... - // vadd v14, v14, v24 // .............................................................................................................................................................................................................................................................................................................................................................................................................*........................... || ...........................................................................................................................................................................................................................................................................*.................... - // vmul v24, v14, v1 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................*.............. || ....................................................................................................................................................................................................................................................................................*........... - // vqrdmulh v14, v14, v5 // .................................................................................................................................................................................................................................................................................................................................................................................................................*....................... || ..............................................................................................................................................................................................................................................................................*................. - // vmlsq v24, v14, v8, 0 // .............................................................................................................................................................................................................................................................................................................................................................................................................................*........... || ........................................................................................................................................................................................................................................................................................*....... - // vsub v14, v13, v24 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................*..... || ............................................................................................................................................................................................................................................................................................*... - // vadd v13, v13, v24 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................*.... || ............................................................................................................................................................................................................................................................................................*... - // vmul v24, v16, v2 // .......................................................................................................................................................................................................................................................................................................................................................................................................................*................. || ..................................................................................................................................................................................................................................................................................*............. - // vqrdmulh v16, v16, v6 // ...................................................................................................................................................................................................................................................................................................................................................................................................................*..................... || ................................................................................................................................................................................................................................................................................*............... - // vmlsq v24, v16, v8, 0 // ............................................................................................................................................................................................................................................................................................................................................................................................................................*............ || ......................................................................................................................................................................................................................................................................................*......... - // vsub v16, v15, v24 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................*......... || ..........................................................................................................................................................................................................................................................................................*..... - // vadd v15, v15, v24 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................*...... || ...........................................................................................................................................................................................................................................................................................*.... - // st4 {v9.4S,v10.4S,v11.4S,v12.4S}, [x1], #64 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................*. || ..............................................................................................................................................................................................................................................................................................*. - // st4 {v13.4S,v14.4S,v15.4S,v16.4S}, [x2], #64 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................* || ...............................................................................................................................................................................................................................................................................................* - - subs count, count, #1 + // Instructions: 144 + // Expected cycles: 126 + // Expected IPC: 1.14 + // + // Wall time: 732.98s + // User time: 732.98s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + ldr q14, [x2, #176] // .......e........................................................................................................................................ + ldr q18, [x4], #64 // ..........e..................................................................................................................................... + mul v25.4S, v23.4S, v24.4S // .....................................................................................................................................*.......... + ldr q0, [x4, #-48] // ...........e.................................................................................................................................... + ldr q10, [x4, #-32] // ............e................................................................................................................................... + sub v16.4S, v19.4S, v11.4S // ..................................................................................................................................*............. + sqrdmulh v12.4S, v23.4S, v22.4S // ....................................................................................................................................*........... + trn1 v11.2D, v9.2D, v2.2D // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q28, [x2, #160] // ......e......................................................................................................................................... + ldr q17, [x2, #128] // ....e........................................................................................................................................... + ldr q29, [x2, #144] // .....e.......................................................................................................................................... + sqrdmulh v20.4S, v14.4S, v18.S[1] // .............................e.................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v22.4S, v14.4S, v18.S[0] // ..............................e................................................................................................................. + // gap // ................................................................................................................................................ + ldr q2, [x1, #176] // ...e............................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v19.4S, v29.4S, v18.S[1] // ...................e............................................................................................................................ + sub v1.4S, v11.4S, v3.4S // .............................................................................................................................*.................. + // gap // ................................................................................................................................................ + add v15.4S, v11.4S, v3.4S // ..............................................................................................................................*................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v22.4S, v20.4S, v8.S[0] // ...............................e................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v9.4S, v29.4S, v18.S[0] // ....................e........................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v9.4S, v19.4S, v8.S[0] // .....................e.......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q6, [x1, #144] // .e.............................................................................................................................................. + add v20.4S, v2.4S, v22.4S // .................................e.............................................................................................................. + // gap // ................................................................................................................................................ + mls v25.4S, v12.4S, v8.S[0] // ......................................................................................................................................*......... + trn1 v12.4S, v27.4S, v31.4S // ............................................................................*................................................................... + // gap // ................................................................................................................................................ + ldr q3, [x1, #160] // ..e............................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v31.4S, v20.4S, v18.S[3] // .......................................e........................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v19.4S, v6.4S, v9.4S // .......................e........................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v23.4S, v20.4S, v18.S[2] // ........................................e....................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v24.2D, v26.2D, v30.2D // .................................................................................*.............................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v23.4S, v31.4S, v8.S[0] // .........................................e...................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v5.4S, v17.4S, v18.S[1] // ..............e................................................................................................................................. + sub v31.4S, v6.4S, v9.4S // ......................e......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v21.4S, v16.4S, v13.4S // .........................................................................................................................................*...... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v20.4S, v19.4S, v23.4S // ..........................................e..................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v29.4S, v7.4S, v4.4S // ..........................................................................*..................................................................... + ldr q13, [x5, #160] // ........................................................................................................................*....................... + mul v7.4S, v28.4S, v18.S[0] // .........................e...................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v27.4S, v20.4S, v10.S[0] // ............................................................e................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v9.2D, v29.2D, v12.2D // ..............................................................................*................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v4.4S, v16.4S, v13.4S // ..........................................................................................................................................*..... + add v14.4S, v15.4S, v25.4S // ........................................................................................................................................*....... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v15.4S, v15.4S, v25.4S // .......................................................................................................................................*........ + ldr q25, [x5, #16] // ...........................................................................................*.................................................... + mls v4.4S, v21.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v21.2D, v29.2D, v12.2D // ................................................................................*............................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v11.4S, v9.4S, v25.4S // ................................................................................................*............................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v13.4S, v17.4S, v18.S[0] // ...............e................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v17.4S, v1.4S, v4.4S // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v18.S[1] // ........................e....................................................................................................................... + // gap // ................................................................................................................................................ + add v16.4S, v1.4S, v4.4S // .............................................................................................................................................*.. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v1.2D, v26.2D, v30.2D // ...............................................................................*................................................................ + ldr q6, [x5], #(12*16) // ..........................................................................................*..................................................... + // gap // ................................................................................................................................................ + mls v13.4S, v5.4S, v8.S[0] // ................e............................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v5.4S, v1.4S, v25.4S // .....................................................................................................*.......................................... + // gap // ................................................................................................................................................ + sub v25.4S, v2.4S, v22.4S // ................................e............................................................................................................... + st4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2], #64 // ...............................................................................................................................................* + add x2, x2, #64 // .........e...................................................................................................................................... + // gap // ................................................................................................................................................ + mul v17.4S, v1.4S, v6.4S // ......................................................................................................*......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q1, [x5, #-128] // ..............................................................................................*................................................. + // gap // ................................................................................................................................................ + mls v7.4S, v28.4S, v8.S[0] // ..........................e..................................................................................................................... + ldr q28, [x5, #112] // .....................................................................................................................e.......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v14.4S, v25.4S, v0.S[1] // .................................................e.............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v15.4S, v25.4S, v0.S[0] // ..................................................e............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v16.4S, v3.4S, v7.4S // ............................e................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v17.4S, v5.4S, v8.S[0] // .......................................................................................................*........................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v25.4S, v16.4S, v18.S[3] // ..................................e............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v2.4S, v3.4S, v7.4S // ...........................e.................................................................................................................... + mls v15.4S, v14.4S, v8.S[0] // ...................................................e............................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v4.4S, v16.4S, v18.S[2] // ...................................e............................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v4.4S, v25.4S, v8.S[0] // ....................................e........................................................................................................... + ldr q25, [x4, #-16] // .............e.................................................................................................................................. + // gap // ................................................................................................................................................ + sub v30.4S, v31.4S, v15.4S // ....................................................e........................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v14.4S, v9.4S, v6.4S // .................................................................................................*.............................................. + add v31.4S, v31.4S, v15.4S // .....................................................e.......................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v15.4S, v30.4S, v25.S[1] // .....................................................................e.......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v14.4S, v11.4S, v8.S[0] // ..................................................................................................*............................................. + add v11.4S, v19.4S, v23.4S // ...........................................e.................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v19.4S, v2.4S, v0.S[0] // .............................................e.................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v3.4S, v20.4S, v10.S[1] // ...........................................................e.................................................................................... + ldr q20, [x1, #128] // e............................................................................................................................................... + // gap // ................................................................................................................................................ + add v22.4S, v21.4S, v14.4S // ....................................................................................................*........................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v29.4S, v30.4S, v25.S[0] // ......................................................................e......................................................................... + sub v30.4S, v21.4S, v14.4S // ...................................................................................................*............................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v7.4S, v2.4S, v0.S[1] // ............................................e................................................................................................... + add v18.4S, v20.4S, v13.4S // ..................e............................................................................................................................. + ldr q14, [x5, #96] // ....................................................................................................................e........................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v25.4S, v31.4S, v10.S[3] // ................................................................e............................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v29.4S, v15.4S, v8.S[0] // .......................................................................e........................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v19.4S, v7.4S, v8.S[0] // ..............................................e................................................................................................. + ldr q12, [x5, #-160] // ............................................................................................*................................................... + // gap // ................................................................................................................................................ + sub v7.4S, v20.4S, v13.4S // .................e.............................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v5.4S, v31.4S, v10.S[2] // .................................................................e.............................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v5.4S, v25.4S, v8.S[0] // ..................................................................e............................................................................. + ldr q15, [x5, #-112] // ...............................................................................................*................................................ + // gap // ................................................................................................................................................ + sub v10.4S, v24.4S, v17.4S // ........................................................................................................*....................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v17.4S, v24.4S, v17.4S // .........................................................................................................*...................................... + mls v27.4S, v3.4S, v8.S[0] // .............................................................e.................................................................................. + ldr q21, [x5, #-144] // .............................................................................................*.................................................. + add v13.4S, v7.4S, v19.4S // ................................................e............................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v9.4S, v7.4S, v19.4S // ...............................................e................................................................................................ + sqrdmulh v23.4S, v10.4S, v15.4S // ...............................................................................................................*................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v15.4S, v10.4S, v1.4S // ................................................................................................................*............................... + sub v1.4S, v18.4S, v4.4S // .....................................e.......................................................................................................... + // gap // ................................................................................................................................................ + sub v26.4S, v9.4S, v29.4S // ........................................................................e....................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v16.4S, v9.4S, v29.4S // .........................................................................e...................................................................... + sqrdmulh v3.4S, v17.4S, v21.4S // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + add v9.4S, v13.4S, v5.4S // ....................................................................e........................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v29.4S, v11.4S, v0.S[3] // ......................................................e......................................................................................... + sub v31.4S, v1.4S, v27.4S // ..............................................................e................................................................................. + // gap // ................................................................................................................................................ + add v27.4S, v1.4S, v27.4S // ...............................................................e................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v24.4S, v17.4S, v12.4S // ...........................................................................................................*.................................... + sub v21.4S, v13.4S, v5.4S // ...................................................................e............................................................................ + // gap // ................................................................................................................................................ + trn1 v2.4S, v16.4S, v26.4S // ....................................................................................e........................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v5.4S, v11.4S, v0.S[2] // .......................................................e........................................................................................ + trn2 v7.4S, v16.4S, v26.4S // .....................................................................................e.......................................................... + // gap // ................................................................................................................................................ + trn2 v13.4S, v9.4S, v21.4S // ...................................................................................e............................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v12.4S, v18.4S, v4.4S // ......................................e......................................................................................................... + mls v5.4S, v29.4S, v8.S[0] // ........................................................e....................................................................................... + // gap // ................................................................................................................................................ + trn1 v9.4S, v9.4S, v21.4S // ..................................................................................e............................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v24.4S, v3.4S, v8.S[0] // ............................................................................................................*................................... + trn1 v19.2D, v13.2D, v7.2D // .........................................................................................e...................................................... + // gap // ................................................................................................................................................ + trn2 v13.2D, v13.2D, v7.2D // .......................................................................................e........................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v15.4S, v23.4S, v8.S[0] // .................................................................................................................*.............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v4.4S, v12.4S, v5.4S // .........................................................e...................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v7.4S, v12.4S, v5.4S // ..........................................................e..................................................................................... + sqrdmulh v20.4S, v13.4S, v28.4S // ...............................................................................................................................e................ + // gap // ................................................................................................................................................ + sub v23.4S, v22.4S, v24.4S // .............................................................................................................*.................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v22.4S, v22.4S, v24.4S // ..............................................................................................................*................................. + trn2 v0.2D, v9.2D, v2.2D // ......................................................................................e......................................................... + // gap // ................................................................................................................................................ + mul v11.4S, v13.4S, v14.4S // ................................................................................................................................e............... + sub v25.4S, v30.4S, v15.4S // ..................................................................................................................*............................. + ldr q13, [x5, #176] // .........................................................................................................................e...................... + add v24.4S, v30.4S, v15.4S // ...................................................................................................................*............................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v11.4S, v20.4S, v8.S[0] // .................................................................................................................................e.............. + trn2 v26.4S, v7.4S, v4.4S // ...........................................................................e.................................................................... + // gap // ................................................................................................................................................ + trn2 v30.4S, v27.4S, v31.4S // .............................................................................e.................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v1.4S, v0.4S, v28.4S // ..........................................................................................................................e..................... + st4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1], #64 // ..............................................................................................................................................*. + add x1, x1, #64 // ........e....................................................................................................................................... + ldr q24, [x5, #128] // ......................................................................................................................e......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v3.4S, v0.4S, v14.4S // ...........................................................................................................................e.................... + ldr q22, [x5, #144] // .......................................................................................................................e........................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v23.4S, v19.4S, v11.4S // ...................................................................................................................................e............ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v1.4S, v8.S[0] // ............................................................................................................................e................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + + // ------------------------------------------------------------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------ + // ldr q9, [x1, #(16*0 + (64))] // ...............................................................................e................................................................'..............................................................................~.......................................................... + // ldr q10, [x1, #(16*1 + (64))] // ....................e...........................................................................................................................'...................~..................................................................................................................... + // ldr q11, [x1, #(16*2 + (64))] // ........................e.......................................................................................................................'.......................~................................................................................................................. + // ldr q12, [x1, #(16*3 + (64))] // .............e..................................................................................................................................'............~............................................................................................................................ + // ldr q13, [x2, #(16*0 + (64))] // .........e......................................................................................................................................'........~................................................................................................................................ + // ldr q14, [x2, #(16*1 + (64))] // ..........e.....................................................................................................................................'.........~............................................................................................................................... + // ldr q15, [x2, #(16*2 + (64))] // ........e.......................................................................................................................................'.......~................................................................................................................................. + // ldr q16, [x2, #(16*3 + (64))] // e...............................................................................................................................................~......................................................................................................................................... + // add x1, x1, #64 // ..........................................................................................................................................e.....'......................................................................................................................................... + // add x2, x2, #64 // ........................................................e.......................................................................................'.......................................................~................................................................................. + // ldr q0, [x4], #64 // .e..............................................................................................................................................'~........................................................................................................................................ + // ldr q1, [x4, #(-64 + 16)] // ...e............................................................................................................................................'..~...................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // ....e...........................................................................................................................................'...~..................................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ......................................................................e.........................................................................'.....................................................................~................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ..............................e.................................................................................................................'.............................~........................................................................................................... + // mul v24.4s, v13.4s, v0.s[0] // ..............................................e.................................................................................................'.............................................~........................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ....................................................e...........................................................................................'...................................................~..................................................................................... + // sub v13.4s, v9.4s, v24.4s // ..........................................................................................e.....................................................'.........................................................................................~............................................... + // add v9.4s, v9.4s, v24.4s // ....................................................................................e...........................................................'...................................................................................~..................................................... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ..............e.................................................................................................................................'.............~........................................................................................................................... + // mul v24.4s, v14.4s, v0.s[0] // ..................e.............................................................................................................................'.................~....................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...................e............................................................................................................................'..................~...................................................................................................................... + // sub v14.4s, v10.4s, v24.4s // ...............................e................................................................................................................'..............................~.......................................................................................................... + // add v10.4s, v10.4s, v24.4s // ..........................e.....................................................................................................................'.........................~............................................................................................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ................................................e...............................................................................................'...............................................~......................................................................................... + // mul v24.4s, v15.4s, v0.s[0] // ....................................e...........................................................................................................'...................................~..................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................e....................................................................................'..........................................................~.............................................................................. + // sub v15.4s, v11.4s, v24.4s // ..................................................................e.............................................................................'.................................................................~....................................................................... + // add v11.4s, v11.4s, v24.4s // ...............................................................e................................................................................'..............................................................~.......................................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ...........e....................................................................................................................................'..........~.............................................................................................................................. + // mul v24.4s, v16.4s, v0.s[0] // ............e...................................................................................................................................'...........~............................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .................e..............................................................................................................................'................~........................................................................................................................ + // sub v16.4s, v12.4s, v24.4s // ......................................................e.........................................................................................'.....................................................~................................................................................... + // add v12.4s, v12.4s, v24.4s // .....................e..........................................................................................................................'....................~.................................................................................................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // .................................................................e..............................................................................'................................................................~........................................................................ + // mul v24.4s, v11.4s, v0.s[2] // ....................................................................e...........................................................................'...................................................................~..................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .....................................................................e..........................................................................'....................................................................~.................................................................... + // sub v11.4s, v9.4s, v24.4s // ......................................................................................................e.........................................'.....................................................................................................~................................... + // add v9.4s, v9.4s, v24.4s // ....................................................................................................................e...........................'...................................................................................................................~..................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .........................e......................................................................................................................'........................~................................................................................................................ + // mul v24.4s, v12.4s, v0.s[2] // ...........................e....................................................................................................................'..........................~.............................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // .............................e..................................................................................................................'............................~............................................................................................................ + // sub v12.4s, v10.4s, v24.4s // .................................e..............................................................................................................'................................~........................................................................................................ + // add v10.4s, v10.4s, v24.4s // ............................................................................e...................................................................'...........................................................................~............................................................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ...................................................................................e............................................................'..................................................................................~...................................................... + // mul v24.4s, v15.4s, v1.s[0] // .............................................................................e..................................................................'............................................................................~............................................................ + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................................e.......................................................'.......................................................................................~................................................. + // sub v15.4s, v13.4s, v24.4s // ...................................................................................................e............................................'..................................................................................................~...................................... + // add v13.4s, v13.4s, v24.4s // ..................................................................................................e.............................................'.................................................................................................~....................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .............................................................e..................................................................................'............................................................~............................................................................ + // mul v24.4s, v16.4s, v1.s[0] // ..............................................................e.................................................................................'.............................................................~........................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...................................................................e............................................................................'..................................................................~...................................................................... + // sub v16.4s, v14.4s, v24.4s // .......................................................................e........................................................................'......................................................................~.................................................................. + // add v14.4s, v14.4s, v24.4s // .........................................................................e......................................................................'........................................................................~................................................................ + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ...........................................................................................................e....................................'..........................................................................................................~.............................. + // mul v24.4s, v10.4s, v1.s[2] // .................................................................................................................e..............................'................................................................................................................~........................ + // mls v24.4s, v27.4s, v8.s[0] // .....................................................................................................................e..........................'....................................................................................................................~.................... + // sub v10.4s, v9.4s, v24.4s // ...........................................................................................................................e....................'..........................................................................................................................~.............. + // add v9.4s, v9.4s, v24.4s // ............................................................................................................................e...................'...........................................................................................................................~............. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ..............................................................................e.................................................................'.............................................................................~........................................................... + // mul v24.4s, v12.4s, v2.s[0] // .....................................e..........................................................................................................'....................................~.................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................................................................................................e...............................................'...............................................................................................~......................................... + // sub v12.4s, v11.4s, v24.4s // ............................................................................................................e...................................'...........................................................................................................~............................. + // add v11.4s, v11.4s, v24.4s // .............................................................................................................e..................................'............................................................................................................~............................ + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ......................................................................................e.........................................................'.....................................................................................~................................................... + // mul v24.4s, v14.4s, v2.s[2] // ...........................................................................................e....................................................'..........................................................................................~.............................................. + // mls v24.4s, v27.4s, v8.s[0] // ............................................................................................e...................................................'...........................................................................................~............................................. + // sub v14.4s, v13.4s, v24.4s // ...............................................................................................................e................................'..............................................................................................................~.......................... + // add v13.4s, v13.4s, v24.4s // ..........................................................................................................e.....................................'.........................................................................................................~............................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ..........................................................................e.....................................................................'.........................................................................~............................................................... + // mul v24.4s, v16.4s, v3.s[0] // .................................................................................e..............................................................'................................................................................~........................................................ + // mls v24.4s, v27.4s, v8.s[0] // .......................................................................................e........................................................'......................................................................................~.................................................. + // sub v16.4s, v15.4s, v24.4s // .......................................................................................................e........................................'......................................................................................................~.................................. + // add v15.4s, v15.4s, v24.4s // ........................................................................................................e.......................................'.......................................................................................................~................................. + // trn1 v25.4s, v9.4s, v10.4s // ..................................~.............................................................................................................'.................................*....................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ......................................................................................................................................e.........'.....................................................................................................................................~... + // trn1 v27.4s, v11.4s, v12.4s // .......................~........................................................................................................................'......................*.................................................................................................................. + // trn2 v28.4s, v11.4s, v12.4s // .......................................................................................................................................e........'......................................................................................................................................~.. + // trn2 v11.2d, v25.2d, v27.2d // ......................................~.........................................................................................................'.....................................*................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ..................................................~.............................................................................................'.................................................*....................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ............................................~...................................................................................................'...........................................*............................................................................................. + // trn1 v10.2d, v26.2d, v28.2d // ............................~...................................................................................................................'...........................*............................................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ......................................................................................................................e.........................'.....................................................................................................................~................... + // trn2 v26.4s, v13.4s, v14.4s // ...................................................................................................................e............................'..................................................................................................................~...................... + // trn1 v27.4s, v15.4s, v16.4s // ................................................................................................................e...............................'...............................................................................................................~......................... + // trn2 v28.4s, v15.4s, v16.4s // ..................................................................................................................e.............................'.................................................................................................................~....................... + // trn2 v15.2d, v25.2d, v27.2d // ................................................................................................................................e...............'...............................................................................................................................~......... + // trn2 v16.2d, v26.2d, v28.2d // .........................................................................................................................e......................'........................................................................................................................~................ + // trn1 v13.2d, v25.2d, v27.2d // .......~........................................................................................................................................'......*.................................................................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ........................................................................................................................e.......................'.......................................................................................................................~................. + // ldr q0, [ x5], #(12*16) // ...................................................~............................................................................................'..................................................*...................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ..........................................~.....................................................................................................'.........................................*............................................................................................... + // ldr q1, [ x5, #(-12*16 + 2*16)] // .........................................................................................~......................................................'........................................................................................*................................................ + // ldr q5, [x5, #(-12*16 + 3*16)] // .................................................................................................~..............................................'................................................................................................*........................................ + // ldr q2, [ x5, #(-12*16 + 4*16)] // ..........................................................~.....................................................................................'.........................................................*............................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // .............................................................................................~..................................................'............................................................................................*............................................ + // sqrdmulh v27.4s, v11.4s, v4.4s // .............................................~..................................................................................................'............................................*............................................................................................ + // mul v24.4s, v11.4s, v0.4s // ........................................................................~.......................................................................'.......................................................................*................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................................~....................................................................'..........................................................................*.............................................................. + // sub v11.4s, v9.4s, v24.4s // ..................................................................................~.............................................................'.................................................................................*....................................................... + // add v9.4s, v9.4s, v24.4s // ................................................................................~...............................................................'...............................................................................*......................................................... + // sqrdmulh v27.4s, v12.4s, v4.4s // .....................................................~..........................................................................................'....................................................*.................................................................................... + // mul v24.4s, v12.4s, v0.4s // .........................................................~......................................................................................'........................................................*................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ................................................................~...............................................................................'...............................................................*......................................................................... + // sub v12.4s, v10.4s, v24.4s // ..............................................................................................~.................................................'.............................................................................................*........................................... + // add v10.4s, v10.4s, v24.4s // ...............................................................................................~................................................'..............................................................................................*.......................................... + // sqrdmulh v27.4s, v10.4s, v5.4s // .........................................................................................................~......................................'........................................................................................................*................................ + // mul v24.4s, v10.4s, v1.4s // ..............................................................................................................~.................................'.............................................................................................................*........................... + // mls v24.4s, v27.4s, v8.s[0] // .......................................................................................................................~........................'......................................................................................................................*.................. + // sub v10.4s, v9.4s, v24.4s // ..............................................................................................................................~.................'.............................................................................................................................*........... + // add v9.4s, v9.4s, v24.4s // ...............................................................................................................................~................'..............................................................................................................................*.......... + // sqrdmulh v27.4s, v12.4s, v6.4s // ....................................................................................................~...........................................'...................................................................................................*..................................... + // mul v24.4s, v12.4s, v2.4s // .....................................................................................................~..........................................'....................................................................................................*.................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................................................................~.....................'.........................................................................................................................*............... + // sub v12.4s, v11.4s, v24.4s // ..................................................................................................................................~.............'.................................................................................................................................*....... + // add v11.4s, v11.4s, v24.4s // ....................................................................................................................................~...........'...................................................................................................................................*..... + // ldr q0, [ x5, #(-12*16 + 6*16)] // .....................................................................................e..........................................................'....................................................................................~.................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ............................................................e...................................................................................'...........................................................~............................................................................. + // ldr q1, [ x5, #(-12*16 + 8*16)] // ...........................................................................................................................................e....'......................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // .............................................................................................................................................e..'......................................................................................................................................... + // ldr q2, [ x5, #(-12*16 + 10*16)] // ...................................~............................................................................................................'..................................*...................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...................................................................................................................................e............'..................................................................................................................................~...... + // sqrdmulh v27.4s, v15.4s, v4.4s // ........................................................................................................................................e.......'.......................................................................................................................................~. + // mul v24.4s, v15.4s, v0.4s // ............................................................................................................................................e...'......................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...............................................................................................................................................e'......................................................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ...............~................................................................................................................................'..............*.......................................................................................................................... + // add v13.4s, v13.4s, v24.4s // ................~...............................................................................................................................'...............*......................................................................................................................... + // sqrdmulh v27.4s, v16.4s, v4.4s // .............................................................................................................................e..................'............................................................................................................................~............ + // mul v24.4s, v16.4s, v0.4s // .................................................................................................................................e..............'................................................................................................................................~........ + // mls v24.4s, v27.4s, v8.s[0] // .....................................................................................................................................e..........'....................................................................................................................................~.... + // sub v16.4s, v14.4s, v24.4s // .....~..........................................................................................................................................'....*.................................................................................................................................... + // add v14.4s, v14.4s, v24.4s // ..............................................................................................................................................e.'......................................................................................................................................... + // sqrdmulh v27.4s, v14.4s, v5.4s // ......~.........................................................................................................................................'.....*................................................................................................................................... + // mul v24.4s, v14.4s, v1.4s // ..~.............................................................................................................................................'.*....................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ......................~.........................................................................................................................'.....................*................................................................................................................... + // sub v14.4s, v13.4s, v24.4s // .........................................~......................................................................................................'........................................*................................................................................................ + // add v13.4s, v13.4s, v24.4s // ........................................~.......................................................................................................'.......................................*................................................................................................. + // sqrdmulh v27.4s, v16.4s, v6.4s // ................................~...............................................................................................................'...............................*......................................................................................................... + // mul v24.4s, v16.4s, v2.4s // .......................................~........................................................................................................'......................................*.................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................................~....................................................................................................'..........................................*.............................................................................................. + // sub v16.4s, v15.4s, v24.4s // ...............................................~................................................................................................'..............................................*.......................................................................................... + // add v15.4s, v15.4s, v24.4s // .................................................~..............................................................................................'................................................*........................................................................................ + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // .........................................................................................................................................~......'........................................................................................................................................* + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // .......................................................~........................................................................................'......................................................*.................................................................................. + + sub count, count, #1 cbnz count, layer45678_start - ldr x8, [x1, #80] // ....*................................................................................................................................................................................................................... - vins v15, x15, 0 // ................................................................................................................................................*....................................................................... - ldr x13, [x2, #112] // ............................*........................................................................................................................................................................................... - ldr x21, [x2, #80] // ....................*................................................................................................................................................................................................... - ldr x18, [x5, #-136] // ...........................................................................................................................................*............................................................................ - vins v24, x28, 0 // ........................................................................................................................................*............................................................................... - ldr x15, [x1, #112] // ............*........................................................................................................................................................................................................... - vins v30, x6, 1 // .............................................................................................................................................................................*.......................................... - ldr x10, [x2, #64] // ................*....................................................................................................................................................................................................... - ldr x29, [x2, #96] // ........................*............................................................................................................................................................................................... - vins v0, x24, 1 // .................................................................................................................................................................................*...................................... - vins v10, x13, 0 // ..............................*......................................................................................................................................................................................... - ldr x23, [x1, #96] // ........*............................................................................................................................................................................................................... - vins v2, x8, 0 // ......*................................................................................................................................................................................................................. - ldr x6, [x4, #-56] // ...................................*.................................................................................................................................................................................... - vins v21, x18, 1 // .............................................................................................................................................*.......................................................................... - ldr x0, [x1, #88] // .....*.................................................................................................................................................................................................................. - vins v19, x21, 0 // ......................*................................................................................................................................................................................................. - ldr x26, [x1, #72] // .*...................................................................................................................................................................................................................... - ldr x13, [x2, #120] // .............................*.......................................................................................................................................................................................... - vins v25, x10, 0 // ..................*..................................................................................................................................................................................................... - vins v29, x29, 0 // ..........................*............................................................................................................................................................................................. - ldr x28, [x1, #104] // .........*.............................................................................................................................................................................................................. - vins v26, x23, 0 // ..........*............................................................................................................................................................................................................. - ldr x21, [x2, #88] // .....................*.................................................................................................................................................................................................. - vins v24, x11, 1 // .........................................................................................................................................*.............................................................................. - vins v14, x15, 0 // ..............*......................................................................................................................................................................................................... - vins v9, x6, 1 // .....................................*.................................................................................................................................................................................. - vins v6, x25, 1 // .....................................................................................................................................................*.................................................................. - ldr x11, [x2, #104] // .........................*.............................................................................................................................................................................................. - vins v10, x13, 1 // ...............................*........................................................................................................................................................................................ - ldr x20, [x1, #64] // *....................................................................................................................................................................................................................... - vins v18, x16, 0 // ................................................................................................................................*....................................................................................... - vins v19, x21, 1 // .......................*................................................................................................................................................................................................ - ldr x18, [x1, #120] // .............*.......................................................................................................................................................................................................... - ldr x23, [x2, #72] // .................*...................................................................................................................................................................................................... - ldr x15, [x4, #-8] // ...............................................*........................................................................................................................................................................ - vins v2, x0, 1 // .......*................................................................................................................................................................................................................ - vins v29, x11, 1 // ...........................*............................................................................................................................................................................................ - vins v26, x28, 1 // ...........*............................................................................................................................................................................................................ - ldr x8, [x5, #-176] // ..................................................................................................................................*..................................................................................... - vins v5, x20, 0 // ..*..................................................................................................................................................................................................................... - ldr x0, [x5, #-48] // ......................................................................................................................................................................................*................................. - vins v27, x12, 0 // ................................................................................................................................................................................................*....................... - add x2, x2, #64 // .................................*...................................................................................................................................................................................... - vqrdmulhq v7, v10, v9, 1 // ..................................................................*..................................................................................................................................................... - vins v14, x18, 1 // ...............*........................................................................................................................................................................................................ - vins v25, x23, 1 // ...................*.................................................................................................................................................................................................... - ldr x21, [x5, #-64] // ..................................................................................................................................................................................*..................................... - ldr x29, [x5, #-168] // ...................................................................................................................................*.................................................................................... - vins v18, x22, 1 // .................................................................................................................................*...................................................................................... - ldr x12, [x4, #-24] // ...........................................*............................................................................................................................................................................ - vmulq v20, v10, v9, 0 // .................................................................*...................................................................................................................................................... - vins v23, x0, 0 // ........................................................................................................................................................................................*............................... - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v22, v29, v9, 1 // .............................................................*.......................................................................................................................................................... - vins v5, x26, 1 // ...*.................................................................................................................................................................................................................... - vins v16, x21, 0 // ....................................................................................................................................................................................*................................... - ldr x21, [x5, #-56] // ...................................................................................................................................................................................*.................................... - vmlsq v20, v7, v8, 0 // ...................................................................*.................................................................................................................................................... - ldr x10, [x5, #-32] // ..........................................................................................................................................................................................*............................. - vins v11, x12, 1 // .............................................*.......................................................................................................................................................................... - add x1, x1, #64 // ................................*....................................................................................................................................................................................... - vmulq v13, v19, v9, 0 // .......................................................*................................................................................................................................................................ - vins v28, x8, 0 // ....................................................................................................................................*................................................................................... - vmulq v17, v29, v9, 0 // ............................................................*........................................................................................................................................................... - vins v12, x7, 0 // ................................................*....................................................................................................................................................................... - ldr x8, [x5, #-24] // ...........................................................................................................................................................................................*............................ - vins v27, x9, 1 // .................................................................................................................................................................................................*...................... - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v31, v19, v9, 1 // ........................................................*............................................................................................................................................................... - vadd v3, v14, v20 // .....................................................................*.................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - vsub v1, v14, v20 // ....................................................................*................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v16, x21, 1 // .....................................................................................................................................................................................*.................................. - vmlsq v17, v22, v8, 0 // ..............................................................*......................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v12, x15, 1 // .................................................*...................................................................................................................................................................... - vqrdmulhq v14, v3, v9, 3 // ............................................................................*........................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v7, v3, v9, 2 // ...........................................................................*............................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v13, v31, v8, 0 // .........................................................*.............................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v7, v14, v8, 0 // .............................................................................*.......................................................................................................................................... - vadd v22, v26, v17 // ................................................................*....................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vadd v3, v2, v13 // ...........................................................*............................................................................................................................................................ - vmulq v10, v25, v9, 0 // ..................................................*..................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v19, v22, v9, 2 // ......................................................................*................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v14, v22, v9, 3 // .......................................................................*................................................................................................................................................ - vadd v22, v3, v7 // ...............................................................................*........................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v29, v26, v17 // ...............................................................*........................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v25, v25, v9, 1 // ...................................................*.................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v3, v3, v7 // ..............................................................................*......................................................................................................................................... - vmulq v31, v22, v4, 2 // ..........................................................................................*............................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v10, v25, v8, 0 // ....................................................*................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v17, v22, v4, 3 // ...........................................................................................*............................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vadd v26, v5, v10 // ......................................................*................................................................................................................................................................. - vmlsq v19, v14, v8, 0 // ........................................................................*............................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v14, v1, v4, 0 // .....................................................................................*.................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v7, v3, v11, 0 // ...............................................................................................*........................................................................................................................ - vsub v20, v26, v19 // .........................................................................*.............................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - vins v28, x29, 1 // .....................................................................................................................................*.................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v3, v3, v11, 1 // ................................................................................................*....................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v22, v1, v4, 1 // ......................................................................................*................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v7, v3, v8, 0 // .................................................................................................*...................................................................................................................... - vadd v3, v26, v19 // ..........................................................................*............................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v14, v22, v8, 0 // .......................................................................................*................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v23, x27, 1 // .........................................................................................................................................................................................*.............................. - vsub v25, v2, v13 // ..........................................................*............................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - vmlsq v31, v17, v8, 0 // ............................................................................................*........................................................................................................................... - vadd v22, v20, v7 // ...................................................................................................*.................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - vsub v20, v20, v7 // ..................................................................................................*..................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - vsub v1, v5, v10 // .....................................................*.................................................................................................................................................................. - vmulq v2, v29, v4, 0 // ................................................................................*....................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - vadd v26, v25, v14 // .........................................................................................*.............................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v10, v29, v4, 1 // .................................................................................*...................................................................................................................................... - vadd v5, v3, v31 // ..............................................................................................*......................................................................................................................... - vsub v19, v25, v14 // ........................................................................................*............................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v25, v3, v31 // .............................................................................................*.......................................................................................................................... - vqrdmulhq v13, v26, v11, 3 // .....................................................................................................*.................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - trn1_s v7, v22, v20 // ................................................................................................................*....................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2_s v31, v22, v20 // .................................................................................................................*...................................................................................................... - vmlsq v2, v10, v8, 0 // ..................................................................................*..................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - trn2_s v22, v5, v25 // ...............................................................................................................*........................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1_s v17, v5, v25 // ..............................................................................................................*......................................................................................................... - vmulq v25, v19, v12, 0 // .........................................................................................................*.............................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulhq v5, v19, v12, 1 // ..........................................................................................................*............................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2_d v10, v22, v31 // ...................................................................................................................*.................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmulq v19, v26, v11, 2 // ....................................................................................................*................................................................................................................... - trn2_d v26, v17, v7 // ..................................................................................................................*..................................................................................................... - // gap // ........................................................................................................................................................................................................................ - trn1_d v14, v22, v31 // .....................................................................................................................*.................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v31, x10, 0 // ............................................................................................................................................................................................*........................... - vmul v20, v10, v18 // ...........................................................................................................................................................*............................................................ - ldr x10, [x5, #-120] // ...............................................................................................................................................*........................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v9, v10, v28 // ............................................................................................................................................................*........................................................... - vadd v10, v1, v2 // ....................................................................................*................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v3, v26, v28 // .......................................................................................................................................................*................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v31, x8, 1 // .............................................................................................................................................................................................*.......................... - vmlsq v20, v9, v8, 0 // .............................................................................................................................................................*.......................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v19, v13, v8, 0 // ......................................................................................................*................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v13, v1, v2 // ...................................................................................*.................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v25, v5, v8, 0 // ...........................................................................................................*............................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vadd v12, v14, v20 // ...............................................................................................................................................................*........................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmul v28, v26, v18 // ......................................................................................................................................................*................................................................. - vadd v1, v10, v19 // ........................................................................................................*............................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vins v15, x10, 1 // .................................................................................................................................................*...................................................................... - vsub v18, v13, v25 // ............................................................................................................*........................................................................................................... - vqrdmulh v11, v12, v21 // .................................................................................................................................................................*...................................................... - // gap // ........................................................................................................................................................................................................................ - vsub v2, v10, v19 // .......................................................................................................*................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vadd v22, v13, v25 // .............................................................................................................*.......................................................................................................... - vmul v25, v12, v24 // ................................................................................................................................................................*....................................................... - // gap // ........................................................................................................................................................................................................................ - vsub v26, v14, v20 // ..............................................................................................................................................................*......................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v28, v3, v8, 0 // ........................................................................................................................................................*............................................................... - trn2_s v9, v1, v2 // .......................................................................................................................*................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2_s v5, v22, v18 // .........................................................................................................................*.............................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1_s v18, v22, v18 // ........................................................................................................................*............................................................................................... - vmlsq v25, v11, v8, 0 // ..................................................................................................................................................................*..................................................... - // gap // ........................................................................................................................................................................................................................ - trn1_s v3, v1, v2 // ......................................................................................................................*................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmul v12, v26, v15 // .....................................................................................................................................................................*.................................................. - // gap // ........................................................................................................................................................................................................................ - trn2_d v1, v9, v5 // ...........................................................................................................................*............................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v13, v26, v6 // ......................................................................................................................................................................*................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2_d v10, v3, v18 // ..........................................................................................................................*............................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1_d v22, v3, v18 // ............................................................................................................................*........................................................................................... - vmul v29, v1, v30 // .......................................................................................................................................................................................................*................ - // gap // ........................................................................................................................................................................................................................ - trn1_d v3, v9, v5 // .............................................................................................................................*.......................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v2, v1, v0 // ........................................................................................................................................................................................................*............... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v26, v10, v0 // ...................................................................................................................................................................................................*.................... - // gap // ........................................................................................................................................................................................................................ - trn1_d v0, v17, v7 // ....................................................................................................................*................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v29, v2, v8, 0 // .........................................................................................................................................................................................................*.............. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmul v1, v10, v30 // ..................................................................................................................................................................................................*..................... - vadd v14, v0, v28 // ..........................................................................................................................................................*............................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v12, v13, v8, 0 // .......................................................................................................................................................................*................................................ - vsub v18, v3, v29 // ..........................................................................................................................................................................................................*............. - // gap // ........................................................................................................................................................................................................................ - vadd v13, v3, v29 // ...........................................................................................................................................................................................................*............ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v1, v26, v8, 0 // ....................................................................................................................................................................................................*................... - vsub v26, v14, v25 // ...................................................................................................................................................................*.................................................... - // gap // ........................................................................................................................................................................................................................ - vadd v25, v14, v25 // ....................................................................................................................................................................*................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v9, v13, v23 // .............................................................................................................................................................................................................*.......... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v24, v0, v28 // .........................................................................................................................................................*.............................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vqrdmulh v7, v18, v27 // ..................................................................................................................................................................................................................*..... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmul v29, v18, v31 // .................................................................................................................................................................................................................*...... - vadd v27, v24, v12 // .........................................................................................................................................................................*.............................................. - // gap // ........................................................................................................................................................................................................................ - vsub v23, v22, v1 // .....................................................................................................................................................................................................*.................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmul v30, v13, v16 // ............................................................................................................................................................................................................*........... - vadd v18, v22, v1 // ......................................................................................................................................................................................................*................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v29, v7, v8, 0 // ...................................................................................................................................................................................................................*.... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vmlsq v30, v9, v8, 0 // ..............................................................................................................................................................................................................*......... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - vsub v17, v23, v29 // ....................................................................................................................................................................................................................*... - vsub v28, v24, v12 // ........................................................................................................................................................................*............................................... - vadd v16, v23, v29 // .....................................................................................................................................................................................................................*.. - // gap // ........................................................................................................................................................................................................................ - vsub v15, v18, v30 // ...............................................................................................................................................................................................................*........ - vadd v14, v18, v30 // ................................................................................................................................................................................................................*....... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - st4 {v25.4S,v26.4S,v27.4S,v28.4S}, [x1], #64 // ......................................................................................................................................................................................................................*. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - st4 {v14.4S,v15.4S,v16.4S,v17.4S}, [x2], #64 // .......................................................................................................................................................................................................................* - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ + // Instructions: 49 + // Expected cycles: 49 + // Expected IPC: 1.00 + // + // Wall time: 1.21s + // User time: 1.21s + // + // -------------- original position ---------------> + // 0 25 + // |------------------------|----------------------- + mul v20.4S, v23.4S, v24.4S // *................................................ + sub v18.4S, v19.4S, v11.4S // .*............................................... + ldr q19, [x5, #160] // ...........*..................................... + trn1 v11.2D, v9.2D, v2.2D // ...*............................................. + ldr q0, [x5, #16] // ................*................................ + ldr q6, [x5], #(12*16) // .......................*......................... + sqrdmulh v22.4S, v23.4S, v22.4S // ..*.............................................. + trn1 v27.4S, v27.4S, v31.4S // .......*......................................... + ldr q17, [x5, #-128] // ...........................*..................... + trn1 v12.4S, v7.4S, v4.4S // ..........*...................................... + ldr q14, [x5, #-112] // ..................................*.............. + ldr q1, [x5, #-160] // .................................*............... + sqrdmulh v13.4S, v18.4S, v13.4S // .........*....................................... + sub v31.4S, v11.4S, v3.4S // ....*............................................ + ldr q25, [x5, #-144] // .....................................*........... + add v11.4S, v11.4S, v3.4S // .....*........................................... + // gap // ................................................. + // gap // ................................................. + mul v18.4S, v18.4S, v19.4S // .............*................................... + trn1 v19.2D, v26.2D, v30.2D // ........*........................................ + // gap // ................................................. + trn2 v30.2D, v26.2D, v30.2D // ......................*.......................... + // gap // ................................................. + // gap // ................................................. + mls v20.4S, v22.4S, v8.S[0] // ......*.......................................... + trn2 v22.2D, v12.2D, v27.2D // ............*.................................... + // gap // ................................................. + trn1 v27.2D, v12.2D, v27.2D // ..................*.............................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v12.4S, v30.4S, v0.4S // ........................*........................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v0.4S, v22.4S, v0.4S // ...................*............................. + // gap // ................................................. + // gap // ................................................. + add v10.4S, v11.4S, v20.4S // ..............*.................................. + // gap // ................................................. + // gap // ................................................. + sub v11.4S, v11.4S, v20.4S // ...............*................................. + mul v20.4S, v30.4S, v6.4S // ..........................*...................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v20.4S, v12.4S, v8.S[0] // ............................*.................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mul v22.4S, v22.4S, v6.4S // .............................*................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v18.4S, v13.4S, v8.S[0] // .................*............................... + // gap // ................................................. + // gap // ................................................. + sub v6.4S, v19.4S, v20.4S // ...................................*............. + // gap // ................................................. + // gap // ................................................. + add v20.4S, v19.4S, v20.4S // ....................................*............ + mls v22.4S, v0.4S, v8.S[0] // ..............................*.................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mul v30.4S, v6.4S, v17.4S // .......................................*......... + // gap // ................................................. + // gap // ................................................. + sub v13.4S, v31.4S, v18.4S // ....................*............................ + // gap // ................................................. + // gap // ................................................. + sqrdmulh v19.4S, v6.4S, v14.4S // ......................................*.......... + add v12.4S, v31.4S, v18.4S // .....................*........................... + // gap // ................................................. + add v18.4S, v27.4S, v22.4S // ...............................*................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v0.4S, v20.4S, v25.4S // ........................................*........ + sub v22.4S, v27.4S, v22.4S // ................................*................ + // gap // ................................................. + st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x2], #64 // .........................*....................... + // gap // ................................................. + // gap // ................................................. + mul v20.4S, v20.4S, v1.4S // .........................................*....... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v30.4S, v19.4S, v8.S[0] // ...........................................*..... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v20.4S, v0.4S, v8.S[0] // ..........................................*...... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v13.4S, v22.4S, v30.4S // ..............................................*.. + add v12.4S, v22.4S, v30.4S // ...............................................*. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v11.4S, v18.4S, v20.4S // ............................................*.... + add v10.4S, v18.4S, v20.4S // .............................................*... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x1], #64 // ................................................* + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + + // ----------------- new position -----------------> + // 0 25 + // |------------------------|----------------------- + // mul v25.4S, v23.4S, v24.4S // *................................................ + // sub v16.4S, v19.4S, v11.4S // .*............................................... + // sqrdmulh v12.4S, v23.4S, v22.4S // ......*.......................................... + // trn1 v11.2D, v9.2D, v2.2D // ...*............................................. + // sub v1.4S, v11.4S, v3.4S // .............*................................... + // add v15.4S, v11.4S, v3.4S // ...............*................................. + // mls v25.4S, v12.4S, v8.S[0] // ...................*............................. + // trn1 v12.4S, v27.4S, v31.4S // .......*......................................... + // trn1 v24.2D, v26.2D, v30.2D // .................*............................... + // sqrdmulh v21.4S, v16.4S, v13.4S // ............*.................................... + // trn1 v29.4S, v7.4S, v4.4S // .........*....................................... + // ldr q13, [x5, #160] // ..*.............................................. + // trn2 v9.2D, v29.2D, v12.2D // ....................*............................ + // mul v4.4S, v16.4S, v13.4S // ................*................................ + // add v14.4S, v15.4S, v25.4S // ........................*........................ + // sub v15.4S, v15.4S, v25.4S // .........................*....................... + // ldr q25, [x5, #16] // ....*............................................ + // mls v4.4S, v21.4S, v8.S[0] // .............................*................... + // trn1 v21.2D, v29.2D, v12.2D // .....................*........................... + // sqrdmulh v11.4S, v9.4S, v25.4S // .......................*......................... + // sub v17.4S, v1.4S, v4.4S // ..................................*.............. + // add v16.4S, v1.4S, v4.4S // ....................................*............ + // trn2 v1.2D, v26.2D, v30.2D // ..................*.............................. + // ldr q6, [x5], #(12*16) // .....*........................................... + // sqrdmulh v5.4S, v1.4S, v25.4S // ......................*.......................... + // st4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2], #64 // ........................................*........ + // mul v17.4S, v1.4S, v6.4S // ..........................*...................... + // ldr q1, [x5, #-128] // ........*........................................ + // mls v17.4S, v5.4S, v8.S[0] // ...........................*..................... + // mul v14.4S, v9.4S, v6.4S // ............................*.................... + // mls v14.4S, v11.4S, v8.S[0] // ................................*................ + // add v22.4S, v21.4S, v14.4S // .....................................*........... + // sub v30.4S, v21.4S, v14.4S // .......................................*......... + // ldr q12, [x5, #-160] // ...........*..................................... + // ldr q15, [x5, #-112] // ..........*...................................... + // sub v10.4S, v24.4S, v17.4S // ..............................*.................. + // add v17.4S, v24.4S, v17.4S // ...............................*................. + // ldr q21, [x5, #-144] // ..............*.................................. + // sqrdmulh v23.4S, v10.4S, v15.4S // ...................................*............. + // mul v15.4S, v10.4S, v1.4S // .................................*............... + // sqrdmulh v3.4S, v17.4S, v21.4S // ......................................*.......... + // mul v24.4S, v17.4S, v12.4S // .........................................*....... + // mls v24.4S, v3.4S, v8.S[0] // ...........................................*..... + // mls v15.4S, v23.4S, v8.S[0] // ..........................................*...... + // sub v23.4S, v22.4S, v24.4S // ..............................................*.. + // add v22.4S, v22.4S, v24.4S // ...............................................*. + // sub v25.4S, v30.4S, v15.4S // ............................................*.... + // add v24.4S, v30.4S, v15.4S // .............................................*... + // st4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1], #64 // ................................................* + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm.s new file mode 100644 index 00000000..a6db0e8e --- /dev/null +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm.s @@ -0,0 +1,1976 @@ + +// Needed to provide ASM_LOAD directive +#include + +.macro vins vec_out, gpr_in, lane + ins \vec_out\().d[\lane], \gpr_in +.endm + +xtmp0 .req x10 +xtmp1 .req x11 + +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlsq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmlsq \dst, t2, consts, 0 +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().4s, \a\().4s, tmp.4s + add \a\().4s, \a\().4s, tmp.4s +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmlsq \a, tmp, consts, 0 +.endm + +.macro barrett_reduce a0, a1, a2, a3 + barrett_reduce_single \a0 + barrett_reduce_single \a1 + barrett_reduce_single \a2 + barrett_reduce_single \a3 +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] +.endm + +.macro load_roots_456 + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] +.endm + +.macro load_roots_78_part1 + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] +.endm + +.macro load_roots_78_part2 + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_dilithium_123_456_78_twiddles.s" +.text + + .global ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm + .global _ntt_dilithium_123_45678_w_scalar + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 + +ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm: +_ntt_dilithium_123_45678_w_scalar_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l012) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + + save STACK0, in + mov count, #8 + + load_roots_123 + + .p2align 2 + // Instructions: 33 + // Expected cycles: 16 + // Expected IPC: 2.06 + // + // Wall time: 0.37s + // User time: 0.37s + // + // ------ original position -------> + // 0 25 + // |------------------------|------- + ldr q20, [x0, #896] // ..*.............................. + ldr q4, [x0, #512] // *................................ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + ldr q10, [x0, #768] // .*............................... + ldr q9, [x0, #128] // .......................*......... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + ldr q31, [x0, #640] // ...*............................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v11.4S, v20.4S, v0.S[0] // .....*........................... + sqrdmulh v13.4S, v20.4S, v0.S[1] // .......*......................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v7.4S, v10.4S, v0.S[0] // ......*.......................... + sqrdmulh v6.4S, v10.4S, v0.S[1] // ........*........................ + ldr q10, [x0, #384] // .........*....................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v23.4S, v31.4S, v0.S[0] // ............*.................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v11.4S, v13.4S, v8.S[0] // .............*................... + ldr q13, [x0, #256] // ....*............................ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v22.4S, v31.4S, v0.S[1] // ..............*.................. + mls v7.4S, v6.4S, v8.S[0] // ..........*...................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sub v6.4S, v10.4S, v11.4S // ................*................ + add v30.4S, v10.4S, v11.4S // .................*............... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v23.4S, v22.4S, v8.S[0] // .........................*....... + add v24.4S, v13.4S, v7.4S // ...............*................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v16.4S, v6.4S, v1.S[1] // ...................*............. + mul v26.4S, v6.4S, v1.S[0] // .....................*........... + mul v19.4S, v30.4S, v0.S[2] // ....................*............ + sqrdmulh v6.4S, v30.4S, v0.S[3] // ......................*.......... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sub v12.4S, v13.4S, v7.4S // ..................*.............. + mul v28.4S, v24.4S, v0.S[2] // ........................*........ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + add v7.4S, v9.4S, v23.4S // ..............................*.. + sub v22.4S, v9.4S, v23.4S // ...............................*. + sqrdmulh v23.4S, v24.4S, v0.S[3] // ..........................*...... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v26.4S, v16.4S, v8.S[0] // .............................*... + mls v19.4S, v6.4S, v8.S[0] // ............................*.... + ldr q6, [x0, #0] // ...........*..................... + mul v29.4S, v12.4S, v1.S[0] // ...........................*..... + mul v16.4S, v4.4S, v0.S[0] // ................................* + // gap // ................................. + // gap // ................................. + // gap // ................................. + + // --------- new position ---------> + // 0 25 + // |------------------------|------- + // ldr q4, [x0, #512] // .*............................... + // ldr q17, [x0, #768] // ..*.............................. + // ldr q31, [x0, #896] // *................................ + // ldr q11, [x0, #640] // ....*............................ + // ldr q30, [x0, #256] // ............*.................... + // mul v26.4S, v31.4S, v0.S[0] // .....*........................... + // mul v19.4S, v17.4S, v0.S[0] // .......*......................... + // sqrdmulh v20.4S, v31.4S, v0.S[1] // ......*.......................... + // sqrdmulh v7.4S, v17.4S, v0.S[1] // ........*........................ + // ldr q12, [x0, #384] // .........*....................... + // mls v19.4S, v7.4S, v8.S[0] // ..............*.................. + // ldr q6, [x0, #0] // ..............................*.. + // mul v16.4S, v11.4S, v0.S[0] // ..........*...................... + // mls v26.4S, v20.4S, v8.S[0] // ...........*..................... + // sqrdmulh v31.4S, v11.4S, v0.S[1] // .............*................... + // add v5.4S, v30.4S, v19.4S // ..................*.............. + // sub v10.4S, v12.4S, v26.4S // ...............*................. + // add v28.4S, v12.4S, v26.4S // ................*................ + // sub v12.4S, v30.4S, v19.4S // .......................*......... + // sqrdmulh v18.4S, v10.4S, v1.S[1] // ...................*............. + // mul v19.4S, v28.4S, v0.S[2] // .....................*........... + // mul v26.4S, v10.4S, v1.S[0] // ....................*............ + // sqrdmulh v25.4S, v28.4S, v0.S[3] // ......................*.......... + // ldr q11, [x0, #128] // ...*............................. + // mul v28.4S, v5.4S, v0.S[2] // ........................*........ + // mls v16.4S, v31.4S, v8.S[0] // .................*............... + // sqrdmulh v23.4S, v5.4S, v0.S[3] // ...........................*..... + // mul v29.4S, v12.4S, v1.S[0] // ...............................*. + // mls v19.4S, v25.4S, v8.S[0] // .............................*... + // mls v26.4S, v18.4S, v8.S[0] // ............................*.... + // add v7.4S, v11.4S, v16.4S // .........................*....... + // sub v22.4S, v11.4S, v16.4S // ..........................*...... + // mul v16.4S, v4.4S, v0.S[0] // ................................* + + sub count, count, #1 +layer123_start: + // Instructions: 76 + // Expected cycles: 17 + // Expected IPC: 4.47 + // + // Wall time: 38.48s + // User time: 38.48s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sqrdmulh v10.4S, v4.4S, v0.S[1] // ........*................................................................... + ldr q4, [x0, #528] // ....e....................................................................... + sqrdmulh v5.4S, v12.4S, v1.S[1] // ......................................*..................................... + ldr q17, [x0, #784] // ......e..................................................................... + ldr q31, [x0, #912] // .......e.................................................................... + mls v28.4S, v23.4S, v8.S[0] // ..............................*............................................. + // gap // ............................................................................ + // gap // ............................................................................ + ldr q11, [x0, #656] // .....e...................................................................... + ldr q30, [x0, #272] // ..e......................................................................... + add v25.4S, v22.4S, v26.4S // ...............................................*............................ + add v18.4S, v7.4S, v19.4S // .....................................*...................................... + sub v12.4S, v22.4S, v26.4S // ..............................................*............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v24.4S, v7.4S, v19.4S // ....................................*....................................... + sqrdmulh v21.4S, v18.4S, v1.S[3] // ................................................*........................... + mul v15.4S, v12.4S, v3.S[0] // ................................................................*........... + mls v16.4S, v10.4S, v8.S[0] // ..........*................................................................. + sqrdmulh v9.4S, v12.4S, v3.S[1] // ...............................................................*............ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v26.4S, v31.4S, v0.S[0] // ........................e................................................... + mul v19.4S, v17.4S, v0.S[0] // ...................e........................................................ + sqrdmulh v20.4S, v31.4S, v0.S[1] // .......................e.................................................... + sqrdmulh v7.4S, v17.4S, v0.S[1] // ..................e......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.4S, v24.4S, v2.S[0] // ......................................................*..................... + sqrdmulh v27.4S, v25.4S, v2.S[3] // ..........................................................*................. + sqrdmulh v17.4S, v24.4S, v2.S[1] // .....................................................*...................... + mls v29.4S, v5.4S, v8.S[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v23.4S, v6.4S, v16.4S // ...........*................................................................ + mls v15.4S, v9.4S, v8.S[0] // .................................................................*.......... + mul v22.4S, v18.4S, v1.S[2] // .................................................*.......................... + mul v14.4S, v25.4S, v2.S[2] // ...........................................................*................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q12, [x0, #400] // ...e........................................................................ + mls v19.4S, v7.4S, v8.S[0] // ....................e....................................................... + add v24.4S, v6.4S, v16.4S // ............*............................................................... + ldr q6, [x0, #16] // e........................................................................... + mul v16.4S, v11.4S, v0.S[0] // ..............e............................................................. + mls v26.4S, v20.4S, v8.S[0] // .........................e.................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.4S, v17.4S, v8.S[0] // .......................................................*.................... + add v20.4S, v23.4S, v29.4S // ..........................................*................................. + sqrdmulh v31.4S, v11.4S, v0.S[1] // .............e.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v17.4S, v23.4S, v29.4S // .........................................*.................................. + mls v22.4S, v21.4S, v8.S[0] // ..................................................*......................... + add v21.4S, v24.4S, v28.4S // ................................*........................................... + sub v24.4S, v24.4S, v28.4S // ...............................*............................................ + mls v14.4S, v27.4S, v8.S[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v5.4S, v30.4S, v19.4S // ......................e..................................................... + add v9.4S, v17.4S, v15.4S // ...................................................................*........ + sub v10.4S, v12.4S, v26.4S // ..........................e................................................. + add v28.4S, v12.4S, v26.4S // ...........................e................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v17.4S, v17.4S, v15.4S // ..................................................................*......... + sub v7.4S, v24.4S, v13.4S // ........................................................*................... + sub v12.4S, v30.4S, v19.4S // .....................e...................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v24.4S, v24.4S, v13.4S // .........................................................*.................. + sqrdmulh v18.4S, v10.4S, v1.S[1] // ...........................................e................................ + mul v19.4S, v28.4S, v0.S[2] // ..................................e......................................... + mul v26.4S, v10.4S, v1.S[0] // ............................................e............................... + sqrdmulh v25.4S, v28.4S, v0.S[3] // .................................e.......................................... + str q9, [x0, #768] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + ldr q11, [x0, #144] // .e.......................................................................... + mul v28.4S, v5.4S, v0.S[2] // .............................e.............................................. + sub v13.4S, v21.4S, v22.4S // ...................................................*........................ + sub v27.4S, v20.4S, v14.4S // .............................................................*.............. + str q17, [x0, #896] // ...........................................................................* + str q7, [x0, #384] // .......................................................................*.... + mls v16.4S, v31.4S, v8.S[0] // ...............e............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v30.4S, v20.4S, v14.4S // ..............................................................*............. + add v31.4S, v21.4S, v22.4S // ....................................................*....................... + str q24, [x0, #256] // ......................................................................*..... + sqrdmulh v23.4S, v5.4S, v0.S[3] // ............................e............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q27, [x0, #640] // .........................................................................*.. + str q13, [x0, #128] // .....................................................................*...... + mul v29.4S, v12.4S, v1.S[0] // .......................................e.................................... + mls v19.4S, v25.4S, v8.S[0] // ...................................e........................................ + mls v26.4S, v18.4S, v8.S[0] // .............................................e.............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v7.4S, v11.4S, v16.4S // .................e.......................................................... + sub v22.4S, v11.4S, v16.4S // ................e........................................................... + mul v16.4S, v4.4S, v0.S[0] // .........e.................................................................. + str q31, [x0], #(16) // ....................................................................*....... + str q30, [x0, #496] // ........................................................................*... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q9, [x0, #0] // ..............................e............................................'..............................~............................................ + // ldr q10, [x0, #(1*(1024/8))] // ......................................................e....................'......................................................~.................... + // ldr q11, [x0, #(2*(1024/8))] // ......e....................................................................'......~.................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ...........................e...............................................'...........................~............................................... + // ldr q13, [x0, #(4*(1024/8))] // e..........................................................................'~.......................................................................... + // ldr q14, [x0, #(5*(1024/8))] // .....e.....................................................................'.....~..................................................................... + // ldr q15, [x0, #(6*(1024/8))] // ..e........................................................................'..~........................................................................ + // ldr q16, [x0, #(7*(1024/8))] // ...e.......................................................................'...~....................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ...........................................................................*........................................................................... + // mul v24.4s, v13.4s, v0.s[0] // ........................................................................e..'........................................................................~.. + // mls v24.4s, v27.4s, v8.s[0] // .............~.............................................................'.............*............................................................. + // sub v13.4s, v9.4s, v24.4s // .......................~...................................................'.......................*................................................... + // add v9.4s, v9.4s, v24.4s // .............................~.............................................'.............................*............................................. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ...................................e.......................................'...................................~....................................... + // mul v24.4s, v14.4s, v0.s[0] // ...............................e...........................................'...............................~........................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................................e..............'............................................................~.............. + // sub v14.4s, v10.4s, v24.4s // .......................................................................e...'.......................................................................~... + // add v10.4s, v10.4s, v24.4s // ......................................................................e....'......................................................................~.... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ..................e........................................................'..................~........................................................ + // mul v24.4s, v15.4s, v0.s[0] // ................e..........................................................'................~.......................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................e..............................................'............................~.............................................. + // sub v15.4s, v11.4s, v24.4s // ...............................................e...........................'...............................................~........................... + // add v11.4s, v11.4s, v24.4s // .........................................e.................................'.........................................~................................. + // sqrdmulh v27.4s, v16.4s, v0.s[1] // .................e.........................................................'.................~......................................................... + // mul v24.4s, v16.4s, v0.s[0] // ...............e...........................................................'...............~........................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................................e..........................................'................................~.......................................... + // sub v16.4s, v12.4s, v24.4s // ...........................................e...............................'...........................................~............................... + // add v12.4s, v12.4s, v24.4s // ............................................e..............................'............................................~.............................. + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ................................................................e..........'................................................................~.......... + // mul v24.4s, v11.4s, v0.s[2] // .......................................................e...................'.......................................................~................... + // mls v24.4s, v27.4s, v8.s[0] // ....~......................................................................'....*...................................................................... + // sub v11.4s, v9.4s, v24.4s // .......................................~...................................'.......................................*................................... + // add v9.4s, v9.4s, v24.4s // ......................................~....................................'......................................*.................................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ....................................................e......................'....................................................~...................... + // mul v24.4s, v12.4s, v0.s[2] // ..................................................e........................'..................................................~........................ + // mls v24.4s, v27.4s, v8.s[0] // ....................................................................e......'....................................................................~...... + // sub v12.4s, v10.4s, v24.4s // ..........~................................................................'..........*................................................................ + // add v10.4s, v10.4s, v24.4s // ........~..................................................................'........*.................................................................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // .~.........................................................................'.*......................................................................... + // mul v24.4s, v15.4s, v1.s[0] // ...................................................................e.......'...................................................................~....... + // mls v24.4s, v27.4s, v8.s[0] // ......................~....................................................'......................*.................................................... + // sub v15.4s, v13.4s, v24.4s // ....................................~......................................'....................................*...................................... + // add v13.4s, v13.4s, v24.4s // ..................................~........................................'..................................*........................................ + // sqrdmulh v27.4s, v16.4s, v1.s[1] // .................................................e.........................'.................................................~......................... + // mul v24.4s, v16.4s, v1.s[0] // ...................................................e.......................'...................................................~....................... + // mls v24.4s, v27.4s, v8.s[0] // .....................................................................e.....'.....................................................................~..... + // sub v16.4s, v14.4s, v24.4s // .........~.................................................................'.........*................................................................. + // add v14.4s, v14.4s, v24.4s // .......~...................................................................'.......*................................................................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ...........~...............................................................'...........*............................................................... + // mul v24.4s, v10.4s, v1.s[2] // .........................~.................................................'.........................*................................................. + // mls v24.4s, v27.4s, v8.s[0] // .....................................~.....................................'.....................................*..................................... + // sub v10.4s, v9.4s, v24.4s // ........................................................~..................'........................................................*.................. + // add v9.4s, v9.4s, v24.4s // ..............................................................~............'..............................................................*............ + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .....................~.....................................................'.....................*..................................................... + // mul v24.4s, v12.4s, v2.s[0] // ...................~.......................................................'...................*....................................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................~.........................................'.................................*......................................... + // sub v12.4s, v11.4s, v24.4s // ..............................................~............................'..............................................*............................ + // add v11.4s, v11.4s, v24.4s // ................................................~..........................'................................................*.......................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ....................~......................................................'....................*...................................................... + // mul v24.4s, v14.4s, v2.s[2] // ..........................~................................................'..........................*................................................ + // mls v24.4s, v27.4s, v8.s[0] // ........................................~..................................'........................................*.................................. + // sub v14.4s, v13.4s, v24.4s // .........................................................~.................'.........................................................*................. + // add v13.4s, v13.4s, v24.4s // .............................................................~.............'.............................................................*............. + // sqrdmulh v27.4s, v16.4s, v3.s[1] // ..............~............................................................'..............*............................................................ + // mul v24.4s, v16.4s, v3.s[0] // ............~..............................................................'............*.............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ........................~..................................................'........................*.................................................. + // sub v16.4s, v15.4s, v24.4s // .............................................~.............................'.............................................*............................. + // add v15.4s, v15.4s, v24.4s // ..........................................~................................'..........................................*................................ + // str q9, [x0], #(16) // .........................................................................~.'.........................................................................*. + // str q10, [x0, #(-16 + 1*(1024/8))] // ..................................................................~........'..................................................................*........ + // str q11, [x0, #(-16 + 2*(1024/8))] // ...............................................................~...........'...............................................................*........... + // str q12, [x0, #(-16 + 3*(1024/8))] // ...........................................................~...............'...........................................................*............... + // str q13, [x0, #(-16 + 4*(1024/8))] // ..........................................................................~'..........................................................................* + // str q14, [x0, #(-16 + 5*(1024/8))] // .................................................................~.........'.................................................................*......... + // str q15, [x0, #(-16 + 6*(1024/8))] // .....................................................~.....................'.....................................................*..................... + // str q16, [x0, #(-16 + 7*(1024/8))] // ..........................................................~................'..........................................................*................ + + sub count, count, #1 + cbnz count, layer123_start + // Instructions: 43 + // Expected cycles: 16 + // Expected IPC: 2.69 + // + // Wall time: 0.55s + // User time: 0.55s + // + // ----------- original position ------------> + // 0 25 + // |------------------------|----------------- + sub v14.4S, v7.4S, v19.4S // ......*.................................... + sqrdmulh v4.4S, v4.4S, v0.S[1] // *.......................................... + sqrdmulh v13.4S, v12.4S, v1.S[1] // .*......................................... + mls v28.4S, v23.4S, v8.S[0] // ..*........................................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v19.4S, v7.4S, v19.4S // ....*...................................... + sub v15.4S, v22.4S, v26.4S // .....*..................................... + add v26.4S, v22.4S, v26.4S // ...*....................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v22.4S, v14.4S, v2.S[0] // ...........*............................... + sqrdmulh v14.4S, v14.4S, v2.S[1] // .............*............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v16.4S, v4.4S, v8.S[0] // .........*................................. + mls v29.4S, v13.4S, v8.S[0] // ..............*............................ + sqrdmulh v4.4S, v19.4S, v1.S[3] // .......*................................... + mul v13.4S, v15.4S, v3.S[0] // ........*.................................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v19.4S, v19.4S, v1.S[2] // .................*......................... + sqrdmulh v15.4S, v15.4S, v3.S[1] // ..........*................................ + sqrdmulh v10.4S, v26.4S, v2.S[3] // ............*.............................. + mul v26.4S, v26.4S, v2.S[2] // ..................*........................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v22.4S, v14.4S, v8.S[0] // ....................*...................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v14.4S, v6.4S, v16.4S // ...................*....................... + sub v6.4S, v6.4S, v16.4S // ...............*........................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v19.4S, v4.4S, v8.S[0] // .......................*................... + mls v13.4S, v15.4S, v8.S[0] // ................*.......................... + mls v26.4S, v10.4S, v8.S[0] // ..........................*................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v4.4S, v14.4S, v28.4S // .........................*................. + add v28.4S, v14.4S, v28.4S // ........................*.................. + sub v14.4S, v6.4S, v29.4S // ......................*.................... + add v6.4S, v6.4S, v29.4S // .....................*..................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v29.4S, v4.4S, v22.4S // .............................*............. + add v22.4S, v4.4S, v22.4S // ..............................*............ + add v4.4S, v14.4S, v13.4S // ...........................*............... + sub v13.4S, v14.4S, v13.4S // ............................*.............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v14.4S, v28.4S, v19.4S // ................................*.......... + add v19.4S, v28.4S, v19.4S // .....................................*..... + sub v28.4S, v6.4S, v26.4S // .................................*......... + add v6.4S, v6.4S, v26.4S // ....................................*...... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q4, [x0, #768] // ...............................*........... + str q13, [x0, #896] // ..................................*........ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q29, [x0, #384] // ...................................*....... + str q22, [x0, #256] // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q14, [x0, #128] // ........................................*.. + str q28, [x0, #640] // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q19, [x0], #(16) // .........................................*. + str q6, [x0, #496] // ..........................................* + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + + // -------------- new position --------------> + // 0 25 + // |------------------------|----------------- + // sqrdmulh v10.4S, v4.4S, v0.S[1] // .*......................................... + // sqrdmulh v5.4S, v12.4S, v1.S[1] // ..*........................................ + // mls v28.4S, v23.4S, v8.S[0] // ...*....................................... + // add v25.4S, v22.4S, v26.4S // ......*.................................... + // add v18.4S, v7.4S, v19.4S // ....*...................................... + // sub v12.4S, v22.4S, v26.4S // .....*..................................... + // sub v24.4S, v7.4S, v19.4S // *.......................................... + // sqrdmulh v21.4S, v18.4S, v1.S[3] // ...........*............................... + // mul v15.4S, v12.4S, v3.S[0] // ............*.............................. + // mls v16.4S, v10.4S, v8.S[0] // .........*................................. + // sqrdmulh v9.4S, v12.4S, v3.S[1] // ..............*............................ + // mul v13.4S, v24.4S, v2.S[0] // .......*................................... + // sqrdmulh v27.4S, v25.4S, v2.S[3] // ...............*........................... + // sqrdmulh v17.4S, v24.4S, v2.S[1] // ........*.................................. + // mls v29.4S, v5.4S, v8.S[0] // ..........*................................ + // sub v23.4S, v6.4S, v16.4S // ...................*....................... + // mls v15.4S, v9.4S, v8.S[0] // .....................*..................... + // mul v22.4S, v18.4S, v1.S[2] // .............*............................. + // mul v14.4S, v25.4S, v2.S[2] // ................*.......................... + // add v24.4S, v6.4S, v16.4S // ..................*........................ + // mls v13.4S, v17.4S, v8.S[0] // .................*......................... + // add v20.4S, v23.4S, v29.4S // ..........................*................ + // sub v17.4S, v23.4S, v29.4S // .........................*................. + // mls v22.4S, v21.4S, v8.S[0] // ....................*...................... + // add v21.4S, v24.4S, v28.4S // ........................*.................. + // sub v24.4S, v24.4S, v28.4S // .......................*................... + // mls v14.4S, v27.4S, v8.S[0] // ......................*.................... + // add v9.4S, v17.4S, v15.4S // .............................*............. + // sub v17.4S, v17.4S, v15.4S // ..............................*............ + // sub v7.4S, v24.4S, v13.4S // ...........................*............... + // add v24.4S, v24.4S, v13.4S // ............................*.............. + // str q9, [x0, #768] // ...................................*....... + // sub v13.4S, v21.4S, v22.4S // ...............................*........... + // sub v27.4S, v20.4S, v14.4S // .................................*......... + // str q17, [x0, #896] // ....................................*...... + // str q7, [x0, #384] // .....................................*..... + // add v30.4S, v20.4S, v14.4S // ..................................*........ + // add v31.4S, v21.4S, v22.4S // ................................*.......... + // str q24, [x0, #256] // ......................................*.... + // str q27, [x0, #640] // ........................................*.. + // str q13, [x0, #128] // .......................................*... + // str q31, [x0], #(16) // .........................................*. + // str q30, [x0, #496] // ..........................................* + + + restore inp, STACK0 + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + sub inp, inp, #64 + sub inpp, inpp, #64 + + .p2align 2 + // Instructions: 62 + // Expected cycles: 24 + // Expected IPC: 2.58 + // + // Wall time: 2.75s + // User time: 2.75s + // + // --------------------- original position ---------------------> + // 0 25 50 + // |------------------------|------------------------|----------- + ldr q14, [x2, #80] // ....*......................................................... + // gap // .............................................................. + ldr q20, [x2, #112] // *............................................................. + ldr q19, [x4], #64 // .*............................................................ + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + ldr q17, [x2, #96] // ...*.......................................................... + // gap // .............................................................. + ldr q1, [x1, #64] // .....................................*........................ + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + ldr q12, [x1, #112] // ..*........................................................... + ldr q16, [x2, #64] // .........*.................................................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + ldr q28, [x4, #-48] // .............*................................................ + // gap // .............................................................. + // gap // .............................................................. + add x2, x2, #64 // .............................................................* + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + sqrdmulh v6.4S, v20.4S, v19.S[1] // .....*........................................................ + mul v18.4S, v20.4S, v19.S[0] // .......*...................................................... + ldr q27, [x1, #96] // ..............*............................................... + ldr q4, [x1, #80] // .......................*...................................... + mul v31.4S, v14.4S, v19.S[0] // ..........*................................................... + sqrdmulh v21.4S, v14.4S, v19.S[1] // .................*............................................ + // gap // .............................................................. + // gap // .............................................................. + mul v9.4S, v17.4S, v19.S[0] // ......*....................................................... + sqrdmulh v25.4S, v17.4S, v19.S[1] // ........*..................................................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + mul v7.4S, v16.4S, v19.S[0] // .........................*.................................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + mls v18.4S, v6.4S, v8.S[0] // ...........*.................................................. + mls v31.4S, v21.4S, v8.S[0] // ........................*..................................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + mls v9.4S, v25.4S, v8.S[0] // ............*................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + ldr q2, [x4, #-16] // .................................*............................ + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + add v30.4S, v12.4S, v18.4S // ................*............................................. + sub v13.4S, v12.4S, v18.4S // ...............*.............................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + add v20.4S, v4.4S, v31.4S // .............................*................................ + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + sqrdmulh v11.4S, v16.4S, v19.S[1] // ....................*......................................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + mul v0.4S, v30.4S, v19.S[2] // ......................*....................................... + sqrdmulh v23.4S, v30.4S, v19.S[3] // .....................*........................................ + // gap // .............................................................. + ldr q15, [x4, #-32] // ................................*............................. + // gap // .............................................................. + sqrdmulh v16.4S, v13.4S, v28.S[1] // ..................*........................................... + mul v25.4S, v13.4S, v28.S[0] // ...................*.......................................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + add v21.4S, v27.4S, v9.4S // .........................................*.................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + sub v13.4S, v27.4S, v9.4S // ............................*................................. + sub v30.4S, v4.4S, v31.4S // ..............................*............................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + mls v7.4S, v11.4S, v8.S[0] // ........................................*..................... + // gap // .............................................................. + // gap // .............................................................. + mls v25.4S, v16.4S, v8.S[0] // ..........................*................................... + // gap // .............................................................. + add x1, x1, #64 // ............................................................*. + // gap // .............................................................. + mls v0.4S, v23.4S, v8.S[0] // ...........................*.................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + sqrdmulh v26.4S, v13.4S, v28.S[1] // ...............................*.............................. + mul v9.4S, v13.4S, v28.S[0] // ...................................*.......................... + mul v12.4S, v21.4S, v19.S[2] // ............................................*................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + sub v31.4S, v1.4S, v7.4S // ..................................................*........... + add v4.4S, v1.4S, v7.4S // ......................................................*....... + sqrdmulh v16.4S, v21.4S, v19.S[3] // ...........................................*.................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + sub v18.4S, v30.4S, v25.4S // .......................................*...................... + add v23.4S, v30.4S, v25.4S // ....................................*......................... + sub v21.4S, v20.4S, v0.4S // ..................................*........................... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + add v29.4S, v20.4S, v0.4S // ......................................*....................... + // gap // .............................................................. + mls v9.4S, v26.4S, v8.S[0] // ..............................................*............... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + mul v19.4S, v18.4S, v2.S[0] // ..........................................*................... + sqrdmulh v18.4S, v18.4S, v2.S[1] // .............................................*................ + mul v11.4S, v23.4S, v15.S[2] // ...............................................*.............. + sqrdmulh v7.4S, v23.4S, v15.S[3] // ................................................*............. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + mls v12.4S, v16.4S, v8.S[0] // .......................................................*...... + mul v16.4S, v29.4S, v28.S[2] // ...................................................*.......... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + sqrdmulh v22.4S, v29.4S, v28.S[3] // ....................................................*......... + sqrdmulh v10.4S, v21.4S, v15.S[1] // .................................................*............ + mul v0.4S, v21.4S, v15.S[0] // .....................................................*........ + sub v26.4S, v31.4S, v9.4S // ..........................................................*... + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + add v14.4S, v31.4S, v9.4S // .........................................................*.... + mls v19.4S, v18.4S, v8.S[0] // ........................................................*..... + mls v11.4S, v7.4S, v8.S[0] // ...........................................................*.. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + // gap // .............................................................. + + // ----------------------- new position ------------------------> + // 0 25 50 + // |------------------------|------------------------|----------- + // ldr q15, [x2, #112] // .*............................................................ + // ldr q7, [x4], #64 // ..*........................................................... + // ldr q2, [x1, #112] // .....*........................................................ + // ldr q21, [x2, #96] // ...*.......................................................... + // ldr q11, [x2, #80] // *............................................................. + // sqrdmulh v24.4S, v15.4S, v7.S[1] // .........*.................................................... + // mul v9.4S, v21.4S, v7.S[0] // ...............*.............................................. + // mul v19.4S, v15.4S, v7.S[0] // ..........*................................................... + // sqrdmulh v25.4S, v21.4S, v7.S[1] // ................*............................................. + // ldr q3, [x2, #64] // ......*....................................................... + // mul v30.4S, v11.4S, v7.S[0] // .............*................................................ + // mls v19.4S, v24.4S, v8.S[0] // ..................*........................................... + // mls v9.4S, v25.4S, v8.S[0] // ....................*......................................... + // ldr q18, [x4, #-48] // .......*...................................................... + // ldr q12, [x1, #96] // ...........*.................................................. + // sub v28.4S, v2.4S, v19.4S // .......................*...................................... + // add v2.4S, v2.4S, v19.4S // ......................*....................................... + // sqrdmulh v25.4S, v11.4S, v7.S[1] // ..............*............................................... + // sqrdmulh v10.4S, v28.4S, v18.S[1] // .............................*................................ + // mul v28.4S, v28.4S, v18.S[0] // ..............................*............................... + // sqrdmulh v22.4S, v3.4S, v7.S[1] // .........................*.................................... + // sqrdmulh v26.4S, v2.4S, v7.S[3] // ...........................*.................................. + // mul v29.4S, v2.4S, v7.S[2] // ..........................*................................... + // ldr q2, [x1, #80] // ............*................................................. + // mls v30.4S, v25.4S, v8.S[0] // ...................*.......................................... + // mul v3.4S, v3.4S, v7.S[0] // .................*............................................ + // mls v28.4S, v10.4S, v8.S[0] // ...................................*.......................... + // mls v29.4S, v26.4S, v8.S[0] // .....................................*........................ + // sub v14.4S, v12.4S, v9.4S // ................................*............................. + // add v1.4S, v2.4S, v30.4S // ........................*..................................... + // sub v27.4S, v2.4S, v30.4S // .................................*............................ + // sqrdmulh v13.4S, v14.4S, v18.S[1] // ......................................*....................... + // ldr q6, [x4, #-32] // ............................*................................. + // ldr q30, [x4, #-16] // .....................*........................................ + // sub v4.4S, v1.4S, v29.4S // ..............................................*............... + // mul v2.4S, v14.4S, v18.S[0] // .......................................*...................... + // add v0.4S, v27.4S, v28.4S // .............................................*................ + // ldr q14, [x1, #64] // ....*......................................................... + // add v5.4S, v1.4S, v29.4S // ...............................................*.............. + // sub v29.4S, v27.4S, v28.4S // ............................................*................. + // mls v3.4S, v22.4S, v8.S[0] // ..................................*........................... + // add v31.4S, v12.4S, v9.4S // ...............................*.............................. + // mul v19.4S, v29.4S, v30.S[0] // .................................................*............ + // sqrdmulh v24.4S, v31.4S, v7.S[3] // ...........................................*.................. + // mul v12.4S, v31.4S, v7.S[2] // ........................................*..................... + // sqrdmulh v9.4S, v29.4S, v30.S[1] // ..................................................*........... + // mls v2.4S, v13.4S, v8.S[0] // ................................................*............. + // mul v11.4S, v0.4S, v6.S[2] // ...................................................*.......... + // sqrdmulh v13.4S, v0.4S, v6.S[3] // ....................................................*......... + // sqrdmulh v10.4S, v4.4S, v6.S[1] // ........................................................*..... + // sub v28.4S, v14.4S, v3.4S // .........................................*.................... + // mul v16.4S, v5.4S, v18.S[2] // ......................................................*....... + // sqrdmulh v22.4S, v5.4S, v18.S[3] // .......................................................*...... + // mul v0.4S, v4.4S, v6.S[0] // .........................................................*.... + // add v4.4S, v14.4S, v3.4S // ..........................................*................... + // mls v12.4S, v24.4S, v8.S[0] // .....................................................*........ + // mls v19.4S, v9.4S, v8.S[0] // ............................................................*. + // add v14.4S, v28.4S, v2.4S // ...........................................................*.. + // sub v26.4S, v28.4S, v2.4S // ..........................................................*... + // mls v11.4S, v13.4S, v8.S[0] // .............................................................* + // add x1, x1, #64 // ....................................*......................... + // add x2, x2, #64 // ........*..................................................... + + sub count, count, #1 +layer45678_start: + // Instructions: 144 + // Expected cycles: 31 + // Expected IPC: 4.65 + // + // Wall time: 1090.26s + // User time: 1090.26s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q15, [x2, #176] // .......e........................................................................................................................................ + ldr q7, [x4], #64 // ..........e..................................................................................................................................... + mls v16.4S, v22.4S, v8.S[0] // ........................................................*....................................................................................... + mls v0.4S, v10.4S, v8.S[0] // .............................................................*.................................................................................. + ldr q2, [x1, #176] // ...e............................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q21, [x2, #160] // ......e......................................................................................................................................... + sub v27.4S, v4.4S, v12.4S // .....................................*.......................................................................................................... + add v30.4S, v4.4S, v12.4S // ......................................*......................................................................................................... + sub v9.4S, v26.4S, v19.4S // ........................................................................*....................................................................... + add v29.4S, v26.4S, v19.4S // .........................................................................*...................................................................... + add v28.4S, v14.4S, v11.4S // ....................................................................*........................................................................... + sub v22.4S, v14.4S, v11.4S // ...................................................................*............................................................................ + // gap // ................................................................................................................................................ + ldr q11, [x2, #144] // .....e.......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v26.4S, v30.4S, v16.4S // .........................................................*...................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v23.4S, v29.4S, v9.4S // ....................................................................................*........................................................... + add v30.4S, v30.4S, v16.4S // ..........................................................*..................................................................................... + add v4.4S, v27.4S, v0.4S // ...............................................................*................................................................................ + ldr q20, [x5, #96] // ....................................................................................................................*........................... + sub v6.4S, v27.4S, v0.4S // ..............................................................*................................................................................. + sqrdmulh v24.4S, v15.4S, v7.S[1] // .............................e.................................................................................................................. + ldr q10, [x5, #112] // .....................................................................................................................*.......................... + trn2 v27.4S, v28.4S, v22.4S // ...................................................................................*............................................................ + // gap // ................................................................................................................................................ + trn2 v16.4S, v29.4S, v9.4S // .....................................................................................*.......................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v17.4S, v30.4S, v26.4S // ...........................................................................*.................................................................... + mul v9.4S, v21.4S, v7.S[0] // .........................e...................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v19.4S, v15.4S, v7.S[0] // ..............................e................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v25.4S, v21.4S, v7.S[1] // ........................e....................................................................................................................... + trn1 v5.4S, v4.4S, v6.4S // ............................................................................*................................................................... + trn2 v31.4S, v4.4S, v6.4S // .............................................................................*.................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q21, [x5], #(12*16) // ..........................................................................................*..................................................... + trn2 v1.2D, v27.2D, v16.2D // .......................................................................................*........................................................ + trn1 v6.4S, v30.4S, v26.4S // ..........................................................................*..................................................................... + ldr q4, [x5, #-176] // ...........................................................................................*.................................................... + trn1 v16.2D, v27.2D, v16.2D // .........................................................................................*...................................................... + ldr q3, [x2, #128] // ....e........................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v30.4S, v11.4S, v7.S[0] // ....................e........................................................................................................................... + trn1 v13.4S, v28.4S, v22.4S // ..................................................................................*............................................................. + trn2 v26.2D, v6.2D, v5.2D // ..............................................................................*................................................................. + mls v19.4S, v24.4S, v8.S[0] // ...............................e................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v1.4S, v10.4S // ...............................................................................................................................*................ + mul v15.4S, v1.4S, v20.4S // ................................................................................................................................*............... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v14.2D, v13.2D, v23.2D // ......................................................................................*......................................................... + trn1 v13.2D, v13.2D, v23.2D // ........................................................................................*....................................................... + mls v9.4S, v25.4S, v8.S[0] // ..........................e..................................................................................................................... + // gap // ................................................................................................................................................ + trn2 v29.2D, v17.2D, v31.2D // ...............................................................................*................................................................ + ldr q18, [x4, #-48] // ...........e.................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v23.4S, v26.4S, v21.4S // .................................................................................................*.............................................. + trn1 v27.2D, v17.2D, v31.2D // .................................................................................*.............................................................. + trn1 v0.2D, v6.2D, v5.2D // ................................................................................*............................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v31.4S, v26.4S, v4.4S // ................................................................................................*............................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v1.4S, v29.4S, v21.4S // ......................................................................................................*......................................... + ldr q12, [x1, #160] // ..e............................................................................................................................................. + sqrdmulh v29.4S, v29.4S, v4.4S // .....................................................................................................*.......................................... + mls v15.4S, v22.4S, v8.S[0] // .................................................................................................................................*.............. + sub v28.4S, v2.4S, v19.4S // ................................e............................................................................................................... + ldr q5, [x5, #-144] // .............................................................................................*.................................................. + add v2.4S, v2.4S, v19.4S // .................................e.............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v20.4S, v14.4S, v20.4S // ...........................................................................................................................*.................... + sqrdmulh v25.4S, v11.4S, v7.S[1] // ...................e............................................................................................................................ + ldr q24, [x5, #-48] // .......................................................................................................................*........................ + sqrdmulh v11.4S, v14.4S, v10.4S // ..........................................................................................................................*..................... + ldr q14, [x5, #-64] // ......................................................................................................................*......................... + sqrdmulh v10.4S, v28.4S, v18.S[1] // .................................................e.............................................................................................. + // gap // ................................................................................................................................................ + mls v23.4S, v31.4S, v8.S[0] // ..................................................................................................*............................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q6, [x5, #-32] // ........................................................................................................................*....................... + mul v28.4S, v28.4S, v18.S[0] // ..................................................e............................................................................................. + sqrdmulh v22.4S, v3.4S, v7.S[1] // ..............e................................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v2.4S, v7.S[3] // .......................................e........................................................................................................ + mls v1.4S, v29.4S, v8.S[0] // .......................................................................................................*........................................ + ldr q21, [x5, #-16] // .........................................................................................................................*...................... + mul v29.4S, v2.4S, v7.S[2] // ........................................e....................................................................................................... + add v19.4S, v16.4S, v15.4S // ...................................................................................................................................*............ + ldr q2, [x1, #144] // .e.............................................................................................................................................. + mls v20.4S, v11.4S, v8.S[0] // ............................................................................................................................*................... + ldr q11, [x5, #-160] // ............................................................................................*................................................... + // gap // ................................................................................................................................................ + mls v30.4S, v25.4S, v8.S[0] // .....................e.......................................................................................................................... + ldr q25, [x5, #-112] // ...............................................................................................*................................................ + sub v15.4S, v16.4S, v15.4S // ..................................................................................................................................*............. + mul v3.4S, v3.4S, v7.S[0] // ...............e................................................................................................................................ + ldr q31, [x5, #-128] // ..............................................................................................*................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v28.4S, v10.4S, v8.S[0] // ...................................................e............................................................................................ + sqrdmulh v4.4S, v19.4S, v24.4S // ....................................................................................................................................*........... + add v17.4S, v0.4S, v23.4S // ....................................................................................................*........................................... + mul v24.4S, v19.4S, v14.4S // .....................................................................................................................................*.......... + mul v19.4S, v15.4S, v6.4S // ..........................................................................................................................................*..... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v6.4S, v27.4S, v1.4S // ........................................................................................................*....................................... + mls v29.4S, v26.4S, v8.S[0] // .........................................e...................................................................................................... + add v16.4S, v27.4S, v1.4S // .........................................................................................................*...................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v14.4S, v12.4S, v9.4S // ...........................e.................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v26.4S, v15.4S, v21.4S // .........................................................................................................................................*...... + add v1.4S, v2.4S, v30.4S // .......................e........................................................................................................................ + add v10.4S, v13.4S, v20.4S // ..............................................................................................................................*................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v15.4S, v16.4S, v11.4S // ...........................................................................................................*.................................... + mul v11.4S, v6.4S, v31.4S // ................................................................................................................*............................... + sqrdmulh v31.4S, v6.4S, v25.4S // ...............................................................................................................*................................ + sqrdmulh v21.4S, v16.4S, v5.4S // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + sub v27.4S, v2.4S, v30.4S // ......................e......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v25.4S, v13.4S, v20.4S // .............................................................................................................................*.................. + sqrdmulh v13.4S, v14.4S, v18.S[1] // ............................................e................................................................................................... + ldr q6, [x4, #-32] // ............e................................................................................................................................... + mls v24.4S, v4.4S, v8.S[0] // ......................................................................................................................................*......... + sub v16.4S, v0.4S, v23.4S // ...................................................................................................*............................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q30, [x4, #-16] // .............e.................................................................................................................................. + sub v4.4S, v1.4S, v29.4S // ..........................................e..................................................................................................... + mul v2.4S, v14.4S, v18.S[0] // .............................................e.................................................................................................. + mls v19.4S, v26.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v0.4S, v27.4S, v28.4S // .....................................................e.......................................................................................... + // gap // ................................................................................................................................................ + ldr q14, [x1, #128] // e............................................................................................................................................... + mls v11.4S, v31.4S, v8.S[0] // .................................................................................................................*.............................. + mls v15.4S, v21.4S, v8.S[0] // ............................................................................................................*................................... + add v5.4S, v1.4S, v29.4S // ...........................................e.................................................................................................... + sub v29.4S, v27.4S, v28.4S // ....................................................e........................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v22.4S, v8.S[0] // ................e............................................................................................................................... + add v20.4S, v10.4S, v24.4S // ........................................................................................................................................*....... + sub v21.4S, v10.4S, v24.4S // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + add v22.4S, v25.4S, v19.4S // .............................................................................................................................................*.. + sub v23.4S, v25.4S, v19.4S // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v31.4S, v12.4S, v9.4S // ............................e................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v26.4S, v17.4S, v15.4S // .............................................................................................................*.................................. + sub v28.4S, v16.4S, v11.4S // ..................................................................................................................*............................. + add v25.4S, v17.4S, v15.4S // ..............................................................................................................*................................. + add v27.4S, v16.4S, v11.4S // ...................................................................................................................*............................ + mul v19.4S, v29.4S, v30.S[0] // ......................................................................e......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v31.4S, v7.S[3] // ..................................e............................................................................................................. + mul v12.4S, v31.4S, v7.S[2] // ...................................e............................................................................................................ + sqrdmulh v9.4S, v29.4S, v30.S[1] // .....................................................................e.......................................................................... + mls v2.4S, v13.4S, v8.S[0] // ..............................................e................................................................................................. + mul v11.4S, v0.4S, v6.S[2] // .................................................................e.............................................................................. + sqrdmulh v13.4S, v0.4S, v6.S[3] // ................................................................e............................................................................... + sqrdmulh v10.4S, v4.4S, v6.S[1] // ...........................................................e.................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // ..............................................................................................................................................*. + st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x2], #64 // ...............................................................................................................................................* + // gap // ................................................................................................................................................ + sub v28.4S, v14.4S, v3.4S // .................e.............................................................................................................................. + // gap // ................................................................................................................................................ + mul v16.4S, v5.4S, v18.S[2] // .......................................................e........................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v5.4S, v18.S[3] // ......................................................e......................................................................................... + // gap // ................................................................................................................................................ + mul v0.4S, v4.4S, v6.S[0] // ............................................................e................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v4.4S, v14.4S, v3.4S // ..................e............................................................................................................................. + mls v12.4S, v24.4S, v8.S[0] // ....................................e........................................................................................................... + mls v19.4S, v9.4S, v8.S[0] // .......................................................................e........................................................................ + add v14.4S, v28.4S, v2.4S // ................................................e............................................................................................... + sub v26.4S, v28.4S, v2.4S // ...............................................e................................................................................................ + mls v11.4S, v13.4S, v8.S[0] // ..................................................................e............................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add x1, x1, #64 // ........e....................................................................................................................................... + add x2, x2, #64 // .........e...................................................................................................................................... + + // ---------------------------------------------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q9, [x1, #(16*0 + (64))] // ...........................................................................................................e....................................'..........................................................................................................~........................ + // ldr q10, [x1, #(16*1 + (64))] // ........................................................................e.......................................................................'.......................................................................~........................................................... + // ldr q11, [x1, #(16*2 + (64))] // ...................................................e............................................................................................'..................................................~................................................................................ + // ldr q12, [x1, #(16*3 + (64))] // ....e...........................................................................................................................................'...~............................................................................................................................... + // ldr q13, [x2, #(16*0 + (64))] // ..................................e.............................................................................................................'.................................~................................................................................................. + // ldr q14, [x2, #(16*1 + (64))] // ............e...................................................................................................................................'...........~....................................................................................................................... + // ldr q15, [x2, #(16*2 + (64))] // .....e..........................................................................................................................................'....~.............................................................................................................................. + // ldr q16, [x2, #(16*3 + (64))] // e...............................................................................................................................................~................................................................................................................................... + // add x1, x1, #64 // ..............................................................................................................................................e.'................................................................................................................................... + // add x2, x2, #64 // ...............................................................................................................................................e'................................................................................................................................... + // ldr q0, [x4], #64 // .e..............................................................................................................................................'~.................................................................................................................................. + // ldr q1, [x4, #(-64 + 16)] // .............................................e..................................................................................................'............................................~...................................................................................... + // ldr q2, [x4, #(-64 + 32)] // ...................................................................................................e............................................'..................................................................................................~................................ + // ldr q3, [x4, #(-64 + 48)] // ......................................................................................................e.........................................'.....................................................................................................~............................. + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ..................................................................e.............................................................................'.................................................................~................................................................. + // mul v24.4s, v13.4s, v0.s[0] // ..............................................................................e.................................................................'.............................................................................~..................................................... + // mls v24.4s, v27.4s, v8.s[0] // ................................................................................................................e...............................'...............................................................................................................~................... + // sub v13.4s, v9.4s, v24.4s // ....................................................................................................................................e...........'................................................................................................................................... + // add v9.4s, v9.4s, v24.4s // ........................................................................................................................................e.......'................................................................................................................................... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ..........................................................e.....................................................................................'.........................................................~......................................................................... + // mul v24.4s, v14.4s, v0.s[0] // ...................................e............................................................................................................'..................................~................................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................................e....................................................................'..........................................................................~........................................................ + // sub v14.4s, v10.4s, v24.4s // ................................................................................................e...............................................'...............................................................................................~................................... + // add v10.4s, v10.4s, v24.4s // ..........................................................................................e.....................................................'.........................................................................................~......................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ..........................e.....................................................................................................................'.........................~......................................................................................................... + // mul v24.4s, v15.4s, v0.s[0] // ........................e.......................................................................................................................'.......................~........................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................e....................................................................................................'..........................................~........................................................................................ + // sub v15.4s, v11.4s, v24.4s // ........................................................................................e.......................................................'.......................................................................................~........................................... + // add v11.4s, v11.4s, v24.4s // .....................................................................................................................e..........................'....................................................................................................................~.............. + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ...................e............................................................................................................................'..................~................................................................................................................ + // mul v24.4s, v16.4s, v0.s[0] // .........................e......................................................................................................................'........................~.......................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ......................................e.........................................................................................................'.....................................~............................................................................................. + // sub v16.4s, v12.4s, v24.4s // ......................................................e.........................................................................................'.....................................................~............................................................................. + // add v12.4s, v12.4s, v24.4s // ........................................................e.......................................................................................'.......................................................~........................................................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ...........................................................................................................................e....................'..........................................................................................................................~........ + // mul v24.4s, v11.4s, v0.s[2] // ............................................................................................................................e...................'...........................................................................................................................~....... + // mls v24.4s, v27.4s, v8.s[0] // .........................................................................................................................................e......'................................................................................................................................... + // sub v11.4s, v9.4s, v24.4s // ......~.........................................................................................................................................'.....*............................................................................................................................. + // add v9.4s, v9.4s, v24.4s // .......~........................................................................................................................................'......*............................................................................................................................ + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ...................................................................e............................................................................'..................................................................~................................................................ + // mul v24.4s, v12.4s, v0.s[2] // ......................................................................e.........................................................................'.....................................................................~............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ......................................................................................e.........................................................'.....................................................................................~............................................. + // sub v12.4s, v10.4s, v24.4s // .......................................................................................................e........................................'......................................................................................................~............................ + // add v10.4s, v10.4s, v24.4s // ..............................................................................................................e.................................'.............................................................................................................~..................... + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ..................................................................................................e.............................................'.................................................................................................~................................. + // mul v24.4s, v15.4s, v1.s[0] // ........................................................................................................e.......................................'.......................................................................................................~........................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................................................e.................'.............................................................................................................................~..... + // sub v15.4s, v13.4s, v24.4s // ............................................................................................................................................e...'................................................................................................................................... + // add v13.4s, v13.4s, v24.4s // ...........................................................................................................................................e....'................................................................................................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ..............................................................e.................................................................................'.............................................................~..................................................................... + // mul v24.4s, v16.4s, v1.s[0] // .................................................................e..............................................................................'................................................................~.................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ................................................................................e...............................................................'...............................................................................~................................................... + // sub v16.4s, v14.4s, v24.4s // ...............................................................................................................e................................'..............................................................................................................~.................... + // add v14.4s, v14.4s, v24.4s // ..........................................................................................................e.....................................'.........................................................................................................~......................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ......................................................................................................................................e.........'................................................................................................................................... + // mul v24.4s, v10.4s, v1.s[2] // .....................................................................................................................................e..........'................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..~.............................................................................................................................................'.*................................................................................................................................. + // sub v10.4s, v9.4s, v24.4s // .............~..................................................................................................................................'............*...................................................................................................................... + // add v9.4s, v9.4s, v24.4s // ...............~................................................................................................................................'..............*.................................................................................................................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // .................................................................................................................................e..............'................................................................................................................................~.. + // mul v24.4s, v12.4s, v2.s[0] // .......................................................................................................................................e........'................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...~............................................................................................................................................'..*................................................................................................................................ + // sub v12.4s, v11.4s, v24.4s // ..................~.............................................................................................................................'.................*................................................................................................................. + // add v11.4s, v11.4s, v24.4s // ................~...............................................................................................................................'...............*................................................................................................................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ................................................................................................................................e...............'...............................................................................................................................~... + // mul v24.4s, v14.4s, v2.s[2] // ...............................................................................................................................e................'..............................................................................................................................~.... + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................................................................................e..'................................................................................................................................... + // sub v14.4s, v13.4s, v24.4s // ...........~....................................................................................................................................'..........*........................................................................................................................ + // add v13.4s, v13.4s, v24.4s // ..........~.....................................................................................................................................'.........*......................................................................................................................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .............................................................................................................................e..................'............................................................................................................................~...... + // mul v24.4s, v16.4s, v3.s[0] // ..........................................................................................................................e.....................'.........................................................................................................................~......... + // mls v24.4s, v27.4s, v8.s[0] // ..........................................................................................................................................e.....'................................................................................................................................... + // sub v16.4s, v15.4s, v24.4s // ........~.......................................................................................................................................'.......*........................................................................................................................... + // add v15.4s, v15.4s, v24.4s // .........~......................................................................................................................................'........*.......................................................................................................................... + // trn1 v25.4s, v9.4s, v10.4s // ...............................~................................................................................................................'..............................*.................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // .......................~........................................................................................................................'......................*............................................................................................................ + // trn1 v27.4s, v11.4s, v12.4s // ...........................~....................................................................................................................'..........................*........................................................................................................ + // trn2 v28.4s, v11.4s, v12.4s // ............................~...................................................................................................................'...........................*....................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // .....................................~..........................................................................................................'....................................*.............................................................................................. + // trn2 v12.2d, v26.2d, v28.2d // ............................................~...................................................................................................'...........................................*....................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ................................................~...............................................................................................'...............................................*................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...............................................~................................................................................................'..............................................*.................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ....................................~...........................................................................................................'...................................*............................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // .....................~..........................................................................................................................'....................*.............................................................................................................. + // trn1 v27.4s, v15.4s, v16.4s // ..............~.................................................................................................................................'.............*..................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ......................~.........................................................................................................................'.....................*............................................................................................................. + // trn2 v15.2d, v25.2d, v27.2d // .........................................~......................................................................................................'........................................*.......................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ..............................~.................................................................................................................'.............................*..................................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ..........................................~.....................................................................................................'.........................................*......................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // .................................~..............................................................................................................'................................*.................................................................................................. + // ldr q0, [ x5], #(12*16) // .............................~..................................................................................................................'............................*...................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ................................~...............................................................................................................'...............................*................................................................................................... + // ldr q1, [ x5, #(-12*16 + 2*16)] // ..........................................................................~.....................................................................'.........................................................................*......................................................... + // ldr q5, [x5, #(-12*16 + 3*16)] // .......................................................~........................................................................................'......................................................*............................................................................ + // ldr q2, [ x5, #(-12*16 + 4*16)] // ...............................................................................~................................................................'..............................................................................*.................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ............................................................................~...................................................................'...........................................................................*....................................................... + // sqrdmulh v27.4s, v11.4s, v4.4s // .................................................~..............................................................................................'................................................*.................................................................................. + // mul v24.4s, v11.4s, v0.4s // ..............................................~.................................................................................................'.............................................*..................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...............................................................~................................................................................'..............................................................*.................................................................... + // sub v11.4s, v9.4s, v24.4s // .....................................................................................................~..........................................'....................................................................................................*.............................. + // add v9.4s, v9.4s, v24.4s // ..................................................................................~.............................................................'.................................................................................*................................................. + // sqrdmulh v27.4s, v12.4s, v4.4s // ....................................................~...........................................................................................'...................................................*............................................................................... + // mul v24.4s, v12.4s, v0.4s // ..................................................~.............................................................................................'.................................................*................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ....................................................................~...........................................................................'...................................................................*............................................................... + // sub v12.4s, v10.4s, v24.4s // .....................................................................................~..........................................................'....................................................................................*.............................................. + // add v10.4s, v10.4s, v24.4s // .......................................................................................~........................................................'......................................................................................*............................................ + // sqrdmulh v27.4s, v10.4s, v5.4s // ...............................................................................................~................................................'..............................................................................................*.................................... + // mul v24.4s, v10.4s, v1.4s // ............................................................................................~...................................................'...........................................................................................*....................................... + // mls v24.4s, v27.4s, v8.s[0] // .............................................................................................................~..................................'............................................................................................................*...................... + // sub v10.4s, v9.4s, v24.4s // ......................................................................................................................~.........................'.....................................................................................................................*............. + // add v9.4s, v9.4s, v24.4s // ........................................................................................................................~.......................'.......................................................................................................................*........... + // sqrdmulh v27.4s, v12.4s, v6.4s // ..............................................................................................~.................................................'.............................................................................................*..................................... + // mul v24.4s, v12.4s, v2.4s // .............................................................................................~..................................................'............................................................................................*...................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................................................................................~...................................'...........................................................................................................*....................... + // sub v12.4s, v11.4s, v24.4s // .......................................................................................................................~........................'......................................................................................................................*............ + // add v11.4s, v11.4s, v24.4s // .........................................................................................................................~......................'........................................................................................................................*.......... + // ldr q0, [ x5, #(-12*16 + 6*16)] // .................~..............................................................................................................................'................*.................................................................................................................. + // ldr q4, [x5, #(-12*16 + 7*16)] // ....................~...........................................................................................................................'...................*............................................................................................................... + // ldr q1, [ x5, #(-12*16 + 8*16)] // .............................................................~..................................................................................'............................................................*...................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ...........................................................~....................................................................................'..........................................................*........................................................................ + // ldr q2, [ x5, #(-12*16 + 10*16)] // ................................................................~...............................................................................'...............................................................*................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // .....................................................................~..........................................................................'....................................................................*.............................................................. + // sqrdmulh v27.4s, v15.4s, v4.4s // ............................................................~...................................................................................'...........................................................*....................................................................... + // mul v24.4s, v15.4s, v0.4s // .........................................................~......................................................................................'........................................................*.......................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .........................................................................~......................................................................'........................................................................*.......................................................... + // sub v15.4s, v13.4s, v24.4s // .................................................................................................~..............................................'................................................................................................*.................................. + // add v13.4s, v13.4s, v24.4s // ...........................................................................................~....................................................'..........................................................................................*........................................ + // sqrdmulh v27.4s, v16.4s, v4.4s // .......................................~........................................................................................................'......................................*............................................................................................ + // mul v24.4s, v16.4s, v0.4s // ........................................~.......................................................................................................'.......................................*........................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .....................................................~..........................................................................................'....................................................*.............................................................................. + // sub v16.4s, v14.4s, v24.4s // .............................................................................~..................................................................'............................................................................*...................................................... + // add v14.4s, v14.4s, v24.4s // .......................................................................~........................................................................'......................................................................*............................................................ + // sqrdmulh v27.4s, v14.4s, v5.4s // .................................................................................~..............................................................'................................................................................*.................................................. + // mul v24.4s, v14.4s, v1.4s // ...................................................................................~............................................................'..................................................................................*................................................ + // mls v24.4s, v27.4s, v8.s[0] // ....................................................................................................~...........................................'...................................................................................................*............................... + // sub v14.4s, v13.4s, v24.4s // ..................................................................................................................~.............................'.................................................................................................................*................. + // add v13.4s, v13.4s, v24.4s // .................................................................................................................~..............................'................................................................................................................*.................. + // sqrdmulh v27.4s, v16.4s, v6.4s // .........................................................................................~......................................................'........................................................................................*.......................................... + // mul v24.4s, v16.4s, v2.4s // ....................................................................................~...........................................................'...................................................................................*............................................... + // mls v24.4s, v27.4s, v8.s[0] // .........................................................................................................~......................................'........................................................................................................*.......................... + // sub v16.4s, v15.4s, v24.4s // ....................................................................................................................~...........................'...................................................................................................................*............... + // add v15.4s, v15.4s, v24.4s // ...................................................................................................................~............................'..................................................................................................................*................ + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // ..................................................................................................................................~.............'.................................................................................................................................*. + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // ...................................................................................................................................~............'..................................................................................................................................* + + sub count, count, #1 + cbnz count, layer45678_start + // Instructions: 82 + // Expected cycles: 29 + // Expected IPC: 2.83 + // + // Wall time: 5.69s + // User time: 5.69s + // + // ------------------------------- original position -------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + mls v0.4S, v10.4S, v8.S[0] // .*................................................................................ + mls v16.4S, v22.4S, v8.S[0] // *................................................................................. + add v6.4S, v4.4S, v12.4S // ...*.............................................................................. + ldr q20, [x5, #112] // ..............*................................................................... + ldr q24, [x5, #96] // ............*..................................................................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + sub v15.4S, v4.4S, v12.4S // ..*............................................................................... + add v30.4S, v14.4S, v11.4S // ......*........................................................................... + sub v1.4S, v26.4S, v19.4S // ....*............................................................................. + ldr q2, [x5], #(12*16) // ....................*............................................................. + ldr q27, [x5, #-176] // .......................*.......................................................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + add v21.4S, v26.4S, v19.4S // .....*............................................................................ + sub v26.4S, v14.4S, v11.4S // .......*.......................................................................... + ldr q3, [x5, #-16] // ...............................................*.................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + add v25.4S, v15.4S, v0.4S // ...........*...................................................................... + sub v5.4S, v15.4S, v0.4S // .............*.................................................................... + add v15.4S, v6.4S, v16.4S // ..........*....................................................................... + sub v6.4S, v6.4S, v16.4S // ........*......................................................................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + trn1 v7.4S, v30.4S, v26.4S // .........................*........................................................ + trn2 v26.4S, v30.4S, v26.4S // ...............*.................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + trn2 v14.4S, v21.4S, v1.4S // ................*................................................................. + trn1 v31.4S, v21.4S, v1.4S // .........*........................................................................ + trn2 v30.4S, v25.4S, v5.4S // ...................*.............................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + trn2 v23.4S, v15.4S, v6.4S // .................*................................................................ + trn2 v10.2D, v26.2D, v14.2D // .....................*............................................................ + trn1 v22.4S, v15.4S, v6.4S // ......................*........................................................... + trn1 v12.4S, v25.4S, v5.4S // ..................*............................................................... + ldr q25, [x5, #-112] // ...................................................*.............................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + trn2 v18.2D, v23.2D, v30.2D // ...............................*.................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + trn2 v17.2D, v7.2D, v31.2D // .............................*.................................................... + sqrdmulh v1.4S, v10.4S, v20.4S // ...........................*...................................................... + mul v6.4S, v10.4S, v24.4S // ............................*..................................................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + sqrdmulh v21.4S, v17.4S, v20.4S // ..........................................*....................................... + sqrdmulh v29.4S, v18.4S, v27.4S // .....................................*............................................ + mul v16.4S, v18.4S, v2.4S // ....................................*............................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + trn2 v4.2D, v22.2D, v12.2D // ..........................*....................................................... + mul v18.4S, v17.4S, v24.4S // ........................................*......................................... + ldr q24, [x5, #-64] // ...........................................*...................................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + mul v17.4S, v4.4S, v2.4S // ................................*................................................. + sqrdmulh v5.4S, v4.4S, v27.4S // ...................................*.............................................. + mls v6.4S, v1.4S, v8.S[0] // ......................................*........................................... + ldr q1, [x5, #-144] // .......................................*.......................................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + trn1 v13.2D, v26.2D, v14.2D // ........................*......................................................... + trn1 v14.2D, v23.2D, v30.2D // .................................*................................................ + mls v16.4S, v29.4S, v8.S[0] // ..............................................*................................... + // gap // .................................................................................. + // gap // .................................................................................. + ldr q0, [x5, #-128] // .....................................................*............................ + ldr q27, [x5, #-48] // .........................................*........................................ + ldr q30, [x5, #-32] // .............................................*.................................... + mls v18.4S, v21.4S, v8.S[0] // .................................................*................................ + trn1 v29.2D, v7.2D, v31.2D // ..............................*................................................... + trn1 v7.2D, v22.2D, v12.2D // ..................................*............................................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + ldr q19, [x5, #-160] // ..................................................*............................... + mls v17.4S, v5.4S, v8.S[0] // ............................................*..................................... + sub v5.4S, v13.4S, v6.4S // ....................................................*............................. + add v6.4S, v13.4S, v6.4S // ................................................*................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + sub v9.4S, v14.4S, v16.4S // ..........................................................*....................... + add v20.4S, v14.4S, v16.4S // ...........................................................*...................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + mul v10.4S, v5.4S, v30.4S // .........................................................*........................ + mul v30.4S, v6.4S, v24.4S // ........................................................*......................... + sqrdmulh v2.4S, v5.4S, v3.4S // ............................................................*..................... + sqrdmulh v13.4S, v6.4S, v27.4S // ......................................................*........................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + sqrdmulh v16.4S, v9.4S, v25.4S // ................................................................*................. + sqrdmulh v23.4S, v20.4S, v1.4S // .................................................................*................ + mul v4.4S, v9.4S, v0.4S // ...............................................................*.................. + mul v21.4S, v20.4S, v19.4S // ..............................................................*................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + add v5.4S, v29.4S, v18.4S // .............................................................*.................... + sub v15.4S, v29.4S, v18.4S // ..................................................................*............... + add v22.4S, v7.4S, v17.4S // .......................................................*.......................... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + mls v30.4S, v13.4S, v8.S[0] // ...................................................................*.............. + mls v10.4S, v2.4S, v8.S[0] // .....................................................................*............ + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + mls v4.4S, v16.4S, v8.S[0] // ......................................................................*........... + mls v21.4S, v23.4S, v8.S[0] // .......................................................................*.......... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + sub v2.4S, v7.4S, v17.4S // ....................................................................*............. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + sub v18.4S, v5.4S, v30.4S // .........................................................................*........ + add v17.4S, v5.4S, v30.4S // ........................................................................*......... + sub v20.4S, v15.4S, v10.4S // ...........................................................................*...... + add v19.4S, v15.4S, v10.4S // ..........................................................................*....... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + add v12.4S, v22.4S, v21.4S // ..............................................................................*... + sub v13.4S, v22.4S, v21.4S // ............................................................................*..... + add v14.4S, v2.4S, v4.4S // ...............................................................................*.. + sub v15.4S, v2.4S, v4.4S // .............................................................................*.... + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x2], #64 // .................................................................................* + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + st4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x1], #64 // ................................................................................*. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + // gap // .................................................................................. + + // --------------------------------- new position ----------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + // mls v16.4S, v22.4S, v8.S[0] // .*................................................................................ + // mls v0.4S, v10.4S, v8.S[0] // *................................................................................. + // sub v27.4S, v4.4S, v12.4S // .....*............................................................................ + // add v30.4S, v4.4S, v12.4S // ..*............................................................................... + // sub v9.4S, v26.4S, v19.4S // .......*.......................................................................... + // add v29.4S, v26.4S, v19.4S // ..........*....................................................................... + // add v28.4S, v14.4S, v11.4S // ......*........................................................................... + // sub v22.4S, v14.4S, v11.4S // ...........*...................................................................... + // sub v26.4S, v30.4S, v16.4S // ................*................................................................. + // trn1 v23.4S, v29.4S, v9.4S // ....................*............................................................. + // add v30.4S, v30.4S, v16.4S // ...............*.................................................................. + // add v4.4S, v27.4S, v0.4S // .............*.................................................................... + // ldr q20, [x5, #96] // ....*............................................................................. + // sub v6.4S, v27.4S, v0.4S // ..............*................................................................... + // ldr q10, [x5, #112] // ...*.............................................................................. + // trn2 v27.4S, v28.4S, v22.4S // ..................*............................................................... + // trn2 v16.4S, v29.4S, v9.4S // ...................*.............................................................. + // trn2 v17.4S, v30.4S, v26.4S // ......................*........................................................... + // trn1 v5.4S, v4.4S, v6.4S // .........................*........................................................ + // trn2 v31.4S, v4.4S, v6.4S // .....................*............................................................ + // ldr q21, [x5], #(12*16) // ........*......................................................................... + // trn2 v1.2D, v27.2D, v16.2D // .......................*.......................................................... + // trn1 v6.4S, v30.4S, v26.4S // ........................*......................................................... + // ldr q4, [x5, #-176] // .........*........................................................................ + // trn1 v16.2D, v27.2D, v16.2D // .........................................*........................................ + // trn1 v13.4S, v28.4S, v22.4S // .................*................................................................ + // trn2 v26.2D, v6.2D, v5.2D // ..................................*............................................... + // sqrdmulh v22.4S, v1.4S, v10.4S // .............................*.................................................... + // mul v15.4S, v1.4S, v20.4S // ..............................*................................................... + // trn2 v14.2D, v13.2D, v23.2D // ............................*..................................................... + // trn1 v13.2D, v13.2D, v23.2D // ................................................*................................. + // trn2 v29.2D, v17.2D, v31.2D // ...........................*...................................................... + // mul v23.4S, v26.4S, v21.4S // .....................................*............................................ + // trn1 v27.2D, v17.2D, v31.2D // ..........................................*....................................... + // trn1 v0.2D, v6.2D, v5.2D // .................................................*................................ + // sqrdmulh v31.4S, v26.4S, v4.4S // ......................................*........................................... + // mul v1.4S, v29.4S, v21.4S // .................................*................................................ + // sqrdmulh v29.4S, v29.4S, v4.4S // ................................*................................................. + // mls v15.4S, v22.4S, v8.S[0] // .......................................*.......................................... + // ldr q5, [x5, #-144] // ........................................*......................................... + // mul v20.4S, v14.4S, v20.4S // ...................................*.............................................. + // ldr q24, [x5, #-48] // .............................................*.................................... + // sqrdmulh v11.4S, v14.4S, v10.4S // ...............................*.................................................. + // ldr q14, [x5, #-64] // ....................................*............................................. + // mls v23.4S, v31.4S, v8.S[0] // ...................................................*.............................. + // ldr q6, [x5, #-32] // ..............................................*................................... + // mls v1.4S, v29.4S, v8.S[0] // ...........................................*...................................... + // ldr q21, [x5, #-16] // ............*..................................................................... + // add v19.4S, v16.4S, v15.4S // .....................................................*............................ + // mls v20.4S, v11.4S, v8.S[0] // ...............................................*.................................. + // ldr q11, [x5, #-160] // ..................................................*............................... + // ldr q25, [x5, #-112] // ..........................*....................................................... + // sub v15.4S, v16.4S, v15.4S // ....................................................*............................. + // ldr q31, [x5, #-128] // ............................................*..................................... + // sqrdmulh v4.4S, v19.4S, v24.4S // ...........................................................*...................... + // add v17.4S, v0.4S, v23.4S // ..................................................................*............... + // mul v24.4S, v19.4S, v14.4S // .........................................................*........................ + // mul v19.4S, v15.4S, v6.4S // ........................................................*......................... + // sub v6.4S, v27.4S, v1.4S // ......................................................*........................... + // add v16.4S, v27.4S, v1.4S // .......................................................*.......................... + // sqrdmulh v26.4S, v15.4S, v21.4S // ..........................................................*....................... + // add v10.4S, v13.4S, v20.4S // ................................................................*................. + // mul v15.4S, v16.4S, v11.4S // ...............................................................*.................. + // mul v11.4S, v6.4S, v31.4S // ..............................................................*................... + // sqrdmulh v31.4S, v6.4S, v25.4S // ............................................................*..................... + // sqrdmulh v21.4S, v16.4S, v5.4S // .............................................................*.................... + // sub v25.4S, v13.4S, v20.4S // .................................................................*................ + // mls v24.4S, v4.4S, v8.S[0] // ...................................................................*.............. + // sub v16.4S, v0.4S, v23.4S // .......................................................................*.......... + // mls v19.4S, v26.4S, v8.S[0] // ....................................................................*............. + // mls v11.4S, v31.4S, v8.S[0] // .....................................................................*............ + // mls v15.4S, v21.4S, v8.S[0] // ......................................................................*........... + // add v20.4S, v10.4S, v24.4S // .........................................................................*........ + // sub v21.4S, v10.4S, v24.4S // ........................................................................*......... + // add v22.4S, v25.4S, v19.4S // ...........................................................................*...... + // sub v23.4S, v25.4S, v19.4S // ..........................................................................*....... + // sub v26.4S, v17.4S, v15.4S // .............................................................................*.... + // sub v28.4S, v16.4S, v11.4S // ...............................................................................*.. + // add v25.4S, v17.4S, v15.4S // ............................................................................*..... + // add v27.4S, v16.4S, v11.4S // ..............................................................................*... + // st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // .................................................................................* + // st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x2], #64 // ................................................................................*. + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm.s index 9241deb5..bb71a747 100644 --- a/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm.s @@ -2,41 +2,13 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm xtmp0 .req x10 xtmp1 .req x11 -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().4s, \a\().4s, \b\().4s .endm @@ -57,15 +29,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -74,12 +46,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s @@ -99,24 +65,24 @@ xtmp1 .req x11 .endm .macro load_vectors a0, a1, a2, a3, addr - ldr_vo \a0, \addr, (16*0) - ldr_vo \a1, \addr, (16*1) - ldr_vo \a2, \addr, (16*2) - ldr_vo \a3, \addr, (16*3) + ldr qform_\a0, [\addr, #(16*0)] + ldr qform_\a1, [\addr, #(16*1)] + ldr qform_\a2, [\addr, #(16*2)] + ldr qform_\a3, [\addr, #(16*3)] .endm .macro load_vectors_with_offset a0, a1, a2, a3, addr, offset - ldr_vo \a0, \addr, (16*0 + (\offset)) - ldr_vo \a1, \addr, (16*1 + (\offset)) - ldr_vo \a2, \addr, (16*2 + (\offset)) - ldr_vo \a3, \addr, (16*3 + (\offset)) + ldr qform_\a0, [\addr, #(16*0 + (\offset))] + ldr qform_\a1, [\addr, #(16*1 + (\offset))] + ldr qform_\a2, [\addr, #(16*2 + (\offset))] + ldr qform_\a3, [\addr, #(16*3 + (\offset))] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro vec_to_scalar_matrix out, in @@ -146,35 +112,35 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_456 - ldr_vi root0, r_ptr0, 64 - ldr_vo root1, r_ptr0, (-64 + 16) - ldr_vo root2, r_ptr0, (-64 + 32) - ldr_vo root3, r_ptr0, (-64 + 48) + ldr qform_root0, [r_ptr0], #64 + ldr qform_root1, [r_ptr0, #(-64 + 16)] + ldr qform_root2, [r_ptr0, #(-64 + 32)] + ldr qform_root3, [r_ptr0, #(-64 + 48)] .endm .macro load_roots_78_part1 - ldr_vi root0, r_ptr1, (12*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) - ldr_vo root1, r_ptr1, (-12*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) - ldr_vo root2, r_ptr1, (-12*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(12*16) + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 5*16)] .endm .macro load_roots_78_part2 - ldr_vo root0, r_ptr1, (-12*16 + 6*16) - ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) - ldr_vo root1, r_ptr1, (-12*16 + 8*16) - ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) - ldr_vo root2, r_ptr1, (-12*16 + 10*16) - ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) + ldr qform_root0, [ r_ptr1, #(-12*16 + 6*16)] + ldr qform_root0_tw, [r_ptr1, #(-12*16 + 7*16)] + ldr qform_root1, [ r_ptr1, #(-12*16 + 8*16)] + ldr qform_root1_tw, [r_ptr1, #(-12*16 + 9*16)] + ldr qform_root2, [ r_ptr1, #(-12*16 + 10*16)] + ldr qform_root2_tw, [r_ptr1, #(-12*16 + 11*16)] .endm .macro transpose4 data0, data1, data2, data3 @@ -196,7 +162,7 @@ xtmp1 .req x11 trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -207,7 +173,7 @@ xtmp1 .req x11 stp x29, x30, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -217,7 +183,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -225,7 +191,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -236,19 +202,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -385,614 +351,506 @@ _ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm: load_roots_123 .p2align 2 - // gap // ...................... - ldr x25, [x0, #640] // .....*................ - ldr x27, [x0, #512] // ......*............... - // gap // ...................... - // gap // ...................... - ldr x24, [x0, #128] // .*.................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - ldr x21, [x0, #648] // .......*.............. - // gap // ...................... - // gap // ...................... - ins v14.d[0], x25 // ........*............. - ins v21.d[0], x27 // ...........*.......... - // gap // ...................... - // gap // ...................... - // gap // ...................... - ldr x29, [x0, #520] // ..........*........... - // gap // ...................... - // gap // ...................... - ins v16.d[0], x24 // ....................*. - ins v14.d[1], x21 // .............*........ - ldr x20, [x0, #8] // ..*................... - // gap // ...................... - ldr x19, [x0, #0] // ...*.................. - // gap // ...................... - // gap // ...................... - // gap // ...................... - ins v21.d[1], x29 // ............*......... - sqrdmulh v28.4S, v14.4S, v0.S[1] // ..................*... - ldr x28, [x0, #896] // .........*............ - // gap // ...................... - mul v20.4S, v14.4S, v0.S[0] // ................*..... - ldr x29, [x0, #136] // *..................... - // gap // ...................... - // gap // ...................... - sqrdmulh v19.4S, v21.4S, v0.S[1] // ..............*....... - mul v29.4S, v21.4S, v0.S[0] // ...............*...... - // gap // ...................... - // gap // ...................... - ins v10.d[0], x19 // ....*................. - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v20.4S, v28.4S, v8.S[0] // .....................* - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v29.4S, v19.4S, v8.S[0] // ...................*.. - ins v10.d[1], x20 // .................*.... - // gap // ...................... - // gap // ...................... - - // original source code - // ldr x29, [x0, #136] // ...............*...... - // ldr x20, [x0, #128] // ..*................... - // ldr x25, [x0, #8] // .........*............ - // ldr x23, [x0, #0] // ..........*........... - // ins v10.d[0], x23 // ..................*... - // ldr x16, [x0, #640] // *..................... - // ldr x7, [x0, #512] // .*.................... - // ldr x11, [x0, #648] // ...*.................. - // ins v15.d[0], x16 // ....*................. - // ldr x28, [x0, #896] // .............*........ - // ldr x22, [x0, #520] // ......*............... - // ins v4.d[0], x7 // .....*................ - // ins v4.d[1], x22 // ...........*.......... - // ins v15.d[1], x11 // ........*............. - // sqrdmulh v28.4S, v4.4S, v0.S[1] // ................*..... - // mul v29.4S, v4.4S, v0.S[0] // .................*.... - // mul v20.4S, v15.4S, v0.S[0] // ..............*....... - // ins v10.d[1], x25 // .....................* - // sqrdmulh v27.4S, v15.4S, v0.S[1] // ............*......... - // mls v29.4S, v28.4S, v8.S[0] // ....................*. - // ins v16.d[0], x20 // .......*.............. - // mls v20.4S, v27.4S, v8.S[0] // ...................*.. + // Instructions: 53 + // Expected cycles: 29 + // Expected IPC: 1.83 + // + // Wall time: 1.06s + // User time: 1.06s + // + // ---------------- original position -----------------> + // 0 25 50 + // |------------------------|------------------------|-- + ldr q11, [x0, #896] // .*................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + ldr q22, [x0, #640] // *.................................................... + // gap // ..................................................... + // gap // ..................................................... + ldr q10, [x0, #768] // ...*................................................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + ldr q18, [x0, #512] // ..*.................................................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v29.4S, v11.4S, v0.S[0] // ......*.............................................. + sqrdmulh v30.4S, v11.4S, v0.S[1] // .......*............................................. + ldr q31, [x0, #0] // .....*............................................... + // gap // ..................................................... + mul v6.4S, v22.4S, v0.S[0] // ...............*..................................... + ldr q24, [x0, #128] // ..........*.......................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v20.4S, v10.4S, v0.S[1] // ...........*......................................... + ldr q16, [x0, #384] // ............*........................................ + // gap // ..................................................... + // gap // ..................................................... + mls v29.4S, v30.4S, v8.S[0] // ..............*...................................... + sqrdmulh v30.4S, v22.4S, v0.S[1] // .........*........................................... + // gap // ..................................................... + // gap // ..................................................... + mul v27.4S, v10.4S, v0.S[0] // .............*....................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v7.4S, v18.4S, v0.S[1] // ........*............................................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v6.4S, v30.4S, v8.S[0] // ...................*................................. + sub v15.4S, v16.4S, v29.4S // ..................*.................................. + ldr q30, [x0, #256] // ....*................................................ + // gap // ..................................................... + mul v23.4S, v18.4S, v0.S[0] // ................*.................................... + mls v27.4S, v20.4S, v8.S[0] // .................*................................... + // gap // ..................................................... + // gap // ..................................................... + mul v9.4S, v15.4S, v1.S[0] // ....................*................................ + sqrdmulh v17.4S, v15.4S, v1.S[1] // .....................*............................... + // gap // ..................................................... + // gap // ..................................................... + add v18.4S, v16.4S, v29.4S // .........................*........................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sub v13.4S, v30.4S, v27.4S // ......................*.............................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v9.4S, v17.4S, v8.S[0] // ...........................*......................... + mul v15.4S, v18.4S, v0.S[2] // ................................*.................... + // gap // ..................................................... + // gap // ..................................................... + sub v17.4S, v24.4S, v6.4S // ........................*............................ + mul v22.4S, v13.4S, v1.S[0] // ..........................*.......................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v10.4S, v13.4S, v1.S[1] // .............................*....................... + add v6.4S, v24.4S, v6.4S // ..............................*...................... + // gap // ..................................................... + // gap // ..................................................... + sub v26.4S, v17.4S, v9.4S // .................................*................... + sqrdmulh v25.4S, v18.4S, v0.S[3] // ...............................*..................... + // gap // ..................................................... + // gap // ..................................................... + mls v23.4S, v7.4S, v8.S[0] // .......................*............................. + add v5.4S, v17.4S, v9.4S // ...................................*................. + // gap // ..................................................... + // gap // ..................................................... + mul v29.4S, v26.4S, v3.S[0] // ....................................*................ + sqrdmulh v21.4S, v26.4S, v3.S[1] // .....................................*............... + // gap // ..................................................... + // gap // ..................................................... + mls v22.4S, v10.4S, v8.S[0] // ..................................*.................. + mls v15.4S, v25.4S, v8.S[0] // .......................................*............. + // gap // ..................................................... + // gap // ..................................................... + sub v14.4S, v31.4S, v23.4S // ............................*........................ + sqrdmulh v28.4S, v5.4S, v2.S[3] // ......................................*.............. + // gap // ..................................................... + // gap // ..................................................... + add v11.4S, v31.4S, v23.4S // ....................................................* + mls v29.4S, v21.4S, v8.S[0] // .........................................*........... + // gap // ..................................................... + // gap // ..................................................... + sub v9.4S, v14.4S, v22.4S // ..........................................*.......... + sub v23.4S, v6.4S, v15.4S // ...........................................*......... + // gap // ..................................................... + // gap // ..................................................... + add v18.4S, v6.4S, v15.4S // ............................................*........ + mul v26.4S, v5.4S, v2.S[2] // ........................................*............ + // gap // ..................................................... + // gap // ..................................................... + sub v31.4S, v9.4S, v29.4S // .............................................*....... + sqrdmulh v25.4S, v23.4S, v2.S[1] // ...............................................*..... + // gap // ..................................................... + // gap // ..................................................... + add v20.4S, v9.4S, v29.4S // ..............................................*...... + mul v21.4S, v23.4S, v2.S[0] // ..................................................*.. + // gap // ..................................................... + // gap // ..................................................... + mul v17.4S, v18.4S, v1.S[2] // ...................................................*. + sqrdmulh v18.4S, v18.4S, v1.S[3] // .................................................*... + str q31, [x0, #896] // ................................................*.... + // gap // ..................................................... + + // ------------------- new position -------------------> + // 0 25 50 + // |------------------------|------------------------|-- + // ldr q12, [x0, #640] // .*................................................... + // ldr q24, [x0, #896] // *.................................................... + // ldr q19, [x0, #512] // ...*................................................. + // ldr q5, [x0, #768] // ..*.................................................. + // ldr q30, [x0, #256] // .................*................................... + // ldr q29, [x0, #0] // ......*.............................................. + // mul v13.4S, v24.4S, v0.S[0] // ....*................................................ + // sqrdmulh v28.4S, v24.4S, v0.S[1] // .....*............................................... + // sqrdmulh v6.4S, v19.4S, v0.S[1] // ..............*...................................... + // sqrdmulh v31.4S, v12.4S, v0.S[1] // ............*........................................ + // ldr q15, [x0, #128] // ........*............................................ + // sqrdmulh v25.4S, v5.4S, v0.S[1] // .........*........................................... + // ldr q16, [x0, #384] // ..........*.......................................... + // mul v27.4S, v5.4S, v0.S[0] // .............*....................................... + // mls v13.4S, v28.4S, v8.S[0] // ...........*......................................... + // mul v9.4S, v12.4S, v0.S[0] // .......*............................................. + // mul v5.4S, v19.4S, v0.S[0] // ..................*.................................. + // mls v27.4S, v25.4S, v8.S[0] // ...................*................................. + // sub v14.4S, v16.4S, v13.4S // ................*.................................... + // mls v9.4S, v31.4S, v8.S[0] // ...............*..................................... + // mul v23.4S, v14.4S, v1.S[0] // ....................*................................ + // sqrdmulh v25.4S, v14.4S, v1.S[1] // .....................*............................... + // sub v19.4S, v30.4S, v27.4S // .......................*............................. + // mls v5.4S, v6.4S, v8.S[0] // ................................*.................... + // sub v28.4S, v15.4S, v9.4S // ..........................*.......................... + // add v4.4S, v16.4S, v13.4S // ......................*.............................. + // mul v22.4S, v19.4S, v1.S[0] // ...........................*......................... + // mls v23.4S, v25.4S, v8.S[0] // ........................*............................ + // sub v14.4S, v29.4S, v5.4S // ......................................*.............. + // sqrdmulh v24.4S, v19.4S, v1.S[1] // ............................*........................ + // add v12.4S, v15.4S, v9.4S // .............................*....................... + // sqrdmulh v9.4S, v4.4S, v0.S[3] // ...............................*..................... + // mul v13.4S, v4.4S, v0.S[2] // .........................*........................... + // sub v18.4S, v28.4S, v23.4S // ..............................*...................... + // mls v22.4S, v24.4S, v8.S[0] // ....................................*................ + // add v25.4S, v28.4S, v23.4S // .................................*................... + // mul v15.4S, v18.4S, v3.S[0] // ..................................*.................. + // sqrdmulh v23.4S, v18.4S, v3.S[1] // ...................................*................. + // sqrdmulh v28.4S, v25.4S, v2.S[3] // .......................................*............. + // mls v13.4S, v9.4S, v8.S[0] // .....................................*............... + // mul v26.4S, v25.4S, v2.S[2] // .............................................*....... + // mls v15.4S, v23.4S, v8.S[0] // .........................................*........... + // sub v24.4S, v14.4S, v22.4S // ..........................................*.......... + // sub v7.4S, v12.4S, v13.4S // ...........................................*......... + // add v16.4S, v12.4S, v13.4S // ............................................*........ + // sub v4.4S, v24.4S, v15.4S // ..............................................*...... + // add v20.4S, v24.4S, v15.4S // ................................................*.... + // sqrdmulh v25.4S, v7.4S, v2.S[1] // ...............................................*..... + // str q4, [x0, #896] // ....................................................* + // sqrdmulh v18.4S, v16.4S, v1.S[3] // ...................................................*. + // mul v21.4S, v7.4S, v2.S[0] // .................................................*... + // mul v17.4S, v16.4S, v1.S[2] // ..................................................*.. + // add v11.4S, v29.4S, v5.4S // ........................................*............ sub count, count, #1 layer123_start: - ins v16.d[1], x29 // .......*............................................................................................ - ldr x24, [x0, #904] // .............................*...................................................................... - ldr x23, [x0, #768] // ........................*........................................................................... - ins v21.d[0], x28 // ..............................*..................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ldr x29, [x0, #152] // .....e.............................................................................................. - ldr x16, [x0, #776] // .........................*.......................................................................... - ldr x19, [x0, #392] // .............*...................................................................................... - ldr x20, [x0, #144] // ....e............................................................................................... - sub v26.4S, v16.4S, v20.4S // ........................................*........................................................... - add v28.4S, v16.4S, v20.4S // .........................................*.......................................................... - ldr x25, [x0, #24] // .e.................................................................................................. - ins v15.d[0], x23 // ..........................*......................................................................... - ldr x7, [x0, #256] // ........*........................................................................................... - ins v21.d[1], x24 // ...............................*.................................................................... - ldr x23, [x0, #16] // e................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ldr x11, [x0, #384] // ............*....................................................................................... - ldr x22, [x0, #264] // .........*.......................................................................................... - ins v15.d[1], x16 // ...........................*........................................................................ - mul v23.4S, v21.4S, v0.S[0] // ...............................................*.................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v13.4S, v21.4S, v0.S[1] // ................................................*................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v22.4S, v15.4S, v0.S[1] // ...........................................*........................................................ - mul v9.4S, v15.4S, v0.S[0] // ..........................................*......................................................... - ins v16.d[0], x11 // ..............*..................................................................................... - ins v4.d[0], x7 // ..........*......................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v23.4S, v13.4S, v8.S[0] // .................................................*.................................................. - mls v9.4S, v22.4S, v8.S[0] // ............................................*....................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v4.d[1], x22 // ...........*........................................................................................ - // gap // .................................................................................................... - ins v16.d[1], x19 // ...............*.................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v18.4S, v16.4S, v23.4S // ..................................................*................................................. - // gap // .................................................................................................... - sub v27.4S, v4.4S, v9.4S // .............................................*...................................................... - add v31.4S, v16.4S, v23.4S // ...................................................*................................................ - add v19.4S, v4.4S, v9.4S // ..............................................*..................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v30.4S, v18.4S, v1.S[1] // ....................................................................*............................... - mul v4.4S, v18.4S, v1.S[0] // ...................................................................*................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v21.4S, v31.4S, v0.S[3] // ..........................................................*......................................... - mul v17.4S, v31.4S, v0.S[2] // .........................................................*.......................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v11.4S, v27.4S, v1.S[0] // ..............................................................*..................................... - sqrdmulh v31.4S, v19.4S, v0.S[3] // .....................................................*.............................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v16.4S, v19.4S, v0.S[2] // ....................................................*............................................... - mls v4.4S, v30.4S, v8.S[0] // .....................................................................*.............................. - // gap // .................................................................................................... - sqrdmulh v25.4S, v27.4S, v1.S[1] // ...............................................................*.................................... - mls v17.4S, v21.4S, v8.S[0] // ...........................................................*........................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v18.4S, v10.4S, v29.4S // ....................................*............................................................... - sub v29.4S, v10.4S, v29.4S // ...................................*................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v16.4S, v31.4S, v8.S[0] // ......................................................*............................................. - sub v12.4S, v26.4S, v4.4S // ......................................................................*............................. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v11.4S, v25.4S, v8.S[0] // ................................................................*................................... - sub v31.4S, v28.4S, v17.4S // ............................................................*....................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v20.4S, v12.4S, v3.S[1] // ........................................................................................*........... - mul v24.4S, v12.4S, v3.S[0] // .......................................................................................*............ - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v5.4S, v31.4S, v2.S[1] // ..............................................................................*..................... - // gap // .................................................................................................... - mul v9.4S, v31.4S, v2.S[0] // .............................................................................*...................... - // gap // .................................................................................................... - add v13.4S, v28.4S, v17.4S // .............................................................*...................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v10.d[0], x23 // ..e................................................................................................. - // gap // .................................................................................................... - mls v24.4S, v20.4S, v8.S[0] // .........................................................................................*.......... - // gap // .................................................................................................... - ldr x16, [x0, #656] // ....................e............................................................................... - mls v9.4S, v5.4S, v8.S[0] // ...............................................................................*.................... - sub v20.4S, v29.4S, v11.4S // .................................................................*.................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v27.4S, v18.4S, v16.4S // .......................................................*............................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v22.4S, v13.4S, v1.S[3] // .........................................................................*.......................... - ldr x7, [x0, #528] // ................e................................................................................... - sub v5.4S, v20.4S, v24.4S // ..........................................................................................*......... - ldr x11, [x0, #664] // .....................e.............................................................................. - ins v15.d[0], x16 // ......................e............................................................................. - add v30.4S, v26.4S, v4.4S // .......................................................................*............................ - // gap // .................................................................................................... - // gap // .................................................................................................... - ldr x28, [x0, #912] // ............................e....................................................................... - str q5, [x0, #896] // ...................................................................................................* - add v14.4S, v27.4S, v9.4S // .................................................................................*.................. - mul v17.4S, v13.4S, v1.S[2] // ........................................................................*........................... - ldr x22, [x0, #536] // .................e.................................................................................. - sqrdmulh v13.4S, v30.4S, v2.S[3] // ...................................................................................*................ - // gap // .................................................................................................... - ins v4.d[0], x7 // ..................e................................................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v21.4S, v30.4S, v2.S[2] // ..................................................................................*................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v16.4S, v18.4S, v16.4S // ........................................................*........................................... - mls v17.4S, v22.4S, v8.S[0] // ..........................................................................*......................... - ins v4.d[1], x22 // ...................e................................................................................ - // gap // .................................................................................................... - add v18.4S, v29.4S, v11.4S // ..................................................................*................................. - // gap // .................................................................................................... - str q14, [x0, #256] // ..............................................................................................*..... - mls v21.4S, v13.4S, v8.S[0] // ....................................................................................*............... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v15.d[1], x11 // .......................e............................................................................ - sub v7.4S, v16.4S, v17.4S // ...........................................................................*........................ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v28.4S, v4.4S, v0.S[1] // .................................e.................................................................. - add v12.4S, v20.4S, v24.4S // ...........................................................................................*........ - // gap // .................................................................................................... - mul v29.4S, v4.4S, v0.S[0] // ................................e................................................................... - // gap // .................................................................................................... - add v5.4S, v18.4S, v21.4S // ......................................................................................*............. - mul v20.4S, v15.4S, v0.S[0] // .....................................e.............................................................. - str q7, [x0, #128] // .............................................................................................*...... - // gap // .................................................................................................... - str q12, [x0, #768] // ..................................................................................................*. - sub v4.4S, v18.4S, v21.4S // .....................................................................................*.............. - // gap // .................................................................................................... - sub v12.4S, v27.4S, v9.4S // ................................................................................*................... - ins v10.d[1], x25 // ...e................................................................................................ - str q5, [x0, #512] // ................................................................................................*... - // gap // .................................................................................................... - sqrdmulh v27.4S, v15.4S, v0.S[1] // ......................................e............................................................. - add v9.4S, v16.4S, v17.4S // ............................................................................*....................... - // gap // .................................................................................................... - // gap // .................................................................................................... - str q12, [x0, #384] // ...............................................................................................*.... - // gap // .................................................................................................... - mls v29.4S, v28.4S, v8.S[0] // ..................................e................................................................. - str q4, [x0, #640] // .................................................................................................*.. - ins v16.d[0], x20 // ......e............................................................................................. - // gap // .................................................................................................... - mls v20.4S, v27.4S, v8.S[0] // .......................................e............................................................ - str q9, [x0], #(16) // ............................................................................................*....... - // gap // .................................................................................................... - - // original source code - // ldr x10, [x0, #0] // ..........e.....................................................................................|.............e..................................................................................... - // ldr x11, [x0, #(0+8)] // ......e.........................................................................................|.........e......................................................................................... - // ins v9.d[0], x10 // .................................................e..............................................|....................................................e.............................................. - // ins v9.d[1], x11 // ......................................................................................e.........|.........................................................................................e......... - // ldr x10, [x0, #(1*(1024/8))] // ...e............................................................................................|......e............................................................................................ - // ldr x11, [x0, #((1*(1024/8))+8)] // e...............................................................................................|...e............................................................................................... - // ins v10.d[0], x10 // .............................................................................................e..|................................................................................................e.. - // ins v10.d[1], x11 // ................................................................................................*................................................................................................... - // ldr x10, [x0, #(2*(1024/8))] // ........*.......................................................................................|...........*....................................................................................... - // ldr x11, [x0, #((2*(1024/8))+8)] // ............*...................................................................................|...............*................................................................................... - // ins v11.d[0], x10 // ...................*............................................................................|......................*............................................................................ - // ins v11.d[1], x11 // ......................*.........................................................................|.........................*......................................................................... - // ldr x10, [x0, #(3*(1024/8))] // ...........*....................................................................................|..............*.................................................................................... - // ldr x11, [x0, #((3*(1024/8))+8)] // ..*.............................................................................................|.....*............................................................................................. - // ins v12.d[0], x10 // ..................*.............................................................................|.....................*............................................................................. - // ins v12.d[1], x11 // .......................*........................................................................|..........................*........................................................................ - // ldr x10, [x0, #(4*(1024/8))] // ........................................................e.......................................|...........................................................e....................................... - // ldr x11, [x0, #((4*(1024/8))+8)] // .................................................................e..............................|....................................................................e.............................. - // ins v13.d[0], x10 // ...................................................................e............................|......................................................................e............................ - // ins v13.d[1], x11 // .......................................................................e........................|..........................................................................e........................ - // ldr x10, [x0, #(5*(1024/8))] // ...................................................e............................................|......................................................e............................................ - // ldr x11, [x0, #((5*(1024/8))+8)] // ..........................................................e.....................................|.............................................................e..................................... - // ins v14.d[0], x10 // ...........................................................e....................................|..............................................................e.................................... - // ins v14.d[1], x11 // ...........................................................................e....................|..............................................................................e.................... - // ldr x10, [x0, #(6*(1024/8))] // ................................................................................................|.*................................................................................................. - // ldr x11, [x0, #((6*(1024/8))+8)] // .*..............................................................................................|....*.............................................................................................. - // ins v15.d[0], x10 // .......*........................................................................................|..........*........................................................................................ - // ins v15.d[1], x11 // .............*..................................................................................|................*.................................................................................. - // ldr x10, [x0, #(7*(1024/8))] // .............................................................e..................................|................................................................e.................................. - // ldr x11, [x0, #((7*(1024/8))+8)] // ................................................................................................|*.................................................................................................. - // ins v16.d[0], x10 // ................................................................................................|..*................................................................................................ - // ins v16.d[1], x11 // .........*......................................................................................|............*...................................................................................... - // mul v24.4s, v13.4s, v0.s[0] // ...............................................................................e................|..................................................................................e................ - // sqrdmulh v13.4s, v13.4s, v0.s[1] // .............................................................................e..................|................................................................................e.................. - // mls v24.4s, v13.4s, v8.s[0] // ...........................................................................................e....|..............................................................................................e.... - // sub v13.4s, v9.4s, v24.4s // .......................................*........................................................|..........................................*........................................................ - // add v9.4s, v9.4s, v24.4s // ......................................*.........................................................|.........................................*......................................................... - // mul v24.4s, v14.4s, v0.s[0] // .................................................................................e..............|....................................................................................e.............. - // sqrdmulh v14.4s, v14.4s, v0.s[1] // ........................................................................................e.......|...........................................................................................e....... - // mls v24.4s, v14.4s, v8.s[0] // ..............................................................................................e.|.................................................................................................e. - // sub v14.4s, v10.4s, v24.4s // ....*...........................................................................................|.......*........................................................................................... - // add v10.4s, v10.4s, v24.4s // .....*..........................................................................................|........*.......................................................................................... - // mul v24.4s, v15.4s, v0.s[0] // .................*..............................................................................|....................*.............................................................................. - // sqrdmulh v15.4s, v15.4s, v0.s[1] // ................*...............................................................................|...................*............................................................................... - // mls v24.4s, v15.4s, v8.s[0] // .....................*..........................................................................|........................*.......................................................................... - // sub v15.4s, v11.4s, v24.4s // .........................*......................................................................|............................*...................................................................... - // add v11.4s, v11.4s, v24.4s // ...........................*....................................................................|..............................*.................................................................... - // mul v24.4s, v16.4s, v0.s[0] // ..............*.................................................................................|.................*................................................................................. - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ...............*................................................................................|..................*................................................................................ - // mls v24.4s, v16.4s, v8.s[0] // ....................*...........................................................................|.......................*........................................................................... - // sub v16.4s, v12.4s, v24.4s // ........................*.......................................................................|...........................*....................................................................... - // add v12.4s, v12.4s, v24.4s // ..........................*.....................................................................|.............................*..................................................................... - // mul v24.4s, v11.4s, v0.s[2] // ..................................*.............................................................|.....................................*............................................................. - // sqrdmulh v11.4s, v11.4s, v0.s[3] // .................................*..............................................................|....................................*.............................................................. - // mls v24.4s, v11.4s, v8.s[0] // ........................................*.......................................................|...........................................*....................................................... - // sub v11.4s, v9.4s, v24.4s // ......................................................*.........................................|.........................................................*......................................... - // add v9.4s, v9.4s, v24.4s // .....................................................................*..........................|........................................................................*.......................... - // mul v24.4s, v12.4s, v0.s[2] // ...............................*................................................................|..................................*................................................................ - // sqrdmulh v12.4s, v12.4s, v0.s[3] // ..............................*.................................................................|.................................*................................................................. - // mls v24.4s, v12.4s, v8.s[0] // .....................................*..........................................................|........................................*.......................................................... - // sub v12.4s, v10.4s, v24.4s // ...........................................*....................................................|..............................................*.................................................... - // add v10.4s, v10.4s, v24.4s // ................................................*...............................................|...................................................*............................................... - // mul v24.4s, v15.4s, v1.s[0] // ................................*...............................................................|...................................*............................................................... - // sqrdmulh v15.4s, v15.4s, v1.s[1] // ....................................*...........................................................|.......................................*........................................................... - // mls v24.4s, v15.4s, v8.s[0] // ..........................................*.....................................................|.............................................*..................................................... - // sub v15.4s, v13.4s, v24.4s // .....................................................*..........................................|........................................................*.......................................... - // add v13.4s, v13.4s, v24.4s // ........................................................................*.......................|...........................................................................*....................... - // mul v24.4s, v16.4s, v1.s[0] // .............................*..................................................................|................................*.................................................................. - // sqrdmulh v16.4s, v16.4s, v1.s[1] // ............................*...................................................................|...............................*................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ...................................*............................................................|......................................*............................................................ - // sub v16.4s, v14.4s, v24.4s // .........................................*......................................................|............................................*...................................................... - // add v14.4s, v14.4s, v24.4s // ............................................................*...................................|...............................................................*................................... - // mul v24.4s, v10.4s, v1.s[2] // ................................................................*...............................|...................................................................*............................... - // sqrdmulh v10.4s, v10.4s, v1.s[3] // .......................................................*........................................|..........................................................*........................................ - // mls v24.4s, v10.4s, v8.s[0] // ......................................................................*.........................|.........................................................................*......................... - // sub v10.4s, v9.4s, v24.4s // ............................................................................*...................|...............................................................................*................... - // add v9.4s, v9.4s, v24.4s // .........................................................................................*......|............................................................................................*...... - // mul v24.4s, v12.4s, v2.s[0] // ...............................................*................................................|..................................................*................................................ - // sqrdmulh v12.4s, v12.4s, v2.s[1] // ..............................................*.................................................|.................................................*................................................. - // mls v24.4s, v12.4s, v8.s[0] // ....................................................*...........................................|.......................................................*........................................... - // sub v12.4s, v11.4s, v24.4s // .....................................................................................*..........|........................................................................................*.......... - // add v11.4s, v11.4s, v24.4s // ...............................................................*................................|..................................................................*................................ - // mul v24.4s, v14.4s, v2.s[2] // ....................................................................*...........................|.......................................................................*........................... - // sqrdmulh v14.4s, v14.4s, v2.s[3] // ..................................................................*.............................|.....................................................................*............................. - // mls v24.4s, v14.4s, v8.s[0] // ..........................................................................*.....................|.............................................................................*..................... - // sub v14.4s, v13.4s, v24.4s // ....................................................................................*...........|.......................................................................................*........... - // add v13.4s, v13.4s, v24.4s // ................................................................................*...............|...................................................................................*............... - // mul v24.4s, v16.4s, v3.s[0] // .............................................*..................................................|................................................*.................................................. - // sqrdmulh v16.4s, v16.4s, v3.s[1] // ............................................*...................................................|...............................................*................................................... - // mls v24.4s, v16.4s, v8.s[0] // ..................................................*.............................................|.....................................................*............................................. - // sub v16.4s, v15.4s, v24.4s // .........................................................*......................................|............................................................*...................................... - // add v15.4s, v15.4s, v24.4s // ..............................................................................*.................|.................................................................................*................. - // str q9, [x0], #(16) // ...............................................................................................*|..................................................................................................* - // str q10, [x0, #(-16 + 1*(1024/8))] // ..................................................................................*.............|.....................................................................................*............. - // str q11, [x0, #(-16 + 2*(1024/8))] // .........................................................................*......................|............................................................................*...................... - // str q12, [x0, #(-16 + 3*(1024/8))] // ..........................................................................................*.....|.............................................................................................*..... - // str q13, [x0, #(-16 + 4*(1024/8))] // .......................................................................................*........|..........................................................................................*........ - // str q14, [x0, #(-16 + 5*(1024/8))] // ............................................................................................*...|...............................................................................................*... - // str q15, [x0, #(-16 + 6*(1024/8))] // ...................................................................................*............|......................................................................................*............ - // str q16, [x0, #(-16 + 7*(1024/8))] // ..............................................................*.................................|.................................................................*................................. + // Instructions: 76 + // Expected cycles: 30 + // Expected IPC: 2.53 + // + // Wall time: 61.56s + // User time: 61.56s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q12, [x0, #656] // .....e...................................................................... + add v9.4S, v30.4S, v27.4S // ......................*..................................................... + mls v26.4S, v28.4S, v8.S[0] // ............................................................*............... + ldr q24, [x0, #912] // .......e.................................................................... + add v4.4S, v14.4S, v22.4S // ..........................................*................................. + mls v21.4S, v25.4S, v8.S[0] // .......................................................*.................... + ldr q19, [x0, #528] // ....e....................................................................... + // gap // ............................................................................ + mls v17.4S, v18.4S, v8.S[0] // ..................................................*......................... + mul v10.4S, v9.4S, v0.S[2] // .............................*.............................................. + ldr q5, [x0, #784] // ......e..................................................................... + // gap // ............................................................................ + sub v25.4S, v4.4S, v26.4S // .............................................................*.............. + ldr q30, [x0, #272] // ..e......................................................................... + sqrdmulh v22.4S, v9.4S, v0.S[3] // ............................*............................................... + // gap // ............................................................................ + ldr q29, [x0, #16] // e........................................................................... + mul v13.4S, v24.4S, v0.S[0] // ........................e................................................... + sqrdmulh v28.4S, v24.4S, v0.S[1] // .......................e.................................................... + // gap // ............................................................................ + sqrdmulh v6.4S, v19.4S, v0.S[1] // ........e................................................................... + sqrdmulh v31.4S, v12.4S, v0.S[1] // .............e.............................................................. + str q25, [x0, #640] // .........................................................................*.. + ldr q15, [x0, #144] // .e.......................................................................... + sqrdmulh v25.4S, v5.4S, v0.S[1] // ..................e......................................................... + mls v10.4S, v22.4S, v8.S[0] // ..............................*............................................. + ldr q16, [x0, #400] // ...e........................................................................ + // gap // ............................................................................ + mul v27.4S, v5.4S, v0.S[0] // ...................e........................................................ + mls v13.4S, v28.4S, v8.S[0] // .........................e.................................................. + // gap // ............................................................................ + // gap // ............................................................................ + mul v9.4S, v12.4S, v0.S[0] // ..............e............................................................. + mul v5.4S, v19.4S, v0.S[0] // .........e.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v28.4S, v4.4S, v26.4S // ..............................................................*............. + add v7.4S, v11.4S, v10.4S // ................................*........................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v27.4S, v25.4S, v8.S[0] // ....................e....................................................... + sub v14.4S, v16.4S, v13.4S // ..........................e................................................. + // gap // ............................................................................ + // gap // ............................................................................ + mls v9.4S, v31.4S, v8.S[0] // ...............e............................................................ + str q28, [x0, #512] // ........................................................................*... + sub v12.4S, v7.4S, v17.4S // ...................................................*........................ + // gap // ............................................................................ + mul v23.4S, v14.4S, v1.S[0] // ............................................e............................... + sqrdmulh v25.4S, v14.4S, v1.S[1] // ...........................................e................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v19.4S, v30.4S, v27.4S // .....................e...................................................... + str q12, [x0, #128] // .....................................................................*...... + mls v5.4S, v6.4S, v8.S[0] // ..........e................................................................. + // gap // ............................................................................ + sub v28.4S, v15.4S, v9.4S // ................e........................................................... + add v4.4S, v16.4S, v13.4S // ...........................e................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v22.4S, v19.4S, v1.S[0] // .......................................e.................................... + mls v23.4S, v25.4S, v8.S[0] // .............................................e.............................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v14.4S, v29.4S, v5.4S // ...........e................................................................ + sqrdmulh v24.4S, v19.4S, v1.S[1] // ......................................e..................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v12.4S, v15.4S, v9.4S // .................e.......................................................... + sqrdmulh v9.4S, v4.4S, v0.S[3] // .................................e.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.4S, v4.4S, v0.S[2] // ..................................e......................................... + sub v18.4S, v28.4S, v23.4S // ..............................................e............................. + // gap // ............................................................................ + // gap // ............................................................................ + mls v22.4S, v24.4S, v8.S[0] // ........................................e................................... + add v25.4S, v28.4S, v23.4S // ...............................................e............................ + str q20, [x0, #768] // ..........................................................................*. + // gap // ............................................................................ + mul v15.4S, v18.4S, v3.S[0] // ................................................................e........... + sqrdmulh v23.4S, v18.4S, v3.S[1] // ...............................................................e............ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v28.4S, v25.4S, v2.S[3] // ..........................................................e................. + add v31.4S, v7.4S, v17.4S // ....................................................*....................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v18.4S, v11.4S, v10.4S // ...............................*............................................ + mls v13.4S, v9.4S, v8.S[0] // ...................................e........................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v26.4S, v25.4S, v2.S[2] // ...........................................................e................ + mls v15.4S, v23.4S, v8.S[0] // .................................................................e.......... + // gap // ............................................................................ + // gap // ............................................................................ + add v11.4S, v18.4S, v21.4S // .........................................................*.................. + sub v24.4S, v14.4S, v22.4S // .........................................e.................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v7.4S, v12.4S, v13.4S // ....................................e....................................... + add v16.4S, v12.4S, v13.4S // .....................................e...................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v9.4S, v18.4S, v21.4S // ........................................................*................... + sub v4.4S, v24.4S, v15.4S // ..................................................................e......... + str q11, [x0, #256] // ......................................................................*..... + // gap // ............................................................................ + add v20.4S, v24.4S, v15.4S // ...................................................................e........ + sqrdmulh v25.4S, v7.4S, v2.S[1] // .....................................................e...................... + str q31, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + str q4, [x0, #896] // ...........................................................................e + sqrdmulh v18.4S, v16.4S, v1.S[3] // ................................................e........................... + mul v21.4S, v7.4S, v2.S[0] // ......................................................e..................... + // gap // ............................................................................ + mul v17.4S, v16.4S, v1.S[2] // .................................................e.......................... + add v11.4S, v29.4S, v5.4S // ............e............................................................... + str q9, [x0, #368] // .......................................................................*.... + // gap // ............................................................................ + + // -------------------------------------------------------------------- new position ---------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr q9, [x0, #0] // .............e..............................................................'............~.............................................................. + // ldr q10, [x0, #(1*(1024/8))] // ...................e........................................................'..................~........................................................ + // ldr q11, [x0, #(2*(1024/8))] // ...........e................................................................'..........~................................................................ + // ldr q12, [x0, #(3*(1024/8))] // ......................e.....................................................'.....................~..................................................... + // ldr q13, [x0, #(4*(1024/8))] // ......e.....................................................................'.....~..................................................................... + // ldr q14, [x0, #(5*(1024/8))] // e...........................................................................~........................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .........e..................................................................'........~.................................................................. + // ldr q16, [x0, #(7*(1024/8))] // ...e........................................................................'..~........................................................................ + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ................e...........................................................'...............~........................................................... + // mul v24.4s, v13.4s, v0.s[0] // ..........................e.................................................'.........................~................................................. + // mls v24.4s, v27.4s, v8.s[0] // ......................................e.....................................'.....................................~..................................... + // sub v13.4s, v9.4s, v24.4s // ...........................................e................................'..........................................~................................ + // add v9.4s, v9.4s, v24.4s // ..........................................................................e.'.........................................................................~. + // sqrdmulh v27.4s, v14.4s, v0.s[1] // .................e..........................................................'................~.......................................................... + // mul v24.4s, v14.4s, v0.s[0] // .........................e..................................................'........................~.................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...............................e............................................'..............................~............................................ + // sub v14.4s, v10.4s, v24.4s // .......................................e....................................'......................................~.................................... + // add v10.4s, v10.4s, v24.4s // .............................................e..............................'............................................~.............................. + // sqrdmulh v27.4s, v15.4s, v0.s[1] // ....................e.......................................................'...................~....................................................... + // mul v24.4s, v15.4s, v0.s[0] // .......................e....................................................'......................~.................................................... + // mls v24.4s, v27.4s, v8.s[0] // .............................e..............................................'............................~.............................................. + // sub v15.4s, v11.4s, v24.4s // ....................................e.......................................'...................................~....................................... + // add v11.4s, v11.4s, v24.4s // .~..........................................................................'*.......................................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ...............e............................................................'..............~............................................................ + // mul v24.4s, v16.4s, v0.s[0] // ..............e.............................................................'.............~............................................................. + // mls v24.4s, v27.4s, v8.s[0] // ........................e...................................................'.......................~................................................... + // sub v16.4s, v12.4s, v24.4s // ..............................e.............................................'.............................~............................................. + // add v12.4s, v12.4s, v24.4s // ........................................e...................................'.......................................~................................... + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ............~...............................................................'...........*............................................................... + // mul v24.4s, v11.4s, v0.s[2] // ........~...................................................................'.......*................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .....................~......................................................'....................*...................................................... + // sub v11.4s, v9.4s, v24.4s // ........................................................~...................'.......................................................*................... + // add v9.4s, v9.4s, v24.4s // ............................~...............................................'...........................*............................................... + // sqrdmulh v27.4s, v12.4s, v0.s[3] // ..............................................e.............................'.............................................~............................. + // mul v24.4s, v12.4s, v0.s[2] // ...............................................e............................'..............................................~............................ + // mls v24.4s, v27.4s, v8.s[0] // .........................................................e..................'........................................................~.................. + // sub v12.4s, v10.4s, v24.4s // ..............................................................e.............'.............................................................~............. + // add v10.4s, v10.4s, v24.4s // ...............................................................e............'..............................................................~............ + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ............................................e...............................'...........................................~............................... + // mul v24.4s, v15.4s, v1.s[0] // .........................................e..................................'........................................~.................................. + // mls v24.4s, v27.4s, v8.s[0] // .................................................e..........................'................................................~.......................... + // sub v15.4s, v13.4s, v24.4s // .............................................................e..............'............................................................~.............. + // add v13.4s, v13.4s, v24.4s // ....~.......................................................................'...*....................................................................... + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ...................................e........................................'..................................~........................................ + // mul v24.4s, v16.4s, v1.s[0] // ..................................e.........................................'.................................~......................................... + // mls v24.4s, v27.4s, v8.s[0] // ..........................................e.................................'.........................................~................................. + // sub v16.4s, v14.4s, v24.4s // ................................................e...........................'...............................................~........................... + // add v14.4s, v14.4s, v24.4s // ..................................................e.........................'.................................................~......................... + // sqrdmulh v27.4s, v10.4s, v1.s[3] // .......................................................................e....'......................................................................~.... + // mul v24.4s, v10.4s, v1.s[2] // .........................................................................e..'........................................................................~.. + // mls v24.4s, v27.4s, v8.s[0] // .......~....................................................................'......*.................................................................... + // sub v10.4s, v9.4s, v24.4s // .................................~..........................................'................................*.......................................... + // add v9.4s, v9.4s, v24.4s // .......................................................~....................'......................................................*.................... + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ....................................................................e.......'...................................................................~....... + // mul v24.4s, v12.4s, v2.s[0] // ........................................................................e...'.......................................................................~... + // mls v24.4s, v27.4s, v8.s[0] // .....~......................................................................'....*...................................................................... + // sub v12.4s, v11.4s, v24.4s // ................................................................~...........'...............................................................*........... + // add v11.4s, v11.4s, v24.4s // ............................................................~...............'...........................................................*............... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ......................................................e.....................'.....................................................~..................... + // mul v24.4s, v14.4s, v2.s[2] // ..........................................................e.................'.........................................................~................. + // mls v24.4s, v27.4s, v8.s[0] // ..~.........................................................................'.*......................................................................... + // sub v14.4s, v13.4s, v24.4s // ..........~.................................................................'.........*................................................................. + // add v13.4s, v13.4s, v24.4s // ...........................~................................................'..........................*................................................ + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .....................................................e......................'....................................................~...................... + // mul v24.4s, v16.4s, v3.s[0] // ....................................................e.......................'...................................................~....................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................e................'..........................................................~................ + // sub v16.4s, v15.4s, v24.4s // .................................................................e..........'................................................................~.......... + // add v15.4s, v15.4s, v24.4s // ...................................................................e........'..................................................................~........ + // str q9, [x0], #(16) // .....................................................................~......'....................................................................*...... + // str q10, [x0, #(-16 + 1*(1024/8))] // .....................................~......................................'....................................*...................................... + // str q11, [x0, #(-16 + 2*(1024/8))] // ..................................................................~.........'.................................................................*......... + // str q12, [x0, #(-16 + 3*(1024/8))] // ...........................................................................~'..........................................................................* + // str q13, [x0, #(-16 + 4*(1024/8))] // ................................~...........................................'...............................*........................................... + // str q14, [x0, #(-16 + 5*(1024/8))] // ..................~.........................................................'.................*......................................................... + // str q15, [x0, #(-16 + 6*(1024/8))] // ...................................................~........................'..................................................*........................ + // str q16, [x0, #(-16 + 7*(1024/8))] // ......................................................................e.....'.....................................................................~..... sub count, count, #1 cbnz count, layer123_start - add v30.4S, v10.4S, v29.4S // ......................................*....................................... - ldr x21, [x0, #904] // .*............................................................................ - ldr x13, [x0, #768] // ..*........................................................................... - ins v16.d[1], x29 // *............................................................................. - ldr x8, [x0, #776] // ....*......................................................................... - ins v17.d[0], x28 // ...*.......................................................................... - // gap // .............................................................................. - // gap // .............................................................................. - sub v13.4S, v16.4S, v20.4S // ......*....................................................................... - ldr x10, [x0, #256] // .........*.................................................................... - // gap // .............................................................................. - // gap // .............................................................................. - ins v17.d[1], x21 // ..........*................................................................... - ldr x17, [x0, #264] // ............*................................................................. - ins v25.d[0], x13 // ........*..................................................................... - // gap // .............................................................................. - ldr x22, [x0, #392] // .....*........................................................................ - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mul v12.4S, v17.4S, v0.S[0] // ..............*............................................................... - ins v25.d[1], x8 // .............*................................................................ - ldr x15, [x0, #384] // ...........*.................................................................. - // gap // .............................................................................. - sqrdmulh v18.4S, v17.4S, v0.S[1] // ...............*.............................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v19.4S, v25.4S, v0.S[1] // ................*............................................................. - mul v21.4S, v25.4S, v0.S[0] // .................*............................................................ - // gap // .............................................................................. - // gap // .............................................................................. - ins v24.d[0], x10 // ...................*.......................................................... - ins v23.d[0], x15 // ..................*........................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v12.4S, v18.4S, v8.S[0] // ....................*......................................................... - mls v21.4S, v19.4S, v8.S[0] // .....................*........................................................ - // gap // .............................................................................. - // gap // .............................................................................. - ins v23.d[1], x22 // .......................*...................................................... - ins v24.d[1], x17 // ......................*....................................................... - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - add v9.4S, v23.4S, v12.4S // ..........................*................................................... - add v18.4S, v24.4S, v21.4S // ...........................*.................................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v6.4S, v23.4S, v12.4S // ........................*..................................................... - sub v14.4S, v24.4S, v21.4S // .........................*.................................................... - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v5.4S, v18.4S, v0.S[3] // .................................*............................................ - mul v22.4S, v18.4S, v0.S[2] // ..................................*........................................... - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v26.4S, v9.4S, v0.S[3] // ..............................*............................................... - mul v28.4S, v9.4S, v0.S[2] // ...............................*.............................................. - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v19.4S, v6.4S, v1.S[1] // ............................*................................................. - mul v27.4S, v6.4S, v1.S[0] // .............................*................................................ - // gap // .............................................................................. - // gap // .............................................................................. - mls v22.4S, v5.4S, v8.S[0] // ........................................*..................................... - sub v11.4S, v10.4S, v29.4S // .......................................*...................................... - // gap // .............................................................................. - // gap // .............................................................................. - mls v28.4S, v26.4S, v8.S[0] // .....................................*........................................ - sqrdmulh v29.4S, v14.4S, v1.S[1] // ....................................*......................................... - // gap // .............................................................................. - // gap // .............................................................................. - mls v27.4S, v19.4S, v8.S[0] // ...................................*.......................................... - add v7.4S, v16.4S, v20.4S // .......*...................................................................... - // gap // .............................................................................. - // gap // .............................................................................. - mul v15.4S, v14.4S, v1.S[0] // ................................*............................................. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - add v25.4S, v7.4S, v28.4S // ................................................*............................. - add v16.4S, v30.4S, v22.4S // .............................................................*................ - // gap // .............................................................................. - // gap // .............................................................................. - add v4.4S, v13.4S, v27.4S // .......................................................*...................... - sub v26.4S, v13.4S, v27.4S // .........................................*.................................... - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v18.4S, v25.4S, v1.S[3] // .....................................................*........................ - mul v27.4S, v25.4S, v1.S[2] // ..........................................................*................... - // gap // .............................................................................. - // gap // .............................................................................. - sqrdmulh v19.4S, v4.4S, v2.S[3] // ...........................................................*.................. - sqrdmulh v31.4S, v26.4S, v3.S[1] // ............................................*................................. - // gap // .............................................................................. - // gap // .............................................................................. - sub v17.4S, v7.4S, v28.4S // ...........................................*.................................. - mul v10.4S, v4.4S, v2.S[2] // ............................................................*................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v15.4S, v29.4S, v8.S[0] // ..........................................*................................... - mls v27.4S, v18.4S, v8.S[0] // ..............................................................*............... - // gap // .............................................................................. - // gap // .............................................................................. - mul v14.4S, v26.4S, v3.S[0] // .............................................*................................ - mul v6.4S, v17.4S, v2.S[0] // ...............................................*.............................. - // gap // .............................................................................. - // gap // .............................................................................. - mls v10.4S, v19.4S, v8.S[0] // .................................................................*............ - sqrdmulh v5.4S, v17.4S, v2.S[1] // ..............................................*............................... - // gap // .............................................................................. - // gap // .............................................................................. - add v17.4S, v16.4S, v27.4S // ..........................................................................*... - add v20.4S, v11.4S, v15.4S // ...............................................................*.............. - // gap // .............................................................................. - // gap // .............................................................................. - sub v26.4S, v16.4S, v27.4S // ..................................................................*........... - mls v14.4S, v31.4S, v8.S[0] // .................................................*............................ - // gap // .............................................................................. - // gap // .............................................................................. - sub v25.4S, v11.4S, v15.4S // ...................................................*.......................... - str q17, [x0], #(16) // .............................................................................* - add v31.4S, v20.4S, v10.4S // ....................................................................*......... - // gap // .............................................................................. - mls v6.4S, v5.4S, v8.S[0] // ..................................................*........................... - sub v13.4S, v20.4S, v10.4S // .......................................................................*...... - str q26, [x0, #112] // .....................................................................*........ - // gap // .............................................................................. - sub v29.4S, v30.4S, v22.4S // ....................................................*......................... - str q31, [x0, #496] // .........................................................................*.... - sub v31.4S, v25.4S, v14.4S // ......................................................*....................... - // gap // .............................................................................. - add v23.4S, v25.4S, v14.4S // ...................................................................*.......... - str q13, [x0, #624] // ............................................................................*. - // gap // .............................................................................. - // gap // .............................................................................. - str q31, [x0, #880] // ........................................................*..................... - add v24.4S, v29.4S, v6.4S // .........................................................*.................... - // gap // .............................................................................. - // gap // .............................................................................. - sub v30.4S, v29.4S, v6.4S // ........................................................................*..... - str q23, [x0, #752] // ......................................................................*....... - // gap // .............................................................................. - // gap // .............................................................................. - str q24, [x0, #240] // ................................................................*............. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - str q30, [x0, #368] // ...........................................................................*.. - // gap // .............................................................................. - // gap // .............................................................................. - // gap // .............................................................................. - - // original source code - // ins v16.d[1], x29 // ...*.......................................................................... - // ldr x24, [x0, #904] // .*............................................................................ - // ldr x23, [x0, #768] // ..*........................................................................... - // ins v21.d[0], x28 // .....*........................................................................ - // ldr x16, [x0, #776] // ....*......................................................................... - // ldr x19, [x0, #392] // ...........*.................................................................. - // sub v26.4S, v16.4S, v20.4S // ......*....................................................................... - // add v28.4S, v16.4S, v20.4S // .......................................*...................................... - // ins v15.d[0], x23 // ..........*................................................................... - // ldr x7, [x0, #256] // .......*...................................................................... - // ins v21.d[1], x24 // ........*..................................................................... - // ldr x11, [x0, #384] // ..............*............................................................... - // ldr x22, [x0, #264] // .........*.................................................................... - // ins v15.d[1], x16 // .............*................................................................ - // mul v23.4S, v21.4S, v0.S[0] // ............*................................................................. - // sqrdmulh v13.4S, v21.4S, v0.S[1] // ...............*.............................................................. - // sqrdmulh v22.4S, v15.4S, v0.S[1] // ................*............................................................. - // mul v9.4S, v15.4S, v0.S[0] // .................*............................................................ - // ins v16.d[0], x11 // ...................*.......................................................... - // ins v4.d[0], x7 // ..................*........................................................... - // mls v23.4S, v13.4S, v8.S[0] // ....................*......................................................... - // mls v9.4S, v22.4S, v8.S[0] // .....................*........................................................ - // ins v4.d[1], x22 // .......................*...................................................... - // ins v16.d[1], x19 // ......................*....................................................... - // sub v18.4S, v16.4S, v23.4S // ..........................*................................................... - // sub v27.4S, v4.4S, v9.4S // ...........................*.................................................. - // add v31.4S, v16.4S, v23.4S // ........................*..................................................... - // add v19.4S, v4.4S, v9.4S // .........................*.................................................... - // sqrdmulh v30.4S, v18.4S, v1.S[1] // ................................*............................................. - // mul v4.4S, v18.4S, v1.S[0] // .................................*............................................ - // sqrdmulh v21.4S, v31.4S, v0.S[3] // ..............................*............................................... - // mul v17.4S, v31.4S, v0.S[2] // ...............................*.............................................. - // mul v11.4S, v27.4S, v1.S[0] // ........................................*..................................... - // sqrdmulh v31.4S, v19.4S, v0.S[3] // ............................*................................................. - // mul v16.4S, v19.4S, v0.S[2] // .............................*................................................ - // mls v4.4S, v30.4S, v8.S[0] // ......................................*....................................... - // sqrdmulh v25.4S, v27.4S, v1.S[1] // .....................................*........................................ - // mls v17.4S, v21.4S, v8.S[0] // ....................................*......................................... - // add v18.4S, v10.4S, v29.4S // *............................................................................. - // sub v29.4S, v10.4S, v29.4S // ...................................*.......................................... - // mls v16.4S, v31.4S, v8.S[0] // ..................................*........................................... - // sub v12.4S, v26.4S, v4.4S // ............................................*................................. - // mls v11.4S, v25.4S, v8.S[0] // ...................................................*.......................... - // sub v31.4S, v28.4S, v17.4S // .................................................*............................ - // sqrdmulh v20.4S, v12.4S, v3.S[1] // ................................................*............................. - // mul v24.4S, v12.4S, v3.S[0] // .....................................................*........................ - // sqrdmulh v5.4S, v31.4S, v2.S[1] // ........................................................*..................... - // mul v9.4S, v31.4S, v2.S[0] // ......................................................*....................... - // add v13.4S, v28.4S, v17.4S // .........................................*.................................... - // mls v24.4S, v20.4S, v8.S[0] // ............................................................*................. - // mls v9.4S, v5.4S, v8.S[0] // ................................................................*............. - // sub v20.4S, v29.4S, v11.4S // .............................................................*................ - // sub v27.4S, v18.4S, v16.4S // ...................................................................*.......... - // sqrdmulh v22.4S, v13.4S, v1.S[3] // .............................................*................................ - // sub v5.4S, v20.4S, v24.4S // .....................................................................*........ - // add v30.4S, v26.4S, v4.4S // ...........................................*.................................. - // str q5, [x0, #896] // ........................................................................*..... - // add v14.4S, v27.4S, v9.4S // .........................................................................*.... - // mul v17.4S, v13.4S, v1.S[2] // ..............................................*............................... - // sqrdmulh v13.4S, v30.4S, v2.S[3] // ...............................................*.............................. - // mul v21.4S, v30.4S, v2.S[2] // ..................................................*........................... - // add v16.4S, v18.4S, v16.4S // ..........................................*................................... - // mls v17.4S, v22.4S, v8.S[0] // ....................................................*......................... - // add v18.4S, v29.4S, v11.4S // ..........................................................*................... - // str q14, [x0, #256] // ............................................................................*. - // mls v21.4S, v13.4S, v8.S[0] // .......................................................*...................... - // sub v7.4S, v16.4S, v17.4S // ...........................................................*.................. - // add v12.4S, v20.4S, v24.4S // ......................................................................*....... - // add v5.4S, v18.4S, v21.4S // ...............................................................*.............. - // str q7, [x0, #128] // ..................................................................*........... - // str q12, [x0, #768] // ...........................................................................*.. - // sub v4.4S, v18.4S, v21.4S // .................................................................*............ - // sub v12.4S, v27.4S, v9.4S // ..........................................................................*... - // str q5, [x0, #512] // ....................................................................*......... - // add v9.4S, v16.4S, v17.4S // .........................................................*.................... - // str q12, [x0, #384] // .............................................................................* - // str q4, [x0, #640] // .......................................................................*...... - // str q9, [x0], #(16) // ..............................................................*............... + // Instructions: 23 + // Expected cycles: 16 + // Expected IPC: 1.44 + // + // Wall time: 0.12s + // User time: 0.12s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mls v17.4S, v18.4S, v8.S[0] // ....*......................... + add v9.4S, v30.4S, v27.4S // *............................. + str q20, [x0, #768] // ...............*.............. + // gap // .............................. + mls v26.4S, v28.4S, v8.S[0] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v31.4S, v9.4S, v0.S[2] // .....*........................ + sqrdmulh v12.4S, v9.4S, v0.S[3] // .......*...................... + // gap // .............................. + // gap // .............................. + mls v21.4S, v25.4S, v8.S[0] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v9.4S, v14.4S, v22.4S // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v31.4S, v12.4S, v8.S[0] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v15.4S, v9.4S, v26.4S // ......*....................... + add v10.4S, v9.4S, v26.4S // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v29.4S, v11.4S, v31.4S // ...........*.................. + sub v27.4S, v11.4S, v31.4S // .................*............ + str q15, [x0, #640] // ........*..................... + // gap // .............................. + str q10, [x0, #512] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v5.4S, v29.4S, v17.4S // .............*................ + add v19.4S, v29.4S, v17.4S // ................*............. + // gap // .............................. + // gap // .............................. + add v9.4S, v27.4S, v21.4S // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q5, [x0, #128] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q9, [x0, #256] // ....................*......... + sub v9.4S, v27.4S, v21.4S // ...................*.......... + // gap // .............................. + // gap // .............................. + str q19, [x0], #(16) // .....................*........ + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q9, [x0, #368] // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // add v9.4S, v30.4S, v27.4S // .*............................. + // mls v26.4S, v28.4S, v8.S[0] // ...*........................... + // add v4.4S, v14.4S, v22.4S // .......*....................... + // mls v21.4S, v25.4S, v8.S[0] // ......*........................ + // mls v17.4S, v18.4S, v8.S[0] // *.............................. + // mul v10.4S, v9.4S, v0.S[2] // ....*.......................... + // sub v25.4S, v4.4S, v26.4S // .........*..................... + // sqrdmulh v22.4S, v9.4S, v0.S[3] // .....*......................... + // str q25, [x0, #640] // .............*................. + // mls v10.4S, v22.4S, v8.S[0] // ........*...................... + // add v28.4S, v4.4S, v26.4S // ..........*.................... + // add v7.4S, v11.4S, v10.4S // ...........*................... + // str q28, [x0, #512] // ..............*................ + // sub v12.4S, v7.4S, v17.4S // ...............*............... + // str q12, [x0, #128] // ..................*............ + // str q20, [x0, #768] // ..*............................ + // add v31.4S, v7.4S, v17.4S // ................*.............. + // sub v18.4S, v11.4S, v10.4S // ............*.................. + // add v11.4S, v18.4S, v21.4S // .................*............. + // sub v9.4S, v18.4S, v21.4S // ....................*.......... + // str q11, [x0, #256] // ...................*........... + // str q31, [x0], #(16) // .....................*......... + // str q9, [x0, #368] // ......................*........ restore inp, STACK0 @@ -1012,1426 +870,854 @@ layer123_start: sub inpp, inpp, #64 .p2align 2 - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ldr x7, [x4], #64 // ...........*........................................................................................................... - ldr x20, [x2, #96] // .............................................*......................................................................... - ldr x26, [x2, #112] // ........................*.............................................................................................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ldr x29, [x4, #-56] // ..................*.................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ldr x11, [x2, #120] // ......................*................................................................................................ - ldr x19, [x2, #104] // .................................*..................................................................................... - ldr x0, [x2, #80] // .....................*................................................................................................. - ldr x25, [x4, #-24] // ...*................................................................................................................... - ins v26.d[0], x7 // .................*..................................................................................................... - ins v21.d[0], x20 // ...............................................*....................................................................... - ldr x8, [x1, #104] // .*..................................................................................................................... - ldr x7, [x5, #24] // ..*.................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ldr x6, [x1, #80] // ..............*........................................................................................................ - ldr x23, [x4, #-32] // *...................................................................................................................... - ins v26.d[1], x29 // ................................*...................................................................................... - ins v21.d[1], x19 // .....................................................*................................................................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ldr x24, [x1, #112] // ................*...................................................................................................... - ldr x21, [x2, #88] // .......................*............................................................................................... - ldr x22, [x5, #64] // ........*.............................................................................................................. - ldr x16, [x5, #16] // ..........*............................................................................................................ - ins v16.d[0], x0 // .........................*............................................................................................. - mul v4.4S, v21.4S, v26.S[0] // ......................................................................*................................................ - ins v23.d[0], x26 // ..................................*.................................................................................... - ldr x26, [x4, #-48] // ......*................................................................................................................ - ldr x20, [x5, #104] // .............*......................................................................................................... - // gap // ....................................................................................................................... - ldr x14, [x5, #72] // ...............*....................................................................................................... - ldr x29, [x5, #160] // ....................*.................................................................................................. - // gap // ....................................................................................................................... - ins v16.d[1], x21 // ............................*.......................................................................................... - ins v23.d[1], x11 // .......................................*............................................................................... - ldr x13, [x5, #48] // ..........................*............................................................................................ - ldr x10, [x1, #72] // ..............................*........................................................................................ - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ldr x27, [x1, #64] // ...............................*....................................................................................... - ldr x17, [x2, #64] // ......................................*................................................................................ - mul v9.4S, v16.4S, v26.S[0] // ..............................................*........................................................................ - ldr x28, [x1, #88] // .........................................*............................................................................. - ldr x21, [x1, #96] // .............................*......................................................................................... - mul v15.4S, v23.4S, v26.S[0] // ................................................*...................................................................... - sqrdmulh v3.4S, v23.4S, v26.S[1] // ............................................*.......................................................................... - sqrdmulh v20.4S, v16.4S, v26.S[1] // ....................................................*.................................................................. - ldr x0, [x1, #120] // .........*............................................................................................................. - ldr x11, [x5, #112] // ....*.................................................................................................................. - sqrdmulh v16.4S, v21.4S, v26.S[1] // .......................................................*............................................................... - ldr x15, [x5, #96] // .....................................*................................................................................. - ins v19.d[0], x6 // ...................*................................................................................................... - ins v5.d[0], x26 // ...........................................................*........................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ldr x9, [x5], #(12*16) // ....................................................................................*.................................. - ldr x19, [x4, #-40] // ..................................................*.................................................................... - mls v9.4S, v20.4S, v8.S[0] // ..........................................................*............................................................ - ldr x6, [x2, #72] // ........................................................................*.............................................. - ins v20.d[0], x17 // ........................................*.............................................................................. - ins v19.d[1], x28 // ..........................................*............................................................................ - // gap // ....................................................................................................................... - ldr x12, [x5, #-184] // ................................................................................*...................................... - ldr x28, [x5, #-72] // .....................................................................................*................................. - ldr x26, [x5, #-16] // ...........................................*........................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v25.d[0], x9 // .......................................................................................................*............... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v20.d[1], x6 // ...........................................................................*........................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v25.d[1], x12 // ............................................................................................................*.......... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v18.d[0], x27 // ..................................................................*.................................................... - ins v13.d[0], x24 // ...................................*................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v27.d[0], x23 // .....*................................................................................................................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v21.d[0], x11 // ................................................................................................................*...... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - mls v15.4S, v3.4S, v8.S[0] // ...................................................*................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v13.d[1], x0 // ....................................*.................................................................................. - ins v21.d[1], x28 // ....................................................................................................................*.. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - add v0.4S, v13.4S, v15.4S // ......................................................*................................................................ - mls v4.4S, v16.4S, v8.S[0] // .........................................................................*............................................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sqrdmulh v16.4S, v20.4S, v26.S[1] // ............................................................................*.......................................... - ins v31.d[0], x21 // .................................................*..................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - mul v23.4S, v0.4S, v26.S[2] // ........................................................*.............................................................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sqrdmulh v0.4S, v0.4S, v26.S[3] // .........................................................*............................................................. - ins v31.d[1], x8 // ............................................................*.......................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v5.d[1], x19 // ..............................................................*........................................................ - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - add v3.4S, v31.4S, v4.4S // .............................................................................*......................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ldr x8, [x5, #-48] // .........................................................................................................*............. - mls v23.4S, v0.4S, v8.S[0] // .............................................................*......................................................... - add v14.4S, v19.4S, v9.4S // ...............................................................*....................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sqrdmulh v2.4S, v3.4S, v26.S[3] // ...............................................................................*....................................... - mul v28.4S, v3.4S, v26.S[2] // .......................................................................................*............................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v27.d[1], x25 // .......*............................................................................................................... - mul v3.4S, v20.4S, v26.S[0] // ...................................................................................*................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sub v0.4S, v14.4S, v23.4S // .....................................................................*................................................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - add v29.4S, v14.4S, v23.4S // .................................................................*..................................................... - mls v28.4S, v2.4S, v8.S[0] // ...........................................................................................*........................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sqrdmulh v20.4S, v0.4S, v27.S[1] // ..........................................................................*............................................ - mul v0.4S, v0.4S, v27.S[0] // ..............................................................................*........................................ - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sqrdmulh v17.4S, v29.4S, v5.S[3] // ....................................................................*.................................................. - mls v3.4S, v16.4S, v8.S[0] // ........................................................................................*.............................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - mul v11.4S, v29.4S, v5.S[2] // ...................................................................*................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - ins v18.d[1], x10 // ..................................................................................*.................................... - mls v0.4S, v20.4S, v8.S[0] // .................................................................................*..................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - add v29.4S, v18.4S, v3.4S // .............................................................................................*......................... - // gap // ....................................................................................................................... - sub v30.4S, v19.4S, v9.4S // ................................................................*...................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - mls v11.4S, v17.4S, v8.S[0] // .......................................................................*............................................... - sub v6.4S, v13.4S, v15.4S // ......................................................................................*................................ - sub v16.4S, v29.4S, v28.4S // ................................................................................................*...................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - add v15.4S, v29.4S, v28.4S // .................................................................................................*..................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sqrdmulh v9.4S, v6.4S, v5.S[1] // ..........................................................................................*............................ - mul v29.4S, v6.4S, v5.S[0] // .........................................................................................*............................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sub v1.4S, v16.4S, v0.4S // ..................................................................................................*.................... - add v16.4S, v16.4S, v0.4S // ...................................................................................................*................... - // gap // ....................................................................................................................... - sub v0.4S, v15.4S, v11.4S // ....................................................................................................*.................. - add v20.4S, v15.4S, v11.4S // .....................................................................................................*................. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - mls v29.4S, v9.4S, v8.S[0] // ..............................................................................................*........................ - trn2 v9.4S, v16.4S, v1.4S // ......................................................................................................*................ - ins v23.d[0], x16 // ............................................................................................*.......................... - // gap // ....................................................................................................................... - trn2 v15.4S, v20.4S, v0.4S // ........................................................................................................*.............. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - trn1 v10.4S, v16.4S, v1.4S // .............................................................................................................*......... - ins v23.d[1], x7 // ...............................................................................................*....................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - trn2 v16.2D, v15.2D, v9.2D // ...........................................................................................................*........... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - add v26.4S, v30.4S, v29.4S // .................................................................................................................*..... - mul v11.4S, v16.4S, v25.4S // ..............................................................................................................*........ - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - sqrdmulh v16.4S, v16.4S, v23.4S // ...............................................................................................................*....... - trn1 v28.4S, v20.4S, v0.4S // ..........................................................................................................*............ - ins v17.d[0], x22 // ............*.......................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - trn1 v13.2D, v15.2D, v9.2D // .....................................................................................................................*. - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - mul v14.4S, v26.4S, v27.S[2] // ......................................................................................................................* - // gap // ....................................................................................................................... - // gap // ....................................................................................................................... - mls v11.4S, v16.4S, v8.S[0] // ...................................................................................................................*... - ins v6.d[0], x15 // ..................................................................................................................*.... - // gap // ....................................................................................................................... - ins v17.d[1], x14 // ...........................*........................................................................................... - // gap // ....................................................................................................................... - - // original source code - // ldr x16, [x4, #32] // .............*......................................................................................................... - // ldr x22, [x1, #104] // ..........*............................................................................................................ - // ldr x21, [x5, #24] // ...........*........................................................................................................... - // ldr x19, [x4, #40] // .......*............................................................................................................... - // ldr x14, [x5, #112] // ........................................*.............................................................................. - // ins v27.d[0], x16 // ...........................................................*........................................................... - // ldr x11, [x4, #16] // .......................*............................................................................................... - // ins v27.d[1], x19 // ..............................................................................*........................................ - // ldr x20, [x5, #64] // ..................*.................................................................................................... - // ldr x24, [x1, #120] // .......................................*............................................................................... - // ldr x16, [x5, #16] // ...................*................................................................................................... - // ldr x28, [x4], #64 // *...................................................................................................................... - // ins v17.d[0], x20 // .................................................................................................................*..... - // ldr x20, [x5, #104] // ........................*.............................................................................................. - // ldr x25, [x1, #80] // ............*.......................................................................................................... - // ldr x17, [x5, #72] // .........................*............................................................................................. - // ldr x8, [x1, #112] // ................*...................................................................................................... - // ins v23.d[0], x28 // ........*.............................................................................................................. - // ldr x28, [x4, #-56] // ...*................................................................................................................... - // ins v20.d[0], x25 // ...........................................*........................................................................... - // ldr x29, [x5, #160] // ..........................*............................................................................................ - // ldr x6, [x2, #80] // ......*................................................................................................................ - // ldr x15, [x2, #120] // ....*.................................................................................................................. - // ldr x9, [x2, #88] // .................*..................................................................................................... - // ldr x0, [x2, #112] // ..*.................................................................................................................... - // ins v22.d[0], x6 // ....................*.................................................................................................. - // ldr x13, [x5, #48] // .............................*......................................................................................... - // ins v17.d[1], x17 // ......................................................................................................................* - // ins v22.d[1], x9 // ...........................*........................................................................................... - // ldr x12, [x1, #96] // ...................................*................................................................................... - // ldr x9, [x1, #72] // ..............................*........................................................................................ - // ldr x25, [x1, #64] // ...............................*....................................................................................... - // ins v23.d[1], x28 // ..............*........................................................................................................ - // ldr x28, [x2, #104] // .....*................................................................................................................. - // ins v26.d[0], x0 // ......................*................................................................................................ - // ins v13.d[0], x8 // ..........................................................*............................................................ - // ins v13.d[1], x24 // ..............................................................*........................................................ - // ldr x24, [x5, #96] // ..........................................*............................................................................ - // ldr x8, [x2, #64] // ................................*...................................................................................... - // ins v26.d[1], x15 // ............................*.......................................................................................... - // ins v25.d[0], x8 // .................................................*..................................................................... - // ldr x8, [x1, #88] // ..................................*.................................................................................... - // ins v20.d[1], x8 // ..................................................*.................................................................... - // ldr x26, [x5, #176] // .....................................................*................................................................. - // sqrdmulh v12.4S, v26.4S, v23.S[1] // .....................................*................................................................................. - // ldr x8, [x2, #96] // .*..................................................................................................................... - // mul v15.4S, v22.4S, v23.S[0] // .................................*..................................................................................... - // ins v11.d[0], x8 // .........*............................................................................................................. - // mul v16.4S, v26.4S, v23.S[0] // ....................................*.................................................................................. - // ins v31.d[0], x12 // ...................................................................*................................................... - // ldr x8, [x4, #-40] // ..............................................*........................................................................ - // mls v16.4S, v12.4S, v8.S[0] // .............................................................*......................................................... - // sqrdmulh v19.4S, v22.4S, v23.S[1] // ......................................*................................................................................ - // ins v11.d[1], x28 // ...............*....................................................................................................... - // add v3.4S, v13.4S, v16.4S // ................................................................*...................................................... - // sqrdmulh v9.4S, v11.4S, v23.S[1] // .........................................*............................................................................. - // mul v26.4S, v3.4S, v23.S[2] // ....................................................................*.................................................. - // sqrdmulh v3.4S, v3.4S, v23.S[3] // .....................................................................*................................................. - // mls v15.4S, v19.4S, v8.S[0] // ...............................................*....................................................................... - // ins v5.d[0], x11 // ............................................*.......................................................................... - // ins v31.d[1], x22 // ......................................................................*................................................ - // mls v26.4S, v3.4S, v8.S[0] // ..........................................................................*............................................ - // ins v5.d[1], x8 // .......................................................................*............................................... - // add v19.4S, v20.4S, v15.4S // ...........................................................................*........................................... - // sub v30.4S, v20.4S, v15.4S // ...........................................................................................*........................... - // add v22.4S, v19.4S, v26.4S // .................................................................................*..................................... - // ins v18.d[0], x25 // .........................................................*............................................................. - // mul v14.4S, v22.4S, v5.S[2] // .......................................................................................*............................... - // sqrdmulh v6.4S, v22.4S, v5.S[3] // .....................................................................................*................................. - // sub v29.4S, v19.4S, v26.4S // ................................................................................*...................................... - // mul v4.4S, v11.4S, v23.S[0] // .....................*................................................................................................. - // mls v14.4S, v6.4S, v8.S[0] // ............................................................................................*.......................... - // ldr x11, [x2, #72] // ................................................*...................................................................... - // mls v4.4S, v9.4S, v8.S[0] // .................................................................*..................................................... - // sqrdmulh v22.4S, v29.4S, v27.S[1] // ...................................................................................*................................... - // ins v25.d[1], x11 // .......................................................*............................................................... - // sqrdmulh v20.4S, v25.4S, v23.S[1] // ..................................................................*.................................................... - // add v12.4S, v31.4S, v4.4S // ........................................................................*.............................................. - // mul v0.4S, v29.4S, v27.S[0] // ....................................................................................*.................................. - // sqrdmulh v21.4S, v12.4S, v23.S[3] // ............................................................................*.......................................... - // ldr x0, [x5, #8] // ...................................................*................................................................... - // mls v0.4S, v22.4S, v8.S[0] // .........................................................................................*............................. - // ins v18.d[1], x9 // ........................................................................................*.............................. - // mul v3.4S, v25.4S, v23.S[0] // ...............................................................................*....................................... - // ldr x8, [x5], #(12*16) // .............................................*......................................................................... - // ldr x10, [x5, #-72] // ....................................................*.................................................................. - // sub v6.4S, v13.4S, v16.4S // .............................................................................................*......................... - // mul v28.4S, v12.4S, v23.S[2] // .............................................................................*......................................... - // mls v3.4S, v20.4S, v8.S[0] // ......................................................................................*................................ - // mul v29.4S, v6.4S, v5.S[0] // .................................................................................................*..................... - // sqrdmulh v15.4S, v6.4S, v5.S[1] // ................................................................................................*...................... - // mls v28.4S, v21.4S, v8.S[0] // ..................................................................................*.................................... - // ins v23.d[0], x16 // ........................................................................................................*.............. - // add v10.4S, v18.4S, v3.4S // ..........................................................................................*............................ - // mls v29.4S, v15.4S, v8.S[0] // ......................................................................................................*................ - // ins v23.d[1], x21 // ...........................................................................................................*........... - // sub v12.4S, v10.4S, v28.4S // ..............................................................................................*........................ - // add v25.4S, v10.4S, v28.4S // ...............................................................................................*....................... - // sub v22.4S, v12.4S, v0.4S // ..................................................................................................*.................... - // add v26.4S, v12.4S, v0.4S // ...................................................................................................*................... - // sub v1.4S, v25.4S, v14.4S // ....................................................................................................*.................. - // add v13.4S, v25.4S, v14.4S // .....................................................................................................*................. - // trn2 v14.4S, v26.4S, v22.4S // .......................................................................................................*............... - // ins v25.d[0], x8 // ......................................................*................................................................ - // trn2 v12.4S, v13.4S, v1.4S // .........................................................................................................*............. - // ldr x8, [x5, #-48] // .........................................................................*............................................. - // trn1 v28.4S, v13.4S, v1.4S // ................................................................................................................*...... - // trn2 v19.2D, v12.2D, v14.2D // ............................................................................................................*.......... - // ins v25.d[1], x0 // ........................................................*.............................................................. - // trn1 v10.4S, v26.4S, v22.4S // ..........................................................................................................*............ - // mul v11.4S, v19.4S, v25.4S // ..............................................................................................................*........ - // sqrdmulh v7.4S, v19.4S, v23.4S // ...............................................................................................................*....... - // ins v21.d[0], x14 // ............................................................*.......................................................... - // add v26.4S, v30.4S, v29.4S // .............................................................................................................*......... - // ins v6.d[0], x24 // .....................................................................................................................*. - // mls v11.4S, v7.4S, v8.S[0] // ....................................................................................................................*.. - // ins v21.d[1], x10 // ...............................................................*....................................................... - // trn1 v13.2D, v12.2D, v14.2D // ..................................................................................................................*.... - // mul v14.4S, v26.4S, v27.S[2] // ...................................................................................................................*... + // Instructions: 106 + // Expected cycles: 47 + // Expected IPC: 2.26 + // + // Wall time: 55.43s + // User time: 55.43s + // + // ------------------------------------------- original position -------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----- + ldr q22, [x4], #64 // ..*....................................................................................................... + ldr q12, [x2, #112] // *......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + ldr q14, [x1, #64] // .........*................................................................................................ + // gap // .......................................................................................................... + ldr q7, [x2, #96] // ....*..................................................................................................... + // gap // .......................................................................................................... + ldr q27, [x2, #64] // .....*.................................................................................................... + ldr q13, [x2, #80] // .*........................................................................................................ + add x2, x2, #64 // ..................................................................................................*....... + // gap // .......................................................................................................... + ldr q26, [x1, #112] // ......*................................................................................................... + ldr q28, [x1, #96] // ...*...................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + ldr q25, [x5], #(12*16) // ....................................................................................*..................... + sqrdmulh v4.4S, v12.4S, v22.S[1] // ........*................................................................................................. + // gap // .......................................................................................................... + mul v29.4S, v12.4S, v22.S[0] // .......*.................................................................................................. + sqrdmulh v10.4S, v7.4S, v22.S[1] // ..........*............................................................................................... + mul v16.4S, v7.4S, v22.S[0] // ................*......................................................................................... + // gap // .......................................................................................................... + ldr q0, [x4, #-48] // ................................*......................................................................... + ldr q23, [x4, #-16] // ...............................*.......................................................................... + sqrdmulh v9.4S, v13.4S, v22.S[1] // .................*........................................................................................ + mul v13.4S, v13.4S, v22.S[0] // ...................*...................................................................................... + // gap // .......................................................................................................... + mul v15.4S, v27.4S, v22.S[0] // .............*............................................................................................ + ldr q1, [x1, #80] // ............*............................................................................................. + // gap // .......................................................................................................... + mls v29.4S, v4.4S, v8.S[0] // ..............*........................................................................................... + // gap // .......................................................................................................... + ldr q19, [x5, #-96] // ...................................................................*...................................... + mls v16.4S, v10.4S, v8.S[0] // ......................*................................................................................... + sqrdmulh v11.4S, v27.4S, v22.S[1] // ...............*.......................................................................................... + // gap // .......................................................................................................... + ldr q7, [x5, #-176] // ..........................................................*............................................... + add x1, x1, #64 // ..........................................................................................*............... + mls v13.4S, v9.4S, v8.S[0] // .........................*................................................................................ + sub v2.4S, v26.4S, v29.4S // .....................................*.................................................................... + // gap // .......................................................................................................... + add v3.4S, v26.4S, v29.4S // ..................*....................................................................................... + // gap // .......................................................................................................... + sub v29.4S, v28.4S, v16.4S // ...........................*.............................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v26.4S, v28.4S, v16.4S // .............................*............................................................................ + sqrdmulh v9.4S, v3.4S, v22.S[3] // ........................*................................................................................. + mul v27.4S, v3.4S, v22.S[2] // .......................*.................................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mul v3.4S, v2.4S, v0.S[0] // ...........................................*.............................................................. + // gap // .......................................................................................................... + sqrdmulh v10.4S, v2.4S, v0.S[1] // ..............................................*........................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sqrdmulh v16.4S, v26.4S, v22.S[3] // .................................*........................................................................ + mul v30.4S, v26.4S, v22.S[2] // ..................................*....................................................................... + mls v15.4S, v11.4S, v8.S[0] // .....................*.................................................................................... + mls v27.4S, v9.4S, v8.S[0] // ............................*............................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v9.4S, v1.4S, v13.4S // ..............................*........................................................................... + mls v3.4S, v10.4S, v8.S[0] // ...................................................*...................................................... + // gap // .......................................................................................................... + ldr q12, [x4, #-32] // ....................*..................................................................................... + mls v30.4S, v16.4S, v8.S[0] // .........................................*................................................................ + sub v26.4S, v1.4S, v13.4S // ...............................................*.......................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v17.4S, v9.4S, v27.4S // ...................................*...................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v21.4S, v14.4S, v15.4S // ......................................*................................................................... + add v11.4S, v9.4S, v27.4S // ....................................*..................................................................... + sub v10.4S, v26.4S, v3.4S // ........................................................*................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sqrdmulh v9.4S, v17.4S, v12.S[1] // .........................................................*................................................ + mul v27.4S, v17.4S, v12.S[0] // .....................................................*.................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mul v4.4S, v10.4S, v23.S[0] // ..............................................................*........................................... + sqrdmulh v22.4S, v11.4S, v0.S[3] // ........................................*................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v13.4S, v21.4S, v30.4S // ....................................................*..................................................... + mul v16.4S, v11.4S, v0.S[2] // .......................................*.................................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v27.4S, v9.4S, v8.S[0] // ................................................................*......................................... + mul v17.4S, v29.4S, v0.S[0] // ......................................................*................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sqrdmulh v2.4S, v29.4S, v0.S[1] // ............................................................*............................................. + add v9.4S, v26.4S, v3.4S // .......................................................*.................................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v16.4S, v22.4S, v8.S[0] // ..........................................*............................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v21.4S, v21.4S, v30.4S // ............................................*............................................................. + add v5.4S, v13.4S, v27.4S // ...........................................................................*.............................. + sub v6.4S, v13.4S, v27.4S // ......................................................................*................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mul v13.4S, v9.4S, v12.S[2] // .............................................................*............................................ + sqrdmulh v26.4S, v9.4S, v12.S[3] // ...............................................................*.......................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + trn2 v11.4S, v5.4S, v6.4S // ...............................................................................*.......................... + add v22.4S, v21.4S, v16.4S // ................................................*......................................................... + // gap // .......................................................................................................... + ldr q1, [x5, #-32] // .....................................................................................................*.... + mls v17.4S, v2.4S, v8.S[0] // ..................................................................*....................................... + ldr q31, [x5, #-128] // .........................................................................*................................ + sqrdmulh v3.4S, v10.4S, v23.S[1] // ...........................................................*.............................................. + // gap // .......................................................................................................... + sub v29.4S, v14.4S, v15.4S // ..........................*............................................................................... + ldr q27, [x5, #-144] // .................................................................................................*........ + ldr q23, [x5, #-48] // ...........*.............................................................................................. + trn1 v0.4S, v5.4S, v6.4S // ..................................................................................*....................... + mls v13.4S, v26.4S, v8.S[0] // .....................................................................*.................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v15.4S, v29.4S, v17.4S // ........................................................................*................................. + mls v4.4S, v3.4S, v8.S[0] // ....................................................................*..................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v3.4S, v21.4S, v16.4S // .................................................*........................................................ + sub v29.4S, v29.4S, v17.4S // .......................................................................*.................................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v9.4S, v15.4S, v13.4S // ..............................................................................*........................... + sub v13.4S, v15.4S, v13.4S // .............................................................................*............................ + // gap // .......................................................................................................... + // gap // .......................................................................................................... + add v26.4S, v29.4S, v4.4S // ..........................................................................*............................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sub v14.4S, v29.4S, v4.4S // ............................................................................*............................. + trn1 v12.4S, v9.4S, v13.4S // ...................................................................................*...................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + trn1 v15.4S, v22.4S, v3.4S // .................................................................*........................................ + trn2 v18.4S, v26.4S, v14.4S // .................................................................................*........................ + trn2 v4.4S, v9.4S, v13.4S // .....................................................................................*.................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + trn1 v14.4S, v26.4S, v14.4S // ................................................................................*......................... + ldr q2, [x5, #-80] // .............................................*............................................................ + trn2 v21.2D, v15.2D, v0.2D // ......................................................................................*................... + trn2 v9.2D, v4.2D, v18.2D // .........................................................................................*................ + trn2 v30.4S, v22.4S, v3.4S // ..................................................*....................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sqrdmulh v22.4S, v21.4S, v7.4S // .............................................................................................*............ + // gap // .......................................................................................................... + // gap // .......................................................................................................... + trn2 v17.2D, v12.2D, v14.2D // .......................................................................................*.................. + mul v10.4S, v9.4S, v19.4S // ...............................................................................................*.......... + trn2 v13.2D, v30.2D, v11.2D // ...................................................................................................*...... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + sqrdmulh v26.4S, v17.4S, v2.4S // ...........................................................................................*.............. + sqrdmulh v3.4S, v9.4S, v2.4S // ................................................................................................*......... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mul v29.4S, v21.4S, v25.4S // ..............................................................................................*........... + // gap // .......................................................................................................... + mul v17.4S, v17.4S, v19.4S // ............................................................................................*............. + mul v9.4S, v13.4S, v25.4S // .......................................................................................................*.. + trn1 v18.2D, v4.2D, v18.2D // ....................................................................................................*..... + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v10.4S, v3.4S, v8.S[0] // ........................................................................................................*. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + trn1 v16.2D, v30.2D, v11.2D // ........................................................................................*................. + // gap // .......................................................................................................... + // gap // .......................................................................................................... + mls v17.4S, v26.4S, v8.S[0] // .........................................................................................................* + mls v29.4S, v22.4S, v8.S[0] // ......................................................................................................*... + + // --------------------------------------------- new position ----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----- + // ldr q20, [x2, #112] // .*........................................................................................................ + // ldr q28, [x2, #80] // .....*.................................................................................................... + // ldr q30, [x4], #64 // *......................................................................................................... + // ldr q22, [x1, #96] // ........*................................................................................................. + // ldr q24, [x2, #96] // ...*...................................................................................................... + // ldr q19, [x2, #64] // ....*..................................................................................................... + // ldr q5, [x1, #112] // .......*.................................................................................................. + // mul v3.4S, v20.4S, v30.S[0] // ...........*.............................................................................................. + // sqrdmulh v0.4S, v20.4S, v30.S[1] // ..........*............................................................................................... + // ldr q10, [x1, #64] // ..*....................................................................................................... + // sqrdmulh v1.4S, v24.4S, v30.S[1] // ............*............................................................................................. + // ldr q23, [x5, #144] // ........................................................................*................................. + // ldr q7, [x1, #80] // ...................*...................................................................................... + // mul v13.4S, v19.4S, v30.S[0] // ..................*....................................................................................... + // mls v3.4S, v0.4S, v8.S[0] // ....................*..................................................................................... + // sqrdmulh v11.4S, v19.4S, v30.S[1] // .......................*.................................................................................. + // mul v0.4S, v24.4S, v30.S[0] // .............*............................................................................................ + // sqrdmulh v21.4S, v28.4S, v30.S[1] // ................*......................................................................................... + // add v15.4S, v5.4S, v3.4S // ............................*............................................................................. + // mul v18.4S, v28.4S, v30.S[0] // .................*........................................................................................ + // ldr q31, [x4, #-32] // .........................................*................................................................ + // mls v13.4S, v11.4S, v8.S[0] // .....................................*.................................................................... + // mls v0.4S, v1.4S, v8.S[0] // ......................*................................................................................... + // mul v28.4S, v15.4S, v30.S[2] // ................................*......................................................................... + // sqrdmulh v11.4S, v15.4S, v30.S[3] // ...............................*.......................................................................... + // mls v18.4S, v21.4S, v8.S[0] // ..........................*............................................................................... + // sub v9.4S, v10.4S, v13.4S // ......................................................................*................................... + // sub v15.4S, v22.4S, v0.4S // .............................*............................................................................ + // mls v28.4S, v11.4S, v8.S[0] // ......................................*................................................................... + // add v0.4S, v22.4S, v0.4S // ..............................*........................................................................... + // add v22.4S, v7.4S, v18.4S // .......................................*.................................................................. + // ldr q20, [x4, #-16] // ...............*.......................................................................................... + // ldr q27, [x4, #-48] // ..............*........................................................................................... + // sqrdmulh v21.4S, v0.4S, v30.S[3] // ...................................*...................................................................... + // mul v30.4S, v0.4S, v30.S[2] // ....................................*..................................................................... + // sub v11.4S, v22.4S, v28.4S // ............................................*............................................................. + // add v28.4S, v22.4S, v28.4S // ..............................................*........................................................... + // sub v3.4S, v5.4S, v3.4S // ...........................*.............................................................................. + // add v5.4S, v10.4S, v13.4S // .............................................*............................................................ + // mul v13.4S, v28.4S, v27.S[2] // .....................................................*.................................................... + // sqrdmulh v0.4S, v28.4S, v27.S[3] // ...................................................*...................................................... + // mls v30.4S, v21.4S, v8.S[0] // ..........................................*............................................................... + // mls v13.4S, v0.4S, v8.S[0] // ..........................................................*............................................... + // mul v26.4S, v3.4S, v27.S[0] // .................................*........................................................................ + // add v0.4S, v5.4S, v30.4S // ...........................................................*.............................................. + // ldr q1, [x5, #112] // ........................................................................................*................. + // sqrdmulh v21.4S, v3.4S, v27.S[1] // ..................................*....................................................................... + // sub v29.4S, v7.4S, v18.4S // ...........................................*.............................................................. + // add v3.4S, v0.4S, v13.4S // .................................................................*........................................ + // sub v0.4S, v0.4S, v13.4S // .............................................................................*............................ + // trn2 v13.4S, v3.4S, v0.4S // ...........................................................................................*.............. + // mls v26.4S, v21.4S, v8.S[0] // ........................................*................................................................. + // sub v5.4S, v5.4S, v30.4S // ....................................................*..................................................... + // mul v30.4S, v11.4S, v31.S[0] // .................................................*........................................................ + // mul v18.4S, v15.4S, v27.S[0] // .......................................................*.................................................. + // add v19.4S, v29.4S, v26.4S // .........................................................*................................................ + // sub v24.4S, v29.4S, v26.4S // ...............................................*.......................................................... + // sqrdmulh v29.4S, v11.4S, v31.S[1] // ................................................*......................................................... + // ldr q7, [x5, #16] // ........................*................................................................................. + // sqrdmulh v12.4S, v24.4S, v20.S[1] // .....................................................................*.................................... + // sqrdmulh v17.4S, v15.4S, v27.S[1] // ........................................................*................................................. + // mul v26.4S, v19.4S, v31.S[2] // ..............................................................*........................................... + // mul v6.4S, v24.4S, v20.S[0] // ..................................................*....................................................... + // sqrdmulh v24.4S, v19.4S, v31.S[3] // ...............................................................*.......................................... + // mls v30.4S, v29.4S, v8.S[0] // ......................................................*................................................... + // trn1 v15.4S, v3.4S, v0.4S // ....................................................................................*..................... + // mls v18.4S, v17.4S, v8.S[0] // ...................................................................*...................................... + // ldr q3, [x5, #96] // .....................*.................................................................................... + // mls v6.4S, v12.4S, v8.S[0] // ............................................................................*............................. + // mls v26.4S, v24.4S, v8.S[0] // ..........................................................................*............................... + // sub v12.4S, v5.4S, v30.4S // .............................................................*............................................ + // sub v27.4S, v9.4S, v18.4S // ..............................................................................*........................... + // add v0.4S, v9.4S, v18.4S // ...........................................................................*.............................. + // ldr q31, [x5, #64] // ....................................................................*..................................... + // add v9.4S, v27.4S, v6.4S // .................................................................................*........................ + // add v30.4S, v5.4S, v30.4S // ............................................................*............................................. + // sub v24.4S, v27.4S, v6.4S // ..................................................................................*....................... + // sub v27.4S, v0.4S, v26.4S // ................................................................................*......................... + // add v6.4S, v0.4S, v26.4S // ...............................................................................*.......................... + // trn2 v18.4S, v30.4S, v12.4S // ................................................................*......................................... + // trn1 v14.4S, v9.4S, v24.4S // .......................................................................................*.................. + // trn2 v9.4S, v9.4S, v24.4S // .....................................................................................*.................... + // trn1 v0.4S, v30.4S, v12.4S // .........................................................................*................................ + // trn1 v12.4S, v6.4S, v27.4S // ...................................................................................*...................... + // ldr q4, [x5], #(12*16) // .........*................................................................................................ + // trn2 v2.4S, v6.4S, v27.4S // ......................................................................................*................... + // trn2 v30.2D, v15.2D, v0.2D // .........................................................................................*................ + // trn2 v24.2D, v12.2D, v14.2D // .............................................................................................*............ + // trn1 v16.2D, v13.2D, v18.2D // .......................................................................................................*.. + // trn2 v6.2D, v2.2D, v9.2D // ..........................................................................................*............... + // add x1, x1, #64 // .........................*................................................................................ + // sqrdmulh v19.4S, v24.4S, v1.4S // ................................................................................................*......... + // mul v17.4S, v24.4S, v3.4S // ...................................................................................................*...... + // sqrdmulh v25.4S, v30.4S, v7.4S // ............................................................................................*............. + // mul v29.4S, v30.4S, v4.4S // ..................................................................................................*....... + // mul v10.4S, v6.4S, v3.4S // ..............................................................................................*........... + // sqrdmulh v20.4S, v6.4S, v1.4S // .................................................................................................*........ + // ldr q27, [x5, #-144] // .......................................................................*.................................. + // add x2, x2, #64 // ......*................................................................................................... + // trn2 v13.2D, v13.2D, v18.2D // ...............................................................................................*.......... + // trn1 v18.2D, v2.2D, v9.2D // .....................................................................................................*.... + // ldr q1, [x5, #-32] // ..................................................................*....................................... + // mls v29.4S, v25.4S, v8.S[0] // .........................................................................................................* + // mul v9.4S, v13.4S, v4.4S // ....................................................................................................*..... + // mls v10.4S, v20.4S, v8.S[0] // ......................................................................................................*... + // mls v17.4S, v19.4S, v8.S[0] // ........................................................................................................*. sub count, count, #1 layer45678_start: - ldr x16, [x4, #32] // ..........................................e............................................................................................................................................................................. - trn2 v9.2D, v28.2D, v10.2D // ..................................................................................................................*..................................................................................................... - add v12.4S, v13.4S, v11.4S // ...............................................................................................................................................................*........................................................ - add x1, x1, #64 // ................................*....................................................................................................................................................................................... - ldr x27, [x5, #-112] // ..................................................................................................................................................*..................................................................... - ldr x22, [x1, #168] // .........e.............................................................................................................................................................................................................. - sqrdmulh v26.4S, v26.4S, v27.S[3] // .....................................................................................................*.................................................................................................................. - sub v4.4S, v31.4S, v4.4S // ...............................................................*........................................................................................................................................................ - ldr x21, [x5, #24] // ...................................................................................................................................e.................................................................................... - ldr x19, [x4, #40] // ...........................................e............................................................................................................................................................................ - sqrdmulh v22.4S, v9.4S, v23.4S // .......................................................................................................................................................*................................................................ - ins v6.d[1], x20 // .............................................................................................................................................................................*.......................................... - ldr x12, [x5, #-64] // ..................................................................................................................................................................................*..................................... - ldr x14, [x5, #112] // ..............................................................................................................................................................................e......................................... - // gap // ........................................................................................................................................................................................................................ - ins v27.d[0], x16 // ............................................e........................................................................................................................................................................... - ldr x11, [x4, #16] // ......................................e................................................................................................................................................................................. - sub v15.4S, v13.4S, v11.4S // ..............................................................................................................................................................*......................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v27.d[1], x19 // .............................................e.......................................................................................................................................................................... - ldr x20, [x5, #64] // ..............................................................................................................................................e......................................................................... - sub v2.4S, v30.4S, v29.4S // ........................................................................................*............................................................................................................................... - ldr x10, [x5, #-160] // ......................................................................................................................................*................................................................................. - ldr x24, [x1, #184] // .............e.......................................................................................................................................................................................................... - mul v7.4S, v9.4S, v25.4S // ......................................................................................................................................................*................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x16, [x5, #16] // ..................................................................................................................................e..................................................................................... - mul v19.4S, v15.4S, v17.4S // .....................................................................................................................................................................*.................................................. - mls v14.4S, v26.4S, v8.S[0] // ......................................................................................................*................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - ldr x28, [x4], #64 // ..................................e..................................................................................................................................................................................... - ins v17.d[0], x20 // ................................................................................................................................................e....................................................................... - trn1 v20.2D, v28.2D, v10.2D // ....................................................................................................................*................................................................................................... - ldr x20, [x5, #104] // ...........................................................................................................................................................................e............................................ - ldr x25, [x1, #144] // ....e................................................................................................................................................................................................................... - mul v13.4S, v4.4S, v5.S[0] // ................................................................................*....................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x17, [x5, #72] // ...............................................................................................................................................e........................................................................ - mls v7.4S, v22.4S, v8.S[0] // ........................................................................................................................................................*............................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v31.d[0], x27 // ....................................................................................................................................................*................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v10.4S, v4.4S, v5.S[1] // .................................................................................*...................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v11.d[0], x13 // ............................................................................................................................................*........................................................................... - sub v28.4S, v20.4S, v7.4S // .........................................................................................................................................................*.............................................................. - ldr x23, [x5, #-56] // ...................................................................................................................................................................................*.................................... - // gap // ........................................................................................................................................................................................................................ - ins v1.d[0], x8 // ........................................................................................................................................................................................*............................... - // gap // ........................................................................................................................................................................................................................ - ldr x8, [x1, #176] // ............e........................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - ldr x19, [x5, #-40] // .......................................................................................................................................................................................*................................ - mls v13.4S, v10.4S, v8.S[0] // ..................................................................................*..................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v23.d[0], x28 // ....................................e................................................................................................................................................................................... - ldr x28, [x4, #-56] // ...................................e.................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v16.4S, v18.4S, v3.4S // .....................................................*.................................................................................................................................................................. - add v18.4S, v20.4S, v7.4S // ..........................................................................................................................................................*............................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x7, [x4, #-80] // ..............................................*......................................................................................................................................................................... - ins v20.d[0], x25 // ......e................................................................................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v0.d[0], x29 // ............................................................................................................................................................................................*........................... - ldr x29, [x5, #160] // ..........................................................................................................................................................................................e............................. - add x2, x2, #64 // .................................*...................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - ldr x6, [x2, #144] // ....................e................................................................................................................................................................................................... - ldr x15, [x2, #184] // .............................e.......................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v25.d[0], x7 // ................................................*....................................................................................................................................................................... - ldr x9, [x2, #152] // .....................e.................................................................................................................................................................................................. - ldr x7, [x4, #-72] // ...............................................*........................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x0, [x2, #176] // ............................e........................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v22.d[0], x6 // ......................e................................................................................................................................................................................................. - ins v7.d[0], x12 // ....................................................................................................................................................................................*................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x13, [x5, #48] // ..........................................................................................................................................e............................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v17.d[1], x17 // .................................................................................................................................................e...................................................................... - ins v22.d[1], x9 // .......................e................................................................................................................................................................................................ - ldr x17, [x5, #-136] // ...........................................................................................................................................*............................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x12, [x1, #160] // ........e............................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v25.d[1], x7 // .................................................*...................................................................................................................................................................... - ldr x9, [x1, #136] // .e...................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v24.4S, v2.4S, v25.S[0] // .........................................................................................................*.............................................................................................................. - sqrdmulh v2.4S, v2.4S, v25.S[1] // ..........................................................................................................*............................................................................................................. - ldr x25, [x1, #128] // e....................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - ins v23.d[1], x28 // .....................................e.................................................................................................................................................................................. - ins v11.d[1], x17 // .............................................................................................................................................*.......................................................................... - ldr x28, [x2, #168] // .........................e.............................................................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - ldr x17, [x5, #-24] // ...........................................................................................................................................................................................*............................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v26.d[0], x0 // ..............................e......................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v24.4S, v2.4S, v8.S[0] // ...........................................................................................................*............................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v30.4S, v16.4S, v13.4S // ....................................................................................*................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v11.4S, v12.4S, v11.4S // .................................................................................................................................................................*...................................................... - sub v2.4S, v16.4S, v13.4S // ...................................................................................*.................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v13.d[0], x8 // ..............e......................................................................................................................................................................................................... - sub v29.4S, v30.4S, v14.4S // .......................................................................................................*................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v16.4S, v30.4S, v14.4S // ........................................................................................................*............................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v13.d[1], x24 // ...............e........................................................................................................................................................................................................ - sub v14.4S, v2.4S, v24.4S // ............................................................................................................*........................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x24, [x5, #96] // ..........................................................................................................................................................................e............................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v2.4S, v2.4S, v24.4S // .............................................................................................................*.......................................................................................................... - trn1 v3.4S, v16.4S, v29.4S // ......................................................................................................................*................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v7.d[1], x23 // .....................................................................................................................................................................................*.................................. - ins v0.d[1], x17 // .............................................................................................................................................................................................*.......................... - ldr x23, [x5, #-8] // ...............................................................................................................................................................................................*........................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1 v24.4S, v2.4S, v14.4S // ........................................................................................................................*............................................................................................... - ins v5.d[0], x10 // ........................................................................................................................................*............................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2 v30.4S, v2.4S, v14.4S // .........................................................................................................................*.............................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x8, [x2, #128] // ................e....................................................................................................................................................................................................... - ins v26.d[1], x15 // ...............................e........................................................................................................................................................................................ - trn2 v10.2D, v3.2D, v24.2D // ..........................................................................................................................*............................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1 v2.2D, v3.2D, v24.2D // ............................................................................................................................*........................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2 v29.4S, v16.4S, v29.4S // .......................................................................................................................*................................................................................................ - ins v25.d[0], x8 // ..................e..................................................................................................................................................................................................... - ldr x8, [x1, #152] // .....e.................................................................................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v9.4S, v10.4S, v6.4S // ..................................................................................................................................................................................................*..................... - sqrdmulh v24.4S, v10.4S, v21.4S // ...................................................................................................................................................................................................*.................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v20.d[1], x8 // .......e................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v9.4S, v24.4S, v8.S[0] // ....................................................................................................................................................................................................*................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1 v3.2D, v29.2D, v30.2D // .............................................................................................................................*.......................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v1.d[1], x19 // .........................................................................................................................................................................................*.............................. - trn2 v4.2D, v29.2D, v30.2D // ...........................................................................................................................*............................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v24.4S, v2.4S, v9.4S // .....................................................................................................................................................................................................*.................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v10.4S, v2.4S, v9.4S // ......................................................................................................................................................................................................*................. - ldr x27, [x5, #-152] // .......................................................................................................................................*................................................................................ - ins v9.d[0], x26 // ................................................................................................................................................................................................*....................... - // gap // ........................................................................................................................................................................................................................ - ldr x26, [x5, #176] // ..............................................................................................................................................................................................e......................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v2.4S, v4.4S, v21.4S // ........................................................................................................................................................................................................*............... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v5.d[1], x27 // .........................................................................................................................................*.............................................................................. - mul v21.4S, v4.4S, v6.4S // .......................................................................................................................................................................................................*................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v16.4S, v12.4S, v5.4S // ................................................................................................................................................................*....................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v21.4S, v2.4S, v8.S[0] // .........................................................................................................................................................................................................*.............. - ldr x27, [x5, #-104] // ...................................................................................................................................................*.................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v12.4S, v26.4S, v23.S[1] // ..................................................................e..................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v31.d[1], x27 // .....................................................................................................................................................*.................................................................. - ldr x8, [x2, #160] // ........................e............................................................................................................................................................................................... - add v29.4S, v3.4S, v21.4S // ...........................................................................................................................................................................................................*............ - // gap // ........................................................................................................................................................................................................................ - mls v16.4S, v11.4S, v8.S[0] // ..................................................................................................................................................................*..................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v14.4S, v15.4S, v31.4S // ......................................................................................................................................................................*................................................. - mul v15.4S, v22.4S, v23.S[0] // .......................................................e................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v9.d[1], x23 // .................................................................................................................................................................................................*...................... - ins v11.d[0], x8 // ..........................e............................................................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v5.4S, v3.4S, v21.4S // ..........................................................................................................................................................................................................*............. - sub v3.4S, v18.4S, v16.4S // ...................................................................................................................................................................*.................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v2.4S, v18.4S, v16.4S // ....................................................................................................................................................................*................................................... - mul v16.4S, v26.4S, v23.S[0] // .................................................................e...................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v19.4S, v14.4S, v8.S[0] // .......................................................................................................................................................................*................................................ - mul v21.4S, v5.4S, v0.4S // .................................................................................................................................................................................................................*...... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v31.d[0], x12 // ..........e............................................................................................................................................................................................................. - sqrdmulh v0.4S, v5.4S, v9.4S // ..................................................................................................................................................................................................................*..... - ldr x8, [x4, #-40] // .......................................e................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v16.4S, v12.4S, v8.S[0] // ...................................................................e.................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v5.4S, v28.4S, v19.4S // ........................................................................................................................................................................*............................................... - add v4.4S, v28.4S, v19.4S // .........................................................................................................................................................................*.............................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v19.4S, v22.4S, v23.S[1] // ........................................................e............................................................................................................................................................... - ins v11.d[1], x28 // ...........................e............................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // ......................................................................................................................................................................................................................*. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v3.4S, v13.4S, v16.4S // .....................................................................e.................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v9.4S, v11.4S, v23.S[1] // .............................................................e.......................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v26.4S, v3.4S, v23.S[2] // ...........................................................................e............................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v3.4S, v3.4S, v23.S[3] // ............................................................................e........................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v15.4S, v19.4S, v8.S[0] // .........................................................e.............................................................................................................................................................. - ins v5.d[0], x11 // ........................................e............................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v31.d[1], x22 // ...........e............................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v26.4S, v3.4S, v8.S[0] // .............................................................................e.......................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v5.d[1], x8 // .........................................e.............................................................................................................................................................................. - add v19.4S, v20.4S, v15.4S // ...........................................................e............................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v30.4S, v20.4S, v15.4S // ..........................................................e............................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v22.4S, v19.4S, v26.4S // ...............................................................................e........................................................................................................................................ - ins v18.d[0], x25 // ..e..................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v21.4S, v0.4S, v8.S[0] // ...................................................................................................................................................................................................................*.... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v14.4S, v22.4S, v5.S[2] // ..........................................................................................e............................................................................................................................. - sqrdmulh v6.4S, v22.4S, v5.S[3] // ...........................................................................................e............................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v28.4S, v29.4S, v7.4S // ............................................................................................................................................................................................................*........... - sqrdmulh v15.4S, v29.4S, v1.4S // .............................................................................................................................................................................................................*.......... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v29.4S, v19.4S, v26.4S // ..............................................................................e......................................................................................................................................... - mul v4.4S, v11.4S, v23.S[0] // ............................................................e........................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v14.4S, v6.4S, v8.S[0] // ............................................................................................e........................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v28.4S, v15.4S, v8.S[0] // ..............................................................................................................................................................................................................*......... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x11, [x2, #136] // .................e...................................................................................................................................................................................................... - add v2.4S, v24.4S, v21.4S // .....................................................................................................................................................................................................................*.. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v4.4S, v9.4S, v8.S[0] // ..............................................................e......................................................................................................................................................... - sqrdmulh v22.4S, v29.4S, v27.S[1] // ................................................................................................e....................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v25.d[1], x11 // ...................e.................................................................................................................................................................................................... - sub v1.4S, v10.4S, v28.4S // ...............................................................................................................................................................................................................*........ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v3.4S, v24.4S, v21.4S // ....................................................................................................................................................................................................................*... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v0.4S, v10.4S, v28.4S // ................................................................................................................................................................................................................*....... - sqrdmulh v20.4S, v25.4S, v23.S[1] // ...................................................e.................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v12.4S, v31.4S, v4.4S // ................................................................e....................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - st4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x2], #64 // .......................................................................................................................................................................................................................* - mul v0.4S, v29.4S, v27.S[0] // ...............................................................................................e........................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sqrdmulh v21.4S, v12.4S, v23.S[3] // .......................................................................e................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ldr x0, [x5, #8] // ...............................................................................................................................e........................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v0.4S, v22.4S, v8.S[0] // .................................................................................................e...................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v18.d[1], x9 // ...e.................................................................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v3.4S, v25.4S, v23.S[0] // ..................................................e..................................................................................................................................................................... - ldr x8, [x5], #(12*16) // ..............................................................................................................................e......................................................................................... - ldr x10, [x5, #-72] // ...............................................................................................................................................................................e........................................ - // gap // ........................................................................................................................................................................................................................ - sub v6.4S, v13.4S, v16.4S // ....................................................................e................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v28.4S, v12.4S, v23.S[2] // ......................................................................e................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v3.4S, v20.4S, v8.S[0] // ....................................................e................................................................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v29.4S, v6.4S, v5.S[0] // .....................................................................................e.................................................................................................................................. - sqrdmulh v15.4S, v6.4S, v5.S[1] // ......................................................................................e................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v28.4S, v21.4S, v8.S[0] // ........................................................................e............................................................................................................................................... - ins v23.d[0], x16 // ....................................................................................................................................e................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v10.4S, v18.4S, v3.4S // ......................................................e................................................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v29.4S, v15.4S, v8.S[0] // .......................................................................................e................................................................................................................................ - ins v23.d[1], x21 // .....................................................................................................................................e.................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v12.4S, v10.4S, v28.4S // .........................................................................e.............................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - add v25.4S, v10.4S, v28.4S // ..........................................................................e............................................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v22.4S, v12.4S, v0.4S // ..................................................................................................e..................................................................................................................... - add v26.4S, v12.4S, v0.4S // ...................................................................................................e.................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - sub v1.4S, v25.4S, v14.4S // .............................................................................................e.......................................................................................................................... - add v13.4S, v25.4S, v14.4S // ..............................................................................................e......................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2 v14.4S, v26.4S, v22.4S // .................................................................................................................e...................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v25.d[0], x8 // ................................................................................................................................e....................................................................................... - trn2 v12.4S, v13.4S, v1.4S // ...............................................................................................................e........................................................................................................ - ldr x8, [x5, #-48] // ......................................................................................................................................................................................e................................. - // gap // ........................................................................................................................................................................................................................ - trn1 v28.4S, v13.4S, v1.4S // ..............................................................................................................e......................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn2 v19.2D, v12.2D, v14.2D // ...................................................................................................................e.................................................................................................... - ins v25.d[1], x0 // .................................................................................................................................e...................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - trn1 v10.4S, v26.4S, v22.4S // ................................................................................................................e....................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v11.4S, v19.4S, v25.4S // ...........................................................................................................................................................e............................................................ - sqrdmulh v7.4S, v19.4S, v23.4S // ............................................................................................................................................................e........................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v21.d[0], x14 // ................................................................................................................................................................................e....................................... - add v26.4S, v30.4S, v29.4S // .........................................................................................e.............................................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v6.d[0], x24 // ............................................................................................................................................................................e........................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mls v11.4S, v7.4S, v8.S[0] // .............................................................................................................................................................e.......................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - ins v21.d[1], x10 // .................................................................................................................................................................................e...................................... - trn1 v13.2D, v12.2D, v14.2D // .....................................................................................................................e.................................................................................................. - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - mul v14.4S, v26.4S, v27.S[2] // ....................................................................................................e................................................................................................................... - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - // gap // ........................................................................................................................................................................................................................ - - // original source code - // ldr x10, [x1, #(16*0 + (64))] // .....................................................................e..................................................................................................................................................|....................................................................e......................................................................................................... - // ldr x11, [x1, #((16*0 + (64))+8)] // ..................................................................e.....................................................................................................................................................|.................................................................e............................................................................................................ - // ins v9.d[0], x10 // ..........................................................................................................................................................e.............................................................|.........................................................................................................................................................e.................... - // ins v9.d[1], x11 // ...................................................................................................................................................................................e....................................|.............................................................................................................................................................................. - // ldr x10, [x1, #(16*1 + (64))] // ..............................e.........................................................................................................................................................................................|.............................e................................................................................................................................................ - // ldr x11, [x1, #((16*1 + (64))+8)] // ...................................................................................................e....................................................................................................................|..................................................................................................e........................................................................... - // ins v10.d[0], x10 // ................................................e.......................................................................................................................................................................|...............................................e.............................................................................................................................. - // ins v10.d[1], x11 // ......................................................................................................e.................................................................................................................|.....................................................................................................e........................................................................ - // ldr x10, [x1, #(16*2 + (64))] // ................................................................e.......................................................................................................................................................|...............................................................e.............................................................................................................. - // ldr x11, [x1, #((16*2 + (64))+8)] // .....e..................................................................................................................................................................................................................|....e......................................................................................................................................................................... - // ins v11.d[0], x10 // .....................................................................................................................................e..................................................................................|....................................................................................................................................e......................................... - // ins v11.d[1], x11 // ....................................................................................................................................................e...................................................................|...................................................................................................................................................e.......................... - // ldr x10, [x1, #(16*3 + (64))] // ........................................e...............................................................................................................................................................................|.......................................e...................................................................................................................................... - // ldr x11, [x1, #((16*3 + (64))+8)] // .....................e..................................................................................................................................................................................................|....................e......................................................................................................................................................... - // ins v12.d[0], x10 // ...............................................................................e........................................................................................................................................|..............................................................................e............................................................................................... - // ins v12.d[1], x11 // ..................................................................................e.....................................................................................................................................|.................................................................................e............................................................................................ - // ldr x10, [x2, #(16*0 + (64))] // .............................................................................................e..........................................................................................................................|............................................................................................e................................................................................. - // ldr x11, [x2, #((16*0 + (64))+8)] // ....................................................................................................................................................................e...................................................|...................................................................................................................................................................e.......... - // ins v13.d[0], x10 // ..................................................................................................e.....................................................................................................................|.................................................................................................e............................................................................ - // ins v13.d[1], x11 // ........................................................................................................................................................................e...............................................|.......................................................................................................................................................................e...... - // ldr x10, [x2, #(16*1 + (64))] // ....................................................e...................................................................................................................................................................|...................................................e.......................................................................................................................... - // ldr x11, [x2, #((16*1 + (64))+8)] // .......................................................e................................................................................................................................................................|......................................................e....................................................................................................................... - // ins v14.d[0], x10 // ..........................................................e.............................................................................................................................................................|.........................................................e.................................................................................................................... - // ins v14.d[1], x11 // ..............................................................e.........................................................................................................................................................|.............................................................e................................................................................................................ - // ldr x10, [x2, #(16*2 + (64))] // ........................................................................................................................e...............................................................................................|.......................................................................................................................e...................................................... - // ldr x11, [x2, #((16*2 + (64))+8)] // ........................................................................e...............................................................................................................................................|.......................................................................e...................................................................................................... - // ins v15.d[0], x10 // ..............................................................................................................................e.........................................................................................|.............................................................................................................................e................................................ - // ins v15.d[1], x11 // ............................................................................................................................................e...........................................................................|...........................................................................................................................................e.................................. - // ldr x10, [x2, #(16*3 + (64))] // .........................................................e..............................................................................................................................................................|........................................................e..................................................................................................................... - // ldr x11, [x2, #((16*3 + (64))+8)] // .....................................................e..................................................................................................................................................................|....................................................e......................................................................................................................... - // ins v16.d[0], x10 // ..........................................................................e.............................................................................................................................................|.........................................................................e.................................................................................................... - // ins v16.d[1], x11 // ..............................................................................................e.........................................................................................................................|.............................................................................................e................................................................................ - // add x1, x1, #64 // ...*....................................................................................................................................................................................................................|..*........................................................................................................................................................................... - // add x2, x2, #64 // ...................................................*....................................................................................................................................................................|..................................................*........................................................................................................................... - // ldr x10, [x4], #64 // ..........................e.............................................................................................................................................................................................|.........................e.................................................................................................................................................... - // ldr x11, [x4, #(-64+8)] // ............................................e...........................................................................................................................................................................|...........................................e.................................................................................................................................. - // ins v0.d[0], x10 // ...........................................e............................................................................................................................................................................|..........................................e................................................................................................................................... - // ins v0.d[1], x11 // ......................................................................e.................................................................................................................................................|.....................................................................e........................................................................................................ - // ldr x10, [x4, #(-64 + 16)] // ...............e........................................................................................................................................................................................................|..............e............................................................................................................................................................... - // ldr x11, [x4, #((-64 + 16)+8)] // .......................................................................................................................................e................................................................................|......................................................................................................................................e....................................... - // ins v1.d[0], x10 // ...................................................................................................................................................e....................................................................|..................................................................................................................................................e........................... - // ins v1.d[1], x11 // ......................................................................................................................................................e.................................................................|.....................................................................................................................................................e........................ - // ldr x10, [x4, #(-64 + 32)] // e.......................................................................................................................................................................................................................e.............................................................................................................................................................................. - // ldr x11, [x4, #((-64 + 32)+8)] // .........e..............................................................................................................................................................................................................|........e..................................................................................................................................................................... - // ins v2.d[0], x10 // ..............e.........................................................................................................................................................................................................|.............e................................................................................................................................................................ - // ins v2.d[1], x11 // .................e......................................................................................................................................................................................................|................e............................................................................................................................................................. - // ldr x10, [x4, #(-64 + 48)] // ...............................................*........................................................................................................................................................................|..............................................*............................................................................................................................... - // ldr x11, [x4, #((-64 + 48)+8)] // ........................................................*...............................................................................................................................................................|.......................................................*...................................................................................................................... - // ins v3.d[0], x10 // ......................................................*.................................................................................................................................................................|.....................................................*........................................................................................................................ - // ins v3.d[1], x11 // .................................................................*......................................................................................................................................................|................................................................*............................................................................................................. - // mul v24.4s, v13.4s, v0.s[0] // ....................................................................................................................................................................................e...................................|.............................................................................................................................................................................. - // sqrdmulh v13.4s, v13.4s, v0.s[1] // ............................................................................................................................................................................e...........................................|...........................................................................................................................................................................e.. - // mls v24.4s, v13.4s, v8.s[0] // .........................................................................................................................................................................................e..............................|.............................................................................................................................................................................. - // sub v13.4s, v9.4s, v24.4s // .............................................*..........................................................................................................................................................................|............................................*................................................................................................................................. - // add v9.4s, v9.4s, v24.4s // ..............................................................................................................................................................................................e.........................|.............................................................................................................................................................................. - // mul v24.4s, v14.4s, v0.s[0] // ............................................................................................................................e...........................................................................................|...........................................................................................................................e.................................................. - // sqrdmulh v14.4s, v14.4s, v0.s[1] // ...........................................................................................................................................e............................................................................|..........................................................................................................................................e................................... - // mls v24.4s, v14.4s, v8.s[0] // ..................................................................................................................................................e.....................................................................|.................................................................................................................................................e............................ - // sub v14.4s, v10.4s, v24.4s // ........................................................................................................................................................e...............................................................|.......................................................................................................................................................e...................... - // add v10.4s, v10.4s, v24.4s // .......................................................................................................................................................e................................................................|......................................................................................................................................................e....................... - // mul v24.4s, v15.4s, v0.s[0] // .................................................................................................................................................................e......................................................|................................................................................................................................................................e............. - // sqrdmulh v15.4s, v15.4s, v0.s[1] // ...............................................................................................................................................e........................................................................|..............................................................................................................................................e............................... - // mls v24.4s, v15.4s, v8.s[0] // ......................................................................................................................................................................e.................................................|.....................................................................................................................................................................e........ - // sub v15.4s, v11.4s, v24.4s // .......*................................................................................................................................................................................................................|......*....................................................................................................................................................................... - // add v11.4s, v11.4s, v24.4s // .............................................................................................................................................................................e..........................................|............................................................................................................................................................................e. - // mul v24.4s, v16.4s, v0.s[0] // ..................................................................................................................................e.....................................................................................|.................................................................................................................................e............................................ - // sqrdmulh v16.4s, v16.4s, v0.s[1] // ......................................................................................................................e.................................................................................................|.....................................................................................................................e........................................................ - // mls v24.4s, v16.4s, v8.s[0] // ........................................................................................................................................e...............................................................................|.......................................................................................................................................e...................................... - // sub v16.4s, v12.4s, v24.4s // .......................................................................................................................................................................................e................................|.............................................................................................................................................................................. - // add v12.4s, v12.4s, v24.4s // ..............................................................................................................................................e.........................................................................|.............................................................................................................................................e................................ - // mul v24.4s, v11.4s, v0.s[2] // ........................................................................................................................................................................................e...............................|.............................................................................................................................................................................. - // sqrdmulh v11.4s, v11.4s, v0.s[3] // ................................................................................................................................................................................e.......................................|.............................................................................................................................................................................. - // mls v24.4s, v11.4s, v8.s[0] // ............................................................................................................................................................................................e...........................|.............................................................................................................................................................................. - // sub v11.4s, v9.4s, v24.4s // .................................................................................................................................................................................................e......................|.............................................................................................................................................................................. - // add v9.4s, v9.4s, v24.4s // ..................................................................................................................................................................................................e.....................|.............................................................................................................................................................................. - // mul v24.4s, v12.4s, v0.s[2] // ................................................................................................................................................e.......................................................................|...............................................................................................................................................e.............................. - // sqrdmulh v12.4s, v12.4s, v0.s[3] // .................................................................................................................................................e......................................................................|................................................................................................................................................e............................. - // mls v24.4s, v12.4s, v8.s[0] // .....................................................................................................................................................e..................................................................|....................................................................................................................................................e......................... - // sub v12.4s, v10.4s, v24.4s // ................................................................................................................................................................e.......................................................|...............................................................................................................................................................e.............. - // add v10.4s, v10.4s, v24.4s // .........................................................................................................................................................e..............................................................|........................................................................................................................................................e..................... - // mul v24.4s, v15.4s, v1.s[0] // ...............................*........................................................................................................................................................................................|..............................*............................................................................................................................................... - // sqrdmulh v15.4s, v15.4s, v1.s[1] // ...................................*....................................................................................................................................................................................|..................................*........................................................................................................................................... - // mls v24.4s, v15.4s, v8.s[0] // ..........................................*.............................................................................................................................................................................|.........................................*.................................................................................................................................... - // sub v15.4s, v13.4s, v24.4s // ..............................................................................*.........................................................................................................................................|.............................................................................*................................................................................................ - // add v13.4s, v13.4s, v24.4s // ............................................................................*...........................................................................................................................................|...........................................................................*.................................................................................................. - // mul v24.4s, v16.4s, v1.s[0] // ..........................................................................................................................................................................................e.............................|.............................................................................................................................................................................. - // sqrdmulh v16.4s, v16.4s, v1.s[1] // ...........................................................................................................................................................................................e............................|.............................................................................................................................................................................. - // mls v24.4s, v16.4s, v8.s[0] // ...............................................................................................................................................................................................e........................|.............................................................................................................................................................................. - // sub v16.4s, v14.4s, v24.4s // ...................*....................................................................................................................................................................................................|..................*........................................................................................................................................................... - // add v14.4s, v14.4s, v24.4s // ..................................................................................................................................................................................................................e.....|.............................................................................................................................................................................. - // mul v24.4s, v10.4s, v1.s[2] // ............................................................................................................................................................e...........................................................|...........................................................................................................................................................e.................. - // sqrdmulh v10.4s, v10.4s, v1.s[3] // .............................................................................................................................................................e..........................................................|............................................................................................................................................................e................. - // mls v24.4s, v10.4s, v8.s[0] // ..................................................................................................................................................................e.....................................................|.................................................................................................................................................................e............ - // sub v10.4s, v9.4s, v24.4s // .....................................................................................................................................................................................................e..................|.............................................................................................................................................................................. - // add v9.4s, v9.4s, v24.4s // ......................................................................................................................................................................................................e.................|.............................................................................................................................................................................. - // mul v24.4s, v12.4s, v2.s[0] // ...............................................................................................................................................................................e........................................|.............................................................................................................................................................................. - // sqrdmulh v12.4s, v12.4s, v2.s[1] // .......................................................................................................................................................................e................................................|......................................................................................................................................................................e....... - // mls v24.4s, v12.4s, v8.s[0] // ..................................................................................................................................................................................e.....................................|.............................................................................................................................................................................. - // sub v12.4s, v11.4s, v24.4s // ...................................................................................................................................................................................................e....................|.............................................................................................................................................................................. - // add v11.4s, v11.4s, v24.4s // ....................................................................................................................................................................................................e...................|.............................................................................................................................................................................. - // mul v24.4s, v14.4s, v2.s[2] // .......................................................................................................................................................................................................................e|.............................................................................................................................................................................. - // sqrdmulh v14.4s, v14.4s, v2.s[3] // ......*.................................................................................................................................................................................................................|.....*........................................................................................................................................................................ - // mls v24.4s, v14.4s, v8.s[0] // .........................*..............................................................................................................................................................................................|........................*..................................................................................................................................................... - // sub v14.4s, v13.4s, v24.4s // ................................................................................*.......................................................................................................................................|...............................................................................*.............................................................................................. - // add v13.4s, v13.4s, v24.4s // .................................................................................*......................................................................................................................................|................................................................................*............................................................................................. - // mul v24.4s, v16.4s, v3.s[0] // ...................................................................*....................................................................................................................................................|..................................................................*........................................................................................................... - // sqrdmulh v16.4s, v16.4s, v3.s[1] // ....................................................................*...................................................................................................................................................|...................................................................*.......................................................................................................... - // mls v24.4s, v16.4s, v8.s[0] // ...........................................................................*............................................................................................................................................|..........................................................................*................................................................................................... - // sub v16.4s, v15.4s, v24.4s // ...................................................................................*....................................................................................................................................|..................................................................................*........................................................................................... - // add v15.4s, v15.4s, v24.4s // .....................................................................................*..................................................................................................................................|....................................................................................*......................................................................................... - // trn1 v25.4s, v9.4s, v10.4s // ...........................................................................................................................................................................................................e............|.............................................................................................................................................................................. - // trn2 v26.4s, v9.4s, v10.4s // .........................................................................................................................................................................................................e..............|.............................................................................................................................................................................. - // trn1 v27.4s, v11.4s, v12.4s // ..............................................................................................................................................................................................................e.........|.............................................................................................................................................................................. - // trn2 v28.4s, v11.4s, v12.4s // .......................................................................................................................................................................................................e................|.............................................................................................................................................................................. - // trn2 v11.2d, v25.2d, v27.2d // .*......................................................................................................................................................................................................................|*............................................................................................................................................................................. - // trn2 v12.2d, v26.2d, v28.2d // ............................................................................................................................................................................................................e...........|.............................................................................................................................................................................. - // trn1 v9.2d, v25.2d, v27.2d // ............................*...........................................................................................................................................................................................|...........................*.................................................................................................................................................. - // trn1 v10.2d, v26.2d, v28.2d // ......................................................................................................................................................................................................................e.|.............................................................................................................................................................................. - // trn1 v25.4s, v13.4s, v14.4s // ......................................................................................*.................................................................................................................................|.....................................................................................*........................................................................................ - // trn2 v26.4s, v13.4s, v14.4s // .................................................................................................*......................................................................................................................|................................................................................................*............................................................................. - // trn1 v27.4s, v15.4s, v16.4s // ..........................................................................................*.............................................................................................................................|.........................................................................................*.................................................................................... - // trn2 v28.4s, v15.4s, v16.4s // ............................................................................................*...........................................................................................................................|...........................................................................................*.................................................................................. - // trn2 v15.2d, v25.2d, v27.2d // ...............................................................................................*........................................................................................................................|..............................................................................................*............................................................................... - // trn2 v16.2d, v26.2d, v28.2d // ..........................................................................................................*.............................................................................................................|.........................................................................................................*.................................................................... - // trn1 v13.2d, v25.2d, v27.2d // ................................................................................................*.......................................................................................................................|...............................................................................................*.............................................................................. - // trn1 v14.2d, v26.2d, v28.2d // ........................................................................................................*...............................................................................................................|.......................................................................................................*...................................................................... - // ldr x10, [x5], #(12*16) // .....................................................................................................................................................................................e..................................|.............................................................................................................................................................................. - // ldr x11, [x5, #(-(12*16)+8)] // .................................................................................................................................................................................e......................................|.............................................................................................................................................................................. - // ins v0.d[0], x10 // ........................................................................................................................................................................................................e...............|.............................................................................................................................................................................. - // ins v0.d[1], x11 // .............................................................................................................................................................................................................e..........|.............................................................................................................................................................................. - // ldr x10, [x5, #(-12*16 + 1*16)] // .......................e................................................................................................................................................................................................|......................e....................................................................................................................................................... - // ldr x11, [x5, #((-12*16 + 1*16)+8)] // ........e...............................................................................................................................................................................................................|.......e...................................................................................................................................................................... - // ins v4.d[0], x10 // .............................................................................................................................................................................................e..........................|.............................................................................................................................................................................. - // ins v4.d[1], x11 // ................................................................................................................................................................................................e.......................|.............................................................................................................................................................................. - // ldr x10, [x5, #(-12*16 + 2*16)] // ....................*...................................................................................................................................................................................................|...................*.......................................................................................................................................................... - // ldr x11, [x5, #((-12*16 + 2*16)+8)] // .............................................................................................................*..........................................................................................................|............................................................................................................*................................................................. - // ins v1.d[0], x10 // ...........................................................................................*............................................................................................................................|..........................................................................................*................................................................................... - // ins v1.d[1], x11 // .................................................................................................................*......................................................................................................|................................................................................................................*............................................................. - // ldr x10, [x5, #(-12*16 + 3*16)] // ............................................................e...........................................................................................................................................................|...........................................................e.................................................................................................................. - // ldr x11, [x5, #((-12*16 + 3*16)+8)] // ...............................................................*........................................................................................................................................................|..............................................................*............................................................................................................... - // ins v5.d[0], x10 // ....................................*...................................................................................................................................................................................|...................................*.......................................................................................................................................... - // ins v5.d[1], x11 // .......................................................................*................................................................................................................................................|......................................................................*....................................................................................................... - // ldr x10, [x5, #(-12*16 + 4*16)] // ..................e.....................................................................................................................................................................................................|.................e............................................................................................................................................................ - // ldr x11, [x5, #((-12*16 + 4*16)+8)] // ................................e.......................................................................................................................................................................................|...............................e.............................................................................................................................................. - // ins v2.d[0], x10 // ...........................e............................................................................................................................................................................................|..........................e................................................................................................................................................... - // ins v2.d[1], x11 // .............................................................e..........................................................................................................................................................|............................................................e................................................................................................................. - // ldr x10, [x5, #(-12*16 + 5*16)] // ....*...................................................................................................................................................................................................................|...*.......................................................................................................................................................................... - // ldr x11, [x5, #((-12*16 + 5*16)+8)] // .....................................................................................................................*..................................................................................................|....................................................................................................................*......................................................... - // ins v6.d[0], x10 // ..................................*.....................................................................................................................................................................................|.................................*............................................................................................................................................ - // ins v6.d[1], x11 // .......................................................................................................................*................................................................................................|......................................................................................................................*....................................................... - // mul v24.4s, v11.4s, v0.4s // ......................*.................................................................................................................................................................................................|.....................*........................................................................................................................................................ - // sqrdmulh v11.4s, v11.4s, v4.4s // ..........*.............................................................................................................................................................................................................|.........*.................................................................................................................................................................... - // mls v24.4s, v11.4s, v8.s[0] // .................................*......................................................................................................................................................................................|................................*............................................................................................................................................. - // sub v11.4s, v9.4s, v24.4s // .....................................*..................................................................................................................................................................................|....................................*......................................................................................................................................... - // add v9.4s, v9.4s, v24.4s // ..............................................*.........................................................................................................................................................................|.............................................*................................................................................................................................ - // mul v24.4s, v12.4s, v0.4s // ...............................................................................................................................................................................................................e........|.............................................................................................................................................................................. - // sqrdmulh v12.4s, v12.4s, v4.4s // ................................................................................................................................................................................................................e.......|.............................................................................................................................................................................. - // mls v24.4s, v12.4s, v8.s[0] // ....................................................................................................................................................................................................................e...|.............................................................................................................................................................................. - // sub v12.4s, v10.4s, v24.4s // ................*.......................................................................................................................................................................................................|...............*.............................................................................................................................................................. - // add v10.4s, v10.4s, v24.4s // ..*.....................................................................................................................................................................................................................|.*............................................................................................................................................................................ - // mul v24.4s, v10.4s, v1.4s // ...................................................................................................................*....................................................................................................|..................................................................................................................*........................................................... - // sqrdmulh v10.4s, v10.4s, v5.4s // .............................................................................*..........................................................................................................................................|............................................................................*................................................................................................. - // mls v24.4s, v10.4s, v8.s[0] // ..........................................................................................................................*.............................................................................................|.........................................................................................................................*.................................................... - // sub v10.4s, v9.4s, v24.4s // ................................................................................................................................*.......................................................................................|...............................................................................................................................*.............................................. - // add v9.4s, v9.4s, v24.4s // .................................................................................................................................*......................................................................................|................................................................................................................................*............................................. - // mul v24.4s, v12.4s, v2.4s // ........................*...............................................................................................................................................................................................|.......................*...................................................................................................................................................... - // sqrdmulh v12.4s, v12.4s, v6.4s // ...........................................................................................................................*............................................................................................|..........................................................................................................................*................................................... - // mls v24.4s, v12.4s, v8.s[0] // ...................................................................................................................................*....................................................................................|..................................................................................................................................*........................................... - // sub v12.4s, v11.4s, v24.4s // .........................................................................................................................................*..............................................................................|........................................................................................................................................*..................................... - // add v11.4s, v11.4s, v24.4s // ..........................................................................................................................................*.............................................................................|.........................................................................................................................................*.................................... - // ldr x10, [x5, #(-12*16 + 6*16)] // ....................................................................................e...................................................................................................................................|...................................................................................e.......................................................................................... - // ldr x11, [x5, #((-12*16 + 6*16)+8)] // .............................e..........................................................................................................................................................................................|............................e................................................................................................................................................. - // ins v0.d[0], x10 // ...................................................................................................................................................................................................................e....|.............................................................................................................................................................................. - // ins v0.d[1], x11 // ...........*............................................................................................................................................................................................................|..........*................................................................................................................................................................... - // ldr x10, [x5, #(-12*16 + 7*16)] // .............e..........................................................................................................................................................................................................|............e................................................................................................................................................................. - // ldr x11, [x5, #((-12*16 + 7*16)+8)] // ......................................................................................................................................................................................e.................................|.............................................................................................................................................................................. - // ins v4.d[0], x10 // .................................................................................................................................................................................................................e......|.............................................................................................................................................................................. - // ins v4.d[1], x11 // .....................................................................................................................................................................................................................e..|.............................................................................................................................................................................. - // ldr x10, [x5, #(-12*16 + 8*16)] // ............*...........................................................................................................................................................................................................|...........*.................................................................................................................................................................. - // ldr x11, [x5, #((-12*16 + 8*16)+8)] // ......................................*.................................................................................................................................................................................|.....................................*........................................................................................................................................ - // ins v1.d[0], x10 // ...........................................................*............................................................................................................................................................|..........................................................*................................................................................................................... - // ins v1.d[1], x11 // .......................................................................................*................................................................................................................................|......................................................................................*....................................................................................... - // ldr x10, [x5, #(-12*16 + 9*16)] // ..........................................................................................................................................................................................................e.............|.............................................................................................................................................................................. - // ldr x11, [x5, #((-12*16 + 9*16)+8)] // .........................................*..............................................................................................................................................................................|........................................*..................................................................................................................................... - // ins v5.d[0], x10 // .......................................*................................................................................................................................................................................|......................................*....................................................................................................................................... - // ins v5.d[1], x11 // .........................................................................................................*..............................................................................................................|........................................................................................................*..................................................................... - // ldr x10, [x5, #(-12*16 + 10*16)] // ..................................................e.....................................................................................................................................................................|.................................................e............................................................................................................................ - // ldr x11, [x5, #((-12*16 + 10*16)+8)] // .........................................................................*..............................................................................................................................................|........................................................................*..................................................................................................... - // ins v2.d[0], x10 // .................................................*......................................................................................................................................................................|................................................*............................................................................................................................. - // ins v2.d[1], x11 // ........................................................................................*...............................................................................................................................|.......................................................................................*...................................................................................... - // ldr x10, [x5, #(-12*16 + 11*16)] // ...............................................................................................................e........................................................................................................|..............................................................................................................e............................................................... - // ldr x11, [x5, #((-12*16 + 11*16)+8)] // .........................................................................................*..............................................................................................................................|........................................................................................*..................................................................................... - // ins v6.d[0], x10 // ..............................................................................................................*.........................................................................................................|.............................................................................................................*................................................................ - // ins v6.d[1], x11 // .............................................................................................................................*..........................................................................................|............................................................................................................................*................................................. - // mul v24.4s, v15.4s, v0.4s // ....................................................................................................*...................................................................................................................|...................................................................................................*.......................................................................... - // sqrdmulh v15.4s, v15.4s, v4.4s // .....................................................................................................*..................................................................................................................|....................................................................................................*......................................................................... - // mls v24.4s, v15.4s, v8.s[0] // .......................................................................................................*................................................................................................................|......................................................................................................*....................................................................... - // sub v15.4s, v13.4s, v24.4s // ...........................................................................................................*............................................................................................................|..........................................................................................................*................................................................... - // add v13.4s, v13.4s, v24.4s // ............................................................................................................*...........................................................................................................|...........................................................................................................*.................................................................. - // mul v24.4s, v16.4s, v0.4s // ..................................................................................................................*.....................................................................................................|.................................................................................................................*............................................................ - // sqrdmulh v16.4s, v16.4s, v4.4s // ................................................................................................................*.......................................................................................................|...............................................................................................................*.............................................................. - // mls v24.4s, v16.4s, v8.s[0] // ....................................................................................................................*...................................................................................................|...................................................................................................................*.......................................................... - // sub v16.4s, v14.4s, v24.4s // ...............................................................................................................................*........................................................................................|..............................................................................................................................*............................................... - // add v14.4s, v14.4s, v24.4s // .........................................................................................................................*..............................................................................................|........................................................................................................................*..................................................... - // mul v24.4s, v14.4s, v1.4s // ..............................................................................................................................................................*.........................................................|.............................................................................................................................................................*................ - // sqrdmulh v14.4s, v14.4s, v5.4s // ...............................................................................................................................................................*........................................................|..............................................................................................................................................................*............... - // mls v24.4s, v14.4s, v8.s[0] // ...................................................................................................................................................................*....................................................|..................................................................................................................................................................*........... - // sub v14.4s, v13.4s, v24.4s // .........................................................................................................................................................................*..............................................|........................................................................................................................................................................*..... - // add v13.4s, v13.4s, v24.4s // ...........................................................................................................................................................................*............................................|..........................................................................................................................................................................*... - // mul v24.4s, v16.4s, v2.4s // ....................................................................................................................................*...................................................................................|...................................................................................................................................*.......................................... - // sqrdmulh v16.4s, v16.4s, v6.4s // ......................................................................................................................................*.................................................................................|.....................................................................................................................................*........................................ - // mls v24.4s, v16.4s, v8.s[0] // ...........................................................................................................................................................*............................................................|..........................................................................................................................................................*................... - // sub v16.4s, v15.4s, v24.4s // ..........................................................................................................................................................................*.............................................|.........................................................................................................................................................................*.... - // add v15.4s, v15.4s, v24.4s // .....................................................................................................................................................................*..................................................|....................................................................................................................................................................*......... - // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // .............................................................................................................................................*..........................................................................|............................................................................................................................................*................................. - // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // ..............................................................................................................................................................................*.........................................|.............................................................................................................................................................................* + // Instructions: 144 + // Expected cycles: 60 + // Expected IPC: 2.40 + // + // Wall time: 1953.56s + // User time: 1953.56s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + ldr q20, [x2, #176] // .......e........................................................................................................................................ + ldr q28, [x2, #144] // .....e.......................................................................................................................................... + sqrdmulh v25.4S, v13.4S, v7.4S // .....................................................................................................*.......................................... + trn1 v6.2D, v12.2D, v14.2D // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + ldr q30, [x4], #64 // ..........e..................................................................................................................................... + trn1 v26.2D, v15.2D, v0.2D // ................................................................................*............................................................... + ldr q22, [x1, #160] // ..e............................................................................................................................................. + ldr q21, [x5, #-160] // ............................................................................................*................................................... + ldr q24, [x2, #160] // ......e......................................................................................................................................... + add v14.4S, v18.4S, v10.4S // ...................................................................................................................................*............ + sub v12.4S, v18.4S, v10.4S // ..................................................................................................................................*............. + ldr q19, [x2, #128] // ....e........................................................................................................................................... + sub v2.4S, v6.4S, v17.4S // .............................................................................................................................*.................. + mls v9.4S, v25.4S, v8.S[0] // .......................................................................................................*........................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q5, [x1, #176] // ...e............................................................................................................................................ + mul v25.4S, v12.4S, v1.4S // ..........................................................................................................................................*..... + sqrdmulh v4.4S, v14.4S, v23.4S // ....................................................................................................................................*........... + mul v3.4S, v20.4S, v30.S[0] // ..............................e................................................................................................................. + sqrdmulh v0.4S, v20.4S, v30.S[1] // .............................e.................................................................................................................. + ldr q10, [x1, #128] // e............................................................................................................................................... + // gap // ................................................................................................................................................ + add v20.4S, v16.4S, v9.4S // .........................................................................................................*...................................... + sqrdmulh v1.4S, v24.4S, v30.S[1] // ........................e....................................................................................................................... + ldr q23, [x5, #144] // .......................................................................................................................e........................ + ldr q7, [x1, #144] // .e.............................................................................................................................................. + sub v9.4S, v16.4S, v9.4S // ........................................................................................................*....................................... + mul v13.4S, v19.4S, v30.S[0] // ...............e................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v16.4S, v20.4S, v21.4S // ...........................................................................................................*.................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v0.4S, v8.S[0] // ...............................e................................................................................................................ + sqrdmulh v11.4S, v19.4S, v30.S[1] // ..............e................................................................................................................................. + mul v0.4S, v24.4S, v30.S[0] // .........................e...................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v24.4S, v9.4S, v31.4S // ................................................................................................................*............................... + sqrdmulh v21.4S, v28.4S, v30.S[1] // ...................e............................................................................................................................ + ldr q19, [x5, #-112] // ...............................................................................................*................................................ + // gap // ................................................................................................................................................ + add v15.4S, v5.4S, v3.4S // .................................e.............................................................................................................. + mul v18.4S, v28.4S, v30.S[0] // ....................e........................................................................................................................... + ldr q31, [x4, #-32] // ............e................................................................................................................................... + // gap // ................................................................................................................................................ + mls v13.4S, v11.4S, v8.S[0] // ................e............................................................................................................................... + // gap // ................................................................................................................................................ + mls v0.4S, v1.4S, v8.S[0] // ..........................e..................................................................................................................... + // gap // ................................................................................................................................................ + mul v28.4S, v15.4S, v30.S[2] // ........................................e....................................................................................................... + sqrdmulh v11.4S, v15.4S, v30.S[3] // .......................................e........................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v18.4S, v21.4S, v8.S[0] // .....................e.......................................................................................................................... + sqrdmulh v1.4S, v9.4S, v19.4S // ...............................................................................................................*................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v9.4S, v10.4S, v13.4S // .................e.............................................................................................................................. + // gap // ................................................................................................................................................ + sub v15.4S, v22.4S, v0.4S // ...........................e.................................................................................................................... + // gap // ................................................................................................................................................ + mls v28.4S, v11.4S, v8.S[0] // .........................................e...................................................................................................... + add v0.4S, v22.4S, v0.4S // ............................e................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v19.4S, v20.4S, v27.4S // ..........................................................................................................*..................................... + add v22.4S, v7.4S, v18.4S // .......................e........................................................................................................................ + ldr q20, [x4, #-16] // .............e.................................................................................................................................. + ldr q27, [x4, #-48] // ...........e.................................................................................................................................... + sqrdmulh v21.4S, v0.4S, v30.S[3] // ..................................e............................................................................................................. + mul v30.4S, v0.4S, v30.S[2] // ...................................e............................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v11.4S, v22.4S, v28.4S // ..........................................e..................................................................................................... + add v28.4S, v22.4S, v28.4S // ...........................................e.................................................................................................... + // gap // ................................................................................................................................................ + ldr q22, [x5, #-16] // .........................................................................................................................*...................... + sub v3.4S, v5.4S, v3.4S // ................................e............................................................................................................... + // gap // ................................................................................................................................................ + add v5.4S, v10.4S, v13.4S // ..................e............................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v13.4S, v28.4S, v27.S[2] // .......................................................e........................................................................................ + sqrdmulh v0.4S, v28.4S, v27.S[3] // ......................................................e......................................................................................... + sub v28.4S, v26.4S, v29.4S // ...................................................................................................*............................................ + mls v30.4S, v21.4S, v8.S[0] // ....................................e........................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v12.4S, v22.4S // .........................................................................................................................................*...... + add v10.4S, v26.4S, v29.4S // ....................................................................................................*........................................... + mls v13.4S, v0.4S, v8.S[0] // ........................................................e....................................................................................... + mul v26.4S, v3.4S, v27.S[0] // ..................................................e............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v0.4S, v5.4S, v30.4S // ......................................e......................................................................................................... + mls v24.4S, v1.4S, v8.S[0] // .................................................................................................................*.............................. + ldr q1, [x5, #112] // .....................................................................................................................e.......................... + sqrdmulh v21.4S, v3.4S, v27.S[1] // .................................................e.............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v29.4S, v7.4S, v18.4S // ......................e......................................................................................................................... + add v3.4S, v0.4S, v13.4S // ..........................................................e..................................................................................... + sub v0.4S, v0.4S, v13.4S // .........................................................e...................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v25.4S, v22.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + sub v22.4S, v28.4S, v24.4S // ..................................................................................................................*............................. + // gap // ................................................................................................................................................ + trn2 v13.4S, v3.4S, v0.4S // ...........................................................................e.................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v16.4S, v19.4S, v8.S[0] // ............................................................................................................*................................... + mls v26.4S, v21.4S, v8.S[0] // ...................................................e............................................................................................ + sub v5.4S, v5.4S, v30.4S // .....................................e.......................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v21.4S, v28.4S, v24.4S // ...................................................................................................................*............................ + // gap // ................................................................................................................................................ + add v28.4S, v2.4S, v25.4S // .............................................................................................................................................*.. + mul v30.4S, v11.4S, v31.S[0] // ............................................................e................................................................................... + mul v18.4S, v15.4S, v27.S[0] // .............................................e.................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v19.4S, v29.4S, v26.4S // .....................................................e.......................................................................................... + sub v24.4S, v29.4S, v26.4S // ....................................................e........................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v29.4S, v11.4S, v31.S[1] // ...........................................................e.................................................................................... + add v11.4S, v6.4S, v17.4S // ..............................................................................................................................*................. + ldr q7, [x5, #16] // ...........................................................................................e.................................................... + sqrdmulh v12.4S, v24.4S, v20.S[1] // .....................................................................e.......................................................................... + sqrdmulh v17.4S, v15.4S, v27.S[1] // ............................................e................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v26.4S, v19.4S, v31.S[2] // .................................................................e.............................................................................. + mul v6.4S, v24.4S, v20.S[0] // ......................................................................e......................................................................... + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v19.4S, v31.S[3] // ................................................................e............................................................................... + mls v30.4S, v29.4S, v8.S[0] // .............................................................e.................................................................................. + // gap // ................................................................................................................................................ + trn1 v15.4S, v3.4S, v0.4S // ..........................................................................e..................................................................... + mls v18.4S, v17.4S, v8.S[0] // ..............................................e................................................................................................. + // gap // ................................................................................................................................................ + ldr q3, [x5, #96] // ....................................................................................................................e........................... + // gap // ................................................................................................................................................ + add v19.4S, v10.4S, v16.4S // ..............................................................................................................*................................. + // gap // ................................................................................................................................................ + mls v6.4S, v12.4S, v8.S[0] // .......................................................................e........................................................................ + mls v26.4S, v24.4S, v8.S[0] // ..................................................................e............................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v12.4S, v5.4S, v30.4S // ..............................................................e................................................................................. + sub v27.4S, v9.4S, v18.4S // ...............................................e................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v0.4S, v9.4S, v18.4S // ................................................e............................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + ldr q29, [x5, #-64] // ......................................................................................................................*......................... + ldr q31, [x5, #64] // ..............................................................................................e................................................. + add v9.4S, v27.4S, v6.4S // .........................................................................e...................................................................... + add v30.4S, v5.4S, v30.4S // ...............................................................e................................................................................ + // gap // ................................................................................................................................................ + sub v24.4S, v27.4S, v6.4S // ........................................................................e....................................................................... + sub v27.4S, v0.4S, v26.4S // ...................................................................e............................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v6.4S, v0.4S, v26.4S // ....................................................................e........................................................................... + // gap // ................................................................................................................................................ + trn2 v18.4S, v30.4S, v12.4S // .............................................................................e.................................................................. + mul v17.4S, v14.4S, v29.4S // .....................................................................................................................................*.......... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v14.4S, v9.4S, v24.4S // ....................................................................................e........................................................... + trn2 v9.4S, v9.4S, v24.4S // .....................................................................................e.......................................................... + trn1 v0.4S, v30.4S, v12.4S // ............................................................................e................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v29.4S, v2.4S, v25.4S // ............................................................................................................................................*... + sub v20.4S, v10.4S, v16.4S // .............................................................................................................*.................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v12.4S, v6.4S, v27.4S // ..................................................................................e............................................................. + // gap // ................................................................................................................................................ + mls v17.4S, v4.4S, v8.S[0] // ......................................................................................................................................*......... + ldr q4, [x5], #(12*16) // ..........................................................................................e..................................................... + trn2 v2.4S, v6.4S, v27.4S // ...................................................................................e............................................................ + st4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1], #64 // ..............................................................................................................................................*. + trn2 v30.2D, v15.2D, v0.2D // ..............................................................................e................................................................. + // gap // ................................................................................................................................................ + trn2 v24.2D, v12.2D, v14.2D // ......................................................................................e......................................................... + // gap // ................................................................................................................................................ + trn1 v16.2D, v13.2D, v18.2D // .................................................................................e.............................................................. + // gap // ................................................................................................................................................ + add v26.4S, v11.4S, v17.4S // ........................................................................................................................................*....... + // gap // ................................................................................................................................................ + trn2 v6.2D, v2.2D, v9.2D // .......................................................................................e........................................................ + add x1, x1, #64 // ........e....................................................................................................................................... + sqrdmulh v19.4S, v24.4S, v1.4S // ..........................................................................................................................e..................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v27.4S, v11.4S, v17.4S // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + mul v17.4S, v24.4S, v3.4S // ...........................................................................................................................e.................... + // gap // ................................................................................................................................................ + sqrdmulh v25.4S, v30.4S, v7.4S // ................................................................................................e............................................... + // gap // ................................................................................................................................................ + st4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x2], #64 // ...............................................................................................................................................* + mul v29.4S, v30.4S, v4.4S // .................................................................................................e.............................................. + mul v10.4S, v6.4S, v3.4S // ................................................................................................................................e............... + sqrdmulh v20.4S, v6.4S, v1.4S // ...............................................................................................................................e................ + ldr q27, [x5, #-144] // .............................................................................................e.................................................. + add x2, x2, #64 // .........e...................................................................................................................................... + trn2 v13.2D, v13.2D, v18.2D // ...............................................................................e................................................................ + // gap // ................................................................................................................................................ + trn1 v18.2D, v2.2D, v9.2D // .........................................................................................e...................................................... + ldr q1, [x5, #-32] // ........................................................................................................................e....................... + // gap // ................................................................................................................................................ + mls v29.4S, v25.4S, v8.S[0] // ..................................................................................................e............................................. + mul v9.4S, v13.4S, v4.4S // ......................................................................................................e......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v10.4S, v20.4S, v8.S[0] // .................................................................................................................................e.............. + // gap // ................................................................................................................................................ + mls v17.4S, v19.4S, v8.S[0] // ............................................................................................................................e................... + + // ---------------------------------------------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q9, [x1, #(16*0 + (64))] // ...................e............................................................................................................................'..................~................................................................................................................ + // ldr q10, [x1, #(16*1 + (64))] // .......................e........................................................................................................................'......................~............................................................................................................ + // ldr q11, [x1, #(16*2 + (64))] // ......e.........................................................................................................................................'.....~............................................................................................................................. + // ldr q12, [x1, #(16*3 + (64))] // ..............e.................................................................................................................................'.............~..................................................................................................................... + // ldr q13, [x2, #(16*0 + (64))] // ...........e....................................................................................................................................'..........~........................................................................................................................ + // ldr q14, [x2, #(16*1 + (64))] // .e..............................................................................................................................................'~.................................................................................................................................. + // ldr q15, [x2, #(16*2 + (64))] // ........e.......................................................................................................................................'.......~........................................................................................................................... + // ldr q16, [x2, #(16*3 + (64))] // e...............................................................................................................................................~................................................................................................................................... + // add x1, x1, #64 // ..............................................................................................................................e.................'.............................................................................................................................~..... + // add x2, x2, #64 // ........................................................................................................................................e.......'................................................................................................................................... + // ldr q0, [x4], #64 // ....e...........................................................................................................................................'...~............................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // .................................................e..............................................................................................'................................................~.................................................................................. + // ldr q2, [x4, #(-64 + 32)] // ...................................e............................................................................................................'..................................~................................................................................................ + // ldr q3, [x4, #(-64 + 48)] // ................................................e...............................................................................................'...............................................~................................................................................... + // sqrdmulh v27.4s, v13.4s, v0.s[1] // ............................e...................................................................................................................'...........................~....................................................................................................... + // mul v24.4s, v13.4s, v0.s[0] // .........................e......................................................................................................................'........................~.......................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ....................................e...........................................................................................................'...................................~............................................................................................... + // sub v13.4s, v9.4s, v24.4s // ..........................................e.....................................................................................................'.........................................~......................................................................................... + // add v9.4s, v9.4s, v24.4s // ........................................................e.......................................................................................'.......................................................~........................................................................... + // sqrdmulh v27.4s, v14.4s, v0.s[1] // ...............................e................................................................................................................'..............................~.................................................................................................... + // mul v24.4s, v14.4s, v0.s[0] // ..................................e.............................................................................................................'.................................~................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ........................................e.......................................................................................................'.......................................~........................................................................................... + // sub v14.4s, v10.4s, v24.4s // .....................................................................e..........................................................................'....................................................................~.............................................................. + // add v10.4s, v10.4s, v24.4s // ...............................................e................................................................................................'..............................................~.................................................................................... + // sqrdmulh v27.4s, v15.4s, v0.s[1] // .....................e..........................................................................................................................'....................~.............................................................................................................. + // mul v24.4s, v15.4s, v0.s[0] // .............................e..................................................................................................................'............................~...................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .....................................e..........................................................................................................'....................................~.............................................................................................. + // sub v15.4s, v11.4s, v24.4s // ...........................................e....................................................................................................'..........................................~........................................................................................ + // add v11.4s, v11.4s, v24.4s // .............................................e..................................................................................................'............................................~...................................................................................... + // sqrdmulh v27.4s, v16.4s, v0.s[1] // ..................e.............................................................................................................................'.................~................................................................................................................. + // mul v24.4s, v16.4s, v0.s[0] // .................e..............................................................................................................................'................~.................................................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ...........................e....................................................................................................................'..........................~........................................................................................................ + // sub v16.4s, v12.4s, v24.4s // .......................................................e........................................................................................'......................................................~............................................................................ + // add v12.4s, v12.4s, v24.4s // .................................e..............................................................................................................'................................~.................................................................................................. + // sqrdmulh v27.4s, v11.4s, v0.s[3] // ..................................................e.............................................................................................'.................................................~................................................................................. + // mul v24.4s, v11.4s, v0.s[2] // ...................................................e............................................................................................'..................................................~................................................................................ + // mls v24.4s, v27.4s, v8.s[0] // ............................................................e...................................................................................'...........................................................~....................................................................... + // sub v11.4s, v9.4s, v24.4s // .............................................................................e..................................................................'............................................................................~...................................................... + // add v9.4s, v9.4s, v24.4s // .................................................................e..............................................................................'................................................................~.................................................................. + // sqrdmulh v27.4s, v12.4s, v0.s[3] // .......................................e........................................................................................................'......................................~............................................................................................ + // mul v24.4s, v12.4s, v0.s[2] // ......................................e.........................................................................................................'.....................................~............................................................................................. + // mls v24.4s, v27.4s, v8.s[0] // ............................................e...................................................................................................'...........................................~....................................................................................... + // sub v12.4s, v10.4s, v24.4s // ....................................................e...........................................................................................'...................................................~............................................................................... + // add v10.4s, v10.4s, v24.4s // .....................................................e..........................................................................................'....................................................~.............................................................................. + // sqrdmulh v27.4s, v15.4s, v1.s[1] // ........................................................................................e.......................................................'.......................................................................................~........................................... + // mul v24.4s, v15.4s, v1.s[0] // .................................................................................e..............................................................'................................................................................~.................................................. + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................e.................................................'.............................................................................................~..................................... + // sub v15.4s, v13.4s, v24.4s // ....................................................................................................e...........................................'...................................................................................................~............................... + // add v13.4s, v13.4s, v24.4s // .....................................................................................................e..........................................'....................................................................................................~.............................. + // sqrdmulh v27.4s, v16.4s, v1.s[1] // ....................................................................e...........................................................................'...................................................................~............................................................... + // mul v24.4s, v16.4s, v1.s[0] // ................................................................e...............................................................................'...............................................................~................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................................................e...................................................................'...........................................................................~....................................................... + // sub v16.4s, v14.4s, v24.4s // ...................................................................................e............................................................'..................................................................................~................................................ + // add v14.4s, v14.4s, v24.4s // ..................................................................................e.............................................................'.................................................................................~................................................. + // sqrdmulh v27.4s, v10.4s, v1.s[3] // ..........................................................e.....................................................................................'.........................................................~......................................................................... + // mul v24.4s, v10.4s, v1.s[2] // .........................................................e......................................................................................'........................................................~.......................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...............................................................e................................................................................'..............................................................~.................................................................... + // sub v10.4s, v9.4s, v24.4s // .......................................................................e........................................................................'......................................................................~............................................................ + // add v9.4s, v9.4s, v24.4s // ......................................................................e.........................................................................'.....................................................................~............................................................. + // sqrdmulh v27.4s, v12.4s, v2.s[1] // ....................................................................................e...........................................................'...................................................................................~............................................... + // mul v24.4s, v12.4s, v2.s[0] // ................................................................................e...............................................................'...............................................................................~................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................................................................e...................................................'...........................................................................................~....................................... + // sub v12.4s, v11.4s, v24.4s // ...................................................................................................e............................................'..................................................................................................~................................ + // add v11.4s, v11.4s, v24.4s // .........................................................................................................e......................................'........................................................................................................~.......................... + // sqrdmulh v27.4s, v14.4s, v2.s[3] // ...........................................................................................e....................................................'..........................................................................................~........................................ + // mul v24.4s, v14.4s, v2.s[2] // .........................................................................................e......................................................'........................................................................................~.......................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................................................e.............................................'.................................................................................................~................................. + // sub v14.4s, v13.4s, v24.4s // ...........................................................................................................e....................................'..........................................................................................................~........................ + // add v13.4s, v13.4s, v24.4s // ............................................................................................................e...................................'...........................................................................................................~....................... + // sqrdmulh v27.4s, v16.4s, v3.s[1] // .......................................................................................e........................................................'......................................................................................~............................................ + // mul v24.4s, v16.4s, v3.s[0] // ..........................................................................................e.....................................................'.........................................................................................~......................................... + // mls v24.4s, v27.4s, v8.s[0] // .................................................................................................e..............................................'................................................................................................~.................................. + // sub v16.4s, v15.4s, v24.4s // ..........................................................................................................e.....................................'.........................................................................................................~......................... + // add v15.4s, v15.4s, v24.4s // ........................................................................................................e.......................................'.......................................................................................................~........................... + // trn1 v25.4s, v9.4s, v10.4s // .............................................................................................e..................................................'............................................................................................~...................................... + // trn2 v26.4s, v9.4s, v10.4s // ..........................................................................e.....................................................................'.........................................................................~......................................................... + // trn1 v27.4s, v11.4s, v12.4s // .................................................................................................................e..............................'................................................................................................................~.................. + // trn2 v28.4s, v11.4s, v12.4s // .............................................................................................................e..................................'............................................................................................................~...................... + // trn2 v11.2d, v25.2d, v27.2d // .........................................................................................................................e......................'........................................................................................................................~.......... + // trn2 v12.2d, v26.2d, v28.2d // .........................................................................................................................................e......'................................................................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // .....~..........................................................................................................................................'....*.............................................................................................................................. + // trn1 v10.2d, v26.2d, v28.2d // ...........................................................................................................................e....................'..........................................................................................................................~........ + // trn1 v25.4s, v13.4s, v14.4s // ....................................................................................................................e...........................'...................................................................................................................~............... + // trn2 v26.4s, v13.4s, v14.4s // .......................................................................................................................e........................'......................................................................................................................~............ + // trn1 v27.4s, v15.4s, v16.4s // ...............................................................................................................e................................'..............................................................................................................~.................... + // trn2 v28.4s, v15.4s, v16.4s // ................................................................................................................e...............................'...............................................................................................................~................... + // trn2 v15.2d, v25.2d, v27.2d // ..........................................................................................................................e.....................'.........................................................................................................................~......... + // trn2 v16.2d, v26.2d, v28.2d // .............................................................................................................................e..................'............................................................................................................................~...... + // trn1 v13.2d, v25.2d, v27.2d // ...~............................................................................................................................................'..*................................................................................................................................ + // trn1 v14.2d, v26.2d, v28.2d // ..........................................................................................................................................e.....'................................................................................................................................... + // ldr q0, [ x5], #(12*16) // ......................................................................................................................e.........................'.....................................................................................................................~............. + // ldr q4, [x5, #(-12*16 + 1*16)] // ......................................................................................e.........................................................'.....................................................................................~............................................. + // ldr q1, [ x5, #(-12*16 + 2*16)] // .......~........................................................................................................................................'......*............................................................................................................................ + // ldr q5, [x5, #(-12*16 + 3*16)] // .......................................................................................................................................e........'................................................................................................................................... + // ldr q2, [ x5, #(-12*16 + 4*16)] // .......................................................................................................e........................................'......................................................................................................~............................ + // ldr q6, [x5, #(-12*16 + 5*16)] // ................................~...............................................................................................................'...............................*................................................................................................... + // sqrdmulh v27.4s, v11.4s, v4.4s // ..................................................................................................................................e.............'.................................................................................................................................~. + // mul v24.4s, v11.4s, v0.4s // ....................................................................................................................................e...........'................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ............................................................................................................................................e...'................................................................................................................................... + // sub v11.4s, v9.4s, v24.4s // ...........................................................~....................................................................................'..........................................................*........................................................................ + // add v9.4s, v9.4s, v24.4s // ..............................................................~.................................................................................'.............................................................*..................................................................... + // sqrdmulh v27.4s, v12.4s, v4.4s // ..~.............................................................................................................................................'.*................................................................................................................................. + // mul v24.4s, v12.4s, v0.4s // .............................................................................................................................................e..'................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // .............~..................................................................................................................................'............*...................................................................................................................... + // sub v12.4s, v10.4s, v24.4s // ........................~.......................................................................................................................'.......................*........................................................................................................... + // add v10.4s, v10.4s, v24.4s // ....................~...........................................................................................................................'...................*............................................................................................................... + // sqrdmulh v27.4s, v10.4s, v5.4s // ..............................................~.................................................................................................'.............................................*..................................................................................... + // mul v24.4s, v10.4s, v1.4s // ..........................~.....................................................................................................................'.........................*......................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ...........................................................................~....................................................................'..........................................................................*........................................................ + // sub v10.4s, v9.4s, v24.4s // ...................................................................................................................~............................'..................................................................................................................*................ + // add v9.4s, v9.4s, v24.4s // ................................................................................................~...............................................'...............................................................................................*................................... + // sqrdmulh v27.4s, v12.4s, v6.4s // .........................................~......................................................................................................'........................................*.......................................................................................... + // mul v24.4s, v12.4s, v2.4s // ..............................~.................................................................................................................'.............................*..................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..................................................................~.............................................................................'.................................................................*................................................................. + // sub v12.4s, v11.4s, v24.4s // .........................................................................~......................................................................'........................................................................*.......................................................... + // add v11.4s, v11.4s, v24.4s // ..............................................................................~.................................................................'.............................................................................*..................................................... + // ldr q0, [ x5, #(-12*16 + 6*16)] // ...............................................................................................e................................................'..............................................................................................~.................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ...................................................................e............................................................................'..................................................................~................................................................ + // ldr q1, [ x5, #(-12*16 + 8*16)] // ......................................................................................................~.........................................'.....................................................................................................*............................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ......................e.........................................................................................................................'.....................~............................................................................................................. + // ldr q2, [ x5, #(-12*16 + 10*16)] // ...........................................................................................................................................e....'................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ......................................................~.........................................................................................'.....................................................*............................................................................. + // sqrdmulh v27.4s, v15.4s, v4.4s // ...............................................................................................................................e................'..............................................................................................................................~.... + // mul v24.4s, v15.4s, v0.4s // .................................................................................................................................e..............'................................................................................................................................~.. + // mls v24.4s, v27.4s, v8.s[0] // ...............................................................................................................................................e'................................................................................................................................... + // sub v15.4s, v13.4s, v24.4s // ............~...................................................................................................................................'...........*....................................................................................................................... + // add v13.4s, v13.4s, v24.4s // .....................................................................................~..........................................................'....................................................................................*.............................................. + // sqrdmulh v27.4s, v16.4s, v4.4s // ......................................................................................................................................e.........'................................................................................................................................... + // mul v24.4s, v16.4s, v0.4s // .....................................................................................................................................e..........'................................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ..............................................................................................................................................e.'................................................................................................................................... + // sub v16.4s, v14.4s, v24.4s // ..........~.....................................................................................................................................'.........*......................................................................................................................... + // add v14.4s, v14.4s, v24.4s // .........~......................................................................................................................................'........*.......................................................................................................................... + // sqrdmulh v27.4s, v14.4s, v5.4s // ................~...............................................................................................................................'...............*................................................................................................................... + // mul v24.4s, v14.4s, v1.4s // ..............................................................................................................~.................................'.............................................................................................................*..................... + // mls v24.4s, v27.4s, v8.s[0] // .....................................................................................................................~..........................'....................................................................................................................*.............. + // sub v14.4s, v13.4s, v24.4s // ................................................................................................................................~...............'...............................................................................................................................*... + // add v13.4s, v13.4s, v24.4s // ............................................................................................................................~...................'...........................................................................................................................*....... + // sqrdmulh v27.4s, v16.4s, v6.4s // .............................................................~..................................................................................'............................................................*...................................................................... + // mul v24.4s, v16.4s, v2.4s // ...............~................................................................................................................................'..............*.................................................................................................................... + // mls v24.4s, v27.4s, v8.s[0] // ........................................................................~.......................................................................'.......................................................................*........................................................... + // sub v16.4s, v15.4s, v24.4s // ..................................................................................................................~.............................'.................................................................................................................*................. + // add v15.4s, v15.4s, v24.4s // ...............................................................................~................................................................'..............................................................................*.................................................... + // st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // ........................................................................................................................~.......................'.......................................................................................................................*........... + // st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2], #64 // ...................................................................................................................................~............'..................................................................................................................................* sub count, count, #1 cbnz count, layer45678_start - ins v2.d[0], x13 // ....................*............................................................................ - ldr x10, [x5, #-112] // ...*............................................................................................. - sqrdmulh v22.4S, v26.4S, v27.S[3] // ....*............................................................................................ - ldr x14, [x4, #-16] // ............................*.................................................................... - add x2, x2, #64 // ..............................*.................................................................. - ldr x28, [x4, #-8] // ................................*................................................................ - ldr x27, [x5, #-160] // ...........*..................................................................................... - ins v20.d[0], x8 // .......................*......................................................................... - sub v1.4S, v30.4S, v29.4S // ..........*...................................................................................... - ldr x21, [x5, #-40] // ........................*........................................................................ - add x1, x1, #64 // ..*.............................................................................................. - // gap // ................................................................................................. - sub v31.4S, v31.4S, v4.4S // .....*........................................................................................... - ins v16.d[0], x14 // ...............................*................................................................. - ldr x19, [x5, #-64] // ........*........................................................................................ - // gap // ................................................................................................. - // gap // ................................................................................................. - sub v18.4S, v18.4S, v3.4S // ..........................*...................................................................... - ldr x23, [x5, #-56] // ......................*.......................................................................... - // gap // ................................................................................................. - ins v16.d[1], x28 // ...................................*............................................................. - ldr x25, [x5, #-136] // ..................................*.............................................................. - sqrdmulh v7.4S, v31.4S, v5.S[1] // ...................*............................................................................. - // gap // ................................................................................................. - mul v0.4S, v31.4S, v5.S[0] // ................*................................................................................ - ldr x14, [x5, #-24] // .......................................*......................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - mul v9.4S, v1.4S, v16.S[0] // ....................................*............................................................ - trn2 v12.2D, v28.2D, v10.2D // *................................................................................................ - // gap // ................................................................................................. - // gap // ................................................................................................. - sqrdmulh v4.4S, v1.4S, v16.S[1] // .....................................*........................................................... - ins v19.d[0], x29 // .............................*................................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - ins v2.d[1], x25 // ......................................*.......................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - ins v20.d[1], x21 // ..............................................................*.................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - mls v9.4S, v4.4S, v8.S[0] // ........................................*........................................................ - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - ins v30.d[0], x19 // .................................*............................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - mls v14.4S, v22.4S, v8.S[0] // ..............*.................................................................................. - ins v19.d[1], x14 // ..................................................*.............................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - mls v0.4S, v7.4S, v8.S[0] // .........................*....................................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - ins v30.d[1], x23 // .................................................*............................................... - ldr x23, [x5, #-8] // ...................................................*............................................. - ins v29.d[0], x26 // ...................................................................*............................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - sub v22.4S, v18.4S, v0.4S // ...........................................*..................................................... - add v1.4S, v18.4S, v0.4S // .........................................*....................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - ins v29.d[1], x23 // ..............................................................................*.................. - ldr x23, [x5, #-152] // ..................................................................*.............................. - sub v16.4S, v13.4S, v11.4S // .........*....................................................................................... - // gap // ................................................................................................. - add v5.4S, v1.4S, v14.4S // .............................................*................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - sub v0.4S, v22.4S, v9.4S // ..............................................*.................................................. - // gap // ................................................................................................. - add v18.4S, v22.4S, v9.4S // ...............................................*................................................. - // gap // ................................................................................................. - sub v7.4S, v1.4S, v14.4S // ............................................*.................................................... - // gap // ................................................................................................. - mul v1.4S, v16.4S, v17.4S // .............*................................................................................... - // gap // ................................................................................................. - trn1 v15.2D, v28.2D, v10.2D // ...............*................................................................................. - // gap // ................................................................................................. - trn2 v9.4S, v18.4S, v0.4S // ......................................................*.......................................... - // gap // ................................................................................................. - trn1 v4.4S, v5.4S, v7.4S // ................................................*................................................ - trn1 v31.4S, v18.4S, v0.4S // ....................................................*............................................ - // gap // ................................................................................................. - // gap // ................................................................................................. - trn2 v24.4S, v5.4S, v7.4S // .........................................................*....................................... - ins v14.d[0], x27 // .....................................................*........................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - add v27.4S, v13.4S, v11.4S // .*............................................................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - ins v6.d[1], x20 // .......*......................................................................................... - trn2 v0.2D, v4.2D, v31.2D // .......................................................*......................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - trn2 v5.2D, v24.2D, v9.2D // ...............................................................*................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - sqrdmulh v7.4S, v0.4S, v21.4S // ...........................................................*..................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - mul v10.4S, v0.4S, v6.4S // ..........................................................*...................................... - sqrdmulh v18.4S, v5.4S, v21.4S // ....................................................................*............................ - mul v22.4S, v5.4S, v6.4S // ......................................................................*.......................... - // gap // ................................................................................................. - // gap // ................................................................................................. - sqrdmulh v3.4S, v12.4S, v23.4S // ......*.......................................................................................... - ins v14.d[1], x23 // .....................................................................*........................... - // gap // ................................................................................................. - // gap // ................................................................................................. - mls v10.4S, v7.4S, v8.S[0] // ............................................................*.................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - mls v22.4S, v18.4S, v8.S[0] // ........................................................................*........................ - ldr x23, [x5, #-104] // .........................................................................*....................... - trn1 v0.2D, v4.2D, v31.2D // ........................................................*........................................ - // gap // ................................................................................................. - sqrdmulh v18.4S, v27.4S, v2.4S // ..........................................*...................................................... - trn1 v7.2D, v24.2D, v9.2D // .............................................................*................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - add v6.4S, v0.4S, v10.4S // .................................................................*............................... - mul v24.4S, v12.4S, v25.4S // ............*.................................................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - add v21.4S, v7.4S, v22.4S // ...........................................................................*..................... - ins v26.d[0], x10 // ..................*.............................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - sub v7.4S, v7.4S, v22.4S // ...............................................................................*................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - mul v25.4S, v27.4S, v14.4S // .......................................................................*......................... - mls v24.4S, v3.4S, v8.S[0] // .................*............................................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - sqrdmulh v5.4S, v7.4S, v29.4S // ....................................................................................*............ - mul v28.4S, v7.4S, v19.4S // ...................................................................................*............. - // gap // ................................................................................................. - // gap // ................................................................................................. - mul v13.4S, v21.4S, v30.4S // .........................................................................................*....... - ins v26.d[1], x23 // ..........................................................................*...................... - // gap // ................................................................................................. - // gap // ................................................................................................. - sqrdmulh v27.4S, v21.4S, v20.4S // ..........................................................................................*...... - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - mls v28.4S, v5.4S, v8.S[0] // ........................................................................................*........ - sub v3.4S, v15.4S, v24.4S // .....................*........................................................................... - // gap // ................................................................................................. - // gap // ................................................................................................. - sub v30.4S, v0.4S, v10.4S // ................................................................*................................ - sqrdmulh v9.4S, v16.4S, v26.4S // .............................................................................*................... - // gap // ................................................................................................. - // gap // ................................................................................................. - mls v13.4S, v27.4S, v8.S[0] // ...........................................................................................*..... - mls v25.4S, v18.4S, v8.S[0] // ............................................................................*.................... - // gap // ................................................................................................. - // gap // ................................................................................................. - add v29.4S, v15.4S, v24.4S // ...........................*..................................................................... - sub v12.4S, v30.4S, v28.4S // ..............................................................................................*.. - // gap // ................................................................................................. - // gap // ................................................................................................. - mls v1.4S, v9.4S, v8.S[0] // ..................................................................................*.............. - add v11.4S, v30.4S, v28.4S // ............................................................................................*.... - // gap // ................................................................................................. - // gap // ................................................................................................. - sub v10.4S, v6.4S, v13.4S // .............................................................................................*... - add v9.4S, v6.4S, v13.4S // ...............................................................................................*. - // gap // ................................................................................................. - // gap // ................................................................................................. - sub v21.4S, v29.4S, v25.4S // ................................................................................*................ - add v20.4S, v29.4S, v25.4S // .................................................................................*............... - // gap // ................................................................................................. - // gap // ................................................................................................. - sub v23.4S, v3.4S, v1.4S // .....................................................................................*........... - add v22.4S, v3.4S, v1.4S // ......................................................................................*.......... - st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x2], #64 // ................................................................................................* - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64 // .......................................................................................*......... - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - // gap // ................................................................................................. - - // original source code - // trn2 v9.2D, v28.2D, v10.2D // ......................*.......................................................................... - // add v12.4S, v13.4S, v11.4S // ...................................................*............................................. - // add x1, x1, #64 // ..........*...................................................................................... - // ldr x27, [x5, #-112] // .*............................................................................................... - // sqrdmulh v26.4S, v26.4S, v27.S[3] // ..*.............................................................................................. - // sub v4.4S, v31.4S, v4.4S // ...........*..................................................................................... - // sqrdmulh v22.4S, v9.4S, v23.4S // ...........................................................*..................................... - // ins v6.d[1], x20 // ....................................................*............................................ - // ldr x12, [x5, #-64] // .............*................................................................................... - // sub v15.4S, v13.4S, v11.4S // .......................................*......................................................... - // sub v2.4S, v30.4S, v29.4S // ........*........................................................................................ - // ldr x10, [x5, #-160] // ......*.......................................................................................... - // mul v7.4S, v9.4S, v25.4S // ....................................................................*............................ - // mul v19.4S, v15.4S, v17.4S // ............................................*.................................................... - // mls v14.4S, v26.4S, v8.S[0] // .............................*................................................................... - // trn1 v20.2D, v28.2D, v10.2D // .............................................*................................................... - // mul v13.4S, v4.4S, v5.S[0] // ...................*............................................................................. - // mls v7.4S, v22.4S, v8.S[0] // .........................................................................*....................... - // ins v31.d[0], x27 // ......................................................................*.......................... - // sqrdmulh v10.4S, v4.4S, v5.S[1] // ..................*.............................................................................. - // ins v11.d[0], x13 // *................................................................................................ - // sub v28.4S, v20.4S, v7.4S // ................................................................................*................ - // ldr x23, [x5, #-56] // ...............*................................................................................. - // ins v1.d[0], x8 // .......*......................................................................................... - // ldr x19, [x5, #-40] // .........*....................................................................................... - // mls v13.4S, v10.4S, v8.S[0] // ...............................*................................................................. - // sub v16.4S, v18.4S, v3.4S // ..............*.................................................................................. - // add v18.4S, v20.4S, v7.4S // .....................................................................................*........... - // ldr x7, [x4, #-16] // ...*............................................................................................. - // ins v0.d[0], x29 // ........................*........................................................................ - // add x2, x2, #64 // ....*............................................................................................ - // ins v25.d[0], x7 // ............*.................................................................................... - // ldr x7, [x4, #-8] // .....*........................................................................................... - // ins v7.d[0], x12 // ............................*.................................................................... - // ldr x17, [x5, #-136] // .................*............................................................................... - // ins v25.d[1], x7 // ................*................................................................................ - // mul v24.4S, v2.4S, v25.S[0] // .....................*........................................................................... - // sqrdmulh v2.4S, v2.4S, v25.S[1] // .......................*......................................................................... - // ins v11.d[1], x17 // .........................*....................................................................... - // ldr x17, [x5, #-24] // ....................*............................................................................ - // mls v24.4S, v2.4S, v8.S[0] // ...........................*..................................................................... - // add v30.4S, v16.4S, v13.4S // ....................................*............................................................ - // sqrdmulh v11.4S, v12.4S, v11.4S // .................................................................*............................... - // sub v2.4S, v16.4S, v13.4S // ...................................*............................................................. - // sub v29.4S, v30.4S, v14.4S // ...........................................*..................................................... - // add v16.4S, v30.4S, v14.4S // ........................................*........................................................ - // sub v14.4S, v2.4S, v24.4S // .........................................*....................................................... - // add v2.4S, v2.4S, v24.4S // ..........................................*...................................................... - // trn1 v3.4S, v16.4S, v29.4S // ...............................................*................................................. - // ins v7.d[1], x23 // ................................*................................................................ - // ins v0.d[1], x17 // ..............................*.................................................................. - // ldr x23, [x5, #-8] // .................................*............................................................... - // trn1 v24.4S, v2.4S, v14.4S // ................................................*................................................ - // ins v5.d[0], x10 // ..................................................*.............................................. - // trn2 v30.4S, v2.4S, v14.4S // ..............................................*.................................................. - // trn2 v10.2D, v3.2D, v24.2D // .....................................................*........................................... - // trn1 v2.2D, v3.2D, v24.2D // ................................................................*................................ - // trn2 v29.4S, v16.4S, v29.4S // .................................................*............................................... - // mul v9.4S, v10.4S, v6.4S // ........................................................*........................................ - // sqrdmulh v24.4S, v10.4S, v21.4S // .......................................................*......................................... - // mls v9.4S, v24.4S, v8.S[0] // .............................................................*................................... - // trn1 v3.2D, v29.2D, v30.2D // ..................................................................*.............................. - // ins v1.d[1], x19 // ..........................*...................................................................... - // trn2 v4.2D, v29.2D, v30.2D // ......................................................*.......................................... - // sub v24.4S, v2.4S, v9.4S // .................................................................................*............... - // add v10.4S, v2.4S, v9.4S // ...................................................................*............................. - // ldr x27, [x5, #-152] // ......................................*.......................................................... - // ins v9.d[0], x26 // ..................................*.............................................................. - // sqrdmulh v2.4S, v4.4S, v21.4S // .........................................................*....................................... - // ins v5.d[1], x27 // ............................................................*.................................... - // mul v21.4S, v4.4S, v6.4S // ..........................................................*...................................... - // mul v16.4S, v12.4S, v5.4S // ........................................................................*........................ - // mls v21.4S, v2.4S, v8.S[0] // ..............................................................*.................................. - // ldr x27, [x5, #-104] // ...............................................................*................................. - // ins v31.d[1], x27 // .............................................................................*................... - // add v29.4S, v3.4S, v21.4S // .....................................................................*........................... - // mls v16.4S, v11.4S, v8.S[0] // ....................................................................................*............ - // sqrdmulh v14.4S, v15.4S, v31.4S // ..................................................................................*.............. - // ins v9.d[1], x23 // .....................................*........................................................... - // sub v5.4S, v3.4S, v21.4S // .......................................................................*......................... - // sub v3.4S, v18.4S, v16.4S // ...........................................................................................*..... - // add v2.4S, v18.4S, v16.4S // ............................................................................................*.... - // mls v19.4S, v14.4S, v8.S[0] // .......................................................................................*......... - // mul v21.4S, v5.4S, v0.4S // ...........................................................................*..................... - // sqrdmulh v0.4S, v5.4S, v9.4S // ..........................................................................*...................... - // sub v5.4S, v28.4S, v19.4S // .............................................................................................*... - // add v4.4S, v28.4S, v19.4S // ..............................................................................................*.. - // st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // ................................................................................................* - // mls v21.4S, v0.4S, v8.S[0] // ...............................................................................*................. - // mul v28.4S, v29.4S, v7.4S // ............................................................................*.................... - // sqrdmulh v15.4S, v29.4S, v1.4S // ..............................................................................*.................. - // mls v28.4S, v15.4S, v8.S[0] // ...................................................................................*............. - // add v2.4S, v24.4S, v21.4S // ........................................................................................*........ - // sub v1.4S, v10.4S, v28.4S // .........................................................................................*....... - // sub v3.4S, v24.4S, v21.4S // ......................................................................................*.......... - // add v0.4S, v10.4S, v28.4S // ..........................................................................................*...... - // st4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x2], #64 // ...............................................................................................*. + // Instructions: 38 + // Expected cycles: 21 + // Expected IPC: 1.81 + // + // Wall time: 0.76s + // User time: 0.76s + // + // --------- original position ---------> + // 0 25 + // |------------------------|------------ + trn1 v25.2D, v12.2D, v14.2D // .*.................................... + sqrdmulh v7.4S, v13.4S, v7.4S // *..................................... + ldr q13, [x5, #-16] // .................*.................... + // gap // ...................................... + sub v28.4S, v18.4S, v10.4S // .....*................................ + trn1 v3.2D, v15.2D, v0.2D // ..*................................... + ldr q30, [x5, #-112] // ..............*....................... + // gap // ...................................... + add v22.4S, v25.4S, v17.4S // ...........................*.......... + sub v4.4S, v25.4S, v17.4S // ......*............................... + ldr q15, [x5, #-64] // .............................*........ + // gap // ...................................... + mls v9.4S, v7.4S, v8.S[0] // .......*.............................. + add v19.4S, v18.4S, v10.4S // ....*................................. + ldr q2, [x5, #-160] // ...*.................................. + // gap // ...................................... + sqrdmulh v21.4S, v28.4S, v13.4S // ...................*.................. + mul v20.4S, v28.4S, v1.4S // ........*............................. + // gap // ...................................... + // gap // ...................................... + sqrdmulh v7.4S, v19.4S, v23.4S // .........*............................ + sub v11.4S, v3.4S, v29.4S // ..................*................... + // gap // ...................................... + // gap // ...................................... + mul v24.4S, v19.4S, v15.4S // ..............................*....... + sub v0.4S, v16.4S, v9.4S // ...........*.......................... + // gap // ...................................... + // gap // ...................................... + add v28.4S, v16.4S, v9.4S // ..........*........................... + mls v20.4S, v21.4S, v8.S[0] // ......................*............... + // gap // ...................................... + // gap // ...................................... + mul v17.4S, v0.4S, v31.4S // .............*........................ + sqrdmulh v21.4S, v0.4S, v30.4S // ...............*...................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v13.4S, v28.4S, v27.4S // ................*..................... + mul v26.4S, v28.4S, v2.4S // ............*......................... + // gap // ...................................... + // gap // ...................................... + mls v24.4S, v7.4S, v8.S[0] // .................................*.... + add v14.4S, v4.4S, v20.4S // ..........................*........... + // gap // ...................................... + // gap // ...................................... + sub v15.4S, v4.4S, v20.4S // ...............................*...... + mls v17.4S, v21.4S, v8.S[0] // .....................*................ + // gap // ...................................... + // gap // ...................................... + add v3.4S, v3.4S, v29.4S // ....................*................. + mls v26.4S, v13.4S, v8.S[0] // ........................*............. + // gap // ...................................... + // gap // ...................................... + add v12.4S, v22.4S, v24.4S // ...................................*.. + sub v13.4S, v22.4S, v24.4S // ....................................*. + // gap // ...................................... + // gap // ...................................... + add v16.4S, v11.4S, v17.4S // .........................*............ + sub v17.4S, v11.4S, v17.4S // .......................*.............. + // gap // ...................................... + // gap // ...................................... + st4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x2], #64 // .....................................* + add v14.4S, v3.4S, v26.4S // ............................*......... + sub v15.4S, v3.4S, v26.4S // ................................*..... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + st4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1], #64 // ..................................*... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + + // ----------- new position ------------> + // 0 25 + // |------------------------|------------ + // sqrdmulh v25.4S, v13.4S, v7.4S // .*.................................... + // trn1 v6.2D, v12.2D, v14.2D // *..................................... + // trn1 v26.2D, v15.2D, v0.2D // ....*................................. + // ldr q21, [x5, #-160] // ...........*.......................... + // add v14.4S, v18.4S, v10.4S // ..........*........................... + // sub v12.4S, v18.4S, v10.4S // ...*.................................. + // sub v2.4S, v6.4S, v17.4S // .......*.............................. + // mls v9.4S, v25.4S, v8.S[0] // .........*............................ + // mul v25.4S, v12.4S, v1.4S // .............*........................ + // sqrdmulh v4.4S, v14.4S, v23.4S // ..............*....................... + // add v20.4S, v16.4S, v9.4S // ..................*................... + // sub v9.4S, v16.4S, v9.4S // .................*.................... + // mul v16.4S, v20.4S, v21.4S // .......................*.............. + // mul v24.4S, v9.4S, v31.4S // ....................*................. + // ldr q19, [x5, #-112] // .....*................................ + // sqrdmulh v1.4S, v9.4S, v19.4S // .....................*................ + // sqrdmulh v19.4S, v20.4S, v27.4S // ......................*............... + // ldr q22, [x5, #-16] // ..*................................... + // sub v28.4S, v26.4S, v29.4S // ...............*...................... + // sqrdmulh v22.4S, v12.4S, v22.4S // ............*......................... + // add v10.4S, v26.4S, v29.4S // ............................*......... + // mls v24.4S, v1.4S, v8.S[0] // ...........................*.......... + // mls v25.4S, v22.4S, v8.S[0] // ...................*.................. + // sub v22.4S, v28.4S, v24.4S // .................................*.... + // mls v16.4S, v19.4S, v8.S[0] // .............................*........ + // add v21.4S, v28.4S, v24.4S // ................................*..... + // add v28.4S, v2.4S, v25.4S // .........................*............ + // add v11.4S, v6.4S, v17.4S // ......*............................... + // add v19.4S, v10.4S, v16.4S // ...................................*.. + // ldr q29, [x5, #-64] // ........*............................. + // mul v17.4S, v14.4S, v29.4S // ................*..................... + // sub v29.4S, v2.4S, v25.4S // ..........................*........... + // sub v20.4S, v10.4S, v16.4S // ....................................*. + // mls v17.4S, v4.4S, v8.S[0] // ........................*............. + // st4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1], #64 // .....................................* + // add v26.4S, v11.4S, v17.4S // ..............................*....... + // sub v27.4S, v11.4S, v17.4S // ...............................*...... + // st4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x2], #64 // ..................................*... pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_1234_567_manual_st4_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_kyber_1234_567_manual_st4_opt_m1_firestorm.s new file mode 100644 index 00000000..b7f1b16c --- /dev/null +++ b/examples/opt/aarch64/ntt_kyber_1234_567_manual_st4_opt_m1_firestorm.s @@ -0,0 +1,2101 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmla d,a,b + mla \d\().8h, \a\().8h, \b\().8h +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmlaq d,a,b,i + mla \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlaq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vmlaq \dst, t2, consts, 0 +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlaq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #16 +.endm + +.macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_kyber_1234_567_twiddles.s" +.text + + .global ntt_kyber_1234_567 + .global _ntt_kyber_1234_567 + +.p2align 4 +const_addr: .short -3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_kyber_1234_567: +_ntt_kyber_1234_567: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + src0 .req x6 + src1 .req x7 + src2 .req x8 + src3 .req x9 + src4 .req x10 + src5 .req x11 + src6 .req x12 + src7 .req x13 + src8 .req x14 + src9 .req x15 + src10 .req x16 + src11 .req x17 + src12 .req x18 + src13 .req x19 + src14 .req x20 + src15 .req x21 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + data8 .req v17 + data9 .req v18 + data10 .req v19 + data11 .req v20 + data12 .req v21 + data13 .req v22 + data14 .req v23 + data15 .req v24 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + qform_data8 .req q17 + qform_data9 .req q18 + qform_data10 .req q19 + qform_data11 .req q20 + qform_data12 .req q21 + qform_data13 .req q22 + qform_data14 .req q23 + qform_data15 .req q24 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + tmp .req v25 + t0 .req v26 + t1 .req v27 + t2 .req v28 + t3 .req v29 + + consts .req v8 + + ASM_LOAD(r_ptr0, roots) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + add src0, x0, #32*0 + add src8, x0, #32*8 + + ld1 { root0.8h, root1.8h, root2.8h, root3.8h}, [r_ptr0], #64 + + mov count, #2 + + .p2align 2 + ldr q15, [x14, #192] // ..............*................................................................................................................................................................................. + ldr q16, [x14, #224] // ...............*................................................................................................................................................................................ + ldr q27, [x14, #160] // .............*.................................................................................................................................................................................. + ldr q14, [x14, #64] // ..........*..................................................................................................................................................................................... + ldr q6, [x14, #32] // .........*...................................................................................................................................................................................... + ldr q26, [x6, #0] // *............................................................................................................................................................................................... + ldr q7, [x14, #128] // ............*................................................................................................................................................................................... + ldr q11, [x14, #96] // ...........*.................................................................................................................................................................................... + ldr q13, [x14, #0] // ........*....................................................................................................................................................................................... + ldr q31, [x6, #224] // .......*........................................................................................................................................................................................ + ldr q25, [x6, #64] // ..*............................................................................................................................................................................................. + ldr q22, [x6, #192] // ......*......................................................................................................................................................................................... + mul v17.8H, v15.8H, v0.H[0] // ...............................................*................................................................................................................................................ + sqrdmulh v15.8H, v15.8H, v0.H[1] // ..............................................*................................................................................................................................................. + mul v29.8H, v16.8H, v0.H[0] // ....................................................*........................................................................................................................................... + sqrdmulh v16.8H, v16.8H, v0.H[1] // ...................................................*............................................................................................................................................ + ldr q23, [x6, #160] // .....*.......................................................................................................................................................................................... + ldr q12, [x6, #96] // ...*............................................................................................................................................................................................ + ldr q19, [x6, #128] // ....*........................................................................................................................................................................................... + mla v29.8H, v16.8H, v8.H[0] // .....................................................*.......................................................................................................................................... + mul v9.8H, v27.8H, v0.H[0] // ..........................................*..................................................................................................................................................... + sqrdmulh v5.8H, v27.8H, v0.H[1] // .........................................*...................................................................................................................................................... + mla v17.8H, v15.8H, v8.H[0] // ................................................*............................................................................................................................................... + sqrdmulh v16.8H, v14.8H, v0.H[1] // ..........................*..................................................................................................................................................................... + mul v27.8H, v14.8H, v0.H[0] // ...........................*.................................................................................................................................................................... + mul v24.8H, v7.8H, v0.H[0] // .....................................*.......................................................................................................................................................... + sqrdmulh v21.8H, v7.8H, v0.H[1] // ....................................*........................................................................................................................................................... + mul v18.8H, v13.8H, v0.H[0] // .................*.............................................................................................................................................................................. + sqrdmulh v4.8H, v13.8H, v0.H[1] // ................*............................................................................................................................................................................... + mul v14.8H, v11.8H, v0.H[0] // ................................*............................................................................................................................................................... + sqrdmulh v20.8H, v11.8H, v0.H[1] // ...............................*................................................................................................................................................................ + mla v9.8H, v5.8H, v8.H[0] // ...........................................*.................................................................................................................................................... + sub v15.8H, v22.8H, v17.8H // .................................................*.............................................................................................................................................. + sub v30.8H, v31.8H, v29.8H // ......................................................*......................................................................................................................................... + mul v11.8H, v6.8H, v0.H[0] // ......................*......................................................................................................................................................................... + add v7.8H, v22.8H, v17.8H // ..................................................*............................................................................................................................................. + add v22.8H, v31.8H, v29.8H // .......................................................*........................................................................................................................................ + mla v27.8H, v16.8H, v8.H[0] // ............................*................................................................................................................................................................... + sqrdmulh v17.8H, v22.8H, v0.H[3] // .......................................................................*........................................................................................................................ + mul v5.8H, v22.8H, v0.H[2] // ........................................................................*....................................................................................................................... + sqrdmulh v22.8H, v30.8H, v0.H[5] // ...........................................................................................*.................................................................................................... + mul v13.8H, v30.8H, v0.H[4] // ............................................................................................*................................................................................................... + mul v30.8H, v15.8H, v0.H[4] // .......................................................................................*........................................................................................................ + sub v16.8H, v23.8H, v9.8H // ............................................*................................................................................................................................................... + mla v14.8H, v20.8H, v8.H[0] // .................................*.............................................................................................................................................................. + sqrdmulh v28.8H, v6.8H, v0.H[1] // .....................*.......................................................................................................................................................................... + mla v24.8H, v21.8H, v8.H[0] // ......................................*......................................................................................................................................................... + sqrdmulh v20.8H, v7.8H, v0.H[3] // ..................................................................*............................................................................................................................. + mul v29.8H, v7.8H, v0.H[2] // ...................................................................*............................................................................................................................ + add v31.8H, v25.8H, v27.8H // ..............................*................................................................................................................................................................. + add v21.8H, v23.8H, v9.8H // .............................................*.................................................................................................................................................. + mla v5.8H, v17.8H, v8.H[0] // .........................................................................*...................................................................................................................... + mla v13.8H, v22.8H, v8.H[0] // .............................................................................................*.................................................................................................. + ldr q23, [x6, #32] // .*.............................................................................................................................................................................................. + sqrdmulh v6.8H, v16.8H, v0.H[5] // .................................................................................*.............................................................................................................. + add v9.8H, v12.8H, v14.8H // ...................................*............................................................................................................................................................ + sqrdmulh v7.8H, v15.8H, v0.H[5] // ......................................................................................*......................................................................................................... + mla v11.8H, v28.8H, v8.H[0] // .......................*........................................................................................................................................................................ + mla v30.8H, v7.8H, v8.H[0] // ........................................................................................*....................................................................................................... + add v7.8H, v9.8H, v5.8H // ...........................................................................*.................................................................................................................... + sqrdmulh v17.8H, v21.8H, v0.H[3] // .............................................................*.................................................................................................................................. + mul v22.8H, v21.8H, v0.H[2] // ..............................................................*................................................................................................................................. + mla v18.8H, v4.8H, v8.H[0] // ..................*............................................................................................................................................................................. + add v4.8H, v19.8H, v24.8H // ........................................*....................................................................................................................................................... + sub v28.8H, v25.8H, v27.8H // .............................*.................................................................................................................................................................. + mla v29.8H, v20.8H, v8.H[0] // ....................................................................*........................................................................................................................... + add v21.8H, v23.8H, v11.8H // .........................*...................................................................................................................................................................... + sub v25.8H, v12.8H, v14.8H // ..................................*............................................................................................................................................................. + sqrdmulh v20.8H, v7.8H, v0.H[7] // .....................................................................................................*.......................................................................................... + mul v7.8H, v7.8H, v0.H[6] // ......................................................................................................*......................................................................................... + sub v27.8H, v19.8H, v24.8H // .......................................*........................................................................................................................................................ + mul v14.8H, v16.8H, v0.H[4] // ..................................................................................*............................................................................................................. + mla v22.8H, v17.8H, v8.H[0] // ...............................................................*................................................................................................................................ + add v10.8H, v28.8H, v30.8H // ..........................................................................................*..................................................................................................... + sub v12.8H, v26.8H, v18.8H // ...................*............................................................................................................................................................................ + add v19.8H, v26.8H, v18.8H // ....................*........................................................................................................................................................................... + add v26.8H, v31.8H, v29.8H // ......................................................................*......................................................................................................................... + mul v17.8H, v4.8H, v0.H[2] // .........................................................*...................................................................................................................................... + sub v24.8H, v25.8H, v13.8H // ..............................................................................................*................................................................................................. + mla v7.8H, v20.8H, v8.H[0] // .......................................................................................................*........................................................................................ + sqrdmulh v16.8H, v4.8H, v0.H[3] // ........................................................*....................................................................................................................................... + sqrdmulh v18.8H, v10.8H, v1.H[3] // ....................................................................................................................*........................................................................... + add v20.8H, v21.8H, v22.8H // .................................................................*.............................................................................................................................. + sub v15.8H, v31.8H, v29.8H // .....................................................................*.......................................................................................................................... + mla v14.8H, v6.8H, v8.H[0] // ...................................................................................*............................................................................................................ + mul v29.8H, v24.8H, v1.H[4] // ....................................................................................................................................*........................................................... + add v31.8H, v25.8H, v13.8H // ...............................................................................................*................................................................................................ + sqrdmulh v25.8H, v24.8H, v1.H[5] // ...................................................................................................................................*............................................................ + sub v6.8H, v28.8H, v30.8H // .........................................................................................*...................................................................................................... + mla v17.8H, v16.8H, v8.H[0] // ..........................................................*..................................................................................................................................... + add v24.8H, v20.8H, v7.8H // .........................................................................................................*...................................................................................... + sqrdmulh v4.8H, v15.8H, v1.H[1] // ..........................................................................................................*..................................................................................... + mul v28.8H, v15.8H, v1.H[0] // ...........................................................................................................*.................................................................................... + sqrdmulh v16.8H, v31.8H, v1.H[3] // .........................................................................................................................*...................................................................... + mul v30.8H, v6.8H, v1.H[4] // ...............................................................................................................................*................................................................ + sub v13.8H, v23.8H, v11.8H // ........................*....................................................................................................................................................................... + sub count, count, #1 +layer1234_start: + // Instructions: 192 + // Expected cycles: 24 + // Expected IPC: 8.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + sub v5.8H, v9.8H, v5.8H // .....*.......................................................................................................................................................................................... + sub v20.8H, v20.8H, v7.8H // ...*............................................................................................................................................................................................ + mul v23.8H, v26.8H, v0.H[6] // .*.............................................................................................................................................................................................. + mul v10.8H, v10.8H, v1.H[2] // ........*....................................................................................................................................................................................... + sqrdmulh v15.8H, v26.8H, v0.H[7] // .......*........................................................................................................................................................................................ + mla v28.8H, v4.8H, v8.H[0] // .........................................................*...................................................................................................................................... + mul v4.8H, v24.8H, v1.H[6] // ...........*.................................................................................................................................................................................... + sub v26.8H, v19.8H, v17.8H // .............................................................*.................................................................................................................................. + sqrdmulh v7.8H, v5.8H, v1.H[1] // .............*.................................................................................................................................................................................. + mul v5.8H, v5.8H, v1.H[0] // ...............*................................................................................................................................................................................ + mul v11.8H, v20.8H, v2.H[0] // .................................................................*.............................................................................................................................. + sqrdmulh v20.8H, v20.8H, v2.H[1] // ..................................................................*............................................................................................................................. + ldr q9, [x14, #208] // ................................................................................................*............................................................................................... + mla v29.8H, v25.8H, v8.H[0] // .........*...................................................................................................................................................................................... + mul v25.8H, v27.8H, v0.H[4] // *............................................................................................................................................................................................... + sqrdmulh v27.8H, v27.8H, v0.H[5] // ....*........................................................................................................................................................................................... + mla v10.8H, v18.8H, v8.H[0] // ....................*........................................................................................................................................................................... + ldr q18, [x14, #240] // .................................................................................................*.............................................................................................. + sub v21.8H, v21.8H, v22.8H // ...................*............................................................................................................................................................................ + mla v11.8H, v20.8H, v8.H[0] // ...............................................................................*................................................................................................................ + add v20.8H, v26.8H, v28.8H // ........................................................................*....................................................................................................................... + mla v25.8H, v27.8H, v8.H[0] // ................*............................................................................................................................................................................... + sub v27.8H, v13.8H, v14.8H // ..............*................................................................................................................................................................................. + add v13.8H, v13.8H, v14.8H // ..................*............................................................................................................................................................................. + mul v14.8H, v31.8H, v1.H[2] // ..*............................................................................................................................................................................................. + mla v23.8H, v15.8H, v8.H[0] // .................*.............................................................................................................................................................................. + add v22.8H, v19.8H, v17.8H // .......................*........................................................................................................................................................................ + sub v15.8H, v27.8H, v29.8H // .........................................*...................................................................................................................................................... + mla v5.8H, v7.8H, v8.H[0] // .........................*...................................................................................................................................................................... + sqrdmulh v7.8H, v18.8H, v0.H[1] // ...............................................................................................................*................................................................................ + add v31.8H, v27.8H, v29.8H // .....................*.......................................................................................................................................................................... + add v29.8H, v12.8H, v25.8H // ...........................................*.................................................................................................................................................... + sub v25.8H, v12.8H, v25.8H // ...............................*................................................................................................................................................................ + mul v19.8H, v18.8H, v0.H[0] // ..............................................................................................................*................................................................................. + sqrdmulh v27.8H, v6.8H, v1.H[5] // ......*......................................................................................................................................................................................... + mla v14.8H, v16.8H, v8.H[0] // ............*................................................................................................................................................................................... + sub v16.8H, v22.8H, v23.8H // ..........................................................................*..................................................................................................................... + sqrdmulh v17.8H, v24.8H, v1.H[7] // ..........*..................................................................................................................................................................................... + mla v30.8H, v27.8H, v8.H[0] // ...........................*.................................................................................................................................................................... + sub v26.8H, v26.8H, v28.8H // .....................................................................*.......................................................................................................................... + add v12.8H, v21.8H, v5.8H // ......................................*......................................................................................................................................................... + mul v27.8H, v15.8H, v3.H[4] // .......................................................*........................................................................................................................................ + add v24.8H, v22.8H, v23.8H // ..............................*................................................................................................................................................................. + sub v22.8H, v21.8H, v5.8H // .....................................*.......................................................................................................................................................... + sqrdmulh v6.8H, v15.8H, v3.H[5] // ........................................................*....................................................................................................................................... + sqrdmulh v28.8H, v9.8H, v0.H[1] // .............................................................................................................*.................................................................................. + ldr q15, [x14, #176] // ..................................................................................................*............................................................................................. + sqrdmulh v23.8H, v12.8H, v2.H[3] // ...................................................*............................................................................................................................................ + mla v19.8H, v7.8H, v8.H[0] // ...................................................................................................................*............................................................................ + mul v5.8H, v12.8H, v2.H[2] // ...............................................*................................................................................................................................................ + mul v18.8H, v9.8H, v0.H[0] // ............................................................................................................*................................................................................... + ldr q9, [x6, #240] // .........................................................................................................*...................................................................................... + add v21.8H, v25.8H, v30.8H // ........................................*....................................................................................................................................................... + sub v12.8H, v25.8H, v30.8H // .......................................................................*........................................................................................................................ + add v25.8H, v16.8H, v11.8H // ..........................................................................................*..................................................................................................... + add v30.8H, v13.8H, v14.8H // ........................*....................................................................................................................................................................... + mla v27.8H, v6.8H, v8.H[0] // ......................................................................*......................................................................................................................... + sub v16.8H, v16.8H, v11.8H // ...........................................................................................*.................................................................................................... + ldr q6, [x14, #144] // ......................................................................................................*......................................................................................... + str q25, [x6, #64] // ...............................................................................................*................................................................................................ + mul v11.8H, v31.8H, v3.H[2] // ............................*................................................................................................................................................................... + sqrdmulh v25.8H, v31.8H, v3.H[3] // .............................*.................................................................................................................................................................. + str q16, [x6, #96] // ..............................................................................................*................................................................................................. + sqrdmulh v16.8H, v22.8H, v2.H[5] // ..............................................*................................................................................................................................................. + mul v22.8H, v22.8H, v2.H[4] // ....................................................*........................................................................................................................................... + mla v4.8H, v17.8H, v8.H[0] // ......................*......................................................................................................................................................................... + mul v17.8H, v15.8H, v0.H[0] // ....................................................................................................................*........................................................................... + mla v18.8H, v28.8H, v8.H[0] // ......................................................................................................................*......................................................................... + mla v5.8H, v23.8H, v8.H[0] // ...............................................................*................................................................................................................................ + sub v13.8H, v13.8H, v14.8H // ..........................*..................................................................................................................................................................... + sqrdmulh v28.8H, v15.8H, v0.H[1] // .....................................................................................................................*.......................................................................... + add v15.8H, v12.8H, v27.8H // ..................................................................................*............................................................................................................. + sub v12.8H, v12.8H, v27.8H // .................................................................................*.............................................................................................................. + mla v22.8H, v16.8H, v8.H[0] // ................................................................*............................................................................................................................... + mla v11.8H, v25.8H, v8.H[0] // ..........................................*..................................................................................................................................................... + mul v23.8H, v30.8H, v2.H[6] // .................................*.............................................................................................................................................................. + sqrdmulh v27.8H, v30.8H, v2.H[7] // ................................*............................................................................................................................................................... + mla v23.8H, v27.8H, v8.H[0] // ................................................*............................................................................................................................................... + ldr q27, [x14, #112] // .......................................................................................................*........................................................................................ + ldr q16, [x14, #48] // ....................................................................................................*........................................................................................... + mla v17.8H, v28.8H, v8.H[0] // ...............................................................................................................................*................................................................ + ldr q28, [x6, #176] // ................................................................................................................*............................................................................... + sub v25.8H, v26.8H, v22.8H // ..............................................................................*................................................................................................................. + add v30.8H, v9.8H, v19.8H // ....................................................................................................................................*........................................................... + str q15, [x14, #192] // .........................................................................................*...................................................................................................... + mul v31.8H, v13.8H, v3.H[0] // ..................................*............................................................................................................................................................. + sqrdmulh v14.8H, v13.8H, v3.H[1] // ...................................*............................................................................................................................................................ + add v13.8H, v29.8H, v10.8H // .....................................................*.......................................................................................................................................... + str q12, [x14, #224] // ............................................................................................*................................................................................................... + add v12.8H, v20.8H, v5.8H // ....................................................................................*........................................................................................................... + str q25, [x6, #224] // .............................................................................................*.................................................................................................. + sub v25.8H, v20.8H, v5.8H // .............................................................................*.................................................................................................................. + mul v5.8H, v30.8H, v0.H[2] // .......................................................................................................................................*........................................................ + sqrdmulh v15.8H, v30.8H, v0.H[3] // ......................................................................................................................................*......................................................... + sub v30.8H, v29.8H, v10.8H // ..................................................*............................................................................................................................................. + ldr q7, [x14, #16] // ........................................................................................................*....................................................................................... + mla v5.8H, v15.8H, v8.H[0] // ...................................................................................................................................................*............................................ + mla v31.8H, v14.8H, v8.H[0] // .................................................*.............................................................................................................................................. + mul v20.8H, v7.8H, v0.H[0] // ...........................................................................................................................*.................................................................... + sqrdmulh v7.8H, v7.8H, v0.H[1] // ............................................................................................................................*................................................................... + str q12, [x6, #128] // ........................................................................................*....................................................................................................... + ldr q10, [x14, #80] // ...................................................................................................*............................................................................................ + sub v14.8H, v24.8H, v4.8H // .......................................*........................................................................................................................................................ + sub v19.8H, v9.8H, v19.8H // .................................................................................................................................*.............................................................. + str q25, [x6, #160] // ......................................................................................*......................................................................................................... + mul v29.8H, v27.8H, v0.H[0] // .............................................................................................................................*.................................................................. + ldr q9, [x6, #16] // .....................................................................................................*.......................................................................................... + sqrdmulh v12.8H, v27.8H, v0.H[1] // ..............................................................................................................................*................................................................. + ldr q15, [x6, #112] // .................................................................................................................*.............................................................................. + add v4.8H, v24.8H, v4.8H // ....................................*........................................................................................................................................................... + sqrdmulh v27.8H, v16.8H, v0.H[1] // .............................................................................................................................................*.................................................. + mul v25.8H, v16.8H, v0.H[0] // ..................................................................................................................................*............................................................. + ldr q16, [x6, #208] // ...........................................................................................................*.................................................................................... + add v24.8H, v26.8H, v22.8H // ................................................................................*............................................................................................................... + mla v20.8H, v7.8H, v8.H[0] // ..............................................................................................................................................................*................................. + sub v26.8H, v30.8H, v31.8H // ...........................................................................*.................................................................................................................... + add v30.8H, v30.8H, v31.8H // ..........................................................*..................................................................................................................................... + mul v7.8H, v19.8H, v0.H[4] // .........................................................................................................................................*...................................................... + str q14, [x6, #32] // ............................................*................................................................................................................................................... + sqrdmulh v14.8H, v19.8H, v0.H[5] // ........................................................................................................................................*....................................................... + sqrdmulh v22.8H, v10.8H, v0.H[1] // .......................................................................................................................*........................................................................ + str q24, [x6, #192] // .......................................................................................*........................................................................................................ + add v19.8H, v13.8H, v23.8H // ............................................................*................................................................................................................................... + mul v24.8H, v10.8H, v0.H[0] // ........................................................................................................................*....................................................................... + str q30, [x14, #64] // .........................................................................*...................................................................................................................... + mul v30.8H, v6.8H, v0.H[0] // .........................................................................................................................*...................................................................... + mla v29.8H, v12.8H, v8.H[0] // ............................................................................................................................................*................................................... + sqrdmulh v6.8H, v6.8H, v0.H[1] // ..........................................................................................................................*..................................................................... + sub v12.8H, v9.8H, v20.8H // ..........................................................................................................................................................................*..................... + add v31.8H, v16.8H, v18.8H // ...................................................................................................................................*............................................................ + str q19, [x14], #16 // ...................................................................*............................................................................................................................ + add v10.8H, v21.8H, v11.8H // ............................................................................*................................................................................................................... + add v19.8H, v9.8H, v20.8H // ...........................................................................................................................................................................*.................... + sub v23.8H, v13.8H, v23.8H // ...........................................................*.................................................................................................................................... + mla v7.8H, v14.8H, v8.H[0] // ....................................................................................................................................................*........................................... + ldr q20, [x6, #144] // ..................................................................................................................*............................................................................. + str q26, [x14, #80] // ...................................................................................*............................................................................................................ + sub v18.8H, v16.8H, v18.8H // ................................................................................................................................*............................................................... + ldr q16, [x6, #80] // ..........................................................................................................*..................................................................................... + mla v25.8H, v27.8H, v8.H[0] // .........................................................................................................................................................*...................................... + mla v24.8H, v22.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + sub v11.8H, v21.8H, v11.8H // ......................................................*......................................................................................................................................... + sub v26.8H, v15.8H, v29.8H // ...................................................................................................................................................................*............................ + add v9.8H, v15.8H, v29.8H // .......................................................................................................................................................*........................................ + ldr q15, [x6, #48] // .....................................................................................................................................................*.......................................... + mla v30.8H, v6.8H, v8.H[0] // ..............................................................................................................................................*................................................. + add v22.8H, v28.8H, v17.8H // ..................................................................................................................................................*............................................. + str q23, [x14, #16] // ....................................................................*........................................................................................................................... + str q10, [x14, #112] // .....................................................................................*.......................................................................................................... + mul v6.8H, v31.8H, v0.H[2] // ................................................................................................................................................*............................................... + sqrdmulh v13.8H, v31.8H, v0.H[3] // ...............................................................................................................................................*................................................ + sqrdmulh v14.8H, v18.8H, v0.H[5] // ........................................................................................................................................................*....................................... + str q11, [x14, #144] // ..............................................................*................................................................................................................................. + str q4, [x6], #16 // .............................................*.................................................................................................................................................. + sub v27.8H, v28.8H, v17.8H // ...........................................................................................................................................*.................................................... + add v31.8H, v26.8H, v7.8H // ......................................................................................................................................................................................*......... + mul v29.8H, v18.8H, v0.H[4] // ..........................................................................................................................................*..................................................... + sqrdmulh v10.8H, v22.8H, v0.H[3] // ............................................................................................................................................................*................................... + add v11.8H, v16.8H, v24.8H // .................................................................................................................................................*.............................................. + add v4.8H, v9.8H, v5.8H // ...........................................................................................................................................................*.................................... + mul v22.8H, v22.8H, v0.H[2] // .............................................................................................................................................................*.................................. + mla v6.8H, v13.8H, v8.H[0] // .................................................................................................................................................................*.............................. + add v28.8H, v20.8H, v30.8H // ...............................................................................................................................................................*................................ + sub v23.8H, v26.8H, v7.8H // ..............................................................................................................................................................................*................. + add v21.8H, v15.8H, v25.8H // ..................................................................................................................................................................*............................. + sqrdmulh v18.8H, v4.8H, v0.H[7] // ....................................................................................................................................................................*........................... + mul v7.8H, v4.8H, v0.H[6] // .....................................................................................................................................................................*.......................... + sub v16.8H, v16.8H, v24.8H // ................................................................................................................................................................*............................... + mla v29.8H, v14.8H, v8.H[0] // ..........................................................................................................................................................*..................................... + mul v17.8H, v28.8H, v0.H[2] // .............................................................................................................................................................................*.................. + mla v22.8H, v10.8H, v8.H[0] // ........................................................................................................................................................................*....................... + sqrdmulh v24.8H, v27.8H, v0.H[5] // ......................................................................................................................................................*......................................... + sqrdmulh v28.8H, v28.8H, v0.H[3] // ................................................................................................................................................................................*............... + mul v14.8H, v27.8H, v0.H[4] // .......................................................................................................................................................................*........................ + sub v13.8H, v15.8H, v25.8H // ...............................................................................................................................................................................................* + sub v27.8H, v20.8H, v30.8H // ......................................................................................................................................................................*......................... + sub v30.8H, v11.8H, v6.8H // ...................................................................................................................................................................................*............ + add v26.8H, v11.8H, v6.8H // ............................................................................................................................................................................*................... + sqrdmulh v25.8H, v23.8H, v1.H[5] // .......................................................................................................................................................................................*........ + mla v7.8H, v18.8H, v8.H[0] // ...............................................................................................................................................................................*................ + add v10.8H, v16.8H, v29.8H // .........................................................................................................................................................................*...................... + mla v17.8H, v28.8H, v8.H[0] // .........................................................................................................................................................................................*...... + mul v28.8H, v30.8H, v1.H[0] // ............................................................................................................................................................................................*... + add v20.8H, v21.8H, v22.8H // ..................................................................................................................................................................................*............. + sub v6.8H, v16.8H, v29.8H // ........................................................................................................................................................................................*....... + sqrdmulh v4.8H, v30.8H, v1.H[1] // ...........................................................................................................................................................................................*.... + sqrdmulh v16.8H, v31.8H, v1.H[3] // .............................................................................................................................................................................................*.. + mla v14.8H, v24.8H, v8.H[0] // ....................................................................................................................................................................................*........... + mul v29.8H, v23.8H, v1.H[4] // .....................................................................................................................................................................................*.......... + add v24.8H, v20.8H, v7.8H // ..........................................................................................................................................................................................*..... + mul v30.8H, v6.8H, v1.H[4] // ..............................................................................................................................................................................................*. + sqrdmulh v18.8H, v10.8H, v1.H[3] // .................................................................................................................................................................................*.............. + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // mul v11.8H, v27.8H, v0.H[4] // ..............*................................................................................................................................................................................. + // mul v15.8H, v26.8H, v0.H[6] // ..*............................................................................................................................................................................................. + // mul v23.8H, v31.8H, v1.H[2] // ........................*....................................................................................................................................................................... + // sub v7.8H, v20.8H, v7.8H // .*.............................................................................................................................................................................................. + // sqrdmulh v27.8H, v27.8H, v0.H[5] // ...............*................................................................................................................................................................................ + // sub v5.8H, v9.8H, v5.8H // *............................................................................................................................................................................................... + // sqrdmulh v6.8H, v6.8H, v1.H[5] // ..................................*............................................................................................................................................................. + // sqrdmulh v26.8H, v26.8H, v0.H[7] // ....*........................................................................................................................................................................................... + // mul v9.8H, v10.8H, v1.H[2] // ...*............................................................................................................................................................................................ + // mla v29.8H, v25.8H, v8.H[0] // .............*.................................................................................................................................................................................. + // sqrdmulh v10.8H, v24.8H, v1.H[7] // .....................................*.......................................................................................................................................................... + // mul v25.8H, v24.8H, v1.H[6] // ......*......................................................................................................................................................................................... + // mla v23.8H, v16.8H, v8.H[0] // ...................................*............................................................................................................................................................ + // sqrdmulh v16.8H, v5.8H, v1.H[1] // ........*....................................................................................................................................................................................... + // sub v24.8H, v13.8H, v14.8H // ......................*......................................................................................................................................................................... + // mul v5.8H, v5.8H, v1.H[0] // .........*...................................................................................................................................................................................... + // mla v11.8H, v27.8H, v8.H[0] // .....................*.......................................................................................................................................................................... + // mla v15.8H, v26.8H, v8.H[0] // .........................*...................................................................................................................................................................... + // add v31.8H, v13.8H, v14.8H // .......................*........................................................................................................................................................................ + // sub v26.8H, v21.8H, v22.8H // ..................*............................................................................................................................................................................. + // mla v9.8H, v18.8H, v8.H[0] // ................*............................................................................................................................................................................... + // add v27.8H, v24.8H, v29.8H // ..............................*................................................................................................................................................................. + // mla v25.8H, v10.8H, v8.H[0] // .................................................................*.............................................................................................................................. + // add v21.8H, v19.8H, v17.8H // ..........................*..................................................................................................................................................................... + // add v14.8H, v31.8H, v23.8H // .......................................................*........................................................................................................................................ + // mla v5.8H, v16.8H, v8.H[0] // ............................*................................................................................................................................................................... + // sub v31.8H, v31.8H, v23.8H // .....................................................................*.......................................................................................................................... + // mla v30.8H, v6.8H, v8.H[0] // ......................................*......................................................................................................................................................... + // mul v18.8H, v27.8H, v3.H[2] // ............................................................*................................................................................................................................... + // sqrdmulh v16.8H, v27.8H, v3.H[3] // .............................................................*.................................................................................................................................. + // add v20.8H, v21.8H, v15.8H // ..........................................*..................................................................................................................................................... + // sub v23.8H, v12.8H, v11.8H // ................................*............................................................................................................................................................... + // sqrdmulh v10.8H, v14.8H, v2.H[7] // ............................................................................*................................................................................................................... + // mul v6.8H, v14.8H, v2.H[6] // ...........................................................................*.................................................................................................................... + // mul v13.8H, v31.8H, v3.H[0] // .....................................................................................*.......................................................................................................... + // sqrdmulh v27.8H, v31.8H, v3.H[1] // ......................................................................................*......................................................................................................... + // add v22.8H, v20.8H, v25.8H // .............................................................................................................*.................................................................................. + // sub v14.8H, v26.8H, v5.8H // ...........................................*.................................................................................................................................................... + // add v5.8H, v26.8H, v5.8H // ........................................*....................................................................................................................................................... + // sub v31.8H, v20.8H, v25.8H // ......................................................................................................*......................................................................................... + // add v26.8H, v23.8H, v30.8H // ....................................................*........................................................................................................................................... + // sub v20.8H, v24.8H, v29.8H // ...........................*.................................................................................................................................................................... + // mla v18.8H, v16.8H, v8.H[0] // ..........................................................................*..................................................................................................................... + // add v16.8H, v12.8H, v11.8H // ...............................*................................................................................................................................................................ + // str q31, [x6, #32] // ......................................................................................................................*......................................................................... + // str q22, [x6], #16 // .........................................................................................................................................................*...................................... + // sqrdmulh v29.8H, v14.8H, v2.H[5] // ...............................................................*................................................................................................................................ + // mul v11.8H, v5.8H, v2.H[2] // .................................................*.............................................................................................................................................. + // mla v6.8H, v10.8H, v8.H[0] // .............................................................................*.................................................................................................................. + // mla v13.8H, v27.8H, v8.H[0] // .................................................................................................*.............................................................................................. + // sub v31.8H, v16.8H, v9.8H // ..............................................................................................*................................................................................................. + // sqrdmulh v27.8H, v5.8H, v2.H[3] // ...............................................*................................................................................................................................................ + // mul v24.8H, v14.8H, v2.H[4] // ................................................................*............................................................................................................................... + // add v10.8H, v16.8H, v9.8H // .......................................................................................*........................................................................................................ + // sub v14.8H, v26.8H, v18.8H // .............................................................................................................................................*.................................................. + // mul v22.8H, v20.8H, v3.H[4] // .........................................*...................................................................................................................................................... + // sqrdmulh v5.8H, v20.8H, v3.H[5] // ............................................*................................................................................................................................................... + // mla v28.8H, v4.8H, v8.H[0] // .....*.......................................................................................................................................................................................... + // add v12.8H, v31.8H, v13.8H // ....................................................................................................................*........................................................................... + // sub v25.8H, v10.8H, v6.8H // .....................................................................................................................................*.......................................................... + // add v16.8H, v10.8H, v6.8H // ..........................................................................................................................*..................................................................... + // sub v10.8H, v19.8H, v17.8H // .......*........................................................................................................................................................................................ + // str q14, [x14, #160] // ........................................................................................................................................................*....................................... + // mla v11.8H, v27.8H, v8.H[0] // ....................................................................*........................................................................................................................... + // mla v24.8H, v29.8H, v8.H[0] // .........................................................................*...................................................................................................................... + // mul v9.8H, v7.8H, v2.H[0] // ..........*..................................................................................................................................................................................... + // sqrdmulh v20.8H, v7.8H, v2.H[1] // ...........*.................................................................................................................................................................................... + // str q16, [x14], #16 // ..................................................................................................................................*............................................................. + // str q25, [x14, #16] // ...................................................................................................................................................*............................................ + // sub v6.8H, v10.8H, v28.8H // .......................................*........................................................................................................................................................ + // mla v22.8H, v5.8H, v8.H[0] // ........................................................*....................................................................................................................................... + // sub v16.8H, v23.8H, v30.8H // .....................................................*.......................................................................................................................................... + // add v29.8H, v10.8H, v28.8H // ....................*........................................................................................................................................................................... + // str q12, [x14, #48] // ............................................................................................................................*................................................................... + // sub v7.8H, v21.8H, v15.8H // ....................................*........................................................................................................................................................... + // sub v27.8H, v31.8H, v13.8H // ...................................................................................................................*............................................................................ + // add v13.8H, v26.8H, v18.8H // ...................................................................................................................................*............................................................ + // sub v15.8H, v29.8H, v11.8H // ...........................................................................................*.................................................................................................... + // sub v26.8H, v6.8H, v24.8H // ..................................................................................*............................................................................................................. + // mla v9.8H, v20.8H, v8.H[0] // ...................*............................................................................................................................................................................ + // add v19.8H, v6.8H, v24.8H // .................................................................................................................*.............................................................................. + // sub v14.8H, v16.8H, v22.8H // ........................................................................*....................................................................................................................... + // add v6.8H, v16.8H, v22.8H // .......................................................................*........................................................................................................................ + // str q27, [x14, #80] // ........................................................................................................................................*....................................................... + // add v16.8H, v29.8H, v11.8H // .........................................................................................*...................................................................................................... + // str q13, [x14, #112] // ....................................................................................................................................................*........................................... + // str q15, [x6, #144] // ........................................................................................................*....................................................................................... + // str q19, [x6, #176] // .........................................................................................................................*...................................................................... + // str q16, [x6, #112] // ....................................................................................................*........................................................................................... + // str q6, [x14, #176] // ....................................................................................*........................................................................................................... + // add v25.8H, v7.8H, v9.8H // ......................................................*......................................................................................................................................... + // sub v27.8H, v7.8H, v9.8H // .........................................................*...................................................................................................................................... + // str q14, [x14, #208] // ........................................................................................*....................................................................................................... + // str q26, [x6, #208] // ..........................................................................................*..................................................................................................... + // str q27, [x6, #80] // ..............................................................*................................................................................................................................. + // str q25, [x6, #48] // ...........................................................*.................................................................................................................................... + // ldr q15, [x14, #192] // ............*................................................................................................................................................................................... + // ldr q16, [x14, #224] // .................*.............................................................................................................................................................................. + // ldr q27, [x14, #160] // ..............................................*................................................................................................................................................. + // ldr q14, [x14, #64] // .....................................................................................................*.......................................................................................... + // ldr q6, [x14, #32] // ...............................................................................*................................................................................................................ + // ldr q26, [x6, #0] // ..........................................................................................................*..................................................................................... + // ldr q7, [x14, #128] // ..........................................................*..................................................................................................................................... + // ldr q11, [x14, #96] // ..............................................................................*................................................................................................................. + // ldr q13, [x14, #0] // ...............................................................................................*................................................................................................ + // ldr q31, [x6, #224] // ...................................................*............................................................................................................................................ + // ldr q25, [x6, #64] // ..........................................................................................................................................*..................................................... + // ldr q22, [x6, #192] // ................................................................................................................*............................................................................... + // mul v17.8H, v15.8H, v0.H[0] // ..................................................*............................................................................................................................................. + // sqrdmulh v15.8H, v15.8H, v0.H[1] // .............................................*.................................................................................................................................................. + // mul v29.8H, v16.8H, v0.H[0] // .................................*.............................................................................................................................................................. + // sqrdmulh v16.8H, v16.8H, v0.H[1] // .............................*.................................................................................................................................................................. + // ldr q23, [x6, #160] // .................................................................................*.............................................................................................................. + // ldr q12, [x6, #96] // ............................................................................................................*................................................................................... + // ldr q19, [x6, #128] // .......................................................................................................................................*........................................................ + // mla v29.8H, v16.8H, v8.H[0] // ................................................*............................................................................................................................................... + // mul v9.8H, v27.8H, v0.H[0] // ..................................................................*............................................................................................................................. + // sqrdmulh v5.8H, v27.8H, v0.H[1] // ......................................................................*......................................................................................................................... + // mla v17.8H, v15.8H, v8.H[0] // ...................................................................*............................................................................................................................ + // sqrdmulh v16.8H, v14.8H, v0.H[1] // ........................................................................................................................*....................................................................... + // mul v27.8H, v14.8H, v0.H[0] // ...........................................................................................................................*.................................................................... + // mul v24.8H, v7.8H, v0.H[0] // .............................................................................................................................*.................................................................. + // sqrdmulh v21.8H, v7.8H, v0.H[1] // ...............................................................................................................................*................................................................ + // mul v18.8H, v13.8H, v0.H[0] // ..................................................................................................*............................................................................................. + // sqrdmulh v4.8H, v13.8H, v0.H[1] // ...................................................................................................*............................................................................................ + // mul v14.8H, v11.8H, v0.H[0] // .........................................................................................................*...................................................................................... + // sqrdmulh v20.8H, v11.8H, v0.H[1] // ...........................................................................................................*.................................................................................... + // mla v9.8H, v5.8H, v8.H[0] // ................................................................................*............................................................................................................... + // sub v15.8H, v22.8H, v17.8H // .........................................................................................................................................*...................................................... + // sub v30.8H, v31.8H, v29.8H // .......................................................................................................*........................................................................................ + // mul v11.8H, v6.8H, v0.H[0] // ...............................................................................................................*................................................................................ + // add v7.8H, v22.8H, v17.8H // .................................................................................................................................*.............................................................. + // add v22.8H, v31.8H, v29.8H // ...................................................................................*............................................................................................................ + // mla v27.8H, v16.8H, v8.H[0] // ............................................................................................................................................*................................................... + // sqrdmulh v17.8H, v22.8H, v0.H[3] // .............................................................................................*.................................................................................................. + // mul v5.8H, v22.8H, v0.H[2] // ............................................................................................*................................................................................................... + // sqrdmulh v22.8H, v30.8H, v0.H[5] // .......................................................................................................................*........................................................................ + // mul v13.8H, v30.8H, v0.H[4] // .....................................................................................................................*.......................................................................... + // mul v30.8H, v15.8H, v0.H[4] // ............................................................................................................................................................*................................... + // sub v16.8H, v23.8H, v9.8H // ..........................................................................................................................................................*..................................... + // mla v14.8H, v20.8H, v8.H[0] // ..............................................................................................................................*................................................................. + // sqrdmulh v28.8H, v6.8H, v0.H[1] // ..............................................................................................................*................................................................................. + // mla v24.8H, v21.8H, v8.H[0] // .................................................................................................................................................*.............................................. + // sqrdmulh v20.8H, v7.8H, v0.H[3] // ......................................................................................................................................................*......................................... + // mul v29.8H, v7.8H, v0.H[2] // .....................................................................................................................................................*.......................................... + // add v31.8H, v25.8H, v27.8H // ..............................................................................................................................................................*................................. + // add v21.8H, v23.8H, v9.8H // ..................................................................................................................................................*............................................. + // mla v5.8H, v17.8H, v8.H[0] // ................................................................................................*............................................................................................... + // mla v13.8H, v22.8H, v8.H[0] // ......................................................................................................................................*......................................................... + // ldr q23, [x6, #32] // ................................................................................................................................................*............................................... + // sqrdmulh v6.8H, v16.8H, v0.H[5] // ...........................................................................................................................................................................*.................... + // add v9.8H, v12.8H, v14.8H // ...............................................................................................................................................*................................................ + // sqrdmulh v7.8H, v15.8H, v0.H[5] // .......................................................................................................................................................*........................................ + // mla v11.8H, v28.8H, v8.H[0] // ...........................................................................................................................................*.................................................... + // mla v30.8H, v7.8H, v8.H[0] // ........................................................................................................................................................................*....................... + // add v7.8H, v9.8H, v5.8H // ...............................................................................................................................................................*................................ + // sqrdmulh v17.8H, v21.8H, v0.H[3] // .............................................................................................................................................................*.................................. + // mul v22.8H, v21.8H, v0.H[2] // ................................................................................................................................................................*............................... + // mla v18.8H, v4.8H, v8.H[0] // ..................................................................................................................*............................................................................. + // add v4.8H, v19.8H, v24.8H // ..................................................................................................................................................................*............................. + // sub v28.8H, v25.8H, v27.8H // .......................................................................................................................................................................*........................ + // mla v29.8H, v20.8H, v8.H[0] // .................................................................................................................................................................*.............................. + // add v21.8H, v23.8H, v11.8H // ....................................................................................................................................................................*........................... + // sub v25.8H, v12.8H, v14.8H // ..............................................................................................................................................*................................................. + // sqrdmulh v20.8H, v7.8H, v0.H[7] // .....................................................................................................................................................................*.......................... + // mul v7.8H, v7.8H, v0.H[6] // ......................................................................................................................................................................*......................... + // sub v27.8H, v19.8H, v24.8H // ...............................................................................................................................................................................*................ + // mul v14.8H, v16.8H, v0.H[4] // .............................................................................................................................................................................*.................. + // mla v22.8H, v17.8H, v8.H[0] // ..........................................................................................................................................................................*..................... + // add v10.8H, v28.8H, v30.8H // ....................................................................................................................................................................................*........... + // sub v12.8H, v26.8H, v18.8H // ................................................................................................................................*............................................................... + // add v19.8H, v26.8H, v18.8H // ....................................................................................................................................*........................................................... + // add v26.8H, v31.8H, v29.8H // .................................................................................................................................................................................*.............. + // mul v17.8H, v4.8H, v0.H[2] // .........................................................................................................................................................................*...................... + // sub v24.8H, v25.8H, v13.8H // ...................................................................................................................................................................*............................ + // mla v7.8H, v20.8H, v8.H[0] // ...................................................................................................................................................................................*............ + // sqrdmulh v16.8H, v4.8H, v0.H[3] // ............................................................................................................................................................................*................... + // sqrdmulh v18.8H, v10.8H, v1.H[3] // ...............................................................................................................................................................................................* + // add v20.8H, v21.8H, v22.8H // .......................................................................................................................................................................................*........ + // sub v15.8H, v31.8H, v29.8H // ................................................................................................................................................................................*............... + // mla v14.8H, v6.8H, v8.H[0] // ...........................................................................................................................................................................................*.... + // mul v29.8H, v24.8H, v1.H[4] // ............................................................................................................................................................................................*... + // add v31.8H, v25.8H, v13.8H // ...........................................................................................................................................................*.................................... + // sqrdmulh v25.8H, v24.8H, v1.H[5] // ..................................................................................................................................................................................*............. + // sub v6.8H, v28.8H, v30.8H // ........................................................................................................................................................................................*....... + // mla v17.8H, v16.8H, v8.H[0] // .....................................................................................................................................................................................*.......... + // add v24.8H, v20.8H, v7.8H // .............................................................................................................................................................................................*.. + // sqrdmulh v4.8H, v15.8H, v1.H[1] // .........................................................................................................................................................................................*...... + // mul v28.8H, v15.8H, v1.H[0] // ......................................................................................................................................................................................*......... + // sqrdmulh v16.8H, v31.8H, v1.H[3] // ..........................................................................................................................................................................................*..... + // mul v30.8H, v6.8H, v1.H[4] // ..............................................................................................................................................................................................*. + // sub v13.8H, v23.8H, v11.8H // ..............................................................................................................................................................................*................. + + sub count, count, #1 + cbnz count, layer1234_start + mul v11.8H, v27.8H, v0.H[4] // .............................................................................*.................................................................................................................. + mul v15.8H, v26.8H, v0.H[6] // .................................................................................................*.............................................................................................. + mul v23.8H, v31.8H, v1.H[2] // ..........................................................................................................................*..................................................................... + sub v7.8H, v20.8H, v7.8H // ........................................................................................................*....................................................................................... + sqrdmulh v27.8H, v27.8H, v0.H[5] // ............................................................................*................................................................................................................... + sub v5.8H, v9.8H, v5.8H // ..........................................................................*..................................................................................................................... + sqrdmulh v6.8H, v6.8H, v1.H[5] // ..............................................................................................................................*................................................................. + sqrdmulh v26.8H, v26.8H, v0.H[7] // ................................................................................................*............................................................................................... + mul v9.8H, v10.8H, v1.H[2] // .....................................................................................................................*.......................................................................... + mla v29.8H, v25.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + sqrdmulh v10.8H, v24.8H, v1.H[7] // ........................................................................................................................................*....................................................... + mul v25.8H, v24.8H, v1.H[6] // .........................................................................................................................................*...................................................... + mla v23.8H, v16.8H, v8.H[0] // ...........................................................................................................................*.................................................................... + sqrdmulh v16.8H, v5.8H, v1.H[1] // ...............................................................................................................*................................................................................ + sub v24.8H, v13.8H, v14.8H // ....................................................................................*........................................................................................................... + mul v5.8H, v5.8H, v1.H[0] // ................................................................................................................*............................................................................... + mla v11.8H, v27.8H, v8.H[0] // ..............................................................................*................................................................................................................. + mla v15.8H, v26.8H, v8.H[0] // ..................................................................................................*............................................................................................. + add v31.8H, v13.8H, v14.8H // .....................................................................................*.......................................................................................................... + sub v26.8H, v21.8H, v22.8H // ................................................................*............................................................................................................................... + mla v9.8H, v18.8H, v8.H[0] // ......................................................................................................................*......................................................................... + add v27.8H, v24.8H, v29.8H // .......................................................................................................................................*........................................................ + mla v25.8H, v10.8H, v8.H[0] // ..........................................................................................................................................*..................................................... + add v21.8H, v19.8H, v17.8H // ............................................................*................................................................................................................................... + add v14.8H, v31.8H, v23.8H // .............................................................................................................................*.................................................................. + mla v5.8H, v16.8H, v8.H[0] // .................................................................................................................*.............................................................................. + sub v31.8H, v31.8H, v23.8H // ............................................................................................................................*................................................................... + mla v30.8H, v6.8H, v8.H[0] // ................................................................................................................................*............................................................... + mul v18.8H, v27.8H, v3.H[2] // .......................................................................................................................................................................*........................ + sqrdmulh v16.8H, v27.8H, v3.H[3] // ......................................................................................................................................................................*......................... + add v20.8H, v21.8H, v15.8H // ....................................................................................................*........................................................................................... + sub v23.8H, v12.8H, v11.8H // ...............................................................................*................................................................................................................ + sqrdmulh v10.8H, v14.8H, v2.H[7] // ............................................................................................................................................................*................................... + mul v6.8H, v14.8H, v2.H[6] // .............................................................................................................................................................*.................................. + mul v13.8H, v31.8H, v3.H[0] // ..................................................................................................................................................................*............................. + sqrdmulh v27.8H, v31.8H, v3.H[1] // .................................................................................................................................................................*.............................. + add v22.8H, v20.8H, v25.8H // ............................................................................................................................................*................................................... + sub v14.8H, v26.8H, v5.8H // ..................................................................................................................*............................................................................. + add v5.8H, v26.8H, v5.8H // ...................................................................................................................*............................................................................ + sub v31.8H, v20.8H, v25.8H // ...........................................................................................................................................*.................................................... + add v26.8H, v23.8H, v30.8H // ..................................................................................................................................*............................................................. + sub v20.8H, v24.8H, v29.8H // ......................................................................................................................................*......................................................... + mla v18.8H, v16.8H, v8.H[0] // ........................................................................................................................................................................*....................... + add v16.8H, v12.8H, v11.8H // ................................................................................*............................................................................................................... + str q31, [x6, #32] // .................................................................................................................................................................................*.............. + str q22, [x6], #16 // ................................................................................................................................................................................*............... + sqrdmulh v29.8H, v14.8H, v2.H[5] // .......................................................................................................................................................*........................................ + mul v11.8H, v5.8H, v2.H[2] // ...................................................................................................................................................*............................................ + mla v6.8H, v10.8H, v8.H[0] // ..............................................................................................................................................................*................................. + mla v13.8H, v27.8H, v8.H[0] // ...................................................................................................................................................................*............................ + sub v31.8H, v16.8H, v9.8H // .......................................................................................................................*........................................................................ + sqrdmulh v27.8H, v5.8H, v2.H[3] // ..................................................................................................................................................*............................................. + mul v24.8H, v14.8H, v2.H[4] // ........................................................................................................................................................*....................................... + add v10.8H, v16.8H, v9.8H // ........................................................................................................................*....................................................................... + sub v14.8H, v26.8H, v18.8H // .........................................................................................................................................................................*...................... + mul v22.8H, v20.8H, v3.H[4] // ............................................................................................................................................................................*................... + sqrdmulh v5.8H, v20.8H, v3.H[5] // ...........................................................................................................................................................................*.................... + mla v28.8H, v4.8H, v8.H[0] // ............................................................................................................*................................................................................... + add v12.8H, v31.8H, v13.8H // .....................................................................................................................................................................*.......................... + sub v25.8H, v10.8H, v6.8H // ...............................................................................................................................................................*................................ + add v16.8H, v10.8H, v6.8H // ................................................................................................................................................................*............................... + sub v10.8H, v19.8H, v17.8H // ...........................................................*.................................................................................................................................... + str q14, [x14, #160] // .............................................................................................................................................................................................*.. + mla v11.8H, v27.8H, v8.H[0] // ....................................................................................................................................................*........................................... + mla v24.8H, v29.8H, v8.H[0] // .........................................................................................................................................................*...................................... + mul v9.8H, v7.8H, v2.H[0] // ..............................................................................................................................................*................................................. + sqrdmulh v20.8H, v7.8H, v2.H[1] // .............................................................................................................................................*.................................................. + str q16, [x14], #16 // ........................................................................................................................................................................................*....... + str q25, [x14, #16] // .........................................................................................................................................................................................*...... + sub v6.8H, v10.8H, v28.8H // .............................................................................................................*.................................................................................. + mla v22.8H, v5.8H, v8.H[0] // .............................................................................................................................................................................*.................. + sub v16.8H, v23.8H, v30.8H // .................................................................................................................................*.............................................................. + add v29.8H, v10.8H, v28.8H // ..............................................................................................................*................................................................................. + str q12, [x14, #48] // ..........................................................................................................................................................................................*..... + sub v7.8H, v21.8H, v15.8H // ...................................................................................................*............................................................................................ + sub v27.8H, v31.8H, v13.8H // ....................................................................................................................................................................*........................... + add v13.8H, v26.8H, v18.8H // ..........................................................................................................................................................................*..................... + sub v15.8H, v29.8H, v11.8H // .....................................................................................................................................................*.......................................... + sub v26.8H, v6.8H, v24.8H // ..........................................................................................................................................................*..................................... + mla v9.8H, v20.8H, v8.H[0] // ...............................................................................................................................................*................................................ + add v19.8H, v6.8H, v24.8H // ...........................................................................................................................................................*.................................... + sub v14.8H, v16.8H, v22.8H // ..............................................................................................................................................................................*................. + add v6.8H, v16.8H, v22.8H // ...............................................................................................................................................................................*................ + str q27, [x14, #80] // ...........................................................................................................................................................................................*.... + add v16.8H, v29.8H, v11.8H // ......................................................................................................................................................*......................................... + str q13, [x14, #112] // ............................................................................................................................................................................................*... + str q15, [x6, #144] // .....................................................................................................................................................................................*.......... + str q19, [x6, #176] // ......................................................................................................................................................................................*......... + str q16, [x6, #112] // ....................................................................................................................................................................................*........... + str q6, [x14, #176] // ..............................................................................................................................................................................................*. + add v25.8H, v7.8H, v9.8H // .................................................................................................................................................*.............................................. + sub v27.8H, v7.8H, v9.8H // ................................................................................................................................................*............................................... + str q14, [x14, #208] // ...............................................................................................................................................................................................* + str q26, [x6, #208] // .......................................................................................................................................................................................*........ + str q27, [x6, #80] // ...................................................................................................................................................................................*............ + str q25, [x6, #48] // ..................................................................................................................................................................................*............. + + restore inp, STACK0 + mov count, #4 + + ASM_LOAD(r_ptr1, roots_l456) + + add src0, inp, #256*0 + add src1, inp, #256*1 + + .p2align 2 + // Instructions: 127 + // Expected cycles: 49 + // Expected IPC: 2.59 + // + // Cycle bound: 49.0 + // IPC bound: 2.59 + // + // Wall time: 59.87s + // User time: 59.87s + // + // ----------------------------------------------------- original position ------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|- + ldr q11, [x4, #128] // ......................................................*........................................................................ + ldr q12, [x4, #112] // .....................................................*......................................................................... + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x6] // *.............................................................................................................................. + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x7] // .*............................................................................................................................. + ldr q23, [x4, #208] // ...*........................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + ldr q10, [x4, #192] // ........*...................................................................................................................... + ldr q9, [x4, #144] // ..*............................................................................................................................ + ldr q31, [x4, #176] // .................................................*............................................................................. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + ldr q13, [x4, #32] // ........................*...................................................................................................... + ldr q17, [x4, #96] // ............................................*.................................................................................. + ldr q30, [x4, #64] // ...............................*............................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + ldr q14, [x4, #16] // ....*.......................................................................................................................... + ldr q15, [x4], #16*14 // .......*....................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + ldr q2, [x4, #-64] // .........*..................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + ldr q0, [x4, #-176] // .....*......................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn1 v16.4S, v6.4S, v22.4S // .........................*..................................................................................................... + trn2 v28.4S, v6.4S, v22.4S // .............*................................................................................................................. + trn2 v7.4S, v3.4S, v19.4S // ............*.................................................................................................................. + ldr q18, [x4, #-144] // ......*........................................................................................................................ + trn2 v6.4S, v4.4S, v20.4S // ...............*............................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn2 v1.4S, v5.4S, v21.4S // ...........*................................................................................................................... + trn1 v5.4S, v5.4S, v21.4S // ..........*.................................................................................................................... + trn1 v24.4S, v4.4S, v20.4S // ..........................*.................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sqrdmulh v26.8H, v28.8H, v14.8H // ...................*........................................................................................................... + mul v21.8H, v28.8H, v15.8H // ..................*............................................................................................................ + mul v20.8H, v7.8H, v15.8H // .................*............................................................................................................. + mul v4.8H, v6.8H, v15.8H // ....................*.......................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mul v22.8H, v1.8H, v15.8H // .....................*......................................................................................................... + sqrdmulh v29.8H, v6.8H, v14.8H // ......................*........................................................................................................ + trn1 v28.4S, v3.4S, v19.4S // ..............*................................................................................................................ + sqrdmulh v27.8H, v1.8H, v14.8H // .......................*....................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sqrdmulh v14.8H, v7.8H, v14.8H // ................*.............................................................................................................. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v21.8H, v26.8H, v8.H[0] // ............................*.................................................................................................. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v4.8H, v29.8H, v8.H[0] // ..............................*................................................................................................ + mla v22.8H, v27.8H, v8.H[0] // .............................*................................................................................................. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v20.8H, v14.8H, v8.H[0] // ...........................*................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sub v27.8H, v16.8H, v21.8H // .................................*............................................................................................. + add v21.8H, v16.8H, v21.8H // ..................................*............................................................................................ + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sub v14.8H, v5.8H, v22.8H // ...................................*........................................................................................... + add v26.8H, v5.8H, v22.8H // ....................................*.......................................................................................... + sub v22.8H, v24.8H, v4.8H // .....................................*......................................................................................... + add v15.8H, v24.8H, v4.8H // ..............................................*................................................................................ + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sqrdmulh v6.8H, v21.8H, v0.8H // ........................................*...................................................................................... + mul v3.8H, v21.8H, v13.8H // .........................................*..................................................................................... + sqrdmulh v16.8H, v27.8H, v18.8H // ......................................*........................................................................................ + mul v1.8H, v27.8H, v30.8H // .......................................*....................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mul v5.8H, v26.8H, v13.8H // ...........................................*................................................................................... + sqrdmulh v18.8H, v14.8H, v18.8H // ................................................*.............................................................................. + mul v14.8H, v14.8H, v30.8H // ..................................................*............................................................................ + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sqrdmulh v27.8H, v26.8H, v0.8H // ..........................................*.................................................................................... + add v25.8H, v28.8H, v20.8H // ...................................................*........................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v1.8H, v16.8H, v8.H[0] // ...............................................*............................................................................... + mla v3.8H, v6.8H, v8.H[0] // .............................................*................................................................................. + sub v19.8H, v28.8H, v20.8H // ................................*.............................................................................................. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v14.8H, v18.8H, v8.H[0] // ............................................................*.................................................................. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v5.8H, v27.8H, v8.H[0] // ....................................................*.......................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + add v18.8H, v22.8H, v1.8H // ..........................................................*.................................................................... + sub v26.8H, v22.8H, v1.8H // .........................................................*..................................................................... + add v24.8H, v15.8H, v3.8H // ........................................................*...................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sub v0.8H, v15.8H, v3.8H // .......................................................*....................................................................... + add v15.8H, v19.8H, v14.8H // ...........................................................................*................................................... + sub v16.8H, v19.8H, v14.8H // ........................................................................*...................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mul v4.8H, v24.8H, v17.8H // ...............................................................*............................................................... + sqrdmulh v29.8H, v24.8H, v12.8H // .............................................................*................................................................. + sqrdmulh v22.8H, v18.8H, v31.8H // ................................................................*.............................................................. + mul v13.8H, v18.8H, v2.8H // ..............................................................*................................................................ + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sqrdmulh v7.8H, v0.8H, v9.8H // .................................................................*............................................................. + sqrdmulh v28.8H, v26.8H, v23.8H // ..................................................................*............................................................ + mul v26.8H, v26.8H, v10.8H // ...................................................................*........................................................... + mul v6.8H, v0.8H, v11.8H // ....................................................................*.......................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sub v21.8H, v25.8H, v5.8H // ...........................................................*................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v4.8H, v29.8H, v8.H[0] // ......................................................................*........................................................ + add v27.8H, v25.8H, v5.8H // .....................................................................*......................................................... + mla v13.8H, v22.8H, v8.H[0] // .......................................................................*....................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v26.8H, v28.8H, v8.H[0] // ..........................................................................*.................................................... + mla v6.8H, v7.8H, v8.H[0] // .........................................................................*..................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + add v29.8H, v27.8H, v4.8H // ..............................................................................*................................................ + sub v23.8H, v15.8H, v13.8H // ............................................................................*.................................................. + add v1.8H, v15.8H, v13.8H // .............................................................................*................................................. + sub v17.8H, v27.8H, v4.8H // ...............................................................................*............................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + add v22.8H, v16.8H, v26.8H // ...................................................................................*........................................... + sub v19.8H, v16.8H, v26.8H // ..................................................................................*............................................ + add v3.8H, v21.8H, v6.8H // .................................................................................*............................................. + sub v30.8H, v21.8H, v6.8H // ................................................................................*.............................................. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sqdmulh v0.8H, v23.8H, v8.H[1] // .......................................................................................*....................................... + sqdmulh v7.8H, v1.8H, v8.H[1] // ......................................................................................*........................................ + sqdmulh v25.8H, v29.8H, v8.H[1] // ....................................................................................*.......................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sqdmulh v13.8H, v17.8H, v8.H[1] // .....................................................................................*......................................... + sqdmulh v11.8H, v22.8H, v8.H[1] // .........................................................................................*..................................... + sqdmulh v16.8H, v19.8H, v8.H[1] // ..........................................................................................*.................................... + sqdmulh v2.8H, v3.8H, v8.H[1] // ........................................................................................*...................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + sqdmulh v15.8H, v30.8H, v8.H[1] // ...........................................................................................*................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + srshr v5.8H, v25.8H, #11 // ............................................................................................*.................................. + srshr v26.8H, v7.8H, #11 // ..............................................................................................*................................ + srshr v6.8H, v0.8H, #11 // ...............................................................................................*............................... + srshr v0.8H, v13.8H, #11 // .............................................................................................*................................. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + srshr v14.8H, v2.8H, #11 // ................................................................................................*.............................. + srshr v27.8H, v15.8H, #11 // .................................................................................................*............................. + srshr v15.8H, v16.8H, #11 // ...................................................................................................*........................... + srshr v16.8H, v11.8H, #11 // ..................................................................................................*............................ + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v17.8H, v0.8H, v8.H[0] // .....................................................................................................*......................... + mla v29.8H, v5.8H, v8.H[0] // ....................................................................................................*.......................... + mla v23.8H, v6.8H, v8.H[0] // .......................................................................................................*....................... + mla v1.8H, v26.8H, v8.H[0] // ......................................................................................................*........................ + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + mla v30.8H, v27.8H, v8.H[0] // ..........................................................................................................*.................... + mla v19.8H, v15.8H, v8.H[0] // ........................................................................................................*...................... + mla v3.8H, v14.8H, v8.H[0] // ...........................................................................................................*................... + mla v22.8H, v16.8H, v8.H[0] // .........................................................................................................*..................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn2 v25.4S, v29.4S, v1.4S // ............................................................................................................*.................. + trn1 v26.4S, v29.4S, v1.4S // .............................................................................................................*................. + trn1 v20.4S, v17.4S, v23.4S // ...............................................................................................................*............... + trn2 v31.4S, v17.4S, v23.4S // ..............................................................................................................*................ + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn2 v16.4S, v30.4S, v19.4S // .................................................................................................................*............. + trn1 v27.4S, v30.4S, v19.4S // ..................................................................................................................*............ + trn1 v14.4S, v3.4S, v22.4S // ................................................................................................................*.............. + trn2 v23.4S, v3.4S, v22.4S // ...................................................................................................................*........... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn1 v11.4S, v26.4S, v20.4S // ....................................................................................................................*.......... + trn1 v6.4S, v25.4S, v31.4S // ......................................................................................................................*........ + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn2 v7.4S, v26.4S, v20.4S // .....................................................................................................................*......... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn1 v15.4S, v23.4S, v16.4S // ..........................................................................................................................*.... + trn1 v0.4S, v14.4S, v27.4S // ........................................................................................................................*...... + trn2 v26.4S, v14.4S, v27.4S // .......................................................................................................................*....... + trn2 v27.4S, v23.4S, v16.4S // .........................................................................................................................*..... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn1 v16.2D, v6.2D, v15.2D // ............................................................................................................................*.. + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + trn1 v14.2D, v11.2D, v0.2D // ...........................................................................................................................*... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + str q14, [x6], #64 // .............................................................................................................................*. + str q16, [x7], #64 // ..............................................................................................................................* + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + // gap // ............................................................................................................................... + + // -------------------------------------------------------- new position --------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|- + // ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x6] // ..*............................................................................................................................ + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x7] // ...*........................................................................................................................... + // ldr q17, [x4, #144] // ......*........................................................................................................................ + // ldr q29, [x4, #208] // ....*.......................................................................................................................... + // ldr q16, [x4, #16] // ...........*................................................................................................................... + // ldr q25, [x4, #48] // ..............*................................................................................................................ + // ldr q13, [x4, #80] // ..................*............................................................................................................ + // ldr q26, [x4], #16*14 // ............*.................................................................................................................. + // ldr q3, [x4, #-32] // .....*......................................................................................................................... + // ldr q4, [x4, #-64] // .............*................................................................................................................. + // trn1 v19.4S, v23.4S, v11.4S // .....................*......................................................................................................... + // trn2 v28.4S, v23.4S, v11.4S // ....................*.......................................................................................................... + // trn2 v7.4S, v21.4S, v9.4S // .................*............................................................................................................. + // trn2 v0.4S, v24.4S, v12.4S // ................*.............................................................................................................. + // trn1 v5.4S, v21.4S, v9.4S // .............................*................................................................................................. + // trn2 v23.4S, v22.4S, v10.4S // ...................*........................................................................................................... + // sqrdmulh v6.8H, v7.8H, v16.8H // ...............................*............................................................................................... + // mul v7.8H, v7.8H, v26.8H // .........................*..................................................................................................... + // mul v14.8H, v0.8H, v26.8H // ........................*...................................................................................................... + // sqrdmulh v21.8H, v0.8H, v16.8H // .......................*....................................................................................................... + // mul v31.8H, v23.8H, v26.8H // ..........................*.................................................................................................... + // mul v15.8H, v28.8H, v26.8H // ...........................*................................................................................................... + // sqrdmulh v27.8H, v23.8H, v16.8H // ............................*.................................................................................................. + // sqrdmulh v16.8H, v28.8H, v16.8H // ..............................*................................................................................................ + // ldr q23, [x4, #-192] // ........*...................................................................................................................... + // trn1 v11.4S, v24.4S, v12.4S // ...............*............................................................................................................... + // trn1 v0.4S, v22.4S, v10.4S // ......................*........................................................................................................ + // mla v7.8H, v6.8H, v8.H[0] // ...................................*........................................................................................... + // mla v14.8H, v21.8H, v8.H[0] // ................................*.............................................................................................. + // mla v15.8H, v16.8H, v8.H[0] // ..................................*............................................................................................ + // mla v31.8H, v27.8H, v8.H[0] // .................................*............................................................................................. + // ldr q1, [x4, #-160] // ..........*.................................................................................................................... + // sub v22.8H, v5.8H, v7.8H // .....................................................*......................................................................... + // sub v18.8H, v11.8H, v14.8H // ....................................*.......................................................................................... + // add v27.8H, v11.8H, v14.8H // .....................................*......................................................................................... + // sub v26.8H, v19.8H, v15.8H // ......................................*........................................................................................ + // add v2.8H, v19.8H, v15.8H // .......................................*....................................................................................... + // sub v10.8H, v0.8H, v31.8H // ........................................*...................................................................................... + // sqrdmulh v16.8H, v18.8H, v13.8H // ............................................*.................................................................................. + // mul v15.8H, v18.8H, v1.8H // .............................................*................................................................................. + // sqrdmulh v14.8H, v27.8H, v25.8H // ..........................................*.................................................................................... + // mul v28.8H, v27.8H, v23.8H // ...........................................*................................................................................... + // sqrdmulh v6.8H, v2.8H, v25.8H // .................................................*............................................................................. + // mul v25.8H, v2.8H, v23.8H // ..............................................*................................................................................ + // ldr q23, [x4, #-128] // .........*..................................................................................................................... + // mla v28.8H, v14.8H, v8.H[0] // ....................................................*.......................................................................... + // add v14.8H, v0.8H, v31.8H // .........................................*..................................................................................... + // mla v15.8H, v16.8H, v8.H[0] // ...................................................*........................................................................... + // sqrdmulh v16.8H, v26.8H, v13.8H // ...............................................*............................................................................... + // ldr q2, [x4, #-48] // .......*....................................................................................................................... + // mul v13.8H, v26.8H, v1.8H // ................................................*.............................................................................. + // add v26.8H, v5.8H, v7.8H // ..................................................*............................................................................ + // mla v25.8H, v6.8H, v8.H[0] // .......................................................*....................................................................... + // ldr q7, [x4, #-112] // .*............................................................................................................................. + // ldr q1, [x4, #-96] // *.............................................................................................................................. + // sub v6.8H, v14.8H, v28.8H // ...........................................................*................................................................... + // add v27.8H, v14.8H, v28.8H // ..........................................................*.................................................................... + // sub v5.8H, v10.8H, v15.8H // .........................................................*..................................................................... + // add v15.8H, v10.8H, v15.8H // ........................................................*...................................................................... + // sub v31.8H, v26.8H, v25.8H // ......................................................................*........................................................ + // mla v13.8H, v16.8H, v8.H[0] // ......................................................*........................................................................ + // sqrdmulh v7.8H, v27.8H, v7.8H // ...............................................................*............................................................... + // mul v20.8H, v15.8H, v4.8H // .................................................................*............................................................. + // mul v0.8H, v27.8H, v23.8H // ..............................................................*................................................................ + // sqrdmulh v14.8H, v15.8H, v2.8H // ................................................................*.............................................................. + // sqrdmulh v15.8H, v6.8H, v17.8H // ..................................................................*............................................................ + // sqrdmulh v16.8H, v5.8H, v29.8H // ...................................................................*........................................................... + // mul v27.8H, v5.8H, v3.8H // ....................................................................*.......................................................... + // mul v6.8H, v6.8H, v1.8H // .....................................................................*......................................................... + // add v4.8H, v26.8H, v25.8H // ........................................................................*...................................................... + // mla v0.8H, v7.8H, v8.H[0] // .......................................................................*....................................................... + // mla v20.8H, v14.8H, v8.H[0] // .........................................................................*..................................................... + // sub v11.8H, v22.8H, v13.8H // .............................................................*................................................................. + // mla v6.8H, v15.8H, v8.H[0] // ...........................................................................*................................................... + // mla v27.8H, v16.8H, v8.H[0] // ..........................................................................*.................................................... + // add v16.8H, v22.8H, v13.8H // ............................................................*.................................................................. + // sub v2.8H, v16.8H, v20.8H // .............................................................................*................................................. + // add v3.8H, v16.8H, v20.8H // ..............................................................................*................................................ + // add v29.8H, v4.8H, v0.8H // ............................................................................*.................................................. + // sub v1.8H, v4.8H, v0.8H // ...............................................................................*............................................... + // sub v17.8H, v31.8H, v6.8H // ...................................................................................*........................................... + // add v24.8H, v31.8H, v6.8H // ..................................................................................*............................................ + // sub v22.8H, v11.8H, v27.8H // .................................................................................*............................................. + // add v11.8H, v11.8H, v27.8H // ................................................................................*.............................................. + // sqdmulh v7.8H, v29.8H, v8.H[1] // ......................................................................................*........................................ + // sqdmulh v0.8H, v1.8H, v8.H[1] // .......................................................................................*....................................... + // sqdmulh v14.8H, v3.8H, v8.H[1] // .....................................................................................*......................................... + // sqdmulh v27.8H, v2.8H, v8.H[1] // ....................................................................................*.......................................... + // sqdmulh v26.8H, v24.8H, v8.H[1] // ..........................................................................................*.................................... + // sqdmulh v16.8H, v11.8H, v8.H[1] // ........................................................................................*...................................... + // sqdmulh v15.8H, v22.8H, v8.H[1] // .........................................................................................*..................................... + // sqdmulh v6.8H, v17.8H, v8.H[1] // ...........................................................................................*................................... + // srshr v7.8H, v7.8H, #11 // ............................................................................................*.................................. + // srshr v0.8H, v0.8H, #11 // ...............................................................................................*............................... + // srshr v14.8H, v14.8H, #11 // .............................................................................................*................................. + // srshr v27.8H, v27.8H, #11 // ..............................................................................................*................................ + // srshr v26.8H, v26.8H, #11 // ................................................................................................*.............................. + // srshr v6.8H, v6.8H, #11 // .................................................................................................*............................. + // srshr v16.8H, v16.8H, #11 // ...................................................................................................*........................... + // srshr v15.8H, v15.8H, #11 // ..................................................................................................*............................ + // mla v29.8H, v7.8H, v8.H[0] // .....................................................................................................*......................... + // mla v1.8H, v0.8H, v8.H[0] // ....................................................................................................*.......................... + // mla v3.8H, v14.8H, v8.H[0] // .......................................................................................................*....................... + // mla v2.8H, v27.8H, v8.H[0] // ......................................................................................................*........................ + // mla v22.8H, v15.8H, v8.H[0] // .........................................................................................................*..................... + // mla v11.8H, v16.8H, v8.H[0] // ...........................................................................................................*................... + // mla v17.8H, v6.8H, v8.H[0] // ........................................................................................................*...................... + // mla v24.8H, v26.8H, v8.H[0] // ..........................................................................................................*.................... + // trn2 v25.4S, v29.4S, v3.4S // ............................................................................................................*.................. + // trn1 v7.4S, v29.4S, v3.4S // .............................................................................................................*................. + // trn2 v31.4S, v1.4S, v2.4S // ...............................................................................................................*............... + // trn1 v0.4S, v1.4S, v2.4S // ..............................................................................................................*................ + // trn1 v13.4S, v24.4S, v11.4S // ..................................................................................................................*............ + // trn2 v15.4S, v17.4S, v22.4S // ................................................................................................................*.............. + // trn1 v14.4S, v17.4S, v22.4S // .................................................................................................................*............. + // trn2 v21.4S, v24.4S, v11.4S // ...................................................................................................................*........... + // trn1 v11.4S, v7.4S, v0.4S // ....................................................................................................................*.......... + // trn2 v7.4S, v7.4S, v0.4S // ......................................................................................................................*........ + // trn1 v6.4S, v25.4S, v31.4S // .....................................................................................................................*......... + // trn2 v26.4S, v13.4S, v14.4S // .........................................................................................................................*..... + // trn1 v0.4S, v13.4S, v14.4S // ........................................................................................................................*...... + // trn2 v27.4S, v21.4S, v15.4S // ..........................................................................................................................*.... + // trn1 v15.4S, v21.4S, v15.4S // .......................................................................................................................*....... + // trn1 v4.2D, v11.2D, v0.2D // ............................................................................................................................*.. + // trn1 v16.2D, v6.2D, v15.2D // ...........................................................................................................................*... + // str q4, [x6], #64 // .............................................................................................................................*. + // str q16, [x7], #64 // ..............................................................................................................................* + + sub count, count, #1 +layer567_start: + // Instructions: 140 + // Expected cycles: 49 + // Expected IPC: 2.86 + // + // Cycle bound: 49.0 + // IPC bound: 2.86 + // + // Wall time: 600.18s + // User time: 600.18s + // + // ------------------------------------------------------------ original position ------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + trn2 v4.2D, v6.2D, v15.2D // ....................................................................................................................................*....... + trn2 v28.2D, v11.2D, v0.2D // ........................................................................................................................*................... + trn2 v15.2D, v7.2D, v26.2D // .........................................................................................................................*.................. + ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x6] // e........................................................................................................................................... + ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x7] // .e.......................................................................................................................................... + ldr q17, [x4, #144] // ...................e........................................................................................................................ + ldr q29, [x4, #208] // .......................e.................................................................................................................... + // gap // ............................................................................................................................................ + ldr q16, [x4, #16] // ...........e................................................................................................................................ + trn2 v14.4S, v25.4S, v31.4S // .................................................................................................................................*.......... + ldr q25, [x4, #48] // .............e.............................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q28, [x6, #-32] // ..............................................................................................................................*............. + ldr q13, [x4, #80] // ...............e............................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q15, [x6, #-16] // ...............................................................................................................................*............ + trn1 v15.2D, v7.2D, v26.2D // ...........................................................................................................................*................ + ldr q26, [x4], #16*14 // ..........e................................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q4, [x7, #-32] // ..........................................................................................................................................*. + ldr q3, [x4, #-32] // ......................e..................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q15, [x6, #-48] // .............................................................................................................................*.............. + trn2 v15.2D, v14.2D, v27.2D // .....................................................................................................................................*...... + ldr q4, [x4, #-64] // ....................e....................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v19.4S, v23.4S, v11.4S // ......e..................................................................................................................................... + trn2 v28.4S, v23.4S, v11.4S // .......e.................................................................................................................................... + trn2 v7.4S, v21.4S, v9.4S // ...e........................................................................................................................................ + trn2 v0.4S, v24.4S, v12.4S // .........e.................................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v5.4S, v21.4S, v9.4S // ..e......................................................................................................................................... + str q15, [x7, #-16] // ...........................................................................................................................................* + trn1 v15.2D, v14.2D, v27.2D // .......................................................................................................................................*.... + trn2 v23.4S, v22.4S, v10.4S // .....e...................................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v6.8H, v7.8H, v16.8H // ........................e................................................................................................................... + mul v7.8H, v7.8H, v26.8H // .........................e.................................................................................................................. + mul v14.8H, v0.8H, v26.8H // ........................................e................................................................................................... + sqrdmulh v21.8H, v0.8H, v16.8H // .......................................e.................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q15, [x7, #-48] // .........................................................................................................................................*.. + mul v31.8H, v23.8H, v26.8H // ..............................e............................................................................................................. + mul v15.8H, v28.8H, v26.8H // ...................................e........................................................................................................ + sqrdmulh v27.8H, v23.8H, v16.8H // .............................e.............................................................................................................. + sqrdmulh v16.8H, v28.8H, v16.8H // ..................................e......................................................................................................... + ldr q23, [x4, #-192] // ............e............................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v11.4S, v24.4S, v12.4S // ........e................................................................................................................................... + trn1 v0.4S, v22.4S, v10.4S // ....e....................................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v7.8H, v6.8H, v8.H[0] // ..........................e................................................................................................................. + mla v14.8H, v21.8H, v8.H[0] // .........................................e.................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v15.8H, v16.8H, v8.H[0] // ....................................e....................................................................................................... + mla v31.8H, v27.8H, v8.H[0] // ...............................e............................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + ldr q1, [x4, #-160] // ..............e............................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v22.8H, v5.8H, v7.8H // ...........................e................................................................................................................ + sub v18.8H, v11.8H, v14.8H // ..........................................e................................................................................................. + add v27.8H, v11.8H, v14.8H // ...........................................e................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v26.8H, v19.8H, v15.8H // .....................................e...................................................................................................... + add v2.8H, v19.8H, v15.8H // ......................................e..................................................................................................... + sub v10.8H, v0.8H, v31.8H // ................................e........................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v16.8H, v18.8H, v13.8H // ...........................................................e................................................................................ + mul v15.8H, v18.8H, v1.8H // ............................................................e............................................................................... + sqrdmulh v14.8H, v27.8H, v25.8H // .................................................e.......................................................................................... + mul v28.8H, v27.8H, v23.8H // ..................................................e......................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v6.8H, v2.8H, v25.8H // ............................................e............................................................................................... + mul v25.8H, v2.8H, v23.8H // .............................................e.............................................................................................. + ldr q23, [x4, #-128] // ................e........................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v28.8H, v14.8H, v8.H[0] // ...................................................e........................................................................................ + add v14.8H, v0.8H, v31.8H // .................................e.......................................................................................................... + mla v15.8H, v16.8H, v8.H[0] // .............................................................e.............................................................................. + sqrdmulh v16.8H, v26.8H, v13.8H // ......................................................e..................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + ldr q2, [x4, #-48] // .....................e...................................................................................................................... + mul v13.8H, v26.8H, v1.8H // .......................................................e.................................................................................... + add v26.8H, v5.8H, v7.8H // ............................e............................................................................................................... + mla v25.8H, v6.8H, v8.H[0] // ..............................................e............................................................................................. + ldr q7, [x4, #-112] // .................e.......................................................................................................................... + ldr q1, [x4, #-96] // ..................e......................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v6.8H, v14.8H, v28.8H // ....................................................e....................................................................................... + add v27.8H, v14.8H, v28.8H // .....................................................e...................................................................................... + sub v5.8H, v10.8H, v15.8H // ..............................................................e............................................................................. + add v15.8H, v10.8H, v15.8H // ...............................................................e............................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v31.8H, v26.8H, v25.8H // ...............................................e............................................................................................ + mla v13.8H, v16.8H, v8.H[0] // ........................................................e................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v7.8H, v27.8H, v7.8H // ................................................................e........................................................................... + mul v20.8H, v15.8H, v4.8H // ...........................................................................e................................................................ + mul v0.8H, v27.8H, v23.8H // .................................................................e.......................................................................... + sqrdmulh v14.8H, v15.8H, v2.8H // ..........................................................................e................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v15.8H, v6.8H, v17.8H // .....................................................................e...................................................................... + sqrdmulh v16.8H, v5.8H, v29.8H // ...............................................................................e............................................................ + mul v27.8H, v5.8H, v3.8H // ................................................................................e........................................................... + mul v6.8H, v6.8H, v1.8H // ......................................................................e..................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v4.8H, v26.8H, v25.8H // ................................................e........................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v0.8H, v7.8H, v8.H[0] // ..................................................................e......................................................................... + mla v20.8H, v14.8H, v8.H[0] // ............................................................................e............................................................... + sub v11.8H, v22.8H, v13.8H // .........................................................e.................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v6.8H, v15.8H, v8.H[0] // .......................................................................e.................................................................... + mla v27.8H, v16.8H, v8.H[0] // .................................................................................e.......................................................... + add v16.8H, v22.8H, v13.8H // ..........................................................e................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v2.8H, v16.8H, v20.8H // .............................................................................e.............................................................. + add v3.8H, v16.8H, v20.8H // ..............................................................................e............................................................. + add v29.8H, v4.8H, v0.8H // ....................................................................e....................................................................... + sub v1.8H, v4.8H, v0.8H // ...................................................................e........................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v17.8H, v31.8H, v6.8H // ........................................................................e................................................................... + add v24.8H, v31.8H, v6.8H // .........................................................................e.................................................................. + sub v22.8H, v11.8H, v27.8H // ..................................................................................e......................................................... + add v11.8H, v11.8H, v27.8H // ...................................................................................e........................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqdmulh v7.8H, v29.8H, v8.H[1] // ....................................................................................e....................................................... + sqdmulh v0.8H, v1.8H, v8.H[1] // .......................................................................................e.................................................... + sqdmulh v14.8H, v3.8H, v8.H[1] // ................................................................................................e........................................... + sqdmulh v27.8H, v2.8H, v8.H[1] // ...................................................................................................e........................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqdmulh v26.8H, v24.8H, v8.H[1] // ..........................................................................................e................................................. + sqdmulh v16.8H, v11.8H, v8.H[1] // ......................................................................................................e..................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqdmulh v15.8H, v22.8H, v8.H[1] // .........................................................................................................e.................................. + sqdmulh v6.8H, v17.8H, v8.H[1] // .............................................................................................e.............................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v7.8H, v7.8H, #11 // .....................................................................................e...................................................... + srshr v0.8H, v0.8H, #11 // ........................................................................................e................................................... + srshr v14.8H, v14.8H, #11 // .................................................................................................e.......................................... + srshr v27.8H, v27.8H, #11 // ....................................................................................................e....................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v26.8H, v26.8H, #11 // ...........................................................................................e................................................ + srshr v6.8H, v6.8H, #11 // ..............................................................................................e............................................. + srshr v16.8H, v16.8H, #11 // .......................................................................................................e.................................... + srshr v15.8H, v15.8H, #11 // ..........................................................................................................e................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v29.8H, v7.8H, v8.H[0] // ......................................................................................e..................................................... + mla v1.8H, v0.8H, v8.H[0] // .........................................................................................e.................................................. + mla v3.8H, v14.8H, v8.H[0] // ..................................................................................................e......................................... + mla v2.8H, v27.8H, v8.H[0] // .....................................................................................................e...................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v22.8H, v15.8H, v8.H[0] // ...........................................................................................................e................................ + mla v11.8H, v16.8H, v8.H[0] // ........................................................................................................e................................... + mla v17.8H, v6.8H, v8.H[0] // ...............................................................................................e............................................ + mla v24.8H, v26.8H, v8.H[0] // ............................................................................................e............................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v25.4S, v29.4S, v3.4S // .............................................................................................................e.............................. + trn1 v7.4S, v29.4S, v3.4S // ............................................................................................................e............................... + trn2 v31.4S, v1.4S, v2.4S // ...............................................................................................................e............................ + trn1 v0.4S, v1.4S, v2.4S // ..............................................................................................................e............................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v13.4S, v24.4S, v11.4S // ................................................................................................................e........................... + trn2 v15.4S, v17.4S, v22.4S // ...................................................................................................................e........................ + trn1 v14.4S, v17.4S, v22.4S // ..................................................................................................................e......................... + trn2 v21.4S, v24.4S, v11.4S // .................................................................................................................e.......................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v11.4S, v7.4S, v0.4S // ....................................................................................................................e....................... + trn2 v7.4S, v7.4S, v0.4S // .....................................................................................................................e...................... + trn1 v6.4S, v25.4S, v31.4S // ................................................................................................................................e........... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v26.4S, v13.4S, v14.4S // .......................................................................................................................e.................... + trn1 v0.4S, v13.4S, v14.4S // ......................................................................................................................e..................... + trn2 v27.4S, v21.4S, v15.4S // ...................................................................................................................................e........ + trn1 v15.4S, v21.4S, v15.4S // ..................................................................................................................................e......... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v4.2D, v11.2D, v0.2D // ..........................................................................................................................e................. + trn1 v16.2D, v6.2D, v15.2D // ......................................................................................................................................e..... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q4, [x6], #64 // ............................................................................................................................e............... + str q16, [x7], #64 // ........................................................................................................................................e... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + + // ----------------------------------------------------------------------------- new position ------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------- + // ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6] // e........................................................................................................................................'..~............................. + // ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7] // .e.......................................................................................................................................'...~............................ + // trn1 v9.4s, v17.4s, v21.4s // .....................e...................................................................................................................'.......................~........ + // trn2 v13.4s, v17.4s, v21.4s // ...................e.....................................................................................................................'.....................~.......... + // trn1 v10.4s, v18.4s, v22.4s // ....................................e....................................................................................................'................................ + // trn2 v14.4s, v18.4s, v22.4s // ........................e................................................................................................................'..........................~..... + // trn1 v11.4s, v19.4s, v23.4s // .................e.......................................................................................................................'...................~............ + // trn2 v15.4s, v19.4s, v23.4s // ..................e......................................................................................................................'....................~........... + // trn1 v12.4s, v20.4s, v24.4s // ...................................e.....................................................................................................'................................ + // trn2 v16.4s, v20.4s, v24.4s // ....................e....................................................................................................................'......................~......... + // ldr q0, [ x4], #16*14 // ...........e.............................................................................................................................'.............~.................. + // ldr q4, [x4, #-16*14+16*1] // ....e....................................................................................................................................'......~......................... + // ldr q1, [ x4, #-16*14+16*2] // ..................................e......................................................................................................'................................ + // ldr q5, [x4, #-16*14+16*3] // ......e..................................................................................................................................'........~....................... + // ldr q2, [ x4, #-16*14+16*4] // .........................................e...............................................................................................'................................ + // ldr q6, [x4, #-16*14+16*5] // ........e................................................................................................................................'..........~..................... + // ldr q3, [ x4, #-16*14+16*6] // ......................................................e..................................................................................'................................ + // ldr q7, [x4, #-16*14+16*7] // ...............................................................e.........................................................................'................................ + // ldr q17, [ x4, #-16*14+16*8] // ................................................................e........................................................................'................................ + // ldr q18, [ x4, #-16*14+16*9] // ..e......................................................................................................................................'....~........................... + // ldr q19, [ x4, #-16*14+16*10] // ................e........................................................................................................................'..................~............. + // ldr q20, [ x4, #-16*14+16*11] // ...........................................................e.............................................................................'................................ + // ldr q21, [ x4, #-16*14+16*12] // .............e...........................................................................................................................'...............~................ + // ldr q22, [ x4, #-16*14+16*13] // ...e.....................................................................................................................................'.....~.......................... + // sqrdmulh v28.8h, v13.8h, v4.8h // .........................e...............................................................................................................'...........................~.... + // mul v25.8h, v13.8h, v0.8h // ..........................e..............................................................................................................'............................~... + // mla v25.8h, v28.8h, v8.h[0] // .....................................e...................................................................................................'................................ + // sub v13.8h, v9.8h, v25.8h // ..........................................e..............................................................................................'................................ + // add v9.8h, v9.8h, v25.8h // .............................................................e...........................................................................'................................ + // sqrdmulh v28.8h, v14.8h, v4.8h // ................................e........................................................................................................'................................ + // mul v25.8h, v14.8h, v0.8h // ..............................e..........................................................................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // ........................................e................................................................................................'................................ + // sub v14.8h, v10.8h, v25.8h // ...............................................e.........................................................................................'................................ + // add v10.8h, v10.8h, v25.8h // ........................................................e................................................................................'................................ + // sqrdmulh v28.8h, v15.8h, v4.8h // .................................e.......................................................................................................'................................ + // mul v25.8h, v15.8h, v0.8h // ...............................e.........................................................................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // .......................................e.................................................................................................'................................ + // sub v15.8h, v11.8h, v25.8h // .............................................e...........................................................................................'................................ + // add v11.8h, v11.8h, v25.8h // ..............................................e..........................................................................................'................................ + // sqrdmulh v28.8h, v16.8h, v4.8h // ............................e............................................................................................................'..............................~. + // mul v25.8h, v16.8h, v0.8h // ...........................e.............................................................................................................'.............................~.. + // mla v25.8h, v28.8h, v8.h[0] // ......................................e..................................................................................................'................................ + // sub v16.8h, v12.8h, v25.8h // ...........................................e.............................................................................................'................................ + // add v12.8h, v12.8h, v25.8h // ............................................e............................................................................................'................................ + // sqrdmulh v28.8h, v11.8h, v5.8h // ....................................................e....................................................................................'................................ + // mul v25.8h, v11.8h, v1.8h // .....................................................e...................................................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // ..............................................................e..........................................................................'................................ + // sub v11.8h, v9.8h, v25.8h // .....................................................................e...................................................................'................................ + // add v9.8h, v9.8h, v25.8h // ...............................................................................e.........................................................'................................ + // sqrdmulh v28.8h, v12.8h, v5.8h // ..................................................e......................................................................................'................................ + // mul v25.8h, v12.8h, v1.8h // ...................................................e.....................................................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // .......................................................e.................................................................................'................................ + // sub v12.8h, v10.8h, v25.8h // .................................................................e.......................................................................'................................ + // add v10.8h, v10.8h, v25.8h // ..................................................................e......................................................................'................................ + // sqrdmulh v28.8h, v15.8h, v6.8h // ..........................................................e..............................................................................'................................ + // mul v25.8h, v15.8h, v2.8h // ............................................................e............................................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // ......................................................................e..................................................................'................................ + // sub v15.8h, v13.8h, v25.8h // ..................................................................................e......................................................'................................ + // add v13.8h, v13.8h, v25.8h // .....................................................................................e...................................................'................................ + // sqrdmulh v28.8h, v16.8h, v6.8h // ................................................e........................................................................................'................................ + // mul v25.8h, v16.8h, v2.8h // .................................................e.......................................................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // .........................................................e...............................................................................'................................ + // sub v16.8h, v14.8h, v25.8h // ...................................................................e.....................................................................'................................ + // add v14.8h, v14.8h, v25.8h // ....................................................................e....................................................................'................................ + // sqrdmulh v28.8h, v10.8h, v7.8h // .......................................................................e.................................................................'................................ + // mul v25.8h, v10.8h, v3.8h // .........................................................................e...............................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // ................................................................................e........................................................'................................ + // sub v10.8h, v9.8h, v25.8h // .........................................................................................e...............................................'................................ + // add v9.8h, v9.8h, v25.8h // ........................................................................................e................................................'................................ + // sqrdmulh v28.8h, v12.8h, v18.8h // ...........................................................................e.............................................................'................................ + // mul v25.8h, v12.8h, v17.8h // ..............................................................................e..........................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // ...................................................................................e.....................................................'................................ + // sub v12.8h, v11.8h, v25.8h // ..........................................................................................e..............................................'................................ + // add v11.8h, v11.8h, v25.8h // ...........................................................................................e.............................................'................................ + // sqrdmulh v28.8h, v14.8h, v20.8h // ..........................................................................e..............................................................'................................ + // mul v25.8h, v14.8h, v19.8h // ........................................................................e................................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // .................................................................................e.......................................................'................................ + // sub v14.8h, v13.8h, v25.8h // ......................................................................................e..................................................'................................ + // add v13.8h, v13.8h, v25.8h // .......................................................................................e.................................................'................................ + // sqrdmulh v28.8h, v16.8h, v22.8h // ............................................................................e............................................................'................................ + // mul v25.8h, v16.8h, v21.8h // .............................................................................e...........................................................'................................ + // mla v25.8h, v28.8h, v8.h[0] // ....................................................................................e....................................................'................................ + // sub v16.8h, v15.8h, v25.8h // ............................................................................................e............................................'................................ + // add v15.8h, v15.8h, v25.8h // .............................................................................................e...........................................'................................ + // sqdmulh v26.8h, v9.8h, v8.h[1] // ..............................................................................................e..........................................'................................ + // srshr v26.8h, v26.8h, #11 // ......................................................................................................e..................................'................................ + // mla v9.8h, v26.8h, v8.h[0] // ..............................................................................................................e..........................'................................ + // sqdmulh v26.8h, v10.8h, v8.h[1] // ...............................................................................................e.........................................'................................ + // srshr v26.8h, v26.8h, #11 // .......................................................................................................e.................................'................................ + // mla v10.8h, v26.8h, v8.h[0] // ...............................................................................................................e.........................'................................ + // sqdmulh v26.8h, v11.8h, v8.h[1] // ..................................................................................................e......................................'................................ + // srshr v26.8h, v26.8h, #11 // ..........................................................................................................e..............................'................................ + // mla v11.8h, v26.8h, v8.h[0] // .....................................................................................................................e...................'................................ + // sqdmulh v26.8h, v12.8h, v8.h[1] // .....................................................................................................e...................................'................................ + // srshr v26.8h, v26.8h, #11 // ...........................................................................................................e.............................'................................ + // mla v12.8h, v26.8h, v8.h[0] // ....................................................................................................................e....................'................................ + // sqdmulh v26.8h, v13.8h, v8.h[1] // ................................................................................................e........................................'................................ + // srshr v26.8h, v26.8h, #11 // ........................................................................................................e................................'................................ + // mla v13.8h, v26.8h, v8.h[0] // ................................................................................................................e........................'................................ + // sqdmulh v26.8h, v14.8h, v8.h[1] // .................................................................................................e.......................................'................................ + // srshr v26.8h, v26.8h, #11 // .........................................................................................................e...............................'................................ + // mla v14.8h, v26.8h, v8.h[0] // .................................................................................................................e.......................'................................ + // sqdmulh v26.8h, v15.8h, v8.h[1] // ...................................................................................................e.....................................'................................ + // srshr v26.8h, v26.8h, #11 // ............................................................................................................e............................'................................ + // mla v15.8h, v26.8h, v8.h[0] // ...................................................................................................................e.....................'................................ + // sqdmulh v26.8h, v16.8h, v8.h[1] // ....................................................................................................e....................................'................................ + // srshr v26.8h, v26.8h, #11 // .............................................................................................................e...........................'................................ + // mla v16.8h, v26.8h, v8.h[0] // ..................................................................................................................e......................'................................ + // trn1 v17.4s, v9.4s, v13.4s // .......................................................................................................................e.................'................................ + // trn2 v21.4s, v9.4s, v13.4s // ......................................................................................................................e..................'................................ + // trn1 v18.4s, v10.4s, v14.4s // .........................................................................................................................e...............'................................ + // trn2 v22.4s, v10.4s, v14.4s // ........................................................................................................................e................'................................ + // trn1 v19.4s, v11.4s, v15.4s // ..........................................................................................................................e..............'................................ + // trn2 v23.4s, v11.4s, v15.4s // .............................................................................................................................e...........'................................ + // trn1 v20.4s, v12.4s, v16.4s // ............................................................................................................................e............'................................ + // trn2 v24.4s, v12.4s, v16.4s // ...........................................................................................................................e.............'................................ + // trn1 v26.4s, v17.4s, v18.4s // ..............................................................................................................................e..........'................................ + // trn2 v27.4s, v17.4s, v18.4s // ...............................................................................................................................e.........'................................ + // trn1 v28.4s, v19.4s, v20.4s // ..................................................................................................................................e......'................................ + // trn2 v29.4s, v19.4s, v20.4s // .................................................................................................................................e.......'................................ + // trn2 v19.2d, v26.2d, v28.2d // .........................................................................................................................................'*............................... + // trn2 v20.2d, v27.2d, v29.2d // .........................................................................................................................................'.*.............................. + // trn1 v17.2d, v26.2d, v28.2d // .....................................................................................................................................e...'................................ + // trn1 v18.2d, v27.2d, v29.2d // ..........~..............................................................................................................................'............*................... + // str q17, [x6], #64 // .......................................................................................................................................e.'................................ + // str q18, [x6, #(-(64) + 16*1)] // ..............~..........................................................................................................................'................*............... + // str q19, [x6, #(-(64) + 16*2)] // .......~.................................................................................................................................'.........*...................... + // str q20, [x6, #(-(64) + 16*3)] // .........~...............................................................................................................................'...........*.................... + // trn1 v26.4s, v21.4s, v22.4s // ................................................................................................................................e........'................................ + // trn2 v27.4s, v21.4s, v22.4s // .....~...................................................................................................................................'.......*........................ + // trn1 v28.4s, v23.4s, v24.4s // ....................................................................................................................................e....'................................ + // trn2 v29.4s, v23.4s, v24.4s // ...................................................................................................................................e.....'................................ + // trn2 v23.2d, v26.2d, v28.2d // .........................................................................................................................................*................................ + // trn2 v24.2d, v27.2d, v29.2d // ...............~.........................................................................................................................'.................*.............. + // trn1 v21.2d, v26.2d, v28.2d // ......................................................................................................................................e..'................................ + // trn1 v22.2d, v27.2d, v29.2d // .......................~.................................................................................................................'.........................*...... + // str q21, [x7], #64 // ........................................................................................................................................e'................................ + // str q22, [x7, #(-(64) + 16*1)] // .............................~...........................................................................................................'...............................* + // str q23, [x7, #(-(64) + 16*2)] // ............~............................................................................................................................'..............*................. + // str q24, [x7, #(-(64) + 16*3)] // ......................~..................................................................................................................'........................*....... + + sub count, count, #1 + cbnz count, layer567_start + // Instructions: 13 + // Expected cycles: 5 + // Expected IPC: 2.60 + // + // Cycle bound: 5.0 + // IPC bound: 2.60 + // + // Wall time: 0.10s + // User time: 0.10s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + trn2 v14.4S, v25.4S, v31.4S // ...*.......................... + trn2 v15.2D, v6.2D, v15.2D // *............................. + trn2 v16.2D, v11.2D, v0.2D // .*............................ + trn2 v6.2D, v7.2D, v26.2D // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + trn1 v26.2D, v7.2D, v26.2D // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + trn2 v0.2D, v14.2D, v27.2D // .........*.................... + trn1 v27.2D, v14.2D, v27.2D // ...........*.................. + str q15, [x7, #-32] // .......*...................... + str q16, [x6, #-32] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q6, [x6, #-16] // .....*........................ + str q26, [x6, #-48] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q0, [x7, #-16] // ..........*................... + str q27, [x7, #-48] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // trn2 v4.2D, v6.2D, v15.2D // .*............................. + // trn2 v28.2D, v11.2D, v0.2D // ..*............................ + // trn2 v15.2D, v7.2D, v26.2D // ...*........................... + // trn2 v14.4S, v25.4S, v31.4S // *.............................. + // str q28, [x6, #-32] // ........*...................... + // str q15, [x6, #-16] // .........*..................... + // trn1 v15.2D, v7.2D, v26.2D // ....*.......................... + // str q4, [x7, #-32] // .......*....................... + // str q15, [x6, #-48] // ..........*.................... + // trn2 v15.2D, v14.2D, v27.2D // .....*......................... + // str q15, [x7, #-16] // ...........*................... + // trn1 v15.2D, v14.2D, v27.2D // ......*........................ + // str q15, [x7, #-48] // ............*.................. + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_1234_567_manual_st4_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_kyber_1234_567_manual_st4_opt_m1_icestorm.s new file mode 100644 index 00000000..49882ab5 --- /dev/null +++ b/examples/opt/aarch64/ntt_kyber_1234_567_manual_st4_opt_m1_icestorm.s @@ -0,0 +1,1833 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmla d,a,b + mla \d\().8h, \a\().8h, \b\().8h +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmlaq d,a,b,i + mla \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlaq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vmlaq \dst, t2, consts, 0 +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlaq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #16 +.endm + +.macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_kyber_1234_567_twiddles.s" +.text + + .global ntt_kyber_1234_567 + .global _ntt_kyber_1234_567 + +.p2align 4 +const_addr: .short -3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_kyber_1234_567: +_ntt_kyber_1234_567: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + src0 .req x6 + src1 .req x7 + src2 .req x8 + src3 .req x9 + src4 .req x10 + src5 .req x11 + src6 .req x12 + src7 .req x13 + src8 .req x14 + src9 .req x15 + src10 .req x16 + src11 .req x17 + src12 .req x18 + src13 .req x19 + src14 .req x20 + src15 .req x21 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + data8 .req v17 + data9 .req v18 + data10 .req v19 + data11 .req v20 + data12 .req v21 + data13 .req v22 + data14 .req v23 + data15 .req v24 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + qform_data8 .req q17 + qform_data9 .req q18 + qform_data10 .req q19 + qform_data11 .req q20 + qform_data12 .req q21 + qform_data13 .req q22 + qform_data14 .req q23 + qform_data15 .req q24 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + tmp .req v25 + t0 .req v26 + t1 .req v27 + t2 .req v28 + t3 .req v29 + + consts .req v8 + + ASM_LOAD(r_ptr0, roots) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + add src0, x0, #32*0 + add src8, x0, #32*8 + + ld1 { root0.8h, root1.8h, root2.8h, root3.8h}, [r_ptr0], #64 + + mov count, #2 + + .p2align 2 + ldr q28, [x6, #32] // .*.............................................................................................................................................................................................. + ldr q6, [x14, #0] // ........*....................................................................................................................................................................................... + ldr q26, [x14, #128] // ............*................................................................................................................................................................................... + ldr q22, [x14, #32] // .........*...................................................................................................................................................................................... + ldr q29, [x14, #64] // ..........*..................................................................................................................................................................................... + ldr q16, [x14, #160] // .............*.................................................................................................................................................................................. + ldr q12, [x6, #192] // ......*......................................................................................................................................................................................... + ldr q19, [x14, #224] // ...............*................................................................................................................................................................................ + ldr q7, [x6, #160] // .....*.......................................................................................................................................................................................... + mul v9.8H, v6.8H, v0.H[0] // .................*.............................................................................................................................................................................. + sqrdmulh v6.8H, v6.8H, v0.H[1] // ................*............................................................................................................................................................................... + ldr q4, [x14, #192] // ..............*................................................................................................................................................................................. + sqrdmulh v15.8H, v22.8H, v0.H[1] // .....................*.......................................................................................................................................................................... + ldr q11, [x6, #128] // ....*........................................................................................................................................................................................... + mul v22.8H, v22.8H, v0.H[0] // ......................*......................................................................................................................................................................... + ldr q17, [x6, #64] // ..*............................................................................................................................................................................................. + sqrdmulh v31.8H, v16.8H, v0.H[1] // .........................................*...................................................................................................................................................... + ldr q18, [x14, #96] // ...........*.................................................................................................................................................................................... + ldr q21, [x6, #96] // ...*............................................................................................................................................................................................ + sqrdmulh v20.8H, v4.8H, v0.H[1] // ..............................................*................................................................................................................................................. + mul v13.8H, v4.8H, v0.H[0] // ...............................................*................................................................................................................................................ + mla v22.8H, v15.8H, v8.H[0] // .......................*........................................................................................................................................................................ + sqrdmulh v25.8H, v19.8H, v0.H[1] // ...................................................*............................................................................................................................................ + mla v9.8H, v6.8H, v8.H[0] // ..................*............................................................................................................................................................................. + sqrdmulh v5.8H, v29.8H, v0.H[1] // ..........................*..................................................................................................................................................................... + mla v13.8H, v20.8H, v8.H[0] // ................................................*............................................................................................................................................... + mul v14.8H, v16.8H, v0.H[0] // ..........................................*..................................................................................................................................................... + sub v6.8H, v28.8H, v22.8H // ........................*....................................................................................................................................................................... + mul v30.8H, v19.8H, v0.H[0] // ....................................................*........................................................................................................................................... + mul v27.8H, v29.8H, v0.H[0] // ...........................*.................................................................................................................................................................... + mul v24.8H, v26.8H, v0.H[0] // .....................................*.......................................................................................................................................................... + add v29.8H, v12.8H, v13.8H // ..................................................*............................................................................................................................................. + sub v16.8H, v12.8H, v13.8H // .................................................*.............................................................................................................................................. + sqrdmulh v12.8H, v26.8H, v0.H[1] // ....................................*........................................................................................................................................................... + mla v30.8H, v25.8H, v8.H[0] // .....................................................*.......................................................................................................................................... + ldr q25, [x6, #224] // .......*........................................................................................................................................................................................ + mla v14.8H, v31.8H, v8.H[0] // ...........................................*.................................................................................................................................................... + mla v27.8H, v5.8H, v8.H[0] // ............................*................................................................................................................................................................... + mla v24.8H, v12.8H, v8.H[0] // ......................................*......................................................................................................................................................... + ldr q19, [x6, #0] // *............................................................................................................................................................................................... + sqrdmulh v20.8H, v18.8H, v0.H[1] // ...............................*................................................................................................................................................................ + mul v18.8H, v18.8H, v0.H[0] // ................................*............................................................................................................................................................... + add v22.8H, v28.8H, v22.8H // .........................*...................................................................................................................................................................... + sub v12.8H, v17.8H, v27.8H // .............................*.................................................................................................................................................................. + mul v31.8H, v29.8H, v0.H[2] // ...................................................................*............................................................................................................................ + sqrdmulh v4.8H, v29.8H, v0.H[3] // ..................................................................*............................................................................................................................. + add v26.8H, v7.8H, v14.8H // .............................................*.................................................................................................................................................. + sub v7.8H, v7.8H, v14.8H // ............................................*................................................................................................................................................... + add v27.8H, v17.8H, v27.8H // ..............................*................................................................................................................................................................. + add v17.8H, v25.8H, v30.8H // .......................................................*........................................................................................................................................ + sub v30.8H, v25.8H, v30.8H // ......................................................*......................................................................................................................................... + sqrdmulh v25.8H, v26.8H, v0.H[3] // .............................................................*.................................................................................................................................. + add v15.8H, v19.8H, v9.8H // ....................*........................................................................................................................................................................... + sub v19.8H, v19.8H, v9.8H // ...................*............................................................................................................................................................................ + sqrdmulh v9.8H, v16.8H, v0.H[5] // ......................................................................................*......................................................................................................... + mul v16.8H, v16.8H, v0.H[4] // .......................................................................................*........................................................................................................ + add v5.8H, v11.8H, v24.8H // ........................................*....................................................................................................................................................... + sub v11.8H, v11.8H, v24.8H // .......................................*........................................................................................................................................................ + mul v29.8H, v5.8H, v0.H[2] // .........................................................*...................................................................................................................................... + sqrdmulh v24.8H, v5.8H, v0.H[3] // ........................................................*....................................................................................................................................... + sqrdmulh v10.8H, v7.8H, v0.H[5] // .................................................................................*.............................................................................................................. + mul v28.8H, v7.8H, v0.H[4] // ..................................................................................*............................................................................................................. + mul v13.8H, v26.8H, v0.H[2] // ..............................................................*................................................................................................................................. + sqrdmulh v23.8H, v17.8H, v0.H[3] // .......................................................................*........................................................................................................................ + mla v16.8H, v9.8H, v8.H[0] // ........................................................................................*....................................................................................................... + mla v18.8H, v20.8H, v8.H[0] // .................................*.............................................................................................................................................................. + mla v31.8H, v4.8H, v8.H[0] // ....................................................................*........................................................................................................................... + mla v28.8H, v10.8H, v8.H[0] // ...................................................................................*............................................................................................................ + mla v13.8H, v25.8H, v8.H[0] // ...............................................................*................................................................................................................................ + mul v14.8H, v11.8H, v0.H[4] // .............................................................................*.................................................................................................................. + add v5.8H, v21.8H, v18.8H // ...................................*............................................................................................................................................................ + sqrdmulh v7.8H, v30.8H, v0.H[5] // ...........................................................................................*.................................................................................................... + add v9.8H, v6.8H, v28.8H // .....................................................................................*.......................................................................................................... + sub v28.8H, v6.8H, v28.8H // ....................................................................................*........................................................................................................... + sqrdmulh v6.8H, v11.8H, v0.H[5] // ............................................................................*................................................................................................................... + sub v20.8H, v21.8H, v18.8H // ..................................*............................................................................................................................................................. + sub v18.8H, v22.8H, v13.8H // ................................................................*............................................................................................................................... + add v11.8H, v22.8H, v13.8H // .................................................................*.............................................................................................................................. + mul v4.8H, v30.8H, v0.H[4] // ............................................................................................*................................................................................................... + mla v29.8H, v24.8H, v8.H[0] // ..........................................................*..................................................................................................................................... + mul v24.8H, v17.8H, v0.H[2] // ........................................................................*....................................................................................................................... + add v17.8H, v27.8H, v31.8H // ......................................................................*......................................................................................................................... + sub v26.8H, v27.8H, v31.8H // .....................................................................*.......................................................................................................................... + mla v4.8H, v7.8H, v8.H[0] // .............................................................................................*.................................................................................................. + mla v14.8H, v6.8H, v8.H[0] // ..............................................................................*................................................................................................................. + mul v31.8H, v26.8H, v1.H[0] // ...........................................................................................................*.................................................................................... + mla v24.8H, v23.8H, v8.H[0] // .........................................................................*...................................................................................................................... + sqrdmulh v22.8H, v26.8H, v1.H[1] // ..........................................................................................................*..................................................................................... + sub v27.8H, v12.8H, v16.8H // .........................................................................................*...................................................................................................... + sub v23.8H, v20.8H, v4.8H // ..............................................................................................*................................................................................................. + add v30.8H, v19.8H, v14.8H // ................................................................................*............................................................................................................... + sub v26.8H, v5.8H, v24.8H // ..........................................................................*..................................................................................................................... + sub v10.8H, v19.8H, v14.8H // ...............................................................................*................................................................................................................ + add v7.8H, v5.8H, v24.8H // ...........................................................................*.................................................................................................................... + mul v19.8H, v23.8H, v1.H[4] // ....................................................................................................................................*........................................................... + mul v25.8H, v26.8H, v1.H[0] // ................................................................................................................*............................................................................... + sub count, count, #1 +layer1234_start: + // Instructions: 192 + // Expected cycles: 48 + // Expected IPC: 4.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + add v13.8H, v15.8H, v29.8H // ............*................................................................................................................................................................................... + mla v31.8H, v22.8H, v8.H[0] // ..........*..................................................................................................................................................................................... + sqrdmulh v6.8H, v26.8H, v1.H[1] // .*.............................................................................................................................................................................................. + sub v26.8H, v15.8H, v29.8H // .................................*.............................................................................................................................................................. + sqrdmulh v21.8H, v27.8H, v1.H[5] // ...................................................*............................................................................................................................................ + mul v15.8H, v27.8H, v1.H[4] // .......................................................*........................................................................................................................................ + add v27.8H, v26.8H, v31.8H // ....................................................................................*........................................................................................................... + sqrdmulh v14.8H, v7.8H, v0.H[7] // ...*............................................................................................................................................................................................ + mul v24.8H, v7.8H, v0.H[6] // ..*............................................................................................................................................................................................. + add v4.8H, v20.8H, v4.8H // *............................................................................................................................................................................................... + sqrdmulh v22.8H, v23.8H, v1.H[5] // ........................................*....................................................................................................................................................... + mla v25.8H, v6.8H, v8.H[0] // .......*........................................................................................................................................................................................ + sqrdmulh v5.8H, v4.8H, v1.H[3] // .....*.......................................................................................................................................................................................... + mul v6.8H, v4.8H, v1.H[2] // ....*........................................................................................................................................................................................... + mla v15.8H, v21.8H, v8.H[0] // ...............................................................*................................................................................................................................ + add v21.8H, v12.8H, v16.8H // .........................*...................................................................................................................................................................... + mla v24.8H, v14.8H, v8.H[0] // .........*...................................................................................................................................................................................... + sub v23.8H, v18.8H, v25.8H // .............*.................................................................................................................................................................................. + ldr q7, [x14, #176] // .....................................................................................................*.......................................................................................... + sqrdmulh v29.8H, v21.8H, v1.H[3] // .............................*.................................................................................................................................................................. + sub v26.8H, v26.8H, v31.8H // .....................................*.......................................................................................................................................................... + add v12.8H, v10.8H, v15.8H // ..................................................................................*............................................................................................................. + sub v10.8H, v10.8H, v15.8H // ......................................................................*......................................................................................................................... + mul v15.8H, v21.8H, v1.H[2] // ............................*................................................................................................................................................................... + sub v4.8H, v11.8H, v24.8H // ......................................*......................................................................................................................................................... + mla v19.8H, v22.8H, v8.H[0] // ...............................................*................................................................................................................................................ + sqrdmulh v16.8H, v17.8H, v0.H[7] // ........*....................................................................................................................................................................................... + add v21.8H, v11.8H, v24.8H // ...............*................................................................................................................................................................................ + mla v6.8H, v5.8H, v8.H[0] // ...........*.................................................................................................................................................................................... + mla v15.8H, v29.8H, v8.H[0] // ....................................*........................................................................................................................................................... + mul v17.8H, v17.8H, v0.H[6] // ......*......................................................................................................................................................................................... + add v22.8H, v28.8H, v19.8H // ..........................................................*..................................................................................................................................... + sub v19.8H, v28.8H, v19.8H // ......................................................*......................................................................................................................................... + add v29.8H, v9.8H, v6.8H // .................*.............................................................................................................................................................................. + sub v9.8H, v9.8H, v6.8H // .......................*........................................................................................................................................................................ + mla v17.8H, v16.8H, v8.H[0] // ..............*................................................................................................................................................................................. + add v28.8H, v18.8H, v25.8H // ..................................................................*............................................................................................................................. + sqrdmulh v16.8H, v23.8H, v2.H[5] // ................*............................................................................................................................................................................... + sqrdmulh v18.8H, v9.8H, v3.H[1] // ................................*............................................................................................................................................................... + mul v5.8H, v9.8H, v3.H[0] // ..........................*..................................................................................................................................................................... + mul v25.8H, v29.8H, v2.H[6] // ....................*........................................................................................................................................................................... + sqrdmulh v29.8H, v29.8H, v2.H[7] // ......................*......................................................................................................................................................................... + mul v24.8H, v19.8H, v3.H[4] // .............................................................*.................................................................................................................................. + sqrdmulh v19.8H, v19.8H, v3.H[5] // ............................................................*................................................................................................................................... + sub v31.8H, v30.8H, v15.8H // ..........................................*..................................................................................................................................................... + add v6.8H, v30.8H, v15.8H // ............................................*................................................................................................................................................... + mla v25.8H, v29.8H, v8.H[0] // ...............................*................................................................................................................................................................ + sqrdmulh v9.8H, v21.8H, v1.H[7] // ..................*............................................................................................................................................................................. + sub v14.8H, v13.8H, v17.8H // ....................................................................*........................................................................................................................... + mul v21.8H, v21.8H, v1.H[6] // ...................*............................................................................................................................................................................ + add v11.8H, v13.8H, v17.8H // ...........................*.................................................................................................................................................................... + ldr q13, [x14, #144] // ..................................................................................................*............................................................................................. + mul v29.8H, v7.8H, v0.H[0] // ..........................................................................................................................*..................................................................... + add v20.8H, v6.8H, v25.8H // ....................................................*........................................................................................................................................... + sub v15.8H, v6.8H, v25.8H // ................................................*............................................................................................................................................... + mla v21.8H, v9.8H, v8.H[0] // ........................*....................................................................................................................................................................... + mul v6.8H, v23.8H, v2.H[4] // .....................*.......................................................................................................................................................................... + ldr q23, [x14, #48] // ...................................................................................................*............................................................................................ + str q20, [x14], #16 // .........................................................*...................................................................................................................................... + ldr q20, [x14, #64] // ....................................................................................................*........................................................................................... + sqrdmulh v17.8H, v23.8H, v0.H[1] // ............................................................................................................*................................................................................... + mul v25.8H, v23.8H, v0.H[0] // ..............................................................................................................*................................................................................. + mul v30.8H, v28.8H, v2.H[2] // ............................................................................*................................................................................................................... + sqrdmulh v7.8H, v7.8H, v0.H[1] // ................................................................................................................*............................................................................... + str q15, [x14, #16] // ...........................................................*.................................................................................................................................... + ldr q23, [x6, #240] // ...................................................................................................................................*............................................................ + mla v24.8H, v19.8H, v8.H[0] // ...................................................................*............................................................................................................................ + ldr q19, [x6, #176] // ........................................................................................................*....................................................................................... + sqrdmulh v9.8H, v28.8H, v2.H[3] // .......................................................................*........................................................................................................................ + ldr q15, [x6, #80] // ...............................................................................................................*................................................................................ + mla v25.8H, v17.8H, v8.H[0] // .....................................................................................................................*.......................................................................... + add v17.8H, v11.8H, v21.8H // ..............................*................................................................................................................................................................. + ldr q28, [x14, #224] // .......................................................................................................*........................................................................................ + mla v29.8H, v7.8H, v8.H[0] // ....................................................................................................................................*........................................................... + sub v11.8H, v11.8H, v21.8H // .................................................*.............................................................................................................................................. + ldr q21, [x14, #192] // ...........................................................................................................*.................................................................................... + str q17, [x6], #16 // ..................................*............................................................................................................................................................. + sub v17.8H, v10.8H, v24.8H // .........................................................................*...................................................................................................................... + add v7.8H, v10.8H, v24.8H // ..........................................................................*..................................................................................................................... + str q11, [x6, #16] // .....................................................*.......................................................................................................................................... + ldr q11, [x14, #96] // .................................................................................................................*.............................................................................. + mul v24.8H, v20.8H, v0.H[0] // .............................................................................................................................*.................................................................. + sqrdmulh v20.8H, v20.8H, v0.H[1] // ........................................................................................................................*....................................................................... + mla v6.8H, v16.8H, v8.H[0] // ...................................*............................................................................................................................................................ + sqrdmulh v16.8H, v22.8H, v3.H[3] // ........................................................................*....................................................................................................................... + str q7, [x14, #176] // .............................................................................*.................................................................................................................. + mul v10.8H, v22.8H, v3.H[2] // .................................................................*.............................................................................................................................. + sqrdmulh v22.8H, v28.8H, v0.H[1] // ......................................................................................................................*......................................................................... + str q17, [x14, #208] // ........................................................................................*....................................................................................................... + mla v24.8H, v20.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + mul v20.8H, v28.8H, v0.H[0] // ............................................................................................................................*................................................................... + sub v17.8H, v26.8H, v6.8H // .........................................*...................................................................................................................................................... + sqrdmulh v7.8H, v4.8H, v2.H[1] // ................................................................*............................................................................................................................... + add v26.8H, v26.8H, v6.8H // .................................................................................*.............................................................................................................. + ldr q6, [x6, #96] // ..................................................................................................................*............................................................................. + mla v10.8H, v16.8H, v8.H[0] // ...............................................................................*................................................................................................................ + mla v5.8H, v18.8H, v8.H[0] // .......................................*........................................................................................................................................................ + str q17, [x6, #208] // ..............................................*................................................................................................................................................. + mla v20.8H, v22.8H, v8.H[0] // ..................................................................................................................................*............................................................. + ldr q16, [x6, #32] // ................................................................................................*............................................................................................... + add v17.8H, v12.8H, v10.8H // .......................................................................................*........................................................................................................ + str q26, [x6, #176] // ......................................................................................*......................................................................................................... + mul v4.8H, v4.8H, v2.H[0] // ...........................................*.................................................................................................................................................... + sub v10.8H, v12.8H, v10.8H // .........................................................................................*...................................................................................................... + sqrdmulh v26.8H, v11.8H, v0.H[1] // ........................................................................................................................................*....................................................... + str q17, [x14, #112] // ..........................................................................................*..................................................................................................... + sub v22.8H, v23.8H, v20.8H // ..................................................................................................................................................*............................................. + sub v17.8H, v31.8H, v5.8H // .............................................*.................................................................................................................................................. + sub v12.8H, v15.8H, v24.8H // ...........................................................................................................................................*.................................................... + add v15.8H, v15.8H, v24.8H // ................................................................................................................................................*............................................... + str q10, [x14, #144] // .............................................................................................*.................................................................................................. + str q17, [x14, #80] // ..................................................*............................................................................................................................................. + sqrdmulh v24.8H, v21.8H, v0.H[1] // ...................................................................................................................*............................................................................ + mul v17.8H, v21.8H, v0.H[0] // ....................................................................................................................*........................................................................... + mul v11.8H, v11.8H, v0.H[0] // .........................................................................................................................................*...................................................... + add v28.8H, v23.8H, v20.8H // .................................................................................................................................................*.............................................. + mla v4.8H, v7.8H, v8.H[0] // .....................................................................*.......................................................................................................................... + mla v30.8H, v9.8H, v8.H[0] // .....................................................................................*.......................................................................................................... + ldr q9, [x6, #192] // ......................................................................................................*......................................................................................... + mla v17.8H, v24.8H, v8.H[0] // .........................................................................................................................*...................................................................... + sub v21.8H, v16.8H, v25.8H // ...........................................................................................................................*.................................................................... + add v25.8H, v16.8H, v25.8H // ..........................................................................................................................................*..................................................... + add v7.8H, v31.8H, v5.8H // ........................................................*....................................................................................................................................... + add v23.8H, v14.8H, v4.8H // ..............................................................................*................................................................................................................. + add v16.8H, v19.8H, v29.8H // ..............................................................................................................................................*................................................. + str q7, [x14, #48] // ..............................................................*................................................................................................................................. + add v31.8H, v9.8H, v17.8H // ...............................................................................................................................*................................................................ + sqrdmulh v20.8H, v13.8H, v0.H[1] // .................................................................................................................................*.............................................................. + sqrdmulh v7.8H, v16.8H, v0.H[3] // ...................................................................................................................................................*............................................ + str q23, [x6, #48] // ...................................................................................*............................................................................................................ + sub v5.8H, v27.8H, v30.8H // ...........................................................................................*.................................................................................................... + mul v18.8H, v31.8H, v0.H[2] // ............................................................................................................................................*................................................... + sqrdmulh v31.8H, v31.8H, v0.H[3] // .............................................................................................................................................*.................................................. + ldr q10, [x14, #0] // .................................................................................................*.............................................................................................. + mla v11.8H, v26.8H, v8.H[0] // .................................................................................................................................................................*.............................. + add v27.8H, v27.8H, v30.8H // ............................................................................................*................................................................................................... + str q5, [x6, #144] // ..............................................................................................*................................................................................................. + mul v30.8H, v13.8H, v0.H[0] // ..............................................................................................................................*................................................................. + ldr q13, [x6, #128] // .............................................................................................................*.................................................................................. + sqrdmulh v5.8H, v22.8H, v0.H[5] // .......................................................................................................................................................................*........................ + mla v18.8H, v31.8H, v8.H[0] // ..................................................................................................................................................................*............................. + str q27, [x6, #112] // ...............................................................................................*................................................................................................ + sub v27.8H, v9.8H, v17.8H // ................................................................................................................................*............................................................... + sub v31.8H, v14.8H, v4.8H // ...........................................................................*.................................................................................................................... + mul v4.8H, v22.8H, v0.H[4] // ..............................................................................................................................................................................*................. + mul v14.8H, v10.8H, v0.H[0] // .........................................................................................................*...................................................................................... + sqrdmulh v23.8H, v28.8H, v0.H[3] // ...............................................................................................................................................................*................................ + add v17.8H, v15.8H, v18.8H // .................................................................................................................................................................................*.............. + sub v18.8H, v15.8H, v18.8H // ..................................................................................................................................................................................*............. + str q31, [x6, #80] // ................................................................................*............................................................................................................... + mul v15.8H, v28.8H, v0.H[2] // ................................................................................................................................................................................*............... + sub v28.8H, v19.8H, v29.8H // ...............................................................................................................................................*................................................ + mla v30.8H, v20.8H, v8.H[0] // ......................................................................................................................................*......................................................... + mul v31.8H, v18.8H, v1.H[0] // .....................................................................................................................................................................................*.......... + mul v16.8H, v16.8H, v0.H[2] // ..............................................................................................................................................................*................................. + sqrdmulh v26.8H, v28.8H, v0.H[5] // ............................................................................................................................................................*................................... + mla v4.8H, v5.8H, v8.H[0] // ...................................................................................................................................................................................*............ + sqrdmulh v5.8H, v10.8H, v0.H[1] // ..........................................................................................................*..................................................................................... + sqrdmulh v22.8H, v18.8H, v1.H[1] // .......................................................................................................................................................................................*........ + sub v20.8H, v6.8H, v11.8H // ...........................................................................................................................................................................*.................... + mla v16.8H, v7.8H, v8.H[0] // ....................................................................................................................................................................*........................... + add v24.8H, v6.8H, v11.8H // ......................................................................................................................................................................*......................... + mla v14.8H, v5.8H, v8.H[0] // .......................................................................................................................*........................................................................ + mul v6.8H, v28.8H, v0.H[4] // .............................................................................................................................................................*.................................. + sub v10.8H, v13.8H, v30.8H // .........................................................................................................................................................*...................................... + add v13.8H, v13.8H, v30.8H // ........................................................................................................................................................*....................................... + add v11.8H, v25.8H, v16.8H // .............................................................................................................................................................................*.................. + mla v15.8H, v23.8H, v8.H[0] // ......................................................................................................................................................................................*......... + sub v18.8H, v25.8H, v16.8H // ............................................................................................................................................................................*................... + mla v6.8H, v26.8H, v8.H[0] // ...................................................................................................................................................................*............................ + ldr q25, [x6, #0] // .......................................................................................................................................*........................................................ + mul v16.8H, v27.8H, v0.H[4] // .......................................................................................................................................................*........................................ + mul v29.8H, v13.8H, v0.H[2] // ..........................................................................................................................................................*..................................... + sqrdmulh v26.8H, v27.8H, v0.H[5] // ......................................................................................................................................................*......................................... + sub v23.8H, v20.8H, v4.8H // .........................................................................................................................................................................................*...... + add v9.8H, v21.8H, v6.8H // ........................................................................................................................................................................*....................... + mul v19.8H, v10.8H, v0.H[4] // .....................................................................................................................................................................*.......................... + sqrdmulh v5.8H, v10.8H, v0.H[5] // ..........................................................................................................................................................................*..................... + mla v16.8H, v26.8H, v8.H[0] // ................................................................................................................................................................*............................... + sqrdmulh v13.8H, v13.8H, v0.H[3] // ...........................................................................................................................................................*.................................... + sub v26.8H, v24.8H, v15.8H // ...........................................................................................................................................................................................*.... + sub v28.8H, v21.8H, v6.8H // .........................................................................................................................................................................*...................... + mla v19.8H, v5.8H, v8.H[0] // ....................................................................................................................................................................................*........... + sub v5.8H, v25.8H, v14.8H // .....................................................................................................................................................*.......................................... + mla v29.8H, v13.8H, v8.H[0] // ...............................................................................................................................................................................*................ + sub v27.8H, v12.8H, v16.8H // ........................................................................................................................................................................................*....... + add v7.8H, v24.8H, v15.8H // .............................................................................................................................................................................................*.. + add v15.8H, v25.8H, v14.8H // ....................................................................................................................................................*........................................... + add v30.8H, v5.8H, v19.8H // ..........................................................................................................................................................................................*..... + sub v10.8H, v5.8H, v19.8H // ............................................................................................................................................................................................*... + mul v19.8H, v23.8H, v1.H[4] // ..............................................................................................................................................................................................*. + mul v25.8H, v26.8H, v1.H[0] // ...............................................................................................................................................................................................* + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // add v20.8H, v20.8H, v4.8H // .........*...................................................................................................................................................................................... + // sqrdmulh v6.8H, v26.8H, v1.H[1] // ..*............................................................................................................................................................................................. + // mul v21.8H, v7.8H, v0.H[6] // ........*....................................................................................................................................................................................... + // sqrdmulh v7.8H, v7.8H, v0.H[7] // .......*........................................................................................................................................................................................ + // mul v14.8H, v20.8H, v1.H[2] // .............*.................................................................................................................................................................................. + // sqrdmulh v20.8H, v20.8H, v1.H[3] // ............*................................................................................................................................................................................... + // mul v5.8H, v17.8H, v0.H[6] // ..............................*................................................................................................................................................................. + // mla v25.8H, v6.8H, v8.H[0] // ...........*.................................................................................................................................................................................... + // sqrdmulh v6.8H, v17.8H, v0.H[7] // ..........................*..................................................................................................................................................................... + // mla v21.8H, v7.8H, v8.H[0] // ................*............................................................................................................................................................................... + // mla v31.8H, v22.8H, v8.H[0] // .*.............................................................................................................................................................................................. + // mla v14.8H, v20.8H, v8.H[0] // ............................*................................................................................................................................................................... + // add v17.8H, v15.8H, v29.8H // *............................................................................................................................................................................................... + // sub v24.8H, v18.8H, v25.8H // .................*.............................................................................................................................................................................. + // mla v5.8H, v6.8H, v8.H[0] // ...................................*............................................................................................................................................................ + // add v20.8H, v11.8H, v21.8H // ...........................*.................................................................................................................................................................... + // sqrdmulh v22.8H, v24.8H, v2.H[5] // .....................................*.......................................................................................................................................................... + // add v13.8H, v9.8H, v14.8H // .................................*.............................................................................................................................................................. + // sqrdmulh v6.8H, v20.8H, v1.H[7] // ...............................................*................................................................................................................................................ + // mul v7.8H, v20.8H, v1.H[6] // .................................................*.............................................................................................................................................. + // mul v26.8H, v13.8H, v2.H[6] // ........................................*....................................................................................................................................................... + // mul v4.8H, v24.8H, v2.H[4] // ........................................................*....................................................................................................................................... + // sqrdmulh v20.8H, v13.8H, v2.H[7] // .........................................*...................................................................................................................................................... + // sub v9.8H, v9.8H, v14.8H // ..................................*............................................................................................................................................................. + // mla v7.8H, v6.8H, v8.H[0] // .......................................................*........................................................................................................................................ + // add v6.8H, v12.8H, v16.8H // ...............*................................................................................................................................................................................ + // mul v24.8H, v9.8H, v3.H[0] // .......................................*........................................................................................................................................................ + // add v16.8H, v17.8H, v5.8H // ..................................................*............................................................................................................................................. + // mul v14.8H, v6.8H, v1.H[2] // .......................*........................................................................................................................................................................ + // sqrdmulh v12.8H, v6.8H, v1.H[3] // ...................*............................................................................................................................................................................ + // add v6.8H, v16.8H, v7.8H // .......................................................................*........................................................................................................................ + // mla v26.8H, v20.8H, v8.H[0] // ..............................................*................................................................................................................................................. + // sqrdmulh v20.8H, v9.8H, v3.H[1] // ......................................*......................................................................................................................................................... + // sub v29.8H, v15.8H, v29.8H // ...*............................................................................................................................................................................................ + // str q6, [x6], #16 // ............................................................................*................................................................................................................... + // mla v4.8H, v22.8H, v8.H[0] // ...................................................................................*............................................................................................................ + // mla v14.8H, v12.8H, v8.H[0] // .............................*.................................................................................................................................................................. + // sub v15.8H, v29.8H, v31.8H // ....................*........................................................................................................................................................................... + // sub v11.8H, v11.8H, v21.8H // ........................*....................................................................................................................................................................... + // mla v24.8H, v20.8H, v8.H[0] // ................................................................................................*............................................................................................... + // sqrdmulh v23.8H, v23.8H, v1.H[5] // ..........*..................................................................................................................................................................................... + // sub v6.8H, v15.8H, v4.8H // ...........................................................................................*.................................................................................................... + // sub v12.8H, v30.8H, v14.8H // ............................................*................................................................................................................................................... + // mul v21.8H, v11.8H, v2.H[0] // ......................................................................................................*......................................................................................... + // add v9.8H, v30.8H, v14.8H // .............................................*.................................................................................................................................................. + // sub v22.8H, v12.8H, v24.8H // ...........................................................................................................*.................................................................................... + // str q6, [x6, #208] // .................................................................................................*.............................................................................................. + // mla v19.8H, v23.8H, v8.H[0] // .........................*...................................................................................................................................................................... + // sub v20.8H, v9.8H, v26.8H // ......................................................*......................................................................................................................................... + // sub v6.8H, v16.8H, v7.8H // ..........................................................................*..................................................................................................................... + // str q22, [x14, #96] // ...............................................................................................................*................................................................................ + // sqrdmulh v7.8H, v27.8H, v1.H[5] // ....*........................................................................................................................................................................................... + // add v16.8H, v9.8H, v26.8H // .....................................................*.......................................................................................................................................... + // str q6, [x6, #16] // ...............................................................................*................................................................................................................ + // sub v22.8H, v28.8H, v19.8H // ................................*............................................................................................................................................................... + // mul v27.8H, v27.8H, v1.H[4] // .....*.......................................................................................................................................................................................... + // add v6.8H, v12.8H, v24.8H // ..........................................................................................................................*..................................................................... + // str q16, [x14], #16 // ..........................................................*..................................................................................................................................... + // add v26.8H, v28.8H, v19.8H // ...............................*................................................................................................................................................................ + // str q20, [x14, #16] // ................................................................*............................................................................................................................... + // sqrdmulh v16.8H, v22.8H, v3.H[5] // ...........................................*.................................................................................................................................................... + // mul v19.8H, v22.8H, v3.H[4] // ..........................................*..................................................................................................................................................... + // str q6, [x14, #48] // .............................................................................................................................*.................................................................. + // mla v27.8H, v7.8H, v8.H[0] // ..............*................................................................................................................................................................................. + // sqrdmulh v22.8H, v11.8H, v2.H[1] // ............................................................................................*................................................................................................... + // mul v20.8H, v26.8H, v3.H[2] // ......................................................................................*......................................................................................................... + // add v23.8H, v18.8H, v25.8H // ....................................*........................................................................................................................................................... + // mla v19.8H, v16.8H, v8.H[0] // ..................................................................*............................................................................................................................. + // sub v16.8H, v17.8H, v5.8H // ................................................*............................................................................................................................................... + // mla v21.8H, v22.8H, v8.H[0] // ....................................................................................................................*........................................................................... + // sub v6.8H, v10.8H, v27.8H // ......................*......................................................................................................................................................................... + // sqrdmulh v30.8H, v23.8H, v2.H[3] // ....................................................................*........................................................................................................................... + // sqrdmulh v9.8H, v26.8H, v3.H[3] // ....................................................................................*........................................................................................................... + // sub v22.8H, v6.8H, v19.8H // .............................................................................*.................................................................................................................. + // add v18.8H, v6.8H, v19.8H // ..............................................................................*................................................................................................................. + // sub v12.8H, v16.8H, v21.8H // ...............................................................................................................................................*................................................ + // mul v26.8H, v23.8H, v2.H[2] // ..............................................................*................................................................................................................................. + // str q18, [x14, #176] // .....................................................................................*.......................................................................................................... + // add v6.8H, v16.8H, v21.8H // ...........................................................................................................................*.................................................................... + // mla v20.8H, v9.8H, v8.H[0] // ...............................................................................................*................................................................................................ + // str q12, [x6, #80] // .....................................................................................................................................................*.......................................... + // add v18.8H, v15.8H, v4.8H // .............................................................................................*.................................................................................................. + // add v16.8H, v10.8H, v27.8H // .....................*.......................................................................................................................................................................... + // str q6, [x6, #48] // .................................................................................................................................*.............................................................. + // add v19.8H, v29.8H, v31.8H // ......*......................................................................................................................................................................................... + // mla v26.8H, v30.8H, v8.H[0] // .....................................................................................................................*.......................................................................... + // str q18, [x6, #176] // .....................................................................................................*.......................................................................................... + // add v6.8H, v16.8H, v20.8H // ....................................................................................................*........................................................................................... + // str q22, [x14, #208] // ........................................................................................*....................................................................................................... + // sub v16.8H, v16.8H, v20.8H // .......................................................................................................*........................................................................................ + // str q6, [x14, #112] // .........................................................................................................*...................................................................................... + // sub v6.8H, v19.8H, v26.8H // ..................................................................................................................................*............................................................. + // add v18.8H, v19.8H, v26.8H // .......................................................................................................................................*........................................................ + // str q16, [x14, #144] // ..............................................................................................................*................................................................................. + // str q6, [x6, #144] // ........................................................................................................................................*....................................................... + // str q18, [x6, #112] // .............................................................................................................................................*.................................................. + // ldr q28, [x6, #32] // ...................................................................................................*............................................................................................ + // ldr q6, [x14, #0] // .....................................................................................................................................*.......................................................... + // ldr q26, [x14, #128] // ...................................................*............................................................................................................................................ + // ldr q22, [x14, #32] // .........................................................*...................................................................................................................................... + // ldr q29, [x14, #64] // ...........................................................*.................................................................................................................................... + // ldr q16, [x14, #160] // ..................*............................................................................................................................................................................. + // ldr q12, [x6, #192] // ......................................................................................................................*......................................................................... + // ldr q19, [x14, #224] // ........................................................................*....................................................................................................................... + // ldr q7, [x6, #160] // ...................................................................*............................................................................................................................ + // mul v9.8H, v6.8H, v0.H[0] // .................................................................................................................................................*.............................................. + // sqrdmulh v6.8H, v6.8H, v0.H[1] // .............................................................................................................................................................*.................................. + // ldr q4, [x14, #192] // ...........................................................................*.................................................................................................................... + // sqrdmulh v15.8H, v22.8H, v0.H[1] // ............................................................*................................................................................................................................... + // ldr q11, [x6, #128] // ..........................................................................................................................................*..................................................... + // mul v22.8H, v22.8H, v0.H[0] // .............................................................*.................................................................................................................................. + // ldr q17, [x6, #64] // .....................................................................*.......................................................................................................................... + // sqrdmulh v31.8H, v16.8H, v0.H[1] // ...............................................................*................................................................................................................................ + // ldr q18, [x14, #96] // ................................................................................*............................................................................................................... + // ldr q21, [x6, #96] // ..............................................................................................*................................................................................................. + // sqrdmulh v20.8H, v4.8H, v0.H[1] // ................................................................................................................*............................................................................... + // mul v13.8H, v4.8H, v0.H[0] // .................................................................................................................*.............................................................................. + // mla v22.8H, v15.8H, v8.H[0] // ......................................................................*......................................................................................................................... + // sqrdmulh v25.8H, v19.8H, v0.H[1] // .......................................................................................*........................................................................................................ + // mla v9.8H, v6.8H, v8.H[0] // ..................................................................................................................................................................*............................. + // sqrdmulh v5.8H, v29.8H, v0.H[1] // ..................................................................................*............................................................................................................. + // mla v13.8H, v20.8H, v8.H[0] // .......................................................................................................................*........................................................................ + // mul v14.8H, v16.8H, v0.H[0] // ....................................................*........................................................................................................................................... + // sub v6.8H, v28.8H, v22.8H // ........................................................................................................................*....................................................................... + // mul v30.8H, v19.8H, v0.H[0] // ..........................................................................................*..................................................................................................... + // mul v27.8H, v29.8H, v0.H[0] // .................................................................................*.............................................................................................................. + // mul v24.8H, v26.8H, v0.H[0] // .........................................................................................................................................*...................................................... + // add v29.8H, v12.8H, v13.8H // ..............................................................................................................................*................................................................. + // sub v16.8H, v12.8H, v13.8H // ..............................................................................................................................................*................................................. + // sqrdmulh v12.8H, v26.8H, v0.H[1] // ...............................................................................................................................*................................................................ + // mla v30.8H, v25.8H, v8.H[0] // ..................................................................................................*............................................................................................. + // ldr q25, [x6, #224] // .................................................................*.............................................................................................................................. + // mla v14.8H, v31.8H, v8.H[0] // .........................................................................*...................................................................................................................... + // mla v27.8H, v5.8H, v8.H[0] // .........................................................................................*...................................................................................................... + // mla v24.8H, v12.8H, v8.H[0] // ........................................................................................................................................................*....................................... + // ldr q19, [x6, #0] // ..........................................................................................................................................................................*..................... + // sqrdmulh v20.8H, v18.8H, v0.H[1] // ........................................................................................................*....................................................................................... + // mul v18.8H, v18.8H, v0.H[0] // ..................................................................................................................*............................................................................. + // add v22.8H, v28.8H, v22.8H // .........................................................................................................................*...................................................................... + // sub v12.8H, v17.8H, v27.8H // ............................................................................................................*................................................................................... + // mul v31.8H, v29.8H, v0.H[2] // ...................................................................................................................................*............................................................ + // sqrdmulh v4.8H, v29.8H, v0.H[3] // ....................................................................................................................................*........................................................... + // add v26.8H, v7.8H, v14.8H // ............................................................................................................................*................................................................... + // sub v7.8H, v7.8H, v14.8H // .......................................................................................................................................................*........................................ + // add v27.8H, v17.8H, v27.8H // .............................................................................................................*.................................................................................. + // add v17.8H, v25.8H, v30.8H // ...................................................................................................................*............................................................................ + // sub v30.8H, v25.8H, v30.8H // ..........................................................................................................*..................................................................................... + // sqrdmulh v25.8H, v26.8H, v0.H[3] // ................................................................................................................................*............................................................... + // add v15.8H, v19.8H, v9.8H // ...........................................................................................................................................................................................*.... + // sub v19.8H, v19.8H, v9.8H // .......................................................................................................................................................................................*........ + // sqrdmulh v9.8H, v16.8H, v0.H[5] // .............................................................................................................................................................................*.................. + // mul v16.8H, v16.8H, v0.H[4] // ...........................................................................................................................................................................*.................... + // add v5.8H, v11.8H, v24.8H // .....................................................................................................................................................................*.......................... + // sub v11.8H, v11.8H, v24.8H // ....................................................................................................................................................................*........................... + // mul v29.8H, v5.8H, v0.H[2] // ............................................................................................................................................................................*................... + // sqrdmulh v24.8H, v5.8H, v0.H[3] // ...................................................................................................................................................................................*............ + // sqrdmulh v10.8H, v7.8H, v0.H[5] // ...........................................................................................................................................................*.................................... + // mul v28.8H, v7.8H, v0.H[4] // ...................................................................................................................................................................*............................ + // mul v13.8H, v26.8H, v0.H[2] // ..........................................................................................................................................................*..................................... + // sqrdmulh v23.8H, v17.8H, v0.H[3] // ..................................................................................................................................................*............................................. + // mla v16.8H, v9.8H, v8.H[0] // ..................................................................................................................................................................................*............. + // mla v18.8H, v20.8H, v8.H[0] // ......................................................................................................................................*......................................................... + // mla v31.8H, v4.8H, v8.H[0] // ............................................................................................................................................*................................................... + // mla v28.8H, v10.8H, v8.H[0] // .........................................................................................................................................................................*...................... + // mla v13.8H, v25.8H, v8.H[0] // ................................................................................................................................................................*............................... + // mul v14.8H, v11.8H, v0.H[4] // ................................................................................................................................................................................*............... + // add v5.8H, v21.8H, v18.8H // .................................................................................................................................................................*.............................. + // sqrdmulh v7.8H, v30.8H, v0.H[5] // ...........................................................................................................................................*.................................................... + // add v9.8H, v6.8H, v28.8H // ...............................................................................................................................................................................*................ + // sub v28.8H, v6.8H, v28.8H // .....................................................................................................................................................................................*.......... + // sqrdmulh v6.8H, v11.8H, v0.H[5] // .................................................................................................................................................................................*.............. + // sub v20.8H, v21.8H, v18.8H // ...............................................................................................................................................................*................................ + // sub v18.8H, v22.8H, v13.8H // ........................................................................................................................................................................*....................... + // add v11.8H, v22.8H, v13.8H // ......................................................................................................................................................................*......................... + // mul v4.8H, v30.8H, v0.H[4] // ................................................................................................................................................*............................................... + // mla v29.8H, v24.8H, v8.H[0] // ........................................................................................................................................................................................*....... + // mul v24.8H, v17.8H, v0.H[2] // ......................................................................................................................................................*......................................... + // add v17.8H, v27.8H, v31.8H // ...................................................................................................................................................*............................................ + // sub v26.8H, v27.8H, v31.8H // ....................................................................................................................................................*........................................... + // mla v4.8H, v7.8H, v8.H[0] // ............................................................................................................................................................*................................... + // mla v14.8H, v6.8H, v8.H[0] // ......................................................................................................................................................................................*......... + // mul v31.8H, v26.8H, v1.H[0] // .........................................................................................................................................................*...................................... + // mla v24.8H, v23.8H, v8.H[0] // .......................................................................................................................................................................*........................ + // sqrdmulh v22.8H, v26.8H, v1.H[1] // ..............................................................................................................................................................*................................. + // sub v27.8H, v12.8H, v16.8H // .........................................................................................................................................................................................*...... + // sub v23.8H, v20.8H, v4.8H // ..............................................................................................................................................................................*................. + // add v30.8H, v19.8H, v14.8H // ............................................................................................................................................................................................*... + // sub v26.8H, v5.8H, v24.8H // ....................................................................................................................................................................................*........... + // sub v10.8H, v19.8H, v14.8H // .............................................................................................................................................................................................*.. + // add v7.8H, v5.8H, v24.8H // ..........................................................................................................................................................................................*..... + // mul v19.8H, v23.8H, v1.H[4] // ..............................................................................................................................................................................................*. + // mul v25.8H, v26.8H, v1.H[0] // ...............................................................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer1234_start + add v20.8H, v20.8H, v4.8H // ...............................................................................................*................................................................................................ + sqrdmulh v6.8H, v26.8H, v1.H[1] // ...............................................................................................................*................................................................................ + mul v21.8H, v7.8H, v0.H[6] // ......................................................................................................*......................................................................................... + sqrdmulh v7.8H, v7.8H, v0.H[7] // .....................................................................................................*.......................................................................................... + mul v14.8H, v20.8H, v1.H[2] // ..........................................................................................................................*..................................................................... + sqrdmulh v20.8H, v20.8H, v1.H[3] // .........................................................................................................................*...................................................................... + mul v5.8H, v17.8H, v0.H[6] // .................................................................................................*.............................................................................................. + mla v25.8H, v6.8H, v8.H[0] // .................................................................................................................*.............................................................................. + sqrdmulh v6.8H, v17.8H, v0.H[7] // ................................................................................................*............................................................................................... + mla v21.8H, v7.8H, v8.H[0] // .......................................................................................................*........................................................................................ + mla v31.8H, v22.8H, v8.H[0] // ............................................................................................................*................................................................................... + mla v14.8H, v20.8H, v8.H[0] // ...........................................................................................................................*.................................................................... + add v17.8H, v15.8H, v29.8H // ............................................................*................................................................................................................................... + sub v24.8H, v18.8H, v25.8H // ..................................................................................................................*............................................................................. + mla v5.8H, v6.8H, v8.H[0] // ..................................................................................................*............................................................................................. + add v20.8H, v11.8H, v21.8H // .........................................................................................................*...................................................................................... + sqrdmulh v22.8H, v24.8H, v2.H[5] // .......................................................................................................................................................*........................................ + add v13.8H, v9.8H, v14.8H // .............................................................................................................................*.................................................................. + sqrdmulh v6.8H, v20.8H, v1.H[7] // ........................................................................................................................................*....................................................... + mul v7.8H, v20.8H, v1.H[6] // .........................................................................................................................................*...................................................... + mul v26.8H, v13.8H, v2.H[6] // .............................................................................................................................................................*.................................. + mul v4.8H, v24.8H, v2.H[4] // ........................................................................................................................................................*....................................... + sqrdmulh v20.8H, v13.8H, v2.H[7] // ............................................................................................................................................................*................................... + sub v9.8H, v9.8H, v14.8H // ............................................................................................................................*................................................................... + mla v7.8H, v6.8H, v8.H[0] // ..........................................................................................................................................*..................................................... + add v6.8H, v12.8H, v16.8H // ..........................................................................................*..................................................................................................... + mul v24.8H, v9.8H, v3.H[0] // ..................................................................................................................................................................*............................. + add v16.8H, v17.8H, v5.8H // ....................................................................................................*........................................................................................... + mul v14.8H, v6.8H, v1.H[2] // .....................................................................................................................*.......................................................................... + sqrdmulh v12.8H, v6.8H, v1.H[3] // ....................................................................................................................*........................................................................... + add v6.8H, v16.8H, v7.8H // ............................................................................................................................................*................................................... + mla v26.8H, v20.8H, v8.H[0] // ..............................................................................................................................................................*................................. + sqrdmulh v20.8H, v9.8H, v3.H[1] // .................................................................................................................................................................*.............................. + sub v29.8H, v15.8H, v29.8H // ...........................................................*.................................................................................................................................... + str q6, [x6], #16 // ................................................................................................................................................................................*............... + mla v4.8H, v22.8H, v8.H[0] // .........................................................................................................................................................*...................................... + mla v14.8H, v12.8H, v8.H[0] // ......................................................................................................................*......................................................................... + sub v15.8H, v29.8H, v31.8H // .............................................................................................................*.................................................................................. + sub v11.8H, v11.8H, v21.8H // ........................................................................................................*....................................................................................... + mla v24.8H, v20.8H, v8.H[0] // ...................................................................................................................................................................*............................ + sqrdmulh v23.8H, v23.8H, v1.H[5] // ...................................................................................................................................*............................................................ + sub v6.8H, v15.8H, v4.8H // ..........................................................................................................................................................*..................................... + sub v12.8H, v30.8H, v14.8H // .......................................................................................................................*........................................................................ + mul v21.8H, v11.8H, v2.H[0] // ..............................................................................................................................................*................................................. + add v9.8H, v30.8H, v14.8H // ........................................................................................................................*....................................................................... + sub v22.8H, v12.8H, v24.8H // ....................................................................................................................................................................*........................... + str q6, [x6, #208] // .......................................................................................................................................................................................*........ + mla v19.8H, v23.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + sub v20.8H, v9.8H, v26.8H // ...............................................................................................................................................................*................................ + sub v6.8H, v16.8H, v7.8H // ...........................................................................................................................................*.................................................... + str q22, [x14, #96] // ...........................................................................................................................................................................................*.... + sqrdmulh v7.8H, v27.8H, v1.H[5] // ..............................................................................................................................*................................................................. + add v16.8H, v9.8H, v26.8H // ................................................................................................................................................................*............................... + str q6, [x6, #16] // .................................................................................................................................................................................*.............. + sub v22.8H, v28.8H, v19.8H // ......................................................................................................................................*......................................................... + mul v27.8H, v27.8H, v1.H[4] // ...............................................................................................................................*................................................................ + add v6.8H, v12.8H, v24.8H // .....................................................................................................................................................................*.......................... + str q16, [x14], #16 // ........................................................................................................................................................................................*....... + add v26.8H, v28.8H, v19.8H // .......................................................................................................................................*........................................................ + str q20, [x14, #16] // .........................................................................................................................................................................................*...... + sqrdmulh v16.8H, v22.8H, v3.H[5] // ...........................................................................................................................................................................*.................... + mul v19.8H, v22.8H, v3.H[4] // ............................................................................................................................................................................*................... + str q6, [x14, #48] // ..........................................................................................................................................................................................*..... + mla v27.8H, v7.8H, v8.H[0] // ................................................................................................................................*............................................................... + sqrdmulh v22.8H, v11.8H, v2.H[1] // .............................................................................................................................................*.................................................. + mul v20.8H, v26.8H, v3.H[2] // .......................................................................................................................................................................*........................ + add v23.8H, v18.8H, v25.8H // ...................................................................................................................*............................................................................ + mla v19.8H, v16.8H, v8.H[0] // .............................................................................................................................................................................*.................. + sub v16.8H, v17.8H, v5.8H // ...................................................................................................*............................................................................................ + mla v21.8H, v22.8H, v8.H[0] // ...............................................................................................................................................*................................................ + sub v6.8H, v10.8H, v27.8H // .................................................................................................................................*.............................................................. + sqrdmulh v30.8H, v23.8H, v2.H[3] // ..................................................................................................................................................*............................................. + sqrdmulh v9.8H, v26.8H, v3.H[3] // ......................................................................................................................................................................*......................... + sub v22.8H, v6.8H, v19.8H // ..............................................................................................................................................................................*................. + add v18.8H, v6.8H, v19.8H // ...............................................................................................................................................................................*................ + sub v12.8H, v16.8H, v21.8H // ................................................................................................................................................*............................................... + mul v26.8H, v23.8H, v2.H[2] // ...................................................................................................................................................*............................................ + str q18, [x14, #176] // ..............................................................................................................................................................................................*. + add v6.8H, v16.8H, v21.8H // .................................................................................................................................................*.............................................. + mla v20.8H, v9.8H, v8.H[0] // ........................................................................................................................................................................*....................... + str q12, [x6, #80] // ...................................................................................................................................................................................*............ + add v18.8H, v15.8H, v4.8H // ...........................................................................................................................................................*.................................... + add v16.8H, v10.8H, v27.8H // ..................................................................................................................................*............................................................. + str q6, [x6, #48] // ..................................................................................................................................................................................*............. + add v19.8H, v29.8H, v31.8H // ..............................................................................................................*................................................................................. + mla v26.8H, v30.8H, v8.H[0] // ....................................................................................................................................................*........................................... + str q18, [x6, #176] // ......................................................................................................................................................................................*......... + add v6.8H, v16.8H, v20.8H // ..........................................................................................................................................................................*..................... + str q22, [x14, #208] // ...............................................................................................................................................................................................* + sub v16.8H, v16.8H, v20.8H // .........................................................................................................................................................................*...................... + str q6, [x14, #112] // ............................................................................................................................................................................................*... + sub v6.8H, v19.8H, v26.8H // .....................................................................................................................................................*.......................................... + add v18.8H, v19.8H, v26.8H // ......................................................................................................................................................*......................................... + str q16, [x14, #144] // .............................................................................................................................................................................................*.. + str q6, [x6, #144] // .....................................................................................................................................................................................*.......... + str q18, [x6, #112] // ....................................................................................................................................................................................*........... + + restore inp, STACK0 + mov count, #4 + + ASM_LOAD(r_ptr1, roots_l456) + + add src0, inp, #256*0 + add src1, inp, #256*1 + + .p2align 2 + // Instructions: 2 + // Expected cycles: 1 + // Expected IPC: 2.00 + // + // Cycle bound: 1.0 + // IPC bound: 2.00 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q26, [x4, #112] // *............................. + ldr q31, [x4, #16] // .*............................ + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q26, [x4, #112] // *.............................. + // ldr q31, [x4, #16] // .*............................. + + sub count, count, #1 +layer567_start: + // Instructions: 140 + // Expected cycles: 70 + // Expected IPC: 2.00 + // + // Cycle bound: 58.0 + // IPC bound: 2.41 + // + // Wall time: 3611.65s + // User time: 3611.65s + // + // ------------------------------------------------------------ original position ------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + ldr q0, [x4, #32] // ............*............................................................................................................................... + ldr q14, [x4, #176] // .....................*...................................................................................................................... + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x6] // *........................................................................................................................................... + ld4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x7] // .*.......................................................................................................................................... + ldr q1, [x4, #96] // ................*........................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + ldr q25, [x4, #144] // ...................*........................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + ldr q9, [x4, #80] // ...............*............................................................................................................................ + ldr q30, [x4, #208] // .......................*.................................................................................................................... + ldr q7, [x4, #192] // ......................*..................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + ldr q11, [x4, #48] // .............*.............................................................................................................................. + ldr q6, [x4], #16*14 // ..........*................................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v22.4S, v21.4S, v5.4S // .........*.................................................................................................................................. + trn1 v13.4S, v19.4S, v3.4S // ....*....................................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v3.4S, v19.4S, v3.4S // .....*...................................................................................................................................... + trn2 v19.4S, v20.4S, v4.4S // .......*.................................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v27.4S, v18.4S, v2.4S // ...*........................................................................................................................................ + trn1 v12.4S, v20.4S, v4.4S // ......*..................................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v23.8H, v3.8H, v6.8H // ..............................*............................................................................................................. + mul v16.8H, v19.8H, v6.8H // ...................................*........................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v19.8H, v19.8H, v31.8H // ..................................*......................................................................................................... + mul v4.8H, v27.8H, v6.8H // .........................*.................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v6.8H, v22.8H, v6.8H // ........................................*................................................................................................... + sqrdmulh v22.8H, v22.8H, v31.8H // .......................................*.................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v20.8H, v3.8H, v31.8H // .............................*.............................................................................................................. + sqrdmulh v17.8H, v27.8H, v31.8H // ........................*................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v31.4S, v21.4S, v5.4S // ........*................................................................................................................................... + mla v16.8H, v19.8H, v8.H[0] // ....................................*....................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v6.8H, v22.8H, v8.H[0] // .........................................*.................................................................................................. + trn1 v18.4S, v18.4S, v2.4S // ..*......................................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v23.8H, v20.8H, v8.H[0] // ...............................*............................................................................................................ + mla v4.8H, v17.8H, v8.H[0] // ..........................*................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v20.8H, v12.8H, v16.8H // ......................................*..................................................................................................... + sub v21.8H, v12.8H, v16.8H // .....................................*...................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v22.8H, v31.8H, v6.8H // ...........................................*................................................................................................ + sub v6.8H, v31.8H, v6.8H // ..........................................*................................................................................................. + // gap // ............................................................................................................................................ + ldr q12, [x4, #-160] // ..............*............................................................................................................................. + sub v27.8H, v18.8H, v4.8H // ...........................*................................................................................................................ + sub v19.8H, v13.8H, v23.8H // ................................*........................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v15.8H, v22.8H, v0.8H // ..................................................*......................................................................................... + sqrdmulh v3.8H, v22.8H, v11.8H // .................................................*.......................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v31.8H, v21.8H, v9.8H // ......................................................*..................................................................................... + mul v16.8H, v20.8H, v0.8H // .............................................*.............................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v20.8H, v20.8H, v11.8H // ............................................*............................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v10.8H, v21.8H, v12.8H // .......................................................*.................................................................................... + sqrdmulh v22.8H, v6.8H, v9.8H // ...........................................................*................................................................................ + mla v15.8H, v3.8H, v8.H[0] // ...................................................*........................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v6.8H, v6.8H, v12.8H // ............................................................*............................................................................... + add v9.8H, v13.8H, v23.8H // .................................*.......................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v10.8H, v31.8H, v8.H[0] // ........................................................*................................................................................... + mla v16.8H, v20.8H, v8.H[0] // ..............................................*............................................................................................. + // gap // ............................................................................................................................................ + ldr q21, [x4, #-96] // ..................*......................................................................................................................... + add v31.8H, v9.8H, v15.8H // .....................................................*...................................................................................... + sub v12.8H, v9.8H, v15.8H // ....................................................*....................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v6.8H, v22.8H, v8.H[0] // .............................................................*.............................................................................. + add v18.8H, v18.8H, v4.8H // ............................*............................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v20.8H, v12.8H, v25.8H // .....................................................................*...................................................................... + mul v22.8H, v31.8H, v1.8H // .................................................................*.......................................................................... + // gap // ............................................................................................................................................ + ldr q25, [x4, #-64] // ....................*....................................................................................................................... + mul v2.8H, v12.8H, v21.8H // ......................................................................*..................................................................... + sub v17.8H, v18.8H, v16.8H // ...............................................*............................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v3.8H, v31.8H, v26.8H // ................................................................*........................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v26.8H, v19.8H, v6.8H // ...............................................................*............................................................................ + add v18.8H, v18.8H, v16.8H // ................................................*........................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v6.8H, v19.8H, v6.8H // ..............................................................*............................................................................. + mul v16.8H, v26.8H, v25.8H // ...........................................................................*................................................................ + sqrdmulh v15.8H, v26.8H, v14.8H // ..........................................................................*................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v26.8H, v6.8H, v7.8H // ................................................................................*........................................................... + sqrdmulh v6.8H, v6.8H, v30.8H // ...............................................................................*............................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v12.8H, v27.8H, v10.8H // ..........................................................*................................................................................. + mla v22.8H, v3.8H, v8.H[0] // ..................................................................*......................................................................... + mla v16.8H, v15.8H, v8.H[0] // ............................................................................*............................................................... + mla v2.8H, v20.8H, v8.H[0] // .......................................................................*.................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v26.8H, v6.8H, v8.H[0] // .................................................................................*.......................................................... + sub v9.8H, v27.8H, v10.8H // .........................................................*.................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v30.8H, v18.8H, v22.8H // ...................................................................*........................................................................ + add v18.8H, v18.8H, v22.8H // ....................................................................*....................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v13.8H, v12.8H, v16.8H // .............................................................................*.............................................................. + add v16.8H, v12.8H, v16.8H // ..............................................................................*............................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v27.8H, v17.8H, v2.8H // ........................................................................*................................................................... + add v7.8H, v17.8H, v2.8H // .........................................................................*.................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqdmulh v22.8H, v18.8H, v8.H[1] // ....................................................................................*....................................................... + sqdmulh v19.8H, v16.8H, v8.H[1] // ................................................................................................*........................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqdmulh v3.8H, v30.8H, v8.H[1] // .......................................................................................*.................................................... + sqdmulh v6.8H, v27.8H, v8.H[1] // .............................................................................................*.............................................. + add v2.8H, v9.8H, v26.8H // ...................................................................................*........................................................ + sub v4.8H, v9.8H, v26.8H // ..................................................................................*......................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v26.8H, v19.8H, #11 // .................................................................................................*.......................................... + srshr v9.8H, v22.8H, #11 // .....................................................................................*...................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqdmulh v22.8H, v13.8H, v8.H[1] // ...................................................................................................*........................................ + srshr v19.8H, v6.8H, #11 // ..............................................................................................*............................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqdmulh v6.8H, v7.8H, v8.H[1] // ..........................................................................................*................................................. + srshr v3.8H, v3.8H, #11 // ........................................................................................*................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v16.8H, v26.8H, v8.H[0] // ..................................................................................................*......................................... + mla v18.8H, v9.8H, v8.H[0] // ......................................................................................*..................................................... + ldr q26, [x4, #112] // .................e.......................................................................................................................... + // gap // ............................................................................................................................................ + srshr v20.8H, v22.8H, #11 // ....................................................................................................*....................................... + sqdmulh v22.8H, v4.8H, v8.H[1] // .........................................................................................................*.................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v27.8H, v19.8H, v8.H[0] // ...............................................................................................*............................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqdmulh v19.8H, v2.8H, v8.H[1] // ......................................................................................................*..................................... + trn1 v15.4S, v18.4S, v16.4S // ............................................................................................................*............................... + trn2 v18.4S, v18.4S, v16.4S // .............................................................................................................*.............................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v16.8H, v22.8H, #11 // ..........................................................................................................*................................. + srshr v22.8H, v6.8H, #11 // ...........................................................................................*................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v6.8H, v19.8H, #11 // .......................................................................................................*.................................... + mla v30.8H, v3.8H, v8.H[0] // .........................................................................................*.................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v13.8H, v20.8H, v8.H[0] // .....................................................................................................*...................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v4.8H, v16.8H, v8.H[0] // ...........................................................................................................*................................ + mla v7.8H, v22.8H, v8.H[0] // ............................................................................................*............................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mla v2.8H, v6.8H, v8.H[0] // ........................................................................................................*................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v12.4S, v30.4S, v13.4S // ...............................................................................................................*............................ + trn1 v6.4S, v30.4S, v13.4S // ..............................................................................................................*............................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v31.4S, v27.4S, v4.4S // ...................................................................................................................*........................ + trn1 v17.4S, v27.4S, v4.4S // ..................................................................................................................*......................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v13.4S, v18.4S, v12.4S // ................................................................................................................................*........... + trn1 v22.4S, v7.4S, v2.4S // ................................................................................................................*........................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v9.4S, v15.4S, v6.4S // ....................................................................................................................*....................... + trn2 v20.4S, v7.4S, v2.4S // .................................................................................................................*.......................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v3.4S, v15.4S, v6.4S // .....................................................................................................................*...................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v19.4S, v22.4S, v17.4S // .......................................................................................................................*.................... + trn1 v27.4S, v22.4S, v17.4S // ......................................................................................................................*..................... + trn2 v16.4S, v18.4S, v12.4S // .................................................................................................................................*.......... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v18.2D, v3.2D, v19.2D // ...........................................................................................................................*................ + trn2 v22.2D, v3.2D, v19.2D // .........................................................................................................................*.................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v6.4S, v20.4S, v31.4S // ...................................................................................................................................*........ + trn1 v31.4S, v20.4S, v31.4S // ..................................................................................................................................*......... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q22, [x6, #48] // ...............................................................................................................................*............ + trn1 v19.2D, v9.2D, v27.2D // ..........................................................................................................................*................. + trn2 v20.2D, v9.2D, v27.2D // ........................................................................................................................*................... + // gap // ............................................................................................................................................ + str q18, [x6, #16] // .............................................................................................................................*.............. + trn2 v22.2D, v13.2D, v31.2D // ....................................................................................................................................*....... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q20, [x6, #32] // ..............................................................................................................................*............. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v18.2D, v16.2D, v6.2D // .....................................................................................................................................*...... + trn1 v31.2D, v13.2D, v31.2D // ......................................................................................................................................*..... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q19, [x6], #64 // ............................................................................................................................*............... + str q18, [x7, #48] // ...........................................................................................................................................* + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v6.2D, v16.2D, v6.2D // .......................................................................................................................................*.... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q31, [x7], #64 // ........................................................................................................................................*... + ldr q31, [x4, #16] // ...........e................................................................................................................................ + str q22, [x7, #-32] // ..........................................................................................................................................*. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q6, [x7, #-48] // .........................................................................................................................................*.. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + + // ------------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------- + // ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6] // ..............................................'.*......................................................................................................................................... + // ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7] // ..............................................'..*........................................................................................................................................ + // trn1 v9.4s, v17.4s, v21.4s // ..............................................'...........................*............................................................................................................... + // trn2 v13.4s, v17.4s, v21.4s // ..............................................'..............*............................................................................................................................ + // trn1 v10.4s, v18.4s, v22.4s // ..............................................'...........*............................................................................................................................... + // trn2 v14.4s, v18.4s, v22.4s // ..............................................'............*.............................................................................................................................. + // trn1 v11.4s, v19.4s, v23.4s // ..............................................'...............*........................................................................................................................... + // trn2 v15.4s, v19.4s, v23.4s // ..............................................'.............*............................................................................................................................. + // trn1 v12.4s, v20.4s, v24.4s // ..............................................'........................*.................................................................................................................. + // trn2 v16.4s, v20.4s, v24.4s // ..............................................'..........*................................................................................................................................ + // ldr q0, [ x4], #16*14 // ..............................................'.........*................................................................................................................................. + // ldr q4, [x4, #-16*14+16*1] // ...........................................e..'........................................................................................................................................~.. + // ldr q1, [ x4, #-16*14+16*2] // ..............................................*........................................................................................................................................... + // ldr q5, [x4, #-16*14+16*3] // ..............................................'........*.................................................................................................................................. + // ldr q2, [ x4, #-16*14+16*4] // ..............................................'..................................*........................................................................................................ + // ldr q6, [x4, #-16*14+16*5] // ..............................................'.....*..................................................................................................................................... + // ldr q3, [ x4, #-16*14+16*6] // ..............................................'...*....................................................................................................................................... + // ldr q7, [x4, #-16*14+16*7] // e.............................................'.............................................................................................~............................................. + // ldr q17, [ x4, #-16*14+16*8] // ..............................................'.................................................*......................................................................................... + // ldr q18, [ x4, #-16*14+16*9] // ..............................................'....*...................................................................................................................................... + // ldr q19, [ x4, #-16*14+16*10] // ..............................................'........................................................*.................................................................................. + // ldr q20, [ x4, #-16*14+16*11] // ..............................................'*.......................................................................................................................................... + // ldr q21, [ x4, #-16*14+16*12] // ..............................................'.......*................................................................................................................................... + // ldr q22, [ x4, #-16*14+16*13] // ..............................................'......*.................................................................................................................................... + // sqrdmulh v28.8h, v13.8h, v4.8h // ..............................................'.......................*................................................................................................................... + // mul v25.8h, v13.8h, v0.8h // ..............................................'...................*....................................................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'.............................*............................................................................................................. + // sub v13.8h, v9.8h, v25.8h // ..............................................'...................................*....................................................................................................... + // add v9.8h, v9.8h, v25.8h // ..............................................'.....................................................*..................................................................................... + // sqrdmulh v28.8h, v14.8h, v4.8h // ..............................................'......................*.................................................................................................................... + // mul v25.8h, v14.8h, v0.8h // ..............................................'................*.......................................................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'............................*.............................................................................................................. + // sub v14.8h, v10.8h, v25.8h // ..............................................'....................................*...................................................................................................... + // add v10.8h, v10.8h, v25.8h // ..............................................'..............................................*............................................................................................ + // sqrdmulh v28.8h, v15.8h, v4.8h // ..............................................'..................*........................................................................................................................ + // mul v25.8h, v15.8h, v0.8h // ..............................................'.................*......................................................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'.........................*................................................................................................................. + // sub v15.8h, v11.8h, v25.8h // ..............................................'...............................*........................................................................................................... + // add v11.8h, v11.8h, v25.8h // ..............................................'..............................*............................................................................................................ + // sqrdmulh v28.8h, v16.8h, v4.8h // ..............................................'.....................*..................................................................................................................... + // mul v25.8h, v16.8h, v0.8h // ..............................................'....................*...................................................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'..........................*................................................................................................................ + // sub v16.8h, v12.8h, v25.8h // ..............................................'.................................*......................................................................................................... + // add v12.8h, v12.8h, v25.8h // ..............................................'................................*.......................................................................................................... + // sqrdmulh v28.8h, v11.8h, v5.8h // ..............................................'.........................................*................................................................................................. + // mul v25.8h, v11.8h, v1.8h // ..............................................'........................................*.................................................................................................. + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'................................................*.......................................................................................... + // sub v11.8h, v9.8h, v25.8h // ..............................................'..........................................................*................................................................................ + // add v9.8h, v9.8h, v25.8h // ..............................................'.............................................................*............................................................................. + // sqrdmulh v28.8h, v12.8h, v5.8h // ..............................................'......................................*.................................................................................................... + // mul v25.8h, v12.8h, v1.8h // ..............................................'.....................................*..................................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'............................................*.............................................................................................. + // sub v12.8h, v10.8h, v25.8h // ..............................................'...................................................*....................................................................................... + // add v10.8h, v10.8h, v25.8h // ..............................................'..................................................*........................................................................................ + // sqrdmulh v28.8h, v15.8h, v6.8h // ..............................................'.......................................*................................................................................................... + // mul v25.8h, v15.8h, v2.8h // ..............................................'..........................................*................................................................................................ + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'...............................................*........................................................................................... + // sub v15.8h, v13.8h, v25.8h // ..............................................'........................................................................*.................................................................. + // add v13.8h, v13.8h, v25.8h // ..............................................'...................................................................*....................................................................... + // sqrdmulh v28.8h, v16.8h, v6.8h // ..............................................'...........................................*............................................................................................... + // mul v25.8h, v16.8h, v2.8h // ..............................................'.............................................*............................................................................................. + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'....................................................*...................................................................................... + // sub v16.8h, v14.8h, v25.8h // ..............................................'..............................................................*............................................................................ + // add v14.8h, v14.8h, v25.8h // ..............................................'............................................................*.............................................................................. + // sqrdmulh v28.8h, v10.8h, v7.8h // ..............................................'...........................................................*............................................................................... + // mul v25.8h, v10.8h, v3.8h // ..............................................'.......................................................*................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'....................................................................*...................................................................... + // sub v10.8h, v9.8h, v25.8h // ..............................................'.........................................................................*................................................................. + // add v9.8h, v9.8h, v25.8h // ..............................................'..........................................................................*................................................................ + // sqrdmulh v28.8h, v12.8h, v18.8h // ..............................................'......................................................*.................................................................................... + // mul v25.8h, v12.8h, v17.8h // ..............................................'.........................................................*................................................................................. + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'......................................................................*.................................................................... + // sub v12.8h, v11.8h, v25.8h // ..............................................'.............................................................................*............................................................. + // add v11.8h, v11.8h, v25.8h // ..............................................'..............................................................................*............................................................ + // sqrdmulh v28.8h, v14.8h, v20.8h // ..............................................'................................................................*.......................................................................... + // mul v25.8h, v14.8h, v19.8h // ..............................................'...............................................................*........................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'.....................................................................*..................................................................... + // sub v14.8h, v13.8h, v25.8h // ..............................................'...........................................................................*............................................................... + // add v13.8h, v13.8h, v25.8h // ..............................................'............................................................................*.............................................................. + // sqrdmulh v28.8h, v16.8h, v22.8h // ..............................................'..................................................................*........................................................................ + // mul v25.8h, v16.8h, v21.8h // ..............................................'.................................................................*......................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................................'.......................................................................*................................................................... + // sub v16.8h, v15.8h, v25.8h // ..............................................'....................................................................................*...................................................... + // add v15.8h, v15.8h, v25.8h // ..............................................'...................................................................................*....................................................... + // sqdmulh v26.8h, v9.8h, v8.h[1] // ..............................................'...............................................................................*........................................................... + // srshr v26.8h, v26.8h, #11 // ..............................................'......................................................................................*.................................................... + // mla v9.8h, v26.8h, v8.h[0] // ..............................................'............................................................................................*.............................................. + // sqdmulh v26.8h, v10.8h, v8.h[1] // ..............................................'.................................................................................*......................................................... + // srshr v26.8h, v26.8h, #11 // ..............................................'..........................................................................................*................................................ + // mla v10.8h, v26.8h, v8.h[0] // ..........~...................................'.......................................................................................................*................................... + // sqdmulh v26.8h, v11.8h, v8.h[1] // ..............................................'.........................................................................................*................................................. + // srshr v26.8h, v26.8h, #11 // ........~.....................................'.....................................................................................................*..................................... + // mla v11.8h, v26.8h, v8.h[0] // .............~................................'..........................................................................................................*................................ + // sqdmulh v26.8h, v12.8h, v8.h[1] // ..............................................'..................................................................................*........................................................ + // srshr v26.8h, v26.8h, #11 // ..............................................'........................................................................................*.................................................. + // mla v12.8h, v26.8h, v8.h[0] // ...~..........................................'................................................................................................*.......................................... + // sqdmulh v26.8h, v13.8h, v8.h[1] // ..............................................'................................................................................*.......................................................... + // srshr v26.8h, v26.8h, #11 // ..............................................'.....................................................................................*..................................................... + // mla v13.8h, v26.8h, v8.h[0] // ..............................................'...........................................................................................*............................................... + // sqdmulh v26.8h, v14.8h, v8.h[1] // ..............................................'.......................................................................................*................................................... + // srshr v26.8h, v26.8h, #11 // .~............................................'..............................................................................................*............................................ + // mla v14.8h, v26.8h, v8.h[0] // ...........~..................................'........................................................................................................*.................................. + // sqdmulh v26.8h, v15.8h, v8.h[1] // ....~.........................................'.................................................................................................*......................................... + // srshr v26.8h, v26.8h, #11 // .........~....................................'......................................................................................................*.................................... + // mla v15.8h, v26.8h, v8.h[0] // ..............~...............................'...........................................................................................................*............................... + // sqdmulh v26.8h, v16.8h, v8.h[1] // ..~...........................................'...............................................................................................*........................................... + // srshr v26.8h, v26.8h, #11 // .......~......................................'....................................................................................................*...................................... + // mla v16.8h, v26.8h, v8.h[0] // ............~.................................'.........................................................................................................*................................. + // trn1 v17.4s, v9.4s, v13.4s // .....~........................................'..................................................................................................*........................................ + // trn2 v21.4s, v9.4s, v13.4s // ......~.......................................'...................................................................................................*....................................... + // trn1 v18.4s, v10.4s, v14.4s // ................~.............................'.............................................................................................................*............................. + // trn2 v22.4s, v10.4s, v14.4s // ...............~..............................'............................................................................................................*.............................. + // trn1 v19.4s, v11.4s, v15.4s // ....................~.........................'.................................................................................................................*......................... + // trn2 v23.4s, v11.4s, v15.4s // ......................~.......................'...................................................................................................................*....................... + // trn1 v20.4s, v12.4s, v16.4s // ..................~...........................'...............................................................................................................*........................... + // trn2 v24.4s, v12.4s, v16.4s // .................~............................'..............................................................................................................*............................ + // trn1 v26.4s, v17.4s, v18.4s // .....................~........................'..................................................................................................................*........................ + // trn2 v27.4s, v17.4s, v18.4s // .......................~......................'....................................................................................................................*...................... + // trn1 v28.4s, v19.4s, v20.4s // .........................~....................'......................................................................................................................*.................... + // trn2 v29.4s, v19.4s, v20.4s // ........................~.....................'.....................................................................................................................*..................... + // trn2 v19.2d, v26.2d, v28.2d // .................................~............'..............................................................................................................................*............ + // trn2 v20.2d, v27.2d, v29.2d // ............................~.................'.........................................................................................................................*................. + // trn1 v17.2d, v26.2d, v28.2d // ................................~.............'.............................................................................................................................*............. + // trn1 v18.2d, v27.2d, v29.2d // ...........................~..................'........................................................................................................................*.................. + // str q17, [x6], #64 // .......................................~......'....................................................................................................................................*...... + // str q18, [x6, #(-(64) + 16*1)] // ..................................~...........'...............................................................................................................................*........... + // str q19, [x6, #(-(64) + 16*2)] // ....................................~.........'.................................................................................................................................*......... + // str q20, [x6, #(-(64) + 16*3)] // ...............................~..............'............................................................................................................................*.............. + // trn1 v26.4s, v21.4s, v22.4s // ...................~..........................'................................................................................................................*.......................... + // trn2 v27.4s, v21.4s, v22.4s // ..........................~...................'.......................................................................................................................*................... + // trn1 v28.4s, v23.4s, v24.4s // ..............................~...............'...........................................................................................................................*............... + // trn2 v29.4s, v23.4s, v24.4s // .............................~................'..........................................................................................................................*................ + // trn2 v23.2d, v26.2d, v28.2d // ...................................~..........'................................................................................................................................*.......... + // trn2 v24.2d, v27.2d, v29.2d // .....................................~........'..................................................................................................................................*........ + // trn1 v21.2d, v26.2d, v28.2d // ......................................~.......'...................................................................................................................................*....... + // trn1 v22.2d, v27.2d, v29.2d // .........................................~....'......................................................................................................................................*.... + // str q21, [x7], #64 // ..........................................~...'.......................................................................................................................................*... + // str q22, [x7, #(-(64) + 16*1)] // .............................................~'..........................................................................................................................................* + // str q23, [x7, #(-(64) + 16*2)] // ............................................~.'.........................................................................................................................................*. + // str q24, [x7, #(-(64) + 16*3)] // ........................................~.....'.....................................................................................................................................*..... + + sub count, count, #1 + cbnz count, layer567_start + // Instructions: 138 + // Expected cycles: 68 + // Expected IPC: 2.03 + // + // Cycle bound: 68.0 + // IPC bound: 2.03 + // + // Wall time: 1406.02s + // User time: 1406.02s + // + // ----------------------------------------------------------- original position -----------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------ + ldr q28, [x4, #176] // .*........................................................................................................................................ + ldr q24, [x4, #32] // *......................................................................................................................................... + ld4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x7] // ...*...................................................................................................................................... + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x6] // ..*....................................................................................................................................... + ldr q2, [x4, #208] // .......*.................................................................................................................................. + ldr q16, [x4, #96] // ....*..................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + ldr q5, [x4, #80] // ......*................................................................................................................................... + ldr q7, [x4], #16*14 // ..........*............................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + ldr q3, [x4, #-160] // ...................................*...................................................................................................... + ldr q17, [x4, #-96] // ..................................................*....................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + ldr q9, [x4, #-64] // .........................................................*................................................................................ + ldr q23, [x4, #-32] // ........*................................................................................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn2 v22.4S, v18.4S, v12.4S // ...............*.......................................................................................................................... + trn2 v6.4S, v21.4S, v15.4S // ...........*.............................................................................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn2 v1.4S, v19.4S, v13.4S // .............*............................................................................................................................ + trn2 v11.4S, v20.4S, v14.4S // ..............*........................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqrdmulh v0.8H, v6.8H, v31.8H // ......................*................................................................................................................... + mul v10.8H, v6.8H, v7.8H // .....................*.................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mul v30.8H, v11.8H, v7.8H // ..................*....................................................................................................................... + sqrdmulh v4.8H, v1.8H, v31.8H // .......................*.................................................................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mul v29.8H, v1.8H, v7.8H // .................*........................................................................................................................ + mul v7.8H, v22.8H, v7.8H // ....................*..................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v10.8H, v0.8H, v8.H[0] // ...........................*.............................................................................................................. + sqrdmulh v6.8H, v22.8H, v31.8H // ........................*................................................................................................................. + ldr q0, [x4, #-176] // .........*................................................................................................................................ + // gap // .......................................................................................................................................... + sqrdmulh v25.8H, v11.8H, v31.8H // ...................*...................................................................................................................... + trn1 v27.4S, v21.4S, v15.4S // .........................*................................................................................................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn1 v31.4S, v20.4S, v14.4S // ................*......................................................................................................................... + mla v29.8H, v4.8H, v8.H[0] // .............................*............................................................................................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + add v22.8H, v27.8H, v10.8H // .................................*........................................................................................................ + sub v15.8H, v27.8H, v10.8H // ..................................*....................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v7.8H, v6.8H, v8.H[0] // ..............................*........................................................................................................... + mla v30.8H, v25.8H, v8.H[0] // ..........................*............................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqrdmulh v6.8H, v22.8H, v0.8H // .......................................*.................................................................................................. + mul v20.8H, v22.8H, v24.8H // ......................................*................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn1 v21.4S, v18.4S, v12.4S // ............................*............................................................................................................. + trn1 v12.4S, v19.4S, v13.4S // ............*............................................................................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sub v18.8H, v31.8H, v30.8H // ................................*......................................................................................................... + add v4.8H, v31.8H, v30.8H // ...............................*.......................................................................................................... + ldr q30, [x4, #-80] // .....*.................................................................................................................................... + // gap // .......................................................................................................................................... + add v25.8H, v21.8H, v7.8H // ......................................................*................................................................................... + mla v20.8H, v6.8H, v8.H[0] // .............................................*............................................................................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqrdmulh v31.8H, v18.8H, v5.8H // ........................................*................................................................................................. + add v14.8H, v12.8H, v29.8H // ...............................................*.......................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mul v13.8H, v15.8H, v3.8H // ..............................................*........................................................................................... + sqrdmulh v22.8H, v4.8H, v0.8H // ..........................................*............................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + add v0.8H, v14.8H, v20.8H // ...................................................*...................................................................................... + mul v19.8H, v4.8H, v24.8H // .........................................*................................................................................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sub v10.8H, v14.8H, v20.8H // ....................................................*..................................................................................... + mul v4.8H, v18.8H, v3.8H // ...........................................*.............................................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqrdmulh v18.8H, v0.8H, v26.8H // ............................................................*............................................................................. + mul v6.8H, v0.8H, v16.8H // ........................................................*................................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqrdmulh v16.8H, v10.8H, v30.8H // .......................................................*.................................................................................. + mla v19.8H, v22.8H, v8.H[0] // .................................................*........................................................................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sub v26.8H, v12.8H, v29.8H // .....................................*.................................................................................................... + mul v12.8H, v10.8H, v17.8H // ..........................................................*............................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v6.8H, v18.8H, v8.H[0] // .....................................................................*.................................................................... + sqrdmulh v30.8H, v15.8H, v5.8H // ............................................*............................................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + add v18.8H, v25.8H, v19.8H // ..............................................................*........................................................................... + sub v20.8H, v25.8H, v19.8H // ...........................................................*.............................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v12.8H, v16.8H, v8.H[0] // .......................................................................*.................................................................. + sub v7.8H, v21.8H, v7.8H // ....................................*..................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v13.8H, v30.8H, v8.H[0] // .....................................................*.................................................................................... + add v27.8H, v18.8H, v6.8H // ...........................................................................*.............................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sub v14.8H, v18.8H, v6.8H // ..........................................................................*............................................................... + mla v4.8H, v31.8H, v8.H[0] // ................................................*......................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sub v21.8H, v20.8H, v12.8H // ..............................................................................*........................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqdmulh v31.8H, v27.8H, v8.H[1] // ................................................................................*......................................................... + add v12.8H, v20.8H, v12.8H // ...............................................................................*.......................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sub v15.8H, v26.8H, v13.8H // ...............................................................*.......................................................................... + add v18.8H, v26.8H, v13.8H // .............................................................*............................................................................ + sqdmulh v22.8H, v21.8H, v8.H[1] // ...................................................................................*...................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mul v16.8H, v15.8H, v23.8H // ..................................................................*....................................................................... + sqrdmulh v6.8H, v15.8H, v2.8H // ...................................................................*...................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mul v3.8H, v18.8H, v9.8H // ................................................................*......................................................................... + sqrdmulh v17.8H, v18.8H, v28.8H // .................................................................*........................................................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sub v19.8H, v7.8H, v4.8H // .........................................................................*................................................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + srshr v30.8H, v22.8H, #11 // .........................................................................................*................................................ + sqdmulh v22.8H, v12.8H, v8.H[1] // ..........................................................................................*............................................... + mla v16.8H, v6.8H, v8.H[0] // ........................................................................*................................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + add v4.8H, v7.8H, v4.8H // ....................................................................*..................................................................... + mla v3.8H, v17.8H, v8.H[0] // ......................................................................*................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + srshr v20.8H, v31.8H, #11 // .......................................................................................*.................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqdmulh v15.8H, v14.8H, v8.H[1] // ..................................................................................*....................................................... + sub v31.8H, v19.8H, v16.8H // .....................................................................................*.................................................... + add v9.8H, v19.8H, v16.8H // ....................................................................................*..................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sub v26.8H, v4.8H, v3.8H // ............................................................................*............................................................. + add v28.8H, v4.8H, v3.8H // .............................................................................*............................................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqdmulh v6.8H, v9.8H, v8.H[1] // .................................................................................................*........................................ + sqdmulh v13.8H, v31.8H, v8.H[1] // ...............................................................................................*.......................................... + sqdmulh v19.8H, v26.8H, v8.H[1] // ........................................................................................*................................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + sqdmulh v16.8H, v28.8H, v8.H[1] // .................................................................................*........................................................ + srshr v18.8H, v15.8H, #11 // ...........................................................................................*.............................................. + srshr v3.8H, v22.8H, #11 // .....................................................................................................*.................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + srshr v22.8H, v13.8H, #11 // ....................................................................................................*..................................... + srshr v6.8H, v6.8H, #11 // ......................................................................................................*................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v21.8H, v30.8H, v8.H[0] // ................................................................................................*......................................... + srshr v16.8H, v16.8H, #11 // ......................................................................................*................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v12.8H, v3.8H, v8.H[0] // ..........................................................................................................*............................... + srshr v24.8H, v19.8H, #11 // ..............................................................................................*........................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v31.8H, v22.8H, v8.H[0] // .........................................................................................................*................................ + mla v9.8H, v6.8H, v8.H[0] // ...........................................................................................................*.............................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v28.8H, v16.8H, v8.H[0] // ............................................................................................*............................................. + mla v27.8H, v20.8H, v8.H[0] // .............................................................................................*............................................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + mla v26.8H, v24.8H, v8.H[0] // ........................................................................................................*................................. + mla v14.8H, v18.8H, v8.H[0] // .......................................................................................................*.................................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn1 v6.4S, v21.4S, v31.4S // ...............................................................................................................*.......................... + trn1 v3.4S, v12.4S, v9.4S // .................................................................................................................*........................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn2 v24.4S, v21.4S, v31.4S // ..............................................................................................................*........................... + trn1 v4.4S, v27.4S, v28.4S // ..................................................................................................*....................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn1 v31.4S, v14.4S, v26.4S // .............................................................................................................*............................ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn1 v7.4S, v3.4S, v6.4S // ......................................................................................................................*................... + trn2 v19.4S, v27.4S, v28.4S // ...................................................................................................*...................................... + trn2 v23.4S, v14.4S, v26.4S // ............................................................................................................*............................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn2 v29.4S, v3.4S, v6.4S // .....................................................................................................................*.................... + trn2 v11.4S, v4.4S, v31.4S // ....................................................................................................................*..................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn1 v27.4S, v4.4S, v31.4S // ..................................................................................................................*....................... + trn2 v12.4S, v12.4S, v9.4S // ...................................................................................................................*...................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn2 v10.4S, v19.4S, v23.4S // .......................................................................................................................*.................. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn2 v1.2D, v11.2D, v29.2D // .........................................................................................................................*................ + trn1 v2.2D, v27.2D, v7.2D // .............................................................................................................................*............ + trn2 v16.4S, v12.4S, v24.4S // ..........................................................................................................................*............... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn1 v0.2D, v11.2D, v29.2D // ........................................................................................................................*................. + trn1 v5.4S, v19.4S, v23.4S // ................................................................................................................*......................... + str q1, [x6, #48] // ............................................................................................................................*............. + // gap // .......................................................................................................................................... + trn1 v6.4S, v12.4S, v24.4S // ...........................................................................................................................*.............. + str q2, [x6], #64 // ....................................................................................................................................*..... + trn1 v26.2D, v10.2D, v16.2D // ......................................................................................................................................*... + // gap // .......................................................................................................................................... + trn2 v25.2D, v10.2D, v16.2D // ..................................................................................................................................*....... + str q0, [x6, #-48] // ...............................................................................................................................*.......... + trn2 v13.2D, v27.2D, v7.2D // ..............................................................................................................................*........... + // gap // .......................................................................................................................................... + str q26, [x7, #16] // .........................................................................................................................................* + trn2 v18.2D, v5.2D, v6.2D // ................................................................................................................................*......... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + str q25, [x7, #48] // .....................................................................................................................................*.... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + trn1 v22.2D, v5.2D, v6.2D // ...................................................................................................................................*...... + str q13, [x6, #-32] // .................................................................................................................................*........ + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + str q22, [x7], #64 // .......................................................................................................................................*.. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + str q18, [x7, #-32] // ........................................................................................................................................*. + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + // gap // .......................................................................................................................................... + + // ------------------------------------------------------------- new position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------ + // ldr q0, [x4, #32] // .*........................................................................................................................................ + // ldr q14, [x4, #176] // *......................................................................................................................................... + // ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x6] // ...*...................................................................................................................................... + // ld4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x7] // ..*....................................................................................................................................... + // ldr q1, [x4, #96] // .....*.................................................................................................................................... + // ldr q25, [x4, #144] // .......................................*.................................................................................................. + // ldr q9, [x4, #80] // ......*................................................................................................................................... + // ldr q30, [x4, #208] // ....*..................................................................................................................................... + // ldr q7, [x4, #192] // ...........*.............................................................................................................................. + // ldr q11, [x4, #48] // ........................*................................................................................................................. + // ldr q6, [x4], #16*14 // .......*.................................................................................................................................. + // trn2 v22.4S, v21.4S, v5.4S // .............*............................................................................................................................ + // trn1 v13.4S, v19.4S, v3.4S // ....................................*..................................................................................................... + // trn2 v3.4S, v19.4S, v3.4S // ..............*........................................................................................................................... + // trn2 v19.4S, v20.4S, v4.4S // ...............*.......................................................................................................................... + // trn2 v27.4S, v18.4S, v2.4S // ............*............................................................................................................................. + // trn1 v12.4S, v20.4S, v4.4S // ...........................*.............................................................................................................. + // mul v23.8H, v3.8H, v6.8H // ....................*..................................................................................................................... + // mul v16.8H, v19.8H, v6.8H // ..................*....................................................................................................................... + // sqrdmulh v19.8H, v19.8H, v31.8H // .........................*................................................................................................................ + // mul v4.8H, v27.8H, v6.8H // .....................*.................................................................................................................... + // mul v6.8H, v22.8H, v6.8H // .................*........................................................................................................................ + // sqrdmulh v22.8H, v22.8H, v31.8H // ................*......................................................................................................................... + // sqrdmulh v20.8H, v3.8H, v31.8H // ...................*...................................................................................................................... + // sqrdmulh v17.8H, v27.8H, v31.8H // .......................*.................................................................................................................. + // trn1 v31.4S, v21.4S, v5.4S // ..........................*............................................................................................................... + // mla v16.8H, v19.8H, v8.H[0] // ................................*......................................................................................................... + // mla v6.8H, v22.8H, v8.H[0] // ......................*................................................................................................................... + // trn1 v18.4S, v18.4S, v2.4S // ...................................*...................................................................................................... + // mla v23.8H, v20.8H, v8.H[0] // ............................*............................................................................................................. + // mla v4.8H, v17.8H, v8.H[0] // ...............................*.......................................................................................................... + // add v20.8H, v12.8H, v16.8H // ......................................*................................................................................................... + // sub v21.8H, v12.8H, v16.8H // .....................................*.................................................................................................... + // add v22.8H, v31.8H, v6.8H // .............................*............................................................................................................ + // sub v6.8H, v31.8H, v6.8H // ..............................*........................................................................................................... + // ldr q12, [x4, #-160] // ........*................................................................................................................................. + // sub v27.8H, v18.8H, v4.8H // .............................................................*............................................................................ + // sub v19.8H, v13.8H, v23.8H // ......................................................*................................................................................... + // mul v15.8H, v22.8H, v0.8H // ..................................*....................................................................................................... + // sqrdmulh v3.8H, v22.8H, v11.8H // .................................*........................................................................................................ + // sqrdmulh v31.8H, v21.8H, v9.8H // ..........................................*............................................................................................... + // mul v16.8H, v20.8H, v0.8H // ...............................................*.......................................................................................... + // sqrdmulh v20.8H, v20.8H, v11.8H // .............................................*............................................................................................ + // mul v10.8H, v21.8H, v12.8H // .................................................*........................................................................................ + // sqrdmulh v22.8H, v6.8H, v9.8H // .........................................................*................................................................................ + // mla v15.8H, v3.8H, v8.H[0] // .........................................*................................................................................................ + // mul v6.8H, v6.8H, v12.8H // ............................................*............................................................................................. + // add v9.8H, v13.8H, v23.8H // ...........................................*.............................................................................................. + // mla v10.8H, v31.8H, v8.H[0] // .................................................................*........................................................................ + // mla v16.8H, v20.8H, v8.H[0] // .....................................................*.................................................................................... + // ldr q21, [x4, #-96] // .........*................................................................................................................................ + // add v31.8H, v9.8H, v15.8H // ..............................................*........................................................................................... + // sub v12.8H, v9.8H, v15.8H // ................................................*......................................................................................... + // mla v6.8H, v22.8H, v8.H[0] // ..............................................................*........................................................................... + // add v18.8H, v18.8H, v4.8H // ........................................*................................................................................................. + // sqrdmulh v20.8H, v12.8H, v25.8H // ....................................................*..................................................................................... + // mul v22.8H, v31.8H, v1.8H // ...................................................*...................................................................................... + // ldr q25, [x4, #-64] // ..........*............................................................................................................................... + // mul v2.8H, v12.8H, v21.8H // .......................................................*.................................................................................. + // sub v17.8H, v18.8H, v16.8H // ...........................................................*.............................................................................. + // sqrdmulh v3.8H, v31.8H, v26.8H // ..................................................*....................................................................................... + // add v26.8H, v19.8H, v6.8H // ......................................................................*................................................................... + // add v18.8H, v18.8H, v16.8H // ..........................................................*............................................................................... + // sub v6.8H, v19.8H, v6.8H // .....................................................................*.................................................................... + // mul v16.8H, v26.8H, v25.8H // ..........................................................................*............................................................... + // sqrdmulh v15.8H, v26.8H, v14.8H // ...........................................................................*.............................................................. + // mul v26.8H, v6.8H, v7.8H // ........................................................................*................................................................. + // sqrdmulh v6.8H, v6.8H, v30.8H // .........................................................................*................................................................ + // add v12.8H, v27.8H, v10.8H // ................................................................................*......................................................... + // mla v22.8H, v3.8H, v8.H[0] // ........................................................*................................................................................. + // mla v16.8H, v15.8H, v8.H[0] // .................................................................................*........................................................ + // mla v2.8H, v20.8H, v8.H[0] // ............................................................*............................................................................. + // mla v26.8H, v6.8H, v8.H[0] // ...............................................................................*.......................................................... + // sub v9.8H, v27.8H, v10.8H // ............................................................................*............................................................. + // sub v30.8H, v18.8H, v22.8H // ................................................................*......................................................................... + // add v18.8H, v18.8H, v22.8H // ...............................................................*.......................................................................... + // sub v13.8H, v12.8H, v16.8H // ......................................................................................*................................................... + // add v16.8H, v12.8H, v16.8H // .......................................................................................*.................................................. + // sub v27.8H, v17.8H, v2.8H // ..................................................................*....................................................................... + // add v7.8H, v17.8H, v2.8H // ....................................................................*..................................................................... + // sqdmulh v22.8H, v18.8H, v8.H[1] // ...................................................................*...................................................................... + // sqdmulh v19.8H, v16.8H, v8.H[1] // ...........................................................................................*.............................................. + // sqdmulh v3.8H, v30.8H, v8.H[1] // ...................................................................................*...................................................... + // sqdmulh v6.8H, v27.8H, v8.H[1] // .......................................................................*.................................................................. + // add v2.8H, v9.8H, v26.8H // .....................................................................................*.................................................... + // sub v4.8H, v9.8H, v26.8H // ....................................................................................*..................................................... + // srshr v26.8H, v19.8H, #11 // .................................................................................................*........................................ + // srshr v9.8H, v22.8H, #11 // ..................................................................................*....................................................... + // sqdmulh v22.8H, v13.8H, v8.H[1] // ..........................................................................................*............................................... + // srshr v19.8H, v6.8H, #11 // .............................................................................*............................................................ + // sqdmulh v6.8H, v7.8H, v8.H[1] // ..............................................................................*........................................................... + // srshr v3.8H, v3.8H, #11 // ............................................................................................*............................................. + // mla v16.8H, v26.8H, v8.H[0] // ......................................................................................................*................................... + // mla v18.8H, v9.8H, v8.H[0] // .......................................................................................................*.................................. + // srshr v20.8H, v22.8H, #11 // ...................................................................................................*...................................... + // sqdmulh v22.8H, v4.8H, v8.H[1] // .........................................................................................*................................................ + // mla v27.8H, v19.8H, v8.H[0] // ................................................................................................*......................................... + // sqdmulh v19.8H, v2.8H, v8.H[1] // ........................................................................................*................................................. + // trn1 v15.4S, v18.4S, v16.4S // .............................................................................................................*............................ + // trn2 v18.4S, v18.4S, v16.4S // ................................................................................................................*......................... + // srshr v16.8H, v22.8H, #11 // ..............................................................................................*........................................... + // srshr v22.8H, v6.8H, #11 // .............................................................................................*............................................ + // srshr v6.8H, v19.8H, #11 // ...............................................................................................*.......................................... + // mla v30.8H, v3.8H, v8.H[0] // .........................................................................................................*................................ + // mla v13.8H, v20.8H, v8.H[0] // ........................................................................................................*................................. + // mla v4.8H, v16.8H, v8.H[0] // ....................................................................................................*..................................... + // mla v7.8H, v22.8H, v8.H[0] // ..................................................................................................*....................................... + // mla v2.8H, v6.8H, v8.H[0] // .....................................................................................................*.................................... + // trn2 v12.4S, v30.4S, v13.4S // .................................................................................................................*........................ + // trn1 v6.4S, v30.4S, v13.4S // ..............................................................................................................*........................... + // trn2 v31.4S, v27.4S, v4.4S // ............................................................................................................*............................. + // trn1 v17.4S, v27.4S, v4.4S // ..........................................................................................................*............................... + // trn1 v13.4S, v18.4S, v12.4S // ...........................................................................................................................*.............. + // trn1 v22.4S, v7.4S, v2.4S // ...........................................................................................................*.............................. + // trn1 v9.4S, v15.4S, v6.4S // ....................................................................................................................*..................... + // trn2 v20.4S, v7.4S, v2.4S // .....................................................................................................................*.................... + // trn2 v3.4S, v15.4S, v6.4S // ...................................................................................................................*...................... + // trn2 v19.4S, v22.4S, v17.4S // ..................................................................................................................*....................... + // trn1 v27.4S, v22.4S, v17.4S // ...............................................................................................................*.......................... + // trn2 v16.4S, v18.4S, v12.4S // ......................................................................................................................*................... + // trn1 v18.2D, v3.2D, v19.2D // ..........................................................................................................................*............... + // trn2 v22.2D, v3.2D, v19.2D // .......................................................................................................................*.................. + // trn2 v6.4S, v20.4S, v31.4S // .........................................................................................................................*................ + // trn1 v31.4S, v20.4S, v31.4S // .............................................................................................................................*............ + // str q22, [x6, #48] // ............................................................................................................................*............. + // trn1 v19.2D, v9.2D, v27.2D // ........................................................................................................................*................. + // trn2 v20.2D, v9.2D, v27.2D // ..................................................................................................................................*....... + // str q18, [x6, #16] // .................................................................................................................................*........ + // trn2 v22.2D, v13.2D, v31.2D // ....................................................................................................................................*..... + // str q20, [x6, #32] // .......................................................................................................................................*.. + // trn2 v18.2D, v16.2D, v6.2D // ................................................................................................................................*......... + // trn1 v31.2D, v13.2D, v31.2D // ......................................................................................................................................*... + // str q19, [x6], #64 // ..............................................................................................................................*........... + // str q18, [x7, #48] // .....................................................................................................................................*.... + // trn1 v6.2D, v16.2D, v6.2D // ...............................................................................................................................*.......... + // str q31, [x7], #64 // ........................................................................................................................................*. + // str q22, [x7, #-32] // .........................................................................................................................................* + // str q6, [x7, #-48] // ...................................................................................................................................*...... + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_1234_567_opt_a55.s b/examples/opt/aarch64/ntt_kyber_1234_567_opt_a55.s new file mode 100644 index 00000000..2161a72e --- /dev/null +++ b/examples/opt/aarch64/ntt_kyber_1234_567_opt_a55.s @@ -0,0 +1,1830 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmla d,a,b + mla \d\().8h, \a\().8h, \b\().8h +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmlaq d,a,b,i + mla \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlaq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vmlaq \dst, t2, consts, 0 +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlaq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #16 +.endm + +.macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_kyber_1234_567_twiddles.s" +.text + + .global ntt_kyber_1234_567_opt_a55 + .global _ntt_kyber_1234_567 + +.p2align 4 +const_addr: .short -3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_kyber_1234_567_opt_a55: +_ntt_kyber_1234_567_opt_a55: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + src0 .req x6 + src1 .req x7 + src2 .req x8 + src3 .req x9 + src4 .req x10 + src5 .req x11 + src6 .req x12 + src7 .req x13 + src8 .req x14 + src9 .req x15 + src10 .req x16 + src11 .req x17 + src12 .req x18 + src13 .req x19 + src14 .req x20 + src15 .req x21 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + data8 .req v17 + data9 .req v18 + data10 .req v19 + data11 .req v20 + data12 .req v21 + data13 .req v22 + data14 .req v23 + data15 .req v24 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + qform_data8 .req q17 + qform_data9 .req q18 + qform_data10 .req q19 + qform_data11 .req q20 + qform_data12 .req q21 + qform_data13 .req q22 + qform_data14 .req q23 + qform_data15 .req q24 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + tmp .req v25 + t0 .req v26 + t1 .req v27 + t2 .req v28 + t3 .req v29 + + consts .req v8 + + ASM_LOAD(r_ptr0, roots) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + add src0, x0, #32*0 + add src8, x0, #32*8 + + ld1 { root0.8h, root1.8h, root2.8h, root3.8h}, [r_ptr0], #64 + + mov count, #2 + + .p2align 2 + ldr q15, [x14, #160] // .............*.................................................................................................................................................................................. + ldr q16, [x14, #0] // ........*....................................................................................................................................................................................... + sqrdmulh v27.8H, v15.8H, v0.H[1] // .........................................*...................................................................................................................................................... + mul v15.8H, v15.8H, v0.H[0] // ..........................................*..................................................................................................................................................... + sqrdmulh v14.8H, v16.8H, v0.H[1] // ................*............................................................................................................................................................................... + mul v16.8H, v16.8H, v0.H[0] // .................*.............................................................................................................................................................................. + ldr q6, [x6, #160] // .....*.......................................................................................................................................................................................... + mla v15.8H, v27.8H, v8.H[0] // ...........................................*.................................................................................................................................................... + mla v16.8H, v14.8H, v8.H[0] // ..................*............................................................................................................................................................................. + ldr q27, [x6, #128] // ....*........................................................................................................................................................................................... + sub v14.8H, v6.8H, v15.8H // ............................................*................................................................................................................................................... + add v15.8H, v6.8H, v15.8H // .............................................*.................................................................................................................................................. + ldr q6, [x14, #128] // ............*................................................................................................................................................................................... + sqrdmulh v26.8H, v14.8H, v0.H[5] // .................................................................................*.............................................................................................................. + mul v14.8H, v14.8H, v0.H[4] // ..................................................................................*............................................................................................................. + mul v7.8H, v15.8H, v0.H[2] // ..............................................................*................................................................................................................................. + sqrdmulh v11.8H, v6.8H, v0.H[1] // ....................................*........................................................................................................................................................... + mul v6.8H, v6.8H, v0.H[0] // .....................................*.......................................................................................................................................................... + ldr q13, [x6, #0] // *............................................................................................................................................................................................... + mla v14.8H, v26.8H, v8.H[0] // ...................................................................................*............................................................................................................ + mla v6.8H, v11.8H, v8.H[0] // ......................................*......................................................................................................................................................... + sqrdmulh v15.8H, v15.8H, v0.H[3] // .............................................................*.................................................................................................................................. + sub v26.8H, v13.8H, v16.8H // ...................*............................................................................................................................................................................ + add v16.8H, v13.8H, v16.8H // ....................*........................................................................................................................................................................... + sub v11.8H, v27.8H, v6.8H // .......................................*........................................................................................................................................................ + add v27.8H, v27.8H, v6.8H // ........................................*....................................................................................................................................................... + mla v7.8H, v15.8H, v8.H[0] // ...............................................................*................................................................................................................................ + ldr q15, [x14, #32] // .........*...................................................................................................................................................................................... + sqrdmulh v6.8H, v11.8H, v0.H[5] // ............................................................................*................................................................................................................... + mul v11.8H, v11.8H, v0.H[4] // .............................................................................*.................................................................................................................. + sqrdmulh v13.8H, v15.8H, v0.H[1] // .....................*.......................................................................................................................................................................... + mul v15.8H, v15.8H, v0.H[0] // ......................*......................................................................................................................................................................... + ldr q31, [x6, #32] // .*.............................................................................................................................................................................................. + mla v11.8H, v6.8H, v8.H[0] // ..............................................................................*................................................................................................................. + mla v15.8H, v13.8H, v8.H[0] // .......................*........................................................................................................................................................................ + mul v6.8H, v27.8H, v0.H[2] // .........................................................*...................................................................................................................................... + sqrdmulh v13.8H, v27.8H, v0.H[3] // ........................................................*....................................................................................................................................... + add v25.8H, v26.8H, v11.8H // ................................................................................*............................................................................................................... + sub v10.8H, v26.8H, v11.8H // ...............................................................................*................................................................................................................ + sub v27.8H, v31.8H, v15.8H // ........................*....................................................................................................................................................................... + add v15.8H, v31.8H, v15.8H // .........................*...................................................................................................................................................................... + mla v6.8H, v13.8H, v8.H[0] // ..........................................................*..................................................................................................................................... + add v18.8H, v27.8H, v14.8H // .....................................................................................*.......................................................................................................... + sub v27.8H, v27.8H, v14.8H // ....................................................................................*........................................................................................................... + add v14.8H, v15.8H, v7.8H // .................................................................*.............................................................................................................................. + sub v15.8H, v15.8H, v7.8H // ................................................................*............................................................................................................................... + sub v26.8H, v16.8H, v6.8H // ...........................................................*.................................................................................................................................... + add v16.8H, v16.8H, v6.8H // ............................................................*................................................................................................................................... + ldr q6, [x14, #192] // ..............*................................................................................................................................................................................. + ldr q7, [x14, #64] // ..........*..................................................................................................................................................................................... + mul v11.8H, v6.8H, v0.H[0] // ...............................................*................................................................................................................................................ + sqrdmulh v6.8H, v6.8H, v0.H[1] // ..............................................*................................................................................................................................................. + mul v13.8H, v7.8H, v0.H[0] // ...........................*.................................................................................................................................................................... + ldr q31, [x6, #192] // ......*......................................................................................................................................................................................... + mla v11.8H, v6.8H, v8.H[0] // ................................................*............................................................................................................................................... + ldr q6, [x14, #96] // ...........*.................................................................................................................................................................................... + sqrdmulh v7.8H, v7.8H, v0.H[1] // ..........................*..................................................................................................................................................................... + sub v22.8H, v31.8H, v11.8H // .................................................*.............................................................................................................................................. + mla v13.8H, v7.8H, v8.H[0] // ............................*................................................................................................................................................................... + sqrdmulh v7.8H, v22.8H, v0.H[5] // ......................................................................................*......................................................................................................... + mul v22.8H, v22.8H, v0.H[4] // .......................................................................................*........................................................................................................ + sqrdmulh v17.8H, v6.8H, v0.H[1] // ...............................*................................................................................................................................................................ + mul v29.8H, v6.8H, v0.H[0] // ................................*............................................................................................................................................................... + add v6.8H, v31.8H, v11.8H // ..................................................*............................................................................................................................................. + ldr q11, [x6, #64] // ..*............................................................................................................................................................................................. + mul v31.8H, v6.8H, v0.H[2] // ...................................................................*............................................................................................................................ + mla v22.8H, v7.8H, v8.H[0] // ........................................................................................*....................................................................................................... + sub v7.8H, v11.8H, v13.8H // .............................*.................................................................................................................................................................. + sqrdmulh v6.8H, v6.8H, v0.H[3] // ..................................................................*............................................................................................................................. + add v11.8H, v11.8H, v13.8H // ..............................*................................................................................................................................................................. + add v13.8H, v7.8H, v22.8H // ..........................................................................................*..................................................................................................... + mla v29.8H, v17.8H, v8.H[0] // .................................*.............................................................................................................................................................. + mla v31.8H, v6.8H, v8.H[0] // ....................................................................*........................................................................................................................... + sqrdmulh v6.8H, v13.8H, v1.H[3] // ....................................................................................................................*........................................................................... + mul v13.8H, v13.8H, v1.H[2] // .....................................................................................................................*.......................................................................... + sub v7.8H, v7.8H, v22.8H // .........................................................................................*...................................................................................................... + sub v24.8H, v11.8H, v31.8H // .....................................................................*.......................................................................................................................... + add v11.8H, v11.8H, v31.8H // ......................................................................*......................................................................................................................... + mul v22.8H, v24.8H, v1.H[0] // ...........................................................................................................*.................................................................................... + sqrdmulh v31.8H, v24.8H, v1.H[1] // ..........................................................................................................*..................................................................................... + mul v30.8H, v11.8H, v0.H[6] // .................................................................................................*.............................................................................................. + mul v20.8H, v7.8H, v1.H[4] // ...............................................................................................................................*................................................................ + sqrdmulh v7.8H, v7.8H, v1.H[5] // ..............................................................................................................................*................................................................. + mla v22.8H, v31.8H, v8.H[0] // ............................................................................................................*................................................................................... + sqrdmulh v24.8H, v11.8H, v0.H[7] // ................................................................................................*............................................................................................... + ldr q5, [x14, #224] // ...............*................................................................................................................................................................................ + sub v21.8H, v26.8H, v22.8H // .............................................................................................................*.................................................................................. + add v28.8H, v26.8H, v22.8H // ..............................................................................................................*................................................................................. + sqrdmulh v26.8H, v5.8H, v0.H[1] // ...................................................*............................................................................................................................................ + mul v19.8H, v5.8H, v0.H[0] // ....................................................*........................................................................................................................................... + ldr q22, [x6, #96] // ...*............................................................................................................................................................................................ + mla v30.8H, v24.8H, v8.H[0] // ..................................................................................................*............................................................................................. + mla v19.8H, v26.8H, v8.H[0] // .....................................................*.......................................................................................................................................... + ldr q24, [x6, #224] // .......*........................................................................................................................................................................................ + sub v11.8H, v16.8H, v30.8H // ...................................................................................................*............................................................................................ + mla v20.8H, v7.8H, v8.H[0] // ................................................................................................................................*............................................................... + sub count, count, #1 +layer1234_start: + // Instructions: 192 + // Expected cycles: 96 + // Expected IPC: 2.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + add v7.8H, v24.8H, v19.8H // *............................................................................................................................................................................................... + sub v12.8H, v22.8H, v29.8H // ...........*.................................................................................................................................................................................... + sub v4.8H, v24.8H, v19.8H // .*.............................................................................................................................................................................................. + mul v26.8H, v7.8H, v0.H[2] // ...*............................................................................................................................................................................................ + sqrdmulh v17.8H, v7.8H, v0.H[3] // .....*.......................................................................................................................................................................................... + mul v19.8H, v4.8H, v0.H[4] // ......*......................................................................................................................................................................................... + sqrdmulh v31.8H, v4.8H, v0.H[5] // ....*........................................................................................................................................................................................... + mla v13.8H, v6.8H, v8.H[0] // ........*....................................................................................................................................................................................... + mla v26.8H, v17.8H, v8.H[0] // .........*...................................................................................................................................................................................... + add v23.8H, v22.8H, v29.8H // .......*........................................................................................................................................................................................ + mla v19.8H, v31.8H, v8.H[0] // ..........*..................................................................................................................................................................................... + sub v5.8H, v25.8H, v13.8H // ........................*....................................................................................................................................................................... + sub v17.8H, v23.8H, v26.8H // ...............*................................................................................................................................................................................ + add v9.8H, v23.8H, v26.8H // .............*.................................................................................................................................................................................. + sub v4.8H, v12.8H, v19.8H // ..............................*................................................................................................................................................................. + mul v29.8H, v17.8H, v1.H[0] // ...................*............................................................................................................................................................................ + sqrdmulh v31.8H, v17.8H, v1.H[1] // ..................*............................................................................................................................................................................. + sqrdmulh v7.8H, v4.8H, v1.H[5] // ............................................*................................................................................................................................................... + mul v22.8H, v4.8H, v1.H[4] // .............................................*.................................................................................................................................................. + mla v29.8H, v31.8H, v8.H[0] // .......................*........................................................................................................................................................................ + mla v22.8H, v7.8H, v8.H[0] // ...................................................*............................................................................................................................................ + sub v26.8H, v10.8H, v20.8H // ............*................................................................................................................................................................................... + add v24.8H, v10.8H, v20.8H // ..*............................................................................................................................................................................................. + sub v7.8H, v15.8H, v29.8H // ...........................*.................................................................................................................................................................... + sqrdmulh v31.8H, v9.8H, v0.H[7] // .................*.............................................................................................................................................................................. + mul v17.8H, v9.8H, v0.H[6] // ................*............................................................................................................................................................................... + sqrdmulh v6.8H, v7.8H, v2.H[5] // ..............................................................*................................................................................................................................. + mul v23.8H, v7.8H, v2.H[4] // ...............................................................*................................................................................................................................ + add v12.8H, v12.8H, v19.8H // ..............*................................................................................................................................................................................. + add v15.8H, v15.8H, v29.8H // ..................................................*............................................................................................................................................. + add v29.8H, v27.8H, v22.8H // ........................................................*....................................................................................................................................... + mla v23.8H, v6.8H, v8.H[0] // ...................................................................*............................................................................................................................ + sub v27.8H, v27.8H, v22.8H // .......................................................*........................................................................................................................................ + add v10.8H, v16.8H, v30.8H // .............................................................................*.................................................................................................................. + add v20.8H, v25.8H, v13.8H // ......................................................*......................................................................................................................................... + sub v25.8H, v21.8H, v23.8H // .......................................................................*........................................................................................................................ + add v13.8H, v21.8H, v23.8H // ........................................................................*....................................................................................................................... + mla v17.8H, v31.8H, v8.H[0] // .....................*.......................................................................................................................................................................... + add v16.8H, v14.8H, v17.8H // ....................................................*........................................................................................................................................... + sqrdmulh v6.8H, v15.8H, v2.H[3] // .....................................................*.......................................................................................................................................... + mul v15.8H, v15.8H, v2.H[2] // .........................................................*...................................................................................................................................... + sqrdmulh v7.8H, v16.8H, v1.H[7] // ....................................................................*........................................................................................................................... + mul v16.8H, v16.8H, v1.H[6] // .....................................................................*.......................................................................................................................... + mul v31.8H, v12.8H, v1.H[2] // ......................*......................................................................................................................................................................... + sqrdmulh v22.8H, v12.8H, v1.H[3] // ....................*........................................................................................................................................................................... + sub v14.8H, v14.8H, v17.8H // .........................*...................................................................................................................................................................... + mla v16.8H, v7.8H, v8.H[0] // .........................................................................*...................................................................................................................... + str q13, [x6, #192] // ...........................................................................*.................................................................................................................... + mul v7.8H, v29.8H, v3.H[2] // ...........................................................*.................................................................................................................................... + sqrdmulh v17.8H, v29.8H, v3.H[3] // ............................................................*................................................................................................................................... + add v13.8H, v10.8H, v16.8H // .................................................................................*.............................................................................................................. + sub v16.8H, v10.8H, v16.8H // ...................................................................................*............................................................................................................ + mul v29.8H, v27.8H, v3.H[4] // .................................................................*.............................................................................................................................. + sqrdmulh v27.8H, v27.8H, v3.H[5] // ................................................................................*............................................................................................................... + mla v15.8H, v6.8H, v8.H[0] // .............................................................*.................................................................................................................................. + mla v7.8H, v17.8H, v8.H[0] // ..........................................................................*..................................................................................................................... + mla v31.8H, v22.8H, v8.H[0] // ..........................*..................................................................................................................................................................... + mla v29.8H, v27.8H, v8.H[0] // .......................................................................................*........................................................................................................ + sub v27.8H, v28.8H, v15.8H // .....................................................................................*.......................................................................................................... + add v15.8H, v28.8H, v15.8H // ..................................................................*............................................................................................................................. + sub v23.8H, v18.8H, v31.8H // ...............................*................................................................................................................................................................ + add v6.8H, v26.8H, v29.8H // ...........................................................................................*.................................................................................................... + add v12.8H, v24.8H, v7.8H // ..............................................................................*................................................................................................................. + sub v9.8H, v26.8H, v29.8H // .............................................................................................*.................................................................................................. + mul v17.8H, v14.8H, v2.H[0] // ............................*................................................................................................................................................................... + sqrdmulh v14.8H, v14.8H, v2.H[1] // .............................*.................................................................................................................................................................. + str q13, [x6], #16 // ....................................................................................*........................................................................................................... + sub v21.8H, v24.8H, v7.8H // .........................................................................................*...................................................................................................... + str q16, [x6, #16] // ......................................................................................*......................................................................................................... + ldr q13, [x14, #16] // .................................................................................................*.............................................................................................. + mla v17.8H, v14.8H, v8.H[0] // .................................*.............................................................................................................................................................. + sqrdmulh v14.8H, v23.8H, v3.H[1] // ..................................*............................................................................................................................................................. + mul v22.8H, v23.8H, v3.H[0] // ...................................*............................................................................................................................................................ + str q25, [x6, #208] // ..........................................................................................*..................................................................................................... + add v31.8H, v18.8H, v31.8H // ................................*............................................................................................................................................................... + str q27, [x6, #144] // ........................................................................................*....................................................................................................... + mla v22.8H, v14.8H, v8.H[0] // .......................................*........................................................................................................................................................ + str q15, [x6, #112] // ......................................................................*......................................................................................................................... + sub v15.8H, v5.8H, v22.8H // ..............................................*................................................................................................................................................. + add v28.8H, v5.8H, v22.8H // ...........................................*.................................................................................................................................................... + mul v27.8H, v31.8H, v2.H[6] // .........................................*...................................................................................................................................................... + sqrdmulh v14.8H, v31.8H, v2.H[7] // ....................................*........................................................................................................................................................... + str q6, [x14, #192] // ..............................................................................................*................................................................................................. + mul v6.8H, v13.8H, v0.H[0] // .....................................................................................................*.......................................................................................... + sqrdmulh v26.8H, v13.8H, v0.H[1] // ....................................................................................................*........................................................................................... + str q12, [x14, #128] // ..................................................................................*............................................................................................................. + sub v7.8H, v11.8H, v17.8H // ......................................*......................................................................................................................................................... + add v11.8H, v11.8H, v17.8H // .....................................*.......................................................................................................................................................... + str q9, [x14, #224] // ...............................................................................................*................................................................................................ + mla v27.8H, v14.8H, v8.H[0] // ................................................*............................................................................................................................................... + str q21, [x14, #160] // ............................................................................................*................................................................................................... + mla v6.8H, v26.8H, v8.H[0] // ........................................................................................................*....................................................................................... + str q15, [x14, #96] // .................................................*.............................................................................................................................................. + sub v15.8H, v20.8H, v27.8H // ..........................................................*..................................................................................................................................... + add v27.8H, v20.8H, v27.8H // ............................................................................*................................................................................................................... + ldr q24, [x6, #224] // .............................................................................................................................................................................................*.. + str q15, [x14, #32] // ................................................................*............................................................................................................................... + ldr q15, [x6, #0] // ..................................................................................................................*............................................................................. + ldr q16, [x14, #144] // ............................................................................................................*................................................................................... + str q7, [x6, #80] // ..........................................*..................................................................................................................................................... + sub v13.8H, v15.8H, v6.8H // ......................................................................................................................*......................................................................... + mul v14.8H, v16.8H, v0.H[0] // .................................................................................................................*.............................................................................. + sqrdmulh v16.8H, v16.8H, v0.H[1] // ................................................................................................................*............................................................................... + str q11, [x6, #48] // ........................................*....................................................................................................................................................... + add v15.8H, v15.8H, v6.8H // .......................................................................................................................*........................................................................ + str q27, [x14], #16 // ...............................................................................*................................................................................................................ + ldr q27, [x6, #128] // .........................................................................................................*...................................................................................... + ldr q6, [x14, #160] // ................................................................................................*............................................................................................... + mla v14.8H, v16.8H, v8.H[0] // ....................................................................................................................*........................................................................... + ldr q16, [x6, #160] // ......................................................................................................*......................................................................................... + mul v7.8H, v6.8H, v0.H[0] // ...................................................................................................*............................................................................................ + add v11.8H, v27.8H, v14.8H // .........................................................................................................................*...................................................................... + sub v27.8H, v27.8H, v14.8H // ........................................................................................................................*....................................................................... + sqrdmulh v6.8H, v6.8H, v0.H[1] // ..................................................................................................*............................................................................................. + mul v26.8H, v11.8H, v0.H[2] // ...................................................................................................................................*............................................................ + mul v14.8H, v27.8H, v0.H[4] // .............................................................................................................................*.................................................................. + sqrdmulh v27.8H, v27.8H, v0.H[5] // ............................................................................................................................*................................................................... + sqrdmulh v21.8H, v11.8H, v0.H[3] // ....................................................................................................................................*........................................................... + mla v7.8H, v6.8H, v8.H[0] // .......................................................................................................*........................................................................................ + str q28, [x14, #48] // ...............................................*................................................................................................................................................ + ldr q6, [x14, #32] // ...........................................................................................................................*.................................................................... + mla v14.8H, v27.8H, v8.H[0] // .................................................................................................................................*.............................................................. + mla v26.8H, v21.8H, v8.H[0] // .........................................................................................................................................*...................................................... + mul v18.8H, v6.8H, v0.H[0] // ...............................................................................................................................*................................................................ + sqrdmulh v30.8H, v6.8H, v0.H[1] // ..............................................................................................................................*................................................................. + sub v27.8H, v16.8H, v7.8H // ..........................................................................................................*..................................................................................... + add v6.8H, v16.8H, v7.8H // ...........................................................................................................*.................................................................................... + add v25.8H, v13.8H, v14.8H // .....................................................................................................................................*.......................................................... + mla v18.8H, v30.8H, v8.H[0] // ..................................................................................................................................*............................................................. + sub v10.8H, v13.8H, v14.8H // ......................................................................................................................................*......................................................... + add v16.8H, v15.8H, v26.8H // ...............................................................................................................................................*................................................ + sqrdmulh v14.8H, v27.8H, v0.H[5] // .............................................................................................................*.................................................................................. + sub v19.8H, v15.8H, v26.8H // ..............................................................................................................................................*................................................. + mul v15.8H, v27.8H, v0.H[4] // ..............................................................................................................*................................................................................. + sqrdmulh v27.8H, v6.8H, v0.H[3] // .....................................................................................................................*.......................................................................... + mul v17.8H, v6.8H, v0.H[2] // ...............................................................................................................*................................................................................ + ldr q26, [x6, #32] // ................................................................................................................................*............................................................... + mla v15.8H, v14.8H, v8.H[0] // ...................................................................................................................*............................................................................ + mla v17.8H, v27.8H, v8.H[0] // ..........................................................................................................................*..................................................................... + ldr q14, [x14, #64] // .................................................................................................................................................*.............................................. + ldr q6, [x14, #192] // ................................................................................................................................................*............................................... + mul v31.8H, v14.8H, v0.H[0] // ....................................................................................................................................................*........................................... + add v22.8H, v26.8H, v18.8H // ........................................................................................................................................*....................................................... + sqrdmulh v13.8H, v14.8H, v0.H[1] // ........................................................................................................................................................*....................................... + sub v26.8H, v26.8H, v18.8H // .......................................................................................................................................*........................................................ + add v14.8H, v22.8H, v17.8H // ............................................................................................................................................*................................................... + mul v11.8H, v6.8H, v0.H[0] // ..................................................................................................................................................*............................................. + sub v27.8H, v26.8H, v15.8H // ...........................................................................................................................................*.................................................... + add v18.8H, v26.8H, v15.8H // ..........................................................................................................................................*..................................................... + ldr q26, [x6, #64] // ................................................................................................................................................................*............................... + sqrdmulh v28.8H, v6.8H, v0.H[1] // ...................................................................................................................................................*............................................ + mla v31.8H, v13.8H, v8.H[0] // ..........................................................................................................................................................*..................................... + ldr q6, [x6, #192] // .....................................................................................................................................................*.......................................... + mla v11.8H, v28.8H, v8.H[0] // ......................................................................................................................................................*......................................... + add v12.8H, v26.8H, v31.8H // .....................................................................................................................................................................*.......................... + sub v15.8H, v22.8H, v17.8H // .............................................................................................................................................*.................................................. + sub v17.8H, v26.8H, v31.8H // ...................................................................................................................................................................*............................ + add v7.8H, v6.8H, v11.8H // ...............................................................................................................................................................*................................ + sub v6.8H, v6.8H, v11.8H // .........................................................................................................................................................*...................................... + ldr q26, [x14, #96] // .......................................................................................................................................................*........................................ + mul v22.8H, v7.8H, v0.H[2] // .................................................................................................................................................................*.............................. + sqrdmulh v13.8H, v7.8H, v0.H[3] // ....................................................................................................................................................................*........................... + mul v29.8H, v26.8H, v0.H[0] // ..............................................................................................................................................................*................................. + sqrdmulh v31.8H, v26.8H, v0.H[1] // .............................................................................................................................................................*.................................. + sqrdmulh v11.8H, v6.8H, v0.H[5] // ...........................................................................................................................................................*.................................... + mla v22.8H, v13.8H, v8.H[0] // ........................................................................................................................................................................*....................... + mul v7.8H, v6.8H, v0.H[4] // ............................................................................................................................................................*................................... + ldr q23, [x14, #224] // .....................................................................................................................................................................................*.......... + sub v6.8H, v12.8H, v22.8H // ............................................................................................................................................................................*................... + mla v29.8H, v31.8H, v8.H[0] // .......................................................................................................................................................................*........................ + add v26.8H, v12.8H, v22.8H // .............................................................................................................................................................................*.................. + mul v31.8H, v6.8H, v1.H[0] // ..............................................................................................................................................................................*................. + sqrdmulh v6.8H, v6.8H, v1.H[1] // ...............................................................................................................................................................................*................ + mla v7.8H, v11.8H, v8.H[0] // ..................................................................................................................................................................*............................. + mul v30.8H, v26.8H, v0.H[6] // ................................................................................................................................................................................*............... + sqrdmulh v26.8H, v26.8H, v0.H[7] // ....................................................................................................................................................................................*........... + mla v31.8H, v6.8H, v8.H[0] // ...................................................................................................................................................................................*............ + sqrdmulh v22.8H, v23.8H, v0.H[1] // ........................................................................................................................................................................................*....... + add v11.8H, v17.8H, v7.8H // ......................................................................................................................................................................*......................... + sub v6.8H, v17.8H, v7.8H // ...........................................................................................................................................................................*.................... + sub v21.8H, v19.8H, v31.8H // ......................................................................................................................................................................................*......... + add v28.8H, v19.8H, v31.8H // .......................................................................................................................................................................................*........ + mul v13.8H, v11.8H, v1.H[2] // ..........................................................................................................................................................................*..................... + mul v19.8H, v23.8H, v0.H[0] // .........................................................................................................................................................................................*...... + mul v20.8H, v6.8H, v1.H[4] // .................................................................................................................................................................................*.............. + mla v30.8H, v26.8H, v8.H[0] // ...........................................................................................................................................................................................*.... + sqrdmulh v26.8H, v6.8H, v1.H[5] // ..................................................................................................................................................................................*............. + mla v19.8H, v22.8H, v8.H[0] // ............................................................................................................................................................................................*... + sqrdmulh v6.8H, v11.8H, v1.H[3] // .........................................................................................................................................................................*...................... + sub v11.8H, v16.8H, v30.8H // ..............................................................................................................................................................................................*. + mla v20.8H, v26.8H, v8.H[0] // ...............................................................................................................................................................................................* + ldr q22, [x6, #96] // ..........................................................................................................................................................................................*..... + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // add v23.8H, v24.8H, v19.8H // *............................................................................................................................................................................................... + // sub v17.8H, v24.8H, v19.8H // ..*............................................................................................................................................................................................. + // add v9.8H, v10.8H, v20.8H // ......................*......................................................................................................................................................................... + // mul v31.8H, v23.8H, v0.H[2] // ...*............................................................................................................................................................................................ + // sqrdmulh v7.8H, v17.8H, v0.H[5] // ......*......................................................................................................................................................................................... + // sqrdmulh v23.8H, v23.8H, v0.H[3] // ....*........................................................................................................................................................................................... + // mul v17.8H, v17.8H, v0.H[4] // .....*.......................................................................................................................................................................................... + // add v5.8H, v22.8H, v29.8H // .........*...................................................................................................................................................................................... + // mla v13.8H, v6.8H, v8.H[0] // .......*........................................................................................................................................................................................ + // mla v31.8H, v23.8H, v8.H[0] // ........*....................................................................................................................................................................................... + // mla v17.8H, v7.8H, v8.H[0] // ..........*..................................................................................................................................................................................... + // sub v29.8H, v22.8H, v29.8H // .*.............................................................................................................................................................................................. + // sub v7.8H, v10.8H, v20.8H // .....................*.......................................................................................................................................................................... + // add v23.8H, v5.8H, v31.8H // .............*.................................................................................................................................................................................. + // add v26.8H, v29.8H, v17.8H // ............................*................................................................................................................................................................... + // sub v12.8H, v5.8H, v31.8H // ............*................................................................................................................................................................................... + // mul v5.8H, v23.8H, v0.H[6] // .........................*...................................................................................................................................................................... + // sqrdmulh v31.8H, v23.8H, v0.H[7] // ........................*....................................................................................................................................................................... + // sqrdmulh v10.8H, v12.8H, v1.H[1] // ................*............................................................................................................................................................................... + // mul v12.8H, v12.8H, v1.H[0] // ...............*................................................................................................................................................................................ + // sqrdmulh v4.8H, v26.8H, v1.H[3] // ............................................*................................................................................................................................................... + // mla v5.8H, v31.8H, v8.H[0] // .....................................*.......................................................................................................................................................... + // mul v31.8H, v26.8H, v1.H[2] // ...........................................*.................................................................................................................................................... + // mla v12.8H, v10.8H, v8.H[0] // ...................*............................................................................................................................................................................ + // sub v23.8H, v25.8H, v13.8H // ...........*.................................................................................................................................................................................... + // sub v24.8H, v14.8H, v5.8H // .............................................*.................................................................................................................................................. + // mla v31.8H, v4.8H, v8.H[0] // ........................................................*....................................................................................................................................... + // sub v19.8H, v15.8H, v12.8H // .......................*........................................................................................................................................................................ + // mul v10.8H, v24.8H, v2.H[0] // ................................................................*............................................................................................................................... + // sqrdmulh v22.8H, v24.8H, v2.H[1] // .................................................................*.............................................................................................................................. + // sub v20.8H, v29.8H, v17.8H // ..............*................................................................................................................................................................................. + // sub v26.8H, v18.8H, v31.8H // ............................................................*................................................................................................................................... + // add v6.8H, v18.8H, v31.8H // ..........................................................................*..................................................................................................................... + // mla v10.8H, v22.8H, v8.H[0] // ......................................................................*......................................................................................................................... + // sqrdmulh v24.8H, v26.8H, v3.H[1] // .......................................................................*........................................................................................................................ + // mul v31.8H, v26.8H, v3.H[0] // ........................................................................*....................................................................................................................... + // sqrdmulh v26.8H, v6.8H, v2.H[7] // .................................................................................*.............................................................................................................. + // add v22.8H, v11.8H, v10.8H // .......................................................................................*........................................................................................................ + // sub v29.8H, v11.8H, v10.8H // ......................................................................................*......................................................................................................... + // mla v31.8H, v24.8H, v8.H[0] // ............................................................................*................................................................................................................... + // str q22, [x6, #64] // .......................................................................................................*........................................................................................ + // mul v17.8H, v6.8H, v2.H[6] // ................................................................................*............................................................................................................... + // str q29, [x6, #96] // ...................................................................................................*............................................................................................ + // add v18.8H, v23.8H, v31.8H // ...............................................................................*................................................................................................................ + // sqrdmulh v4.8H, v20.8H, v1.H[5] // .................*.............................................................................................................................................................................. + // mul v22.8H, v20.8H, v1.H[4] // ..................*............................................................................................................................................................................. + // sub v11.8H, v23.8H, v31.8H // ..............................................................................*................................................................................................................. + // str q18, [x14, #64] // .......................................................................................................................*........................................................................ + // mla v17.8H, v26.8H, v8.H[0] // .........................................................................................*...................................................................................................... + // str q11, [x14, #96] // ............................................................................................*................................................................................................... + // add v29.8H, v15.8H, v12.8H // .............................*.................................................................................................................................................................. + // mla v22.8H, v4.8H, v8.H[0] // ....................*........................................................................................................................................................................... + // add v31.8H, v14.8H, v5.8H // ......................................*......................................................................................................................................................... + // sqrdmulh v14.8H, v29.8H, v2.H[3] // .......................................*........................................................................................................................................................ + // add v13.8H, v25.8H, v13.8H // ..................................*............................................................................................................................................................. + // sub v20.8H, v27.8H, v22.8H // ................................*............................................................................................................................................................... + // add v18.8H, v27.8H, v22.8H // ..............................*................................................................................................................................................................. + // mul v29.8H, v29.8H, v2.H[2] // ........................................*....................................................................................................................................................... + // sub v26.8H, v13.8H, v17.8H // .............................................................................................*.................................................................................................. + // mul v4.8H, v18.8H, v3.H[2] // ................................................*............................................................................................................................................... + // sqrdmulh v22.8H, v18.8H, v3.H[3] // .................................................*.............................................................................................................................................. + // mla v29.8H, v14.8H, v8.H[0] // ......................................................*......................................................................................................................................... + // sqrdmulh v6.8H, v19.8H, v2.H[5] // ..........................*..................................................................................................................................................................... + // mul v19.8H, v19.8H, v2.H[4] // ...........................*.................................................................................................................................................................... + // str q26, [x14, #32] // ................................................................................................*............................................................................................... + // mul v25.8H, v20.8H, v3.H[4] // ....................................................*........................................................................................................................................... + // add v14.8H, v28.8H, v29.8H // ...........................................................*.................................................................................................................................... + // mla v19.8H, v6.8H, v8.H[0] // ...............................*................................................................................................................................................................ + // sqrdmulh v27.8H, v31.8H, v1.H[7] // .........................................*...................................................................................................................................................... + // mul v11.8H, v31.8H, v1.H[6] // ..........................................*..................................................................................................................................................... + // str q14, [x6, #128] // .............................................................................*.................................................................................................................. + // sub v5.8H, v21.8H, v19.8H // ...................................*............................................................................................................................................................ + // add v14.8H, v21.8H, v19.8H // ....................................*........................................................................................................................................................... + // mla v11.8H, v27.8H, v8.H[0] // ..............................................*................................................................................................................................................. + // mla v4.8H, v22.8H, v8.H[0] // .......................................................*........................................................................................................................................ + // str q14, [x6, #192] // ...............................................*................................................................................................................................................ + // add v13.8H, v13.8H, v17.8H // ..............................................................................................*................................................................................................. + // add v6.8H, v16.8H, v30.8H // .................................*.............................................................................................................................................................. + // add v24.8H, v9.8H, v4.8H // ..............................................................*................................................................................................................................. + // str q13, [x14], #16 // .........................................................................................................*...................................................................................... + // sqrdmulh v26.8H, v20.8H, v3.H[5] // .....................................................*.......................................................................................................................................... + // add v23.8H, v6.8H, v11.8H // ..................................................*............................................................................................................................................. + // str q24, [x14, #112] // .....................................................................................*.......................................................................................................... + // sub v22.8H, v6.8H, v11.8H // ...................................................*............................................................................................................................................ + // str q23, [x6], #16 // ..................................................................*............................................................................................................................. + // sub v15.8H, v28.8H, v29.8H // ..........................................................*..................................................................................................................................... + // str q22, [x6, #16] // ....................................................................*........................................................................................................................... + // mla v25.8H, v26.8H, v8.H[0] // .........................................................*...................................................................................................................................... + // str q15, [x6, #144] // ...........................................................................*.................................................................................................................... + // sub v29.8H, v9.8H, v4.8H // ...................................................................*............................................................................................................................ + // str q5, [x6, #208] // .........................................................................*...................................................................................................................... + // add v17.8H, v7.8H, v25.8H // .............................................................*.................................................................................................................................. + // str q29, [x14, #144] // ..........................................................................................*..................................................................................................... + // sub v11.8H, v7.8H, v25.8H // ...............................................................*................................................................................................................................ + // str q17, [x14, #176] // ..................................................................................*............................................................................................................. + // str q11, [x14, #208] // ........................................................................................*....................................................................................................... + // ldr q15, [x14, #160] // ...........................................................................................................*.................................................................................... + // ldr q16, [x14, #0] // .....................................................................*.......................................................................................................................... + // sqrdmulh v27.8H, v15.8H, v0.H[1] // .................................................................................................................*.............................................................................. + // mul v15.8H, v15.8H, v0.H[0] // ..............................................................................................................*................................................................................. + // sqrdmulh v14.8H, v16.8H, v0.H[1] // ....................................................................................*........................................................................................................... + // mul v16.8H, v16.8H, v0.H[0] // ...................................................................................*............................................................................................................ + // ldr q6, [x6, #160] // .............................................................................................................*.................................................................................. + // mla v15.8H, v27.8H, v8.H[0] // ......................................................................................................................*......................................................................... + // mla v16.8H, v14.8H, v8.H[0] // ...........................................................................................*.................................................................................................... + // ldr q27, [x6, #128] // ..........................................................................................................*..................................................................................... + // sub v14.8H, v6.8H, v15.8H // .............................................................................................................................*.................................................................. + // add v15.8H, v6.8H, v15.8H // ..............................................................................................................................*................................................................. + // ldr q6, [x14, #128] // ..................................................................................................*............................................................................................. + // sqrdmulh v26.8H, v14.8H, v0.H[5] // ...................................................................................................................................*............................................................ + // mul v14.8H, v14.8H, v0.H[4] // .....................................................................................................................................*.......................................................... + // mul v7.8H, v15.8H, v0.H[2] // .......................................................................................................................................*........................................................ + // sqrdmulh v11.8H, v6.8H, v0.H[1] // ......................................................................................................*......................................................................................... + // mul v6.8H, v6.8H, v0.H[0] // .....................................................................................................*.......................................................................................... + // ldr q13, [x6, #0] // .................................................................................................*.............................................................................................. + // mla v14.8H, v26.8H, v8.H[0] // .........................................................................................................................................*...................................................... + // mla v6.8H, v11.8H, v8.H[0] // ............................................................................................................*................................................................................... + // sqrdmulh v15.8H, v15.8H, v0.H[3] // ......................................................................................................................................*......................................................... + // sub v26.8H, v13.8H, v16.8H // ....................................................................................................*........................................................................................... + // add v16.8H, v13.8H, v16.8H // ........................................................................................................*....................................................................................... + // sub v11.8H, v27.8H, v6.8H // ................................................................................................................*............................................................................... + // add v27.8H, v27.8H, v6.8H // ...............................................................................................................*................................................................................ + // mla v7.8H, v15.8H, v8.H[0] // ..........................................................................................................................................*..................................................... + // ldr q15, [x14, #32] // ........................................................................................................................*....................................................................... + // sqrdmulh v6.8H, v11.8H, v0.H[5] // ....................................................................................................................*........................................................................... + // mul v11.8H, v11.8H, v0.H[4] // ...................................................................................................................*............................................................................ + // sqrdmulh v13.8H, v15.8H, v0.H[1] // ............................................................................................................................*................................................................... + // mul v15.8H, v15.8H, v0.H[0] // ...........................................................................................................................*.................................................................... + // ldr q31, [x6, #32] // ........................................................................................................................................*....................................................... + // mla v11.8H, v6.8H, v8.H[0] // .........................................................................................................................*...................................................................... + // mla v15.8H, v13.8H, v8.H[0] // ................................................................................................................................*............................................................... + // mul v6.8H, v27.8H, v0.H[2] // ..................................................................................................................*............................................................................. + // sqrdmulh v13.8H, v27.8H, v0.H[3] // .....................................................................................................................*.......................................................................... + // add v25.8H, v26.8H, v11.8H // ...............................................................................................................................*................................................................ + // sub v10.8H, v26.8H, v11.8H // .................................................................................................................................*.............................................................. + // sub v27.8H, v31.8H, v15.8H // ................................................................................................................................................*............................................... + // add v15.8H, v31.8H, v15.8H // ..............................................................................................................................................*................................................. + // mla v6.8H, v13.8H, v8.H[0] // ..........................................................................................................................*..................................................................... + // add v18.8H, v27.8H, v14.8H // ....................................................................................................................................................*........................................... + // sub v27.8H, v27.8H, v14.8H // ...................................................................................................................................................*............................................ + // add v14.8H, v15.8H, v7.8H // .................................................................................................................................................*.............................................. + // sub v15.8H, v15.8H, v7.8H // ...........................................................................................................................................................*.................................... + // sub v26.8H, v16.8H, v6.8H // ....................................................................................................................................*........................................................... + // add v16.8H, v16.8H, v6.8H // ..................................................................................................................................*............................................................. + // ldr q6, [x14, #192] // ............................................................................................................................................*................................................... + // ldr q7, [x14, #64] // ...........................................................................................................................................*.................................................... + // mul v11.8H, v6.8H, v0.H[0] // ..................................................................................................................................................*............................................. + // sqrdmulh v6.8H, v6.8H, v0.H[1] // ......................................................................................................................................................*......................................... + // mul v13.8H, v7.8H, v0.H[0] // .............................................................................................................................................*.................................................. + // ldr q31, [x6, #192] // ........................................................................................................................................................*....................................... + // mla v11.8H, v6.8H, v8.H[0] // .........................................................................................................................................................*...................................... + // ldr q6, [x14, #96] // ...............................................................................................................................................................*................................ + // sqrdmulh v7.8H, v7.8H, v0.H[1] // ...............................................................................................................................................*................................................ + // sub v22.8H, v31.8H, v11.8H // ..............................................................................................................................................................*................................. + // mla v13.8H, v7.8H, v8.H[0] // .......................................................................................................................................................*........................................ + // sqrdmulh v7.8H, v22.8H, v0.H[5] // ....................................................................................................................................................................*........................... + // mul v22.8H, v22.8H, v0.H[4] // ......................................................................................................................................................................*......................... + // sqrdmulh v17.8H, v6.8H, v0.H[1] // ...................................................................................................................................................................*............................ + // mul v29.8H, v6.8H, v0.H[0] // ..................................................................................................................................................................*............................. + // add v6.8H, v31.8H, v11.8H // .............................................................................................................................................................*.................................. + // ldr q11, [x6, #64] // .....................................................................................................................................................*.......................................... + // mul v31.8H, v6.8H, v0.H[2] // ................................................................................................................................................................*............................... + // mla v22.8H, v7.8H, v8.H[0] // .............................................................................................................................................................................*.................. + // sub v7.8H, v11.8H, v13.8H // ............................................................................................................................................................*................................... + // sqrdmulh v6.8H, v6.8H, v0.H[3] // .................................................................................................................................................................*.............................. + // add v11.8H, v11.8H, v13.8H // ..........................................................................................................................................................*..................................... + // add v13.8H, v7.8H, v22.8H // ..................................................................................................................................................................................*............. + // mla v29.8H, v17.8H, v8.H[0] // .........................................................................................................................................................................*...................... + // mla v31.8H, v6.8H, v8.H[0] // .....................................................................................................................................................................*.......................... + // sqrdmulh v6.8H, v13.8H, v1.H[3] // ............................................................................................................................................................................................*... + // mul v13.8H, v13.8H, v1.H[2] // ......................................................................................................................................................................................*......... + // sub v7.8H, v7.8H, v22.8H // ...................................................................................................................................................................................*............ + // sub v24.8H, v11.8H, v31.8H // ........................................................................................................................................................................*....................... + // add v11.8H, v11.8H, v31.8H // ..........................................................................................................................................................................*..................... + // mul v22.8H, v24.8H, v1.H[0] // ...........................................................................................................................................................................*.................... + // sqrdmulh v31.8H, v24.8H, v1.H[1] // ............................................................................................................................................................................*................... + // mul v30.8H, v11.8H, v0.H[6] // ..............................................................................................................................................................................*................. + // mul v20.8H, v7.8H, v1.H[4] // ........................................................................................................................................................................................*....... + // sqrdmulh v7.8H, v7.8H, v1.H[5] // ..........................................................................................................................................................................................*..... + // mla v22.8H, v31.8H, v8.H[0] // ................................................................................................................................................................................*............... + // sqrdmulh v24.8H, v11.8H, v0.H[7] // ...............................................................................................................................................................................*................ + // ldr q5, [x14, #224] // .......................................................................................................................................................................*........................ + // sub v21.8H, v26.8H, v22.8H // ....................................................................................................................................................................................*........... + // add v28.8H, v26.8H, v22.8H // .....................................................................................................................................................................................*.......... + // sqrdmulh v26.8H, v5.8H, v0.H[1] // .................................................................................................................................................................................*.............. + // mul v19.8H, v5.8H, v0.H[0] // .......................................................................................................................................................................................*........ + // ldr q22, [x6, #96] // ...............................................................................................................................................................................................* + // mla v30.8H, v24.8H, v8.H[0] // .........................................................................................................................................................................................*...... + // mla v19.8H, v26.8H, v8.H[0] // ...........................................................................................................................................................................................*.... + // ldr q24, [x6, #224] // ...............................................................................................*................................................................................................ + // sub v11.8H, v16.8H, v30.8H // .............................................................................................................................................................................................*.. + // mla v20.8H, v7.8H, v8.H[0] // ..............................................................................................................................................................................................*. + + sub count, count, #1 + cbnz count, layer1234_start + add v23.8H, v24.8H, v19.8H // .......................................................*........................................................................................................................................ + sub v17.8H, v24.8H, v19.8H // ......................................................*......................................................................................................................................... + add v9.8H, v10.8H, v20.8H // ..................................................................................................................................*............................................................. + mul v31.8H, v23.8H, v0.H[2] // ........................................................................*....................................................................................................................... + sqrdmulh v7.8H, v17.8H, v0.H[5] // ...........................................................................................*.................................................................................................... + sqrdmulh v23.8H, v23.8H, v0.H[3] // .......................................................................*........................................................................................................................ + mul v17.8H, v17.8H, v0.H[4] // ............................................................................................*................................................................................................... + add v5.8H, v22.8H, v29.8H // ...................................*............................................................................................................................................................ + mla v13.8H, v6.8H, v8.H[0] // ......................................................................................................................*......................................................................... + mla v31.8H, v23.8H, v8.H[0] // .........................................................................*...................................................................................................................... + mla v17.8H, v7.8H, v8.H[0] // .............................................................................................*.................................................................................................. + sub v29.8H, v22.8H, v29.8H // ..................................*............................................................................................................................................................. + sub v7.8H, v10.8H, v20.8H // .................................................................................................................................*.............................................................. + add v23.8H, v5.8H, v31.8H // ...........................................................................*.................................................................................................................... + add v26.8H, v29.8H, v17.8H // ...............................................................................................*................................................................................................ + sub v12.8H, v5.8H, v31.8H // ..........................................................................*..................................................................................................................... + mul v5.8H, v23.8H, v0.H[6] // ......................................................................................................*......................................................................................... + sqrdmulh v31.8H, v23.8H, v0.H[7] // .....................................................................................................*.......................................................................................... + sqrdmulh v10.8H, v12.8H, v1.H[1] // ...............................................................................................................*................................................................................ + mul v12.8H, v12.8H, v1.H[0] // ................................................................................................................*............................................................................... + sqrdmulh v4.8H, v26.8H, v1.H[3] // .........................................................................................................................*...................................................................... + mla v5.8H, v31.8H, v8.H[0] // .......................................................................................................*........................................................................................ + mul v31.8H, v26.8H, v1.H[2] // ..........................................................................................................................*..................................................................... + mla v12.8H, v10.8H, v8.H[0] // .................................................................................................................*.............................................................................. + sub v23.8H, v25.8H, v13.8H // .......................................................................................................................*........................................................................ + sub v24.8H, v14.8H, v5.8H // ........................................................................................................*....................................................................................... + mla v31.8H, v4.8H, v8.H[0] // ...........................................................................................................................*.................................................................... + sub v19.8H, v15.8H, v12.8H // ..................................................................................................................*............................................................................. + mul v10.8H, v24.8H, v2.H[0] // ..............................................................................................................................................*................................................. + sqrdmulh v22.8H, v24.8H, v2.H[1] // .............................................................................................................................................*.................................................. + sub v20.8H, v29.8H, v17.8H // ..............................................................................................*................................................................................................. + sub v26.8H, v18.8H, v31.8H // ............................................................................................................................*................................................................... + add v6.8H, v18.8H, v31.8H // .............................................................................................................................*.................................................................. + mla v10.8H, v22.8H, v8.H[0] // ...............................................................................................................................................*................................................ + sqrdmulh v24.8H, v26.8H, v3.H[1] // .................................................................................................................................................................*.............................. + mul v31.8H, v26.8H, v3.H[0] // ..................................................................................................................................................................*............................. + sqrdmulh v26.8H, v6.8H, v2.H[7] // ............................................................................................................................................................*................................... + add v22.8H, v11.8H, v10.8H // .................................................................................................................................................*.............................................. + sub v29.8H, v11.8H, v10.8H // ................................................................................................................................................*............................................... + mla v31.8H, v24.8H, v8.H[0] // ...................................................................................................................................................................*............................ + str q22, [x6, #64] // ..................................................................................................................................................................................*............. + mul v17.8H, v6.8H, v2.H[6] // .............................................................................................................................................................*.................................. + str q29, [x6, #96] // ...................................................................................................................................................................................*............ + add v18.8H, v23.8H, v31.8H // .....................................................................................................................................................................*.......................... + sqrdmulh v4.8H, v20.8H, v1.H[5] // ...................................................................................................................................*............................................................ + mul v22.8H, v20.8H, v1.H[4] // ....................................................................................................................................*........................................................... + sub v11.8H, v23.8H, v31.8H // ....................................................................................................................................................................*........................... + str q18, [x14, #64] // ..........................................................................................................................................................................................*..... + mla v17.8H, v26.8H, v8.H[0] // ..............................................................................................................................................................*................................. + str q11, [x14, #96] // ...........................................................................................................................................................................................*.... + add v29.8H, v15.8H, v12.8H // ...................................................................................................................*............................................................................ + mla v22.8H, v4.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + add v31.8H, v14.8H, v5.8H // .........................................................................................................*...................................................................................... + sqrdmulh v14.8H, v29.8H, v2.H[3] // ..................................................................................................................................................*............................................. + add v13.8H, v25.8H, v13.8H // ........................................................................................................................*....................................................................... + sub v20.8H, v27.8H, v22.8H // ......................................................................................................................................*......................................................... + add v18.8H, v27.8H, v22.8H // .......................................................................................................................................*........................................................ + mul v29.8H, v29.8H, v2.H[2] // ...................................................................................................................................................*............................................ + sub v26.8H, v13.8H, v17.8H // ...............................................................................................................................................................*................................ + mul v4.8H, v18.8H, v3.H[2] // .......................................................................................................................................................................*........................ + sqrdmulh v22.8H, v18.8H, v3.H[3] // ......................................................................................................................................................................*......................... + mla v29.8H, v14.8H, v8.H[0] // ....................................................................................................................................................*........................................... + sqrdmulh v6.8H, v19.8H, v2.H[5] // .......................................................................................................................................................*........................................ + mul v19.8H, v19.8H, v2.H[4] // ........................................................................................................................................................*....................................... + str q26, [x14, #32] // .........................................................................................................................................................................................*...... + mul v25.8H, v20.8H, v3.H[4] // ............................................................................................................................................................................*................... + add v14.8H, v28.8H, v29.8H // ......................................................................................................................................................*......................................... + mla v19.8H, v6.8H, v8.H[0] // .........................................................................................................................................................*...................................... + sqrdmulh v27.8H, v31.8H, v1.H[7] // ........................................................................................................................................*....................................................... + mul v11.8H, v31.8H, v1.H[6] // .........................................................................................................................................*...................................................... + str q14, [x6, #128] // ....................................................................................................................................................................................*........... + sub v5.8H, v21.8H, v19.8H // ..........................................................................................................................................................*..................................... + add v14.8H, v21.8H, v19.8H // ...........................................................................................................................................................*.................................... + mla v11.8H, v27.8H, v8.H[0] // ..........................................................................................................................................*..................................................... + mla v4.8H, v22.8H, v8.H[0] // ........................................................................................................................................................................*....................... + str q14, [x6, #192] // ......................................................................................................................................................................................*......... + add v13.8H, v13.8H, v17.8H // ................................................................................................................................................................*............................... + add v6.8H, v16.8H, v30.8H // ....................................................................................................*........................................................................................... + add v24.8H, v9.8H, v4.8H // ..........................................................................................................................................................................*..................... + str q13, [x14], #16 // ........................................................................................................................................................................................*....... + sqrdmulh v26.8H, v20.8H, v3.H[5] // ...........................................................................................................................................................................*.................... + add v23.8H, v6.8H, v11.8H // ............................................................................................................................................*................................................... + str q24, [x14, #112] // ............................................................................................................................................................................................*... + sub v22.8H, v6.8H, v11.8H // ...........................................................................................................................................*.................................................... + str q23, [x6], #16 // ................................................................................................................................................................................*............... + sub v15.8H, v28.8H, v29.8H // .....................................................................................................................................................*.......................................... + str q22, [x6, #16] // .................................................................................................................................................................................*.............. + mla v25.8H, v26.8H, v8.H[0] // .............................................................................................................................................................................*.................. + str q15, [x6, #144] // .....................................................................................................................................................................................*.......... + sub v29.8H, v9.8H, v4.8H // .........................................................................................................................................................................*...................... + str q5, [x6, #208] // .......................................................................................................................................................................................*........ + add v17.8H, v7.8H, v25.8H // ...............................................................................................................................................................................*................ + str q29, [x14, #144] // .............................................................................................................................................................................................*.. + sub v11.8H, v7.8H, v25.8H // ..............................................................................................................................................................................*................. + str q17, [x14, #176] // ..............................................................................................................................................................................................*. + str q11, [x14, #208] // ...............................................................................................................................................................................................* + + restore inp, STACK0 + mov count, #4 + + ASM_LOAD(r_ptr1, roots_l456) + + add src0, inp, #256*0 + add src1, inp, #256*1 + + .p2align 2 + // Instructions: 118 + // Expected cycles: 152 + // Expected IPC: 0.78 + // + // Cycle bound: 152.0 + // IPC bound: 0.78 + // + // Wall time: 3.08s + // User time: 3.08s + // + // ------------------------------------------------- original position -------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + ldr q15, [x4], #16*14 // ..*................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x6] // *..................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ld4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x7] // .*.................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q14, [x4, #-64] // .................*.................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v6.4S, v16.4S, v27.4S // ....*................................................................................................................. + // gap // ...................................................................................................................... + trn2 v26.4S, v18.4S, v29.4S // .....*................................................................................................................ + // gap // ...................................................................................................................... + mul v0.8H, v6.8H, v15.8H // ......*............................................................................................................... + // gap // ...................................................................................................................... + mul v7.8H, v26.8H, v15.8H // ........*............................................................................................................. + // gap // ...................................................................................................................... + trn2 v11.4S, v19.4S, v30.4S // .........*............................................................................................................ + // gap // ...................................................................................................................... + trn2 v13.4S, v17.4S, v28.4S // ...........................*.......................................................................................... + // gap // ...................................................................................................................... + mul v31.8H, v11.8H, v15.8H // ...........*.......................................................................................................... + // gap // ...................................................................................................................... + mul v15.8H, v13.8H, v15.8H // ..............................*....................................................................................... + // gap // ...................................................................................................................... + trn1 v16.4S, v16.4S, v27.4S // ..................*................................................................................................... + // gap // ...................................................................................................................... + trn1 v27.4S, v17.4S, v28.4S // .........................................*............................................................................ + // gap // ...................................................................................................................... + trn1 v25.4S, v18.4S, v29.4S // ...*.................................................................................................................. + // gap // ...................................................................................................................... + trn1 v22.4S, v19.4S, v30.4S // ...............*...................................................................................................... + // gap // ...................................................................................................................... + ldr q3, [x4, #-208] // .......*.............................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q2, [x4, #-160] // ....................*................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v26.8H, v26.8H, v3.8H // ..........*........................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v11.8H, v11.8H, v3.8H // ............*......................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v6.8H, v6.8H, v3.8H // .............*........................................................................................................ + // gap // ...................................................................................................................... + sqrdmulh v13.8H, v13.8H, v3.8H // .............................*........................................................................................ + // gap // ...................................................................................................................... + mla v7.8H, v26.8H, v8.H[0] // ..............*....................................................................................................... + // gap // ...................................................................................................................... + mla v31.8H, v11.8H, v8.H[0] // ................*..................................................................................................... + // gap // ...................................................................................................................... + mla v0.8H, v6.8H, v8.H[0] // ................................*..................................................................................... + // gap // ...................................................................................................................... + ldr q6, [x4, #-144] // ..........................*........................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v26.8H, v22.8H, v31.8H // ...................*.................................................................................................. + // gap // ...................................................................................................................... + add v11.8H, v22.8H, v31.8H // ............................*......................................................................................... + // gap // ...................................................................................................................... + add v31.8H, v25.8H, v7.8H // ..................................*................................................................................... + // gap // ...................................................................................................................... + mul v22.8H, v26.8H, v2.8H // ......................*............................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v26.8H, v26.8H, v6.8H // .................................................*.................................................................... + // gap // ...................................................................................................................... + mla v15.8H, v13.8H, v8.H[0] // ....................................*................................................................................. + // gap // ...................................................................................................................... + add v13.8H, v16.8H, v0.8H // ...................................*.................................................................................. + // gap // ...................................................................................................................... + sub v7.8H, v25.8H, v7.8H // ........................................*............................................................................. + // gap // ...................................................................................................................... + mla v22.8H, v26.8H, v8.H[0] // .........................................................*............................................................ + // gap // ...................................................................................................................... + sub v26.8H, v27.8H, v15.8H // ........................................................*............................................................. + // gap // ...................................................................................................................... + sqrdmulh v6.8H, v7.8H, v6.8H // ...........................................*.......................................................................... + // gap // ...................................................................................................................... + add v15.8H, v27.8H, v15.8H // .............................................*........................................................................ + // gap // ...................................................................................................................... + add v27.8H, v26.8H, v22.8H // ..............................................................*....................................................... + // gap // ...................................................................................................................... + sub v26.8H, v26.8H, v22.8H // .............................................................*........................................................ + // gap // ...................................................................................................................... + sub v16.8H, v16.8H, v0.8H // ...............................................................................*...................................... + // gap // ...................................................................................................................... + mul v14.8H, v27.8H, v14.8H // ..................................................................*................................................... + // gap // ...................................................................................................................... + mul v0.8H, v7.8H, v2.8H // .......................................................................*.............................................. + // gap // ...................................................................................................................... + ldr q7, [x4, #-96] // .....................*................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q25, [x4, #-48] // .......................*.............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v0.8H, v6.8H, v8.H[0] // ............................................................................*......................................... + // gap // ...................................................................................................................... + ldr q6, [x4, #-32] // ........................*............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v27.8H, v27.8H, v25.8H // ...........................................................................*.......................................... + // gap // ...................................................................................................................... + add v25.8H, v16.8H, v0.8H // ..................................................................................*................................... + // gap // ...................................................................................................................... + mul v6.8H, v26.8H, v6.8H // ................................................................................*..................................... + // gap // ...................................................................................................................... + sub v16.8H, v16.8H, v0.8H // .........................................................................................*............................ + // gap // ...................................................................................................................... + mla v14.8H, v27.8H, v8.H[0] // .................................................................................*.................................... + // gap // ...................................................................................................................... + ldr q27, [x4, #-112] // .........................*............................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q0, [x4, #-192] // ...............................*...................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v22.8H, v25.8H, v14.8H // ....................................................................................*................................. + // gap // ...................................................................................................................... + sub v14.8H, v25.8H, v14.8H // .....................................................................................*................................ + // gap // ...................................................................................................................... + mul v25.8H, v31.8H, v0.8H // .....................................*................................................................................ + // gap // ...................................................................................................................... + mul v0.8H, v11.8H, v0.8H // ............................................*......................................................................... + // gap // ...................................................................................................................... + sqdmulh v3.8H, v22.8H, v8.H[1] // .......................................................................................*.............................. + // gap // ...................................................................................................................... + sqdmulh v2.8H, v14.8H, v8.H[1] // ........................................................................................*............................. + // gap // ...................................................................................................................... + ldr q17, [x4, #-176] // .................................*.................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v3.8H, v3.8H, #11 // ...........................................................................................*.......................... + // gap // ...................................................................................................................... + srshr v2.8H, v2.8H, #11 // ............................................................................................*......................... + // gap // ...................................................................................................................... + sqrdmulh v31.8H, v31.8H, v17.8H // ......................................*............................................................................... + // gap // ...................................................................................................................... + sqrdmulh v28.8H, v11.8H, v17.8H // .......................................*.............................................................................. + // gap // ...................................................................................................................... + ldr q11, [x4, #-16] // ...................................................................................*.................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v25.8H, v31.8H, v8.H[0] // ..........................................*........................................................................... + // gap // ...................................................................................................................... + mla v0.8H, v28.8H, v8.H[0] // ................................................*..................................................................... + // gap // ...................................................................................................................... + ldr q28, [x4, #-80] // ..................................................*................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v31.8H, v13.8H, v25.8H // ..............................................*....................................................................... + // gap // ...................................................................................................................... + add v13.8H, v13.8H, v25.8H // ...............................................*...................................................................... + // gap // ...................................................................................................................... + add v25.8H, v15.8H, v0.8H // ...................................................*.................................................................. + // gap // ...................................................................................................................... + ldr q21, [x4, #-128] // ....................................................*................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v15.8H, v15.8H, v0.8H // ......................................................*............................................................... + // gap // ...................................................................................................................... + sqrdmulh v27.8H, v25.8H, v27.8H // .....................................................*................................................................ + // gap // ...................................................................................................................... + mul v0.8H, v25.8H, v21.8H // .......................................................*.............................................................. + // gap // ...................................................................................................................... + mul v21.8H, v15.8H, v7.8H // ..........................................................*........................................................... + // gap // ...................................................................................................................... + sqrdmulh v15.8H, v15.8H, v28.8H // ...........................................................*.......................................................... + // gap // ...................................................................................................................... + mla v22.8H, v3.8H, v8.H[0] // ...............................................................................................*...................... + // gap // ...................................................................................................................... + mla v0.8H, v27.8H, v8.H[0] // ............................................................*......................................................... + // gap // ...................................................................................................................... + mla v14.8H, v2.8H, v8.H[0] // ................................................................................................*..................... + // gap // ...................................................................................................................... + mla v21.8H, v15.8H, v8.H[0] // ...............................................................*...................................................... + // gap // ...................................................................................................................... + sqrdmulh v15.8H, v26.8H, v11.8H // ......................................................................................*............................... + // gap // ...................................................................................................................... + add v27.8H, v13.8H, v0.8H // ................................................................*..................................................... + // gap // ...................................................................................................................... + sub v26.8H, v13.8H, v0.8H // .................................................................*.................................................... + // gap // ...................................................................................................................... + sub v0.8H, v31.8H, v21.8H // ...................................................................*.................................................. + // gap // ...................................................................................................................... + add v7.8H, v31.8H, v21.8H // .....................................................................*................................................ + // gap // ...................................................................................................................... + sqdmulh v11.8H, v26.8H, v8.H[1] // ....................................................................*................................................. + // gap // ...................................................................................................................... + sqdmulh v13.8H, v0.8H, v8.H[1] // ......................................................................*............................................... + // gap // ...................................................................................................................... + sqdmulh v31.8H, v7.8H, v8.H[1] // .........................................................................*............................................ + // gap // ...................................................................................................................... + mla v6.8H, v15.8H, v8.H[0] // ..........................................................................................*........................... + // gap // ...................................................................................................................... + srshr v15.8H, v11.8H, #11 // ........................................................................*............................................. + // gap // ...................................................................................................................... + srshr v11.8H, v13.8H, #11 // ..........................................................................*........................................... + // gap // ...................................................................................................................... + srshr v13.8H, v31.8H, #11 // ..............................................................................*....................................... + // gap // ...................................................................................................................... + sqdmulh v31.8H, v27.8H, v8.H[1] // .............................................................................................*........................ + // gap // ...................................................................................................................... + mla v0.8H, v11.8H, v8.H[0] // .............................................................................*........................................ + // gap // ...................................................................................................................... + sub v11.8H, v16.8H, v6.8H // ..............................................................................................*....................... + // gap // ...................................................................................................................... + add v16.8H, v16.8H, v6.8H // .................................................................................................*.................... + // gap // ...................................................................................................................... + srshr v6.8H, v31.8H, #11 // ..................................................................................................*................... + // gap // ...................................................................................................................... + sqdmulh v31.8H, v11.8H, v8.H[1] // ...................................................................................................*.................. + // gap // ...................................................................................................................... + sqdmulh v25.8H, v16.8H, v8.H[1] // ....................................................................................................*................. + // gap // ...................................................................................................................... + mla v27.8H, v6.8H, v8.H[0] // .....................................................................................................*................ + // gap // ...................................................................................................................... + mla v7.8H, v13.8H, v8.H[0] // ......................................................................................................*............... + // gap // ...................................................................................................................... + srshr v6.8H, v31.8H, #11 // .......................................................................................................*.............. + // gap // ...................................................................................................................... + srshr v13.8H, v25.8H, #11 // ........................................................................................................*............. + // gap // ...................................................................................................................... + trn1 v3.4S, v27.4S, v22.4S // .........................................................................................................*............ + // gap // ...................................................................................................................... + mla v11.8H, v6.8H, v8.H[0] // ..........................................................................................................*........... + // gap // ...................................................................................................................... + mla v16.8H, v13.8H, v8.H[0] // ...........................................................................................................*.......... + // gap // ...................................................................................................................... + mla v26.8H, v15.8H, v8.H[0] // ............................................................................................................*......... + // gap // ...................................................................................................................... + trn2 v27.4S, v27.4S, v22.4S // .............................................................................................................*........ + // gap // ...................................................................................................................... + trn1 v6.4S, v0.4S, v11.4S // ..............................................................................................................*....... + // gap // ...................................................................................................................... + trn2 v29.4S, v7.4S, v16.4S // ...............................................................................................................*...... + // gap // ...................................................................................................................... + trn1 v4.4S, v26.4S, v14.4S // ................................................................................................................*..... + // gap // ...................................................................................................................... + trn2 v28.4S, v26.4S, v14.4S // .................................................................................................................*.... + // gap // ...................................................................................................................... + trn2 v30.4S, v0.4S, v11.4S // ..................................................................................................................*... + // gap // ...................................................................................................................... + trn1 v5.4S, v7.4S, v16.4S // ...................................................................................................................*.. + // gap // ...................................................................................................................... + st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x7], #64 // ....................................................................................................................*. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + st4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x6], #64 // .....................................................................................................................* + // gap // ...................................................................................................................... + + // --------------------------------------------------- new position ----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + // ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x6] // .*.................................................................................................................... + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x7] // ..*................................................................................................................... + // ldr q15, [x4], #16*14 // *..................................................................................................................... + // trn1 v13.4S, v22.4S, v11.4S // ..............*....................................................................................................... + // trn2 v14.4S, v20.4S, v9.4S // ....*................................................................................................................. + // trn2 v2.4S, v22.4S, v11.4S // .....*................................................................................................................ + // mul v17.8H, v14.8H, v15.8H // ......*............................................................................................................... + // ldr q19, [x4, #-208] // ................*..................................................................................................... + // mul v26.8H, v2.8H, v15.8H // .......*.............................................................................................................. + // trn2 v6.4S, v23.4S, v12.4S // ........*............................................................................................................. + // sqrdmulh v28.8H, v2.8H, v19.8H // ..................*................................................................................................... + // mul v16.8H, v6.8H, v15.8H // ..........*........................................................................................................... + // sqrdmulh v27.8H, v6.8H, v19.8H // ...................*.................................................................................................. + // sqrdmulh v14.8H, v14.8H, v19.8H // ....................*................................................................................................. + // mla v26.8H, v28.8H, v8.H[0] // ......................*............................................................................................... + // trn1 v0.4S, v23.4S, v12.4S // ...............*...................................................................................................... + // mla v16.8H, v27.8H, v8.H[0] // .......................*.............................................................................................. + // ldr q31, [x4, #-64] // ...*.................................................................................................................. + // trn1 v24.4S, v20.4S, v9.4S // ............*......................................................................................................... + // sub v7.8H, v0.8H, v16.8H // ..........................*........................................................................................... + // ldr q23, [x4, #-160] // .................*.................................................................................................... + // ldr q22, [x4, #-96] // ...........................................*.......................................................................... + // mul v4.8H, v7.8H, v23.8H // .............................*........................................................................................ + // ldr q25, [x4, #-48] // ............................................*......................................................................... + // ldr q28, [x4, #-32] // ..............................................*....................................................................... + // ldr q29, [x4, #-112] // ....................................................*................................................................. + // ldr q1, [x4, #-144] // .........................*............................................................................................ + // trn2 v11.4S, v21.4S, v10.4S // .........*............................................................................................................ + // add v6.8H, v0.8H, v16.8H // ...........................*.......................................................................................... + // sqrdmulh v27.8H, v11.8H, v19.8H // .....................*................................................................................................ + // mul v11.8H, v11.8H, v15.8H // ...........*.......................................................................................................... + // ldr q3, [x4, #-192] // .....................................................*................................................................ + // mla v17.8H, v14.8H, v8.H[0] // ........................*............................................................................................. + // ldr q0, [x4, #-176] // ............................................................*......................................................... + // add v16.8H, v13.8H, v26.8H // ............................*......................................................................................... + // add v14.8H, v24.8H, v17.8H // ................................*..................................................................................... + // mla v11.8H, v27.8H, v8.H[0] // ...............................*...................................................................................... + // mul v27.8H, v16.8H, v3.8H // ........................................................*............................................................. + // sqrdmulh v15.8H, v16.8H, v0.8H // ...............................................................*...................................................... + // sqrdmulh v16.8H, v6.8H, v0.8H // ................................................................*..................................................... + // sub v13.8H, v13.8H, v26.8H // .................................*.................................................................................... + // trn1 v2.4S, v21.4S, v10.4S // .............*........................................................................................................ + // mla v27.8H, v15.8H, v8.H[0] // ..................................................................*................................................... + // sqrdmulh v5.8H, v13.8H, v1.8H // ....................................*................................................................................. + // mul v15.8H, v6.8H, v3.8H // .........................................................*............................................................ + // add v3.8H, v2.8H, v11.8H // .....................................*................................................................................ + // sub v0.8H, v14.8H, v27.8H // .....................................................................*................................................ + // add v26.8H, v14.8H, v27.8H // ......................................................................*............................................... + // mla v15.8H, v16.8H, v8.H[0] // ...................................................................*.................................................. + // sqrdmulh v19.8H, v7.8H, v1.8H // ..............................*....................................................................................... + // ldr q10, [x4, #-80] // ....................................................................*................................................. + // add v12.8H, v3.8H, v15.8H // .......................................................................*.............................................. + // ldr q16, [x4, #-128] // ........................................................................*............................................. + // sqrdmulh v14.8H, v12.8H, v29.8H // ..........................................................................*........................................... + // sub v21.8H, v3.8H, v15.8H // .........................................................................*............................................ + // mul v27.8H, v12.8H, v16.8H // ...........................................................................*.......................................... + // sub v6.8H, v2.8H, v11.8H // ...................................*.................................................................................. + // mla v4.8H, v19.8H, v8.H[0] // ..................................*................................................................................... + // mul v15.8H, v21.8H, v22.8H // ............................................................................*......................................... + // sqrdmulh v16.8H, v21.8H, v10.8H // .............................................................................*........................................ + // mla v27.8H, v14.8H, v8.H[0] // ...............................................................................*...................................... + // sub v1.8H, v6.8H, v4.8H // .......................................*.............................................................................. + // add v14.8H, v6.8H, v4.8H // ......................................*............................................................................... + // mla v15.8H, v16.8H, v8.H[0] // .................................................................................*.................................... + // add v22.8H, v26.8H, v27.8H // ...................................................................................*.................................. + // sub v3.8H, v26.8H, v27.8H // ....................................................................................*................................. + // mul v27.8H, v14.8H, v31.8H // .........................................*............................................................................ + // sub v31.8H, v0.8H, v15.8H // .....................................................................................*................................ + // sqdmulh v16.8H, v3.8H, v8.H[1] // .......................................................................................*.............................. + // add v11.8H, v0.8H, v15.8H // ......................................................................................*............................... + // sqdmulh v15.8H, v31.8H, v8.H[1] // ........................................................................................*............................. + // mul v30.8H, v13.8H, v23.8H // ..........................................*........................................................................... + // srshr v13.8H, v16.8H, #11 // ...........................................................................................*.......................... + // sqdmulh v16.8H, v11.8H, v8.H[1] // .........................................................................................*............................ + // srshr v15.8H, v15.8H, #11 // ............................................................................................*......................... + // sqrdmulh v26.8H, v14.8H, v25.8H // ...............................................*...................................................................... + // mla v30.8H, v5.8H, v8.H[0] // .............................................*........................................................................ + // mla v31.8H, v15.8H, v8.H[0] // ...............................................................................................*...................... + // srshr v14.8H, v16.8H, #11 // .............................................................................................*........................ + // sub v21.8H, v24.8H, v17.8H // ........................................*............................................................................. + // mul v6.8H, v1.8H, v28.8H // .................................................*.................................................................... + // mla v27.8H, v26.8H, v8.H[0] // ...................................................*.................................................................. + // add v2.8H, v21.8H, v30.8H // ................................................*..................................................................... + // ldr q4, [x4, #-16] // .................................................................*.................................................... + // add v10.8H, v2.8H, v27.8H // ......................................................*............................................................... + // sub v7.8H, v2.8H, v27.8H // .......................................................*.............................................................. + // sqrdmulh v26.8H, v1.8H, v4.8H // ..................................................................................*................................... + // sqdmulh v16.8H, v10.8H, v8.H[1] // ..........................................................*........................................................... + // sqdmulh v15.8H, v7.8H, v8.H[1] // ...........................................................*.......................................................... + // sub v24.8H, v21.8H, v30.8H // ..................................................*................................................................... + // mla v6.8H, v26.8H, v8.H[0] // ..........................................................................................*........................... + // srshr v16.8H, v16.8H, #11 // .............................................................*........................................................ + // srshr v15.8H, v15.8H, #11 // ..............................................................*....................................................... + // sqdmulh v27.8H, v22.8H, v8.H[1] // ..............................................................................................*....................... + // sub v0.8H, v24.8H, v6.8H // ................................................................................................*..................... + // mla v10.8H, v16.8H, v8.H[0] // ..............................................................................*....................................... + // mla v7.8H, v15.8H, v8.H[0] // ................................................................................*..................................... + // add v26.8H, v24.8H, v6.8H // .................................................................................................*.................... + // srshr v6.8H, v27.8H, #11 // ..................................................................................................*................... + // sqdmulh v15.8H, v0.8H, v8.H[1] // ...................................................................................................*.................. + // sqdmulh v16.8H, v26.8H, v8.H[1] // ....................................................................................................*................. + // mla v22.8H, v6.8H, v8.H[0] // .....................................................................................................*................ + // mla v11.8H, v14.8H, v8.H[0] // ......................................................................................................*............... + // srshr v15.8H, v15.8H, #11 // .......................................................................................................*.............. + // srshr v14.8H, v16.8H, #11 // ........................................................................................................*............. + // trn1 v27.4S, v22.4S, v10.4S // .........................................................................................................*............ + // mla v0.8H, v15.8H, v8.H[0] // ..........................................................................................................*........... + // mla v26.8H, v14.8H, v8.H[0] // ...........................................................................................................*.......... + // mla v3.8H, v13.8H, v8.H[0] // ............................................................................................................*......... + // trn2 v15.4S, v22.4S, v10.4S // .............................................................................................................*........ + // trn1 v30.4S, v31.4S, v0.4S // ..............................................................................................................*....... + // trn2 v17.4S, v11.4S, v26.4S // ...............................................................................................................*...... + // trn1 v28.4S, v3.4S, v7.4S // ................................................................................................................*..... + // trn2 v16.4S, v3.4S, v7.4S // .................................................................................................................*.... + // trn2 v18.4S, v31.4S, v0.4S // ..................................................................................................................*... + // trn1 v29.4S, v11.4S, v26.4S // ...................................................................................................................*.. + // st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x7], #64 // ....................................................................................................................*. + // st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x6], #64 // .....................................................................................................................* + + sub count, count, #1 +layer567_start: + // Instructions: 118 + // Expected cycles: 156 + // Expected IPC: 0.76 + // + // Cycle bound: 156.0 + // IPC bound: 0.76 + // + // Wall time: 508.40s + // User time: 508.40s + // + // ------------------------------------------------- original position -------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x6] // e..................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x7] // .e.................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q15, [x4], #16*14 // ..........e........................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v13.4S, v22.4S, v11.4S // ......e............................................................................................................... + // gap // ...................................................................................................................... + trn2 v14.4S, v20.4S, v9.4S // ...e.................................................................................................................. + // gap // ...................................................................................................................... + trn2 v2.4S, v22.4S, v11.4S // .......e.............................................................................................................. + // gap // ...................................................................................................................... + mul v17.8H, v14.8H, v15.8H // .........................e............................................................................................ + // gap // ...................................................................................................................... + ldr q19, [x4, #-208] // ...........e.......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v26.8H, v2.8H, v15.8H // ...................................e.................................................................................. + // gap // ...................................................................................................................... + trn2 v6.4S, v23.4S, v12.4S // .........e............................................................................................................ + // gap // ...................................................................................................................... + sqrdmulh v28.8H, v2.8H, v19.8H // ..................................e................................................................................... + // gap // ...................................................................................................................... + mul v16.8H, v6.8H, v15.8H // ........................................e............................................................................. + // gap // ...................................................................................................................... + sqrdmulh v27.8H, v6.8H, v19.8H // .......................................e.............................................................................. + // gap // ...................................................................................................................... + sqrdmulh v14.8H, v14.8H, v19.8H // ........................e............................................................................................. + // gap // ...................................................................................................................... + mla v26.8H, v28.8H, v8.H[0] // ....................................e................................................................................. + // gap // ...................................................................................................................... + trn1 v0.4S, v23.4S, v12.4S // ........e............................................................................................................. + // gap // ...................................................................................................................... + mla v16.8H, v27.8H, v8.H[0] // .........................................e............................................................................ + // gap // ...................................................................................................................... + ldr q31, [x4, #-64] // ....................e................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v24.4S, v20.4S, v9.4S // ..e................................................................................................................... + // gap // ...................................................................................................................... + sub v7.8H, v0.8H, v16.8H // ..........................................e........................................................................... + // gap // ...................................................................................................................... + ldr q23, [x4, #-160] // ..............e....................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q22, [x4, #-96] // ..................e................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v4.8H, v7.8H, v23.8H // ............................................................e......................................................... + // gap // ...................................................................................................................... + ldr q25, [x4, #-48] // .....................e................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q28, [x4, #-32] // ......................e............................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q29, [x4, #-112] // .................e.................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q1, [x4, #-144] // ...............e...................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v11.4S, v21.4S, v10.4S // .....e................................................................................................................ + // gap // ...................................................................................................................... + add v6.8H, v0.8H, v16.8H // ...........................................e.......................................................................... + // gap // ...................................................................................................................... + sqrdmulh v27.8H, v11.8H, v19.8H // .............................e........................................................................................ + // gap // ...................................................................................................................... + mul v11.8H, v11.8H, v15.8H // ..............................e....................................................................................... + // gap // ...................................................................................................................... + ldr q3, [x4, #-192] // ............e......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v17.8H, v14.8H, v8.H[0] // ..........................e........................................................................................... + // gap // ...................................................................................................................... + ldr q0, [x4, #-176] // .............e........................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v16.8H, v13.8H, v26.8H // ......................................e............................................................................... + // gap // ...................................................................................................................... + add v14.8H, v24.8H, v17.8H // ............................e......................................................................................... + // gap // ...................................................................................................................... + mla v11.8H, v27.8H, v8.H[0] // ...............................e...................................................................................... + // gap // ...................................................................................................................... + mul v27.8H, v16.8H, v3.8H // .............................................e........................................................................ + // gap // ...................................................................................................................... + sqrdmulh v15.8H, v16.8H, v0.8H // ............................................e......................................................................... + // gap // ...................................................................................................................... + sqrdmulh v16.8H, v6.8H, v0.8H // .................................................e.................................................................... + // gap // ...................................................................................................................... + sub v13.8H, v13.8H, v26.8H // .....................................e................................................................................ + // gap // ...................................................................................................................... + trn1 v2.4S, v21.4S, v10.4S // ....e................................................................................................................. + // gap // ...................................................................................................................... + mla v27.8H, v15.8H, v8.H[0] // ..............................................e....................................................................... + // gap // ...................................................................................................................... + sqrdmulh v5.8H, v13.8H, v1.8H // ......................................................e............................................................... + // gap // ...................................................................................................................... + mul v15.8H, v6.8H, v3.8H // ..................................................e................................................................... + // gap // ...................................................................................................................... + add v3.8H, v2.8H, v11.8H // .................................e.................................................................................... + // gap // ...................................................................................................................... + sub v0.8H, v14.8H, v27.8H // ...............................................e...................................................................... + // gap // ...................................................................................................................... + add v26.8H, v14.8H, v27.8H // ................................................e..................................................................... + // gap // ...................................................................................................................... + mla v15.8H, v16.8H, v8.H[0] // ...................................................e.................................................................. + // gap // ...................................................................................................................... + sqrdmulh v19.8H, v7.8H, v1.8H // ...........................................................e.......................................................... + // gap // ...................................................................................................................... + ldr q10, [x4, #-80] // ...................e.................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v12.8H, v3.8H, v15.8H // .....................................................e................................................................ + // gap // ...................................................................................................................... + ldr q16, [x4, #-128] // ................e..................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v14.8H, v12.8H, v29.8H // ................................................................e..................................................... + // gap // ...................................................................................................................... + sub v21.8H, v3.8H, v15.8H // ....................................................e................................................................. + // gap // ...................................................................................................................... + mul v27.8H, v12.8H, v16.8H // .................................................................e.................................................... + // gap // ...................................................................................................................... + sub v6.8H, v2.8H, v11.8H // ................................e..................................................................................... + // gap // ...................................................................................................................... + mla v4.8H, v19.8H, v8.H[0] // .............................................................e........................................................ + // gap // ...................................................................................................................... + mul v15.8H, v21.8H, v22.8H // ......................................................................e............................................... + // gap // ...................................................................................................................... + sqrdmulh v16.8H, v21.8H, v10.8H // .....................................................................e................................................ + // gap // ...................................................................................................................... + mla v27.8H, v14.8H, v8.H[0] // ..................................................................e................................................... + // gap // ...................................................................................................................... + sub v1.8H, v6.8H, v4.8H // ..............................................................e....................................................... + // gap // ...................................................................................................................... + add v14.8H, v6.8H, v4.8H // ...............................................................e...................................................... + // gap // ...................................................................................................................... + mla v15.8H, v16.8H, v8.H[0] // .......................................................................e.............................................. + // gap // ...................................................................................................................... + add v22.8H, v26.8H, v27.8H // ....................................................................e................................................. + // gap // ...................................................................................................................... + sub v3.8H, v26.8H, v27.8H // ...................................................................e.................................................. + // gap // ...................................................................................................................... + mul v27.8H, v14.8H, v31.8H // ...........................................................................e.......................................... + // gap // ...................................................................................................................... + sub v31.8H, v0.8H, v15.8H // ........................................................................e............................................. + // gap // ...................................................................................................................... + sqdmulh v16.8H, v3.8H, v8.H[1] // .......................................................................................e.............................. + // gap // ...................................................................................................................... + add v11.8H, v0.8H, v15.8H // .........................................................................e............................................ + // gap // ...................................................................................................................... + sqdmulh v15.8H, v31.8H, v8.H[1] // .............................................................................................e........................ + // gap // ...................................................................................................................... + mul v30.8H, v13.8H, v23.8H // .......................................................e.............................................................. + // gap // ...................................................................................................................... + srshr v13.8H, v16.8H, #11 // ........................................................................................e............................. + // gap // ...................................................................................................................... + sqdmulh v16.8H, v11.8H, v8.H[1] // ..........................................................................................e........................... + // gap // ...................................................................................................................... + srshr v15.8H, v15.8H, #11 // ..............................................................................................e....................... + // gap // ...................................................................................................................... + sqrdmulh v26.8H, v14.8H, v25.8H // ..........................................................................e........................................... + // gap // ...................................................................................................................... + mla v30.8H, v5.8H, v8.H[0] // ........................................................e............................................................. + // gap // ...................................................................................................................... + mla v31.8H, v15.8H, v8.H[0] // ...............................................................................................e...................... + // gap // ...................................................................................................................... + srshr v14.8H, v16.8H, #11 // ...........................................................................................e.......................... + // gap // ...................................................................................................................... + sub v21.8H, v24.8H, v17.8H // ...........................e.......................................................................................... + // gap // ...................................................................................................................... + mul v6.8H, v1.8H, v28.8H // ................................................................................e..................................... + // gap // ...................................................................................................................... + mla v27.8H, v26.8H, v8.H[0] // ............................................................................e......................................... + // gap // ...................................................................................................................... + add v2.8H, v21.8H, v30.8H // ..........................................................e........................................................... + // gap // ...................................................................................................................... + ldr q4, [x4, #-16] // .......................e.............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v10.8H, v2.8H, v27.8H // ..............................................................................e....................................... + // gap // ...................................................................................................................... + sub v7.8H, v2.8H, v27.8H // .............................................................................e........................................ + // gap // ...................................................................................................................... + sqrdmulh v26.8H, v1.8H, v4.8H // ...............................................................................e...................................... + // gap // ...................................................................................................................... + sqdmulh v16.8H, v10.8H, v8.H[1] // ................................................................................................e..................... + // gap // ...................................................................................................................... + sqdmulh v15.8H, v7.8H, v8.H[1] // ...................................................................................................e.................. + // gap // ...................................................................................................................... + sub v24.8H, v21.8H, v30.8H // .........................................................e............................................................ + // gap // ...................................................................................................................... + mla v6.8H, v26.8H, v8.H[0] // .................................................................................e.................................... + // gap // ...................................................................................................................... + srshr v16.8H, v16.8H, #11 // .................................................................................................e.................... + // gap // ...................................................................................................................... + srshr v15.8H, v15.8H, #11 // ....................................................................................................e................. + // gap // ...................................................................................................................... + sqdmulh v27.8H, v22.8H, v8.H[1] // ....................................................................................e................................. + // gap // ...................................................................................................................... + sub v0.8H, v24.8H, v6.8H // ..................................................................................e................................... + // gap // ...................................................................................................................... + mla v10.8H, v16.8H, v8.H[0] // ..................................................................................................e................... + // gap // ...................................................................................................................... + mla v7.8H, v15.8H, v8.H[0] // .....................................................................................................e................ + // gap // ...................................................................................................................... + add v26.8H, v24.8H, v6.8H // ...................................................................................e.................................. + // gap // ...................................................................................................................... + srshr v6.8H, v27.8H, #11 // .....................................................................................e................................ + // gap // ...................................................................................................................... + sqdmulh v15.8H, v0.8H, v8.H[1] // .........................................................................................................e............ + // gap // ...................................................................................................................... + sqdmulh v16.8H, v26.8H, v8.H[1] // ......................................................................................................e............... + // gap // ...................................................................................................................... + mla v22.8H, v6.8H, v8.H[0] // ......................................................................................e............................... + // gap // ...................................................................................................................... + mla v11.8H, v14.8H, v8.H[0] // ............................................................................................e......................... + // gap // ...................................................................................................................... + srshr v15.8H, v15.8H, #11 // ..........................................................................................................e........... + // gap // ...................................................................................................................... + srshr v14.8H, v16.8H, #11 // .......................................................................................................e.............. + // gap // ...................................................................................................................... + trn1 v27.4S, v22.4S, v10.4S // ............................................................................................................e......... + // gap // ...................................................................................................................... + mla v0.8H, v15.8H, v8.H[0] // ...........................................................................................................e.......... + // gap // ...................................................................................................................... + mla v26.8H, v14.8H, v8.H[0] // ........................................................................................................e............. + // gap // ...................................................................................................................... + mla v3.8H, v13.8H, v8.H[0] // .........................................................................................e............................ + // gap // ...................................................................................................................... + trn2 v15.4S, v22.4S, v10.4S // .............................................................................................................e........ + // gap // ...................................................................................................................... + trn1 v30.4S, v31.4S, v0.4S // ..................................................................................................................e... + // gap // ...................................................................................................................... + trn2 v17.4S, v11.4S, v26.4S // .................................................................................................................e.... + // gap // ...................................................................................................................... + trn1 v28.4S, v3.4S, v7.4S // ..............................................................................................................e....... + // gap // ...................................................................................................................... + trn2 v16.4S, v3.4S, v7.4S // ...............................................................................................................e...... + // gap // ...................................................................................................................... + trn2 v18.4S, v31.4S, v0.4S // ...................................................................................................................e.. + // gap // ...................................................................................................................... + trn1 v29.4S, v11.4S, v26.4S // ................................................................................................................e..... + // gap // ...................................................................................................................... + st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x7], #64 // .....................................................................................................................e + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x6], #64 // ....................................................................................................................e. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + + // ---------------------------------------------------- new position ----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------ + // ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6] // e.....................................................................................................................' + // ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7] // .e....................................................................................................................' + // trn1 v9.4s, v17.4s, v21.4s // ..................e...................................................................................................' + // trn2 v13.4s, v17.4s, v21.4s // ....e.................................................................................................................' + // trn1 v10.4s, v18.4s, v22.4s // .........................................e............................................................................' + // trn2 v14.4s, v18.4s, v22.4s // ...........................e..........................................................................................' + // trn1 v11.4s, v19.4s, v23.4s // ...e..................................................................................................................' + // trn2 v15.4s, v19.4s, v23.4s // .....e................................................................................................................' + // trn1 v12.4s, v20.4s, v24.4s // ...............e......................................................................................................' + // trn2 v16.4s, v20.4s, v24.4s // .........e............................................................................................................' + // ldr q0, [ x4], #16*14 // ..e...................................................................................................................' + // ldr q4, [x4, #-16*14+16*1] // .......e..............................................................................................................' + // ldr q1, [ x4, #-16*14+16*2] // ...............................e......................................................................................' + // ldr q5, [x4, #-16*14+16*3] // .................................e....................................................................................' + // ldr q2, [ x4, #-16*14+16*4] // ....................e.................................................................................................' + // ldr q6, [x4, #-16*14+16*5] // ..........................e...........................................................................................' + // ldr q3, [ x4, #-16*14+16*6] // ....................................................e.................................................................' + // ldr q7, [x4, #-16*14+16*7] // .........................e............................................................................................' + // ldr q17, [ x4, #-16*14+16*8] // .....................e................................................................................................' + // ldr q18, [ x4, #-16*14+16*9] // ..................................................e...................................................................' + // ldr q19, [ x4, #-16*14+16*10] // .................e....................................................................................................' + // ldr q20, [ x4, #-16*14+16*11] // .......................e..............................................................................................' + // ldr q21, [ x4, #-16*14+16*12] // ........................e.............................................................................................' + // ldr q22, [ x4, #-16*14+16*13] // ...................................................................................e..................................' + // sqrdmulh v28.8h, v13.8h, v4.8h // .............e........................................................................................................' + // mul v25.8h, v13.8h, v0.8h // ......e...............................................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ................................e.....................................................................................' + // sub v13.8h, v9.8h, v25.8h // ...............................................................................e......................................' + // add v9.8h, v9.8h, v25.8h // ...................................e..................................................................................' + // sqrdmulh v28.8h, v14.8h, v4.8h // .............................e........................................................................................' + // mul v25.8h, v14.8h, v0.8h // ..............................e.......................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ....................................e.................................................................................' + // sub v14.8h, v10.8h, v25.8h // ........................................................e.............................................................' + // add v10.8h, v10.8h, v25.8h // .............................................e........................................................................' + // sqrdmulh v28.8h, v15.8h, v4.8h // ..........e...........................................................................................................' + // mul v25.8h, v15.8h, v0.8h // ........e.............................................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ..............e.......................................................................................................' + // sub v15.8h, v11.8h, v25.8h // ........................................e.............................................................................' + // add v11.8h, v11.8h, v25.8h // ..................................e...................................................................................' + // sqrdmulh v28.8h, v16.8h, v4.8h // ............e.........................................................................................................' + // mul v25.8h, v16.8h, v0.8h // ...........e..........................................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ................e.....................................................................................................' + // sub v16.8h, v12.8h, v25.8h // ...................e..................................................................................................' + // add v12.8h, v12.8h, v25.8h // ............................e.........................................................................................' + // sqrdmulh v28.8h, v11.8h, v5.8h // ......................................e...............................................................................' + // mul v25.8h, v11.8h, v1.8h // .....................................e................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ..........................................e...........................................................................' + // sub v11.8h, v9.8h, v25.8h // ..............................................e.......................................................................' + // add v9.8h, v9.8h, v25.8h // ...............................................e......................................................................' + // sqrdmulh v28.8h, v12.8h, v5.8h // .......................................e..............................................................................' + // mul v25.8h, v12.8h, v1.8h // ............................................e.........................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ................................................e.....................................................................' + // sub v12.8h, v10.8h, v25.8h // ......................................................e...............................................................' + // add v10.8h, v10.8h, v25.8h // ...................................................e..................................................................' + // sqrdmulh v28.8h, v15.8h, v6.8h // ...........................................e..........................................................................' + // mul v25.8h, v15.8h, v2.8h // .......................................................................e..............................................' + // mla v25.8h, v28.8h, v8.h[0] // ............................................................................e.........................................' + // sub v15.8h, v13.8h, v25.8h // .........................................................................................e............................' + // add v13.8h, v13.8h, v25.8h // ..................................................................................e...................................' + // sqrdmulh v28.8h, v16.8h, v6.8h // .................................................e....................................................................' + // mul v25.8h, v16.8h, v2.8h // ......................e...............................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // .........................................................e............................................................' + // sub v16.8h, v14.8h, v25.8h // .............................................................e........................................................' + // add v14.8h, v14.8h, v25.8h // ..............................................................e.......................................................' + // sqrdmulh v28.8h, v10.8h, v7.8h // .....................................................e................................................................' + // mul v25.8h, v10.8h, v3.8h // .......................................................e..............................................................' + // mla v25.8h, v28.8h, v8.h[0] // ............................................................e.........................................................' + // sub v10.8h, v9.8h, v25.8h // .................................................................e....................................................' + // add v9.8h, v9.8h, v25.8h // ................................................................e.....................................................' + // sqrdmulh v28.8h, v12.8h, v18.8h // ...........................................................e..........................................................' + // mul v25.8h, v12.8h, v17.8h // ..........................................................e...........................................................' + // mla v25.8h, v28.8h, v8.h[0] // ...............................................................e......................................................' + // sub v12.8h, v11.8h, v25.8h // ...................................................................e..................................................' + // add v11.8h, v11.8h, v25.8h // .....................................................................e................................................' + // sqrdmulh v28.8h, v14.8h, v20.8h // ...........................................................................e..........................................' + // mul v25.8h, v14.8h, v19.8h // ..................................................................e...................................................' + // mla v25.8h, v28.8h, v8.h[0] // .................................................................................e....................................' + // sub v14.8h, v13.8h, v25.8h // .....................................................................................e................................' + // add v13.8h, v13.8h, v25.8h // ....................................................................................e.................................' + // sqrdmulh v28.8h, v16.8h, v22.8h // ......................................................................................e...............................' + // mul v25.8h, v16.8h, v21.8h // ................................................................................e.....................................' + // mla v25.8h, v28.8h, v8.h[0] // ..........................................................................................e...........................' + // sub v16.8h, v15.8h, v25.8h // ..............................................................................................e.......................' + // add v15.8h, v15.8h, v25.8h // .................................................................................................e....................' + // sqdmulh v26.8h, v9.8h, v8.h[1] // .............................................................................................e........................' + // srshr v26.8h, v26.8h, #11 // ..................................................................................................e...................' + // mla v9.8h, v26.8h, v8.h[0] // .....................................................................................................e................' + // sqdmulh v26.8h, v10.8h, v8.h[1] // ....................................................................e.................................................' + // srshr v26.8h, v26.8h, #11 // ........................................................................e.............................................' + // mla v10.8h, v26.8h, v8.h[0] // ............................................................................................................e.........' + // sqdmulh v26.8h, v11.8h, v8.h[1] // .........................................................................e............................................' + // srshr v26.8h, v26.8h, #11 // ..............................................................................e.......................................' + // mla v11.8h, v26.8h, v8.h[0] // ......................................................................................................e...............' + // sqdmulh v26.8h, v12.8h, v8.h[1] // ......................................................................e...............................................' + // srshr v26.8h, v26.8h, #11 // ..........................................................................e...........................................' + // mla v12.8h, v26.8h, v8.h[0] // .............................................................................e........................................' + // sqdmulh v26.8h, v13.8h, v8.h[1] // .......................................................................................e..............................' + // srshr v26.8h, v26.8h, #11 // ...........................................................................................e..........................' + // mla v13.8h, v26.8h, v8.h[0] // ...............................................................................................e......................' + // sqdmulh v26.8h, v14.8h, v8.h[1] // ........................................................................................e.............................' + // srshr v26.8h, v26.8h, #11 // ............................................................................................e.........................' + // mla v14.8h, v26.8h, v8.h[0] // ................................................................................................e.....................' + // sqdmulh v26.8h, v15.8h, v8.h[1] // ....................................................................................................e.................' + // srshr v26.8h, v26.8h, #11 // ........................................................................................................e.............' + // mla v15.8h, v26.8h, v8.h[0] // ...........................................................................................................e..........' + // sqdmulh v26.8h, v16.8h, v8.h[1] // ...................................................................................................e..................' + // srshr v26.8h, v26.8h, #11 // .......................................................................................................e..............' + // mla v16.8h, v26.8h, v8.h[0] // ..........................................................................................................e...........' + // trn1 v17.4s, v9.4s, v13.4s // .........................................................................................................e............' + // trn2 v21.4s, v9.4s, v13.4s // .............................................................................................................e........' + // trn1 v18.4s, v10.4s, v14.4s // ................................................................................................................e.....' + // trn2 v22.4s, v10.4s, v14.4s // .................................................................................................................e....' + // trn1 v19.4s, v11.4s, v15.4s // ...................................................................................................................e..' + // trn2 v23.4s, v11.4s, v15.4s // ...............................................................................................................e......' + // trn1 v20.4s, v12.4s, v16.4s // ..............................................................................................................e.......' + // trn2 v24.4s, v12.4s, v16.4s // ..................................................................................................................e...' + // st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6], #64 // .....................................................................................................................e' + // st4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7], #64 // ....................................................................................................................e.' + + sub count, count, #1 + cbnz count, layer567_start + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.04s + // User time: 0.04s + // + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_1234_567_opt_a72.s b/examples/opt/aarch64/ntt_kyber_1234_567_opt_a72.s index 44eee5e0..b18532a9 100644 --- a/examples/opt/aarch64/ntt_kyber_1234_567_opt_a72.s +++ b/examples/opt/aarch64/ntt_kyber_1234_567_opt_a72.s @@ -26,26 +26,11 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, \offset] +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s .endm -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], \inc -.endm -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, \offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], \inc +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s .endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h @@ -67,15 +52,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmlaq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlaq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,55 +69,49 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h add \a\().8h, \a\().8h, tmp.8h .endm -.macro barrett_reduce a, barrett_const, barrett_const_idx - vqdmulhq t0, \a, \barrett_const, \barrett_const_idx - srshr t0.8H, t0.8H, #11 - vmla \a, t0, modulus +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlaq \a, t0, consts, 0 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 + ldr qform_\root0, [\r_ptr0], #16 .endm .macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s - - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -143,7 +122,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -153,7 +132,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -161,7 +140,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -172,19 +151,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -197,11 +176,18 @@ roots: .text .global ntt_kyber_1234_567_opt_a72 - .global _ntt_kyber_1234_567_opt_a72 + .global _ntt_kyber_1234_567 .p2align 4 -modulus_addr: .quad -3329 -barrett_const_addr: .quad 20159 +const_addr: .short -3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + ntt_kyber_1234_567_opt_a72: _ntt_kyber_1234_567_opt_a72: push_stack @@ -321,497 +307,1639 @@ _ntt_kyber_1234_567_opt_a72: t2 .req v28 t3 .req v29 - barrett_const .req v8 - modulus .req v30 + consts .req v8 ASM_LOAD(r_ptr0, roots) - ASM_LOAD(xtmp, modulus_addr) - ld1r {modulus.8h}, [xtmp] + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] save STACK0, in add src0, x0, #32*0 add src8, x0, #32*8 - ld1 { root0.8H, root1.8H, root2.8H, root3.8H}, [r_ptr0], #64 + ld1 { root0.8h, root1.8h, root2.8h, root3.8h}, [r_ptr0], #64 mov count, #2 .p2align 2 - ldr_vo v26, x6, 0 - ldr_vo v4, x14, 0 - ldr_vo v16, x6, 64 - ldr_vo v21, x6, 32 - ldr_vo v31, x6, 96 - ldr_vo v7, x6, 128 - ldr_vo v9, x6, 192 - ldr_vo v17, x6, 160 - ldr_vo v19, x6, 224 - sqrdmulh v24.8H, v4.8H, v0.H[1] - ldr_vo v28, x14, 32 - ldr_vo v15, x14, 64 - ldr_vo v5, x14, 96 - mul v4.8H, v4.8H, v0.H[0] - ldr_vo v18, x14, 128 - ldr_vo v8, x14, 160 - ldr_vo v20, x14, 192 - mla v4.8H, v24.8H, v30.8H - sqrdmulh v24.8H, v15.8H, v0.H[1] - mul v10.8H, v18.8H, v0.H[0] - ldr_vo v25, x14, 224 - sqrdmulh v18.8H, v18.8H, v0.H[1] - mul v29.8H, v15.8H, v0.H[0] - mla v29.8H, v24.8H, v30.8H - sqrdmulh v6.8H, v20.8H, v0.H[1] - mul v20.8H, v20.8H, v0.H[0] - mla v20.8H, v6.8H, v30.8H - mla v10.8H, v18.8H, v30.8H - sub v15.8H, v9.8H, v20.8H - sqrdmulh v6.8H, v8.8H, v0.H[1] - mul v24.8H, v8.8H, v0.H[0] - add v22.8H, v7.8H, v10.8H - sub v11.8H, v7.8H, v10.8H - add v8.8H, v9.8H, v20.8H - mla v24.8H, v6.8H, v30.8H - sub v27.8H, v26.8H, v4.8H - sqrdmulh v10.8H, v25.8H, v0.H[1] - mul v25.8H, v25.8H, v0.H[0] - mla v25.8H, v10.8H, v30.8H - add v4.8H, v26.8H, v4.8H - sub v26.8H, v17.8H, v24.8H - add v9.8H, v17.8H, v24.8H - mul v24.8H, v15.8H, v0.H[4] - sub v12.8H, v16.8H, v29.8H - sqrdmulh v15.8H, v15.8H, v0.H[5] - add v16.8H, v16.8H, v29.8H - add v18.8H, v19.8H, v25.8H - sub v19.8H, v19.8H, v25.8H - sqrdmulh v7.8H, v28.8H, v0.H[1] - mul v17.8H, v28.8H, v0.H[0] - sqrdmulh v28.8H, v8.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[2] - sqrdmulh v20.8H, v5.8H, v0.H[1] - mul v5.8H, v5.8H, v0.H[0] - sqrdmulh v6.8H, v22.8H, v0.H[3] - mul v22.8H, v22.8H, v0.H[2] - sqrdmulh v14.8H, v11.8H, v0.H[5] - mul v29.8H, v11.8H, v0.H[4] - mla v17.8H, v7.8H, v30.8H - mla v5.8H, v20.8H, v30.8H - mla v29.8H, v14.8H, v30.8H - add v7.8H, v21.8H, v17.8H - sub v21.8H, v21.8H, v17.8H - mla v22.8H, v6.8H, v30.8H - sub v20.8H, v31.8H, v5.8H - add v31.8H, v31.8H, v5.8H - mla v24.8H, v15.8H, v30.8H - sub v17.8H, v27.8H, v29.8H - add v15.8H, v27.8H, v29.8H - mla v8.8H, v28.8H, v30.8H - add v28.8H, v4.8H, v22.8H - sub v4.8H, v4.8H, v22.8H - mul v5.8H, v9.8H, v0.H[2] - add v22.8H, v12.8H, v24.8H - sub v24.8H, v12.8H, v24.8H - sqrdmulh v9.8H, v9.8H, v0.H[3] - add v12.8H, v16.8H, v8.8H - sub v27.8H, v16.8H, v8.8H - sqrdmulh v23.8H, v12.8H, v0.H[7] - mul v29.8H, v12.8H, v0.H[6] - mla v29.8H, v23.8H, v30.8H - sqrdmulh v16.8H, v18.8H, v0.H[3] - sub v13.8H, v28.8H, v29.8H - mul v23.8H, v18.8H, v0.H[2] - add v10.8H, v28.8H, v29.8H - mla v23.8H, v16.8H, v30.8H - mul v16.8H, v19.8H, v0.H[4] - add v11.8H, v31.8H, v23.8H - mla v5.8H, v9.8H, v30.8H - mul v14.8H, v26.8H, v0.H[4] - sqrdmulh v28.8H, v11.8H, v0.H[7] - sub v29.8H, v7.8H, v5.8H - mul v25.8H, v11.8H, v0.H[6] - add v7.8H, v7.8H, v5.8H - mla v25.8H, v28.8H, v30.8H - sqrdmulh v26.8H, v26.8H, v0.H[5] + ldr q20, [x6, #0] // *............................................................................................................................................................................................... + ldr q14, [x14, #160] // .............*.................................................................................................................................................................................. + ldr q16, [x6, #32] // .*.............................................................................................................................................................................................. + ldr q11, [x14, #192] // ..............*................................................................................................................................................................................. + ldr q25, [x14, #96] // ...........*.................................................................................................................................................................................... + ldr q27, [x14, #224] // ...............*................................................................................................................................................................................ + ldr q17, [x6, #224] // .......*........................................................................................................................................................................................ + mul v22.8H, v14.8H, v0.H[0] // ..........................................*..................................................................................................................................................... + ldr q26, [x6, #96] // ...*............................................................................................................................................................................................ + ldr q21, [x14, #0] // ........*....................................................................................................................................................................................... + sqrdmulh v15.8H, v27.8H, v0.H[1] // ...................................................*............................................................................................................................................ + ldr q19, [x14, #32] // .........*...................................................................................................................................................................................... + ldr q30, [x6, #64] // ..*............................................................................................................................................................................................. + mul v13.8H, v27.8H, v0.H[0] // ....................................................*........................................................................................................................................... + ldr q12, [x6, #128] // ....*........................................................................................................................................................................................... + ldr q6, [x14, #128] // ............*................................................................................................................................................................................... + ldr q28, [x14, #64] // ..........*..................................................................................................................................................................................... + sqrdmulh v23.8H, v25.8H, v0.H[1] // ...............................*................................................................................................................................................................ + ldr q5, [x6, #160] // .....*.......................................................................................................................................................................................... + ldr q9, [x6, #192] // ......*......................................................................................................................................................................................... + sqrdmulh v27.8H, v14.8H, v0.H[1] // .........................................*...................................................................................................................................................... + sqrdmulh v29.8H, v21.8H, v0.H[1] // ................*............................................................................................................................................................................... + mul v14.8H, v21.8H, v0.H[0] // .................*.............................................................................................................................................................................. + mla v22.8H, v27.8H, v8.H[0] // ...........................................*.................................................................................................................................................... + mul v31.8H, v19.8H, v0.H[0] // ......................*......................................................................................................................................................................... + mla v14.8H, v29.8H, v8.H[0] // ..................*............................................................................................................................................................................. + add v7.8H, v5.8H, v22.8H // .............................................*.................................................................................................................................................. + mla v13.8H, v15.8H, v8.H[0] // .....................................................*.......................................................................................................................................... + sub v27.8H, v5.8H, v22.8H // ............................................*................................................................................................................................................... + sqrdmulh v21.8H, v7.8H, v0.H[3] // .............................................................*.................................................................................................................................. + sub v10.8H, v20.8H, v14.8H // ...................*............................................................................................................................................................................ + add v24.8H, v20.8H, v14.8H // ....................*........................................................................................................................................................................... + mul v20.8H, v25.8H, v0.H[0] // ................................*............................................................................................................................................................... + mla v20.8H, v23.8H, v8.H[0] // .................................*.............................................................................................................................................................. + mul v14.8H, v7.8H, v0.H[2] // ..............................................................*................................................................................................................................. + add v25.8H, v17.8H, v13.8H // .......................................................*........................................................................................................................................ + sqrdmulh v5.8H, v11.8H, v0.H[1] // ..............................................*................................................................................................................................................. + sub v7.8H, v17.8H, v13.8H // ......................................................*......................................................................................................................................... + mul v11.8H, v11.8H, v0.H[0] // ...............................................*................................................................................................................................................ + add v13.8H, v26.8H, v20.8H // ...................................*............................................................................................................................................................ + sub v26.8H, v26.8H, v20.8H // ..................................*............................................................................................................................................................. + sqrdmulh v17.8H, v7.8H, v0.H[5] // ...........................................................................................*.................................................................................................... + mul v23.8H, v7.8H, v0.H[4] // ............................................................................................*................................................................................................... + mul v29.8H, v27.8H, v0.H[4] // ..................................................................................*............................................................................................................. + mla v23.8H, v17.8H, v8.H[0] // .............................................................................................*.................................................................................................. + sqrdmulh v4.8H, v27.8H, v0.H[5] // .................................................................................*.............................................................................................................. + mla v11.8H, v5.8H, v8.H[0] // ................................................*............................................................................................................................................... + add v5.8H, v26.8H, v23.8H // ...............................................................................................*................................................................................................ + sub v26.8H, v26.8H, v23.8H // ..............................................................................................*................................................................................................. + sqrdmulh v20.8H, v6.8H, v0.H[1] // ....................................*........................................................................................................................................................... + sqrdmulh v22.8H, v5.8H, v1.H[3] // .........................................................................................................................*...................................................................... + sub v7.8H, v9.8H, v11.8H // .................................................*.............................................................................................................................................. + add v9.8H, v9.8H, v11.8H // ..................................................*............................................................................................................................................. + sqrdmulh v15.8H, v26.8H, v1.H[5] // ...................................................................................................................................*............................................................ + sqrdmulh v17.8H, v7.8H, v0.H[5] // ......................................................................................*......................................................................................................... + mul v18.8H, v6.8H, v0.H[0] // .....................................*.......................................................................................................................................................... + mla v18.8H, v20.8H, v8.H[0] // ......................................*......................................................................................................................................................... + mul v20.8H, v26.8H, v1.H[4] // ....................................................................................................................................*........................................................... + mla v20.8H, v15.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + sub v6.8H, v12.8H, v18.8H // .......................................*........................................................................................................................................................ + add v26.8H, v12.8H, v18.8H // ........................................*....................................................................................................................................................... + sqrdmulh v18.8H, v19.8H, v0.H[1] // .....................*.......................................................................................................................................................................... + mul v27.8H, v7.8H, v0.H[4] // .......................................................................................*........................................................................................................ + mla v27.8H, v17.8H, v8.H[0] // ........................................................................................*....................................................................................................... + mla v31.8H, v18.8H, v8.H[0] // .......................*........................................................................................................................................................................ + sqrdmulh v23.8H, v25.8H, v0.H[3] // .......................................................................*........................................................................................................................ + mla v14.8H, v21.8H, v8.H[0] // ...............................................................*................................................................................................................................ + add v18.8H, v16.8H, v31.8H // .........................*...................................................................................................................................................................... + mla v29.8H, v4.8H, v8.H[0] // ...................................................................................*............................................................................................................ + mul v17.8H, v28.8H, v0.H[0] // ...........................*.................................................................................................................................................................... + sub v21.8H, v16.8H, v31.8H // ........................*....................................................................................................................................................................... + add v31.8H, v18.8H, v14.8H // .................................................................*.............................................................................................................................. + sub v16.8H, v18.8H, v14.8H // ................................................................*............................................................................................................................... + sqrdmulh v14.8H, v28.8H, v0.H[1] // ..........................*..................................................................................................................................................................... + mul v19.8H, v25.8H, v0.H[2] // ........................................................................*....................................................................................................................... + mla v19.8H, v23.8H, v8.H[0] // .........................................................................*...................................................................................................................... + sub v23.8H, v21.8H, v29.8H // ....................................................................................*........................................................................................................... + sub v18.8H, v23.8H, v20.8H // ......................................................................................................................................*......................................................... + sqrdmulh v25.8H, v26.8H, v0.H[3] // ........................................................*....................................................................................................................................... + add v20.8H, v23.8H, v20.8H // .......................................................................................................................................*........................................................ + sub v7.8H, v13.8H, v19.8H // ..........................................................................*..................................................................................................................... + sqrdmulh v11.8H, v9.8H, v0.H[3] // ..................................................................*............................................................................................................................. + add v23.8H, v21.8H, v29.8H // .....................................................................................*.......................................................................................................... + sqrdmulh v15.8H, v20.8H, v3.H[3] // ......................................................................................................................................................................*......................... + mul v28.8H, v26.8H, v0.H[2] // .........................................................*...................................................................................................................................... + mla v17.8H, v14.8H, v8.H[0] // ............................*................................................................................................................................................................... + mul v4.8H, v9.8H, v0.H[2] // ...................................................................*............................................................................................................................ + add v9.8H, v13.8H, v19.8H // ...........................................................................*.................................................................................................................... + mla v4.8H, v11.8H, v8.H[0] // ....................................................................*........................................................................................................................... + add v19.8H, v30.8H, v17.8H // ..............................*................................................................................................................................................................. + sub v12.8H, v30.8H, v17.8H // .............................*.................................................................................................................................................................. + mul v30.8H, v9.8H, v0.H[6] // ......................................................................................................*......................................................................................... + sqrdmulh v14.8H, v9.8H, v0.H[7] // .....................................................................................................*.......................................................................................... + add v21.8H, v19.8H, v4.8H // ......................................................................*......................................................................................................................... + sub v26.8H, v19.8H, v4.8H // .....................................................................*.......................................................................................................................... + mul v9.8H, v5.8H, v1.H[2] // ..........................................................................................................................*..................................................................... sub count, count, #1 -.p2align 2 layer1234_start: - ldr_vo v18, x14, 112 - sub v8.8H, v7.8H, v25.8H - sqrdmulh v11.8H, v24.8H, v1.H[5] - sub v12.8H, v31.8H, v23.8H - ldr_vo v9, x14, 208 // gap(s) to follow - sqrdmulh v23.8H, v22.8H, v1.H[3] // gap(s) to follow - sqrdmulh v31.8H, v8.8H, v2.H[1] // gap(s) to follow - mul v6.8H, v8.8H, v2.H[0] // gap(s) to follow - mla v6.8H, v31.8H, v30.8H // gap(s) to follow - mul v31.8H, v22.8H, v1.H[2] // gap(s) to follow - mla v31.8H, v23.8H, v30.8H // gap(s) to follow - mul v23.8H, v24.8H, v1.H[4] - add v24.8H, v7.8H, v25.8H // gap(s) to follow - sqrdmulh v7.8H, v9.8H, v0.H[1] // gap(s) to follow - sqrdmulh v8.8H, v24.8H, v1.H[7] // gap(s) to follow - mul v5.8H, v9.8H, v0.H[0] // gap(s) to follow - mla v5.8H, v7.8H, v30.8H // gap(s) to follow - mla v23.8H, v11.8H, v30.8H // gap(s) to follow - mla v14.8H, v26.8H, v30.8H // gap(s) to follow - sqrdmulh v7.8H, v18.8H, v0.H[1] - add v26.8H, v17.8H, v23.8H // gap(s) to follow - sub v28.8H, v17.8H, v23.8H // gap(s) to follow - add v25.8H, v21.8H, v14.8H - sqrdmulh v19.8H, v19.8H, v0.H[5] // gap(s) to follow - sub v23.8H, v21.8H, v14.8H // gap(s) to follow - mul v21.8H, v18.8H, v0.H[0] // gap(s) to follow - mla v21.8H, v7.8H, v30.8H // gap(s) to follow - mla v16.8H, v19.8H, v30.8H - sub v19.8H, v15.8H, v31.8H // gap(s) to follow - sqrdmulh v7.8H, v27.8H, v1.H[1] // gap(s) to follow - mul v22.8H, v27.8H, v1.H[0] - sub v14.8H, v20.8H, v16.8H // gap(s) to follow - add v17.8H, v20.8H, v16.8H // gap(s) to follow - add v27.8H, v15.8H, v31.8H - mul v9.8H, v24.8H, v1.H[6] // gap(s) to follow - sub v11.8H, v13.8H, v6.8H // gap(s) to follow - mla v9.8H, v8.8H, v30.8H - add v31.8H, v13.8H, v6.8H // gap(s) to follow - ldr_vo v6, x14, 144 - sqrdmulh v18.8H, v17.8H, v1.H[3] - sub v24.8H, v10.8H, v9.8H - add v16.8H, v10.8H, v9.8H - ldr_vo v20, x6, 112 - str_vo v11, x6, 96 - mul v13.8H, v17.8H, v1.H[2] - str_vo v31, x6, 64 - ldr_vo v15, x6, 208 - str_vo v24, x6, 32 // gap(s) to follow - mla v13.8H, v18.8H, v30.8H - str_vi v16, x6, 16 // gap(s) to follow - mla v22.8H, v7.8H, v30.8H - add v24.8H, v15.8H, v5.8H // gap(s) to follow - sub v16.8H, v15.8H, v5.8H // gap(s) to follow - mul v10.8H, v12.8H, v1.H[0] - add v11.8H, v25.8H, v13.8H // gap(s) to follow - sub v8.8H, v25.8H, v13.8H // gap(s) to follow - add v18.8H, v4.8H, v22.8H - sqrdmulh v31.8H, v14.8H, v1.H[5] // gap(s) to follow - sub v15.8H, v4.8H, v22.8H - ldr_vo v22, x14, 48 - mul v13.8H, v14.8H, v1.H[4] // gap(s) to follow - mla v13.8H, v31.8H, v30.8H - add v31.8H, v20.8H, v21.8H // gap(s) to follow - sqrdmulh v25.8H, v12.8H, v1.H[1] - ldr_vo v12, x14, 176 - sub v20.8H, v20.8H, v21.8H // gap(s) to follow - add v4.8H, v23.8H, v13.8H - mul v17.8H, v8.8H, v3.H[0] // gap(s) to follow - sub v13.8H, v23.8H, v13.8H // gap(s) to follow - mla v10.8H, v25.8H, v30.8H // gap(s) to follow - sqrdmulh v23.8H, v4.8H, v3.H[3] // gap(s) to follow - sqrdmulh v7.8H, v11.8H, v2.H[7] // gap(s) to follow - sub v21.8H, v29.8H, v10.8H - add v9.8H, v29.8H, v10.8H // gap(s) to follow - mul v10.8H, v4.8H, v3.H[2] // gap(s) to follow - mla v10.8H, v23.8H, v30.8H // gap(s) to follow - ldr_vo v23, x14, 80 - ldr_vo v5, x14, 240 - add v14.8H, v26.8H, v10.8H - mul v29.8H, v22.8H, v0.H[0] // gap(s) to follow - sub v10.8H, v26.8H, v10.8H - sqrdmulh v25.8H, v22.8H, v0.H[1] // gap(s) to follow - mul v22.8H, v5.8H, v0.H[0] - str_vo v14, x14, 128 // gap(s) to follow - str_vo v10, x14, 160 // gap(s) to follow - sqrdmulh v10.8H, v23.8H, v0.H[1] // gap(s) to follow - sqrdmulh v4.8H, v12.8H, v0.H[1] // gap(s) to follow - sqrdmulh v14.8H, v8.8H, v3.H[1] // gap(s) to follow - sqrdmulh v26.8H, v13.8H, v3.H[5] // gap(s) to follow - mul v8.8H, v23.8H, v0.H[0] // gap(s) to follow - mla v8.8H, v10.8H, v30.8H // gap(s) to follow - mla v17.8H, v14.8H, v30.8H // gap(s) to follow - mul v10.8H, v11.8H, v2.H[6] // gap(s) to follow - sub v14.8H, v19.8H, v17.8H // gap(s) to follow - mla v10.8H, v7.8H, v30.8H - add v17.8H, v19.8H, v17.8H // gap(s) to follow - mla v29.8H, v25.8H, v30.8H - str_vo v14, x14, 96 - sub v7.8H, v27.8H, v10.8H - add v23.8H, v27.8H, v10.8H - str_vo v17, x14, 64 // gap(s) to follow - sqrdmulh v19.8H, v5.8H, v0.H[1] - ldr_vo v5, x14, 16 // gap(s) to follow - str_vo v7, x14, 32 // gap(s) to follow - mul v17.8H, v12.8H, v0.H[0] - str_vi v23, x14, 16 // gap(s) to follow - sqrdmulh v10.8H, v5.8H, v0.H[1] // gap(s) to follow - mul v25.8H, v5.8H, v0.H[0] // gap(s) to follow - sqrdmulh v7.8H, v9.8H, v2.H[3] // gap(s) to follow - mul v23.8H, v9.8H, v2.H[2] // gap(s) to follow - mla v23.8H, v7.8H, v30.8H // gap(s) to follow - mla v17.8H, v4.8H, v30.8H // gap(s) to follow - sqrdmulh v4.8H, v21.8H, v2.H[5] // gap(s) to follow - ldr_vo v7, x6, 160 // gap(s) to follow - sqrdmulh v11.8H, v6.8H, v0.H[1] // gap(s) to follow - ldr_vo v9, x6, 224 // gap(s) to follow - mla v22.8H, v19.8H, v30.8H // gap(s) to follow - sub v12.8H, v18.8H, v23.8H // gap(s) to follow - add v5.8H, v18.8H, v23.8H // gap(s) to follow - mul v27.8H, v6.8H, v0.H[0] - sub v6.8H, v7.8H, v17.8H // gap(s) to follow - str_vo v12, x6, 144 // gap(s) to follow - mla v27.8H, v11.8H, v30.8H - str_vo v5, x6, 112 - ldr_vo v12, x6, 32 // gap(s) to follow - mul v21.8H, v21.8H, v2.H[4] // gap(s) to follow - ldr_vo v23, x6, 0 // gap(s) to follow - mla v21.8H, v4.8H, v30.8H - ldr_vo v4, x6, 128 // gap(s) to follow - ldr_vo v5, x6, 64 // gap(s) to follow - mla v25.8H, v10.8H, v30.8H // gap(s) to follow - mul v13.8H, v13.8H, v3.H[4] - add v18.8H, v15.8H, v21.8H - add v11.8H, v4.8H, v27.8H // gap(s) to follow - sub v15.8H, v15.8H, v21.8H - mla v13.8H, v26.8H, v30.8H // gap(s) to follow - add v21.8H, v7.8H, v17.8H - str_vo v18, x6, 176 // gap(s) to follow - mul v18.8H, v11.8H, v0.H[2] - add v7.8H, v9.8H, v22.8H // gap(s) to follow - str_vo v15, x6, 208 // gap(s) to follow - sub v19.8H, v28.8H, v13.8H - sqrdmulh v17.8H, v21.8H, v0.H[3] // gap(s) to follow - mul v15.8H, v21.8H, v0.H[2] - sub v21.8H, v12.8H, v29.8H // gap(s) to follow - str_vo v19, x14, 208 - sub v19.8H, v9.8H, v22.8H // gap(s) to follow - mla v15.8H, v17.8H, v30.8H - sub v9.8H, v4.8H, v27.8H // gap(s) to follow - sub v10.8H, v23.8H, v25.8H - add v14.8H, v23.8H, v25.8H - sqrdmulh v25.8H, v7.8H, v0.H[3] // gap(s) to follow - add v4.8H, v5.8H, v8.8H // gap(s) to follow - mul v23.8H, v7.8H, v0.H[2] - add v7.8H, v28.8H, v13.8H // gap(s) to follow - sqrdmulh v22.8H, v24.8H, v0.H[3] - sub v8.8H, v5.8H, v8.8H // gap(s) to follow - str_vo v7, x14, 176 // gap(s) to follow - mul v26.8H, v24.8H, v0.H[2] // gap(s) to follow - mla v26.8H, v22.8H, v30.8H // gap(s) to follow - sqrdmulh v24.8H, v16.8H, v0.H[5] // gap(s) to follow - mul v22.8H, v16.8H, v0.H[4] // gap(s) to follow - sub v27.8H, v4.8H, v26.8H // gap(s) to follow - add v4.8H, v4.8H, v26.8H - mla v22.8H, v24.8H, v30.8H // gap(s) to follow - add v26.8H, v12.8H, v29.8H // gap(s) to follow - mla v23.8H, v25.8H, v30.8H // gap(s) to follow - sqrdmulh v17.8H, v4.8H, v0.H[7] - add v7.8H, v26.8H, v15.8H // gap(s) to follow - sub v29.8H, v26.8H, v15.8H // gap(s) to follow - sqrdmulh v12.8H, v11.8H, v0.H[3] // gap(s) to follow - mul v13.8H, v4.8H, v0.H[6] // gap(s) to follow - add v16.8H, v31.8H, v23.8H // gap(s) to follow - sqrdmulh v28.8H, v9.8H, v0.H[5] // gap(s) to follow - sqrdmulh v4.8H, v16.8H, v0.H[7] // gap(s) to follow - mul v25.8H, v16.8H, v0.H[6] // gap(s) to follow - mul v5.8H, v9.8H, v0.H[4] // gap(s) to follow - mla v5.8H, v28.8H, v30.8H // gap(s) to follow - mla v18.8H, v12.8H, v30.8H // gap(s) to follow - mla v13.8H, v17.8H, v30.8H - sub v17.8H, v10.8H, v5.8H // gap(s) to follow - add v15.8H, v10.8H, v5.8H // gap(s) to follow - add v5.8H, v14.8H, v18.8H // gap(s) to follow - mul v16.8H, v19.8H, v0.H[4] - sub v24.8H, v8.8H, v22.8H // gap(s) to follow - mla v25.8H, v4.8H, v30.8H // gap(s) to follow - add v10.8H, v5.8H, v13.8H // gap(s) to follow - sqrdmulh v26.8H, v6.8H, v0.H[5] - sub v4.8H, v14.8H, v18.8H // gap(s) to follow - sub v13.8H, v5.8H, v13.8H // gap(s) to follow - add v22.8H, v8.8H, v22.8H // gap(s) to follow - mul v14.8H, v6.8H, v0.H[4] // gap(s) to follow - subs count, count, #1 + // Instructions: 192 + // Expected cycles: 64 + // Expected IPC: 3.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + sub v19.8H, v12.8H, v27.8H // .*.............................................................................................................................................................................................. + sqrdmulh v17.8H, v6.8H, v0.H[5] // ..........*..................................................................................................................................................................................... + add v12.8H, v12.8H, v27.8H // ......*......................................................................................................................................................................................... + ldr q27, [x14, #112] // ....................................................................................................*........................................................................................... + mla v30.8H, v14.8H, v8.H[0] // .....*.......................................................................................................................................................................................... + mul v4.8H, v20.8H, v3.H[2] // *............................................................................................................................................................................................... + mul v20.8H, v6.8H, v0.H[4] // ...............*................................................................................................................................................................................ + mla v20.8H, v17.8H, v8.H[0] // ................*............................................................................................................................................................................... + mul v13.8H, v21.8H, v0.H[6] // .........................................*...................................................................................................................................................... + sqrdmulh v6.8H, v19.8H, v1.H[5] // ............................*................................................................................................................................................................... + add v11.8H, v10.8H, v20.8H // ....................*........................................................................................................................................................................... + sub v10.8H, v10.8H, v20.8H // ...................*............................................................................................................................................................................ + mul v19.8H, v19.8H, v1.H[4] // ...............................*................................................................................................................................................................ + sqrdmulh v20.8H, v21.8H, v0.H[7] // ..............*................................................................................................................................................................................. + mla v19.8H, v6.8H, v8.H[0] // .....................................*.......................................................................................................................................................... + mla v4.8H, v15.8H, v8.H[0] // ..*............................................................................................................................................................................................. + mul v21.8H, v12.8H, v1.H[2] // ..................*............................................................................................................................................................................. + add v29.8H, v10.8H, v19.8H // ........................................*....................................................................................................................................................... + sub v6.8H, v10.8H, v19.8H // .................................................*.............................................................................................................................................. + mul v10.8H, v26.8H, v1.H[0] // .........*...................................................................................................................................................................................... + sub v15.8H, v29.8H, v4.8H // .............................................*.................................................................................................................................................. + ldr q14, [x14, #176] // .................................................................................................*.............................................................................................. + sqrdmulh v26.8H, v26.8H, v1.H[1] // ..........................................................*..................................................................................................................................... + str q15, [x14, #160] // ...................................................*............................................................................................................................................ + sub v15.8H, v31.8H, v30.8H // ...........*.................................................................................................................................................................................... + sqrdmulh v5.8H, v14.8H, v0.H[1] // ....................................................................................................................*........................................................................... + mla v13.8H, v20.8H, v8.H[0] // ......................................................*......................................................................................................................................... + mul v17.8H, v15.8H, v2.H[0] // .......................................................*........................................................................................................................................ + sqrdmulh v19.8H, v15.8H, v2.H[1] // .....................................................*.......................................................................................................................................... + add v15.8H, v31.8H, v30.8H // ............*................................................................................................................................................................................... + mla v28.8H, v25.8H, v8.H[0] // ......................................*......................................................................................................................................................... + mul v30.8H, v15.8H, v1.H[6] // ....................................................................*........................................................................................................................... + sqrdmulh v25.8H, v15.8H, v1.H[7] // .................................................................*.............................................................................................................................. + sqrdmulh v15.8H, v7.8H, v1.H[1] // ....*........................................................................................................................................................................................... + mla v17.8H, v19.8H, v8.H[0] // ........................................................*....................................................................................................................................... + mul v31.8H, v7.8H, v1.H[0] // .............*.................................................................................................................................................................................. + ldr q7, [x14, #16] // .........................................................................................................*...................................................................................... + mla v9.8H, v22.8H, v8.H[0] // ...*............................................................................................................................................................................................ + sub v20.8H, v24.8H, v28.8H // ..........................................*..................................................................................................................................................... + mla v31.8H, v15.8H, v8.H[0] // ..........................*..................................................................................................................................................................... + ldr q15, [x14, #208] // ...................................................................................................*............................................................................................ + add v28.8H, v24.8H, v28.8H // ..............................................*................................................................................................................................................. + mul v22.8H, v7.8H, v0.H[0] // ......................................................................................................................*......................................................................... + add v24.8H, v29.8H, v4.8H // ............................................*................................................................................................................................................... + sqrdmulh v19.8H, v12.8H, v1.H[3] // .................*.............................................................................................................................................................................. + ldr q29, [x14, #240] // .....................................................................................................*.......................................................................................... + sub v4.8H, v28.8H, v13.8H // .........................................................*...................................................................................................................................... + add v28.8H, v28.8H, v13.8H // ...........................................................*.................................................................................................................................... + sqrdmulh v7.8H, v7.8H, v0.H[1] // .....................................................................................................................*.......................................................................... + str q24, [x14, #128] // ................................................*............................................................................................................................................... + sub v24.8H, v16.8H, v31.8H // .................................*.............................................................................................................................................................. + add v12.8H, v16.8H, v31.8H // ................................*............................................................................................................................................................... + sqrdmulh v16.8H, v29.8H, v0.H[1] // ..........................................................................................................*..................................................................................... + sub v13.8H, v4.8H, v17.8H // ...............................................................*................................................................................................................................ + add v4.8H, v4.8H, v17.8H // .............................................................*.................................................................................................................................. + mla v21.8H, v19.8H, v8.H[0] // ......................*......................................................................................................................................................................... + str q13, [x6, #96] // ..................................................................*............................................................................................................................. + str q4, [x6, #64] // ................................................................*............................................................................................................................... + add v4.8H, v11.8H, v21.8H // .........................*...................................................................................................................................................................... + ldr q31, [x6, #176] // ..................................................................................................................*............................................................................. + sqrdmulh v17.8H, v24.8H, v2.H[5] // ........................................................................*....................................................................................................................... + add v13.8H, v23.8H, v9.8H // .......*........................................................................................................................................................................................ + sub v23.8H, v23.8H, v9.8H // ........*....................................................................................................................................................................................... + mla v30.8H, v25.8H, v8.H[0] // .....................................................................*.......................................................................................................................... + mul v9.8H, v24.8H, v2.H[4] // ............................................................................*................................................................................................................... + mla v9.8H, v17.8H, v8.H[0] // ...............................................................................*................................................................................................................ + add v25.8H, v28.8H, v30.8H // ...........................................................................*.................................................................................................................... + sub v19.8H, v28.8H, v30.8H // .............................................................................*.................................................................................................................. + sqrdmulh v30.8H, v13.8H, v2.H[7] // ....................................................*........................................................................................................................................... + str q25, [x6], #16 // ................................................................................*............................................................................................................... + mul v24.8H, v13.8H, v2.H[6] // ..................................................*............................................................................................................................................. + str q19, [x6, #16] // ..................................................................................*............................................................................................................. + ldr q19, [x14, #80] // ................................................................................................................*............................................................................... + sqrdmulh v17.8H, v18.8H, v3.H[5] // ...................................*............................................................................................................................................................ + mla v24.8H, v30.8H, v8.H[0] // .........................................................................*...................................................................................................................... + mul v28.8H, v18.8H, v3.H[4] // ............................................................*................................................................................................................................... + mla v28.8H, v17.8H, v8.H[0] // ..............................................................*................................................................................................................................. + ldr q13, [x6, #224] // ......................................................................................................*......................................................................................... + mul v29.8H, v29.8H, v0.H[0] // .............................................................................................................*.................................................................................. + ldr q18, [x14, #48] // ...........................................................................................................*.................................................................................... + sub v17.8H, v6.8H, v28.8H // .................................................................................*.............................................................................................................. + add v30.8H, v4.8H, v24.8H // ...................................................................................*............................................................................................................ + mla v29.8H, v16.8H, v8.H[0] // ...........................................................................................................................*.................................................................... + add v25.8H, v6.8H, v28.8H // ........................................................................................*....................................................................................................... + ldr q28, [x6, #96] // ........................................................................................................*....................................................................................... + str q17, [x14, #224] // ......................................................................................*......................................................................................................... + sub v17.8H, v11.8H, v21.8H // ...........................*.................................................................................................................................................................... + mla v10.8H, v26.8H, v8.H[0] // ...................................................................*............................................................................................................................ + str q30, [x14], #16 // .......................................................................................*........................................................................................................ + sub v16.8H, v4.8H, v24.8H // ....................................................................................*........................................................................................................... + sqrdmulh v30.8H, v15.8H, v0.H[1] // ....................................................................................................................................*........................................................... + str q25, [x14, #176] // .............................................................................................*.................................................................................................. + add v25.8H, v13.8H, v29.8H // ...................................................................................................................................*............................................................ + sub v11.8H, v13.8H, v29.8H // .....................................................................................................................................*.......................................................... + ldr q13, [x6, #192] // ...................................................................................................................*............................................................................ + mul v4.8H, v19.8H, v0.H[0] // .....................................................................................................................................................................*.......................... + add v21.8H, v20.8H, v10.8H // ......................................................................*......................................................................................................................... + mul v15.8H, v15.8H, v0.H[0] // ......................................................................................................................................*......................................................... + str q16, [x14, #16] // ...........................................................................................*.................................................................................................... + sub v16.8H, v20.8H, v10.8H // .......................................................................*........................................................................................................................ + mla v15.8H, v30.8H, v8.H[0] // ..............................................................................................................................................*................................................. + add v29.8H, v16.8H, v9.8H // .........................................................................................*...................................................................................................... + mul v30.8H, v14.8H, v0.H[0] // .......................................................................................................*........................................................................................ + sub v20.8H, v16.8H, v9.8H // ..........................................................................................*..................................................................................................... + sqrdmulh v9.8H, v19.8H, v0.H[1] // .........................................................................................................................................................................*...................... + ldr q19, [x14, #128] // ...............................................................................................................*................................................................................ + str q29, [x6, #176] // ..............................................................................................*................................................................................................. + sub v29.8H, v13.8H, v15.8H // ...................................................................................................................................................*............................................ + mla v30.8H, v5.8H, v8.H[0] // .......................................................................................................................*........................................................................ + str q20, [x6, #208] // ...............................................................................................*................................................................................................ + add v5.8H, v13.8H, v15.8H // ....................................................................................................................................................*........................................... + sqrdmulh v16.8H, v29.8H, v0.H[5] // ......................................................................................................................................................*......................................... + sqrdmulh v15.8H, v5.8H, v0.H[3] // .................................................................................................................................................................................*.............. + add v20.8H, v31.8H, v30.8H // ..........................................................................................................................*..................................................................... + sqrdmulh v26.8H, v19.8H, v0.H[1] // .................................................................................................................................................*.............................................. + sub v30.8H, v31.8H, v30.8H // ............................................................................................................................*................................................................... + mul v5.8H, v5.8H, v0.H[2] // ......................................................................................................................................................................................*......... + mla v5.8H, v15.8H, v8.H[0] // ........................................................................................................................................................................................*....... + mla v22.8H, v7.8H, v8.H[0] // .........................................................................................................................*...................................................................... + mla v4.8H, v9.8H, v8.H[0] // .....................................................................................................................................................................................*.......... + sqrdmulh v6.8H, v27.8H, v0.H[1] // .................................................................................................................*.............................................................................. + mul v10.8H, v27.8H, v0.H[0] // ................................................................................................................................*............................................................... + sqrdmulh v15.8H, v11.8H, v0.H[5] // .........................................................................................................................................*...................................................... + mla v10.8H, v6.8H, v8.H[0] // .................................................................................................................................*.............................................................. + mul v9.8H, v11.8H, v0.H[4] // ..........................................................................................................................................*..................................................... + mla v9.8H, v15.8H, v8.H[0] // ............................................................................................................................................*................................................... + ldr q15, [x6, #0] // ................................................................................................*............................................................................................... + add v13.8H, v28.8H, v10.8H // .......................................................................................................................................*........................................................ + sqrdmulh v31.8H, v23.8H, v3.H[1] // .....................*.......................................................................................................................................................................... + sub v6.8H, v28.8H, v10.8H // ........................................................................................................................................*....................................................... + sqrdmulh v27.8H, v18.8H, v0.H[1] // .............................................................................................................................................................*.................................. + add v24.8H, v15.8H, v22.8H // ...............................................................................................................................*................................................................ + add v14.8H, v6.8H, v9.8H // ...............................................................................................................................................*................................................ + mul v11.8H, v23.8H, v3.H[0] // .......................*........................................................................................................................................................................ + sub v10.8H, v15.8H, v22.8H // ..............................................................................................................................*................................................................. + sub v6.8H, v6.8H, v9.8H // ................................................................................................................................................*............................................... + sqrdmulh v22.8H, v14.8H, v1.H[3] // ..................................................................................................................................................*............................................. + sqrdmulh v15.8H, v6.8H, v1.H[5] // .....................................................................................................................................................*.......................................... + mla v11.8H, v31.8H, v8.H[0] // ........................*....................................................................................................................................................................... + mul v31.8H, v18.8H, v0.H[0] // ........................................................................................................................*....................................................................... + ldr q18, [x6, #64] // ............................................................................................................*................................................................................... + mla v31.8H, v27.8H, v8.H[0] // ................................................................................................................................................................*............................... + mul v27.8H, v6.8H, v1.H[4] // .........................................................................................................................................................*...................................... + mla v27.8H, v15.8H, v8.H[0] // ..........................................................................................................................................................*..................................... + sub v15.8H, v17.8H, v11.8H // .............................*.................................................................................................................................................................. + sqrdmulh v6.8H, v25.8H, v0.H[3] // .................................................................................................................................................................*.............................. + str q15, [x14, #80] // ..................................*............................................................................................................................................................. + add v15.8H, v17.8H, v11.8H // ..............................*................................................................................................................................................................. + ldr q17, [x6, #128] // ..............................................................................................................*................................................................................. + mul v28.8H, v25.8H, v0.H[2] // ..........................................................................................................................................................................*..................... + str q15, [x14, #48] // ....................................*........................................................................................................................................................... + sqrdmulh v7.8H, v20.8H, v0.H[3] // .............................................................................................................................*.................................................................. + mla v28.8H, v6.8H, v8.H[0] // ...........................................................................................................................................................................*.................... + mul v11.8H, v20.8H, v0.H[2] // ..................................................................................................................................*............................................................. + mla v11.8H, v7.8H, v8.H[0] // ..................................................................................................................................................................*............................. + sub v7.8H, v13.8H, v28.8H // ................................................................................................................................................................................*............... + add v28.8H, v13.8H, v28.8H // .......................................................................................................................................................................................*........ + sqrdmulh v6.8H, v30.8H, v0.H[5] // .............................................................................................................................................*.................................................. + mul v9.8H, v14.8H, v1.H[2] // ...............................................................................................................................................................................................* + ldr q14, [x6, #32] // ..................................................................................................*............................................................................................. + mul v23.8H, v30.8H, v0.H[4] // ...........................................................................................................................................*.................................................... + mla v23.8H, v6.8H, v8.H[0] // ....................................................................................................................................................................*........................... + sub v6.8H, v14.8H, v31.8H // ......................................................................................................................................................................*......................... + mul v19.8H, v19.8H, v0.H[0] // .......................................................................................................................................................*........................................ + sqrdmulh v30.8H, v12.8H, v2.H[3] // .......................................*........................................................................................................................................................ + sub v15.8H, v6.8H, v23.8H // ............................................................................................................................................................................*................... + mla v19.8H, v26.8H, v8.H[0] // ........................................................................................................................................................*....................................... + add v23.8H, v6.8H, v23.8H // ..................................................................................................................................................................................*............. + add v26.8H, v18.8H, v4.8H // .........................................................................................................................................................................................*...... + mul v13.8H, v12.8H, v2.H[2] // ...........................................*.................................................................................................................................................... + sub v12.8H, v18.8H, v4.8H // ..........................................................................................................................................................................................*..... + mla v13.8H, v30.8H, v8.H[0] // ...............................................*................................................................................................................................................ + add v6.8H, v17.8H, v19.8H // ............................................................................................................................................................*................................... + mul v30.8H, v28.8H, v0.H[6] // ...........................................................................................................................................................................................*.... + sub v18.8H, v15.8H, v27.8H // .............................................................................................................................................................................*.................. + sqrdmulh v25.8H, v6.8H, v0.H[3] // ..............................................................................................................................................................................*................. + add v20.8H, v15.8H, v27.8H // ...............................................................................................................................................................................*................ + mul v27.8H, v29.8H, v0.H[4] // ..............................................................................................................................................................*................................. + sub v15.8H, v21.8H, v13.8H // .....................................................................................*.......................................................................................................... + mla v27.8H, v16.8H, v8.H[0] // ...............................................................................................................................................................*................................ + add v16.8H, v21.8H, v13.8H // ..........................................................................*..................................................................................................................... + str q15, [x6, #144] // ............................................................................................*................................................................................................... + add v21.8H, v26.8H, v5.8H // .............................................................................................................................................................................................*.. + add v15.8H, v14.8H, v31.8H // ...................................................................................................................................................................*............................ + sqrdmulh v14.8H, v28.8H, v0.H[7] // ............................................................................................................................................................................................*... + sub v26.8H, v26.8H, v5.8H // ..............................................................................................................................................................................................*. + str q16, [x6, #112] // ..............................................................................*................................................................................................................. + mul v28.8H, v6.8H, v0.H[2] // ....................................................................................................................................................................................*........... + sub v6.8H, v17.8H, v19.8H // ...........................................................................................................................................................*.................................... + sub v16.8H, v15.8H, v11.8H // ........................................................................................................................................................................*....................... + add v31.8H, v15.8H, v11.8H // .......................................................................................................................................................................*........................ + sqrdmulh v15.8H, v20.8H, v3.H[3] // ...................................................................................................................................................................................*............ + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // mul v20.8H, v20.8H, v3.H[2] // .....*.......................................................................................................................................................................................... + // sub v17.8H, v12.8H, v27.8H // *............................................................................................................................................................................................... + // mla v20.8H, v15.8H, v8.H[0] // ...............*................................................................................................................................................................................ + // mla v9.8H, v22.8H, v8.H[0] // .....................................*.......................................................................................................................................................... + // sqrdmulh v11.8H, v7.8H, v1.H[1] // .................................*.............................................................................................................................................................. + // mla v30.8H, v14.8H, v8.H[0] // ....*........................................................................................................................................................................................... + // add v14.8H, v12.8H, v27.8H // ..*............................................................................................................................................................................................. + // add v4.8H, v23.8H, v9.8H // .............................................................*.................................................................................................................................. + // sub v13.8H, v23.8H, v9.8H // ..............................................................*................................................................................................................................. + // mul v9.8H, v26.8H, v1.H[0] // ...................*............................................................................................................................................................................ + // sqrdmulh v15.8H, v6.8H, v0.H[5] // .*.............................................................................................................................................................................................. + // sub v19.8H, v31.8H, v30.8H // ........................*....................................................................................................................................................................... + // add v27.8H, v31.8H, v30.8H // .............................*.................................................................................................................................................................. + // mul v12.8H, v7.8H, v1.H[0] // ...................................*............................................................................................................................................................ + // sqrdmulh v7.8H, v21.8H, v0.H[7] // .............*.................................................................................................................................................................................. + // mul v22.8H, v6.8H, v0.H[4] // ......*......................................................................................................................................................................................... + // mla v22.8H, v15.8H, v8.H[0] // .......*........................................................................................................................................................................................ + // sqrdmulh v15.8H, v14.8H, v1.H[3] // ............................................*................................................................................................................................................... + // mul v29.8H, v14.8H, v1.H[2] // ................*............................................................................................................................................................................... + // sub v30.8H, v10.8H, v22.8H // ...........*.................................................................................................................................................................................... + // add v10.8H, v10.8H, v22.8H // ..........*..................................................................................................................................................................................... + // sqrdmulh v22.8H, v13.8H, v3.H[1] // ................................................................................................................................*............................................................... + // mla v29.8H, v15.8H, v8.H[0] // .......................................................*........................................................................................................................................ + // mul v13.8H, v13.8H, v3.H[0] // .....................................................................................................................................*.......................................................... + // mla v13.8H, v22.8H, v8.H[0] // ..........................................................................................................................................*..................................................... + // add v31.8H, v10.8H, v29.8H // ..........................................................*..................................................................................................................................... + // mla v12.8H, v11.8H, v8.H[0] // .......................................*........................................................................................................................................................ + // sub v22.8H, v10.8H, v29.8H // ......................................................................................*......................................................................................................... + // sqrdmulh v11.8H, v17.8H, v1.H[5] // .........*...................................................................................................................................................................................... + // sub v14.8H, v22.8H, v13.8H // ................................................................................................................................................*............................................... + // add v15.8H, v22.8H, v13.8H // ...................................................................................................................................................*............................................ + // mul v10.8H, v17.8H, v1.H[4] // ............*................................................................................................................................................................................... + // add v23.8H, v16.8H, v12.8H // ...................................................*............................................................................................................................................ + // sub v22.8H, v16.8H, v12.8H // ..................................................*............................................................................................................................................. + // str q14, [x14, #96] // ..................................................................................................................................................*............................................. + // sqrdmulh v14.8H, v18.8H, v3.H[5] // .........................................................................*...................................................................................................................... + // str q15, [x14, #64] // ......................................................................................................................................................*......................................... + // mla v10.8H, v11.8H, v8.H[0] // ..............*................................................................................................................................................................................. + // mla v28.8H, v25.8H, v8.H[0] // ..............................*................................................................................................................................................................. + // sqrdmulh v25.8H, v23.8H, v2.H[3] // ....................................................................................................................................................................*........................... + // add v15.8H, v30.8H, v10.8H // .................*.............................................................................................................................................................................. + // mul v11.8H, v21.8H, v0.H[6] // ........*....................................................................................................................................................................................... + // sub v29.8H, v24.8H, v28.8H // ......................................*......................................................................................................................................................... + // mul v16.8H, v23.8H, v2.H[2] // .........................................................................................................................................................................*...................... + // add v12.8H, v15.8H, v20.8H // ...........................................*.................................................................................................................................................... + // sub v15.8H, v15.8H, v20.8H // ....................*........................................................................................................................................................................... + // add v13.8H, v24.8H, v28.8H // .........................................*...................................................................................................................................................... + // mla v16.8H, v25.8H, v8.H[0] // ...........................................................................................................................................................................*.................... + // str q12, [x14, #128] // .................................................*.............................................................................................................................................. + // sub v20.8H, v30.8H, v10.8H // ..................*............................................................................................................................................................................. + // mul v17.8H, v4.8H, v2.H[6] // ......................................................................*......................................................................................................................... + // str q15, [x14, #160] // .......................*........................................................................................................................................................................ + // sqrdmulh v23.8H, v4.8H, v2.H[7] // ....................................................................*........................................................................................................................... + // sqrdmulh v15.8H, v19.8H, v2.H[1] // ............................*................................................................................................................................................................... + // mla v11.8H, v7.8H, v8.H[0] // ..........................*..................................................................................................................................................................... + // mul v30.8H, v19.8H, v2.H[0] // ...........................*.................................................................................................................................................................... + // mla v30.8H, v15.8H, v8.H[0] // ..................................*............................................................................................................................................................. + // sub v6.8H, v13.8H, v11.8H // ..............................................*................................................................................................................................................. + // sqrdmulh v19.8H, v26.8H, v1.H[1] // ......................*......................................................................................................................................................................... + // add v26.8H, v13.8H, v11.8H // ...............................................*................................................................................................................................................ + // mul v13.8H, v18.8H, v3.H[4] // ...........................................................................*.................................................................................................................... + // add v12.8H, v6.8H, v30.8H // ......................................................*......................................................................................................................................... + // mla v13.8H, v14.8H, v8.H[0] // ............................................................................*................................................................................................................... + // sub v7.8H, v6.8H, v30.8H // .....................................................*.......................................................................................................................................... + // str q12, [x6, #64] // .........................................................*...................................................................................................................................... + // sqrdmulh v12.8H, v27.8H, v1.H[7] // ................................*............................................................................................................................................................... + // str q7, [x6, #96] // ........................................................*....................................................................................................................................... + // mla v9.8H, v19.8H, v8.H[0] // .......................................................................................*........................................................................................................ + // mul v5.8H, v27.8H, v1.H[6] // ...............................*................................................................................................................................................................ + // mla v5.8H, v12.8H, v8.H[0] // ...............................................................*................................................................................................................................ + // add v28.8H, v29.8H, v9.8H // ................................................................................................*............................................................................................... + // sub v27.8H, v29.8H, v9.8H // ...................................................................................................*............................................................................................ + // sqrdmulh v15.8H, v22.8H, v2.H[5] // ............................................................*................................................................................................................................... + // mla v17.8H, v23.8H, v8.H[0] // ..........................................................................*..................................................................................................................... + // add v10.8H, v28.8H, v16.8H // ....................................................................................................................................................................................*........... + // add v9.8H, v26.8H, v5.8H // ..................................................................*............................................................................................................................. + // mul v22.8H, v22.8H, v2.H[4] // ................................................................*............................................................................................................................... + // sub v26.8H, v26.8H, v5.8H // ...................................................................*............................................................................................................................ + // str q10, [x6, #128] // ..........................................................................................................................................................................................*..... + // mla v22.8H, v15.8H, v8.H[0] // .................................................................*.............................................................................................................................. + // str q9, [x6], #16 // .....................................................................*.......................................................................................................................... + // sub v9.8H, v20.8H, v13.8H // ................................................................................*............................................................................................................... + // str q26, [x6, #16] // .......................................................................*........................................................................................................................ + // add v15.8H, v31.8H, v17.8H // .................................................................................*.............................................................................................................. + // sub v6.8H, v31.8H, v17.8H // .........................................................................................*...................................................................................................... + // sub v17.8H, v28.8H, v16.8H // ..................................................................................................................................................................................*............. + // str q9, [x14, #224] // .....................................................................................*.......................................................................................................... + // str q15, [x14], #16 // ........................................................................................*....................................................................................................... + // add v15.8H, v20.8H, v13.8H // ...................................................................................*............................................................................................................ + // add v5.8H, v27.8H, v22.8H // .....................................................................................................*.......................................................................................... + // sub v14.8H, v27.8H, v22.8H // .......................................................................................................*........................................................................................ + // str q6, [x14, #16] // ..................................................................................................*............................................................................................. + // str q17, [x6, #144] // .....................................................................................................................................................................................*.......... + // str q15, [x14, #176] // ...........................................................................................*.................................................................................................... + // str q5, [x6, #176] // ..........................................................................................................*..................................................................................... + // str q14, [x6, #208] // .............................................................................................................*.................................................................................. + // ldr q20, [x6, #0] // ..............................................................................................................................*................................................................. + // ldr q14, [x14, #160] // .....................*.......................................................................................................................................................................... + // ldr q16, [x6, #32] // ...............................................................................................................................................................*................................ + // ldr q11, [x14, #192] // ........................................*....................................................................................................................................................... + // ldr q25, [x14, #96] // ...*............................................................................................................................................................................................ + // ldr q27, [x14, #224] // .............................................*.................................................................................................................................................. + // ldr q17, [x6, #224] // .............................................................................*.................................................................................................................. + // mul v22.8H, v14.8H, v0.H[0] // ......................................................................................................*......................................................................................... + // ldr q26, [x6, #96] // ....................................................................................*........................................................................................................... + // ldr q21, [x14, #0] // ....................................*........................................................................................................................................................... + // sqrdmulh v15.8H, v27.8H, v0.H[1] // ....................................................*........................................................................................................................................... + // ldr q19, [x14, #32] // ...............................................................................*................................................................................................................ + // ldr q30, [x6, #64] // ............................................................................................................................................*................................................... + // mul v13.8H, v27.8H, v0.H[0] // ..............................................................................*................................................................................................................. + // ldr q12, [x6, #128] // ....................................................................................................................................................*........................................... + // ldr q6, [x14, #128] // .........................................................................................................*...................................................................................... + // ldr q28, [x14, #64] // ........................................................................*....................................................................................................................... + // sqrdmulh v23.8H, v25.8H, v0.H[1] // ........................................................................................................................*....................................................................... + // ldr q5, [x6, #160] // ...........................................................*.................................................................................................................................... + // ldr q9, [x6, #192] // ..............................................................................................*................................................................................................. + // sqrdmulh v27.8H, v14.8H, v0.H[1] // .........................*...................................................................................................................................................................... + // sqrdmulh v29.8H, v21.8H, v0.H[1] // ................................................*............................................................................................................................................... + // mul v14.8H, v21.8H, v0.H[0] // ..........................................*..................................................................................................................................................... + // mla v22.8H, v27.8H, v8.H[0] // ............................................................................................................*................................................................................... + // mul v31.8H, v19.8H, v0.H[0] // ...........................................................................................................................................*.................................................... + // mla v14.8H, v29.8H, v8.H[0] // ......................................................................................................................*......................................................................... + // add v7.8H, v5.8H, v22.8H // .................................................................................................................*.............................................................................. + // mla v13.8H, v15.8H, v8.H[0] // ..................................................................................*............................................................................................................. + // sub v27.8H, v5.8H, v22.8H // ...................................................................................................................*............................................................................ + // sqrdmulh v21.8H, v7.8H, v0.H[3] // .......................................................................................................................................................*........................................ + // sub v10.8H, v20.8H, v14.8H // ......................................................................................................................................*......................................................... + // add v24.8H, v20.8H, v14.8H // ...................................................................................................................................*............................................................ + // mul v20.8H, v25.8H, v0.H[0] // .........................................................................................................................*...................................................................... + // mla v20.8H, v23.8H, v8.H[0] // ...........................................................................................................................*.................................................................... + // mul v14.8H, v7.8H, v0.H[2] // .........................................................................................................................................................*...................................... + // add v25.8H, v17.8H, v13.8H // ............................................................................................*................................................................................................... + // sqrdmulh v5.8H, v11.8H, v0.H[1] // ..........................................................................................*..................................................................................................... + // sub v7.8H, v17.8H, v13.8H // .............................................................................................*.................................................................................................. + // mul v11.8H, v11.8H, v0.H[0] // .................................................................................................*.............................................................................................. + // add v13.8H, v26.8H, v20.8H // ...............................................................................................................................*................................................................ + // sub v26.8H, v26.8H, v20.8H // .................................................................................................................................*.............................................................. + // sqrdmulh v17.8H, v7.8H, v0.H[5] // ..........................................................................................................................*..................................................................... + // mul v23.8H, v7.8H, v0.H[4] // ............................................................................................................................*................................................................... + // mul v29.8H, v27.8H, v0.H[4] // ................................................................................................................................................................*............................... + // mla v23.8H, v17.8H, v8.H[0] // .............................................................................................................................*.................................................................. + // sqrdmulh v4.8H, v27.8H, v0.H[5] // .............................................................................................................................................................*.................................. + // mla v11.8H, v5.8H, v8.H[0] // ....................................................................................................*........................................................................................... + // add v5.8H, v26.8H, v23.8H // ....................................................................................................................................*........................................................... + // sub v26.8H, v26.8H, v23.8H // .......................................................................................................................................*........................................................ + // sqrdmulh v20.8H, v6.8H, v0.H[1] // ..................................................................................................................*............................................................................. + // sqrdmulh v22.8H, v5.8H, v1.H[3] // ........................................................................................................................................*....................................................... + // sub v7.8H, v9.8H, v11.8H // ...........................................................................................................*.................................................................................... + // add v9.8H, v9.8H, v11.8H // ..............................................................................................................*................................................................................. + // sqrdmulh v15.8H, v26.8H, v1.H[5] // .........................................................................................................................................*...................................................... + // sqrdmulh v17.8H, v7.8H, v0.H[5] // ...............................................................................................................*................................................................................ + // mul v18.8H, v6.8H, v0.H[0] // ...................................................................................................................................................................*............................ + // mla v18.8H, v20.8H, v8.H[0] // ......................................................................................................................................................................*......................... + // mul v20.8H, v26.8H, v1.H[4] // ..............................................................................................................................................*................................................. + // mla v20.8H, v15.8H, v8.H[0] // ...............................................................................................................................................*................................................ + // sub v6.8H, v12.8H, v18.8H // ............................................................................................................................................................................................*... + // add v26.8H, v12.8H, v18.8H // ............................................................................................................................................................................*................... + // sqrdmulh v18.8H, v19.8H, v0.H[1] // ..................................................................................................................................*............................................................. + // mul v27.8H, v7.8H, v0.H[4] // .................................................................................................................................................................................*.............. + // mla v27.8H, v17.8H, v8.H[0] // ...................................................................................................................................................................................*............ + // mla v31.8H, v18.8H, v8.H[0] // .............................................................................................................................................*.................................................. + // sqrdmulh v23.8H, v25.8H, v0.H[3] // .................................................................................................................................................*.............................................. + // mla v14.8H, v21.8H, v8.H[0] // ..........................................................................................................................................................*..................................... + // add v18.8H, v16.8H, v31.8H // .......................................................................................................................................................................................*........ + // mla v29.8H, v4.8H, v8.H[0] // .................................................................................................................................................................*.............................. + // mul v17.8H, v28.8H, v0.H[0] // ...............................................................................................*................................................................................................ + // sub v21.8H, v16.8H, v31.8H // ..................................................................................................................................................................*............................. + // add v31.8H, v18.8H, v14.8H // ..............................................................................................................................................................................................*. + // sub v16.8H, v18.8H, v14.8H // .............................................................................................................................................................................................*.. + // sqrdmulh v14.8H, v28.8H, v0.H[1] // ........................................................................................................*....................................................................................... + // mul v19.8H, v25.8H, v0.H[2] // .....................................................................................................................................................*.......................................... + // mla v19.8H, v23.8H, v8.H[0] // ........................................................................................................................................................*....................................... + // sub v23.8H, v21.8H, v29.8H // .....................................................................................................................................................................*.......................... + // sub v18.8H, v23.8H, v20.8H // ..............................................................................................................................................................................*................. + // sqrdmulh v25.8H, v26.8H, v0.H[3] // ...............................................................................................................................................................................*................ + // add v20.8H, v23.8H, v20.8H // ................................................................................................................................................................................*............... + // sub v7.8H, v13.8H, v19.8H // ...........................................................................................................................................................*.................................... + // sqrdmulh v11.8H, v9.8H, v0.H[3] // ................................................................................................................*............................................................................... + // add v23.8H, v21.8H, v29.8H // .......................................................................................................................................................................*........................ + // sqrdmulh v15.8H, v20.8H, v3.H[3] // ...............................................................................................................................................................................................* + // mul v28.8H, v26.8H, v0.H[2] // ...........................................................................................................................................................................................*.... + // mla v17.8H, v14.8H, v8.H[0] // .......................................................................................................................*........................................................................ + // mul v4.8H, v9.8H, v0.H[2] // ....................................................................................................................*........................................................................... + // add v9.8H, v13.8H, v19.8H // ............................................................................................................................................................*................................... + // mla v4.8H, v11.8H, v8.H[0] // .....................................................................................................................*.......................................................................... + // add v19.8H, v30.8H, v17.8H // ........................................................................................................................................................................*....................... + // sub v12.8H, v30.8H, v17.8H // ..........................................................................................................................................................................*..................... + // mul v30.8H, v9.8H, v0.H[6] // .............................................................................................................................................................................*.................. + // sqrdmulh v14.8H, v9.8H, v0.H[7] // ........................................................................................................................................................................................*....... + // add v21.8H, v19.8H, v4.8H // ......................................................................................................................................................................................*......... + // sub v26.8H, v19.8H, v4.8H // .........................................................................................................................................................................................*...... + // mul v9.8H, v5.8H, v1.H[2] // ..............................................................................................................................................................*................................. + + sub count, count, #1 cbnz count, layer1234_start - add v5.8H, v7.8H, v25.8H - sqrdmulh v19.8H, v19.8H, v0.H[5] - sub v18.8H, v31.8H, v23.8H - sub v8.8H, v7.8H, v25.8H - mul v6.8H, v22.8H, v1.H[2] - sqrdmulh v9.8H, v5.8H, v1.H[7] - sqrdmulh v28.8H, v8.8H, v2.H[1] - mul v7.8H, v8.8H, v2.H[0] - mla v16.8H, v19.8H, v30.8H - mla v7.8H, v28.8H, v30.8H - mul v11.8H, v5.8H, v1.H[6] - mla v11.8H, v9.8H, v30.8H - add v31.8H, v13.8H, v7.8H - sub v5.8H, v13.8H, v7.8H - sqrdmulh v28.8H, v22.8H, v1.H[3] - str_vo v31, x6, 64 - mla v14.8H, v26.8H, v30.8H - str_vo v5, x6, 96 - add v7.8H, v10.8H, v11.8H - sub v26.8H, v10.8H, v11.8H - add v31.8H, v20.8H, v16.8H - sqrdmulh v25.8H, v27.8H, v1.H[1] - str_vo v26, x6, 32 - sub v8.8H, v20.8H, v16.8H - mul v16.8H, v27.8H, v1.H[0] - str_vi v7, x6, 16 - sub v12.8H, v21.8H, v14.8H - sqrdmulh v26.8H, v24.8H, v1.H[5] - mul v27.8H, v24.8H, v1.H[4] - mla v27.8H, v26.8H, v30.8H - mla v16.8H, v25.8H, v30.8H - mul v20.8H, v8.8H, v1.H[4] - add v22.8H, v17.8H, v27.8H - sub v9.8H, v17.8H, v27.8H - sub v25.8H, v4.8H, v16.8H - sqrdmulh v26.8H, v8.8H, v1.H[5] - add v11.8H, v4.8H, v16.8H - sqrdmulh v19.8H, v31.8H, v1.H[3] - mla v20.8H, v26.8H, v30.8H - mul v23.8H, v31.8H, v1.H[2] - mla v23.8H, v19.8H, v30.8H - mla v6.8H, v28.8H, v30.8H - add v4.8H, v21.8H, v14.8H - sub v28.8H, v12.8H, v20.8H - add v8.8H, v12.8H, v20.8H - sqrdmulh v21.8H, v18.8H, v1.H[1] - sub v20.8H, v4.8H, v23.8H - add v26.8H, v4.8H, v23.8H - mul v27.8H, v18.8H, v1.H[0] - sub v7.8H, v15.8H, v6.8H - add v23.8H, v15.8H, v6.8H - mla v27.8H, v21.8H, v30.8H - sqrdmulh v4.8H, v26.8H, v2.H[7] - mul v16.8H, v26.8H, v2.H[6] - mla v16.8H, v4.8H, v30.8H - sqrdmulh v4.8H, v20.8H, v3.H[1] - sub v6.8H, v29.8H, v27.8H - sqrdmulh v17.8H, v8.8H, v3.H[3] - mul v24.8H, v20.8H, v3.H[0] - add v21.8H, v29.8H, v27.8H - mla v24.8H, v4.8H, v30.8H - sqrdmulh v5.8H, v6.8H, v2.H[5] - sub v4.8H, v7.8H, v24.8H - mul v13.8H, v6.8H, v2.H[4] - add v27.8H, v23.8H, v16.8H - sqrdmulh v26.8H, v21.8H, v2.H[3] - str_vo v4, x14, 96 - sub v6.8H, v23.8H, v16.8H - str_vi v27, x14, 16 - sqrdmulh v31.8H, v28.8H, v3.H[5] - mul v23.8H, v21.8H, v2.H[2] - mla v23.8H, v26.8H, v30.8H - mul v26.8H, v8.8H, v3.H[2] - mla v13.8H, v5.8H, v30.8H - sub v12.8H, v11.8H, v23.8H - mla v26.8H, v17.8H, v30.8H - str_vo v12, x6, 144 - mul v4.8H, v28.8H, v3.H[4] - add v10.8H, v25.8H, v13.8H - str_vo v6, x14, 16 - sub v6.8H, v25.8H, v13.8H - mla v4.8H, v31.8H, v30.8H - sub v16.8H, v22.8H, v26.8H - str_vo v10, x6, 176 - add v21.8H, v11.8H, v23.8H - add v11.8H, v22.8H, v26.8H - add v7.8H, v7.8H, v24.8H - str_vo v6, x6, 208 - str_vo v16, x14, 144 - sub v23.8H, v9.8H, v4.8H - add v26.8H, v9.8H, v4.8H - str_vo v11, x14, 112 - str_vo v21, x6, 112 - str_vo v7, x14, 48 - str_vo v23, x14, 208 - str_vo v26, x14, 176 + mul v20.8H, v20.8H, v3.H[2] // .......................................................................................................................................................................*........................ + sub v17.8H, v12.8H, v27.8H // .........................................................................................*...................................................................................................... + mla v20.8H, v15.8H, v8.H[0] // ........................................................................................................................................................................*....................... + mla v9.8H, v22.8H, v8.H[0] // ...........................................................................................................................*.................................................................... + sqrdmulh v11.8H, v7.8H, v1.H[1] // ...............................................................................................................*................................................................................ + mla v30.8H, v14.8H, v8.H[0] // .......................................................................................................*........................................................................................ + add v14.8H, v12.8H, v27.8H // ..........................................................................................*..................................................................................................... + add v4.8H, v23.8H, v9.8H // .............................................................................................................................*.................................................................. + sub v13.8H, v23.8H, v9.8H // ............................................................................................................................*................................................................... + mul v9.8H, v26.8H, v1.H[0] // ...........................................................................................................*.................................................................................... + sqrdmulh v15.8H, v6.8H, v0.H[5] // ............................................................................*................................................................................................................... + sub v19.8H, v31.8H, v30.8H // ........................................................................................................*....................................................................................... + add v27.8H, v31.8H, v30.8H // .........................................................................................................*...................................................................................... + mul v12.8H, v7.8H, v1.H[0] // ................................................................................................................*............................................................................... + sqrdmulh v7.8H, v21.8H, v0.H[7] // ................................................................................................*............................................................................................... + mul v22.8H, v6.8H, v0.H[4] // .............................................................................*.................................................................................................................. + mla v22.8H, v15.8H, v8.H[0] // ..............................................................................*................................................................................................................. + sqrdmulh v15.8H, v14.8H, v1.H[3] // ....................................................................................................................*........................................................................... + mul v29.8H, v14.8H, v1.H[2] // .....................................................................................................................*.......................................................................... + sub v30.8H, v10.8H, v22.8H // ...............................................................................*................................................................................................................ + add v10.8H, v10.8H, v22.8H // ................................................................................*............................................................................................................... + sqrdmulh v22.8H, v13.8H, v3.H[1] // .................................................................................................................................................................*.............................. + mla v29.8H, v15.8H, v8.H[0] // ......................................................................................................................*......................................................................... + mul v13.8H, v13.8H, v3.H[0] // ..................................................................................................................................................................*............................. + mla v13.8H, v22.8H, v8.H[0] // ...................................................................................................................................................................*............................ + add v31.8H, v10.8H, v29.8H // ........................................................................................................................*....................................................................... + mla v12.8H, v11.8H, v8.H[0] // .................................................................................................................*.............................................................................. + sub v22.8H, v10.8H, v29.8H // .......................................................................................................................*........................................................................ + sqrdmulh v11.8H, v17.8H, v1.H[5] // ..............................................................................................................................*................................................................. + sub v14.8H, v22.8H, v13.8H // ....................................................................................................................................................................*........................... + add v15.8H, v22.8H, v13.8H // .....................................................................................................................................................................*.......................... + mul v10.8H, v17.8H, v1.H[4] // ...............................................................................................................................*................................................................ + add v23.8H, v16.8H, v12.8H // ...................................................................................................................*............................................................................ + sub v22.8H, v16.8H, v12.8H // ..................................................................................................................*............................................................................. + str q14, [x14, #96] // ...........................................................................................................................................................................................*.... + sqrdmulh v14.8H, v18.8H, v3.H[5] // ...........................................................................................................................................................................*.................... + str q15, [x14, #64] // ..........................................................................................................................................................................................*..... + mla v10.8H, v11.8H, v8.H[0] // ................................................................................................................................*............................................................... + mla v28.8H, v25.8H, v8.H[0] // ..........................................................*..................................................................................................................................... + sqrdmulh v25.8H, v23.8H, v2.H[3] // ..................................................................................................................................................*............................................. + add v15.8H, v30.8H, v10.8H // ..................................................................................................................................*............................................................. + mul v11.8H, v21.8H, v0.H[6] // .................................................................................................*.............................................................................................. + sub v29.8H, v24.8H, v28.8H // ...........................................................*.................................................................................................................................... + mul v16.8H, v23.8H, v2.H[2] // ...................................................................................................................................................*............................................ + add v12.8H, v15.8H, v20.8H // ..........................................................................................................................................................................*..................... + sub v15.8H, v15.8H, v20.8H // .........................................................................................................................................................................*...................... + add v13.8H, v24.8H, v28.8H // ............................................................*................................................................................................................................... + mla v16.8H, v25.8H, v8.H[0] // ....................................................................................................................................................*........................................... + str q12, [x14, #128] // ............................................................................................................................................................................................*... + sub v20.8H, v30.8H, v10.8H // .................................................................................................................................*.............................................................. + mul v17.8H, v4.8H, v2.H[6] // .............................................................................................................................................................*.................................. + str q15, [x14, #160] // .............................................................................................................................................................................................*.. + sqrdmulh v23.8H, v4.8H, v2.H[7] // ............................................................................................................................................................*................................... + sqrdmulh v15.8H, v19.8H, v2.H[1] // .............................................................................................................................................*.................................................. + mla v11.8H, v7.8H, v8.H[0] // ..................................................................................................*............................................................................................. + mul v30.8H, v19.8H, v2.H[0] // ..............................................................................................................................................*................................................. + mla v30.8H, v15.8H, v8.H[0] // ...............................................................................................................................................*................................................ + sub v6.8H, v13.8H, v11.8H // ...................................................................................................*............................................................................................ + sqrdmulh v19.8H, v26.8H, v1.H[1] // ..........................................................................................................*..................................................................................... + add v26.8H, v13.8H, v11.8H // ....................................................................................................*........................................................................................... + mul v13.8H, v18.8H, v3.H[4] // ............................................................................................................................................................................*................... + add v12.8H, v6.8H, v30.8H // .................................................................................................................................................*.............................................. + mla v13.8H, v14.8H, v8.H[0] // .............................................................................................................................................................................*.................. + sub v7.8H, v6.8H, v30.8H // ................................................................................................................................................*............................................... + str q12, [x6, #64] // ..................................................................................................................................................................................*............. + sqrdmulh v12.8H, v27.8H, v1.H[7] // ........................................................................................................................................*....................................................... + str q7, [x6, #96] // ...................................................................................................................................................................................*............ + mla v9.8H, v19.8H, v8.H[0] // ............................................................................................................*................................................................................... + mul v5.8H, v27.8H, v1.H[6] // .........................................................................................................................................*...................................................... + mla v5.8H, v12.8H, v8.H[0] // ..........................................................................................................................................*..................................................... + add v28.8H, v29.8H, v9.8H // ..............................................................................................................*................................................................................. + sub v27.8H, v29.8H, v9.8H // .............................................................................................................*.................................................................................. + sqrdmulh v15.8H, v22.8H, v2.H[5] // .......................................................................................................................................................*........................................ + mla v17.8H, v23.8H, v8.H[0] // ..............................................................................................................................................................*................................. + add v10.8H, v28.8H, v16.8H // ......................................................................................................................................................*......................................... + add v9.8H, v26.8H, v5.8H // ............................................................................................................................................*................................................... + mul v22.8H, v22.8H, v2.H[4] // ........................................................................................................................................................*....................................... + sub v26.8H, v26.8H, v5.8H // ...........................................................................................................................................*.................................................... + str q10, [x6, #128] // ....................................................................................................................................................................................*........... + mla v22.8H, v15.8H, v8.H[0] // .........................................................................................................................................................*...................................... + str q9, [x6], #16 // ................................................................................................................................................................................*............... + sub v9.8H, v20.8H, v13.8H // ..............................................................................................................................................................................*................. + str q26, [x6, #16] // .................................................................................................................................................................................*.............. + add v15.8H, v31.8H, v17.8H // ................................................................................................................................................................*............................... + sub v6.8H, v31.8H, v17.8H // ...............................................................................................................................................................*................................ + sub v17.8H, v28.8H, v16.8H // .....................................................................................................................................................*.......................................... + str q9, [x14, #224] // ...............................................................................................................................................................................................* + str q15, [x14], #16 // ........................................................................................................................................................................................*....... + add v15.8H, v20.8H, v13.8H // ...............................................................................................................................................................................*................ + add v5.8H, v27.8H, v22.8H // ...........................................................................................................................................................*.................................... + sub v14.8H, v27.8H, v22.8H // ..........................................................................................................................................................*..................................... + str q6, [x14, #16] // .........................................................................................................................................................................................*...... + str q17, [x6, #144] // .....................................................................................................................................................................................*.......... + str q15, [x14, #176] // ..............................................................................................................................................................................................*. + str q5, [x6, #176] // ......................................................................................................................................................................................*......... + str q14, [x6, #208] // .......................................................................................................................................................................................*........ restore inp, STACK0 mov count, #4 - ASM_LOAD(xtmp, barrett_const_addr) - ld1r {barrett_const.8h}, [xtmp] ASM_LOAD(r_ptr1, roots_l456) add src0, inp, #256*0 add src1, inp, #256*1 .p2align 2 + // Instructions: 1 + // Expected cycles: 1 + // Expected IPC: 1.00 + // + // Cycle bound: 1.0 + // IPC bound: 1.00 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + // gap // .............................. + // gap // .............................. + ldr q19, [x4, #16] // *............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q19, [x4, #16] // *.............................. + + sub count, count, #1 layer567_start: + // Instructions: 118 + // Expected cycles: 121 + // Expected IPC: 0.98 + // + // Cycle bound: 121.0 + // IPC bound: 0.98 + // + // Wall time: 3607.83s + // User time: 3607.83s + // + // ------------------------------------------------- original position -------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x6] // *..................................................................................................................... + ld4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x7] // .*.................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v20.4S, v15.4S, v24.4S // ..*................................................................................................................... + ldr q13, [x4, #144] // ...................*.................................................................................................. + trn2 v14.4S, v17.4S, v26.4S // .......*.............................................................................................................. + trn2 v9.4S, v15.4S, v24.4S // ...*.................................................................................................................. + ldr q1, [x4, #192] // ......................*............................................................................................... + ldr q5, [x4], #16*14 // ..........*........................................................................................................... + ldr q7, [x4, #-176] // .............*........................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q23, [x4, #-64] // ....................*................................................................................................. + sqrdmulh v30.8H, v14.8H, v19.8H // ..................................*................................................................................... + // gap // ...................................................................................................................... + ldr q4, [x4, #-96] // ..................*................................................................................................... + // gap // ...................................................................................................................... + trn2 v3.4S, v18.4S, v27.4S // .........*............................................................................................................ + mul v29.8H, v14.8H, v5.8H // ...................................*.................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q24, [x4, #-48] // .....................*................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q11, [x4, #-128] // ................*..................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v15.8H, v3.8H, v19.8H // .......................................*.............................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v21.4S, v16.4S, v25.4S // .....*................................................................................................................ + mla v29.8H, v30.8H, v8.H[0] // ....................................*................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v6.8H, v21.8H, v19.8H // .............................*........................................................................................ + trn1 v2.4S, v17.4S, v26.4S // ......*............................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v31.8H, v3.8H, v5.8H // ........................................*............................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v0.8H, v2.8H, v29.8H // ......................................*............................................................................... + mla v31.8H, v15.8H, v8.H[0] // .........................................*............................................................................ + // gap // ...................................................................................................................... + ldr q22, [x4, #-192] // ............*......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v17.8H, v0.8H, v7.8H // ............................................*......................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v14.8H, v0.8H, v22.8H // .............................................*........................................................................ + // gap // ...................................................................................................................... + trn1 v3.4S, v18.4S, v27.4S // ........*............................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q12, [x4, #-144] // ...............*...................................................................................................... + mul v0.8H, v21.8H, v5.8H // ..............................*....................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v15.8H, v3.8H, v31.8H // ...........................................*.......................................................................... + sub v31.8H, v3.8H, v31.8H // ..........................................*........................................................................... + mla v0.8H, v6.8H, v8.H[0] // ...............................*...................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v27.8H, v15.8H, v7.8H // .................................................*.................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v26.8H, v9.8H, v19.8H // ........................*............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v28.8H, v15.8H, v22.8H // ..................................................*................................................................... + trn1 v30.4S, v16.4S, v25.4S // ....*................................................................................................................. + ldr q3, [x4, #-16] // .......................*.............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v28.8H, v27.8H, v8.H[0] // ...................................................*.................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v16.8H, v30.8H, v0.8H // .................................*.................................................................................... + mul v22.8H, v9.8H, v5.8H // .........................*............................................................................................ + ldr q5, [x4, #-112] // .................*.................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v22.8H, v26.8H, v8.H[0] // ..........................*........................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v15.8H, v16.8H, v28.8H // .....................................................*................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v14.8H, v17.8H, v8.H[0] // ..............................................*....................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v27.8H, v16.8H, v28.8H // ....................................................*................................................................. + sqrdmulh v16.8H, v15.8H, v5.8H // ................................................................*..................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v26.8H, v20.8H, v22.8H // ............................*......................................................................................... + mul v6.8H, v15.8H, v11.8H // .................................................................*.................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v5.8H, v26.8H, v14.8H // ...............................................*...................................................................... + sqrdmulh v15.8H, v27.8H, v13.8H // .....................................................................*................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v6.8H, v16.8H, v8.H[0] // ..................................................................*................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v27.8H, v27.8H, v4.8H // ......................................................................*............................................... + add v10.8H, v26.8H, v14.8H // ................................................*..................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v27.8H, v15.8H, v8.H[0] // .......................................................................*.............................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v15.8H, v10.8H, v6.8H // ....................................................................*................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v16.8H, v10.8H, v6.8H // ...................................................................*.................................................. + ldr q4, [x4, #-160] // ..............*....................................................................................................... + sqrdmulh v26.8H, v31.8H, v12.8H // ...........................................................*.......................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v17.8H, v15.8H, v8.H[1] // ....................................................................................*................................. + add v14.8H, v5.8H, v27.8H // .........................................................................*............................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v27.8H, v5.8H, v27.8H // ........................................................................*............................................. + mul v7.8H, v31.8H, v4.8H // ............................................................*......................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v31.8H, v14.8H, v8.H[1] // ..........................................................................................*........................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v19.8H, v2.8H, v29.8H // .....................................*................................................................................ + mla v7.8H, v26.8H, v8.H[0] // .............................................................*........................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v13.8H, v19.8H, v4.8H // .......................................................*.............................................................. + // gap // ...................................................................................................................... + sub v25.8H, v30.8H, v0.8H // ................................*..................................................................................... + srshr v0.8H, v31.8H, #11 // ...........................................................................................*.......................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v31.8H, v19.8H, v12.8H // ......................................................*............................................................... + ldr q19, [x4, #16] // ...........e.......................................................................................................... + // gap // ...................................................................................................................... + sub v26.8H, v25.8H, v7.8H // ..............................................................*....................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v14.8H, v0.8H, v8.H[0] // ............................................................................................*......................... + sub v0.8H, v20.8H, v22.8H // ...........................*.......................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v6.8H, v26.8H, v3.8H // ...............................................................................*...................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v13.8H, v31.8H, v8.H[0] // ........................................................*............................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v22.8H, v26.8H, v1.8H // ................................................................................*..................................... + add v26.8H, v25.8H, v7.8H // ...............................................................*...................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v22.8H, v6.8H, v8.H[0] // .................................................................................*.................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v2.8H, v0.8H, v13.8H // .........................................................*............................................................ + sqrdmulh v29.8H, v26.8H, v24.8H // ..........................................................................*........................................... + add v3.8H, v0.8H, v13.8H // ..........................................................*........................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v0.8H, v26.8H, v23.8H // ...........................................................................*.......................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v7.8H, v2.8H, v22.8H // ...................................................................................*.................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v25.8H, v27.8H, v8.H[1] // .............................................................................................*........................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v0.8H, v29.8H, v8.H[0] // ............................................................................*......................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v31.8H, v7.8H, v8.H[1] // ......................................................................................................*............... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v11.8H, v2.8H, v22.8H // ..................................................................................*................................... + sqdmulh v22.8H, v16.8H, v8.H[1] // .......................................................................................*.............................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v26.8H, v3.8H, v0.8H // ..............................................................................*....................................... + // gap // ...................................................................................................................... + srshr v13.8H, v25.8H, #11 // ..............................................................................................*....................... + sqdmulh v25.8H, v11.8H, v8.H[1] // .........................................................................................................*............ + sub v6.8H, v3.8H, v0.8H // .............................................................................*........................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v0.8H, v26.8H, v8.H[1] // ................................................................................................*..................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v27.8H, v13.8H, v8.H[0] // ...............................................................................................*...................... + srshr v22.8H, v22.8H, #11 // ........................................................................................*............................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v13.8H, v6.8H, v8.H[1] // ...................................................................................................*.................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v0.8H, v0.8H, #11 // .................................................................................................*.................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v16.8H, v22.8H, v8.H[0] // .........................................................................................*............................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v22.8H, v17.8H, #11 // .....................................................................................*................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v26.8H, v0.8H, v8.H[0] // ..................................................................................................*................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v13.8H, v13.8H, #11 // ....................................................................................................*................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v15.8H, v22.8H, v8.H[0] // ......................................................................................*............................... + srshr v0.8H, v31.8H, #11 // .......................................................................................................*.............. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v6.8H, v13.8H, v8.H[0] // .....................................................................................................*................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v13.8H, v25.8H, #11 // ..........................................................................................................*........... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v7.8H, v0.8H, v8.H[0] // ........................................................................................................*............. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v11.8H, v13.8H, v8.H[0] // ...........................................................................................................*.......... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v1.4S, v16.4S, v6.4S // ..............................................................................................................*....... + trn2 v16.4S, v16.4S, v6.4S // ...............................................................................................................*...... + trn1 v2.4S, v14.4S, v7.4S // ................................................................................................................*..... + trn2 v17.4S, v14.4S, v7.4S // .................................................................................................................*.... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v0.4S, v15.4S, v26.4S // ............................................................................................................*......... + trn2 v15.4S, v15.4S, v26.4S // .............................................................................................................*........ + trn1 v3.4S, v27.4S, v11.4S // ..................................................................................................................*... + trn2 v18.4S, v27.4S, v11.4S // ...................................................................................................................*.. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + st4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x6], #64 // ....................................................................................................................*. + st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x7], #64 // .....................................................................................................................* + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... - ld4 {data8.4S, data9.4S, data10.4S, data11.4S}, [src0] - ld4 {data12.4S, data13.4S, data14.4S, data15.4S}, [src1] - - trn1_s data0, data8, data12 - trn2_s data4, data8, data12 - trn1_s data1, data9, data13 - trn2_s data5, data9, data13 - trn1_s data2, data10, data14 - trn2_s data6, data10, data14 - trn1_s data3, data11, data15 - trn2_s data7, data11, data15 - - // load twiddle factors - ldr_vi root0, r_ptr1, 16*14 - ldr_vo root0_tw, r_ptr1, -16*14+16*1 - ldr_vo root1, r_ptr1, -16*14+16*2 - ldr_vo root1_tw, r_ptr1, -16*14+16*3 - ldr_vo root2, r_ptr1, -16*14+16*4 - ldr_vo root2_tw, r_ptr1, -16*14+16*5 - ldr_vo root3, r_ptr1, -16*14+16*6 - ldr_vo root3_tw, r_ptr1, -16*14+16*7 - - ldr_vo data8, r_ptr1, -16*14+16*8 - ldr_vo data9, r_ptr1, -16*14+16*9 - ldr_vo data10, r_ptr1, -16*14+16*10 - ldr_vo data11, r_ptr1, -16*14+16*11 - ldr_vo data12, r_ptr1, -16*14+16*12 - ldr_vo data13, r_ptr1, -16*14+16*13 - - // butterflies - ct_butterfly_v data0, data4, root0, root0_tw - ct_butterfly_v data1, data5, root0, root0_tw - ct_butterfly_v data2, data6, root0, root0_tw - ct_butterfly_v data3, data7, root0, root0_tw - - ct_butterfly_v data0, data2, root1, root1_tw - ct_butterfly_v data1, data3, root1, root1_tw - ct_butterfly_v data4, data6, root2, root2_tw - ct_butterfly_v data5, data7, root2, root2_tw - - ct_butterfly_v data0, data1, root3, root3_tw - ct_butterfly_v data2, data3, data8, data9 - ct_butterfly_v data4, data5, data10, data11 - ct_butterfly_v data6, data7, data12, data13 - - // transpose back - trn1_s data8, data0, data4 - trn2_s data12, data0, data4 - trn1_s data9, data1, data5 - trn2_s data13, data1, data5 - trn1_s data10, data2, data6 - trn2_s data14, data2, data6 - trn1_s data11, data3, data7 - trn2_s data15, data3, data7 - - // reduce - barrett_reduce data8, barrett_const, 0 - barrett_reduce data9, barrett_const, 0 - barrett_reduce data10, barrett_const, 0 - barrett_reduce data11, barrett_const, 0 - barrett_reduce data12, barrett_const, 0 - barrett_reduce data13, barrett_const, 0 - barrett_reduce data14, barrett_const, 0 - barrett_reduce data15, barrett_const, 0 - - st4 {data8.4S, data9.4S, data10.4S, data11.4S}, [src0], #64 - st4 {data12.4S, data13.4S, data14.4S, data15.4S}, [src1], #64 - - subs count, count, #1 + // --------------------------------------------------------------------------- new position ----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6] // ................................................*..................................................................................................................... + // ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7] // ................................................'*.................................................................................................................... + // trn1 v9.4s, v17.4s, v21.4s // ................................................'.*................................................................................................................... + // trn2 v13.4s, v17.4s, v21.4s // ................................................'....*................................................................................................................ + // trn1 v10.4s, v18.4s, v22.4s // ................................................'...................................*................................................................................. + // trn2 v14.4s, v18.4s, v22.4s // ................................................'................*.................................................................................................... + // trn1 v11.4s, v19.4s, v23.4s // ................................................'...................*................................................................................................. + // trn2 v15.4s, v19.4s, v23.4s // ................................................'...*................................................................................................................. + // trn1 v12.4s, v20.4s, v24.4s // ................................................'..........................*.......................................................................................... + // trn2 v16.4s, v20.4s, v24.4s // ................................................'...........*......................................................................................................... + // ldr q0, [ x4], #16*14 // ................................................'......*.............................................................................................................. + // ldr q4, [x4, #-16*14+16*1] // e...............................................'.....................................................................~............................................... + // ldr q1, [ x4, #-16*14+16*2] // ................................................'.......................*............................................................................................. + // ldr q5, [x4, #-16*14+16*3] // ................................................'.......*............................................................................................................. + // ldr q2, [ x4, #-16*14+16*4] // ................................................'........................................................*............................................................ + // ldr q6, [x4, #-16*14+16*5] // ................................................'...........................*......................................................................................... + // ldr q3, [ x4, #-16*14+16*6] // ................................................'..............*...................................................................................................... + // ldr q7, [x4, #-16*14+16*7] // ................................................'........................................*............................................................................ + // ldr q17, [ x4, #-16*14+16*8] // ................................................'..........*.......................................................................................................... + // ldr q18, [ x4, #-16*14+16*9] // ................................................'..*.................................................................................................................. + // ldr q19, [ x4, #-16*14+16*10] // ................................................'........*............................................................................................................ + // ldr q20, [ x4, #-16*14+16*11] // ................................................'.............*....................................................................................................... + // ldr q21, [ x4, #-16*14+16*12] // ................................................'.....*............................................................................................................... + // ldr q22, [ x4, #-16*14+16*13] // ................................................'....................................*................................................................................ + // sqrdmulh v28.8h, v13.8h, v4.8h // ................................................'.................................*................................................................................... + // mul v25.8h, v13.8h, v0.8h // ................................................'.......................................*............................................................................. + // mla v25.8h, v28.8h, v8.h[0] // ................................................'.........................................*........................................................................... + // sub v13.8h, v9.8h, v25.8h // ...~............................................'........................................................................*............................................ + // add v9.8h, v9.8h, v25.8h // ................................................'..............................................*...................................................................... + // sqrdmulh v28.8h, v14.8h, v4.8h // ................................................'..................*.................................................................................................. + // mul v25.8h, v14.8h, v0.8h // ................................................'............................*........................................................................................ + // mla v25.8h, v28.8h, v8.h[0] // ................................................'...............................*..................................................................................... + // sub v14.8h, v10.8h, v25.8h // ................................................'..................................................................*.................................................. + // add v10.8h, v10.8h, v25.8h // ................................................'......................................*.............................................................................. + // sqrdmulh v28.8h, v15.8h, v4.8h // ................................................'.........*........................................................................................................... + // mul v25.8h, v15.8h, v0.8h // ................................................'............*........................................................................................................ + // mla v25.8h, v28.8h, v8.h[0] // ................................................'.................*................................................................................................... + // sub v15.8h, v11.8h, v25.8h // ................................................'...............................................................*..................................................... + // add v11.8h, v11.8h, v25.8h // ................................................'.....................*............................................................................................... + // sqrdmulh v28.8h, v16.8h, v4.8h // ................................................'...............*..................................................................................................... + // mul v25.8h, v16.8h, v0.8h // ................................................'....................*................................................................................................ + // mla v25.8h, v28.8h, v8.h[0] // ................................................'......................*.............................................................................................. + // sub v16.8h, v12.8h, v25.8h // ................................................'..............................*...................................................................................... + // add v12.8h, v12.8h, v25.8h // ................................................'.............................*....................................................................................... + // sqrdmulh v28.8h, v11.8h, v5.8h // ................................................'........................*............................................................................................ + // mul v25.8h, v11.8h, v1.8h // ................................................'.........................*........................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ................................................'...........................................*......................................................................... + // sub v11.8h, v9.8h, v25.8h // ................................................'................................................*.................................................................... + // add v9.8h, v9.8h, v25.8h // ................................................'....................................................*................................................................ + // sqrdmulh v28.8h, v12.8h, v5.8h // ................................................'................................*.................................................................................... + // mul v25.8h, v12.8h, v1.8h // ................................................'..................................*.................................................................................. + // mla v25.8h, v28.8h, v8.h[0] // ................................................'.....................................*............................................................................... + // sub v12.8h, v10.8h, v25.8h // ................................................'............................................*........................................................................ + // add v10.8h, v10.8h, v25.8h // ................................................'..........................................*.......................................................................... + // sqrdmulh v28.8h, v15.8h, v6.8h // ................................................'....................................................................*................................................ + // mul v25.8h, v15.8h, v2.8h // ................................................'.................................................................*................................................... + // mla v25.8h, v28.8h, v8.h[0] // .....~..........................................'..........................................................................*.......................................... + // sub v15.8h, v13.8h, v25.8h // .........~......................................'..............................................................................*...................................... + // add v13.8h, v13.8h, v25.8h // ...........~....................................'................................................................................*.................................... + // sqrdmulh v28.8h, v16.8h, v6.8h // ................................................'.........................................................*........................................................... + // mul v25.8h, v16.8h, v2.8h // ................................................'.............................................................*....................................................... + // mla v25.8h, v28.8h, v8.h[0] // ................................................'................................................................*.................................................... + // sub v16.8h, v14.8h, v25.8h // .~..............................................'......................................................................*.............................................. + // add v14.8h, v14.8h, v25.8h // .......~........................................'............................................................................*........................................ + // sqrdmulh v28.8h, v10.8h, v7.8h // ................................................'.............................................*....................................................................... + // mul v25.8h, v10.8h, v3.8h // ................................................'...............................................*..................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ................................................'..................................................*.................................................................. + // sub v10.8h, v9.8h, v25.8h // ................................................'.......................................................*............................................................. + // add v9.8h, v9.8h, v25.8h // ................................................'......................................................*.............................................................. + // sqrdmulh v28.8h, v12.8h, v18.8h // ................................................'.................................................*................................................................... + // mul v25.8h, v12.8h, v17.8h // ................................................'...................................................*................................................................. + // mla v25.8h, v28.8h, v8.h[0] // ................................................'.....................................................*............................................................... + // sub v12.8h, v11.8h, v25.8h // ................................................'............................................................*........................................................ + // add v11.8h, v11.8h, v25.8h // ................................................'...........................................................*......................................................... + // sqrdmulh v28.8h, v14.8h, v20.8h // ..........~.....................................'...............................................................................*..................................... + // mul v25.8h, v14.8h, v19.8h // ............~...................................'.................................................................................*................................... + // mla v25.8h, v28.8h, v8.h[0] // ...............~................................'....................................................................................*................................ + // sub v14.8h, v13.8h, v25.8h // ......................~.........................'...........................................................................................*......................... + // add v13.8h, v13.8h, v25.8h // ...................~............................'........................................................................................*............................ + // sqrdmulh v28.8h, v16.8h, v22.8h // ....~...........................................'.........................................................................*........................................... + // mul v25.8h, v16.8h, v21.8h // ......~.........................................'...........................................................................*......................................... + // mla v25.8h, v28.8h, v8.h[0] // ........~.......................................'.............................................................................*....................................... + // sub v16.8h, v15.8h, v25.8h // .................~..............................'......................................................................................*.............................. + // add v15.8h, v15.8h, v25.8h // .............~..................................'..................................................................................*.................................. + // sqdmulh v26.8h, v9.8h, v8.h[1] // ................................................'..........................................................*.......................................................... + // srshr v26.8h, v26.8h, #11 // .............................~..................'..................................................................................................*.................. + // mla v9.8h, v26.8h, v8.h[0] // ................................~...............'.....................................................................................................*............... + // sqdmulh v26.8h, v10.8h, v8.h[1] // ..................~.............................'.......................................................................................*............................. + // srshr v26.8h, v26.8h, #11 // .........................~......................'..............................................................................................*...................... + // mla v10.8h, v26.8h, v8.h[0] // ............................~...................'.................................................................................................*................... + // sqdmulh v26.8h, v11.8h, v8.h[1] // ................................................'..............................................................*...................................................... + // srshr v26.8h, v26.8h, #11 // ................................................'...................................................................*................................................. + // mla v11.8h, v26.8h, v8.h[0] // ..~.............................................'.......................................................................*............................................. + // sqdmulh v26.8h, v12.8h, v8.h[1] // ..............~.................................'...................................................................................*................................. + // srshr v26.8h, v26.8h, #11 // ....................~...........................'.........................................................................................*........................... + // mla v12.8h, v26.8h, v8.h[0] // ........................~.......................'.............................................................................................*....................... + // sqdmulh v26.8h, v13.8h, v8.h[1] // .......................~........................'............................................................................................*........................ + // srshr v26.8h, v26.8h, #11 // ...........................~....................'................................................................................................*.................... + // mla v13.8h, v26.8h, v8.h[0] // ..............................~.................'...................................................................................................*................. + // sqdmulh v26.8h, v14.8h, v8.h[1] // ..........................~.....................'...............................................................................................*..................... + // srshr v26.8h, v26.8h, #11 // ...............................~................'....................................................................................................*................ + // mla v14.8h, v26.8h, v8.h[0] // ..................................~.............'.......................................................................................................*............. + // sqdmulh v26.8h, v15.8h, v8.h[1] // ................~...............................'.....................................................................................*............................... + // srshr v26.8h, v26.8h, #11 // .................................~..............'......................................................................................................*.............. + // mla v15.8h, v26.8h, v8.h[0] // ....................................~...........'.........................................................................................................*........... + // sqdmulh v26.8h, v16.8h, v8.h[1] // .....................~..........................'..........................................................................................*.......................... + // srshr v26.8h, v26.8h, #11 // ...................................~............'........................................................................................................*............ + // mla v16.8h, v26.8h, v8.h[0] // .....................................~..........'..........................................................................................................*.......... + // trn1 v17.4s, v9.4s, v13.4s // ..........................................~.....'...............................................................................................................*..... + // trn2 v21.4s, v9.4s, v13.4s // ...........................................~....'................................................................................................................*.... + // trn1 v18.4s, v10.4s, v14.4s // ......................................~.........'...........................................................................................................*......... + // trn2 v22.4s, v10.4s, v14.4s // .......................................~........'............................................................................................................*........ + // trn1 v19.4s, v11.4s, v15.4s // ........................................~.......'.............................................................................................................*....... + // trn2 v23.4s, v11.4s, v15.4s // .........................................~......'..............................................................................................................*...... + // trn1 v20.4s, v12.4s, v16.4s // ............................................~...'.................................................................................................................*... + // trn2 v24.4s, v12.4s, v16.4s // .............................................~..'..................................................................................................................*.. + // st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6], #64 // ..............................................~.'...................................................................................................................*. + // st4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7], #64 // ...............................................~'....................................................................................................................* + + sub count, count, #1 cbnz count, layer567_start + // Instructions: 117 + // Expected cycles: 118 + // Expected IPC: 0.99 + // + // Cycle bound: 118.0 + // IPC bound: 0.99 + // + // Wall time: 22.37s + // User time: 22.37s + // + // ------------------------------------------------ original position -------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|---------------- + ldr q12, [x4, #80] // ............................*........................................................................................ + ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x6] // *.................................................................................................................... + ld4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x7] // .*................................................................................................................... + ldr q9, [x4, #112] // .........................................*........................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + ldr q1, [x4, #32] // ........................*............................................................................................ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + ldr q14, [x4, #144] // ...*................................................................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + trn2 v0.4S, v17.4S, v29.4S // ....*................................................................................................................ + ldr q5, [x4, #208] // .....................................*............................................................................... + trn2 v26.4S, v18.4S, v30.4S // ............*........................................................................................................ + trn1 v25.4S, v17.4S, v29.4S // ....................*................................................................................................ + trn2 v20.4S, v15.4S, v27.4S // .....*............................................................................................................... + ldr q17, [x4, #96] // ...............*..................................................................................................... + trn1 v11.4S, v18.4S, v30.4S // ...........................*......................................................................................... + trn2 v22.4S, v16.4S, v28.4S // .................*................................................................................................... + ldr q6, [x4, #192] // ......*.............................................................................................................. + trn1 v15.4S, v15.4S, v27.4S // ..*.................................................................................................................. + sqrdmulh v7.8H, v26.8H, v19.8H // ................*.................................................................................................... + ldr q31, [x4], #16*14 // .......*............................................................................................................. + ldr q27, [x4, #-64] // .........*........................................................................................................... + trn1 v28.4S, v16.4S, v28.4S // ....................................*................................................................................ + ldr q13, [x4, #-96] // ...........*......................................................................................................... + sqrdmulh v23.8H, v0.8H, v19.8H // ..........*.......................................................................................................... + ldr q10, [x4, #-160] // .........................................................*........................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v4.8H, v26.8H, v31.8H // .....................*............................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v4.8H, v7.8H, v8.H[0] // .......................*............................................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v26.8H, v0.8H, v31.8H // .............*....................................................................................................... + ldr q0, [x4, #-176] // ........*............................................................................................................ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v26.8H, v23.8H, v8.H[0] // ..................*.................................................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + add v7.8H, v11.8H, v4.8H // ..............................*...................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqrdmulh v18.8H, v22.8H, v19.8H // ...................*................................................................................................. + sub v3.8H, v11.8H, v4.8H // ...............................*..................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqrdmulh v16.8H, v7.8H, v0.8H // .................................*................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqrdmulh v2.8H, v20.8H, v19.8H // ..................................*.................................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v11.8H, v22.8H, v31.8H // .............................*....................................................................................... + // gap // ..................................................................................................................... + ldr q22, [x4, #-48] // ..............*...................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v11.8H, v18.8H, v8.H[0] // ................................*.................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v19.8H, v7.8H, v1.8H // ...................................*................................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v19.8H, v16.8H, v8.H[0] // ......................................*.............................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + add v30.8H, v28.8H, v11.8H // .......................................*............................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sub v16.8H, v28.8H, v11.8H // ...................................................................*................................................. + sqrdmulh v11.8H, v3.8H, v12.8H // ..........................................................*.......................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v7.8H, v20.8H, v31.8H // ........................................*............................................................................ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sub v31.8H, v30.8H, v19.8H // .............................................*....................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + add v19.8H, v30.8H, v19.8H // ...........................................*......................................................................... + // gap // ..................................................................................................................... + mul v3.8H, v3.8H, v10.8H // ..............................................................*...................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v29.8H, v31.8H, v13.8H // ....................................................*................................................................ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sub v13.8H, v25.8H, v26.8H // ................................................................*.................................................... + sqrdmulh v14.8H, v31.8H, v14.8H // ..................................................*.................................................................. + add v31.8H, v25.8H, v26.8H // ......................*.............................................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v3.8H, v11.8H, v8.H[0] // .................................................................*................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqrdmulh v0.8H, v31.8H, v0.8H // .........................*........................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v21.8H, v19.8H, v17.8H // ................................................*.................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sub v11.8H, v16.8H, v3.8H // ......................................................................*.............................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v17.8H, v31.8H, v1.8H // ..........................*.......................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v17.8H, v0.8H, v8.H[0] // ............................................*........................................................................ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v7.8H, v2.8H, v8.H[0] // ..........................................*.......................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + add v16.8H, v16.8H, v3.8H // ............................................................................*........................................ + sqrdmulh v3.8H, v19.8H, v9.8H // ..............................................*...................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqrdmulh v20.8H, v11.8H, v5.8H // .........................................................................*........................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + add v23.8H, v15.8H, v7.8H // ...............................................*..................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sub v28.8H, v15.8H, v7.8H // ........................................................................*............................................ + sqrdmulh v15.8H, v16.8H, v22.8H // ...............................................................................*..................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + add v25.8H, v23.8H, v17.8H // .....................................................*............................................................... + sqrdmulh v22.8H, v13.8H, v12.8H // .....................................................................*............................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v27.8H, v16.8H, v27.8H // .................................................................................*................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v16.8H, v13.8H, v10.8H // ..................................................................*.................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v16.8H, v22.8H, v8.H[0] // ..........................................................................*.......................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mul v26.8H, v11.8H, v6.8H // ...........................................................................*......................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v26.8H, v20.8H, v8.H[0] // .............................................................................*....................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sub v11.8H, v28.8H, v16.8H // ..............................................................................*...................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v27.8H, v15.8H, v8.H[0] // ....................................................................................*................................ + add v16.8H, v28.8H, v16.8H // ................................................................................*.................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v21.8H, v3.8H, v8.H[0] // ...................................................*................................................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + add v13.8H, v11.8H, v26.8H // ..................................................................................*.................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v29.8H, v14.8H, v8.H[0] // ......................................................*.............................................................. + sub v14.8H, v23.8H, v17.8H // .................................................*................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + add v7.8H, v16.8H, v27.8H // ........................................................................................*............................ + sub v15.8H, v16.8H, v27.8H // ...........................................................................................*......................... + // gap // ..................................................................................................................... + sqdmulh v27.8H, v13.8H, v8.H[1] // .....................................................................................*............................... + sub v6.8H, v11.8H, v26.8H // ......................................................................................*.............................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqdmulh v3.8H, v7.8H, v8.H[1] // ............................................................................................*........................ + sub v31.8H, v25.8H, v21.8H // ........................................................*............................................................ + // gap // ..................................................................................................................... + add v2.8H, v14.8H, v29.8H // ............................................................*........................................................ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sub v14.8H, v14.8H, v29.8H // .............................................................*....................................................... + // gap // ..................................................................................................................... + sqdmulh v22.8H, v15.8H, v8.H[1] // ...............................................................................................*..................... + srshr v27.8H, v27.8H, #11 // ......................................................................................................*.............. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqdmulh v16.8H, v2.8H, v8.H[1] // ...............................................................*..................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v13.8H, v27.8H, v8.H[0] // .........................................................................................................*........... + add v27.8H, v25.8H, v21.8H // .......................................................*............................................................. + // gap // ..................................................................................................................... + srshr v26.8H, v22.8H, #11 // ....................................................................................................*................ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqdmulh v0.8H, v31.8H, v8.H[1] // .......................................................................................*............................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + srshr v1.8H, v16.8H, #11 // ....................................................................*................................................ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqdmulh v25.8H, v14.8H, v8.H[1] // ...................................................................................*................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v15.8H, v26.8H, v8.H[0] // .......................................................................................................*............. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + srshr v16.8H, v0.8H, #11 // ..............................................................................................*...................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqdmulh v26.8H, v27.8H, v8.H[1] // ...........................................................*......................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + srshr v11.8H, v3.8H, #11 // ................................................................................................*.................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v31.8H, v16.8H, v8.H[0] // .................................................................................................*................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v2.8H, v1.8H, v8.H[0] // .......................................................................*............................................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + srshr v16.8H, v26.8H, #11 // ..................................................................................................*.................. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + sqdmulh v22.8H, v6.8H, v8.H[1] // ..........................................................................................*.......................... + trn1 v28.4S, v31.4S, v15.4S // ...........................................................................................................*......... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v7.8H, v11.8H, v8.H[0] // ...................................................................................................*................. + trn2 v17.4S, v31.4S, v15.4S // ............................................................................................................*........ + // gap // ..................................................................................................................... + srshr v15.8H, v25.8H, #11 // .........................................................................................*........................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v27.8H, v16.8H, v8.H[0] // .....................................................................................................*............... + trn2 v18.4S, v2.4S, v13.4S // ..............................................................................................................*...... + // gap // ..................................................................................................................... + srshr v25.8H, v22.8H, #11 // ........................................................................................................*............ + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v14.8H, v15.8H, v8.H[0] // .............................................................................................*....................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + mla v6.8H, v25.8H, v8.H[0] // ..........................................................................................................*.......... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + trn2 v16.4S, v27.4S, v7.4S // ................................................................................................................*.... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + trn1 v29.4S, v2.4S, v13.4S // .............................................................................................................*....... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + trn1 v27.4S, v27.4S, v7.4S // ...............................................................................................................*..... + trn1 v30.4S, v14.4S, v6.4S // .................................................................................................................*... + trn2 v19.4S, v14.4S, v6.4S // ..................................................................................................................*.. + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + // gap // ..................................................................................................................... + st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x6], #64 // ...................................................................................................................*. + st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x7], #64 // ....................................................................................................................* + // gap // ..................................................................................................................... + + // --------------------------------------------------- new position ---------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|---------------- + // ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x6] // .*................................................................................................................... + // ld4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x7] // ..*.................................................................................................................. + // trn1 v20.4S, v15.4S, v24.4S // ...............*..................................................................................................... + // ldr q13, [x4, #144] // .....*............................................................................................................... + // trn2 v14.4S, v17.4S, v26.4S // ......*.............................................................................................................. + // trn2 v9.4S, v15.4S, v24.4S // ..........*.......................................................................................................... + // ldr q1, [x4, #192] // ..............*...................................................................................................... + // ldr q5, [x4], #16*14 // .................*................................................................................................... + // ldr q7, [x4, #-176] // ..........................*.......................................................................................... + // ldr q23, [x4, #-64] // ..................*.................................................................................................. + // sqrdmulh v30.8H, v14.8H, v19.8H // .....................*............................................................................................... + // ldr q4, [x4, #-96] // ....................*................................................................................................ + // trn2 v3.4S, v18.4S, v27.4S // ........*............................................................................................................ + // mul v29.8H, v14.8H, v5.8H // .........................*........................................................................................... + // ldr q24, [x4, #-48] // ..................................*.................................................................................. + // ldr q11, [x4, #-128] // ...........*......................................................................................................... + // sqrdmulh v15.8H, v3.8H, v19.8H // ................*.................................................................................................... + // trn2 v21.4S, v16.4S, v25.4S // .............*....................................................................................................... + // mla v29.8H, v30.8H, v8.H[0] // ...........................*......................................................................................... + // sqrdmulh v6.8H, v21.8H, v19.8H // .............................*....................................................................................... + // trn1 v2.4S, v17.4S, v26.4S // .........*........................................................................................................... + // mul v31.8H, v3.8H, v5.8H // .......................*............................................................................................. + // add v0.8H, v2.8H, v29.8H // ................................................*.................................................................... + // mla v31.8H, v15.8H, v8.H[0] // ........................*............................................................................................ + // ldr q22, [x4, #-192] // ....*................................................................................................................ + // sqrdmulh v17.8H, v0.8H, v7.8H // ..................................................*.................................................................. + // mul v14.8H, v0.8H, v22.8H // .....................................................*............................................................... + // trn1 v3.4S, v18.4S, v27.4S // ............*........................................................................................................ + // ldr q12, [x4, #-144] // *.................................................................................................................... + // mul v0.8H, v21.8H, v5.8H // .................................*................................................................................... + // add v15.8H, v3.8H, v31.8H // ............................*........................................................................................ + // sub v31.8H, v3.8H, v31.8H // ..............................*...................................................................................... + // mla v0.8H, v6.8H, v8.H[0] // ...................................*................................................................................. + // sqrdmulh v27.8H, v15.8H, v7.8H // ...............................*..................................................................................... + // sqrdmulh v26.8H, v9.8H, v19.8H // ................................*.................................................................................... + // mul v28.8H, v15.8H, v22.8H // ....................................*................................................................................ + // trn1 v30.4S, v16.4S, v25.4S // ...................*................................................................................................. + // ldr q3, [x4, #-16] // .......*............................................................................................................. + // mla v28.8H, v27.8H, v8.H[0] // .....................................*............................................................................... + // add v16.8H, v30.8H, v0.8H // ......................................*.............................................................................. + // mul v22.8H, v9.8H, v5.8H // .........................................*........................................................................... + // ldr q5, [x4, #-112] // ...*................................................................................................................. + // mla v22.8H, v26.8H, v8.H[0] // .......................................................*............................................................. + // add v15.8H, v16.8H, v28.8H // ...........................................*......................................................................... + // mla v14.8H, v17.8H, v8.H[0] // ......................................................*.............................................................. + // sub v27.8H, v16.8H, v28.8H // ..........................................*.......................................................................... + // sqrdmulh v16.8H, v15.8H, v5.8H // .........................................................*........................................................... + // add v26.8H, v20.8H, v22.8H // ...........................................................*......................................................... + // mul v6.8H, v15.8H, v11.8H // ...................................................*................................................................. + // sub v5.8H, v26.8H, v14.8H // ...........................................................................*......................................... + // sqrdmulh v15.8H, v27.8H, v13.8H // ...............................................*..................................................................... + // mla v6.8H, v16.8H, v8.H[0] // ........................................................................*............................................ + // mul v27.8H, v27.8H, v4.8H // .............................................*....................................................................... + // add v10.8H, v26.8H, v14.8H // ..............................................................*...................................................... + // mla v27.8H, v15.8H, v8.H[0] // ..........................................................................*.......................................... + // add v15.8H, v10.8H, v6.8H // ........................................................................................*............................ + // sub v16.8H, v10.8H, v6.8H // .................................................................................*................................... + // ldr q4, [x4, #-160] // ......................*.............................................................................................. + // sqrdmulh v26.8H, v31.8H, v12.8H // ........................................*............................................................................ + // sqdmulh v17.8H, v15.8H, v8.H[1] // ...............................................................................................*..................... + // add v14.8H, v5.8H, v27.8H // ..................................................................................*.................................. + // sub v27.8H, v5.8H, v27.8H // ...................................................................................*................................. + // mul v7.8H, v31.8H, v4.8H // ............................................*........................................................................ + // sqdmulh v31.8H, v14.8H, v8.H[1] // ......................................................................................*.............................. + // sub v19.8H, v2.8H, v29.8H // ..............................................*...................................................................... + // mla v7.8H, v26.8H, v8.H[0] // .................................................*................................................................... + // mul v13.8H, v19.8H, v4.8H // .................................................................*................................................... + // sub v25.8H, v30.8H, v0.8H // .......................................*............................................................................. + // srshr v0.8H, v31.8H, #11 // ...........................................................................................*......................... + // sqrdmulh v31.8H, v19.8H, v12.8H // ...............................................................*..................................................... + // sub v26.8H, v25.8H, v7.8H // ....................................................*................................................................ + // mla v14.8H, v0.8H, v8.H[0] // ..................................................................................................*.................. + // sub v0.8H, v20.8H, v22.8H // ............................................................*........................................................ + // sqrdmulh v6.8H, v26.8H, v3.8H // ..........................................................*.......................................................... + // mla v13.8H, v31.8H, v8.H[0] // ..................................................................*.................................................. + // mul v22.8H, v26.8H, v1.8H // ...................................................................*................................................. + // add v26.8H, v25.8H, v7.8H // ........................................................*............................................................ + // mla v22.8H, v6.8H, v8.H[0] // ....................................................................*................................................ + // sub v2.8H, v0.8H, v13.8H // .....................................................................*............................................... + // sqrdmulh v29.8H, v26.8H, v24.8H // .............................................................*....................................................... + // add v3.8H, v0.8H, v13.8H // .......................................................................*............................................. + // mul v0.8H, v26.8H, v23.8H // ................................................................*.................................................... + // add v7.8H, v2.8H, v22.8H // .........................................................................*........................................... + // sqdmulh v25.8H, v27.8H, v8.H[1] // ............................................................................................*........................ + // mla v0.8H, v29.8H, v8.H[0] // ......................................................................*.............................................. + // sqdmulh v31.8H, v7.8H, v8.H[1] // ..............................................................................*...................................... + // sub v11.8H, v2.8H, v22.8H // ...............................................................................*..................................... + // sqdmulh v22.8H, v16.8H, v8.H[1] // ..........................................................................................*.......................... + // add v26.8H, v3.8H, v0.8H // ............................................................................*........................................ + // srshr v13.8H, v25.8H, #11 // ........................................................................................................*............ + // sqdmulh v25.8H, v11.8H, v8.H[1] // ....................................................................................................*................ + // sub v6.8H, v3.8H, v0.8H // .............................................................................*....................................... + // sqdmulh v0.8H, v26.8H, v8.H[1] // ................................................................................*.................................... + // mla v27.8H, v13.8H, v8.H[0] // ............................................................................................................*........ + // srshr v22.8H, v22.8H, #11 // ..............................................................................................*...................... + // sqdmulh v13.8H, v6.8H, v8.H[1] // ....................................................................................*................................ + // srshr v0.8H, v0.8H, #11 // ................................................................................................*.................... + // mla v16.8H, v22.8H, v8.H[0] // .................................................................................................*................... + // srshr v22.8H, v17.8H, #11 // ...................................................................................................*................. + // mla v26.8H, v0.8H, v8.H[0] // ......................................................................................................*.............. + // srshr v13.8H, v13.8H, #11 // .........................................................................................*........................... + // mla v15.8H, v22.8H, v8.H[0] // .........................................................................................................*........... + // srshr v0.8H, v31.8H, #11 // .....................................................................................*............................... + // mla v6.8H, v13.8H, v8.H[0] // .............................................................................................*....................... + // srshr v13.8H, v25.8H, #11 // ...........................................................................................................*......... + // mla v7.8H, v0.8H, v8.H[0] // .......................................................................................*............................. + // mla v11.8H, v13.8H, v8.H[0] // .............................................................................................................*....... + // trn1 v1.4S, v16.4S, v6.4S // .....................................................................................................*............... + // trn2 v16.4S, v16.4S, v6.4S // .......................................................................................................*............. + // trn1 v2.4S, v14.4S, v7.4S // ...............................................................................................................*..... + // trn2 v17.4S, v14.4S, v7.4S // ..........................................................................................................*.......... + // trn1 v0.4S, v15.4S, v26.4S // ................................................................................................................*.... + // trn2 v15.4S, v15.4S, v26.4S // ..............................................................................................................*...... + // trn1 v3.4S, v27.4S, v11.4S // .................................................................................................................*... + // trn2 v18.4S, v27.4S, v11.4S // ..................................................................................................................*.. + // st4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x6], #64 // ...................................................................................................................*. + // st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x7], #64 // ....................................................................................................................* + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_1234_567_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_kyber_1234_567_opt_m1_firestorm.s new file mode 100644 index 00000000..717542ca --- /dev/null +++ b/examples/opt/aarch64/ntt_kyber_1234_567_opt_m1_firestorm.s @@ -0,0 +1,1934 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmla d,a,b + mla \d\().8h, \a\().8h, \b\().8h +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmlaq d,a,b,i + mla \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlaq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vmlaq \dst, t2, consts, 0 +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlaq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #16 +.endm + +.macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_kyber_1234_567_twiddles.s" +.text + + .global ntt_kyber_1234_567_opt_m1_firestorm + .global _ntt_kyber_1234_567 + +.p2align 4 +const_addr: .short -3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_kyber_1234_567_opt_m1_firestorm: +_ntt_kyber_1234_567_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + src0 .req x6 + src1 .req x7 + src2 .req x8 + src3 .req x9 + src4 .req x10 + src5 .req x11 + src6 .req x12 + src7 .req x13 + src8 .req x14 + src9 .req x15 + src10 .req x16 + src11 .req x17 + src12 .req x18 + src13 .req x19 + src14 .req x20 + src15 .req x21 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + data8 .req v17 + data9 .req v18 + data10 .req v19 + data11 .req v20 + data12 .req v21 + data13 .req v22 + data14 .req v23 + data15 .req v24 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + qform_data8 .req q17 + qform_data9 .req q18 + qform_data10 .req q19 + qform_data11 .req q20 + qform_data12 .req q21 + qform_data13 .req q22 + qform_data14 .req q23 + qform_data15 .req q24 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + tmp .req v25 + t0 .req v26 + t1 .req v27 + t2 .req v28 + t3 .req v29 + + consts .req v8 + + ASM_LOAD(r_ptr0, roots) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + add src0, x0, #32*0 + add src8, x0, #32*8 + + ld1 { root0.8h, root1.8h, root2.8h, root3.8h}, [r_ptr0], #64 + + mov count, #2 + + .p2align 2 + ldr q15, [x14, #64] // ..........*..................................................................................................................................................................................... + ldr q27, [x14, #224] // ...............*................................................................................................................................................................................ + ldr q16, [x14, #0] // ........*....................................................................................................................................................................................... + ldr q6, [x14, #96] // ...........*.................................................................................................................................................................................... + ldr q26, [x14, #160] // .............*.................................................................................................................................................................................. + ldr q14, [x14, #192] // ..............*................................................................................................................................................................................. + ldr q30, [x6, #0] // *............................................................................................................................................................................................... + ldr q11, [x6, #96] // ...*............................................................................................................................................................................................ + ldr q7, [x14, #128] // ............*................................................................................................................................................................................... + ldr q17, [x6, #160] // .....*.......................................................................................................................................................................................... + ldr q31, [x14, #32] // .........*...................................................................................................................................................................................... + ldr q13, [x6, #64] // ..*............................................................................................................................................................................................. + ldr q12, [x6, #32] // .*.............................................................................................................................................................................................. + mul v9.8H, v27.8H, v0.H[0] // ....................................................*........................................................................................................................................... + sqrdmulh v27.8H, v27.8H, v0.H[1] // ...................................................*............................................................................................................................................ + sqrdmulh v18.8H, v16.8H, v0.H[1] // ................*............................................................................................................................................................................... + mul v29.8H, v16.8H, v0.H[0] // .................*.............................................................................................................................................................................. + ldr q19, [x6, #128] // ....*........................................................................................................................................................................................... + ldr q23, [x6, #192] // ......*......................................................................................................................................................................................... + mla v9.8H, v27.8H, v8.H[0] // .....................................................*.......................................................................................................................................... + sqrdmulh v10.8H, v6.8H, v0.H[1] // ...............................*................................................................................................................................................................ + mla v29.8H, v18.8H, v8.H[0] // ..................*............................................................................................................................................................................. + ldr q18, [x6, #224] // .......*........................................................................................................................................................................................ + mul v22.8H, v6.8H, v0.H[0] // ................................*............................................................................................................................................................... + mul v28.8H, v26.8H, v0.H[0] // ..........................................*..................................................................................................................................................... + sqrdmulh v5.8H, v14.8H, v0.H[1] // ..............................................*................................................................................................................................................. + mul v20.8H, v14.8H, v0.H[0] // ...............................................*................................................................................................................................................ + mul v14.8H, v7.8H, v0.H[0] // .....................................*.......................................................................................................................................................... + sqrdmulh v24.8H, v15.8H, v0.H[1] // ..........................*..................................................................................................................................................................... + sqrdmulh v27.8H, v7.8H, v0.H[1] // ....................................*........................................................................................................................................................... + sqrdmulh v26.8H, v26.8H, v0.H[1] // .........................................*...................................................................................................................................................... + mul v16.8H, v15.8H, v0.H[0] // ...........................*.................................................................................................................................................................... + sub v21.8H, v30.8H, v29.8H // ...................*............................................................................................................................................................................ + add v4.8H, v30.8H, v29.8H // ....................*........................................................................................................................................................................... + mla v22.8H, v10.8H, v8.H[0] // .................................*.............................................................................................................................................................. + sqrdmulh v25.8H, v31.8H, v0.H[1] // .....................*.......................................................................................................................................................................... + sub v15.8H, v18.8H, v9.8H // ......................................................*......................................................................................................................................... + add v18.8H, v18.8H, v9.8H // .......................................................*........................................................................................................................................ + mul v9.8H, v31.8H, v0.H[0] // ......................*......................................................................................................................................................................... + mla v16.8H, v24.8H, v8.H[0] // ............................*................................................................................................................................................................... + add v10.8H, v11.8H, v22.8H // ...................................*............................................................................................................................................................ + mul v7.8H, v15.8H, v0.H[4] // ............................................................................................*................................................................................................... + sqrdmulh v31.8H, v15.8H, v0.H[5] // ...........................................................................................*.................................................................................................... + mla v28.8H, v26.8H, v8.H[0] // ...........................................*.................................................................................................................................................... + mla v14.8H, v27.8H, v8.H[0] // ......................................*......................................................................................................................................................... + sub v29.8H, v11.8H, v22.8H // ..................................*............................................................................................................................................................. + mla v20.8H, v5.8H, v8.H[0] // ................................................*............................................................................................................................................... + sqrdmulh v24.8H, v18.8H, v0.H[3] // .......................................................................*........................................................................................................................ + add v27.8H, v13.8H, v16.8H // ..............................*................................................................................................................................................................. + mul v15.8H, v18.8H, v0.H[2] // ........................................................................*....................................................................................................................... + sub v30.8H, v13.8H, v16.8H // .............................*.................................................................................................................................................................. + mla v9.8H, v25.8H, v8.H[0] // .......................*........................................................................................................................................................................ + sub v11.8H, v17.8H, v28.8H // ............................................*................................................................................................................................................... + add v17.8H, v17.8H, v28.8H // .............................................*.................................................................................................................................................. + mla v7.8H, v31.8H, v8.H[0] // .............................................................................................*.................................................................................................. + add v6.8H, v19.8H, v14.8H // ........................................*....................................................................................................................................................... + sub v31.8H, v23.8H, v20.8H // .................................................*.............................................................................................................................................. + sub v22.8H, v19.8H, v14.8H // .......................................*........................................................................................................................................................ + sub v16.8H, v29.8H, v7.8H // ..............................................................................................*................................................................................................. + mul v5.8H, v11.8H, v0.H[4] // ..................................................................................*............................................................................................................. + sqrdmulh v11.8H, v11.8H, v0.H[5] // .................................................................................*.............................................................................................................. + sqrdmulh v14.8H, v17.8H, v0.H[3] // .............................................................*.................................................................................................................................. + mul v17.8H, v17.8H, v0.H[2] // ..............................................................*................................................................................................................................. + mla v15.8H, v24.8H, v8.H[0] // .........................................................................*...................................................................................................................... + add v26.8H, v23.8H, v20.8H // ..................................................*............................................................................................................................................. + sub v13.8H, v12.8H, v9.8H // ........................*....................................................................................................................................................................... + add v18.8H, v12.8H, v9.8H // .........................*...................................................................................................................................................................... + sqrdmulh v20.8H, v16.8H, v1.H[5] // ...................................................................................................................................*............................................................ + mul v19.8H, v22.8H, v0.H[4] // .............................................................................*.................................................................................................................. + mul v28.8H, v16.8H, v1.H[4] // ....................................................................................................................................*........................................................... + mla v5.8H, v11.8H, v8.H[0] // ...................................................................................*............................................................................................................ + mul v16.8H, v6.8H, v0.H[2] // .........................................................*...................................................................................................................................... + sqrdmulh v24.8H, v22.8H, v0.H[5] // ............................................................................*................................................................................................................... + add v12.8H, v29.8H, v7.8H // ...............................................................................................*................................................................................................ + mla v17.8H, v14.8H, v8.H[0] // ...............................................................*................................................................................................................................ + sub v23.8H, v10.8H, v15.8H // ..........................................................................*..................................................................................................................... + add v11.8H, v10.8H, v15.8H // ...........................................................................*.................................................................................................................... + sqrdmulh v29.8H, v31.8H, v0.H[5] // ......................................................................................*......................................................................................................... + mul v22.8H, v31.8H, v0.H[4] // .......................................................................................*........................................................................................................ + mul v31.8H, v11.8H, v0.H[6] // ......................................................................................................*......................................................................................... + sqrdmulh v15.8H, v11.8H, v0.H[7] // .....................................................................................................*.......................................................................................... + sqrdmulh v7.8H, v26.8H, v0.H[3] // ..................................................................*............................................................................................................................. + mul v14.8H, v26.8H, v0.H[2] // ...................................................................*............................................................................................................................ + mla v28.8H, v20.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + sqrdmulh v10.8H, v12.8H, v1.H[3] // .........................................................................................................................*...................................................................... + sub v26.8H, v13.8H, v5.8H // ....................................................................................*........................................................................................................... + mul v9.8H, v12.8H, v1.H[2] // ..........................................................................................................................*..................................................................... + sqrdmulh v25.8H, v6.8H, v0.H[3] // ........................................................*....................................................................................................................................... + mla v19.8H, v24.8H, v8.H[0] // ..............................................................................*................................................................................................................. + mul v11.8H, v23.8H, v1.H[0] // ................................................................................................................*............................................................................... + mla v22.8H, v29.8H, v8.H[0] // ........................................................................................*....................................................................................................... + mla v31.8H, v15.8H, v8.H[0] // .......................................................................................................*........................................................................................ + sqrdmulh v12.8H, v23.8H, v1.H[1] // ...............................................................................................................*................................................................................ + add v29.8H, v26.8H, v28.8H // .......................................................................................................................................*........................................................ + sub v26.8H, v26.8H, v28.8H // ......................................................................................................................................*......................................................... + add v23.8H, v18.8H, v17.8H // .................................................................*.............................................................................................................................. + sub count, count, #1 +layer1234_start: + // Instructions: 192 + // Expected cycles: 24 + // Expected IPC: 8.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + mul v28.8H, v29.8H, v3.H[2] // ....*........................................................................................................................................................................................... + sqrdmulh v24.8H, v29.8H, v3.H[3] // .....*.......................................................................................................................................................................................... + sub v29.8H, v23.8H, v31.8H // ..*............................................................................................................................................................................................. + sub v15.8H, v30.8H, v22.8H // ...............*................................................................................................................................................................................ + ldr q6, [x14, #240] // .................................................................................................*.............................................................................................. + mla v11.8H, v12.8H, v8.H[0] // .......*........................................................................................................................................................................................ + mla v14.8H, v7.8H, v8.H[0] // *............................................................................................................................................................................................... + add v23.8H, v23.8H, v31.8H // ...*............................................................................................................................................................................................ + mla v16.8H, v25.8H, v8.H[0] // ...................*............................................................................................................................................................................ + mul v12.8H, v29.8H, v2.H[0] // .......................................*........................................................................................................................................................ + sub v25.8H, v18.8H, v17.8H // ........*....................................................................................................................................................................................... + sqrdmulh v31.8H, v15.8H, v1.H[5] // .....................*.......................................................................................................................................................................... + mul v15.8H, v15.8H, v1.H[4] // ........................*....................................................................................................................................................................... + mla v28.8H, v24.8H, v8.H[0] // ................*............................................................................................................................................................................... + add v13.8H, v13.8H, v5.8H // .*.............................................................................................................................................................................................. + sqrdmulh v20.8H, v23.8H, v1.H[7] // ..........*..................................................................................................................................................................................... + mul v24.8H, v23.8H, v1.H[6] // .........*...................................................................................................................................................................................... + add v5.8H, v27.8H, v14.8H // .........................*...................................................................................................................................................................... + mul v23.8H, v6.8H, v0.H[0] // .............................................................................................................*.................................................................................. + mla v15.8H, v31.8H, v8.H[0] // ......................................*......................................................................................................................................................... + mul v18.8H, v5.8H, v0.H[6] // ...................................*............................................................................................................................................................ + sqrdmulh v5.8H, v5.8H, v0.H[7] // .................................*.............................................................................................................................................................. + ldr q17, [x14, #176] // ....................................................................................................*........................................................................................... + mla v9.8H, v10.8H, v8.H[0] // ..................................*............................................................................................................................................................. + sub v31.8H, v21.8H, v19.8H // ...........*.................................................................................................................................................................................... + add v10.8H, v21.8H, v19.8H // .............................*.................................................................................................................................................................. + mul v19.8H, v26.8H, v3.H[4] // .........................................*...................................................................................................................................................... + sqrdmulh v26.8H, v26.8H, v3.H[5] // ................................................*............................................................................................................................................... + sqrdmulh v7.8H, v29.8H, v2.H[1] // ........................................*....................................................................................................................................................... + sub v29.8H, v25.8H, v11.8H // .................*.............................................................................................................................................................................. + add v21.8H, v25.8H, v11.8H // ..................*............................................................................................................................................................................. + mla v24.8H, v20.8H, v8.H[0] // ....................*........................................................................................................................................................................... + add v20.8H, v31.8H, v15.8H // ...........................................................*.................................................................................................................................... + sub v15.8H, v31.8H, v15.8H // ...................................................*............................................................................................................................................ + sqrdmulh v6.8H, v6.8H, v0.H[1] // ..............................................................................................................*................................................................................. + sub v11.8H, v13.8H, v9.8H // ............................................*................................................................................................................................................... + add v25.8H, v30.8H, v22.8H // ......*......................................................................................................................................................................................... + sqrdmulh v31.8H, v17.8H, v0.H[1] // ..............................................................................................................................*................................................................. + sqrdmulh v22.8H, v25.8H, v1.H[3] // ..............*................................................................................................................................................................................. + mla v18.8H, v5.8H, v8.H[0] // .............................................*.................................................................................................................................................. + mla v19.8H, v26.8H, v8.H[0] // ..................................................................*............................................................................................................................. + sub v30.8H, v20.8H, v28.8H // .....................................................................*.......................................................................................................................... + ldr q26, [x14, #48] // ..........................................................................................................*..................................................................................... + mul v5.8H, v17.8H, v0.H[0] // ........................................................................................................................*....................................................................... + mul v25.8H, v25.8H, v1.H[2] // .............*.................................................................................................................................................................................. + add v28.8H, v20.8H, v28.8H // ......................................................................*......................................................................................................................... + add v20.8H, v4.8H, v16.8H // .................................................*.............................................................................................................................................. + mla v23.8H, v6.8H, v8.H[0] // ...................................................................................................................*............................................................................ + mla v12.8H, v7.8H, v8.H[0] // .................................................................*.............................................................................................................................. + ldr q7, [x14, #112] // ...................................................................................................*............................................................................................ + str q30, [x14, #160] // .............................................................................*.................................................................................................................. + sqrdmulh v6.8H, v21.8H, v2.H[3] // ..........................*..................................................................................................................................................................... + sub v30.8H, v15.8H, v19.8H // ................................................................................*............................................................................................................... + add v15.8H, v15.8H, v19.8H // .................................................................................*.............................................................................................................. + add v19.8H, v20.8H, v18.8H // ...............................................................*................................................................................................................................ + sub v18.8H, v20.8H, v18.8H // .............................................................*.................................................................................................................................. + ldr q20, [x6, #240] // ......................................................................................................................*......................................................................... + str q28, [x14, #128] // ............................................................................*................................................................................................................... + sqrdmulh v17.8H, v7.8H, v0.H[1] // ....................................................................................................................*........................................................................... + str q15, [x14, #192] // ..........................................................................................*..................................................................................................... + sub v15.8H, v4.8H, v16.8H // .....................................*.......................................................................................................................................................... + mul v7.8H, v7.8H, v0.H[0] // .......................................................................................................................*........................................................................ + str q30, [x14, #224] // .............................................................................................*.................................................................................................. + add v13.8H, v13.8H, v9.8H // ..................................................*............................................................................................................................................. + ldr q30, [x14, #16] // ..................................................................................................*............................................................................................. + sub v9.8H, v20.8H, v23.8H // ....................................................................................................................................*........................................................... + add v20.8H, v20.8H, v23.8H // .....................................................................................................................................*.......................................................... + add v23.8H, v18.8H, v12.8H // ..............................................................................*................................................................................................................. + sqrdmulh v16.8H, v29.8H, v2.H[5] // ............................*................................................................................................................................................................... + sub v12.8H, v18.8H, v12.8H // ...............................................................................*................................................................................................................ + ldr q4, [x14, #208] // .....................................................................................................*.......................................................................................... + mul v28.8H, v21.8H, v2.H[2] // ...............................*................................................................................................................................................................ + mul v18.8H, v29.8H, v2.H[4] // ..............................*................................................................................................................................................................. + sub v14.8H, v27.8H, v14.8H // ............*................................................................................................................................................................................... + mla v25.8H, v22.8H, v8.H[0] // ...........................*.................................................................................................................................................................... + mla v7.8H, v17.8H, v8.H[0] // ..................................................................................................................................*............................................................. + sub v22.8H, v19.8H, v24.8H // ........................................................................*....................................................................................................................... + mla v5.8H, v31.8H, v8.H[0] // ...........................................................................................................................................*.................................................... + add v19.8H, v19.8H, v24.8H // .........................................................................*...................................................................................................................... + str q12, [x6, #96] // .........................................................................................*...................................................................................................... + mul v31.8H, v14.8H, v1.H[0] // .......................*........................................................................................................................................................................ + ldr q27, [x6, #176] // .........................................................................................................*...................................................................................... + str q23, [x6, #64] // ........................................................................................*....................................................................................................... + sqrdmulh v29.8H, v14.8H, v1.H[1] // ......................*......................................................................................................................................................................... + ldr q24, [x14, #144] // ........................................................................................................*....................................................................................... + sub v17.8H, v10.8H, v25.8H // .......................................................................*........................................................................................................................ + str q22, [x6, #32] // ..................................................................................*............................................................................................................. + ldr q14, [x6, #144] // .................................................................................................................*.............................................................................. + ldr q21, [x6, #112] // .......................................................................................................*........................................................................................ + mul v22.8H, v20.8H, v0.H[2] // .................................................................................................................................................*.............................................. + sqrdmulh v23.8H, v9.8H, v0.H[5] // ..........................................................................................................................................*..................................................... + sqrdmulh v12.8H, v20.8H, v0.H[3] // ...............................................................................................................................................*................................................ + str q19, [x6], #16 // ...................................................................................*............................................................................................................ + add v19.8H, v10.8H, v25.8H // ....................................*........................................................................................................................................................... + mla v28.8H, v6.8H, v8.H[0] // ...........................................*.................................................................................................................................................... + mla v18.8H, v16.8H, v8.H[0] // ..........................................*..................................................................................................................................................... + mul v20.8H, v26.8H, v0.H[0] // ......................................................................................................................................*......................................................... + sqrdmulh v6.8H, v11.8H, v3.H[1] // .......................................................*........................................................................................................................................ + mul v25.8H, v11.8H, v3.H[0] // ....................................................*........................................................................................................................................... + mul v16.8H, v24.8H, v0.H[0] // ...........................................................................................................................*.................................................................... + sqrdmulh v11.8H, v13.8H, v2.H[7] // ..........................................................*..................................................................................................................................... + mul v13.8H, v13.8H, v2.H[6] // .........................................................*...................................................................................................................................... + mla v22.8H, v12.8H, v8.H[0] // ...............................................................................................................................................................*................................ + sub v10.8H, v21.8H, v7.8H // .............................................................................................................................................*.................................................. + sqrdmulh v12.8H, v24.8H, v0.H[1] // .............................................................................................................................*.................................................................. + mla v31.8H, v29.8H, v8.H[0] // ................................*............................................................................................................................................................... + sqrdmulh v24.8H, v4.8H, v0.H[1] // .........................................................................................................................*...................................................................... + mul v9.8H, v9.8H, v0.H[4] // .........................................................................................................................................*...................................................... + sqrdmulh v29.8H, v26.8H, v0.H[1] // ...................................................................................................................................*............................................................ + mla v25.8H, v6.8H, v8.H[0] // ....................................................................*........................................................................................................................... + add v6.8H, v27.8H, v5.8H // .....................................................................................................................................................*.......................................... + mul v4.8H, v4.8H, v0.H[0] // ..........................................................................................................................*..................................................................... + sqrdmulh v26.8H, v30.8H, v0.H[1] // ...............................................................................................................*................................................................................ + mul v30.8H, v30.8H, v0.H[0] // ................................................................................................................*............................................................................... + sub v5.8H, v27.8H, v5.8H // ....................................................................................................................................................*........................................... + mla v13.8H, v11.8H, v8.H[0] // ..........................................................................*..................................................................................................................... + mla v16.8H, v12.8H, v8.H[0] // ............................................................................................................................................*................................................... + sub v12.8H, v15.8H, v31.8H // ..............................................*................................................................................................................................................. + mla v9.8H, v23.8H, v8.H[0] // ......................................................................................................................................................*......................................... + add v7.8H, v21.8H, v7.8H // ........................................................................................................................................*....................................................... + ldr q27, [x6, #32] // ............................................................................................................*................................................................................... + add v21.8H, v17.8H, v25.8H // ....................................................................................*........................................................................................................... + mla v20.8H, v29.8H, v8.H[0] // ...................................................................................................................................................*............................................ + add v23.8H, v15.8H, v31.8H // ...............................................*................................................................................................................................................ + sub v15.8H, v17.8H, v25.8H // .....................................................................................*.......................................................................................................... + sub v25.8H, v19.8H, v13.8H // .......................................................................................*........................................................................................................ + ldr q11, [x6, #0] // ......................................................................................................*......................................................................................... + add v19.8H, v19.8H, v13.8H // ......................................................................................*......................................................................................................... + mla v4.8H, v24.8H, v8.H[0] // ..............................................................................................................................................*................................................. + mla v30.8H, v26.8H, v8.H[0] // .....................................................................................................................*.......................................................................... + ldr q26, [x6, #192] // ..................................................................................................................*............................................................................. + mul v17.8H, v6.8H, v0.H[2] // ..............................................................................................................................................................*................................. + add v31.8H, v7.8H, v22.8H // ............................................................................................................................................................................*................... + add v13.8H, v12.8H, v18.8H // ......................................................*......................................................................................................................................... + sqrdmulh v24.8H, v5.8H, v0.H[5] // ............................................................................................................................................................*................................... + add v29.8H, v10.8H, v9.8H // .........................................................................................................................................................................*...................... + str q21, [x14, #64] // ...............................................................................................*................................................................................................ + str q19, [x14], #16 // ............................................................................................*................................................................................................... + sub v22.8H, v7.8H, v22.8H // ...........................................................................................................................................................................*.................... + ldr q21, [x14, #64] // ................................................................................................*............................................................................................... + sub v18.8H, v12.8H, v18.8H // ................................................................*............................................................................................................................... + add v19.8H, v23.8H, v28.8H // ........................................................*....................................................................................................................................... + sub v23.8H, v23.8H, v28.8H // .....................................................*.......................................................................................................................................... + str q15, [x14, #80] // ...........................................................................................*.................................................................................................... + mul v5.8H, v5.8H, v0.H[4] // ...........................................................................................................................................................*.................................... + sub v15.8H, v14.8H, v16.8H // .........................................................................................................................................................*...................................... + sub v12.8H, v10.8H, v9.8H // ..........................................................................................................................................................*..................................... + sqrdmulh v9.8H, v6.8H, v0.H[3] // .............................................................................................................................................................*.................................. + add v7.8H, v26.8H, v4.8H // ................................................................................................................................................................*............................... + str q18, [x6, #208] // ...........................................................................*.................................................................................................................... + str q19, [x6, #112] // ...................................................................*............................................................................................................................ + sqrdmulh v6.8H, v31.8H, v0.H[7] // ................................................................................................................................................................................*............... + mul v31.8H, v31.8H, v0.H[6] // ...............................................................................................................................................................................*................ + sqrdmulh v10.8H, v12.8H, v1.H[5] // ...................................................................................................................................................................*............................ + str q23, [x6, #144] // ..............................................................*................................................................................................................................. + str q25, [x14, #16] // ..............................................................................................*................................................................................................. + mul v19.8H, v15.8H, v0.H[4] // ....................................................................................................................................................................*........................... + sqrdmulh v25.8H, v15.8H, v0.H[5] // ........................................................................................................................................................................*....................... + mul v15.8H, v12.8H, v1.H[4] // .....................................................................................................................................................................*.......................... + sqrdmulh v12.8H, v22.8H, v1.H[1] // ............................................................................................................................................................................................*... + str q13, [x6, #176] // ............................................................*................................................................................................................................... + mla v5.8H, v24.8H, v8.H[0] // ......................................................................................................................................................................*......................... + mul v23.8H, v21.8H, v0.H[0] // ...............................................................................................................................*................................................................ + sqrdmulh v24.8H, v21.8H, v0.H[1] // ............................................................................................................................*................................................................... + mla v17.8H, v9.8H, v8.H[0] // ..........................................................................................................................................................................*..................... + ldr q28, [x6, #64] // ...........................................................................................................*.................................................................................... + sub v21.8H, v11.8H, v30.8H // ................................................................................................................................*............................................................... + sub v13.8H, v27.8H, v20.8H // .................................................................................................................................................................*.............................. + sub v26.8H, v26.8H, v4.8H // ........................................................................................................................................................*....................................... + add v4.8H, v11.8H, v30.8H // .................................................................................................................................*.............................................................. + mla v19.8H, v25.8H, v8.H[0] // ........................................................................................................................................................................................*....... + mla v15.8H, v10.8H, v8.H[0] // ...................................................................................................................................................................................*............ + mla v31.8H, v6.8H, v8.H[0] // ...........................................................................................................................................................................................*.... + mla v23.8H, v24.8H, v8.H[0] // .......................................................................................................................................*........................................................ + sqrdmulh v10.8H, v29.8H, v1.H[3] // ....................................................................................................................................................................................*........... + sqrdmulh v24.8H, v26.8H, v0.H[5] // .............................................................................................................................................................................*.................. + sub v6.8H, v13.8H, v5.8H // .....................................................................................................................................................................................*.......... + add v30.8H, v14.8H, v16.8H // .......................................................................................................................................................*........................................ + mul v14.8H, v7.8H, v0.H[2] // ..................................................................................................................................................................................*............. + mul v11.8H, v22.8H, v1.H[0] // .........................................................................................................................................................................................*...... + mul v22.8H, v26.8H, v0.H[4] // ..............................................................................................................................................................................*................. + mul v9.8H, v29.8H, v1.H[2] // ......................................................................................................................................................................................*......... + add v18.8H, v27.8H, v20.8H // ..................................................................................................................................................................*............................. + sub v26.8H, v6.8H, v15.8H // ..............................................................................................................................................................................................*. + sqrdmulh v25.8H, v30.8H, v0.H[3] // .......................................................................................................................................................................................*........ + mul v16.8H, v30.8H, v0.H[2] // .......................................................................................................................................................................*........................ + add v27.8H, v28.8H, v23.8H // ................................................................................................................................................*............................................... + sub v30.8H, v28.8H, v23.8H // ..................................................................................................................................................*............................................. + add v23.8H, v18.8H, v17.8H // ...............................................................................................................................................................................................* + add v29.8H, v6.8H, v15.8H // .............................................................................................................................................................................................*.. + mla v22.8H, v24.8H, v8.H[0] // ..........................................................................................................................................................................................*..... + sqrdmulh v7.8H, v7.8H, v0.H[3] // .................................................................................................................................................................................*.............. + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // mla v14.8H, v7.8H, v8.H[0] // ......*......................................................................................................................................................................................... + // add v20.8H, v13.8H, v5.8H // ..............*................................................................................................................................................................................. + // sub v5.8H, v23.8H, v31.8H // ..*............................................................................................................................................................................................. + // add v24.8H, v23.8H, v31.8H // .......*........................................................................................................................................................................................ + // mul v23.8H, v29.8H, v3.H[2] // *............................................................................................................................................................................................... + // sqrdmulh v15.8H, v29.8H, v3.H[3] // .*.............................................................................................................................................................................................. + // add v29.8H, v30.8H, v22.8H // ....................................*........................................................................................................................................................... + // mla v11.8H, v12.8H, v8.H[0] // .....*.......................................................................................................................................................................................... + // sub v28.8H, v18.8H, v17.8H // ..........*..................................................................................................................................................................................... + // mul v18.8H, v24.8H, v1.H[6] // ................*............................................................................................................................................................................... + // sqrdmulh v13.8H, v24.8H, v1.H[7] // ...............*................................................................................................................................................................................ + // sub v7.8H, v21.8H, v19.8H // ........................*....................................................................................................................................................................... + // sub v6.8H, v27.8H, v14.8H // .........................................................................*...................................................................................................................... + // mul v12.8H, v29.8H, v1.H[2] // ............................................*................................................................................................................................................... + // sqrdmulh v29.8H, v29.8H, v1.H[3] // ......................................*......................................................................................................................................................... + // sub v24.8H, v30.8H, v22.8H // ...*............................................................................................................................................................................................ + // mla v23.8H, v15.8H, v8.H[0] // .............*.................................................................................................................................................................................. + // sub v30.8H, v28.8H, v11.8H // .............................*.................................................................................................................................................................. + // add v11.8H, v28.8H, v11.8H // ..............................*................................................................................................................................................................. + // mla v16.8H, v25.8H, v8.H[0] // ........*....................................................................................................................................................................................... + // mla v18.8H, v13.8H, v8.H[0] // ...............................*................................................................................................................................................................ + // sqrdmulh v13.8H, v24.8H, v1.H[5] // ...........*.................................................................................................................................................................................... + // sqrdmulh v28.8H, v6.8H, v1.H[1] // ...................................................................................*............................................................................................................ + // mul v25.8H, v6.8H, v1.H[0] // ................................................................................*............................................................................................................... + // mul v15.8H, v24.8H, v1.H[4] // ............*................................................................................................................................................................................... + // add v31.8H, v27.8H, v14.8H // .................*.............................................................................................................................................................................. + // sqrdmulh v27.8H, v11.8H, v2.H[3] // ...................................................*............................................................................................................................................ + // mla v12.8H, v29.8H, v8.H[0] // ..........................................................................*..................................................................................................................... + // sqrdmulh v24.8H, v30.8H, v2.H[5] // ....................................................................*........................................................................................................................... + // add v14.8H, v21.8H, v19.8H // .........................*...................................................................................................................................................................... + // mul v19.8H, v30.8H, v2.H[4] // ........................................................................*....................................................................................................................... + // mul v11.8H, v11.8H, v2.H[2] // .......................................................................*........................................................................................................................ + // mla v25.8H, v28.8H, v8.H[0] // .........................................................................................................*...................................................................................... + // sqrdmulh v21.8H, v31.8H, v0.H[7] // .....................*.......................................................................................................................................................................... + // mla v9.8H, v10.8H, v8.H[0] // .......................*........................................................................................................................................................................ + // mul v10.8H, v31.8H, v0.H[6] // ....................*........................................................................................................................................................................... + // add v6.8H, v14.8H, v12.8H // .............................................................................................*.................................................................................................. + // sub v22.8H, v4.8H, v16.8H // ............................................................*................................................................................................................................... + // mla v15.8H, v13.8H, v8.H[0] // ...................*............................................................................................................................................................................ + // mul v17.8H, v5.8H, v2.H[0] // .........*...................................................................................................................................................................................... + // sqrdmulh v13.8H, v5.8H, v2.H[1] // ............................*................................................................................................................................................................... + // mul v28.8H, v26.8H, v3.H[4] // ..........................*..................................................................................................................................................................... + // mla v19.8H, v24.8H, v8.H[0] // ...............................................................................................*................................................................................................ + // mla v11.8H, v27.8H, v8.H[0] // ..............................................................................................*................................................................................................. + // sub v31.8H, v20.8H, v9.8H // ...................................*............................................................................................................................................................ + // mla v10.8H, v21.8H, v8.H[0] // .......................................*........................................................................................................................................................ + // sub v29.8H, v22.8H, v25.8H // .....................................................................................................................*.......................................................................... + // add v22.8H, v22.8H, v25.8H // ...........................................................................................................................*.................................................................... + // sqrdmulh v30.8H, v26.8H, v3.H[5] // ...........................*.................................................................................................................................................................... + // add v16.8H, v4.8H, v16.8H // ..............................................*................................................................................................................................................. + // add v20.8H, v20.8H, v9.8H // ...............................................................*................................................................................................................................ + // sub v21.8H, v7.8H, v15.8H // .................................*.............................................................................................................................................................. + // mul v24.8H, v31.8H, v3.H[0] // ..................................................................................................*............................................................................................. + // sub v25.8H, v22.8H, v11.8H // ..............................................................................................................................................*................................................. + // add v4.8H, v29.8H, v19.8H // .....................................................................................................................................*.......................................................... + // sqrdmulh v9.8H, v31.8H, v3.H[1] // .................................................................................................*.............................................................................................. + // add v5.8H, v22.8H, v11.8H // .............................................................................................................................................*.................................................. + // mul v11.8H, v20.8H, v2.H[6] // .....................................................................................................*.......................................................................................... + // sqrdmulh v22.8H, v20.8H, v2.H[7] // ....................................................................................................*........................................................................................... + // add v20.8H, v7.8H, v15.8H // ................................*............................................................................................................................................................... + // str q4, [x6, #192] // ................................................................................................................................................................*............................... + // sub v27.8H, v16.8H, v10.8H // .......................................................*........................................................................................................................................ + // str q25, [x6, #160] // ..........................................................................................................................................................*..................................... + // add v31.8H, v16.8H, v10.8H // ......................................................*......................................................................................................................................... + // sub v26.8H, v29.8H, v19.8H // ............................................................................................................................................*................................................... + // mla v17.8H, v13.8H, v8.H[0] // ................................................*............................................................................................................................................... + // mla v28.8H, v30.8H, v8.H[0] // ........................................*....................................................................................................................................................... + // str q5, [x6, #128] // ......................................................................................................................................................*......................................... + // mla v24.8H, v9.8H, v8.H[0] // .............................................................................................................*.................................................................................. + // sub v29.8H, v20.8H, v23.8H // .........................................*...................................................................................................................................................... + // add v23.8H, v20.8H, v23.8H // .............................................*.................................................................................................................................................. + // sub v30.8H, v14.8H, v12.8H // .....................................................................................*.......................................................................................................... + // sub v15.8H, v31.8H, v18.8H // ............................................................................*................................................................................................................... + // add v16.8H, v31.8H, v18.8H // ..............................................................................*................................................................................................................. + // mla v11.8H, v22.8H, v8.H[0] // ...................................................................................................................*............................................................................ + // str q26, [x6, #224] // .....................................................................................................................................................*.......................................... + // str q23, [x14, #128] // .........................................................*...................................................................................................................................... + // str q29, [x14, #160] // ..................................................*............................................................................................................................................. + // add v26.8H, v27.8H, v17.8H // ...................................................................*............................................................................................................................ + // sub v14.8H, v27.8H, v17.8H // .....................................................................*.......................................................................................................................... + // sub v27.8H, v21.8H, v28.8H // ....................................................*........................................................................................................................................... + // add v28.8H, v21.8H, v28.8H // .....................................................*.......................................................................................................................................... + // str q15, [x6, #32] // ......................................................................................*......................................................................................................... + // str q16, [x6], #16 // ............................................................................................*................................................................................................... + // add v16.8H, v30.8H, v24.8H // .........................................................................................................................*...................................................................... + // sub v15.8H, v30.8H, v24.8H // ............................................................................................................................*................................................................... + // add v21.8H, v6.8H, v11.8H // ...............................................................................................................................*................................................................ + // sub v4.8H, v6.8H, v11.8H // .............................................................................................................................*.................................................................. + // str q26, [x6, #48] // ..................................................................................*............................................................................................................. + // str q14, [x6, #80] // ...............................................................................*................................................................................................................ + // str q28, [x14, #192] // ...........................................................*.................................................................................................................................... + // str q15, [x14, #96] // ...............................................................................................................................................*................................................ + // str q21, [x14], #16 // .........................................................................................................................................*...................................................... + // str q27, [x14, #208] // ..............................................................*................................................................................................................................. + // str q4, [x14, #16] // ...........................................................................................................................................................*.................................... + // str q16, [x14, #48] // ........................................................................................................................................*....................................................... + // ldr q15, [x14, #64] // ...........................................................................................................................................*.................................................... + // ldr q27, [x14, #224] // ....*........................................................................................................................................................................................... + // ldr q16, [x14, #0] // ................................................................*............................................................................................................................... + // ldr q6, [x14, #96] // .................................................*.............................................................................................................................................. + // ldr q26, [x14, #160] // ......................*......................................................................................................................................................................... + // ldr q14, [x14, #192] // ......................................................................*......................................................................................................................... + // ldr q30, [x6, #0] // ..............................................................................................................................*................................................................. + // ldr q11, [x6, #96] // ........................................................................................*....................................................................................................... + // ldr q7, [x14, #128] // ....................................................................................*........................................................................................................... + // ldr q17, [x6, #160] // .................................................................................*.............................................................................................................. + // ldr q31, [x14, #32] // ..........................................*..................................................................................................................................................... + // ldr q13, [x6, #64] // .....................................................................................................................................................................*.......................... + // ldr q12, [x6, #32] // ........................................................................................................................*....................................................................... + // mul v9.8H, v27.8H, v0.H[0] // ..................*............................................................................................................................................................................. + // sqrdmulh v27.8H, v27.8H, v0.H[1] // ..................................*............................................................................................................................................................. + // sqrdmulh v18.8H, v16.8H, v0.H[1] // ................................................................................................................*............................................................................... + // mul v29.8H, v16.8H, v0.H[0] // .................................................................................................................*.............................................................................. + // ldr q19, [x6, #128] // .......................................................................................*........................................................................................................ + // ldr q23, [x6, #192] // ..................................................................................................................................*............................................................. + // mla v9.8H, v27.8H, v8.H[0] // ...............................................*................................................................................................................................................ + // sqrdmulh v10.8H, v6.8H, v0.H[1] // ..........................................................*..................................................................................................................................... + // mla v29.8H, v18.8H, v8.H[0] // .................................................................................................................................*.............................................................. + // ldr q18, [x6, #224] // ........................................................*....................................................................................................................................... + // mul v22.8H, v6.8H, v0.H[0] // .............................................................*.................................................................................................................................. + // mul v28.8H, v26.8H, v0.H[0] // ...........................................*.................................................................................................................................................... + // sqrdmulh v5.8H, v14.8H, v0.H[1] // ..........................................................................................................*..................................................................................... + // mul v20.8H, v14.8H, v0.H[0] // ...............................................................................................................*................................................................................ + // mul v14.8H, v7.8H, v0.H[0] // ...................................................................................................*............................................................................................ + // sqrdmulh v24.8H, v15.8H, v0.H[1] // ...................................................................................................................................................................*............................ + // sqrdmulh v27.8H, v7.8H, v0.H[1] // ........................................................................................................*....................................................................................... + // sqrdmulh v26.8H, v26.8H, v0.H[1] // .....................................*.......................................................................................................................................................... + // mul v16.8H, v15.8H, v0.H[0] // ..................................................................................................................................................................*............................. + // sub v21.8H, v30.8H, v29.8H // ......................................................................................................................................................................*......................... + // add v4.8H, v30.8H, v29.8H // .........................................................................................................................................................................*...................... + // mla v22.8H, v10.8H, v8.H[0] // ...........................................................................*.................................................................................................................... + // sqrdmulh v25.8H, v31.8H, v0.H[1] // ............................................................................................................*................................................................................... + // sub v15.8H, v18.8H, v9.8H // .................................................................*.............................................................................................................................. + // add v18.8H, v18.8H, v9.8H // ..................................................................*............................................................................................................................. + // mul v9.8H, v31.8H, v0.H[0] // ................................................................................................*............................................................................................... + // mla v16.8H, v24.8H, v8.H[0] // .............................................................................................................................................................................*.................. + // add v10.8H, v11.8H, v22.8H // .......................................................................................................................*........................................................................ + // mul v7.8H, v15.8H, v0.H[4] // ...........................................................................................................*.................................................................................... + // sqrdmulh v31.8H, v15.8H, v0.H[5] // ..........................................................................................*..................................................................................................... + // mla v28.8H, v26.8H, v8.H[0] // .............................................................................*.................................................................................................................. + // mla v14.8H, v27.8H, v8.H[0] // ....................................................................................................................*........................................................................... + // sub v29.8H, v11.8H, v22.8H // .......................................................................................................*........................................................................................ + // mla v20.8H, v5.8H, v8.H[0] // ................................................................................................................................*............................................................... + // sqrdmulh v24.8H, v18.8H, v0.H[3] // ...........................................................................................*.................................................................................................... + // add v27.8H, v13.8H, v16.8H // ..........................................................................................................................................................................................*..... + // mul v15.8H, v18.8H, v0.H[2] // .........................................................................................*...................................................................................................... + // sub v30.8H, v13.8H, v16.8H // ...........................................................................................................................................................................................*.... + // mla v9.8H, v25.8H, v8.H[0] // ..........................................................................................................................*..................................................................... + // sub v11.8H, v17.8H, v28.8H // ..................................................................................................................*............................................................................. + // add v17.8H, v17.8H, v28.8H // ..............................................................................................................*................................................................................. + // mla v7.8H, v31.8H, v8.H[0] // ......................................................................................................................*......................................................................... + // add v6.8H, v19.8H, v14.8H // .................................................................................................................................................................................*.............. + // sub v31.8H, v23.8H, v20.8H // ........................................................................................................................................................................*....................... + // sub v22.8H, v19.8H, v14.8H // .................................................................................................................................................*.............................................. + // sub v16.8H, v29.8H, v7.8H // ..................................................................................................................................................*............................................. + // mul v5.8H, v11.8H, v0.H[4] // ................................................................................................................................................*............................................... + // sqrdmulh v11.8H, v11.8H, v0.H[5] // ......................................................................................................................................*......................................................... + // sqrdmulh v14.8H, v17.8H, v0.H[3] // ...................................................................................................................................................*............................................ + // mul v17.8H, v17.8H, v0.H[2] // ...................................................................................................................................*............................................................ + // mla v15.8H, v24.8H, v8.H[0] // ......................................................................................................*......................................................................................... + // add v26.8H, v23.8H, v20.8H // ....................................................................................................................................................*........................................... + // sub v13.8H, v12.8H, v9.8H // .......................................................................................................................................................................*........................ + // add v18.8H, v12.8H, v9.8H // ......................................................................................................................................................................................*......... + // sqrdmulh v20.8H, v16.8H, v1.H[5] // .........................................................................................................................................................*...................................... + // mul v19.8H, v22.8H, v0.H[4] // ............................................................................................................................................................*................................... + // mul v28.8H, v16.8H, v1.H[4] // ..............................................................................................................................................................*................................. + // mla v5.8H, v11.8H, v8.H[0] // .................................................................................................................................................................*.............................. + // mul v16.8H, v6.8H, v0.H[2] // .........................................................................................................................................................................................*...... + // sqrdmulh v24.8H, v22.8H, v0.H[5] // .............................................................................................................................................................*.................................. + // add v12.8H, v29.8H, v7.8H // .......................................................................................................................................*........................................................ + // mla v17.8H, v14.8H, v8.H[0] // ....................................................................................................................................................................*........................... + // sub v23.8H, v10.8H, v15.8H // ..........................................................................................................................................*..................................................... + // add v11.8H, v10.8H, v15.8H // ....................................................................................................................................*........................................................... + // sqrdmulh v29.8H, v31.8H, v0.H[5] // ...............................................................................................................................................................................*................ + // mul v22.8H, v31.8H, v0.H[4] // ....................................................................................................................................................................................*........... + // mul v31.8H, v11.8H, v0.H[6] // ........................................................................................................................................................*....................................... + // sqrdmulh v15.8H, v11.8H, v0.H[7] // .......................................................................................................................................................*........................................ + // sqrdmulh v7.8H, v26.8H, v0.H[3] // ...............................................................................................................................................................................................* + // mul v14.8H, v26.8H, v0.H[2] // ..................................................................................................................................................................................*............. + // mla v28.8H, v20.8H, v8.H[0] // ...........................................................................................................................................................................*.................... + // sqrdmulh v10.8H, v12.8H, v1.H[3] // ..............................................................................................................................................................................*................. + // sub v26.8H, v13.8H, v5.8H // ................................................................................................................................................................................*............... + // mul v9.8H, v12.8H, v1.H[2] // .....................................................................................................................................................................................*.......... + // sqrdmulh v25.8H, v6.8H, v0.H[3] // ........................................................................................................................................................................................*....... + // mla v19.8H, v24.8H, v8.H[0] // ..........................................................................................................................................................................*..................... + // mul v11.8H, v23.8H, v1.H[0] // ...................................................................................................................................................................................*............ + // mla v22.8H, v29.8H, v8.H[0] // ..............................................................................................................................................................................................*. + // mla v31.8H, v15.8H, v8.H[0] // ............................................................................................................................................................................*................... + // sqrdmulh v12.8H, v23.8H, v1.H[1] // ...............................................................................................................................................................*................................ + // add v29.8H, v26.8H, v28.8H // .............................................................................................................................................................................................*.. + // sub v26.8H, v26.8H, v28.8H // .......................................................................................................................................................................................*........ + // add v23.8H, v18.8H, v17.8H // ............................................................................................................................................................................................*... + + sub count, count, #1 + cbnz count, layer1234_start + mla v14.8H, v7.8H, v8.H[0] // ....................................................................*........................................................................................................................... + add v20.8H, v13.8H, v5.8H // .....................................................................................*.......................................................................................................... + sub v5.8H, v23.8H, v31.8H // ........................................................................................................*....................................................................................... + add v24.8H, v23.8H, v31.8H // .........................................................................................................*...................................................................................... + mul v23.8H, v29.8H, v3.H[2] // .......................................................................................................................................................................*........................ + sqrdmulh v15.8H, v29.8H, v3.H[3] // ......................................................................................................................................................................*......................... + add v29.8H, v30.8H, v22.8H // ..........................................................................................*..................................................................................................... + mla v11.8H, v12.8H, v8.H[0] // .................................................................................................................*.............................................................................. + sub v28.8H, v18.8H, v17.8H // ................................................................*............................................................................................................................... + mul v18.8H, v24.8H, v1.H[6] // .........................................................................................................................................*...................................................... + sqrdmulh v13.8H, v24.8H, v1.H[7] // ........................................................................................................................................*....................................................... + sub v7.8H, v21.8H, v19.8H // ...............................................................................*................................................................................................................ + sub v6.8H, v27.8H, v14.8H // .....................................................................*.......................................................................................................................... + mul v12.8H, v29.8H, v1.H[2] // .....................................................................................................................*.......................................................................... + sqrdmulh v29.8H, v29.8H, v1.H[3] // ....................................................................................................................*........................................................................... + sub v24.8H, v30.8H, v22.8H // .........................................................................................*...................................................................................................... + mla v23.8H, v15.8H, v8.H[0] // ........................................................................................................................................................................*....................... + sub v30.8H, v28.8H, v11.8H // ..................................................................................................................*............................................................................. + add v11.8H, v28.8H, v11.8H // ...................................................................................................................*............................................................................ + mla v16.8H, v25.8H, v8.H[0] // ..........................................................*..................................................................................................................................... + mla v18.8H, v13.8H, v8.H[0] // ..........................................................................................................................................*..................................................... + sqrdmulh v13.8H, v24.8H, v1.H[5] // ..............................................................................................................................*................................................................. + sqrdmulh v28.8H, v6.8H, v1.H[1] // ..........................................................................................................*..................................................................................... + mul v25.8H, v6.8H, v1.H[0] // ...........................................................................................................*.................................................................................... + mul v15.8H, v24.8H, v1.H[4] // ...............................................................................................................................*................................................................ + add v31.8H, v27.8H, v14.8H // ......................................................................*......................................................................................................................... + sqrdmulh v27.8H, v11.8H, v2.H[3] // ..................................................................................................................................................*............................................. + mla v12.8H, v29.8H, v8.H[0] // ......................................................................................................................*......................................................................... + sqrdmulh v24.8H, v30.8H, v2.H[5] // .......................................................................................................................................................*........................................ + add v14.8H, v21.8H, v19.8H // ................................................................................*............................................................................................................... + mul v19.8H, v30.8H, v2.H[4] // ........................................................................................................................................................*....................................... + mul v11.8H, v11.8H, v2.H[2] // ...................................................................................................................................................*............................................ + mla v25.8H, v28.8H, v8.H[0] // ............................................................................................................*................................................................................... + sqrdmulh v21.8H, v31.8H, v0.H[7] // ................................................................................................*............................................................................................... + mla v9.8H, v10.8H, v8.H[0] // ...........................................................................................................................*.................................................................... + mul v10.8H, v31.8H, v0.H[6] // .................................................................................................*.............................................................................................. + add v6.8H, v14.8H, v12.8H // ........................................................................................................................*....................................................................... + sub v22.8H, v4.8H, v16.8H // ...........................................................*.................................................................................................................................... + mla v15.8H, v13.8H, v8.H[0] // ................................................................................................................................*............................................................... + mul v17.8H, v5.8H, v2.H[0] // ..............................................................................................................................................*................................................. + sqrdmulh v13.8H, v5.8H, v2.H[1] // .............................................................................................................................................*.................................................. + mul v28.8H, v26.8H, v3.H[4] // ............................................................................................................................................................................*................... + mla v19.8H, v24.8H, v8.H[0] // .........................................................................................................................................................*...................................... + mla v11.8H, v27.8H, v8.H[0] // ....................................................................................................................................................*........................................... + sub v31.8H, v20.8H, v9.8H // ............................................................................................................................*................................................................... + mla v10.8H, v21.8H, v8.H[0] // ..................................................................................................*............................................................................................. + sub v29.8H, v22.8H, v25.8H // .............................................................................................................*.................................................................................. + add v22.8H, v22.8H, v25.8H // ..............................................................................................................*................................................................................. + sqrdmulh v30.8H, v26.8H, v3.H[5] // ...........................................................................................................................................................................*.................... + add v16.8H, v4.8H, v16.8H // ............................................................*................................................................................................................................... + add v20.8H, v20.8H, v9.8H // .............................................................................................................................*.................................................................. + sub v21.8H, v7.8H, v15.8H // .................................................................................................................................*.............................................................. + mul v24.8H, v31.8H, v3.H[0] // ..................................................................................................................................................................*............................. + sub v25.8H, v22.8H, v11.8H // .....................................................................................................................................................*.......................................... + add v4.8H, v29.8H, v19.8H // ...........................................................................................................................................................*.................................... + sqrdmulh v9.8H, v31.8H, v3.H[1] // .................................................................................................................................................................*.............................. + add v5.8H, v22.8H, v11.8H // ......................................................................................................................................................*......................................... + mul v11.8H, v20.8H, v2.H[6] // .............................................................................................................................................................*.................................. + sqrdmulh v22.8H, v20.8H, v2.H[7] // ............................................................................................................................................................*................................... + add v20.8H, v7.8H, v15.8H // ..................................................................................................................................*............................................................. + str q4, [x6, #192] // ......................................................................................................................................................................................*......... + sub v27.8H, v16.8H, v10.8H // ...................................................................................................*............................................................................................ + str q25, [x6, #160] // .....................................................................................................................................................................................*.......... + add v31.8H, v16.8H, v10.8H // ....................................................................................................*........................................................................................... + sub v26.8H, v29.8H, v19.8H // ..........................................................................................................................................................*..................................... + mla v17.8H, v13.8H, v8.H[0] // ...............................................................................................................................................*................................................ + mla v28.8H, v30.8H, v8.H[0] // .............................................................................................................................................................................*.................. + str q5, [x6, #128] // ....................................................................................................................................................................................*........... + mla v24.8H, v9.8H, v8.H[0] // ...................................................................................................................................................................*............................ + sub v29.8H, v20.8H, v23.8H // .........................................................................................................................................................................*...................... + add v23.8H, v20.8H, v23.8H // ..........................................................................................................................................................................*..................... + sub v30.8H, v14.8H, v12.8H // .......................................................................................................................*........................................................................ + sub v15.8H, v31.8H, v18.8H // ...........................................................................................................................................*.................................................... + add v16.8H, v31.8H, v18.8H // ............................................................................................................................................*................................................... + mla v11.8H, v22.8H, v8.H[0] // ..............................................................................................................................................................*................................. + str q26, [x6, #224] // .......................................................................................................................................................................................*........ + str q23, [x14, #128] // ............................................................................................................................................................................................*... + str q29, [x14, #160] // .............................................................................................................................................................................................*.. + add v26.8H, v27.8H, v17.8H // .................................................................................................................................................*.............................................. + sub v14.8H, v27.8H, v17.8H // ................................................................................................................................................*............................................... + sub v27.8H, v21.8H, v28.8H // ..............................................................................................................................................................................*................. + add v28.8H, v21.8H, v28.8H // ...............................................................................................................................................................................*................ + str q15, [x6, #32] // .................................................................................................................................................................................*.............. + str q16, [x6], #16 // ................................................................................................................................................................................*............... + add v16.8H, v30.8H, v24.8H // .....................................................................................................................................................................*.......................... + sub v15.8H, v30.8H, v24.8H // ....................................................................................................................................................................*........................... + add v21.8H, v6.8H, v11.8H // ................................................................................................................................................................*............................... + sub v4.8H, v6.8H, v11.8H // ...............................................................................................................................................................*................................ + str q26, [x6, #48] // ..................................................................................................................................................................................*............. + str q14, [x6, #80] // ...................................................................................................................................................................................*............ + str q28, [x14, #192] // ..............................................................................................................................................................................................*. + str q15, [x14, #96] // ...........................................................................................................................................................................................*.... + str q21, [x14], #16 // ........................................................................................................................................................................................*....... + str q27, [x14, #208] // ...............................................................................................................................................................................................* + str q4, [x14, #16] // .........................................................................................................................................................................................*...... + str q16, [x14, #48] // ..........................................................................................................................................................................................*..... + + restore inp, STACK0 + mov count, #4 + + ASM_LOAD(r_ptr1, roots_l456) + + add src0, inp, #256*0 + add src1, inp, #256*1 + + .p2align 2 + // Instructions: 118 + // Expected cycles: 45 + // Expected IPC: 2.62 + // + // Cycle bound: 45.0 + // IPC bound: 2.62 + // + // Wall time: 20.96s + // User time: 20.96s + // + // ------------------------------------------------- original position -------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + ldr q13, [x4, #32] // .....*................................................................................................................ + ldr q30, [x4, #64] // ....*................................................................................................................. + ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x6] // *..................................................................................................................... + ld4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x7] // ..*................................................................................................................... + ldr q27, [x4, #16] // ............*......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q15, [x4, #80] // ........*............................................................................................................. + ldr q29, [x4, #144] // ...*.................................................................................................................. + ldr q0, [x4], #16*14 // ..........*........................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q7, [x4, #-32] // ................................................*..................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q25, [x4, #-96] // .......*.............................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q24, [x4, #-64] // .........*............................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q1, [x4, #-112] // ..............................................*....................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v11.4S, v22.4S, v4.4S // .................*.................................................................................................... + trn2 v18.4S, v21.4S, v3.4S // ...............*...................................................................................................... + trn1 v16.4S, v21.4S, v3.4S // ..................*................................................................................................... + ldr q9, [x4, #-176] // ......*............................................................................................................... + trn2 v26.4S, v23.4S, v5.4S // ................*..................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v6.4S, v22.4S, v4.4S // ...................*.................................................................................................. + trn1 v10.4S, v20.4S, v2.4S // .............*........................................................................................................ + trn2 v31.4S, v20.4S, v2.4S // ..............*....................................................................................................... + trn1 v20.4S, v23.4S, v5.4S // ..........................*........................................................................................... + ldr q14, [x4, #-48] // ...........*.......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v23.8H, v18.8H, v27.8H // ....................*................................................................................................. + mul v19.8H, v18.8H, v0.8H // .....................*................................................................................................ + mul v18.8H, v26.8H, v0.8H // ......................*............................................................................................... + sqrdmulh v26.8H, v26.8H, v27.8H // .......................*.............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v17.8H, v6.8H, v0.8H // ........................*............................................................................................. + sqrdmulh v6.8H, v6.8H, v27.8H // .........................*............................................................................................ + mul v12.8H, v31.8H, v0.8H // ...........................*.......................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v28.8H, v31.8H, v27.8H // ..............................*....................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v18.8H, v26.8H, v8.H[0] // ............................*......................................................................................... + mla v19.8H, v23.8H, v8.H[0] // .............................*........................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v17.8H, v6.8H, v8.H[0] // ...............................*...................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v12.8H, v28.8H, v8.H[0] // .................................*.................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v4.8H, v16.8H, v19.8H // ................................*..................................................................................... + add v21.8H, v16.8H, v19.8H // ....................................................*................................................................. + add v16.8H, v20.8H, v18.8H // ...................................*.................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v26.8H, v20.8H, v18.8H // ..................................*................................................................................... + add v3.8H, v11.8H, v17.8H // .....................................*................................................................................ + sub v6.8H, v11.8H, v17.8H // ....................................*................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v11.8H, v16.8H, v9.8H // ......................................*............................................................................... + mul v17.8H, v16.8H, v13.8H // .......................................*.............................................................................. + mul v18.8H, v26.8H, v30.8H // ........................................*............................................................................. + sqrdmulh v5.8H, v26.8H, v15.8H // .........................................*............................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v20.8H, v6.8H, v15.8H // .............................................*........................................................................ + mul v0.8H, v6.8H, v30.8H // ............................................*......................................................................... + mul v26.8H, v3.8H, v13.8H // ...............................................*...................................................................... + ldr q6, [x4, #-16] // .*.................................................................................................................... + sqrdmulh v27.8H, v3.8H, v9.8H // ..........................................*........................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v9.8H, v10.8H, v12.8H // .................................................*.................................................................... + sub v10.8H, v10.8H, v12.8H // ...........................................*.......................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v18.8H, v5.8H, v8.H[0] // ...................................................*.................................................................. + mla v17.8H, v11.8H, v8.H[0] // ..................................................*................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v0.8H, v20.8H, v8.H[0] // .....................................................*................................................................ + mla v26.8H, v27.8H, v8.H[0] // ......................................................*............................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q28, [x4, #-128] // .......................................................*.............................................................. + sub v23.8H, v4.8H, v18.8H // .........................................................*............................................................ + add v5.8H, v4.8H, v18.8H // ........................................................*............................................................. + add v12.8H, v21.8H, v17.8H // ............................................................*......................................................... + sub v17.8H, v21.8H, v17.8H // ..........................................................*........................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v21.8H, v9.8H, v26.8H // ...........................................................................*.......................................... + add v15.8H, v9.8H, v26.8H // .....................................................................*................................................ + sub v30.8H, v10.8H, v0.8H // ..........................................................................*........................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v19.8H, v23.8H, v6.8H // ................................................................*..................................................... + sqrdmulh v4.8H, v17.8H, v29.8H // ..............................................................*....................................................... + mul v16.8H, v17.8H, v25.8H // .............................................................*........................................................ + mul v20.8H, v23.8H, v7.8H // ...............................................................*...................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v9.8H, v12.8H, v28.8H // ....................................................................*................................................. + sqrdmulh v6.8H, v5.8H, v14.8H // .................................................................*.................................................... + sqrdmulh v28.8H, v12.8H, v1.8H // ...................................................................*.................................................. + mul v29.8H, v5.8H, v24.8H // ..................................................................*................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v17.8H, v10.8H, v0.8H // ...........................................................*.......................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v16.8H, v4.8H, v8.H[0] // ......................................................................*............................................... + mla v20.8H, v19.8H, v8.H[0] // .......................................................................*.............................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v29.8H, v6.8H, v8.H[0] // ........................................................................*............................................. + mla v9.8H, v28.8H, v8.H[0] // .........................................................................*............................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v10.8H, v21.8H, v16.8H // .............................................................................*........................................ + sub v23.8H, v21.8H, v16.8H // ............................................................................*......................................... + add v11.8H, v30.8H, v20.8H // ...............................................................................*...................................... + sub v13.8H, v30.8H, v20.8H // ..............................................................................*....................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v16.8H, v17.8H, v29.8H // .................................................................................*.................................... + sub v17.8H, v17.8H, v29.8H // ................................................................................*..................................... + sub v3.8H, v15.8H, v9.8H // ..................................................................................*................................... + add v7.8H, v15.8H, v9.8H // ...................................................................................*.................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v0.8H, v23.8H, v8.H[1] // .....................................................................................*................................ + sqdmulh v14.8H, v13.8H, v8.H[1] // ....................................................................................*................................. + sqdmulh v26.8H, v10.8H, v8.H[1] // ......................................................................................*............................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v29.8H, v11.8H, v8.H[1] // .......................................................................................*.............................. + sqdmulh v6.8H, v17.8H, v8.H[1] // .........................................................................................*............................ + sqdmulh v27.8H, v7.8H, v8.H[1] // ..........................................................................................*........................... + sqdmulh v12.8H, v16.8H, v8.H[1] // ........................................................................................*............................. + sqdmulh v15.8H, v3.8H, v8.H[1] // ...........................................................................................*.......................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v9.8H, v0.8H, #11 // ..............................................................................................*....................... + srshr v0.8H, v29.8H, #11 // ............................................................................................*......................... + srshr v14.8H, v14.8H, #11 // ...............................................................................................*...................... + srshr v21.8H, v26.8H, #11 // .............................................................................................*........................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v19.8H, v27.8H, #11 // ..................................................................................................*................... + srshr v26.8H, v12.8H, #11 // ................................................................................................*..................... + srshr v6.8H, v6.8H, #11 // .................................................................................................*.................... + srshr v15.8H, v15.8H, #11 // ...................................................................................................*.................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v10.8H, v21.8H, v8.H[0] // .....................................................................................................*................ + mla v11.8H, v0.8H, v8.H[0] // .......................................................................................................*.............. + mla v23.8H, v9.8H, v8.H[0] // ....................................................................................................*................. + mla v13.8H, v14.8H, v8.H[0] // ......................................................................................................*............... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v3.8H, v15.8H, v8.H[0] // ...........................................................................................................*.......... + mla v7.8H, v19.8H, v8.H[0] // .........................................................................................................*............ + mla v16.8H, v26.8H, v8.H[0] // ..........................................................................................................*........... + mla v17.8H, v6.8H, v8.H[0] // ........................................................................................................*............. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v27.4S, v23.4S, v13.4S // ............................................................................................................*......... + trn1 v15.4S, v23.4S, v13.4S // .............................................................................................................*........ + trn1 v14.4S, v10.4S, v11.4S // ...............................................................................................................*...... + trn2 v26.4S, v10.4S, v11.4S // ..............................................................................................................*....... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v13.4S, v3.4S, v17.4S // .................................................................................................................*.... + trn2 v25.4S, v3.4S, v17.4S // ..................................................................................................................*... + trn2 v24.4S, v7.4S, v16.4S // ................................................................................................................*..... + trn1 v12.4S, v7.4S, v16.4S // ...................................................................................................................*.. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x7], #64 // ....................................................................................................................*. + st4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x6], #64 // .....................................................................................................................* + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + + // --------------------------------------------------- new position ----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + // ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x6] // ..*................................................................................................................... + // ldr q9, [x4, #208] // ...............................................*...................................................................... + // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x7] // ...*.................................................................................................................. + // ldr q30, [x4, #144] // ......*............................................................................................................... + // ldr q22, [x4, #64] // .*.................................................................................................................... + // ldr q13, [x4, #32] // *..................................................................................................................... + // ldr q28, [x4, #48] // ...............*...................................................................................................... + // ldr q12, [x4, #128] // .........*............................................................................................................ + // ldr q23, [x4, #80] // .....*................................................................................................................ + // ldr q2, [x4, #160] // ..........*........................................................................................................... + // ldr q0, [x4], #16*14 // .......*.............................................................................................................. + // ldr q17, [x4, #-48] // .....................*................................................................................................ + // ldr q7, [x4, #-208] // ....*................................................................................................................. + // trn1 v24.4S, v18.4S, v3.4S // ..................*................................................................................................... + // trn2 v18.4S, v18.4S, v3.4S // ...................*.................................................................................................. + // trn2 v11.4S, v19.4S, v4.4S // .............*........................................................................................................ + // trn2 v27.4S, v21.4S, v6.4S // ................*..................................................................................................... + // trn1 v10.4S, v20.4S, v5.4S // ............*......................................................................................................... + // trn1 v1.4S, v19.4S, v4.4S // ..............*....................................................................................................... + // trn2 v14.4S, v20.4S, v5.4S // .................*.................................................................................................... + // sqrdmulh v26.8H, v11.8H, v7.8H // ......................*............................................................................................... + // mul v31.8H, v11.8H, v0.8H // .......................*.............................................................................................. + // mul v15.8H, v27.8H, v0.8H // ........................*............................................................................................. + // sqrdmulh v16.8H, v27.8H, v7.8H // .........................*............................................................................................ + // mul v27.8H, v14.8H, v0.8H // ..........................*........................................................................................... + // sqrdmulh v14.8H, v14.8H, v7.8H // ...........................*.......................................................................................... + // trn1 v11.4S, v21.4S, v6.4S // ....................*................................................................................................. + // mul v5.8H, v18.8H, v0.8H // ............................*......................................................................................... + // mla v15.8H, v16.8H, v8.H[0] // ..............................*....................................................................................... + // mla v31.8H, v26.8H, v8.H[0] // ...............................*...................................................................................... + // sqrdmulh v0.8H, v18.8H, v7.8H // .............................*........................................................................................ + // mla v27.8H, v14.8H, v8.H[0] // ................................*..................................................................................... + // sub v29.8H, v1.8H, v31.8H // ..................................*................................................................................... + // mla v5.8H, v0.8H, v8.H[0] // .................................*.................................................................................... + // sub v16.8H, v11.8H, v15.8H // .....................................*................................................................................ + // add v15.8H, v11.8H, v15.8H // ....................................*................................................................................. + // sub v7.8H, v10.8H, v27.8H // .......................................*.............................................................................. + // add v0.8H, v10.8H, v27.8H // ......................................*............................................................................... + // sqrdmulh v6.8H, v15.8H, v28.8H // ........................................*............................................................................. + // mul v14.8H, v15.8H, v13.8H // .........................................*............................................................................ + // mul v15.8H, v16.8H, v22.8H // ..........................................*........................................................................... + // sqrdmulh v16.8H, v16.8H, v23.8H // ...........................................*.......................................................................... + // sqrdmulh v26.8H, v0.8H, v28.8H // ................................................*..................................................................... + // sub v3.8H, v24.8H, v5.8H // ..................................................*................................................................... + // mul v11.8H, v7.8H, v22.8H // .............................................*........................................................................ + // sqrdmulh v27.8H, v7.8H, v23.8H // ............................................*......................................................................... + // ldr q22, [x4, #-112] // ...........*.......................................................................................................... + // mul v13.8H, v0.8H, v13.8H // ..............................................*....................................................................... + // ldr q7, [x4, #-32] // ........*............................................................................................................. + // add v25.8H, v24.8H, v5.8H // .................................................*.................................................................... + // mla v14.8H, v6.8H, v8.H[0] // ....................................................*................................................................. + // mla v15.8H, v16.8H, v8.H[0] // ...................................................*.................................................................. + // add v0.8H, v1.8H, v31.8H // ...................................*.................................................................................. + // mla v11.8H, v27.8H, v8.H[0] // .....................................................*................................................................ + // mla v13.8H, v26.8H, v8.H[0] // ......................................................*............................................................... + // ldr q1, [x4, #-128] // .......................................................*.............................................................. + // add v27.8H, v29.8H, v15.8H // .........................................................*............................................................ + // sub v16.8H, v29.8H, v15.8H // ........................................................*............................................................. + // sub v26.8H, v0.8H, v14.8H // ...........................................................*.......................................................... + // add v31.8H, v3.8H, v11.8H // .......................................................................*.............................................. + // add v0.8H, v0.8H, v14.8H // ..........................................................*........................................................... + // mul v6.8H, v26.8H, v12.8H // .................................................................*.................................................... + // sqrdmulh v26.8H, v26.8H, v30.8H // ................................................................*..................................................... + // mul v15.8H, v16.8H, v7.8H // ..................................................................*................................................... + // sqrdmulh v16.8H, v16.8H, v9.8H // ...............................................................*...................................................... + // sqrdmulh v14.8H, v27.8H, v17.8H // ....................................................................*................................................. + // mul v27.8H, v27.8H, v2.8H // ......................................................................*............................................... + // sqrdmulh v7.8H, v0.8H, v22.8H // .....................................................................*................................................ + // mul v0.8H, v0.8H, v1.8H // ...................................................................*.................................................. + // add v2.8H, v25.8H, v13.8H // .............................................................*........................................................ + // mla v6.8H, v26.8H, v8.H[0] // ........................................................................*............................................. + // mla v15.8H, v16.8H, v8.H[0] // .........................................................................*............................................ + // mla v27.8H, v14.8H, v8.H[0] // ..........................................................................*........................................... + // mla v0.8H, v7.8H, v8.H[0] // ...........................................................................*.......................................... + // sub v14.8H, v3.8H, v11.8H // ..............................................................*....................................................... + // sub v7.8H, v25.8H, v13.8H // ............................................................*......................................................... + // sub v3.8H, v7.8H, v6.8H // .............................................................................*........................................ + // add v22.8H, v7.8H, v6.8H // ............................................................................*......................................... + // sub v13.8H, v14.8H, v15.8H // ...............................................................................*...................................... + // add v11.8H, v14.8H, v15.8H // ..............................................................................*....................................... + // sub v25.8H, v31.8H, v27.8H // .................................................................................*.................................... + // add v31.8H, v31.8H, v27.8H // ................................................................................*..................................... + // sub v1.8H, v2.8H, v0.8H // ..................................................................................*................................... + // add v2.8H, v2.8H, v0.8H // ...................................................................................*.................................. + // sqdmulh v15.8H, v13.8H, v8.H[1] // .....................................................................................*................................ + // sqdmulh v6.8H, v3.8H, v8.H[1] // ....................................................................................*................................. + // sqdmulh v26.8H, v22.8H, v8.H[1] // ......................................................................................*............................... + // sqdmulh v16.8H, v11.8H, v8.H[1] // .......................................................................................*.............................. + // sqdmulh v14.8H, v31.8H, v8.H[1] // ..........................................................................................*........................... + // sqdmulh v27.8H, v25.8H, v8.H[1] // ........................................................................................*............................. + // sqdmulh v7.8H, v2.8H, v8.H[1] // .........................................................................................*............................ + // sqdmulh v0.8H, v1.8H, v8.H[1] // ...........................................................................................*.......................... + // srshr v16.8H, v16.8H, #11 // .............................................................................................*........................ + // srshr v26.8H, v26.8H, #11 // ...............................................................................................*...................... + // srshr v6.8H, v6.8H, #11 // ............................................................................................*......................... + // srshr v15.8H, v15.8H, #11 // ..............................................................................................*....................... + // srshr v14.8H, v14.8H, #11 // .................................................................................................*.................... + // srshr v27.8H, v27.8H, #11 // ..................................................................................................*................... + // srshr v7.8H, v7.8H, #11 // ................................................................................................*..................... + // srshr v0.8H, v0.8H, #11 // ...................................................................................................*.................. + // mla v3.8H, v6.8H, v8.H[0] // ......................................................................................................*............... + // mla v22.8H, v26.8H, v8.H[0] // ....................................................................................................*................. + // mla v13.8H, v15.8H, v8.H[0] // .......................................................................................................*.............. + // mla v11.8H, v16.8H, v8.H[0] // .....................................................................................................*................ + // mla v25.8H, v27.8H, v8.H[0] // ...........................................................................................................*.......... + // mla v2.8H, v7.8H, v8.H[0] // .........................................................................................................*............ + // mla v31.8H, v14.8H, v8.H[0] // ..........................................................................................................*........... + // mla v1.8H, v0.8H, v8.H[0] // ........................................................................................................*............. + // trn2 v18.4S, v3.4S, v13.4S // ............................................................................................................*......... + // trn1 v30.4S, v3.4S, v13.4S // .............................................................................................................*........ + // trn2 v17.4S, v22.4S, v11.4S // ...............................................................................................................*...... + // trn1 v29.4S, v22.4S, v11.4S // ..............................................................................................................*....... + // trn2 v15.4S, v2.4S, v31.4S // ..................................................................................................................*... + // trn1 v28.4S, v1.4S, v25.4S // ................................................................................................................*..... + // trn2 v16.4S, v1.4S, v25.4S // .................................................................................................................*.... + // trn1 v27.4S, v2.4S, v31.4S // ...................................................................................................................*.. + // st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x7], #64 // ....................................................................................................................*. + // st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x6], #64 // .....................................................................................................................* + + sub count, count, #1 +layer567_start: + // Instructions: 118 + // Expected cycles: 45 + // Expected IPC: 2.62 + // + // Cycle bound: 45.0 + // IPC bound: 2.62 + // + // Wall time: 316.01s + // User time: 316.01s + // + // ------------------------------------------------- original position -------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x6] // e..................................................................................................................... + ldr q9, [x4, #208] // .......................e.............................................................................................. + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x7] // .e.................................................................................................................... + ldr q30, [x4, #144] // ...................e.................................................................................................. + ldr q22, [x4, #64] // ..............e....................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q13, [x4, #32] // ............e......................................................................................................... + ldr q28, [x4, #48] // .............e........................................................................................................ + ldr q12, [x4, #128] // ..................e................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q23, [x4, #80] // ...............e...................................................................................................... + ldr q2, [x4, #160] // ....................e................................................................................................. + ldr q0, [x4], #16*14 // ..........e........................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q17, [x4, #-48] // .....................e................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q7, [x4, #-208] // ...........e.......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v24.4S, v18.4S, v3.4S // ..e................................................................................................................... + trn2 v18.4S, v18.4S, v3.4S // ...e.................................................................................................................. + trn2 v11.4S, v19.4S, v4.4S // .....e................................................................................................................ + trn2 v27.4S, v21.4S, v6.4S // .........e............................................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v10.4S, v20.4S, v5.4S // ......e............................................................................................................... + trn1 v1.4S, v19.4S, v4.4S // ....e................................................................................................................. + trn2 v14.4S, v20.4S, v5.4S // .......e.............................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v26.8H, v11.8H, v7.8H // .............................e........................................................................................ + mul v31.8H, v11.8H, v0.8H // ..............................e....................................................................................... + mul v15.8H, v27.8H, v0.8H // ........................................e............................................................................. + sqrdmulh v16.8H, v27.8H, v7.8H // .......................................e.............................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v27.8H, v14.8H, v0.8H // ...................................e.................................................................................. + sqrdmulh v14.8H, v14.8H, v7.8H // ..................................e................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v11.4S, v21.4S, v6.4S // ........e............................................................................................................. + mul v5.8H, v18.8H, v0.8H // .........................e............................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v15.8H, v16.8H, v8.H[0] // .........................................e............................................................................ + mla v31.8H, v26.8H, v8.H[0] // ...............................e...................................................................................... + sqrdmulh v0.8H, v18.8H, v7.8H // ........................e............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v27.8H, v14.8H, v8.H[0] // ....................................e................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v29.8H, v1.8H, v31.8H // ................................e..................................................................................... + mla v5.8H, v0.8H, v8.H[0] // ..........................e........................................................................................... + sub v16.8H, v11.8H, v15.8H // ..........................................e........................................................................... + add v15.8H, v11.8H, v15.8H // ...........................................e.......................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v7.8H, v10.8H, v27.8H // .....................................e................................................................................ + add v0.8H, v10.8H, v27.8H // ......................................e............................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v6.8H, v15.8H, v28.8H // .................................................e.................................................................... + mul v14.8H, v15.8H, v13.8H // ..................................................e................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v15.8H, v16.8H, v22.8H // ............................................................e......................................................... + sqrdmulh v16.8H, v16.8H, v23.8H // ...........................................................e.......................................................... + sqrdmulh v26.8H, v0.8H, v28.8H // ............................................e......................................................................... + sub v3.8H, v24.8H, v5.8H // ...........................e.......................................................................................... + mul v11.8H, v7.8H, v22.8H // .......................................................e.............................................................. + sqrdmulh v27.8H, v7.8H, v23.8H // ......................................................e............................................................... + ldr q22, [x4, #-112] // .................e.................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v13.8H, v0.8H, v13.8H // .............................................e........................................................................ + ldr q7, [x4, #-32] // ......................e............................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v25.8H, v24.8H, v5.8H // ............................e......................................................................................... + mla v14.8H, v6.8H, v8.H[0] // ...................................................e.................................................................. + mla v15.8H, v16.8H, v8.H[0] // .............................................................e........................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v0.8H, v1.8H, v31.8H // .................................e.................................................................................... + mla v11.8H, v27.8H, v8.H[0] // ........................................................e............................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v13.8H, v26.8H, v8.H[0] // ..............................................e....................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q1, [x4, #-128] // ................e..................................................................................................... + add v27.8H, v29.8H, v15.8H // ...............................................................e...................................................... + sub v16.8H, v29.8H, v15.8H // ..............................................................e....................................................... + sub v26.8H, v0.8H, v14.8H // ....................................................e................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v31.8H, v3.8H, v11.8H // ..........................................................e........................................................... + add v0.8H, v0.8H, v14.8H // .....................................................e................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v6.8H, v26.8H, v12.8H // ......................................................................e............................................... + sqrdmulh v26.8H, v26.8H, v30.8H // .....................................................................e................................................ + mul v15.8H, v16.8H, v7.8H // ................................................................................e..................................... + sqrdmulh v16.8H, v16.8H, v9.8H // ...............................................................................e...................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v14.8H, v27.8H, v17.8H // ..........................................................................e........................................... + mul v27.8H, v27.8H, v2.8H // ...........................................................................e.......................................... + sqrdmulh v7.8H, v0.8H, v22.8H // ................................................................e..................................................... + mul v0.8H, v0.8H, v1.8H // .................................................................e.................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v2.8H, v25.8H, v13.8H // ................................................e..................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v6.8H, v26.8H, v8.H[0] // .......................................................................e.............................................. + mla v15.8H, v16.8H, v8.H[0] // .................................................................................e.................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v27.8H, v14.8H, v8.H[0] // ............................................................................e......................................... + mla v0.8H, v7.8H, v8.H[0] // ..................................................................e................................................... + sub v14.8H, v3.8H, v11.8H // .........................................................e............................................................ + sub v7.8H, v25.8H, v13.8H // ...............................................e...................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v3.8H, v7.8H, v6.8H // ........................................................................e............................................. + add v22.8H, v7.8H, v6.8H // .........................................................................e............................................ + sub v13.8H, v14.8H, v15.8H // ..................................................................................e................................... + add v11.8H, v14.8H, v15.8H // ...................................................................................e.................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v25.8H, v31.8H, v27.8H // .............................................................................e........................................ + add v31.8H, v31.8H, v27.8H // ..............................................................................e....................................... + sub v1.8H, v2.8H, v0.8H // ...................................................................e.................................................. + add v2.8H, v2.8H, v0.8H // ....................................................................e................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v15.8H, v13.8H, v8.H[1] // .........................................................................................................e............ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v6.8H, v3.8H, v8.H[1] // .............................................................................................e........................ + sqdmulh v26.8H, v22.8H, v8.H[1] // ..........................................................................................e........................... + sqdmulh v16.8H, v11.8H, v8.H[1] // ......................................................................................................e............... + sqdmulh v14.8H, v31.8H, v8.H[1] // ................................................................................................e..................... + sqdmulh v27.8H, v25.8H, v8.H[1] // ...................................................................................................e.................. + sqdmulh v7.8H, v2.8H, v8.H[1] // ....................................................................................e................................. + sqdmulh v0.8H, v1.8H, v8.H[1] // .......................................................................................e.............................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v16.8H, v16.8H, #11 // .......................................................................................................e.............. + srshr v26.8H, v26.8H, #11 // ...........................................................................................e.......................... + srshr v6.8H, v6.8H, #11 // ..............................................................................................e....................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v15.8H, v15.8H, #11 // ..........................................................................................................e........... + srshr v14.8H, v14.8H, #11 // .................................................................................................e.................... + srshr v27.8H, v27.8H, #11 // ....................................................................................................e................. + srshr v7.8H, v7.8H, #11 // .....................................................................................e................................ + srshr v0.8H, v0.8H, #11 // ........................................................................................e............................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v3.8H, v6.8H, v8.H[0] // ...............................................................................................e...................... + mla v22.8H, v26.8H, v8.H[0] // ............................................................................................e......................... + mla v13.8H, v15.8H, v8.H[0] // ...........................................................................................................e.......... + mla v11.8H, v16.8H, v8.H[0] // ........................................................................................................e............. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v25.8H, v27.8H, v8.H[0] // .....................................................................................................e................ + mla v2.8H, v7.8H, v8.H[0] // ......................................................................................e............................... + mla v31.8H, v14.8H, v8.H[0] // ..................................................................................................e................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v1.8H, v0.8H, v8.H[0] // .........................................................................................e............................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v18.4S, v3.4S, v13.4S // ...................................................................................................................e.. + trn1 v30.4S, v3.4S, v13.4S // ..................................................................................................................e... + trn2 v17.4S, v22.4S, v11.4S // .................................................................................................................e.... + trn1 v29.4S, v22.4S, v11.4S // ................................................................................................................e..... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v15.4S, v2.4S, v31.4S // .............................................................................................................e........ + trn1 v28.4S, v1.4S, v25.4S // ..............................................................................................................e....... + trn2 v16.4S, v1.4S, v25.4S // ...............................................................................................................e...... + trn1 v27.4S, v2.4S, v31.4S // ............................................................................................................e......... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x7], #64 // .....................................................................................................................e + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x6], #64 // ....................................................................................................................e. + + // ---------------------------------------------------- new position ----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------ + // ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6] // e.....................................................................................................................' + // ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7] // ..e...................................................................................................................' + // trn1 v9.4s, v17.4s, v21.4s // .............e........................................................................................................' + // trn2 v13.4s, v17.4s, v21.4s // ..............e.......................................................................................................' + // trn1 v10.4s, v18.4s, v22.4s // ..................e...................................................................................................' + // trn2 v14.4s, v18.4s, v22.4s // ...............e......................................................................................................' + // trn1 v11.4s, v19.4s, v23.4s // .................e....................................................................................................' + // trn2 v15.4s, v19.4s, v23.4s // ...................e..................................................................................................' + // trn1 v12.4s, v20.4s, v24.4s // ..........................e...........................................................................................' + // trn2 v16.4s, v20.4s, v24.4s // ................e.....................................................................................................' + // ldr q0, [ x4], #16*14 // ..........e...........................................................................................................' + // ldr q4, [x4, #-16*14+16*1] // ............e.........................................................................................................' + // ldr q1, [ x4, #-16*14+16*2] // .....e................................................................................................................' + // ldr q5, [x4, #-16*14+16*3] // ......e...............................................................................................................' + // ldr q2, [ x4, #-16*14+16*4] // ....e.................................................................................................................' + // ldr q6, [x4, #-16*14+16*5] // ........e.............................................................................................................' + // ldr q3, [ x4, #-16*14+16*6] // .......................................................e..............................................................' + // ldr q7, [x4, #-16*14+16*7] // ..............................................e.......................................................................' + // ldr q17, [ x4, #-16*14+16*8] // .......e..............................................................................................................' + // ldr q18, [ x4, #-16*14+16*9] // ...e..................................................................................................................' + // ldr q19, [ x4, #-16*14+16*10] // .........e............................................................................................................' + // ldr q20, [ x4, #-16*14+16*11] // ...........e..........................................................................................................' + // ldr q21, [ x4, #-16*14+16*12] // ................................................e.....................................................................' + // ldr q22, [ x4, #-16*14+16*13] // .e....................................................................................................................' + // sqrdmulh v28.8h, v13.8h, v4.8h // ..............................e.......................................................................................' + // mul v25.8h, v13.8h, v0.8h // ...........................e..........................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // .................................e....................................................................................' + // sub v13.8h, v9.8h, v25.8h // ...........................................e..........................................................................' + // add v9.8h, v9.8h, v25.8h // .................................................e....................................................................' + // sqrdmulh v28.8h, v14.8h, v4.8h // ....................e.................................................................................................' + // mul v25.8h, v14.8h, v0.8h // .....................e................................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // .............................e........................................................................................' + // sub v14.8h, v10.8h, v25.8h // ................................e.....................................................................................' + // add v10.8h, v10.8h, v25.8h // ....................................................e.................................................................' + // sqrdmulh v28.8h, v15.8h, v4.8h // .........................e............................................................................................' + // mul v25.8h, v15.8h, v0.8h // ........................e.............................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ...............................e......................................................................................' + // sub v15.8h, v11.8h, v25.8h // ....................................e.................................................................................' + // add v11.8h, v11.8h, v25.8h // .....................................e................................................................................' + // sqrdmulh v28.8h, v16.8h, v4.8h // .......................e..............................................................................................' + // mul v25.8h, v16.8h, v0.8h // ......................e...............................................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ............................e.........................................................................................' + // sub v16.8h, v12.8h, v25.8h // ..................................e...................................................................................' + // add v12.8h, v12.8h, v25.8h // ...................................e..................................................................................' + // sqrdmulh v28.8h, v11.8h, v5.8h // ..........................................e...........................................................................' + // mul v25.8h, v11.8h, v1.8h // ...............................................e......................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ......................................................e...............................................................' + // sub v11.8h, v9.8h, v25.8h // ...........................................................................e..........................................' + // add v9.8h, v9.8h, v25.8h // .....................................................................e................................................' + // sqrdmulh v28.8h, v12.8h, v5.8h // ......................................e...............................................................................' + // mul v25.8h, v12.8h, v1.8h // .......................................e..............................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ..................................................e...................................................................' + // sub v12.8h, v10.8h, v25.8h // ..........................................................e...........................................................' + // add v10.8h, v10.8h, v25.8h // ............................................................e.........................................................' + // sqrdmulh v28.8h, v15.8h, v6.8h // .............................................e........................................................................' + // mul v25.8h, v15.8h, v2.8h // ............................................e.........................................................................' + // mla v25.8h, v28.8h, v8.h[0] // .....................................................e................................................................' + // sub v15.8h, v13.8h, v25.8h // ..........................................................................e...........................................' + // add v13.8h, v13.8h, v25.8h // ...........................................................e..........................................................' + // sqrdmulh v28.8h, v16.8h, v6.8h // .........................................e............................................................................' + // mul v25.8h, v16.8h, v2.8h // ........................................e.............................................................................' + // mla v25.8h, v28.8h, v8.h[0] // ...................................................e..................................................................' + // sub v16.8h, v14.8h, v25.8h // .........................................................e............................................................' + // add v14.8h, v14.8h, v25.8h // ........................................................e.............................................................' + // sqrdmulh v28.8h, v10.8h, v7.8h // ...................................................................e..................................................' + // mul v25.8h, v10.8h, v3.8h // ....................................................................e.................................................' + // mla v25.8h, v28.8h, v8.h[0] // .........................................................................e............................................' + // sub v10.8h, v9.8h, v25.8h // ..................................................................................e...................................' + // add v9.8h, v9.8h, v25.8h // ...................................................................................e..................................' + // sqrdmulh v28.8h, v12.8h, v18.8h // ..............................................................e.......................................................' + // mul v25.8h, v12.8h, v17.8h // .............................................................e........................................................' + // mla v25.8h, v28.8h, v8.h[0] // ......................................................................e...............................................' + // sub v12.8h, v11.8h, v25.8h // ............................................................................e.........................................' + // add v11.8h, v11.8h, v25.8h // .............................................................................e........................................' + // sqrdmulh v28.8h, v14.8h, v20.8h // .................................................................e....................................................' + // mul v25.8h, v14.8h, v19.8h // ..................................................................e...................................................' + // mla v25.8h, v28.8h, v8.h[0] // ........................................................................e.............................................' + // sub v14.8h, v13.8h, v25.8h // ................................................................................e.....................................' + // add v13.8h, v13.8h, v25.8h // .................................................................................e....................................' + // sqrdmulh v28.8h, v16.8h, v22.8h // ................................................................e.....................................................' + // mul v25.8h, v16.8h, v21.8h // ...............................................................e......................................................' + // mla v25.8h, v28.8h, v8.h[0] // .......................................................................e..............................................' + // sub v16.8h, v15.8h, v25.8h // ..............................................................................e.......................................' + // add v15.8h, v15.8h, v25.8h // ...............................................................................e......................................' + // sqdmulh v26.8h, v9.8h, v8.h[1] // ..........................................................................................e...........................' + // srshr v26.8h, v26.8h, #11 // ..................................................................................................e...................' + // mla v9.8h, v26.8h, v8.h[0] // .........................................................................................................e............' + // sqdmulh v26.8h, v10.8h, v8.h[1] // ...........................................................................................e..........................' + // srshr v26.8h, v26.8h, #11 // ...................................................................................................e..................' + // mla v10.8h, v26.8h, v8.h[0] // ...........................................................................................................e..........' + // sqdmulh v26.8h, v11.8h, v8.h[1] // ......................................................................................e...............................' + // srshr v26.8h, v26.8h, #11 // .............................................................................................e........................' + // mla v11.8h, v26.8h, v8.h[0] // .....................................................................................................e................' + // sqdmulh v26.8h, v12.8h, v8.h[1] // .....................................................................................e................................' + // srshr v26.8h, v26.8h, #11 // ..............................................................................................e.......................' + // mla v12.8h, v26.8h, v8.h[0] // ....................................................................................................e.................' + // sqdmulh v26.8h, v13.8h, v8.h[1] // ........................................................................................e.............................' + // srshr v26.8h, v26.8h, #11 // ................................................................................................e.....................' + // mla v13.8h, v26.8h, v8.h[0] // ..........................................................................................................e...........' + // sqdmulh v26.8h, v14.8h, v8.h[1] // .........................................................................................e............................' + // srshr v26.8h, v26.8h, #11 // .................................................................................................e....................' + // mla v14.8h, v26.8h, v8.h[0] // ........................................................................................................e.............' + // sqdmulh v26.8h, v15.8h, v8.h[1] // .......................................................................................e..............................' + // srshr v26.8h, v26.8h, #11 // ............................................................................................e.........................' + // mla v15.8h, v26.8h, v8.h[0] // .......................................................................................................e..............' + // sqdmulh v26.8h, v16.8h, v8.h[1] // ....................................................................................e.................................' + // srshr v26.8h, v26.8h, #11 // ...............................................................................................e......................' + // mla v16.8h, v26.8h, v8.h[0] // ......................................................................................................e...............' + // trn1 v17.4s, v9.4s, v13.4s // ...................................................................................................................e..' + // trn2 v21.4s, v9.4s, v13.4s // ................................................................................................................e.....' + // trn1 v18.4s, v10.4s, v14.4s // .................................................................................................................e....' + // trn2 v22.4s, v10.4s, v14.4s // ..................................................................................................................e...' + // trn1 v19.4s, v11.4s, v15.4s // ...............................................................................................................e......' + // trn2 v23.4s, v11.4s, v15.4s // ..............................................................................................................e.......' + // trn1 v20.4s, v12.4s, v16.4s // .............................................................................................................e........' + // trn2 v24.4s, v12.4s, v16.4s // ............................................................................................................e.........' + // st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6], #64 // .....................................................................................................................e' + // st4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7], #64 // ....................................................................................................................e.' + + sub count, count, #1 + cbnz count, layer567_start + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.04s + // User time: 0.04s + // + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_1234_567_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_kyber_1234_567_opt_m1_icestorm.s new file mode 100644 index 00000000..98a2fa6a --- /dev/null +++ b/examples/opt/aarch64/ntt_kyber_1234_567_opt_m1_icestorm.s @@ -0,0 +1,1316 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmla d,a,b + mla \d\().8h, \a\().8h, \b\().8h +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmlaq d,a,b,i + mla \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlaq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vmlaq \dst, t2, consts, 0 +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlaq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 root0, r_ptr0 + ldr qform_\root0, [\r_ptr0], #16 +.endm + +.macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr qform_\root0, [ \r_ptr1], #(6*16) + ldr qform_\root0_tw, [\r_ptr1, #(-6*16 + 1*16)] + ldr qform_\root1, [ \r_ptr1, #(-6*16 + 2*16)] + ldr qform_\root1_tw, [\r_ptr1, #(-6*16 + 3*16)] + ldr qform_\root2, [ \r_ptr1, #(-6*16 + 4*16)] + ldr qform_\root2_tw, [\r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "ntt_kyber_1234_567_twiddles.s" +.text + + .global ntt_kyber_1234_567_opt_m1_icestorm + .global _ntt_kyber_1234_567 + +.p2align 4 +const_addr: .short -3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_kyber_1234_567_opt_m1_icestorm: +_ntt_kyber_1234_567_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + src0 .req x6 + src1 .req x7 + src2 .req x8 + src3 .req x9 + src4 .req x10 + src5 .req x11 + src6 .req x12 + src7 .req x13 + src8 .req x14 + src9 .req x15 + src10 .req x16 + src11 .req x17 + src12 .req x18 + src13 .req x19 + src14 .req x20 + src15 .req x21 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + data8 .req v17 + data9 .req v18 + data10 .req v19 + data11 .req v20 + data12 .req v21 + data13 .req v22 + data14 .req v23 + data15 .req v24 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + qform_data8 .req q17 + qform_data9 .req q18 + qform_data10 .req q19 + qform_data11 .req q20 + qform_data12 .req q21 + qform_data13 .req q22 + qform_data14 .req q23 + qform_data15 .req q24 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + tmp .req v25 + t0 .req v26 + t1 .req v27 + t2 .req v28 + t3 .req v29 + + consts .req v8 + + ASM_LOAD(r_ptr0, roots) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + add src0, x0, #32*0 + add src8, x0, #32*8 + + ld1 { root0.8h, root1.8h, root2.8h, root3.8h}, [r_ptr0], #64 + + mov count, #2 + + .p2align 2 + ldr q15, [x14, #224] // ...............*................................................................................................................................................................................ + ldr q16, [x6, #128] // ....*........................................................................................................................................................................................... + ldr q14, [x6, #32] // .*.............................................................................................................................................................................................. + ldr q27, [x14, #128] // ............*................................................................................................................................................................................... + ldr q26, [x6, #224] // .......*........................................................................................................................................................................................ + ldr q6, [x14, #0] // ........*....................................................................................................................................................................................... + ldr q7, [x14, #96] // ...........*.................................................................................................................................................................................... + ldr q11, [x14, #192] // ..............*................................................................................................................................................................................. + ldr q25, [x6, #96] // ...*............................................................................................................................................................................................ + ldr q31, [x14, #32] // .........*...................................................................................................................................................................................... + mul v13.8H, v15.8H, v0.H[0] // ....................................................*........................................................................................................................................... + sqrdmulh v15.8H, v15.8H, v0.H[1] // ...................................................*............................................................................................................................................ + ldr q29, [x14, #160] // .............*.................................................................................................................................................................................. + mul v22.8H, v27.8H, v0.H[0] // .....................................*.......................................................................................................................................................... + sqrdmulh v27.8H, v27.8H, v0.H[1] // ....................................*........................................................................................................................................................... + ldr q23, [x6, #64] // ..*............................................................................................................................................................................................. + ldr q5, [x6, #192] // ......*......................................................................................................................................................................................... + mul v12.8H, v6.8H, v0.H[0] // .................*.............................................................................................................................................................................. + sqrdmulh v6.8H, v6.8H, v0.H[1] // ................*............................................................................................................................................................................... + ldr q10, [x6, #160] // .....*.......................................................................................................................................................................................... + sqrdmulh v19.8H, v31.8H, v0.H[1] // .....................*.......................................................................................................................................................................... + mla v13.8H, v15.8H, v8.H[0] // .....................................................*.......................................................................................................................................... + mla v12.8H, v6.8H, v8.H[0] // ..................*............................................................................................................................................................................. + mla v22.8H, v27.8H, v8.H[0] // ......................................*......................................................................................................................................................... + mul v4.8H, v31.8H, v0.H[0] // ......................*......................................................................................................................................................................... + mul v17.8H, v11.8H, v0.H[0] // ...............................................*................................................................................................................................................ + sqrdmulh v11.8H, v11.8H, v0.H[1] // ..............................................*................................................................................................................................................. + add v24.8H, v26.8H, v13.8H // .......................................................*........................................................................................................................................ + sqrdmulh v27.8H, v29.8H, v0.H[1] // .........................................*...................................................................................................................................................... + mul v29.8H, v29.8H, v0.H[0] // ..........................................*..................................................................................................................................................... + mul v31.8H, v24.8H, v0.H[2] // ........................................................................*....................................................................................................................... + sqrdmulh v21.8H, v24.8H, v0.H[3] // .......................................................................*........................................................................................................................ + mla v17.8H, v11.8H, v8.H[0] // ................................................*............................................................................................................................................... + mla v4.8H, v19.8H, v8.H[0] // .......................*........................................................................................................................................................................ + mla v29.8H, v27.8H, v8.H[0] // ...........................................*.................................................................................................................................................... + sub v30.8H, v16.8H, v22.8H // .......................................*........................................................................................................................................................ + ldr q24, [x6, #0] // *............................................................................................................................................................................................... + mla v31.8H, v21.8H, v8.H[0] // .........................................................................*...................................................................................................................... + ldr q19, [x14, #64] // ..........*..................................................................................................................................................................................... + sub v27.8H, v10.8H, v29.8H // ............................................*................................................................................................................................................... + sub v21.8H, v14.8H, v4.8H // ........................*....................................................................................................................................................................... + sub v18.8H, v5.8H, v17.8H // .................................................*.............................................................................................................................................. + add v28.8H, v10.8H, v29.8H // .............................................*.................................................................................................................................................. + mul v15.8H, v27.8H, v0.H[4] // ..................................................................................*............................................................................................................. + sqrdmulh v20.8H, v27.8H, v0.H[5] // .................................................................................*.............................................................................................................. + mul v11.8H, v28.8H, v0.H[2] // ..............................................................*................................................................................................................................. + sqrdmulh v28.8H, v28.8H, v0.H[3] // .............................................................*.................................................................................................................................. + sqrdmulh v9.8H, v19.8H, v0.H[1] // ..........................*..................................................................................................................................................................... + mul v19.8H, v19.8H, v0.H[0] // ...........................*.................................................................................................................................................................... + mla v15.8H, v20.8H, v8.H[0] // ...................................................................................*............................................................................................................ + mul v6.8H, v18.8H, v0.H[4] // .......................................................................................*........................................................................................................ + sqrdmulh v29.8H, v18.8H, v0.H[5] // ......................................................................................*......................................................................................................... + mla v11.8H, v28.8H, v8.H[0] // ...............................................................*................................................................................................................................ + add v14.8H, v14.8H, v4.8H // .........................*...................................................................................................................................................................... + mla v19.8H, v9.8H, v8.H[0] // ............................*................................................................................................................................................................... + add v27.8H, v16.8H, v22.8H // ........................................*....................................................................................................................................................... + add v18.8H, v21.8H, v15.8H // .....................................................................................*.......................................................................................................... + add v9.8H, v14.8H, v11.8H // .................................................................*.............................................................................................................................. + sqrdmulh v20.8H, v7.8H, v0.H[1] // ...............................*................................................................................................................................................................ + mul v16.8H, v7.8H, v0.H[0] // ................................*............................................................................................................................................................... + sub v15.8H, v21.8H, v15.8H // ....................................................................................*........................................................................................................... + add v7.8H, v5.8H, v17.8H // ..................................................*............................................................................................................................................. + sub v4.8H, v26.8H, v13.8H // ......................................................*......................................................................................................................................... + sqrdmulh v21.8H, v27.8H, v0.H[3] // ........................................................*....................................................................................................................................... + mla v6.8H, v29.8H, v8.H[0] // ........................................................................................*....................................................................................................... + mla v16.8H, v20.8H, v8.H[0] // .................................*.............................................................................................................................................................. + mul v26.8H, v4.8H, v0.H[4] // ............................................................................................*................................................................................................... + sqrdmulh v17.8H, v4.8H, v0.H[5] // ...........................................................................................*.................................................................................................... + sqrdmulh v4.8H, v30.8H, v0.H[5] // ............................................................................*................................................................................................................... + sub v20.8H, v23.8H, v19.8H // .............................*.................................................................................................................................................................. + sqrdmulh v5.8H, v7.8H, v0.H[3] // ..................................................................*............................................................................................................................. + add v29.8H, v25.8H, v16.8H // ...................................*............................................................................................................................................................ + mla v26.8H, v17.8H, v8.H[0] // .............................................................................................*.................................................................................................. + mul v10.8H, v27.8H, v0.H[2] // .........................................................*...................................................................................................................................... + sub v25.8H, v25.8H, v16.8H // ..................................*............................................................................................................................................................. + sub v17.8H, v29.8H, v31.8H // ..........................................................................*..................................................................................................................... + add v27.8H, v29.8H, v31.8H // ...........................................................................*.................................................................................................................... + add v13.8H, v23.8H, v19.8H // ..............................*................................................................................................................................................................. + sub v16.8H, v24.8H, v12.8H // ...................*............................................................................................................................................................................ + sqrdmulh v29.8H, v27.8H, v0.H[7] // .....................................................................................................*.......................................................................................... + mul v31.8H, v27.8H, v0.H[6] // ......................................................................................................*......................................................................................... + add v27.8H, v24.8H, v12.8H // ....................*........................................................................................................................................................................... + mul v22.8H, v30.8H, v0.H[4] // .............................................................................*.................................................................................................................. + mul v12.8H, v7.8H, v0.H[2] // ...................................................................*............................................................................................................................ + sub v19.8H, v25.8H, v26.8H // ..............................................................................................*................................................................................................. + add v26.8H, v25.8H, v26.8H // ...............................................................................................*................................................................................................ + mla v31.8H, v29.8H, v8.H[0] // .......................................................................................................*........................................................................................ + mla v22.8H, v4.8H, v8.H[0] // ..............................................................................*................................................................................................................. + sub v14.8H, v14.8H, v11.8H // ................................................................*............................................................................................................................... + mla v12.8H, v5.8H, v8.H[0] // ....................................................................*........................................................................................................................... + sub v30.8H, v20.8H, v6.8H // .........................................................................................*...................................................................................................... + sqrdmulh v23.8H, v26.8H, v1.H[3] // .........................................................................................................................*...................................................................... + add v7.8H, v9.8H, v31.8H // .........................................................................................................*...................................................................................... + add v11.8H, v16.8H, v22.8H // ................................................................................*............................................................................................................... + mul v25.8H, v26.8H, v1.H[2] // ..........................................................................................................................*..................................................................... + sqrdmulh v28.8H, v7.8H, v1.H[7] // ........................................................................................................................................*....................................................... + sub count, count, #1 +layer1234_start: + // Instructions: 192 + // Expected cycles: 48 + // Expected IPC: 4.00 + + // -------------------------------------------------------------------------------------- original position --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + sqrdmulh v29.8H, v19.8H, v1.H[5] // ...................................................*............................................................................................................................................ + mul v5.8H, v19.8H, v1.H[4] // .....................................................*.......................................................................................................................................... + mla v10.8H, v21.8H, v8.H[0] // ..*............................................................................................................................................................................................. + mla v25.8H, v23.8H, v8.H[0] // ...............................*................................................................................................................................................................ + mul v23.8H, v7.8H, v1.H[6] // ......................................*......................................................................................................................................................... + sub v4.8H, v16.8H, v22.8H // ................................................................*............................................................................................................................... + sqrdmulh v24.8H, v30.8H, v1.H[5] // ..............................................................*................................................................................................................................. + mul v7.8H, v30.8H, v1.H[4] // ...................................................................*............................................................................................................................ + mla v5.8H, v29.8H, v8.H[0] // ...........................................................*.................................................................................................................................... + sub v29.8H, v27.8H, v10.8H // ...................*............................................................................................................................................................................ + add v30.8H, v27.8H, v10.8H // ...........*.................................................................................................................................................................................... + mla v23.8H, v28.8H, v8.H[0] // ..................................................*............................................................................................................................................. + mla v7.8H, v24.8H, v8.H[0] // ..........................................................................*..................................................................................................................... + mul v24.8H, v17.8H, v1.H[0] // .*.............................................................................................................................................................................................. + add v27.8H, v20.8H, v6.8H // .....*.......................................................................................................................................................................................... + sub v28.8H, v15.8H, v5.8H // ....................................................................*........................................................................................................................... + sqrdmulh v17.8H, v17.8H, v1.H[1] // *............................................................................................................................................................................................... + sub v26.8H, v18.8H, v25.8H // ...........................................*.................................................................................................................................................... + sqrdmulh v10.8H, v28.8H, v3.H[5] // ........................................................................*....................................................................................................................... + sqrdmulh v21.8H, v27.8H, v1.H[3] // ....................................*........................................................................................................................................................... + sub v20.8H, v4.8H, v7.8H // .................................................................................*.............................................................................................................. + mul v19.8H, v28.8H, v3.H[4] // .........................................................................*...................................................................................................................... + add v16.8H, v15.8H, v5.8H // .....................................................................*.......................................................................................................................... + ldr q5, [x14, #176] // ............................................................................................................*................................................................................... + mla v24.8H, v17.8H, v8.H[0] // .......*........................................................................................................................................................................................ + mul v22.8H, v27.8H, v1.H[2] // ..................................*............................................................................................................................................................. + sqrdmulh v27.8H, v16.8H, v3.H[3] // ...........................................................................*.................................................................................................................... + add v15.8H, v18.8H, v25.8H // .......................................*........................................................................................................................................................ + mla v19.8H, v10.8H, v8.H[0] // ...............................................................................*................................................................................................................ + mul v17.8H, v16.8H, v3.H[2] // .............................................................................*.................................................................................................................. + mla v22.8H, v21.8H, v8.H[0] // ..........................................*..................................................................................................................................................... + sub v18.8H, v14.8H, v24.8H // .....................*.......................................................................................................................................................................... + mul v10.8H, v15.8H, v2.H[6] // .............................................*.................................................................................................................................................. + add v24.8H, v14.8H, v24.8H // .............*.................................................................................................................................................................................. + add v21.8H, v20.8H, v19.8H // .......................................................................................*........................................................................................................ + sqrdmulh v16.8H, v26.8H, v3.H[1] // ...............................................*................................................................................................................................................ + mla v17.8H, v27.8H, v8.H[0] // .....................................................................................*.......................................................................................................... + sub v25.8H, v20.8H, v19.8H // .........................................................................................*...................................................................................................... + sub v19.8H, v9.8H, v31.8H // ........*....................................................................................................................................................................................... + str q25, [x14, #224] // .............................................................................................*.................................................................................................. + ldr q25, [x6, #176] // ...................................................................................................................*............................................................................ + add v6.8H, v13.8H, v12.8H // ....*........................................................................................................................................................................................... + add v28.8H, v4.8H, v7.8H // ....................................................................................*........................................................................................................... + ldr q9, [x14, #208] // .......................................................................................................*........................................................................................ + mul v31.8H, v5.8H, v0.H[0] // .............................................................................................................................*.................................................................. + str q21, [x14, #192] // ...........................................................................................*.................................................................................................... + sqrdmulh v7.8H, v6.8H, v0.H[7] // ...............*................................................................................................................................................................................ + mul v6.8H, v6.8H, v0.H[6] // ..........*..................................................................................................................................................................................... + sqrdmulh v14.8H, v19.8H, v2.H[1] // ............*................................................................................................................................................................................... + sqrdmulh v4.8H, v5.8H, v0.H[1] // ............................................................................................................................*................................................................... + mul v5.8H, v19.8H, v2.H[0] // ..................*............................................................................................................................................................................. + sub v12.8H, v13.8H, v12.8H // ...*............................................................................................................................................................................................ + mla v6.8H, v7.8H, v8.H[0] // ....................*........................................................................................................................................................................... + mul v7.8H, v26.8H, v3.H[0] // .................................................*.............................................................................................................................................. + sqrdmulh v19.8H, v9.8H, v0.H[1] // ..........................................................................................................................*..................................................................... + sqrdmulh v26.8H, v18.8H, v2.H[5] // .........................*...................................................................................................................................................................... + mla v5.8H, v14.8H, v8.H[0] // ........................*....................................................................................................................................................................... + mul v13.8H, v18.8H, v2.H[4] // ...........................*.................................................................................................................................................................... + mla v31.8H, v4.8H, v8.H[0] // ..................................................................................................................................*............................................................. + sqrdmulh v14.8H, v15.8H, v2.H[7] // ..............................................*................................................................................................................................................. + ldr q18, [x14, #240] // ................................................................................................*............................................................................................... + sqrdmulh v15.8H, v24.8H, v2.H[3] // .................*.............................................................................................................................................................................. + add v21.8H, v30.8H, v6.8H // ............................*................................................................................................................................................................... + sub v27.8H, v30.8H, v6.8H // ..........................*..................................................................................................................................................................... + mul v24.8H, v24.8H, v2.H[2] // ................*............................................................................................................................................................................... + ldr q30, [x14, #112] // ......................................................................................................*......................................................................................... + add v6.8H, v25.8H, v31.8H // ..........................................................................................................................................*..................................................... + sub v31.8H, v25.8H, v31.8H // .......................................................................................................................................*........................................................ + mul v4.8H, v12.8H, v1.H[0] // ......*......................................................................................................................................................................................... + add v20.8H, v11.8H, v22.8H // .......................................................*........................................................................................................................................ + mla v7.8H, v16.8H, v8.H[0] // ......................................................*......................................................................................................................................... + sub v11.8H, v11.8H, v22.8H // .................................................................*.............................................................................................................................. + mul v25.8H, v30.8H, v0.H[0] // ...........................................................................................................................................................*.................................... + sqrdmulh v22.8H, v12.8H, v1.H[1] // .........*...................................................................................................................................................................................... + sub v12.8H, v28.8H, v17.8H // ............................................................................................*................................................................................................... + mla v24.8H, v15.8H, v8.H[0] // .......................*........................................................................................................................................................................ + add v17.8H, v28.8H, v17.8H // ..........................................................................................*..................................................................................................... + mla v4.8H, v22.8H, v8.H[0] // ..............*................................................................................................................................................................................. + mla v10.8H, v14.8H, v8.H[0] // ....................................................*........................................................................................................................................... + ldr q16, [x6, #144] // .................................................................................................*.............................................................................................. + ldr q22, [x14, #80] // ......................................................................................................................................*......................................................... + mul v9.8H, v9.8H, v0.H[0] // .........................................................................................................................*...................................................................... + str q12, [x14, #160] // ...............................................................................................*................................................................................................ + sub v28.8H, v11.8H, v7.8H // ..............................................................................*................................................................................................................. + ldr q12, [x14, #144] // ...................................................................................................*............................................................................................ + add v7.8H, v11.8H, v7.8H // ......................................................................*......................................................................................................................... + mla v13.8H, v26.8H, v8.H[0] // .................................*.............................................................................................................................................................. + str q17, [x14, #128] // ..............................................................................................*................................................................................................. + ldr q17, [x6, #208] // ................................................................................................................*............................................................................... + sub v26.8H, v29.8H, v4.8H // .....................................*.......................................................................................................................................................... + add v29.8H, v29.8H, v4.8H // ......................*......................................................................................................................................................................... + str q28, [x14, #96] // ........................................................................................*....................................................................................................... + ldr q15, [x14, #16] // .....................................................................................................*.......................................................................................... + str q7, [x14, #64] // ..................................................................................*............................................................................................................. + sqrdmulh v11.8H, v22.8H, v0.H[1] // ...............................................................................................................................................*................................................ + mul v4.8H, v22.8H, v0.H[0] // ................................................................................................................................................*............................................... + sub v14.8H, v29.8H, v24.8H // ............................................................................*................................................................................................................... + sqrdmulh v22.8H, v31.8H, v0.H[5] // ............................................................................................................................................*................................................... + mul v28.8H, v31.8H, v0.H[4] // ...........................................................................................................................................*.................................................... + sqrdmulh v31.8H, v30.8H, v0.H[1] // ..........................................................................................................................................................*..................................... + add v7.8H, v29.8H, v24.8H // .............................*.................................................................................................................................................................. + str q14, [x6, #160] // ...................................................................................*............................................................................................................ + mul v24.8H, v12.8H, v0.H[0] // .............................................................................................................*.................................................................................. + mla v4.8H, v11.8H, v8.H[0] // ......................................................................................................................................................*......................................... + sqrdmulh v14.8H, v18.8H, v0.H[1] // ...........................................................................................................*.................................................................................... + str q7, [x6, #128] // ................................*............................................................................................................................................................... + sqrdmulh v11.8H, v15.8H, v0.H[1] // ..................................................................................................................*............................................................................. + mul v15.8H, v15.8H, v0.H[0] // .................................................................................................................*.............................................................................. + mla v28.8H, v22.8H, v8.H[0] // .................................................................................................................................................*.............................................. + mul v22.8H, v18.8H, v0.H[0] // ..........................................................................................................*..................................................................................... + mla v9.8H, v19.8H, v8.H[0] // ................................................................................................................................*............................................................... + sqrdmulh v12.8H, v12.8H, v0.H[1] // ..............................................................................................................*................................................................................. + ldr q19, [x6, #16] // ....................................................................................................................................*........................................................... + mla v15.8H, v11.8H, v8.H[0] // ......................................................................................................................*......................................................................... + add v7.8H, v27.8H, v5.8H // ........................................................*....................................................................................................................................... + ldr q29, [x6, #240] // ....................................................................................................*........................................................................................... + mla v22.8H, v14.8H, v8.H[0] // .....................................................................................................................*.......................................................................... + sub v27.8H, v27.8H, v5.8H // ..............................*................................................................................................................................................................. + mla v24.8H, v12.8H, v8.H[0] // .......................................................................................................................*........................................................................ + str q7, [x6, #64] // .......................................................................*........................................................................................................................ + sub v11.8H, v17.8H, v9.8H // .........................................................................................................................................*...................................................... + str q27, [x6, #96] // ...................................*............................................................................................................................................................ + add v27.8H, v19.8H, v15.8H // .................................................................................................................................................................................*.............. + add v12.8H, v17.8H, v9.8H // .............................................................................................................................................................*.................................. + sub v14.8H, v26.8H, v13.8H // ........................................*....................................................................................................................................................... + add v17.8H, v29.8H, v22.8H // ...........................................................................................................................*.................................................................... + add v7.8H, v16.8H, v24.8H // .......................................................................................................................................................*........................................ + add v18.8H, v26.8H, v13.8H // .........................................*...................................................................................................................................................... + sub v13.8H, v16.8H, v24.8H // ...................................................................................................................................*............................................................ + sub v26.8H, v29.8H, v22.8H // ..............................................................................................................................................................*................................. + sqrdmulh v24.8H, v12.8H, v0.H[3] // ......................................................................................................................................................................*......................... + str q14, [x6, #224] // ................................................*............................................................................................................................................... + add v14.8H, v20.8H, v10.8H // ..........................................................*..................................................................................................................................... + ldr q5, [x6, #112] // ........................................................................................................*....................................................................................... + sqrdmulh v30.8H, v13.8H, v0.H[5] // ....................................................................................................................................................................*........................... + mla v25.8H, v31.8H, v8.H[0] // .................................................................................................................................................................*.............................. + sub v16.8H, v19.8H, v15.8H // ..............................................................................................................................................................................*................. + str q18, [x6, #192] // ............................................*................................................................................................................................................... + mul v15.8H, v17.8H, v0.H[2] // ..............................................................................................................................*................................................................. + ldr q18, [x14, #48] // .........................................................................................................*...................................................................................... + mul v22.8H, v13.8H, v0.H[4] // ..................................................................................................................................................................................*............. + str q14, [x14], #16 // ...............................................................*................................................................................................................................ + sqrdmulh v14.8H, v17.8H, v0.H[3] // ...............................................................................................................................*................................................................ + sub v19.8H, v20.8H, v10.8H // ............................................................*................................................................................................................................... + mul v10.8H, v7.8H, v0.H[2] // .........................................................................................................................................................................*...................... + ldr q13, [x6, #80] // ...............................................................................................................*................................................................................ + mul v31.8H, v26.8H, v0.H[4] // ..................................................................................................................................................................*............................. + sqrdmulh v26.8H, v26.8H, v0.H[5] // ...................................................................................................................................................................*............................ + mla v15.8H, v14.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + add v17.8H, v5.8H, v25.8H // .......................................................................................................................................................................*........................ + str q19, [x14, #16] // ..................................................................*............................................................................................................................. + sqrdmulh v20.8H, v6.8H, v0.H[3] // ..............................................................................................................................................*................................................. + sqrdmulh v9.8H, v18.8H, v0.H[1] // ....................................................................................................................*........................................................................... + mla v31.8H, v26.8H, v8.H[0] // ........................................................................................................................................................................*....................... + mul v6.8H, v6.8H, v0.H[2] // .............................................................................................................................................*.................................................. + mla v22.8H, v30.8H, v8.H[0] // .......................................................................................................................................................................................*........ + add v29.8H, v17.8H, v15.8H // ............................................................................................................................................................................*................... + sub v17.8H, v17.8H, v15.8H // ...........................................................................................................................................................................*.................... + mul v15.8H, v18.8H, v0.H[0] // ........................................................................................................................*....................................................................... + mla v6.8H, v20.8H, v8.H[0] // ....................................................................................................................................................*........................................... + sub v26.8H, v21.8H, v23.8H // ................................................................................*............................................................................................................... + mul v12.8H, v12.8H, v0.H[2] // ...................................................................................................................................................................................*............ + add v14.8H, v21.8H, v23.8H // .........................................................*...................................................................................................................................... + ldr q21, [x6, #48] // ..................................................................................................*............................................................................................. + str q26, [x6, #32] // ......................................................................................*......................................................................................................... + sub v25.8H, v5.8H, v25.8H // ..........................................................................................................................................................................*..................... + mla v15.8H, v9.8H, v8.H[0] // .................................................................................................................................*.............................................................. + sqrdmulh v30.8H, v29.8H, v0.H[7] // ...............................................................................................................................................................................*................ + str q14, [x6], #16 // .............................................................*.................................................................................................................................. + sqrdmulh v23.8H, v11.8H, v0.H[5] // ...................................................................................................................................................*............................................ + add v26.8H, v25.8H, v31.8H // .....................................................................................................................................................................................*.......... + mla v12.8H, v24.8H, v8.H[0] // .........................................................................................................................................................................................*...... + add v5.8H, v21.8H, v15.8H // .....................................................................................................................................................*.......................................... + sub v19.8H, v25.8H, v31.8H // ....................................................................................................................................................................................*........... + mul v25.8H, v26.8H, v1.H[2] // ..............................................................................................................................................................................................*. + mul v31.8H, v29.8H, v0.H[6] // ................................................................................................................................................................................*............... + sub v20.8H, v21.8H, v15.8H // ........................................................................................................................................*....................................................... + sub v14.8H, v5.8H, v6.8H // ........................................................................................................................................................................................*....... + add v9.8H, v5.8H, v6.8H // .........................................................................................................................................................*...................................... + mul v6.8H, v11.8H, v0.H[4] // ..................................................................................................................................................*............................................. + add v18.8H, v20.8H, v28.8H // ........................................................................................................................................................*....................................... + mla v31.8H, v30.8H, v8.H[0] // ......................................................................................................................................................................................*......... + sub v15.8H, v20.8H, v28.8H // ............................................................................................................................................................*................................... + sub v20.8H, v13.8H, v4.8H // .....................................................................................................................................................................*.......................... + mla v6.8H, v23.8H, v8.H[0] // ................................................................................................................................................................*............................... + add v11.8H, v16.8H, v22.8H // .............................................................................................................................................................................................*.. + sqrdmulh v21.8H, v7.8H, v0.H[3] // ...............................................................................................................................................................*................................ + add v7.8H, v9.8H, v31.8H // ............................................................................................................................................................................................*... + add v13.8H, v13.8H, v4.8H // .............................................................................................................................................................................*.................. + sqrdmulh v23.8H, v26.8H, v1.H[3] // ...........................................................................................................................................................................................*.... + sub v30.8H, v20.8H, v6.8H // ..........................................................................................................................................................................................*..... + sqrdmulh v28.8H, v7.8H, v1.H[7] // ...............................................................................................................................................................................................* + + // ---------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // sqrdmulh v26.8H, v17.8H, v1.H[1] // ................*............................................................................................................................................................................... + // mul v4.8H, v17.8H, v1.H[0] // .............*.................................................................................................................................................................................. + // mla v10.8H, v21.8H, v8.H[0] // ..*............................................................................................................................................................................................. + // sub v17.8H, v13.8H, v12.8H // ...................................................*............................................................................................................................................ + // add v13.8H, v13.8H, v12.8H // .........................................*...................................................................................................................................................... + // add v21.8H, v20.8H, v6.8H // ..............*................................................................................................................................................................................. + // mul v24.8H, v17.8H, v1.H[0] // ....................................................................*........................................................................................................................... + // mla v4.8H, v26.8H, v8.H[0] // ........................*....................................................................................................................................................................... + // sub v29.8H, v9.8H, v31.8H // ......................................*......................................................................................................................................................... + // sqrdmulh v31.8H, v17.8H, v1.H[1] // .........................................................................*...................................................................................................................... + // mul v6.8H, v13.8H, v0.H[6] // ...............................................*................................................................................................................................................ + // add v9.8H, v27.8H, v10.8H // ..........*..................................................................................................................................................................................... + // sqrdmulh v17.8H, v29.8H, v2.H[1] // ................................................*............................................................................................................................................... + // add v26.8H, v14.8H, v4.8H // .................................*.............................................................................................................................................................. + // mla v24.8H, v31.8H, v8.H[0] // .............................................................................*.................................................................................................................. + // sqrdmulh v5.8H, v13.8H, v0.H[7] // ..............................................*................................................................................................................................................. + // mul v13.8H, v26.8H, v2.H[2] // ................................................................*............................................................................................................................... + // sqrdmulh v26.8H, v26.8H, v2.H[3] // .............................................................*.................................................................................................................................. + // mul v31.8H, v29.8H, v2.H[0] // ..................................................*............................................................................................................................................. + // sub v10.8H, v27.8H, v10.8H // .........*...................................................................................................................................................................................... + // mla v6.8H, v5.8H, v8.H[0] // ....................................................*........................................................................................................................................... + // sub v29.8H, v14.8H, v4.8H // ...............................*................................................................................................................................................................ + // add v14.8H, v10.8H, v24.8H // ..........................................................................................*..................................................................................................... + // mla v13.8H, v26.8H, v8.H[0] // ...........................................................................*.................................................................................................................... + // mla v31.8H, v17.8H, v8.H[0] // ........................................................*....................................................................................................................................... + // sqrdmulh v5.8H, v29.8H, v2.H[5] // .......................................................*........................................................................................................................................ + // sub v12.8H, v9.8H, v6.8H // ...............................................................*................................................................................................................................ + // mul v29.8H, v29.8H, v2.H[4] // .........................................................*...................................................................................................................................... + // add v6.8H, v9.8H, v6.8H // ..............................................................*................................................................................................................................. + // add v27.8H, v14.8H, v13.8H // ....................................................................................................*........................................................................................... + // sub v26.8H, v12.8H, v31.8H // .....................................................................................................................*.......................................................................... + // mla v25.8H, v23.8H, v8.H[0] // ...*............................................................................................................................................................................................ + // str q27, [x6, #128] // .........................................................................................................*...................................................................................... + // mla v29.8H, v5.8H, v8.H[0] // ......................................................................................*......................................................................................................... + // mul v9.8H, v21.8H, v1.H[2] // .........................*...................................................................................................................................................................... + // str q26, [x6, #96] // .........................................................................................................................*...................................................................... + // sqrdmulh v23.8H, v21.8H, v1.H[3] // ...................*............................................................................................................................................................................ + // sub v17.8H, v10.8H, v24.8H // .........................................................................................*...................................................................................................... + // mul v4.8H, v7.8H, v1.H[6] // ....*........................................................................................................................................................................................... + // add v7.8H, v18.8H, v25.8H // ...........................*.................................................................................................................................................................... + // sub v26.8H, v17.8H, v29.8H // ............................................................................................................................*................................................................... + // add v27.8H, v17.8H, v29.8H // ...............................................................................................................................*................................................................ + // mla v9.8H, v23.8H, v8.H[0] // ..............................*................................................................................................................................................................. + // sub v25.8H, v18.8H, v25.8H // .................*.............................................................................................................................................................................. + // str q27, [x6, #192] // .........................................................................................................................................*...................................................... + // mul v17.8H, v7.8H, v2.H[6] // ................................*............................................................................................................................................................... + // sqrdmulh v29.8H, v7.8H, v2.H[7] // ...........................................................*.................................................................................................................................... + // sqrdmulh v27.8H, v25.8H, v3.H[1] // ...................................*............................................................................................................................................................ + // str q26, [x6, #224] // ...................................................................................................................................*............................................................ + // mul v23.8H, v25.8H, v3.H[0] // .....................................................*.......................................................................................................................................... + // mla v4.8H, v28.8H, v8.H[0] // ...........*.................................................................................................................................................................................... + // sqrdmulh v24.8H, v19.8H, v1.H[5] // *............................................................................................................................................................................................... + // mla v17.8H, v29.8H, v8.H[0] // ..............................................................................*................................................................................................................. + // mul v19.8H, v19.8H, v1.H[4] // .*.............................................................................................................................................................................................. + // mla v23.8H, v27.8H, v8.H[0] // ......................................................................*......................................................................................................................... + // add v7.8H, v11.8H, v9.8H // .....................................................................*.......................................................................................................................... + // add v29.8H, v12.8H, v31.8H // ..................................................................................................................*............................................................................. + // add v25.8H, v6.8H, v4.8H // ..................................................................................................................................................................*............................. + // add v26.8H, v7.8H, v17.8H // ....................................................................................................................................*........................................................... + // mla v19.8H, v24.8H, v8.H[0] // ........*....................................................................................................................................................................................... + // sub v5.8H, v7.8H, v17.8H // ...............................................................................................................................................*................................................ + // str q25, [x6], #16 // ........................................................................................................................................................................*....................... + // sqrdmulh v17.8H, v30.8H, v1.H[5] // ......*......................................................................................................................................................................................... + // str q26, [x14], #16 // .............................................................................................................................................*.................................................. + // sub v31.8H, v16.8H, v22.8H // .....*.......................................................................................................................................................................................... + // sub v7.8H, v11.8H, v9.8H // .......................................................................*........................................................................................................................ + // str q5, [x14, #16] // ......................................................................................................................................................*......................................... + // mul v25.8H, v30.8H, v1.H[4] // .......*........................................................................................................................................................................................ + // sub v27.8H, v15.8H, v19.8H // ...............*................................................................................................................................................................................ + // add v15.8H, v15.8H, v19.8H // ......................*......................................................................................................................................................................... + // add v16.8H, v7.8H, v23.8H // .....................................................................................*.......................................................................................................... + // str q29, [x6, #48] // .......................................................................................................................*........................................................................ + // sqrdmulh v11.8H, v27.8H, v3.H[5] // ..................*............................................................................................................................................................................. + // mul v26.8H, v27.8H, v3.H[4] // .....................*.......................................................................................................................................................................... + // mla v25.8H, v17.8H, v8.H[0] // ............*................................................................................................................................................................................... + // sqrdmulh v17.8H, v15.8H, v3.H[3] // ..........................*..................................................................................................................................................................... + // sub v27.8H, v14.8H, v13.8H // ................................................................................................*............................................................................................... + // mul v29.8H, v15.8H, v3.H[2] // .............................*.................................................................................................................................................................. + // sub v14.8H, v7.8H, v23.8H // ...................................................................................*............................................................................................................ + // mla v26.8H, v11.8H, v8.H[0] // ............................*................................................................................................................................................................... + // sub v22.8H, v6.8H, v4.8H // ................................................................................................................................................................*............................... + // sub v7.8H, v31.8H, v25.8H // ....................*........................................................................................................................................................................... + // str q16, [x14, #48] // .............................................................................................*.................................................................................................. + // str q27, [x6, #144] // .....................................................................................................*.......................................................................................... + // add v6.8H, v31.8H, v25.8H // ..........................................*..................................................................................................................................................... + // mla v29.8H, v17.8H, v8.H[0] // ....................................*........................................................................................................................................................... + // str q22, [x6, #16] // ....................................................................................................................................................................*........................... + // add v15.8H, v7.8H, v26.8H // ..................................*............................................................................................................................................................. + // str q14, [x14, #80] // ...........................................................................................*.................................................................................................... + // sub v16.8H, v7.8H, v26.8H // .....................................*.......................................................................................................................................................... + // add v27.8H, v6.8H, v29.8H // ............................................................................*................................................................................................................... + // str q15, [x14, #176] // .............................................*.................................................................................................................................................. + // sub v15.8H, v6.8H, v29.8H // ..........................................................................*..................................................................................................................... + // str q16, [x14, #208] // .......................................*........................................................................................................................................................ + // str q27, [x14, #112] // .......................................................................................*........................................................................................................ + // str q15, [x14, #144] // ..................................................................................*............................................................................................................. + // ldr q15, [x14, #224] // ............................................................*................................................................................................................................... + // ldr q16, [x6, #128] // ...............................................................................*................................................................................................................ + // ldr q14, [x6, #32] // ...................................................................................................................................................................*............................ + // ldr q27, [x14, #128] // ....................................................................................*........................................................................................................... + // ldr q26, [x6, #224] // ...................................................................................................................*............................................................................ + // ldr q6, [x14, #0] // ............................................................................................*................................................................................................... + // ldr q7, [x14, #96] // .................................................................*.............................................................................................................................. + // ldr q11, [x14, #192] // ...........................................*.................................................................................................................................................... + // ldr q25, [x6, #96] // .....................................................................................................................................*.......................................................... + // ldr q31, [x14, #32] // ...........................................................................................................................................*.................................................... + // mul v13.8H, v15.8H, v0.H[0] // .............................................................................................................*.................................................................................. + // sqrdmulh v15.8H, v15.8H, v0.H[1] // ........................................................................................................*....................................................................................... + // ldr q29, [x14, #160] // .......................*........................................................................................................................................................................ + // mul v22.8H, v27.8H, v0.H[0] // ......................................................................................................*......................................................................................... + // sqrdmulh v27.8H, v27.8H, v0.H[1] // ...............................................................................................................*................................................................................ + // ldr q23, [x6, #64] // .................................................................................................................................................*.............................................. + // ldr q5, [x6, #192] // ........................................................................................*....................................................................................................... + // mul v12.8H, v6.8H, v0.H[0] // ...........................................................................................................*.................................................................................... + // sqrdmulh v6.8H, v6.8H, v0.H[1] // ..........................................................................................................*..................................................................................... + // ldr q10, [x6, #160] // ........................................*....................................................................................................................................................... + // sqrdmulh v19.8H, v31.8H, v0.H[1] // ........................................................................................................................................................*....................................... + // mla v13.8H, v15.8H, v8.H[0] // ....................................................................................................................*........................................................................... + // mla v12.8H, v6.8H, v8.H[0] // .................................................................................................................*.............................................................................. + // mla v22.8H, v27.8H, v8.H[0] // ......................................................................................................................*......................................................................... + // mul v4.8H, v31.8H, v0.H[0] // ..............................................................................................................................................................*................................. + // mul v17.8H, v11.8H, v0.H[0] // .................................................................................*.............................................................................................................. + // sqrdmulh v11.8H, v11.8H, v0.H[1] // ......................................................*......................................................................................................................................... + // add v24.8H, v26.8H, v13.8H // .............................................................................................................................*.................................................................. + // sqrdmulh v27.8H, v29.8H, v0.H[1] // .................................................*.............................................................................................................................................. + // mul v29.8H, v29.8H, v0.H[0] // ............................................*................................................................................................................................................... + // mul v31.8H, v24.8H, v0.H[2] // ..........................................................................................................................................*..................................................... + // sqrdmulh v21.8H, v24.8H, v0.H[3] // ..............................................................................................................................................*................................................. + // mla v17.8H, v11.8H, v8.H[0] // ..............................................................................................................*................................................................................. + // mla v4.8H, v19.8H, v8.H[0] // ......................................................................................................................................................................*......................... + // mla v29.8H, v27.8H, v8.H[0] // ..........................................................*..................................................................................................................................... + // sub v30.8H, v16.8H, v22.8H // ................................................................................................................................*............................................................... + // ldr q24, [x6, #0] // ................................................................................................................*............................................................................... + // mla v31.8H, v21.8H, v8.H[0] // ....................................................................................................................................................*........................................... + // ldr q19, [x14, #64] // ................................................................................*............................................................................................................... + // sub v27.8H, v10.8H, v29.8H // ...................................................................*............................................................................................................................ + // sub v21.8H, v14.8H, v4.8H // ................................................................................................................................................................................*............... + // sub v18.8H, v5.8H, v17.8H // ........................................................................................................................*....................................................................... + // add v28.8H, v10.8H, v29.8H // ..................................................................*............................................................................................................................. + // mul v15.8H, v27.8H, v0.H[4] // ..................................................................................................*............................................................................................. + // sqrdmulh v20.8H, v27.8H, v0.H[5] // .................................................................................................*.............................................................................................. + // mul v11.8H, v28.8H, v0.H[2] // ..........................................................................................................................................................*..................................... + // sqrdmulh v28.8H, v28.8H, v0.H[3] // .......................................................................................................................................................*........................................ + // sqrdmulh v9.8H, v19.8H, v0.H[1] // ..............................................................................................*................................................................................................. + // mul v19.8H, v19.8H, v0.H[0] // ...............................................................................................*................................................................................................ + // mla v15.8H, v20.8H, v8.H[0] // ............................................................................................................*................................................................................... + // mul v6.8H, v18.8H, v0.H[4] // ...................................................................................................................................................................................*............ + // sqrdmulh v29.8H, v18.8H, v0.H[5] // .........................................................................................................................................................................*...................... + // mla v11.8H, v28.8H, v8.H[0] // ...............................................................................................................................................................*................................ + // add v14.8H, v14.8H, v4.8H // ............................................................................................................................................................................*................... + // mla v19.8H, v9.8H, v8.H[0] // .......................................................................................................*........................................................................................ + // add v27.8H, v16.8H, v22.8H // ..............................................................................................................................*................................................................. + // add v18.8H, v21.8H, v15.8H // ....................................................................................................................................................................................*........... + // add v9.8H, v14.8H, v11.8H // ..................................................................................................................................................................................*............. + // sqrdmulh v20.8H, v7.8H, v0.H[1] // ...................................................................................................*............................................................................................ + // mul v16.8H, v7.8H, v0.H[0] // ........................................................................*....................................................................................................................... + // sub v15.8H, v21.8H, v15.8H // ......................................................................................................................................................................................*......... + // add v7.8H, v5.8H, v17.8H // ...........................................................................................................................*.................................................................... + // sub v4.8H, v26.8H, v13.8H // .................................................................................................................................*.............................................................. + // sqrdmulh v21.8H, v27.8H, v0.H[3] // ..........................................................................................................................................................................................*..... + // mla v6.8H, v29.8H, v8.H[0] // ........................................................................................................................................................................................*....... + // mla v16.8H, v20.8H, v8.H[0] // .......................................................................................................................................*........................................................ + // mul v26.8H, v4.8H, v0.H[4] // ..................................................................................................................................................*............................................. + // sqrdmulh v17.8H, v4.8H, v0.H[5] // ...................................................................................................................................................*............................................ + // sqrdmulh v4.8H, v30.8H, v0.H[5] // ......................................................................................................................................*......................................................... + // sub v20.8H, v23.8H, v19.8H // .......................................................................................................................................................................................*........ + // sqrdmulh v5.8H, v7.8H, v0.H[3] // ..................................................................................................................................*............................................................. + // add v29.8H, v25.8H, v16.8H // .....................................................................................................................................................*.......................................... + // mla v26.8H, v17.8H, v8.H[0] // .........................................................................................................................................................*...................................... + // mul v10.8H, v27.8H, v0.H[2] // ................................................................................................................................................*............................................... + // sub v25.8H, v25.8H, v16.8H // .....................................................................................................................................................................*.......................... + // sub v17.8H, v29.8H, v31.8H // .............................................................................................................................................................*.................................. + // add v27.8H, v29.8H, v31.8H // ............................................................................................................................................................*................................... + // add v13.8H, v23.8H, v19.8H // ............................................................................................................................................................................................*... + // sub v16.8H, v24.8H, v12.8H // ........................................................................................................................................*....................................................... + // sqrdmulh v29.8H, v27.8H, v0.H[7] // .......................................................................................................................................................................*........................ + // mul v31.8H, v27.8H, v0.H[6] // ...............................................................................................................................................................................*................ + // add v27.8H, v24.8H, v12.8H // ..........................................................................................................................*..................................................................... + // mul v22.8H, v30.8H, v0.H[4] // ............................................................................................................................................*................................................... + // mul v12.8H, v7.8H, v0.H[2] // .................................................................................................................................................................*.............................. + // sub v19.8H, v25.8H, v26.8H // .............................................................................................................................................................................*.................. + // add v26.8H, v25.8H, v26.8H // ..........................................................................................................................................................................*..................... + // mla v31.8H, v29.8H, v8.H[0] // .....................................................................................................................................................................................*.......... + // mla v22.8H, v4.8H, v8.H[0] // ...........................................................................................................................................................*.................................... + // sub v14.8H, v14.8H, v11.8H // .................................................................................................................................................................................*.............. + // mla v12.8H, v5.8H, v8.H[0] // ...........................................................................................................................................................................*.................... + // sub v30.8H, v20.8H, v6.8H // ..............................................................................................................................................................................................*. + // sqrdmulh v23.8H, v26.8H, v1.H[3] // .............................................................................................................................................................................................*.. + // add v7.8H, v9.8H, v31.8H // ...........................................................................................................................................................................................*.... + // add v11.8H, v16.8H, v22.8H // .........................................................................................................................................................................................*...... + // mul v25.8H, v26.8H, v1.H[2] // ..............................................................................................................................................................................*................. + // sqrdmulh v28.8H, v7.8H, v1.H[7] // ...............................................................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer1234_start + sqrdmulh v26.8H, v17.8H, v1.H[1] // ...............................................................................................................*................................................................................ + mul v4.8H, v17.8H, v1.H[0] // ................................................................................................................*............................................................................... + mla v10.8H, v21.8H, v8.H[0] // ..........................................................*..................................................................................................................................... + sub v17.8H, v13.8H, v12.8H // .....................................................................*.......................................................................................................................... + add v13.8H, v13.8H, v12.8H // ......................................................................*......................................................................................................................... + add v21.8H, v20.8H, v6.8H // ..........................................................................................*..................................................................................................... + mul v24.8H, v17.8H, v1.H[0] // ...........................................................................................................*.................................................................................... + mla v4.8H, v26.8H, v8.H[0] // .................................................................................................................*.............................................................................. + sub v29.8H, v9.8H, v31.8H // ........................................................................................................*....................................................................................... + sqrdmulh v31.8H, v17.8H, v1.H[1] // ..........................................................................................................*..................................................................................... + mul v6.8H, v13.8H, v0.H[6] // .................................................................................................*.............................................................................................. + add v9.8H, v27.8H, v10.8H // ............................................................*................................................................................................................................... + sqrdmulh v17.8H, v29.8H, v2.H[1] // .............................................................................................................................................*.................................................. + add v26.8H, v14.8H, v4.8H // ...................................................................................................................*............................................................................ + mla v24.8H, v31.8H, v8.H[0] // ............................................................................................................*................................................................................... + sqrdmulh v5.8H, v13.8H, v0.H[7] // ................................................................................................*............................................................................................... + mul v13.8H, v26.8H, v2.H[2] // ...................................................................................................................................................*............................................ + sqrdmulh v26.8H, v26.8H, v2.H[3] // ..................................................................................................................................................*............................................. + mul v31.8H, v29.8H, v2.H[0] // ..............................................................................................................................................*................................................. + sub v10.8H, v27.8H, v10.8H // ...........................................................*.................................................................................................................................... + mla v6.8H, v5.8H, v8.H[0] // ..................................................................................................*............................................................................................. + sub v29.8H, v14.8H, v4.8H // ..................................................................................................................*............................................................................. + add v14.8H, v10.8H, v24.8H // ..............................................................................................................*................................................................................. + mla v13.8H, v26.8H, v8.H[0] // ....................................................................................................................................................*........................................... + mla v31.8H, v17.8H, v8.H[0] // ...............................................................................................................................................*................................................ + sqrdmulh v5.8H, v29.8H, v2.H[5] // .......................................................................................................................................................*........................................ + sub v12.8H, v9.8H, v6.8H // ...................................................................................................*............................................................................................ + mul v29.8H, v29.8H, v2.H[4] // ........................................................................................................................................................*....................................... + add v6.8H, v9.8H, v6.8H // ....................................................................................................*........................................................................................... + add v27.8H, v14.8H, v13.8H // ......................................................................................................................................................*......................................... + sub v26.8H, v12.8H, v31.8H // ................................................................................................................................................*............................................... + mla v25.8H, v23.8H, v8.H[0] // ...........................................................................................................................*.................................................................... + str q27, [x6, #128] // ....................................................................................................................................................................................*........... + mla v29.8H, v5.8H, v8.H[0] // .........................................................................................................................................................*...................................... + mul v9.8H, v21.8H, v1.H[2] // .....................................................................................................................*.......................................................................... + str q26, [x6, #96] // ...................................................................................................................................................................................*............ + sqrdmulh v23.8H, v21.8H, v1.H[3] // ....................................................................................................................*........................................................................... + sub v17.8H, v10.8H, v24.8H // .............................................................................................................*.................................................................................. + mul v4.8H, v7.8H, v1.H[6] // .........................................................................................................................................*...................................................... + add v7.8H, v18.8H, v25.8H // .............................................................................................................................*.................................................................. + sub v26.8H, v17.8H, v29.8H // ..........................................................................................................................................................*..................................... + add v27.8H, v17.8H, v29.8H // ...........................................................................................................................................................*.................................... + mla v9.8H, v23.8H, v8.H[0] // ......................................................................................................................*......................................................................... + sub v25.8H, v18.8H, v25.8H // ............................................................................................................................*................................................................... + str q27, [x6, #192] // ......................................................................................................................................................................................*......... + mul v17.8H, v7.8H, v2.H[6] // .............................................................................................................................................................*.................................. + sqrdmulh v29.8H, v7.8H, v2.H[7] // ............................................................................................................................................................*................................... + sqrdmulh v27.8H, v25.8H, v3.H[1] // .................................................................................................................................................................*.............................. + str q26, [x6, #224] // .......................................................................................................................................................................................*........ + mul v23.8H, v25.8H, v3.H[0] // ..................................................................................................................................................................*............................. + mla v4.8H, v28.8H, v8.H[0] // ..........................................................................................................................................*..................................................... + sqrdmulh v24.8H, v19.8H, v1.H[5] // ...................................................................................................................................*............................................................ + mla v17.8H, v29.8H, v8.H[0] // ..............................................................................................................................................................*................................. + mul v19.8H, v19.8H, v1.H[4] // ....................................................................................................................................*........................................................... + mla v23.8H, v27.8H, v8.H[0] // ...................................................................................................................................................................*............................ + add v7.8H, v11.8H, v9.8H // ........................................................................................................................*....................................................................... + add v29.8H, v12.8H, v31.8H // .................................................................................................................................................*.............................................. + add v25.8H, v6.8H, v4.8H // ............................................................................................................................................*................................................... + add v26.8H, v7.8H, v17.8H // ................................................................................................................................................................*............................... + mla v19.8H, v24.8H, v8.H[0] // .....................................................................................................................................*.......................................................... + sub v5.8H, v7.8H, v17.8H // ...............................................................................................................................................................*................................ + str q25, [x6], #16 // ................................................................................................................................................................................*............... + sqrdmulh v17.8H, v30.8H, v1.H[5] // ..............................................................................................................................*................................................................. + str q26, [x14], #16 // ........................................................................................................................................................................................*....... + sub v31.8H, v16.8H, v22.8H // ...............................................................................*................................................................................................................ + sub v7.8H, v11.8H, v9.8H // .......................................................................................................................*........................................................................ + str q5, [x14, #16] // .........................................................................................................................................................................................*...... + mul v25.8H, v30.8H, v1.H[4] // ...............................................................................................................................*................................................................ + sub v27.8H, v15.8H, v19.8H // ......................................................................................................................................*......................................................... + add v15.8H, v15.8H, v19.8H // .......................................................................................................................................*........................................................ + add v16.8H, v7.8H, v23.8H // .....................................................................................................................................................................*.......................... + str q29, [x6, #48] // ..................................................................................................................................................................................*............. + sqrdmulh v11.8H, v27.8H, v3.H[5] // ...........................................................................................................................................................................*.................... + mul v26.8H, v27.8H, v3.H[4] // ............................................................................................................................................................................*................... + mla v25.8H, v17.8H, v8.H[0] // ................................................................................................................................*............................................................... + sqrdmulh v17.8H, v15.8H, v3.H[3] // ......................................................................................................................................................................*......................... + sub v27.8H, v14.8H, v13.8H // .....................................................................................................................................................*.......................................... + mul v29.8H, v15.8H, v3.H[2] // .......................................................................................................................................................................*........................ + sub v14.8H, v7.8H, v23.8H // ....................................................................................................................................................................*........................... + mla v26.8H, v11.8H, v8.H[0] // .............................................................................................................................................................................*.................. + sub v22.8H, v6.8H, v4.8H // ...........................................................................................................................................*.................................................... + sub v7.8H, v31.8H, v25.8H // .................................................................................................................................*.............................................................. + str q16, [x14, #48] // ..........................................................................................................................................................................................*..... + str q27, [x6, #144] // .....................................................................................................................................................................................*.......... + add v6.8H, v31.8H, v25.8H // ..................................................................................................................................*............................................................. + mla v29.8H, v17.8H, v8.H[0] // ........................................................................................................................................................................*....................... + str q22, [x6, #16] // .................................................................................................................................................................................*.............. + add v15.8H, v7.8H, v26.8H // ...............................................................................................................................................................................*................ + str q14, [x14, #80] // ...........................................................................................................................................................................................*.... + sub v16.8H, v7.8H, v26.8H // ..............................................................................................................................................................................*................. + add v27.8H, v6.8H, v29.8H // ..........................................................................................................................................................................*..................... + str q15, [x14, #176] // ..............................................................................................................................................................................................*. + sub v15.8H, v6.8H, v29.8H // .........................................................................................................................................................................*...................... + str q16, [x14, #208] // ...............................................................................................................................................................................................* + str q27, [x14, #112] // ............................................................................................................................................................................................*... + str q15, [x14, #144] // .............................................................................................................................................................................................*.. + + restore inp, STACK0 + mov count, #4 + + ASM_LOAD(r_ptr1, roots_l456) + + add src0, inp, #256*0 + add src1, inp, #256*1 + + .p2align 2 + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.04s + // User time: 0.04s + // +layer567_start: + // Instructions: 118 + // Expected cycles: 58 + // Expected IPC: 2.03 + // + // Cycle bound: 52.0 + // IPC bound: 2.27 + // + // Wall time: 3607.24s + // User time: 3607.24s + // + // ------------------------------------------------- original position -------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + ld4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x7] // .*.................................................................................................................... + ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x6] // *..................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + ldr q14, [x4], #16*14 // ..........*........................................................................................................... + ldr q21, [x4, #-48] // .....................*................................................................................................ + ldr q0, [x4, #-208] // ...........*.......................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v10.4S, v16.4S, v27.4S // ....*................................................................................................................. + ldr q30, [x4, #-192] // ............*......................................................................................................... + trn2 v2.4S, v18.4S, v29.4S // .........*............................................................................................................ + ldr q25, [x4, #-160] // ..............*....................................................................................................... + trn2 v3.4S, v17.4S, v28.4S // .......*.............................................................................................................. + ldr q31, [x4, #-144] // ...............*...................................................................................................... + trn2 v23.4S, v15.4S, v26.4S // ...*.................................................................................................................. + ldr q9, [x4, #-128] // ................*..................................................................................................... + ldr q1, [x4, #-176] // .............*........................................................................................................ + ldr q24, [x4, #-64] // ....................*................................................................................................. + trn1 v15.4S, v15.4S, v26.4S // ..*................................................................................................................... + trn2 v26.4S, v16.4S, v27.4S // .....*................................................................................................................ + mul v6.8H, v3.8H, v14.8H // ...................................*.................................................................................. + sqrdmulh v7.8H, v3.8H, v0.8H // ..................................*................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v3.8H, v26.8H, v14.8H // ..............................*....................................................................................... + mul v16.8H, v2.8H, v14.8H // ........................................*............................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v27.8H, v2.8H, v0.8H // .......................................*.............................................................................. + mul v22.8H, v23.8H, v14.8H // .........................*............................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v14.8H, v26.8H, v0.8H // .............................*........................................................................................ + sqrdmulh v0.8H, v23.8H, v0.8H // ........................*............................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v12.4S, v18.4S, v29.4S // ........*............................................................................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v6.8H, v7.8H, v8.H[0] // ....................................*................................................................................. + mla v16.8H, v27.8H, v8.H[0] // .........................................*............................................................................ + trn1 v13.4S, v17.4S, v28.4S // ......*............................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v3.8H, v14.8H, v8.H[0] // ...............................*...................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v22.8H, v0.8H, v8.H[0] // ..........................*........................................................................................... + add v27.8H, v13.8H, v6.8H // ......................................*............................................................................... + sub v11.8H, v13.8H, v6.8H // .....................................*................................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v29.8H, v12.8H, v16.8H // ..........................................*........................................................................... + add v14.8H, v12.8H, v16.8H // ...........................................*.......................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v26.8H, v11.8H, v31.8H // ......................................................*............................................................... + mul v13.8H, v27.8H, v30.8H // .............................................*........................................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v16.8H, v29.8H, v31.8H // ...........................................................*.......................................................... + mul v30.8H, v14.8H, v30.8H // ..................................................*................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v6.8H, v27.8H, v1.8H // ............................................*......................................................................... + sqrdmulh v1.8H, v14.8H, v1.8H // .................................................*.................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v7.8H, v29.8H, v25.8H // ............................................................*......................................................... + mul v31.8H, v11.8H, v25.8H // .......................................................*.............................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v2.8H, v10.8H, v3.8H // .................................*.................................................................................... + sub v23.8H, v15.8H, v22.8H // ...........................*.......................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v30.8H, v1.8H, v8.H[0] // ...................................................*.................................................................. + mla v13.8H, v6.8H, v8.H[0] // ..............................................*....................................................................... + ldr q17, [x4, #-16] // .......................*.............................................................................................. + ldr q27, [x4, #-112] // .................*.................................................................................................... + mla v7.8H, v16.8H, v8.H[0] // .............................................................*........................................................ + add v5.8H, v15.8H, v22.8H // ............................*......................................................................................... + // gap // ...................................................................................................................... + ldr q1, [x4, #-80] // ...................*.................................................................................................. + mla v31.8H, v26.8H, v8.H[0] // ........................................................*............................................................. + sub v25.8H, v10.8H, v3.8H // ................................*..................................................................................... + // gap // ...................................................................................................................... + ldr q12, [x4, #-96] // ..................*................................................................................................... + add v14.8H, v5.8H, v13.8H // ................................................*..................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v6.8H, v2.8H, v30.8H // .....................................................*................................................................ + add v15.8H, v25.8H, v7.8H // ...............................................................*...................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v0.8H, v2.8H, v30.8H // ....................................................*................................................................. + mul v26.8H, v6.8H, v9.8H // .................................................................*.................................................... + sqrdmulh v27.8H, v6.8H, v27.8H // ................................................................*..................................................... + // gap // ...................................................................................................................... + ldr q4, [x4, #-32] // ......................*............................................................................................... + mul v6.8H, v0.8H, v12.8H // ......................................................................*............................................... + sqrdmulh v16.8H, v0.8H, v1.8H // .....................................................................*................................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mul v0.8H, v15.8H, v24.8H // ...........................................................................*.......................................... + sub v11.8H, v25.8H, v7.8H // ..............................................................*....................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqrdmulh v15.8H, v15.8H, v21.8H // ..........................................................................*........................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v26.8H, v27.8H, v8.H[0] // ..................................................................*................................................... + mul v27.8H, v11.8H, v4.8H // ................................................................................*..................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v6.8H, v16.8H, v8.H[0] // .......................................................................*.............................................. + sqrdmulh v16.8H, v11.8H, v17.8H // ...............................................................................*...................................... + sub v17.8H, v5.8H, v13.8H // ...............................................*...................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + add v7.8H, v23.8H, v31.8H // ..........................................................*........................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v0.8H, v15.8H, v8.H[0] // ............................................................................*......................................... + sub v2.8H, v23.8H, v31.8H // .........................................................*............................................................ + add v12.8H, v17.8H, v6.8H // .........................................................................*............................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v27.8H, v16.8H, v8.H[0] // .................................................................................*.................................... + // gap // ...................................................................................................................... + sub v3.8H, v14.8H, v26.8H // ...................................................................*.................................................. + // gap // ...................................................................................................................... + add v22.8H, v14.8H, v26.8H // ....................................................................*................................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v15.8H, v12.8H, v8.H[1] // ..........................................................................................*........................... + add v31.8H, v7.8H, v0.8H // ..............................................................................*....................................... + sub v25.8H, v7.8H, v0.8H // .............................................................................*........................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v11.8H, v17.8H, v6.8H // ........................................................................*............................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v6.8H, v3.8H, v8.H[1] // .......................................................................................*.............................. + srshr v16.8H, v15.8H, #11 // ...........................................................................................*.......................... + sqdmulh v15.8H, v25.8H, v8.H[1] // ...................................................................................................*.................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sub v13.8H, v2.8H, v27.8H // ..................................................................................*................................... + add v14.8H, v2.8H, v27.8H // ...................................................................................*.................................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v7.8H, v6.8H, #11 // ........................................................................................*............................. + sqdmulh v26.8H, v31.8H, v8.H[1] // ................................................................................................*..................... + mla v12.8H, v16.8H, v8.H[0] // ............................................................................................*......................... + srshr v6.8H, v15.8H, #11 // ....................................................................................................*................. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v15.8H, v13.8H, v8.H[1] // .........................................................................................................*............ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v18.8H, v11.8H, v8.H[1] // .............................................................................................*........................ + mla v3.8H, v7.8H, v8.H[0] // .........................................................................................*............................ + sqdmulh v16.8H, v14.8H, v8.H[1] // ......................................................................................................*............... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v25.8H, v6.8H, v8.H[0] // .....................................................................................................*................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + sqdmulh v6.8H, v22.8H, v8.H[1] // ....................................................................................*................................. + srshr v15.8H, v15.8H, #11 // ..........................................................................................................*........... + srshr v0.8H, v18.8H, #11 // ..............................................................................................*....................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v26.8H, v26.8H, #11 // .................................................................................................*.................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + srshr v16.8H, v16.8H, #11 // .......................................................................................................*.............. + srshr v6.8H, v6.8H, #11 // .....................................................................................*................................ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v1.4S, v3.4S, v25.4S // ...............................................................................................................*...... + mla v13.8H, v15.8H, v8.H[0] // ...........................................................................................................*.......... + mla v11.8H, v0.8H, v8.H[0] // ...............................................................................................*...................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v14.8H, v16.8H, v8.H[0] // ........................................................................................................*............. + mla v31.8H, v26.8H, v8.H[0] // ..................................................................................................*................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + mla v22.8H, v6.8H, v8.H[0] // ......................................................................................*............................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v16.4S, v3.4S, v25.4S // ..............................................................................................................*....... + trn2 v3.4S, v11.4S, v13.4S // ...................................................................................................................*.. + trn1 v18.4S, v11.4S, v13.4S // ..................................................................................................................*... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v2.4S, v12.4S, v14.4S // .................................................................................................................*.... + trn1 v17.4S, v12.4S, v14.4S // ................................................................................................................*..... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn1 v15.4S, v22.4S, v31.4S // ............................................................................................................*......... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + trn2 v0.4S, v22.4S, v31.4S // .............................................................................................................*........ + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + st4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x7], #64 // .....................................................................................................................* + st4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x6], #64 // ....................................................................................................................*. + // gap // ...................................................................................................................... + // gap // ...................................................................................................................... + + // --------------------------------------------------- new position ----------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|----------------- + // ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6] // .*.................................................................................................................... + // ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7] // *..................................................................................................................... + // trn1 v9.4s, v17.4s, v21.4s // ...............*...................................................................................................... + // trn2 v13.4s, v17.4s, v21.4s // ...........*.......................................................................................................... + // trn1 v10.4s, v18.4s, v22.4s // .....*................................................................................................................ + // trn2 v14.4s, v18.4s, v22.4s // ................*..................................................................................................... + // trn1 v11.4s, v19.4s, v23.4s // ............................*......................................................................................... + // trn2 v15.4s, v19.4s, v23.4s // .........*............................................................................................................ + // trn1 v12.4s, v20.4s, v24.4s // .........................*............................................................................................ + // trn2 v16.4s, v20.4s, v24.4s // .......*.............................................................................................................. + // ldr q0, [ x4], #16*14 // ..*................................................................................................................... + // ldr q4, [x4, #-16*14+16*1] // ....*................................................................................................................. + // ldr q1, [ x4, #-16*14+16*2] // ......*............................................................................................................... + // ldr q5, [x4, #-16*14+16*3] // .............*........................................................................................................ + // ldr q2, [ x4, #-16*14+16*4] // ........*............................................................................................................. + // ldr q6, [x4, #-16*14+16*5] // ..........*........................................................................................................... + // ldr q3, [ x4, #-16*14+16*6] // ............*......................................................................................................... + // ldr q7, [x4, #-16*14+16*7] // ................................................*..................................................................... + // ldr q17, [ x4, #-16*14+16*8] // ......................................................*............................................................... + // ldr q18, [ x4, #-16*14+16*9] // ...................................................*.................................................................. + // ldr q19, [ x4, #-16*14+16*10] // ..............*....................................................................................................... + // ldr q20, [ x4, #-16*14+16*11] // ...*.................................................................................................................. + // ldr q21, [ x4, #-16*14+16*12] // .............................................................*........................................................ + // ldr q22, [ x4, #-16*14+16*13] // ...............................................*...................................................................... + // sqrdmulh v28.8h, v13.8h, v4.8h // ........................*............................................................................................. + // mul v25.8h, v13.8h, v0.8h // ......................*............................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..............................*....................................................................................... + // sub v13.8h, v9.8h, v25.8h // ............................................*......................................................................... + // add v9.8h, v9.8h, v25.8h // ..................................................*................................................................... + // sqrdmulh v28.8h, v14.8h, v4.8h // .......................*.............................................................................................. + // mul v25.8h, v14.8h, v0.8h // ...................*.................................................................................................. + // mla v25.8h, v28.8h, v8.h[0] // .............................*........................................................................................ + // sub v14.8h, v10.8h, v25.8h // .....................................................*................................................................ + // add v10.8h, v10.8h, v25.8h // ...........................................*.......................................................................... + // sqrdmulh v28.8h, v15.8h, v4.8h // ..................*................................................................................................... + // mul v25.8h, v15.8h, v0.8h // .................*.................................................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ..........................*........................................................................................... + // sub v15.8h, v11.8h, v25.8h // ................................*..................................................................................... + // add v11.8h, v11.8h, v25.8h // ...............................*...................................................................................... + // sqrdmulh v28.8h, v16.8h, v4.8h // .....................*................................................................................................ + // mul v25.8h, v16.8h, v0.8h // ....................*................................................................................................. + // mla v25.8h, v28.8h, v8.h[0] // ...........................*.......................................................................................... + // sub v16.8h, v12.8h, v25.8h // .................................*.................................................................................... + // add v12.8h, v12.8h, v25.8h // ..................................*................................................................................... + // sqrdmulh v28.8h, v11.8h, v5.8h // .......................................*.............................................................................. + // mul v25.8h, v11.8h, v1.8h // ....................................*................................................................................. + // mla v25.8h, v28.8h, v8.h[0] // ..............................................*....................................................................... + // sub v11.8h, v9.8h, v25.8h // .......................................................................*.............................................. + // add v9.8h, v9.8h, v25.8h // .......................................................*.............................................................. + // sqrdmulh v28.8h, v12.8h, v5.8h // ........................................*............................................................................. + // mul v25.8h, v12.8h, v1.8h // ......................................*............................................................................... + // mla v25.8h, v28.8h, v8.h[0] // .............................................*........................................................................ + // sub v12.8h, v10.8h, v25.8h // ..........................................................*........................................................... + // add v10.8h, v10.8h, v25.8h // ........................................................*............................................................. + // sqrdmulh v28.8h, v15.8h, v6.8h // ...................................*.................................................................................. + // mul v25.8h, v15.8h, v2.8h // ..........................................*........................................................................... + // mla v25.8h, v28.8h, v8.h[0] // ....................................................*................................................................. + // sub v15.8h, v13.8h, v25.8h // ..........................................................................*........................................... + // add v13.8h, v13.8h, v25.8h // ........................................................................*............................................. + // sqrdmulh v28.8h, v16.8h, v6.8h // .....................................*................................................................................ + // mul v25.8h, v16.8h, v2.8h // .........................................*............................................................................ + // mla v25.8h, v28.8h, v8.h[0] // .................................................*.................................................................... + // sub v16.8h, v14.8h, v25.8h // .................................................................*.................................................... + // add v14.8h, v14.8h, v25.8h // .........................................................*............................................................ + // sqrdmulh v28.8h, v10.8h, v7.8h // ............................................................*......................................................... + // mul v25.8h, v10.8h, v3.8h // ...........................................................*.......................................................... + // mla v25.8h, v28.8h, v8.h[0] // ...................................................................*.................................................. + // sub v10.8h, v9.8h, v25.8h // .............................................................................*........................................ + // add v9.8h, v9.8h, v25.8h // ..............................................................................*....................................... + // sqrdmulh v28.8h, v12.8h, v18.8h // ...............................................................*...................................................... + // mul v25.8h, v12.8h, v17.8h // ..............................................................*....................................................... + // mla v25.8h, v28.8h, v8.h[0] // .....................................................................*................................................ + // sub v12.8h, v11.8h, v25.8h // ..................................................................................*................................... + // add v11.8h, v11.8h, v25.8h // ...........................................................................*.......................................... + // sqrdmulh v28.8h, v14.8h, v20.8h // ..................................................................*................................................... + // mul v25.8h, v14.8h, v19.8h // ................................................................*..................................................... + // mla v25.8h, v28.8h, v8.h[0] // .........................................................................*............................................ + // sub v14.8h, v13.8h, v25.8h // .................................................................................*.................................... + // add v13.8h, v13.8h, v25.8h // ................................................................................*..................................... + // sqrdmulh v28.8h, v16.8h, v22.8h // ......................................................................*............................................... + // mul v25.8h, v16.8h, v21.8h // ....................................................................*................................................. + // mla v25.8h, v28.8h, v8.h[0] // ............................................................................*......................................... + // sub v16.8h, v15.8h, v25.8h // ......................................................................................*............................... + // add v15.8h, v15.8h, v25.8h // .......................................................................................*.............................. + // sqdmulh v26.8h, v9.8h, v8.h[1] // .................................................................................................*.................... + // srshr v26.8h, v26.8h, #11 // ......................................................................................................*............... + // mla v9.8h, v26.8h, v8.h[0] // ............................................................................................................*......... + // sqdmulh v26.8h, v10.8h, v8.h[1] // ...................................................................................*.................................. + // srshr v26.8h, v26.8h, #11 // ........................................................................................*............................. + // mla v10.8h, v26.8h, v8.h[0] // ..............................................................................................*....................... + // sqdmulh v26.8h, v11.8h, v8.h[1] // ...............................................................................*...................................... + // srshr v26.8h, v26.8h, #11 // ....................................................................................*................................. + // mla v11.8h, v26.8h, v8.h[0] // ..........................................................................................*........................... + // sqdmulh v26.8h, v12.8h, v8.h[1] // .............................................................................................*........................ + // srshr v26.8h, v26.8h, #11 // ...................................................................................................*.................. + // mla v12.8h, v26.8h, v8.h[0] // .........................................................................................................*............ + // sqdmulh v26.8h, v13.8h, v8.h[1] // .........................................................................................*............................ + // srshr v26.8h, v26.8h, #11 // ....................................................................................................*................. + // mla v13.8h, v26.8h, v8.h[0] // ...........................................................................................................*.......... + // sqdmulh v26.8h, v14.8h, v8.h[1] // .....................................................................................*................................ + // srshr v26.8h, v26.8h, #11 // ...........................................................................................*.......................... + // mla v14.8h, v26.8h, v8.h[0] // ................................................................................................*..................... + // sqdmulh v26.8h, v15.8h, v8.h[1] // ...............................................................................................*...................... + // srshr v26.8h, v26.8h, #11 // .....................................................................................................*................ + // mla v15.8h, v26.8h, v8.h[0] // ..........................................................................................................*........... + // sqdmulh v26.8h, v16.8h, v8.h[1] // ............................................................................................*......................... + // srshr v26.8h, v26.8h, #11 // ..................................................................................................*................... + // mla v16.8h, v26.8h, v8.h[0] // ........................................................................................................*............. + // trn1 v17.4s, v9.4s, v13.4s // ..................................................................................................................*... + // trn2 v21.4s, v9.4s, v13.4s // ...................................................................................................................*.. + // trn1 v18.4s, v10.4s, v14.4s // .............................................................................................................*........ + // trn2 v22.4s, v10.4s, v14.4s // .......................................................................................................*.............. + // trn1 v19.4s, v11.4s, v15.4s // .................................................................................................................*.... + // trn2 v23.4s, v11.4s, v15.4s // ................................................................................................................*..... + // trn1 v20.4s, v12.4s, v16.4s // ...............................................................................................................*...... + // trn2 v24.4s, v12.4s, v16.4s // ..............................................................................................................*....... + // st4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x6], #64 // .....................................................................................................................* + // st4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x7], #64 // ....................................................................................................................*. + + sub count, count, #1 + cbnz count, layer567_start + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.04s + // User time: 0.04s + // + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_a55.s b/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_a55.s index d0ac4bb7..efe52444 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_a55.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_a55.s @@ -26,30 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +43,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -103,50 +73,50 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +127,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +137,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +145,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,19 +156,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -211,7 +181,7 @@ roots: .text .global ntt_kyber_123_4567_manual_st4_opt_a55 - .global _ntt_kyber_123_4567_manual_st4_opt_a55 + .global _ntt_kyber_123_4567_manual_st4 .p2align 4 const_addr: .short 3329 @@ -337,1083 +307,1165 @@ _ntt_kyber_123_4567_manual_st4_opt_a55: load_roots_123 .p2align 2 - ldr_vo v30, x0, 0 // *......... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v9, x0, 64 // .*........ - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v5, x0, 128 // ..*....... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v10, x0, 192 // ...*...... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v4, x0, 256 // ....*..... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v17, x0, 448 // .......*.. - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v16.8H, v4.8H, v0.H[1] // ......*... - // gap // .......... - ldr_vo v23, x0, 320 // .....*.... - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v29.8H, v17.8H, v0.H[1] // .........* - // gap // .......... - ldr_vo v22, x0, 384 // ........*. - // gap // .......... - - // original source code - // ldr_vo v30, x0, 0 // *......... || *................ - // ldr_vo v9, x0, 64 // .*........ || ..*.............. - // ldr_vo v5, x0, 128 // ..*....... || ....*............ - // ldr_vo v10, x0, 192 // ...*...... || ......*.......... - // ldr_vo v4, x0, 256 // ....*..... || ........*........ - // ldr_vo v23, x0, 320 // .......*.. || .............*... - // sqrdmulh v16.8H, v4.8H, v0.H[1] // ......*... || ............*.... - // ldr_vo v17, x0, 448 // .....*.... || ..........*...... - // ldr_vo v22, x0, 384 // .........* || ................* - // sqrdmulh v29.8H, v17.8H, v0.H[1] // ........*. || ...............*. - + // Instructions: 10 + // Expected cycles: 16 + // Expected IPC: 0.62 + // + // Cycle bound: 16.0 + // IPC bound: 0.62 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q15, [x0, #448] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q20, [x0, #64] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q29, [x0, #256] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x0, #128] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v18.8H, v29.8H, v0.H[1] // ......*....................... + // gap // .............................. + mul v26.8H, v29.8H, v0.H[0] // .......*...................... + // gap // .............................. + ldr q31, [x0, #320] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x0, #384] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v26.8H, v18.8H, v7.H[0] // .........*.................... + // gap // .............................. + ldr q12, [x0, #192] // ...*.......................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q15, [x0, #448] // *.............................. + // ldr q20, [x0, #64] // .*............................. + // ldr q11, [x0, #128] // ...*........................... + // ldr q12, [x0, #192] // .........*..................... + // ldr q18, [x0, #256] // ..*............................ + // ldr q31, [x0, #320] // ......*........................ + // sqrdmulh v29.8H, v18.8H, v0.H[1] // ....*.......................... + // mul v26.8H, v18.8H, v0.H[0] // .....*......................... + // ldr q25, [x0, #384] // .......*....................... + // mls v26.8H, v29.8H, v7.H[0] // ........*...................... + sub count, count, #1 -.p2align 2 layer123_start: - mul v4.8H, v4.8H, v0.H[0] // ........*................................................................... - // gap // ............................................................................ - mul v6.8H, v23.8H, v0.H[0] // .............*.............................................................. - // gap // ............................................................................ - sqrdmulh v14.8H, v23.8H, v0.H[1] // ..............*............................................................. - // gap // ............................................................................ - mul v28.8H, v22.8H, v0.H[0] // ..................*......................................................... - // gap // ............................................................................ - mls v4.8H, v16.8H, v7.H[0] // ..........*................................................................. - // gap // ............................................................................ - sqrdmulh v23.8H, v22.8H, v0.H[1] // ...................*........................................................ - // gap // ............................................................................ - mls v6.8H, v14.8H, v7.H[0] // ...............*............................................................ - // gap // ............................................................................ - mul v14.8H, v17.8H, v0.H[0] // .......................*.................................................... - // gap // ............................................................................ - sub v16.8H, v30.8H, v4.8H // ...........*................................................................ - // gap // ............................................................................ - mls v28.8H, v23.8H, v7.H[0] // ....................*....................................................... - // gap // ............................................................................ - add v4.8H, v30.8H, v4.8H // ............*............................................................... - // gap // ............................................................................ - sub v23.8H, v9.8H, v6.8H // ................*........................................................... - // gap // ............................................................................ - add v6.8H, v9.8H, v6.8H // .................*.......................................................... - // gap // ............................................................................ - sub v17.8H, v5.8H, v28.8H // .....................*...................................................... - // gap // ............................................................................ - add v28.8H, v5.8H, v28.8H // ......................*..................................................... - // gap // ............................................................................ - mls v14.8H, v29.8H, v7.H[0] // .........................*.................................................. - // gap // ............................................................................ - mul v5.8H, v17.8H, v0.H[4] // ......................................*..................................... - // gap // ............................................................................ - sqrdmulh v17.8H, v17.8H, v0.H[5] // .......................................*.................................... - // gap // ............................................................................ - mul v30.8H, v28.8H, v0.H[2] // ............................*............................................... - // gap // ............................................................................ - sub v9.8H, v10.8H, v14.8H // ..........................*................................................. - // gap // ............................................................................ - add v14.8H, v10.8H, v14.8H // ...........................*................................................ - // gap // ............................................................................ - mls v5.8H, v17.8H, v7.H[0] // ........................................*................................... - // gap // ............................................................................ - sqrdmulh v28.8H, v28.8H, v0.H[3] // .............................*.............................................. - // gap // ............................................................................ - mul v17.8H, v9.8H, v0.H[4] // ...........................................*................................ - // gap // ............................................................................ - sqrdmulh v9.8H, v9.8H, v0.H[5] // ............................................*............................... - // gap // ............................................................................ - sub v10.8H, v16.8H, v5.8H // .........................................*.................................. - // gap // ............................................................................ - add v16.8H, v16.8H, v5.8H // ..........................................*................................. - // gap // ............................................................................ - mls v30.8H, v28.8H, v7.H[0] // ..............................*............................................. - // gap // ............................................................................ - mul v28.8H, v14.8H, v0.H[2] // .................................*.......................................... - // gap // ............................................................................ - mls v17.8H, v9.8H, v7.H[0] // .............................................*.............................. - // gap // ............................................................................ - sqrdmulh v14.8H, v14.8H, v0.H[3] // ..................................*......................................... - // gap // ............................................................................ - sub v5.8H, v4.8H, v30.8H // ...............................*............................................ - // gap // ............................................................................ - add v4.8H, v4.8H, v30.8H // ................................*........................................... - // gap // ............................................................................ - sub v30.8H, v23.8H, v17.8H // ..............................................*............................. - // gap // ............................................................................ - add v23.8H, v23.8H, v17.8H // ...............................................*............................ - // gap // ............................................................................ - mls v28.8H, v14.8H, v7.H[0] // ...................................*........................................ - // gap // ............................................................................ - mul v14.8H, v30.8H, v1.H[4] // ...............................................................*............ - // gap // ............................................................................ - mul v17.8H, v23.8H, v1.H[2] // ..........................................................*................. - // gap // ............................................................................ - sqrdmulh v23.8H, v23.8H, v1.H[3] // ...........................................................*................ - // gap // ............................................................................ - sub v9.8H, v6.8H, v28.8H // ....................................*....................................... - // gap // ............................................................................ - add v6.8H, v6.8H, v28.8H // .....................................*...................................... - // gap // ............................................................................ - sqrdmulh v28.8H, v30.8H, v1.H[5] // ................................................................*........... - // gap // ............................................................................ - mul v30.8H, v9.8H, v1.H[0] // .....................................................*...................... - // gap // ............................................................................ - mul v22.8H, v6.8H, v0.H[6] // ................................................*........................... - // gap // ............................................................................ - sqrdmulh v6.8H, v6.8H, v0.H[7] // .................................................*.......................... - // gap // ............................................................................ - sqrdmulh v9.8H, v9.8H, v1.H[1] // ......................................................*..................... - // gap // ............................................................................ - mls v17.8H, v23.8H, v7.H[0] // ............................................................*............... - // gap // ............................................................................ - mls v14.8H, v28.8H, v7.H[0] // .................................................................*.......... - // gap // ............................................................................ - mls v22.8H, v6.8H, v7.H[0] // ..................................................*......................... - // gap // ............................................................................ - mls v30.8H, v9.8H, v7.H[0] // .......................................................*.................... - // gap // ............................................................................ - sub v6.8H, v16.8H, v17.8H // .............................................................*.............. - // gap // ............................................................................ - sub v28.8H, v10.8H, v14.8H // ..................................................................*......... - // gap // ............................................................................ - add v14.8H, v10.8H, v14.8H // ...................................................................*........ - // gap // ............................................................................ - add v23.8H, v16.8H, v17.8H // ..............................................................*............. - // gap // ............................................................................ - sub v16.8H, v4.8H, v22.8H // ...................................................*........................ - // gap // ............................................................................ - add v4.8H, v4.8H, v22.8H // ....................................................*....................... - // gap // ............................................................................ - sub v17.8H, v5.8H, v30.8H // ........................................................*................... - // gap // ............................................................................ - add v5.8H, v5.8H, v30.8H // .........................................................*.................. - // gap // ............................................................................ - str_vi v4, x0, 16 // ....................................................................*....... - // gap // ............................................................................ - ldr_vo v30, x0, 0 // e........................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v16, x0, 48 // .....................................................................*...... - // gap // ............................................................................ - ldr_vo v9, x0, 64 // .e.......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v5, x0, 112 // ......................................................................*..... - // gap // ............................................................................ - ldr_vo v5, x0, 128 // ..e......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v17, x0, 176 // .......................................................................*.... - // gap // ............................................................................ - ldr_vo v10, x0, 192 // ...e........................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v23, x0, 240 // ........................................................................*... - // gap // ............................................................................ - ldr_vo v4, x0, 256 // ....e....................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v6, x0, 304 // .........................................................................*.. - // gap // ............................................................................ - ldr_vo v23, x0, 320 // .....e...................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v14, x0, 368 // ..........................................................................*. - // gap // ............................................................................ - sqrdmulh v16.8H, v4.8H, v0.H[1] // .........e.................................................................. - // gap // ............................................................................ - str_vo v28, x0, 432 // ...........................................................................* - // gap // ............................................................................ - ldr_vo v17, x0, 448 // .......e.................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - ldr_vo v22, x0, 384 // ......e..................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v29.8H, v17.8H, v0.H[1] // ........................e................................................... - // gap // ............................................................................ - - // original source code - // ldr_vo v8, x0, 0 // e......................................................................................... || e....................................................................................................... - // ldr_vo v9, x0, 64 // ..e....................................................................................... || ...e.................................................................................................... - // ldr_vo v10, x0, 128 // ....e..................................................................................... || ......e................................................................................................. - // ldr_vo v11, x0, 192 // ......e................................................................................... || .........e.............................................................................................. - // ldr_vo v12, x0, 256 // ........e................................................................................. || ............e........................................................................................... - // ldr_vo v13, x0, 320 // ..........e............................................................................... || ...............e........................................................................................ - // ldr_vo v14, x0, 384 // ...............e.......................................................................... || ......................e................................................................................. - // ldr_vo v15, x0, 448 // ..............e........................................................................... || ....................e................................................................................... - // mul v24.8H, v12.8H, v0.H[0] // .................*........................................................................ || .........................*.............................................................................. - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ............e............................................................................. || ..................e..................................................................................... - // mls v24.8H, v12.8H, v7.H[0] // .....................*.................................................................... || .............................*.......................................................................... - // sub v12.8H, v8.8H, v24.8H // .........................*................................................................ || .................................*...................................................................... - // add v8.8H, v8.8H, v24.8H // ...........................*.............................................................. || ...................................*.................................................................... - // mul v24.8H, v13.8H, v0.H[0] // ..................*....................................................................... || ..........................*............................................................................. - // sqrdmulh v13.8H, v13.8H, v0.H[1] // ...................*...................................................................... || ...........................*............................................................................ - // mls v24.8H, v13.8H, v7.H[0] // .......................*.................................................................. || ...............................*........................................................................ - // sub v13.8H, v9.8H, v24.8H // ............................*............................................................. || ....................................*................................................................... - // add v9.8H, v9.8H, v24.8H // .............................*............................................................ || .....................................*.................................................................. - // mul v24.8H, v14.8H, v0.H[0] // ....................*..................................................................... || ............................*........................................................................... - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ......................*................................................................... || ..............................*......................................................................... - // mls v24.8H, v14.8H, v7.H[0] // ..........................*............................................................... || ..................................*..................................................................... - // sub v14.8H, v10.8H, v24.8H // ..............................*........................................................... || ......................................*................................................................. - // add v10.8H, v10.8H, v24.8H // ...............................*.......................................................... || .......................................*................................................................ - // mul v24.8H, v15.8H, v0.H[0] // ........................*................................................................. || ................................*....................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ................e......................................................................... || ........................e............................................................................... - // mls v24.8H, v15.8H, v7.H[0] // ................................*......................................................... || ........................................*............................................................... - // sub v15.8H, v11.8H, v24.8H // ....................................*..................................................... || ............................................*........................................................... - // add v11.8H, v11.8H, v24.8H // .....................................*.................................................... || .............................................*.......................................................... - // mul v24.8H, v10.8H, v0.H[2] // ...................................*...................................................... || ...........................................*............................................................ - // sqrdmulh v10.8H, v10.8H, v0.H[3] // .......................................*.................................................. || ...............................................*........................................................ - // mls v24.8H, v10.8H, v7.H[0] // ............................................*............................................. || ....................................................*................................................... - // sub v10.8H, v8.8H, v24.8H // ................................................*......................................... || ........................................................*............................................... - // add v8.8H, v8.8H, v24.8H // .................................................*........................................ || .........................................................*.............................................. - // mul v24.8H, v11.8H, v0.H[2] // .............................................*............................................ || .....................................................*.................................................. - // sqrdmulh v11.8H, v11.8H, v0.H[3] // ...............................................*.......................................... || .......................................................*................................................ - // mls v24.8H, v11.8H, v7.H[0] // ....................................................*..................................... || ............................................................*........................................... - // sub v11.8H, v9.8H, v24.8H // ........................................................*................................. || ................................................................*....................................... - // add v9.8H, v9.8H, v24.8H // .........................................................*................................ || .................................................................*...................................... - // mul v24.8H, v14.8H, v0.H[4] // .................................*........................................................ || .........................................*.............................................................. - // sqrdmulh v14.8H, v14.8H, v0.H[5] // ..................................*....................................................... || ..........................................*............................................................. - // mls v24.8H, v14.8H, v7.H[0] // ......................................*................................................... || ..............................................*......................................................... - // sub v14.8H, v12.8H, v24.8H // ..........................................*............................................... || ..................................................*..................................................... - // add v12.8H, v12.8H, v24.8H // ...........................................*.............................................. || ...................................................*.................................................... - // mul v24.8H, v15.8H, v0.H[4] // ........................................*................................................. || ................................................*....................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // .........................................*................................................ || .................................................*...................................................... - // mls v24.8H, v15.8H, v7.H[0] // ..............................................*........................................... || ......................................................*................................................. - // sub v15.8H, v13.8H, v24.8H // ..................................................*....................................... || ..........................................................*............................................. - // add v13.8H, v13.8H, v24.8H // ...................................................*...................................... || ...........................................................*............................................ - // mul v24.8H, v9.8H, v0.H[6] // ............................................................*............................. || ....................................................................*................................... - // sqrdmulh v9.8H, v9.8H, v0.H[7] // .............................................................*............................ || .....................................................................*.................................. - // mls v24.8H, v9.8H, v7.H[0] // .................................................................*........................ || .........................................................................*.............................. - // sub v9.8H, v8.8H, v24.8H // .......................................................................*.................. || ...............................................................................*........................ - // add v8.8H, v8.8H, v24.8H // ........................................................................*................. || ................................................................................*....................... - // mul v24.8H, v11.8H, v1.H[0] // ...........................................................*.............................. || ...................................................................*.................................... - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ..............................................................*........................... || ......................................................................*................................. - // mls v24.8H, v11.8H, v7.H[0] // ..................................................................*....................... || ..........................................................................*............................. - // sub v11.8H, v10.8H, v24.8H // .........................................................................*................ || .................................................................................*...................... - // add v10.8H, v10.8H, v24.8H // ..........................................................................*............... || ..................................................................................*..................... - // mul v24.8H, v13.8H, v1.H[2] // ......................................................*................................... || ..............................................................*......................................... - // sqrdmulh v13.8H, v13.8H, v1.H[3] // .......................................................*.................................. || ...............................................................*........................................ - // mls v24.8H, v13.8H, v7.H[0] // ...............................................................*.......................... || .......................................................................*................................ - // sub v13.8H, v12.8H, v24.8H // ...................................................................*...................... || ...........................................................................*............................ - // add v12.8H, v12.8H, v24.8H // ......................................................................*................... || ..............................................................................*......................... - // mul v24.8H, v15.8H, v1.H[4] // .....................................................*.................................... || .............................................................*.......................................... - // sqrdmulh v15.8H, v15.8H, v1.H[5] // ..........................................................*............................... || ..................................................................*..................................... - // mls v24.8H, v15.8H, v7.H[0] // ................................................................*......................... || ........................................................................*............................... - // sub v15.8H, v14.8H, v24.8H // ....................................................................*..................... || ............................................................................*........................... - // add v14.8H, v14.8H, v24.8H // .....................................................................*.................... || .............................................................................*.......................... - // str_vi v8, x0, 16 // ...........................................................................*.............. || ...................................................................................*.................... - // str_vo v9, x0, 48 // .............................................................................*............ || ......................................................................................*................. - // str_vo v10, x0, 112 // ...............................................................................*.......... || .........................................................................................*.............. - // str_vo v11, x0, 176 // .................................................................................*........ || ............................................................................................*........... - // str_vo v12, x0, 240 // ...................................................................................*...... || ...............................................................................................*........ - // str_vo v13, x0, 304 // .....................................................................................*.... || ..................................................................................................*..... - // str_vo v14, x0, 368 // .......................................................................................*.. || .....................................................................................................*.. - // str_vo v15, x0, 432 // .........................................................................................* || .......................................................................................................* - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Cycle bound: 84.0 + // IPC bound: 0.90 + // + // Wall time: 6.34s + // User time: 6.34s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q29, [x0, #0] // *........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v18.8H, v31.8H, v0.H[1] // .............*.............................................................. + // gap // ............................................................................ + mul v6.8H, v31.8H, v0.H[0] // ..............*............................................................. + // gap // ............................................................................ + sub v2.8H, v29.8H, v26.8H // ...........*................................................................ + // gap // ............................................................................ + add v17.8H, v29.8H, v26.8H // ............*............................................................... + // gap // ............................................................................ + sqrdmulh v26.8H, v25.8H, v0.H[1] // ..................*......................................................... + // gap // ............................................................................ + mls v6.8H, v18.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + mul v18.8H, v25.8H, v0.H[0] // ...................*........................................................ + // gap // ............................................................................ + sqrdmulh v25.8H, v15.8H, v0.H[1] // .......................*.................................................... + // gap // ............................................................................ + mul v16.8H, v15.8H, v0.H[0] // ........................*................................................... + // gap // ............................................................................ + sub v19.8H, v20.8H, v6.8H // ................*........................................................... + // gap // ............................................................................ + mls v18.8H, v26.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + add v6.8H, v20.8H, v6.8H // .................*.......................................................... + // gap // ............................................................................ + ldr q15, [x0, #464] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v3.8H, v11.8H, v18.8H // .....................*...................................................... + // gap // ............................................................................ + add v5.8H, v11.8H, v18.8H // ......................*..................................................... + // gap // ............................................................................ + mls v16.8H, v25.8H, v7.H[0] // .........................*.................................................. + // gap // ............................................................................ + sqrdmulh v20.8H, v3.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + mul v26.8H, v3.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + sqrdmulh v11.8H, v5.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + mul v18.8H, v5.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + add v3.8H, v12.8H, v16.8H // ...........................*................................................ + // gap // ............................................................................ + sub v9.8H, v12.8H, v16.8H // ..........................*................................................. + // gap // ............................................................................ + mls v26.8H, v20.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + mls v18.8H, v11.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + mul v25.8H, v9.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + sqrdmulh v20.8H, v9.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + add v24.8H, v2.8H, v26.8H // ..........................................*................................. + // gap // ............................................................................ + sub v11.8H, v2.8H, v26.8H // .........................................*.................................. + // gap // ............................................................................ + mul v16.8H, v3.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + mls v25.8H, v20.8H, v7.H[0] // .............................................*.............................. + // gap // ............................................................................ + sqrdmulh v5.8H, v3.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + add v9.8H, v17.8H, v18.8H // ................................*........................................... + // gap // ............................................................................ + sub v26.8H, v17.8H, v18.8H // ...............................*............................................ + // gap // ............................................................................ + sub v20.8H, v19.8H, v25.8H // ..............................................*............................. + // gap // ............................................................................ + add v25.8H, v19.8H, v25.8H // ...............................................*............................ + // gap // ............................................................................ + mls v16.8H, v5.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + sqrdmulh v29.8H, v20.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + sqrdmulh v19.8H, v25.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + mul v25.8H, v25.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + sub v12.8H, v6.8H, v16.8H // ....................................*....................................... + // gap // ............................................................................ + add v2.8H, v6.8H, v16.8H // .....................................*...................................... + // gap // ............................................................................ + mul v16.8H, v20.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v20.8H, v12.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + sqrdmulh v8.8H, v2.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + mul v6.8H, v2.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + mls v25.8H, v19.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + mls v16.8H, v29.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + mul v12.8H, v12.8H, v1.H[0] // ......................................................*..................... + // gap // ............................................................................ + mls v6.8H, v8.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + sub v29.8H, v24.8H, v25.8H // .............................................................*.............. + // gap // ............................................................................ + add v31.8H, v24.8H, v25.8H // ..............................................................*............. + // gap // ............................................................................ + mls v12.8H, v20.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + add v18.8H, v9.8H, v6.8H // ....................................................*....................... + // gap // ............................................................................ + sub v25.8H, v9.8H, v6.8H // ...................................................*........................ + // gap // ............................................................................ + sub v6.8H, v11.8H, v16.8H // ..................................................................*......... + // gap // ............................................................................ + sub v19.8H, v26.8H, v12.8H // ........................................................*................... + // gap // ............................................................................ + add v26.8H, v26.8H, v12.8H // .........................................................*.................. + // gap // ............................................................................ + str q18, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + add v16.8H, v11.8H, v16.8H // ...................................................................*........ + // gap // ............................................................................ + ldr q20, [x0, #64] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q25, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + ldr q11, [x0, #128] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q26, [x0, #112] // ......................................................................*..... + // gap // ............................................................................ + ldr q12, [x0, #192] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + ldr q18, [x0, #256] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q31, [x0, #240] // ........................................................................*... + // gap // ............................................................................ + ldr q31, [x0, #320] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q29, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ + sqrdmulh v29.8H, v18.8H, v0.H[1] // ........e................................................................... + // gap // ............................................................................ + str q16, [x0, #368] // ..........................................................................*. + // gap // ............................................................................ + mul v26.8H, v18.8H, v0.H[0] // .........e.................................................................. + // gap // ............................................................................ + str q6, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + ldr q25, [x0, #384] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.8H, v29.8H, v7.H[0] // ..........e................................................................. + // gap // ............................................................................ + + // ------------------------------------------------------------- new position -------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ldr q8, [x0, #0] // ...............................................................*......................................................................... + // ldr q9, [x0, #(1*(512/8))] // ...............................................e...............'...........................................................~............. + // ldr q10, [x0, #(2*(512/8))] // .................................................e.............'.............................................................~........... + // ldr q11, [x0, #(3*(512/8))] // ...................................................e...........'...............................................................~......... + // ldr q12, [x0, #(4*(512/8))] // .....................................................e.........'.................................................................~....... + // ldr q13, [x0, #(5*(512/8))] // .......................................................e.......'...................................................................~..... + // ldr q14, [x0, #(6*(512/8))] // .............................................................e.'......................................................................... + // ldr q15, [x0, #(7*(512/8))] // e..............................................................'............~............................................................ + // sqrdmulh v27.8h, v12.8h, v0.h[1] // .........................................................e.....'.....................................................................~... + // mul v24.8h, v12.8h, v0.h[0] // ...........................................................e...'.......................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // ..............................................................e'......................................................................... + // sub v12.8h, v8.8h, v24.8h // ...............................................................'..*...................................................................... + // add v8.8h, v8.8h, v24.8h // ...............................................................'...*..................................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ...............................................................'*........................................................................ + // mul v24.8h, v13.8h, v0.h[0] // ...............................................................'.*....................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................................................'.....*................................................................... + // sub v13.8h, v9.8h, v24.8h // ...............................................................'.........*............................................................... + // add v9.8h, v9.8h, v24.8h // ...............................................................'...........*............................................................. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ...............................................................'....*.................................................................... + // mul v24.8h, v14.8h, v0.h[0] // ...............................................................'......*.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...............................................................'..........*.............................................................. + // sub v14.8h, v10.8h, v24.8h // .~.............................................................'.............*........................................................... + // add v10.8h, v10.8h, v24.8h // ..~............................................................'..............*.......................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ...............................................................'.......*................................................................. + // mul v24.8h, v15.8h, v0.h[0] // ...............................................................'........*................................................................ + // mls v24.8h, v27.8h, v7.h[0] // ...~...........................................................'...............*......................................................... + // sub v15.8h, v11.8h, v24.8h // .........~.....................................................'.....................*................................................... + // add v11.8h, v11.8h, v24.8h // ........~......................................................'....................*.................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ......~........................................................'..................*...................................................... + // mul v24.8h, v10.8h, v0.h[2] // .......~.......................................................'...................*..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........~...................................................'.......................*................................................. + // sub v10.8h, v8.8h, v24.8h // ....................~..........................................'................................*........................................ + // add v8.8h, v8.8h, v24.8h // ...................~...........................................'...............................*......................................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ..................~............................................'..............................*.......................................... + // mul v24.8h, v11.8h, v0.h[2] // ................~..............................................'............................*............................................ + // mls v24.8h, v27.8h, v7.h[0] // .......................~.......................................'...................................*..................................... + // sub v11.8h, v9.8h, v24.8h // ...........................~...................................'.......................................*................................. + // add v9.8h, v9.8h, v24.8h // ............................~..................................'........................................*................................ + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ....~..........................................................'................*........................................................ + // mul v24.8h, v14.8h, v0.h[4] // .....~.........................................................'.................*....................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........~....................................................'......................*.................................................. + // sub v14.8h, v12.8h, v24.8h // ...............~...............................................'...........................*............................................. + // add v12.8h, v12.8h, v24.8h // ..............~................................................'..........................*.............................................. + // sqrdmulh v27.8h, v15.8h, v0.h[5] // .............~.................................................'.........................*............................................... + // mul v24.8h, v15.8h, v0.h[4] // ............~..................................................'........................*................................................ + // mls v24.8h, v27.8h, v7.h[0] // .................~.............................................'.............................*........................................... + // sub v15.8h, v13.8h, v24.8h // .....................~.........................................'.................................*....................................... + // add v13.8h, v13.8h, v24.8h // ......................~........................................'..................................*...................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ...............................~...............................'...........................................*............................. + // mul v24.8h, v9.8h, v0.h[6] // ................................~..............................'............................................*............................ + // mls v24.8h, v27.8h, v7.h[0] // ....................................~..........................'................................................*........................ + // sub v9.8h, v8.8h, v24.8h // .........................................~.....................'.....................................................*................... + // add v8.8h, v8.8h, v24.8h // ........................................~......................'....................................................*.................... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ..............................~................................'..........................................*.............................. + // mul v24.8h, v11.8h, v1.h[0] // ...................................~...........................'...............................................*......................... + // mls v24.8h, v27.8h, v7.h[0] // .......................................~.......................'...................................................*..................... + // sub v11.8h, v10.8h, v24.8h // ...........................................~...................'.......................................................*................. + // add v10.8h, v10.8h, v24.8h // ............................................~..................'........................................................*................ + // sqrdmulh v27.8h, v13.8h, v1.h[3] // .........................~.....................................'.....................................*................................... + // mul v24.8h, v13.8h, v1.h[2] // ..........................~....................................'......................................*.................................. + // mls v24.8h, v27.8h, v7.h[0] // .................................~.............................'.............................................*........................... + // sub v13.8h, v12.8h, v24.8h // .....................................~.........................'.................................................*....................... + // add v12.8h, v12.8h, v24.8h // ......................................~........................'..................................................*...................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ........................~......................................'....................................*.................................... + // mul v24.8h, v15.8h, v1.h[4] // .............................~.................................'.........................................*............................... + // mls v24.8h, v27.8h, v7.h[0] // ..................................~............................'..............................................*.......................... + // sub v15.8h, v14.8h, v24.8h // ..........................................~....................'......................................................*.................. + // add v14.8h, v14.8h, v24.8h // ..............................................~................'..........................................................*.............. + // str q8, [x0], #(16) // .............................................~.................'.........................................................*............... + // str q9, [x0, #(-16 + 1*(512/8))] // ................................................~..............'............................................................*............ + // str q10, [x0, #(-16 + 2*(512/8))] // ..................................................~............'..............................................................*.......... + // str q11, [x0, #(-16 + 3*(512/8))] // ....................................................~..........'................................................................*........ + // str q12, [x0, #(-16 + 4*(512/8))] // ......................................................~........'..................................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // ........................................................~......'....................................................................*.... + // str q14, [x0, #(-16 + 6*(512/8))] // ..........................................................~....'......................................................................*.. + // str q15, [x0, #(-16 + 7*(512/8))] // ............................................................~..'........................................................................* + + sub count, count, #1 cbnz count, layer123_start - mul v6.8H, v4.8H, v0.H[0] // *................................................................. - // gap // .................................................................. - mul v8.8H, v17.8H, v0.H[0] // .......*.......................................................... - // gap // .................................................................. - mul v4.8H, v22.8H, v0.H[0] // ...*.............................................................. - // gap // .................................................................. - sqrdmulh v15.8H, v23.8H, v0.H[1] // ..*............................................................... - // gap // .................................................................. - mul v14.8H, v23.8H, v0.H[0] // .*................................................................ - // gap // .................................................................. - mls v8.8H, v29.8H, v7.H[0] // ...............*.................................................. - // gap // .................................................................. - sqrdmulh v13.8H, v22.8H, v0.H[1] // .....*............................................................ - // gap // .................................................................. - mls v6.8H, v16.8H, v7.H[0] // ....*............................................................. - // gap // .................................................................. - mls v14.8H, v15.8H, v7.H[0] // ......*........................................................... - // gap // .................................................................. - sub v20.8H, v10.8H, v8.8H // ...................*.............................................. - // gap // .................................................................. - mls v4.8H, v13.8H, v7.H[0] // .........*........................................................ - // gap // .................................................................. - add v17.8H, v10.8H, v8.8H // ....................*............................................. - // gap // .................................................................. - add v31.8H, v9.8H, v14.8H // ............*..................................................... - // gap // .................................................................. - sub v12.8H, v9.8H, v14.8H // ...........*...................................................... - // gap // .................................................................. - sub v18.8H, v5.8H, v4.8H // .............*.................................................... - // gap // .................................................................. - add v4.8H, v5.8H, v4.8H // ..............*................................................... - // gap // .................................................................. - sqrdmulh v9.8H, v17.8H, v0.H[3] // ..............................*................................... - // gap // .................................................................. - mul v21.8H, v17.8H, v0.H[2] // ............................*..................................... - // gap // .................................................................. - mul v10.8H, v4.8H, v0.H[2] // ..................*............................................... - // gap // .................................................................. - sqrdmulh v3.8H, v4.8H, v0.H[3] // ......................*........................................... - // gap // .................................................................. - add v4.8H, v30.8H, v6.8H // ..........*....................................................... - // gap // .................................................................. - sub v15.8H, v30.8H, v6.8H // ........*......................................................... - // gap // .................................................................. - mls v21.8H, v9.8H, v7.H[0] // ...................................*.............................. - // gap // .................................................................. - mul v24.8H, v20.8H, v0.H[4] // .......................*.......................................... - // gap // .................................................................. - sqrdmulh v25.8H, v20.8H, v0.H[5] // ........................*......................................... - // gap // .................................................................. - mls v10.8H, v3.8H, v7.H[0] // ...........................*...................................... - // gap // .................................................................. - add v27.8H, v31.8H, v21.8H // ........................................*......................... - // gap // .................................................................. - sub v19.8H, v31.8H, v21.8H // .......................................*.......................... - // gap // .................................................................. - sqrdmulh v21.8H, v18.8H, v0.H[5] // .................*................................................ - // gap // .................................................................. - add v26.8H, v4.8H, v10.8H // ................................*................................. - // gap // .................................................................. - sub v17.8H, v4.8H, v10.8H // ...............................*.................................. - // gap // .................................................................. - mul v31.8H, v27.8H, v0.H[6] // ...........................................*...................... - // gap // .................................................................. - mul v2.8H, v19.8H, v1.H[0] // ..........................................*....................... - // gap // .................................................................. - mul v4.8H, v18.8H, v0.H[4] // ................*................................................. - // gap // .................................................................. - mls v24.8H, v25.8H, v7.H[0] // .............................*.................................... - // gap // .................................................................. - sqrdmulh v13.8H, v19.8H, v1.H[1] // .............................................*.................... - // gap // .................................................................. - sqrdmulh v11.8H, v27.8H, v0.H[7] // ............................................*..................... - // gap // .................................................................. - mls v4.8H, v21.8H, v7.H[0] // .....................*............................................ - // gap // .................................................................. - sub v27.8H, v12.8H, v24.8H // .................................*................................ - // gap // .................................................................. - mls v2.8H, v13.8H, v7.H[0] // .................................................*................ - // gap // .................................................................. - add v13.8H, v12.8H, v24.8H // ..................................*............................... - // gap // .................................................................. - mul v24.8H, v27.8H, v1.H[4] // ....................................*............................. - // gap // .................................................................. - mls v31.8H, v11.8H, v7.H[0] // ................................................*................. - // gap // .................................................................. - mul v5.8H, v13.8H, v1.H[2] // .....................................*............................ - // gap // .................................................................. - sqrdmulh v12.8H, v13.8H, v1.H[3] // ......................................*........................... - // gap // .................................................................. - add v13.8H, v17.8H, v2.8H // .........................................................*........ - // gap // .................................................................. - sub v18.8H, v15.8H, v4.8H // .........................*........................................ - // gap // .................................................................. - add v29.8H, v26.8H, v31.8H // .......................................................*.......... - // gap // .................................................................. - add v4.8H, v15.8H, v4.8H // ..........................*....................................... - // gap // .................................................................. - sub v25.8H, v26.8H, v31.8H // ......................................................*........... - // gap // .................................................................. - str_vi v29, x0, 16 // ..........................................................*....... - // gap // .................................................................. - sqrdmulh v31.8H, v27.8H, v1.H[5] // .........................................*........................ - // gap // .................................................................. - str_vo v25, x0, 48 // ...........................................................*...... - // gap // .................................................................. - mls v5.8H, v12.8H, v7.H[0] // ..............................................*................... - // gap // .................................................................. - str_vo v13, x0, 112 // ............................................................*..... - // gap // .................................................................. - sub v13.8H, v17.8H, v2.8H // ........................................................*......... - // gap // .................................................................. - mls v24.8H, v31.8H, v7.H[0] // ...............................................*.................. - // gap // .................................................................. - add v3.8H, v4.8H, v5.8H // .....................................................*............ - // gap // .................................................................. - str_vo v13, x0, 176 // .............................................................*.... - // gap // .................................................................. - sub v4.8H, v4.8H, v5.8H // ..................................................*............... - // gap // .................................................................. - str_vo v3, x0, 240 // ..............................................................*... - // gap // .................................................................. - add v11.8H, v18.8H, v24.8H // ....................................................*............. - // gap // .................................................................. - str_vo v4, x0, 304 // ...............................................................*.. - // gap // .................................................................. - sub v4.8H, v18.8H, v24.8H // ...................................................*.............. - // gap // .................................................................. - str_vo v11, x0, 368 // ................................................................*. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str_vo v4, x0, 432 // .................................................................* - // gap // .................................................................. - - // original source code - // mul v4.8H, v4.8H, v0.H[0] // *................................................................. || *.................................................................. - // mul v6.8H, v23.8H, v0.H[0] // ....*............................................................. || ....*.............................................................. - // sqrdmulh v14.8H, v23.8H, v0.H[1] // ...*.............................................................. || ...*............................................................... - // mul v28.8H, v22.8H, v0.H[0] // ..*............................................................... || ..*................................................................ - // mls v4.8H, v16.8H, v7.H[0] // .......*.......................................................... || .......*........................................................... - // sqrdmulh v23.8H, v22.8H, v0.H[1] // ......*........................................................... || ......*............................................................ - // mls v6.8H, v14.8H, v7.H[0] // ........*......................................................... || ........*.......................................................... - // mul v14.8H, v17.8H, v0.H[0] // .*................................................................ || .*................................................................. - // sub v16.8H, v30.8H, v4.8H // .....................*............................................ || .....................*............................................. - // mls v28.8H, v23.8H, v7.H[0] // ..........*....................................................... || ..........*........................................................ - // add v4.8H, v30.8H, v4.8H // ....................*............................................. || ....................*.............................................. - // sub v23.8H, v9.8H, v6.8H // .............*.................................................... || .............*..................................................... - // add v6.8H, v9.8H, v6.8H // ............*..................................................... || ............*...................................................... - // sub v17.8H, v5.8H, v28.8H // ..............*................................................... || ..............*.................................................... - // add v28.8H, v5.8H, v28.8H // ...............*.................................................. || ...............*................................................... - // mls v14.8H, v29.8H, v7.H[0] // .....*............................................................ || .....*............................................................. - // mul v5.8H, v17.8H, v0.H[4] // .................................*................................ || .................................*................................. - // sqrdmulh v17.8H, v17.8H, v0.H[5] // ............................*..................................... || ............................*...................................... - // mul v30.8H, v28.8H, v0.H[2] // ..................*............................................... || ..................*................................................ - // sub v9.8H, v10.8H, v14.8H // .........*........................................................ || .........*......................................................... - // add v14.8H, v10.8H, v14.8H // ...........*...................................................... || ...........*....................................................... - // mls v5.8H, v17.8H, v7.H[0] // .....................................*............................ || .....................................*............................. - // sqrdmulh v28.8H, v28.8H, v0.H[3] // ...................*.............................................. || ...................*............................................... - // mul v17.8H, v9.8H, v0.H[4] // .......................*.......................................... || .......................*........................................... - // sqrdmulh v9.8H, v9.8H, v0.H[5] // ........................*......................................... || ........................*.......................................... - // sub v10.8H, v16.8H, v5.8H // ..............................................*................... || ..............................................*.................... - // add v16.8H, v16.8H, v5.8H // ................................................*................. || ................................................*.................. - // mls v30.8H, v28.8H, v7.H[0] // .........................*........................................ || .........................*......................................... - // mul v28.8H, v14.8H, v0.H[2] // .................*................................................ || .................*................................................. - // mls v17.8H, v9.8H, v7.H[0] // ..................................*............................... || ..................................*................................ - // sqrdmulh v14.8H, v14.8H, v0.H[3] // ................*................................................. || ................*.................................................. - // sub v5.8H, v4.8H, v30.8H // ..............................*................................... || ..............................*.................................... - // add v4.8H, v4.8H, v30.8H // .............................*.................................... || .............................*..................................... - // sub v30.8H, v23.8H, v17.8H // ......................................*........................... || ......................................*............................ - // add v23.8H, v23.8H, v17.8H // ........................................*......................... || ........................................*.......................... - // mls v28.8H, v14.8H, v7.H[0] // ......................*........................................... || ......................*............................................ - // mul v14.8H, v30.8H, v1.H[4] // .........................................*........................ || .........................................*......................... - // mul v17.8H, v23.8H, v1.H[2] // ...........................................*...................... || ...........................................*....................... - // sqrdmulh v23.8H, v23.8H, v1.H[3] // ............................................*..................... || ............................................*...................... - // sub v9.8H, v6.8H, v28.8H // ...........................*...................................... || ...........................*....................................... - // add v6.8H, v6.8H, v28.8H // ..........................*....................................... || ..........................*........................................ - // sqrdmulh v28.8H, v30.8H, v1.H[5] // ...................................................*.............. || ...................................................*............... - // mul v30.8H, v9.8H, v1.H[0] // ................................*................................. || ................................*.................................. - // mul v22.8H, v6.8H, v0.H[6] // ...............................*.................................. || ...............................*................................... - // sqrdmulh v6.8H, v6.8H, v0.H[7] // ....................................*............................. || ....................................*.............................. - // sqrdmulh v9.8H, v9.8H, v1.H[1] // ...................................*.............................. || ...................................*............................... - // mls v17.8H, v23.8H, v7.H[0] // .....................................................*............ || .....................................................*............. - // mls v14.8H, v28.8H, v7.H[0] // ........................................................*......... || ........................................................*.......... - // mls v22.8H, v6.8H, v7.H[0] // ..........................................*....................... || ..........................................*........................ - // mls v30.8H, v9.8H, v7.H[0] // .......................................*.......................... || .......................................*........................... - // sub v6.8H, v16.8H, v17.8H // ...........................................................*...... || ...........................................................*....... - // sub v28.8H, v10.8H, v14.8H // ...............................................................*.. || ...............................................................*... - // add v14.8H, v10.8H, v14.8H // .............................................................*.... || .............................................................*..... - // add v23.8H, v16.8H, v17.8H // .........................................................*........ || .........................................................*......... - // sub v16.8H, v4.8H, v22.8H // .................................................*................ || .................................................*................. - // add v4.8H, v4.8H, v22.8H // ...............................................*.................. || ...............................................*................... - // sub v17.8H, v5.8H, v30.8H // .......................................................*.......... || .......................................................*........... - // add v5.8H, v5.8H, v30.8H // .............................................*.................... || .............................................*..................... - // str_vi v4, x0, 16 // ..................................................*............... || ..................................................*................ - // str_vo v16, x0, 48 // ....................................................*............. || ....................................................*.............. - // str_vo v5, x0, 112 // ......................................................*........... || ......................................................*............ - // str_vo v17, x0, 176 // ..........................................................*....... || ..........................................................*........ - // str_vo v23, x0, 240 // ............................................................*..... || ............................................................*...... - // str_vo v6, x0, 304 // ..............................................................*... || ..............................................................*.... - // str_vo v14, x0, 368 // ................................................................*. || ................................................................*.. - // str_vo v28, x0, 432 // .................................................................* || ..................................................................* - + // Instructions: 66 + // Expected cycles: 68 + // Expected IPC: 0.97 + // + // Cycle bound: 68.0 + // IPC bound: 0.97 + // + // Wall time: 19.38s + // User time: 19.38s + // + // ----------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + mul v18.8H, v25.8H, v0.H[0] // .......*.......................................................... + // gap // .................................................................. + sqrdmulh v21.8H, v25.8H, v0.H[1] // .....*............................................................ + // gap // .................................................................. + ldr q14, [x0, #0] // *................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + sqrdmulh v4.8H, v15.8H, v0.H[1] // ........*......................................................... + // gap // .................................................................. + mls v18.8H, v21.8H, v7.H[0] // ...........*...................................................... + // gap // .................................................................. + add v16.8H, v14.8H, v26.8H // ....*............................................................. + // gap // .................................................................. + mul v29.8H, v15.8H, v0.H[0] // .........*........................................................ + // gap // .................................................................. + sqrdmulh v6.8H, v31.8H, v0.H[1] // .*................................................................ + // gap // .................................................................. + add v24.8H, v11.8H, v18.8H // ..............*................................................... + // gap // .................................................................. + sub v28.8H, v11.8H, v18.8H // .............*.................................................... + // gap // .................................................................. + mls v29.8H, v4.8H, v7.H[0] // ...............*.................................................. + // gap // .................................................................. + mul v25.8H, v31.8H, v0.H[0] // ..*............................................................... + // gap // .................................................................. + mul v31.8H, v24.8H, v0.H[2] // ...................*.............................................. + // gap // .................................................................. + sqrdmulh v23.8H, v24.8H, v0.H[3] // ..................*............................................... + // gap // .................................................................. + mul v15.8H, v28.8H, v0.H[4] // .................*................................................ + // gap // .................................................................. + add v10.8H, v12.8H, v29.8H // ....................*............................................. + // gap // .................................................................. + mls v25.8H, v6.8H, v7.H[0] // ......*........................................................... + // gap // .................................................................. + mls v31.8H, v23.8H, v7.H[0] // .......................*.......................................... + // gap // .................................................................. + mul v5.8H, v10.8H, v0.H[2] // ............................*..................................... + // gap // .................................................................. + sub v11.8H, v12.8H, v29.8H // .....................*............................................ + // gap // .................................................................. + add v8.8H, v20.8H, v25.8H // ............*..................................................... + // gap // .................................................................. + sqrdmulh v6.8H, v10.8H, v0.H[3] // ..............................*................................... + // gap // .................................................................. + mul v19.8H, v11.8H, v0.H[4] // ........................*......................................... + // gap // .................................................................. + sqrdmulh v21.8H, v11.8H, v0.H[5] // .........................*........................................ + // gap // .................................................................. + sqrdmulh v18.8H, v28.8H, v0.H[5] // ................*................................................. + // gap // .................................................................. + mls v5.8H, v6.8H, v7.H[0] // ...................................*.............................. + // gap // .................................................................. + sub v29.8H, v20.8H, v25.8H // ..........*....................................................... + // gap // .................................................................. + mls v19.8H, v21.8H, v7.H[0] // .............................*.................................... + // gap // .................................................................. + mls v15.8H, v18.8H, v7.H[0] // ......................*........................................... + // gap // .................................................................. + add v23.8H, v8.8H, v5.8H // ........................................*......................... + // gap // .................................................................. + add v22.8H, v16.8H, v31.8H // ...............................*.................................. + // gap // .................................................................. + add v30.8H, v29.8H, v19.8H // ..................................*............................... + // gap // .................................................................. + sqrdmulh v18.8H, v23.8H, v0.H[7] // ...........................................*...................... + // gap // .................................................................. + mul v10.8H, v23.8H, v0.H[6] // ............................................*..................... + // gap // .................................................................. + sqrdmulh v27.8H, v30.8H, v1.H[3] // .....................................*............................ + // gap // .................................................................. + mul v25.8H, v30.8H, v1.H[2] // ......................................*........................... + // gap // .................................................................. + sub v13.8H, v14.8H, v26.8H // ...*.............................................................. + // gap // .................................................................. + mls v10.8H, v18.8H, v7.H[0] // ................................................*................. + // gap // .................................................................. + sub v6.8H, v16.8H, v31.8H // ................................*................................. + // gap // .................................................................. + mls v25.8H, v27.8H, v7.H[0] // .............................................*.................... + // gap // .................................................................. + sub v2.8H, v8.8H, v5.8H // .......................................*.......................... + // gap // .................................................................. + sub v12.8H, v22.8H, v10.8H // .....................................................*............ + // gap // .................................................................. + add v26.8H, v13.8H, v15.8H // ..........................*....................................... + // gap // .................................................................. + sqrdmulh v24.8H, v2.8H, v1.H[1] // ..........................................*....................... + // gap // .................................................................. + mul v16.8H, v2.8H, v1.H[0] // ...............................................*.................. + // gap // .................................................................. + str q12, [x0, #64] // ...........................................................*...... + // gap // .................................................................. + sub v11.8H, v29.8H, v19.8H // .................................*................................ + // gap // .................................................................. + add v31.8H, v26.8H, v25.8H // ..................................................*............... + // gap // .................................................................. + mls v16.8H, v24.8H, v7.H[0] // ...................................................*.............. + // gap // .................................................................. + mul v29.8H, v11.8H, v1.H[4] // .........................................*........................ + // gap // .................................................................. + sqrdmulh v19.8H, v11.8H, v1.H[5] // ....................................*............................. + // gap // .................................................................. + str q31, [x0, #256] // ..............................................................*... + // gap // .................................................................. + add v18.8H, v6.8H, v16.8H // ........................................................*......... + // gap // .................................................................. + sub v25.8H, v26.8H, v25.8H // .................................................*................ + // gap // .................................................................. + mls v29.8H, v19.8H, v7.H[0] // ..............................................*................... + // gap // .................................................................. + str q18, [x0, #128] // ............................................................*..... + // gap // .................................................................. + sub v9.8H, v13.8H, v15.8H // ...........................*...................................... + // gap // .................................................................. + add v13.8H, v22.8H, v10.8H // ....................................................*............. + // gap // .................................................................. + str q25, [x0, #320] // ...............................................................*.. + // gap // .................................................................. + sub v12.8H, v9.8H, v29.8H // ......................................................*........... + // gap // .................................................................. + str q13, [x0], #(16) // .........................................................*........ + // gap // .................................................................. + add v31.8H, v9.8H, v29.8H // ..........................................................*....... + // gap // .................................................................. + str q12, [x0, #432] // .................................................................* + // gap // .................................................................. + sub v24.8H, v6.8H, v16.8H // .......................................................*.......... + // gap // .................................................................. + str q31, [x0, #368] // ................................................................*. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q24, [x0, #176] // .............................................................*.... + // gap // .................................................................. + + // ------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // ldr q29, [x0, #0] // ..*............................................................... + // sqrdmulh v18.8H, v31.8H, v0.H[1] // .......*.......................................................... + // mul v6.8H, v31.8H, v0.H[0] // ...........*...................................................... + // sub v2.8H, v29.8H, v26.8H // ....................................*............................. + // add v17.8H, v29.8H, v26.8H // .....*............................................................ + // sqrdmulh v26.8H, v25.8H, v0.H[1] // .*................................................................ + // mls v6.8H, v18.8H, v7.H[0] // ................*................................................. + // mul v18.8H, v25.8H, v0.H[0] // *................................................................. + // sqrdmulh v25.8H, v15.8H, v0.H[1] // ...*.............................................................. + // mul v16.8H, v15.8H, v0.H[0] // ......*........................................................... + // sub v19.8H, v20.8H, v6.8H // ..........................*....................................... + // mls v18.8H, v26.8H, v7.H[0] // ....*............................................................. + // add v6.8H, v20.8H, v6.8H // ....................*............................................. + // sub v3.8H, v11.8H, v18.8H // .........*........................................................ + // add v5.8H, v11.8H, v18.8H // ........*......................................................... + // mls v16.8H, v25.8H, v7.H[0] // ..........*....................................................... + // sqrdmulh v20.8H, v3.8H, v0.H[5] // ........................*......................................... + // mul v26.8H, v3.8H, v0.H[4] // ..............*................................................... + // sqrdmulh v11.8H, v5.8H, v0.H[3] // .............*.................................................... + // mul v18.8H, v5.8H, v0.H[2] // ............*..................................................... + // add v3.8H, v12.8H, v16.8H // ...............*.................................................. + // sub v9.8H, v12.8H, v16.8H // ...................*.............................................. + // mls v26.8H, v20.8H, v7.H[0] // ............................*..................................... + // mls v18.8H, v11.8H, v7.H[0] // .................*................................................ + // mul v25.8H, v9.8H, v0.H[4] // ......................*........................................... + // sqrdmulh v20.8H, v9.8H, v0.H[5] // .......................*.......................................... + // add v24.8H, v2.8H, v26.8H // ..........................................*....................... + // sub v11.8H, v2.8H, v26.8H // ........................................................*......... + // mul v16.8H, v3.8H, v0.H[2] // ..................*............................................... + // mls v25.8H, v20.8H, v7.H[0] // ...........................*...................................... + // sqrdmulh v5.8H, v3.8H, v0.H[3] // .....................*............................................ + // add v9.8H, v17.8H, v18.8H // ..............................*................................... + // sub v26.8H, v17.8H, v18.8H // ......................................*........................... + // sub v20.8H, v19.8H, v25.8H // ..............................................*................... + // add v25.8H, v19.8H, v25.8H // ...............................*.................................. + // mls v16.8H, v5.8H, v7.H[0] // .........................*........................................ + // sqrdmulh v29.8H, v20.8H, v1.H[5] // ..................................................*............... + // sqrdmulh v19.8H, v25.8H, v1.H[3] // ..................................*............................... + // mul v25.8H, v25.8H, v1.H[2] // ...................................*.............................. + // sub v12.8H, v6.8H, v16.8H // ........................................*......................... + // add v2.8H, v6.8H, v16.8H // .............................*.................................... + // mul v16.8H, v20.8H, v1.H[4] // .................................................*................ + // sqrdmulh v20.8H, v12.8H, v1.H[1] // ...........................................*...................... + // sqrdmulh v8.8H, v2.8H, v0.H[7] // ................................*................................. + // mul v6.8H, v2.8H, v0.H[6] // .................................*................................ + // mls v25.8H, v19.8H, v7.H[0] // .......................................*.......................... + // mls v16.8H, v29.8H, v7.H[0] // ......................................................*........... + // mul v12.8H, v12.8H, v1.H[0] // ............................................*..................... + // mls v6.8H, v8.8H, v7.H[0] // .....................................*............................ + // sub v29.8H, v24.8H, v25.8H // .....................................................*............ + // add v31.8H, v24.8H, v25.8H // ...............................................*.................. + // mls v12.8H, v20.8H, v7.H[0] // ................................................*................. + // add v18.8H, v9.8H, v6.8H // .........................................................*........ + // sub v25.8H, v9.8H, v6.8H // .........................................*........................ + // sub v6.8H, v11.8H, v16.8H // ...........................................................*...... + // sub v19.8H, v26.8H, v12.8H // ...............................................................*.. + // add v26.8H, v26.8H, v12.8H // ....................................................*............. + // str q18, [x0], #(16) // ............................................................*..... + // add v16.8H, v11.8H, v16.8H // .............................................................*.... + // str q25, [x0, #48] // .............................................*.................... + // str q26, [x0, #112] // .......................................................*.......... + // str q19, [x0, #176] // .................................................................* + // str q31, [x0, #240] // ...................................................*.............. + // str q29, [x0, #304] // ..........................................................*....... + // str q16, [x0, #368] // ................................................................*. + // str q6, [x0, #432] // ..............................................................*... + restore inp, STACK0 mov count, #8 .p2align 2 - // gap // ................................................. - ldr_vi v5, x3, 16 // .....*........................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ldr_vo v12, x1, 48 // ....*............................................ - // gap // ................................................. - // gap // ................................................. - ldr_vo v11, x1, 32 // ......*.......................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mul v14.8H, v12.8H, v5.H[0] // .......*......................................... - // gap // ................................................. - sqrdmulh v25.8H, v12.8H, v5.H[1] // ........*........................................ - // gap // ................................................. - ldr_vo v17, x1, 16 // .*............................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mul v12.8H, v11.8H, v5.H[0] // ...........*..................................... - // gap // ................................................. - mls v14.8H, v25.8H, v7.H[0] // ..........*...................................... - // gap // ................................................. - sqrdmulh v26.8H, v11.8H, v5.H[1] // .............*................................... - // gap // ................................................. - ldr_vo v16, x1, 0 // ..*.............................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sub v30.8H, v17.8H, v14.8H // ............*.................................... - // gap // ................................................. - add v6.8H, v17.8H, v14.8H // ..............*.................................. - // gap // ................................................. - mls v12.8H, v26.8H, v7.H[0] // .................*............................... - // gap // ................................................. - mul v31.8H, v30.8H, v5.H[4] // ...............*................................. - // gap // ................................................. - sqrdmulh v4.8H, v30.8H, v5.H[5] // ................*................................ - // gap // ................................................. - mul v13.8H, v6.8H, v5.H[2] // ..................*.............................. - // gap // ................................................. - sqrdmulh v18.8H, v6.8H, v5.H[3] // ...................*............................. - // gap // ................................................. - sub v15.8H, v16.8H, v12.8H // .....................*........................... - // gap // ................................................. - mls v31.8H, v4.8H, v7.H[0] // ....................*............................ - // gap // ................................................. - add v25.8H, v16.8H, v12.8H // ......................*.......................... - // gap // ................................................. - mls v13.8H, v18.8H, v7.H[0] // .......................*......................... - // gap // ................................................. - ldr_vi v23, x4, 96 // .........*....................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - add v6.8H, v15.8H, v31.8H // .........................*....................... - // gap // ................................................. - sub v1.8H, v15.8H, v31.8H // ........................*........................ - // gap // ................................................. - add v18.8H, v25.8H, v13.8H // ...........................*..................... - // gap // ................................................. - sub v15.8H, v25.8H, v13.8H // ..........................*...................... - // gap // ................................................. - trn2 v12.4S, v6.4S, v1.4S // ...............................*................. - // gap // ................................................. - trn1 v14.4S, v6.4S, v1.4S // ............................*.................... - // gap // ................................................. - trn2 v13.4S, v18.4S, v15.4S // ..............................*.................. - // gap // ................................................. - ldr_vo v20, x4, -80 // *................................................ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - trn1 v0.2D, v13.2D, v12.2D // .................................*............... - // gap // ................................................. - trn2 v4.2D, v13.2D, v12.2D // ................................*................ - // gap // ................................................. - trn1 v13.4S, v18.4S, v15.4S // .............................*................... - // gap // ................................................. - mul v31.8H, v4.8H, v23.8H // ..................................*.............. - // gap // ................................................. - sqrdmulh v5.8H, v4.8H, v20.8H // ...................................*............. - // gap // ................................................. - trn2 v28.2D, v13.2D, v14.2D // ......................................*.......... - // gap // ................................................. - ldr_vo v2, x4, -16 // ...*............................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mul v4.8H, v28.8H, v23.8H // .......................................*......... - // gap // ................................................. - mls v31.8H, v5.8H, v7.H[0] // .....................................*........... - // gap // ................................................. - sqrdmulh v15.8H, v28.8H, v20.8H // .........................................*....... - // gap // ................................................. - ldr_vo v23, x4, -32 // ....................................*............ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sub v18.8H, v0.8H, v31.8H // ........................................*........ - // gap // ................................................. - mls v4.8H, v15.8H, v7.H[0] // ............................................*.... - // gap // ................................................. - trn1 v11.2D, v13.2D, v14.2D // .............................................*... - // gap // ................................................. - mul v19.8H, v18.8H, v23.8H // ..........................................*...... - // gap // ................................................. - sqrdmulh v26.8H, v18.8H, v2.8H // ...........................................*..... - // gap // ................................................. - sub v28.8H, v11.8H, v4.8H // ...............................................*. - // gap // ................................................. - add v5.8H, v11.8H, v4.8H // ................................................* - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mls v19.8H, v26.8H, v7.H[0] // ..............................................*.. - // gap // ................................................. - - // original source code - // ldr_vo v11, x4, 16 // .............................*................... || ...................................*....................... - // ldr_vo v26, x1, 16 // .....*........................................... || ........*.................................................. - // ldr_vo v30, x1, 0 // .........*....................................... || .............*............................................. - // ldr_vo v2, x4, 80 // ....................................*............ || ...........................................*............... - // ldr_vo v12, x1, 48 // .*............................................... || ..*........................................................ - // ldr_vi v8, x3, 16 // *................................................ || *.......................................................... - // ldr_vo v3, x1, 32 // ..*.............................................. || ....*...................................................... - // mul v23.8H, v12.8H, v8.H[0] // ...*............................................. || ......*.................................................... - // sqrdmulh v16.8H, v12.8H, v8.H[1] // ....*............................................ || .......*................................................... - // ldr_vi v6, x4, 96 // .....................*........................... || ..........................*................................ - // mls v23.8H, v16.8H, v7.H[0] // .......*......................................... || ...........*............................................... - // mul v28.8H, v3.8H, v8.H[0] // ......*.......................................... || ..........*................................................ - // sub v24.8H, v26.8H, v23.8H // ..........*...................................... || ...............*........................................... - // sqrdmulh v15.8H, v3.8H, v8.H[1] // ........*........................................ || ............*.............................................. - // add v1.8H, v26.8H, v23.8H // ...........*..................................... || ................*.......................................... - // mul v22.8H, v24.8H, v8.H[4] // .............*................................... || ..................*........................................ - // sqrdmulh v12.8H, v24.8H, v8.H[5] // ..............*.................................. || ...................*....................................... - // mls v28.8H, v15.8H, v7.H[0] // ............*.................................... || .................*......................................... - // mul v23.8H, v1.8H, v8.H[2] // ...............*................................. || ....................*...................................... - // sqrdmulh v20.8H, v1.8H, v8.H[3] // ................*................................ || .....................*..................................... - // mls v22.8H, v12.8H, v7.H[0] // ..................*.............................. || .......................*................................... - // sub v9.8H, v30.8H, v28.8H // .................*............................... || ......................*.................................... - // add v24.8H, v30.8H, v28.8H // ...................*............................. || ........................*.................................. - // mls v23.8H, v20.8H, v7.H[0] // ....................*............................ || .........................*................................. - // sub v14.8H, v9.8H, v22.8H // .......................*......................... || .............................*............................. - // add v20.8H, v9.8H, v22.8H // ......................*.......................... || ............................*.............................. - // sub v21.8H, v24.8H, v23.8H // .........................*....................... || ...............................*........................... - // add v15.8H, v24.8H, v23.8H // ........................*........................ || ..............................*............................ - // trn1 v27.4S, v20.4S, v14.4S // ...........................*..................... || .................................*......................... - // trn1 v30.4S, v15.4S, v21.4S // ................................*................ || .......................................*................... - // trn2 v25.4S, v15.4S, v21.4S // ............................*.................... || ..................................*........................ - // trn2 v1.4S, v20.4S, v14.4S // ..........................*...................... || ................................*.......................... - // trn2 v18.2D, v25.2D, v1.2D // ...............................*................. || ......................................*.................... - // trn1 v0.2D, v25.2D, v1.2D // ..............................*.................. || .....................................*..................... - // mul v31.8H, v18.8H, v6.8H // .................................*............... || ........................................*.................. - // sqrdmulh v26.8H, v18.8H, v11.8H // ..................................*.............. || .........................................*................. - // ldr_vo v14, x4, -32 // ........................................*........ || ................................................*.......... - // mls v31.8H, v26.8H, v7.H[0] // ......................................*.......... || ..............................................*............ - // trn2 v19.2D, v30.2D, v27.2D // ...................................*............. || ..........................................*................ - // mul v17.8H, v19.8H, v6.8H // .....................................*........... || .............................................*............. - // sub v21.8H, v0.8H, v31.8H // .........................................*....... || ..................................................*........ - // sqrdmulh v18.8H, v19.8H, v11.8H // .......................................*......... || ...............................................*........... - // mul v19.8H, v21.8H, v14.8H // ............................................*.... || .....................................................*..... - // sqrdmulh v13.8H, v21.8H, v2.8H // .............................................*... || ......................................................*.... - // mls v17.8H, v18.8H, v7.H[0] // ..........................................*...... || ...................................................*....... - // trn1 v14.2D, v30.2D, v27.2D // ...........................................*..... || ....................................................*...... - // mls v19.8H, v13.8H, v7.H[0] // ................................................* || ..........................................................* - // sub v28.8H, v14.8H, v17.8H // ..............................................*.. || .......................................................*... - // add v5.8H, v14.8H, v17.8H // ...............................................*. || ........................................................*.. - + // Instructions: 20 + // Expected cycles: 28 + // Expected IPC: 0.71 + // + // Cycle bound: 28.0 + // IPC bound: 0.71 + // + // Wall time: 0.11s + // User time: 0.11s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q14, [x3], #16 // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q1, [x1, #48] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q29, [x1, #0] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v5.8H, v1.8H, v14.H[1] // .....*........................ + // gap // .............................. + mul v1.8H, v1.8H, v14.H[0] // ......*....................... + // gap // .............................. + ldr q17, [x4], #(6*16) // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q27, [x4, #-16] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v1.8H, v5.8H, v7.H[0] // ........*..................... + // gap // .............................. + ldr q10, [x1, #16] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q12, [x4, #-80] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v2.8H, v10.8H, v1.8H // .............*................ + // gap // .............................. + add v1.8H, v10.8H, v1.8H // ..............*............... + // gap // .............................. + ldr q10, [x1, #32] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v18.8H, v1.8H, v14.H[2] // .................*............ + // gap // .............................. + sqrdmulh v1.8H, v1.8H, v14.H[3] // ................*............. + // gap // .............................. + mul v3.8H, v10.8H, v14.H[0] // ..*........................... + // gap // .............................. + sqrdmulh v25.8H, v10.8H, v14.H[1] // ....*......................... + // gap // .............................. + sqrdmulh v6.8H, v2.8H, v14.H[5] // ...............*.............. + // gap // .............................. + mls v18.8H, v1.8H, v7.H[0] // ...................*.......... + // gap // .............................. + ldr q10, [x4, #-64] // .......*...................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q16, [x1, #32] // ............*.................. + // ldr q14, [x3], #16 // *.............................. + // mul v3.8H, v16.8H, v14.H[0] // ...............*............... + // ldr q6, [x1, #48] // .*............................. + // sqrdmulh v25.8H, v16.8H, v14.H[1] // ................*.............. + // sqrdmulh v5.8H, v6.8H, v14.H[1] // ...*........................... + // mul v12.8H, v6.8H, v14.H[0] // ....*.......................... + // ldr q10, [x4, #32] // ...................*........... + // mls v12.8H, v5.8H, v7.H[0] // .......*....................... + // ldr q29, [x1, #0] // ..*............................ + // ldr q17, [x4], #(6*16) // .....*......................... + // ldr q18, [x1, #16] // ........*...................... + // ldr q27, [x4, #-16] // ......*........................ + // sub v2.8H, v18.8H, v12.8H // ..........*.................... + // add v18.8H, v18.8H, v12.8H // ...........*................... + // sqrdmulh v6.8H, v2.8H, v14.H[5] // .................*............. + // sqrdmulh v28.8H, v18.8H, v14.H[3] // ..............*................ + // mul v18.8H, v18.8H, v14.H[2] // .............*................. + // ldr q12, [x4, #-80] // .........*..................... + // mls v18.8H, v28.8H, v7.H[0] // ..................*............ + sub count, count, #1 -.p2align 2 layer4567_start: - sub v29.8H, v28.8H, v19.8H // .........................................................*......................... + // Instructions: 83 + // Expected cycles: 94 + // Expected IPC: 0.88 + // + // Cycle bound: 94.0 + // IPC bound: 0.88 + // + // Wall time: 8.82s + // User time: 8.82s + // + // ------------------------------- original position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------- + mls v3.8H, v25.8H, v7.H[0] // .......*........................................................................... // gap // ................................................................................... - ldr_vo v11, x4, 16 // ..................................e................................................ + mul v25.8H, v2.8H, v14.H[4] // .....................*............................................................. // gap // ................................................................................... + ldr q16, [x1, #96] // ..e................................................................................ // gap // ................................................................................... // gap // ................................................................................... - add v13.8H, v28.8H, v19.8H // ..........................................................*........................ // gap // ................................................................................... + sub v31.8H, v29.8H, v3.8H // ........*.......................................................................... // gap // ................................................................................... - ldr_vo v26, x1, 80 // .e................................................................................. + mls v25.8H, v6.8H, v7.H[0] // ......................*............................................................ // gap // ................................................................................... + add v29.8H, v29.8H, v3.8H // .........*......................................................................... // gap // ................................................................................... - sqdmulh v10.8H, v29.8H, v7.H[1] // ....................................................................*.............. + ldr q14, [x3], #16 // ....e.............................................................................. // gap // ................................................................................... - ldr_vo v30, x1, 64 // e.................................................................................. // gap // ................................................................................... // gap // ................................................................................... + sub v6.8H, v31.8H, v25.8H // .......................*........................................................... // gap // ................................................................................... - ldr_vo v2, x4, 80 // ......................................e............................................ + add v31.8H, v31.8H, v25.8H // ........................*.......................................................... // gap // ................................................................................... + add v28.8H, v29.8H, v18.8H // ...................*............................................................... // gap // ................................................................................... + sub v5.8H, v29.8H, v18.8H // ..................*................................................................ // gap // ................................................................................... - ldr_vo v12, x1, 112 // ...e............................................................................... + trn1 v29.4S, v31.4S, v6.4S // ...........................*....................................................... // gap // ................................................................................... + trn2 v6.4S, v31.4S, v6.4S // ............................*...................................................... // gap // ................................................................................... + trn2 v18.4S, v28.4S, v5.4S // ..........................*........................................................ // gap // ................................................................................... - ldr_vi v8, x3, 16 // ....e.............................................................................. + trn1 v11.4S, v28.4S, v5.4S // .........................*......................................................... // gap // ................................................................................... + mul v3.8H, v16.8H, v14.H[0] // ......e............................................................................ // gap // ................................................................................... + trn2 v8.2D, v18.2D, v6.2D // ..............................*.................................................... // gap // ................................................................................... - ldr_vo v3, x1, 96 // ..e................................................................................ + trn2 v25.2D, v11.2D, v29.2D // .............................*..................................................... // gap // ................................................................................... + mul v26.8H, v8.8H, v17.8H // .............................................*..................................... // gap // ................................................................................... + sqrdmulh v31.8H, v8.8H, v12.8H // ............................................*...................................... // gap // ................................................................................... - sqdmulh v1.8H, v13.8H, v7.H[1] // .................................................................*................. + sqrdmulh v30.8H, v25.8H, v12.8H // .......................................*........................................... // gap // ................................................................................... - mul v23.8H, v12.8H, v8.H[0] // ..........e........................................................................ + mul v19.8H, v25.8H, v17.8H // ........................................*.......................................... // gap // ................................................................................... - sqrdmulh v16.8H, v12.8H, v8.H[1] // ...........e....................................................................... + trn1 v18.2D, v18.2D, v6.2D // ................................*.................................................. // gap // ................................................................................... - srshr v10.8H, v10.8H, #11 // .....................................................................*............. + mls v26.8H, v31.8H, v7.H[0] // ..............................................*.................................... // gap // ................................................................................... - ldr_vi v6, x4, 96 // .................................e................................................. + ldr q6, [x4, #-48] // ....................................*.............................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v23.8H, v16.8H, v7.H[0] // ............e...................................................................... + mls v19.8H, v30.8H, v7.H[0] // .........................................*......................................... // gap // ................................................................................... - srshr v18.8H, v1.8H, #11 // ..................................................................*................ + sub v1.8H, v18.8H, v26.8H // ...............................................*................................... // gap // ................................................................................... - mls v29.8H, v10.8H, v7.H[0] // ......................................................................*............ + add v18.8H, v18.8H, v26.8H // ................................................*.................................. // gap // ................................................................................... - mul v28.8H, v3.8H, v8.H[0] // .....e............................................................................. + ldr q26, [x4, #-32] // .....................................*............................................. // gap // ................................................................................... - sub v24.8H, v26.8H, v23.8H // .............e..................................................................... // gap // ................................................................................... - sqrdmulh v15.8H, v3.8H, v8.H[1] // ......e............................................................................ // gap // ................................................................................... - add v1.8H, v26.8H, v23.8H // ..............e.................................................................... + sqrdmulh v6.8H, v18.8H, v6.8H // .................................................*................................. // gap // ................................................................................... - mul v22.8H, v24.8H, v8.H[4] // ....................e.............................................................. + mul v18.8H, v18.8H, v10.8H // ..................................................*................................ // gap // ................................................................................... - sqrdmulh v12.8H, v24.8H, v8.H[5] // .....................e............................................................. + mul v26.8H, v1.8H, v26.8H // .......................................................*........................... // gap // ................................................................................... - mls v13.8H, v18.8H, v7.H[0] // ...................................................................*............... + sqrdmulh v25.8H, v1.8H, v27.8H // ......................................................*............................ // gap // ................................................................................... - mls v28.8H, v15.8H, v7.H[0] // .......e........................................................................... + trn1 v22.2D, v11.2D, v29.2D // ...............................*................................................... // gap // ................................................................................... - mul v23.8H, v1.8H, v8.H[2] // ...............e................................................................... + mls v18.8H, v6.8H, v7.H[0] // ...................................................*............................... // gap // ................................................................................... - sqrdmulh v20.8H, v1.8H, v8.H[3] // ................e.................................................................. + add v29.8H, v22.8H, v19.8H // ...........................................*....................................... // gap // ................................................................................... - mls v22.8H, v12.8H, v7.H[0] // ......................e............................................................ + mls v26.8H, v25.8H, v7.H[0] // ........................................................*.......................... // gap // ................................................................................... - ldr_vo v4, x4, -144 // ....................................*.............................................. + sub v31.8H, v22.8H, v19.8H // ..........................................*........................................ // gap // ................................................................................... + sub v25.8H, v29.8H, v18.8H // ....................................................*.............................. // gap // ................................................................................... + add v18.8H, v29.8H, v18.8H // .....................................................*............................. // gap // ................................................................................... - sub v9.8H, v30.8H, v28.8H // ........e.......................................................................... + add v17.8H, v31.8H, v26.8H // ..........................................................*........................ // gap // ................................................................................... - add v24.8H, v30.8H, v28.8H // .........e......................................................................... + sqdmulh v19.8H, v25.8H, v7.H[1] // ..............................................................*.................... // gap // ................................................................................... - mls v23.8H, v20.8H, v7.H[0] // .................e................................................................. + sqdmulh v24.8H, v18.8H, v7.H[1] // ...........................................................*....................... // gap // ................................................................................... - sub v14.8H, v9.8H, v22.8H // .......................e........................................................... + sqdmulh v20.8H, v17.8H, v7.H[1] // .................................................................*................. // gap // ................................................................................... - add v20.8H, v9.8H, v22.8H // ........................e.......................................................... + sub v29.8H, v31.8H, v26.8H // .........................................................*......................... // gap // ................................................................................... - add v17.8H, v0.8H, v31.8H // ................................................*.................................. + srshr v19.8H, v19.8H, #11 // ...............................................................*................... // gap // ................................................................................... - sub v21.8H, v24.8H, v23.8H // ..................e................................................................ + srshr v10.8H, v24.8H, #11 // ............................................................*...................... // gap // ................................................................................... - add v15.8H, v24.8H, v23.8H // ...................e............................................................... + srshr v20.8H, v20.8H, #11 // ..................................................................*................ // gap // ................................................................................... - sqrdmulh v31.8H, v17.8H, v4.8H // ..................................................*................................ + mls v25.8H, v19.8H, v7.H[0] // ................................................................*.................. // gap // ................................................................................... - ldr_vo v4, x4, -160 // ...................................*............................................... + mls v18.8H, v10.8H, v7.H[0] // .............................................................*..................... // gap // ................................................................................... + sqdmulh v11.8H, v29.8H, v7.H[1] // ....................................................................*.............. // gap // ................................................................................... + ldr q6, [x1, #112] // ...e............................................................................... // gap // ................................................................................... - trn1 v27.4S, v20.4S, v14.4S // ...........................e....................................................... // gap // ................................................................................... - trn1 v30.4S, v15.4S, v21.4S // .........................e......................................................... // gap // ................................................................................... - mul v3.8H, v17.8H, v4.8H // .................................................*................................. + trn1 v26.4S, v18.4S, v25.4S // .......................................................................*........... // gap // ................................................................................... - trn2 v25.4S, v15.4S, v21.4S // ..........................e........................................................ + trn2 v22.4S, v18.4S, v25.4S // ........................................................................*.......... // gap // ................................................................................... - trn2 v1.4S, v20.4S, v14.4S // ............................e...................................................... + sqrdmulh v25.8H, v16.8H, v14.H[1] // .....e............................................................................. // gap // ................................................................................... - trn1 v15.4S, v13.4S, v29.4S // .........................................................................*......... + srshr v9.8H, v11.8H, #11 // .....................................................................*............. // gap // ................................................................................... - mls v3.8H, v31.8H, v7.H[0] // ...................................................*............................... + mls v17.8H, v20.8H, v7.H[0] // ...................................................................*............... // gap // ................................................................................... - trn2 v18.2D, v25.2D, v1.2D // ..............................e.................................................... + sqrdmulh v5.8H, v6.8H, v14.H[1] // ..........e........................................................................ // gap // ................................................................................... - trn1 v0.2D, v25.2D, v1.2D // ................................e.................................................. + mls v29.8H, v9.8H, v7.H[0] // ......................................................................*............ // gap // ................................................................................... - mul v31.8H, v18.8H, v6.8H // ............................................e...................................... + mul v12.8H, v6.8H, v14.H[0] // ...........e....................................................................... // gap // ................................................................................... - sub v25.8H, v5.8H, v3.8H // ....................................................*.............................. + ldr q10, [x4, #32] // ...................................e............................................... // gap // ................................................................................... - add v23.8H, v5.8H, v3.8H // .....................................................*............................. // gap // ................................................................................... - sqrdmulh v26.8H, v18.8H, v11.8H // .............................................e..................................... // gap // ................................................................................... - sqdmulh v1.8H, v25.8H, v7.H[1] // ..............................................................*.................... + trn2 v13.4S, v17.4S, v29.4S // ..........................................................................*........ // gap // ................................................................................... - sqdmulh v22.8H, v23.8H, v7.H[1] // ...........................................................*....................... + mls v12.8H, v5.8H, v7.H[0] // ............e...................................................................... // gap // ................................................................................... - ldr_vo v14, x4, -32 // .....................................e............................................. + trn1 v29.4S, v17.4S, v29.4S // .........................................................................*......... // gap // ................................................................................... + trn1 v18.2D, v22.2D, v13.2D // ..............................................................................*.... // gap // ................................................................................... + trn2 v4.2D, v22.2D, v13.2D // ............................................................................*...... // gap // ................................................................................... - srshr v28.8H, v1.8H, #11 // ...............................................................*................... + trn2 v31.2D, v26.2D, v29.2D // ...........................................................................*....... // gap // ................................................................................... - srshr v19.8H, v22.8H, #11 // ............................................................*...................... + trn1 v26.2D, v26.2D, v29.2D // .............................................................................*..... // gap // ................................................................................... - mls v31.8H, v26.8H, v7.H[0] // ..............................................e.................................... + ldr q29, [x1, #64] // e.................................................................................. // gap // ................................................................................... - mls v25.8H, v28.8H, v7.H[0] // ................................................................*.................. // gap // ................................................................................... - mls v23.8H, v19.8H, v7.H[0] // .............................................................*..................... // gap // ................................................................................... - trn2 v19.2D, v30.2D, v27.2D // .............................e..................................................... + str q26, [x1], #64 // ...............................................................................*... // gap // ................................................................................... - trn2 v26.4S, v13.4S, v29.4S // ..........................................................................*........ + ldr q17, [x4], #(6*16) // .................................e................................................. // gap // ................................................................................... - mul v17.8H, v19.8H, v6.8H // .......................................e........................................... // gap // ................................................................................... - trn1 v12.4S, v23.4S, v25.4S // .......................................................................*........... // gap // ................................................................................... - trn2 v1.4S, v23.4S, v25.4S // ........................................................................*.......... + str q18, [x1, #-48] // ................................................................................*.. // gap // ................................................................................... - sub v21.8H, v0.8H, v31.8H // ...............................................e................................... + ldr q18, [x1, #16] // .e................................................................................. // gap // ................................................................................... - trn1 v9.2D, v12.2D, v15.2D // .............................................................................*..... // gap // ................................................................................... - trn2 v15.2D, v12.2D, v15.2D // ...........................................................................*....... // gap // ................................................................................... - str_vi v9, x1, 64 // ...............................................................................*... + ldr q27, [x4, #-16] // ......................................e............................................ // gap // ................................................................................... - trn2 v13.2D, v1.2D, v26.2D // ............................................................................*...... // gap // ................................................................................... - str_vo v15, x1, -32 // .................................................................................*. // gap // ................................................................................... - sqrdmulh v18.8H, v19.8H, v11.8H // ........................................e.......................................... + sub v2.8H, v18.8H, v12.8H // .............e..................................................................... // gap // ................................................................................... - str_vo v13, x1, -16 // ..................................................................................* + add v18.8H, v18.8H, v12.8H // ..............e.................................................................... // gap // ................................................................................... - mul v19.8H, v21.8H, v14.8H // ......................................................e............................ + str q31, [x1, #-32] // .................................................................................*. // gap // ................................................................................... - sqrdmulh v13.8H, v21.8H, v2.8H // .......................................................e........................... + sqrdmulh v6.8H, v2.8H, v14.H[5] // ....................e.............................................................. // gap // ................................................................................... - mls v17.8H, v18.8H, v7.H[0] // .........................................e......................................... + sqrdmulh v28.8H, v18.8H, v14.H[3] // ...............e................................................................... // gap // ................................................................................... - trn1 v20.2D, v1.2D, v26.2D // ..............................................................................*.... + mul v18.8H, v18.8H, v14.H[2] // ................e.................................................................. // gap // ................................................................................... - trn1 v14.2D, v30.2D, v27.2D // ...............................e................................................... + str q4, [x1, #-16] // ..................................................................................* // gap // ................................................................................... - mls v19.8H, v13.8H, v7.H[0] // ........................................................e.......................... + ldr q12, [x4, #-80] // ..................................e................................................ // gap // ................................................................................... - sub v28.8H, v14.8H, v17.8H // ..........................................e........................................ // gap // ................................................................................... - add v5.8H, v14.8H, v17.8H // ...........................................e....................................... // gap // ................................................................................... - str_vo v20, x1, -48 // ................................................................................*.. + mls v18.8H, v28.8H, v7.H[0] // .................e................................................................. // gap // ................................................................................... - - // original source code - // ldr_vo v8, x1, 0 // ....e................................................................................................................................................................ || ......e.................................................................................................................................................................................... - // ldr_vo v9, x1, 16 // ..e.................................................................................................................................................................. || ...e....................................................................................................................................................................................... - // ldr_vo v10, x1, 32 // ........e............................................................................................................................................................ || ..............e............................................................................................................................................................................ - // ldr_vo v11, x1, 48 // ......e.............................................................................................................................................................. || ..........e................................................................................................................................................................................ - // ldr_vi v0, x3, 16 // .......e............................................................................................................................................................. || ............e.............................................................................................................................................................................. - // mul v24.8H, v10.8H, v0.H[0] // .................e................................................................................................................................................... || .........................e................................................................................................................................................................. - // sqrdmulh v10.8H, v10.8H, v0.H[1] // ...................e................................................................................................................................................. || ...........................e............................................................................................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ........................e............................................................................................................................................ || ................................e.......................................................................................................................................................... - // sub v10.8H, v8.8H, v24.8H // .............................e....................................................................................................................................... || ......................................e.................................................................................................................................................... - // add v8.8H, v8.8H, v24.8H // ..............................e...................................................................................................................................... || .......................................e................................................................................................................................................... - // mul v24.8H, v11.8H, v0.H[0] // ..........e.......................................................................................................................................................... || .................e......................................................................................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // ...........e......................................................................................................................................................... || ..................e........................................................................................................................................................................ - // mls v24.8H, v11.8H, v7.H[0] // ..............e...................................................................................................................................................... || ......................e.................................................................................................................................................................... - // sub v11.8H, v9.8H, v24.8H // ..................e.................................................................................................................................................. || ..........................e................................................................................................................................................................ - // add v9.8H, v9.8H, v24.8H // ....................e................................................................................................................................................ || ............................e.............................................................................................................................................................. - // mul v24.8H, v9.8H, v0.H[2] // .........................e........................................................................................................................................... || .................................e......................................................................................................................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ..........................e.......................................................................................................................................... || ..................................e........................................................................................................................................................ - // mls v24.8H, v9.8H, v7.H[0] // ...............................e..................................................................................................................................... || ........................................e.................................................................................................................................................. - // sub v9.8H, v8.8H, v24.8H // ...................................e................................................................................................................................. || ............................................e.............................................................................................................................................. - // add v8.8H, v8.8H, v24.8H // ....................................e................................................................................................................................ || .............................................e............................................................................................................................................. - // mul v24.8H, v11.8H, v0.H[4] // .....................e............................................................................................................................................... || .............................e............................................................................................................................................................. - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ......................e.............................................................................................................................................. || ..............................e............................................................................................................................................................ - // mls v24.8H, v11.8H, v7.H[0] // ...........................e......................................................................................................................................... || ...................................e....................................................................................................................................................... - // sub v11.8H, v10.8H, v24.8H // ................................e.................................................................................................................................... || .........................................e................................................................................................................................................. - // add v10.8H, v10.8H, v24.8H // .................................e................................................................................................................................... || ..........................................e................................................................................................................................................ - // trn1 v25.4S, v8.4S, v9.4S // ........................................e............................................................................................................................ || ..................................................e........................................................................................................................................ - // trn2 v26.4S, v8.4S, v9.4S // ..........................................e.......................................................................................................................... || ....................................................e...................................................................................................................................... - // trn1 v27.4S, v10.4S, v11.4S // .......................................e............................................................................................................................. || .................................................e......................................................................................................................................... - // trn2 v28.4S, v10.4S, v11.4S // ...........................................e......................................................................................................................... || .....................................................e..................................................................................................................................... - // trn2 v10.2D, v25.2D, v27.2D // ............................................................e........................................................................................................ || .......................................................................e................................................................................................................... - // trn2 v11.2D, v26.2D, v28.2D // ..............................................e...................................................................................................................... || ........................................................e.................................................................................................................................. - // trn1 v8.2D, v25.2D, v27.2D // .............................................................................e....................................................................................... || ........................................................................................e.................................................................................................. - // trn1 v9.2D, v26.2D, v28.2D // ...............................................e..................................................................................................................... || .........................................................e................................................................................................................................. - // ldr_vi v0, x4, 96 // .............e....................................................................................................................................................... || ....................e...................................................................................................................................................................... - // ldr_vo v4, x4, -80 // e.................................................................................................................................................................... || e.......................................................................................................................................................................................... - // ldr_vo v1, x4, -64 // .........................................................................................................................*........................................... || .............................................................................................................................................*............................................. - // ldr_vo v5, x4, -48 // ...............................................................................................................*..................................................... || ..................................................................................................................................*........................................................ - // ldr_vo v2, x4, -32 // ......................................................e.............................................................................................................. || ................................................................e.......................................................................................................................... - // ldr_vo v6, x4, -16 // .....e............................................................................................................................................................... || ........e.................................................................................................................................................................................. - // mul v24.8H, v10.8H, v0.8H // ..............................................................e...................................................................................................... || .........................................................................e................................................................................................................. - // sqrdmulh v10.8H, v10.8H, v4.8H // .......................................................................e............................................................................................. || ..................................................................................e........................................................................................................ - // mls v24.8H, v10.8H, v7.H[0] // ...........................................................................e......................................................................................... || ......................................................................................e.................................................................................................... - // sub v10.8H, v8.8H, v24.8H // ...............................................................................e..................................................................................... || ..........................................................................................e................................................................................................ - // add v8.8H, v8.8H, v24.8H // ................................................................................e.................................................................................... || ...........................................................................................e............................................................................................... - // mul v24.8H, v11.8H, v0.8H // ................................................e.................................................................................................................... || ..........................................................e................................................................................................................................ - // sqrdmulh v11.8H, v11.8H, v4.8H // ...................................................e................................................................................................................. || .............................................................e............................................................................................................................. - // mls v24.8H, v11.8H, v7.H[0] // .........................................................e........................................................................................................... || ....................................................................e...................................................................................................................... - // sub v11.8H, v9.8H, v24.8H // .................................................................e................................................................................................... || ............................................................................e.............................................................................................................. - // add v9.8H, v9.8H, v24.8H // .....................................................................................................................*............................................... || .........................................................................................................................................*................................................. - // mul v24.8H, v9.8H, v1.8H // ............................................................................................................................*........................................ || .................................................................................................................................................*......................................... - // sqrdmulh v9.8H, v9.8H, v5.8H // ........................................................................................................................*............................................ || ............................................................................................................................................*.............................................. - // mls v24.8H, v9.8H, v7.H[0] // ................................................................................................................................*.................................... || .....................................................................................................................................................*..................................... - // sub v9.8H, v8.8H, v24.8H // ....................................................................................................................................*................................ || .........................................................................................................................................................*................................. - // add v8.8H, v8.8H, v24.8H // .....................................................................................................................................*............................... || ..........................................................................................................................................................*................................ - // mul v24.8H, v11.8H, v2.8H // .........................................................................e........................................................................................... || ....................................................................................e...................................................................................................... - // sqrdmulh v11.8H, v11.8H, v6.8H // ..........................................................................e.......................................................................................... || .....................................................................................e..................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ..............................................................................e...................................................................................... || .........................................................................................e................................................................................................. - // sub v11.8H, v10.8H, v24.8H // ..................................................................................*.................................................................................. || .............................................................................................*............................................................................................. - // add v10.8H, v10.8H, v24.8H // ....................................................................................*................................................................................ || ................................................................................................*.......................................................................................... - // sqdmulh v25.8H, v8.8H, v7.H[1] // ........................................................................................................................................*............................ || .............................................................................................................................................................*............................. - // srshr v25.8H, v25.8H, #11 // ...........................................................................................................................................*......................... || .................................................................................................................................................................*......................... - // mls v8.8H, v25.8H, v7.H[0] // ..............................................................................................................................................*...................... || ....................................................................................................................................................................*...................... - // sqdmulh v25.8H, v9.8H, v7.H[1] // .......................................................................................................................................*............................. || ............................................................................................................................................................*.............................. - // srshr v25.8H, v25.8H, #11 // ..........................................................................................................................................*.......................... || ................................................................................................................................................................*.......................... - // mls v9.8H, v25.8H, v7.H[0] // .............................................................................................................................................*....................... || ...................................................................................................................................................................*....................... - // sqdmulh v25.8H, v10.8H, v7.H[1] // ............................................................................................*........................................................................ || ..............................................................................................................*............................................................................ - // srshr v25.8H, v25.8H, #11 // ..................................................................................................*.................................................................. || .....................................................................................................................*..................................................................... - // mls v10.8H, v25.8H, v7.H[0] // ..........................................................................................................*.......................................................... || .............................................................................................................................*............................................................. - // sqdmulh v25.8H, v11.8H, v7.H[1] // ......................................................................................*.............................................................................. || ...................................................................................................*....................................................................................... - // srshr v25.8H, v25.8H, #11 // ...............................................................................................*..................................................................... || .................................................................................................................*......................................................................... - // mls v11.8H, v25.8H, v7.H[0] // ...................................................................................................*................................................................. || ......................................................................................................................*.................................................................... - // trn1 v25.4S, v8.4S, v9.4S // ..................................................................................................................................................*.................. || ........................................................................................................................................................................*.................. - // trn2 v26.4S, v8.4S, v9.4S // ...................................................................................................................................................*................. || .........................................................................................................................................................................*................. - // trn1 v27.4S, v10.4S, v11.4S // ...............................................................................................................................*..................................... || ....................................................................................................................................................*...................................... - // trn2 v28.4S, v10.4S, v11.4S // ................................................................................................................................................*.................... || ......................................................................................................................................................................*.................... - // trn2 v10.2D, v25.2D, v27.2D // ......................................................................................................................................................*.............. || ............................................................................................................................................................................*.............. - // trn2 v11.2D, v26.2D, v28.2D // ........................................................................................................................................................*............ || ..............................................................................................................................................................................*............ - // trn1 v8.2D, v25.2D, v27.2D // .....................................................................................................................................................*............... || ...........................................................................................................................................................................*............... - // trn1 v9.2D, v26.2D, v28.2D // ...............................................................................................................................................................*..... || .....................................................................................................................................................................................*..... - // str_vi v8, x1, 64 // .......................................................................................................................................................*............. || .............................................................................................................................................................................*............. - // str_vo v9, x1, -48 // ....................................................................................................................................................................* || ..........................................................................................................................................................................................* - // str_vo v10, x1, -32 // .........................................................................................................................................................*........... || ...............................................................................................................................................................................*........... - // str_vo v11, x1, -16 // ...........................................................................................................................................................*......... || .................................................................................................................................................................................*......... - - subs count, count, #1 + + // ------------------------------------------------------------------------- new position --------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ldr q8, [x1, #(16*0)] // ..................................................................e..............'...................................................................~............ + // ldr q9, [x1, #(16*1)] // ......................................................................e..........'.......................................................................~........ + // ldr q10, [x1, #(16*2)] // e................................................................................'.~.............................................................................. + // ldr q11, [x1, #(16*3)] // .................................................e...............................'..................................................~............................. + // ldr q0, [x3], #16 // ....e............................................................................'.....~.......................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ....................................................e............................'.....................................................~.......................... + // mul v24.8h, v10.8h, v0.h[0] // .............e...................................................................'..............~................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .................................................................................*................................................................................ + // sub v10.8h, v8.8h, v24.8h // .~...............................................................................'..*............................................................................. + // add v8.8h, v8.8h, v24.8h // ...~.............................................................................'....*........................................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .......................................................e.........................'........................................................~....................... + // mul v24.8h, v11.8h, v0.h[0] // .........................................................e.......................'..........................................................~..................... + // mls v24.8h, v27.8h, v7.h[0] // ............................................................e....................'.............................................................~.................. + // sub v11.8h, v9.8h, v24.8h // ........................................................................e........'.........................................................................~...... + // add v9.8h, v9.8h, v24.8h // .........................................................................e.......'..........................................................................~..... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ............................................................................e....'.............................................................................~.. + // mul v24.8h, v9.8h, v0.h[2] // .............................................................................e...'..............................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // ................................................................................e'................................................................................ + // sub v9.8h, v8.8h, v24.8h // ........~........................................................................'.........*...................................................................... + // add v8.8h, v8.8h, v24.8h // .......~.........................................................................'........*....................................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ...........................................................................e.....'............................................................................~... + // mul v24.8h, v11.8h, v0.h[4] // .................................................................................'*............................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..~..............................................................................'...*............................................................................ + // sub v11.8h, v10.8h, v24.8h // .....~...........................................................................'......*......................................................................... + // add v10.8h, v10.8h, v24.8h // ......~..........................................................................'.......*........................................................................ + // trn1 v25.4s, v8.4s, v9.4s // ............~....................................................................'.............*.................................................................. + // trn2 v26.4s, v8.4s, v9.4s // ...........~.....................................................................'............*................................................................... + // trn1 v27.4s, v10.4s, v11.4s // .........~.......................................................................'..........*..................................................................... + // trn2 v28.4s, v10.4s, v11.4s // ..........~......................................................................'...........*.................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ...............~.................................................................'................*............................................................... + // trn2 v11.2d, v26.2d, v28.2d // ..............~..................................................................'...............*................................................................ + // trn1 v8.2d, v25.2d, v27.2d // ...............................~.................................................'................................*............................................... + // trn1 v9.2d, v26.2d, v28.2d // ....................~............................................................'.....................*.......................................................... + // ldr q0, [ x4], #(6*16) // ....................................................................e............'.....................................................................~.......... + // ldr q4, [x4, #(-6*16 + 1*16)] // ...............................................................................e.'................................................................................ + // ldr q1, [ x4, #(-6*16 + 2*16)] // ..........................................................e......................'...........................................................~.................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ......................~..........................................................'.......................*........................................................ + // ldr q2, [ x4, #(-6*16 + 4*16)] // ..........................~......................................................'...........................*.................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .......................................................................e.........'........................................................................~....... + // sqrdmulh v27.8h, v10.8h, v4.8h // ..................~..............................................................'...................*............................................................ + // mul v24.8h, v10.8h, v0.8h // ...................~.............................................................'....................*........................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................~.........................................................'........................*....................................................... + // sub v10.8h, v8.8h, v24.8h // ...................................~.............................................'....................................*........................................... + // add v8.8h, v8.8h, v24.8h // .................................~...............................................'..................................*............................................. + // sqrdmulh v27.8h, v11.8h, v4.8h // .................~...............................................................'..................*............................................................. + // mul v24.8h, v11.8h, v0.8h // ................~................................................................'.................*.............................................................. + // mls v24.8h, v27.8h, v7.h[0] // .....................~...........................................................'......................*......................................................... + // sub v11.8h, v9.8h, v24.8h // ........................~........................................................'.........................*...................................................... + // add v9.8h, v9.8h, v24.8h // .........................~.......................................................'..........................*..................................................... + // sqrdmulh v27.8h, v9.8h, v5.8h // ...........................~.....................................................'............................*................................................... + // mul v24.8h, v9.8h, v1.8h // ............................~....................................................'.............................*.................................................. + // mls v24.8h, v27.8h, v7.h[0] // ................................~................................................'.................................*.............................................. + // sub v9.8h, v8.8h, v24.8h // ....................................~............................................'.....................................*.......................................... + // add v8.8h, v8.8h, v24.8h // .....................................~...........................................'......................................*......................................... + // sqrdmulh v27.8h, v11.8h, v6.8h // ..............................~..................................................'...............................*................................................ + // mul v24.8h, v11.8h, v2.8h // .............................~...................................................'..............................*................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..................................~..............................................'...................................*............................................ + // sub v11.8h, v10.8h, v24.8h // ..........................................~......................................'...........................................*.................................... + // add v10.8h, v10.8h, v24.8h // ......................................~..........................................'.......................................*........................................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ........................................~........................................'.........................................*...................................... + // srshr v25.8h, v25.8h, #11 // ............................................~....................................'.............................................*.................................. + // mls v8.8h, v25.8h, v7.h[0] // ...............................................~.................................'................................................*............................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // .......................................~.........................................'........................................*....................................... + // srshr v25.8h, v25.8h, #11 // ...........................................~.....................................'............................................*................................... + // mls v9.8h, v25.8h, v7.h[0] // ..............................................~..................................'...............................................*................................ + // sqdmulh v25.8h, v10.8h, v7.h[1] // .........................................~.......................................'..........................................*..................................... + // srshr v25.8h, v25.8h, #11 // .............................................~...................................'..............................................*................................. + // mls v10.8h, v25.8h, v7.h[0] // ......................................................~..........................'.......................................................*........................ + // sqdmulh v25.8h, v11.8h, v7.h[1] // ................................................~................................'.................................................*.............................. + // srshr v25.8h, v25.8h, #11 // .....................................................~...........................'......................................................*......................... + // mls v11.8h, v25.8h, v7.h[0] // ........................................................~........................'.........................................................*...................... + // trn1 v25.4s, v8.4s, v9.4s // ..................................................~..............................'...................................................*............................ + // trn2 v26.4s, v8.4s, v9.4s // ...................................................~.............................'....................................................*........................... + // trn1 v27.4s, v10.4s, v11.4s // .............................................................~...................'..............................................................*................. + // trn2 v28.4s, v10.4s, v11.4s // ...........................................................~.....................'............................................................*................... + // trn2 v10.2d, v25.2d, v27.2d // ................................................................~................'.................................................................*.............. + // trn2 v11.2d, v26.2d, v28.2d // ...............................................................~.................'................................................................*............... + // trn1 v8.2d, v25.2d, v27.2d // .................................................................~...............'..................................................................*............. + // trn1 v9.2d, v26.2d, v28.2d // ..............................................................~..................'...............................................................*................ + // str q8, [x1], #64 // ...................................................................~.............'....................................................................*........... + // str q9, [x1, #(-(64) + 16*1)] // .....................................................................~...........'......................................................................*......... + // str q10, [x1, #(-(64) + 16*2)] // ..........................................................................~......'...........................................................................*.... + // str q11, [x1, #(-(64) + 16*3)] // ..............................................................................~..'...............................................................................* + + sub count, count, #1 cbnz count, layer4567_start - add v4.8H, v0.8H, v31.8H // .........*........................ - // gap // .................................. - ldr_vo v13, x4, -48 // ........*......................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - ldr_vo v15, x4, -64 // ...........*...................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - add v23.8H, v28.8H, v19.8H // .*................................ - // gap // .................................. - sqrdmulh v8.8H, v4.8H, v13.8H // ..........*....................... - // gap // .................................. - mul v30.8H, v4.8H, v15.8H // ............*..................... - // gap // .................................. - sqdmulh v4.8H, v23.8H, v7.H[1] // ...*.............................. - // gap // .................................. - sub v6.8H, v28.8H, v19.8H // *................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - mls v30.8H, v8.8H, v7.H[0] // ..............*................... - // gap // .................................. - srshr v15.8H, v4.8H, #11 // .....*............................ - // gap // .................................. - sqdmulh v17.8H, v6.8H, v7.H[1] // ..*............................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - sub v13.8H, v5.8H, v30.8H // ...............*.................. - // gap // .................................. - add v4.8H, v5.8H, v30.8H // ................*................. - // gap // .................................. - mls v23.8H, v15.8H, v7.H[0] // .......*.......................... - // gap // .................................. - sqdmulh v28.8H, v13.8H, v7.H[1] // .................*................ - // gap // .................................. - sqdmulh v26.8H, v4.8H, v7.H[1] // ..................*............... - // gap // .................................. - srshr v18.8H, v17.8H, #11 // ....*............................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - srshr v16.8H, v28.8H, #11 // ...................*.............. - // gap // .................................. - srshr v22.8H, v26.8H, #11 // ....................*............. - // gap // .................................. - mls v6.8H, v18.8H, v7.H[0] // ......*........................... - // gap // .................................. - mls v13.8H, v16.8H, v7.H[0] // .....................*............ - // gap // .................................. - mls v4.8H, v22.8H, v7.H[0] // ......................*........... - // gap // .................................. - // gap // .................................. - // gap // .................................. - trn1 v19.4S, v23.4S, v6.4S // .............*.................... - // gap // .................................. - trn2 v14.4S, v23.4S, v6.4S // .......................*.......... - // gap // .................................. - trn1 v23.4S, v4.4S, v13.4S // ........................*......... - // gap // .................................. - trn2 v6.4S, v4.4S, v13.4S // .........................*........ - // gap // .................................. - // gap // .................................. - // gap // .................................. - trn2 v4.2D, v23.2D, v19.2D // ...........................*...... - // gap // .................................. - trn2 v13.2D, v6.2D, v14.2D // .............................*.... - // gap // .................................. - str_vo v4, x1, 32 // ..............................*... - // gap // .................................. - trn1 v9.2D, v23.2D, v19.2D // ..........................*....... - // gap // .................................. - str_vo v13, x1, 48 // ...............................*.. - // gap // .................................. - trn1 v12.2D, v6.2D, v14.2D // ................................*. - // gap // .................................. - str_vi v9, x1, 64 // ............................*..... - // gap // .................................. - // gap // .................................. - // gap // .................................. - str_vo v12, x1, -48 // .................................* - // gap // .................................. - - // original source code - // sub v29.8H, v28.8H, v19.8H // .......*.......................... || .........*................................ - // add v13.8H, v28.8H, v19.8H // ...*.............................. || .....*.................................... - // sqdmulh v10.8H, v29.8H, v7.H[1] // ..........*....................... || .............*............................ - // sqdmulh v1.8H, v13.8H, v7.H[1] // ......*........................... || ........*................................. - // srshr v10.8H, v10.8H, #11 // ................*................. || ....................*..................... - // srshr v18.8H, v1.8H, #11 // .........*........................ || ............*............................. - // mls v29.8H, v10.8H, v7.H[0] // ...................*.............. || ........................*................. - // mls v13.8H, v18.8H, v7.H[0] // .............*.................... || .................*........................ - // ldr_vo v4, x4, -48 // .*................................ || .*........................................ - // add v17.8H, v0.8H, v31.8H // *................................. || *......................................... - // sqrdmulh v31.8H, v17.8H, v4.8H // ....*............................. || ......*................................... - // ldr_vo v4, x4, -64 // ..*............................... || ...*...................................... - // mul v3.8H, v17.8H, v4.8H // .....*............................ || .......*.................................. - // trn1 v15.4S, v13.4S, v29.4S // ......................*........... || ............................*............. - // mls v3.8H, v31.8H, v7.H[0] // ........*......................... || ...........*.............................. - // sub v25.8H, v5.8H, v3.8H // ...........*...................... || ...............*.......................... - // add v23.8H, v5.8H, v3.8H // ............*..................... || ................*......................... - // sqdmulh v1.8H, v25.8H, v7.H[1] // ..............*................... || ..................*....................... - // sqdmulh v22.8H, v23.8H, v7.H[1] // ...............*.................. || ...................*...................... - // srshr v28.8H, v1.8H, #11 // .................*................ || ......................*................... - // srshr v19.8H, v22.8H, #11 // ..................*............... || .......................*.................. - // mls v25.8H, v28.8H, v7.H[0] // ....................*............. || .........................*................ - // mls v23.8H, v19.8H, v7.H[0] // .....................*............ || ..........................*............... - // trn2 v26.4S, v13.4S, v29.4S // .......................*.......... || .............................*............ - // trn1 v12.4S, v23.4S, v25.4S // ........................*......... || ..............................*........... - // trn2 v1.4S, v23.4S, v25.4S // .........................*........ || ...............................*.......... - // trn1 v9.2D, v12.2D, v15.2D // .............................*.... || ....................................*..... - // trn2 v15.2D, v12.2D, v15.2D // ..........................*....... || .................................*........ - // str_vi v9, x1, 64 // ................................*. || .......................................*.. - // trn2 v13.2D, v1.2D, v26.2D // ...........................*...... || ..................................*....... - // str_vo v15, x1, -32 // ............................*..... || ...................................*...... - // str_vo v13, x1, -16 // ..............................*... || .....................................*.... - // trn1 v20.2D, v1.2D, v26.2D // ...............................*.. || ......................................*... - // str_vo v20, x1, -48 // .................................* || .........................................* - + // Instructions: 63 + // Expected cycles: 70 + // Expected IPC: 0.90 + // + // Cycle bound: 70.0 + // IPC bound: 0.90 + // + // Wall time: 14.39s + // User time: 14.39s + // + // --------------------- original position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------------ + mls v3.8H, v25.8H, v7.H[0] // *.............................................................. + // gap // ............................................................... + mul v15.8H, v2.8H, v14.H[4] // .*............................................................. + // gap // ............................................................... + ldr q31, [x4, #-32] // .........................*..................................... + // gap // ............................................................... + // gap // ............................................................... + // gap // ............................................................... + add v26.8H, v29.8H, v3.8H // ....*.......................................................... + // gap // ............................................................... + mls v15.8H, v6.8H, v7.H[0] // ...*........................................................... + // gap // ............................................................... + sub v25.8H, v29.8H, v3.8H // ..*............................................................ + // gap // ............................................................... + sub v29.8H, v26.8H, v18.8H // ........*...................................................... + // gap // ............................................................... + add v18.8H, v26.8H, v18.8H // .......*....................................................... + // gap // ............................................................... + add v6.8H, v25.8H, v15.8H // ......*........................................................ + // gap // ............................................................... + sub v16.8H, v25.8H, v15.8H // .....*......................................................... + // gap // ............................................................... + trn1 v26.4S, v18.4S, v29.4S // ............*.................................................. + // gap // ............................................................... + trn2 v25.4S, v18.4S, v29.4S // ...........*................................................... + // gap // ............................................................... + trn2 v15.4S, v6.4S, v16.4S // ..........*.................................................... + // gap // ............................................................... + trn1 v19.4S, v6.4S, v16.4S // .........*..................................................... + // gap // ............................................................... + // gap // ............................................................... + // gap // ............................................................... + trn2 v6.2D, v25.2D, v15.2D // .............*................................................. + // gap // ............................................................... + trn2 v29.2D, v26.2D, v19.2D // ..............*................................................ + // gap // ............................................................... + sqrdmulh v18.8H, v6.8H, v12.8H // ................*.............................................. + // gap // ............................................................... + mul v14.8H, v6.8H, v17.8H // ...............*............................................... + // gap // ............................................................... + sqrdmulh v16.8H, v29.8H, v12.8H // .................*............................................. + // gap // ............................................................... + mul v12.8H, v29.8H, v17.8H // ..................*............................................ + // gap // ............................................................... + trn1 v29.2D, v25.2D, v15.2D // ...................*........................................... + // gap // ............................................................... + mls v14.8H, v18.8H, v7.H[0] // ....................*.......................................... + // gap // ............................................................... + ldr q25, [x4, #-48] // .....................*......................................... + // gap // ............................................................... + // gap // ............................................................... + // gap // ............................................................... + trn1 v18.2D, v26.2D, v19.2D // ..............................*................................ + // gap // ............................................................... + sub v6.8H, v29.8H, v14.8H // .......................*....................................... + // gap // ............................................................... + add v26.8H, v29.8H, v14.8H // ........................*...................................... + // gap // ............................................................... + mls v12.8H, v16.8H, v7.H[0] // ......................*........................................ + // gap // ............................................................... + sqrdmulh v29.8H, v6.8H, v27.8H // .............................*................................. + // gap // ............................................................... + mul v6.8H, v6.8H, v31.8H // ............................*.................................. + // gap // ............................................................... + mul v31.8H, v26.8H, v10.8H // ...........................*................................... + // gap // ............................................................... + sqrdmulh v25.8H, v26.8H, v25.8H // ..........................*.................................... + // gap // ............................................................... + sub v26.8H, v18.8H, v12.8H // ..................................*............................ + // gap // ............................................................... + mls v6.8H, v29.8H, v7.H[0] // .................................*............................. + // gap // ............................................................... + // gap // ............................................................... + // gap // ............................................................... + mls v31.8H, v25.8H, v7.H[0] // ...............................*............................... + // gap // ............................................................... + add v18.8H, v18.8H, v12.8H // ................................*.............................. + // gap // ............................................................... + sub v16.8H, v26.8H, v6.8H // .........................................*..................... + // gap // ............................................................... + add v19.8H, v26.8H, v6.8H // .....................................*......................... + // gap // ............................................................... + add v26.8H, v18.8H, v31.8H // ....................................*.......................... + // gap // ............................................................... + sub v20.8H, v18.8H, v31.8H // ...................................*........................... + // gap // ............................................................... + sqdmulh v29.8H, v19.8H, v7.H[1] // ........................................*...................... + // gap // ............................................................... + sqdmulh v6.8H, v26.8H, v7.H[1] // .......................................*....................... + // gap // ............................................................... + sqdmulh v31.8H, v20.8H, v7.H[1] // ......................................*........................ + // gap // ............................................................... + sqdmulh v18.8H, v16.8H, v7.H[1] // ...............................................*............... + // gap // ............................................................... + srshr v29.8H, v29.8H, #11 // ............................................*.................. + // gap // ............................................................... + srshr v6.8H, v6.8H, #11 // ...........................................*................... + // gap // ............................................................... + srshr v31.8H, v31.8H, #11 // ..........................................*.................... + // gap // ............................................................... + srshr v18.8H, v18.8H, #11 // ..................................................*............ + // gap // ............................................................... + mls v26.8H, v6.8H, v7.H[0] // ..............................................*................ + // gap // ............................................................... + mls v20.8H, v31.8H, v7.H[0] // .............................................*................. + // gap // ............................................................... + mls v16.8H, v18.8H, v7.H[0] // ....................................................*.......... + // gap // ............................................................... + mls v19.8H, v29.8H, v7.H[0] // ...................................................*........... + // gap // ............................................................... + // gap // ............................................................... + // gap // ............................................................... + trn1 v25.4S, v26.4S, v20.4S // ................................................*.............. + // gap // ............................................................... + trn2 v26.4S, v26.4S, v20.4S // .................................................*............. + // gap // ............................................................... + trn2 v31.4S, v19.4S, v16.4S // .....................................................*......... + // gap // ............................................................... + trn1 v6.4S, v19.4S, v16.4S // ......................................................*........ + // gap // ............................................................... + // gap // ............................................................... + // gap // ............................................................... + trn2 v29.2D, v26.2D, v31.2D // ........................................................*...... + // gap // ............................................................... + trn2 v18.2D, v25.2D, v6.2D // .........................................................*..... + // gap // ............................................................... + str q29, [x1, #48] // ..............................................................* + // gap // ............................................................... + trn1 v29.2D, v26.2D, v31.2D // .......................................................*....... + // gap // ............................................................... + str q18, [x1, #32] // .............................................................*. + // gap // ............................................................... + trn1 v18.2D, v25.2D, v6.2D // ..........................................................*.... + // gap // ............................................................... + str q29, [x1, #16] // ............................................................*.. + // gap // ............................................................... + // gap // ............................................................... + // gap // ............................................................... + str q18, [x1], #64 // ...........................................................*... + // gap // ............................................................... + + // ------------------------ new position ------------------------> + // 0 25 50 + // |------------------------|------------------------|------------ + // mls v3.8H, v25.8H, v7.H[0] // *.............................................................. + // mul v25.8H, v2.8H, v14.H[4] // .*............................................................. + // sub v31.8H, v29.8H, v3.8H // .....*......................................................... + // mls v25.8H, v6.8H, v7.H[0] // ....*.......................................................... + // add v29.8H, v29.8H, v3.8H // ...*........................................................... + // sub v6.8H, v31.8H, v25.8H // .........*..................................................... + // add v31.8H, v31.8H, v25.8H // ........*...................................................... + // add v28.8H, v29.8H, v18.8H // .......*....................................................... + // sub v5.8H, v29.8H, v18.8H // ......*........................................................ + // trn1 v29.4S, v31.4S, v6.4S // .............*................................................. + // trn2 v6.4S, v31.4S, v6.4S // ............*.................................................. + // trn2 v18.4S, v28.4S, v5.4S // ...........*................................................... + // trn1 v11.4S, v28.4S, v5.4S // ..........*.................................................... + // trn2 v8.2D, v18.2D, v6.2D // ..............*................................................ + // trn2 v25.2D, v11.2D, v29.2D // ...............*............................................... + // mul v26.8H, v8.8H, v17.8H // .................*............................................. + // sqrdmulh v31.8H, v8.8H, v12.8H // ................*.............................................. + // sqrdmulh v30.8H, v25.8H, v12.8H // ..................*............................................ + // mul v19.8H, v25.8H, v17.8H // ...................*........................................... + // trn1 v18.2D, v18.2D, v6.2D // ....................*.......................................... + // mls v26.8H, v31.8H, v7.H[0] // .....................*......................................... + // ldr q6, [x4, #-48] // ......................*........................................ + // mls v19.8H, v30.8H, v7.H[0] // ..........................*.................................... + // sub v1.8H, v18.8H, v26.8H // ........................*...................................... + // add v18.8H, v18.8H, v26.8H // .........................*..................................... + // ldr q26, [x4, #-32] // ..*............................................................ + // sqrdmulh v6.8H, v18.8H, v6.8H // ..............................*................................ + // mul v18.8H, v18.8H, v10.8H // .............................*................................. + // mul v26.8H, v1.8H, v26.8H // ............................*.................................. + // sqrdmulh v25.8H, v1.8H, v27.8H // ...........................*................................... + // trn1 v22.2D, v11.2D, v29.2D // .......................*....................................... + // mls v18.8H, v6.8H, v7.H[0] // .................................*............................. + // add v29.8H, v22.8H, v19.8H // ..................................*............................ + // mls v26.8H, v25.8H, v7.H[0] // ................................*.............................. + // sub v31.8H, v22.8H, v19.8H // ...............................*............................... + // sub v25.8H, v29.8H, v18.8H // ......................................*........................ + // add v18.8H, v29.8H, v18.8H // .....................................*......................... + // add v17.8H, v31.8H, v26.8H // ....................................*.......................... + // sqdmulh v19.8H, v25.8H, v7.H[1] // .........................................*..................... + // sqdmulh v24.8H, v18.8H, v7.H[1] // ........................................*...................... + // sqdmulh v20.8H, v17.8H, v7.H[1] // .......................................*....................... + // sub v29.8H, v31.8H, v26.8H // ...................................*........................... + // srshr v19.8H, v19.8H, #11 // .............................................*................. + // srshr v10.8H, v24.8H, #11 // ............................................*.................. + // srshr v20.8H, v20.8H, #11 // ...........................................*................... + // mls v25.8H, v19.8H, v7.H[0] // ................................................*.............. + // mls v18.8H, v10.8H, v7.H[0] // ...............................................*............... + // sqdmulh v11.8H, v29.8H, v7.H[1] // ..........................................*.................... + // trn1 v26.4S, v18.4S, v25.4S // ...................................................*........... + // trn2 v22.4S, v18.4S, v25.4S // ....................................................*.......... + // srshr v9.8H, v11.8H, #11 // ..............................................*................ + // mls v17.8H, v20.8H, v7.H[0] // ..................................................*............ + // mls v29.8H, v9.8H, v7.H[0] // .................................................*............. + // trn2 v13.4S, v17.4S, v29.4S // .....................................................*......... + // trn1 v29.4S, v17.4S, v29.4S // ......................................................*........ + // trn1 v18.2D, v22.2D, v13.2D // ..........................................................*.... + // trn2 v4.2D, v22.2D, v13.2D // .......................................................*....... + // trn2 v31.2D, v26.2D, v29.2D // ........................................................*...... + // trn1 v26.2D, v26.2D, v29.2D // ............................................................*.. + // str q26, [x1], #64 // ..............................................................* + // str q18, [x1, #-48] // .............................................................*. + // str q31, [x1, #-32] // ...........................................................*... + // str q4, [x1, #-16] // .........................................................*..... + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_a72.s b/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_a72.s index 25e53c30..8dc7a487 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_a72.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_a72.s @@ -26,30 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +43,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -98,55 +68,55 @@ .macro barrett_reduce a vqdmulhq t0, \a, consts, 1 - srshr t0.8H, t0.8H, #11 + srshr t0.8h, t0.8h, #11 vmlsq \a, t0, consts, 0 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +127,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +137,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +145,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,19 +156,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -211,7 +181,7 @@ roots: .text .global ntt_kyber_123_4567_manual_st4_opt_a72 - .global _ntt_kyber_123_4567_manual_st4_opt_a72 + .global _ntt_kyber_123_4567_manual_st4 .p2align 4 const_addr: .short 3329 @@ -337,1317 +307,1390 @@ _ntt_kyber_123_4567_manual_st4_opt_a72: load_roots_123 .p2align 2 - ldr_vo v23, x0, 192 // ..*......... - ldr_vo v14, x0, 448 // .*.......... - // gap // ............ - ldr_vo v8, x0, 256 // *........... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v4.8H, v14.8H, v0.H[1] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v14.8H, v14.8H, v0.H[0] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v30.8H, v8.8H, v0.H[1] // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v14.8H, v4.8H, v7.H[0] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v17.8H, v8.8H, v0.H[0] // ......*..... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v17.8H, v30.8H, v7.H[0] // ...........* - // gap // ............ - // gap // ............ - add v30.8H, v23.8H, v14.8H // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v28.8H, v30.8H, v0.H[3] // .........*.. - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v22.8H, v30.8H, v0.H[2] // ..........*. - // gap // ............ - // gap // ............ - - // original source code - // ldr_vo v3, x0, 256 // ..*......... || .*................... - // ldr_vo v16, x0, 448 // .*.......... || *.................... - // ldr_vo v23, x0, 192 // *........... || *.................... - // mul v14.8H, v16.8H, v0.H[0] // ....*....... || ......*.............. - // sqrdmulh v28.8H, v16.8H, v0.H[1] // ...*........ || ....*................ - // mls v14.8H, v28.8H, v7.H[0] // ......*..... || ..........*.......... - // mul v17.8H, v3.8H, v0.H[0] // .......*.... || ............*........ - // add v9.8H, v23.8H, v14.8H // .........*.. || ...............*..... - // sqrdmulh v6.8H, v3.8H, v0.H[1] // .....*...... || ........*............ - // sqrdmulh v28.8H, v9.8H, v0.H[3] // ..........*. || ..................*.. - // mul v22.8H, v9.8H, v0.H[2] // ...........* || ....................* - // mls v17.8H, v6.8H, v7.H[0] // ........*... || ..............*...... - + // Instructions: 9 + // Expected cycles: 11 + // Expected IPC: 0.82 + // + // Cycle bound: 11.0 + // IPC bound: 0.82 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q16, [x0, #320] // *............................. + // gap // .............................. + // gap // .............................. + ldr q24, [x0, #0] // ..*........................... + // gap // .............................. + // gap // .............................. + ldr q5, [x0, #128] // ........*..................... + // gap // .............................. + // gap // .............................. + ldr q15, [x0, #448] // .*............................ + // gap // .............................. + // gap // .............................. + mul v6.8H, v16.8H, v0.H[0] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v11.8H, v16.8H, v0.H[1] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v14.8H, v15.8H, v0.H[1] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v30.8H, v15.8H, v0.H[0] // .......*...................... + ldr q15, [x0, #64] // ....*......................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q22, [x0, #320] // *.............................. + // ldr q31, [x0, #448] // ...*........................... + // ldr q24, [x0, #0] // .*............................. + // mul v6.8H, v22.8H, v0.H[0] // ....*.......................... + // ldr q15, [x0, #64] // ........*...................... + // sqrdmulh v11.8H, v22.8H, v0.H[1] // .....*......................... + // sqrdmulh v14.8H, v31.8H, v0.H[1] // ......*........................ + // mul v30.8H, v31.8H, v0.H[0] // .......*....................... + // ldr q5, [x0, #128] // ..*............................ + sub count, count, #1 -.p2align 2 layer123_start: - ldr_vo v30, x0, 0 // *........................................................................... - sub v14.8H, v23.8H, v14.8H // ..........................*................................................. - ldr_vo v8, x0, 128 // ..*......................................................................... - ldr_vo v9, x0, 384 // ......*..................................................................... - ldr_vo v6, x0, 64 // .*.......................................................................... - mls v22.8H, v28.8H, v7.H[0] // ...................................*........................................ - ldr_vo v3, x0, 272 // ....e....................................................................... - ldr_vo v28, x0, 320 // .....*...................................................................... - // gap // ............................................................................ - mul v13.8H, v14.8H, v0.H[4] // ...........................................*................................ - ldr_vo v16, x0, 464 // .......e.................................................................... - ldr_vo v23, x0, 208 // ...e........................................................................ - sub v18.8H, v30.8H, v17.8H // ...........*................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v30.8H, v30.8H, v17.8H // ............*............................................................... - sqrdmulh v14.8H, v14.8H, v0.H[5] // ............................................*............................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v17.8H, v9.8H, v0.H[1] // ...................*........................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v9.8H, v9.8H, v0.H[0] // ..................*......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v31.8H, v28.8H, v0.H[1] // ..............*............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v9.8H, v17.8H, v7.H[0] // ....................*....................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v28.8H, v28.8H, v0.H[0] // .............*.............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v28.8H, v31.8H, v7.H[0] // ...............*............................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v17.8H, v8.8H, v9.8H // .....................*...................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v8.8H, v8.8H, v9.8H // ......................*..................................................... - mls v13.8H, v14.8H, v7.H[0] // .............................................*.............................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v14.8H, v16.8H, v0.H[0] // .......................e.................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v9.8H, v6.8H, v28.8H // .................*.......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v6.8H, v6.8H, v28.8H // ................*........................................................... - sqrdmulh v28.8H, v16.8H, v0.H[1] // ........................e................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v16.8H, v17.8H, v0.H[5] // .......................................*.................................... - sub v31.8H, v9.8H, v22.8H // ....................................*....................................... - // gap // ............................................................................ - add v9.8H, v9.8H, v22.8H // .....................................*...................................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v22.8H, v8.8H, v0.H[3] // .............................*.............................................. - sub v4.8H, v6.8H, v13.8H // ..............................................*............................. - // gap // ............................................................................ - add v6.8H, v6.8H, v13.8H // ...............................................*............................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v13.8H, v17.8H, v0.H[4] // ......................................*..................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v13.8H, v16.8H, v7.H[0] // ........................................*................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v8.8H, v8.8H, v0.H[2] // ............................*............................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v8.8H, v22.8H, v7.H[0] // ..............................*............................................. - // gap // ............................................................................ - // gap // ............................................................................ - sub v22.8H, v18.8H, v13.8H // .........................................*.................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v13.8H, v18.8H, v13.8H // ..........................................*................................. - mul v16.8H, v9.8H, v0.H[6] // ................................................*........................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v9.8H, v9.8H, v0.H[7] // .................................................*.......................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v18.8H, v30.8H, v8.8H // ...............................*............................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v30.8H, v30.8H, v8.8H // ................................*........................................... - mul v8.8H, v31.8H, v1.H[0] // .....................................................*...................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v17.8H, v31.8H, v1.H[1] // ......................................................*..................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v16.8H, v9.8H, v7.H[0] // ..................................................*......................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v9.8H, v6.8H, v1.H[2] // ..........................................................*................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v8.8H, v17.8H, v7.H[0] // .......................................................*.................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v17.8H, v30.8H, v16.8H // ...................................................*........................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v6.8H, v6.8H, v1.H[3] // ...........................................................*................ - add v30.8H, v30.8H, v16.8H // ....................................................*....................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v17, x0, 64 // .....................................................................*...... - mul v16.8H, v4.8H, v1.H[4] // ...............................................................*............ - // gap // ............................................................................ - str_vi v30, x0, 16 // ....................................................................*....... - sub v30.8H, v18.8H, v8.8H // ........................................................*................... - // gap // ............................................................................ - sqrdmulh v17.8H, v4.8H, v1.H[5] // ................................................................*........... - add v8.8H, v18.8H, v8.8H // .........................................................*.................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v9.8H, v6.8H, v7.H[0] // ............................................................*............... - str_vo v30, x0, 176 // .......................................................................*.... - // gap // ............................................................................ - str_vo v8, x0, 112 // ......................................................................*..... - // gap // ............................................................................ - // gap // ............................................................................ - mls v14.8H, v28.8H, v7.H[0] // .........................e.................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v16.8H, v17.8H, v7.H[0] // .................................................................*.......... - // gap // ............................................................................ - // gap // ............................................................................ - sub v30.8H, v13.8H, v9.8H // .............................................................*.............. - // gap // ............................................................................ - // gap // ............................................................................ - add v8.8H, v13.8H, v9.8H // ..............................................................*............. - mul v17.8H, v3.8H, v0.H[0] // ........e................................................................... - // gap // ............................................................................ - add v9.8H, v23.8H, v14.8H // ...........................e................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v6.8H, v3.8H, v0.H[1] // .........e.................................................................. - str_vo v30, x0, 304 // .........................................................................*.. - // gap // ............................................................................ - str_vo v8, x0, 240 // ........................................................................*... - sub v30.8H, v22.8H, v16.8H // ..................................................................*......... - // gap // ............................................................................ - add v8.8H, v22.8H, v16.8H // ...................................................................*........ - sqrdmulh v28.8H, v9.8H, v0.H[3] // ..................................e......................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v22.8H, v9.8H, v0.H[2] // .................................e.......................................... - str_vo v30, x0, 432 // ...........................................................................* - // gap // ............................................................................ - str_vo v8, x0, 368 // ..........................................................................*. - // gap // ............................................................................ - // gap // ............................................................................ - mls v17.8H, v6.8H, v7.H[0] // ..........e................................................................. - // gap // ............................................................................ - // gap // ............................................................................ - - // original source code - // ldr_vo v8, x0, 0 // ......................................................................*.......................................................................... || ......................................................................*...................................................................... - // ldr_vo v9, x0, 64 // ..........................................................................*...................................................................... || .......................................................................*..................................................................... - // ldr_vo v10, x0, 128 // ........................................................................*........................................................................ || ......................................................................*...................................................................... - // ldr_vo v11, x0, 192 // ....e............................................................................................................................................ || .e........................................................................................................................................... - // ldr_vo v12, x0, 256 // e................................................................................................................................................ || e............................................................................................................................................ - // ldr_vo v13, x0, 320 // .............................................................................*................................................................... || ........................................................................*.................................................................... - // ldr_vo v14, x0, 384 // .........................................................................*....................................................................... || .......................................................................*..................................................................... - // ldr_vo v15, x0, 448 // ...e............................................................................................................................................. || .e........................................................................................................................................... - // mul v24.8H, v12.8H, v0.H[0] // ..........................................................e...................................................................................... || .............................................................e............................................................................... - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ............................................................e.................................................................................... || ...............................................................e............................................................................. - // mls v24.8H, v12.8H, v7.H[0] // .....................................................................e........................................................................... || .....................................................................e....................................................................... - // sub v12.8H, v8.8H, v24.8H // .................................................................................*............................................................... || ..........................................................................*.................................................................. - // add v8.8H, v8.8H, v24.8H // ..................................................................................*.............................................................. || ...........................................................................*................................................................. - // mul v24.8H, v13.8H, v0.H[0] // ........................................................................................*........................................................ || .....................................................................................*....................................................... - // sqrdmulh v13.8H, v13.8H, v0.H[1] // ......................................................................................*.......................................................... || .................................................................................*........................................................... - // mls v24.8H, v13.8H, v7.H[0] // .........................................................................................*....................................................... || .......................................................................................*..................................................... - // sub v13.8H, v9.8H, v24.8H // ...............................................................................................*................................................. || .............................................................................................*............................................... - // add v9.8H, v9.8H, v24.8H // ..............................................................................................*.................................................. || ............................................................................................*................................................ - // mul v24.8H, v14.8H, v0.H[0] // .....................................................................................*........................................................... || ...............................................................................*............................................................. - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ....................................................................................*............................................................ || .............................................................................*............................................................... - // mls v24.8H, v14.8H, v7.H[0] // .......................................................................................*......................................................... || ...................................................................................*......................................................... - // sub v14.8H, v10.8H, v24.8H // ..........................................................................................*...................................................... || ........................................................................................*.................................................... - // add v10.8H, v10.8H, v24.8H // ...........................................................................................*..................................................... || .........................................................................................*................................................... - // mul v24.8H, v15.8H, v0.H[0] // .................e............................................................................................................................... || ...................e......................................................................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ....................e............................................................................................................................ || .....................e....................................................................................................................... - // mls v24.8H, v15.8H, v7.H[0] // ......................................................e.......................................................................................... || .........................................................e................................................................................... - // sub v15.8H, v11.8H, v24.8H // .......................................................................*......................................................................... || ......................................................................*...................................................................... - // add v11.8H, v11.8H, v24.8H // ...........................................................e..................................................................................... || ..............................................................e.............................................................................. - // mul v24.8H, v10.8H, v0.H[2] // .........................................................................................................*....................................... || .......................................................................................................*..................................... - // sqrdmulh v10.8H, v10.8H, v0.H[3] // ....................................................................................................*............................................ || .................................................................................................*........................................... - // mls v24.8H, v10.8H, v7.H[0] // ..........................................................................................................*...................................... || .........................................................................................................*................................... - // sub v10.8H, v8.8H, v24.8H // ...............................................................................................................*................................. || ..............................................................................................................*.............................. - // add v8.8H, v8.8H, v24.8H // ................................................................................................................*................................ || ...............................................................................................................*............................. - // mul v24.8H, v11.8H, v0.H[2] // ..................................................................e.............................................................................. || ...................................................................e......................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[3] // .................................................................e............................................................................... || .................................................................e........................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ...........................................................................*..................................................................... || .......................................................................*..................................................................... - // sub v11.8H, v9.8H, v24.8H // ..................................................................................................*.............................................. || ...............................................................................................*............................................. - // add v9.8H, v9.8H, v24.8H // ...................................................................................................*............................................. || ................................................................................................*............................................ - // mul v24.8H, v14.8H, v0.H[4] // .......................................................................................................*......................................... || ...................................................................................................*......................................... - // sqrdmulh v14.8H, v14.8H, v0.H[5] // .................................................................................................*............................................... || ...............................................................................................*............................................. - // mls v24.8H, v14.8H, v7.H[0] // ........................................................................................................*........................................ || .....................................................................................................*....................................... - // sub v14.8H, v12.8H, v24.8H // ...........................................................................................................*..................................... || ..........................................................................................................*.................................. - // add v12.8H, v12.8H, v24.8H // ............................................................................................................*.................................... || ...........................................................................................................*................................. - // mul v24.8H, v15.8H, v0.H[4] // ..............................................................................*.................................................................. || .........................................................................*................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // ...................................................................................*............................................................. || ...........................................................................*................................................................. - // mls v24.8H, v15.8H, v7.H[0] // ............................................................................................*.................................................... || .........................................................................................*................................................... - // sub v15.8H, v13.8H, v24.8H // .....................................................................................................*........................................... || .................................................................................................*........................................... - // add v13.8H, v13.8H, v24.8H // ......................................................................................................*.......................................... || ..................................................................................................*.......................................... - // mul v24.8H, v9.8H, v0.H[6] // .............................................................................................................*................................... || ...........................................................................................................*................................. - // sqrdmulh v9.8H, v9.8H, v0.H[7] // ..............................................................................................................*.................................. || .............................................................................................................*............................... - // mls v24.8H, v9.8H, v7.H[0] // ...................................................................................................................*............................. || ...................................................................................................................*......................... - // sub v9.8H, v8.8H, v24.8H // ......................................................................................................................*.......................... || ........................................................................................................................*.................... - // add v8.8H, v8.8H, v24.8H // ........................................................................................................................*........................ || .........................................................................................................................*................... - // mul v24.8H, v11.8H, v1.H[0] // .................................................................................................................*............................... || ...............................................................................................................*............................. - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ..................................................................................................................*.............................. || .................................................................................................................*........................... - // mls v24.8H, v11.8H, v7.H[0] // .....................................................................................................................*........................... || .......................................................................................................................*..................... - // sub v11.8H, v10.8H, v24.8H // ............................................................................................................................*.................... || ............................................................................................................................*................ - // add v10.8H, v10.8H, v24.8H // ..............................................................................................................................*.................. || .............................................................................................................................*............... - // mul v24.8H, v13.8H, v1.H[2] // ....................................................................................................................*............................ || .....................................................................................................................*....................... - // sqrdmulh v13.8H, v13.8H, v1.H[3] // .......................................................................................................................*......................... || .........................................................................................................................*................... - // mls v24.8H, v13.8H, v7.H[0] // ...............................................................................................................................*................. || ...............................................................................................................................*............. - // sub v13.8H, v12.8H, v24.8H // ....................................................................................................................................*............ || ....................................................................................................................................*........ - // add v12.8H, v12.8H, v24.8H // .....................................................................................................................................*........... || .....................................................................................................................................*....... - // mul v24.8H, v15.8H, v1.H[4] // ..........................................................................................................................*...................... || ...........................................................................................................................*................. - // sqrdmulh v15.8H, v15.8H, v1.H[5] // .............................................................................................................................*................... || .............................................................................................................................*............... - // mls v24.8H, v15.8H, v7.H[0] // ...................................................................................................................................*............. || ...................................................................................................................................*......... - // sub v15.8H, v14.8H, v24.8H // ...........................................................................................................................................*..... || ........................................................................................................................................*.... - // add v14.8H, v14.8H, v24.8H // ............................................................................................................................................*.... || .........................................................................................................................................*... - // str_vi v8, x0, 16 // ...........................................................................................................................*..................... || ............................................................................................................................*................ - // str_vo v9, x0, 48 // .........................................................................................................................*....................... || ...........................................................................................................................*................. - // str_vo v10, x0, 112 // .................................................................................................................................*............... || ................................................................................................................................*............ - // str_vo v11, x0, 176 // ................................................................................................................................*................ || ...............................................................................................................................*............. - // str_vo v12, x0, 240 // ..........................................................................................................................................*...... || ........................................................................................................................................*.... - // str_vo v13, x0, 304 // .........................................................................................................................................*....... || .......................................................................................................................................*..... - // str_vo v14, x0, 368 // ................................................................................................................................................* || ............................................................................................................................................* - // str_vo v15, x0, 432 // ...............................................................................................................................................*. || ...........................................................................................................................................*. - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Cycle bound: 72.0 + // IPC bound: 1.06 + // + // Wall time: 49.10s + // User time: 49.10s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q26, [x0, #384] // ......*..................................................................... + ldr q9, [x0, #256] // ....*....................................................................... + mls v6.8H, v11.8H, v7.H[0] // ...............*............................................................ + ldr q22, [x0, #336] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + ldr q3, [x0, #192] // ...*........................................................................ + mls v30.8H, v14.8H, v7.H[0] // .........................*.................................................. + // gap // ............................................................................ + ldr q31, [x0, #464] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v29.8H, v9.8H, v0.H[1] // ........*................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v20.8H, v15.8H, v6.8H // .................*.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.8H, v9.8H, v0.H[0] // .........*.................................................................. + sub v14.8H, v15.8H, v6.8H // ................*........................................................... + // gap // ............................................................................ + add v15.8H, v3.8H, v30.8H // ...........................*................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v16.8H, v3.8H, v30.8H // ..........................*................................................. + sqrdmulh v25.8H, v26.8H, v0.H[1] // ..................*......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v27.8H, v15.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v6.8H, v15.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.8H, v29.8H, v7.H[0] // ..........*................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v15.8H, v16.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v11.8H, v26.8H, v0.H[0] // ...................*........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v2.8H, v24.8H, v13.8H // ...........*................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v13.8H, v24.8H, v13.8H // ............*............................................................... + mls v11.8H, v25.8H, v7.H[0] // ....................*....................................................... + ldr q24, [x0, #16] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v16.8H, v16.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v16.8H, v15.8H, v7.H[0] // .............................................*.............................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.8H, v5.8H, v11.8H // .....................*...................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v6.8H, v27.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v3.8H, v26.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v15.8H, v14.8H, v16.8H // ..............................................*............................. + // gap // ............................................................................ + // gap // ............................................................................ + add v14.8H, v14.8H, v16.8H // ...............................................*............................ + mul v27.8H, v26.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + sub v10.8H, v20.8H, v6.8H // ....................................*....................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v12.8H, v20.8H, v6.8H // .....................................*...................................... + sqrdmulh v16.8H, v15.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + add v6.8H, v5.8H, v11.8H // ......................*..................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v27.8H, v3.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v15.8H, v15.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v15.8H, v16.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + sub v16.8H, v2.8H, v27.8H // .........................................*.................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v27.8H, v2.8H, v27.8H // ..........................................*................................. + sqrdmulh v26.8H, v6.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v6.8H, v6.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v17.8H, v16.8H, v15.8H // ...................................................................*........ + // gap // ............................................................................ + // gap // ............................................................................ + sub v28.8H, v16.8H, v15.8H // ..................................................................*......... + sqrdmulh v15.8H, v12.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q17, [x0, #384] // ..........................................................................*. + mls v6.8H, v26.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + str q28, [x0, #448] // ...........................................................................* + // gap // ............................................................................ + // gap // ............................................................................ + mul v16.8H, v12.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v16.8H, v15.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.8H, v13.8H, v6.8H // ...............................*............................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v6.8H, v13.8H, v6.8H // ................................*........................................... + sqrdmulh v15.8H, v14.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v25.8H, v14.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + // gap // ............................................................................ + add v14.8H, v6.8H, v16.8H // ....................................................*....................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v6.8H, v6.8H, v16.8H // ...................................................*........................ + sqrdmulh v16.8H, v10.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q14, [x0], #(16) // ....................................................................*....... + mls v25.8H, v15.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + str q6, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + // gap // ............................................................................ + mul v14.8H, v10.8H, v1.H[0] // ......................................................*..................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v14.8H, v16.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v15.8H, v27.8H, v25.8H // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + mul v6.8H, v22.8H, v0.H[0] // ..............e............................................................. + add v16.8H, v27.8H, v25.8H // ..............................................................*............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q15, [x0, #304] // .........................................................................*.. + ldr q15, [x0, #64] // .e.......................................................................... + sqrdmulh v11.8H, v22.8H, v0.H[1] // .............e.............................................................. + str q16, [x0, #240] // ........................................................................*... + add v16.8H, v26.8H, v14.8H // .........................................................*.................. + // gap // ............................................................................ + sub v27.8H, v26.8H, v14.8H // ........................................................*................... + // gap // ............................................................................ + sqrdmulh v14.8H, v31.8H, v0.H[1] // .......................e.................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #112] // ......................................................................*..... + mul v30.8H, v31.8H, v0.H[0] // ........................e................................................... + // gap // ............................................................................ + str q27, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + ldr q5, [x0, #128] // ..e......................................................................... + + // ------------------------------------------------------------------ new position -------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------------------- + // ldr q8, [x0, #0] // ...................e.....................................................'.....................~.................................................... + // ldr q9, [x0, #(1*(512/8))] // ...............................................................e.........'.................................................................~........ + // ldr q10, [x0, #(2*(512/8))] // ........................................................................e'.......................................................................... + // ldr q11, [x0, #(3*(512/8))] // .~.......................................................................'...*...................................................................... + // ldr q12, [x0, #(4*(512/8))] // .........................................................................'*......................................................................... + // ldr q13, [x0, #(5*(512/8))] // e........................................................................'..~....................................................................... + // ldr q14, [x0, #(6*(512/8))] // .........................................................................*.......................................................................... + // ldr q15, [x0, #(7*(512/8))] // ...e.....................................................................'.....~.................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ....~....................................................................'......*................................................................... + // mul v24.8h, v12.8h, v0.h[0] // ......~..................................................................'........*................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .............~...........................................................'...............*.......................................................... + // sub v12.8h, v8.8h, v24.8h // ................~........................................................'..................*....................................................... + // add v8.8h, v8.8h, v24.8h // .................~.......................................................'...................*...................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ................................................................e........'..................................................................~....... + // mul v24.8h, v13.8h, v0.h[0] // ............................................................e............'..............................................................~........... + // mls v24.8h, v27.8h, v7.h[0] // .........................................................................'.*........................................................................ + // sub v13.8h, v9.8h, v24.8h // .......~.................................................................'.........*................................................................ + // add v9.8h, v9.8h, v24.8h // .....~...................................................................'.......*.................................................................. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ..........~..............................................................'............*............................................................. + // mul v24.8h, v14.8h, v0.h[0] // ...............~.........................................................'.................*........................................................ + // mls v24.8h, v27.8h, v7.h[0] // ..................~......................................................'....................*..................................................... + // sub v14.8h, v10.8h, v24.8h // ......................~..................................................'........................*................................................. + // add v10.8h, v10.8h, v24.8h // ...............................~.........................................'.................................*........................................ + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ....................................................................e....'......................................................................~... + // mul v24.8h, v15.8h, v0.h[0] // ......................................................................e..'........................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // ..~......................................................................'....*..................................................................... + // sub v15.8h, v11.8h, v24.8h // .........~...............................................................'...........*.............................................................. + // add v11.8h, v11.8h, v24.8h // ........~................................................................'..........*............................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // .....................................~...................................'.......................................*.................................. + // mul v24.8h, v10.8h, v0.h[2] // ......................................~..................................'........................................*................................. + // mls v24.8h, v27.8h, v7.h[0] // ...........................................~.............................'.............................................*............................ + // sub v10.8h, v8.8h, v24.8h // ...............................................~.........................'.................................................*........................ + // add v8.8h, v8.8h, v24.8h // ................................................~........................'..................................................*....................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ...........~.............................................................'.............*............................................................ + // mul v24.8h, v11.8h, v0.h[2] // ............~............................................................'..............*........................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................~.................................................'.........................*................................................ + // sub v11.8h, v9.8h, v24.8h // ............................~............................................'..............................*........................................... + // add v9.8h, v9.8h, v24.8h // .............................~...........................................'...............................*.......................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ........................~................................................'..........................*............................................... + // mul v24.8h, v14.8h, v0.h[4] // ...........................~.............................................'.............................*............................................ + // mls v24.8h, v27.8h, v7.h[0] // ................................~........................................'..................................*....................................... + // sub v14.8h, v12.8h, v24.8h // ...................................~.....................................'.....................................*.................................... + // add v12.8h, v12.8h, v24.8h // ....................................~....................................'......................................*................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ..............~..........................................................'................*......................................................... + // mul v24.8h, v15.8h, v0.h[4] // ....................~....................................................'......................*................................................... + // mls v24.8h, v27.8h, v7.h[0] // .....................~...................................................'.......................*.................................................. + // sub v15.8h, v13.8h, v24.8h // .........................~...............................................'...........................*.............................................. + // add v13.8h, v13.8h, v24.8h // ..........................~..............................................'............................*............................................. + // sqrdmulh v27.8h, v9.8h, v0.h[7] // .........................................~...............................'...........................................*.............................. + // mul v24.8h, v9.8h, v0.h[6] // .............................................~...........................'...............................................*.......................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................................~..........................'................................................*......................... + // sub v9.8h, v8.8h, v24.8h // ....................................................~....................'......................................................*................... + // add v8.8h, v8.8h, v24.8h // ...................................................~.....................'.....................................................*.................... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // .....................................................~...................'.......................................................*.................. + // mul v24.8h, v11.8h, v1.h[0] // .........................................................~...............'...........................................................*.............. + // mls v24.8h, v27.8h, v7.h[0] // ..........................................................~..............'............................................................*............. + // sub v11.8h, v10.8h, v24.8h // ...................................................................~.....'.....................................................................*.... + // add v10.8h, v10.8h, v24.8h // ..................................................................~......'....................................................................*..... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // .................................................~.......................'...................................................*...................... + // mul v24.8h, v13.8h, v1.h[2] // ..................................................~......................'....................................................*..................... + // mls v24.8h, v27.8h, v7.h[0] // .......................................................~.................'.........................................................*................ + // sub v13.8h, v12.8h, v24.8h // ...........................................................~.............'.............................................................*............ + // add v12.8h, v12.8h, v24.8h // .............................................................~...........'...............................................................*.......... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ..............................~..........................................'................................*......................................... + // mul v24.8h, v15.8h, v1.h[4] // .................................~.......................................'...................................*...................................... + // mls v24.8h, v27.8h, v7.h[0] // ..................................~......................................'....................................*..................................... + // sub v15.8h, v14.8h, v24.8h // ........................................~................................'..........................................*............................... + // add v14.8h, v14.8h, v24.8h // .......................................~.................................'.........................................*................................ + // str q8, [x0], #(16) // ......................................................~..................'........................................................*................. + // str q9, [x0, #(-16 + 1*(512/8))] // ........................................................~................'..........................................................*............... + // str q10, [x0, #(-16 + 2*(512/8))] // .....................................................................~...'.......................................................................*.. + // str q11, [x0, #(-16 + 3*(512/8))] // .......................................................................~.'.........................................................................* + // str q12, [x0, #(-16 + 4*(512/8))] // .................................................................~.......'...................................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // ..............................................................~..........'................................................................*......... + // str q14, [x0, #(-16 + 6*(512/8))] // ..........................................~..............................'............................................*............................. + // str q15, [x0, #(-16 + 7*(512/8))] // ............................................~............................'..............................................*........................... + + sub count, count, #1 cbnz count, layer123_start - sub v14.8H, v23.8H, v14.8H // .*.............................................................. - mls v22.8H, v28.8H, v7.H[0] // .....*.......................................................... - ldr_vo v30, x0, 0 // *............................................................... - ldr_vo v8, x0, 384 // ...*............................................................ - ldr_vo v9, x0, 128 // ..*............................................................. - // gap // ................................................................ - ldr_vo v6, x0, 320 // ......*......................................................... - ldr_vo v28, x0, 64 // ....*........................................................... - // gap // ................................................................ - mul v3.8H, v14.8H, v0.H[4] // .......*........................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v13.8H, v30.8H, v17.8H // ........*....................................................... - // gap // ................................................................ - // gap // ................................................................ - add v30.8H, v30.8H, v17.8H // .........*...................................................... - sqrdmulh v14.8H, v14.8H, v0.H[5] // ..........*..................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v16.8H, v8.8H, v0.H[1] // ...........*.................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v8.8H, v8.8H, v0.H[0] // ............*................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v23.8H, v6.8H, v0.H[1] // .............*.................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v8.8H, v16.8H, v7.H[0] // ..............*................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v6.8H, v6.8H, v0.H[0] // ...............*................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v6.8H, v23.8H, v7.H[0] // ................*............................................... - // gap // ................................................................ - // gap // ................................................................ - sub v16.8H, v9.8H, v8.8H // .................*.............................................. - // gap // ................................................................ - // gap // ................................................................ - add v8.8H, v9.8H, v8.8H // ..................*............................................. - mls v3.8H, v14.8H, v7.H[0] // ...................*............................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v14.8H, v16.8H, v0.H[5] // ......................*......................................... - // gap // ................................................................ - // gap // ................................................................ - add v9.8H, v28.8H, v6.8H // ....................*........................................... - // gap // ................................................................ - // gap // ................................................................ - sub v6.8H, v28.8H, v6.8H // .....................*.......................................... - sqrdmulh v28.8H, v8.8H, v0.H[3] // .........................*...................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v16.8H, v16.8H, v0.H[4] // ............................*................................... - sub v23.8H, v9.8H, v22.8H // .......................*........................................ - // gap // ................................................................ - add v9.8H, v9.8H, v22.8H // ........................*....................................... - // gap // ................................................................ - // gap // ................................................................ - mls v16.8H, v14.8H, v7.H[0] // .............................*.................................. - sub v14.8H, v6.8H, v3.8H // ..........................*..................................... - // gap // ................................................................ - add v6.8H, v6.8H, v3.8H // ...........................*.................................... - // gap // ................................................................ - // gap // ................................................................ - mul v8.8H, v8.8H, v0.H[2] // ..............................*................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v8.8H, v28.8H, v7.H[0] // ...............................*................................ - // gap // ................................................................ - // gap // ................................................................ - sub v22.8H, v13.8H, v16.8H // ................................*............................... - // gap // ................................................................ - // gap // ................................................................ - add v28.8H, v13.8H, v16.8H // .................................*.............................. - mul v3.8H, v9.8H, v0.H[6] // ..................................*............................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v9.8H, v9.8H, v0.H[7] // ...................................*............................ - // gap // ................................................................ - // gap // ................................................................ - sub v13.8H, v30.8H, v8.8H // ....................................*........................... - // gap // ................................................................ - // gap // ................................................................ - add v30.8H, v30.8H, v8.8H // .....................................*.......................... - mul v8.8H, v23.8H, v1.H[0] // ......................................*......................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v16.8H, v23.8H, v1.H[1] // .......................................*........................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v3.8H, v9.8H, v7.H[0] // ........................................*....................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v9.8H, v6.8H, v1.H[2] // .........................................*...................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v8.8H, v16.8H, v7.H[0] // ..........................................*..................... - // gap // ................................................................ - // gap // ................................................................ - add v16.8H, v30.8H, v3.8H // .............................................*.................. - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v6.8H, v6.8H, v1.H[3] // ............................................*................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str_vi v16, x0, 16 // ................................................*............... - sqrdmulh v16.8H, v14.8H, v1.H[5] // ..................................................*............. - // gap // ................................................................ - sub v23.8H, v13.8H, v8.8H // .................................................*.............. - // gap // ................................................................ - // gap // ................................................................ - mul v14.8H, v14.8H, v1.H[4] // ...............................................*................ - add v8.8H, v13.8H, v8.8H // ...................................................*............ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v9.8H, v6.8H, v7.H[0] // ....................................................*........... - str_vo v23, x0, 176 // .....................................................*.......... - // gap // ................................................................ - str_vo v8, x0, 112 // ......................................................*......... - // gap // ................................................................ - // gap // ................................................................ - mls v14.8H, v16.8H, v7.H[0] // .......................................................*........ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v30.8H, v30.8H, v3.8H // ...........................................*.................... - // gap // ................................................................ - // gap // ................................................................ - sub v8.8H, v28.8H, v9.8H // ........................................................*....... - add v9.8H, v28.8H, v9.8H // .........................................................*...... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v6.8H, v22.8H, v14.8H // ............................................................*... - add v14.8H, v22.8H, v14.8H // .............................................................*.. - str_vo v30, x0, 48 // ..............................................*................. - str_vo v8, x0, 304 // ..........................................................*..... - str_vo v9, x0, 240 // ...........................................................*.... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str_vo v6, x0, 432 // ..............................................................*. - str_vo v14, x0, 368 // ...............................................................* - // gap // ................................................................ - - // original source code - // ldr_vo v30, x0, 0 // ..*............................................................. || *............................................................... - // sub v14.8H, v23.8H, v14.8H // *............................................................... || *............................................................... - // ldr_vo v8, x0, 128 // ....*........................................................... || .*.............................................................. - // ldr_vo v9, x0, 384 // ...*............................................................ || .*.............................................................. - // ldr_vo v6, x0, 64 // ......*......................................................... || ..*............................................................. - // mls v22.8H, v28.8H, v7.H[0] // .*.............................................................. || *............................................................... - // ldr_vo v28, x0, 320 // .....*.......................................................... || ..*............................................................. - // mul v13.8H, v14.8H, v0.H[4] // .......*........................................................ || ...*............................................................ - // sub v18.8H, v30.8H, v17.8H // ........*....................................................... || ....*........................................................... - // add v30.8H, v30.8H, v17.8H // .........*...................................................... || .....*.......................................................... - // sqrdmulh v14.8H, v14.8H, v0.H[5] // ..........*..................................................... || .....*.......................................................... - // sqrdmulh v17.8H, v9.8H, v0.H[1] // ...........*.................................................... || .......*........................................................ - // mul v9.8H, v9.8H, v0.H[0] // ............*................................................... || .........*...................................................... - // sqrdmulh v31.8H, v28.8H, v0.H[1] // .............*.................................................. || ...........*.................................................... - // mls v9.8H, v17.8H, v7.H[0] // ..............*................................................. || .............*.................................................. - // mul v28.8H, v28.8H, v0.H[0] // ...............*................................................ || ...............*................................................ - // mls v28.8H, v31.8H, v7.H[0] // ................*............................................... || .................*.............................................. - // sub v17.8H, v8.8H, v9.8H // .................*.............................................. || ..................*............................................. - // add v8.8H, v8.8H, v9.8H // ..................*............................................. || ...................*............................................ - // mls v13.8H, v14.8H, v7.H[0] // ...................*............................................ || ...................*............................................ - // add v9.8H, v6.8H, v28.8H // .....................*.......................................... || ......................*......................................... - // sub v6.8H, v6.8H, v28.8H // ......................*......................................... || .......................*........................................ - // sqrdmulh v16.8H, v17.8H, v0.H[5] // ....................*........................................... || .....................*.......................................... - // sub v31.8H, v9.8H, v22.8H // .........................*...................................... || .........................*...................................... - // add v9.8H, v9.8H, v22.8H // ..........................*..................................... || ..........................*..................................... - // sqrdmulh v22.8H, v8.8H, v0.H[3] // .......................*........................................ || .......................*........................................ - // sub v4.8H, v6.8H, v13.8H // ............................*................................... || ...........................*.................................... - // add v6.8H, v6.8H, v13.8H // .............................*.................................. || ............................*................................... - // mul v13.8H, v17.8H, v0.H[4] // ........................*....................................... || .........................*...................................... - // mls v13.8H, v16.8H, v7.H[0] // ...........................*.................................... || ...........................*.................................... - // mul v8.8H, v8.8H, v0.H[2] // ..............................*................................. || .............................*.................................. - // mls v8.8H, v22.8H, v7.H[0] // ...............................*................................ || ...............................*................................ - // sub v22.8H, v18.8H, v13.8H // ................................*............................... || ................................*............................... - // add v13.8H, v18.8H, v13.8H // .................................*.............................. || .................................*.............................. - // mul v16.8H, v9.8H, v0.H[6] // ..................................*............................. || .................................*.............................. - // sqrdmulh v9.8H, v9.8H, v0.H[7] // ...................................*............................ || ...................................*............................ - // sub v18.8H, v30.8H, v8.8H // ....................................*........................... || ....................................*........................... - // add v30.8H, v30.8H, v8.8H // .....................................*.......................... || .....................................*.......................... - // mul v8.8H, v31.8H, v1.H[0] // ......................................*......................... || .....................................*.......................... - // sqrdmulh v17.8H, v31.8H, v1.H[1] // .......................................*........................ || .......................................*........................ - // mls v16.8H, v9.8H, v7.H[0] // ........................................*....................... || .........................................*...................... - // mul v9.8H, v6.8H, v1.H[2] // .........................................*...................... || ...........................................*.................... - // mls v8.8H, v17.8H, v7.H[0] // ..........................................*..................... || .............................................*.................. - // sub v17.8H, v30.8H, v16.8H // ......................................................*......... || .........................................................*...... - // sqrdmulh v6.8H, v6.8H, v1.H[3] // ............................................*................... || ...............................................*................ - // add v30.8H, v30.8H, v16.8H // ...........................................*.................... || ..............................................*................. - // str_vo v17, x0, 64 // ...........................................................*.... || ............................................................*... - // mul v16.8H, v4.8H, v1.H[4] // ................................................*............... || ...................................................*............ - // str_vi v30, x0, 16 // .............................................*.................. || .................................................*.............. - // sub v30.8H, v18.8H, v8.8H // ...............................................*................ || ..................................................*............. - // sqrdmulh v17.8H, v4.8H, v1.H[5] // ..............................................*................. || .................................................*.............. - // add v8.8H, v18.8H, v8.8H // .................................................*.............. || ...................................................*............ - // mls v9.8H, v6.8H, v7.H[0] // ..................................................*............. || .....................................................*.......... - // str_vo v30, x0, 176 // ...................................................*............ || .....................................................*.......... - // str_vo v8, x0, 112 // ....................................................*........... || ......................................................*......... - // mls v16.8H, v17.8H, v7.H[0] // .....................................................*.......... || .......................................................*........ - // sub v30.8H, v13.8H, v9.8H // .......................................................*........ || ..........................................................*..... - // add v8.8H, v13.8H, v9.8H // ........................................................*....... || ..........................................................*..... - // str_vo v30, x0, 304 // ............................................................*... || .............................................................*.. - // str_vo v8, x0, 240 // .............................................................*.. || .............................................................*.. - // sub v30.8H, v22.8H, v16.8H // .........................................................*...... || ............................................................*... - // add v8.8H, v22.8H, v16.8H // ..........................................................*..... || ............................................................*... - // str_vo v30, x0, 432 // ..............................................................*. || ...............................................................* - // str_vo v8, x0, 368 // ...............................................................* || ...............................................................* - + // Instructions: 67 + // Expected cycles: 71 + // Expected IPC: 0.94 + // + // Cycle bound: 71.0 + // IPC bound: 0.94 + // + // Wall time: 3.64s + // User time: 3.64s + // + // ----------------------- original position ------------------------> + // 0 25 50 + // |------------------------|------------------------|---------------- + ldr q4, [x0, #384] // *.................................................................. + // gap // ................................................................... + mls v30.8H, v14.8H, v7.H[0] // ....*.............................................................. + // gap // ................................................................... + // gap // ................................................................... + ldr q20, [x0, #192] // ...*............................................................... + mls v6.8H, v11.8H, v7.H[0] // ..*................................................................ + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v21.8H, v4.8H, v0.H[1] // ...........*....................................................... + // gap // ................................................................... + // gap // ................................................................... + add v3.8H, v20.8H, v30.8H // .........*......................................................... + // gap // ................................................................... + // gap // ................................................................... + sub v20.8H, v20.8H, v30.8H // ..........*........................................................ + mul v18.8H, v4.8H, v0.H[0] // ................*.................................................. + // gap // ................................................................... + sub v29.8H, v15.8H, v6.8H // ........*.......................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v28.8H, v3.8H, v0.H[3] // ............*...................................................... + add v25.8H, v15.8H, v6.8H // ......*............................................................ + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v18.8H, v21.8H, v7.H[0] // ...................*............................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mul v19.8H, v3.8H, v0.H[2] // .............*..................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v19.8H, v28.8H, v7.H[0] // .......................*........................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sub v3.8H, v5.8H, v18.8H // ......................*............................................ + add v26.8H, v5.8H, v18.8H // ...............................*................................... + sqrdmulh v10.8H, v20.8H, v0.H[5] // ...............*................................................... + ldr q5, [x0, #256] // .*................................................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mul v31.8H, v20.8H, v0.H[4] // ....................*.............................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sub v18.8H, v25.8H, v19.8H // ............................*...................................... + add v17.8H, v25.8H, v19.8H // .............................*..................................... + sqrdmulh v25.8H, v3.8H, v0.H[5] // ........................*.......................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v31.8H, v10.8H, v7.H[0] // .....................*............................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v21.8H, v26.8H, v0.H[3] // .....................................*............................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v16.8H, v17.8H, v0.H[7] // .........................................*......................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mul v26.8H, v26.8H, v0.H[2] // ......................................*............................ + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v26.8H, v21.8H, v7.H[0] // ...........................................*....................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v20.8H, v5.8H, v0.H[1] // .....*............................................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mul v13.8H, v5.8H, v0.H[0] // .......*........................................................... + sub v5.8H, v29.8H, v31.8H // .........................*......................................... + // gap // ................................................................... + add v31.8H, v29.8H, v31.8H // ..........................*........................................ + // gap // ................................................................... + // gap // ................................................................... + mul v14.8H, v18.8H, v1.H[0] // .........................................................*......... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v27.8H, v5.8H, v1.H[5] // ..............................*.................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v13.8H, v20.8H, v7.H[0] // ..............*.................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mul v15.8H, v5.8H, v1.H[4] // .................................*................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v15.8H, v27.8H, v7.H[0] // ..................................*................................ + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mul v27.8H, v17.8H, v0.H[6] // .............................................*..................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v27.8H, v16.8H, v7.H[0] // ..............................................*.................... + // gap // ................................................................... + sub v23.8H, v24.8H, v13.8H // .................*................................................. + add v11.8H, v24.8H, v13.8H // ..................*................................................ + // gap // ................................................................... + // gap // ................................................................... + mul v17.8H, v3.8H, v0.H[4] // ...........................*....................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + add v21.8H, v11.8H, v26.8H // ................................................*.................. + sqrdmulh v4.8H, v31.8H, v1.H[3] // .................................................*................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v17.8H, v25.8H, v7.H[0] // ................................*.................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + add v3.8H, v21.8H, v27.8H // ...................................................*............... + sqrdmulh v28.8H, v18.8H, v1.H[1] // .....................................................*............. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + str q3, [x0], #(16) // ......................................................*............ + mul v6.8H, v31.8H, v1.H[2] // ..................................................*................ + // gap // ................................................................... + sub v16.8H, v23.8H, v17.8H // ...................................*............................... + // gap // ................................................................... + // gap // ................................................................... + mls v6.8H, v4.8H, v7.H[0] // .......................................................*........... + add v2.8H, v23.8H, v17.8H // ....................................*.............................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v14.8H, v28.8H, v7.H[0] // ..........................................................*........ + add v8.8H, v16.8H, v15.8H // .......................................*........................... + // gap // ................................................................... + sub v29.8H, v16.8H, v15.8H // ........................................*.......................... + // gap // ................................................................... + // gap // ................................................................... + sub v11.8H, v11.8H, v26.8H // ...............................................*................... + sub v24.8H, v21.8H, v27.8H // ....................................................*.............. + // gap // ................................................................... + str q8, [x0, #368] // ..........................................*........................ + sub v26.8H, v2.8H, v6.8H // ...........................................................*....... + // gap // ................................................................... + add v9.8H, v2.8H, v6.8H // ............................................................*...... + str q29, [x0, #432] // ............................................*...................... + // gap // ................................................................... + add v27.8H, v11.8H, v14.8H // ...............................................................*... + sub v16.8H, v11.8H, v14.8H // ................................................................*.. + str q24, [x0, #48] // ........................................................*.......... + str q26, [x0, #304] // .............................................................*..... + // gap // ................................................................... + // gap // ................................................................... + str q9, [x0, #240] // ..............................................................*.... + // gap // ................................................................... + // gap // ................................................................... + str q27, [x0, #112] // .................................................................*. + str q16, [x0, #176] // ..................................................................* + // gap // ................................................................... + + // -------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|---------------- + // ldr q26, [x0, #384] // *.................................................................. + // ldr q9, [x0, #256] // .................*................................................. + // mls v6.8H, v11.8H, v7.H[0] // ...*............................................................... + // ldr q3, [x0, #192] // ..*................................................................ + // mls v30.8H, v14.8H, v7.H[0] // .*................................................................. + // sqrdmulh v29.8H, v9.8H, v0.H[1] // ...........................*....................................... + // add v20.8H, v15.8H, v6.8H // ..........*........................................................ + // mul v13.8H, v9.8H, v0.H[0] // ............................*...................................... + // sub v14.8H, v15.8H, v6.8H // ........*.......................................................... + // add v15.8H, v3.8H, v30.8H // .....*............................................................. + // sub v16.8H, v3.8H, v30.8H // ......*............................................................ + // sqrdmulh v25.8H, v26.8H, v0.H[1] // ....*.............................................................. + // sqrdmulh v27.8H, v15.8H, v0.H[3] // .........*......................................................... + // mul v6.8H, v15.8H, v0.H[2] // ............*...................................................... + // mls v13.8H, v29.8H, v7.H[0] // .................................*................................. + // sqrdmulh v15.8H, v16.8H, v0.H[5] // ................*.................................................. + // mul v11.8H, v26.8H, v0.H[0] // .......*........................................................... + // sub v2.8H, v24.8H, v13.8H // ......................................*............................ + // add v13.8H, v24.8H, v13.8H // .......................................*........................... + // mls v11.8H, v25.8H, v7.H[0] // ...........*....................................................... + // mul v16.8H, v16.8H, v0.H[4] // ..................*................................................ + // mls v16.8H, v15.8H, v7.H[0] // ......................*............................................ + // sub v26.8H, v5.8H, v11.8H // ..............*.................................................... + // mls v6.8H, v27.8H, v7.H[0] // .............*..................................................... + // sqrdmulh v3.8H, v26.8H, v0.H[5] // .....................*............................................. + // sub v15.8H, v14.8H, v16.8H // .............................*..................................... + // add v14.8H, v14.8H, v16.8H // ..............................*.................................... + // mul v27.8H, v26.8H, v0.H[4] // ........................................*.......................... + // sub v10.8H, v20.8H, v6.8H // ...................*............................................... + // add v12.8H, v20.8H, v6.8H // ....................*.............................................. + // sqrdmulh v16.8H, v15.8H, v1.H[5] // ................................*.................................. + // add v6.8H, v5.8H, v11.8H // ...............*................................................... + // mls v27.8H, v3.8H, v7.H[0] // ...........................................*....................... + // mul v15.8H, v15.8H, v1.H[4] // ..................................*................................ + // mls v15.8H, v16.8H, v7.H[0] // ...................................*............................... + // sub v16.8H, v2.8H, v27.8H // ................................................*.................. + // add v27.8H, v2.8H, v27.8H // ..................................................*................ + // sqrdmulh v26.8H, v6.8H, v0.H[3] // .......................*........................................... + // mul v6.8H, v6.8H, v0.H[2] // .........................*......................................... + // add v17.8H, v16.8H, v15.8H // ....................................................*.............. + // sub v28.8H, v16.8H, v15.8H // .....................................................*............. + // sqrdmulh v15.8H, v12.8H, v0.H[7] // ........................*.......................................... + // str q17, [x0, #384] // ........................................................*.......... + // mls v6.8H, v26.8H, v7.H[0] // ..........................*........................................ + // str q28, [x0, #448] // ...........................................................*....... + // mul v16.8H, v12.8H, v0.H[6] // ....................................*.............................. + // mls v16.8H, v15.8H, v7.H[0] // .....................................*............................. + // sub v26.8H, v13.8H, v6.8H // ......................................................*............ + // add v6.8H, v13.8H, v6.8H // .........................................*......................... + // sqrdmulh v15.8H, v14.8H, v1.H[3] // ..........................................*........................ + // mul v25.8H, v14.8H, v1.H[2] // ...............................................*................... + // add v14.8H, v6.8H, v16.8H // ............................................*...................... + // sub v6.8H, v6.8H, v16.8H // .......................................................*........... + // sqrdmulh v16.8H, v10.8H, v1.H[1] // .............................................*..................... + // str q14, [x0], #(16) // ..............................................*.................... + // mls v25.8H, v15.8H, v7.H[0] // .................................................*................. + // str q6, [x0, #48] // ..............................................................*.... + // mul v14.8H, v10.8H, v1.H[0] // ...............................*................................... + // mls v14.8H, v16.8H, v7.H[0] // ...................................................*............... + // sub v15.8H, v27.8H, v25.8H // .........................................................*......... + // add v16.8H, v27.8H, v25.8H // ..........................................................*........ + // str q15, [x0, #304] // ...............................................................*... + // str q16, [x0, #240] // ................................................................*.. + // add v16.8H, v26.8H, v14.8H // ............................................................*...... + // sub v27.8H, v26.8H, v14.8H // .............................................................*..... + // str q16, [x0, #112] // .................................................................*. + // str q27, [x0, #176] // ..................................................................* + restore inp, STACK0 mov count, #8 .p2align 2 - ldr_vo v20, x1, 48 // *.............................................. - ldr_vi v13, x3, 16 // .*............................................. - // gap // ............................................... - ldr_vo v3, x1, 0 // ...*........................................... - ldr_vo v21, x1, 32 // ..*............................................ - // gap // ............................................... - ldr_vo v9, x1, 16 // ......*........................................ - // gap // ............................................... - // gap // ............................................... - ldr_vo v6, x4, 16 // .........................*..................... - // gap // ............................................... - // gap // ............................................... - sqrdmulh v11.8H, v20.8H, v13.H[1] // ....*.......................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sqrdmulh v25.8H, v21.8H, v13.H[1] // .......*....................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v31.8H, v20.8H, v13.H[0] // .....*......................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v31.8H, v11.8H, v7.H[0] // ........*...................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v30.8H, v21.8H, v13.H[0] // .........*..................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v30.8H, v25.8H, v7.H[0] // ............*.................................. - // gap // ............................................... - // gap // ............................................... - sub v8.8H, v9.8H, v31.8H // ..........*.................................... - // gap // ............................................... - // gap // ............................................... - add v28.8H, v9.8H, v31.8H // ...........*................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v16.8H, v8.8H, v13.H[4] // ..................*............................ - // gap // ............................................... - // gap // ............................................... - sub v0.8H, v3.8H, v30.8H // ...............*............................... - // gap // ............................................... - // gap // ............................................... - sqrdmulh v22.8H, v28.8H, v13.H[3] // ..............*................................ - add v14.8H, v3.8H, v30.8H // .................*............................. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sqrdmulh v20.8H, v8.8H, v13.H[5] // .............*................................. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v23.8H, v28.8H, v13.H[2] // ................*.............................. - ldr_vo v28, x4, 32 // .....................................*......... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v23.8H, v22.8H, v7.H[0] // ....................*.......................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v16.8H, v20.8H, v7.H[0] // ...................*........................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sub v8.8H, v14.8H, v23.8H // .......................*....................... - add v30.8H, v14.8H, v23.8H // ........................*...................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - add v15.8H, v0.8H, v16.8H // ......................*........................ - sub v20.8H, v0.8H, v16.8H // .....................*......................... - // gap // ............................................... - trn2 v9.4S, v30.4S, v8.4S // ...........................*................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - trn1 v14.4S, v15.4S, v20.4S // ...............................*............... - trn2 v15.4S, v15.4S, v20.4S // ..........................*.................... - // gap // ............................................... - trn1 v20.4S, v30.4S, v8.4S // ............................*.................. - ldr_vi v8, x4, 96 // .............................*................. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - trn2 v30.2D, v9.2D, v15.2D // ..............................*................ - // gap // ............................................... - // gap // ............................................... - trn1 v17.2D, v20.2D, v14.2D // ..................................*............ - trn2 v14.2D, v20.2D, v14.2D // .................................*............. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sqrdmulh v3.8H, v30.8H, v6.8H // ................................*.............. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sqrdmulh v27.8H, v14.8H, v6.8H // .......................................*....... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v30.8H, v30.8H, v8.8H // ...................................*........... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v30.8H, v3.8H, v7.H[0] // ......................................*........ - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v5.8H, v14.8H, v8.8H // .........................................*..... - trn1 v8.2D, v9.2D, v15.2D // ....................................*.......... - ldr_vo v14, x4, -32 // ........................................*...... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - add v21.8H, v8.8H, v30.8H // .............................................*. - sub v12.8H, v8.8H, v30.8H // ...........................................*... - ldr_vo v30, x4, -16 // ..........................................*.... - mls v5.8H, v27.8H, v7.H[0] // ............................................*.. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v3.8H, v12.8H, v14.8H // ..............................................* - // gap // ............................................... - // gap // ............................................... - - // original source code - // ldr_vo v15, x1, 48 // *.............................................. || *........................................................ - // ldr_vi v0, x3, 16 // .*............................................. || *........................................................ - // ldr_vo v24, x1, 32 // ...*........................................... || .*....................................................... - // ldr_vo v17, x1, 0 // ..*............................................ || .*....................................................... - // sqrdmulh v28.8H, v15.8H, v0.H[1] // ......*........................................ || ....*.................................................... - // mul v5.8H, v15.8H, v0.H[0] // ........*...................................... || ........*................................................ - // ldr_vo v20, x1, 16 // ....*.......................................... || ..*...................................................... - // sqrdmulh v30.8H, v24.8H, v0.H[1] // .......*....................................... || ......*.................................................. - // mls v5.8H, v28.8H, v7.H[0] // .........*..................................... || ..........*.............................................. - // mul v2.8H, v24.8H, v0.H[0] // ..........*.................................... || ............*............................................ - // sub v31.8H, v20.8H, v5.8H // ............*.................................. || ...............*......................................... - // add v22.8H, v20.8H, v5.8H // .............*................................. || ................*........................................ - // mls v2.8H, v30.8H, v7.H[0] // ...........*................................... || ..............*.......................................... - // sqrdmulh v30.8H, v31.8H, v0.H[5] // ..................*............................ || ......................*.................................. - // sqrdmulh v18.8H, v22.8H, v0.H[3] // ................*.............................. || ....................*.................................... - // sub v9.8H, v17.8H, v2.8H // ...............*............................... || ...................*..................................... - // mul v21.8H, v22.8H, v0.H[2] // ...................*........................... || ........................*................................ - // add v23.8H, v17.8H, v2.8H // .................*............................. || ....................*.................................... - // mul v8.8H, v31.8H, v0.H[4] // ..............*................................ || ..................*...................................... - // mls v8.8H, v30.8H, v7.H[0] // ......................*........................ || ............................*............................ - // mls v21.8H, v18.8H, v7.H[0] // .....................*......................... || ..........................*.............................. - // sub v24.8H, v9.8H, v8.8H // ..........................*.................... || .................................*....................... - // add v4.8H, v9.8H, v8.8H // .........................*..................... || .................................*....................... - // sub v17.8H, v23.8H, v21.8H // .......................*....................... || ...............................*......................... - // add v18.8H, v23.8H, v21.8H // ........................*...................... || ...............................*......................... - // ldr_vo v23, x4, 16 // .....*......................................... || ...*..................................................... - // trn2 v28.4S, v4.4S, v24.4S // .............................*................. || ....................................*.................... - // trn2 v31.4S, v18.4S, v17.4S // ...........................*................... || ..................................*...................... - // trn1 v17.4S, v18.4S, v17.4S // ..............................*................ || .....................................*................... - // ldr_vi v18, x4, 96 // ...............................*............... || .....................................*................... - // trn2 v6.2D, v31.2D, v28.2D // ................................*.............. || .......................................*................. - // trn1 v9.4S, v4.4S, v24.4S // ............................*.................. || ....................................*.................... - // sqrdmulh v8.8H, v6.8H, v23.8H // ...................................*........... || ..........................................*.............. - // trn2 v22.2D, v17.2D, v9.2D // ..................................*............ || ........................................*................ - // trn1 v17.2D, v17.2D, v9.2D // .................................*............. || ........................................*................ - // mul v9.8H, v6.8H, v18.8H // .....................................*......... || ..............................................*.......... - // trn1 v6.2D, v31.2D, v28.2D // ........................................*...... || ..................................................*...... - // ldr_vo v28, x4, -64 // ....................*.......................... || ........................*................................ - // mls v9.8H, v8.8H, v7.H[0] // ......................................*........ || ................................................*........ - // sqrdmulh v24.8H, v22.8H, v23.8H // ....................................*.......... || ............................................*............ - // ldr_vo v16, x4, -32 // .........................................*..... || ..................................................*...... - // mul v5.8H, v22.8H, v18.8H // .......................................*....... || ..................................................*...... - // ldr_vo v30, x4, -16 // ............................................*.. || .....................................................*... - // sub v12.8H, v6.8H, v9.8H // ...........................................*... || .....................................................*... - // mls v5.8H, v24.8H, v7.H[0] // .............................................*. || ......................................................*.. - // add v21.8H, v6.8H, v9.8H // ..........................................*.... || .....................................................*... - // mul v3.8H, v12.8H, v16.8H // ..............................................* || ........................................................* - + // Instructions: 47 + // Expected cycles: 57 + // Expected IPC: 0.82 + // + // Cycle bound: 57.0 + // IPC bound: 0.82 + // + // Wall time: 1.56s + // User time: 1.56s + // + // ------------- original position --------------> + // 0 25 + // |------------------------|--------------------- + ldr q12, [x3], #16 // *.............................................. + ldr q19, [x1, #48] // .*............................................. + // gap // ............................................... + ldr q31, [x4, #48] // ............................*.................. + ldr q2, [x1, #0] // ...*........................................... + // gap // ............................................... + ldr q5, [x4, #32] // ....................*.......................... + ldr q6, [x1, #32] // .....*......................................... + // gap // ............................................... + ldr q9, [x4, #16] // ..*............................................ + ldr q10, [x4], #(6*16) // ........................*...................... + // gap // ............................................... + ldr q30, [x4, #-16] // .........................................*..... + sqrdmulh v21.8H, v19.8H, v12.H[1] // ......*........................................ + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + ldr q0, [x1, #16] // ....*.......................................... + mul v22.8H, v19.8H, v12.H[0] // .......*....................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + sqrdmulh v28.8H, v6.8H, v12.H[1] // ........*...................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mls v22.8H, v21.8H, v7.H[0] // .........*..................................... + ldr q21, [x4, #-32] // ..........................................*.... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mul v26.8H, v6.8H, v12.H[0] // ..........*.................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mls v26.8H, v28.8H, v7.H[0] // ...........*................................... + // gap // ............................................... + // gap // ............................................... + sub v28.8H, v0.8H, v22.8H // .............*................................. + // gap // ............................................... + // gap // ............................................... + add v16.8H, v0.8H, v22.8H // ............*.................................. + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + sqrdmulh v24.8H, v28.8H, v12.H[5] // ................*.............................. + // gap // ............................................... + // gap // ............................................... + sub v0.8H, v2.8H, v26.8H // ...............*............................... + // gap // ............................................... + // gap // ............................................... + add v14.8H, v2.8H, v26.8H // .........................*..................... + mul v18.8H, v28.8H, v12.H[4] // .................*............................. + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + sqrdmulh v20.8H, v16.8H, v12.H[3] // ..............*................................ + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mls v18.8H, v24.8H, v7.H[0] // ...................*........................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mul v4.8H, v16.8H, v12.H[2] // ..................*............................ + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mls v4.8H, v20.8H, v7.H[0] // ......................*........................ + // gap // ............................................... + // gap // ............................................... + sub v27.8H, v0.8H, v18.8H // .....................*......................... + // gap // ............................................... + // gap // ............................................... + add v18.8H, v0.8H, v18.8H // .......................*....................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + add v15.8H, v14.8H, v4.8H // ...........................*................... + sub v4.8H, v14.8H, v4.8H // ..........................*.................... + // gap // ............................................... + trn1 v22.4S, v18.4S, v27.4S // .................................*............. + trn2 v27.4S, v18.4S, v27.4S // .............................*................. + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + trn2 v20.4S, v15.4S, v4.4S // ..............................*................ + trn1 v3.4S, v15.4S, v4.4S // ...............................*............... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + trn2 v4.2D, v20.2D, v27.2D // ................................*.............. + // gap // ............................................... + // gap // ............................................... + trn1 v24.2D, v20.2D, v27.2D // ..................................*............ + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + sqrdmulh v15.8H, v4.8H, v9.8H // ...................................*........... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mul v18.8H, v4.8H, v10.8H // ....................................*.......... + trn2 v4.2D, v3.2D, v22.2D // .....................................*......... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mls v18.8H, v15.8H, v7.H[0] // ......................................*........ + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + sqrdmulh v20.8H, v4.8H, v9.8H // .......................................*....... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + mul v11.8H, v4.8H, v10.8H // ........................................*...... + sub v27.8H, v24.8H, v18.8H // ...........................................*... + // gap // ............................................... + add v28.8H, v24.8H, v18.8H // .............................................*. + // gap // ............................................... + // gap // ............................................... + mls v11.8H, v20.8H, v7.H[0] // ............................................*.. + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + // gap // ............................................... + sqrdmulh v0.8H, v27.8H, v30.8H // ..............................................* + // gap // ............................................... + // gap // ............................................... + + // ---------------- new position ----------------> + // 0 25 + // |------------------------|--------------------- + // ldr q9, [x3], #16 // *.............................................. + // ldr q20, [x1, #48] // .*............................................. + // ldr q2, [x4, #16] // ......*........................................ + // ldr q19, [x1, #0] // ...*........................................... + // ldr q10, [x1, #16] // ..........*.................................... + // ldr q24, [x1, #32] // .....*......................................... + // sqrdmulh v18.8H, v20.8H, v9.H[1] // .........*..................................... + // mul v13.8H, v20.8H, v9.H[0] // ...........*................................... + // sqrdmulh v25.8H, v24.8H, v9.H[1] // ............*.................................. + // mls v13.8H, v18.8H, v7.H[0] // .............*................................. + // mul v3.8H, v24.8H, v9.H[0] // ...............*............................... + // mls v3.8H, v25.8H, v7.H[0] // ................*.............................. + // add v26.8H, v10.8H, v13.8H // ..................*............................ + // sub v11.8H, v10.8H, v13.8H // .................*............................. + // sqrdmulh v30.8H, v26.8H, v9.H[3] // .......................*....................... + // sub v13.8H, v19.8H, v3.8H // ....................*.......................... + // sqrdmulh v8.8H, v11.8H, v9.H[5] // ...................*........................... + // mul v25.8H, v11.8H, v9.H[4] // ......................*........................ + // mul v17.8H, v26.8H, v9.H[2] // .........................*..................... + // mls v25.8H, v8.8H, v7.H[0] // ........................*...................... + // ldr q5, [x4, #32] // ....*.......................................... + // sub v22.8H, v13.8H, v25.8H // ...........................*................... + // mls v17.8H, v30.8H, v7.H[0] // ..........................*.................... + // add v25.8H, v13.8H, v25.8H // ............................*.................. + // ldr q30, [x4], #(6*16) // .......*....................................... + // add v13.8H, v19.8H, v3.8H // .....................*......................... + // sub v4.8H, v13.8H, v17.8H // ..............................*................ + // add v28.8H, v13.8H, v17.8H // .............................*................. + // ldr q31, [x4, #-48] // ..*............................................ + // trn2 v14.4S, v25.4S, v22.4S // ................................*.............. + // trn2 v18.4S, v28.4S, v4.4S // .................................*............. + // trn1 v3.4S, v28.4S, v4.4S // ..................................*............ + // trn2 v27.2D, v18.2D, v14.2D // ...................................*........... + // trn1 v22.4S, v25.4S, v22.4S // ...............................*............... + // trn1 v18.2D, v18.2D, v14.2D // ....................................*.......... + // sqrdmulh v15.8H, v27.8H, v2.8H // .....................................*......... + // mul v28.8H, v27.8H, v30.8H // ......................................*........ + // trn2 v13.2D, v3.2D, v22.2D // .......................................*....... + // mls v28.8H, v15.8H, v7.H[0] // ........................................*...... + // sqrdmulh v6.8H, v13.8H, v2.8H // .........................................*..... + // mul v11.8H, v13.8H, v30.8H // ..........................................*.... + // ldr q17, [x4, #-16] // ........*...................................... + // ldr q21, [x4, #-32] // ..............*................................ + // sub v27.8H, v18.8H, v28.8H // ...........................................*... + // mls v11.8H, v6.8H, v7.H[0] // .............................................*. + // add v28.8H, v18.8H, v28.8H // ............................................*.. + // sqrdmulh v0.8H, v27.8H, v17.8H // ..............................................* + sub count, count, #1 -.p2align 2 layer4567_start: - ldr_vo v15, x1, 112 // ...e............................................................................... - ldr_vi v0, x3, 16 // ....e.............................................................................. - mul v6.8H, v21.8H, v28.8H // .................................................*................................. - ldr_vo v14, x4, -48 // ....................................*.............................................. - sub v29.8H, v17.8H, v5.8H // ..........................................*........................................ - // gap // ................................................................................... - sqrdmulh v8.8H, v12.8H, v30.8H // .......................................................*........................... - add v10.8H, v17.8H, v5.8H // ...........................................*....................................... - ldr_vo v24, x1, 96 // ..e................................................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - ldr_vo v17, x1, 64 // e.................................................................................. - // gap // ................................................................................... - sqrdmulh v28.8H, v15.8H, v0.H[1] // ...........e....................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v5.8H, v15.8H, v0.H[0] // ..........e........................................................................ - ldr_vo v20, x1, 80 // .e................................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v30.8H, v24.8H, v0.H[1] // ......e............................................................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v3.8H, v8.8H, v7.H[0] // ........................................................*.......................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v5.8H, v28.8H, v7.H[0] // ............e...................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v2.8H, v24.8H, v0.H[0] // .....e............................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v9.8H, v21.8H, v14.8H // ..................................................*................................ - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sub v31.8H, v20.8H, v5.8H // .............e..................................................................... - // gap // ................................................................................... - add v22.8H, v20.8H, v5.8H // ..............e.................................................................... - mls v2.8H, v30.8H, v7.H[0] // .......e........................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v30.8H, v31.8H, v0.H[5] // .....................e............................................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v6.8H, v9.8H, v7.H[0] // ...................................................*............................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v18.8H, v22.8H, v0.H[3] // ................e.................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - sub v9.8H, v17.8H, v2.8H // ........e.......................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v21.8H, v22.8H, v0.H[2] // ...............e................................................................... - add v23.8H, v17.8H, v2.8H // .........e......................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mul v8.8H, v31.8H, v0.H[4] // ....................e.............................................................. - // gap // ................................................................................... - // gap // ................................................................................... - sub v14.8H, v10.8H, v6.8H // ....................................................*.............................. - // gap // ................................................................................... - // gap // ................................................................................... - mls v8.8H, v30.8H, v7.H[0] // ......................e............................................................ - sub v16.8H, v29.8H, v3.8H // .........................................................*......................... - // gap // ................................................................................... - add v13.8H, v10.8H, v6.8H // .....................................................*............................. - // gap // ................................................................................... - // gap // ................................................................................... - mls v21.8H, v18.8H, v7.H[0] // .................e................................................................. - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqdmulh v30.8H, v13.8H, v7.H[1] // ...........................................................*....................... - add v3.8H, v29.8H, v3.8H // ..........................................................*........................ - // gap // ................................................................................... - sub v24.8H, v9.8H, v8.8H // .......................e........................................................... - // gap // ................................................................................... - // gap // ................................................................................... - add v4.8H, v9.8H, v8.8H // ........................e.......................................................... - sqdmulh v8.8H, v16.8H, v7.H[1] // ....................................................................*.............. - // gap // ................................................................................... - sub v17.8H, v23.8H, v21.8H // ..................e................................................................ - // gap // ................................................................................... - // gap // ................................................................................... - add v18.8H, v23.8H, v21.8H // ...................e............................................................... - sqdmulh v6.8H, v14.8H, v7.H[1] // ..............................................................*.................... - ldr_vo v23, x4, 16 // ..................................e................................................ - srshr v22.8H, v30.8H, #11 // ............................................................*...................... - // gap // ................................................................................... - // gap // ................................................................................... - sqdmulh v9.8H, v3.8H, v7.H[1] // .................................................................*................. - trn2 v28.4S, v4.4S, v24.4S // ............................e...................................................... - // gap // ................................................................................... - // gap // ................................................................................... - srshr v30.8H, v8.8H, #11 // .....................................................................*............. - // gap // ................................................................................... - mls v13.8H, v22.8H, v7.H[0] // .............................................................*..................... - trn2 v31.4S, v18.4S, v17.4S // ..........................e........................................................ - // gap // ................................................................................... - srshr v8.8H, v6.8H, #11 // ...............................................................*................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v16.8H, v30.8H, v7.H[0] // ......................................................................*............ - trn1 v17.4S, v18.4S, v17.4S // .........................e......................................................... - ldr_vi v18, x4, 96 // .................................e................................................. - // gap // ................................................................................... - trn2 v6.2D, v31.2D, v28.2D // ..............................e.................................................... - // gap // ................................................................................... - mls v14.8H, v8.8H, v7.H[0] // ................................................................*.................. - srshr v30.8H, v9.8H, #11 // ..................................................................*................ - // gap // ................................................................................... - trn1 v9.4S, v4.4S, v24.4S // ...........................e....................................................... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v8.8H, v6.8H, v23.8H // .............................................e..................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - // gap // ................................................................................... - mls v3.8H, v30.8H, v7.H[0] // ...................................................................*............... - trn2 v22.2D, v17.2D, v9.2D // .............................e..................................................... - // gap // ................................................................................... - trn1 v30.4S, v13.4S, v14.4S // .......................................................................*........... - // gap // ................................................................................... - // gap // ................................................................................... - trn1 v17.2D, v17.2D, v9.2D // ...............................e................................................... - mul v9.8H, v6.8H, v18.8H // ............................................e...................................... - // gap // ................................................................................... - trn1 v6.2D, v31.2D, v28.2D // ................................e.................................................. - ldr_vo v28, x4, -64 // ...................................e............................................... - // gap // ................................................................................... - mls v9.8H, v8.8H, v7.H[0] // ..............................................e.................................... - // gap // ................................................................................... - // gap // ................................................................................... - trn1 v8.4S, v3.4S, v16.4S // .........................................................................*......... - // gap // ................................................................................... - // gap // ................................................................................... - sqrdmulh v24.8H, v22.8H, v23.8H // ........................................e.......................................... - trn2 v20.4S, v13.4S, v14.4S // ........................................................................*.......... - // gap // ................................................................................... - trn2 v14.4S, v3.4S, v16.4S // ..........................................................................*........ - ldr_vo v16, x4, -32 // .....................................e............................................. - // gap // ................................................................................... - mul v5.8H, v22.8H, v18.8H // .......................................e........................................... - trn2 v25.2D, v30.2D, v8.2D // ...........................................................................*....... - // gap // ................................................................................... - trn1 v15.2D, v30.2D, v8.2D // .............................................................................*..... - ldr_vo v30, x4, -16 // ......................................e............................................ - // gap // ................................................................................... - trn2 v11.2D, v20.2D, v14.2D // ............................................................................*...... - sub v12.8H, v6.8H, v9.8H // ...............................................e................................... - // gap // ................................................................................... - mls v5.8H, v24.8H, v7.H[0] // .........................................e......................................... - trn1 v14.2D, v20.2D, v14.2D // ..............................................................................*.... - str_vo v25, x1, 32 // .................................................................................*. - add v21.8H, v6.8H, v9.8H // ................................................e.................................. - str_vi v15, x1, 64 // ...............................................................................*... - // gap // ................................................................................... - mul v3.8H, v12.8H, v16.8H // ......................................................e............................ - str_vo v11, x1, -16 // ..................................................................................* - // gap // ................................................................................... - str_vo v14, x1, -48 // ................................................................................*.. - // gap // ................................................................................... - // gap // ................................................................................... - - // original source code - // ldr_vo v8, x1, 0 // ........e............................................................................................................................................................. || ....e............................................................................................................................. - // ldr_vo v9, x1, 16 // ...........e.......................................................................................................................................................... || ......e........................................................................................................................... - // ldr_vo v10, x1, 32 // .......e.............................................................................................................................................................. || ..e............................................................................................................................... - // ldr_vo v11, x1, 48 // e..................................................................................................................................................................... || e................................................................................................................................. - // ldr_vi v0, x3, 16 // .e.................................................................................................................................................................... || e................................................................................................................................. - // mul v24.8H, v10.8H, v0.H[0] // ...............e...................................................................................................................................................... || ..............e................................................................................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[1] // ............e......................................................................................................................................................... || ........e......................................................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ...................e.................................................................................................................................................. || ..................e............................................................................................................... - // sub v10.8H, v8.8H, v24.8H // .......................e.............................................................................................................................................. || .........................e........................................................................................................ - // add v8.8H, v8.8H, v24.8H // .........................e............................................................................................................................................ || ..........................e....................................................................................................... - // mul v24.8H, v11.8H, v0.H[0] // ..........e........................................................................................................................................................... || ......e........................................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // .........e............................................................................................................................................................ || ....e............................................................................................................................. - // mls v24.8H, v11.8H, v7.H[0] // ..............e....................................................................................................................................................... || ............e..................................................................................................................... - // sub v11.8H, v9.8H, v24.8H // .................e.................................................................................................................................................... || .................e................................................................................................................ - // add v9.8H, v9.8H, v24.8H // ..................e................................................................................................................................................... || ..................e............................................................................................................... - // mul v24.8H, v9.8H, v0.H[2] // ........................e............................................................................................................................................. || ..........................e....................................................................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ......................e............................................................................................................................................... || ........................e......................................................................................................... - // mls v24.8H, v9.8H, v7.H[0] // ...............................e...................................................................................................................................... || ................................e................................................................................................. - // sub v9.8H, v8.8H, v24.8H // .....................................e................................................................................................................................ || .....................................e............................................................................................ - // add v8.8H, v8.8H, v24.8H // ......................................e............................................................................................................................... || ......................................e........................................................................................... - // mul v24.8H, v11.8H, v0.H[4] // ..........................e........................................................................................................................................... || ............................e..................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ....................e................................................................................................................................................. || ....................e............................................................................................................. - // mls v24.8H, v11.8H, v7.H[0] // ............................e......................................................................................................................................... || ..............................e................................................................................................... - // sub v11.8H, v10.8H, v24.8H // ..................................e................................................................................................................................... || ...................................e.............................................................................................. - // add v10.8H, v10.8H, v24.8H // ...................................e.................................................................................................................................. || ....................................e............................................................................................. - // trn1 v25.4S, v8.4S, v9.4S // .................................................e.................................................................................................................... || ............................................e..................................................................................... - // trn2 v26.4S, v8.4S, v9.4S // ..............................................e....................................................................................................................... || ..........................................e....................................................................................... - // trn1 v27.4S, v10.4S, v11.4S // ......................................................e............................................................................................................... || ...............................................e.................................................................................. - // trn2 v28.4S, v10.4S, v11.4S // ...........................................e.......................................................................................................................... || ........................................e......................................................................................... - // trn2 v10.2D, v25.2D, v27.2D // .........................................................e............................................................................................................ || ..................................................e............................................................................... - // trn2 v11.2D, v26.2D, v28.2D // ...................................................e.................................................................................................................. || .............................................e.................................................................................... - // trn1 v8.2D, v25.2D, v27.2D // ...........................................................e.......................................................................................................... || ....................................................e............................................................................. - // trn1 v9.2D, v26.2D, v28.2D // .............................................................e........................................................................................................ || .....................................................e............................................................................ - // ldr_vi v0, x4, 96 // ..................................................e................................................................................................................... || ............................................e..................................................................................... - // ldr_vo v4, x4, -80 // ........................................e............................................................................................................................. || ......................................e........................................................................................... - // ldr_vo v1, x4, -64 // ..............................................................e....................................................................................................... || .....................................................e............................................................................ - // ldr_vo v5, x4, -48 // ......................................................................................*............................................................................... || ..................................................................*............................................................... - // ldr_vo v2, x4, -32 // ....................................................................e................................................................................................. || .........................................................e........................................................................ - // ldr_vo v6, x4, -16 // ........................................................................e............................................................................................. || ...........................................................e...................................................................... - // mul v24.8H, v10.8H, v0.8H // .....................................................................e................................................................................................ || ..........................................................e....................................................................... - // sqrdmulh v10.8H, v10.8H, v4.8H // .................................................................e.................................................................................................... || ........................................................e......................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ...........................................................................e.......................................................................................... || .............................................................e.................................................................... - // sub v10.8H, v8.8H, v24.8H // .......................................................................................*.............................................................................. || ..................................................................*............................................................... - // add v8.8H, v8.8H, v24.8H // .........................................................................................*............................................................................ || ...................................................................*.............................................................. - // mul v24.8H, v11.8H, v0.8H // ............................................................e......................................................................................................... || ....................................................e............................................................................. - // sqrdmulh v11.8H, v11.8H, v4.8H // .......................................................e.............................................................................................................. || ................................................e................................................................................. - // mls v24.8H, v11.8H, v7.H[0] // ...............................................................e...................................................................................................... || ......................................................e........................................................................... - // sub v11.8H, v9.8H, v24.8H // ..........................................................................e........................................................................................... || ............................................................e..................................................................... - // add v9.8H, v9.8H, v24.8H // ..............................................................................e....................................................................................... || ..............................................................e................................................................... - // mul v24.8H, v9.8H, v1.8H // .....................................................................................*................................................................................ || .................................................................*................................................................ - // sqrdmulh v9.8H, v9.8H, v5.8H // ...................................................................................................*.................................................................. || .................................................................................*................................................ - // mls v24.8H, v9.8H, v7.H[0] // ........................................................................................................*............................................................. || .......................................................................................*.......................................... - // sub v9.8H, v8.8H, v24.8H // ..............................................................................................................*....................................................... || ..............................................................................................*................................... - // add v8.8H, v8.8H, v24.8H // .................................................................................................................*.................................................... || ................................................................................................*................................. - // mul v24.8H, v11.8H, v2.8H // ................................................................................e..................................................................................... || ...............................................................e.................................................................. - // sqrdmulh v11.8H, v11.8H, v6.8H // ........................................................................................*............................................................................. || ...................................................................*.............................................................. - // mls v24.8H, v11.8H, v7.H[0] // ................................................................................................*..................................................................... || ...........................................................................*...................................................... - // sub v11.8H, v10.8H, v24.8H // ................................................................................................................*..................................................... || ...............................................................................................*.................................. - // add v10.8H, v10.8H, v24.8H // ....................................................................................................................*................................................. || ...................................................................................................*.............................. - // sqdmulh v25.8H, v8.8H, v7.H[1] // ...................................................................................................................*.................................................. || ...................................................................................................*.............................. - // srshr v25.8H, v25.8H, #11 // ............................................................................................................................*......................................... || ........................................................................................................*......................... - // mls v8.8H, v25.8H, v7.H[0] // ................................................................................................................................*..................................... || ...........................................................................................................*...................... - // sqdmulh v25.8H, v9.8H, v7.H[1] // ..........................................................................................................................*........................................... || .......................................................................................................*.......................... - // srshr v25.8H, v25.8H, #11 // ..................................................................................................................................*................................... || ............................................................................................................*..................... - // mls v9.8H, v25.8H, v7.H[0] // .......................................................................................................................................*.............................. || ...............................................................................................................*.................. - // sqdmulh v25.8H, v10.8H, v7.H[1] // .............................................................................................................................*........................................ || .........................................................................................................*........................ - // srshr v25.8H, v25.8H, #11 // ........................................................................................................................................*............................. || ...............................................................................................................*.................. - // mls v10.8H, v25.8H, v7.H[0] // ...........................................................................................................................................*.......................... || ...................................................................................................................*.............. - // sqdmulh v25.8H, v11.8H, v7.H[1] // .......................................................................................................................*.............................................. || .....................................................................................................*............................ - // srshr v25.8H, v25.8H, #11 // ...............................................................................................................................*...................................... || ..........................................................................................................*....................... - // mls v11.8H, v25.8H, v7.H[0] // ...................................................................................................................................*.................................. || .............................................................................................................*.................... - // trn1 v25.4S, v8.4S, v9.4S // .............................................................................................................................................*........................ || ....................................................................................................................*............. - // trn2 v26.4S, v8.4S, v9.4S // .....................................................................................................................................................*................ || .........................................................................................................................*........ - // trn1 v27.4S, v10.4S, v11.4S // ...................................................................................................................................................*.................. || ........................................................................................................................*......... - // trn2 v28.4S, v10.4S, v11.4S // ......................................................................................................................................................*............... || ..........................................................................................................................*....... - // trn2 v10.2D, v25.2D, v27.2D // .........................................................................................................................................................*............ || ...........................................................................................................................*...... - // trn2 v11.2D, v26.2D, v28.2D // ............................................................................................................................................................*......... || .............................................................................................................................*.... - // trn1 v8.2D, v25.2D, v27.2D // ..........................................................................................................................................................*........... || ............................................................................................................................*..... - // trn1 v9.2D, v26.2D, v28.2D // ...............................................................................................................................................................*...... || ..............................................................................................................................*... - // str_vi v8, x1, 64 // ..................................................................................................................................................................*... || ...............................................................................................................................*.. - // str_vo v9, x1, -48 // .....................................................................................................................................................................* || .................................................................................................................................* - // str_vo v10, x1, -32 // ................................................................................................................................................................*..... || ..............................................................................................................................*... - // str_vo v11, x1, -16 // ....................................................................................................................................................................*. || ................................................................................................................................*. - - subs count, count, #1 + // Instructions: 83 + // Expected cycles: 65 + // Expected IPC: 1.28 + // + // Cycle bound: 65.0 + // IPC bound: 1.28 + // + // Wall time: 469.55s + // User time: 469.55s + // + // ------------------------------- original position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------- + ldr q9, [x3], #16 // ....e.............................................................................. + mul v27.8H, v27.8H, v21.8H // .......................................................*........................... + ldr q20, [x1, #112] // ...e............................................................................... + ldr q2, [x4, #16] // ..................................e................................................ + ldr q19, [x1, #64] // e.................................................................................. + trn1 v23.2D, v3.2D, v22.2D // ...............................*................................................... + ldr q10, [x1, #80] // .e................................................................................. + sqrdmulh v16.8H, v28.8H, v31.8H // .................................................*................................. + // gap // ................................................................................... + ldr q24, [x1, #96] // ..e................................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + sub v15.8H, v23.8H, v11.8H // ..........................................*........................................ + sqrdmulh v18.8H, v20.8H, v9.H[1] // ..........e........................................................................ + // gap // ................................................................................... + add v14.8H, v23.8H, v11.8H // ...........................................*....................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v13.8H, v20.8H, v9.H[0] // ...........e....................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v25.8H, v24.8H, v9.H[1] // .....e............................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v13.8H, v18.8H, v7.H[0] // ............e...................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v3.8H, v24.8H, v9.H[0] // ......e............................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v3.8H, v25.8H, v7.H[0] // .......e........................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v26.8H, v10.8H, v13.8H // ..............e.................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v27.8H, v0.8H, v7.H[0] // ........................................................*.......................... + sub v11.8H, v10.8H, v13.8H // .............e..................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v30.8H, v26.8H, v9.H[3] // ...............e................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v13.8H, v19.8H, v3.8H // ........e.......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v8.8H, v11.8H, v9.H[5] // ....................e.............................................................. + // gap // ................................................................................... + // gap // ................................................................................... + add v6.8H, v15.8H, v27.8H // ..........................................................*........................ + // gap // ................................................................................... + // gap // ................................................................................... + mul v25.8H, v11.8H, v9.H[4] // .....................e............................................................. + sub v11.8H, v15.8H, v27.8H // .........................................................*......................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v17.8H, v26.8H, v9.H[2] // ................e.................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v25.8H, v8.8H, v7.H[0] // ......................e............................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v15.8H, v28.8H, v5.8H // ..................................................*................................ + ldr q5, [x4, #32] // ...................................e............................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v15.8H, v16.8H, v7.H[0] // ...................................................*............................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v22.8H, v13.8H, v25.8H // .......................e........................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v17.8H, v30.8H, v7.H[0] // .................e................................................................. + add v25.8H, v13.8H, v25.8H // ........................e.......................................................... + ldr q30, [x4], #(6*16) // .................................e................................................. + add v13.8H, v19.8H, v3.8H // .........e......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v16.8H, v6.8H, v7.H[1] // .................................................................*................. + add v26.8H, v14.8H, v15.8H // .....................................................*............................. + // gap // ................................................................................... + // gap // ................................................................................... + sub v0.8H, v14.8H, v15.8H // ....................................................*.............................. + sqdmulh v15.8H, v11.8H, v7.H[1] // ....................................................................*.............. + // gap // ................................................................................... + sub v4.8H, v13.8H, v17.8H // ..................e................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + add v28.8H, v13.8H, v17.8H // ...................e............................................................... + // gap // ................................................................................... + sqdmulh v13.8H, v26.8H, v7.H[1] // ...........................................................*....................... + srshr v27.8H, v16.8H, #11 // ..................................................................*................ + // gap // ................................................................................... + // gap // ................................................................................... + ldr q31, [x4, #-48] // ....................................e.............................................. + trn2 v14.4S, v25.4S, v22.4S // ............................e...................................................... + sqdmulh v16.8H, v0.8H, v7.H[1] // ..............................................................*.................... + srshr v15.8H, v15.8H, #11 // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + mls v6.8H, v27.8H, v7.H[0] // ...................................................................*............... + // gap // ................................................................................... + trn2 v18.4S, v28.4S, v4.4S // ..........................e........................................................ + srshr v1.8H, v13.8H, #11 // ............................................................*...................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v11.8H, v15.8H, v7.H[0] // ......................................................................*............ + trn1 v3.4S, v28.4S, v4.4S // .........................e......................................................... + // gap // ................................................................................... + trn2 v27.2D, v18.2D, v14.2D // ..............................e.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v26.8H, v1.8H, v7.H[0] // .............................................................*..................... + srshr v28.8H, v16.8H, #11 // ...............................................................*................... + // gap // ................................................................................... + trn1 v22.4S, v25.4S, v22.4S // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v18.2D, v18.2D, v14.2D // ................................e.................................................. + sqrdmulh v15.8H, v27.8H, v2.8H // ............................................e...................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v16.4S, v6.4S, v11.4S // ..........................................................................*........ + mls v0.8H, v28.8H, v7.H[0] // ................................................................*.................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v28.8H, v27.8H, v30.8H // .............................................e..................................... + trn1 v27.4S, v6.4S, v11.4S // .........................................................................*......... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v13.2D, v3.2D, v22.2D // .............................e..................................................... + mls v28.8H, v15.8H, v7.H[0] // ..............................................e.................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v21.4S, v26.4S, v0.4S // .......................................................................*........... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v6.8H, v13.8H, v2.8H // .......................................e........................................... + trn2 v14.4S, v26.4S, v0.4S // ........................................................................*.......... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v11.8H, v13.8H, v30.8H // ........................................e.......................................... + trn2 v15.2D, v21.2D, v27.2D // ...........................................................................*....... + // gap // ................................................................................... + trn1 v23.2D, v21.2D, v27.2D // .............................................................................*..... + // gap // ................................................................................... + ldr q17, [x4, #-16] // ......................................e............................................ + ldr q21, [x4, #-32] // .....................................e............................................. + sub v27.8H, v18.8H, v28.8H // ...............................................e................................... + trn1 v24.2D, v14.2D, v16.2D // ..............................................................................*.... + trn2 v16.2D, v14.2D, v16.2D // ............................................................................*...... + mls v11.8H, v6.8H, v7.H[0] // .........................................e......................................... + str q15, [x1, #32] // .................................................................................*. + add v28.8H, v18.8H, v28.8H // ................................................e.................................. + str q23, [x1], #64 // ...............................................................................*... + // gap // ................................................................................... + sqrdmulh v0.8H, v27.8H, v17.8H // ......................................................e............................ + str q24, [x1, #-48] // ................................................................................*.. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q16, [x1, #-16] // ..................................................................................* + + // --------------------------------------------------------------------------- new position ----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q8, [x1, #(16*0)] // ....e..............................................................................'...~.............................................................................. + // ldr q9, [x1, #(16*1)] // ......e............................................................................'.....~............................................................................ + // ldr q10, [x1, #(16*2)] // ........e..........................................................................'.......~.......................................................................... + // ldr q11, [x1, #(16*3)] // ..e................................................................................'.~................................................................................ + // ldr q0, [x3], #16 // e..................................................................................~.................................................................................. + // sqrdmulh v27.8h, v10.8h, v0.h[1] // .............e.....................................................................'............~..................................................................... + // mul v24.8h, v10.8h, v0.h[0] // ...............e...................................................................'..............~................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ................e..................................................................'...............~.................................................................. + // sub v10.8h, v8.8h, v24.8h // .....................e.............................................................'....................~............................................................. + // add v8.8h, v8.8h, v24.8h // ...................................e...............................................'..................................~............................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ..........e........................................................................'.........~........................................................................ + // mul v24.8h, v11.8h, v0.h[0] // ............e......................................................................'...........~...................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............e....................................................................'.............~.................................................................... + // sub v11.8h, v9.8h, v24.8h // ...................e...............................................................'..................~............................................................... + // add v9.8h, v9.8h, v24.8h // .................e.................................................................'................~................................................................. + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ....................e..............................................................'...................~.............................................................. + // mul v24.8h, v9.8h, v0.h[2] // ..........................e........................................................'.........................~........................................................ + // mls v24.8h, v27.8h, v7.h[0] // ................................e..................................................'...............................~.................................................. + // sub v9.8h, v8.8h, v24.8h // ........................................e..........................................'.......................................~.......................................... + // add v8.8h, v8.8h, v24.8h // .........................................e.........................................'........................................~......................................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ......................e............................................................'.....................~............................................................ + // mul v24.8h, v11.8h, v0.h[4] // ........................e..........................................................'.......................~.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................e.......................................................'..........................~....................................................... + // sub v11.8h, v10.8h, v24.8h // ...............................e...................................................'..............................~................................................... + // add v10.8h, v10.8h, v24.8h // .................................e.................................................'................................~................................................. + // trn1 v25.4s, v8.4s, v9.4s // ....................................................e..............................'...................................................~.............................. + // trn2 v26.4s, v8.4s, v9.4s // .................................................e.................................'................................................~................................. + // trn1 v27.4s, v10.4s, v11.4s // ........................................................e..........................'.......................................................~.......................... + // trn2 v28.4s, v10.4s, v11.4s // .............................................e.....................................'............................................~..................................... + // trn2 v10.2d, v25.2d, v27.2d // ...............................................................e...................'..............................................................~................... + // trn2 v11.2d, v26.2d, v28.2d // .....................................................e.............................'....................................................~............................. + // trn1 v8.2d, v25.2d, v27.2d // .....~.............................................................................'....*............................................................................. + // trn1 v9.2d, v26.2d, v28.2d // .........................................................e.........................'........................................................~......................... + // ldr q0, [ x4], #(6*16) // ..................................e................................................'.................................~................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // ...e...............................................................................'..~............................................................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // .............................e.....................................................'............................~..................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ............................................e......................................'...........................................~...................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // ........................................................................e..........'.......................................................................~.......... + // ldr q6, [x4, #(-6*16 + 5*16)] // .......................................................................e...........'......................................................................~........... + // sqrdmulh v27.8h, v10.8h, v4.8h // ..................................................................e................'.................................................................~................ + // mul v24.8h, v10.8h, v0.8h // ....................................................................e..............'...................................................................~.............. + // mls v24.8h, v27.8h, v7.h[0] // ............................................................................e......'...........................................................................~...... + // sub v10.8h, v8.8h, v24.8h // .........~.........................................................................'........*......................................................................... + // add v8.8h, v8.8h, v24.8h // ...........~.......................................................................'..........*....................................................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ..........................................................e........................'.........................................................~........................ + // mul v24.8h, v11.8h, v0.8h // .............................................................e.....................'............................................................~..................... + // mls v24.8h, v27.8h, v7.h[0] // ................................................................e..................'...............................................................~.................. + // sub v11.8h, v9.8h, v24.8h // .........................................................................e.........'........................................................................~......... + // add v9.8h, v9.8h, v24.8h // ..............................................................................e....'.............................................................................~.... + // sqrdmulh v27.8h, v9.8h, v5.8h // .......~...........................................................................'......*........................................................................... + // mul v24.8h, v9.8h, v1.8h // ............................~......................................................'...........................*...................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................~....................................................'.............................*.................................................... + // sub v9.8h, v8.8h, v24.8h // ......................................~............................................'.....................................*............................................ + // add v8.8h, v8.8h, v24.8h // .....................................~.............................................'....................................*............................................. + // sqrdmulh v27.8h, v11.8h, v6.8h // ................................................................................e..'...............................................................................~.. + // mul v24.8h, v11.8h, v2.8h // .~.................................................................................'*................................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..................~................................................................'.................*................................................................ + // sub v11.8h, v10.8h, v24.8h // .........................~.........................................................'........................*......................................................... + // add v10.8h, v10.8h, v24.8h // .......................~...........................................................'......................*........................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..........................................~........................................'.........................................*........................................ + // srshr v25.8h, v25.8h, #11 // ..................................................~................................'.................................................*................................ + // mls v8.8h, v25.8h, v7.h[0] // ......................................................~............................'.....................................................*............................ + // sqdmulh v25.8h, v9.8h, v7.h[1] // ..............................................~....................................'.............................................*.................................... + // srshr v25.8h, v25.8h, #11 // .......................................................~...........................'......................................................*........................... + // mls v9.8h, v25.8h, v7.h[0] // ............................................................~......................'...........................................................*...................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ....................................~..............................................'...................................*.............................................. + // srshr v25.8h, v25.8h, #11 // ...........................................~.......................................'..........................................*....................................... + // mls v10.8h, v25.8h, v7.h[0] // ................................................~..................................'...............................................*.................................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // .......................................~...........................................'......................................*........................................... + // srshr v25.8h, v25.8h, #11 // ...............................................~...................................'..............................................*................................... + // mls v11.8h, v25.8h, v7.h[0] // ...................................................~...............................'..................................................*............................... + // trn1 v25.4s, v8.4s, v9.4s // .................................................................~.................'................................................................*................. + // trn2 v26.4s, v8.4s, v9.4s // ...................................................................~...............'..................................................................*............... + // trn1 v27.4s, v10.4s, v11.4s // ..............................................................~....................'.............................................................*.................... + // trn2 v28.4s, v10.4s, v11.4s // ...........................................................~.......................'..........................................................*....................... + // trn2 v10.2d, v25.2d, v27.2d // .....................................................................~.............'....................................................................*............. + // trn2 v11.2d, v26.2d, v28.2d // ...........................................................................~.......'..........................................................................*....... + // trn1 v8.2d, v25.2d, v27.2d // ......................................................................~............'.....................................................................*............ + // trn1 v9.2d, v26.2d, v28.2d // ..........................................................................~........'.........................................................................*........ + // str q8, [x1], #64 // ...............................................................................~...'..............................................................................*... + // str q9, [x1, #(-(64) + 16*1)] // .................................................................................~.'................................................................................*. + // str q10, [x1, #(-(64) + 16*2)] // .............................................................................~.....'............................................................................*..... + // str q11, [x1, #(-(64) + 16*3)] // ..................................................................................~'.................................................................................* + + sub count, count, #1 cbnz count, layer4567_start - sqrdmulh v13.8H, v12.8H, v30.8H // ...*................................ - ldr_vo v14, x4, -48 // .*.................................. - sub v15.8H, v17.8H, v5.8H // ..*................................. - add v8.8H, v17.8H, v5.8H // ....*............................... - // gap // .................................... - // gap // .................................... - mul v22.8H, v21.8H, v28.8H // *................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sqrdmulh v14.8H, v21.8H, v14.8H // ......*............................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v3.8H, v13.8H, v7.H[0] // .....*.............................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v22.8H, v14.8H, v7.H[0] // .......*............................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - add v9.8H, v15.8H, v3.8H // ............*....................... - sub v4.8H, v15.8H, v3.8H // .........*.......................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - add v1.8H, v8.8H, v22.8H // ..........*......................... - sqdmulh v19.8H, v9.8H, v7.H[1] // ................*................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sub v0.8H, v8.8H, v22.8H // ........*........................... - // gap // .................................... - sqdmulh v5.8H, v4.8H, v7.H[1] // .............*...................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sqdmulh v18.8H, v1.8H, v7.H[1] // ...........*........................ - // gap // .................................... - // gap // .................................... - srshr v2.8H, v19.8H, #11 // ......................*............. - // gap // .................................... - // gap // .................................... - sqdmulh v20.8H, v0.8H, v7.H[1] // ..............*..................... - // gap // .................................... - // gap // .................................... - srshr v23.8H, v5.8H, #11 // .................*.................. - // gap // .................................... - // gap // .................................... - mls v9.8H, v2.8H, v7.H[0] // .......................*............ - // gap // .................................... - // gap // .................................... - srshr v30.8H, v18.8H, #11 // ...............*.................... - // gap // .................................... - // gap // .................................... - mls v4.8H, v23.8H, v7.H[0] // ....................*............... - // gap // .................................... - // gap // .................................... - srshr v20.8H, v20.8H, #11 // ...................*................ - // gap // .................................... - // gap // .................................... - mls v1.8H, v30.8H, v7.H[0] // ..................*................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v0.8H, v20.8H, v7.H[0] // .....................*.............. - // gap // .................................... - // gap // .................................... - trn1 v30.4S, v9.4S, v4.4S // .........................*.......... - // gap // .................................... - // gap // .................................... - trn2 v11.4S, v9.4S, v4.4S // ...........................*........ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - trn2 v5.4S, v1.4S, v0.4S // ..........................*......... - trn1 v14.4S, v1.4S, v0.4S // ........................*........... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - trn2 v12.2D, v14.2D, v30.2D // ............................*....... - trn1 v29.2D, v14.2D, v30.2D // .............................*...... - // gap // .................................... - trn2 v22.2D, v5.2D, v11.2D // ..............................*..... - trn1 v20.2D, v5.2D, v11.2D // ...............................*.... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str_vo v12, x1, 32 // ................................*... - str_vi v29, x1, 64 // .................................*.. - // gap // .................................... - str_vo v22, x1, -16 // ..................................*. - str_vo v20, x1, -48 // ...................................* - // gap // .................................... - - // original source code - // mul v6.8H, v21.8H, v28.8H // ....*............................... || ..*...................................... - // ldr_vo v14, x4, -48 // .*.................................. || *........................................ - // sub v29.8H, v17.8H, v5.8H // ..*................................. || *........................................ - // sqrdmulh v8.8H, v12.8H, v30.8H // *................................... || *........................................ - // add v10.8H, v17.8H, v5.8H // ...*................................ || .*....................................... - // mls v3.8H, v8.8H, v7.H[0] // ......*............................. || ......*.................................. - // sqrdmulh v9.8H, v21.8H, v14.8H // .....*.............................. || ....*.................................... - // mls v6.8H, v9.8H, v7.H[0] // .......*............................ || .........*............................... - // sub v14.8H, v10.8H, v6.8H // ............*....................... || ................*........................ - // sub v16.8H, v29.8H, v3.8H // .........*.......................... || ...........*............................. - // add v13.8H, v10.8H, v6.8H // ..........*......................... || ..............*.......................... - // sqdmulh v30.8H, v13.8H, v7.H[1] // ..............*..................... || ..................*...................... - // add v3.8H, v29.8H, v3.8H // ........*........................... || ...........*............................. - // sqdmulh v8.8H, v16.8H, v7.H[1] // .............*...................... || ................*........................ - // sqdmulh v6.8H, v14.8H, v7.H[1] // ................*................... || ....................*.................... - // srshr v22.8H, v30.8H, #11 // ...................*................ || .......................*................. - // sqdmulh v9.8H, v3.8H, v7.H[1] // ...........*........................ || ..............*.......................... - // srshr v30.8H, v8.8H, #11 // .................*.................. || .....................*................... - // mls v13.8H, v22.8H, v7.H[0] // ......................*............. || ..........................*.............. - // srshr v8.8H, v6.8H, #11 // .....................*.............. || .........................*............... - // mls v16.8H, v30.8H, v7.H[0] // ....................*............... || ........................*................ - // mls v14.8H, v8.8H, v7.H[0] // .......................*............ || ............................*............ - // srshr v30.8H, v9.8H, #11 // ...............*.................... || ...................*..................... - // mls v3.8H, v30.8H, v7.H[0] // ..................*................. || ......................*.................. - // trn1 v30.4S, v13.4S, v14.4S // ...........................*........ || .................................*....... - // trn1 v8.4S, v3.4S, v16.4S // ........................*........... || .............................*........... - // trn2 v20.4S, v13.4S, v14.4S // ..........................*......... || .................................*....... - // trn2 v14.4S, v3.4S, v16.4S // .........................*.......... || ..............................*.......... - // trn2 v25.2D, v30.2D, v8.2D // ............................*....... || ....................................*.... - // trn1 v15.2D, v30.2D, v8.2D // .............................*...... || ....................................*.... - // trn2 v11.2D, v20.2D, v14.2D // ..............................*..... || .....................................*... - // trn1 v14.2D, v20.2D, v14.2D // ...............................*.... || .....................................*... - // str_vo v25, x1, 32 // ................................*... || .......................................*. - // str_vi v15, x1, 64 // .................................*.. || .......................................*. - // str_vo v11, x1, -16 // ..................................*. || ........................................* - // str_vo v14, x1, -48 // ...................................* || ........................................* - + // Instructions: 36 + // Expected cycles: 39 + // Expected IPC: 0.92 + // + // Cycle bound: 39.0 + // IPC bound: 0.92 + // + // Wall time: 0.59s + // User time: 0.59s + // + // -------- original position --------> + // 0 25 + // |------------------------|---------- + trn1 v14.2D, v3.2D, v22.2D // .*.................................. + mul v8.8H, v27.8H, v21.8H // *................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sqrdmulh v26.8H, v28.8H, v31.8H // ..*................................. + // gap // .................................... + // gap // .................................... + sub v10.8H, v14.8H, v11.8H // ...*................................ + // gap // .................................... + // gap // .................................... + mls v8.8H, v0.8H, v7.H[0] // .....*.............................. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v1.8H, v28.8H, v5.8H // ........*........................... + add v5.8H, v14.8H, v11.8H // ....*............................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v1.8H, v26.8H, v7.H[0] // .........*.......................... + // gap // .................................... + // gap // .................................... + add v20.8H, v10.8H, v8.8H // ......*............................. + // gap // .................................... + // gap // .................................... + sub v10.8H, v10.8H, v8.8H // .......*............................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sqdmulh v21.8H, v20.8H, v7.H[1] // ..........*......................... + // gap // .................................... + // gap // .................................... + add v2.8H, v5.8H, v1.8H // ...........*........................ + // gap // .................................... + // gap // .................................... + sub v26.8H, v5.8H, v1.8H // ............*....................... + // gap // .................................... + sqdmulh v16.8H, v10.8H, v7.H[1] // .............*...................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sqdmulh v4.8H, v2.8H, v7.H[1] // ..............*..................... + srshr v28.8H, v21.8H, #11 // ...............*.................... + // gap // .................................... + // gap // .................................... + sqdmulh v31.8H, v26.8H, v7.H[1] // ................*................... + // gap // .................................... + // gap // .................................... + srshr v13.8H, v16.8H, #11 // .................*.................. + // gap // .................................... + // gap // .................................... + mls v20.8H, v28.8H, v7.H[0] // ..................*................. + // gap // .................................... + // gap // .................................... + srshr v24.8H, v4.8H, #11 // ...................*................ + // gap // .................................... + // gap // .................................... + mls v10.8H, v13.8H, v7.H[0] // ....................*............... + // gap // .................................... + // gap // .................................... + srshr v21.8H, v31.8H, #11 // ......................*............. + // gap // .................................... + // gap // .................................... + mls v2.8H, v24.8H, v7.H[0] // .....................*.............. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v26.8H, v21.8H, v7.H[0] // ........................*........... + // gap // .................................... + // gap // .................................... + trn2 v12.4S, v20.4S, v10.4S // .......................*............ + // gap // .................................... + // gap // .................................... + trn1 v30.4S, v20.4S, v10.4S // .........................*.......... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + trn1 v19.4S, v2.4S, v26.4S // ..........................*......... + // gap // .................................... + // gap // .................................... + trn2 v10.4S, v2.4S, v26.4S // ...........................*........ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + trn2 v29.2D, v19.2D, v30.2D // ............................*....... + trn1 v24.2D, v19.2D, v30.2D // .............................*...... + // gap // .................................... + // gap // .................................... + trn2 v5.2D, v10.2D, v12.2D // ...............................*.... + trn1 v30.2D, v10.2D, v12.2D // ..............................*..... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q29, [x1, #32] // ................................*... + str q24, [x1], #64 // .................................*.. + // gap // .................................... + // gap // .................................... + str q5, [x1, #-16] // ...................................* + str q30, [x1, #-48] // ..................................*. + + // ---------- new position -----------> + // 0 25 + // |------------------------|---------- + // mul v27.8H, v27.8H, v21.8H // .*.................................. + // trn1 v23.2D, v3.2D, v22.2D // *................................... + // sqrdmulh v16.8H, v28.8H, v31.8H // ..*................................. + // sub v15.8H, v23.8H, v11.8H // ...*................................ + // add v14.8H, v23.8H, v11.8H // ......*............................. + // mls v27.8H, v0.8H, v7.H[0] // ....*............................... + // add v6.8H, v15.8H, v27.8H // ........*........................... + // sub v11.8H, v15.8H, v27.8H // .........*.......................... + // mul v15.8H, v28.8H, v5.8H // .....*.............................. + // mls v15.8H, v16.8H, v7.H[0] // .......*............................ + // sqdmulh v16.8H, v6.8H, v7.H[1] // ..........*......................... + // add v26.8H, v14.8H, v15.8H // ...........*........................ + // sub v0.8H, v14.8H, v15.8H // ............*....................... + // sqdmulh v15.8H, v11.8H, v7.H[1] // .............*...................... + // sqdmulh v13.8H, v26.8H, v7.H[1] // ..............*..................... + // srshr v27.8H, v16.8H, #11 // ...............*.................... + // sqdmulh v16.8H, v0.8H, v7.H[1] // ................*................... + // srshr v15.8H, v15.8H, #11 // .................*.................. + // mls v6.8H, v27.8H, v7.H[0] // ..................*................. + // srshr v1.8H, v13.8H, #11 // ...................*................ + // mls v11.8H, v15.8H, v7.H[0] // ....................*............... + // mls v26.8H, v1.8H, v7.H[0] // ......................*............. + // srshr v28.8H, v16.8H, #11 // .....................*.............. + // trn2 v16.4S, v6.4S, v11.4S // ........................*........... + // mls v0.8H, v28.8H, v7.H[0] // .......................*............ + // trn1 v27.4S, v6.4S, v11.4S // .........................*.......... + // trn1 v21.4S, v26.4S, v0.4S // ..........................*......... + // trn2 v14.4S, v26.4S, v0.4S // ...........................*........ + // trn2 v15.2D, v21.2D, v27.2D // ............................*....... + // trn1 v23.2D, v21.2D, v27.2D // .............................*...... + // trn1 v24.2D, v14.2D, v16.2D // ...............................*.... + // trn2 v16.2D, v14.2D, v16.2D // ..............................*..... + // str q15, [x1, #32] // ................................*... + // str q23, [x1], #64 // .................................*.. + // str q24, [x1, #-48] // ...................................* + // str q16, [x1, #-16] // ..................................*. + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_m1_firestorm.s index 57104a31..f7c28511 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_m1_firestorm.s @@ -26,30 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +43,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -103,28 +73,28 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data @@ -146,7 +116,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +127,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +137,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +145,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,19 +156,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -337,900 +307,968 @@ _ntt_kyber_123_4567_manual_st4_opt_m1_firestorm: load_roots_123 .p2align 2 - ldr q3, [x0, #448] // .*.................................... - ldr q27, [x0, #320] // *..................................... - ldr q31, [x0, #256] // ....*................................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - ldr q29, [x0, #192] // .....*................................ - ldr q8, [x0, #384] // ..*................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - ldr q9, [x0, #128] // ...*.................................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqrdmulh v4.8H, v31.8H, v0.H[1] // ............*......................... - sqrdmulh v6.8H, v3.8H, v0.H[1] // .........*............................ - mul v20.8H, v3.8H, v0.H[0] // ..........*........................... - sqrdmulh v23.8H, v27.8H, v0.H[1] // ..............*....................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v2.8H, v27.8H, v0.H[0] // ...........*.......................... - mul v30.8H, v31.8H, v0.H[0] // .............*........................ - mul v17.8H, v8.8H, v0.H[0] // .......*.............................. - sqrdmulh v14.8H, v8.8H, v0.H[1] // ........*............................. - ldr q8, [x0, #64] // ......*............................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v20.8H, v6.8H, v7.H[0] // ................*..................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v30.8H, v4.8H, v7.H[0] // .................*.................... - mls v2.8H, v23.8H, v7.H[0] // ..................*................... - mls v17.8H, v14.8H, v7.H[0] // ...............*...................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - add v12.8H, v29.8H, v20.8H // ...................*.................. - sub v10.8H, v29.8H, v20.8H // .....................*................ - ldr q20, [x0, #0] // .............................*........ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v22.8H, v9.8H, v17.8H // ....................*................. - add v3.8H, v9.8H, v17.8H // ........................*............. - sub v29.8H, v8.8H, v2.8H // ......................*............... - add v16.8H, v8.8H, v2.8H // .......................*.............. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqrdmulh v27.8H, v12.8H, v0.H[3] // ..............................*....... - mul v15.8H, v12.8H, v0.H[2] // ...............................*...... - sqrdmulh v5.8H, v10.8H, v0.H[5] // ............................*......... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqrdmulh v8.8H, v22.8H, v0.H[5] // .........................*............ - mul v14.8H, v22.8H, v0.H[4] // ..........................*........... - mul v22.8H, v10.8H, v0.H[4] // ...........................*.......... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v28.8H, v20.8H, v30.8H // ....................................*. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v15.8H, v27.8H, v7.H[0] // .....................................* - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v2.8H, v3.8H, v0.H[2] // ..................................*... - sqrdmulh v3.8H, v3.8H, v0.H[3] // ................................*..... - mls v22.8H, v5.8H, v7.H[0] // ...................................*.. - mls v14.8H, v8.8H, v7.H[0] // .................................*.... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... + // Instructions: 32 + // Expected cycles: 16 + // Expected IPC: 2.00 + // + // Cycle bound: 16.0 + // IPC bound: 2.00 + // + // Wall time: 0.37s + // User time: 0.37s + // + // ------ original position ------> + // 0 25 + // |------------------------|------ + ldr q15, [x0, #448] // .*.............................. + ldr q27, [x0, #64] // *............................... + ldr q14, [x0, #128] // ..*............................. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + ldr q16, [x0, #320] // ...*............................ + ldr q6, [x0, #384] // ....*........................... + ldr q26, [x0, #256] // .............*.................. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + ldr q11, [x0, #192] // .....*.......................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sqrdmulh v13.8H, v15.8H, v0.H[1] // ......*......................... + mul v15.8H, v15.8H, v0.H[0] // .......*........................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mul v31.8H, v16.8H, v0.H[0] // ...........*.................... + sqrdmulh v16.8H, v16.8H, v0.H[1] // ............*................... + sqrdmulh v25.8H, v6.8H, v0.H[1] // ........*....................... + mul v22.8H, v6.8H, v0.H[0] // ..........*..................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sqrdmulh v23.8H, v26.8H, v0.H[1] // ......................*......... + mul v19.8H, v26.8H, v0.H[0] // ........................*....... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v15.8H, v13.8H, v7.H[0] // .........*...................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v31.8H, v16.8H, v7.H[0] // .....................*.......... + mls v22.8H, v25.8H, v7.H[0] // ................*............... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + add v6.8H, v11.8H, v15.8H // ..............*................. + sub v15.8H, v11.8H, v15.8H // ...............*................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sub v16.8H, v14.8H, v22.8H // .......................*........ + add v13.8H, v27.8H, v31.8H // ............................*... + sub v28.8H, v27.8H, v31.8H // .............................*.. + add v11.8H, v14.8H, v22.8H // .........................*...... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mul v5.8H, v6.8H, v0.H[2] // ...................*............ + sqrdmulh v27.8H, v6.8H, v0.H[3] // ....................*........... + sqrdmulh v14.8H, v15.8H, v0.H[5] // .................*.............. + mul v12.8H, v15.8H, v0.H[4] // ..................*............. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sqrdmulh v10.8H, v16.8H, v0.H[5] // ..............................*. + mul v30.8H, v16.8H, v0.H[4] // ...............................* + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v5.8H, v27.8H, v7.H[0] // ..........................*..... + mls v12.8H, v14.8H, v7.H[0] // ...........................*.... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ - // original source code - // ldr q10, [x0, #320] // .*.................................... - // ldr q8, [x0, #448] // *..................................... - // ldr q11, [x0, #384] // ....*................................. - // ldr q13, [x0, #128] // .....*................................ - // ldr q12, [x0, #256] // ..*................................... - // ldr q17, [x0, #192] // ...*.................................. - // ldr q24, [x0, #64] // ..............*....................... - // mul v21.8H, v11.8H, v0.H[0] // ............*......................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // .............*........................ - // sqrdmulh v16.8H, v8.8H, v0.H[1] // .......*.............................. - // mul v15.8H, v8.8H, v0.H[0] // ........*............................. - // mul v5.8H, v10.8H, v0.H[0] // ..........*........................... - // sqrdmulh v25.8H, v12.8H, v0.H[1] // ......*............................... - // mul v30.8H, v12.8H, v0.H[0] // ...........*.......................... - // sqrdmulh v9.8H, v10.8H, v0.H[1] // .........*............................ - // mls v21.8H, v11.8H, v7.H[0] // ..................*................... - // mls v15.8H, v16.8H, v7.H[0] // ...............*...................... - // mls v30.8H, v25.8H, v7.H[0] // ................*..................... - // mls v5.8H, v9.8H, v7.H[0] // .................*.................... - // add v27.8H, v17.8H, v15.8H // ...................*.................. - // sub v22.8H, v13.8H, v21.8H // ......................*............... - // sub v6.8H, v17.8H, v15.8H // ....................*................. - // sub v29.8H, v24.8H, v5.8H // ........................*............. - // add v16.8H, v24.8H, v5.8H // .........................*............ - // add v31.8H, v13.8H, v21.8H // .......................*.............. - // sqrdmulh v4.8H, v22.8H, v0.H[5] // .............................*........ - // mul v14.8H, v22.8H, v0.H[4] // ..............................*....... - // mul v22.8H, v6.8H, v0.H[4] // ...............................*...... - // sqrdmulh v23.8H, v6.8H, v0.H[5] // ............................*......... - // ldr q20, [x0, #0] // .....................*................ - // sqrdmulh v19.8H, v27.8H, v0.H[3] // ..........................*........... - // mul v15.8H, v27.8H, v0.H[2] // ...........................*.......... - // sqrdmulh v3.8H, v31.8H, v0.H[3] // ...................................*.. - // mls v14.8H, v4.8H, v7.H[0] // .....................................* - // mul v2.8H, v31.8H, v0.H[2] // ..................................*... - // mls v22.8H, v23.8H, v7.H[0] // ....................................*. - // sub v28.8H, v20.8H, v30.8H // ................................*..... - // mls v15.8H, v19.8H, v7.H[0] // .................................*.... + // -------- new position ---------> + // 0 25 + // |------------------------|------ + // ldr q25, [x0, #64] // .*.............................. + // ldr q23, [x0, #448] // *............................... + // ldr q16, [x0, #128] // ..*............................. + // ldr q21, [x0, #320] // ...*............................ + // ldr q11, [x0, #384] // ....*........................... + // ldr q15, [x0, #192] // ......*......................... + // sqrdmulh v14.8H, v23.8H, v0.H[1] // .......*........................ + // mul v31.8H, v23.8H, v0.H[0] // ........*....................... + // sqrdmulh v20.8H, v11.8H, v0.H[1] // ...........*.................... + // mls v31.8H, v14.8H, v7.H[0] // ...............*................ + // mul v17.8H, v11.8H, v0.H[0] // ............*................... + // mul v22.8H, v21.8H, v0.H[0] // .........*...................... + // sqrdmulh v24.8H, v21.8H, v0.H[1] // ..........*..................... + // ldr q13, [x0, #256] // .....*.......................... + // add v21.8H, v15.8H, v31.8H // ..................*............. + // sub v15.8H, v15.8H, v31.8H // ...................*............ + // mls v17.8H, v20.8H, v7.H[0] // .................*.............. + // sqrdmulh v20.8H, v15.8H, v0.H[5] // ..........................*..... + // mul v12.8H, v15.8H, v0.H[4] // ...........................*.... + // mul v5.8H, v21.8H, v0.H[2] // ........................*....... + // sqrdmulh v3.8H, v21.8H, v0.H[3] // .........................*...... + // mls v22.8H, v24.8H, v7.H[0] // ................*............... + // sqrdmulh v23.8H, v13.8H, v0.H[1] // .............*.................. + // sub v27.8H, v16.8H, v17.8H // ....................*........... + // mul v19.8H, v13.8H, v0.H[0] // ..............*................. + // add v11.8H, v16.8H, v17.8H // .......................*........ + // mls v5.8H, v3.8H, v7.H[0] // ..............................*. + // mls v12.8H, v20.8H, v7.H[0] // ...............................* + // add v13.8H, v25.8H, v22.8H // .....................*.......... + // sub v28.8H, v25.8H, v22.8H // ......................*......... + // sqrdmulh v10.8H, v27.8H, v0.H[5] // ............................*... + // mul v30.8H, v27.8H, v0.H[4] // .............................*.. sub count, count, #1 layer123_start: - ldr q10, [x0, #336] // .....e...................................................................... - ldr q8, [x0, #464] // .......e.................................................................... - ldr q11, [x0, #400] // ......e..................................................................... + // Instructions: 76 + // Expected cycles: 17 + // Expected IPC: 4.47 + // + // Cycle bound: 16.0 + // IPC bound: 4.75 + // + // Wall time: 3600.50s + // User time: 3600.50s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q25, [x0, #80] // .e.......................................................................... + mls v19.8H, v23.8H, v7.H[0] // ..........*................................................................. + ldr q23, [x0, #464] // .......e.................................................................... + ldr q24, [x0, #0] // *........................................................................... + mul v27.8H, v11.8H, v0.H[2] // .............................*.............................................. + sqrdmulh v14.8H, v11.8H, v0.H[3] // ............................*............................................... // gap // ............................................................................ // gap // ............................................................................ + add v3.8H, v13.8H, v5.8H // .....................................*...................................... + sub v29.8H, v13.8H, v5.8H // ....................................*....................................... + sub v22.8H, v28.8H, v12.8H // ..............................................*............................. + add v8.8H, v28.8H, v12.8H // ...............................................*............................ + ldr q16, [x0, #144] // ..e......................................................................... + ldr q21, [x0, #336] // .....e...................................................................... // gap // ............................................................................ // gap // ............................................................................ - add v31.8H, v20.8H, v30.8H // ............*............................................................... - add v6.8H, v28.8H, v14.8H // ..........................................*................................. - mls v2.8H, v3.8H, v7.H[0] // ..............................*............................................. - add v26.8H, v29.8H, v22.8H // ...............................................*............................ - ldr q13, [x0, #144] // ..e......................................................................... - ldr q12, [x0, #272] // ....e....................................................................... - sub v22.8H, v29.8H, v22.8H // ..............................................*............................. + mls v30.8H, v10.8H, v7.H[0] // ........................................*................................... // gap // ............................................................................ + ldr q11, [x0, #400] // ......e..................................................................... // gap // ............................................................................ - add v3.8H, v16.8H, v15.8H // .....................................*...................................... - ldr q17, [x0, #208] // ...e........................................................................ - sub v28.8H, v28.8H, v14.8H // .........................................*.................................. - sub v19.8H, v16.8H, v15.8H // ....................................*....................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mul v27.8H, v26.8H, v1.H[2] // ..........................................................*................. - sqrdmulh v4.8H, v26.8H, v1.H[3] // ...........................................................*................ - sqrdmulh v23.8H, v22.8H, v1.H[5] // ................................................................*........... - mul v22.8H, v22.8H, v1.H[4] // ...............................................................*............ - ldr q24, [x0, #80] // .e.......................................................................... + sqrdmulh v18.8H, v22.8H, v1.H[5] // ...............................................................*............ + mul v12.8H, v22.8H, v1.H[4] // ................................................................*........... + mul v4.8H, v29.8H, v1.H[0] // ......................................................*..................... + sqrdmulh v9.8H, v29.8H, v1.H[1] // .....................................................*...................... + ldr q15, [x0, #208] // ...e........................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mul v21.8H, v11.8H, v0.H[0] // ..................e......................................................... - sqrdmulh v11.8H, v11.8H, v0.H[1] // ...................e........................................................ - sqrdmulh v16.8H, v8.8H, v0.H[1] // ........................e................................................... - mul v15.8H, v8.8H, v0.H[0] // .......................e.................................................... + mls v27.8H, v14.8H, v7.H[0] // ..............................*............................................. + sqrdmulh v14.8H, v23.8H, v0.H[1] // .......................e.................................................... + mul v31.8H, v23.8H, v0.H[0] // ........................e................................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mul v5.8H, v10.8H, v0.H[0] // .............e.............................................................. - sqrdmulh v25.8H, v12.8H, v0.H[1] // .........e.................................................................. - mul v30.8H, v12.8H, v0.H[0] // ........e................................................................... - sqrdmulh v9.8H, v10.8H, v0.H[1] // ..............e............................................................. + sub v10.8H, v24.8H, v19.8H // ...........*................................................................ + add v26.8H, v24.8H, v19.8H // ............*............................................................... + mul v28.8H, v8.8H, v1.H[2] // ...........................................................*................ + sqrdmulh v23.8H, v8.8H, v1.H[3] // ..........................................................*................. + mul v19.8H, v3.8H, v0.H[6] // .................................................*.......................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mul v10.8H, v19.8H, v1.H[0] // .....................................................*...................... - mls v27.8H, v4.8H, v7.H[0] // ............................................................*............... - mul v12.8H, v3.8H, v0.H[6] // ................................................*........................... - mls v22.8H, v23.8H, v7.H[0] // .................................................................*.......... + sqrdmulh v29.8H, v3.8H, v0.H[7] // ................................................*........................... + mls v12.8H, v18.8H, v7.H[0] // .................................................................*.......... + mls v4.8H, v9.8H, v7.H[0] // .......................................................*.................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v26.8H, v19.8H, v1.H[1] // ......................................................*..................... - sqrdmulh v14.8H, v3.8H, v0.H[7] // .................................................*.......................... - mls v21.8H, v11.8H, v7.H[0] // ....................e....................................................... - mls v15.8H, v16.8H, v7.H[0] // .........................e.................................................. + sqrdmulh v20.8H, v11.8H, v0.H[1] // ..................e......................................................... + mls v31.8H, v14.8H, v7.H[0] // .........................e.................................................. + sub v3.8H, v10.8H, v30.8H // .........................................*.................................. + mul v17.8H, v11.8H, v0.H[0] // ...................e........................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sub v8.8H, v31.8H, v2.8H // ...............................*............................................ - add v2.8H, v31.8H, v2.8H // ................................*........................................... - mls v30.8H, v25.8H, v7.H[0] // ..........e................................................................. - mls v5.8H, v9.8H, v7.H[0] // ...............e............................................................ + sub v11.8H, v26.8H, v27.8H // ...............................*............................................ + mul v22.8H, v21.8H, v0.H[0] // ..............e............................................................. + sqrdmulh v24.8H, v21.8H, v0.H[1] // .............e.............................................................. + mls v28.8H, v23.8H, v7.H[0] // ............................................................*............... + add v8.8H, v10.8H, v30.8H // ..........................................*................................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sub v25.8H, v6.8H, v27.8H // .............................................................*.............. - add v23.8H, v28.8H, v22.8H // ...................................................................*........ - sub v4.8H, v28.8H, v22.8H // ..................................................................*......... - add v3.8H, v6.8H, v27.8H // ..............................................................*............. + add v30.8H, v3.8H, v12.8H // ...................................................................*........ + sub v9.8H, v3.8H, v12.8H // ..................................................................*......... + mls v19.8H, v29.8H, v7.H[0] // ..................................................*......................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + ldr q13, [x0, #272] // ....e....................................................................... + add v12.8H, v11.8H, v4.8H // .........................................................*.................. + add v21.8H, v15.8H, v31.8H // ...........................e................................................ + sub v15.8H, v15.8H, v31.8H // ..........................e................................................. + sub v14.8H, v11.8H, v4.8H // ........................................................*................... + mls v17.8H, v20.8H, v7.H[0] // ....................e....................................................... // gap // ............................................................................ - add v27.8H, v17.8H, v15.8H // ...........................e................................................ - mls v12.8H, v14.8H, v7.H[0] // ..................................................*......................... - sub v22.8H, v13.8H, v21.8H // .....................e...................................................... - sub v6.8H, v17.8H, v15.8H // ..........................e................................................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + str q9, [x0, #448] // ...........................................................................* + str q12, [x0, #128] // ......................................................................*..... + add v6.8H, v8.8H, v28.8H // ..............................................................*............. + sub v28.8H, v8.8H, v28.8H // .............................................................*.............. + add v23.8H, v26.8H, v27.8H // ................................*........................................... // gap // ............................................................................ - sub v29.8H, v24.8H, v5.8H // ................e........................................................... - add v16.8H, v24.8H, v5.8H // .................e.......................................................... - str q25, [x0, #320] // .........................................................................*.. - str q4, [x0, #448] // ...........................................................................* - mls v10.8H, v26.8H, v7.H[0] // .......................................................*.................... - add v31.8H, v13.8H, v21.8H // ......................e..................................................... // gap // ............................................................................ // gap // ............................................................................ - str q3, [x0, #256] // ........................................................................*... - str q23, [x0, #384] // ..........................................................................*. - sqrdmulh v4.8H, v22.8H, v0.H[5] // .......................................e.................................... - mul v14.8H, v22.8H, v0.H[4] // ......................................e..................................... - mul v22.8H, v6.8H, v0.H[4] // ...........................................e................................ - sqrdmulh v23.8H, v6.8H, v0.H[5] // ............................................e............................... + sqrdmulh v20.8H, v15.8H, v0.H[5] // ...........................................e................................ + str q14, [x0, #192] // .......................................................................*.... + mul v12.8H, v15.8H, v0.H[4] // ............................................e............................... + str q30, [x0, #384] // ..........................................................................*. + mul v5.8H, v21.8H, v0.H[2] // ..................................e......................................... + sqrdmulh v3.8H, v21.8H, v0.H[3] // .................................e.......................................... // gap // ............................................................................ - ldr q20, [x0, #16] // e........................................................................... - sqrdmulh v19.8H, v27.8H, v0.H[3] // ..................................e......................................... - mul v15.8H, v27.8H, v0.H[2] // .................................e.......................................... - add v11.8H, v2.8H, v12.8H // ....................................................*....................... // gap // ............................................................................ + str q28, [x0, #320] // .........................................................................*.. + sub v26.8H, v23.8H, v19.8H // ...................................................*........................ + add v2.8H, v23.8H, v19.8H // ....................................................*....................... + mls v22.8H, v24.8H, v7.H[0] // ...............e............................................................ + str q6, [x0, #256] // ........................................................................*... + sqrdmulh v23.8H, v13.8H, v0.H[1] // ........e................................................................... // gap // ............................................................................ // gap // ............................................................................ + sub v27.8H, v16.8H, v17.8H // .....................e...................................................... + mul v19.8H, v13.8H, v0.H[0] // .........e.................................................................. + add v11.8H, v16.8H, v17.8H // ......................e..................................................... // gap // ............................................................................ - sub v24.8H, v2.8H, v12.8H // ...................................................*........................ - sqrdmulh v3.8H, v31.8H, v0.H[3] // .............................e.............................................. - add v12.8H, v8.8H, v10.8H // .........................................................*.................. - sub v27.8H, v8.8H, v10.8H // ........................................................*................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + mls v5.8H, v3.8H, v7.H[0] // ...................................e........................................ + mls v12.8H, v20.8H, v7.H[0] // .............................................e.............................. + str q26, [x0, #64] // .....................................................................*...... + str q2, [x0], #(16) // ....................................................................*....... // gap // ............................................................................ - mls v14.8H, v4.8H, v7.H[0] // ........................................e................................... - mul v2.8H, v31.8H, v0.H[2] // ............................e............................................... - str q11, [x0], #(16) // ....................................................................*....... - mls v22.8H, v23.8H, v7.H[0] // .............................................e.............................. - str q24, [x0, #48] // .....................................................................*...... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sub v28.8H, v20.8H, v30.8H // ...........e................................................................ - mls v15.8H, v19.8H, v7.H[0] // ...................................e........................................ - str q27, [x0, #176] // .......................................................................*.... - str q12, [x0, #112] // ......................................................................*..... + add v13.8H, v25.8H, v22.8H // .................e.......................................................... + sub v28.8H, v25.8H, v22.8H // ................e........................................................... + sqrdmulh v10.8H, v27.8H, v0.H[5] // ......................................e..................................... + mul v30.8H, v27.8H, v0.H[4] // .......................................e.................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - // original source code - // ldr q8, [x0, #0] // ...........................................................e................|..........................................................e................ - // ldr q9, [x0, #(1*(512/8))] // ..................e.........................................................|.................e......................................................... - // ldr q10, [x0, #(2*(512/8))] // .......e....................................................................|......e.................................................................... - // ldr q11, [x0, #(3*(512/8))] // ...........e................................................................|..........e................................................................ - // ldr q12, [x0, #(4*(512/8))] // ........e...................................................................|.......e................................................................... - // ldr q13, [x0, #(5*(512/8))] // e...........................................................................e........................................................................... - // ldr q14, [x0, #(6*(512/8))] // ..e.........................................................................|.e......................................................................... - // ldr q15, [x0, #(7*(512/8))] // .e..........................................................................|e.......................................................................... - // mul v24.8h, v12.8h, v0.h[0] // .........................e..................................................|........................e.................................................. - // sqrdmulh v12.8h, v12.8h, v0.h[1] // ........................e...................................................|.......................e................................................... - // mls v24.8h, v12.8h, v7.h[0] // .....................................e......................................|....................................e...................................... - // sub v12.8h, v8.8h, v24.8h // ........................................................................e...|.......................................................................e... - // add v8.8h, v8.8h, v24.8h // ...*........................................................................|..*........................................................................ - // mul v24.8h, v13.8h, v0.h[0] // .......................e....................................................|......................e.................................................... - // sqrdmulh v13.8h, v13.8h, v0.h[1] // ..........................e.................................................|.........................e................................................. - // mls v24.8h, v13.8h, v7.h[0] // ......................................e.....................................|.....................................e..................................... - // sub v13.8h, v9.8h, v24.8h // ...............................................e............................|..............................................e............................ - // add v9.8h, v9.8h, v24.8h // ................................................e...........................|...............................................e........................... - // mul v24.8h, v14.8h, v0.h[0] // ...................e........................................................|..................e........................................................ - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ....................e.......................................................|...................e....................................................... - // mls v24.8h, v14.8h, v7.h[0] // .................................e..........................................|................................e.......................................... - // sub v14.8h, v10.8h, v24.8h // .............................................e..............................|............................................e.............................. - // add v10.8h, v10.8h, v24.8h // ....................................................e.......................|...................................................e....................... - // mul v24.8h, v15.8h, v0.h[0] // ......................e.....................................................|.....................e..................................................... - // sqrdmulh v15.8h, v15.8h, v0.h[1] // .....................e......................................................|....................e...................................................... - // mls v24.8h, v15.8h, v7.h[0] // ..................................e.........................................|.................................e......................................... - // sub v15.8h, v11.8h, v24.8h // ..............................................e.............................|.............................................e............................. - // add v11.8h, v11.8h, v24.8h // ...........................................e................................|..........................................e................................ - // mul v24.8h, v10.8h, v0.h[2] // ....................................................................e.......|...................................................................e....... - // sqrdmulh v10.8h, v10.8h, v0.h[3] // ................................................................e...........|...............................................................e........... - // mls v24.8h, v10.8h, v7.h[0] // .....*......................................................................|....*...................................................................... - // sub v10.8h, v8.8h, v24.8h // ...................................*........................................|..................................*........................................ - // add v8.8h, v8.8h, v24.8h // ....................................*.......................................|...................................*....................................... - // mul v24.8h, v11.8h, v0.h[2] // .............................................................e..............|............................................................e.............. - // sqrdmulh v11.8h, v11.8h, v0.h[3] // ............................................................e...............|...........................................................e............... - // mls v24.8h, v11.8h, v7.h[0] // .........................................................................e..|........................................................................e.. - // sub v11.8h, v9.8h, v24.8h // .............*..............................................................|............*.............................................................. - // add v9.8h, v9.8h, v24.8h // ..........*.................................................................|.........*................................................................. - // mul v24.8h, v14.8h, v0.h[4] // ........................................................e...................|.......................................................e................... - // sqrdmulh v14.8h, v14.8h, v0.h[5] // .......................................................e....................|......................................................e.................... - // mls v24.8h, v14.8h, v7.h[0] // ...................................................................e........|..................................................................e........ - // sub v14.8h, v12.8h, v24.8h // ............*...............................................................|...........*............................................................... - // add v12.8h, v12.8h, v24.8h // ....*.......................................................................|...*....................................................................... - // mul v24.8h, v15.8h, v0.h[4] // .........................................................e..................|........................................................e.................. - // sqrdmulh v15.8h, v15.8h, v0.h[5] // ..........................................................e.................|.........................................................e................. - // mls v24.8h, v15.8h, v7.h[0] // ......................................................................e.....|.....................................................................e..... - // sub v15.8h, v13.8h, v24.8h // .........*..................................................................|........*.................................................................. - // add v13.8h, v13.8h, v24.8h // ......*.....................................................................|.....*..................................................................... - // mul v24.8h, v9.8h, v0.h[6] // .............................*..............................................|............................*.............................................. - // sqrdmulh v9.8h, v9.8h, v0.h[7] // ................................*...........................................|...............................*........................................... - // mls v24.8h, v9.8h, v7.h[0] // ............................................*...............................|...........................................*............................... - // sub v9.8h, v8.8h, v24.8h // ...............................................................*............|..............................................................*............ - // add v8.8h, v8.8h, v24.8h // ..............................................................*.............|.............................................................*............. - // mul v24.8h, v11.8h, v1.h[0] // ...........................*................................................|..........................*................................................ - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ...............................*............................................|..............................*............................................ - // mls v24.8h, v11.8h, v7.h[0] // ...................................................*........................|..................................................*........................ - // sub v11.8h, v10.8h, v24.8h // ..................................................................*.........|.................................................................*......... - // add v10.8h, v10.8h, v24.8h // .................................................................*..........|................................................................*.......... - // mul v24.8h, v13.8h, v1.h[2] // ..............*.............................................................|.............*............................................................. - // sqrdmulh v13.8h, v13.8h, v1.h[3] // ...............*............................................................|..............*............................................................ - // mls v24.8h, v13.8h, v7.h[0] // ............................*...............................................|...........................*............................................... - // sub v13.8h, v12.8h, v24.8h // .......................................*....................................|......................................*.................................... - // add v12.8h, v12.8h, v24.8h // ..........................................*.................................|.........................................*................................. - // mul v24.8h, v15.8h, v1.h[4] // .................*..........................................................|................*.......................................................... - // sqrdmulh v15.8h, v15.8h, v1.h[5] // ................*...........................................................|...............*........................................................... - // mls v24.8h, v15.8h, v7.h[0] // ..............................*.............................................|.............................*............................................. - // sub v15.8h, v14.8h, v24.8h // .........................................*..................................|........................................*.................................. - // add v14.8h, v14.8h, v24.8h // ........................................*...................................|.......................................*................................... - // str q8, [x0], #(16) // .....................................................................*......|....................................................................*...... - // str q9, [x0, #(-16 + 1*(512/8))] // .......................................................................*....|......................................................................*.... - // str q10, [x0, #(-16 + 2*(512/8))] // ...........................................................................*|..........................................................................* - // str q11, [x0, #(-16 + 3*(512/8))] // ..........................................................................*.|.........................................................................*. - // str q12, [x0, #(-16 + 4*(512/8))] // .....................................................*......................|....................................................*...................... - // str q13, [x0, #(-16 + 5*(512/8))] // .................................................*..........................|................................................*.......................... - // str q14, [x0, #(-16 + 6*(512/8))] // ......................................................*.....................|.....................................................*..................... - // str q15, [x0, #(-16 + 7*(512/8))] // ..................................................*.........................|.................................................*......................... + // ------------------------------------------------------------------ new position -------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------------------- + // ldr q8, [x0, #0] // ...~........................................................................'..*.................................................................... + // ldr q9, [x0, #(1*(512/8))] // e...........................................................................~....................................................................... + // ldr q10, [x0, #(2*(512/8))] // ..........e.................................................................'.........~............................................................. + // ldr q11, [x0, #(3*(512/8))] // ..................e.........................................................'.................~..................................................... + // ldr q12, [x0, #(4*(512/8))] // ..........................................e.................................'.........................................~............................. + // ldr q13, [x0, #(5*(512/8))] // ...........e................................................................'..........~............................................................ + // ldr q14, [x0, #(6*(512/8))] // .............e..............................................................'............~.......................................................... + // ldr q15, [x0, #(7*(512/8))] // ..e.........................................................................'.~..................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ................................................................e...........'...............................................................~....... + // mul v24.8h, v12.8h, v0.h[0] // ..................................................................e.........'.................................................................~..... + // mls v24.8h, v27.8h, v7.h[0] // .~..........................................................................'*...................................................................... + // sub v12.8h, v8.8h, v24.8h // ......................~.....................................................'.....................*................................................. + // add v8.8h, v8.8h, v24.8h // .......................~....................................................'......................*................................................ + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ....................................e.......................................'...................................~................................... + // mul v24.8h, v13.8h, v0.h[0] // ...................................e........................................'..................................~.................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................................................e.............'.............................................................~......... + // sub v13.8h, v9.8h, v24.8h // .........................................................................e..'....................................................................... + // add v9.8h, v9.8h, v24.8h // ........................................................................e...'....................................................................... + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ..............................e.............................................'.............................~......................................... + // mul v24.8h, v14.8h, v0.h[0] // .................................e..........................................'................................~...................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................................e............................'..............................................~........................ + // sub v14.8h, v10.8h, v24.8h // .................................................................e..........'................................................................~...... + // add v10.8h, v10.8h, v24.8h // ...................................................................e........'..................................................................~.... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ....................e.......................................................'...................~................................................... + // mul v24.8h, v15.8h, v0.h[0] // .....................e......................................................'....................~.................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...............................e............................................'..............................~........................................ + // sub v15.8h, v11.8h, v24.8h // .............................................e..............................'............................................~.......................... + // add v11.8h, v11.8h, v24.8h // ............................................e...............................'...........................................~........................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // .....~......................................................................'....*.................................................................. + // mul v24.8h, v10.8h, v0.h[2] // ....~.......................................................................'...*................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................~........................................................'..................*.................................................... + // sub v10.8h, v8.8h, v24.8h // ..................................~.........................................'.................................*..................................... + // add v8.8h, v8.8h, v24.8h // ....................................................~.......................'...................................................*................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ..........................................................e.................'.........................................................~............. + // mul v24.8h, v11.8h, v0.h[2] // .........................................................e..................'........................................................~.............. + // mls v24.8h, v27.8h, v7.h[0] // ....................................................................e.......'...................................................................~... + // sub v11.8h, v9.8h, v24.8h // .......~....................................................................'......*................................................................ + // add v9.8h, v9.8h, v24.8h // ......~.....................................................................'.....*................................................................. + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ..........................................................................e.'....................................................................... + // mul v24.8h, v14.8h, v0.h[4] // ...........................................................................e'....................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ............~...............................................................'...........*........................................................... + // sub v14.8h, v12.8h, v24.8h // ................................~...........................................'...............................*....................................... + // add v12.8h, v12.8h, v24.8h // ......................................~.....................................'.....................................*................................. + // sqrdmulh v27.8h, v15.8h, v0.h[5] // .....................................................e......................'....................................................~.................. + // mul v24.8h, v15.8h, v0.h[4] // .......................................................e....................'......................................................~................ + // mls v24.8h, v27.8h, v7.h[0] // .....................................................................e......'....................................................................~.. + // sub v15.8h, v13.8h, v24.8h // ........~...................................................................'.......*............................................................... + // add v13.8h, v13.8h, v24.8h // .........~..................................................................'........*.............................................................. + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ...........................~................................................'..........................*............................................ + // mul v24.8h, v9.8h, v0.h[6] // ..........................~.................................................'.........................*............................................. + // mls v24.8h, v27.8h, v7.h[0] // .........................................~..................................'........................................*.............................. + // sub v9.8h, v8.8h, v24.8h // ............................................................~...............'...........................................................*........... + // add v8.8h, v8.8h, v24.8h // .............................................................~..............'............................................................*.......... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // .................~..........................................................'................*...................................................... + // mul v24.8h, v11.8h, v1.h[0] // ................~...........................................................'...............*....................................................... + // mls v24.8h, v27.8h, v7.h[0] // .............................~..............................................'............................*.......................................... + // sub v11.8h, v10.8h, v24.8h // ..............................................~.............................'.............................................*......................... + // add v10.8h, v10.8h, v24.8h // ...........................................~................................'..........................................*............................ + // sqrdmulh v27.8h, v13.8h, v1.h[3] // .........................~..................................................'........................*.............................................. + // mul v24.8h, v13.8h, v1.h[2] // ........................~...................................................'.......................*............................................... + // mls v24.8h, v27.8h, v7.h[0] // .....................................~......................................'....................................*.................................. + // sub v13.8h, v12.8h, v24.8h // ...................................................~........................'..................................................*.................... + // add v12.8h, v12.8h, v24.8h // ..................................................~.........................'.................................................*..................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ..............~.............................................................'.............*......................................................... + // mul v24.8h, v15.8h, v1.h[4] // ...............~............................................................'..............*........................................................ + // mls v24.8h, v27.8h, v7.h[0] // ............................~...............................................'...........................*........................................... + // sub v15.8h, v14.8h, v24.8h // ........................................~...................................'.......................................*............................... + // add v14.8h, v14.8h, v24.8h // .......................................~....................................'......................................*................................ + // str q8, [x0], #(16) // .......................................................................~....'......................................................................* + // str q9, [x0, #(-16 + 1*(512/8))] // ......................................................................~.....'.....................................................................*. + // str q10, [x0, #(-16 + 2*(512/8))] // .................................................~..........................'................................................*...................... + // str q11, [x0, #(-16 + 3*(512/8))] // ......................................................~.....................'.....................................................*................. + // str q12, [x0, #(-16 + 4*(512/8))] // ...............................................................~............'..............................................................*........ + // str q13, [x0, #(-16 + 5*(512/8))] // ...........................................................~................'..........................................................*............ + // str q14, [x0, #(-16 + 6*(512/8))] // ........................................................~...................'.......................................................*............... + // str q15, [x0, #(-16 + 7*(512/8))] // ................................................~...........................'...............................................*....................... sub count, count, #1 cbnz count, layer123_start - add v23.8H, v16.8H, v15.8H // .....*................................ - add v13.8H, v28.8H, v14.8H // .*.................................... - sub v8.8H, v28.8H, v14.8H // ......*............................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v26.8H, v29.8H, v22.8H // ....*................................. - add v22.8H, v29.8H, v22.8H // ...*.................................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v25.8H, v16.8H, v15.8H // .......*.............................. - mls v2.8H, v3.8H, v7.H[0] // ..*................................... - mul v10.8H, v23.8H, v0.H[6] // ..............*....................... - sqrdmulh v6.8H, v23.8H, v0.H[7] // .................*.................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v3.8H, v26.8H, v1.H[4] // ...........*.......................... - sqrdmulh v4.8H, v22.8H, v1.H[3] // .........*............................ - sqrdmulh v23.8H, v26.8H, v1.H[5] // ..........*........................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mul v26.8H, v22.8H, v1.H[2] // ........*............................. - add v22.8H, v20.8H, v30.8H // *..................................... - sqrdmulh v30.8H, v25.8H, v1.H[1] // ................*..................... - mul v11.8H, v25.8H, v1.H[0] // ............*......................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v10.8H, v6.8H, v7.H[0] // ........................*............. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v3.8H, v23.8H, v7.H[0] // ...............*...................... - add v16.8H, v22.8H, v2.8H // ...................*.................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - mls v26.8H, v4.8H, v7.H[0] // .............*........................ - mls v11.8H, v30.8H, v7.H[0] // ...........................*.......... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - add v31.8H, v16.8H, v10.8H // ..............................*....... - sub v4.8H, v22.8H, v2.8H // ..................*................... - sub v24.8H, v16.8H, v10.8H // ...............................*...... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v23.8H, v8.8H, v3.8H // ......................*............... - add v21.8H, v8.8H, v3.8H // .....................*................ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sub v25.8H, v4.8H, v11.8H // .................................*.... - add v11.8H, v4.8H, v11.8H // ................................*..... - str q31, [x0], #(16) // ..................................*... - str q24, [x0, #48] // ...................................*.. - add v17.8H, v13.8H, v26.8H // .......................*.............. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q21, [x0, #368] // .............................*........ - str q23, [x0, #432] // ..........................*........... - sub v19.8H, v13.8H, v26.8H // ....................*................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q17, [x0, #240] // ............................*......... - str q25, [x0, #176] // ....................................*. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str q11, [x0, #112] // .....................................* - str q19, [x0, #304] // .........................*............ - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... + // Instructions: 44 + // Expected cycles: 14 + // Expected IPC: 3.14 + // + // Cycle bound: 14.0 + // IPC bound: 3.14 + // + // Wall time: 0.84s + // User time: 0.84s + // + // ------------ original position ------------> + // 0 25 + // |------------------------|------------------ + add v18.8H, v13.8H, v5.8H // ....*....................................... + sub v6.8H, v28.8H, v12.8H // ......*..................................... + ldr q22, [x0, #0] // .*.......................................... + add v24.8H, v28.8H, v12.8H // .......*.................................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + sqrdmulh v17.8H, v11.8H, v0.H[3] // ...*........................................ + mls v19.8H, v23.8H, v7.H[0] // *........................................... + mls v30.8H, v10.8H, v7.H[0] // ........*................................... + sub v28.8H, v13.8H, v5.8H // .....*...................................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + sqrdmulh v20.8H, v24.8H, v1.H[3] // .................*.......................... + mul v8.8H, v24.8H, v1.H[2] // ................*........................... + mul v24.8H, v11.8H, v0.H[2] // ..*......................................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mul v27.8H, v6.8H, v1.H[4] // ..........*................................. + mul v16.8H, v18.8H, v0.H[6] // ..................*......................... + sqrdmulh v13.8H, v6.8H, v1.H[5] // .........*.................................. + sqrdmulh v2.8H, v18.8H, v0.H[7] // ...................*........................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mul v15.8H, v28.8H, v1.H[0] // ...........*................................ + add v9.8H, v22.8H, v19.8H // ...............*............................ + sub v26.8H, v22.8H, v19.8H // ..............*............................. + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v8.8H, v20.8H, v7.H[0] // ........................*................... + sqrdmulh v4.8H, v28.8H, v1.H[1] // ............*............................... + mls v24.8H, v17.8H, v7.H[0] // .............*.............................. + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + sub v22.8H, v26.8H, v30.8H // ......................*..................... + add v26.8H, v26.8H, v30.8H // .........................*.................. + mls v27.8H, v13.8H, v7.H[0] // ....................*....................... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + mls v16.8H, v2.8H, v7.H[0] // ............................*............... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + add v12.8H, v9.8H, v24.8H // ...................................*........ + mls v15.8H, v4.8H, v7.H[0] // .....................*...................... + add v23.8H, v26.8H, v8.8H // .................................*.......... + sub v14.8H, v26.8H, v8.8H // ..................................*......... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + sub v8.8H, v9.8H, v24.8H // .......................*.................... + add v21.8H, v22.8H, v27.8H // ..........................*................. + sub v13.8H, v22.8H, v27.8H // ...........................*................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + sub v11.8H, v12.8H, v16.8H // .......................................*.... + add v20.8H, v12.8H, v16.8H // ........................................*... + str q23, [x0, #256] // .........................................*.. + str q14, [x0, #320] // ......................................*..... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + str q21, [x0, #384] // .....................................*...... + sub v27.8H, v8.8H, v15.8H // ..............................*............. + add v26.8H, v8.8H, v15.8H // .............................*.............. + str q13, [x0, #448] // ...............................*............ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + str q11, [x0, #64] // ..........................................*. + str q20, [x0], #(16) // ...........................................* + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + str q27, [x0, #176] // ....................................*....... + str q26, [x0, #112] // ................................*........... + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ + // gap // ............................................ - // original source code - // add v31.8H, v20.8H, v30.8H // .............*........................ - // add v6.8H, v28.8H, v14.8H // .*.................................... - // mls v2.8H, v3.8H, v7.H[0] // ......*............................... - // add v26.8H, v29.8H, v22.8H // ....*................................. - // sub v22.8H, v29.8H, v22.8H // ...*.................................. - // add v3.8H, v16.8H, v15.8H // *..................................... - // sub v28.8H, v28.8H, v14.8H // ..*................................... - // sub v19.8H, v16.8H, v15.8H // .....*................................ - // mul v27.8H, v26.8H, v1.H[2] // ............*......................... - // sqrdmulh v4.8H, v26.8H, v1.H[3] // ..........*........................... - // sqrdmulh v23.8H, v22.8H, v1.H[5] // ...........*.......................... - // mul v22.8H, v22.8H, v1.H[4] // .........*............................ - // mul v10.8H, v19.8H, v1.H[0] // ...............*...................... - // mls v27.8H, v4.8H, v7.H[0] // ...................*.................. - // mul v12.8H, v3.8H, v0.H[6] // .......*.............................. - // mls v22.8H, v23.8H, v7.H[0] // .................*.................... - // sqrdmulh v26.8H, v19.8H, v1.H[1] // ..............*....................... - // sqrdmulh v14.8H, v3.8H, v0.H[7] // ........*............................. - // sub v8.8H, v31.8H, v2.8H // ......................*............... - // add v2.8H, v31.8H, v2.8H // ..................*................... - // sub v25.8H, v6.8H, v27.8H // .................................*.... - // add v23.8H, v28.8H, v22.8H // .........................*............ - // sub v4.8H, v28.8H, v22.8H // ........................*............. - // add v3.8H, v6.8H, v27.8H // ..............................*....... - // mls v12.8H, v14.8H, v7.H[0] // ................*..................... - // str q25, [x0, #320] // .....................................* - // str q4, [x0, #448] // ................................*..... - // mls v10.8H, v26.8H, v7.H[0] // ....................*................. - // str q3, [x0, #256] // ..................................*... - // str q23, [x0, #384] // ...............................*...... - // add v11.8H, v2.8H, v12.8H // .....................*................ - // sub v24.8H, v2.8H, v12.8H // .......................*.............. - // add v12.8H, v8.8H, v10.8H // ...........................*.......... - // sub v27.8H, v8.8H, v10.8H // ..........................*........... - // str q11, [x0], #(16) // ............................*......... - // str q24, [x0, #48] // .............................*........ - // str q27, [x0, #176] // ...................................*.. - // str q12, [x0, #112] // ....................................*. + // -------------- new position ---------------> + // 0 25 + // |------------------------|------------------ + // mls v19.8H, v23.8H, v7.H[0] // .....*...................................... + // ldr q24, [x0, #0] // ..*......................................... + // mul v27.8H, v11.8H, v0.H[2] // ..........*................................. + // sqrdmulh v14.8H, v11.8H, v0.H[3] // ....*....................................... + // add v3.8H, v13.8H, v5.8H // *........................................... + // sub v29.8H, v13.8H, v5.8H // .......*.................................... + // sub v22.8H, v28.8H, v12.8H // .*.......................................... + // add v8.8H, v28.8H, v12.8H // ...*........................................ + // mls v30.8H, v10.8H, v7.H[0] // ......*..................................... + // sqrdmulh v18.8H, v22.8H, v1.H[5] // .............*.............................. + // mul v12.8H, v22.8H, v1.H[4] // ...........*................................ + // mul v4.8H, v29.8H, v1.H[0] // ...............*............................ + // sqrdmulh v9.8H, v29.8H, v1.H[1] // ...................*........................ + // mls v27.8H, v14.8H, v7.H[0] // ....................*....................... + // sub v10.8H, v24.8H, v19.8H // .................*.......................... + // add v26.8H, v24.8H, v19.8H // ................*........................... + // mul v28.8H, v8.8H, v1.H[2] // .........*.................................. + // sqrdmulh v23.8H, v8.8H, v1.H[3] // ........*................................... + // mul v19.8H, v3.8H, v0.H[6] // ............*............................... + // sqrdmulh v29.8H, v3.8H, v0.H[7] // ..............*............................. + // mls v12.8H, v18.8H, v7.H[0] // .......................*.................... + // mls v4.8H, v9.8H, v7.H[0] // ..........................*................. + // sub v3.8H, v10.8H, v30.8H // .....................*...................... + // sub v11.8H, v26.8H, v27.8H // .............................*.............. + // mls v28.8H, v23.8H, v7.H[0] // ..................*......................... + // add v8.8H, v10.8H, v30.8H // ......................*..................... + // add v30.8H, v3.8H, v12.8H // ..............................*............. + // sub v9.8H, v3.8H, v12.8H // ...............................*............ + // mls v19.8H, v29.8H, v7.H[0] // ........................*................... + // add v12.8H, v11.8H, v4.8H // ......................................*..... + // sub v14.8H, v11.8H, v4.8H // .....................................*...... + // str q9, [x0, #448] // .......................................*.... + // str q12, [x0, #128] // ...........................................* + // add v6.8H, v8.8H, v28.8H // ...........................*................ + // sub v28.8H, v8.8H, v28.8H // ............................*............... + // add v23.8H, v26.8H, v27.8H // .........................*.................. + // str q14, [x0, #192] // ..........................................*. + // str q30, [x0, #384] // ....................................*....... + // str q28, [x0, #320] // ...................................*........ + // sub v26.8H, v23.8H, v19.8H // ................................*........... + // add v2.8H, v23.8H, v19.8H // .................................*.......... + // str q6, [x0, #256] // ..................................*......... + // str q26, [x0, #64] // ........................................*... + // str q2, [x0], #(16) // .........................................*.. restore inp, STACK0 mov count, #8 .p2align 2 - ldr q4, [x1, #48] // .*.......................................... - ldr q25, [x1, #32] // ..*......................................... - ldr q10, [x3], #16 // *........................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - ldr q27, [x4, #16] // ................................*........... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - ldr q11, [x4, #64] // .............................*.............. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - ldr q22, [x1, #16] // ........*................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mul v13.8H, v25.8H, v10.H[0] // ....*....................................... - sqrdmulh v20.8H, v25.8H, v10.H[1] // .....*...................................... - ldr q23, [x1, #0] // ...*........................................ - sqrdmulh v21.8H, v4.8H, v10.H[1] // ......*..................................... - mul v18.8H, v4.8H, v10.H[0] // .......*.................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - ldr q12, [x4], #(6*16) // ...............*............................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v13.8H, v20.8H, v7.H[0] // .........*.................................. - mls v18.8H, v21.8H, v7.H[0] // ..........*................................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sub v15.8H, v22.8H, v18.8H // ...........*................................ - add v26.8H, v22.8H, v18.8H // ............*............................... - sub v0.8H, v23.8H, v13.8H // .............*.............................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - add v17.8H, v23.8H, v13.8H // ..............*............................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mul v23.8H, v15.8H, v10.H[4] // ................*........................... - sqrdmulh v3.8H, v15.8H, v10.H[5] // .................*.......................... - mul v22.8H, v26.8H, v10.H[2] // ...................*........................ - sqrdmulh v4.8H, v26.8H, v10.H[3] // ..................*......................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v22.8H, v4.8H, v7.H[0] // .....................*...................... - mls v23.8H, v3.8H, v7.H[0] // ....................*....................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - add v10.8H, v0.8H, v23.8H // .......................*.................... - sub v23.8H, v0.8H, v23.8H // ........................*................... - add v26.8H, v17.8H, v22.8H // .........................*.................. - sub v0.8H, v17.8H, v22.8H // ..........................*................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - trn2 v13.4S, v26.4S, v0.4S // ..............................*............. - trn1 v3.4S, v26.4S, v0.4S // ...............................*............ - trn2 v25.4S, v10.4S, v23.4S // ............................*............... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - trn1 v0.4S, v10.4S, v23.4S // ...........................*................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - trn2 v22.2D, v13.2D, v25.2D // .................................*.......... - trn1 v19.2D, v13.2D, v25.2D // ...................................*........ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sqrdmulh v31.8H, v22.8H, v27.8H // ....................................*....... - mul v26.8H, v22.8H, v12.8H // .....................................*...... - ldr q22, [x4, #-64] // .........................................*.. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - trn2 v13.2D, v3.2D, v0.2D // ..................................*......... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - trn1 v10.2D, v3.2D, v0.2D // ..........................................*. - ldr q3, [x4, #-16] // ......................*..................... - ldr q0, [x4, #-48] // ......................................*..... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v26.8H, v31.8H, v7.H[0] // ...........................................* - sqrdmulh v4.8H, v13.8H, v27.8H // ........................................*... - mul v12.8H, v13.8H, v12.8H // .......................................*.... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ + // Instructions: 41 + // Expected cycles: 28 + // Expected IPC: 1.46 + // + // Cycle bound: 28.0 + // IPC bound: 1.46 + // + // Wall time: 0.58s + // User time: 0.58s + // + // ---------- original position -----------> + // 0 25 + // |------------------------|--------------- + ldr q28, [x1, #32] // ....*.................................... + ldr q4, [x1, #48] // ..*...................................... + // gap // ......................................... + // gap // ......................................... + ldr q9, [x3], #16 // .*....................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + ldr q15, [x4, #32] // ......................................*.. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + ldr q24, [x1, #16] // .....*................................... + ldr q10, [x1, #0] // ...*..................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + ldr q20, [x4, #48] // ............................*............ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mul v21.8H, v28.8H, v9.H[0] // ..........*.............................. + sqrdmulh v28.8H, v28.8H, v9.H[1] // ...........*............................. + sqrdmulh v18.8H, v4.8H, v9.H[1] // .......*................................. + mul v4.8H, v4.8H, v9.H[0] // ........*................................ + ldr q5, [x4], #(6*16) // *........................................ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + ldr q19, [x4, #-80] // ......*.................................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v21.8H, v28.8H, v7.H[0] // .............*........................... + mls v4.8H, v18.8H, v7.H[0] // ............*............................ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v28.8H, v10.8H, v21.8H // ....................*.................... + add v30.8H, v24.8H, v4.8H // ...............*......................... + sub v4.8H, v24.8H, v4.8H // ..............*.......................... + add v18.8H, v10.8H, v21.8H // .....................*................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sqrdmulh v21.8H, v30.8H, v9.H[3] // ..................*...................... + mul v24.8H, v30.8H, v9.H[2] // ...................*..................... + mul v10.8H, v4.8H, v9.H[4] // .................*....................... + sqrdmulh v4.8H, v4.8H, v9.H[5] // ................*........................ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v10.8H, v4.8H, v7.H[0] // ......................*.................. + mls v24.8H, v21.8H, v7.H[0] // .......................*................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v4.8H, v28.8H, v10.8H // .........................*............... + add v28.8H, v28.8H, v10.8H // ..........................*.............. + sub v21.8H, v18.8H, v24.8H // ...........................*............. + add v18.8H, v18.8H, v24.8H // ........................*................ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn2 v12.4S, v28.4S, v4.4S // .............................*........... + trn1 v28.4S, v28.4S, v4.4S // ................................*........ + trn1 v4.4S, v18.4S, v21.4S // ...............................*......... + trn2 v22.4S, v18.4S, v21.4S // ..............................*.......... + ldr q18, [x4, #-32] // .........*............................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn2 v27.2D, v4.2D, v28.2D // .....................................*... + trn2 v21.2D, v22.2D, v12.2D // .................................*....... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mul v8.8H, v27.8H, v5.8H // .......................................*. + mul v30.8H, v21.8H, v5.8H // ...................................*..... + trn1 v6.2D, v4.2D, v28.2D // ..................................*...... + sqrdmulh v28.8H, v21.8H, v19.8H // ....................................*.... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v30.8H, v28.8H, v7.H[0] // ........................................* + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... - // original source code - // ldr q2, [x3], #16 // ..*......................................... - // ldr q30, [x1, #48] // *........................................... - // ldr q23, [x1, #32] // .*.......................................... - // ldr q15, [x1, #0] // ........*................................... - // mul v19.8H, v23.8H, v2.H[0] // ......*..................................... - // sqrdmulh v6.8H, v23.8H, v2.H[1] // .......*.................................... - // sqrdmulh v23.8H, v30.8H, v2.H[1] // .........*.................................. - // mul v26.8H, v30.8H, v2.H[0] // ..........*................................. - // ldr q14, [x1, #16] // .....*...................................... - // mls v19.8H, v6.8H, v7.H[0] // ............*............................... - // mls v26.8H, v23.8H, v7.H[0] // .............*.............................. - // sub v29.8H, v14.8H, v26.8H // ..............*............................. - // add v16.8H, v14.8H, v26.8H // ...............*............................ - // sub v9.8H, v15.8H, v19.8H // ................*........................... - // add v13.8H, v15.8H, v19.8H // .................*.......................... - // ldr q6, [x4], #(6*16) // ...........*................................ - // mul v12.8H, v29.8H, v2.H[4] // ..................*......................... - // sqrdmulh v15.8H, v29.8H, v2.H[5] // ...................*........................ - // sqrdmulh v24.8H, v16.8H, v2.H[3] // .....................*...................... - // mul v14.8H, v16.8H, v2.H[2] // ....................*....................... - // mls v12.8H, v15.8H, v7.H[0] // .......................*.................... - // mls v14.8H, v24.8H, v7.H[0] // ......................*..................... - // ldr q3, [x4, #-16] // .......................................*.... - // add v11.8H, v9.8H, v12.8H // ........................*................... - // sub v12.8H, v9.8H, v12.8H // .........................*.................. - // add v1.8H, v13.8H, v14.8H // ..........................*................. - // sub v14.8H, v13.8H, v14.8H // ...........................*................ - // trn1 v28.4S, v11.4S, v12.4S // ...............................*............ - // trn2 v20.4S, v11.4S, v12.4S // ..............................*............. - // ldr q11, [x4, #-32] // ....*....................................... - // trn2 v19.4S, v1.4S, v14.4S // ............................*............... - // trn1 v5.4S, v1.4S, v14.4S // .............................*.............. - // ldr q14, [x4, #-80] // ...*........................................ - // trn2 v0.2D, v19.2D, v20.2D // ................................*........... - // trn2 v22.2D, v5.2D, v28.2D // .....................................*...... - // trn1 v19.2D, v19.2D, v20.2D // .................................*.......... - // sqrdmulh v21.8H, v0.8H, v14.8H // ..................................*......... - // mul v26.8H, v0.8H, v6.8H // ...................................*........ - // ldr q0, [x4, #-48] // ........................................*... - // mul v12.8H, v22.8H, v6.8H // ...........................................* - // sqrdmulh v4.8H, v22.8H, v14.8H // ..........................................*. - // ldr q22, [x4, #-64] // ....................................*....... - // trn1 v10.2D, v5.2D, v28.2D // ......................................*..... - // mls v26.8H, v21.8H, v7.H[0] // .........................................*.. + // ------------- new position -------------> + // 0 25 + // |------------------------|--------------- + // ldr q23, [x4], #(6*16) // ...........*............................. + // ldr q3, [x3], #16 // ..*...................................... + // ldr q11, [x1, #48] // .*....................................... + // ldr q22, [x1, #0] // .....*................................... + // ldr q13, [x1, #32] // *........................................ + // ldr q31, [x1, #16] // ....*.................................... + // ldr q19, [x4, #-80] // ............*............................ + // sqrdmulh v24.8H, v11.8H, v3.H[1] // .........*............................... + // mul v5.8H, v11.8H, v3.H[0] // ..........*.............................. + // ldr q18, [x4, #-32] // .................................*....... + // mul v21.8H, v13.8H, v3.H[0] // .......*................................. + // sqrdmulh v28.8H, v13.8H, v3.H[1] // ........*................................ + // mls v5.8H, v24.8H, v7.H[0] // ..............*.......................... + // mls v21.8H, v28.8H, v7.H[0] // .............*........................... + // sub v16.8H, v31.8H, v5.8H // .................*....................... + // add v28.8H, v31.8H, v5.8H // ................*........................ + // sqrdmulh v13.8H, v16.8H, v3.H[5] // ......................*.................. + // mul v20.8H, v16.8H, v3.H[4] // .....................*................... + // sqrdmulh v25.8H, v28.8H, v3.H[3] // ...................*..................... + // mul v31.8H, v28.8H, v3.H[2] // ....................*.................... + // sub v4.8H, v22.8H, v21.8H // ...............*......................... + // add v21.8H, v22.8H, v21.8H // ..................*...................... + // mls v20.8H, v13.8H, v7.H[0] // .......................*................. + // mls v31.8H, v25.8H, v7.H[0] // ........................*................ + // add v0.8H, v21.8H, v31.8H // ............................*............ + // sub v29.8H, v4.8H, v20.8H // .........................*............... + // add v25.8H, v4.8H, v20.8H // ..........................*.............. + // sub v2.8H, v21.8H, v31.8H // ...........................*............. + // ldr q20, [x4, #-48] // ......*.................................. + // trn2 v12.4S, v25.4S, v29.4S // .............................*........... + // trn2 v22.4S, v0.4S, v2.4S // ................................*........ + // trn1 v13.4S, v0.4S, v2.4S // ...............................*......... + // trn1 v31.4S, v25.4S, v29.4S // ..............................*.......... + // trn2 v0.2D, v22.2D, v12.2D // ...................................*..... + // trn1 v6.2D, v13.2D, v31.2D // ......................................*.. + // mul v30.8H, v0.8H, v23.8H // .....................................*... + // sqrdmulh v9.8H, v0.8H, v19.8H // .......................................*. + // trn2 v27.2D, v13.2D, v31.2D // ..................................*...... + // ldr q15, [x4, #-64] // ...*..................................... + // mul v8.8H, v27.8H, v23.8H // ....................................*.... + // mls v30.8H, v9.8H, v7.H[0] // ........................................* sub count, count, #1 layer4567_start: - mls v12.8H, v4.8H, v7.H[0] // .........................................*......................................... - ldr q2, [x3], #16 // ....e.............................................................................. - ldr q30, [x1, #112] // ...e............................................................................... + // Instructions: 83 + // Expected cycles: 28 + // Expected IPC: 2.96 + // + // Cycle bound: 28.0 + // IPC bound: 2.96 + // + // Wall time: 42.21s + // User time: 42.21s + // + // ------------------------------- original position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------- + ldr q23, [x4], #(6*16) // .................................e................................................. + trn1 v12.2D, v22.2D, v12.2D // ................................*.................................................. + sqrdmulh v17.8H, v27.8H, v19.8H // .......................................*........................................... + ldr q3, [x3], #16 // ....e.............................................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + ldr q11, [x1, #112] // ...e............................................................................... + ldr q22, [x1, #64] // e.................................................................................. + ldr q9, [x4, #-112] // ......................................*............................................ + ldr q13, [x1, #96] // ..e................................................................................ // gap // ................................................................................... - ldr q23, [x1, #96] // ..e................................................................................ - ldr q15, [x1, #64] // e.................................................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + add v16.8H, v12.8H, v30.8H // ................................................*.................................. + sub v1.8H, v12.8H, v30.8H // ...............................................*................................... + ldr q31, [x1, #80] // .e................................................................................. + ldr q19, [x4, #-80] // ..................................e................................................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - add v27.8H, v19.8H, v26.8H // ................................................*.................................. // gap // ................................................................................... + mls v8.8H, v17.8H, v7.H[0] // .........................................*......................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - sub v4.8H, v10.8H, v12.8H // ..........................................*........................................ // gap // ................................................................................... + sqrdmulh v24.8H, v11.8H, v3.H[1] // ..........e........................................................................ + sqrdmulh v14.8H, v16.8H, v20.8H // .................................................*................................. + mul v27.8H, v16.8H, v15.8H // ..................................................*................................ + mul v5.8H, v11.8H, v3.H[0] // ...........e....................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + sqrdmulh v16.8H, v1.8H, v9.8H // ......................................................*............................ + mul v15.8H, v1.8H, v18.8H // .......................................................*........................... + ldr q18, [x4, #-32] // .....................................e............................................. + mul v21.8H, v13.8H, v3.H[0] // ......e............................................................................ // gap // ................................................................................... - sub v8.8H, v19.8H, v26.8H // ...............................................*................................... - mul v19.8H, v23.8H, v2.H[0] // .....e............................................................................. - sqrdmulh v6.8H, v23.8H, v2.H[1] // ......e............................................................................ - sqrdmulh v23.8H, v30.8H, v2.H[1] // ...........e....................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + sqrdmulh v28.8H, v13.8H, v3.H[1] // .....e............................................................................. + sub v0.8H, v6.8H, v8.8H // ..........................................*........................................ + add v10.8H, v6.8H, v8.8H // ...........................................*....................................... // gap // ................................................................................... - mul v26.8H, v30.8H, v2.H[0] // ..........e........................................................................ - sqrdmulh v3.8H, v8.8H, v3.8H // .......................................................*........................... - mul v28.8H, v27.8H, v22.8H // .................................................*................................. - sqrdmulh v0.8H, v27.8H, v0.8H // ..................................................*................................ - mul v22.8H, v8.8H, v11.8H // ......................................................*............................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mls v5.8H, v24.8H, v7.H[0] // ............e...................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mls v27.8H, v14.8H, v7.H[0] // ...................................................*............................... // gap // ................................................................................... - ldr q14, [x1, #80] // .e................................................................................. - mls v19.8H, v6.8H, v7.H[0] // .......e........................................................................... - add v6.8H, v10.8H, v12.8H // ...........................................*....................................... - mls v26.8H, v23.8H, v7.H[0] // ............e...................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v22.8H, v3.8H, v7.H[0] // ........................................................*.......................... - mls v28.8H, v0.8H, v7.H[0] // ...................................................*............................... // gap // ................................................................................... + mls v15.8H, v16.8H, v7.H[0] // ........................................................*.......................... + mls v21.8H, v28.8H, v7.H[0] // .......e........................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... @@ -1238,485 +1276,515 @@ layer4567_start: // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + sub v1.8H, v10.8H, v27.8H // ....................................................*.............................. + add v11.8H, v10.8H, v27.8H // .....................................................*............................. + sub v16.8H, v31.8H, v5.8H // .............e..................................................................... + add v28.8H, v31.8H, v5.8H // ..............e.................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + sub v6.8H, v0.8H, v15.8H // .........................................................*......................... + add v26.8H, v0.8H, v15.8H // ..........................................................*........................ // gap // ................................................................................... // gap // ................................................................................... - sub v29.8H, v14.8H, v26.8H // .............e..................................................................... - add v16.8H, v14.8H, v26.8H // ..............e.................................................................... - sub v9.8H, v15.8H, v19.8H // ........e.......................................................................... - add v13.8H, v15.8H, v19.8H // .........e......................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - sub v17.8H, v6.8H, v28.8H // ....................................................*.............................. - add v10.8H, v6.8H, v28.8H // .....................................................*............................. - ldr q6, [x4], #(6*16) // .................................e................................................. - add v0.8H, v4.8H, v22.8H // ..........................................................*........................ - sub v26.8H, v4.8H, v22.8H // .........................................................*......................... + sqrdmulh v13.8H, v16.8H, v3.H[5] // ....................e.............................................................. + mul v20.8H, v16.8H, v3.H[4] // .....................e............................................................. + sqrdmulh v25.8H, v28.8H, v3.H[3] // ...............e................................................................... + mul v31.8H, v28.8H, v3.H[2] // ................e.................................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mul v12.8H, v29.8H, v2.H[4] // ....................e.............................................................. - sqrdmulh v15.8H, v29.8H, v2.H[5] // .....................e............................................................. - sqrdmulh v24.8H, v16.8H, v2.H[3] // ................e.................................................................. // gap // ................................................................................... + sqdmulh v27.8H, v1.8H, v7.H[1] // ..............................................................*.................... + sqdmulh v16.8H, v26.8H, v7.H[1] // .................................................................*................. + sqdmulh v15.8H, v6.8H, v7.H[1] // ....................................................................*.............. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mul v14.8H, v16.8H, v2.H[2] // ...............e................................................................... - sqdmulh v4.8H, v17.8H, v7.H[1] // ..............................................................*.................... - sqdmulh v22.8H, v0.8H, v7.H[1] // .................................................................*................. - sqdmulh v23.8H, v26.8H, v7.H[1] // ....................................................................*.............. // gap // ................................................................................... + sqdmulh v14.8H, v11.8H, v7.H[1] // ...........................................................*....................... + sub v4.8H, v22.8H, v21.8H // ........e.......................................................................... + add v21.8H, v22.8H, v21.8H // .........e......................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - sqdmulh v3.8H, v10.8H, v7.H[1] // ...........................................................*....................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mls v20.8H, v13.8H, v7.H[0] // ......................e............................................................ + mls v31.8H, v25.8H, v7.H[0] // .................e................................................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v12.8H, v15.8H, v7.H[0] // ......................e............................................................ - mls v14.8H, v24.8H, v7.H[0] // .................e................................................................. // gap // ................................................................................... + srshr v15.8H, v15.8H, #11 // .....................................................................*............. + srshr v27.8H, v27.8H, #11 // ...............................................................*................... + srshr v14.8H, v14.8H, #11 // ............................................................*...................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + srshr v16.8H, v16.8H, #11 // ..................................................................*................ // gap // ................................................................................... - srshr v27.8H, v3.8H, #11 // ............................................................*...................... - srshr v23.8H, v23.8H, #11 // .....................................................................*............. - srshr v22.8H, v22.8H, #11 // ..................................................................*................ - ldr q3, [x4, #-16] // ......................................e............................................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - srshr v4.8H, v4.8H, #11 // ...............................................................*................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + add v0.8H, v21.8H, v31.8H // ...................e............................................................... + sub v29.8H, v4.8H, v20.8H // .......................e........................................................... + add v25.8H, v4.8H, v20.8H // ........................e.......................................................... + sub v2.8H, v21.8H, v31.8H // ..................e................................................................ + ldr q20, [x4, #-48] // ....................................e.............................................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + mls v1.8H, v27.8H, v7.H[0] // ................................................................*.................. + mls v11.8H, v14.8H, v7.H[0] // .............................................................*..................... + mls v6.8H, v15.8H, v7.H[0] // ......................................................................*............ + mls v26.8H, v16.8H, v7.H[0] // ...................................................................*............... // gap // ................................................................................... - add v11.8H, v9.8H, v12.8H // ........................e.......................................................... - sub v12.8H, v9.8H, v12.8H // .......................e........................................................... - add v1.8H, v13.8H, v14.8H // ...................e............................................................... - sub v14.8H, v13.8H, v14.8H // ..................e................................................................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + trn2 v12.4S, v25.4S, v29.4S // ............................e...................................................... + trn2 v22.4S, v0.4S, v2.4S // ..........................e........................................................ + trn1 v13.4S, v0.4S, v2.4S // .........................e......................................................... // gap // ................................................................................... - mls v26.8H, v23.8H, v7.H[0] // ......................................................................*............ - mls v0.8H, v22.8H, v7.H[0] // ...................................................................*............... - mls v17.8H, v4.8H, v7.H[0] // ................................................................*.................. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v10.8H, v27.8H, v7.H[0] // .............................................................*..................... - trn1 v28.4S, v11.4S, v12.4S // ...........................e....................................................... - trn2 v20.4S, v11.4S, v12.4S // ............................e...................................................... - ldr q11, [x4, #-32] // .....................................e............................................. - trn2 v19.4S, v1.4S, v14.4S // ..........................e........................................................ - trn1 v5.4S, v1.4S, v14.4S // .........................e......................................................... + trn1 v31.4S, v25.4S, v29.4S // ...........................e....................................................... // gap // ................................................................................... // gap // ................................................................................... - ldr q14, [x4, #-80] // ..................................e................................................ // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + trn1 v17.4S, v26.4S, v6.4S // .........................................................................*......... + trn2 v15.4S, v26.4S, v6.4S // ..........................................................................*........ + trn2 v0.2D, v22.2D, v12.2D // ..............................e.................................................... + trn1 v27.4S, v11.4S, v1.4S // .......................................................................*........... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - trn2 v23.4S, v0.4S, v26.4S // ..........................................................................*........ - trn1 v13.4S, v0.4S, v26.4S // .........................................................................*......... - trn2 v0.2D, v19.2D, v20.2D // ..............................e.................................................... - trn2 v15.4S, v10.4S, v17.4S // ........................................................................*.......... // gap // ................................................................................... + trn2 v16.4S, v11.4S, v1.4S // ........................................................................*.......... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - trn2 v22.2D, v5.2D, v28.2D // .............................e..................................................... - trn1 v19.2D, v19.2D, v20.2D // ................................e.................................................. - trn1 v20.4S, v10.4S, v17.4S // .......................................................................*........... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... + trn1 v6.2D, v13.2D, v31.2D // ...............................e................................................... + trn1 v4.2D, v27.2D, v17.2D // .............................................................................*..... + trn2 v14.2D, v27.2D, v17.2D // ...........................................................................*....... + mul v30.8H, v0.8H, v23.8H // .............................................e..................................... + sqrdmulh v9.8H, v0.8H, v19.8H // ............................................e...................................... // gap // ................................................................................... // gap // ................................................................................... - sqrdmulh v21.8H, v0.8H, v14.8H // .............................................e..................................... - mul v26.8H, v0.8H, v6.8H // ............................................e...................................... - trn2 v25.2D, v15.2D, v23.2D // ............................................................................*...... - trn1 v23.2D, v15.2D, v23.2D // ..............................................................................*.... - ldr q0, [x4, #-48] // ....................................e.............................................. // gap // ................................................................................... // gap // ................................................................................... + trn2 v27.2D, v13.2D, v31.2D // .............................e..................................................... + trn2 v3.2D, v16.2D, v15.2D // ............................................................................*...... + trn1 v16.2D, v16.2D, v15.2D // ..............................................................................*.... // gap // ................................................................................... - mul v12.8H, v22.8H, v6.8H // .......................................e........................................... - sqrdmulh v4.8H, v22.8H, v14.8H // ........................................e.......................................... - ldr q22, [x4, #-64] // ...................................e............................................... - trn2 v27.2D, v20.2D, v13.2D // ...........................................................................*....... - trn1 v9.2D, v20.2D, v13.2D // .............................................................................*..... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - trn1 v10.2D, v5.2D, v28.2D // ...............................e................................................... - str q25, [x1, #48] // ..................................................................................* - str q23, [x1, #16] // ................................................................................*.. + ldr q15, [x4, #-64] // ...................................e............................................... + str q4, [x1], #64 // ...............................................................................*... + str q14, [x1, #-32] // .................................................................................*. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - mls v26.8H, v21.8H, v7.H[0] // ..............................................e.................................... - str q27, [x1, #32] // .................................................................................*. - str q9, [x1], #64 // ...............................................................................*... // gap // ................................................................................... + mul v8.8H, v27.8H, v23.8H // ........................................e.......................................... + mls v30.8H, v9.8H, v7.H[0] // ..............................................e.................................... + str q3, [x1, #-16] // ..................................................................................* + str q16, [x1, #-48] // ................................................................................*.. // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... // gap // ................................................................................... - // original source code - // ldr q8, [x1, #(16*0)] // ...e..............................................................................|...e.............................................................................. - // ldr q9, [x1, #(16*1)] // ...............e..................................................................|...............e.................................................................. - // ldr q10, [x1, #(16*2)] // ..e...............................................................................|..e............................................................................... - // ldr q11, [x1, #(16*3)] // .e................................................................................|.e................................................................................ - // ldr q0, [x3], #16 // e.................................................................................|e................................................................................. - // mul v24.8h, v10.8h, v0.h[0] // .......e..........................................................................|.......e.......................................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[1] // ........e.........................................................................|........e......................................................................... - // mls v24.8h, v10.8h, v7.h[0] // ................e.................................................................|................e................................................................. - // sub v10.8h, v8.8h, v24.8h // .......................e..........................................................|.......................e.......................................................... - // add v8.8h, v8.8h, v24.8h // ........................e.........................................................|........................e......................................................... - // mul v24.8h, v11.8h, v0.h[0] // ..........e.......................................................................|..........e....................................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[1] // .........e........................................................................|.........e........................................................................ - // mls v24.8h, v11.8h, v7.h[0] // ..................e...............................................................|..................e............................................................... - // sub v11.8h, v9.8h, v24.8h // .....................e............................................................|.....................e............................................................ - // add v9.8h, v9.8h, v24.8h // ......................e...........................................................|......................e........................................................... - // mul v24.8h, v9.8h, v0.h[2] // .................................e................................................|.................................e................................................ - // sqrdmulh v9.8h, v9.8h, v0.h[3] // ................................e.................................................|................................e................................................. - // mls v24.8h, v9.8h, v7.h[0] // .......................................e..........................................|.......................................e.......................................... - // sub v9.8h, v8.8h, v24.8h // ................................................e.................................|................................................e................................. - // add v8.8h, v8.8h, v24.8h // ...............................................e..................................|...............................................e.................................. - // mul v24.8h, v11.8h, v0.h[4] // ..............................e...................................................|..............................e................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[5] // ...............................e..................................................|...............................e.................................................. - // mls v24.8h, v11.8h, v7.h[0] // ......................................e...........................................|......................................e........................................... - // sub v11.8h, v10.8h, v24.8h // ..............................................e...................................|..............................................e................................... - // add v10.8h, v10.8h, v24.8h // .............................................e....................................|.............................................e.................................... - // trn1 v25.4s, v8.4s, v9.4s // .........................................................e........................|.........................................................e........................ - // trn2 v26.4s, v8.4s, v9.4s // ........................................................e.........................|........................................................e......................... - // trn1 v27.4s, v10.4s, v11.4s // .....................................................e............................|.....................................................e............................ - // trn2 v28.4s, v10.4s, v11.4s // ......................................................e...........................|......................................................e........................... - // trn2 v10.2d, v25.2d, v27.2d // ...............................................................e..................|...............................................................e.................. - // trn2 v11.2d, v26.2d, v28.2d // .............................................................e....................|.............................................................e.................... - // trn1 v8.2d, v25.2d, v27.2d // ............................................................................e.....|............................................................................e..... - // trn1 v9.2d, v26.2d, v28.2d // ................................................................e.................|................................................................e................. - // ldr q0, [x4], #(6*16) // ...........................e......................................................|...........................e...................................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // ..........................................................e.......................|..........................................................e....................... - // ldr q1, [x4, #(-6*16 + 2*16)] // .........................................................................e........|.........................................................................e........ - // ldr q5, [x4, #(-6*16 + 3*16)] // ......................................................................e...........|......................................................................e........... - // ldr q2, [x4, #(-6*16 + 4*16)] // .......................................................e..........................|.......................................................e.......................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ...........................................e......................................|...........................................e...................................... - // mul v24.8h, v10.8h, v0.8h // .......................................................................e..........|.......................................................................e.......... - // sqrdmulh v10.8h, v10.8h, v4.8h // ........................................................................e.........|........................................................................e......... - // mls v24.8h, v10.8h, v7.h[0] // ..................................................................................*.................................................................................. - // sub v10.8h, v8.8h, v24.8h // .....*............................................................................|.....*............................................................................ - // add v8.8h, v8.8h, v24.8h // .................*................................................................|.................*................................................................ - // mul v24.8h, v11.8h, v0.8h // ...................................................................e..............|...................................................................e.............. - // sqrdmulh v11.8h, v11.8h, v4.8h // ..................................................................e...............|..................................................................e............... - // mls v24.8h, v11.8h, v7.h[0] // ...............................................................................e..|...............................................................................e.. - // sub v11.8h, v9.8h, v24.8h // ......*...........................................................................|......*........................................................................... - // add v9.8h, v9.8h, v24.8h // ....*.............................................................................|....*............................................................................. - // mul v24.8h, v9.8h, v1.8h // ............*.....................................................................|............*..................................................................... - // sqrdmulh v9.8h, v9.8h, v5.8h // .............*....................................................................|.............*.................................................................... - // mls v24.8h, v9.8h, v7.h[0] // ....................*.............................................................|....................*............................................................. - // sub v9.8h, v8.8h, v24.8h // .........................*........................................................|.........................*........................................................ - // add v8.8h, v8.8h, v24.8h // ..........................*.......................................................|..........................*....................................................... - // mul v24.8h, v11.8h, v2.8h // ..............*...................................................................|..............*................................................................... - // sqrdmulh v11.8h, v11.8h, v6.8h // ...........*......................................................................|...........*...................................................................... - // mls v24.8h, v11.8h, v7.h[0] // ...................*..............................................................|...................*.............................................................. - // sub v11.8h, v10.8h, v24.8h // .............................*....................................................|.............................*.................................................... - // add v10.8h, v10.8h, v24.8h // ............................*.....................................................|............................*..................................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // .....................................*............................................|.....................................*............................................ - // srshr v25.8h, v25.8h, #11 // ........................................*.........................................|........................................*......................................... - // mls v8.8h, v25.8h, v7.h[0] // ....................................................*.............................|....................................................*............................. - // sqdmulh v25.8h, v9.8h, v7.h[1] // ..................................*...............................................|..................................*............................................... - // srshr v25.8h, v25.8h, #11 // ............................................*.....................................|............................................*..................................... - // mls v9.8h, v25.8h, v7.h[0] // ...................................................*..............................|...................................................*.............................. - // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................................*..............................................|...................................*.............................................. - // srshr v25.8h, v25.8h, #11 // ..........................................*.......................................|..........................................*....................................... - // mls v10.8h, v25.8h, v7.h[0] // ..................................................*...............................|..................................................*............................... - // sqdmulh v25.8h, v11.8h, v7.h[1] // ....................................*.............................................|....................................*............................................. - // srshr v25.8h, v25.8h, #11 // .........................................*........................................|.........................................*........................................ - // mls v11.8h, v25.8h, v7.h[0] // .................................................*................................|.................................................*................................ - // trn1 v25.4s, v8.4s, v9.4s // .................................................................*................|.................................................................*................ - // trn2 v26.4s, v8.4s, v9.4s // ..............................................................*...................|..............................................................*................... - // trn1 v27.4s, v10.4s, v11.4s // ............................................................*.....................|............................................................*..................... - // trn2 v28.4s, v10.4s, v11.4s // ...........................................................*......................|...........................................................*...................... - // trn2 v10.2d, v25.2d, v27.2d // ..........................................................................*.......|..........................................................................*....... - // trn2 v11.2d, v26.2d, v28.2d // ....................................................................*.............|....................................................................*............. - // trn1 v8.2d, v25.2d, v27.2d // ...........................................................................*......|...........................................................................*...... - // trn1 v9.2d, v26.2d, v28.2d // .....................................................................*............|.....................................................................*............ - // str q8, [x1], #64 // .................................................................................*|.................................................................................* - // str q9, [x1, #(-(64) + 16*1)] // ..............................................................................*...|..............................................................................*... - // str q10, [x1, #(-(64) + 16*2)] // ................................................................................*.|................................................................................*. - // str q11, [x1, #(-(64) + 16*3)] // .............................................................................*....|.............................................................................*.... + // --------------------------------------------------------------------------- new position ----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q8, [x1, #(16*0)] // .....e.............................................................................'....~............................................................................. + // ldr q9, [x1, #(16*1)] // ..........e........................................................................'.........~........................................................................ + // ldr q10, [x1, #(16*2)] // .......e...........................................................................'......~........................................................................... + // ldr q11, [x1, #(16*3)] // ....e..............................................................................'...~.............................................................................. + // ldr q0, [x3], #16 // ...e...............................................................................'..~............................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // .....................e.............................................................'....................~............................................................. + // mul v24.8h, v10.8h, v0.h[0] // ....................e..............................................................'...................~.............................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...........................e.......................................................'..........................~....................................................... + // sub v10.8h, v8.8h, v24.8h // ..........................................e........................................'.........................................~........................................ + // add v8.8h, v8.8h, v24.8h // ...........................................e.......................................'..........................................~....................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .............e.....................................................................'............~..................................................................... + // mul v24.8h, v11.8h, v0.h[0] // ................e..................................................................'...............~.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ........................e..........................................................'.......................~.......................................................... + // sub v11.8h, v9.8h, v24.8h // ..............................e....................................................'.............................~.................................................... + // add v9.8h, v9.8h, v24.8h // ...............................e...................................................'..............................~................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ....................................e..............................................'...................................~.............................................. + // mul v24.8h, v9.8h, v0.h[2] // .....................................e.............................................'....................................~............................................. + // mls v24.8h, v27.8h, v7.h[0] // .............................................e.....................................'............................................~..................................... + // sub v9.8h, v8.8h, v24.8h // .....................................................e.............................'....................................................~............................. + // add v8.8h, v8.8h, v24.8h // ..................................................e................................'.................................................~................................ + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ..................................e................................................'.................................~................................................ + // mul v24.8h, v11.8h, v0.h[4] // ...................................e...............................................'..................................~............................................... + // mls v24.8h, v27.8h, v7.h[0] // ............................................e......................................'...........................................~...................................... + // sub v11.8h, v10.8h, v24.8h // ...................................................e...............................'..................................................~............................... + // add v10.8h, v10.8h, v24.8h // ....................................................e..............................'...................................................~.............................. + // trn1 v25.4s, v8.4s, v9.4s // .............................................................e.....................'............................................................~..................... + // trn2 v26.4s, v8.4s, v9.4s // ............................................................e......................'...........................................................~...................... + // trn1 v27.4s, v10.4s, v11.4s // ..............................................................e....................'.............................................................~.................... + // trn2 v28.4s, v10.4s, v11.4s // ...........................................................e.......................'..........................................................~....................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................................................e.........'........................................................................~......... + // trn2 v11.2d, v26.2d, v28.2d // .................................................................e.................'................................................................~................. + // trn1 v8.2d, v25.2d, v27.2d // ....................................................................e..............'...................................................................~.............. + // trn1 v9.2d, v26.2d, v28.2d // .~.................................................................................'*................................................................................. + // ldr q0, [ x4], #(6*16) // e..................................................................................~.................................................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // ...........e.......................................................................'..........~....................................................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ............................................................................e......'...........................................................................~...... + // ldr q5, [x4, #(-6*16 + 3*16)] // ......................................................e............................'.....................................................~............................ + // ldr q2, [ x4, #(-6*16 + 4*16)] // ...................e...............................................................'..................~............................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ......~............................................................................'.....*............................................................................ + // sqrdmulh v27.8h, v10.8h, v4.8h // ..~................................................................................'.*................................................................................ + // mul v24.8h, v10.8h, v0.8h // ...............................................................................e...'..............................................................................~... + // mls v24.8h, v27.8h, v7.h[0] // ............~......................................................................'...........*...................................................................... + // sub v10.8h, v8.8h, v24.8h // ......................~............................................................'.....................*............................................................ + // add v8.8h, v8.8h, v24.8h // .......................~...........................................................'......................*........................................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ........................................................................e..........'.......................................................................~.......... + // mul v24.8h, v11.8h, v0.8h // .......................................................................e...........'......................................................................~........... + // mls v24.8h, v27.8h, v7.h[0] // ................................................................................e..'...............................................................................~.. + // sub v11.8h, v9.8h, v24.8h // .........~.........................................................................'........*......................................................................... + // add v9.8h, v9.8h, v24.8h // ........~..........................................................................'.......*.......................................................................... + // sqrdmulh v27.8h, v9.8h, v5.8h // ..............~....................................................................'.............*.................................................................... + // mul v24.8h, v9.8h, v1.8h // ...............~...................................................................'..............*................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................~.........................................................'........................*......................................................... + // sub v9.8h, v8.8h, v24.8h // ............................~......................................................'...........................*...................................................... + // add v8.8h, v8.8h, v24.8h // .............................~.....................................................'............................*..................................................... + // sqrdmulh v27.8h, v11.8h, v6.8h // .................~.................................................................'................*................................................................. + // mul v24.8h, v11.8h, v2.8h // ..................~................................................................'.................*................................................................ + // mls v24.8h, v27.8h, v7.h[0] // ..........................~........................................................'.........................*........................................................ + // sub v11.8h, v10.8h, v24.8h // ................................~..................................................'...............................*.................................................. + // add v10.8h, v10.8h, v24.8h // .................................~.................................................'................................*................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // .........................................~.........................................'........................................*......................................... + // srshr v25.8h, v25.8h, #11 // ................................................~..................................'...............................................*.................................. + // mls v8.8h, v25.8h, v7.h[0] // ........................................................~..........................'.......................................................*.......................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ......................................~............................................'.....................................*............................................ + // srshr v25.8h, v25.8h, #11 // ...............................................~...................................'..............................................*................................... + // mls v9.8h, v25.8h, v7.h[0] // .......................................................~...........................'......................................................*........................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .......................................~...........................................'......................................*........................................... + // srshr v25.8h, v25.8h, #11 // .................................................~.................................'................................................*................................. + // mls v10.8h, v25.8h, v7.h[0] // ..........................................................~........................'.........................................................*........................ + // sqdmulh v25.8h, v11.8h, v7.h[1] // ........................................~..........................................'.......................................*.......................................... + // srshr v25.8h, v25.8h, #11 // ..............................................~....................................'.............................................*.................................... + // mls v11.8h, v25.8h, v7.h[0] // .........................................................~.........................'........................................................*......................... + // trn1 v25.4s, v8.4s, v9.4s // ..................................................................~................'.................................................................*................ + // trn2 v26.4s, v8.4s, v9.4s // ...................................................................~...............'..................................................................*............... + // trn1 v27.4s, v10.4s, v11.4s // ...............................................................~...................'..............................................................*................... + // trn2 v28.4s, v10.4s, v11.4s // ................................................................~..................'...............................................................*.................. + // trn2 v10.2d, v25.2d, v27.2d // ......................................................................~............'.....................................................................*............ + // trn2 v11.2d, v26.2d, v28.2d // ..........................................................................~........'.........................................................................*........ + // trn1 v8.2d, v25.2d, v27.2d // .....................................................................~.............'....................................................................*............. + // trn1 v9.2d, v26.2d, v28.2d // ...........................................................................~.......'..........................................................................*....... + // str q8, [x1], #64 // .............................................................................~.....'............................................................................*..... + // str q9, [x1, #(-(64) + 16*1)] // ..................................................................................~'.................................................................................* + // str q10, [x1, #(-(64) + 16*2)] // ..............................................................................~....'.............................................................................*.... + // str q11, [x1, #(-(64) + 16*3)] // .................................................................................~.'................................................................................*. sub count, count, #1 cbnz count, layer4567_start - sub v2.8H, v19.8H, v26.8H // ...*................................... - add v8.8H, v19.8H, v26.8H // .*..................................... - mls v12.8H, v4.8H, v7.H[0] // *...................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sqrdmulh v24.8H, v2.8H, v3.8H // ....*.................................. - mul v16.8H, v2.8H, v11.8H // .......*............................... - sqrdmulh v13.8H, v8.8H, v0.8H // ......*................................ - mul v5.8H, v8.8H, v22.8H // .....*................................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sub v23.8H, v10.8H, v12.8H // ..*.................................... - add v15.8H, v10.8H, v12.8H // ........*.............................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v16.8H, v24.8H, v7.H[0] // .........*............................. - mls v5.8H, v13.8H, v7.H[0] // ..........*............................ - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sub v22.8H, v15.8H, v5.8H // ...........*........................... - add v9.8H, v15.8H, v5.8H // ............*.......................... - sub v15.8H, v23.8H, v16.8H // ..............*........................ - add v13.8H, v23.8H, v16.8H // .............*......................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sqdmulh v23.8H, v13.8H, v7.H[1] // ................*...................... - sqdmulh v12.8H, v9.8H, v7.H[1] // ..................*.................... - sqdmulh v26.8H, v15.8H, v7.H[1] // .................*..................... - sqdmulh v18.8H, v22.8H, v7.H[1] // ...............*....................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - srshr v4.8H, v12.8H, #11 // ...................*................... - srshr v30.8H, v26.8H, #11 // ....................*.................. - srshr v14.8H, v18.8H, #11 // ......................*................ - srshr v3.8H, v23.8H, #11 // .....................*................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v22.8H, v14.8H, v7.H[0] // .........................*............. - mls v15.8H, v30.8H, v7.H[0] // .......................*............... - mls v9.8H, v4.8H, v7.H[0] // ..........................*............ - mls v13.8H, v3.8H, v7.H[0] // ........................*.............. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - trn2 v12.4S, v9.4S, v22.4S // .............................*......... - trn1 v8.4S, v9.4S, v22.4S // ..............................*........ - trn2 v28.4S, v13.4S, v15.4S // ...........................*........... - trn1 v6.4S, v13.4S, v15.4S // ............................*.......... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - trn2 v23.2D, v12.2D, v28.2D // ...............................*....... - trn1 v2.2D, v12.2D, v28.2D // ................................*...... - trn1 v25.2D, v8.2D, v6.2D // ..................................*.... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - trn2 v24.2D, v8.2D, v6.2D // .................................*..... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - str q25, [x1], #64 // ......................................* - str q23, [x1, #-16] // ...................................*... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - str q2, [x1, #-48] // ....................................*.. - str q24, [x1, #-32] // .....................................*. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... + // Instructions: 42 + // Expected cycles: 27 + // Expected IPC: 1.56 + // + // Cycle bound: 27.0 + // IPC bound: 1.56 + // + // Wall time: 0.89s + // User time: 0.89s + // + // ----------- original position -----------> + // 0 25 + // |------------------------|---------------- + trn1 v16.2D, v22.2D, v12.2D // *......................................... + sqrdmulh v27.8H, v27.8H, v19.8H // .*........................................ + ldr q14, [x4, #-16] // ..*....................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v26.8H, v16.8H, v30.8H // ...*...................................... + sub v16.8H, v16.8H, v30.8H // ....*..................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v8.8H, v27.8H, v7.H[0] // .....*.................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sqrdmulh v27.8H, v26.8H, v20.8H // ......*................................... + mul v15.8H, v26.8H, v15.8H // .......*.................................. + mul v26.8H, v16.8H, v18.8H // .........*................................ + sqrdmulh v16.8H, v16.8H, v14.8H // ........*................................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v14.8H, v6.8H, v8.8H // ..........*............................... + add v6.8H, v6.8H, v8.8H // ...........*.............................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v15.8H, v27.8H, v7.H[0] // ............*............................. + mls v26.8H, v16.8H, v7.H[0] // .............*............................ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v16.8H, v14.8H, v26.8H // ................*......................... + add v14.8H, v14.8H, v26.8H // .................*........................ + sub v26.8H, v6.8H, v15.8H // ..............*........................... + add v15.8H, v6.8H, v15.8H // ...............*.......................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sqdmulh v27.8H, v26.8H, v7.H[1] // ..................*....................... + sqdmulh v6.8H, v14.8H, v7.H[1] // ...................*...................... + sqdmulh v0.8H, v16.8H, v7.H[1] // ....................*..................... + sqdmulh v11.8H, v15.8H, v7.H[1] // .....................*.................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + srshr v27.8H, v27.8H, #11 // .......................*.................. + srshr v6.8H, v6.8H, #11 // .........................*................ + srshr v0.8H, v0.8H, #11 // ......................*................... + srshr v11.8H, v11.8H, #11 // ........................*................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v14.8H, v6.8H, v7.H[0] // .............................*............ + mls v16.8H, v0.8H, v7.H[0] // ............................*............. + mls v26.8H, v27.8H, v7.H[0] // ..........................*............... + mls v15.8H, v11.8H, v7.H[0] // ...........................*.............. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + trn1 v27.4S, v15.4S, v26.4S // ................................*......... + trn1 v6.4S, v14.4S, v16.4S // ..............................*........... + trn2 v16.4S, v14.4S, v16.4S // ...............................*.......... + trn2 v15.4S, v15.4S, v26.4S // .................................*........ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + trn1 v14.2D, v27.2D, v6.2D // ..................................*....... + trn2 v27.2D, v27.2D, v6.2D // ...................................*...... + trn2 v6.2D, v15.2D, v16.2D // ....................................*..... + trn1 v15.2D, v15.2D, v16.2D // .....................................*.... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q14, [x1], #64 // ......................................*... + str q27, [x1, #-32] // .......................................*.. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q6, [x1, #-16] // ........................................*. + str q15, [x1, #-48] // .........................................* + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... - // original source code - // mls v12.8H, v4.8H, v7.H[0] // ..*.................................... - // add v27.8H, v19.8H, v26.8H // .*..................................... - // sub v4.8H, v10.8H, v12.8H // .......*............................... - // sub v8.8H, v19.8H, v26.8H // *...................................... - // sqrdmulh v3.8H, v8.8H, v3.8H // ...*................................... - // mul v28.8H, v27.8H, v22.8H // ......*................................ - // sqrdmulh v0.8H, v27.8H, v0.8H // .....*................................. - // mul v22.8H, v8.8H, v11.8H // ....*.................................. - // add v6.8H, v10.8H, v12.8H // ........*.............................. - // mls v22.8H, v3.8H, v7.H[0] // .........*............................. - // mls v28.8H, v0.8H, v7.H[0] // ..........*............................ - // sub v17.8H, v6.8H, v28.8H // ...........*........................... - // add v10.8H, v6.8H, v28.8H // ............*.......................... - // add v0.8H, v4.8H, v22.8H // ..............*........................ - // sub v26.8H, v4.8H, v22.8H // .............*......................... - // sqdmulh v4.8H, v17.8H, v7.H[1] // ..................*.................... - // sqdmulh v22.8H, v0.8H, v7.H[1] // ...............*....................... - // sqdmulh v23.8H, v26.8H, v7.H[1] // .................*..................... - // sqdmulh v3.8H, v10.8H, v7.H[1] // ................*...................... - // srshr v27.8H, v3.8H, #11 // ...................*................... - // srshr v23.8H, v23.8H, #11 // ....................*.................. - // srshr v22.8H, v22.8H, #11 // ......................*................ - // srshr v4.8H, v4.8H, #11 // .....................*................. - // mls v26.8H, v23.8H, v7.H[0] // ........................*.............. - // mls v0.8H, v22.8H, v7.H[0] // ..........................*............ - // mls v17.8H, v4.8H, v7.H[0] // .......................*............... - // mls v10.8H, v27.8H, v7.H[0] // .........................*............. - // trn2 v23.4S, v0.4S, v26.4S // .............................*......... - // trn1 v13.4S, v0.4S, v26.4S // ..............................*........ - // trn2 v15.4S, v10.4S, v17.4S // ...........................*........... - // trn1 v20.4S, v10.4S, v17.4S // ............................*.......... - // trn2 v25.2D, v15.2D, v23.2D // ...............................*....... - // trn1 v23.2D, v15.2D, v23.2D // ................................*...... - // trn2 v27.2D, v20.2D, v13.2D // ..................................*.... - // trn1 v9.2D, v20.2D, v13.2D // .................................*..... - // str q25, [x1, #48] // ....................................*.. - // str q23, [x1, #16] // .....................................*. - // str q27, [x1, #32] // ......................................* - // str q9, [x1], #64 // ...................................*... + // ------------- new position --------------> + // 0 25 + // |------------------------|---------------- + // trn1 v12.2D, v22.2D, v12.2D // *......................................... + // sqrdmulh v17.8H, v27.8H, v19.8H // .*........................................ + // ldr q9, [x4, #-16] // ..*....................................... + // add v16.8H, v12.8H, v30.8H // ...*...................................... + // sub v1.8H, v12.8H, v30.8H // ....*..................................... + // mls v8.8H, v17.8H, v7.H[0] // .....*.................................... + // sqrdmulh v14.8H, v16.8H, v20.8H // ......*................................... + // mul v27.8H, v16.8H, v15.8H // .......*.................................. + // sqrdmulh v16.8H, v1.8H, v9.8H // .........*................................ + // mul v15.8H, v1.8H, v18.8H // ........*................................. + // sub v0.8H, v6.8H, v8.8H // ..........*............................... + // add v10.8H, v6.8H, v8.8H // ...........*.............................. + // mls v27.8H, v14.8H, v7.H[0] // ............*............................. + // mls v15.8H, v16.8H, v7.H[0] // .............*............................ + // sub v1.8H, v10.8H, v27.8H // ................*......................... + // add v11.8H, v10.8H, v27.8H // .................*........................ + // sub v6.8H, v0.8H, v15.8H // ..............*........................... + // add v26.8H, v0.8H, v15.8H // ...............*.......................... + // sqdmulh v27.8H, v1.8H, v7.H[1] // ..................*....................... + // sqdmulh v16.8H, v26.8H, v7.H[1] // ...................*...................... + // sqdmulh v15.8H, v6.8H, v7.H[1] // ....................*..................... + // sqdmulh v14.8H, v11.8H, v7.H[1] // .....................*.................... + // srshr v15.8H, v15.8H, #11 // ........................*................. + // srshr v27.8H, v27.8H, #11 // ......................*................... + // srshr v14.8H, v14.8H, #11 // .........................*................ + // srshr v16.8H, v16.8H, #11 // .......................*.................. + // mls v1.8H, v27.8H, v7.H[0] // ............................*............. + // mls v11.8H, v14.8H, v7.H[0] // .............................*............ + // mls v6.8H, v15.8H, v7.H[0] // ...........................*.............. + // mls v26.8H, v16.8H, v7.H[0] // ..........................*............... + // trn1 v17.4S, v26.4S, v6.4S // ...............................*.......... + // trn2 v15.4S, v26.4S, v6.4S // ................................*......... + // trn1 v27.4S, v11.4S, v1.4S // ..............................*........... + // trn2 v16.4S, v11.4S, v1.4S // .................................*........ + // trn1 v4.2D, v27.2D, v17.2D // ..................................*....... + // trn2 v14.2D, v27.2D, v17.2D // ...................................*...... + // trn2 v3.2D, v16.2D, v15.2D // ....................................*..... + // trn1 v16.2D, v16.2D, v15.2D // .....................................*.... + // str q4, [x1], #64 // ......................................*... + // str q14, [x1, #-32] // .......................................*.. + // str q3, [x1, #-16] // ........................................*. + // str q16, [x1, #-48] // .........................................* pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_m1_icestorm.s index d32eafce..a4c1190e 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_manual_st4_opt_m1_icestorm.s @@ -26,30 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -67,15 +43,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -103,28 +73,28 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro store_vectors_with_inc a0, a1, a2, a3, addr, inc - str_vi \a0, \addr, \inc - str_vo \a1, \addr, (-(\inc) + 16*1) - str_vo \a2, \addr, (-(\inc) + 16*2) - str_vo \a3, \addr, (-(\inc) + 16*3) + str qform_\a0, [\addr], #\inc + str qform_\a1, [\addr, #(-(\inc) + 16*1)] + str qform_\a2, [\addr, #(-(\inc) + 16*2)] + str qform_\a3, [\addr, #(-(\inc) + 16*3)] .endm .macro transpose4 data @@ -146,7 +116,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -157,7 +127,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -167,7 +137,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -175,7 +145,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -186,19 +156,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -337,1006 +307,1112 @@ _ntt_kyber_123_4567_manual_st4_opt_m1_icestorm: load_roots_123 .p2align 2 - ldr q27, [x0, #256] // ...........*............................................ - // gap // ........................................................ - // gap // ........................................................ - ldr q12, [x0, #448] // *....................................................... - ldr q31, [x0, #192] // ...*.................................................... - // gap // ........................................................ - // gap // ........................................................ - ldr q4, [x0, #384] // ..*..................................................... - ldr q19, [x0, #0] // ....*................................................... - ldr q17, [x0, #320] // .*...................................................... - // gap // ........................................................ - // gap // ........................................................ - ldr q13, [x0, #128] // ........*............................................... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - sqrdmulh v21.8H, v12.8H, v0.H[1] // ......*................................................. - mul v28.8H, v12.8H, v0.H[0] // .......*................................................ - // gap // ........................................................ - // gap // ........................................................ - mul v18.8H, v4.8H, v0.H[0] // .........*.............................................. - sqrdmulh v5.8H, v4.8H, v0.H[1] // ..........*............................................. - // gap // ........................................................ - // gap // ........................................................ - mul v9.8H, v17.8H, v0.H[0] // ...............*........................................ - sqrdmulh v10.8H, v17.8H, v0.H[1] // ............*........................................... - // gap // ........................................................ - // gap // ........................................................ - mul v6.8H, v27.8H, v0.H[0] // ..........................*............................. - mls v28.8H, v21.8H, v7.H[0] // .............*.......................................... - // gap // ........................................................ - // gap // ........................................................ - mls v18.8H, v5.8H, v7.H[0] // ..............*......................................... - sqrdmulh v3.8H, v27.8H, v0.H[1] // ................*....................................... - // gap // ........................................................ - // gap // ........................................................ - mls v9.8H, v10.8H, v7.H[0] // ....................*................................... - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - add v22.8H, v31.8H, v28.8H // .................*...................................... - sub v25.8H, v31.8H, v28.8H // ..................*..................................... - // gap // ........................................................ - // gap // ........................................................ - sub v12.8H, v13.8H, v18.8H // ...................*.................................... - add v8.8H, v13.8H, v18.8H // .........................*.............................. - ldr q13, [x0, #64] // .....*.................................................. - // gap // ........................................................ - mul v30.8H, v22.8H, v0.H[2] // .....................*.................................. - sqrdmulh v22.8H, v22.8H, v0.H[3] // ......................*................................. - // gap // ........................................................ - // gap // ........................................................ - mls v6.8H, v3.8H, v7.H[0] // ...................................*.................... - mul v2.8H, v25.8H, v0.H[4] // ....................................*................... - // gap // ........................................................ - // gap // ........................................................ - mul v28.8H, v12.8H, v0.H[4] // .......................*................................ - sqrdmulh v12.8H, v12.8H, v0.H[5] // ........................*............................... - // gap // ........................................................ - // gap // ........................................................ - mls v30.8H, v22.8H, v7.H[0] // ...........................*............................ - mul v31.8H, v8.8H, v0.H[2] // ...............................*........................ - // gap // ........................................................ - // gap // ........................................................ - sqrdmulh v27.8H, v8.8H, v0.H[3] // ................................*....................... - add v14.8H, v13.8H, v9.8H // ..............................*......................... - // gap // ........................................................ - // gap // ........................................................ - sub v29.8H, v13.8H, v9.8H // ............................................*........... - sqrdmulh v25.8H, v25.8H, v0.H[5] // ............................*........................... - // gap // ........................................................ - // gap // ........................................................ - sub v17.8H, v14.8H, v30.8H // .................................*...................... - add v9.8H, v14.8H, v30.8H // ..................................*..................... - // gap // ........................................................ - // gap // ........................................................ - mls v31.8H, v27.8H, v7.H[0] // ........................................*............... - add v24.8H, v19.8H, v6.8H // ..........................................*............. - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - mul v22.8H, v17.8H, v1.H[0] // .....................................*.................. - // gap // ........................................................ - sqrdmulh v8.8H, v17.8H, v1.H[1] // ......................................*................. - mls v2.8H, v25.8H, v7.H[0] // ..............................................*......... - sqrdmulh v26.8H, v9.8H, v0.H[7] // .........................................*.............. - // gap // ........................................................ - // gap // ........................................................ - mul v27.8H, v9.8H, v0.H[6] // .......................................*................ - sub v3.8H, v24.8H, v31.8H // .............................................*.......... - // gap // ........................................................ - // gap // ........................................................ - add v10.8H, v24.8H, v31.8H // .................................................*...... - // gap // ........................................................ - // gap // ........................................................ - mls v22.8H, v8.8H, v7.H[0] // ...........................................*............ - // gap // ........................................................ - sub v13.8H, v29.8H, v2.8H // ..................................................*..... - // gap // ........................................................ - // gap // ........................................................ - add v21.8H, v29.8H, v2.8H // ....................................................*... - mls v27.8H, v26.8H, v7.H[0] // ...............................................*........ - // gap // ........................................................ - // gap // ........................................................ - mls v28.8H, v12.8H, v7.H[0] // .............................*.......................... - // gap // ........................................................ - // gap // ........................................................ - add v5.8H, v3.8H, v22.8H // ................................................*....... - sqrdmulh v8.8H, v13.8H, v1.H[5] // .......................................................* - // gap // ........................................................ - // gap // ........................................................ - // gap // ........................................................ - sub v30.8H, v10.8H, v27.8H // .....................................................*.. - mul v9.8H, v13.8H, v1.H[4] // ......................................................*. - // gap // ........................................................ - str q5, [x0, #128] // ...................................................*.... + // Instructions: 27 + // Expected cycles: 16 + // Expected IPC: 1.69 + // + // Cycle bound: 16.0 + // IPC bound: 1.69 + // + // Wall time: 0.21s + // User time: 0.21s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q9, [x0, #0] // ..........*................... + ldr q19, [x0, #384] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q15, [x0, #448] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q14, [x0, #320] // ...................*.......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q12, [x0, #256] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v13.8H, v19.8H, v0.H[0] // .....*........................ + sqrdmulh v29.8H, v19.8H, v0.H[1] // ....*......................... + ldr q27, [x0, #192] // ........*..................... + // gap // .............................. + mul v28.8H, v15.8H, v0.H[0] // ......*....................... + sqrdmulh v15.8H, v15.8H, v0.H[1] // .......*...................... + ldr q5, [x0, #64] // ........................*..... + // gap // .............................. + ldr q25, [x0, #128] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v13.8H, v29.8H, v7.H[0] // ...........*.................. + mul v23.8H, v12.8H, v0.H[0] // .........*.................... + // gap // .............................. + // gap // .............................. + mls v28.8H, v15.8H, v7.H[0] // ...............*.............. + sqrdmulh v15.8H, v12.8H, v0.H[1] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v16.8H, v25.8H, v13.8H // .............*................ + add v12.8H, v25.8H, v13.8H // ..........................*... + // gap // .............................. + // gap // .............................. + add v6.8H, v27.8H, v28.8H // ..................*........... + mls v23.8H, v15.8H, v7.H[0] // ..............*............... + // gap // .............................. + // gap // .............................. + mul v11.8H, v16.8H, v0.H[4] // ................*............. + sqrdmulh v15.8H, v16.8H, v0.H[5] // .................*............ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v26.8H, v27.8H, v28.8H // ......................*....... + add v31.8H, v9.8H, v23.8H // ....................*......... + // gap // .............................. + // gap // .............................. + sqrdmulh v27.8H, v6.8H, v0.H[3] // .....................*........ + mls v11.8H, v15.8H, v7.H[0] // .........................*.... + mul v15.8H, v6.8H, v0.H[2] // .......................*...... + // gap // .............................. + // gap // .............................. - // original source code - // ldr q26, [x0, #448] // .*...................................................... - // ldr q6, [x0, #320] // .....*.................................................. - // ldr q14, [x0, #384] // ...*.................................................... - // ldr q5, [x0, #192] // ..*..................................................... - // ldr q19, [x0, #0] // ....*................................................... - // ldr q4, [x0, #64] // ......................*................................. - // sqrdmulh v22.8H, v26.8H, v0.H[1] // .......*................................................ - // mul v24.8H, v26.8H, v0.H[0] // ........*............................................... - // ldr q16, [x0, #128] // ......*................................................. - // mul v8.8H, v14.8H, v0.H[0] // .........*.............................................. - // sqrdmulh v12.8H, v14.8H, v0.H[1] // ..........*............................................. - // ldr q27, [x0, #256] // *....................................................... - // sqrdmulh v11.8H, v6.8H, v0.H[1] // ............*........................................... - // mls v24.8H, v22.8H, v7.H[0] // ..............*......................................... - // mls v8.8H, v12.8H, v7.H[0] // ...............*........................................ - // mul v18.8H, v6.8H, v0.H[0] // ...........*............................................ - // sqrdmulh v23.8H, v27.8H, v0.H[1] // ................*....................................... - // add v29.8H, v5.8H, v24.8H // ..................*..................................... - // sub v5.8H, v5.8H, v24.8H // ...................*.................................... - // sub v31.8H, v16.8H, v8.8H // ....................*................................... - // mls v18.8H, v11.8H, v7.H[0] // .................*...................................... - // mul v2.8H, v29.8H, v0.H[2] // .......................*................................ - // sqrdmulh v9.8H, v29.8H, v0.H[3] // ........................*............................... - // mul v28.8H, v31.8H, v0.H[4] // ...........................*............................ - // sqrdmulh v11.8H, v31.8H, v0.H[5] // ............................*........................... - // add v21.8H, v16.8H, v8.8H // .....................*.................................. - // mul v6.8H, v27.8H, v0.H[0] // .............*.......................................... - // mls v2.8H, v9.8H, v7.H[0] // .............................*.......................... - // sqrdmulh v29.8H, v5.8H, v0.H[5] // ..................................*..................... - // mls v28.8H, v11.8H, v7.H[0] // ..................................................*..... - // add v16.8H, v4.8H, v18.8H // ................................*....................... - // mul v8.8H, v21.8H, v0.H[2] // ..............................*......................... - // sqrdmulh v13.8H, v21.8H, v0.H[3] // ...............................*........................ - // sub v12.8H, v16.8H, v2.8H // ...................................*.................... - // add v17.8H, v16.8H, v2.8H // ....................................*................... - // mls v6.8H, v23.8H, v7.H[0] // .........................*.............................. - // mul v21.8H, v5.8H, v0.H[4] // ..........................*............................. - // mul v22.8H, v12.8H, v1.H[0] // .......................................*................ - // sqrdmulh v16.8H, v12.8H, v1.H[1] // ........................................*............... - // mul v27.8H, v17.8H, v0.H[6] // ...........................................*............ - // mls v8.8H, v13.8H, v7.H[0] // .....................................*.................. - // sqrdmulh v31.8H, v17.8H, v0.H[7] // ..........................................*............. - // add v10.8H, v19.8H, v6.8H // ......................................*................. - // mls v22.8H, v16.8H, v7.H[0] // ..............................................*......... - // sub v2.8H, v4.8H, v18.8H // .................................*...................... - // sub v3.8H, v10.8H, v8.8H // ............................................*........... - // mls v21.8H, v29.8H, v7.H[0] // .........................................*.............. - // mls v27.8H, v31.8H, v7.H[0] // .................................................*...... - // add v9.8H, v3.8H, v22.8H // ...................................................*.... - // add v10.8H, v10.8H, v8.8H // .............................................*.......... - // sub v8.8H, v2.8H, v21.8H // ...............................................*........ - // str q9, [x0, #128] // .......................................................* - // add v21.8H, v2.8H, v21.8H // ................................................*....... - // sub v30.8H, v10.8H, v27.8H // .....................................................*.. - // mul v9.8H, v8.8H, v1.H[4] // ......................................................*. - // sqrdmulh v8.8H, v8.8H, v1.H[5] // ....................................................*... + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q18, [x0, #256] // ....*.......................... + // ldr q3, [x0, #384] // .*............................. + // ldr q25, [x0, #448] // ..*............................ + // ldr q21, [x0, #128] // ...........*................... + // sqrdmulh v13.8H, v3.8H, v0.H[1] // ......*........................ + // mul v4.8H, v3.8H, v0.H[0] // .....*......................... + // mul v17.8H, v25.8H, v0.H[0] // ........*...................... + // sqrdmulh v25.8H, v25.8H, v0.H[1] // .........*..................... + // ldr q24, [x0, #192] // .......*....................... + // mul v23.8H, v18.8H, v0.H[0] // .............*................. + // ldr q9, [x0, #0] // *.............................. + // mls v4.8H, v13.8H, v7.H[0] // ............*.................. + // sqrdmulh v29.8H, v18.8H, v0.H[1] // ...............*............... + // sub v5.8H, v21.8H, v4.8H // ................*.............. + // mls v23.8H, v29.8H, v7.H[0] // ...................*........... + // mls v17.8H, v25.8H, v7.H[0] // ..............*................ + // mul v11.8H, v5.8H, v0.H[4] // ....................*.......... + // sqrdmulh v25.8H, v5.8H, v0.H[5] // .....................*......... + // add v6.8H, v24.8H, v17.8H // ..................*............ + // ldr q14, [x0, #320] // ...*........................... + // add v31.8H, v9.8H, v23.8H // .......................*....... + // sqrdmulh v27.8H, v6.8H, v0.H[3] // ........................*...... + // sub v26.8H, v24.8H, v17.8H // ......................*........ + // mul v15.8H, v6.8H, v0.H[2] // ..........................*.... + // ldr q5, [x0, #64] // ..........*.................... + // mls v11.8H, v25.8H, v7.H[0] // .........................*..... + // add v12.8H, v21.8H, v4.8H // .................*............. sub count, count, #1 layer123_start: - str q30, [x0, #64] // .....................................................................*...... - mul v20.8H, v21.8H, v1.H[2] // ..........................................................*................. - sqrdmulh v2.8H, v21.8H, v1.H[3] // ...........................................................*................ - ldr q26, [x0, #464] // .......e.................................................................... - sub v30.8H, v19.8H, v6.8H // ...........*................................................................ - ldr q6, [x0, #336] // .....e...................................................................... - ldr q14, [x0, #400] // ......e..................................................................... - add v27.8H, v10.8H, v27.8H // ....................................................*....................... - ldr q5, [x0, #208] // ...e........................................................................ - ldr q19, [x0, #16] // e........................................................................... - mls v9.8H, v8.8H, v7.H[0] // .................................................................*.......... - sub v12.8H, v3.8H, v22.8H // ........................................................*................... - mls v20.8H, v2.8H, v7.H[0] // ............................................................*............... - str q27, [x0], #(16) // ....................................................................*....... - sub v18.8H, v30.8H, v28.8H // .........................................*.................................. - ldr q4, [x0, #64] // .e.......................................................................... - sqrdmulh v22.8H, v26.8H, v0.H[1] // ........................e................................................... - mul v24.8H, v26.8H, v0.H[0] // .......................e.................................................... - str q12, [x0, #176] // .......................................................................*.... - ldr q16, [x0, #128] // ..e......................................................................... - mul v8.8H, v14.8H, v0.H[0] // ..................e......................................................... - sqrdmulh v12.8H, v14.8H, v0.H[1] // ...................e........................................................ + // Instructions: 76 + // Expected cycles: 30 + // Expected IPC: 2.53 + // + // Cycle bound: 30.0 + // IPC bound: 2.53 + // + // Wall time: 53.19s + // User time: 53.19s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| // gap // ............................................................................ - ldr q27, [x0, #256] // ....e....................................................................... - add v17.8H, v18.8H, v9.8H // ...................................................................*........ - sub v9.8H, v18.8H, v9.8H // ..................................................................*......... // gap // ............................................................................ + sqrdmulh v13.8H, v14.8H, v0.H[1] // .............*.............................................................. + mul v6.8H, v26.8H, v0.H[4] // ............................................*............................... + ldr q18, [x0, #272] // ....e....................................................................... + sqrdmulh v16.8H, v26.8H, v0.H[5] // ...........................................*................................ + ldr q3, [x0, #400] // ......e..................................................................... + mul v14.8H, v14.8H, v0.H[0] // ..............*............................................................. + mls v15.8H, v27.8H, v7.H[0] // ...................................*........................................ + sqrdmulh v26.8H, v12.8H, v0.H[3] // ............................*............................................... + ldr q25, [x0, #464] // .......e.................................................................... + ldr q21, [x0, #144] // ..e......................................................................... + mul v27.8H, v12.8H, v0.H[2] // .............................*.............................................. + sub v22.8H, v9.8H, v23.8H // ...........*................................................................ // gap // ............................................................................ - sqrdmulh v11.8H, v6.8H, v0.H[1] // ..............e............................................................. - mls v24.8H, v22.8H, v7.H[0] // .........................e.................................................. // gap // ............................................................................ + mls v6.8H, v16.8H, v7.H[0] // .............................................*.............................. // gap // ............................................................................ - mls v8.8H, v12.8H, v7.H[0] // ....................e....................................................... - mul v18.8H, v6.8H, v0.H[0] // .............e.............................................................. - str q9, [x0, #432] // ...........................................................................* // gap // ............................................................................ - add v30.8H, v30.8H, v28.8H // ..........................................*................................. - str q17, [x0, #368] // ..........................................................................*. - sqrdmulh v23.8H, v27.8H, v0.H[1] // .........e.................................................................. + mls v14.8H, v13.8H, v7.H[0] // ...............*............................................................ + sqrdmulh v13.8H, v3.8H, v0.H[1] // ..................e......................................................... + mul v4.8H, v3.8H, v0.H[0] // ...................e........................................................ // gap // ............................................................................ - add v29.8H, v5.8H, v24.8H // ...........................e................................................ - sub v5.8H, v5.8H, v24.8H // ..........................e................................................. // gap // ............................................................................ // gap // ............................................................................ - sub v31.8H, v16.8H, v8.8H // .....................e...................................................... - mls v18.8H, v11.8H, v7.H[0] // ...............e............................................................ // gap // ............................................................................ + mls v27.8H, v26.8H, v7.H[0] // ..............................*............................................. + mul v17.8H, v25.8H, v0.H[0] // ........................e................................................... + sqrdmulh v25.8H, v25.8H, v0.H[1] // .......................e.................................................... + ldr q24, [x0, #208] // ...e........................................................................ // gap // ............................................................................ - mul v2.8H, v29.8H, v0.H[2] // .................................e.......................................... - sqrdmulh v9.8H, v29.8H, v0.H[3] // ..................................e......................................... + mul v23.8H, v18.8H, v0.H[0] // .........e.................................................................. + ldr q9, [x0, #16] // e........................................................................... + mls v4.8H, v13.8H, v7.H[0] // ....................e....................................................... // gap // ............................................................................ + add v16.8H, v5.8H, v14.8H // .................*.......................................................... + sub v19.8H, v31.8H, v27.8H // ...............................*............................................ + add v2.8H, v22.8H, v11.8H // ..........................................*................................. // gap // ............................................................................ - mul v28.8H, v31.8H, v0.H[4] // ......................................e..................................... - sqrdmulh v11.8H, v31.8H, v0.H[5] // .......................................e.................................... // gap // ............................................................................ + add v30.8H, v31.8H, v27.8H // ................................*........................................... // gap // ............................................................................ - add v21.8H, v16.8H, v8.8H // ......................e..................................................... - mul v6.8H, v27.8H, v0.H[0] // ........e................................................................... // gap // ............................................................................ + add v26.8H, v16.8H, v15.8H // .....................................*...................................... + sub v27.8H, v5.8H, v14.8H // ................*........................................................... + sqrdmulh v29.8H, v18.8H, v0.H[1] // ........e................................................................... // gap // ............................................................................ - mls v2.8H, v9.8H, v7.H[0] // ...................................e........................................ - sqrdmulh v29.8H, v5.8H, v0.H[5] // ............................................e............................... // gap // ............................................................................ // gap // ............................................................................ - mls v28.8H, v11.8H, v7.H[0] // ........................................e................................... - add v16.8H, v4.8H, v18.8H // .................e.......................................................... // gap // ............................................................................ + mul v13.8H, v26.8H, v0.H[6] // .................................................*.......................... + sub v8.8H, v16.8H, v15.8H // ....................................*....................................... // gap // ............................................................................ - mul v8.8H, v21.8H, v0.H[2] // ............................e............................................... - sqrdmulh v13.8H, v21.8H, v0.H[3] // .............................e.............................................. // gap // ............................................................................ + sub v16.8H, v27.8H, v6.8H // ..............................................*............................. + add v28.8H, v27.8H, v6.8H // ...............................................*............................ // gap // ............................................................................ - sub v12.8H, v16.8H, v2.8H // ....................................e....................................... - add v17.8H, v16.8H, v2.8H // .....................................e...................................... // gap // ............................................................................ + sqrdmulh v27.8H, v26.8H, v0.H[7] // ................................................*........................... + sqrdmulh v15.8H, v8.8H, v1.H[1] // .....................................................*...................... // gap // ............................................................................ - mls v6.8H, v23.8H, v7.H[0] // ..........e................................................................. - mul v21.8H, v5.8H, v0.H[4] // ...........................................e................................ + sqrdmulh v6.8H, v16.8H, v1.H[5] // ...............................................................*............ // gap // ............................................................................ + mul v26.8H, v8.8H, v1.H[0] // ......................................................*..................... + mul v16.8H, v16.8H, v1.H[4] // ................................................................*........... + sub v5.8H, v21.8H, v4.8H // .....................e...................................................... // gap // ............................................................................ - mul v22.8H, v12.8H, v1.H[0] // .....................................................e...................... - sqrdmulh v16.8H, v12.8H, v1.H[1] // ......................................................e..................... // gap // ............................................................................ + mls v13.8H, v27.8H, v7.H[0] // ..................................................*......................... // gap // ............................................................................ - mul v27.8H, v17.8H, v0.H[6] // ................................................e........................... - mls v8.8H, v13.8H, v7.H[0] // ..............................e............................................. // gap // ............................................................................ + sub v22.8H, v22.8H, v11.8H // .........................................*.................................. // gap // ............................................................................ - sqrdmulh v31.8H, v17.8H, v0.H[7] // .................................................e.......................... + sqrdmulh v27.8H, v28.8H, v1.H[3] // ..........................................................*................. // gap // ............................................................................ + mls v26.8H, v15.8H, v7.H[0] // .......................................................*.................... + mls v16.8H, v6.8H, v7.H[0] // .................................................................*.......... + mul v14.8H, v28.8H, v1.H[2] // ...........................................................*................ // gap // ............................................................................ - add v10.8H, v19.8H, v6.8H // ............e............................................................... - mls v22.8H, v16.8H, v7.H[0] // .......................................................e.................... - sub v2.8H, v4.8H, v18.8H // ................e........................................................... // gap // ............................................................................ + add v11.8H, v30.8H, v13.8H // ....................................................*....................... // gap // ............................................................................ - sub v3.8H, v10.8H, v8.8H // ...............................e............................................ - mls v21.8H, v29.8H, v7.H[0] // .............................................e.............................. // gap // ............................................................................ + mls v23.8H, v29.8H, v7.H[0] // ..........e................................................................. // gap // ............................................................................ - mls v27.8H, v31.8H, v7.H[0] // ..................................................e......................... // gap // ............................................................................ + mls v17.8H, v25.8H, v7.H[0] // .........................e.................................................. + sub v6.8H, v19.8H, v26.8H // ........................................................*................... + sub v15.8H, v22.8H, v16.8H // ..................................................................*......... // gap // ............................................................................ - sub v12.8H, v30.8H, v20.8H // .............................................................*.............. - add v9.8H, v3.8H, v22.8H // .........................................................e.................. - add v10.8H, v10.8H, v8.8H // ................................e........................................... + mls v14.8H, v27.8H, v7.H[0] // ............................................................*............... + str q11, [x0], #(16) // ....................................................................*....... // gap // ............................................................................ + sub v27.8H, v30.8H, v13.8H // ...................................................*........................ + mul v11.8H, v5.8H, v0.H[4] // .......................................e.................................... + str q6, [x0, #176] // .......................................................................*.... + add v3.8H, v22.8H, v16.8H // ...................................................................*........ + str q15, [x0, #432] // ...........................................................................* // gap // ............................................................................ - sub v8.8H, v2.8H, v21.8H // ..............................................e............................. - str q12, [x0, #304] // .........................................................................*.. - add v11.8H, v30.8H, v20.8H // ..............................................................*............. + sqrdmulh v25.8H, v5.8H, v0.H[5] // ......................................e..................................... + str q27, [x0, #48] // .....................................................................*...... + add v6.8H, v24.8H, v17.8H // ...........................e................................................ + sub v27.8H, v2.8H, v14.8H // .............................................................*.............. // gap // ............................................................................ - str q9, [x0, #128] // ......................................................................e..... - add v21.8H, v2.8H, v21.8H // ...............................................e............................ - sub v30.8H, v10.8H, v27.8H // ...................................................e........................ + str q3, [x0, #368] // ..........................................................................*. + add v15.8H, v2.8H, v14.8H // ..............................................................*............. + ldr q14, [x0, #320] // .....e...................................................................... + add v31.8H, v9.8H, v23.8H // ............e............................................................... + add v16.8H, v19.8H, v26.8H // .........................................................*.................. + str q27, [x0, #304] // .........................................................................*.. // gap // ............................................................................ - str q11, [x0, #240] // ........................................................................*... - mul v9.8H, v8.8H, v1.H[4] // ...............................................................e............ - sqrdmulh v8.8H, v8.8H, v1.H[5] // ................................................................e........... + sqrdmulh v27.8H, v6.8H, v0.H[3] // .................................e.......................................... + str q15, [x0, #240] // ........................................................................*... + sub v26.8H, v24.8H, v17.8H // ..........................e................................................. + mul v15.8H, v6.8H, v0.H[2] // ..................................e......................................... // gap // ............................................................................ + str q16, [x0, #112] // ......................................................................*..... + ldr q5, [x0, #64] // .e.......................................................................... + mls v11.8H, v25.8H, v7.H[0] // ........................................e................................... + add v12.8H, v21.8H, v4.8H // ......................e..................................................... - // original source code - // ldr q8, [x0, #0] // ......e..................................................................|........e................................................................ - // ldr q9, [x0, #(1*(512/8))] // ............e............................................................|..............e.......................................................... - // ldr q10, [x0, #(2*(512/8))] // ................e........................................................|..................e...................................................... - // ldr q11, [x0, #(3*(512/8))] // .....e...................................................................|.......e................................................................. - // ldr q12, [x0, #(4*(512/8))] // ...................e.....................................................|.....................e................................................... - // ldr q13, [x0, #(5*(512/8))] // ..e......................................................................|....e.................................................................... - // ldr q14, [x0, #(6*(512/8))] // ...e.....................................................................|.....e................................................................... - // ldr q15, [x0, #(7*(512/8))] // e........................................................................|..e...................................................................... - // mul v24.8h, v12.8h, v0.h[0] // .......................................e.................................|.........................................e............................... - // sqrdmulh v12.8h, v12.8h, v0.h[1] // .............................e...........................................|...............................e......................................... - // mls v24.8h, v12.8h, v7.h[0] // ................................................e........................|..................................................e...................... - // sub v12.8h, v8.8h, v24.8h // .*.......................................................................|...*..................................................................... - // add v8.8h, v8.8h, v24.8h // .......................................................e.................|.........................................................e............... - // mul v24.8h, v13.8h, v0.h[0] // .........................e...............................................|...........................e............................................. - // sqrdmulh v13.8h, v13.8h, v0.h[1] // ......................e..................................................|........................e................................................ - // mls v24.8h, v13.8h, v7.h[0] // .................................e.......................................|...................................e..................................... - // sub v13.8h, v9.8h, v24.8h // .........................................................e...............|...........................................................e............. - // add v9.8h, v9.8h, v24.8h // ...........................................e.............................|.............................................e........................... - // mul v24.8h, v14.8h, v0.h[0] // .................e.......................................................|...................e..................................................... - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ..................e......................................................|....................e.................................................... - // mls v24.8h, v14.8h, v7.h[0] // ........................e................................................|..........................e.............................................. - // sub v14.8h, v10.8h, v24.8h // ................................e........................................|..................................e...................................... - // add v10.8h, v10.8h, v24.8h // ......................................e..................................|........................................e................................ - // mul v24.8h, v15.8h, v0.h[0] // ..............e..........................................................|................e........................................................ - // sqrdmulh v15.8h, v15.8h, v0.h[1] // .............e...........................................................|...............e......................................................... - // mls v24.8h, v15.8h, v7.h[0] // .......................e.................................................|.........................e............................................... - // sub v15.8h, v11.8h, v24.8h // ...............................e.........................................|.................................e....................................... - // add v11.8h, v11.8h, v24.8h // ..............................e..........................................|................................e........................................ - // mul v24.8h, v10.8h, v0.h[2] // ............................................e............................|..............................................e.......................... - // sqrdmulh v10.8h, v10.8h, v0.h[3] // .............................................e...........................|...............................................e......................... - // mls v24.8h, v10.8h, v7.h[0] // .....................................................e...................|.......................................................e................. - // sub v10.8h, v8.8h, v24.8h // ..........................................................e..............|............................................................e............ - // add v8.8h, v8.8h, v24.8h // ...............................................................e.........|.................................................................e....... - // mul v24.8h, v11.8h, v0.h[2] // ..................................e......................................|....................................e.................................... - // sqrdmulh v11.8h, v11.8h, v0.h[3] // ...................................e.....................................|.....................................e................................... - // mls v24.8h, v11.8h, v7.h[0] // ........................................e................................|..........................................e.............................. - // sub v11.8h, v9.8h, v24.8h // ..............................................e..........................|................................................e........................ - // add v9.8h, v9.8h, v24.8h // ...............................................e.........................|.................................................e....................... - // mul v24.8h, v14.8h, v0.h[4] // ....................................e....................................|......................................e.................................. - // sqrdmulh v14.8h, v14.8h, v0.h[5] // .....................................e...................................|.......................................e................................. - // mls v24.8h, v14.8h, v7.h[0] // ..........................................e..............................|............................................e............................ - // sub v14.8h, v12.8h, v24.8h // ...........*.............................................................|.............*........................................................... - // add v12.8h, v12.8h, v24.8h // ...........................*.............................................|.............................*........................................... - // mul v24.8h, v15.8h, v0.h[4] // .................................................e.......................|...................................................e..................... - // sqrdmulh v15.8h, v15.8h, v0.h[5] // .........................................e...............................|...........................................e............................. - // mls v24.8h, v15.8h, v7.h[0] // ...........................................................e.............|.............................................................e........... - // sub v15.8h, v13.8h, v24.8h // ................................................................e........|..................................................................e...... - // add v13.8h, v13.8h, v24.8h // ....................................................................e....|......................................................................e.. - // mul v24.8h, v9.8h, v0.h[6] // ....................................................e....................|......................................................e.................. - // sqrdmulh v9.8h, v9.8h, v0.h[7] // ......................................................e..................|........................................................e................ - // mls v24.8h, v9.8h, v7.h[0] // ............................................................e............|..............................................................e.......... - // sub v9.8h, v8.8h, v24.8h // .....................................................................e...|.......................................................................e. - // add v8.8h, v8.8h, v24.8h // ....*....................................................................|......*.................................................................. - // mul v24.8h, v11.8h, v1.h[0] // ..................................................e......................|....................................................e.................... - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ...................................................e.....................|.....................................................e................... - // mls v24.8h, v11.8h, v7.h[0] // ........................................................e................|..........................................................e.............. - // sub v11.8h, v10.8h, v24.8h // ........*................................................................|..........*.............................................................. - // add v10.8h, v10.8h, v24.8h // ..............................................................e..........|................................................................e........ - // mul v24.8h, v13.8h, v1.h[2] // .........................................................................|*........................................................................ - // sqrdmulh v13.8h, v13.8h, v1.h[3] // .........................................................................|.*....................................................................... - // mls v24.8h, v13.8h, v7.h[0] // .........*...............................................................|...........*............................................................. - // sub v13.8h, v12.8h, v24.8h // .............................................................*...........|...............................................................*......... - // add v12.8h, v12.8h, v24.8h // ..................................................................*......|....................................................................*.... - // mul v24.8h, v15.8h, v1.h[4] // .......................................................................e.|......................................................................... - // sqrdmulh v15.8h, v15.8h, v1.h[5] // ........................................................................e|......................................................................... - // mls v24.8h, v15.8h, v7.h[0] // .......*.................................................................|.........*............................................................... - // sub v15.8h, v14.8h, v24.8h // .....................*...................................................|.......................*................................................. - // add v14.8h, v14.8h, v24.8h // ....................*....................................................|......................*.................................................. - // str q8, [x0], #(16) // ..........*..............................................................|............*............................................................ - // str q9, [x0, #(-16 + 1*(512/8))] // .........................................................................*......................................................................... - // str q10, [x0, #(-16 + 2*(512/8))] // ...................................................................e.....|.....................................................................e... - // str q11, [x0, #(-16 + 3*(512/8))] // ...............*.........................................................|.................*....................................................... - // str q12, [x0, #(-16 + 4*(512/8))] // ......................................................................*..|........................................................................* - // str q13, [x0, #(-16 + 5*(512/8))] // .................................................................*.......|...................................................................*..... - // str q14, [x0, #(-16 + 6*(512/8))] // ............................*............................................|..............................*.......................................... - // str q15, [x0, #(-16 + 7*(512/8))] // ..........................*..............................................|............................*............................................ + // ------------------------------------------------------------------ new position ------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--------------------- + // ldr q8, [x0, #0] // ...................e......................................................'....................~................................................... + // ldr q9, [x0, #(1*(512/8))] // .......................................................................e..'........................................................................ + // ldr q10, [x0, #(2*(512/8))] // .......e..................................................................'........~............................................................... + // ldr q11, [x0, #(3*(512/8))] // .................e........................................................'..................~..................................................... + // ldr q12, [x0, #(4*(512/8))] // e.........................................................................'.~...................................................................... + // ldr q13, [x0, #(5*(512/8))] // ..............................................................e...........'...............................................................~........ + // ldr q14, [x0, #(6*(512/8))] // ..e.......................................................................'...~.................................................................... + // ldr q15, [x0, #(7*(512/8))] // ......e...................................................................'.......~................................................................ + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ...........................e..............................................'............................~........................................... + // mul v24.8h, v12.8h, v0.h[0] // ..................e.......................................................'...................~.................................................... + // mls v24.8h, v27.8h, v7.h[0] // .............................................e............................'..............................................~......................... + // sub v12.8h, v8.8h, v24.8h // .........~................................................................'..........*............................................................. + // add v8.8h, v8.8h, v24.8h // ...............................................................e..........'................................................................~....... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ..........................................................................*........................................................................ + // mul v24.8h, v13.8h, v0.h[0] // ...~......................................................................'....*................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........~..............................................................'............*........................................................... + // sub v13.8h, v9.8h, v24.8h // ..........................~...............................................'...........................*............................................ + // add v9.8h, v9.8h, v24.8h // .....................~....................................................'......................*................................................. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ............e.............................................................'.............~.......................................................... + // mul v24.8h, v14.8h, v0.h[0] // .............e............................................................'..............~......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................e.....................................................'.....................~.................................................. + // sub v14.8h, v10.8h, v24.8h // .....................................e....................................'......................................~................................. + // add v10.8h, v10.8h, v24.8h // .........................................................................e'........................................................................ + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ................e.........................................................'.................~...................................................... + // mul v24.8h, v15.8h, v0.h[0] // ...............e..........................................................'................~....................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................................e...........................'...............................................~........................ + // sub v15.8h, v11.8h, v24.8h // ....................................................................e.....'.....................................................................~.. + // add v11.8h, v11.8h, v24.8h // ..........................................................e...............'...........................................................~............ + // sqrdmulh v27.8h, v10.8h, v0.h[3] // .....~....................................................................'......*................................................................. + // mul v24.8h, v10.8h, v0.h[2] // ........~.................................................................'.........*.............................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..............~...........................................................'...............*........................................................ + // sub v10.8h, v8.8h, v24.8h // ......................~...................................................'.......................*................................................ + // add v8.8h, v8.8h, v24.8h // ........................~.................................................'.........................*.............................................. + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ..................................................................e.......'...................................................................~.... + // mul v24.8h, v11.8h, v0.h[2] // .....................................................................e....'......................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // ....~.....................................................................'.....*.................................................................. + // sub v11.8h, v9.8h, v24.8h // .............................~............................................'..............................*......................................... + // add v9.8h, v9.8h, v24.8h // .........................~................................................'..........................*............................................. + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ........................................................e.................'.........................................................~.............. + // mul v24.8h, v14.8h, v0.h[4] // ....................................................e.....................'.....................................................~.................. + // mls v24.8h, v27.8h, v7.h[0] // ........................................................................e.'........................................................................ + // sub v14.8h, v12.8h, v24.8h // .......................................~..................................'........................................*............................... + // add v12.8h, v12.8h, v24.8h // .......................~..................................................'........................*............................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // .~........................................................................'..*..................................................................... + // mul v24.8h, v15.8h, v0.h[4] // ..........................................................................'*....................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........~...............................................................'...........*............................................................ + // sub v15.8h, v13.8h, v24.8h // ..............................~...........................................'...............................*........................................ + // add v13.8h, v13.8h, v24.8h // ...............................~..........................................'................................*....................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ................................~.........................................'.................................*...................................... + // mul v24.8h, v9.8h, v0.h[6] // ............................~.............................................'.............................*.......................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................~...................................'.......................................*................................ + // sub v9.8h, v8.8h, v24.8h // ...................................................~......................'....................................................*................... + // add v8.8h, v8.8h, v24.8h // ............................................~.............................'.............................................*.......................... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // .................................~........................................'..................................*..................................... + // mul v24.8h, v11.8h, v1.h[0] // ...................................~......................................'....................................*................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................~................................'..........................................*............................. + // sub v11.8h, v10.8h, v24.8h // ...............................................~..........................'................................................*....................... + // add v10.8h, v10.8h, v24.8h // ................................................................~.........'.................................................................*...... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ........................................~.................................'.........................................*.............................. + // mul v24.8h, v13.8h, v1.h[2] // ...........................................~..............................'............................................*........................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................~........................'..................................................*..................... + // sub v13.8h, v12.8h, v24.8h // ...........................................................~..............'............................................................*........... + // add v12.8h, v12.8h, v24.8h // .............................................................~............'..............................................................*......... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ..................................~.......................................'...................................*.................................... + // mul v24.8h, v15.8h, v1.h[4] // ....................................~.....................................'.....................................*.................................. + // mls v24.8h, v27.8h, v7.h[0] // ..........................................~...............................'...........................................*............................ + // sub v15.8h, v14.8h, v24.8h // ................................................~.........................'.................................................*...................... + // add v14.8h, v14.8h, v24.8h // ......................................................~...................'.......................................................*................ + // str q8, [x0], #(16) // ..................................................~.......................'...................................................*.................... + // str q9, [x0, #(-16 + 1*(512/8))] // .........................................................~................'..........................................................*............. + // str q10, [x0, #(-16 + 2*(512/8))] // ......................................................................~...'.......................................................................* + // str q11, [x0, #(-16 + 3*(512/8))] // .....................................................~....................'......................................................*................. + // str q12, [x0, #(-16 + 4*(512/8))] // ...................................................................~......'....................................................................*... + // str q13, [x0, #(-16 + 5*(512/8))] // .................................................................~........'..................................................................*..... + // str q14, [x0, #(-16 + 6*(512/8))] // ............................................................~.............'.............................................................*.......... + // str q15, [x0, #(-16 + 7*(512/8))] // .......................................................~..................'........................................................*............... sub count, count, #1 cbnz count, layer123_start - str q30, [x0, #64] // *................... - mul v20.8H, v21.8H, v1.H[2] // .*.................. - sqrdmulh v12.8H, v21.8H, v1.H[3] // ..*................. - // gap // .................... - sub v30.8H, v19.8H, v6.8H // ...*................ - add v27.8H, v10.8H, v27.8H // ....*............... - // gap // .................... - // gap // .................... - mls v9.8H, v8.8H, v7.H[0] // .....*.............. - // gap // .................... - // gap // .................... - // gap // .................... - sub v26.8H, v3.8H, v22.8H // ......*............. - str q27, [x0], #(16) // ........*........... - sub v16.8H, v30.8H, v28.8H // .........*.......... - // gap // .................... - add v18.8H, v30.8H, v28.8H // ..............*..... - mls v20.8H, v12.8H, v7.H[0] // .......*............ - // gap // .................... - // gap // .................... - str q26, [x0, #176] // ..........*......... - add v25.8H, v16.8H, v9.8H // ...........*........ - // gap // .................... - // gap // .................... - sub v22.8H, v16.8H, v9.8H // ............*....... - // gap // .................... - // gap // .................... - // gap // .................... - str q25, [x0, #368] // ...............*.... - sub v19.8H, v18.8H, v20.8H // ................*... - // gap // .................... - // gap // .................... - add v18.8H, v18.8H, v20.8H // ..................*. - str q22, [x0, #432] // .............*...... - // gap // .................... - // gap // .................... - str q19, [x0, #304] // .................*.. - // gap // .................... - // gap // .................... - // gap // .................... - str q18, [x0, #240] // ...................* - // gap // .................... - // gap // .................... - // gap // .................... + // Instructions: 49 + // Expected cycles: 26 + // Expected IPC: 1.88 + // + // Cycle bound: 26.0 + // IPC bound: 1.88 + // + // Wall time: 1.42s + // User time: 1.42s + // + // -------------- original position ---------------> + // 0 25 + // |------------------------|----------------------- + sqrdmulh v13.8H, v14.8H, v0.H[1] // *................................................ + mul v14.8H, v14.8H, v0.H[0] // ...*............................................. + // gap // ................................................. + // gap // ................................................. + sub v16.8H, v9.8H, v23.8H // .......*......................................... + mul v6.8H, v26.8H, v0.H[4] // .*............................................... + // gap // ................................................. + // gap // ................................................. + sqrdmulh v25.8H, v12.8H, v0.H[3] // .....*........................................... + mul v22.8H, v12.8H, v0.H[2] // ......*.......................................... + // gap // ................................................. + // gap // ................................................. + sqrdmulh v26.8H, v26.8H, v0.H[5] // ..*.............................................. + // gap // ................................................. + mls v14.8H, v13.8H, v7.H[0] // .........*....................................... + // gap // ................................................. + add v3.8H, v16.8H, v11.8H // .............*................................... + mls v15.8H, v27.8H, v7.H[0] // ....*............................................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v22.8H, v25.8H, v7.H[0] // ..........*...................................... + // gap // ................................................. + // gap // ................................................. + add v27.8H, v5.8H, v14.8H // ...........*..................................... + sub v16.8H, v16.8H, v11.8H // ...........................*..................... + // gap // ................................................. + // gap // ................................................. + mls v6.8H, v26.8H, v7.H[0] // ........*........................................ + // gap // ................................................. + // gap // ................................................. + sub v14.8H, v5.8H, v14.8H // ................*................................ + add v13.8H, v27.8H, v15.8H // ...............*................................. + // gap // ................................................. + add v26.8H, v31.8H, v22.8H // ..............*.................................. + // gap // ................................................. + sub v11.8H, v31.8H, v22.8H // ............*.................................... + // gap // ................................................. + // gap // ................................................. + sub v15.8H, v27.8H, v15.8H // ..................*.............................. + mul v28.8H, v13.8H, v0.H[6] // .................*............................... + sqrdmulh v21.8H, v13.8H, v0.H[7] // .....................*........................... + // gap // ................................................. + // gap // ................................................. + sqrdmulh v31.8H, v15.8H, v1.H[1] // ......................*.......................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mul v15.8H, v15.8H, v1.H[0] // ........................*........................ + add v27.8H, v14.8H, v6.8H // ....................*............................ + // gap // ................................................. + // gap // ................................................. + sub v14.8H, v14.8H, v6.8H // ...................*............................. + mls v28.8H, v21.8H, v7.H[0] // ..........................*...................... + // gap // ................................................. + // gap // ................................................. + sqrdmulh v22.8H, v27.8H, v1.H[3] // ............................*.................... + mul v27.8H, v27.8H, v1.H[2] // ...............................*................. + // gap // ................................................. + // gap // ................................................. + mls v15.8H, v31.8H, v7.H[0] // .............................*................... + sqrdmulh v25.8H, v14.8H, v1.H[5] // .......................*......................... + // gap // ................................................. + // gap // ................................................. + mul v14.8H, v14.8H, v1.H[4] // .........................*....................... + add v21.8H, v26.8H, v28.8H // ................................*................ + // gap // ................................................. + // gap // ................................................. + mls v27.8H, v22.8H, v7.H[0] // ...................................*............. + sub v6.8H, v26.8H, v28.8H // .....................................*........... + // gap // ................................................. + // gap // ................................................. + sub v13.8H, v11.8H, v15.8H // .................................*............... + add v15.8H, v11.8H, v15.8H // .............................................*... + str q21, [x0], #(16) // ....................................*............ + // gap // ................................................. + str q6, [x0, #48] // .........................................*....... + mls v14.8H, v25.8H, v7.H[0] // ..............................*.................. + // gap // ................................................. + // gap // ................................................. + sub v26.8H, v3.8H, v27.8H // ..........................................*...... + add v27.8H, v3.8H, v27.8H // ............................................*.... + str q13, [x0, #176] // ......................................*.......... + // gap // ................................................. + str q15, [x0, #112] // ................................................* + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v6.8H, v16.8H, v14.8H // ..................................*.............. + add v16.8H, v16.8H, v14.8H // .......................................*......... + str q26, [x0, #304] // ..............................................*.. + // gap // ................................................. + str q27, [x0, #240] // ...............................................*. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + str q6, [x0, #432] // ........................................*........ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + str q16, [x0, #368] // ...........................................*..... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. - // original source code - // str q30, [x0, #64] // *................... - // mul v20.8H, v21.8H, v1.H[2] // .*.................. - // sqrdmulh v2.8H, v21.8H, v1.H[3] // ..*................. - // sub v30.8H, v19.8H, v6.8H // ...*................ - // add v27.8H, v10.8H, v27.8H // ....*............... - // mls v9.8H, v8.8H, v7.H[0] // .....*.............. - // sub v12.8H, v3.8H, v22.8H // ......*............. - // mls v20.8H, v2.8H, v7.H[0] // ..........*......... - // str q27, [x0], #(16) // .......*............ - // sub v18.8H, v30.8H, v28.8H // ........*........... - // str q12, [x0, #176] // ...........*........ - // add v17.8H, v18.8H, v9.8H // ............*....... - // sub v9.8H, v18.8H, v9.8H // .............*...... - // str q9, [x0, #432] // .................*.. - // add v30.8H, v30.8H, v28.8H // .........*.......... - // str q17, [x0, #368] // ..............*..... - // sub v12.8H, v30.8H, v20.8H // ...............*.... - // str q12, [x0, #304] // ..................*. - // add v11.8H, v30.8H, v20.8H // ................*... - // str q11, [x0, #240] // ...................* + // ----------------- new position -----------------> + // 0 25 + // |------------------------|----------------------- + // sqrdmulh v13.8H, v14.8H, v0.H[1] // *................................................ + // mul v6.8H, v26.8H, v0.H[4] // ...*............................................. + // sqrdmulh v16.8H, v26.8H, v0.H[5] // ......*.......................................... + // mul v14.8H, v14.8H, v0.H[0] // .*............................................... + // mls v15.8H, v27.8H, v7.H[0] // .........*....................................... + // sqrdmulh v26.8H, v12.8H, v0.H[3] // ....*............................................ + // mul v27.8H, v12.8H, v0.H[2] // .....*........................................... + // sub v22.8H, v9.8H, v23.8H // ..*.............................................. + // mls v6.8H, v16.8H, v7.H[0] // .............*................................... + // mls v14.8H, v13.8H, v7.H[0] // .......*......................................... + // mls v27.8H, v26.8H, v7.H[0] // ..........*...................................... + // add v16.8H, v5.8H, v14.8H // ...........*..................................... + // sub v19.8H, v31.8H, v27.8H // .................*............................... + // add v2.8H, v22.8H, v11.8H // ........*........................................ + // add v30.8H, v31.8H, v27.8H // ................*................................ + // add v26.8H, v16.8H, v15.8H // ...............*................................. + // sub v27.8H, v5.8H, v14.8H // ..............*.................................. + // mul v13.8H, v26.8H, v0.H[6] // ...................*............................. + // sub v8.8H, v16.8H, v15.8H // ..................*.............................. + // sub v16.8H, v27.8H, v6.8H // ........................*........................ + // add v28.8H, v27.8H, v6.8H // .......................*......................... + // sqrdmulh v27.8H, v26.8H, v0.H[7] // ....................*............................ + // sqrdmulh v15.8H, v8.8H, v1.H[1] // .....................*........................... + // sqrdmulh v6.8H, v16.8H, v1.H[5] // .............................*................... + // mul v26.8H, v8.8H, v1.H[0] // ......................*.......................... + // mul v16.8H, v16.8H, v1.H[4] // ..............................*.................. + // mls v13.8H, v27.8H, v7.H[0] // .........................*....................... + // sub v22.8H, v22.8H, v11.8H // ............*.................................... + // sqrdmulh v27.8H, v28.8H, v1.H[3] // ..........................*...................... + // mls v26.8H, v15.8H, v7.H[0] // ............................*.................... + // mls v16.8H, v6.8H, v7.H[0] // ......................................*.......... + // mul v14.8H, v28.8H, v1.H[2] // ...........................*..................... + // add v11.8H, v30.8H, v13.8H // ...............................*................. + // sub v6.8H, v19.8H, v26.8H // ..................................*.............. + // sub v15.8H, v22.8H, v16.8H // ...........................................*..... + // mls v14.8H, v27.8H, v7.H[0] // ................................*................ + // str q11, [x0], #(16) // ....................................*............ + // sub v27.8H, v30.8H, v13.8H // .................................*............... + // str q6, [x0, #176] // .........................................*....... + // add v3.8H, v22.8H, v16.8H // ............................................*.... + // str q15, [x0, #432] // ...............................................*. + // str q27, [x0, #48] // .....................................*........... + // sub v27.8H, v2.8H, v14.8H // .......................................*......... + // str q3, [x0, #368] // ................................................* + // add v15.8H, v2.8H, v14.8H // ........................................*........ + // add v16.8H, v19.8H, v26.8H // ...................................*............. + // str q27, [x0, #304] // .............................................*... + // str q15, [x0, #240] // ..............................................*.. + // str q16, [x0, #112] // ..........................................*...... restore inp, STACK0 mov count, #8 .p2align 2 - ldr q14, [x1, #48] // *.................................. - ldr q2, [x3], #16 // .*................................. - // gap // ................................... - // gap // ................................... - ldr q5, [x1, #32] // ..*................................ - // gap // ................................... - // gap // ................................... - // gap // ................................... - ldr q1, [x1, #0] // .....*............................. - // gap // ................................... - // gap // ................................... - // gap // ................................... - ldr q23, [x4], #(6*16) // ........*.......................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - sqrdmulh v22.8H, v14.8H, v2.H[1] // ......*............................ - mul v11.8H, v14.8H, v2.H[0] // .......*........................... - ldr q16, [x4, #-48] // ............*...................... - // gap // ................................... - sqrdmulh v8.8H, v5.8H, v2.H[1] // .........*......................... - ldr q18, [x1, #16] // ...*............................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v11.8H, v22.8H, v7.H[0] // ...........*....................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mul v17.8H, v5.8H, v2.H[0] // ..........*........................ - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - add v13.8H, v18.8H, v11.8H // .............*..................... - sub v24.8H, v18.8H, v11.8H // ....................*.............. - // gap // ................................... - // gap // ................................... - mls v17.8H, v8.8H, v7.H[0] // ..............*.................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - sqrdmulh v27.8H, v13.8H, v2.H[3] // .................*................. - mul v13.8H, v13.8H, v2.H[2] // ................*.................. - // gap // ................................... - // gap // ................................... - sqrdmulh v19.8H, v24.8H, v2.H[5] // .....................*............. - mul v10.8H, v24.8H, v2.H[4] // ......................*............ - // gap // ................................... - // gap // ................................... - sub v12.8H, v1.8H, v17.8H // .........................*......... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v13.8H, v27.8H, v7.H[0] // ...................*............... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v10.8H, v19.8H, v7.H[0] // ........................*.......... - add v2.8H, v1.8H, v17.8H // ..................*................ - ldr q1, [x4, #-64] // ...............*................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - add v24.8H, v2.8H, v13.8H // .......................*........... - sub v0.8H, v2.8H, v13.8H // ...........................*....... - // gap // ................................... - // gap // ................................... - add v8.8H, v12.8H, v10.8H // ............................*...... - sub v26.8H, v12.8H, v10.8H // ..........................*........ - // gap // ................................... - ldr q10, [x4, #-80] // ....*.............................. - trn1 v9.4S, v24.4S, v0.4S // ...............................*... - // gap // ................................... - // gap // ................................... - // gap // ................................... - trn2 v0.4S, v24.4S, v0.4S // .............................*..... - trn2 v12.4S, v8.4S, v26.4S // ..............................*.... - // gap // ................................... - // gap // ................................... - trn1 v18.4S, v8.4S, v26.4S // ................................*.. - // gap // ................................... - // gap // ................................... - // gap // ................................... - trn1 v20.2D, v0.2D, v12.2D // ..................................* - trn2 v11.2D, v0.2D, v12.2D // .................................*. - // gap // ................................... - // gap // ................................... + // Instructions: 48 + // Expected cycles: 34 + // Expected IPC: 1.41 + // + // Cycle bound: 34.0 + // IPC bound: 1.41 + // + // Wall time: 1.18s + // User time: 1.18s + // + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + ldr q15, [x1, #48] // .*.............................................. + // gap // ................................................ + // gap // ................................................ + ldr q12, [x3], #16 // *............................................... + ldr q27, [x1, #32] // ....*........................................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + ldr q25, [x4, #48] // ...*............................................ + ldr q28, [x1, #0] // ........*....................................... + // gap // ................................................ + // gap // ................................................ + ldr q3, [x4, #16] // ......*......................................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + ldr q8, [x4, #32] // .....*.......................................... + mul v16.8H, v15.8H, v12.H[0] // .........*...................................... + sqrdmulh v31.8H, v15.8H, v12.H[1] // .......*........................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mul v26.8H, v27.8H, v12.H[0] // ...........*.................................... + ldr q30, [x1, #16] // ..........*..................................... + sqrdmulh v6.8H, v27.8H, v12.H[1] // .............*.................................. + // gap // ................................................ + mls v16.8H, v31.8H, v7.H[0] // ............*................................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v26.8H, v6.8H, v7.H[0] // ...................*............................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + add v14.8H, v30.8H, v16.8H // ...............*................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sub v15.8H, v30.8H, v16.8H // ..............*................................. + // gap // ................................................ + // gap // ................................................ + sqrdmulh v5.8H, v14.8H, v12.H[3] // ..................*............................. + mul v14.8H, v14.8H, v12.H[2] // ....................*........................... + // gap // ................................................ + // gap // ................................................ + sqrdmulh v16.8H, v15.8H, v12.H[5] // ................*............................... + mul v27.8H, v15.8H, v12.H[4] // .................*.............................. + // gap // ................................................ + // gap // ................................................ + sub v15.8H, v28.8H, v26.8H // .......................*........................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v14.8H, v5.8H, v7.H[0] // ......................*......................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v27.8H, v16.8H, v7.H[0] // .....................*.......................... + add v16.8H, v28.8H, v26.8H // ........................*....................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sub v4.8H, v16.8H, v14.8H // ...........................*.................... + add v26.8H, v16.8H, v14.8H // ............................*................... + // gap // ................................................ + // gap // ................................................ + sub v30.8H, v15.8H, v27.8H // .........................*...................... + add v12.8H, v15.8H, v27.8H // ..........................*..................... + // gap // ................................................ + // gap // ................................................ + trn2 v27.4S, v26.4S, v4.4S // ...............................*................ + // gap // ................................................ + ldr q14, [x4], #(6*16) // ..............................*................. + // gap // ................................................ + trn2 v6.4S, v12.4S, v30.4S // .............................*.................. + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + trn1 v15.4S, v26.4S, v4.4S // ..................................*............. + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + trn2 v28.2D, v27.2D, v6.2D // ...................................*............ + trn1 v0.2D, v27.2D, v6.2D // ......................................*......... + trn1 v16.4S, v12.4S, v30.4S // .................................*.............. + // gap // ................................................ + // gap // ................................................ + mul v21.8H, v28.8H, v14.8H // .....................................*.......... + // gap // ................................................ + // gap // ................................................ + sqrdmulh v24.8H, v28.8H, v3.8H // ....................................*........... + trn1 v20.2D, v15.2D, v16.2D // ............................................*... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + trn2 v13.2D, v15.2D, v16.2D // .......................................*........ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + ldr q15, [x4, #-32] // ..*............................................. + // gap // ................................................ + mls v21.8H, v24.8H, v7.H[0] // ........................................*....... + ldr q9, [x4, #-16] // ................................*............... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mul v6.8H, v13.8H, v14.8H // .........................................*...... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + add v14.8H, v0.8H, v21.8H // .............................................*.. + // gap // ................................................ + // gap // ................................................ + sub v28.8H, v0.8H, v21.8H // ..........................................*..... + // gap // ................................................ + sqrdmulh v21.8H, v13.8H, v3.8H // ...........................................*.... + // gap // ................................................ + // gap // ................................................ + sqrdmulh v16.8H, v28.8H, v9.8H // ...............................................* + // gap // ................................................ + // gap // ................................................ + mul v0.8H, v28.8H, v15.8H // ..............................................*. - // original source code - // ldr q4, [x1, #48] // *.................................. - // ldr q8, [x3], #16 // .*................................. - // ldr q3, [x1, #32] // ..*................................ - // ldr q21, [x1, #16] // .........*......................... - // ldr q10, [x4, #16] // ............................*...... - // ldr q31, [x1, #0] // ...*............................... - // sqrdmulh v2.8H, v4.8H, v8.H[1] // .....*............................. - // mul v26.8H, v4.8H, v8.H[0] // ......*............................ - // ldr q23, [x4], #(6*16) // ....*.............................. - // sqrdmulh v5.8H, v3.8H, v8.H[1] // ........*.......................... - // mul v11.8H, v3.8H, v8.H[0] // ...........*....................... - // mls v26.8H, v2.8H, v7.H[0] // ..........*........................ - // ldr q16, [x4, #-48] // .......*........................... - // add v29.8H, v21.8H, v26.8H // ............*...................... - // mls v11.8H, v5.8H, v7.H[0] // ..............*.................... - // ldr q1, [x4, #-64] // .......................*........... - // mul v19.8H, v29.8H, v8.H[2] // ................*.................. - // sqrdmulh v29.8H, v29.8H, v8.H[3] // ...............*................... - // add v30.8H, v31.8H, v11.8H // ......................*............ - // mls v19.8H, v29.8H, v7.H[0] // ....................*.............. - // sub v28.8H, v21.8H, v26.8H // .............*..................... - // sqrdmulh v18.8H, v28.8H, v8.H[5] // .................*................. - // mul v26.8H, v28.8H, v8.H[4] // ..................*................ - // add v0.8H, v30.8H, v19.8H // ........................*.......... - // mls v26.8H, v18.8H, v7.H[0] // .....................*............. - // sub v31.8H, v31.8H, v11.8H // ...................*............... - // sub v24.8H, v31.8H, v26.8H // ...........................*....... - // sub v17.8H, v30.8H, v19.8H // .........................*......... - // add v6.8H, v31.8H, v26.8H // ..........................*........ - // trn2 v22.4S, v0.4S, v17.4S // ..............................*.... - // trn2 v27.4S, v6.4S, v24.4S // ...............................*... - // trn1 v9.4S, v0.4S, v17.4S // .............................*..... - // trn1 v18.4S, v6.4S, v24.4S // ................................*.. - // trn2 v11.2D, v22.2D, v27.2D // ..................................* - // trn1 v20.2D, v22.2D, v27.2D // .................................*. + // ---------------- new position -----------------> + // 0 25 + // |------------------------|---------------------- + // ldr q1, [x3], #16 // .*.............................................. + // ldr q3, [x1, #48] // *............................................... + // ldr q19, [x4, #64] // .......................................*........ + // ldr q25, [x4, #48] // ...*............................................ + // ldr q13, [x1, #32] // ..*............................................. + // ldr q8, [x4, #32] // ......*......................................... + // ldr q2, [x4, #16] // .....*.......................................... + // sqrdmulh v14.8H, v3.8H, v1.H[1] // ........*....................................... + // ldr q17, [x1, #0] // ....*........................................... + // mul v29.8H, v3.8H, v1.H[0] // .......*........................................ + // ldr q26, [x1, #16] // ..........*..................................... + // mul v22.8H, v13.8H, v1.H[0] // .........*...................................... + // mls v29.8H, v14.8H, v7.H[0] // ............*................................... + // sqrdmulh v18.8H, v13.8H, v1.H[1] // ...........*.................................... + // sub v30.8H, v26.8H, v29.8H // ...............*................................ + // add v0.8H, v26.8H, v29.8H // ..............*................................. + // sqrdmulh v15.8H, v30.8H, v1.H[5] // ..................*............................. + // mul v23.8H, v30.8H, v1.H[4] // ...................*............................ + // sqrdmulh v26.8H, v0.8H, v1.H[3] // ................*............................... + // mls v22.8H, v18.8H, v7.H[0] // .............*.................................. + // mul v28.8H, v0.8H, v1.H[2] // .................*.............................. + // mls v23.8H, v15.8H, v7.H[0] // ......................*......................... + // mls v28.8H, v26.8H, v7.H[0] // .....................*.......................... + // sub v26.8H, v17.8H, v22.8H // ....................*........................... + // add v15.8H, v17.8H, v22.8H // .......................*........................ + // sub v14.8H, v26.8H, v23.8H // ..........................*..................... + // add v22.8H, v26.8H, v23.8H // ...........................*.................... + // sub v0.8H, v15.8H, v28.8H // ........................*....................... + // add v15.8H, v15.8H, v28.8H // .........................*...................... + // trn2 v26.4S, v22.4S, v14.4S // ..............................*................. + // ldr q13, [x4], #(6*16) // .............................*.................. + // trn2 v17.4S, v15.4S, v0.4S // ............................*................... + // ldr q23, [x4, #-16] // .........................................*...... + // trn1 v12.4S, v22.4S, v14.4S // ..................................*............. + // trn1 v1.4S, v15.4S, v0.4S // ...............................*................ + // trn2 v15.2D, v17.2D, v26.2D // ................................*............... + // sqrdmulh v14.8H, v15.8H, v2.8H // ....................................*........... + // mul v0.8H, v15.8H, v13.8H // ...................................*............ + // trn1 v11.2D, v17.2D, v26.2D // .................................*.............. + // trn2 v31.2D, v1.2D, v12.2D // ......................................*......... + // mls v0.8H, v14.8H, v7.H[0] // ........................................*....... + // mul v6.8H, v31.8H, v13.8H // ..........................................*..... + // sub v15.8H, v11.8H, v0.8H // ............................................*... + // sqrdmulh v21.8H, v31.8H, v2.8H // .............................................*.. + // trn1 v20.2D, v1.2D, v12.2D // .....................................*.......... + // add v14.8H, v11.8H, v0.8H // ...........................................*.... + // mul v0.8H, v15.8H, v19.8H // ...............................................* + // sqrdmulh v16.8H, v15.8H, v23.8H // ..............................................*. sub count, count, #1 layer4567_start: - ldr q4, [x1, #112] // ...e............................................................................... - trn2 v22.2D, v9.2D, v18.2D // .............................*..................................................... - ldr q8, [x3], #16 // ....e.............................................................................. - trn1 v18.2D, v9.2D, v18.2D // ...............................*................................................... - sqrdmulh v14.8H, v11.8H, v10.8H // .............................................*..................................... - mul v0.8H, v11.8H, v23.8H // ............................................*...................................... - ldr q3, [x1, #96] // ..e................................................................................ - ldr q13, [x4, #-32] // .....................................*............................................. - ldr q9, [x4, #-16] // ......................................*............................................ - ldr q21, [x1, #80] // .e................................................................................. - sqrdmulh v19.8H, v22.8H, v10.8H // ........................................*.......................................... - mul v25.8H, v22.8H, v23.8H // .......................................*........................................... - ldr q10, [x4, #16] // ..................................e................................................ - ldr q31, [x1, #64] // e.................................................................................. + // Instructions: 83 + // Expected cycles: 35 + // Expected IPC: 2.37 + // + // Cycle bound: 34.0 + // IPC bound: 2.44 + // + // Wall time: 3600.55s + // User time: 3600.55s + // + // ------------------------------- original position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------- + sqrdmulh v15.8H, v14.8H, v25.8H // .................................................*................................. + mls v6.8H, v21.8H, v7.H[0] // .........................................*......................................... + ldr q1, [x3], #16 // ....e.............................................................................. + ldr q3, [x1, #112] // ...e............................................................................... + ldr q19, [x4, #64] // .....................................e............................................. + ldr q25, [x4, #48] // ....................................e.............................................. // gap // ................................................................................... // gap // ................................................................................... - sqrdmulh v2.8H, v4.8H, v8.H[1] // ...........e....................................................................... + mls v0.8H, v16.8H, v7.H[0] // ........................................................*.......................... // gap // ................................................................................... - mls v0.8H, v14.8H, v7.H[0] // ..............................................*.................................... + ldr q13, [x1, #96] // ..e................................................................................ + mul v16.8H, v14.8H, v8.8H // ..................................................*................................ + ldr q8, [x4, #32] // ...................................e............................................... + sub v26.8H, v20.8H, v6.8H // ..........................................*........................................ + ldr q2, [x4, #16] // ..................................e................................................ + add v27.8H, v20.8H, v6.8H // ...........................................*....................................... + sqrdmulh v14.8H, v3.8H, v1.H[1] // ..........e........................................................................ // gap // ................................................................................... - mul v26.8H, v4.8H, v8.H[0] // ..........e........................................................................ - mls v25.8H, v19.8H, v7.H[0] // .........................................*......................................... + ldr q17, [x1, #64] // e.................................................................................. + mul v29.8H, v3.8H, v1.H[0] // ...........e....................................................................... + add v6.8H, v26.8H, v0.8H // ..........................................................*........................ + mls v16.8H, v15.8H, v7.H[0] // ...................................................*............................... // gap // ................................................................................... - ldr q23, [x4], #(6*16) // .................................e................................................. // gap // ................................................................................... - sqrdmulh v5.8H, v3.8H, v8.H[1] // ......e............................................................................ - mul v11.8H, v3.8H, v8.H[0] // .....e............................................................................. + sub v3.8H, v26.8H, v0.8H // .........................................................*......................... // gap // ................................................................................... - add v4.8H, v20.8H, v0.8H // ................................................*.................................. - sub v19.8H, v20.8H, v0.8H // ...............................................*................................... + ldr q26, [x1, #80] // .e................................................................................. + mul v22.8H, v13.8H, v1.H[0] // ......e............................................................................ + sqdmulh v15.8H, v6.8H, v7.H[1] // .................................................................*................. // gap // ................................................................................... // gap // ................................................................................... - add v17.8H, v18.8H, v25.8H // ...........................................*....................................... - mls v26.8H, v2.8H, v7.H[0] // ............e...................................................................... + mls v29.8H, v14.8H, v7.H[0] // ............e...................................................................... + add v11.8H, v27.8H, v16.8H // .....................................................*............................. + sub v31.8H, v27.8H, v16.8H // ....................................................*.............................. // gap // ................................................................................... // gap // ................................................................................... - sqrdmulh v9.8H, v19.8H, v9.8H // .......................................................*........................... - mul v20.8H, v19.8H, v13.8H // ......................................................*............................ + sqdmulh v16.8H, v3.8H, v7.H[1] // ....................................................................*.............. + sqrdmulh v18.8H, v13.8H, v1.H[1] // .....e............................................................................. // gap // ................................................................................... // gap // ................................................................................... - sqrdmulh v2.8H, v4.8H, v16.8H // ..................................................*................................ - mul v28.8H, v4.8H, v1.8H // .................................................*................................. + srshr v13.8H, v15.8H, #11 // ..................................................................*................ // gap // ................................................................................... - ldr q16, [x4, #-48] // ....................................e.............................................. - add v29.8H, v21.8H, v26.8H // ..............e.................................................................... - mls v11.8H, v5.8H, v7.H[0] // .......e........................................................................... - ldr q1, [x4, #-64] // ...................................e............................................... // gap // ................................................................................... - sub v22.8H, v18.8H, v25.8H // ..........................................*........................................ - mls v20.8H, v9.8H, v7.H[0] // ........................................................*.......................... + sub v30.8H, v26.8H, v29.8H // .............e..................................................................... + sqdmulh v27.8H, v11.8H, v7.H[1] // ...........................................................*....................... + add v0.8H, v26.8H, v29.8H // ..............e.................................................................... // gap // ................................................................................... // gap // ................................................................................... - mul v19.8H, v29.8H, v8.H[2] // ...............e................................................................... - mls v28.8H, v2.8H, v7.H[0] // ...................................................*............................... + sqrdmulh v15.8H, v30.8H, v1.H[5] // ....................e.............................................................. + srshr v16.8H, v16.8H, #11 // .....................................................................*............. // gap // ................................................................................... // gap // ................................................................................... - sqrdmulh v29.8H, v29.8H, v8.H[3] // ................e.................................................................. - add v30.8H, v31.8H, v11.8H // .........e......................................................................... + mul v23.8H, v30.8H, v1.H[4] // .....................e............................................................. // gap // ................................................................................... // gap // ................................................................................... - add v3.8H, v22.8H, v20.8H // ..........................................................*........................ - sub v5.8H, v22.8H, v20.8H // .........................................................*......................... + sqrdmulh v26.8H, v0.8H, v1.H[3] // ...............e................................................................... + mls v22.8H, v18.8H, v7.H[0] // .......e........................................................................... // gap // ................................................................................... + mul v28.8H, v0.8H, v1.H[2] // ................e.................................................................. // gap // ................................................................................... - add v2.8H, v17.8H, v28.8H // .....................................................*............................. - sub v4.8H, v17.8H, v28.8H // ....................................................*.............................. + mls v3.8H, v16.8H, v7.H[0] // ......................................................................*............ // gap // ................................................................................... // gap // ................................................................................... - sqdmulh v9.8H, v5.8H, v7.H[1] // ....................................................................*.............. + srshr v27.8H, v27.8H, #11 // ............................................................*...................... + mls v23.8H, v15.8H, v7.H[0] // ......................e............................................................ // gap // ................................................................................... // gap // ................................................................................... - sqdmulh v20.8H, v3.8H, v7.H[1] // .................................................................*................. - sqdmulh v12.8H, v2.8H, v7.H[1] // ...........................................................*....................... + sqdmulh v16.8H, v31.8H, v7.H[1] // ..............................................................*.................... + mls v28.8H, v26.8H, v7.H[0] // .................e................................................................. + sub v26.8H, v17.8H, v22.8H // ........e.......................................................................... // gap // ................................................................................... // gap // ................................................................................... - sqdmulh v27.8H, v4.8H, v7.H[1] // ..............................................................*.................... - mls v19.8H, v29.8H, v7.H[0] // .................e................................................................. - sub v28.8H, v21.8H, v26.8H // .............e..................................................................... + mls v11.8H, v27.8H, v7.H[0] // .............................................................*..................... + add v15.8H, v17.8H, v22.8H // .........e......................................................................... // gap // ................................................................................... // gap // ................................................................................... - srshr v22.8H, v20.8H, #11 // ..................................................................*................ + sub v14.8H, v26.8H, v23.8H // .......................e........................................................... // gap // ................................................................................... - srshr v20.8H, v9.8H, #11 // .....................................................................*............. // gap // ................................................................................... - srshr v9.8H, v27.8H, #11 // ...............................................................*................... + add v22.8H, v26.8H, v23.8H // ........................e.......................................................... + sub v0.8H, v15.8H, v28.8H // ..................e................................................................ // gap // ................................................................................... // gap // ................................................................................... - srshr v27.8H, v12.8H, #11 // ............................................................*...................... - sqrdmulh v18.8H, v28.8H, v8.H[5] // .....................e............................................................. - mul v26.8H, v28.8H, v8.H[4] // ....................e.............................................................. + add v15.8H, v15.8H, v28.8H // ...................e............................................................... + srshr v27.8H, v16.8H, #11 // ...............................................................*................... // gap // ................................................................................... // gap // ................................................................................... - mls v5.8H, v20.8H, v7.H[0] // ......................................................................*............ + trn2 v26.4S, v22.4S, v14.4S // ............................e...................................................... + mls v6.8H, v13.8H, v7.H[0] // ...................................................................*............... + ldr q13, [x4], #(6*16) // .................................e................................................. + trn2 v17.4S, v15.4S, v0.4S // ..........................e........................................................ // gap // ................................................................................... - mls v3.8H, v22.8H, v7.H[0] // ...................................................................*............... + ldr q23, [x4, #-16] // ......................................e............................................ + trn1 v12.4S, v22.4S, v14.4S // ...........................e....................................................... + trn1 v1.4S, v15.4S, v0.4S // .........................e......................................................... // gap // ................................................................................... - mls v4.8H, v9.8H, v7.H[0] // ................................................................*.................. - mls v2.8H, v27.8H, v7.H[0] // .............................................................*..................... + mls v31.8H, v27.8H, v7.H[0] // ................................................................*.................. // gap // ................................................................................... // gap // ................................................................................... - add v0.8H, v30.8H, v19.8H // ...................e............................................................... - mls v26.8H, v18.8H, v7.H[0] // ......................e............................................................ + trn2 v15.2D, v17.2D, v26.2D // ..............................e.................................................... + trn2 v16.4S, v6.4S, v3.4S // ..........................................................................*........ + trn1 v29.4S, v6.4S, v3.4S // .........................................................................*......... // gap // ................................................................................... // gap // ................................................................................... - sub v31.8H, v31.8H, v11.8H // ........e.......................................................................... - trn2 v8.4S, v3.4S, v5.4S // ..........................................................................*........ + sqrdmulh v14.8H, v15.8H, v2.8H // ............................................e...................................... // gap // ................................................................................... + mul v0.8H, v15.8H, v13.8H // .............................................e..................................... // gap // ................................................................................... - trn1 v22.4S, v2.4S, v4.4S // .......................................................................*........... - trn2 v11.4S, v2.4S, v4.4S // ........................................................................*.......... + trn2 v6.4S, v11.4S, v31.4S // ........................................................................*.......... + trn1 v22.4S, v11.4S, v31.4S // .......................................................................*........... // gap // ................................................................................... // gap // ................................................................................... - sub v24.8H, v31.8H, v26.8H // .......................e........................................................... - trn1 v20.4S, v3.4S, v5.4S // .........................................................................*......... + trn1 v11.2D, v17.2D, v26.2D // ................................e.................................................. + trn2 v31.2D, v1.2D, v12.2D // .............................e..................................................... // gap // ................................................................................... // gap // ................................................................................... - trn1 v14.2D, v11.2D, v8.2D // ..............................................................................*.... - trn2 v11.2D, v11.2D, v8.2D // ............................................................................*...... + mls v0.8H, v14.8H, v7.H[0] // ..............................................e.................................... // gap // ................................................................................... // gap // ................................................................................... - sub v17.8H, v30.8H, v19.8H // ..................e................................................................ - add v6.8H, v31.8H, v26.8H // ........................e.......................................................... + trn2 v15.2D, v22.2D, v29.2D // ...........................................................................*....... + trn2 v14.2D, v6.2D, v16.2D // ............................................................................*...... + trn1 v27.2D, v6.2D, v16.2D // ..............................................................................*.... // gap // ................................................................................... // gap // ................................................................................... - str q14, [x1, #16] // ................................................................................*.. - trn1 v9.2D, v22.2D, v20.2D // .............................................................................*..... - trn2 v2.2D, v22.2D, v20.2D // ...........................................................................*....... + trn1 v16.2D, v22.2D, v29.2D // .............................................................................*..... + mul v6.8H, v31.8H, v13.8H // ........................................e.......................................... // gap // ................................................................................... - trn2 v22.4S, v0.4S, v17.4S // ..........................e........................................................ - str q11, [x1, #48] // ..................................................................................* + str q15, [x1, #32] // .................................................................................*. + sub v15.8H, v11.8H, v0.8H // ...............................................e................................... + str q14, [x1, #48] // ..................................................................................* + sqrdmulh v21.8H, v31.8H, v2.8H // .......................................e........................................... // gap // ................................................................................... - trn2 v27.4S, v6.4S, v24.4S // ............................e...................................................... - str q9, [x1], #64 // ...............................................................................*... - trn1 v9.4S, v0.4S, v17.4S // .........................e......................................................... - trn1 v18.4S, v6.4S, v24.4S // ...........................e....................................................... + trn1 v20.2D, v1.2D, v12.2D // ...............................e................................................... + add v14.8H, v11.8H, v0.8H // ................................................e.................................. + str q16, [x1], #64 // ...............................................................................*... // gap // ................................................................................... - trn2 v11.2D, v22.2D, v27.2D // ..............................e.................................................... - trn1 v20.2D, v22.2D, v27.2D // ................................e.................................................. - str q2, [x1, #-32] // .................................................................................*. + mul v0.8H, v15.8H, v19.8H // .......................................................e........................... + sqrdmulh v16.8H, v15.8H, v23.8H // ......................................................e............................ + str q27, [x1, #-48] // ................................................................................*.. // gap // ................................................................................... - // original source code - // ldr q8, [x1, #(16*0)] // .............e.....................................................................|............e..................................................................... - // ldr q9, [x1, #(16*1)] // .........e.........................................................................|........e......................................................................... - // ldr q10, [x1, #(16*2)] // ......e............................................................................|.....e............................................................................ - // ldr q11, [x1, #(16*3)] // e..................................................................................e.................................................................................. - // ldr q0, [x3], #16 // ..e................................................................................|.e................................................................................ - // mul v24.8h, v10.8h, v0.h[0] // ....................e..............................................................|...................e.............................................................. - // sqrdmulh v10.8h, v10.8h, v0.h[1] // ...................e...............................................................|..................e............................................................... - // mls v24.8h, v10.8h, v7.h[0] // ...............................e...................................................|..............................e................................................... - // sub v10.8h, v8.8h, v24.8h // .............................................................e.....................|............................................................e..................... - // add v8.8h, v8.8h, v24.8h // ......................................e............................................|.....................................e............................................ - // mul v24.8h, v11.8h, v0.h[0] // ................e..................................................................|...............e.................................................................. - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ..............e....................................................................|.............e.................................................................... - // mls v24.8h, v11.8h, v7.h[0] // ........................e..........................................................|.......................e.......................................................... - // sub v11.8h, v9.8h, v24.8h // ................................................e..................................|...............................................e.................................. - // add v9.8h, v9.8h, v24.8h // ..............................e....................................................|.............................e.................................................... - // mul v24.8h, v9.8h, v0.h[2] // ...................................e...............................................|..................................e............................................... - // sqrdmulh v9.8h, v9.8h, v0.h[3] // .....................................e.............................................|....................................e............................................. - // mls v24.8h, v9.8h, v7.h[0] // ...............................................e...................................|..............................................e................................... - // sub v9.8h, v8.8h, v24.8h // .....................................................................e.............|....................................................................e............. - // add v8.8h, v8.8h, v24.8h // ...........................................................e.......................|..........................................................e....................... - // mul v24.8h, v11.8h, v0.h[4] // ......................................................e............................|.....................................................e............................ - // sqrdmulh v11.8h, v11.8h, v0.h[5] // .....................................................e.............................|....................................................e............................. - // mls v24.8h, v11.8h, v7.h[0] // ............................................................e......................|...........................................................e...................... - // sub v11.8h, v10.8h, v24.8h // .................................................................e.................|................................................................e................. - // add v10.8h, v10.8h, v24.8h // ......................................................................e............|.....................................................................e............ - // trn1 v25.4s, v8.4s, v9.4s // ..............................................................................e....|.............................................................................e.... - // trn2 v26.4s, v8.4s, v9.4s // ..........................................................................e........|.........................................................................e........ - // trn1 v27.4s, v10.4s, v11.4s // ...............................................................................e...|..............................................................................e... - // trn2 v28.4s, v10.4s, v11.4s // ............................................................................e......|...........................................................................e...... - // trn2 v10.2d, v25.2d, v27.2d // .*.................................................................................|*................................................................................. - // trn2 v11.2d, v26.2d, v28.2d // ................................................................................e..|...............................................................................e.. - // trn1 v8.2d, v25.2d, v27.2d // ...*...............................................................................|..*............................................................................... - // trn1 v9.2d, v26.2d, v28.2d // .................................................................................e.|................................................................................e. - // ldr q0, [x4], #(6*16) // ..................e................................................................|.................e................................................................ - // ldr q4, [x4, #(-6*16 + 1*16)] // ............e......................................................................|...........e...................................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ................................e..................................................|...............................e.................................................. - // ldr q5, [x4, #(-6*16 + 3*16)] // .............................e.....................................................|............................e..................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // .......*...........................................................................|......*........................................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ........*..........................................................................|.......*.......................................................................... - // mul v24.8h, v10.8h, v0.8h // ...........*.......................................................................|..........*....................................................................... - // sqrdmulh v10.8h, v10.8h, v4.8h // ..........*........................................................................|.........*........................................................................ - // mls v24.8h, v10.8h, v7.h[0] // .................*.................................................................|................*................................................................. - // sub v10.8h, v8.8h, v24.8h // .................................*.................................................|................................*................................................. - // add v8.8h, v8.8h, v24.8h // .......................*...........................................................|......................*........................................................... - // mul v24.8h, v11.8h, v0.8h // .....*.............................................................................|....*............................................................................. - // sqrdmulh v11.8h, v11.8h, v4.8h // ....*..............................................................................|...*.............................................................................. - // mls v24.8h, v11.8h, v7.h[0] // ...............*...................................................................|..............*................................................................... - // sub v11.8h, v9.8h, v24.8h // ......................*............................................................|.....................*............................................................ - // add v9.8h, v9.8h, v24.8h // .....................*.............................................................|....................*............................................................. - // mul v24.8h, v9.8h, v1.8h // ............................*......................................................|...........................*...................................................... - // sqrdmulh v9.8h, v9.8h, v5.8h // ...........................*.......................................................|..........................*....................................................... - // mls v24.8h, v9.8h, v7.h[0] // ....................................*..............................................|...................................*.............................................. - // sub v9.8h, v8.8h, v24.8h // ..........................................*........................................|.........................................*........................................ - // add v8.8h, v8.8h, v24.8h // .........................................*.........................................|........................................*......................................... - // mul v24.8h, v11.8h, v2.8h // ..........................*........................................................|.........................*........................................................ - // sqrdmulh v11.8h, v11.8h, v6.8h // .........................*.........................................................|........................*......................................................... - // mls v24.8h, v11.8h, v7.h[0] // ..................................*................................................|.................................*................................................ - // sub v11.8h, v10.8h, v24.8h // ........................................*..........................................|.......................................*.......................................... - // add v10.8h, v10.8h, v24.8h // .......................................*...........................................|......................................*........................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // .............................................*.....................................|............................................*..................................... - // srshr v25.8h, v25.8h, #11 // ....................................................*..............................|...................................................*.............................. - // mls v8.8h, v25.8h, v7.h[0] // ..........................................................*........................|.........................................................*........................ - // sqdmulh v25.8h, v9.8h, v7.h[1] // ..............................................*....................................|.............................................*.................................... - // srshr v25.8h, v25.8h, #11 // ...................................................*...............................|..................................................*............................... - // mls v9.8h, v25.8h, v7.h[0] // .........................................................*.........................|........................................................*......................... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ............................................*......................................|...........................................*...................................... - // srshr v25.8h, v25.8h, #11 // .................................................*.................................|................................................*................................. - // mls v10.8h, v25.8h, v7.h[0] // ........................................................*..........................|.......................................................*.......................... - // sqdmulh v25.8h, v11.8h, v7.h[1] // ...........................................*.......................................|..........................................*....................................... - // srshr v25.8h, v25.8h, #11 // ..................................................*................................|.................................................*................................ - // mls v11.8h, v25.8h, v7.h[0] // .......................................................*...........................|......................................................*........................... - // trn1 v25.4s, v8.4s, v9.4s // ...............................................................*...................|..............................................................*................... - // trn2 v26.4s, v8.4s, v9.4s // ................................................................*..................|...............................................................*.................. - // trn1 v27.4s, v10.4s, v11.4s // ..................................................................*................|.................................................................*................ - // trn2 v28.4s, v10.4s, v11.4s // ..............................................................*....................|.............................................................*.................... - // trn2 v10.2d, v25.2d, v27.2d // .........................................................................*.........|........................................................................*......... - // trn2 v11.2d, v26.2d, v28.2d // ....................................................................*..............|...................................................................*.............. - // trn1 v8.2d, v25.2d, v27.2d // ........................................................................*..........|.......................................................................*.......... - // trn1 v9.2d, v26.2d, v28.2d // ...................................................................*...............|..................................................................*............... - // str q8, [x1], #64 // .............................................................................*.....|............................................................................*..... - // str q9, [x1, #(-(64) + 16*1)] // .......................................................................*...........|......................................................................*........... - // str q10, [x1, #(-(64) + 16*2)] // ..................................................................................*|.................................................................................* - // str q11, [x1, #(-(64) + 16*3)] // ...........................................................................*.......|..........................................................................*....... + // -------------------------------------------------------------------------- new position ---------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + // ldr q8, [x1, #(16*0)] // ............e....................................................................'.............~.................................................................... + // ldr q9, [x1, #(16*1)] // .................e...............................................................'..................~............................................................... + // ldr q10, [x1, #(16*2)] // .....e...........................................................................'......~........................................................................... + // ldr q11, [x1, #(16*3)] // .e...............................................................................'..~............................................................................... + // ldr q0, [x3], #16 // e................................................................................'.~................................................................................ + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ........................e........................................................'.........................~........................................................ + // mul v24.8h, v10.8h, v0.h[0] // ..................e..............................................................'...................~.............................................................. + // mls v24.8h, v27.8h, v7.h[0] // .................................e...............................................'..................................~............................................... + // sub v10.8h, v8.8h, v24.8h // ........................................e........................................'.........................................~........................................ + // add v8.8h, v8.8h, v24.8h // ..........................................e......................................'...........................................~...................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ...........e.....................................................................'............~..................................................................... + // mul v24.8h, v11.8h, v0.h[0] // .............e...................................................................'..............~................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................e............................................................'.....................~............................................................ + // sub v11.8h, v9.8h, v24.8h // ..........................e......................................................'...........................~...................................................... + // add v9.8h, v9.8h, v24.8h // ............................e....................................................'.............................~.................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ................................e................................................'.................................~................................................ + // mul v24.8h, v9.8h, v0.h[2] // ..................................e..............................................'...................................~.............................................. + // mls v24.8h, v27.8h, v7.h[0] // .......................................e.........................................'........................................~......................................... + // sub v9.8h, v8.8h, v24.8h // .............................................e...................................'..............................................~................................... + // add v8.8h, v8.8h, v24.8h // ..............................................e..................................'...............................................~.................................. + // sqrdmulh v27.8h, v11.8h, v0.h[5] // .............................e...................................................'..............................~................................................... + // mul v24.8h, v11.8h, v0.h[4] // ...............................e.................................................'................................~................................................. + // mls v24.8h, v27.8h, v7.h[0] // .....................................e...........................................'......................................~........................................... + // sub v11.8h, v10.8h, v24.8h // ...........................................e.....................................'............................................~..................................... + // add v10.8h, v10.8h, v24.8h // ............................................e....................................'.............................................~.................................... + // trn1 v25.4s, v8.4s, v9.4s // ......................................................e..........................'.......................................................~.......................... + // trn2 v26.4s, v8.4s, v9.4s // ...................................................e.............................'....................................................~............................. + // trn1 v27.4s, v10.4s, v11.4s // .....................................................e...........................'......................................................~........................... + // trn2 v28.4s, v10.4s, v11.4s // ................................................e................................'.................................................~................................ + // trn2 v10.2d, v25.2d, v27.2d // ................................................................e................'.................................................................~................ + // trn2 v11.2d, v26.2d, v28.2d // ........................................................e........................'.........................................................~........................ + // trn1 v8.2d, v25.2d, v27.2d // ...........................................................................e.....'............................................................................~..... + // trn1 v9.2d, v26.2d, v28.2d // ...............................................................e.................'................................................................~................. + // ldr q0, [ x4], #(6*16) // ..................................................e..............................'...................................................~.............................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .........e.......................................................................'..........~....................................................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // .......e.........................................................................'........~......................................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ...e.............................................................................'....~............................................................................. + // ldr q2, [ x4, #(-6*16 + 4*16)] // ..e..............................................................................'...~.............................................................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // ....................................................e............................'.....................................................~............................ + // sqrdmulh v27.8h, v10.8h, v4.8h // ..........................................................................e......'...........................................................................~...... + // mul v24.8h, v10.8h, v0.8h // ......................................................................e..........'.......................................................................~.......... + // mls v24.8h, v27.8h, v7.h[0] // .................................................................................'*................................................................................. + // sub v10.8h, v8.8h, v24.8h // ........~........................................................................'.........*........................................................................ + // add v8.8h, v8.8h, v24.8h // ..........~......................................................................'...........*...................................................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ...........................................................e.....................'............................................................~..................... + // mul v24.8h, v11.8h, v0.8h // ............................................................e....................'.............................................................~.................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................................e...............'..................................................................~............... + // sub v11.8h, v9.8h, v24.8h // ........................................................................e........'.........................................................................~........ + // add v9.8h, v9.8h, v24.8h // ............................................................................e....'.............................................................................~.... + // sqrdmulh v27.8h, v9.8h, v5.8h // .................................................................................*.................................................................................. + // mul v24.8h, v9.8h, v1.8h // ......~..........................................................................'.......*.......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............~.................................................................'................*................................................................. + // sub v9.8h, v8.8h, v24.8h // ......................~..........................................................'.......................*.......................................................... + // add v8.8h, v8.8h, v24.8h // .....................~...........................................................'......................*........................................................... + // sqrdmulh v27.8h, v11.8h, v6.8h // ...............................................................................e.'................................................................................~. + // mul v24.8h, v11.8h, v2.8h // ..............................................................................e..'...............................................................................~.. + // mls v24.8h, v27.8h, v7.h[0] // ....~............................................................................'.....*............................................................................ + // sub v11.8h, v10.8h, v24.8h // ................~................................................................'.................*................................................................ + // add v10.8h, v10.8h, v24.8h // ..............~..................................................................'...............*.................................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........................~.....................................................'............................*..................................................... + // srshr v25.8h, v25.8h, #11 // ....................................~............................................'.....................................*............................................ + // mls v8.8h, v25.8h, v7.h[0] // .........................................~.......................................'..........................................*....................................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ......................................~..........................................'.......................................*.......................................... + // srshr v25.8h, v25.8h, #11 // ...............................................~.................................'................................................*................................. + // mls v9.8h, v25.8h, v7.h[0] // .......................................................~.........................'........................................................*......................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................~.............................................................'....................*............................................................. + // srshr v25.8h, v25.8h, #11 // .........................~.......................................................'..........................*....................................................... + // mls v10.8h, v25.8h, v7.h[0] // .................................................~...............................'..................................................*............................... + // sqdmulh v25.8h, v11.8h, v7.h[1] // .......................~.........................................................'........................*......................................................... + // srshr v25.8h, v25.8h, #11 // ..............................~..................................................'...............................*.................................................. + // mls v11.8h, v25.8h, v7.h[0] // ...................................~.............................................'....................................*............................................. + // trn1 v25.4s, v8.4s, v9.4s // ..............................................................~..................'...............................................................*.................. + // trn2 v26.4s, v8.4s, v9.4s // .............................................................~...................'..............................................................*................... + // trn1 v27.4s, v10.4s, v11.4s // ..........................................................~......................'...........................................................*...................... + // trn2 v28.4s, v10.4s, v11.4s // .........................................................~.......................'..........................................................*....................... + // trn2 v10.2d, v25.2d, v27.2d // ..................................................................~..............'...................................................................*.............. + // trn2 v11.2d, v26.2d, v28.2d // ...................................................................~.............'....................................................................*............. + // trn1 v8.2d, v25.2d, v27.2d // .....................................................................~...........'......................................................................*........... + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................~............'.....................................................................*............ + // str q8, [x1], #64 // .............................................................................~...'..............................................................................*... + // str q9, [x1, #(-(64) + 16*1)] // ................................................................................~'.................................................................................* + // str q10, [x1, #(-(64) + 16*2)] // .......................................................................~.........'........................................................................*......... + // str q11, [x1, #(-(64) + 16*3)] // .........................................................................~.......'..........................................................................*....... sub count, count, #1 cbnz count, layer4567_start - mul v22.8H, v11.8H, v23.8H // ...*............................................ - sqrdmulh v25.8H, v11.8H, v10.8H // ..*............................................. - ldr q6, [x4, #-16] // .....*.......................................... - // gap // ................................................ - trn2 v26.2D, v9.2D, v18.2D // *............................................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v22.8H, v25.8H, v7.H[0] // ........*....................................... - sqrdmulh v28.8H, v26.8H, v10.8H // ......*......................................... - // gap // ................................................ - // gap // ................................................ - mul v27.8H, v26.8H, v23.8H // .......*........................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - trn1 v26.2D, v9.2D, v18.2D // .*.............................................. - // gap // ................................................ - // gap // ................................................ - ldr q4, [x4, #-32] // ....*........................................... - add v9.8H, v20.8H, v22.8H // ..........*..................................... - sub v24.8H, v20.8H, v22.8H // ...........*.................................... - // gap // ................................................ - // gap // ................................................ - mls v27.8H, v28.8H, v7.H[0] // .........*...................................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mul v8.8H, v9.8H, v1.8H // ................*............................... - sqrdmulh v9.8H, v9.8H, v16.8H // ...............*................................ - // gap // ................................................ - // gap // ................................................ - mul v4.8H, v24.8H, v4.8H // ..............*................................. - sqrdmulh v31.8H, v24.8H, v6.8H // .............*.................................. - // gap // ................................................ - // gap // ................................................ - add v12.8H, v26.8H, v27.8H // ............*................................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v8.8H, v9.8H, v7.H[0] // ...................*............................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v4.8H, v31.8H, v7.H[0] // ..................*............................. - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - sub v11.8H, v26.8H, v27.8H // .................*.............................. - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - add v1.8H, v12.8H, v8.8H // ......................*......................... - sub v8.8H, v12.8H, v8.8H // .......................*........................ - // gap // ................................................ - // gap // ................................................ - sub v21.8H, v11.8H, v4.8H // .....................*.......................... - add v4.8H, v11.8H, v4.8H // ....................*........................... - // gap // ................................................ - // gap // ................................................ - sqdmulh v12.8H, v1.8H, v7.H[1] // ..........................*..................... - // gap // ................................................ - // gap // ................................................ - sqdmulh v9.8H, v8.8H, v7.H[1] // ...........................*.................... - sqdmulh v20.8H, v4.8H, v7.H[1] // .........................*...................... - // gap // ................................................ - // gap // ................................................ - sqdmulh v19.8H, v21.8H, v7.H[1] // ........................*....................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - srshr v17.8H, v9.8H, #11 // ..............................*................. - srshr v11.8H, v12.8H, #11 // ...............................*................ - // gap // ................................................ - // gap // ................................................ - srshr v3.8H, v19.8H, #11 // .............................*.................. - srshr v9.8H, v20.8H, #11 // ............................*................... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - mls v8.8H, v17.8H, v7.H[0] // ..................................*............. - mls v1.8H, v11.8H, v7.H[0] // ...................................*............ - // gap // ................................................ - // gap // ................................................ - mls v4.8H, v9.8H, v7.H[0] // .................................*.............. - mls v21.8H, v3.8H, v7.H[0] // ................................*............... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - trn1 v17.4S, v1.4S, v8.4S // .....................................*.......... - trn2 v29.4S, v1.4S, v8.4S // ......................................*......... - // gap // ................................................ - // gap // ................................................ - trn2 v16.4S, v4.4S, v21.4S // ....................................*........... - trn1 v23.4S, v4.4S, v21.4S // .......................................*........ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - trn1 v0.2D, v17.2D, v23.2D // ...........................................*.... - trn1 v13.2D, v29.2D, v16.2D // ........................................*....... - // gap // ................................................ - // gap // ................................................ - trn2 v2.2D, v17.2D, v23.2D // ............................................*... - trn2 v9.2D, v29.2D, v16.2D // .........................................*...... - // gap // ................................................ - // gap // ................................................ - str q0, [x1], #64 // ..............................................*. - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - str q9, [x1, #-16] // .............................................*.. - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - str q13, [x1, #-48] // ..........................................*..... - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ - str q2, [x1, #-32] // ...............................................* - // gap // ................................................ - // gap // ................................................ - // gap // ................................................ + // Instructions: 35 + // Expected cycles: 26 + // Expected IPC: 1.35 + // + // Cycle bound: 26.0 + // IPC bound: 1.35 + // + // Wall time: 0.50s + // User time: 0.50s + // + // ------- original position --------> + // 0 25 + // |------------------------|--------- + sqrdmulh v15.8H, v14.8H, v25.8H // *.................................. + mul v27.8H, v14.8H, v8.8H // ...*............................... + // gap // ................................... + // gap // ................................... + mls v6.8H, v21.8H, v7.H[0] // .*................................. + mls v0.8H, v16.8H, v7.H[0] // ..*................................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v27.8H, v15.8H, v7.H[0] // .......*........................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sub v15.8H, v20.8H, v6.8H // ....*.............................. + add v16.8H, v20.8H, v6.8H // .....*............................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + add v14.8H, v15.8H, v0.8H // ......*............................ + sub v15.8H, v15.8H, v0.8H // ........*.......................... + // gap // ................................... + // gap // ................................... + add v6.8H, v16.8H, v27.8H // ..........*........................ + sub v16.8H, v16.8H, v27.8H // ...........*....................... + // gap // ................................... + // gap // ................................... + sqdmulh v27.8H, v14.8H, v7.H[1] // .........*......................... + sqdmulh v26.8H, v15.8H, v7.H[1] // ............*...................... + // gap // ................................... + // gap // ................................... + sqdmulh v0.8H, v6.8H, v7.H[1] // ..............*.................... + sqdmulh v11.8H, v16.8H, v7.H[1] // ..................*................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + srshr v27.8H, v27.8H, #11 // .............*..................... + srshr v26.8H, v26.8H, #11 // ...............*................... + // gap // ................................... + // gap // ................................... + srshr v0.8H, v0.8H, #11 // .................*................. + srshr v11.8H, v11.8H, #11 // ....................*.............. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v15.8H, v26.8H, v7.H[0] // ................*.................. + mls v14.8H, v27.8H, v7.H[0] // .....................*............. + // gap // ................................... + // gap // ................................... + mls v6.8H, v0.8H, v7.H[0] // ...................*............... + mls v16.8H, v11.8H, v7.H[0] // ......................*............ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + trn2 v27.4S, v14.4S, v15.4S // .......................*........... + trn1 v15.4S, v14.4S, v15.4S // ........................*.......... + // gap // ................................... + // gap // ................................... + trn2 v14.4S, v6.4S, v16.4S // .........................*......... + trn1 v16.4S, v6.4S, v16.4S // ..........................*........ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + trn2 v6.2D, v16.2D, v15.2D // ...........................*....... + trn2 v26.2D, v14.2D, v27.2D // ............................*...... + // gap // ................................... + // gap // ................................... + trn1 v27.2D, v14.2D, v27.2D // .............................*..... + trn1 v15.2D, v16.2D, v15.2D // ..............................*.... + // gap // ................................... + // gap // ................................... + str q6, [x1, #32] // ...............................*... + // gap // ................................... + // gap // ................................... + // gap // ................................... + str q26, [x1, #48] // ................................*.. + // gap // ................................... + // gap // ................................... + // gap // ................................... + str q15, [x1], #64 // .................................*. + // gap // ................................... + // gap // ................................... + // gap // ................................... + str q27, [x1, #-48] // ..................................* + // gap // ................................... + // gap // ................................... + // gap // ................................... - // original source code - // trn2 v22.2D, v9.2D, v18.2D // ...*............................................ - // trn1 v18.2D, v9.2D, v18.2D // .......*........................................ - // sqrdmulh v14.8H, v11.8H, v10.8H // .*.............................................. - // mul v0.8H, v11.8H, v23.8H // *............................................... - // ldr q13, [x4, #-32] // ........*....................................... - // ldr q9, [x4, #-16] // ..*............................................. - // sqrdmulh v19.8H, v22.8H, v10.8H // .....*.......................................... - // mul v25.8H, v22.8H, v23.8H // ......*......................................... - // mls v0.8H, v14.8H, v7.H[0] // ....*........................................... - // mls v25.8H, v19.8H, v7.H[0] // ...........*.................................... - // add v4.8H, v20.8H, v0.8H // .........*...................................... - // sub v19.8H, v20.8H, v0.8H // ..........*..................................... - // add v17.8H, v18.8H, v25.8H // ................*............................... - // sqrdmulh v9.8H, v19.8H, v9.8H // ...............*................................ - // mul v20.8H, v19.8H, v13.8H // ..............*................................. - // sqrdmulh v2.8H, v4.8H, v16.8H // .............*.................................. - // mul v28.8H, v4.8H, v1.8H // ............*................................... - // sub v22.8H, v18.8H, v25.8H // ...................*............................ - // mls v20.8H, v9.8H, v7.H[0] // ..................*............................. - // mls v28.8H, v2.8H, v7.H[0] // .................*.............................. - // add v3.8H, v22.8H, v20.8H // .......................*........................ - // sub v5.8H, v22.8H, v20.8H // ......................*......................... - // add v2.8H, v17.8H, v28.8H // ....................*........................... - // sub v4.8H, v17.8H, v28.8H // .....................*.......................... - // sqdmulh v9.8H, v5.8H, v7.H[1] // ...........................*.................... - // sqdmulh v20.8H, v3.8H, v7.H[1] // ..........................*..................... - // sqdmulh v12.8H, v2.8H, v7.H[1] // ........................*....................... - // sqdmulh v27.8H, v4.8H, v7.H[1] // .........................*...................... - // srshr v22.8H, v20.8H, #11 // ...............................*................ - // srshr v20.8H, v9.8H, #11 // ..............................*................. - // srshr v9.8H, v27.8H, #11 // ............................*................... - // srshr v27.8H, v12.8H, #11 // .............................*.................. - // mls v5.8H, v20.8H, v7.H[0] // ...................................*............ - // mls v3.8H, v22.8H, v7.H[0] // ..................................*............. - // mls v4.8H, v9.8H, v7.H[0] // ................................*............... - // mls v2.8H, v27.8H, v7.H[0] // .................................*.............. - // trn2 v8.4S, v3.4S, v5.4S // ......................................*......... - // trn1 v22.4S, v2.4S, v4.4S // ....................................*........... - // trn2 v11.4S, v2.4S, v4.4S // .....................................*.......... - // trn1 v20.4S, v3.4S, v5.4S // .......................................*........ - // trn1 v14.2D, v11.2D, v8.2D // .........................................*...... - // trn2 v11.2D, v11.2D, v8.2D // ...........................................*.... - // str q14, [x1, #16] // ..............................................*. - // trn1 v9.2D, v22.2D, v20.2D // ........................................*....... - // trn2 v2.2D, v22.2D, v20.2D // ..........................................*..... - // str q11, [x1, #48] // .............................................*.. - // str q9, [x1], #64 // ............................................*... - // str q2, [x1, #-32] // ...............................................* + // ---------- new position ----------> + // 0 25 + // |------------------------|--------- + // sqrdmulh v15.8H, v14.8H, v25.8H // *.................................. + // mls v6.8H, v21.8H, v7.H[0] // ..*................................ + // mls v0.8H, v16.8H, v7.H[0] // ...*............................... + // mul v16.8H, v14.8H, v8.8H // .*................................. + // sub v26.8H, v20.8H, v6.8H // .....*............................. + // add v27.8H, v20.8H, v6.8H // ......*............................ + // add v6.8H, v26.8H, v0.8H // .......*........................... + // mls v16.8H, v15.8H, v7.H[0] // ....*.............................. + // sub v3.8H, v26.8H, v0.8H // ........*.......................... + // sqdmulh v15.8H, v6.8H, v7.H[1] // ...........*....................... + // add v11.8H, v27.8H, v16.8H // .........*......................... + // sub v31.8H, v27.8H, v16.8H // ..........*........................ + // sqdmulh v16.8H, v3.8H, v7.H[1] // ............*...................... + // srshr v13.8H, v15.8H, #11 // ...............*................... + // sqdmulh v27.8H, v11.8H, v7.H[1] // .............*..................... + // srshr v16.8H, v16.8H, #11 // ................*.................. + // mls v3.8H, v16.8H, v7.H[0] // ...................*............... + // srshr v27.8H, v27.8H, #11 // .................*................. + // sqdmulh v16.8H, v31.8H, v7.H[1] // ..............*.................... + // mls v11.8H, v27.8H, v7.H[0] // .....................*............. + // srshr v27.8H, v16.8H, #11 // ..................*................ + // mls v6.8H, v13.8H, v7.H[0] // ....................*.............. + // mls v31.8H, v27.8H, v7.H[0] // ......................*............ + // trn2 v16.4S, v6.4S, v3.4S // .......................*........... + // trn1 v29.4S, v6.4S, v3.4S // ........................*.......... + // trn2 v6.4S, v11.4S, v31.4S // .........................*......... + // trn1 v22.4S, v11.4S, v31.4S // ..........................*........ + // trn2 v15.2D, v22.2D, v29.2D // ...........................*....... + // trn2 v14.2D, v6.2D, v16.2D // ............................*...... + // trn1 v27.2D, v6.2D, v16.2D // .............................*..... + // trn1 v16.2D, v22.2D, v29.2D // ..............................*.... + // str q15, [x1, #32] // ...............................*... + // str q14, [x1, #48] // ................................*.. + // str q16, [x1], #64 // .................................*. + // str q27, [x1, #-48] // ..................................* pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_opt_a55.s b/examples/opt/aarch64/ntt_kyber_123_4567_opt_a55.s index 95fd7392..8d7af65a 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_opt_a55.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_opt_a55.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h @@ -67,15 +44,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +61,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -103,43 +74,43 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -150,7 +121,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -160,7 +131,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -168,7 +139,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -179,19 +150,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -204,7 +175,7 @@ roots: .text .global ntt_kyber_123_4567_opt_a55 - .global _ntt_kyber_123_4567_opt_a55 + .global _ntt_kyber_123_4567 .p2align 4 const_addr: .short 3329 @@ -330,1017 +301,1103 @@ _ntt_kyber_123_4567_opt_a55: load_roots_123 .p2align 2 - ldr_vo v29, x0, 0 // *......... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v17, x0, 64 // .*........ - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v10, x0, 128 // ..*....... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v15, x0, 192 // ...*...... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v11, x0, 256 // ....*..... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v24, x0, 448 // .......*.. - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v25.8H, v11.8H, v0.H[1] // ......*... - // gap // .......... - ldr_vo v2, x0, 320 // .....*.... - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v30.8H, v24.8H, v0.H[1] // .........* - // gap // .......... - ldr_vo v9, x0, 384 // ........*. - // gap // .......... - - // original source code - // ldr_vo v29, x0, 0 // *......... || *................ - // ldr_vo v17, x0, 64 // .*........ || ..*.............. - // ldr_vo v10, x0, 128 // ..*....... || ....*............ - // ldr_vo v15, x0, 192 // ...*...... || ......*.......... - // ldr_vo v11, x0, 256 // ....*..... || ........*........ - // ldr_vo v2, x0, 320 // .......*.. || .............*... - // sqrdmulh v25.8H, v11.8H, v0.H[1] // ......*... || ............*.... - // ldr_vo v24, x0, 448 // .....*.... || ..........*...... - // ldr_vo v9, x0, 384 // .........* || ................* - // sqrdmulh v30.8H, v24.8H, v0.H[1] // ........*. || ...............*. - + // Instructions: 10 + // Expected cycles: 17 + // Expected IPC: 0.59 + // + // Cycle bound: 17.0 + // IPC bound: 0.59 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q20, [x0, #0] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x0, #64] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q12, [x0, #128] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q8, [x0, #192] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q29, [x0, #256] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x0, #448] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v26.8H, v29.8H, v0.H[0] // ......*....................... + // gap // .............................. + ldr q6, [x0, #320] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v19.8H, v25.8H, v0.H[0] // .........*.................... + // gap // .............................. + ldr q16, [x0, #384] // ........*..................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q20, [x0, #0] // *.............................. + // ldr q11, [x0, #64] // .*............................. + // ldr q12, [x0, #128] // ..*............................ + // ldr q8, [x0, #192] // ...*........................... + // ldr q29, [x0, #256] // ....*.......................... + // ldr q6, [x0, #320] // .......*....................... + // mul v26.8H, v29.8H, v0.H[0] // ......*........................ + // ldr q25, [x0, #448] // .....*......................... + // ldr q16, [x0, #384] // .........*..................... + // mul v19.8H, v25.8H, v0.H[0] // ........*...................... + sub count, count, #1 -.p2align 2 layer123_start: - mul v11.8H, v11.8H, v0.H[0] // ........*................................................................... - // gap // ............................................................................ - mul v18.8H, v2.8H, v0.H[0] // .............*.............................................................. - // gap // ............................................................................ - sqrdmulh v6.8H, v2.8H, v0.H[1] // ..............*............................................................. - // gap // ............................................................................ - mul v20.8H, v9.8H, v0.H[0] // ..................*......................................................... - // gap // ............................................................................ - mls v11.8H, v25.8H, v7.H[0] // ..........*................................................................. - // gap // ............................................................................ - sqrdmulh v2.8H, v9.8H, v0.H[1] // ...................*........................................................ - // gap // ............................................................................ - mls v18.8H, v6.8H, v7.H[0] // ...............*............................................................ - // gap // ............................................................................ - mul v6.8H, v24.8H, v0.H[0] // .......................*.................................................... - // gap // ............................................................................ - sub v25.8H, v29.8H, v11.8H // ...........*................................................................ - // gap // ............................................................................ - mls v20.8H, v2.8H, v7.H[0] // ....................*....................................................... - // gap // ............................................................................ - add v11.8H, v29.8H, v11.8H // ............*............................................................... - // gap // ............................................................................ - sub v2.8H, v17.8H, v18.8H // ................*........................................................... - // gap // ............................................................................ - add v18.8H, v17.8H, v18.8H // .................*.......................................................... - // gap // ............................................................................ - sub v24.8H, v10.8H, v20.8H // .....................*...................................................... - // gap // ............................................................................ - add v20.8H, v10.8H, v20.8H // ......................*..................................................... - // gap // ............................................................................ - mls v6.8H, v30.8H, v7.H[0] // .........................*.................................................. - // gap // ............................................................................ - mul v10.8H, v24.8H, v0.H[4] // ......................................*..................................... - // gap // ............................................................................ - sqrdmulh v24.8H, v24.8H, v0.H[5] // .......................................*.................................... - // gap // ............................................................................ - mul v29.8H, v20.8H, v0.H[2] // ............................*............................................... - // gap // ............................................................................ - sub v17.8H, v15.8H, v6.8H // ..........................*................................................. - // gap // ............................................................................ - add v6.8H, v15.8H, v6.8H // ...........................*................................................ - // gap // ............................................................................ - mls v10.8H, v24.8H, v7.H[0] // ........................................*................................... - // gap // ............................................................................ - sqrdmulh v20.8H, v20.8H, v0.H[3] // .............................*.............................................. - // gap // ............................................................................ - mul v24.8H, v17.8H, v0.H[4] // ...........................................*................................ - // gap // ............................................................................ - sqrdmulh v17.8H, v17.8H, v0.H[5] // ............................................*............................... - // gap // ............................................................................ - sub v15.8H, v25.8H, v10.8H // .........................................*.................................. - // gap // ............................................................................ - add v25.8H, v25.8H, v10.8H // ..........................................*................................. - // gap // ............................................................................ - mls v29.8H, v20.8H, v7.H[0] // ..............................*............................................. - // gap // ............................................................................ - mul v20.8H, v6.8H, v0.H[2] // .................................*.......................................... - // gap // ............................................................................ - mls v24.8H, v17.8H, v7.H[0] // .............................................*.............................. - // gap // ............................................................................ - sqrdmulh v6.8H, v6.8H, v0.H[3] // ..................................*......................................... - // gap // ............................................................................ - sub v10.8H, v11.8H, v29.8H // ...............................*............................................ - // gap // ............................................................................ - add v11.8H, v11.8H, v29.8H // ................................*........................................... - // gap // ............................................................................ - sub v29.8H, v2.8H, v24.8H // ..............................................*............................. - // gap // ............................................................................ - add v2.8H, v2.8H, v24.8H // ...............................................*............................ - // gap // ............................................................................ - mls v20.8H, v6.8H, v7.H[0] // ...................................*........................................ - // gap // ............................................................................ - mul v6.8H, v29.8H, v1.H[4] // ...............................................................*............ - // gap // ............................................................................ - mul v24.8H, v2.8H, v1.H[2] // ..........................................................*................. - // gap // ............................................................................ - sqrdmulh v2.8H, v2.8H, v1.H[3] // ...........................................................*................ - // gap // ............................................................................ - sub v17.8H, v18.8H, v20.8H // ....................................*....................................... - // gap // ............................................................................ - add v18.8H, v18.8H, v20.8H // .....................................*...................................... - // gap // ............................................................................ - sqrdmulh v20.8H, v29.8H, v1.H[5] // ................................................................*........... - // gap // ............................................................................ - mul v29.8H, v17.8H, v1.H[0] // .....................................................*...................... - // gap // ............................................................................ - mul v9.8H, v18.8H, v0.H[6] // ................................................*........................... - // gap // ............................................................................ - sqrdmulh v18.8H, v18.8H, v0.H[7] // .................................................*.......................... - // gap // ............................................................................ - sqrdmulh v17.8H, v17.8H, v1.H[1] // ......................................................*..................... - // gap // ............................................................................ - mls v24.8H, v2.8H, v7.H[0] // ............................................................*............... - // gap // ............................................................................ - mls v6.8H, v20.8H, v7.H[0] // .................................................................*.......... - // gap // ............................................................................ - mls v9.8H, v18.8H, v7.H[0] // ..................................................*......................... - // gap // ............................................................................ - mls v29.8H, v17.8H, v7.H[0] // .......................................................*.................... - // gap // ............................................................................ - sub v18.8H, v25.8H, v24.8H // .............................................................*.............. - // gap // ............................................................................ - sub v20.8H, v15.8H, v6.8H // ..................................................................*......... - // gap // ............................................................................ - add v6.8H, v15.8H, v6.8H // ...................................................................*........ - // gap // ............................................................................ - add v2.8H, v25.8H, v24.8H // ..............................................................*............. - // gap // ............................................................................ - sub v25.8H, v11.8H, v9.8H // ...................................................*........................ - // gap // ............................................................................ - add v11.8H, v11.8H, v9.8H // ....................................................*....................... - // gap // ............................................................................ - sub v24.8H, v10.8H, v29.8H // ........................................................*................... - // gap // ............................................................................ - add v10.8H, v10.8H, v29.8H // .........................................................*.................. - // gap // ............................................................................ - str_vi v11, x0, 16 // ....................................................................*....... - // gap // ............................................................................ - ldr_vo v29, x0, 0 // e........................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v25, x0, 48 // .....................................................................*...... - // gap // ............................................................................ - ldr_vo v17, x0, 64 // .e.......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v10, x0, 112 // ......................................................................*..... - // gap // ............................................................................ - ldr_vo v10, x0, 128 // ..e......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v24, x0, 176 // .......................................................................*.... - // gap // ............................................................................ - ldr_vo v15, x0, 192 // ...e........................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v2, x0, 240 // ........................................................................*... - // gap // ............................................................................ - ldr_vo v11, x0, 256 // ....e....................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v18, x0, 304 // .........................................................................*.. - // gap // ............................................................................ - ldr_vo v2, x0, 320 // .....e...................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v6, x0, 368 // ..........................................................................*. - // gap // ............................................................................ - sqrdmulh v25.8H, v11.8H, v0.H[1] // .........e.................................................................. - // gap // ............................................................................ - str_vo v20, x0, 432 // ...........................................................................* - // gap // ............................................................................ - ldr_vo v24, x0, 448 // .......e.................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - ldr_vo v9, x0, 384 // ......e..................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v30.8H, v24.8H, v0.H[1] // ........................e................................................... - // gap // ............................................................................ - - // original source code - // ldr_vo v8, x0, 0 // e......................................................................................... || e....................................................................................................... - // ldr_vo v9, x0, 64 // ..e....................................................................................... || ...e.................................................................................................... - // ldr_vo v10, x0, 128 // ....e..................................................................................... || ......e................................................................................................. - // ldr_vo v11, x0, 192 // ......e................................................................................... || .........e.............................................................................................. - // ldr_vo v12, x0, 256 // ........e................................................................................. || ............e........................................................................................... - // ldr_vo v13, x0, 320 // ..........e............................................................................... || ...............e........................................................................................ - // ldr_vo v14, x0, 384 // ...............e.......................................................................... || ......................e................................................................................. - // ldr_vo v15, x0, 448 // ..............e........................................................................... || ....................e................................................................................... - // mul v24.8H, v12.8H, v0.H[0] // .................*........................................................................ || .........................*.............................................................................. - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ............e............................................................................. || ..................e..................................................................................... - // mls v24.8H, v12.8H, v7.H[0] // .....................*.................................................................... || .............................*.......................................................................... - // sub v12.8H, v8.8H, v24.8H // .........................*................................................................ || .................................*...................................................................... - // add v8.8H, v8.8H, v24.8H // ...........................*.............................................................. || ...................................*.................................................................... - // mul v24.8H, v13.8H, v0.H[0] // ..................*....................................................................... || ..........................*............................................................................. - // sqrdmulh v13.8H, v13.8H, v0.H[1] // ...................*...................................................................... || ...........................*............................................................................ - // mls v24.8H, v13.8H, v7.H[0] // .......................*.................................................................. || ...............................*........................................................................ - // sub v13.8H, v9.8H, v24.8H // ............................*............................................................. || ....................................*................................................................... - // add v9.8H, v9.8H, v24.8H // .............................*............................................................ || .....................................*.................................................................. - // mul v24.8H, v14.8H, v0.H[0] // ....................*..................................................................... || ............................*........................................................................... - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ......................*................................................................... || ..............................*......................................................................... - // mls v24.8H, v14.8H, v7.H[0] // ..........................*............................................................... || ..................................*..................................................................... - // sub v14.8H, v10.8H, v24.8H // ..............................*........................................................... || ......................................*................................................................. - // add v10.8H, v10.8H, v24.8H // ...............................*.......................................................... || .......................................*................................................................ - // mul v24.8H, v15.8H, v0.H[0] // ........................*................................................................. || ................................*....................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ................e......................................................................... || ........................e............................................................................... - // mls v24.8H, v15.8H, v7.H[0] // ................................*......................................................... || ........................................*............................................................... - // sub v15.8H, v11.8H, v24.8H // ....................................*..................................................... || ............................................*........................................................... - // add v11.8H, v11.8H, v24.8H // .....................................*.................................................... || .............................................*.......................................................... - // mul v24.8H, v10.8H, v0.H[2] // ...................................*...................................................... || ...........................................*............................................................ - // sqrdmulh v10.8H, v10.8H, v0.H[3] // .......................................*.................................................. || ...............................................*........................................................ - // mls v24.8H, v10.8H, v7.H[0] // ............................................*............................................. || ....................................................*................................................... - // sub v10.8H, v8.8H, v24.8H // ................................................*......................................... || ........................................................*............................................... - // add v8.8H, v8.8H, v24.8H // .................................................*........................................ || .........................................................*.............................................. - // mul v24.8H, v11.8H, v0.H[2] // .............................................*............................................ || .....................................................*.................................................. - // sqrdmulh v11.8H, v11.8H, v0.H[3] // ...............................................*.......................................... || .......................................................*................................................ - // mls v24.8H, v11.8H, v7.H[0] // ....................................................*..................................... || ............................................................*........................................... - // sub v11.8H, v9.8H, v24.8H // ........................................................*................................. || ................................................................*....................................... - // add v9.8H, v9.8H, v24.8H // .........................................................*................................ || .................................................................*...................................... - // mul v24.8H, v14.8H, v0.H[4] // .................................*........................................................ || .........................................*.............................................................. - // sqrdmulh v14.8H, v14.8H, v0.H[5] // ..................................*....................................................... || ..........................................*............................................................. - // mls v24.8H, v14.8H, v7.H[0] // ......................................*................................................... || ..............................................*......................................................... - // sub v14.8H, v12.8H, v24.8H // ..........................................*............................................... || ..................................................*..................................................... - // add v12.8H, v12.8H, v24.8H // ...........................................*.............................................. || ...................................................*.................................................... - // mul v24.8H, v15.8H, v0.H[4] // ........................................*................................................. || ................................................*....................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // .........................................*................................................ || .................................................*...................................................... - // mls v24.8H, v15.8H, v7.H[0] // ..............................................*........................................... || ......................................................*................................................. - // sub v15.8H, v13.8H, v24.8H // ..................................................*....................................... || ..........................................................*............................................. - // add v13.8H, v13.8H, v24.8H // ...................................................*...................................... || ...........................................................*............................................ - // mul v24.8H, v9.8H, v0.H[6] // ............................................................*............................. || ....................................................................*................................... - // sqrdmulh v9.8H, v9.8H, v0.H[7] // .............................................................*............................ || .....................................................................*.................................. - // mls v24.8H, v9.8H, v7.H[0] // .................................................................*........................ || .........................................................................*.............................. - // sub v9.8H, v8.8H, v24.8H // .......................................................................*.................. || ...............................................................................*........................ - // add v8.8H, v8.8H, v24.8H // ........................................................................*................. || ................................................................................*....................... - // mul v24.8H, v11.8H, v1.H[0] // ...........................................................*.............................. || ...................................................................*.................................... - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ..............................................................*........................... || ......................................................................*................................. - // mls v24.8H, v11.8H, v7.H[0] // ..................................................................*....................... || ..........................................................................*............................. - // sub v11.8H, v10.8H, v24.8H // .........................................................................*................ || .................................................................................*...................... - // add v10.8H, v10.8H, v24.8H // ..........................................................................*............... || ..................................................................................*..................... - // mul v24.8H, v13.8H, v1.H[2] // ......................................................*................................... || ..............................................................*......................................... - // sqrdmulh v13.8H, v13.8H, v1.H[3] // .......................................................*.................................. || ...............................................................*........................................ - // mls v24.8H, v13.8H, v7.H[0] // ...............................................................*.......................... || .......................................................................*................................ - // sub v13.8H, v12.8H, v24.8H // ...................................................................*...................... || ...........................................................................*............................ - // add v12.8H, v12.8H, v24.8H // ......................................................................*................... || ..............................................................................*......................... - // mul v24.8H, v15.8H, v1.H[4] // .....................................................*.................................... || .............................................................*.......................................... - // sqrdmulh v15.8H, v15.8H, v1.H[5] // ..........................................................*............................... || ..................................................................*..................................... - // mls v24.8H, v15.8H, v7.H[0] // ................................................................*......................... || ........................................................................*............................... - // sub v15.8H, v14.8H, v24.8H // ....................................................................*..................... || ............................................................................*........................... - // add v14.8H, v14.8H, v24.8H // .....................................................................*.................... || .............................................................................*.......................... - // str_vi v8, x0, 16 // ...........................................................................*.............. || ...................................................................................*.................... - // str_vo v9, x0, 48 // .............................................................................*............ || ......................................................................................*................. - // str_vo v10, x0, 112 // ...............................................................................*.......... || .........................................................................................*.............. - // str_vo v11, x0, 176 // .................................................................................*........ || ............................................................................................*........... - // str_vo v12, x0, 240 // ...................................................................................*...... || ...............................................................................................*........ - // str_vo v13, x0, 304 // .....................................................................................*.... || ..................................................................................................*..... - // str_vo v14, x0, 368 // .......................................................................................*.. || .....................................................................................................*.. - // str_vo v15, x0, 432 // .........................................................................................* || .......................................................................................................* - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Cycle bound: 84.0 + // IPC bound: 0.90 + // + // Wall time: 4.51s + // User time: 4.51s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sqrdmulh v18.8H, v29.8H, v0.H[1] // ........*................................................................... + // gap // ............................................................................ + sqrdmulh v29.8H, v6.8H, v0.H[1] // .............*.............................................................. + // gap // ............................................................................ + mul v6.8H, v6.8H, v0.H[0] // ..............*............................................................. + // gap // ............................................................................ + sqrdmulh v31.8H, v16.8H, v0.H[1] // ..................*......................................................... + // gap // ............................................................................ + mls v26.8H, v18.8H, v7.H[0] // ..........*................................................................. + // gap // ............................................................................ + mul v18.8H, v16.8H, v0.H[0] // ...................*........................................................ + // gap // ............................................................................ + mls v6.8H, v29.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + sqrdmulh v29.8H, v25.8H, v0.H[1] // .......................*.................................................... + // gap // ............................................................................ + sub v25.8H, v20.8H, v26.8H // ...........*................................................................ + // gap // ............................................................................ + mls v18.8H, v31.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + sub v31.8H, v11.8H, v6.8H // ................*........................................................... + // gap // ............................................................................ + add v6.8H, v11.8H, v6.8H // .................*.......................................................... + // gap // ............................................................................ + add v26.8H, v20.8H, v26.8H // ............*............................................................... + // gap // ............................................................................ + sub v16.8H, v12.8H, v18.8H // .....................*...................................................... + // gap // ............................................................................ + add v18.8H, v12.8H, v18.8H // ......................*..................................................... + // gap // ............................................................................ + mls v19.8H, v29.8H, v7.H[0] // .........................*.................................................. + // gap // ............................................................................ + sqrdmulh v29.8H, v16.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + mul v16.8H, v16.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + sqrdmulh v20.8H, v18.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + sub v11.8H, v8.8H, v19.8H // ..........................*................................................. + // gap // ............................................................................ + add v19.8H, v8.8H, v19.8H // ...........................*................................................ + // gap // ............................................................................ + mls v16.8H, v29.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + sqrdmulh v29.8H, v11.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + mul v11.8H, v11.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + mul v18.8H, v18.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + sub v12.8H, v25.8H, v16.8H // .........................................*.................................. + // gap // ............................................................................ + add v25.8H, v25.8H, v16.8H // ..........................................*................................. + // gap // ............................................................................ + mls v11.8H, v29.8H, v7.H[0] // .............................................*.............................. + // gap // ............................................................................ + sqrdmulh v29.8H, v19.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + mul v16.8H, v19.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + mls v18.8H, v20.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + sub v19.8H, v31.8H, v11.8H // ..............................................*............................. + // gap // ............................................................................ + add v31.8H, v31.8H, v11.8H // ...............................................*............................ + // gap // ............................................................................ + mls v16.8H, v29.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + sub v29.8H, v26.8H, v18.8H // ...............................*............................................ + // gap // ............................................................................ + add v18.8H, v26.8H, v18.8H // ................................*........................................... + // gap // ............................................................................ + sqrdmulh v26.8H, v31.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + sub v20.8H, v6.8H, v16.8H // ....................................*....................................... + // gap // ............................................................................ + add v6.8H, v6.8H, v16.8H // .....................................*...................................... + // gap // ............................................................................ + mul v31.8H, v31.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + sqrdmulh v16.8H, v20.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + sqrdmulh v11.8H, v6.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + mul v6.8H, v6.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + mul v20.8H, v20.8H, v1.H[0] // ......................................................*..................... + // gap // ............................................................................ + mls v31.8H, v26.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + sqrdmulh v26.8H, v19.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + mls v6.8H, v11.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + mls v20.8H, v16.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + sub v16.8H, v25.8H, v31.8H // .............................................................*.............. + // gap // ............................................................................ + add v31.8H, v25.8H, v31.8H // ..............................................................*............. + // gap // ............................................................................ + sub v25.8H, v18.8H, v6.8H // ...................................................*........................ + // gap // ............................................................................ + mul v19.8H, v19.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + add v18.8H, v18.8H, v6.8H // ....................................................*....................... + // gap // ............................................................................ + sub v6.8H, v29.8H, v20.8H // ........................................................*................... + // gap // ............................................................................ + add v29.8H, v29.8H, v20.8H // .........................................................*.................. + // gap // ............................................................................ + mls v19.8H, v26.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + str q18, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + ldr q20, [x0, #0] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v18.8H, v12.8H, v19.8H // ..................................................................*......... + // gap // ............................................................................ + add v26.8H, v12.8H, v19.8H // ...................................................................*........ + // gap // ............................................................................ + str q25, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + ldr q11, [x0, #64] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q29, [x0, #112] // ......................................................................*..... + // gap // ............................................................................ + ldr q12, [x0, #128] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q6, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + ldr q8, [x0, #192] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q31, [x0, #240] // ........................................................................*... + // gap // ............................................................................ + ldr q29, [x0, #256] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ + ldr q6, [x0, #320] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q26, [x0, #368] // ..........................................................................*. + // gap // ............................................................................ + mul v26.8H, v29.8H, v0.H[0] // .........e.................................................................. + // gap // ............................................................................ + str q18, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + ldr q25, [x0, #448] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q16, [x0, #384] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v19.8H, v25.8H, v0.H[0] // ........................e................................................... + // gap // ............................................................................ + + // -------------------------------------- new position ---------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|---------------- + // ldr q8, [x0, #0] // e..................'........................................................~............... + // ldr q9, [x0, #(1*(512/8))] // ....e..............'............................................................~........... + // ldr q10, [x0, #(2*(512/8))] // ......e............'..............................................................~......... + // ldr q11, [x0, #(3*(512/8))] // ........e..........'................................................................~....... + // ldr q12, [x0, #(4*(512/8))] // ..........e........'..................................................................~..... + // ldr q13, [x0, #(5*(512/8))] // ............e......'....................................................................~... + // ldr q14, [x0, #(6*(512/8))] // .................e.'........................................................................ + // ldr q15, [x0, #(7*(512/8))] // ................e..'........................................................................ + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ...................*........................................................................ + // mul v24.8h, v12.8h, v0.h[0] // ..............e....'......................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // ...................'...*.................................................................... + // sub v12.8h, v8.8h, v24.8h // ...................'.......*................................................................ + // add v8.8h, v8.8h, v24.8h // ...................'...........*............................................................ + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ...................'*....................................................................... + // mul v24.8h, v13.8h, v0.h[0] // ...................'.*...................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................'.....*.................................................................. + // sub v13.8h, v9.8h, v24.8h // ...................'.........*.............................................................. + // add v9.8h, v9.8h, v24.8h // ...................'..........*............................................................. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ...................'..*..................................................................... + // mul v24.8h, v14.8h, v0.h[0] // ...................'....*................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................'........*............................................................... + // sub v14.8h, v10.8h, v24.8h // ...................'............*........................................................... + // add v10.8h, v10.8h, v24.8h // ...................'.............*.......................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ...................'......*................................................................. + // mul v24.8h, v15.8h, v0.h[0] // ..................e'........................................................................ + // mls v24.8h, v27.8h, v7.h[0] // ...................'..............*......................................................... + // sub v15.8h, v11.8h, v24.8h // ...................'..................*..................................................... + // add v11.8h, v11.8h, v24.8h // ...................'...................*.................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ...................'.................*...................................................... + // mul v24.8h, v10.8h, v0.h[2] // ...................'.......................*................................................ + // mls v24.8h, v27.8h, v7.h[0] // ...................'.............................*.......................................... + // sub v10.8h, v8.8h, v24.8h // ...................'.................................*...................................... + // add v8.8h, v8.8h, v24.8h // ...................'..................................*..................................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ...................'...........................*............................................ + // mul v24.8h, v11.8h, v0.h[2] // ...................'............................*........................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................'................................*....................................... + // sub v11.8h, v9.8h, v24.8h // ...................'....................................*................................... + // add v9.8h, v9.8h, v24.8h // ...................'.....................................*.................................. + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ...................'...............*........................................................ + // mul v24.8h, v14.8h, v0.h[4] // ...................'................*....................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................'....................*................................................... + // sub v14.8h, v12.8h, v24.8h // ...................'........................*............................................... + // add v12.8h, v12.8h, v24.8h // ...................'.........................*.............................................. + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ...................'.....................*.................................................. + // mul v24.8h, v15.8h, v0.h[4] // ...................'......................*................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...................'..........................*............................................. + // sub v15.8h, v13.8h, v24.8h // ...................'..............................*......................................... + // add v13.8h, v13.8h, v24.8h // ...................'...............................*........................................ + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ...................'........................................*............................... + // mul v24.8h, v9.8h, v0.h[6] // ...................'.........................................*.............................. + // mls v24.8h, v27.8h, v7.h[0] // ...................'.............................................*.......................... + // sub v9.8h, v8.8h, v24.8h // ...................'.................................................*...................... + // add v8.8h, v8.8h, v24.8h // ...................'...................................................*.................... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ...................'.......................................*................................ + // mul v24.8h, v11.8h, v1.h[0] // ...................'..........................................*............................. + // mls v24.8h, v27.8h, v7.h[0] // ...................'..............................................*......................... + // sub v11.8h, v10.8h, v24.8h // ...................'....................................................*................... + // add v10.8h, v10.8h, v24.8h // ...................'.....................................................*.................. + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ...................'...................................*.................................... + // mul v24.8h, v13.8h, v1.h[2] // ...................'......................................*................................. + // mls v24.8h, v27.8h, v7.h[0] // ...................'...........................................*............................ + // sub v13.8h, v12.8h, v24.8h // ...................'...............................................*........................ + // add v12.8h, v12.8h, v24.8h // ...................'................................................*....................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ...................'............................................*........................... + // mul v24.8h, v15.8h, v1.h[4] // ...................'..................................................*..................... + // mls v24.8h, v27.8h, v7.h[0] // ...................'......................................................*................. + // sub v15.8h, v14.8h, v24.8h // .~.................'.........................................................*.............. + // add v14.8h, v14.8h, v24.8h // ..~................'..........................................................*............. + // str q8, [x0], #(16) // ...................'.......................................................*................ + // str q9, [x0, #(-16 + 1*(512/8))] // ...~...............'...........................................................*............ + // str q10, [x0, #(-16 + 2*(512/8))] // .....~.............'.............................................................*.......... + // str q11, [x0, #(-16 + 3*(512/8))] // .......~...........'...............................................................*........ + // str q12, [x0, #(-16 + 4*(512/8))] // .........~.........'.................................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // ...........~.......'...................................................................*.... + // str q14, [x0, #(-16 + 6*(512/8))] // .............~.....'.....................................................................*.. + // str q15, [x0, #(-16 + 7*(512/8))] // ...............~...'.......................................................................* + + sub count, count, #1 cbnz count, layer123_start - mul v18.8H, v11.8H, v0.H[0] // *................................................................. - // gap // .................................................................. - mul v23.8H, v24.8H, v0.H[0] // .......*.......................................................... - // gap // .................................................................. - mul v11.8H, v9.8H, v0.H[0] // ...*.............................................................. - // gap // .................................................................. - sqrdmulh v12.8H, v2.8H, v0.H[1] // ..*............................................................... - // gap // .................................................................. - mul v27.8H, v2.8H, v0.H[0] // .*................................................................ - // gap // .................................................................. - mls v23.8H, v30.8H, v7.H[0] // ...............*.................................................. - // gap // .................................................................. - mls v18.8H, v25.8H, v7.H[0] // ....*............................................................. - // gap // .................................................................. - sqrdmulh v3.8H, v9.8H, v0.H[1] // .....*............................................................ - // gap // .................................................................. - mls v27.8H, v12.8H, v7.H[0] // ......*........................................................... - // gap // .................................................................. - add v24.8H, v15.8H, v23.8H // ....................*............................................. - // gap // .................................................................. - sub v14.8H, v15.8H, v23.8H // ...................*.............................................. - // gap // .................................................................. - mls v11.8H, v3.8H, v7.H[0] // .........*........................................................ - // gap // .................................................................. - mul v21.8H, v24.8H, v0.H[2] // ............................*..................................... - // gap // .................................................................. - add v19.8H, v17.8H, v27.8H // ............*..................................................... - // gap // .................................................................. - sub v26.8H, v17.8H, v27.8H // ...........*...................................................... - // gap // .................................................................. - sub v22.8H, v10.8H, v11.8H // .............*.................................................... - // gap // .................................................................. - add v11.8H, v10.8H, v11.8H // ..............*................................................... - // gap // .................................................................. - sub v3.8H, v29.8H, v18.8H // ........*......................................................... - // gap // .................................................................. - sqrdmulh v31.8H, v24.8H, v0.H[3] // ..............................*................................... - // gap // .................................................................. - mul v8.8H, v11.8H, v0.H[2] // ..................*............................................... - // gap // .................................................................. - sqrdmulh v30.8H, v11.8H, v0.H[3] // ......................*........................................... - // gap // .................................................................. - add v11.8H, v29.8H, v18.8H // ..........*....................................................... - // gap // .................................................................. - mul v15.8H, v14.8H, v0.H[4] // .......................*.......................................... - // gap // .................................................................. - mls v21.8H, v31.8H, v7.H[0] // ...................................*.............................. - // gap // .................................................................. - mls v8.8H, v30.8H, v7.H[0] // ...........................*...................................... - // gap // .................................................................. - sqrdmulh v12.8H, v14.8H, v0.H[5] // ........................*......................................... - // gap // .................................................................. - sqrdmulh v13.8H, v22.8H, v0.H[5] // .................*................................................ - // gap // .................................................................. - sub v2.8H, v19.8H, v21.8H // .......................................*.......................... - // gap // .................................................................. - add v29.8H, v11.8H, v8.8H // ................................*................................. - // gap // .................................................................. - sub v9.8H, v11.8H, v8.8H // ...............................*.................................. - // gap // .................................................................. - add v11.8H, v19.8H, v21.8H // ........................................*......................... - // gap // .................................................................. - mls v15.8H, v12.8H, v7.H[0] // .............................*.................................... - // gap // .................................................................. - mul v21.8H, v2.8H, v1.H[0] // ..........................................*....................... - // gap // .................................................................. - mul v8.8H, v11.8H, v0.H[6] // ...........................................*...................... - // gap // .................................................................. - sqrdmulh v28.8H, v11.8H, v0.H[7] // ............................................*..................... - // gap // .................................................................. - mul v11.8H, v22.8H, v0.H[4] // ................*................................................. - // gap // .................................................................. - sub v31.8H, v26.8H, v15.8H // .................................*................................ - // gap // .................................................................. - sqrdmulh v27.8H, v2.8H, v1.H[1] // .............................................*.................... - // gap // .................................................................. - mls v8.8H, v28.8H, v7.H[0] // ................................................*................. - // gap // .................................................................. - mls v11.8H, v13.8H, v7.H[0] // .....................*............................................ - // gap // .................................................................. - add v28.8H, v26.8H, v15.8H // ..................................*............................... - // gap // .................................................................. - sqrdmulh v26.8H, v31.8H, v1.H[5] // .........................................*........................ - // gap // .................................................................. - add v13.8H, v29.8H, v8.8H // .......................................................*.......... - // gap // .................................................................. - sub v14.8H, v3.8H, v11.8H // .........................*........................................ - // gap // .................................................................. - add v5.8H, v3.8H, v11.8H // ..........................*....................................... - // gap // .................................................................. - mul v3.8H, v31.8H, v1.H[4] // ....................................*............................. - // gap // .................................................................. - mls v21.8H, v27.8H, v7.H[0] // .................................................*................ - // gap // .................................................................. - mul v27.8H, v28.8H, v1.H[2] // .....................................*............................ - // gap // .................................................................. - str_vi v13, x0, 16 // ..........................................................*....... - // gap // .................................................................. - sqrdmulh v13.8H, v28.8H, v1.H[3] // ......................................*........................... - // gap // .................................................................. - mls v3.8H, v26.8H, v7.H[0] // ...............................................*.................. - // gap // .................................................................. - sub v30.8H, v29.8H, v8.8H // ......................................................*........... - // gap // .................................................................. - add v12.8H, v9.8H, v21.8H // .........................................................*........ - // gap // .................................................................. - mls v27.8H, v13.8H, v7.H[0] // ..............................................*................... - // gap // .................................................................. - str_vo v30, x0, 48 // ...........................................................*...... - // gap // .................................................................. - sub v21.8H, v9.8H, v21.8H // ........................................................*......... - // gap // .................................................................. - str_vo v12, x0, 112 // ............................................................*..... - // gap // .................................................................. - add v6.8H, v5.8H, v27.8H // .....................................................*............ - // gap // .................................................................. - str_vo v21, x0, 176 // .............................................................*.... - // gap // .................................................................. - sub v11.8H, v5.8H, v27.8H // ..................................................*............... - // gap // .................................................................. - str_vo v6, x0, 240 // ..............................................................*... - // gap // .................................................................. - add v21.8H, v14.8H, v3.8H // ....................................................*............. - // gap // .................................................................. - str_vo v11, x0, 304 // ...............................................................*.. - // gap // .................................................................. - sub v11.8H, v14.8H, v3.8H // ...................................................*.............. - // gap // .................................................................. - str_vo v21, x0, 368 // ................................................................*. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str_vo v11, x0, 432 // .................................................................* - // gap // .................................................................. - - // original source code - // mul v11.8H, v11.8H, v0.H[0] // *................................................................. || *.................................................................. - // mul v18.8H, v2.8H, v0.H[0] // ....*............................................................. || ....*.............................................................. - // sqrdmulh v6.8H, v2.8H, v0.H[1] // ...*.............................................................. || ...*............................................................... - // mul v20.8H, v9.8H, v0.H[0] // ..*............................................................... || ..*................................................................ - // mls v11.8H, v25.8H, v7.H[0] // ......*........................................................... || ......*............................................................ - // sqrdmulh v2.8H, v9.8H, v0.H[1] // .......*.......................................................... || .......*........................................................... - // mls v18.8H, v6.8H, v7.H[0] // ........*......................................................... || ........*.......................................................... - // mul v6.8H, v24.8H, v0.H[0] // .*................................................................ || .*................................................................. - // sub v25.8H, v29.8H, v11.8H // .................*................................................ || .................*................................................. - // mls v20.8H, v2.8H, v7.H[0] // ...........*...................................................... || ...........*....................................................... - // add v11.8H, v29.8H, v11.8H // .....................*............................................ || .....................*............................................. - // sub v2.8H, v17.8H, v18.8H // ..............*................................................... || ..............*.................................................... - // add v18.8H, v17.8H, v18.8H // .............*.................................................... || .............*..................................................... - // sub v24.8H, v10.8H, v20.8H // ...............*.................................................. || ...............*................................................... - // add v20.8H, v10.8H, v20.8H // ................*................................................. || ................*.................................................. - // mls v6.8H, v30.8H, v7.H[0] // .....*............................................................ || .....*............................................................. - // mul v10.8H, v24.8H, v0.H[4] // ...................................*.............................. || ...................................*............................... - // sqrdmulh v24.8H, v24.8H, v0.H[5] // ..........................*....................................... || ..........................*........................................ - // mul v29.8H, v20.8H, v0.H[2] // ...................*.............................................. || ...................*............................................... - // sub v17.8H, v15.8H, v6.8H // ..........*....................................................... || ..........*........................................................ - // add v6.8H, v15.8H, v6.8H // .........*........................................................ || .........*......................................................... - // mls v10.8H, v24.8H, v7.H[0] // .......................................*.......................... || .......................................*........................... - // sqrdmulh v20.8H, v20.8H, v0.H[3] // ....................*............................................. || ....................*.............................................. - // mul v24.8H, v17.8H, v0.H[4] // ......................*........................................... || ......................*............................................ - // sqrdmulh v17.8H, v17.8H, v0.H[5] // .........................*........................................ || .........................*......................................... - // sub v15.8H, v25.8H, v10.8H // ...........................................*...................... || ...........................................*....................... - // add v25.8H, v25.8H, v10.8H // ............................................*..................... || ............................................*...................... - // mls v29.8H, v20.8H, v7.H[0] // ........................*......................................... || ........................*.......................................... - // mul v20.8H, v6.8H, v0.H[2] // ............*..................................................... || ............*...................................................... - // mls v24.8H, v17.8H, v7.H[0] // ...............................*.................................. || ...............................*................................... - // sqrdmulh v6.8H, v6.8H, v0.H[3] // ..................*............................................... || ..................*................................................ - // sub v10.8H, v11.8H, v29.8H // .............................*.................................... || .............................*..................................... - // add v11.8H, v11.8H, v29.8H // ............................*..................................... || ............................*...................................... - // sub v29.8H, v2.8H, v24.8H // ....................................*............................. || ....................................*.............................. - // add v2.8H, v2.8H, v24.8H // ........................................*......................... || ........................................*.......................... - // mls v20.8H, v6.8H, v7.H[0] // .......................*.......................................... || .......................*........................................... - // mul v6.8H, v29.8H, v1.H[4] // .............................................*.................... || .............................................*..................... - // mul v24.8H, v2.8H, v1.H[2] // ...............................................*.................. || ...............................................*................... - // sqrdmulh v2.8H, v2.8H, v1.H[3] // .................................................*................ || .................................................*................. - // sub v17.8H, v18.8H, v20.8H // ...........................*...................................... || ...........................*....................................... - // add v18.8H, v18.8H, v20.8H // ..............................*................................... || ..............................*.................................... - // sqrdmulh v20.8H, v29.8H, v1.H[5] // .........................................*........................ || .........................................*......................... - // mul v29.8H, v17.8H, v1.H[0] // ................................*................................. || ................................*.................................. - // mul v9.8H, v18.8H, v0.H[6] // .................................*................................ || .................................*................................. - // sqrdmulh v18.8H, v18.8H, v0.H[7] // ..................................*............................... || ..................................*................................ - // sqrdmulh v17.8H, v17.8H, v1.H[1] // .....................................*............................ || .....................................*............................. - // mls v24.8H, v2.8H, v7.H[0] // .....................................................*............ || .....................................................*............. - // mls v6.8H, v20.8H, v7.H[0] // ..................................................*............... || ..................................................*................ - // mls v9.8H, v18.8H, v7.H[0] // ......................................*........................... || ......................................*............................ - // mls v29.8H, v17.8H, v7.H[0] // ..............................................*................... || ..............................................*.................... - // sub v18.8H, v25.8H, v24.8H // ...........................................................*...... || ...........................................................*....... - // sub v20.8H, v15.8H, v6.8H // ...............................................................*.. || ...............................................................*... - // add v6.8H, v15.8H, v6.8H // .............................................................*.... || .............................................................*..... - // add v2.8H, v25.8H, v24.8H // .........................................................*........ || .........................................................*......... - // sub v25.8H, v11.8H, v9.8H // ...................................................*.............. || ...................................................*............... - // add v11.8H, v11.8H, v9.8H // ..........................................*....................... || ..........................................*........................ - // sub v24.8H, v10.8H, v29.8H // .......................................................*.......... || .......................................................*........... - // add v10.8H, v10.8H, v29.8H // ....................................................*............. || ....................................................*.............. - // str_vi v11, x0, 16 // ................................................*................. || ................................................*.................. - // str_vo v25, x0, 48 // ......................................................*........... || ......................................................*............ - // str_vo v10, x0, 112 // ........................................................*......... || ........................................................*.......... - // str_vo v24, x0, 176 // ..........................................................*....... || ..........................................................*........ - // str_vo v2, x0, 240 // ............................................................*..... || ............................................................*...... - // str_vo v18, x0, 304 // ..............................................................*... || ..............................................................*.... - // str_vo v6, x0, 368 // ................................................................*. || ................................................................*.. - // str_vo v20, x0, 432 // .................................................................* || ..................................................................* - + // Instructions: 66 + // Expected cycles: 67 + // Expected IPC: 0.99 + // + // Cycle bound: 67.0 + // IPC bound: 0.99 + // + // Wall time: 14.42s + // User time: 14.42s + // + // ----------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + sqrdmulh v18.8H, v29.8H, v0.H[1] // *................................................................. + // gap // .................................................................. + sqrdmulh v30.8H, v25.8H, v0.H[1] // .......*.......................................................... + // gap // .................................................................. + mul v29.8H, v6.8H, v0.H[0] // ..*............................................................... + // gap // .................................................................. + sqrdmulh v23.8H, v6.8H, v0.H[1] // .*................................................................ + // gap // .................................................................. + mls v26.8H, v18.8H, v7.H[0] // ....*............................................................. + // gap // .................................................................. + mls v19.8H, v30.8H, v7.H[0] // ...............*.................................................. + // gap // .................................................................. + mul v15.8H, v16.8H, v0.H[0] // .....*............................................................ + // gap // .................................................................. + mls v29.8H, v23.8H, v7.H[0] // ......*........................................................... + // gap // .................................................................. + sqrdmulh v23.8H, v16.8H, v0.H[1] // ...*.............................................................. + // gap // .................................................................. + add v16.8H, v8.8H, v19.8H // ....................*............................................. + // gap // .................................................................. + sub v22.8H, v8.8H, v19.8H // ...................*.............................................. + // gap // .................................................................. + add v30.8H, v11.8H, v29.8H // ...........*...................................................... + // gap // .................................................................. + sqrdmulh v24.8H, v16.8H, v0.H[3] // ............................*..................................... + // gap // .................................................................. + mul v13.8H, v16.8H, v0.H[2] // .............................*.................................... + // gap // .................................................................. + sqrdmulh v3.8H, v22.8H, v0.H[5] // ......................*........................................... + // gap // .................................................................. + mul v25.8H, v22.8H, v0.H[4] // .......................*.......................................... + // gap // .................................................................. + mls v15.8H, v23.8H, v7.H[0] // .........*........................................................ + // gap // .................................................................. + mls v13.8H, v24.8H, v7.H[0] // .................................*................................ + // gap // .................................................................. + sub v19.8H, v11.8H, v29.8H // ..........*....................................................... + // gap // .................................................................. + mls v25.8H, v3.8H, v7.H[0] // ...........................*...................................... + // gap // .................................................................. + add v2.8H, v12.8H, v15.8H // ..............*................................................... + // gap // .................................................................. + add v22.8H, v30.8H, v13.8H // ......................................*........................... + // gap // .................................................................. + add v18.8H, v20.8H, v26.8H // ............*..................................................... + // gap // .................................................................. + sub v4.8H, v20.8H, v26.8H // ........*......................................................... + // gap // .................................................................. + sqrdmulh v3.8H, v2.8H, v0.H[3] // ..................*............................................... + // gap // .................................................................. + mul v8.8H, v2.8H, v0.H[2] // ........................*......................................... + // gap // .................................................................. + sqrdmulh v24.8H, v22.8H, v0.H[7] // .........................................*........................ + // gap // .................................................................. + sub v9.8H, v30.8H, v13.8H // .....................................*............................ + // gap // .................................................................. + add v31.8H, v19.8H, v25.8H // ................................*................................. + // gap // .................................................................. + mls v8.8H, v3.8H, v7.H[0] // ..............................*................................... + // gap // .................................................................. + sqrdmulh v20.8H, v9.8H, v1.H[1] // ........................................*......................... + // gap // .................................................................. + sub v14.8H, v19.8H, v25.8H // ...............................*.................................. + // gap // .................................................................. + sub v28.8H, v12.8H, v15.8H // .............*.................................................... + // gap // .................................................................. + add v17.8H, v18.8H, v8.8H // ...................................*.............................. + // gap // .................................................................. + sub v19.8H, v18.8H, v8.8H // ..................................*............................... + // gap // .................................................................. + mul v15.8H, v9.8H, v1.H[0] // ...........................................*...................... + // gap // .................................................................. + mul v25.8H, v22.8H, v0.H[6] // ..........................................*....................... + // gap // .................................................................. + sqrdmulh v6.8H, v31.8H, v1.H[3] // ....................................*............................. + // gap // .................................................................. + sqrdmulh v16.8H, v14.8H, v1.H[5] // .............................................*.................... + // gap // .................................................................. + sqrdmulh v5.8H, v28.8H, v0.H[5] // ................*................................................. + // gap // .................................................................. + mls v25.8H, v24.8H, v7.H[0] // ..............................................*................... + // gap // .................................................................. + mul v11.8H, v28.8H, v0.H[4] // .................*................................................ + // gap // .................................................................. + mul v21.8H, v14.8H, v1.H[4] // ...................................................*.............. + // gap // .................................................................. + mul v31.8H, v31.8H, v1.H[2] // .......................................*.......................... + // gap // .................................................................. + add v13.8H, v17.8H, v25.8H // ....................................................*............. + // gap // .................................................................. + mls v11.8H, v5.8H, v7.H[0] // .....................*............................................ + // gap // .................................................................. + mls v21.8H, v16.8H, v7.H[0] // .......................................................*.......... + // gap // .................................................................. + str q13, [x0], #(16) // ........................................................*......... + // gap // .................................................................. + mls v31.8H, v6.8H, v7.H[0] // ............................................*..................... + // gap // .................................................................. + add v10.8H, v4.8H, v11.8H // ..........................*....................................... + // gap // .................................................................. + sub v5.8H, v17.8H, v25.8H // ..................................................*............... + // gap // .................................................................. + mls v15.8H, v20.8H, v7.H[0] // ...............................................*.................. + // gap // .................................................................. + add v17.8H, v10.8H, v31.8H // .................................................*................ + // gap // .................................................................. + str q5, [x0, #48] // ...........................................................*...... + // gap // .................................................................. + sub v2.8H, v10.8H, v31.8H // ................................................*................. + // gap // .................................................................. + str q17, [x0, #240] // ..............................................................*... + // gap // .................................................................. + sub v29.8H, v19.8H, v15.8H // .....................................................*............ + // gap // .................................................................. + str q2, [x0, #304] // ...............................................................*.. + // gap // .................................................................. + sub v12.8H, v4.8H, v11.8H // .........................*........................................ + // gap // .................................................................. + add v26.8H, v19.8H, v15.8H // ......................................................*........... + // gap // .................................................................. + str q29, [x0, #176] // .............................................................*.... + // gap // .................................................................. + sub v19.8H, v12.8H, v21.8H // .........................................................*........ + // gap // .................................................................. + str q26, [x0, #112] // ............................................................*..... + // gap // .................................................................. + add v9.8H, v12.8H, v21.8H // ..........................................................*....... + // gap // .................................................................. + str q19, [x0, #432] // .................................................................* + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q9, [x0, #368] // ................................................................*. + // gap // .................................................................. + + // ------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // sqrdmulh v18.8H, v29.8H, v0.H[1] // *................................................................. + // sqrdmulh v29.8H, v6.8H, v0.H[1] // ...*.............................................................. + // mul v6.8H, v6.8H, v0.H[0] // ..*............................................................... + // sqrdmulh v31.8H, v16.8H, v0.H[1] // ........*......................................................... + // mls v26.8H, v18.8H, v7.H[0] // ....*............................................................. + // mul v18.8H, v16.8H, v0.H[0] // ......*........................................................... + // mls v6.8H, v29.8H, v7.H[0] // .......*.......................................................... + // sqrdmulh v29.8H, v25.8H, v0.H[1] // .*................................................................ + // sub v25.8H, v20.8H, v26.8H // .......................*.......................................... + // mls v18.8H, v31.8H, v7.H[0] // ................*................................................. + // sub v31.8H, v11.8H, v6.8H // ..................*............................................... + // add v6.8H, v11.8H, v6.8H // ...........*...................................................... + // add v26.8H, v20.8H, v26.8H // ......................*........................................... + // sub v16.8H, v12.8H, v18.8H // ................................*................................. + // add v18.8H, v12.8H, v18.8H // ....................*............................................. + // mls v19.8H, v29.8H, v7.H[0] // .....*............................................................ + // sqrdmulh v29.8H, v16.8H, v0.H[5] // .......................................*.......................... + // mul v16.8H, v16.8H, v0.H[4] // .........................................*........................ + // sqrdmulh v20.8H, v18.8H, v0.H[3] // ........................*......................................... + // sub v11.8H, v8.8H, v19.8H // ..........*....................................................... + // add v19.8H, v8.8H, v19.8H // .........*........................................................ + // mls v16.8H, v29.8H, v7.H[0] // .............................................*.................... + // sqrdmulh v29.8H, v11.8H, v0.H[5] // ..............*................................................... + // mul v11.8H, v11.8H, v0.H[4] // ...............*.................................................. + // mul v18.8H, v18.8H, v0.H[2] // .........................*........................................ + // sub v12.8H, v25.8H, v16.8H // ..........................................................*....... + // add v25.8H, v25.8H, v16.8H // .................................................*................ + // mls v11.8H, v29.8H, v7.H[0] // ...................*.............................................. + // sqrdmulh v29.8H, v19.8H, v0.H[3] // ............*..................................................... + // mul v16.8H, v19.8H, v0.H[2] // .............*.................................................... + // mls v18.8H, v20.8H, v7.H[0] // .............................*.................................... + // sub v19.8H, v31.8H, v11.8H // ...............................*.................................. + // add v31.8H, v31.8H, v11.8H // ............................*..................................... + // mls v16.8H, v29.8H, v7.H[0] // .................*................................................ + // sub v29.8H, v26.8H, v18.8H // ..................................*............................... + // add v18.8H, v26.8H, v18.8H // .................................*................................ + // sqrdmulh v26.8H, v31.8H, v1.H[3] // .....................................*............................ + // sub v20.8H, v6.8H, v16.8H // ...........................*...................................... + // add v6.8H, v6.8H, v16.8H // .....................*............................................ + // mul v31.8H, v31.8H, v1.H[2] // ...........................................*...................... + // sqrdmulh v16.8H, v20.8H, v1.H[1] // ..............................*................................... + // sqrdmulh v11.8H, v6.8H, v0.H[7] // ..........................*....................................... + // mul v6.8H, v6.8H, v0.H[6] // ....................................*............................. + // mul v20.8H, v20.8H, v1.H[0] // ...................................*.............................. + // mls v31.8H, v26.8H, v7.H[0] // ................................................*................. + // sqrdmulh v26.8H, v19.8H, v1.H[5] // ......................................*........................... + // mls v6.8H, v11.8H, v7.H[0] // ........................................*......................... + // mls v20.8H, v16.8H, v7.H[0] // ...................................................*.............. + // sub v16.8H, v25.8H, v31.8H // ......................................................*........... + // add v31.8H, v25.8H, v31.8H // ....................................................*............. + // sub v25.8H, v18.8H, v6.8H // ..................................................*............... + // mul v19.8H, v19.8H, v1.H[4] // ..........................................*....................... + // add v18.8H, v18.8H, v6.8H // ............................................*..................... + // sub v6.8H, v29.8H, v20.8H // ........................................................*......... + // add v29.8H, v29.8H, v20.8H // ...........................................................*...... + // mls v19.8H, v26.8H, v7.H[0] // ..............................................*................... + // str q18, [x0], #(16) // ...............................................*.................. + // sub v18.8H, v12.8H, v19.8H // .............................................................*.... + // add v26.8H, v12.8H, v19.8H // ...............................................................*.. + // str q25, [x0, #48] // .....................................................*............ + // str q29, [x0, #112] // ..............................................................*... + // str q6, [x0, #176] // ............................................................*..... + // str q31, [x0, #240] // .......................................................*.......... + // str q16, [x0, #304] // .........................................................*........ + // str q26, [x0, #368] // .................................................................* + // str q18, [x0, #432] // ................................................................*. + restore inp, STACK0 mov count, #8 .p2align 2 - // gap // ............................... - ldr_vo v6, x1, 48 // .*............................. - // gap // ............................... - // gap // ............................... - ldr_vi v9, x3, 16 // ..*............................ - // gap // ............................... - // gap // ............................... - // gap // ............................... - ldr_vo v31, x1, 32 // .......*....................... - // gap // ............................... - // gap // ............................... - // gap // ............................... - sqrdmulh v2.8H, v6.8H, v9.H[1] // ....*.......................... - // gap // ............................... - mul v20.8H, v6.8H, v9.H[0] // ...*........................... - // gap // ............................... - ldr_vo v24, x1, 16 // *.............................. - // gap // ............................... - // gap // ............................... - // gap // ............................... - mul v3.8H, v31.8H, v9.H[0] // ..........*.................... - // gap // ............................... - mls v20.8H, v2.8H, v7.H[0] // ......*........................ - // gap // ............................... - sqrdmulh v11.8H, v31.8H, v9.H[1] // ...........*................... - // gap // ............................... - ldr_vo v18, x1, 0 // .....*......................... - // gap // ............................... - // gap // ............................... - // gap // ............................... - add v2.8H, v24.8H, v20.8H // ........*...................... - // gap // ............................... - sub v27.8H, v24.8H, v20.8H // .........*..................... - // gap // ............................... - mls v3.8H, v11.8H, v7.H[0] // ...............*............... - // gap // ............................... - mul v10.8H, v2.8H, v9.H[2] // ..............*................ - // gap // ............................... - sqrdmulh v24.8H, v27.8H, v9.H[5] // ............*.................. - // gap // ............................... - mul v11.8H, v27.8H, v9.H[4] // .............*................. - // gap // ............................... - sqrdmulh v28.8H, v2.8H, v9.H[3] // ........................*...... - // gap // ............................... - sub v31.8H, v18.8H, v3.8H // ..................*............ - // gap // ............................... - add v26.8H, v18.8H, v3.8H // ......................*........ - // gap // ............................... - mls v11.8H, v24.8H, v7.H[0] // .................*............. - // gap // ............................... - mls v10.8H, v28.8H, v7.H[0] // .........................*..... - // gap // ............................... - ldr_vi v19, x4, 96 // ................*.............. - // gap // ............................... - // gap // ............................... - // gap // ............................... - add v29.8H, v31.8H, v11.8H // ....................*.......... - // gap // ............................... - sub v9.8H, v31.8H, v11.8H // .....................*......... - // gap // ............................... - sub v12.8H, v26.8H, v10.8H // ..........................*.... - // gap // ............................... - add v0.8H, v26.8H, v10.8H // ...........................*... - // gap // ............................... - trn1 v22.4S, v29.4S, v9.4S // .......................*....... - // gap // ............................... - // gap // ............................... - // gap // ............................... - trn1 v26.4S, v0.4S, v12.4S // ............................*.. - // gap // ............................... - ldr_vo v6, x4, -48 // ...................*........... - // gap // ............................... - // gap // ............................... - // gap // ............................... - trn2 v24.2D, v26.2D, v22.2D // ..............................* - // gap // ............................... - ldr_vo v3, x4, -16 // .............................*. - // gap // ............................... - - // original source code - // ldr_vo v14, x1, 16 // .....*......................... || ........*.............................. - // ldr_vo v13, x1, 48 // *.............................. || *...................................... - // ldr_vi v1, x3, 16 // .*............................. || ..*.................................... - // mul v31.8H, v13.8H, v1.H[0] // ....*.......................... || .......*............................... - // sqrdmulh v19.8H, v13.8H, v1.H[1] // ...*........................... || ......*................................ - // ldr_vo v23, x1, 0 // .........*..................... || .............*......................... - // mls v31.8H, v19.8H, v7.H[0] // .......*....................... || ...........*........................... - // ldr_vo v30, x1, 32 // ..*............................ || ....*.................................. - // add v16.8H, v14.8H, v31.8H // ..........*.................... || ...............*....................... - // sub v0.8H, v14.8H, v31.8H // ...........*................... || ................*...................... - // mul v14.8H, v30.8H, v1.H[0] // ......*........................ || ..........*............................ - // sqrdmulh v30.8H, v30.8H, v1.H[1] // ........*...................... || ............*.......................... - // sqrdmulh v10.8H, v0.8H, v1.H[5] // ..............*................ || ...................*................... - // mul v31.8H, v0.8H, v1.H[4] // ...............*............... || ....................*.................. - // mul v26.8H, v16.8H, v1.H[2] // .............*................. || ..................*.................... - // mls v14.8H, v30.8H, v7.H[0] // ............*.................. || .................*..................... - // ldr_vi v19, x4, 96 // .....................*......... || ..........................*............ - // mls v31.8H, v10.8H, v7.H[0] // ...................*........... || ........................*.............. - // sub v12.8H, v23.8H, v14.8H // .................*............. || ......................*................ - // ldr_vo v6, x4, -48 // ............................*.. || ...................................*... - // add v29.8H, v12.8H, v31.8H // ......................*........ || ............................*.......... - // sub v9.8H, v12.8H, v31.8H // .......................*....... || .............................*......... - // add v24.8H, v23.8H, v14.8H // ..................*............ || .......................*............... - // trn1 v22.4S, v29.4S, v9.4S // ..........................*.... || ................................*...... - // sqrdmulh v0.8H, v16.8H, v1.H[3] // ................*.............. || .....................*................. - // mls v26.8H, v0.8H, v7.H[0] // ....................*.......... || .........................*............. - // sub v12.8H, v24.8H, v26.8H // ........................*...... || ..............................*........ - // add v0.8H, v24.8H, v26.8H // .........................*..... || ...............................*....... - // trn1 v26.4S, v0.4S, v12.4S // ...........................*... || ..................................*.... - // ldr_vo v3, x4, -16 // ..............................* || ......................................* - // trn2 v24.2D, v26.2D, v22.2D // .............................*. || .....................................*. - + // Instructions: 11 + // Expected cycles: 15 + // Expected IPC: 0.73 + // + // Cycle bound: 15.0 + // IPC bound: 0.73 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q18, [x1, #32] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x3], #16 // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q29, [x1, #48] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v16.8H, v18.8H, v11.H[0] // ....*......................... + // gap // .............................. + sqrdmulh v18.8H, v18.8H, v11.H[1] // ........*..................... + // gap // .............................. + mul v6.8H, v29.8H, v11.H[0] // .....*........................ + // gap // .............................. + sqrdmulh v29.8H, v29.8H, v11.H[1] // ......*....................... + // gap // .............................. + ldr q20, [x4, #16] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v16.8H, v18.8H, v7.H[0] // ..........*................... + // gap // .............................. + mls v6.8H, v29.8H, v7.H[0] // .......*...................... + // gap // .............................. + ldr q2, [x4, #80] // .........*.................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q13, [x1, #32] // *.............................. + // ldr q12, [x1, #48] // ..*............................ + // ldr q11, [x3], #16 // .*............................. + // ldr q20, [x4, #16] // .......*....................... + // mul v16.8H, v13.8H, v11.H[0] // ...*........................... + // mul v6.8H, v12.8H, v11.H[0] // .....*......................... + // sqrdmulh v29.8H, v12.8H, v11.H[1] // ......*........................ + // mls v6.8H, v29.8H, v7.H[0] // .........*..................... + // sqrdmulh v18.8H, v13.8H, v11.H[1] // ....*.......................... + // ldr q2, [x4, #80] // ..........*.................... + // mls v16.8H, v18.8H, v7.H[0] // ........*...................... + sub count, count, #1 -.p2align 2 layer4567_start: - mul v17.8H, v24.8H, v19.8H // .......................................*................................ - // gap // ........................................................................ - trn2 v9.4S, v29.4S, v9.4S // ............................*........................................... - // gap // ........................................................................ - trn2 v29.4S, v0.4S, v12.4S // ..........................*............................................. - // gap // ........................................................................ - // gap // ........................................................................ - ldr_vo v14, x1, 80 // .e...................................................................... - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v12.2D, v29.2D, v9.2D // ..............................*......................................... - // gap // ........................................................................ - trn1 v5.2D, v29.2D, v9.2D // ................................*....................................... - // gap // ........................................................................ - ldr_vo v21, x4, -32 // .....................................*.................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v9.8H, v12.8H, v19.8H // ............................................*........................... - // gap // ........................................................................ - // gap // ........................................................................ - ldr_vo v23, x4, -80 // ..................................*..................................... - // gap // ........................................................................ - // gap // ........................................................................ - ldr_vo v13, x1, 112 // ...e.................................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v19.8H, v12.8H, v23.8H // .............................................*.......................... - // gap // ........................................................................ - // gap // ........................................................................ - ldr_vi v1, x3, 16 // ....e................................................................... - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v10.8H, v24.8H, v23.8H // ........................................*............................... - // gap // ........................................................................ - mls v9.8H, v19.8H, v7.H[0] // ..............................................*......................... - // gap // ........................................................................ - mul v31.8H, v13.8H, v1.H[0] // ..........e............................................................. - // gap // ........................................................................ - sqrdmulh v19.8H, v13.8H, v1.H[1] // ...........e............................................................ - // gap // ........................................................................ - mls v17.8H, v10.8H, v7.H[0] // .........................................*.............................. - // gap // ........................................................................ - sub v28.8H, v5.8H, v9.8H // ...............................................*........................ - // gap // ........................................................................ - ldr_vo v23, x1, 64 // e....................................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v15.8H, v28.8H, v3.8H // .......................................................*................ - // gap // ........................................................................ - trn1 v24.2D, v26.2D, v22.2D // ...............................*........................................ - // gap // ........................................................................ - mul v21.8H, v28.8H, v21.8H // ......................................................*................. - // gap // ........................................................................ - add v2.8H, v24.8H, v17.8H // ...........................................*............................ - // gap // ........................................................................ - mls v31.8H, v19.8H, v7.H[0] // ............e........................................................... - // gap // ........................................................................ - add v3.8H, v5.8H, v9.8H // ................................................*....................... - // gap // ........................................................................ - ldr_vo v30, x1, 96 // ..e..................................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v16.8H, v14.8H, v31.8H // ..............e......................................................... - // gap // ........................................................................ - sub v0.8H, v14.8H, v31.8H // .............e.......................................................... - // gap // ........................................................................ - mul v14.8H, v30.8H, v1.H[0] // .....e.................................................................. - // gap // ........................................................................ - sqrdmulh v25.8H, v3.8H, v6.8H // ..................................................*..................... - // gap // ........................................................................ - mls v21.8H, v15.8H, v7.H[0] // ........................................................*............... - // gap // ........................................................................ - sqrdmulh v30.8H, v30.8H, v1.H[1] // ......e................................................................. - // gap // ........................................................................ - sqrdmulh v10.8H, v0.8H, v1.H[5] // .....................e.................................................. - // gap // ........................................................................ - mul v31.8H, v0.8H, v1.H[4] // ....................e................................................... - // gap // ........................................................................ - mul v26.8H, v16.8H, v1.H[2] // ...............e........................................................ - // gap // ........................................................................ - mls v14.8H, v30.8H, v7.H[0] // .......e................................................................ - // gap // ........................................................................ - ldr_vi v19, x4, 96 // .................................e...................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v31.8H, v10.8H, v7.H[0] // ......................e................................................. - // gap // ........................................................................ - sub v12.8H, v23.8H, v14.8H // ........e............................................................... - // gap // ........................................................................ - ldr_vo v6, x4, -48 // ....................................e................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v29.8H, v12.8H, v31.8H // ........................e............................................... - // gap // ........................................................................ - sub v9.8H, v12.8H, v31.8H // .......................e................................................ - // gap // ........................................................................ - sub v5.8H, v24.8H, v17.8H // ..........................................*............................. - // gap // ........................................................................ - add v24.8H, v23.8H, v14.8H // .........e.............................................................. - // gap // ........................................................................ - trn1 v22.4S, v29.4S, v9.4S // ...........................e............................................ - // gap // ........................................................................ - sub v18.8H, v5.8H, v21.8H // .........................................................*.............. - // gap // ........................................................................ - // gap // ........................................................................ - ldr_vo v27, x4, -160 // ...................................*.................................... - // gap // ........................................................................ - // gap // ........................................................................ - add v17.8H, v5.8H, v21.8H // ..........................................................*............. - // gap // ........................................................................ - sqdmulh v4.8H, v18.8H, v7.H[1] // ....................................................................*... - // gap // ........................................................................ - mul v3.8H, v3.8H, v27.8H // .................................................*...................... - // gap // ........................................................................ - sqdmulh v11.8H, v17.8H, v7.H[1] // .................................................................*...... - // gap // ........................................................................ - sqrdmulh v0.8H, v16.8H, v1.H[3] // ................e....................................................... - // gap // ........................................................................ - srshr v21.8H, v4.8H, #11 // .....................................................................*.. - // gap // ........................................................................ - mls v3.8H, v25.8H, v7.H[0] // ...................................................*.................... - // gap // ........................................................................ - srshr v27.8H, v11.8H, #11 // ..................................................................*..... - // gap // ........................................................................ - mls v26.8H, v0.8H, v7.H[0] // .................e...................................................... - // gap // ........................................................................ - mls v18.8H, v21.8H, v7.H[0] // ......................................................................*. - // gap // ........................................................................ - sub v16.8H, v2.8H, v3.8H // ....................................................*................... - // gap // ........................................................................ - add v15.8H, v2.8H, v3.8H // .....................................................*.................. - // gap // ........................................................................ - mls v17.8H, v27.8H, v7.H[0] // ...................................................................*.... - // gap // ........................................................................ - sqdmulh v20.8H, v16.8H, v7.H[1] // ..............................................................*......... - // gap // ........................................................................ - sqdmulh v3.8H, v15.8H, v7.H[1] // ...........................................................*............ - // gap // ........................................................................ - sub v12.8H, v24.8H, v26.8H // ..................e..................................................... - // gap // ........................................................................ - add v0.8H, v24.8H, v26.8H // ...................e.................................................... - // gap // ........................................................................ - srshr v30.8H, v20.8H, #11 // ...............................................................*........ - // gap // ........................................................................ - srshr v4.8H, v3.8H, #11 // ............................................................*........... - // gap // ........................................................................ - trn1 v26.4S, v0.4S, v12.4S // .........................e.............................................. - // gap // ........................................................................ - mls v16.8H, v30.8H, v7.H[0] // ................................................................*....... - // gap // ........................................................................ - mls v15.8H, v4.8H, v7.H[0] // .............................................................*.......... - // gap // ........................................................................ - ldr_vo v3, x4, -16 // ......................................e................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v24.2D, v26.2D, v22.2D // .............................e.......................................... - // gap // ........................................................................ - st4 {v15.4S,v16.4S,v17.4S,v18.4S}, [x1], #64 // .......................................................................* - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - - // original source code - // ldr_vo v8, x1, 0 // ...............e............................................................................................................................. || ....................e.................................................................................................................................................. - // ldr_vo v9, x1, 16 // e............................................................................................................................................ || e...................................................................................................................................................................... - // ldr_vo v10, x1, 32 // ......................e...................................................................................................................... || ............................e.......................................................................................................................................... - // ldr_vo v11, x1, 48 // ......e...................................................................................................................................... || .........e............................................................................................................................................................. - // ldr_vi v0, x3, 16 // ........e.................................................................................................................................... || ............e.......................................................................................................................................................... - // mul v24.8H, v10.8H, v0.H[0] // .........................e................................................................................................................... || ................................e...................................................................................................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[1] // ............................e................................................................................................................ || ...................................e................................................................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ................................e............................................................................................................ || .......................................e............................................................................................................................... - // sub v10.8H, v8.8H, v24.8H // ...................................e......................................................................................................... || ...........................................e........................................................................................................................... - // add v8.8H, v8.8H, v24.8H // ........................................e.................................................................................................... || .................................................e..................................................................................................................... - // mul v24.8H, v11.8H, v0.H[0] // ...........e................................................................................................................................. || ................e...................................................................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // ............e................................................................................................................................ || .................e..................................................................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ....................e........................................................................................................................ || ..........................e............................................................................................................................................ - // sub v11.8H, v9.8H, v24.8H // ........................e.................................................................................................................... || ...............................e....................................................................................................................................... - // add v9.8H, v9.8H, v24.8H // .......................e..................................................................................................................... || ..............................e........................................................................................................................................ - // mul v24.8H, v9.8H, v0.H[2] // ...............................e............................................................................................................. || ......................................e................................................................................................................................ - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ................................................e............................................................................................ || ..........................................................e............................................................................................................ - // mls v24.8H, v9.8H, v7.H[0] // ....................................................e........................................................................................ || ..............................................................e........................................................................................................ - // sub v9.8H, v8.8H, v24.8H // ...........................................................e................................................................................. || .....................................................................e................................................................................................. - // add v8.8H, v8.8H, v24.8H // ............................................................e................................................................................ || ......................................................................e................................................................................................ - // mul v24.8H, v11.8H, v0.H[4] // ..............................e.............................................................................................................. || .....................................e................................................................................................................................. - // sqrdmulh v11.8H, v11.8H, v0.H[5] // .............................e............................................................................................................... || ....................................e.................................................................................................................................. - // mls v24.8H, v11.8H, v7.H[0] // ..................................e.......................................................................................................... || ..........................................e............................................................................................................................ - // sub v11.8H, v10.8H, v24.8H // ......................................e...................................................................................................... || ...............................................e....................................................................................................................... - // add v10.8H, v10.8H, v24.8H // .....................................e....................................................................................................... || ..............................................e........................................................................................................................ - // trn1 v25.4S, v8.4S, v9.4S // ...............................................................e............................................................................. || .........................................................................e............................................................................................. - // trn2 v26.4S, v8.4S, v9.4S // .......................................................................*..................................................................... || ......................................................................................*................................................................................ - // trn1 v27.4S, v10.4S, v11.4S // .........................................e................................................................................................... || ..................................................e.................................................................................................................... - // trn2 v28.4S, v10.4S, v11.4S // ......................................................................*...................................................................... || .....................................................................................*................................................................................. - // trn2 v10.2D, v25.2D, v27.2D // ...................................................................e......................................................................... || ..............................................................................e........................................................................................ - // trn2 v11.2D, v26.2D, v28.2D // .........................................................................*................................................................... || .........................................................................................*............................................................................. - // trn1 v8.2D, v25.2D, v27.2D // .........................................................................................*................................................... || ..............................................................................................................*........................................................ - // trn1 v9.2D, v26.2D, v28.2D // ..........................................................................*.................................................................. || ..........................................................................................*............................................................................ - // ldr_vi v0, x4, 96 // .................................e........................................................................................................... || ........................................e.............................................................................................................................. - // ldr_vo v4, x4, -80 // .............................................................................*............................................................... || ..............................................................................................*........................................................................ - // ldr_vo v1, x4, -64 // ...................................................................................................................*......................... || ...........................................................................................................................................*........................... - // ldr_vo v5, x4, -48 // ....................................e........................................................................................................ || ............................................e.......................................................................................................................... - // ldr_vo v2, x4, -32 // ...........................................................................*................................................................. || ...........................................................................................*........................................................................... - // ldr_vo v6, x4, -16 // ..................................................................e.......................................................................... || ............................................................................e.......................................................................................... - // mul v24.8H, v10.8H, v0.8H // .....................................................................*....................................................................... || ....................................................................................*.................................................................................. - // sqrdmulh v10.8H, v10.8H, v4.8H // .................................................................................*........................................................... || .....................................................................................................*................................................................. - // mls v24.8H, v10.8H, v7.H[0] // .....................................................................................*....................................................... || .........................................................................................................*............................................................. - // sub v10.8H, v8.8H, v24.8H // ...............................................................................................................*............................. || .......................................................................................................................................*............................... - // add v8.8H, v8.8H, v24.8H // ...........................................................................................*................................................. || ................................................................................................................*...................................................... - // mul v24.8H, v11.8H, v0.8H // ............................................................................*................................................................ || .............................................................................................*......................................................................... - // sqrdmulh v11.8H, v11.8H, v4.8H // ...............................................................................*............................................................. || ..................................................................................................*.................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ..................................................................................*.......................................................... || ......................................................................................................*................................................................ - // sub v11.8H, v9.8H, v24.8H // ......................................................................................*...................................................... || ..........................................................................................................*............................................................ - // add v9.8H, v9.8H, v24.8H // .............................................................................................*............................................... || ..................................................................................................................*.................................................... - // mul v24.8H, v9.8H, v1.8H // ......................................................................................................................*...................... || ...............................................................................................................................................*....................... - // sqrdmulh v9.8H, v9.8H, v5.8H // ..................................................................................................*.......................................... || ........................................................................................................................*.............................................. - // mls v24.8H, v9.8H, v7.H[0] // ..........................................................................................................................*.................. || ...................................................................................................................................................*................... - // sub v9.8H, v8.8H, v24.8H // ..............................................................................................................................*.............. || .......................................................................................................................................................*............... - // add v8.8H, v8.8H, v24.8H // ...............................................................................................................................*............. || ........................................................................................................................................................*.............. - // mul v24.8H, v11.8H, v2.8H // ..........................................................................................*.................................................. || ...............................................................................................................*....................................................... - // sqrdmulh v11.8H, v11.8H, v6.8H // ........................................................................................*.................................................... || .............................................................................................................*......................................................... - // mls v24.8H, v11.8H, v7.H[0] // ...................................................................................................*......................................... || .........................................................................................................................*............................................. - // sub v11.8H, v10.8H, v24.8H // ..................................................................................................................*.......................... || ..........................................................................................................................................*............................ - // add v10.8H, v10.8H, v24.8H // ....................................................................................................................*........................ || .............................................................................................................................................*......................... - // sqdmulh v25.8H, v8.8H, v7.H[1] // ..................................................................................................................................*.......... || ...........................................................................................................................................................*........... - // srshr v25.8H, v25.8H, #11 // ......................................................................................................................................*...... || ...............................................................................................................................................................*....... - // mls v8.8H, v25.8H, v7.H[0] // .........................................................................................................................................*... || ..................................................................................................................................................................*.... - // sqdmulh v25.8H, v9.8H, v7.H[1] // .................................................................................................................................*........... || ..........................................................................................................................................................*............ - // srshr v25.8H, v25.8H, #11 // .....................................................................................................................................*....... || ..............................................................................................................................................................*........ - // mls v9.8H, v25.8H, v7.H[0] // ........................................................................................................................................*.... || .................................................................................................................................................................*..... - // sqdmulh v25.8H, v10.8H, v7.H[1] // .......................................................................................................................*..................... || ................................................................................................................................................*...................... - // srshr v25.8H, v25.8H, #11 // ...........................................................................................................................*................. || ....................................................................................................................................................*.................. - // mls v10.8H, v25.8H, v7.H[0] // ................................................................................................................................*............ || .........................................................................................................................................................*............. - // sqdmulh v25.8H, v11.8H, v7.H[1] // .....................................................................................................................*....................... || ..............................................................................................................................................*........................ - // srshr v25.8H, v25.8H, #11 // .........................................................................................................................*................... || ..................................................................................................................................................*.................... - // mls v11.8H, v25.8H, v7.H[0] // .............................................................................................................................*............... || ......................................................................................................................................................*................ - // st4 {v8.4S,v9.4S,v10.4S,v11.4S}, [x1], #64 // ............................................................................................................................................* || ......................................................................................................................................................................* - - subs count, count, #1 + // Instructions: 72 + // Expected cycles: 87 + // Expected IPC: 0.83 + // + // Cycle bound: 87.0 + // IPC bound: 0.83 + // + // Wall time: 306.06s + // User time: 306.06s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ldr q3, [x1, #0] // *....................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q24, [x1, #16] // .*...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v29.8H, v3.8H, v16.8H // .........*.............................................................. + // gap // ........................................................................ + ldr q13, [x1, #96] // ..e..................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v17.8H, v24.8H, v6.8H // .............*.......................................................... + // gap // ........................................................................ + sub v1.8H, v3.8H, v16.8H // ........*............................................................... + // gap // ........................................................................ + add v18.8H, v24.8H, v6.8H // ..............*......................................................... + // gap // ........................................................................ + sqrdmulh v6.8H, v17.8H, v11.H[5] // ....................*................................................... + // gap // ........................................................................ + mul v25.8H, v17.8H, v11.H[4] // .....................*.................................................. + // gap // ........................................................................ + ldr q14, [x4], #(6*16) // .................................*...................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v16.8H, v18.8H, v11.H[3] // ...............*........................................................ + // gap // ........................................................................ + mls v25.8H, v6.8H, v7.H[0] // ......................*................................................. + // gap // ........................................................................ + mul v18.8H, v18.8H, v11.H[2] // ................*....................................................... + // gap // ........................................................................ + ldr q12, [x1, #112] // ...e.................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v10.8H, v1.8H, v25.8H // .......................*................................................ + // gap // ........................................................................ + mls v18.8H, v16.8H, v7.H[0] // .................*...................................................... + // gap // ........................................................................ + add v31.8H, v1.8H, v25.8H // ........................*............................................... + // gap // ........................................................................ + ldr q11, [x3], #16 // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v25.8H, v29.8H, v18.8H // ..................*..................................................... + // gap // ........................................................................ + add v18.8H, v29.8H, v18.8H // ...................*.................................................... + // gap // ........................................................................ + trn1 v24.4S, v31.4S, v10.4S // ...........................*............................................ + // gap // ........................................................................ + trn2 v6.4S, v31.4S, v10.4S // ............................*........................................... + // gap // ........................................................................ + trn1 v31.4S, v18.4S, v25.4S // .........................*.............................................. + // gap // ........................................................................ + trn2 v27.4S, v18.4S, v25.4S // ..........................*............................................. + // gap // ........................................................................ + ldr q25, [x4, #-64] // ...................................*.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v18.2D, v27.2D, v6.2D // ................................*....................................... + // gap // ........................................................................ + trn2 v8.2D, v31.2D, v24.2D // .............................*.......................................... + // gap // ........................................................................ + trn2 v16.2D, v27.2D, v6.2D // ..............................*......................................... + // gap // ........................................................................ + sqrdmulh v6.8H, v8.8H, v20.8H // .......................................*................................ + // gap // ........................................................................ + mul v19.8H, v8.8H, v14.8H // ........................................*............................... + // gap // ........................................................................ + trn1 v29.2D, v31.2D, v24.2D // ...............................*........................................ + // gap // ........................................................................ + mul v26.8H, v16.8H, v14.8H // .............................................*.......................... + // gap // ........................................................................ + sqrdmulh v28.8H, v16.8H, v20.8H // ............................................*........................... + // gap // ........................................................................ + mls v19.8H, v6.8H, v7.H[0] // .........................................*.............................. + // gap // ........................................................................ + ldr q6, [x4, #-48] // ....................................*................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v28.8H, v7.H[0] // ..............................................*......................... + // gap // ........................................................................ + ldr q20, [x4, #16] // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v4.8H, v29.8H, v19.8H // ..........................................*............................. + // gap // ........................................................................ + sub v27.8H, v18.8H, v26.8H // ...............................................*........................ + // gap // ........................................................................ + add v18.8H, v18.8H, v26.8H // ................................................*....................... + // gap // ........................................................................ + ldr q26, [x4, #-32] // .....................................*.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v14.8H, v18.8H, v6.8H // .................................................*...................... + // gap // ........................................................................ + mul v18.8H, v18.8H, v25.8H // ..................................................*..................... + // gap // ........................................................................ + mul v26.8H, v27.8H, v26.8H // .......................................................*................ + // gap // ........................................................................ + sqrdmulh v22.8H, v27.8H, v2.8H // ......................................................*................. + // gap // ........................................................................ + add v29.8H, v29.8H, v19.8H // ...........................................*............................ + // gap // ........................................................................ + mls v18.8H, v14.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + mul v16.8H, v13.8H, v11.H[0] // ......e................................................................. + // gap // ........................................................................ + mls v26.8H, v22.8H, v7.H[0] // ........................................................*............... + // gap // ........................................................................ + mul v6.8H, v12.8H, v11.H[0] // ...........e............................................................ + // gap // ........................................................................ + sub v25.8H, v29.8H, v18.8H // ....................................................*................... + // gap // ........................................................................ + add v24.8H, v29.8H, v18.8H // .....................................................*.................. + // gap // ........................................................................ + sqrdmulh v29.8H, v12.8H, v11.H[1] // ..........e............................................................. + // gap // ........................................................................ + sub v27.8H, v4.8H, v26.8H // .........................................................*.............. + // gap // ........................................................................ + sqdmulh v19.8H, v24.8H, v7.H[1] // ...........................................................*............ + // gap // ........................................................................ + add v26.8H, v4.8H, v26.8H // ..........................................................*............. + // gap // ........................................................................ + sqdmulh v14.8H, v27.8H, v7.H[1] // ....................................................................*... + // gap // ........................................................................ + mls v6.8H, v29.8H, v7.H[0] // ............e........................................................... + // gap // ........................................................................ + srshr v18.8H, v19.8H, #11 // ............................................................*........... + // gap // ........................................................................ + sqdmulh v29.8H, v25.8H, v7.H[1] // ..............................................................*......... + // gap // ........................................................................ + sqdmulh v31.8H, v26.8H, v7.H[1] // .................................................................*...... + // gap // ........................................................................ + mls v24.8H, v18.8H, v7.H[0] // .............................................................*.......... + // gap // ........................................................................ + srshr v18.8H, v14.8H, #11 // .....................................................................*.. + // gap // ........................................................................ + srshr v0.8H, v29.8H, #11 // ...............................................................*........ + // gap // ........................................................................ + srshr v31.8H, v31.8H, #11 // ..................................................................*..... + // gap // ........................................................................ + mls v27.8H, v18.8H, v7.H[0] // ......................................................................*. + // gap // ........................................................................ + sqrdmulh v18.8H, v13.8H, v11.H[1] // .....e.................................................................. + // gap // ........................................................................ + mls v25.8H, v0.8H, v7.H[0] // ................................................................*....... + // gap // ........................................................................ + mls v26.8H, v31.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + ldr q2, [x4, #80] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v16.8H, v18.8H, v7.H[0] // .......e................................................................ + // gap // ........................................................................ + st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + + // --------------------------------------------------------------- new position ---------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q8, [x1, #(16*0)] // .....................................................................*....................................................................... + // ldr q9, [x1, #(16*1)] // .....................................................................'*...................................................................... + // ldr q10, [x1, #(16*2)] // e....................................................................'..~.................................................................... + // ldr q11, [x1, #(16*3)] // ..........e..........................................................'............~.......................................................... + // ldr q0, [x3], #16 // ..............e......................................................'................~...................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ...............................................................e.....'.................................................................~..... + // mul v24.8h, v10.8h, v0.h[0] // ............................................e........................'..............................................~........................ + // mls v24.8h, v27.8h, v7.h[0] // ...................................................................e.'.....................................................................~. + // sub v10.8h, v8.8h, v24.8h // ..~..................................................................'....*.................................................................. + // add v8.8h, v8.8h, v24.8h // .....................................................................'.*..................................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .................................................e...................'...................................................~................... + // mul v24.8h, v11.8h, v0.h[0] // ..............................................e......................'................................................~...................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................................e..............'........................................................~.............. + // sub v11.8h, v9.8h, v24.8h // .~...................................................................'...*................................................................... + // add v9.8h, v9.8h, v24.8h // ...~.................................................................'.....*................................................................. + // sqrdmulh v27.8h, v9.8h, v0.h[3] // .......~.............................................................'.........*............................................................. + // mul v24.8h, v9.8h, v0.h[2] // .........~...........................................................'...........*........................................................... + // mls v24.8h, v27.8h, v7.h[0] // ............~........................................................'..............*........................................................ + // sub v9.8h, v8.8h, v24.8h // ...............~.....................................................'.................*..................................................... + // add v8.8h, v8.8h, v24.8h // ................~....................................................'..................*.................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ....~................................................................'......*................................................................ + // mul v24.8h, v11.8h, v0.h[4] // .....~...............................................................'.......*............................................................... + // mls v24.8h, v27.8h, v7.h[0] // ........~............................................................'..........*............................................................ + // sub v11.8h, v10.8h, v24.8h // ...........~.........................................................'.............*......................................................... + // add v10.8h, v10.8h, v24.8h // .............~.......................................................'...............*....................................................... + // trn1 v25.4s, v8.4s, v9.4s // ...................~.................................................'.....................*................................................. + // trn2 v26.4s, v8.4s, v9.4s // ....................~................................................'......................*................................................ + // trn1 v27.4s, v10.4s, v11.4s // .................~...................................................'...................*................................................... + // trn2 v28.4s, v10.4s, v11.4s // ..................~..................................................'....................*.................................................. + // trn2 v10.2d, v25.2d, v27.2d // .......................~.............................................'.........................*............................................. + // trn2 v11.2d, v26.2d, v28.2d // ........................~............................................'..........................*............................................ + // trn1 v8.2d, v25.2d, v27.2d // ...........................~.........................................'.............................*......................................... + // trn1 v9.2d, v26.2d, v28.2d // ......................~..............................................'........................*.............................................. + // ldr q0, [ x4], #(6*16) // ......~..............................................................'........*.............................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .................................e...................................'...................................~................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // .....................~...............................................'.......................*............................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ...............................~.....................................'.................................*..................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // .....................................~...............................'.......................................*............................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ..................................................................e..'....................................................................~.. + // sqrdmulh v27.8h, v10.8h, v4.8h // .........................~...........................................'...........................*........................................... + // mul v24.8h, v10.8h, v0.8h // ..........................~..........................................'............................*.......................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................~......................................'................................*...................................... + // sub v10.8h, v8.8h, v24.8h // ..................................~..................................'....................................*.................................. + // add v8.8h, v8.8h, v24.8h // ..........................................~..........................'............................................*.......................... + // sqrdmulh v27.8h, v11.8h, v4.8h // .............................~.......................................'...............................*....................................... + // mul v24.8h, v11.8h, v0.8h // ............................~........................................'..............................*........................................ + // mls v24.8h, v27.8h, v7.h[0] // ................................~....................................'..................................*.................................... + // sub v11.8h, v9.8h, v24.8h // ...................................~.................................'.....................................*................................. + // add v9.8h, v9.8h, v24.8h // ....................................~................................'......................................*................................ + // sqrdmulh v27.8h, v9.8h, v5.8h // ......................................~..............................'........................................*.............................. + // mul v24.8h, v9.8h, v1.8h // .......................................~.............................'.........................................*............................. + // mls v24.8h, v27.8h, v7.h[0] // ...........................................~.........................'.............................................*......................... + // sub v9.8h, v8.8h, v24.8h // ...............................................~.....................'.................................................*..................... + // add v8.8h, v8.8h, v24.8h // ................................................~....................'..................................................*.................... + // sqrdmulh v27.8h, v11.8h, v6.8h // .........................................~...........................'...........................................*........................... + // mul v24.8h, v11.8h, v2.8h // ........................................~............................'..........................................*............................ + // mls v24.8h, v27.8h, v7.h[0] // .............................................~.......................'...............................................*....................... + // sub v11.8h, v10.8h, v24.8h // ..................................................~..................'....................................................*.................. + // add v10.8h, v10.8h, v24.8h // ....................................................~................'......................................................*................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................................................~.................'.....................................................*................. + // srshr v25.8h, v25.8h, #11 // .......................................................~.............'.........................................................*............. + // mls v8.8h, v25.8h, v7.h[0] // ..........................................................~..........'............................................................*.......... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ........................................................~............'..........................................................*............ + // srshr v25.8h, v25.8h, #11 // ............................................................~........'..............................................................*........ + // mls v9.8h, v25.8h, v7.h[0] // ................................................................~....'..................................................................*.... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .........................................................~...........'...........................................................*........... + // srshr v25.8h, v25.8h, #11 // .............................................................~.......'...............................................................*....... + // mls v10.8h, v25.8h, v7.h[0] // .................................................................~...'...................................................................*... + // sqdmulh v25.8h, v11.8h, v7.h[1] // .....................................................~...............'.......................................................*............... + // srshr v25.8h, v25.8h, #11 // ...........................................................~.........'.............................................................*......... + // mls v11.8h, v25.8h, v7.h[0] // ..............................................................~......'................................................................*...... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ....................................................................~'......................................................................* + + sub count, count, #1 cbnz count, layer4567_start - trn2 v11.4S, v29.4S, v9.4S // .*....................................... - // gap // ......................................... - trn2 v21.4S, v0.4S, v12.4S // ..*...................................... - // gap // ......................................... - ldr_vo v10, x4, -80 // .......*................................. - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - trn2 v20.2D, v21.2D, v11.2D // ...*..................................... - // gap // ......................................... - mul v4.8H, v24.8H, v19.8H // *........................................ - // gap // ......................................... - mul v25.8H, v20.8H, v19.8H // ......*.................................. - // gap // ......................................... - sqrdmulh v20.8H, v20.8H, v10.8H // ........*................................ - // gap // ......................................... - sqrdmulh v13.8H, v24.8H, v10.8H // .........*............................... - // gap // ......................................... - ldr_vo v28, x4, -32 // .....*................................... - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - mls v25.8H, v20.8H, v7.H[0] // ..........*.............................. - // gap // ......................................... - trn1 v21.2D, v21.2D, v11.2D // ....*.................................... - // gap // ......................................... - ldr_vo v2, x4, -64 // ......................*.................. - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - add v11.8H, v21.8H, v25.8H // .................*....................... - // gap // ......................................... - mls v4.8H, v13.8H, v7.H[0] // ...........*............................. - // gap // ......................................... - sub v31.8H, v21.8H, v25.8H // ............*............................ - // gap // ......................................... - mul v2.8H, v11.8H, v2.8H // .........................*............... - // gap // ......................................... - sqrdmulh v11.8H, v11.8H, v6.8H // ..................*...................... - // gap // ......................................... - mul v27.8H, v31.8H, v28.8H // ...............*......................... - // gap // ......................................... - sqrdmulh v21.8H, v31.8H, v3.8H // .............*........................... - // gap // ......................................... - trn1 v18.2D, v26.2D, v22.2D // ..............*.......................... - // gap // ......................................... - mls v2.8H, v11.8H, v7.H[0] // ............................*............ - // gap // ......................................... - add v11.8H, v18.8H, v4.8H // ................*........................ - // gap // ......................................... - mls v27.8H, v21.8H, v7.H[0] // ...................*..................... - // gap // ......................................... - sub v3.8H, v18.8H, v4.8H // ....................*.................... - // gap // ......................................... - sub v16.8H, v11.8H, v2.8H // ...............................*......... - // gap // ......................................... - add v15.8H, v11.8H, v2.8H // ................................*........ - // gap // ......................................... - sub v18.8H, v3.8H, v27.8H // .....................*................... - // gap // ......................................... - add v17.8H, v3.8H, v27.8H // .......................*................. - // gap // ......................................... - sqdmulh v30.8H, v16.8H, v7.H[1] // ..................................*...... - // gap // ......................................... - sqdmulh v11.8H, v15.8H, v7.H[1] // ...................................*..... - // gap // ......................................... - sqdmulh v3.8H, v18.8H, v7.H[1] // ........................*................ - // gap // ......................................... - sqdmulh v20.8H, v17.8H, v7.H[1] // ..........................*.............. - // gap // ......................................... - srshr v21.8H, v30.8H, #11 // ....................................*.... - // gap // ......................................... - srshr v13.8H, v11.8H, #11 // .....................................*... - // gap // ......................................... - srshr v11.8H, v3.8H, #11 // ...........................*............. - // gap // ......................................... - mls v16.8H, v21.8H, v7.H[0] // ......................................*.. - // gap // ......................................... - srshr v21.8H, v20.8H, #11 // .............................*........... - // gap // ......................................... - mls v18.8H, v11.8H, v7.H[0] // ..............................*.......... - // gap // ......................................... - mls v15.8H, v13.8H, v7.H[0] // .......................................*. - // gap // ......................................... - mls v17.8H, v21.8H, v7.H[0] // .................................*....... - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - st4 {v15.4S,v16.4S,v17.4S,v18.4S}, [x1], #64 // ........................................* - // gap // ......................................... - - // original source code - // mul v17.8H, v24.8H, v19.8H // ....*.................................... || .....*......................................... - // trn2 v9.4S, v29.4S, v9.4S // *........................................ || *.............................................. - // trn2 v29.4S, v0.4S, v12.4S // .*....................................... || .*............................................. - // trn2 v12.2D, v29.2D, v9.2D // ...*..................................... || ....*.......................................... - // trn1 v5.2D, v29.2D, v9.2D // ..........*.............................. || ............*.................................. - // ldr_vo v21, x4, -32 // ........*................................ || .........*..................................... - // mul v9.8H, v12.8H, v19.8H // .....*................................... || ......*........................................ - // ldr_vo v23, x4, -80 // ..*...................................... || ..*............................................ - // sqrdmulh v19.8H, v12.8H, v23.8H // ......*.................................. || .......*....................................... - // sqrdmulh v10.8H, v24.8H, v23.8H // .......*................................. || ........*...................................... - // mls v9.8H, v19.8H, v7.H[0] // .........*............................... || ...........*................................... - // mls v17.8H, v10.8H, v7.H[0] // .............*........................... || ................*.............................. - // sub v28.8H, v5.8H, v9.8H // ..............*.......................... || .................*............................. - // sqrdmulh v15.8H, v28.8H, v3.8H // ..................*...................... || .....................*......................... - // trn1 v24.2D, v26.2D, v22.2D // ...................*..................... || ......................*........................ - // mul v21.8H, v28.8H, v21.8H // .................*....................... || ....................*.......................... - // add v2.8H, v24.8H, v17.8H // .....................*................... || ........................*...................... - // add v3.8H, v5.8H, v9.8H // ............*............................ || ...............*............................... - // sqrdmulh v25.8H, v3.8H, v6.8H // ................*........................ || ...................*........................... - // mls v21.8H, v15.8H, v7.H[0] // ......................*.................. || .........................*..................... - // sub v5.8H, v24.8H, v17.8H // .......................*................. || ..........................*.................... - // sub v18.8H, v5.8H, v21.8H // ..........................*.............. || .............................*................. - // ldr_vo v27, x4, -64 // ...........*............................. || .............*................................. - // add v17.8H, v5.8H, v21.8H // ...........................*............. || ..............................*................ - // sqdmulh v4.8H, v18.8H, v7.H[1] // ..............................*.......... || .................................*............. - // mul v3.8H, v3.8H, v27.8H // ...............*......................... || ..................*............................ - // sqdmulh v11.8H, v17.8H, v7.H[1] // ...............................*......... || ..................................*............ - // srshr v21.8H, v4.8H, #11 // ..................................*...... || .....................................*......... - // mls v3.8H, v25.8H, v7.H[0] // ....................*.................... || .......................*....................... - // srshr v27.8H, v11.8H, #11 // ....................................*.... || .......................................*....... - // mls v18.8H, v21.8H, v7.H[0] // .....................................*... || ........................................*...... - // sub v16.8H, v2.8H, v3.8H // ........................*................ || ...........................*................... - // add v15.8H, v2.8H, v3.8H // .........................*............... || ............................*.................. - // mls v17.8H, v27.8H, v7.H[0] // .......................................*. || ..........................................*.... - // sqdmulh v20.8H, v16.8H, v7.H[1] // ............................*............ || ...............................*............... - // sqdmulh v3.8H, v15.8H, v7.H[1] // .............................*........... || ................................*.............. - // srshr v30.8H, v20.8H, #11 // ................................*........ || ...................................*........... - // srshr v4.8H, v3.8H, #11 // .................................*....... || ....................................*.......... - // mls v16.8H, v30.8H, v7.H[0] // ...................................*..... || ......................................*........ - // mls v15.8H, v4.8H, v7.H[0] // ......................................*.. || .........................................*..... - // st4 {v15.4S,v16.4S,v17.4S,v18.4S}, [x1], #64 // ........................................* || ..............................................* - + // Instructions: 61 + // Expected cycles: 70 + // Expected IPC: 0.87 + // + // Cycle bound: 70.0 + // IPC bound: 0.87 + // + // Wall time: 14.77s + // User time: 14.77s + // + // -------------------- original position ---------------------> + // 0 25 50 + // |------------------------|------------------------|---------- + ldr q30, [x1, #16] // .*........................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + ldr q22, [x1, #0] // *............................................................ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v18.8H, v30.8H, v6.8H // ...*......................................................... + // gap // ............................................................. + add v29.8H, v30.8H, v6.8H // .....*....................................................... + // gap // ............................................................. + add v14.8H, v22.8H, v16.8H // ..*.......................................................... + // gap // ............................................................. + mul v12.8H, v18.8H, v11.H[4] // .......*..................................................... + // gap // ............................................................. + sqrdmulh v18.8H, v18.8H, v11.H[5] // ......*...................................................... + // gap // ............................................................. + mul v26.8H, v29.8H, v11.H[2] // ...........*................................................. + // gap // ............................................................. + sqrdmulh v8.8H, v29.8H, v11.H[3] // .........*................................................... + // gap // ............................................................. + sub v15.8H, v22.8H, v16.8H // ....*........................................................ + // gap // ............................................................. + mls v12.8H, v18.8H, v7.H[0] // ..........*.................................................. + // gap // ............................................................. + ldr q29, [x4, #64] // ....................................*........................ + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + mls v26.8H, v8.8H, v7.H[0] // .............*............................................... + // gap // ............................................................. + add v18.8H, v15.8H, v12.8H // ..............*.............................................. + // gap // ............................................................. + ldr q11, [x4, #32] // .....................*....................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v6.8H, v15.8H, v12.8H // ............*................................................ + // gap // ............................................................. + add v16.8H, v14.8H, v26.8H // ................*............................................ + // gap // ............................................................. + sub v31.8H, v14.8H, v26.8H // ...............*............................................. + // gap // ............................................................. + trn2 v19.4S, v18.4S, v6.4S // ..................*.......................................... + // gap // ............................................................. + trn1 v12.4S, v18.4S, v6.4S // .................*........................................... + // gap // ............................................................. + trn2 v26.4S, v16.4S, v31.4S // ....................*........................................ + // gap // ............................................................. + ldr q22, [x4], #(6*16) // ........*.................................................... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + trn2 v6.2D, v26.2D, v19.2D // ........................*.................................... + // gap // ............................................................. + trn1 v8.4S, v16.4S, v31.4S // ...................*......................................... + // gap // ............................................................. + sqrdmulh v18.8H, v6.8H, v20.8H // .............................*............................... + // gap // ............................................................. + mul v6.8H, v6.8H, v22.8H // ............................*................................ + // gap // ............................................................. + trn2 v31.2D, v8.2D, v12.2D // .......................*..................................... + // gap // ............................................................. + trn1 v19.2D, v26.2D, v19.2D // ......................*...................................... + // gap // ............................................................. + mul v16.8H, v31.8H, v22.8H // ..........................*.................................. + // gap // ............................................................. + mls v6.8H, v18.8H, v7.H[0] // ................................*............................ + // gap // ............................................................. + sqrdmulh v18.8H, v31.8H, v20.8H // .........................*................................... + // gap // ............................................................. + ldr q26, [x4, #-48] // ...............................*............................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + sub v25.8H, v19.8H, v6.8H // ..................................*.......................... + // gap // ............................................................. + mls v16.8H, v18.8H, v7.H[0] // ..............................*.............................. + // gap // ............................................................. + add v6.8H, v19.8H, v6.8H // ...................................*......................... + // gap // ............................................................. + sqrdmulh v18.8H, v25.8H, v2.8H // ........................................*.................... + // gap // ............................................................. + mul v29.8H, v25.8H, v29.8H // .......................................*..................... + // gap // ............................................................. + mul v31.8H, v6.8H, v11.8H // ......................................*...................... + // gap // ............................................................. + sqrdmulh v6.8H, v6.8H, v26.8H // .....................................*....................... + // gap // ............................................................. + trn1 v25.2D, v8.2D, v12.2D // ...........................*................................. + // gap // ............................................................. + mls v29.8H, v18.8H, v7.H[0] // ...........................................*................. + // gap // ............................................................. + sub v26.8H, v25.8H, v16.8H // .................................*........................... + // gap // ............................................................. + mls v31.8H, v6.8H, v7.H[0] // ..........................................*.................. + // gap // ............................................................. + add v6.8H, v25.8H, v16.8H // .........................................*................... + // gap // ............................................................. + sub v28.8H, v26.8H, v29.8H // ..............................................*.............. + // gap // ............................................................. + add v27.8H, v26.8H, v29.8H // ................................................*............ + // gap // ............................................................. + sub v26.8H, v6.8H, v31.8H // ............................................*................ + // gap // ............................................................. + sqdmulh v29.8H, v28.8H, v7.H[1] // .................................................*........... + // gap // ............................................................. + sqdmulh v18.8H, v27.8H, v7.H[1] // ....................................................*........ + // gap // ............................................................. + add v25.8H, v6.8H, v31.8H // .............................................*............... + // gap // ............................................................. + sqdmulh v6.8H, v26.8H, v7.H[1] // ...................................................*......... + // gap // ............................................................. + srshr v29.8H, v29.8H, #11 // ......................................................*...... + // gap // ............................................................. + sqdmulh v13.8H, v25.8H, v7.H[1] // ...............................................*............. + // gap // ............................................................. + srshr v18.8H, v18.8H, #11 // ........................................................*.... + // gap // ............................................................. + mls v28.8H, v29.8H, v7.H[0] // .........................................................*... + // gap // ............................................................. + srshr v29.8H, v6.8H, #11 // .......................................................*..... + // gap // ............................................................. + srshr v6.8H, v13.8H, #11 // ..................................................*.......... + // gap // ............................................................. + mls v27.8H, v18.8H, v7.H[0] // ...........................................................*. + // gap // ............................................................. + mls v26.8H, v29.8H, v7.H[0] // ..........................................................*.. + // gap // ............................................................. + mls v25.8H, v6.8H, v7.H[0] // .....................................................*....... + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + // gap // ............................................................. + st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // ............................................................* + // gap // ............................................................. + + // ----------------------- new position -----------------------> + // 0 25 50 + // |------------------------|------------------------|---------- + // ldr q3, [x1, #0] // .*........................................................... + // ldr q24, [x1, #16] // *............................................................ + // add v29.8H, v3.8H, v16.8H // ....*........................................................ + // sub v17.8H, v24.8H, v6.8H // ..*.......................................................... + // sub v1.8H, v3.8H, v16.8H // .........*................................................... + // add v18.8H, v24.8H, v6.8H // ...*......................................................... + // sqrdmulh v6.8H, v17.8H, v11.H[5] // ......*...................................................... + // mul v25.8H, v17.8H, v11.H[4] // .....*....................................................... + // ldr q14, [x4], #(6*16) // .....................*....................................... + // sqrdmulh v16.8H, v18.8H, v11.H[3] // ........*.................................................... + // mls v25.8H, v6.8H, v7.H[0] // ..........*.................................................. + // mul v18.8H, v18.8H, v11.H[2] // .......*..................................................... + // sub v10.8H, v1.8H, v25.8H // ...............*............................................. + // mls v18.8H, v16.8H, v7.H[0] // ............*................................................ + // add v31.8H, v1.8H, v25.8H // .............*............................................... + // sub v25.8H, v29.8H, v18.8H // .................*........................................... + // add v18.8H, v29.8H, v18.8H // ................*............................................ + // trn1 v24.4S, v31.4S, v10.4S // ...................*......................................... + // trn2 v6.4S, v31.4S, v10.4S // ..................*.......................................... + // trn1 v31.4S, v18.4S, v25.4S // .......................*..................................... + // trn2 v27.4S, v18.4S, v25.4S // ....................*........................................ + // ldr q25, [x4, #-64] // ..............*.............................................. + // trn1 v18.2D, v27.2D, v6.2D // ...........................*................................. + // trn2 v8.2D, v31.2D, v24.2D // ..........................*.................................. + // trn2 v16.2D, v27.2D, v6.2D // ......................*...................................... + // sqrdmulh v6.8H, v8.8H, v20.8H // ..............................*.............................. + // mul v19.8H, v8.8H, v14.8H // ............................*................................ + // trn1 v29.2D, v31.2D, v24.2D // .......................................*..................... + // mul v26.8H, v16.8H, v14.8H // .........................*................................... + // sqrdmulh v28.8H, v16.8H, v20.8H // ........................*.................................... + // mls v19.8H, v6.8H, v7.H[0] // .................................*........................... + // ldr q6, [x4, #-48] // ...............................*............................. + // mls v26.8H, v28.8H, v7.H[0] // .............................*............................... + // sub v4.8H, v29.8H, v19.8H // .........................................*................... + // sub v27.8H, v18.8H, v26.8H // ................................*............................ + // add v18.8H, v18.8H, v26.8H // ..................................*.......................... + // ldr q26, [x4, #-32] // ...........*................................................. + // sqrdmulh v14.8H, v18.8H, v6.8H // ......................................*...................... + // mul v18.8H, v18.8H, v25.8H // .....................................*....................... + // mul v26.8H, v27.8H, v26.8H // ....................................*........................ + // sqrdmulh v22.8H, v27.8H, v2.8H // ...................................*......................... + // add v29.8H, v29.8H, v19.8H // ...........................................*................. + // mls v18.8H, v14.8H, v7.H[0] // ..........................................*.................. + // mls v26.8H, v22.8H, v7.H[0] // ........................................*.................... + // sub v25.8H, v29.8H, v18.8H // ..............................................*.............. + // add v24.8H, v29.8H, v18.8H // .................................................*........... + // sub v27.8H, v4.8H, v26.8H // ............................................*................ + // sqdmulh v19.8H, v24.8H, v7.H[1] // ....................................................*........ + // add v26.8H, v4.8H, v26.8H // .............................................*............... + // sqdmulh v14.8H, v27.8H, v7.H[1] // ...............................................*............. + // srshr v18.8H, v19.8H, #11 // ........................................................*.... + // sqdmulh v29.8H, v25.8H, v7.H[1] // ..................................................*.......... + // sqdmulh v31.8H, v26.8H, v7.H[1] // ................................................*............ + // mls v24.8H, v18.8H, v7.H[0] // ...........................................................*. + // srshr v18.8H, v14.8H, #11 // ...................................................*......... + // srshr v0.8H, v29.8H, #11 // .......................................................*..... + // srshr v31.8H, v31.8H, #11 // .....................................................*....... + // mls v27.8H, v18.8H, v7.H[0] // ......................................................*...... + // mls v25.8H, v0.8H, v7.H[0] // ..........................................................*.. + // mls v26.8H, v31.8H, v7.H[0] // .........................................................*... + // st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 // ............................................................* + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_opt_a72.s b/examples/opt/aarch64/ntt_kyber_123_4567_opt_a72.s index f6226f02..35c2f995 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_opt_a72.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_opt_a72.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h @@ -67,15 +44,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +61,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -98,48 +69,48 @@ .macro barrett_reduce a vqdmulhq t0, \a, consts, 1 - srshr t0.8H, t0.8H, #11 + srshr t0.8h, t0.8h, #11 vmlsq \a, t0, consts, 0 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -150,7 +121,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -160,7 +131,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -168,7 +139,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -179,19 +150,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -204,7 +175,7 @@ roots: .text .global ntt_kyber_123_4567_opt_a72 - .global _ntt_kyber_123_4567_opt_a72 + .global _ntt_kyber_123_4567 .p2align 4 const_addr: .short 3329 @@ -330,1265 +301,1356 @@ _ntt_kyber_123_4567_opt_a72: load_roots_123 .p2align 2 - ldr_vo v16, x0, 192 // ..*......... - ldr_vo v11, x0, 448 // .*.......... - // gap // ............ - ldr_vo v18, x0, 256 // *........... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v15.8H, v11.8H, v0.H[1] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v11.8H, v11.8H, v0.H[0] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v4.8H, v18.8H, v0.H[1] // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v11.8H, v15.8H, v7.H[0] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v27.8H, v18.8H, v0.H[0] // ......*..... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v27.8H, v4.8H, v7.H[0] // ...........* - // gap // ............ - // gap // ............ - add v4.8H, v16.8H, v11.8H // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v24.8H, v4.8H, v0.H[3] // .........*.. - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v10.8H, v4.8H, v0.H[2] // ..........*. - // gap // ............ - // gap // ............ - - // original source code - // ldr_vo v31, x0, 256 // ..*......... || .*................... - // ldr_vo v5, x0, 448 // .*.......... || *.................... - // ldr_vo v16, x0, 192 // *........... || *.................... - // mul v11.8H, v5.8H, v0.H[0] // ....*....... || ......*.............. - // sqrdmulh v24.8H, v5.8H, v0.H[1] // ...*........ || ....*................ - // mls v11.8H, v24.8H, v7.H[0] // ......*..... || ..........*.......... - // mul v27.8H, v31.8H, v0.H[0] // .......*.... || ............*........ - // add v22.8H, v16.8H, v11.8H // .........*.. || ...............*..... - // sqrdmulh v14.8H, v31.8H, v0.H[1] // .....*...... || ........*............ - // sqrdmulh v24.8H, v22.8H, v0.H[3] // ..........*. || ..................*.. - // mul v10.8H, v22.8H, v0.H[2] // ...........* || ....................* - // mls v27.8H, v14.8H, v7.H[0] // ........*... || ..............*...... - + // Instructions: 12 + // Expected cycles: 15 + // Expected IPC: 0.80 + // + // Cycle bound: 15.0 + // IPC bound: 0.80 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q20, [x0, #64] // ...*.......................... + ldr q15, [x0, #320] // .*............................ + // gap // .............................. + ldr q16, [x0, #128] // ....*......................... + // gap // .............................. + // gap // .............................. + ldr q28, [x0, #256] // ......*....................... + // gap // .............................. + // gap // .............................. + ldr q17, [x0, #0] // ...........*.................. + // gap // .............................. + // gap // .............................. + sqrdmulh v27.8H, v15.8H, v0.H[1] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v18.8H, v15.8H, v0.H[0] // .....*........................ + ldr q15, [x0, #448] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v18.8H, v27.8H, v7.H[0] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v8.8H, v15.8H, v0.H[1] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v12.8H, v15.8H, v0.H[0] // .........*.................... + // gap // .............................. + // gap // .............................. + add v3.8H, v20.8H, v18.8H // ..........*................... + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0, #448] // .......*....................... + // ldr q29, [x0, #320] // .*............................. + // sqrdmulh v31.8H, v29.8H, v0.H[1] // .....*......................... + // ldr q20, [x0, #64] // *.............................. + // ldr q16, [x0, #128] // ..*............................ + // mul v18.8H, v29.8H, v0.H[0] // ......*........................ + // ldr q28, [x0, #256] // ...*........................... + // mls v18.8H, v31.8H, v7.H[0] // ........*...................... + // sqrdmulh v8.8H, v6.8H, v0.H[1] // .........*..................... + // mul v12.8H, v6.8H, v0.H[0] // ..........*.................... + // add v3.8H, v20.8H, v18.8H // ...........*................... + // ldr q17, [x0, #0] // ....*.......................... + sub count, count, #1 -.p2align 2 layer123_start: - ldr_vo v4, x0, 0 // *........................................................................... - sub v11.8H, v16.8H, v11.8H // ..........................*................................................. - ldr_vo v18, x0, 128 // ..*......................................................................... - ldr_vo v22, x0, 384 // ......*..................................................................... - ldr_vo v14, x0, 64 // .*.......................................................................... - mls v10.8H, v24.8H, v7.H[0] // ...................................*........................................ - ldr_vo v31, x0, 272 // ....e....................................................................... - ldr_vo v24, x0, 320 // .....*...................................................................... - // gap // ............................................................................ - mul v8.8H, v11.8H, v0.H[4] // ...........................................*................................ - ldr_vo v5, x0, 464 // .......e.................................................................... - ldr_vo v16, x0, 208 // ...e........................................................................ - sub v23.8H, v4.8H, v27.8H // ...........*................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v4.8H, v4.8H, v27.8H // ............*............................................................... - sqrdmulh v11.8H, v11.8H, v0.H[5] // ............................................*............................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v27.8H, v22.8H, v0.H[1] // ...................*........................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v22.8H, v22.8H, v0.H[0] // ..................*......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v26.8H, v24.8H, v0.H[1] // ..............*............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v22.8H, v27.8H, v7.H[0] // ....................*....................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v24.8H, v24.8H, v0.H[0] // .............*.............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v24.8H, v26.8H, v7.H[0] // ...............*............................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v27.8H, v18.8H, v22.8H // .....................*...................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v18.8H, v18.8H, v22.8H // ......................*..................................................... - mls v8.8H, v11.8H, v7.H[0] // .............................................*.............................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v11.8H, v5.8H, v0.H[0] // .......................e.................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v22.8H, v14.8H, v24.8H // .................*.......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v14.8H, v14.8H, v24.8H // ................*........................................................... - sqrdmulh v24.8H, v5.8H, v0.H[1] // ........................e................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v5.8H, v27.8H, v0.H[5] // .......................................*.................................... - sub v26.8H, v22.8H, v10.8H // ....................................*....................................... - // gap // ............................................................................ - add v22.8H, v22.8H, v10.8H // .....................................*...................................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v10.8H, v18.8H, v0.H[3] // .............................*.............................................. - sub v15.8H, v14.8H, v8.8H // ..............................................*............................. - // gap // ............................................................................ - add v14.8H, v14.8H, v8.8H // ...............................................*............................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v8.8H, v27.8H, v0.H[4] // ......................................*..................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v8.8H, v5.8H, v7.H[0] // ........................................*................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v18.8H, v18.8H, v0.H[2] // ............................*............................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v18.8H, v10.8H, v7.H[0] // ..............................*............................................. - // gap // ............................................................................ - // gap // ............................................................................ - sub v10.8H, v23.8H, v8.8H // .........................................*.................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v8.8H, v23.8H, v8.8H // ..........................................*................................. - mul v5.8H, v22.8H, v0.H[6] // ................................................*........................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v22.8H, v22.8H, v0.H[7] // .................................................*.......................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v23.8H, v4.8H, v18.8H // ...............................*............................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v4.8H, v4.8H, v18.8H // ................................*........................................... - mul v18.8H, v26.8H, v1.H[0] // .....................................................*...................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v27.8H, v26.8H, v1.H[1] // ......................................................*..................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.8H, v22.8H, v7.H[0] // ..................................................*......................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v22.8H, v14.8H, v1.H[2] // ..........................................................*................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v18.8H, v27.8H, v7.H[0] // .......................................................*.................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v27.8H, v4.8H, v5.8H // ...................................................*........................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v14.8H, v14.8H, v1.H[3] // ...........................................................*................ - add v4.8H, v4.8H, v5.8H // ....................................................*....................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v27, x0, 64 // .....................................................................*...... - mul v5.8H, v15.8H, v1.H[4] // ...............................................................*............ - // gap // ............................................................................ - str_vi v4, x0, 16 // ....................................................................*....... - sub v4.8H, v23.8H, v18.8H // ........................................................*................... - // gap // ............................................................................ - sqrdmulh v27.8H, v15.8H, v1.H[5] // ................................................................*........... - add v18.8H, v23.8H, v18.8H // .........................................................*.................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v22.8H, v14.8H, v7.H[0] // ............................................................*............... - str_vo v4, x0, 176 // .......................................................................*.... - // gap // ............................................................................ - str_vo v18, x0, 112 // ......................................................................*..... - // gap // ............................................................................ - // gap // ............................................................................ - mls v11.8H, v24.8H, v7.H[0] // .........................e.................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.8H, v27.8H, v7.H[0] // .................................................................*.......... - // gap // ............................................................................ - // gap // ............................................................................ - sub v4.8H, v8.8H, v22.8H // .............................................................*.............. - // gap // ............................................................................ - // gap // ............................................................................ - add v18.8H, v8.8H, v22.8H // ..............................................................*............. - mul v27.8H, v31.8H, v0.H[0] // ........e................................................................... - // gap // ............................................................................ - add v22.8H, v16.8H, v11.8H // ...........................e................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v14.8H, v31.8H, v0.H[1] // .........e.................................................................. - str_vo v4, x0, 304 // .........................................................................*.. - // gap // ............................................................................ - str_vo v18, x0, 240 // ........................................................................*... - sub v4.8H, v10.8H, v5.8H // ..................................................................*......... - // gap // ............................................................................ - add v18.8H, v10.8H, v5.8H // ...................................................................*........ - sqrdmulh v24.8H, v22.8H, v0.H[3] // ..................................e......................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v10.8H, v22.8H, v0.H[2] // .................................e.......................................... - str_vo v4, x0, 432 // ...........................................................................* - // gap // ............................................................................ - str_vo v18, x0, 368 // ..........................................................................*. - // gap // ............................................................................ - // gap // ............................................................................ - mls v27.8H, v14.8H, v7.H[0] // ..........e................................................................. - // gap // ............................................................................ - // gap // ............................................................................ - - // original source code - // ldr_vo v8, x0, 0 // ......................................................................*.......................................................................... || ......................................................................*...................................................................... - // ldr_vo v9, x0, 64 // ..........................................................................*...................................................................... || .......................................................................*..................................................................... - // ldr_vo v10, x0, 128 // ........................................................................*........................................................................ || ......................................................................*...................................................................... - // ldr_vo v11, x0, 192 // ....e............................................................................................................................................ || .e........................................................................................................................................... - // ldr_vo v12, x0, 256 // e................................................................................................................................................ || e............................................................................................................................................ - // ldr_vo v13, x0, 320 // .............................................................................*................................................................... || ........................................................................*.................................................................... - // ldr_vo v14, x0, 384 // .........................................................................*....................................................................... || .......................................................................*..................................................................... - // ldr_vo v15, x0, 448 // ...e............................................................................................................................................. || .e........................................................................................................................................... - // mul v24.8H, v12.8H, v0.H[0] // ..........................................................e...................................................................................... || .............................................................e............................................................................... - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ............................................................e.................................................................................... || ...............................................................e............................................................................. - // mls v24.8H, v12.8H, v7.H[0] // .....................................................................e........................................................................... || .....................................................................e....................................................................... - // sub v12.8H, v8.8H, v24.8H // .................................................................................*............................................................... || ..........................................................................*.................................................................. - // add v8.8H, v8.8H, v24.8H // ..................................................................................*.............................................................. || ...........................................................................*................................................................. - // mul v24.8H, v13.8H, v0.H[0] // ........................................................................................*........................................................ || .....................................................................................*....................................................... - // sqrdmulh v13.8H, v13.8H, v0.H[1] // ......................................................................................*.......................................................... || .................................................................................*........................................................... - // mls v24.8H, v13.8H, v7.H[0] // .........................................................................................*....................................................... || .......................................................................................*..................................................... - // sub v13.8H, v9.8H, v24.8H // ...............................................................................................*................................................. || .............................................................................................*............................................... - // add v9.8H, v9.8H, v24.8H // ..............................................................................................*.................................................. || ............................................................................................*................................................ - // mul v24.8H, v14.8H, v0.H[0] // .....................................................................................*........................................................... || ...............................................................................*............................................................. - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ....................................................................................*............................................................ || .............................................................................*............................................................... - // mls v24.8H, v14.8H, v7.H[0] // .......................................................................................*......................................................... || ...................................................................................*......................................................... - // sub v14.8H, v10.8H, v24.8H // ..........................................................................................*...................................................... || ........................................................................................*.................................................... - // add v10.8H, v10.8H, v24.8H // ...........................................................................................*..................................................... || .........................................................................................*................................................... - // mul v24.8H, v15.8H, v0.H[0] // .................e............................................................................................................................... || ...................e......................................................................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ....................e............................................................................................................................ || .....................e....................................................................................................................... - // mls v24.8H, v15.8H, v7.H[0] // ......................................................e.......................................................................................... || .........................................................e................................................................................... - // sub v15.8H, v11.8H, v24.8H // .......................................................................*......................................................................... || ......................................................................*...................................................................... - // add v11.8H, v11.8H, v24.8H // ...........................................................e..................................................................................... || ..............................................................e.............................................................................. - // mul v24.8H, v10.8H, v0.H[2] // .........................................................................................................*....................................... || .......................................................................................................*..................................... - // sqrdmulh v10.8H, v10.8H, v0.H[3] // ....................................................................................................*............................................ || .................................................................................................*........................................... - // mls v24.8H, v10.8H, v7.H[0] // ..........................................................................................................*...................................... || .........................................................................................................*................................... - // sub v10.8H, v8.8H, v24.8H // ...............................................................................................................*................................. || ..............................................................................................................*.............................. - // add v8.8H, v8.8H, v24.8H // ................................................................................................................*................................ || ...............................................................................................................*............................. - // mul v24.8H, v11.8H, v0.H[2] // ..................................................................e.............................................................................. || ...................................................................e......................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[3] // .................................................................e............................................................................... || .................................................................e........................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ...........................................................................*..................................................................... || .......................................................................*..................................................................... - // sub v11.8H, v9.8H, v24.8H // ..................................................................................................*.............................................. || ...............................................................................................*............................................. - // add v9.8H, v9.8H, v24.8H // ...................................................................................................*............................................. || ................................................................................................*............................................ - // mul v24.8H, v14.8H, v0.H[4] // .......................................................................................................*......................................... || ...................................................................................................*......................................... - // sqrdmulh v14.8H, v14.8H, v0.H[5] // .................................................................................................*............................................... || ...............................................................................................*............................................. - // mls v24.8H, v14.8H, v7.H[0] // ........................................................................................................*........................................ || .....................................................................................................*....................................... - // sub v14.8H, v12.8H, v24.8H // ...........................................................................................................*..................................... || ..........................................................................................................*.................................. - // add v12.8H, v12.8H, v24.8H // ............................................................................................................*.................................... || ...........................................................................................................*................................. - // mul v24.8H, v15.8H, v0.H[4] // ..............................................................................*.................................................................. || .........................................................................*................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // ...................................................................................*............................................................. || ...........................................................................*................................................................. - // mls v24.8H, v15.8H, v7.H[0] // ............................................................................................*.................................................... || .........................................................................................*................................................... - // sub v15.8H, v13.8H, v24.8H // .....................................................................................................*........................................... || .................................................................................................*........................................... - // add v13.8H, v13.8H, v24.8H // ......................................................................................................*.......................................... || ..................................................................................................*.......................................... - // mul v24.8H, v9.8H, v0.H[6] // .............................................................................................................*................................... || ...........................................................................................................*................................. - // sqrdmulh v9.8H, v9.8H, v0.H[7] // ..............................................................................................................*.................................. || .............................................................................................................*............................... - // mls v24.8H, v9.8H, v7.H[0] // ...................................................................................................................*............................. || ...................................................................................................................*......................... - // sub v9.8H, v8.8H, v24.8H // ......................................................................................................................*.......................... || ........................................................................................................................*.................... - // add v8.8H, v8.8H, v24.8H // ........................................................................................................................*........................ || .........................................................................................................................*................... - // mul v24.8H, v11.8H, v1.H[0] // .................................................................................................................*............................... || ...............................................................................................................*............................. - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ..................................................................................................................*.............................. || .................................................................................................................*........................... - // mls v24.8H, v11.8H, v7.H[0] // .....................................................................................................................*........................... || .......................................................................................................................*..................... - // sub v11.8H, v10.8H, v24.8H // ............................................................................................................................*.................... || ............................................................................................................................*................ - // add v10.8H, v10.8H, v24.8H // ..............................................................................................................................*.................. || .............................................................................................................................*............... - // mul v24.8H, v13.8H, v1.H[2] // ....................................................................................................................*............................ || .....................................................................................................................*....................... - // sqrdmulh v13.8H, v13.8H, v1.H[3] // .......................................................................................................................*......................... || .........................................................................................................................*................... - // mls v24.8H, v13.8H, v7.H[0] // ...............................................................................................................................*................. || ...............................................................................................................................*............. - // sub v13.8H, v12.8H, v24.8H // ....................................................................................................................................*............ || ....................................................................................................................................*........ - // add v12.8H, v12.8H, v24.8H // .....................................................................................................................................*........... || .....................................................................................................................................*....... - // mul v24.8H, v15.8H, v1.H[4] // ..........................................................................................................................*...................... || ...........................................................................................................................*................. - // sqrdmulh v15.8H, v15.8H, v1.H[5] // .............................................................................................................................*................... || .............................................................................................................................*............... - // mls v24.8H, v15.8H, v7.H[0] // ...................................................................................................................................*............. || ...................................................................................................................................*......... - // sub v15.8H, v14.8H, v24.8H // ...........................................................................................................................................*..... || ........................................................................................................................................*.... - // add v14.8H, v14.8H, v24.8H // ............................................................................................................................................*.... || .........................................................................................................................................*... - // str_vi v8, x0, 16 // ...........................................................................................................................*..................... || ............................................................................................................................*................ - // str_vo v9, x0, 48 // .........................................................................................................................*....................... || ...........................................................................................................................*................. - // str_vo v10, x0, 112 // .................................................................................................................................*............... || ................................................................................................................................*............ - // str_vo v11, x0, 176 // ................................................................................................................................*................ || ...............................................................................................................................*............. - // str_vo v12, x0, 240 // ..........................................................................................................................................*...... || ........................................................................................................................................*.... - // str_vo v13, x0, 304 // .........................................................................................................................................*....... || .......................................................................................................................................*..... - // str_vo v14, x0, 368 // ................................................................................................................................................* || ............................................................................................................................................* - // str_vo v15, x0, 432 // ...............................................................................................................................................*. || ...........................................................................................................................................*. - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Cycle bound: 72.0 + // IPC bound: 1.06 + // + // Wall time: 48.13s + // User time: 48.13s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + sub v30.8H, v20.8H, v18.8H // ................*........................................................... + ldr q26, [x0, #384] // ......*..................................................................... + sqrdmulh v18.8H, v28.8H, v0.H[1] // ........*................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + ldr q6, [x0, #464] // .......e.................................................................... + mls v12.8H, v8.8H, v7.H[0] // .........................*.................................................. + ldr q20, [x0, #192] // ...*........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q29, [x0, #336] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v2.8H, v26.8H, v0.H[1] // ..................*......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v22.8H, v26.8H, v0.H[0] // ...................*........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v24.8H, v20.8H, v12.8H // ..........................*................................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v9.8H, v20.8H, v12.8H // ...........................*................................................ + // gap // ............................................................................ + mul v23.8H, v28.8H, v0.H[0] // .........*.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v22.8H, v2.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v19.8H, v24.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v23.8H, v18.8H, v7.H[0] // ..........*................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v28.8H, v16.8H, v22.8H // ......................*..................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v21.8H, v16.8H, v22.8H // .....................*...................................................... + mul v16.8H, v24.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v16.8H, v19.8H, v7.H[0] // .............................................*.............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v4.8H, v17.8H, v23.8H // ...........*................................................................ + add v18.8H, v17.8H, v23.8H // ............*............................................................... + sqrdmulh v10.8H, v9.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v26.8H, v21.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v5.8H, v30.8H, v16.8H // ..............................................*............................. + mul v2.8H, v21.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v20.8H, v5.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v2.8H, v26.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.8H, v5.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.8H, v20.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + sub v14.8H, v4.8H, v2.8H // .........................................*.................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v15.8H, v4.8H, v2.8H // ..........................................*................................. + mul v21.8H, v9.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v21.8H, v10.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v5.8H, v14.8H, v13.8H // ..................................................................*......... + sqrdmulh v12.8H, v28.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q5, [x0, #448] // ...........................................................................* + sqrdmulh v31.8H, v29.8H, v0.H[1] // .............e.............................................................. + // gap // ............................................................................ + add v2.8H, v3.8H, v21.8H // .....................................*...................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v20.8H, v3.8H, v21.8H // ....................................*....................................... + mul v3.8H, v28.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v3.8H, v12.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v28.8H, v20.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v23.8H, v20.8H, v1.H[0] // ......................................................*..................... + // gap // ............................................................................ + // gap // ............................................................................ + add v12.8H, v18.8H, v3.8H // ................................*........................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v24.8H, v2.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + sub v21.8H, v18.8H, v3.8H // ...............................*............................................ + add v3.8H, v30.8H, v16.8H // ...............................................*............................ + ldr q20, [x0, #80] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v22.8H, v2.8H, v0.H[6] // .................................................*.......................... + add v16.8H, v14.8H, v13.8H // ...................................................................*........ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v5.8H, v3.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #384] // ..........................................................................*. + mls v22.8H, v24.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v2.8H, v3.8H, v1.H[2] // ...........................................................*................ + ldr q16, [x0, #144] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v2.8H, v5.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + sub v19.8H, v12.8H, v22.8H // ...................................................*........................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v23.8H, v28.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v17.8H, v12.8H, v22.8H // ....................................................*....................... + // gap // ............................................................................ + mul v18.8H, v29.8H, v0.H[0] // ..............e............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v4.8H, v15.8H, v2.8H // .............................................................*.............. + ldr q28, [x0, #272] // ....e....................................................................... + // gap // ............................................................................ + str q17, [x0], #(16) // ....................................................................*....... + add v25.8H, v15.8H, v2.8H // ..............................................................*............. + mls v18.8H, v31.8H, v7.H[0] // ...............e............................................................ + add v26.8H, v21.8H, v23.8H // .........................................................*.................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v11.8H, v21.8H, v23.8H // ........................................................*................... + sqrdmulh v8.8H, v6.8H, v0.H[1] // .......................e.................................................... + str q4, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ + // gap // ............................................................................ + str q25, [x0, #240] // ........................................................................*... + str q19, [x0, #48] // .....................................................................*...... + str q26, [x0, #112] // ......................................................................*..... + mul v12.8H, v6.8H, v0.H[0] // ........................e................................................... + add v3.8H, v20.8H, v18.8H // .................e.......................................................... + str q11, [x0, #176] // .......................................................................*.... + ldr q17, [x0, #0] // e........................................................................... + + // ------------------------------------------------------------------ new position -------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------------------- + // ldr q8, [x0, #0] // ........................................................................e'.......................................................................... + // ldr q9, [x0, #(1*(512/8))] // ............................................e............................'..............................................~........................... + // ldr q10, [x0, #(2*(512/8))] // ...................................................e.....................'.....................................................~.................... + // ldr q11, [x0, #(3*(512/8))] // ..~......................................................................'....*..................................................................... + // ldr q12, [x0, #(4*(512/8))] // ..........................................................e..............'............................................................~............. + // ldr q13, [x0, #(5*(512/8))] // ...e.....................................................................'.....~.................................................................... + // ldr q14, [x0, #(6*(512/8))] // .........................................................................'*......................................................................... + // ldr q15, [x0, #(7*(512/8))] // e........................................................................'..~....................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // .........................................................................'.*........................................................................ + // mul v24.8h, v12.8h, v0.h[0] // ........~................................................................'..........*............................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........~.............................................................'.............*............................................................ + // sub v12.8h, v8.8h, v24.8h // ................~........................................................'..................*....................................................... + // add v8.8h, v8.8h, v24.8h // .................~.......................................................'...................*...................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // .................................e.......................................'...................................~...................................... + // mul v24.8h, v13.8h, v0.h[0] // ........................................................e................'..........................................................~............... + // mls v24.8h, v27.8h, v7.h[0] // .............................................................e...........'...............................................................~.......... + // sub v13.8h, v9.8h, v24.8h // .........................................................................*.......................................................................... + // add v9.8h, v9.8h, v24.8h // ......................................................................e..'........................................................................~. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ....~....................................................................'......*................................................................... + // mul v24.8h, v14.8h, v0.h[0] // .....~...................................................................'.......*.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .........~...............................................................'...........*.............................................................. + // sub v14.8h, v10.8h, v24.8h // .............~...........................................................'...............*.......................................................... + // add v10.8h, v10.8h, v24.8h // ............~............................................................'..............*........................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ................................................................e........'..................................................................~....... + // mul v24.8h, v15.8h, v0.h[0] // .....................................................................e...'.......................................................................~.. + // mls v24.8h, v27.8h, v7.h[0] // .~.......................................................................'...*...................................................................... + // sub v15.8h, v11.8h, v24.8h // ......~..................................................................'........*................................................................. + // add v11.8h, v11.8h, v24.8h // .......~.................................................................'.........*................................................................ + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ...............................~.........................................'.................................*........................................ + // mul v24.8h, v10.8h, v0.h[2] // ....................................~....................................'......................................*................................... + // mls v24.8h, v27.8h, v7.h[0] // .....................................~...................................'.......................................*.................................. + // sub v10.8h, v8.8h, v24.8h // ..........................................~..............................'............................................*............................. + // add v8.8h, v8.8h, v24.8h // ........................................~................................'..........................................*............................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ..................~......................................................'....................*..................................................... + // mul v24.8h, v11.8h, v0.h[2] // ............................~............................................'..............................*........................................... + // mls v24.8h, v27.8h, v7.h[0] // .............................~...........................................'...............................*.......................................... + // sub v11.8h, v9.8h, v24.8h // ...................................~.....................................'.....................................*.................................... + // add v9.8h, v9.8h, v24.8h // ..................................~......................................'....................................*..................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ...................~.....................................................'.....................*.................................................... + // mul v24.8h, v14.8h, v0.h[4] // .....................~...................................................'.......................*.................................................. + // mls v24.8h, v27.8h, v7.h[0] // .......................~.................................................'.........................*................................................ + // sub v14.8h, v12.8h, v24.8h // ..........................~..............................................'............................*............................................. + // add v12.8h, v12.8h, v24.8h // ...........................~.............................................'.............................*............................................ + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ..........~..............................................................'............*............................................................. + // mul v24.8h, v15.8h, v0.h[4] // ..............~..........................................................'................*......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............~.........................................................'.................*........................................................ + // sub v15.8h, v13.8h, v24.8h // ....................~....................................................'......................*................................................... + // add v13.8h, v13.8h, v24.8h // ...........................................~.............................'.............................................*............................ + // sqrdmulh v27.8h, v9.8h, v0.h[7] // .........................................~...............................'...........................................*.............................. + // mul v24.8h, v9.8h, v0.h[6] // .............................................~...........................'...............................................*.......................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................~.......................'...................................................*...................... + // sub v9.8h, v8.8h, v24.8h // .....................................................~...................'.......................................................*.................. + // add v8.8h, v8.8h, v24.8h // .......................................................~.................'.........................................................*................ + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ......................................~..................................'........................................*................................. + // mul v24.8h, v11.8h, v1.h[0] // .......................................~.................................'.........................................*................................ + // mls v24.8h, v27.8h, v7.h[0] // ......................................................~..................'........................................................*................. + // sub v11.8h, v10.8h, v24.8h // ...............................................................~.........'.................................................................*........ + // add v10.8h, v10.8h, v24.8h // ..............................................................~..........'................................................................*......... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ...............................................~.........................'.................................................*........................ + // mul v24.8h, v13.8h, v1.h[2] // ..................................................~......................'....................................................*..................... + // mls v24.8h, v27.8h, v7.h[0] // ....................................................~....................'......................................................*................... + // sub v13.8h, v12.8h, v24.8h // .........................................................~...............'...........................................................*.............. + // add v12.8h, v12.8h, v24.8h // ............................................................~............'..............................................................*........... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ......................~..................................................'........................*................................................. + // mul v24.8h, v15.8h, v1.h[4] // ........................~................................................'..........................*............................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................~...............................................'...........................*.............................................. + // sub v15.8h, v14.8h, v24.8h // ..............................~..........................................'................................*......................................... + // add v14.8h, v14.8h, v24.8h // ..............................................~..........................'................................................*......................... + // str q8, [x0], #(16) // ...........................................................~.............'.............................................................*............ + // str q9, [x0, #(-16 + 1*(512/8))] // ...................................................................~.....'.....................................................................*.... + // str q10, [x0, #(-16 + 2*(512/8))] // ....................................................................~....'......................................................................*... + // str q11, [x0, #(-16 + 3*(512/8))] // .......................................................................~.'.........................................................................* + // str q12, [x0, #(-16 + 4*(512/8))] // ..................................................................~......'....................................................................*..... + // str q13, [x0, #(-16 + 5*(512/8))] // .................................................................~.......'...................................................................*...... + // str q14, [x0, #(-16 + 6*(512/8))] // ................................................~........................'..................................................*....................... + // str q15, [x0, #(-16 + 7*(512/8))] // ................................~........................................'..................................*....................................... + + sub count, count, #1 cbnz count, layer123_start - sub v11.8H, v16.8H, v11.8H // .*.............................................................. - mls v10.8H, v24.8H, v7.H[0] // .....*.......................................................... - ldr_vo v4, x0, 0 // *............................................................... - ldr_vo v18, x0, 384 // ...*............................................................ - ldr_vo v22, x0, 128 // ..*............................................................. - // gap // ................................................................ - ldr_vo v14, x0, 320 // ......*......................................................... - ldr_vo v24, x0, 64 // ....*........................................................... - // gap // ................................................................ - mul v31.8H, v11.8H, v0.H[4] // .......*........................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v8.8H, v4.8H, v27.8H // ........*....................................................... - // gap // ................................................................ - // gap // ................................................................ - add v4.8H, v4.8H, v27.8H // .........*...................................................... - sqrdmulh v11.8H, v11.8H, v0.H[5] // ..........*..................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v5.8H, v18.8H, v0.H[1] // ...........*.................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v18.8H, v18.8H, v0.H[0] // ............*................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v16.8H, v14.8H, v0.H[1] // .............*.................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v18.8H, v5.8H, v7.H[0] // ..............*................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v14.8H, v14.8H, v0.H[0] // ...............*................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v14.8H, v16.8H, v7.H[0] // ................*............................................... - // gap // ................................................................ - // gap // ................................................................ - sub v5.8H, v22.8H, v18.8H // .................*.............................................. - // gap // ................................................................ - // gap // ................................................................ - add v18.8H, v22.8H, v18.8H // ..................*............................................. - mls v31.8H, v11.8H, v7.H[0] // ...................*............................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v11.8H, v5.8H, v0.H[5] // ......................*......................................... - // gap // ................................................................ - // gap // ................................................................ - add v22.8H, v24.8H, v14.8H // ....................*........................................... - // gap // ................................................................ - // gap // ................................................................ - sub v14.8H, v24.8H, v14.8H // .....................*.......................................... - sqrdmulh v24.8H, v18.8H, v0.H[3] // .........................*...................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v5.8H, v5.8H, v0.H[4] // ............................*................................... - sub v16.8H, v22.8H, v10.8H // .......................*........................................ - // gap // ................................................................ - add v22.8H, v22.8H, v10.8H // ........................*....................................... - // gap // ................................................................ - // gap // ................................................................ - mls v5.8H, v11.8H, v7.H[0] // .............................*.................................. - sub v11.8H, v14.8H, v31.8H // ..........................*..................................... - // gap // ................................................................ - add v14.8H, v14.8H, v31.8H // ...........................*.................................... - // gap // ................................................................ - // gap // ................................................................ - mul v18.8H, v18.8H, v0.H[2] // ..............................*................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v18.8H, v24.8H, v7.H[0] // ...............................*................................ - // gap // ................................................................ - // gap // ................................................................ - sub v10.8H, v8.8H, v5.8H // ................................*............................... - // gap // ................................................................ - // gap // ................................................................ - add v24.8H, v8.8H, v5.8H // .................................*.............................. - mul v31.8H, v22.8H, v0.H[6] // ..................................*............................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v22.8H, v22.8H, v0.H[7] // ...................................*............................ - // gap // ................................................................ - // gap // ................................................................ - sub v8.8H, v4.8H, v18.8H // ....................................*........................... - // gap // ................................................................ - // gap // ................................................................ - add v4.8H, v4.8H, v18.8H // .....................................*.......................... - mul v18.8H, v16.8H, v1.H[0] // ......................................*......................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v5.8H, v16.8H, v1.H[1] // .......................................*........................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v31.8H, v22.8H, v7.H[0] // ........................................*....................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v22.8H, v14.8H, v1.H[2] // .........................................*...................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v18.8H, v5.8H, v7.H[0] // ..........................................*..................... - // gap // ................................................................ - // gap // ................................................................ - add v5.8H, v4.8H, v31.8H // .............................................*.................. - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v14.8H, v14.8H, v1.H[3] // ............................................*................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str_vi v5, x0, 16 // ................................................*............... - sqrdmulh v5.8H, v11.8H, v1.H[5] // ..................................................*............. - // gap // ................................................................ - sub v16.8H, v8.8H, v18.8H // .................................................*.............. - // gap // ................................................................ - // gap // ................................................................ - mul v11.8H, v11.8H, v1.H[4] // ...............................................*................ - add v18.8H, v8.8H, v18.8H // ...................................................*............ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v22.8H, v14.8H, v7.H[0] // ....................................................*........... - str_vo v16, x0, 176 // .....................................................*.......... - // gap // ................................................................ - str_vo v18, x0, 112 // ......................................................*......... - // gap // ................................................................ - // gap // ................................................................ - mls v11.8H, v5.8H, v7.H[0] // .......................................................*........ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v4.8H, v4.8H, v31.8H // ...........................................*.................... - // gap // ................................................................ - // gap // ................................................................ - sub v18.8H, v24.8H, v22.8H // ........................................................*....... - add v22.8H, v24.8H, v22.8H // .........................................................*...... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v14.8H, v10.8H, v11.8H // ............................................................*... - add v11.8H, v10.8H, v11.8H // .............................................................*.. - str_vo v4, x0, 48 // ..............................................*................. - str_vo v18, x0, 304 // ..........................................................*..... - str_vo v22, x0, 240 // ...........................................................*.... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str_vo v14, x0, 432 // ..............................................................*. - str_vo v11, x0, 368 // ...............................................................* - // gap // ................................................................ - - // original source code - // ldr_vo v4, x0, 0 // ..*............................................................. || *............................................................... - // sub v11.8H, v16.8H, v11.8H // *............................................................... || *............................................................... - // ldr_vo v18, x0, 128 // ....*........................................................... || .*.............................................................. - // ldr_vo v22, x0, 384 // ...*............................................................ || .*.............................................................. - // ldr_vo v14, x0, 64 // ......*......................................................... || ..*............................................................. - // mls v10.8H, v24.8H, v7.H[0] // .*.............................................................. || *............................................................... - // ldr_vo v24, x0, 320 // .....*.......................................................... || ..*............................................................. - // mul v8.8H, v11.8H, v0.H[4] // .......*........................................................ || ...*............................................................ - // sub v23.8H, v4.8H, v27.8H // ........*....................................................... || ....*........................................................... - // add v4.8H, v4.8H, v27.8H // .........*...................................................... || .....*.......................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ..........*..................................................... || .....*.......................................................... - // sqrdmulh v27.8H, v22.8H, v0.H[1] // ...........*.................................................... || .......*........................................................ - // mul v22.8H, v22.8H, v0.H[0] // ............*................................................... || .........*...................................................... - // sqrdmulh v26.8H, v24.8H, v0.H[1] // .............*.................................................. || ...........*.................................................... - // mls v22.8H, v27.8H, v7.H[0] // ..............*................................................. || .............*.................................................. - // mul v24.8H, v24.8H, v0.H[0] // ...............*................................................ || ...............*................................................ - // mls v24.8H, v26.8H, v7.H[0] // ................*............................................... || .................*.............................................. - // sub v27.8H, v18.8H, v22.8H // .................*.............................................. || ..................*............................................. - // add v18.8H, v18.8H, v22.8H // ..................*............................................. || ...................*............................................ - // mls v8.8H, v11.8H, v7.H[0] // ...................*............................................ || ...................*............................................ - // add v22.8H, v14.8H, v24.8H // .....................*.......................................... || ......................*......................................... - // sub v14.8H, v14.8H, v24.8H // ......................*......................................... || .......................*........................................ - // sqrdmulh v5.8H, v27.8H, v0.H[5] // ....................*........................................... || .....................*.......................................... - // sub v26.8H, v22.8H, v10.8H // .........................*...................................... || .........................*...................................... - // add v22.8H, v22.8H, v10.8H // ..........................*..................................... || ..........................*..................................... - // sqrdmulh v10.8H, v18.8H, v0.H[3] // .......................*........................................ || .......................*........................................ - // sub v15.8H, v14.8H, v8.8H // ............................*................................... || ...........................*.................................... - // add v14.8H, v14.8H, v8.8H // .............................*.................................. || ............................*................................... - // mul v8.8H, v27.8H, v0.H[4] // ........................*....................................... || .........................*...................................... - // mls v8.8H, v5.8H, v7.H[0] // ...........................*.................................... || ...........................*.................................... - // mul v18.8H, v18.8H, v0.H[2] // ..............................*................................. || .............................*.................................. - // mls v18.8H, v10.8H, v7.H[0] // ...............................*................................ || ...............................*................................ - // sub v10.8H, v23.8H, v8.8H // ................................*............................... || ................................*............................... - // add v8.8H, v23.8H, v8.8H // .................................*.............................. || .................................*.............................. - // mul v5.8H, v22.8H, v0.H[6] // ..................................*............................. || .................................*.............................. - // sqrdmulh v22.8H, v22.8H, v0.H[7] // ...................................*............................ || ...................................*............................ - // sub v23.8H, v4.8H, v18.8H // ....................................*........................... || ....................................*........................... - // add v4.8H, v4.8H, v18.8H // .....................................*.......................... || .....................................*.......................... - // mul v18.8H, v26.8H, v1.H[0] // ......................................*......................... || .....................................*.......................... - // sqrdmulh v27.8H, v26.8H, v1.H[1] // .......................................*........................ || .......................................*........................ - // mls v5.8H, v22.8H, v7.H[0] // ........................................*....................... || .........................................*...................... - // mul v22.8H, v14.8H, v1.H[2] // .........................................*...................... || ...........................................*.................... - // mls v18.8H, v27.8H, v7.H[0] // ..........................................*..................... || .............................................*.................. - // sub v27.8H, v4.8H, v5.8H // ......................................................*......... || .........................................................*...... - // sqrdmulh v14.8H, v14.8H, v1.H[3] // ............................................*................... || ...............................................*................ - // add v4.8H, v4.8H, v5.8H // ...........................................*.................... || ..............................................*................. - // str_vo v27, x0, 64 // ...........................................................*.... || ............................................................*... - // mul v5.8H, v15.8H, v1.H[4] // ................................................*............... || ...................................................*............ - // str_vi v4, x0, 16 // .............................................*.................. || .................................................*.............. - // sub v4.8H, v23.8H, v18.8H // ...............................................*................ || ..................................................*............. - // sqrdmulh v27.8H, v15.8H, v1.H[5] // ..............................................*................. || .................................................*.............. - // add v18.8H, v23.8H, v18.8H // .................................................*.............. || ...................................................*............ - // mls v22.8H, v14.8H, v7.H[0] // ..................................................*............. || .....................................................*.......... - // str_vo v4, x0, 176 // ...................................................*............ || .....................................................*.......... - // str_vo v18, x0, 112 // ....................................................*........... || ......................................................*......... - // mls v5.8H, v27.8H, v7.H[0] // .....................................................*.......... || .......................................................*........ - // sub v4.8H, v8.8H, v22.8H // .......................................................*........ || ..........................................................*..... - // add v18.8H, v8.8H, v22.8H // ........................................................*....... || ..........................................................*..... - // str_vo v4, x0, 304 // ............................................................*... || .............................................................*.. - // str_vo v18, x0, 240 // .............................................................*.. || .............................................................*.. - // sub v4.8H, v10.8H, v5.8H // .........................................................*...... || ............................................................*... - // add v18.8H, v10.8H, v5.8H // ..........................................................*..... || ............................................................*... - // str_vo v4, x0, 432 // ..............................................................*. || ...............................................................* - // str_vo v18, x0, 368 // ...............................................................* || ...............................................................* - + // Instructions: 64 + // Expected cycles: 69 + // Expected IPC: 0.93 + // + // Cycle bound: 69.0 + // IPC bound: 0.93 + // + // Wall time: 3.28s + // User time: 3.28s + // + // ---------------------- original position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + sub v21.8H, v20.8H, v18.8H // *............................................................... + sqrdmulh v25.8H, v28.8H, v0.H[1] // ..*............................................................. + ldr q30, [x0, #384] // .*.............................................................. + ldr q26, [x0, #192] // ....*........................................................... + // gap // ................................................................ + // gap // ................................................................ + mls v12.8H, v8.8H, v7.H[0] // ...*............................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v18.8H, v28.8H, v0.H[0] // .........*...................................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v10.8H, v30.8H, v0.H[0] // ......*......................................................... + // gap // ................................................................ + // gap // ................................................................ + add v5.8H, v26.8H, v12.8H // ........*....................................................... + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v20.8H, v30.8H, v0.H[1] // .....*.......................................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sub v6.8H, v26.8H, v12.8H // .......*........................................................ + sqrdmulh v27.8H, v5.8H, v0.H[3] // ...................*............................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v4.8H, v5.8H, v0.H[2] // .............................*.................................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v10.8H, v20.8H, v7.H[0] // ..........*..................................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v4.8H, v27.8H, v7.H[0] // ..............................*................................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v2.8H, v6.8H, v0.H[5] // ...........*.................................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v18.8H, v25.8H, v7.H[0] // ............*................................................... + // gap // ................................................................ + // gap // ................................................................ + add v24.8H, v3.8H, v4.8H // ..................................*............................. + // gap // ................................................................ + // gap // ................................................................ + sub v15.8H, v3.8H, v4.8H // ...................................*............................ + mul v28.8H, v6.8H, v0.H[4] // ...............*................................................ + // gap // ................................................................ + add v3.8H, v16.8H, v10.8H // .............*.................................................. + // gap // ................................................................ + // gap // ................................................................ + mls v28.8H, v2.8H, v7.H[0] // ................*............................................... + // gap // ................................................................ + sub v29.8H, v16.8H, v10.8H // ..............*................................................. + sub v25.8H, v17.8H, v18.8H // .................*.............................................. + // gap // ................................................................ + // gap // ................................................................ + add v9.8H, v17.8H, v18.8H // ..................*............................................. + sqrdmulh v4.8H, v3.8H, v0.H[3] // ................................*............................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v13.8H, v29.8H, v0.H[4] // ......................*......................................... + // gap // ................................................................ + // gap // ................................................................ + sub v2.8H, v21.8H, v28.8H // .....................*.......................................... + // gap // ................................................................ + // gap // ................................................................ + add v11.8H, v21.8H, v28.8H // ...........................................*.................... + sqrdmulh v21.8H, v29.8H, v0.H[5] // ....................*........................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v28.8H, v3.8H, v0.H[2] // ....................................*........................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v29.8H, v24.8H, v0.H[7] // .........................................*...................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v28.8H, v4.8H, v7.H[0] // .....................................*.......................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v27.8H, v24.8H, v0.H[6] // ............................................*................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v27.8H, v29.8H, v7.H[0] // ................................................*............... + // gap // ................................................................ + // gap // ................................................................ + add v31.8H, v9.8H, v28.8H // ........................................*....................... + // gap // ................................................................ + // gap // ................................................................ + sub v10.8H, v9.8H, v28.8H // ..........................................*..................... + sqrdmulh v5.8H, v15.8H, v1.H[1] // ......................................*......................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v29.8H, v11.8H, v1.H[2] // .................................................*.............. + // gap // ................................................................ + // gap // ................................................................ + sub v22.8H, v31.8H, v27.8H // ...................................................*............ + // gap // ................................................................ + // gap // ................................................................ + add v9.8H, v31.8H, v27.8H // .....................................................*.......... + sqrdmulh v31.8H, v11.8H, v1.H[3] // ..............................................*................. + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + str q22, [x0, #64] // .............................................................*.. + // gap // ................................................................ + mul v27.8H, v15.8H, v1.H[0] // .......................................*........................ + str q9, [x0], #(16) // .......................................................*........ + // gap // ................................................................ + // gap // ................................................................ + sqrdmulh v6.8H, v2.8H, v1.H[5] // .......................*........................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v27.8H, v5.8H, v7.H[0] // ....................................................*........... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mls v13.8H, v21.8H, v7.H[0] // ........................*....................................... + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + mul v26.8H, v2.8H, v1.H[4] // .........................*...................................... + // gap // ................................................................ + // gap // ................................................................ + add v15.8H, v10.8H, v27.8H // .........................................................*...... + // gap // ................................................................ + // gap // ................................................................ + sub v11.8H, v10.8H, v27.8H // ..........................................................*..... + mls v26.8H, v6.8H, v7.H[0] // ..........................*..................................... + // gap // ................................................................ + sub v20.8H, v25.8H, v13.8H // ...........................*.................................... + // gap // ................................................................ + // gap // ................................................................ + str q15, [x0, #112] // ..............................................................*. + mls v29.8H, v31.8H, v7.H[0] // ..................................................*............. + // gap // ................................................................ + add v13.8H, v25.8H, v13.8H // ............................*................................... + str q11, [x0, #176] // ...............................................................* + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + // gap // ................................................................ + sub v16.8H, v20.8H, v26.8H // ...............................*................................ + // gap // ................................................................ + // gap // ................................................................ + add v26.8H, v20.8H, v26.8H // .............................................*.................. + // gap // ................................................................ + // gap // ................................................................ + add v20.8H, v13.8H, v29.8H // ........................................................*....... + sub v23.8H, v13.8H, v29.8H // ......................................................*......... + // gap // ................................................................ + str q16, [x0, #432] // .................................*.............................. + // gap // ................................................................ + // gap // ................................................................ + str q26, [x0, #368] // ...............................................*................ + // gap // ................................................................ + // gap // ................................................................ + str q20, [x0, #240] // ............................................................*... + // gap // ................................................................ + str q23, [x0, #304] // ...........................................................*.... + + // ------------------------ new position -------------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + // sub v30.8H, v20.8H, v18.8H // *............................................................... + // ldr q26, [x0, #384] // ..*............................................................. + // sqrdmulh v18.8H, v28.8H, v0.H[1] // .*.............................................................. + // mls v12.8H, v8.8H, v7.H[0] // ....*........................................................... + // ldr q20, [x0, #192] // ...*............................................................ + // sqrdmulh v2.8H, v26.8H, v0.H[1] // ........*....................................................... + // mul v22.8H, v26.8H, v0.H[0] // ......*......................................................... + // sub v24.8H, v20.8H, v12.8H // .........*...................................................... + // add v9.8H, v20.8H, v12.8H // .......*........................................................ + // mul v23.8H, v28.8H, v0.H[0] // .....*.......................................................... + // mls v22.8H, v2.8H, v7.H[0] // ............*................................................... + // sqrdmulh v19.8H, v24.8H, v0.H[5] // ..............*................................................. + // mls v23.8H, v18.8H, v7.H[0] // ...............*................................................ + // add v28.8H, v16.8H, v22.8H // ...................*............................................ + // sub v21.8H, v16.8H, v22.8H // .....................*.......................................... + // mul v16.8H, v24.8H, v0.H[4] // ..................*............................................. + // mls v16.8H, v19.8H, v7.H[0] // ....................*........................................... + // sub v4.8H, v17.8H, v23.8H // ......................*......................................... + // add v18.8H, v17.8H, v23.8H // .......................*........................................ + // sqrdmulh v10.8H, v9.8H, v0.H[3] // ..........*..................................................... + // sqrdmulh v26.8H, v21.8H, v0.H[5] // ............................*................................... + // sub v5.8H, v30.8H, v16.8H // ..........................*..................................... + // mul v2.8H, v21.8H, v0.H[4] // .........................*...................................... + // sqrdmulh v20.8H, v5.8H, v1.H[5] // ............................................*................... + // mls v2.8H, v26.8H, v7.H[0] // ..............................................*................. + // mul v13.8H, v5.8H, v1.H[4] // ...............................................*................ + // mls v13.8H, v20.8H, v7.H[0] // ..................................................*............. + // sub v14.8H, v4.8H, v2.8H // ...................................................*............ + // add v15.8H, v4.8H, v2.8H // ......................................................*......... + // mul v21.8H, v9.8H, v0.H[2] // ...........*.................................................... + // mls v21.8H, v10.8H, v7.H[0] // .............*.................................................. + // sub v5.8H, v14.8H, v13.8H // ........................................................*....... + // sqrdmulh v12.8H, v28.8H, v0.H[3] // ........................*....................................... + // str q5, [x0, #448] // ............................................................*... + // add v2.8H, v3.8H, v21.8H // ................*............................................... + // sub v20.8H, v3.8H, v21.8H // .................*.............................................. + // mul v3.8H, v28.8H, v0.H[2] // .............................*.................................. + // mls v3.8H, v12.8H, v7.H[0] // ...............................*................................ + // sqrdmulh v28.8H, v20.8H, v1.H[1] // ....................................*........................... + // mul v23.8H, v20.8H, v1.H[0] // ..........................................*..................... + // add v12.8H, v18.8H, v3.8H // ..................................*............................. + // sqrdmulh v24.8H, v2.8H, v0.H[7] // ..............................*................................. + // sub v21.8H, v18.8H, v3.8H // ...................................*............................ + // add v3.8H, v30.8H, v16.8H // ...........................*.................................... + // mul v22.8H, v2.8H, v0.H[6] // ................................*............................... + // add v16.8H, v14.8H, v13.8H // .........................................................*...... + // sqrdmulh v5.8H, v3.8H, v1.H[3] // ........................................*....................... + // str q16, [x0, #384] // .............................................................*.. + // mls v22.8H, v24.8H, v7.H[0] // .................................*.............................. + // mul v2.8H, v3.8H, v1.H[2] // .....................................*.......................... + // mls v2.8H, v5.8H, v7.H[0] // .....................................................*.......... + // sub v19.8H, v12.8H, v22.8H // ......................................*......................... + // mls v23.8H, v28.8H, v7.H[0] // .............................................*.................. + // add v17.8H, v12.8H, v22.8H // .......................................*........................ + // sub v4.8H, v15.8H, v2.8H // ...........................................................*.... + // str q17, [x0], #(16) // ...........................................*.................... + // add v25.8H, v15.8H, v2.8H // ..........................................................*..... + // add v26.8H, v21.8H, v23.8H // ................................................*............... + // sub v11.8H, v21.8H, v23.8H // .................................................*.............. + // str q4, [x0, #304] // ...............................................................* + // str q25, [x0, #240] // ..............................................................*. + // str q19, [x0, #48] // .........................................*...................... + // str q26, [x0, #112] // ....................................................*........... + // str q11, [x0, #176] // .......................................................*........ + restore inp, STACK0 mov count, #8 .p2align 2 - ldr_vo v30, x1, 48 // .*................................................ - // gap // .................................................. - ldr_vi v15, x3, 16 // *................................................. - ldr_vo v13, x1, 0 // ..*............................................... - // gap // .................................................. - // gap // .................................................. - ldr_vo v11, x1, 32 // ...*.............................................. - // gap // .................................................. - // gap // .................................................. - ldr_vo v10, x4, 16 // ......*........................................... - // gap // .................................................. - // gap // .................................................. - ldr_vo v5, x4, 64 // ....*............................................. - sqrdmulh v22.8H, v30.8H, v15.H[1] // .......*.......................................... - // gap // .................................................. - ldr_vo v8, x1, 16 // .....*............................................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v1.8H, v30.8H, v15.H[0] // ........*......................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v1.8H, v22.8H, v7.H[0] // ..........*....................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v24.8H, v11.8H, v15.H[1] // .........*........................................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v4.8H, v11.8H, v15.H[0] // ...........*...................................... - // gap // .................................................. - // gap // .................................................. - sub v11.8H, v8.8H, v1.8H // ..............*................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v19.8H, v8.8H, v1.8H // ............*..................................... - // gap // .................................................. - mls v4.8H, v24.8H, v7.H[0] // .............*.................................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v23.8H, v11.8H, v15.H[4] // .................*................................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v25.8H, v11.8H, v15.H[5] // ................*................................. - // gap // .................................................. - // gap // .................................................. - add v0.8H, v13.8H, v4.8H // ....................*............................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v28.8H, v19.8H, v15.H[3] // ...............*.................................. - sub v16.8H, v13.8H, v4.8H // .....................*............................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v2.8H, v19.8H, v15.H[2] // ..................*............................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v23.8H, v25.8H, v7.H[0] // ...................*.............................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v2.8H, v28.8H, v7.H[0] // ......................*........................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v28.8H, v16.8H, v23.8H // .......................*.......................... - sub v14.8H, v16.8H, v23.8H // ........................*......................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - add v3.8H, v0.8H, v2.8H // .........................*........................ - sub v21.8H, v0.8H, v2.8H // ..........................*....................... - // gap // .................................................. - trn2 v18.4S, v28.4S, v14.4S // ............................*..................... - // gap // .................................................. - // gap // .................................................. - trn1 v2.4S, v28.4S, v14.4S // ...............................*.................. - // gap // .................................................. - // gap // .................................................. - trn2 v4.4S, v3.4S, v21.4S // .............................*.................... - // gap // .................................................. - // gap // .................................................. - trn1 v1.4S, v3.4S, v21.4S // .................................*................ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - trn2 v11.2D, v4.2D, v18.2D // ................................*................. - trn1 v19.2D, v4.2D, v18.2D // ..................................*............... - // gap // .................................................. - trn2 v0.2D, v1.2D, v2.2D // ....................................*............. - // gap // .................................................. - // gap // .................................................. - trn1 v16.2D, v1.2D, v2.2D // .....................................*............ - // gap // .................................................. - // gap // .................................................. - sqrdmulh v28.8H, v11.8H, v10.8H // ...................................*.............. - ldr_vi v2, x4, 96 // ..............................*................... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v1.8H, v0.8H, v10.8H // .......................................*.......... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v3.8H, v11.8H, v2.8H // ......................................*........... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v3.8H, v28.8H, v7.H[0] // ........................................*......... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v12.8H, v0.8H, v2.8H // ..........................................*....... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mls v12.8H, v1.8H, v7.H[0] // .............................................*.... - ldr_vo v1, x4, -48 // ...........................*...................... - // gap // .................................................. - add v0.8H, v19.8H, v3.8H // ............................................*..... - // gap // .................................................. - // gap // .................................................. - sub v19.8H, v19.8H, v3.8H // ...........................................*...... - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v27.8H, v0.8H, v1.8H // .................................................* - ldr_vo v1, x4, -16 // .........................................*........ - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - mul v13.8H, v19.8H, v5.8H // ...............................................*.. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - // gap // .................................................. - sqrdmulh v29.8H, v19.8H, v1.8H // ................................................*. - ldr_vo v19, x4, -64 // ..............................................*... - // gap // .................................................. - - // original source code - // ldr_vi v8, x3, 16 // .*................................................ || *............................................................ - // ldr_vo v20, x1, 48 // *................................................. || *............................................................ - // ldr_vo v6, x1, 0 // ..*............................................... || .*........................................................... - // ldr_vo v30, x1, 32 // ...*.............................................. || ..*.......................................................... - // ldr_vo v17, x4, 64 // .....*............................................ || ....*........................................................ - // ldr_vo v10, x1, 16 // .......*.......................................... || .....*....................................................... - // ldr_vo v11, x4, 16 // ....*............................................. || ...*......................................................... - // sqrdmulh v14.8H, v20.8H, v8.H[1] // ......*........................................... || ....*........................................................ - // mul v9.8H, v20.8H, v8.H[0] // ........*......................................... || .......*..................................................... - // sqrdmulh v21.8H, v30.8H, v8.H[1] // ..........*....................................... || ...........*................................................. - // mls v9.8H, v14.8H, v7.H[0] // .........*........................................ || .........*................................................... - // mul v2.8H, v30.8H, v8.H[0] // ...........*...................................... || .............*............................................... - // add v16.8H, v10.8H, v9.8H // .............*.................................... || ...............*............................................. - // mls v2.8H, v21.8H, v7.H[0] // ..............*................................... || ................*............................................ - // sub v15.8H, v10.8H, v9.8H // ............*..................................... || ..............*.............................................. - // sqrdmulh v12.8H, v16.8H, v8.H[3] // ..................*............................... || ......................*...................................... - // sqrdmulh v5.8H, v15.8H, v8.H[5] // ................*................................. || ....................*........................................ - // mul v20.8H, v15.8H, v8.H[4] // ...............*.................................. || ..................*.......................................... - // mul v3.8H, v16.8H, v8.H[2] // ....................*............................. || ........................*.................................... - // mls v20.8H, v5.8H, v7.H[0] // .....................*............................ || ..........................*.................................. - // add v10.8H, v6.8H, v2.8H // .................*................................ || .....................*....................................... - // sub v4.8H, v6.8H, v2.8H // ...................*.............................. || ......................*...................................... - // mls v3.8H, v12.8H, v7.H[0] // ......................*........................... || ............................*................................ - // add v8.8H, v4.8H, v20.8H // .......................*.......................... || ...............................*............................. - // sub v31.8H, v4.8H, v20.8H // ........................*......................... || ...............................*............................. - // add v2.8H, v10.8H, v3.8H // .........................*........................ || .................................*........................... - // sub v5.8H, v10.8H, v3.8H // ..........................*....................... || .................................*........................... - // ldr_vo v10, x4, 48 // ..........................................*....... || ....................................................*........ - // trn2 v30.4S, v8.4S, v31.4S // ...........................*...................... || ..................................*.......................... - // trn2 v3.4S, v2.4S, v5.4S // .............................*.................... || ....................................*........................ - // ldr_vi v13, x4, 96 // ....................................*............. || ..........................................*.................. - // trn1 v20.4S, v8.4S, v31.4S // ............................*..................... || ...................................*......................... - // trn2 v6.2D, v3.2D, v30.2D // ...............................*.................. || .......................................*..................... - // trn1 v19.4S, v2.4S, v5.4S // ..............................*................... || .....................................*....................... - // trn1 v15.2D, v3.2D, v30.2D // ................................*................. || .......................................*..................... - // sqrdmulh v4.8H, v6.8H, v11.8H // ...................................*.............. || ..........................................*.................. - // trn2 v29.2D, v19.2D, v20.2D // .................................*................ || ........................................*.................... - // trn1 v16.2D, v19.2D, v20.2D // ..................................*............... || .........................................*................... - // mul v2.8H, v6.8H, v13.8H // ......................................*........... || ..............................................*.............. - // sqrdmulh v19.8H, v29.8H, v11.8H // .....................................*............ || ............................................*................ - // mls v2.8H, v4.8H, v7.H[0] // .......................................*.......... || ................................................*............ - // ldr_vo v4, x4, -16 // ..............................................*... || ........................................................*.... - // mul v12.8H, v29.8H, v13.8H // ........................................*......... || ..................................................*.......... - // sub v21.8H, v15.8H, v2.8H // ............................................*..... || ......................................................*...... - // add v0.8H, v15.8H, v2.8H // ...........................................*...... || .....................................................*....... - // mls v12.8H, v19.8H, v7.H[0] // .........................................*........ || ....................................................*........ - // ldr_vo v19, x4, -64 // .................................................* || ............................................................* - // mul v13.8H, v21.8H, v17.8H // ...............................................*.. || ..........................................................*.. - // sqrdmulh v29.8H, v21.8H, v4.8H // ................................................*. || ............................................................* - // sqrdmulh v27.8H, v0.8H, v10.8H // .............................................*.... || ........................................................*.... - + // Instructions: 37 + // Expected cycles: 47 + // Expected IPC: 0.79 + // + // Cycle bound: 47.0 + // IPC bound: 0.79 + // + // Wall time: 0.61s + // User time: 0.61s + // + // -------- original position ---------> + // 0 25 + // |------------------------|----------- + ldr q14, [x1, #48] // *.................................... + ldr q4, [x3], #16 // .*................................... + // gap // ..................................... + ldr q30, [x4, #16] // ...........................*......... + ldr q16, [x1, #16] // .....*............................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v28.8H, v14.8H, v4.H[1] // ..*.................................. + ldr q21, [x1, #32] // ...*................................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mul v23.8H, v14.8H, v4.H[0] // ....*................................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v6.8H, v21.8H, v4.H[1] // .......*............................. + ldr q15, [x1, #0] // ............*........................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v23.8H, v28.8H, v7.H[0] // ......*.............................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mul v11.8H, v21.8H, v4.H[0] // ........*............................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v11.8H, v6.8H, v7.H[0] // .............*....................... + // gap // ..................................... + // gap // ..................................... + add v21.8H, v16.8H, v23.8H // ..........*.......................... + // gap // ..................................... + // gap // ..................................... + sub v8.8H, v16.8H, v23.8H // .........*........................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v19.8H, v21.8H, v4.H[3] // ..............*...................... + // gap // ..................................... + // gap // ..................................... + add v17.8H, v15.8H, v11.8H // .................*................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v28.8H, v8.8H, v4.H[5] // ................*.................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mul v23.8H, v21.8H, v4.H[2] // ..................*.................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v23.8H, v19.8H, v7.H[0] // ...................*................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mul v18.8H, v8.8H, v4.H[4] // ...........*......................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v18.8H, v28.8H, v7.H[0] // ....................*................ + // gap // ..................................... + // gap // ..................................... + add v28.8H, v17.8H, v23.8H // .......................*............. + // gap // ..................................... + // gap // ..................................... + sub v15.8H, v15.8H, v11.8H // ...............*..................... + // gap // ..................................... + // gap // ..................................... + sub v27.8H, v17.8H, v23.8H // .....................*............... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sub v9.8H, v15.8H, v18.8H // ......................*.............. + add v16.8H, v15.8H, v18.8H // ........................*............ + // gap // ..................................... + trn2 v26.4S, v28.4S, v27.4S // ............................*........ + trn1 v15.4S, v28.4S, v27.4S // .........................*........... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + trn1 v20.4S, v16.4S, v9.4S // ..........................*.......... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + trn2 v28.4S, v16.4S, v9.4S // ...............................*..... + ldr q27, [x4], #(6*16) // .............................*....... + trn2 v10.2D, v15.2D, v20.2D // ..............................*...... + // gap // ..................................... + // gap // ..................................... + trn1 v11.2D, v15.2D, v20.2D // ................................*.... + // gap // ..................................... + // gap // ..................................... + trn2 v9.2D, v26.2D, v28.2D // ..................................*.. + // gap // ..................................... + // gap // ..................................... + sqrdmulh v8.8H, v10.8H, v30.8H // .................................*... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v16.8H, v9.8H, v30.8H // ....................................* + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mul v14.8H, v10.8H, v27.8H // ...................................*. + // gap // ..................................... + // gap // ..................................... + + // ----------- new position -----------> + // 0 25 + // |------------------------|----------- + // ldr q2, [x1, #48] // *.................................... + // ldr q3, [x3], #16 // .*................................... + // sqrdmulh v22.8H, v2.8H, v3.H[1] // ....*................................ + // ldr q6, [x1, #32] // .....*............................... + // mul v31.8H, v2.8H, v3.H[0] // ......*.............................. + // ldr q21, [x1, #16] // ...*................................. + // mls v31.8H, v22.8H, v7.H[0] // .........*........................... + // sqrdmulh v4.8H, v6.8H, v3.H[1] // .......*............................. + // mul v0.8H, v6.8H, v3.H[0] // ..........*.......................... + // sub v5.8H, v21.8H, v31.8H // .............*....................... + // add v1.8H, v21.8H, v31.8H // ............*........................ + // mul v24.8H, v5.8H, v3.H[4] // ...................*................. + // ldr q27, [x1, #0] // ........*............................ + // mls v0.8H, v4.8H, v7.H[0] // ...........*......................... + // sqrdmulh v14.8H, v1.8H, v3.H[3] // ..............*...................... + // sub v31.8H, v27.8H, v0.8H // ......................*.............. + // sqrdmulh v16.8H, v5.8H, v3.H[5] // ................*.................... + // add v17.8H, v27.8H, v0.8H // ...............*..................... + // mul v27.8H, v1.8H, v3.H[2] // .................*................... + // mls v27.8H, v14.8H, v7.H[0] // ..................*.................. + // mls v24.8H, v16.8H, v7.H[0] // ....................*................ + // sub v3.8H, v17.8H, v27.8H // .......................*............. + // sub v10.8H, v31.8H, v24.8H // ........................*............ + // add v1.8H, v17.8H, v27.8H // .....................*............... + // add v13.8H, v31.8H, v24.8H // .........................*........... + // trn1 v4.4S, v1.4S, v3.4S // ...........................*......... + // trn1 v24.4S, v13.4S, v10.4S // ............................*........ + // ldr q16, [x4, #16] // ..*.................................. + // trn2 v26.4S, v1.4S, v3.4S // ..........................*.......... + // ldr q27, [x4], #(6*16) // ..............................*...... + // trn2 v6.2D, v4.2D, v24.2D // ...............................*..... + // trn2 v28.4S, v13.4S, v10.4S // .............................*....... + // trn1 v11.2D, v4.2D, v24.2D // ................................*.... + // sqrdmulh v8.8H, v6.8H, v16.8H // ..................................*.. + // trn2 v9.2D, v26.2D, v28.2D // .................................*... + // mul v14.8H, v6.8H, v27.8H // ....................................* + // sqrdmulh v16.8H, v9.8H, v16.8H // ...................................*. + sub count, count, #1 -.p2align 2 layer4567_start: - ldr_vi v8, x3, 16 // ....e................................................................... - mul v31.8H, v0.8H, v19.8H // .................................................*...................... - ldr_vo v20, x1, 112 // ...e.................................................................... - ldr_vo v6, x1, 64 // e....................................................................... - ldr_vo v30, x1, 96 // ..e..................................................................... - // gap // ........................................................................ - ldr_vo v17, x4, 64 // .....................................e.................................. - ldr_vo v10, x1, 80 // .e...................................................................... - mls v13.8H, v29.8H, v7.H[0] // ........................................................*............... - ldr_vo v11, x4, 16 // ..................................e..................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v14.8H, v20.8H, v8.H[1] // ...........e............................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v0.8H, v16.8H, v12.8H // ..........................................*............................. - // gap // ........................................................................ - mul v9.8H, v20.8H, v8.H[0] // ..........e............................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v26.8H, v16.8H, v12.8H // ...........................................*............................ - sqrdmulh v21.8H, v30.8H, v8.H[1] // ......e................................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v9.8H, v14.8H, v7.H[0] // ............e........................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v31.8H, v27.8H, v7.H[0] // ...................................................*.................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v2.8H, v30.8H, v8.H[0] // .....e.................................................................. - // gap // ........................................................................ - // gap // ........................................................................ - add v16.8H, v10.8H, v9.8H // ..............e......................................................... - // gap // ........................................................................ - // gap // ........................................................................ - mls v2.8H, v21.8H, v7.H[0] // .......e................................................................ - sub v15.8H, v10.8H, v9.8H // .............e.......................................................... - // gap // ........................................................................ - sub v24.8H, v26.8H, v31.8H // ....................................................*................... - // gap // ........................................................................ - // gap // ........................................................................ - add v23.8H, v26.8H, v31.8H // .....................................................*.................. - sqrdmulh v12.8H, v16.8H, v8.H[3] // ................e....................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v5.8H, v15.8H, v8.H[5] // .....................e.................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v20.8H, v15.8H, v8.H[4] // ....................e................................................... - // gap // ........................................................................ - // gap // ........................................................................ - sub v26.8H, v0.8H, v13.8H // .........................................................*.............. - // gap // ........................................................................ - // gap // ........................................................................ - mul v3.8H, v16.8H, v8.H[2] // ...............e........................................................ - add v25.8H, v0.8H, v13.8H // ..........................................................*............. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v20.8H, v5.8H, v7.H[0] // ......................e................................................. - // gap // ........................................................................ - // gap // ........................................................................ - add v10.8H, v6.8H, v2.8H // .........e.............................................................. - // gap // ........................................................................ - // gap // ........................................................................ - sub v4.8H, v6.8H, v2.8H // ........e............................................................... - mls v3.8H, v12.8H, v7.H[0] // .................e...................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v1.8H, v26.8H, v7.H[1] // ....................................................................*... - // gap // ........................................................................ - // gap // ........................................................................ - add v8.8H, v4.8H, v20.8H // ........................e............................................... - // gap // ........................................................................ - // gap // ........................................................................ - sub v31.8H, v4.8H, v20.8H // .......................e................................................ - sqdmulh v12.8H, v25.8H, v7.H[1] // .................................................................*...... - // gap // ........................................................................ - add v2.8H, v10.8H, v3.8H // ...................e.................................................... - // gap // ........................................................................ - // gap // ........................................................................ - sub v5.8H, v10.8H, v3.8H // ..................e..................................................... - sqdmulh v19.8H, v23.8H, v7.H[1] // ...........................................................*............ - ldr_vo v10, x4, 48 // ....................................e................................... - srshr v18.8H, v1.8H, #11 // .....................................................................*.. - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v28.8H, v24.8H, v7.H[1] // ..............................................................*......... - trn2 v30.4S, v8.4S, v31.4S // ............................e........................................... - // gap // ........................................................................ - srshr v14.8H, v12.8H, #11 // ..................................................................*..... - // gap // ........................................................................ - // gap // ........................................................................ - mls v26.8H, v18.8H, v7.H[0] // ......................................................................*. - trn2 v3.4S, v2.4S, v5.4S // ..........................e............................................. - ldr_vi v13, x4, 96 // .................................e...................................... - srshr v27.8H, v19.8H, #11 // ............................................................*........... - // gap // ........................................................................ - // gap // ........................................................................ - mls v25.8H, v14.8H, v7.H[0] // ...................................................................*.... - trn1 v20.4S, v8.4S, v31.4S // ...........................e............................................ - // gap // ........................................................................ - trn2 v6.2D, v3.2D, v30.2D // ..............................e......................................... - // gap // ........................................................................ - // gap // ........................................................................ - mls v23.8H, v27.8H, v7.H[0] // .............................................................*.......... - trn1 v19.4S, v2.4S, v5.4S // .........................e.............................................. - // gap // ........................................................................ - srshr v22.8H, v28.8H, #11 // ...............................................................*........ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v15.2D, v3.2D, v30.2D // ................................e....................................... - sqrdmulh v4.8H, v6.8H, v11.8H // .............................................e.......................... - // gap // ........................................................................ - trn2 v29.2D, v19.2D, v20.2D // .............................e.......................................... - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v16.2D, v19.2D, v20.2D // ...............................e........................................ - mul v2.8H, v6.8H, v13.8H // ............................................e........................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v19.8H, v29.8H, v11.8H // ........................................e............................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v2.8H, v4.8H, v7.H[0] // ..............................................e......................... - ldr_vo v4, x4, -16 // ......................................e................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v24.8H, v22.8H, v7.H[0] // ................................................................*....... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v12.8H, v29.8H, v13.8H // .......................................e................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v21.8H, v15.8H, v2.8H // ...............................................e........................ - // gap // ........................................................................ - // gap // ........................................................................ - add v0.8H, v15.8H, v2.8H // ................................................e....................... - mls v12.8H, v19.8H, v7.H[0] // .........................................e.............................. - ldr_vo v19, x4, -64 // ...................................e.................................... - st4 {v23.4S,v24.4S,v25.4S,v26.4S}, [x1], #64 // .......................................................................* - // gap // ........................................................................ - // gap // ........................................................................ - mul v13.8H, v21.8H, v17.8H // ......................................................e................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v29.8H, v21.8H, v4.8H // .......................................................e................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v27.8H, v0.8H, v10.8H // ..................................................e..................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - - // original source code - // ldr_vo v8, x1, 0 // ...e......................................................................................................................................... || .e........................................................................................................................ - // ldr_vo v9, x1, 16 // ......e...................................................................................................................................... || ..e....................................................................................................................... - // ldr_vo v10, x1, 32 // ....e........................................................................................................................................ || .e........................................................................................................................ - // ldr_vo v11, x1, 48 // ..e.......................................................................................................................................... || e......................................................................................................................... - // ldr_vi v0, x3, 16 // e............................................................................................................................................ || e......................................................................................................................... - // mul v24.8H, v10.8H, v0.H[0] // ................e............................................................................................................................ || ..............e........................................................................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[1] // .............e............................................................................................................................... || ........e................................................................................................................. - // mls v24.8H, v10.8H, v7.H[0] // ..................e.......................................................................................................................... || ................e......................................................................................................... - // sub v10.8H, v8.8H, v24.8H // ..............................e.............................................................................................................. || ............................e............................................................................................. - // add v8.8H, v8.8H, v24.8H // .............................e............................................................................................................... || ...........................e.............................................................................................. - // mul v24.8H, v11.8H, v0.H[0] // ...........e................................................................................................................................. || ......e................................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // .........e................................................................................................................................... || ....e..................................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ..............e.............................................................................................................................. || ..........e............................................................................................................... - // sub v11.8H, v9.8H, v24.8H // ...................e......................................................................................................................... || ................e......................................................................................................... - // add v9.8H, v9.8H, v24.8H // .................e........................................................................................................................... || ...............e.......................................................................................................... - // mul v24.8H, v9.8H, v0.H[2] // ..........................e.................................................................................................................. || ........................e................................................................................................. - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ......................e...................................................................................................................... || ..................e....................................................................................................... - // mls v24.8H, v9.8H, v7.H[0] // ...............................e............................................................................................................. || ............................e............................................................................................. - // sub v9.8H, v8.8H, v24.8H // .....................................e....................................................................................................... || ..................................e....................................................................................... - // add v8.8H, v8.8H, v24.8H // ....................................e........................................................................................................ || .................................e........................................................................................ - // mul v24.8H, v11.8H, v0.H[4] // ........................e.................................................................................................................... || ......................e................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // .......................e..................................................................................................................... || ....................e..................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ............................e................................................................................................................ || ..........................e............................................................................................... - // sub v11.8H, v10.8H, v24.8H // ..................................e.......................................................................................................... || ................................e......................................................................................... - // add v10.8H, v10.8H, v24.8H // .................................e........................................................................................................... || ...............................e.......................................................................................... - // trn1 v25.4S, v8.4S, v9.4S // ....................................................e........................................................................................ || ..........................................e............................................................................... - // trn2 v26.4S, v8.4S, v9.4S // .............................................e............................................................................................... || ......................................e................................................................................... - // trn1 v27.4S, v10.4S, v11.4S // .................................................e........................................................................................... || ........................................e................................................................................. - // trn2 v28.4S, v10.4S, v11.4S // ..........................................e.................................................................................................. || ....................................e..................................................................................... - // trn2 v10.2D, v25.2D, v27.2D // ........................................................e.................................................................................... || .............................................e............................................................................ - // trn2 v11.2D, v26.2D, v28.2D // ..................................................e.......................................................................................... || .........................................e................................................................................ - // trn1 v8.2D, v25.2D, v27.2D // .........................................................e................................................................................... || ..............................................e........................................................................... - // trn1 v9.2D, v26.2D, v28.2D // ......................................................e...................................................................................... || ............................................e............................................................................. - // ldr_vi v0, x4, 96 // ..............................................e.............................................................................................. || ......................................e................................................................................... - // ldr_vo v4, x4, -80 // ........e.................................................................................................................................... || ...e...................................................................................................................... - // ldr_vo v1, x4, -64 // ...................................................................e......................................................................... || ........................................................e................................................................. - // ldr_vo v5, x4, -48 // .......................................e..................................................................................................... || ..................................e....................................................................................... - // ldr_vo v2, x4, -32 // .....e....................................................................................................................................... || ..e....................................................................................................................... - // ldr_vo v6, x4, -16 // .............................................................e............................................................................... || ..................................................e....................................................................... - // mul v24.8H, v10.8H, v0.8H // ...............................................................e............................................................................. || ......................................................e................................................................... - // sqrdmulh v10.8H, v10.8H, v4.8H // ...........................................................e................................................................................. || ................................................e......................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ..................................................................e.......................................................................... || ........................................................e................................................................. - // sub v10.8H, v8.8H, v24.8H // ..................................................................................*.......................................................... || ......................................................................*................................................... - // add v8.8H, v8.8H, v24.8H // ....................................................................................*........................................................ || ........................................................................*................................................. - // mul v24.8H, v11.8H, v0.8H // ..........................................................e.................................................................................. || ..............................................e........................................................................... - // sqrdmulh v11.8H, v11.8H, v4.8H // .......................................................e..................................................................................... || ............................................e............................................................................. - // mls v24.8H, v11.8H, v7.H[0] // ............................................................e................................................................................ || ..................................................e....................................................................... - // sub v11.8H, v9.8H, v24.8H // ................................................................e............................................................................ || .......................................................e.................................................................. - // add v9.8H, v9.8H, v24.8H // .................................................................e........................................................................... || ........................................................e................................................................. - // mul v24.8H, v9.8H, v1.8H // .........................................................................*................................................................... || ................................................................*......................................................... - // sqrdmulh v9.8H, v9.8H, v5.8H // .......................................................................e..................................................................... || ..............................................................e........................................................... - // mls v24.8H, v9.8H, v7.H[0] // .......................................................................................*..................................................... || ............................................................................*............................................. - // sub v9.8H, v8.8H, v24.8H // ............................................................................................*................................................ || .................................................................................*........................................ - // add v8.8H, v8.8H, v24.8H // .............................................................................................*............................................... || ..................................................................................*....................................... - // mul v24.8H, v11.8H, v2.8H // .....................................................................e....................................................................... || ..........................................................e............................................................... - // sqrdmulh v11.8H, v11.8H, v6.8H // ......................................................................e...................................................................... || ............................................................e............................................................. - // mls v24.8H, v11.8H, v7.H[0] // ...............................................................................*............................................................. || ..................................................................*....................................................... - // sub v11.8H, v10.8H, v24.8H // .................................................................................................*........................................... || .......................................................................................*.................................. - // add v10.8H, v10.8H, v24.8H // ...................................................................................................*......................................... || ........................................................................................*................................. - // sqdmulh v25.8H, v8.8H, v7.H[1] // ..............................................................................................................*.............................. || ..................................................................................................*....................... - // srshr v25.8H, v25.8H, #11 // .......................................................................................................................*..................... || .......................................................................................................*.................. - // mls v8.8H, v25.8H, v7.H[0] // ...........................................................................................................................*................. || ..........................................................................................................*............... - // sqdmulh v25.8H, v9.8H, v7.H[1] // .................................................................................................................*........................... || ....................................................................................................*..................... - // srshr v25.8H, v25.8H, #11 // .............................................................................................................................*............... || ...........................................................................................................*.............. - // mls v9.8H, v25.8H, v7.H[0] // ......................................................................................................................................*...... || ....................................................................................................................*..... - // sqdmulh v25.8H, v10.8H, v7.H[1] // ...........................................................................................................*................................. || ................................................................................................*......................... - // srshr v25.8H, v25.8H, #11 // ...................................................................................................................*......................... || .....................................................................................................*.................... - // mls v10.8H, v25.8H, v7.H[0] // ........................................................................................................................*.................... || ........................................................................................................*................. - // sqdmulh v25.8H, v11.8H, v7.H[1] // ........................................................................................................*.................................... || ..............................................................................................*........................... - // srshr v25.8H, v25.8H, #11 // ................................................................................................................*............................ || ...................................................................................................*...................... - // mls v11.8H, v25.8H, v7.H[0] // ....................................................................................................................*........................ || ......................................................................................................*................... - // st4 {v8.4S,v9.4S,v10.4S,v11.4S}, [x1], #64 // ............................................................................................................................................* || .........................................................................................................................* - - subs count, count, #1 + // Instructions: 72 + // Expected cycles: 64 + // Expected IPC: 1.12 + // + // Cycle bound: 64.0 + // IPC bound: 1.12 + // + // Wall time: 97.44s + // User time: 97.44s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ldr q2, [x1, #112] // ...e.................................................................... + ldr q3, [x3], #16 // ....e................................................................... + mls v14.8H, v8.8H, v7.H[0] // .........................................*.............................. + ldr q20, [x4, #-64] // ...................................*.................................... + // gap // ........................................................................ + // gap // ........................................................................ + mul v15.8H, v9.8H, v27.8H // .............................................*.......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v22.8H, v2.8H, v3.H[1] // ..........e............................................................. + trn1 v0.2D, v26.2D, v28.2D // ................................*....................................... + ldr q26, [x4, #-48] // ....................................*................................... + // gap // ........................................................................ + mls v15.8H, v16.8H, v7.H[0] // ..............................................*......................... + ldr q16, [x4, #-16] // ......................................*................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q6, [x1, #96] // ..e..................................................................... + mul v31.8H, v2.8H, v3.H[0] // ...........e............................................................ + ldr q21, [x1, #80] // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v31.8H, v22.8H, v7.H[0] // ............e........................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v27.8H, v0.8H, v15.8H // ................................................*....................... + sqrdmulh v4.8H, v6.8H, v3.H[1] // .....e.................................................................. + sub v13.8H, v0.8H, v15.8H // ...............................................*........................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v0.8H, v6.8H, v3.H[0] // ......e................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + sub v5.8H, v21.8H, v31.8H // .............e.......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v1.8H, v21.8H, v31.8H // ..............e......................................................... + sqrdmulh v15.8H, v13.8H, v16.8H // ......................................................*................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v16.8H, v27.8H, v26.8H // .................................................*...................... + // gap // ........................................................................ + ldr q6, [x4, #-32] // .....................................*.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v24.8H, v5.8H, v3.H[4] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v18.8H, v13.8H, v6.8H // .......................................................*................ + sub v6.8H, v11.8H, v14.8H // ..........................................*............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v21.8H, v27.8H, v20.8H // ..................................................*..................... + ldr q27, [x1, #64] // e....................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v21.8H, v16.8H, v7.H[0] // ...................................................*.................... + add v16.8H, v11.8H, v14.8H // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v0.8H, v4.8H, v7.H[0] // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v18.8H, v15.8H, v7.H[0] // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + sub v20.8H, v16.8H, v21.8H // ....................................................*................... + // gap // ........................................................................ + // gap // ........................................................................ + add v19.8H, v16.8H, v21.8H // .....................................................*.................. + // gap // ........................................................................ + sqrdmulh v14.8H, v1.8H, v3.H[3] // ...............e........................................................ + sub v31.8H, v27.8H, v0.8H // ........e............................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v16.8H, v5.8H, v3.H[5] // ....................e................................................... + add v17.8H, v27.8H, v0.8H // .........e.............................................................. + // gap // ........................................................................ + sub v22.8H, v6.8H, v18.8H // .........................................................*.............. + // gap // ........................................................................ + // gap // ........................................................................ + mul v27.8H, v1.8H, v3.H[2] // ................e....................................................... + add v21.8H, v6.8H, v18.8H // ..........................................................*............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v14.8H, v7.H[0] // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v24.8H, v16.8H, v7.H[0] // ......................e................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v8.8H, v22.8H, v7.H[1] // ....................................................................*... + // gap // ........................................................................ + // gap // ........................................................................ + sub v3.8H, v17.8H, v27.8H // ..................e..................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v23.8H, v20.8H, v7.H[1] // ..............................................................*......... + // gap // ........................................................................ + // gap // ........................................................................ + sub v10.8H, v31.8H, v24.8H // .......................e................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v1.8H, v17.8H, v27.8H // ...................e.................................................... + sqdmulh v15.8H, v19.8H, v7.H[1] // ...........................................................*............ + // gap // ........................................................................ + srshr v17.8H, v8.8H, #11 // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + add v13.8H, v31.8H, v24.8H // ........................e............................................... + sqdmulh v16.8H, v21.8H, v7.H[1] // .................................................................*...... + // gap // ........................................................................ + srshr v28.8H, v23.8H, #11 // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + mls v22.8H, v17.8H, v7.H[0] // ......................................................................*. + trn1 v4.4S, v1.4S, v3.4S // .........................e.............................................. + // gap // ........................................................................ + srshr v27.8H, v15.8H, #11 // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + mls v20.8H, v28.8H, v7.H[0] // ................................................................*....... + // gap // ........................................................................ + trn1 v24.4S, v13.4S, v10.4S // ...........................e............................................ + srshr v15.8H, v16.8H, #11 // ..................................................................*..... + ldr q16, [x4, #16] // ..................................e..................................... + // gap // ........................................................................ + mls v19.8H, v27.8H, v7.H[0] // .............................................................*.......... + trn2 v26.4S, v1.4S, v3.4S // ..........................e............................................. + ldr q27, [x4], #(6*16) // .................................e...................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v6.2D, v4.2D, v24.2D // .............................e.......................................... + mls v21.8H, v15.8H, v7.H[0] // ...................................................................*.... + trn2 v28.4S, v13.4S, v10.4S // ............................e........................................... + // gap // ........................................................................ + trn1 v11.2D, v4.2D, v24.2D // ...............................e........................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v8.8H, v6.8H, v16.8H // .......................................e................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v9.2D, v26.2D, v28.2D // ..............................e......................................... + mul v14.8H, v6.8H, v27.8H // ........................................e............................... + // gap // ........................................................................ + // gap // ........................................................................ + st4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1], #64 // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v16.8H, v9.8H, v16.8H // ............................................e........................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + + // ---------------------------------------------------------------- new position ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q8, [x1, #(16*0)] // ...........................e............................................'..........................~........................................... + // ldr q9, [x1, #(16*1)] // ............e...........................................................'...........~.......................................................... + // ldr q10, [x1, #(16*2)] // ..........e.............................................................'.........~............................................................ + // ldr q11, [x1, #(16*3)] // e.......................................................................~...................................................................... + // ldr q0, [x3], #16 // .e......................................................................'~..................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ...............e........................................................'..............~....................................................... + // mul v24.8h, v10.8h, v0.h[0] // .................e......................................................'................~..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................e.........................................'.............................~........................................ + // sub v10.8h, v8.8h, v24.8h // ...................................e....................................'..................................~................................... + // add v8.8h, v8.8h, v24.8h // .....................................e..................................'....................................~................................. + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .....e..................................................................'....~................................................................. + // mul v24.8h, v11.8h, v0.h[0] // ...........e............................................................'..........~........................................................... + // mls v24.8h, v27.8h, v7.h[0] // .............e..........................................................'............~......................................................... + // sub v11.8h, v9.8h, v24.8h // ..................e.....................................................'.................~.................................................... + // add v9.8h, v9.8h, v24.8h // ...................e....................................................'..................~................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ..................................e.....................................'.................................~.................................... + // mul v24.8h, v9.8h, v0.h[2] // .......................................e................................'......................................~............................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................e..............................'........................................~............................. + // sub v9.8h, v8.8h, v24.8h // ............................................e...........................'...........................................~.......................... + // add v8.8h, v8.8h, v24.8h // ...............................................e........................'..............................................~....................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ....................................e...................................'...................................~.................................. + // mul v24.8h, v11.8h, v0.h[4] // .......................e................................................'......................~............................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................e.............................'.........................................~............................ + // sub v11.8h, v10.8h, v24.8h // ..............................................e.........................'.............................................~........................ + // add v10.8h, v10.8h, v24.8h // ..................................................e.....................'.................................................~.................... + // trn1 v25.4s, v8.4s, v9.4s // ......................................................e.................'.....................................................~................ + // trn2 v26.4s, v8.4s, v9.4s // .............................................................e..........'............................................................~......... + // trn1 v27.4s, v10.4s, v11.4s // .........................................................e..............'........................................................~............. + // trn2 v28.4s, v10.4s, v11.4s // .................................................................e......'................................................................~..... + // trn2 v10.2d, v25.2d, v27.2d // ...............................................................e........'..............................................................~....... + // trn2 v11.2d, v26.2d, v28.2d // ....................................................................e...'...................................................................~.. + // trn1 v8.2d, v25.2d, v27.2d // ..................................................................e.....'.................................................................~.... + // trn1 v9.2d, v26.2d, v28.2d // ......~.................................................................'.....*................................................................ + // ldr q0, [ x4], #(6*16) // ..............................................................e.........'.............................................................~........ + // ldr q4, [x4, #(-6*16 + 1*16)] // ...........................................................e............'..........................................................~........... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ...~....................................................................'..*................................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .......~................................................................'......*............................................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // ......................~.................................................'.....................*................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // .........~..............................................................'........*............................................................. + // sqrdmulh v27.8h, v10.8h, v4.8h // ...................................................................e....'..................................................................~... + // mul v24.8h, v10.8h, v0.8h // .....................................................................e..'....................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // ..~.....................................................................'.*.................................................................... + // sub v10.8h, v8.8h, v24.8h // .........................~..............................................'........................*............................................. + // add v8.8h, v8.8h, v24.8h // .............................~..........................................'............................*......................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // .......................................................................e'...................................................................... + // mul v24.8h, v11.8h, v0.8h // ....~...................................................................'...*.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ........~...............................................................'.......*.............................................................. + // sub v11.8h, v9.8h, v24.8h // ................~.......................................................'...............*...................................................... + // add v9.8h, v9.8h, v24.8h // ..............~.........................................................'.............*........................................................ + // sqrdmulh v27.8h, v9.8h, v5.8h // .....................~..................................................'....................*................................................. + // mul v24.8h, v9.8h, v1.8h // ..........................~.............................................'.........................*............................................ + // mls v24.8h, v27.8h, v7.h[0] // ............................~...........................................'...........................*.......................................... + // sub v9.8h, v8.8h, v24.8h // ................................~.......................................'...............................*...................................... + // add v8.8h, v8.8h, v24.8h // .................................~......................................'................................*..................................... + // sqrdmulh v27.8h, v11.8h, v6.8h // ....................~...................................................'...................*.................................................. + // mul v24.8h, v11.8h, v2.8h // ........................~...............................................'.......................*.............................................. + // mls v24.8h, v27.8h, v7.h[0] // ...............................~........................................'..............................*....................................... + // sub v11.8h, v10.8h, v24.8h // ......................................~.................................'.....................................*................................ + // add v10.8h, v10.8h, v24.8h // ........................................~...............................'.......................................*.............................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ................................................~.......................'...............................................*...................... + // srshr v25.8h, v25.8h, #11 // .......................................................~................'......................................................*............... + // mls v8.8h, v25.8h, v7.h[0] // ............................................................~...........'...........................................................*.......... + // sqdmulh v25.8h, v9.8h, v7.h[1] // .............................................~..........................'............................................*......................... + // srshr v25.8h, v25.8h, #11 // ....................................................~...................'...................................................*.................. + // mls v9.8h, v25.8h, v7.h[0] // ........................................................~...............'.......................................................*.............. + // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................................................~....................'..................................................*................... + // srshr v25.8h, v25.8h, #11 // ..........................................................~.............'.........................................................*............ + // mls v10.8h, v25.8h, v7.h[0] // ................................................................~.......'...............................................................*...... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ...........................................~............................'..........................................*........................... + // srshr v25.8h, v25.8h, #11 // .................................................~......................'................................................*..................... + // mls v11.8h, v25.8h, v7.h[0] // .....................................................~..................'....................................................*................. + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ......................................................................~.'.....................................................................* + + sub count, count, #1 cbnz count, layer4567_start - sub v1.8H, v16.8H, v12.8H // ..*................... - mls v13.8H, v29.8H, v7.H[0] // .*.................... - // gap // ...................... - add v11.8H, v16.8H, v12.8H // ...*.................. - // gap // ...................... - // gap // ...................... - mul v4.8H, v0.8H, v19.8H // *..................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v4.8H, v27.8H, v7.H[0] // ....*................. - // gap // ...................... - // gap // ...................... - sub v18.8H, v1.8H, v13.8H // .......*.............. - // gap // ...................... - // gap // ...................... - add v17.8H, v1.8H, v13.8H // ........*............. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqdmulh v1.8H, v18.8H, v7.H[1] // .........*............ - // gap // ...................... - // gap // ...................... - sub v16.8H, v11.8H, v4.8H // .....*................ - // gap // ...................... - // gap // ...................... - add v15.8H, v11.8H, v4.8H // ......*............... - sqdmulh v11.8H, v17.8H, v7.H[1] // ..........*........... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - sqdmulh v4.8H, v16.8H, v7.H[1] // .............*........ - // gap // ...................... - // gap // ...................... - srshr v1.8H, v1.8H, #11 // ............*......... - // gap // ...................... - // gap // ...................... - sqdmulh v22.8H, v15.8H, v7.H[1] // ...........*.......... - // gap // ...................... - // gap // ...................... - srshr v11.8H, v11.8H, #11 // ..............*....... - // gap // ...................... - // gap // ...................... - mls v18.8H, v1.8H, v7.H[0] // ...............*...... - // gap // ...................... - // gap // ...................... - srshr v1.8H, v4.8H, #11 // ...................*.. - // gap // ...................... - // gap // ...................... - mls v17.8H, v11.8H, v7.H[0] // .................*.... - // gap // ...................... - // gap // ...................... - srshr v11.8H, v22.8H, #11 // ................*..... - // gap // ...................... - // gap // ...................... - mls v16.8H, v1.8H, v7.H[0] // ....................*. - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - mls v15.8H, v11.8H, v7.H[0] // ..................*... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - // gap // ...................... - st4 {v15.4S,v16.4S,v17.4S,v18.4S}, [x1], #64 // .....................* - // gap // ...................... - // gap // ...................... - - // original source code - // mul v31.8H, v0.8H, v19.8H // ...*.................. || ..*......................... - // mls v13.8H, v29.8H, v7.H[0] // .*.................... || *........................... - // sub v0.8H, v16.8H, v12.8H // *..................... || *........................... - // add v26.8H, v16.8H, v12.8H // ..*................... || .*.......................... - // mls v31.8H, v27.8H, v7.H[0] // ....*................. || ....*....................... - // sub v24.8H, v26.8H, v31.8H // ........*............. || .........*.................. - // add v23.8H, v26.8H, v31.8H // .........*............ || ..........*................. - // sub v26.8H, v0.8H, v13.8H // .....*................ || .....*...................... - // add v25.8H, v0.8H, v13.8H // ......*............... || ......*..................... - // sqdmulh v1.8H, v26.8H, v7.H[1] // .......*.............. || ........*................... - // sqdmulh v12.8H, v25.8H, v7.H[1] // ..........*........... || ..........*................. - // sqdmulh v19.8H, v23.8H, v7.H[1] // .............*........ || ..............*............. - // srshr v18.8H, v1.8H, #11 // ............*......... || .............*.............. - // sqdmulh v28.8H, v24.8H, v7.H[1] // ...........*.......... || ............*............... - // srshr v14.8H, v12.8H, #11 // ..............*....... || ...............*............ - // mls v26.8H, v18.8H, v7.H[0] // ...............*...... || ................*........... - // srshr v27.8H, v19.8H, #11 // ..................*... || ...................*........ - // mls v25.8H, v14.8H, v7.H[0] // .................*.... || ..................*......... - // mls v23.8H, v27.8H, v7.H[0] // ....................*. || ......................*..... - // srshr v22.8H, v28.8H, #11 // ................*..... || .................*.......... - // mls v24.8H, v22.8H, v7.H[0] // ...................*.. || ....................*....... - // st4 {v23.4S,v24.4S,v25.4S,v26.4S}, [x1], #64 // .....................* || ...........................* - + // Instructions: 35 + // Expected cycles: 44 + // Expected IPC: 0.80 + // + // Cycle bound: 44.0 + // IPC bound: 0.80 + // + // Wall time: 0.59s + // User time: 0.59s + // + // ------- original position --------> + // 0 25 + // |------------------------|--------- + mul v0.8H, v9.8H, v27.8H // ..*................................ + ldr q10, [x4, #-16] // ......*............................ + // gap // ................................... + ldr q15, [x4, #-48] // ....*.............................. + // gap // ................................... + // gap // ................................... + mls v0.8H, v16.8H, v7.H[0] // .....*............................. + trn1 v6.2D, v26.2D, v28.2D // ...*............................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v14.8H, v8.8H, v7.H[0] // *.................................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + add v17.8H, v6.8H, v0.8H // .......*........................... + // gap // ................................... + // gap // ................................... + sub v19.8H, v6.8H, v0.8H // ........*.......................... + ldr q6, [x4, #-32] // ...........*....................... + // gap // ................................... + add v2.8H, v11.8H, v14.8H // ................*.................. + sub v13.8H, v11.8H, v14.8H // .............*..................... + ldr q14, [x4, #-64] // .*................................. + sqrdmulh v1.8H, v17.8H, v15.8H // ..........*........................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqrdmulh v12.8H, v19.8H, v10.8H // .........*......................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mul v26.8H, v17.8H, v14.8H // ..............*.................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v26.8H, v1.8H, v7.H[0] // ...............*................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mul v6.8H, v19.8H, v6.8H // ............*...................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v6.8H, v12.8H, v7.H[0] // .................*................. + // gap // ................................... + // gap // ................................... + add v19.8H, v2.8H, v26.8H // ...................*............... + // gap // ................................... + // gap // ................................... + sub v20.8H, v2.8H, v26.8H // ..................*................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqdmulh v28.8H, v19.8H, v7.H[1] // ........................*.......... + // gap // ................................... + // gap // ................................... + sub v22.8H, v13.8H, v6.8H // ....................*.............. + // gap // ................................... + // gap // ................................... + add v21.8H, v13.8H, v6.8H // .....................*............. + sqdmulh v9.8H, v20.8H, v7.H[1] // .......................*........... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqdmulh v18.8H, v22.8H, v7.H[1] // ......................*............ + srshr v5.8H, v28.8H, #11 // .............................*..... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqdmulh v1.8H, v21.8H, v7.H[1] // ..........................*........ + srshr v4.8H, v9.8H, #11 // ...........................*....... + // gap // ................................... + // gap // ................................... + mls v19.8H, v5.8H, v7.H[0] // ................................*.. + // gap // ................................... + // gap // ................................... + srshr v10.8H, v18.8H, #11 // .........................*......... + // gap // ................................... + // gap // ................................... + mls v20.8H, v4.8H, v7.H[0] // ..............................*.... + // gap // ................................... + // gap // ................................... + srshr v18.8H, v1.8H, #11 // ...............................*... + // gap // ................................... + // gap // ................................... + mls v22.8H, v10.8H, v7.H[0] // ............................*...... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v21.8H, v18.8H, v7.H[0] // .................................*. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + st4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1], #64 // ..................................* + // gap // ................................... + // gap // ................................... + + // ---------- new position ----------> + // 0 25 + // |------------------------|--------- + // mls v14.8H, v8.8H, v7.H[0] // .....*............................. + // ldr q20, [x4, #-64] // ...........*....................... + // mul v15.8H, v9.8H, v27.8H // *.................................. + // trn1 v0.2D, v26.2D, v28.2D // ....*.............................. + // ldr q26, [x4, #-48] // ..*................................ + // mls v15.8H, v16.8H, v7.H[0] // ...*............................... + // ldr q16, [x4, #-16] // .*................................. + // add v27.8H, v0.8H, v15.8H // ......*............................ + // sub v13.8H, v0.8H, v15.8H // .......*........................... + // sqrdmulh v15.8H, v13.8H, v16.8H // .............*..................... + // sqrdmulh v16.8H, v27.8H, v26.8H // ............*...................... + // ldr q6, [x4, #-32] // ........*.......................... + // mul v18.8H, v13.8H, v6.8H // ................*.................. + // sub v6.8H, v11.8H, v14.8H // ..........*........................ + // mul v21.8H, v27.8H, v20.8H // ..............*.................... + // mls v21.8H, v16.8H, v7.H[0] // ...............*................... + // add v16.8H, v11.8H, v14.8H // .........*......................... + // mls v18.8H, v15.8H, v7.H[0] // .................*................. + // sub v20.8H, v16.8H, v21.8H // ...................*............... + // add v19.8H, v16.8H, v21.8H // ..................*................ + // sub v22.8H, v6.8H, v18.8H // .....................*............. + // add v21.8H, v6.8H, v18.8H // ......................*............ + // sqdmulh v8.8H, v22.8H, v7.H[1] // ........................*.......... + // sqdmulh v23.8H, v20.8H, v7.H[1] // .......................*........... + // sqdmulh v15.8H, v19.8H, v7.H[1] // ....................*.............. + // srshr v17.8H, v8.8H, #11 // .............................*..... + // sqdmulh v16.8H, v21.8H, v7.H[1] // ..........................*........ + // srshr v28.8H, v23.8H, #11 // ...........................*....... + // mls v22.8H, v17.8H, v7.H[0] // ................................*.. + // srshr v27.8H, v15.8H, #11 // .........................*......... + // mls v20.8H, v28.8H, v7.H[0] // ..............................*.... + // srshr v15.8H, v16.8H, #11 // ...............................*... + // mls v19.8H, v27.8H, v7.H[0] // ............................*...... + // mls v21.8H, v15.8H, v7.H[0] // .................................*. + // st4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1], #64 // ..................................* + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_opt_m1_firestorm.s index b062b636..cb1bcd2e 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_opt_m1_firestorm.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h @@ -67,15 +44,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +61,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -103,21 +74,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -139,7 +110,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -150,7 +121,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -160,7 +131,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -168,7 +139,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -179,19 +150,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -330,878 +301,951 @@ _ntt_kyber_123_4567_opt_m1_firestorm: load_roots_123 .p2align 2 - ldr q18, [x0, #128] // ......*.............................. - ldr q25, [x0, #256] // .*................................... - ldr q9, [x0, #448] // ..*.................................. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - ldr q28, [x0, #64] // .......*............................. - ldr q8, [x0, #192] // ...*................................. - ldr q11, [x0, #384] // ....*................................ - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - ldr q22, [x0, #320] // *.................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sqrdmulh v12.8H, v25.8H, v0.H[1] // ..........*.......................... - mul v17.8H, v25.8H, v0.H[0] // ...........*......................... - mul v15.8H, v9.8H, v0.H[0] // ........*............................ - sqrdmulh v20.8H, v9.8H, v0.H[1] // .........*........................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sqrdmulh v10.8H, v11.8H, v0.H[1] // ..............*...................... - mul v11.8H, v11.8H, v0.H[0] // .............*....................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sqrdmulh v26.8H, v22.8H, v0.H[1] // ............*........................ - mul v2.8H, v22.8H, v0.H[0] // ..................*.................. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v17.8H, v12.8H, v7.H[0] // ................*.................... - mls v15.8H, v20.8H, v7.H[0] // ...............*..................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v11.8H, v10.8H, v7.H[0] // .................*................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v2.8H, v26.8H, v7.H[0] // ...........................*......... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sub v20.8H, v8.8H, v15.8H // ...................*................. - add v29.8H, v8.8H, v15.8H // ....................*................ - ldr q8, [x0, #0] // .....*............................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sub v12.8H, v18.8H, v11.8H // ..........................*.......... - add v18.8H, v18.8H, v11.8H // ............................*........ - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sqrdmulh v11.8H, v29.8H, v0.H[3] // ......................*.............. - mul v22.8H, v29.8H, v0.H[2] // .........................*........... - sqrdmulh v3.8H, v20.8H, v0.H[5] // .......................*............. - mul v16.8H, v20.8H, v0.H[4] // ........................*............ - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mul v21.8H, v18.8H, v0.H[2] // .................................*... - sqrdmulh v27.8H, v18.8H, v0.H[3] // ..................................*.. - mul v14.8H, v12.8H, v0.H[4] // ..............................*...... - sqrdmulh v5.8H, v12.8H, v0.H[5] // .............................*....... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - sub v18.8H, v8.8H, v17.8H // .....................*............... - sub v6.8H, v28.8H, v2.8H // ....................................* - add v25.8H, v28.8H, v2.8H // ...................................*. - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - mls v16.8H, v3.8H, v7.H[0] // ...............................*..... - mls v22.8H, v11.8H, v7.H[0] // ................................*.... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... - // gap // ..................................... + // Instructions: 34 + // Expected cycles: 16 + // Expected IPC: 2.12 + // + // Cycle bound: 16.0 + // IPC bound: 2.12 + // + // Wall time: 0.39s + // User time: 0.39s + // + // ------- original position -------> + // 0 25 + // |------------------------|-------- + ldr q15, [x0, #384] // *................................. + ldr q16, [x0, #448] // .*................................ + ldr q27, [x0, #320] // ..*............................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + ldr q6, [x0, #128] // ...*.............................. + ldr q26, [x0, #192] // ....*............................. + ldr q11, [x0, #64] // .....*............................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + ldr q14, [x0, #256] // ................*................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v13.8H, v15.8H, v0.H[1] // .........*........................ + mul v15.8H, v15.8H, v0.H[0] // ............*..................... + mul v31.8H, v16.8H, v0.H[0] // ......*........................... + sqrdmulh v16.8H, v16.8H, v0.H[1] // .......*.......................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v25.8H, v27.8H, v0.H[1] // ........*......................... + mul v27.8H, v27.8H, v0.H[0] // ...........*...................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v9.8H, v14.8H, v0.H[0] // ............................*..... + sqrdmulh v29.8H, v14.8H, v0.H[1] // .............................*.... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v15.8H, v13.8H, v7.H[0] // .................*................ + mls v31.8H, v16.8H, v7.H[0] // ..........*....................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v27.8H, v25.8H, v7.H[0] // ...............*.................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + add v14.8H, v26.8H, v31.8H // .............*.................... + add v16.8H, v6.8H, v15.8H // .........................*........ + sub v15.8H, v6.8H, v15.8H // ........................*......... + sub v6.8H, v26.8H, v31.8H // ..............*................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + add v19.8H, v11.8H, v27.8H // ......................*........... + sub v30.8H, v11.8H, v27.8H // .......................*.......... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v27.8H, v14.8H, v0.H[3] // ..................*............... + mul v12.8H, v14.8H, v0.H[2] // ....................*............. + sqrdmulh v14.8H, v6.8H, v0.H[5] // ...................*.............. + mul v25.8H, v6.8H, v0.H[4] // .....................*............ + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v5.8H, v16.8H, v0.H[3] // ..............................*... + mul v23.8H, v16.8H, v0.H[2] // .................................* + sqrdmulh v3.8H, v15.8H, v0.H[5] // ...............................*.. + mul v13.8H, v15.8H, v0.H[4] // ................................*. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v12.8H, v27.8H, v7.H[0] // ...........................*...... + mls v25.8H, v14.8H, v7.H[0] // ..........................*....... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. - // original source code - // ldr q12, [x0, #320] // ......*.............................. - // ldr q2, [x0, #256] // .*................................... - // ldr q24, [x0, #448] // ..*.................................. - // ldr q13, [x0, #192] // ....*................................ - // ldr q29, [x0, #384] // .....*............................... - // ldr q8, [x0, #0] // .....................*............... - // ldr q5, [x0, #128] // *.................................... - // ldr q30, [x0, #64] // ...*................................. - // mul v19.8H, v24.8H, v0.H[0] // .........*........................... - // sqrdmulh v10.8H, v24.8H, v0.H[1] // ..........*.......................... - // sqrdmulh v6.8H, v2.8H, v0.H[1] // .......*............................. - // mul v17.8H, v2.8H, v0.H[0] // ........*............................ - // sqrdmulh v11.8H, v12.8H, v0.H[1] // .............*....................... - // mul v15.8H, v29.8H, v0.H[0] // ............*........................ - // sqrdmulh v29.8H, v29.8H, v0.H[1] // ...........*......................... - // mls v19.8H, v10.8H, v7.H[0] // ................*.................... - // mls v17.8H, v6.8H, v7.H[0] // ...............*..................... - // mls v15.8H, v29.8H, v7.H[0] // .................*................... - // mul v12.8H, v12.8H, v0.H[0] // ..............*...................... - // sub v29.8H, v13.8H, v19.8H // ...................*................. - // add v19.8H, v13.8H, v19.8H // ....................*................ - // sub v18.8H, v8.8H, v17.8H // ................................*.... - // sqrdmulh v28.8H, v19.8H, v0.H[3] // ........................*............ - // sqrdmulh v6.8H, v29.8H, v0.H[5] // ..........................*.......... - // mul v16.8H, v29.8H, v0.H[4] // ...........................*......... - // mul v22.8H, v19.8H, v0.H[2] // .........................*........... - // sub v29.8H, v5.8H, v15.8H // ......................*.............. - // mls v12.8H, v11.8H, v7.H[0] // ..................*.................. - // add v27.8H, v5.8H, v15.8H // .......................*............. - // sqrdmulh v5.8H, v29.8H, v0.H[5] // ...............................*..... - // mul v14.8H, v29.8H, v0.H[4] // ..............................*...... - // mls v16.8H, v6.8H, v7.H[0] // ...................................*. - // mls v22.8H, v28.8H, v7.H[0] // ....................................* - // mul v21.8H, v27.8H, v0.H[2] // ............................*........ - // sqrdmulh v27.8H, v27.8H, v0.H[3] // .............................*....... - // add v25.8H, v30.8H, v12.8H // ..................................*.. - // sub v6.8H, v30.8H, v12.8H // .................................*... + // --------- new position ----------> + // 0 25 + // |------------------------|-------- + // ldr q31, [x0, #384] // *................................. + // ldr q22, [x0, #448] // .*................................ + // ldr q26, [x0, #320] // ..*............................... + // ldr q28, [x0, #128] // ...*.............................. + // ldr q21, [x0, #192] // ....*............................. + // ldr q10, [x0, #64] // .....*............................ + // mul v30.8H, v22.8H, v0.H[0] // .........*........................ + // sqrdmulh v8.8H, v22.8H, v0.H[1] // ..........*....................... + // sqrdmulh v3.8H, v26.8H, v0.H[1] // ...........*...................... + // sqrdmulh v19.8H, v31.8H, v0.H[1] // .......*.......................... + // mls v30.8H, v8.8H, v7.H[0] // ................*................. + // mul v24.8H, v26.8H, v0.H[0] // ............*..................... + // mul v5.8H, v31.8H, v0.H[0] // ........*......................... + // add v2.8H, v21.8H, v30.8H // ..................*............... + // sub v8.8H, v21.8H, v30.8H // .....................*............ + // mls v24.8H, v3.8H, v7.H[0] // .................*................ + // ldr q23, [x0, #256] // ......*........................... + // mls v5.8H, v19.8H, v7.H[0] // ...............*.................. + // sqrdmulh v22.8H, v2.8H, v0.H[3] // ........................*......... + // sqrdmulh v3.8H, v8.8H, v0.H[5] // ..........................*....... + // mul v12.8H, v2.8H, v0.H[2] // .........................*........ + // mul v25.8H, v8.8H, v0.H[4] // ...........................*...... + // add v19.8H, v10.8H, v24.8H // ......................*........... + // sub v30.8H, v10.8H, v24.8H // .......................*.......... + // sub v8.8H, v28.8H, v5.8H // ....................*............. + // add v21.8H, v28.8H, v5.8H // ...................*.............. + // mls v25.8H, v3.8H, v7.H[0] // .................................* + // mls v12.8H, v22.8H, v7.H[0] // ................................*. + // mul v9.8H, v23.8H, v0.H[0] // .............*.................... + // sqrdmulh v29.8H, v23.8H, v0.H[1] // ..............*................... + // sqrdmulh v5.8H, v21.8H, v0.H[3] // ............................*..... + // sqrdmulh v3.8H, v8.8H, v0.H[5] // ..............................*... + // mul v13.8H, v8.8H, v0.H[4] // ...............................*.. + // mul v23.8H, v21.8H, v0.H[2] // .............................*.... sub count, count, #1 layer123_start: - ldr q12, [x0, #336] // .....e...................................................................... - ldr q2, [x0, #272] // ....e....................................................................... - ldr q24, [x0, #464] // .......e.................................................................... + // Instructions: 76 + // Expected cycles: 17 + // Expected IPC: 4.47 + // + // Cycle bound: 16.0 + // IPC bound: 4.75 + // + // Wall time: 3600.44s + // User time: 3600.44s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q4, [x0, #0] // *........................................................................... + ldr q31, [x0, #400] // ......e..................................................................... + ldr q22, [x0, #464] // .......e.................................................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - add v28.8H, v8.8H, v17.8H // ............*............................................................... - mls v14.8H, v5.8H, v7.H[0] // ........................................*................................... - ldr q13, [x0, #208] // ...e........................................................................ - ldr q29, [x0, #400] // ......e..................................................................... - add v26.8H, v6.8H, v16.8H // ...............................................*............................ - sub v16.8H, v6.8H, v16.8H // ..............................................*............................. - ldr q8, [x0, #16] // e........................................................................... - sub v11.8H, v25.8H, v22.8H // ....................................*....................................... // gap // ............................................................................ - ldr q5, [x0, #144] // ..e......................................................................... + add v2.8H, v19.8H, v12.8H // .....................................*...................................... + ldr q26, [x0, #336] // .....e...................................................................... + ldr q28, [x0, #144] // ..e......................................................................... + sub v8.8H, v30.8H, v25.8H // ..............................................*............................. // gap // ............................................................................ - add v3.8H, v25.8H, v22.8H // .....................................*...................................... // gap // ............................................................................ + sub v17.8H, v19.8H, v12.8H // ....................................*....................................... + mls v9.8H, v29.8H, v7.H[0] // ..........*................................................................. + mls v13.8H, v3.8H, v7.H[0] // ........................................*................................... + ldr q21, [x0, #208] // ...e........................................................................ + mls v23.8H, v5.8H, v7.H[0] // ..............................*............................................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mls v21.8H, v27.8H, v7.H[0] // ..............................*............................................. - sqrdmulh v31.8H, v16.8H, v1.H[5] // ................................................................*........... - sqrdmulh v22.8H, v26.8H, v1.H[3] // ...........................................................*................ - sqrdmulh v4.8H, v11.8H, v1.H[1] // ......................................................*..................... - mul v23.8H, v11.8H, v1.H[0] // .....................................................*...................... - ldr q30, [x0, #80] // .e.......................................................................... // gap // ............................................................................ + add v20.8H, v30.8H, v25.8H // ...............................................*............................ + sqrdmulh v15.8H, v8.8H, v1.H[5] // ...............................................................*............ + mul v27.8H, v8.8H, v1.H[4] // ................................................................*........... + sqrdmulh v18.8H, v17.8H, v1.H[1] // .....................................................*...................... + mul v17.8H, v17.8H, v1.H[0] // ......................................................*..................... // gap // ............................................................................ // gap // ............................................................................ - mul v20.8H, v16.8H, v1.H[4] // ...............................................................*............ - mul v19.8H, v24.8H, v0.H[0] // .......................e.................................................... - sqrdmulh v10.8H, v24.8H, v0.H[1] // ........................e................................................... - sqrdmulh v6.8H, v2.8H, v0.H[1] // .........e.................................................................. // gap // ............................................................................ // gap // ............................................................................ + sub v24.8H, v4.8H, v9.8H // ...........*................................................................ + add v14.8H, v4.8H, v9.8H // ............*............................................................... + ldr q10, [x0, #80] // .e.......................................................................... // gap // ............................................................................ + mul v30.8H, v22.8H, v0.H[0] // ........................e................................................... + sqrdmulh v8.8H, v22.8H, v0.H[1] // .......................e.................................................... // gap // ............................................................................ - mul v17.8H, v2.8H, v0.H[0] // ........e................................................................... - sqrdmulh v11.8H, v12.8H, v0.H[1] // ..............e............................................................. - mul v2.8H, v3.8H, v0.H[6] // ................................................*........................... - sqrdmulh v3.8H, v3.8H, v0.H[7] // .................................................*.......................... // gap // ............................................................................ + sqrdmulh v3.8H, v26.8H, v0.H[1] // .............e.............................................................. + sqrdmulh v11.8H, v20.8H, v1.H[3] // ..........................................................*................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - mul v15.8H, v29.8H, v0.H[0] // ..................e......................................................... - sub v27.8H, v28.8H, v21.8H // ...............................*............................................ - mls v23.8H, v4.8H, v7.H[0] // .......................................................*.................... // gap // ............................................................................ + mul v6.8H, v20.8H, v1.H[2] // ...........................................................*................ + mul v4.8H, v2.8H, v0.H[6] // .................................................*.......................... + sub v22.8H, v14.8H, v23.8H // ...............................*............................................ + sqrdmulh v19.8H, v31.8H, v0.H[1] // ..................e......................................................... + mls v27.8H, v15.8H, v7.H[0] // .................................................................*.......... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v29.8H, v29.8H, v0.H[1] // ...................e........................................................ - mls v20.8H, v31.8H, v7.H[0] // .................................................................*.......... - sub v9.8H, v18.8H, v14.8H // .........................................*.................................. - mul v4.8H, v26.8H, v1.H[2] // ..........................................................*................. // gap // ............................................................................ - mls v19.8H, v10.8H, v7.H[0] // .........................e.................................................. + mls v17.8H, v18.8H, v7.H[0] // .......................................................*.................... + add v9.8H, v24.8H, v13.8H // ..........................................*................................. // gap // ............................................................................ + sub v29.8H, v24.8H, v13.8H // .........................................*.................................. + mls v30.8H, v8.8H, v7.H[0] // .........................e.................................................. // gap // ............................................................................ // gap // ............................................................................ - mls v17.8H, v6.8H, v7.H[0] // ..........e................................................................. - add v31.8H, v28.8H, v21.8H // ................................*........................................... // gap // ............................................................................ + mul v24.8H, v26.8H, v0.H[0] // ..............e............................................................. + mls v6.8H, v11.8H, v7.H[0] // ............................................................*............... // gap // ............................................................................ // gap // ............................................................................ - mls v2.8H, v3.8H, v7.H[0] // ..................................................*......................... // gap // ............................................................................ // gap // ............................................................................ - add v3.8H, v27.8H, v23.8H // .........................................................*.................. - sub v6.8H, v27.8H, v23.8H // ........................................................*................... - mls v15.8H, v29.8H, v7.H[0] // ....................e....................................................... - mul v12.8H, v12.8H, v0.H[0] // .............e.............................................................. // gap // ............................................................................ + mul v5.8H, v31.8H, v0.H[0] // ...................e........................................................ + sqrdmulh v25.8H, v2.8H, v0.H[7] // ................................................*........................... + add v26.8H, v22.8H, v17.8H // .........................................................*.................. + sub v11.8H, v22.8H, v17.8H // ........................................................*................... + add v15.8H, v29.8H, v27.8H // ...................................................................*........ + sub v16.8H, v29.8H, v27.8H // ..................................................................*......... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - add v23.8H, v9.8H, v20.8H // ...................................................................*........ - sub v29.8H, v13.8H, v19.8H // ..........................e................................................. - mls v4.8H, v22.8H, v7.H[0] // ............................................................*............... - add v19.8H, v13.8H, v19.8H // ...........................e................................................ // gap // ............................................................................ + add v2.8H, v21.8H, v30.8H // ...........................e................................................ + sub v8.8H, v21.8H, v30.8H // ..........................e................................................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - str q3, [x0, #128] // ......................................................................*..... // gap // ............................................................................ - sub v10.8H, v9.8H, v20.8H // ..................................................................*......... - str q6, [x0, #192] // .......................................................................*.... - add v20.8H, v18.8H, v14.8H // ..........................................*................................. + add v17.8H, v14.8H, v23.8H // ................................*........................................... + mls v24.8H, v3.8H, v7.H[0] // ...............e............................................................ + str q16, [x0, #448] // ...........................................................................* + str q15, [x0, #384] // ..........................................................................*. + ldr q23, [x0, #272] // ....e....................................................................... // gap // ............................................................................ - sub v26.8H, v31.8H, v2.8H // ...................................................*........................ - sub v18.8H, v8.8H, v17.8H // ...........e................................................................ - sqrdmulh v28.8H, v19.8H, v0.H[3] // ..................................e......................................... - sqrdmulh v6.8H, v29.8H, v0.H[5] // ............................................e............................... - mul v16.8H, v29.8H, v0.H[4] // ...........................................e................................ - mul v22.8H, v19.8H, v0.H[2] // .................................e.......................................... - str q23, [x0, #384] // ..........................................................................*. // gap // ............................................................................ + mls v4.8H, v25.8H, v7.H[0] // ..................................................*......................... + mls v5.8H, v19.8H, v7.H[0] // ....................e....................................................... + add v27.8H, v9.8H, v6.8H // ..............................................................*............. + str q11, [x0, #192] // .......................................................................*.... + str q26, [x0, #128] // ......................................................................*..... + sqrdmulh v22.8H, v2.8H, v0.H[3] // .................................e.......................................... // gap // ............................................................................ + sqrdmulh v3.8H, v8.8H, v0.H[5] // ...........................................e................................ + mul v12.8H, v2.8H, v0.H[2] // ..................................e......................................... // gap // ............................................................................ - str q26, [x0, #64] // .....................................................................*...... - sub v29.8H, v5.8H, v15.8H // .....................e...................................................... - add v26.8H, v31.8H, v2.8H // ....................................................*....................... + mul v25.8H, v8.8H, v0.H[4] // ............................................e............................... + add v19.8H, v10.8H, v24.8H // .................e.......................................................... + str q27, [x0, #256] // ........................................................................*... + sub v30.8H, v10.8H, v24.8H // ................e........................................................... // gap // ............................................................................ - sub v24.8H, v20.8H, v4.8H // .............................................................*.............. - mls v12.8H, v11.8H, v7.H[0] // ...............e............................................................ + sub v14.8H, v9.8H, v6.8H // .............................................................*.............. // gap // ............................................................................ // gap // ............................................................................ - add v27.8H, v5.8H, v15.8H // ......................e..................................................... - add v23.8H, v20.8H, v4.8H // ..............................................................*............. - str q10, [x0, #448] // ...........................................................................* // gap // ............................................................................ + sub v8.8H, v28.8H, v5.8H // .....................e...................................................... + add v16.8H, v17.8H, v4.8H // ....................................................*....................... + sub v15.8H, v17.8H, v4.8H // ...................................................*........................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v5.8H, v29.8H, v0.H[5] // .......................................e.................................... - mul v14.8H, v29.8H, v0.H[4] // ......................................e..................................... - str q24, [x0, #320] // .........................................................................*.. - mls v16.8H, v6.8H, v7.H[0] // .............................................e.............................. + add v21.8H, v28.8H, v5.8H // ......................e..................................................... + str q14, [x0, #320] // .........................................................................*.. + mls v25.8H, v3.8H, v7.H[0] // .............................................e.............................. + mls v12.8H, v22.8H, v7.H[0] // ...................................e........................................ + mul v9.8H, v23.8H, v0.H[0] // .........e.................................................................. // gap // ............................................................................ + sqrdmulh v29.8H, v23.8H, v0.H[1] // ........e................................................................... // gap // ............................................................................ // gap // ............................................................................ - mls v22.8H, v28.8H, v7.H[0] // ...................................e........................................ - mul v21.8H, v27.8H, v0.H[2] // ............................e............................................... - str q26, [x0], #(16) // ....................................................................*....... - sqrdmulh v27.8H, v27.8H, v0.H[3] // .............................e.............................................. - str q23, [x0, #240] // ........................................................................*... - add v25.8H, v30.8H, v12.8H // .................e.......................................................... - sub v6.8H, v30.8H, v12.8H // ................e........................................................... + sqrdmulh v5.8H, v21.8H, v0.H[3] // ............................e............................................... + str q16, [x0], #(16) // ....................................................................*....... + sqrdmulh v3.8H, v8.8H, v0.H[5] // ......................................e..................................... + mul v13.8H, v8.8H, v0.H[4] // .......................................e.................................... + mul v23.8H, v21.8H, v0.H[2] // .............................e.............................................. + str q15, [x0, #48] // .....................................................................*...... // gap // ............................................................................ // gap // ............................................................................ - // original source code - // ldr q8, [x0, #0] // .........e..................................................................|........e................................................................ - // ldr q9, [x0, #(1*(512/8))] // ..................e.........................................................|.................e....................................................... - // ldr q10, [x0, #(2*(512/8))] // ...........e................................................................|..........e.............................................................. - // ldr q11, [x0, #(3*(512/8))] // .....e......................................................................|....e.................................................................... - // ldr q12, [x0, #(4*(512/8))] // .e..........................................................................|e........................................................................ - // ldr q13, [x0, #(5*(512/8))] // e...........................................................................e......................................................................... - // ldr q14, [x0, #(6*(512/8))] // ......e.....................................................................|.....e................................................................... - // ldr q15, [x0, #(7*(512/8))] // ..e.........................................................................|.e....................................................................... - // mul v24.8h, v12.8h, v0.h[0] // .......................e....................................................|......................e.................................................. - // sqrdmulh v12.8h, v12.8h, v0.h[1] // ......................e.....................................................|.....................e................................................... - // mls v24.8h, v12.8h, v7.h[0] // ...................................e........................................|..................................e...................................... - // sub v12.8h, v8.8h, v24.8h // ...................................................e........................|..................................................e...................... - // add v8.8h, v8.8h, v24.8h // ...*........................................................................|..*...................................................................... - // mul v24.8h, v13.8h, v0.h[0] // .........................................e..................................|........................................e................................ - // sqrdmulh v13.8h, v13.8h, v0.h[1] // ........................e...................................................|.......................e................................................. - // mls v24.8h, v13.8h, v7.h[0] // .............................................................e..............|............................................................e............ - // sub v13.8h, v9.8h, v24.8h // ...........................................................................e|......................................................................... - // add v9.8h, v9.8h, v24.8h // ..........................................................................e.|......................................................................... - // mul v24.8h, v14.8h, v0.h[0] // ...........................e................................................|..........................e.............................................. - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ..............................e.............................................|.............................e........................................... - // mls v24.8h, v14.8h, v7.h[0] // ........................................e...................................|.......................................e................................. - // sub v14.8h, v10.8h, v24.8h // ..........................................................e.................|.........................................................e............... - // add v10.8h, v10.8h, v24.8h // ..............................................................e.............|.............................................................e........... - // mul v24.8h, v15.8h, v0.h[0] // ....................e.......................................................|...................e..................................................... - // sqrdmulh v15.8h, v15.8h, v0.h[1] // .....................e......................................................|....................e.................................................... - // mls v24.8h, v15.8h, v7.h[0] // ..................................e.........................................|.................................e....................................... - // sub v15.8h, v11.8h, v24.8h // ...........................................e................................|..........................................e.............................. - // add v11.8h, v11.8h, v24.8h // .............................................e..............................|............................................e............................ - // mul v24.8h, v10.8h, v0.h[2] // ......................................................................e.....|.....................................................................e... - // sqrdmulh v10.8h, v10.8h, v0.h[3] // ........................................................................e...|.......................................................................e. - // mls v24.8h, v10.8h, v7.h[0] // .............*..............................................................|............*............................................................ - // sub v10.8h, v8.8h, v24.8h // ............................*...............................................|...........................*............................................. - // add v8.8h, v8.8h, v24.8h // ....................................*.......................................|...................................*..................................... - // mul v24.8h, v11.8h, v0.h[2] // .......................................................e....................|......................................................e.................. - // sqrdmulh v11.8h, v11.8h, v0.h[3] // ....................................................e.......................|...................................................e..................... - // mls v24.8h, v11.8h, v7.h[0] // .....................................................................e......|....................................................................e.... - // sub v11.8h, v9.8h, v24.8h // ..........*.................................................................|.........*............................................................... - // add v9.8h, v9.8h, v24.8h // ............*...............................................................|...........*............................................................. - // mul v24.8h, v14.8h, v0.h[4] // ..................................................................e.........|.................................................................e....... - // sqrdmulh v14.8h, v14.8h, v0.h[5] // .................................................................e..........|................................................................e........ - // mls v24.8h, v14.8h, v7.h[0] // ....*.......................................................................|...*..................................................................... - // sub v14.8h, v12.8h, v24.8h // ................................*...........................................|...............................*......................................... - // add v12.8h, v12.8h, v24.8h // .................................................*..........................|................................................*........................ - // mul v24.8h, v15.8h, v0.h[4] // ......................................................e.....................|.....................................................e................... - // sqrdmulh v15.8h, v15.8h, v0.h[5] // .....................................................e......................|....................................................e.................... - // mls v24.8h, v15.8h, v7.h[0] // ....................................................................e.......|...................................................................e..... - // sub v15.8h, v13.8h, v24.8h // ........*...................................................................|.......*................................................................. - // add v13.8h, v13.8h, v24.8h // .......*....................................................................|......*.................................................................. - // mul v24.8h, v9.8h, v0.h[6] // .........................*..................................................|........................*................................................ - // sqrdmulh v9.8h, v9.8h, v0.h[7] // ..........................*.................................................|.........................*............................................... - // mls v24.8h, v9.8h, v7.h[0] // .....................................*......................................|....................................*.................................... - // sub v9.8h, v8.8h, v24.8h // ..................................................*.........................|.................................................*....................... - // add v8.8h, v8.8h, v24.8h // ...........................................................*................|..........................................................*.............. - // mul v24.8h, v11.8h, v1.h[0] // .................*..........................................................|................*........................................................ - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ................*...........................................................|...............*......................................................... - // mls v24.8h, v11.8h, v7.h[0] // .............................*..............................................|............................*............................................ - // sub v11.8h, v10.8h, v24.8h // .......................................*....................................|......................................*.................................. - // add v10.8h, v10.8h, v24.8h // ......................................*.....................................|.....................................*................................... - // mul v24.8h, v13.8h, v1.h[2] // .................................*..........................................|................................*........................................ - // sqrdmulh v13.8h, v13.8h, v1.h[3] // ...............*............................................................|..............*.......................................................... - // mls v24.8h, v13.8h, v7.h[0] // ............................................*...............................|...........................................*............................. - // sub v13.8h, v12.8h, v24.8h // ............................................................*...............|...........................................................*............. - // add v12.8h, v12.8h, v24.8h // ...............................................................*............|..............................................................*.......... - // mul v24.8h, v15.8h, v1.h[4] // ...................*........................................................|..................*...................................................... - // sqrdmulh v15.8h, v15.8h, v1.h[5] // ..............*.............................................................|.............*........................................................... - // mls v24.8h, v15.8h, v7.h[0] // ...............................*............................................|..............................*.......................................... - // sub v15.8h, v14.8h, v24.8h // ...............................................*............................|..............................................*.......................... - // add v14.8h, v14.8h, v24.8h // ..........................................*.................................|.........................................*............................... - // str q8, [x0], #(16) // .......................................................................*....|......................................................................*.. - // str q9, [x0, #(-16 + 1*(512/8))] // .........................................................*..................|........................................................*................ - // str q10, [x0, #(-16 + 2*(512/8))] // ..............................................*.............................|.............................................*........................... - // str q11, [x0, #(-16 + 3*(512/8))] // ................................................*...........................|...............................................*......................... - // str q12, [x0, #(-16 + 4*(512/8))] // .........................................................................*..|........................................................................* - // str q13, [x0, #(-16 + 5*(512/8))] // ...................................................................*........|..................................................................*...... - // str q14, [x0, #(-16 + 6*(512/8))] // ........................................................*...................|.......................................................*................. - // str q15, [x0, #(-16 + 7*(512/8))] // ................................................................*...........|...............................................................*......... + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q8, [x0, #0] // ...........................................................................*........................................................................... + // ldr q9, [x0, #(1*(512/8))] // ..................e........................................................'..................~........................................................ + // ldr q10, [x0, #(2*(512/8))] // ....e......................................................................'....~...................................................................... + // ldr q11, [x0, #(3*(512/8))] // .........e.................................................................'.........~................................................................. + // ldr q12, [x0, #(4*(512/8))] // ..............................................e............................'..............................................~............................ + // ldr q13, [x0, #(5*(512/8))] // ...e.......................................................................'...~....................................................................... + // ldr q14, [x0, #(6*(512/8))] // e..........................................................................'~.......................................................................... + // ldr q15, [x0, #(7*(512/8))] // .e.........................................................................'.~......................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ....................................................................e......'....................................................................~...... + // mul v24.8h, v12.8h, v0.h[0] // ...................................................................e.......'...................................................................~....... + // mls v24.8h, v27.8h, v7.h[0] // .......~...................................................................'.......*................................................................... + // sub v12.8h, v8.8h, v24.8h // ................~..........................................................'................*.......................................................... + // add v8.8h, v8.8h, v24.8h // .................~.........................................................'.................*......................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // .....................e.....................................................'.....................~..................................................... + // mul v24.8h, v13.8h, v0.h[0] // ................................e..........................................'................................~.......................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................................e...............................'...........................................~............................... + // sub v13.8h, v9.8h, v24.8h // ..........................................................e................'..........................................................~................ + // add v9.8h, v9.8h, v24.8h // ........................................................e..................'........................................................~.................. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ..........................e................................................'..........................~................................................ + // mul v24.8h, v14.8h, v0.h[0] // ..................................e........................................'..................................~........................................ + // mls v24.8h, v27.8h, v7.h[0] // ................................................e..........................'................................................~.......................... + // sub v14.8h, v10.8h, v24.8h // ............................................................e..............'............................................................~.............. + // add v10.8h, v10.8h, v24.8h // ...............................................................e...........'...............................................................~........... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ....................e......................................................'....................~...................................................... + // mul v24.8h, v15.8h, v0.h[0] // ...................e.......................................................'...................~....................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................e...........................................'...............................~........................................... + // sub v15.8h, v11.8h, v24.8h // .........................................e.................................'.........................................~................................. + // add v11.8h, v11.8h, v24.8h // ........................................e..................................'........................................~.................................. + // sqrdmulh v27.8h, v10.8h, v0.h[3] // .....................................................................e.....'.....................................................................~..... + // mul v24.8h, v10.8h, v0.h[2] // .........................................................................e.'.........................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // ..........~................................................................'..........*................................................................ + // sub v10.8h, v8.8h, v24.8h // .........................~.................................................'.........................*................................................. + // add v8.8h, v8.8h, v24.8h // ..........................................~................................'..........................................*................................ + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ....................................................e......................'....................................................~...................... + // mul v24.8h, v11.8h, v0.h[2] // ......................................................e....................'......................................................~.................... + // mls v24.8h, v27.8h, v7.h[0] // ..................................................................e........'..................................................................~........ + // sub v11.8h, v9.8h, v24.8h // ......~....................................................................'......*.................................................................... + // add v9.8h, v9.8h, v24.8h // ..~........................................................................'..*........................................................................ + // sqrdmulh v27.8h, v14.8h, v0.h[5] // .......................................................................e...'.......................................................................~... + // mul v24.8h, v14.8h, v0.h[4] // ........................................................................e..'........................................................................~.. + // mls v24.8h, v27.8h, v7.h[0] // ........~..................................................................'........*.................................................................. + // sub v14.8h, v12.8h, v24.8h // ..............................~............................................'..............................*............................................ + // add v12.8h, v12.8h, v24.8h // .............................~.............................................'.............................*............................................. + // sqrdmulh v27.8h, v15.8h, v0.h[5] // .....................................................e.....................'.....................................................~..................... + // mul v24.8h, v15.8h, v0.h[4] // .......................................................e...................'.......................................................~................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................................e.........'.................................................................~......... + // sub v15.8h, v13.8h, v24.8h // .....~.....................................................................'.....*..................................................................... + // add v13.8h, v13.8h, v24.8h // ...........~...............................................................'...........*............................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ...................................~.......................................'...................................*....................................... + // mul v24.8h, v9.8h, v0.h[6] // ........................~..................................................'........................*.................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...............................................~...........................'...............................................*........................... + // sub v9.8h, v8.8h, v24.8h // ..............................................................~............'..............................................................*............ + // add v8.8h, v8.8h, v24.8h // .............................................................~.............'.............................................................*............. + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ..............~............................................................'..............*............................................................ + // mul v24.8h, v11.8h, v1.h[0] // ...............~...........................................................'...............*........................................................... + // mls v24.8h, v27.8h, v7.h[0] // ............................~..............................................'............................*.............................................. + // sub v11.8h, v10.8h, v24.8h // .....................................~.....................................'.....................................*..................................... + // add v10.8h, v10.8h, v24.8h // ....................................~......................................'....................................*...................................... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ......................~....................................................'......................*.................................................... + // mul v24.8h, v13.8h, v1.h[2] // .......................~...................................................'.......................*................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................~.........................................'.................................*......................................... + // sub v13.8h, v12.8h, v24.8h // ...........................................................~...............'...........................................................*............... + // add v12.8h, v12.8h, v24.8h // .................................................~.........................'.................................................*......................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ............~..............................................................'............*.............................................................. + // mul v24.8h, v15.8h, v1.h[4] // .............~.............................................................'.............*............................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...........................~...............................................'...........................*............................................... + // sub v15.8h, v14.8h, v24.8h // .......................................~...................................'.......................................*................................... + // add v14.8h, v14.8h, v24.8h // ......................................~....................................'......................................*.................................... + // str q8, [x0], #(16) // ......................................................................~....'......................................................................*.... + // str q9, [x0, #(-16 + 1*(512/8))] // ..........................................................................~'..........................................................................* + // str q10, [x0, #(-16 + 2*(512/8))] // ...................................................~.......................'...................................................*....................... + // str q11, [x0, #(-16 + 3*(512/8))] // ..................................................~........................'..................................................*........................ + // str q12, [x0, #(-16 + 4*(512/8))] // .........................................................~.................'.........................................................*................. + // str q13, [x0, #(-16 + 5*(512/8))] // ................................................................~..........'................................................................*.......... + // str q14, [x0, #(-16 + 6*(512/8))] // .............................................~.............................'.............................................*............................. + // str q15, [x0, #(-16 + 7*(512/8))] // ............................................~..............................'............................................*.............................. sub count, count, #1 cbnz count, layer123_start - sub v4.8H, v25.8H, v22.8H // ....*.................................. - add v13.8H, v25.8H, v22.8H // .....*................................. - add v29.8H, v8.8H, v17.8H // *...................................... - add v12.8H, v6.8H, v16.8H // ..*.................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sub v16.8H, v6.8H, v16.8H // ...*................................... - mls v21.8H, v27.8H, v7.H[0] // ......*................................ - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sqrdmulh v11.8H, v12.8H, v1.H[3] // ........*.............................. - mul v3.8H, v12.8H, v1.H[2] // ..................*.................... - mul v10.8H, v4.8H, v1.H[0] // ..........*............................ - sqrdmulh v2.8H, v4.8H, v1.H[1] // .........*............................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sqrdmulh v19.8H, v16.8H, v1.H[5] // .......*............................... - mul v16.8H, v16.8H, v1.H[4] // ...........*........................... - mls v14.8H, v5.8H, v7.H[0] // .*..................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mul v27.8H, v13.8H, v0.H[6] // ............*.......................... - sqrdmulh v23.8H, v13.8H, v0.H[7] // .............*......................... - sub v9.8H, v29.8H, v21.8H // ..............*........................ - add v4.8H, v29.8H, v21.8H // ...................*................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v3.8H, v11.8H, v7.H[0] // ........................*.............. - mls v10.8H, v2.8H, v7.H[0] // ...............*....................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v16.8H, v19.8H, v7.H[0] // ................*...................... - add v13.8H, v18.8H, v14.8H // ............................*.......... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sub v26.8H, v18.8H, v14.8H // .................*..................... - mls v27.8H, v23.8H, v7.H[0] // ....................*.................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sub v11.8H, v9.8H, v10.8H // ......................*................ - add v10.8H, v9.8H, v10.8H // .....................*................. - sub v14.8H, v13.8H, v3.8H // .................................*..... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - add v23.8H, v13.8H, v3.8H // ..................................*.... - add v22.8H, v26.8H, v16.8H // .......................*............... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - str q14, [x0, #320] // ....................................*.. - str q11, [x0, #192] // ...........................*........... - add v25.8H, v4.8H, v27.8H // ................................*...... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - str q23, [x0, #256] // ......................................* - str q22, [x0, #384] // ..............................*........ - sub v22.8H, v4.8H, v27.8H // .............................*......... - sub v4.8H, v26.8H, v16.8H // ..........................*............ - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - str q10, [x0, #128] // .........................*............. - str q25, [x0], #(16) // .....................................*. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - str q22, [x0, #48] // ...............................*....... - str q4, [x0, #432] // ...................................*... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... + // Instructions: 42 + // Expected cycles: 14 + // Expected IPC: 3.00 + // + // Cycle bound: 14.0 + // IPC bound: 3.00 + // + // Wall time: 0.77s + // User time: 0.77s + // + // ----------- original position -----------> + // 0 25 + // |------------------------|---------------- + mls v9.8H, v29.8H, v7.H[0] // ....*..................................... + sub v29.8H, v30.8H, v25.8H // ..*....................................... + sub v21.8H, v19.8H, v12.8H // ...*...................................... + add v15.8H, v19.8H, v12.8H // .*........................................ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + ldr q14, [x0, #0] // *......................................... + mls v13.8H, v3.8H, v7.H[0] // .....*.................................... + add v3.8H, v30.8H, v25.8H // .......*.................................. + mls v23.8H, v5.8H, v7.H[0] // ......*................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mul v28.8H, v21.8H, v1.H[0] // ...........*.............................. + sqrdmulh v27.8H, v21.8H, v1.H[1] // ..........*............................... + mul v2.8H, v15.8H, v0.H[6] // ................*......................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sqrdmulh v15.8H, v15.8H, v0.H[7] // .......................*.................. + sqrdmulh v16.8H, v3.8H, v1.H[3] // ..............*........................... + mul v11.8H, v3.8H, v1.H[2] // ...............*.......................... + sqrdmulh v22.8H, v29.8H, v1.H[5] // ........*................................. + mul v25.8H, v29.8H, v1.H[4] // .........*................................ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v6.8H, v14.8H, v9.8H // ............*............................. + add v31.8H, v14.8H, v9.8H // .............*............................ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v2.8H, v15.8H, v7.H[0] // ...............................*.......... + mls v28.8H, v27.8H, v7.H[0] // ...................*...................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v26.8H, v6.8H, v13.8H // ....................*..................... + add v12.8H, v31.8H, v23.8H // ............................*............. + sub v15.8H, v31.8H, v23.8H // .................*........................ + mls v25.8H, v22.8H, v7.H[0] // ..................*....................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v11.8H, v16.8H, v7.H[0] // ......................*................... + sub v14.8H, v6.8H, v13.8H // .....................*.................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v27.8H, v15.8H, v28.8H // ........................*................. + sub v15.8H, v15.8H, v28.8H // .........................*................ + add v16.8H, v12.8H, v2.8H // .....................................*.... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v28.8H, v14.8H, v25.8H // ..........................*............... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q15, [x0, #192] // .................................*........ + sub v6.8H, v14.8H, v25.8H // ...........................*.............. + sub v15.8H, v12.8H, v2.8H // ......................................*... + str q16, [x0], #(16) // ........................................*. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q28, [x0, #368] // ..............................*........... + str q27, [x0, #112] // ..................................*....... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v27.8H, v26.8H, v11.8H // ....................................*..... + add v14.8H, v26.8H, v11.8H // ................................*......... + str q15, [x0, #48] // .........................................* + str q6, [x0, #432] // .............................*............ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q14, [x0, #240] // ...................................*...... + str q27, [x0, #304] // .......................................*.. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... - // original source code - // add v28.8H, v8.8H, v17.8H // ..*.................................... - // mls v14.8H, v5.8H, v7.H[0] // ............*.......................... - // add v26.8H, v6.8H, v16.8H // ...*................................... - // sub v16.8H, v6.8H, v16.8H // ....*.................................. - // sub v11.8H, v25.8H, v22.8H // *...................................... - // add v3.8H, v25.8H, v22.8H // .*..................................... - // mls v21.8H, v27.8H, v7.H[0] // .....*................................. - // sqrdmulh v31.8H, v16.8H, v1.H[5] // ..........*............................ - // sqrdmulh v22.8H, v26.8H, v1.H[3] // ......*................................ - // sqrdmulh v4.8H, v11.8H, v1.H[1] // .........*............................. - // mul v23.8H, v11.8H, v1.H[0] // ........*.............................. - // mul v20.8H, v16.8H, v1.H[4] // ...........*........................... - // mul v2.8H, v3.8H, v0.H[6] // .............*......................... - // sqrdmulh v3.8H, v3.8H, v0.H[7] // ..............*........................ - // sub v27.8H, v28.8H, v21.8H // ...............*....................... - // mls v23.8H, v4.8H, v7.H[0] // ..................*.................... - // mls v20.8H, v31.8H, v7.H[0] // ...................*................... - // sub v9.8H, v18.8H, v14.8H // .....................*................. - // mul v4.8H, v26.8H, v1.H[2] // .......*............................... - // add v31.8H, v28.8H, v21.8H // ................*...................... - // mls v2.8H, v3.8H, v7.H[0] // ......................*................ - // add v3.8H, v27.8H, v23.8H // ........................*.............. - // sub v6.8H, v27.8H, v23.8H // .......................*............... - // add v23.8H, v9.8H, v20.8H // ...........................*........... - // mls v4.8H, v22.8H, v7.H[0] // .................*..................... - // str q3, [x0, #128] // ...................................*... - // sub v10.8H, v9.8H, v20.8H // ..................................*.... - // str q6, [x0, #192] // .............................*......... - // add v20.8H, v18.8H, v14.8H // ....................*.................. - // sub v26.8H, v31.8H, v2.8H // .................................*..... - // str q23, [x0, #384] // ................................*...... - // str q26, [x0, #64] // .....................................*. - // add v26.8H, v31.8H, v2.8H // ..............................*........ - // sub v24.8H, v20.8H, v4.8H // .........................*............. - // add v23.8H, v20.8H, v4.8H // ..........................*............ - // str q10, [x0, #448] // ......................................* - // str q24, [x0, #320] // ............................*.......... - // str q26, [x0], #(16) // ....................................*.. - // str q23, [x0, #240] // ...............................*....... + // ------------- new position --------------> + // 0 25 + // |------------------------|---------------- + // ldr q4, [x0, #0] // ....*..................................... + // add v2.8H, v19.8H, v12.8H // ...*...................................... + // sub v8.8H, v30.8H, v25.8H // .*........................................ + // sub v17.8H, v19.8H, v12.8H // ..*....................................... + // mls v9.8H, v29.8H, v7.H[0] // *......................................... + // mls v13.8H, v3.8H, v7.H[0] // .....*.................................... + // mls v23.8H, v5.8H, v7.H[0] // .......*.................................. + // add v20.8H, v30.8H, v25.8H // ......*................................... + // sqrdmulh v15.8H, v8.8H, v1.H[5] // ..............*........................... + // mul v27.8H, v8.8H, v1.H[4] // ...............*.......................... + // sqrdmulh v18.8H, v17.8H, v1.H[1] // .........*................................ + // mul v17.8H, v17.8H, v1.H[0] // ........*................................. + // sub v24.8H, v4.8H, v9.8H // ................*......................... + // add v14.8H, v4.8H, v9.8H // .................*........................ + // sqrdmulh v11.8H, v20.8H, v1.H[3] // ............*............................. + // mul v6.8H, v20.8H, v1.H[2] // .............*............................ + // mul v4.8H, v2.8H, v0.H[6] // ..........*............................... + // sub v22.8H, v14.8H, v23.8H // ......................*................... + // mls v27.8H, v15.8H, v7.H[0] // .......................*.................. + // mls v17.8H, v18.8H, v7.H[0] // ...................*...................... + // add v9.8H, v24.8H, v13.8H // ....................*..................... + // sub v29.8H, v24.8H, v13.8H // .........................*................ + // mls v6.8H, v11.8H, v7.H[0] // ........................*................. + // sqrdmulh v25.8H, v2.8H, v0.H[7] // ...........*.............................. + // add v26.8H, v22.8H, v17.8H // ..........................*............... + // sub v11.8H, v22.8H, v17.8H // ...........................*.............. + // add v15.8H, v29.8H, v27.8H // .............................*............ + // sub v16.8H, v29.8H, v27.8H // ...............................*.......... + // add v17.8H, v14.8H, v23.8H // .....................*.................... + // str q16, [x0, #448] // .......................................*.. + // str q15, [x0, #384] // ..................................*....... + // mls v4.8H, v25.8H, v7.H[0] // ..................*....................... + // add v27.8H, v9.8H, v6.8H // .....................................*.... + // str q11, [x0, #192] // ..............................*........... + // str q26, [x0, #128] // ...................................*...... + // str q27, [x0, #256] // ........................................*. + // sub v14.8H, v9.8H, v6.8H // ....................................*..... + // add v16.8H, v17.8H, v4.8H // ............................*............. + // sub v15.8H, v17.8H, v4.8H // ................................*......... + // str q14, [x0, #320] // .........................................* + // str q16, [x0], #(16) // .................................*........ + // str q15, [x0, #48] // ......................................*... restore inp, STACK0 mov count, #8 .p2align 2 - ldr q9, [x3], #16 // *................................ - ldr q15, [x1, #32] // .*............................... - ldr q31, [x1, #48] // ..*.............................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - ldr q13, [x1, #16] // ....*............................ - ldr q4, [x1, #0] // ...*............................. - ldr q23, [x4], #(6*16) // .........................*....... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - sqrdmulh v2.8H, v15.8H, v9.H[1] // ......*.......................... - mul v17.8H, v31.8H, v9.H[0] // .......*......................... - sqrdmulh v20.8H, v31.8H, v9.H[1] // ........*........................ - mul v26.8H, v15.8H, v9.H[0] // .....*........................... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - mls v17.8H, v20.8H, v7.H[0] // ..........*...................... - mls v26.8H, v2.8H, v7.H[0] // .........*....................... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - sub v5.8H, v4.8H, v26.8H // ...........*..................... - add v18.8H, v4.8H, v26.8H // .............*................... - sub v19.8H, v13.8H, v17.8H // ..............*.................. - add v1.8H, v13.8H, v17.8H // ............*.................... - ldr q4, [x4, #-80] // ..............................*.. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - sqrdmulh v12.8H, v19.8H, v9.H[5] // ...............*................. - mul v11.8H, v19.8H, v9.H[4] // ..................*.............. - sqrdmulh v19.8H, v1.8H, v9.H[3] // ................*................ - mul v16.8H, v1.8H, v9.H[2] // .................*............... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - mls v16.8H, v19.8H, v7.H[0] // ...................*............. - mls v11.8H, v12.8H, v7.H[0] // ....................*............ - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - add v19.8H, v18.8H, v16.8H // ......................*.......... - sub v16.8H, v18.8H, v16.8H // .....................*........... - sub v25.8H, v5.8H, v11.8H // .......................*......... - add v13.8H, v5.8H, v11.8H // ........................*........ - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - trn2 v10.4S, v13.4S, v25.4S // ............................*.... - trn1 v24.4S, v13.4S, v25.4S // .............................*... - trn2 v12.4S, v19.4S, v16.4S // ..........................*...... - trn1 v21.4S, v19.4S, v16.4S // ...........................*..... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - trn2 v0.2D, v21.2D, v24.2D // ...............................*. - trn2 v3.2D, v12.2D, v10.2D // ................................* - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. + // Instructions: 37 + // Expected cycles: 23 + // Expected IPC: 1.61 + // + // Cycle bound: 23.0 + // IPC bound: 1.61 + // + // Wall time: 0.53s + // User time: 0.53s + // + // -------- original position ---------> + // 0 25 + // |------------------------|----------- + ldr q27, [x1, #32] // ..*.................................. + ldr q13, [x1, #48] // .*................................... + ldr q15, [x3], #16 // *.................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + ldr q0, [x4, #48] // ..............................*...... + ldr q23, [x1, #0] // ....*................................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + ldr q6, [x4, #16] // ................................*.... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + ldr q16, [x1, #16] // ...*................................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v19.8H, v27.8H, v15.H[1] // ........*............................ + mul v26.8H, v27.8H, v15.H[0] // .........*........................... + sqrdmulh v28.8H, v13.8H, v15.H[1] // ......*.............................. + mul v21.8H, v13.8H, v15.H[0] // .......*............................. + ldr q11, [x4, #32] // ....................................* + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v26.8H, v19.8H, v7.H[0] // ...........*......................... + mls v21.8H, v28.8H, v7.H[0] // ..........*.......................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sub v14.8H, v23.8H, v26.8H // ............*........................ + add v24.8H, v16.8H, v21.8H // ..............*...................... + sub v28.8H, v16.8H, v21.8H // .............*....................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v12.8H, v24.8H, v15.H[3] // .................*................... + mul v16.8H, v24.8H, v15.H[2] // ..................*.................. + mul v4.8H, v28.8H, v15.H[4] // ................*.................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v15.8H, v28.8H, v15.H[5] // ...............*..................... + // gap // ..................................... + ldr q24, [x4], #(6*16) // .....*............................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v16.8H, v12.8H, v7.H[0] // .....................*............... + mls v4.8H, v15.8H, v7.H[0] // ....................*................ + add v15.8H, v23.8H, v26.8H // ...................*................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + ldr q26, [x4, #-32] // ...............................*..... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sub v28.8H, v15.8H, v16.8H // ......................*.............. + add v25.8H, v15.8H, v16.8H // .........................*........... + add v1.8H, v14.8H, v4.8H // .......................*............. + sub v12.8H, v14.8H, v4.8H // ........................*............ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + trn2 v15.4S, v25.4S, v28.4S // ...........................*......... + trn1 v4.4S, v25.4S, v28.4S // ..........................*.......... + trn2 v16.4S, v1.4S, v12.4S // .............................*....... + trn1 v21.4S, v1.4S, v12.4S // ............................*........ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + trn2 v5.2D, v15.2D, v16.2D // .................................*... + trn1 v15.2D, v15.2D, v16.2D // ..................................*.. + trn2 v28.2D, v4.2D, v21.2D // ...................................*. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... - // original source code - // ldr q1, [x3], #16 // *................................ - // ldr q27, [x1, #32] // .*............................... - // ldr q13, [x1, #48] // ..*.............................. - // ldr q30, [x1, #0] // ....*............................ - // ldr q9, [x1, #16] // ...*............................. - // mul v15.8H, v27.8H, v1.H[0] // .........*....................... - // sqrdmulh v16.8H, v27.8H, v1.H[1] // ......*.......................... - // mul v19.8H, v13.8H, v1.H[0] // .......*......................... - // sqrdmulh v28.8H, v13.8H, v1.H[1] // ........*........................ - // mls v15.8H, v16.8H, v7.H[0] // ...........*..................... - // mls v19.8H, v28.8H, v7.H[0] // ..........*...................... - // sub v14.8H, v30.8H, v15.8H // ............*.................... - // add v27.8H, v9.8H, v19.8H // ...............*................. - // add v0.8H, v30.8H, v15.8H // .............*................... - // sub v26.8H, v9.8H, v19.8H // ..............*.................. - // sqrdmulh v17.8H, v26.8H, v1.H[5] // .................*............... - // sqrdmulh v10.8H, v27.8H, v1.H[3] // ...................*............. - // mul v30.8H, v27.8H, v1.H[2] // ....................*............ - // mul v16.8H, v26.8H, v1.H[4] // ..................*.............. - // mls v30.8H, v10.8H, v7.H[0] // .....................*........... - // mls v16.8H, v17.8H, v7.H[0] // ......................*.......... - // sub v17.8H, v0.8H, v30.8H // ........................*........ - // add v13.8H, v0.8H, v30.8H // .......................*......... - // sub v24.8H, v14.8H, v16.8H // .........................*....... - // add v0.8H, v14.8H, v16.8H // ..........................*...... - // ldr q23, [x4], #(6*16) // .....*........................... - // trn2 v12.4S, v13.4S, v17.4S // .............................*... - // trn1 v21.4S, v13.4S, v17.4S // ..............................*.. - // trn2 v10.4S, v0.4S, v24.4S // ...........................*..... - // trn1 v24.4S, v0.4S, v24.4S // ............................*.... - // ldr q4, [x4, #-80] // ................*................ - // trn2 v0.2D, v21.2D, v24.2D // ...............................*. - // trn2 v3.2D, v12.2D, v10.2D // ................................* + // ----------- new position -----------> + // 0 25 + // |------------------------|----------- + // ldr q1, [x3], #16 // ..*.................................. + // ldr q8, [x1, #48] // .*................................... + // ldr q9, [x1, #32] // *.................................... + // ldr q29, [x1, #16] // ......*.............................. + // ldr q2, [x1, #0] // ....*................................ + // ldr q24, [x4], #(6*16) // .....................*............... + // sqrdmulh v25.8H, v8.8H, v1.H[1] // .........*........................... + // mul v13.8H, v8.8H, v1.H[0] // ..........*.......................... + // sqrdmulh v5.8H, v9.8H, v1.H[1] // .......*............................. + // mul v22.8H, v9.8H, v1.H[0] // ........*............................ + // mls v13.8H, v25.8H, v7.H[0] // .............*....................... + // mls v22.8H, v5.8H, v7.H[0] // ............*........................ + // sub v21.8H, v2.8H, v22.8H // ..............*...................... + // sub v28.8H, v29.8H, v13.8H // ................*.................... + // add v26.8H, v29.8H, v13.8H // ...............*..................... + // sqrdmulh v25.8H, v28.8H, v1.H[5] // ....................*................ + // mul v0.8H, v28.8H, v1.H[4] // ...................*................. + // sqrdmulh v29.8H, v26.8H, v1.H[3] // .................*................... + // mul v20.8H, v26.8H, v1.H[2] // ..................*.................. + // add v15.8H, v2.8H, v22.8H // ........................*............ + // mls v0.8H, v25.8H, v7.H[0] // .......................*............. + // mls v20.8H, v29.8H, v7.H[0] // ......................*.............. + // sub v25.8H, v15.8H, v20.8H // ..........................*.......... + // add v26.8H, v21.8H, v0.8H // ............................*........ + // sub v0.8H, v21.8H, v0.8H // .............................*....... + // add v15.8H, v15.8H, v20.8H // ...........................*......... + // trn1 v4.4S, v15.4S, v25.4S // ...............................*..... + // trn2 v15.4S, v15.4S, v25.4S // ..............................*...... + // trn1 v21.4S, v26.4S, v0.4S // .................................*... + // trn2 v18.4S, v26.4S, v0.4S // ................................*.... + // ldr q0, [x4, #-48] // ...*................................. + // ldr q26, [x4, #-32] // .........................*........... + // ldr q6, [x4, #-80] // .....*............................... + // trn2 v5.2D, v15.2D, v18.2D // ..................................*.. + // trn1 v15.2D, v15.2D, v18.2D // ...................................*. + // trn2 v28.2D, v4.2D, v21.2D // ....................................* + // ldr q11, [x4, #-64] // ...........*......................... sub count, count, #1 layer4567_start: - mul v18.8H, v3.8H, v23.8H // ............................................*........................... - mul v20.8H, v0.8H, v23.8H // .......................................*................................ + // Instructions: 72 + // Expected cycles: 26 + // Expected IPC: 2.77 + // + // Cycle bound: 26.0 + // IPC bound: 2.77 + // + // Wall time: 29.81s + // User time: 29.81s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- ldr q1, [x3], #16 // ....e................................................................... - ldr q27, [x1, #96] // ..e..................................................................... - ldr q13, [x1, #112] // ...e.................................................................... + ldr q8, [x1, #112] // ...e.................................................................... + sqrdmulh v27.8H, v28.8H, v6.8H // .......................................*................................ + ldr q9, [x1, #96] // ..e..................................................................... + mul v19.8H, v28.8H, v24.8H // ........................................*............................... + mul v31.8H, v5.8H, v24.8H // .............................................*.......................... // gap // ........................................................................ - trn1 v26.2D, v21.2D, v24.2D // ...............................*........................................ - sqrdmulh v8.8H, v3.8H, v4.8H // .............................................*.......................... - ldr q30, [x1, #64] // e....................................................................... - trn1 v23.2D, v12.2D, v10.2D // ................................*....................................... - sqrdmulh v6.8H, v0.8H, v4.8H // ........................................*............................... + sqrdmulh v6.8H, v5.8H, v6.8H // ............................................*........................... + ldr q29, [x1, #80] // .e...................................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - ldr q9, [x1, #80] // .e...................................................................... // gap // ........................................................................ // gap // ........................................................................ + ldr q2, [x1, #64] // e....................................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - mls v18.8H, v8.8H, v7.H[0] // ..............................................*......................... - ldr q4, [x4, #-16] // ......................................*................................. // gap // ........................................................................ + trn1 v20.2D, v4.2D, v21.2D // ...............................*........................................ + mls v31.8H, v6.8H, v7.H[0] // ..............................................*......................... + mls v19.8H, v27.8H, v7.H[0] // .........................................*.............................. // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - ldr q25, [x4, #-32] // .....................................*.................................. - mul v15.8H, v27.8H, v1.H[0] // .....e.................................................................. - sqrdmulh v16.8H, v27.8H, v1.H[1] // ......e................................................................. - mul v19.8H, v13.8H, v1.H[0] // ..........e............................................................. - sqrdmulh v28.8H, v13.8H, v1.H[1] // ...........e............................................................ // gap // ........................................................................ + ldr q24, [x4], #(6*16) // .................................e...................................... + sqrdmulh v25.8H, v8.8H, v1.H[1] // ..........e............................................................. + mul v13.8H, v8.8H, v1.H[0] // ...........e............................................................ + sqrdmulh v5.8H, v9.8H, v1.H[1] // .....e.................................................................. // gap // ........................................................................ - ldr q10, [x4, #-64] // ...................................*.................................... - ldr q27, [x4, #-48] // ....................................*................................... - mls v20.8H, v6.8H, v7.H[0] // .........................................*.............................. // gap // ........................................................................ // gap // ........................................................................ + mul v22.8H, v9.8H, v1.H[0] // ......e................................................................. + ldr q14, [x4, #-112] // ......................................*................................. // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - add v3.8H, v23.8H, v18.8H // ................................................*....................... - sub v23.8H, v23.8H, v18.8H // ...............................................*........................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + add v16.8H, v20.8H, v19.8H // ...........................................*............................ + sub v6.8H, v20.8H, v19.8H // ..........................................*............................. + add v27.8H, v15.8H, v31.8H // ................................................*....................... + sub v15.8H, v15.8H, v31.8H // ...............................................*........................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - mls v15.8H, v16.8H, v7.H[0] // .......e................................................................ - mls v19.8H, v28.8H, v7.H[0] // ............e........................................................... // gap // ........................................................................ + mls v13.8H, v25.8H, v7.H[0] // ............e........................................................... + mls v22.8H, v5.8H, v7.H[0] // .......e................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - sqrdmulh v31.8H, v23.8H, v4.8H // .......................................................*................ - mul v4.8H, v23.8H, v25.8H // ......................................................*................. - sqrdmulh v17.8H, v3.8H, v27.8H // ..................................................*..................... - mul v3.8H, v3.8H, v10.8H // .................................................*...................... // gap // ........................................................................ + sqrdmulh v8.8H, v15.8H, v14.8H // ......................................................*................. + sqrdmulh v31.8H, v27.8H, v0.8H // .................................................*...................... + mul v27.8H, v27.8H, v11.8H // ..................................................*..................... + mul v15.8H, v15.8H, v26.8H // .......................................................*................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - sub v21.8H, v26.8H, v20.8H // ..........................................*............................. // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - add v22.8H, v26.8H, v20.8H // ...........................................*............................ - sub v14.8H, v30.8H, v15.8H // ........e............................................................... - add v27.8H, v9.8H, v19.8H // ..............e......................................................... - add v0.8H, v30.8H, v15.8H // .........e.............................................................. - sub v26.8H, v9.8H, v19.8H // .............e.......................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + sub v21.8H, v2.8H, v22.8H // ........e............................................................... + sub v28.8H, v29.8H, v13.8H // .............e.......................................................... + add v26.8H, v29.8H, v13.8H // ..............e......................................................... // gap // ........................................................................ - mls v4.8H, v31.8H, v7.H[0] // ........................................................*............... - mls v3.8H, v17.8H, v7.H[0] // ...................................................*.................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + mls v15.8H, v8.8H, v7.H[0] // ........................................................*............... + mls v27.8H, v31.8H, v7.H[0] // ...................................................*.................... // gap // ........................................................................ // gap // ........................................................................ - sqrdmulh v17.8H, v26.8H, v1.H[5] // .....................e.................................................. - sqrdmulh v10.8H, v27.8H, v1.H[3] // ................e....................................................... - mul v30.8H, v27.8H, v1.H[2] // ...............e........................................................ - mul v16.8H, v26.8H, v1.H[4] // ....................e................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + sqrdmulh v25.8H, v28.8H, v1.H[5] // ....................e................................................... + mul v0.8H, v28.8H, v1.H[4] // .....................e.................................................. + sqrdmulh v29.8H, v26.8H, v1.H[3] // ...............e........................................................ + mul v20.8H, v26.8H, v1.H[2] // ................e....................................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ @@ -1210,87 +1254,89 @@ layer4567_start: // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - add v27.8H, v21.8H, v4.8H // ..........................................................*............. - sub v28.8H, v21.8H, v4.8H // .........................................................*.............. - sub v26.8H, v22.8H, v3.8H // ....................................................*................... - add v25.8H, v22.8H, v3.8H // .....................................................*.................. // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - mls v30.8H, v10.8H, v7.H[0] // .................e...................................................... + sub v13.8H, v6.8H, v15.8H // .........................................................*.............. + add v12.8H, v6.8H, v15.8H // ..........................................................*............. + sub v11.8H, v16.8H, v27.8H // ....................................................*................... + add v10.8H, v16.8H, v27.8H // .....................................................*.................. // gap // ........................................................................ // gap // ........................................................................ - mls v16.8H, v17.8H, v7.H[0] // ......................e................................................. // gap // ........................................................................ // gap // ........................................................................ + add v15.8H, v2.8H, v22.8H // .........e.............................................................. + mls v0.8H, v25.8H, v7.H[0] // ......................e................................................. + mls v20.8H, v29.8H, v7.H[0] // .................e...................................................... // gap // ........................................................................ // gap // ........................................................................ - sqdmulh v23.8H, v26.8H, v7.H[1] // ..............................................................*......... - sqdmulh v1.8H, v25.8H, v7.H[1] // ...........................................................*............ - sqdmulh v22.8H, v28.8H, v7.H[1] // ....................................................................*... - sqdmulh v20.8H, v27.8H, v7.H[1] // .................................................................*...... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + sqdmulh v6.8H, v11.8H, v7.H[1] // ..............................................................*......... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + sqdmulh v16.8H, v13.8H, v7.H[1] // ....................................................................*... + sqdmulh v27.8H, v12.8H, v7.H[1] // .................................................................*...... + sqdmulh v14.8H, v10.8H, v7.H[1] // ...........................................................*............ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - sub v17.8H, v0.8H, v30.8H // ..................e..................................................... - add v13.8H, v0.8H, v30.8H // ...................e.................................................... - sub v24.8H, v14.8H, v16.8H // .......................e................................................ - add v0.8H, v14.8H, v16.8H // ........................e............................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + sub v25.8H, v15.8H, v20.8H // ..................e..................................................... + add v26.8H, v21.8H, v0.8H // ........................e............................................... + sub v0.8H, v21.8H, v0.8H // .......................e................................................ + add v15.8H, v15.8H, v20.8H // ...................e.................................................... // gap // ........................................................................ - srshr v4.8H, v23.8H, #11 // ...............................................................*........ - srshr v3.8H, v1.8H, #11 // ............................................................*........... - srshr v5.8H, v22.8H, #11 // .....................................................................*.. - srshr v22.8H, v20.8H, #11 // ..................................................................*..... - ldr q23, [x4], #(6*16) // .................................e...................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - trn2 v12.4S, v13.4S, v17.4S // ..........................e............................................. - trn1 v21.4S, v13.4S, v17.4S // .........................e.............................................. + srshr v2.8H, v14.8H, #11 // ............................................................*........... + srshr v14.8H, v6.8H, #11 // ...............................................................*........ + srshr v27.8H, v27.8H, #11 // ..................................................................*..... + srshr v16.8H, v16.8H, #11 // .....................................................................*.. // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + trn1 v4.4S, v15.4S, v25.4S // .........................e.............................................. + trn2 v15.4S, v15.4S, v25.4S // ..........................e............................................. + trn1 v21.4S, v26.4S, v0.4S // ...........................e............................................ // gap // ........................................................................ // gap // ........................................................................ - trn2 v10.4S, v0.4S, v24.4S // ............................e........................................... - trn1 v24.4S, v0.4S, v24.4S // ...........................e............................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ + trn2 v18.4S, v26.4S, v0.4S // ............................e........................................... + ldr q0, [x4, #-48] // ....................................e................................... + ldr q26, [x4, #-32] // .....................................e.................................. // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - mls v27.8H, v22.8H, v7.H[0] // ...................................................................*.... - mls v26.8H, v4.8H, v7.H[0] // ................................................................*....... - mls v28.8H, v5.8H, v7.H[0] // ......................................................................*. - mls v25.8H, v3.8H, v7.H[0] // .............................................................*.......... - ldr q4, [x4, #-80] // ..................................e..................................... // gap // ........................................................................ // gap // ........................................................................ + mls v13.8H, v16.8H, v7.H[0] // ......................................................................*. + mls v12.8H, v27.8H, v7.H[0] // ...................................................................*.... + mls v11.8H, v14.8H, v7.H[0] // ................................................................*....... + mls v10.8H, v2.8H, v7.H[0] // .............................................................*.......... // gap // ........................................................................ - trn2 v0.2D, v21.2D, v24.2D // .............................e.......................................... // gap // ........................................................................ // gap // ........................................................................ + ldr q6, [x4, #-80] // ..................................e..................................... + trn2 v5.2D, v15.2D, v18.2D // ..............................e......................................... + trn1 v15.2D, v15.2D, v18.2D // ................................e....................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - trn2 v3.2D, v12.2D, v10.2D // ..............................e......................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ @@ -1298,8 +1344,10 @@ layer4567_start: // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ - st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // .......................................................................* // gap // ........................................................................ + trn2 v28.2D, v4.2D, v21.2D // .............................e.......................................... + st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x1], #64 // .......................................................................* + ldr q11, [x4, #-64] // ...................................e.................................... // gap // ........................................................................ // gap // ........................................................................ // gap // ........................................................................ @@ -1307,331 +1355,344 @@ layer4567_start: // gap // ........................................................................ // gap // ........................................................................ - // original source code - // ldr q8, [x1, #(16*0)] // .....e................................................................|......e................................................................ - // ldr q9, [x1, #(16*1)] // ........e.............................................................|.........e............................................................. - // ldr q10, [x1, #(16*2)] // .e....................................................................|..e.................................................................... - // ldr q11, [x1, #(16*3)] // ..e...................................................................|...e................................................................... - // ldr q0, [x3], #16 // e.....................................................................|.e..................................................................... - // mul v24.8h, v10.8h, v0.h[0] // ............e.........................................................|.............e......................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[1] // .............e........................................................|..............e........................................................ - // mls v24.8h, v10.8h, v7.h[0] // .....................e................................................|......................e................................................ - // sub v10.8h, v8.8h, v24.8h // .............................e........................................|..............................e........................................ - // add v8.8h, v8.8h, v24.8h // ...............................e......................................|................................e...................................... - // mul v24.8h, v11.8h, v0.h[0] // ..............e.......................................................|...............e....................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ...............e......................................................|................e...................................................... - // mls v24.8h, v11.8h, v7.h[0] // ......................e...............................................|.......................e............................................... - // sub v11.8h, v9.8h, v24.8h // ................................e.....................................|.................................e..................................... - // add v9.8h, v9.8h, v24.8h // ..............................e.......................................|...............................e....................................... - // mul v24.8h, v9.8h, v0.h[2] // .....................................e................................|......................................e................................ - // sqrdmulh v9.8h, v9.8h, v0.h[3] // ....................................e.................................|.....................................e................................. - // mls v24.8h, v9.8h, v7.h[0] // ...........................................e..........................|............................................e.......................... - // sub v9.8h, v8.8h, v24.8h // .................................................e....................|..................................................e.................... - // add v8.8h, v8.8h, v24.8h // ..................................................e...................|...................................................e................... - // mul v24.8h, v11.8h, v0.h[4] // ......................................e...............................|.......................................e............................... - // sqrdmulh v11.8h, v11.8h, v0.h[5] // ...................................e..................................|....................................e.................................. - // mls v24.8h, v11.8h, v7.h[0] // ............................................e.........................|.............................................e......................... - // sub v11.8h, v10.8h, v24.8h // ...................................................e..................|....................................................e.................. - // add v10.8h, v10.8h, v24.8h // ....................................................e.................|.....................................................e................. - // trn1 v25.4s, v8.4s, v9.4s // ...........................................................e..........|............................................................e.......... - // trn2 v26.4s, v8.4s, v9.4s // ..........................................................e...........|...........................................................e........... - // trn1 v27.4s, v10.4s, v11.4s // .............................................................e........|..............................................................e........ - // trn2 v28.4s, v10.4s, v11.4s // ............................................................e.........|.............................................................e......... - // trn2 v10.2d, v25.2d, v27.2d // ...................................................................e..|....................................................................e.. - // trn2 v11.2d, v26.2d, v28.2d // ....................................................................e.|.....................................................................e. - // trn1 v8.2d, v25.2d, v27.2d // ...*..................................................................|....*.................................................................. - // trn1 v9.2d, v26.2d, v28.2d // ......*...............................................................|.......*............................................................... - // ldr q0, [x4], #(6*16) // .........................................................e............|..........................................................e............ - // ldr q4, [x4, #(-6*16 + 1*16)] // ..................................................................e...|...................................................................e... - // ldr q1, [x4, #(-6*16 + 2*16)] // ................*.....................................................|.................*..................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // .................*....................................................|..................*.................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ...........*..........................................................|............*.......................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ..........*...........................................................|...........*........................................................... - // mul v24.8h, v10.8h, v0.8h // ......................................................................|*...................................................................... - // sqrdmulh v10.8h, v10.8h, v4.8h // .......*..............................................................|........*.............................................................. - // mls v24.8h, v10.8h, v7.h[0] // ..................*...................................................|...................*................................................... - // sub v10.8h, v8.8h, v24.8h // ...........................*..........................................|............................*.......................................... - // add v8.8h, v8.8h, v24.8h // ............................*.........................................|.............................*......................................... - // mul v24.8h, v11.8h, v0.8h // ......................................................................*....................................................................... - // sqrdmulh v11.8h, v11.8h, v4.8h // ....*.................................................................|.....*................................................................. - // mls v24.8h, v11.8h, v7.h[0] // .........*............................................................|..........*............................................................ - // sub v11.8h, v9.8h, v24.8h // ....................*.................................................|.....................*................................................. - // add v9.8h, v9.8h, v24.8h // ...................*..................................................|....................*.................................................. - // mul v24.8h, v9.8h, v1.8h // ..........................*...........................................|...........................*........................................... - // sqrdmulh v9.8h, v9.8h, v5.8h // .........................*............................................|..........................*............................................ - // mls v24.8h, v9.8h, v7.h[0] // ..................................*...................................|...................................*................................... - // sub v9.8h, v8.8h, v24.8h // .........................................*............................|..........................................*............................ - // add v8.8h, v8.8h, v24.8h // ..........................................*...........................|...........................................*........................... - // mul v24.8h, v11.8h, v2.8h // ........................*.............................................|.........................*............................................. - // sqrdmulh v11.8h, v11.8h, v6.8h // .......................*..............................................|........................*.............................................. - // mls v24.8h, v11.8h, v7.h[0] // .................................*....................................|..................................*.................................... - // sub v11.8h, v10.8h, v24.8h // ........................................*.............................|.........................................*............................. - // add v10.8h, v10.8h, v24.8h // .......................................*..............................|........................................*.............................. - // sqdmulh v25.8h, v8.8h, v7.h[1] // ..............................................*.......................|...............................................*....................... - // srshr v25.8h, v25.8h, #11 // ......................................................*...............|.......................................................*............... - // mls v8.8h, v25.8h, v7.h[0] // .................................................................*....|..................................................................*.... - // sqdmulh v25.8h, v9.8h, v7.h[1] // .............................................*........................|..............................................*........................ - // srshr v25.8h, v25.8h, #11 // .....................................................*................|......................................................*................ - // mls v9.8h, v25.8h, v7.h[0] // ...............................................................*......|................................................................*...... - // sqdmulh v25.8h, v10.8h, v7.h[1] // ................................................*.....................|.................................................*..................... - // srshr v25.8h, v25.8h, #11 // ........................................................*.............|.........................................................*............. - // mls v10.8h, v25.8h, v7.h[0] // ..............................................................*.......|...............................................................*....... - // sqdmulh v25.8h, v11.8h, v7.h[1] // ...............................................*......................|................................................*...................... - // srshr v25.8h, v25.8h, #11 // .......................................................*..............|........................................................*.............. - // mls v11.8h, v25.8h, v7.h[0] // ................................................................*.....|.................................................................*..... - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // .....................................................................*|......................................................................* + // ---------------------------------------------------------------- new position ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q8, [x1, #(16*0)] // ........e...............................................................'.......~.............................................................. + // ldr q9, [x1, #(16*1)] // .......e................................................................'......~............................................................... + // ldr q10, [x1, #(16*2)] // ...e....................................................................'..~................................................................... + // ldr q11, [x1, #(16*3)] // .e......................................................................'~..................................................................... + // ldr q0, [x3], #16 // e.......................................................................~...................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ...............e........................................................'..............~....................................................... + // mul v24.8h, v10.8h, v0.h[0] // ................e.......................................................'...............~...................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................e................................................'......................~............................................... + // sub v10.8h, v8.8h, v24.8h // ............................e...........................................'...........................~.......................................... + // add v8.8h, v8.8h, v24.8h // .........................................e..............................'........................................~............................. + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .............e..........................................................'............~......................................................... + // mul v24.8h, v11.8h, v0.h[0] // ..............e.........................................................'.............~........................................................ + // mls v24.8h, v27.8h, v7.h[0] // ......................e.................................................'.....................~................................................ + // sub v11.8h, v9.8h, v24.8h // .............................e..........................................'............................~......................................... + // add v9.8h, v9.8h, v24.8h // ..............................e.........................................'.............................~........................................ + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ...................................e....................................'..................................~................................... + // mul v24.8h, v9.8h, v0.h[2] // ....................................e...................................'...................................~.................................. + // mls v24.8h, v27.8h, v7.h[0] // ...........................................e............................'..........................................~........................... + // sub v9.8h, v8.8h, v24.8h // ................................................e.......................'...............................................~...................... + // add v8.8h, v8.8h, v24.8h // ...................................................e....................'..................................................~................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // .................................e......................................'................................~..................................... + // mul v24.8h, v11.8h, v0.h[4] // ..................................e.....................................'.................................~.................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................e.............................'.........................................~............................ + // sub v11.8h, v10.8h, v24.8h // ..................................................e.....................'.................................................~.................... + // add v10.8h, v10.8h, v24.8h // .................................................e......................'................................................~..................... + // trn1 v25.4s, v8.4s, v9.4s // ........................................................e...............'.......................................................~.............. + // trn2 v26.4s, v8.4s, v9.4s // .........................................................e..............'........................................................~............. + // trn1 v27.4s, v10.4s, v11.4s // ..........................................................e.............'.........................................................~............ + // trn2 v28.4s, v10.4s, v11.4s // ...........................................................e............'..........................................................~........... + // trn2 v10.2d, v25.2d, v27.2d // .....................................................................e..'....................................................................~. + // trn2 v11.2d, v26.2d, v28.2d // ...................................................................e....'..................................................................~... + // trn1 v8.2d, v25.2d, v27.2d // .........~..............................................................'........*............................................................. + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e...'...................................................................~.. + // ldr q0, [ x4], #(6*16) // ............e...........................................................'...........~.......................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ..................................................................e.....'.................................................................~.... + // ldr q1, [ x4, #(-6*16 + 2*16)] // .......................................................................e'...................................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ............................................................e...........'...........................................................~.......... + // ldr q2, [ x4, #(-6*16 + 4*16)] // .............................................................e..........'............................................................~......... + // ldr q6, [x4, #(-6*16 + 5*16)] // .................~......................................................'................*..................................................... + // sqrdmulh v27.8h, v10.8h, v4.8h // ..~.....................................................................'.*.................................................................... + // mul v24.8h, v10.8h, v0.8h // ....~...................................................................'...*.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...........~............................................................'..........*........................................................... + // sub v10.8h, v8.8h, v24.8h // ...................~....................................................'..................*................................................... + // add v8.8h, v8.8h, v24.8h // ..................~.....................................................'.................*.................................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ......~.................................................................'.....*................................................................ + // mul v24.8h, v11.8h, v0.8h // .....~..................................................................'....*................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..........~.............................................................'.........*............................................................ + // sub v11.8h, v9.8h, v24.8h // .....................~..................................................'....................*................................................. + // add v9.8h, v9.8h, v24.8h // ....................~...................................................'...................*.................................................. + // sqrdmulh v27.8h, v9.8h, v5.8h // .........................~..............................................'........................*............................................. + // mul v24.8h, v9.8h, v1.8h // ..........................~.............................................'.........................*............................................ + // mls v24.8h, v27.8h, v7.h[0] // ................................~.......................................'...............................*...................................... + // sub v9.8h, v8.8h, v24.8h // .......................................~................................'......................................*............................... + // add v8.8h, v8.8h, v24.8h // ........................................~...............................'.......................................*.............................. + // sqrdmulh v27.8h, v11.8h, v6.8h // ........................~...............................................'.......................*.............................................. + // mul v24.8h, v11.8h, v2.8h // ...........................~............................................'..........................*........................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................~........................................'..............................*....................................... + // sub v11.8h, v10.8h, v24.8h // .....................................~..................................'....................................*................................. + // add v10.8h, v10.8h, v24.8h // ......................................~.................................'.....................................*................................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...............................................~........................'..............................................*....................... + // srshr v25.8h, v25.8h, #11 // ....................................................~...................'...................................................*.................. + // mls v8.8h, v25.8h, v7.h[0] // .................................................................~......'................................................................*..... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ............................................~...........................'...........................................*.......................... + // srshr v25.8h, v25.8h, #11 // .....................................................~..................'....................................................*................. + // mls v9.8h, v25.8h, v7.h[0] // ................................................................~.......'...............................................................*...... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..............................................~.........................'.............................................*........................ + // srshr v25.8h, v25.8h, #11 // ......................................................~.................'.....................................................*................ + // mls v10.8h, v25.8h, v7.h[0] // ...............................................................~........'..............................................................*....... + // sqdmulh v25.8h, v11.8h, v7.h[1] // .............................................~..........................'............................................*......................... + // srshr v25.8h, v25.8h, #11 // .......................................................~................'......................................................*............... + // mls v11.8h, v25.8h, v7.h[0] // ..............................................................~.........'.............................................................*........ + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ......................................................................~.'.....................................................................* sub count, count, #1 cbnz count, layer4567_start - mul v18.8H, v0.8H, v23.8H // .*..................................... - sqrdmulh v26.8H, v0.8H, v4.8H // .....*................................. - ldr q0, [x4, #-32] // ........*.............................. - sqrdmulh v20.8H, v3.8H, v4.8H // ...*................................... - mul v5.8H, v3.8H, v23.8H // *...................................... - ldr q30, [x4, #-16] // .......*............................... - ldr q23, [x4, #-64] // .........*............................. - // gap // ....................................... - trn1 v29.2D, v12.2D, v10.2D // ....*.................................. - trn1 v9.2D, v21.2D, v24.2D // ..*.................................... - ldr q27, [x4, #-48] // ..........*............................ - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v18.8H, v26.8H, v7.H[0] // ...........*........................... - mls v5.8H, v20.8H, v7.H[0] // ......*................................ - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sub v13.8H, v9.8H, v18.8H // ..................*.................... - add v26.8H, v9.8H, v18.8H // ...................*................... - add v9.8H, v29.8H, v5.8H // ............*.......................... - sub v17.8H, v29.8H, v5.8H // .............*......................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mul v31.8H, v9.8H, v23.8H // .................*..................... - sqrdmulh v18.8H, v9.8H, v27.8H // ................*...................... - mul v20.8H, v17.8H, v0.8H // ...............*....................... - sqrdmulh v29.8H, v17.8H, v30.8H // ..............*........................ - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v20.8H, v29.8H, v7.H[0] // ....................*.................. - mls v31.8H, v18.8H, v7.H[0] // .....................*................. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - add v24.8H, v13.8H, v20.8H // ......................*................ - sub v25.8H, v13.8H, v20.8H // .......................*............... - sub v23.8H, v26.8H, v31.8H // ........................*.............. - add v22.8H, v26.8H, v31.8H // .........................*............. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - sqdmulh v3.8H, v25.8H, v7.H[1] // ............................*.......... - sqdmulh v4.8H, v23.8H, v7.H[1] // ..........................*............ - sqdmulh v26.8H, v22.8H, v7.H[1] // ...........................*........... - sqdmulh v29.8H, v24.8H, v7.H[1] // .............................*......... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - srshr v20.8H, v4.8H, #11 // ..............................*........ - srshr v18.8H, v26.8H, #11 // ...............................*....... - srshr v29.8H, v29.8H, #11 // .................................*..... - srshr v13.8H, v3.8H, #11 // ................................*...... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - mls v23.8H, v20.8H, v7.H[0] // ...................................*... - mls v22.8H, v18.8H, v7.H[0] // .....................................*. - mls v24.8H, v29.8H, v7.H[0] // ..................................*.... - mls v25.8H, v13.8H, v7.H[0] // ....................................*.. - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - st4 {v22.4S, v23.4S, v24.4S, v25.4S}, [x1], #64 // ......................................* - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... - // gap // ....................................... + // Instructions: 35 + // Expected cycles: 26 + // Expected IPC: 1.35 + // + // Cycle bound: 26.0 + // IPC bound: 1.35 + // + // Wall time: 0.71s + // User time: 0.71s + // + // ------- original position --------> + // 0 25 + // |------------------------|--------- + mul v13.8H, v5.8H, v24.8H // ..*................................ + mul v31.8H, v28.8H, v24.8H // .*................................. + sqrdmulh v9.8H, v5.8H, v6.8H // ...*............................... + sqrdmulh v14.8H, v28.8H, v6.8H // *.................................. + ldr q25, [x4, #-16] // .......*........................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + trn1 v16.2D, v4.2D, v21.2D // ....*.............................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v13.8H, v9.8H, v7.H[0] // .....*............................. + mls v31.8H, v14.8H, v7.H[0] // ......*............................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + add v14.8H, v15.8H, v13.8H // ..........*........................ + sub v13.8H, v15.8H, v13.8H // ...........*....................... + add v27.8H, v16.8H, v31.8H // ........*.......................... + sub v15.8H, v16.8H, v31.8H // .........*......................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mul v6.8H, v14.8H, v11.8H // ..............*.................... + sqrdmulh v20.8H, v14.8H, v0.8H // .............*..................... + mul v0.8H, v13.8H, v26.8H // ...............*................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqrdmulh v22.8H, v13.8H, v25.8H // ............*...................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v6.8H, v20.8H, v7.H[0] // .................*................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v0.8H, v22.8H, v7.H[0] // ................*.................. + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sub v9.8H, v27.8H, v6.8H // ....................*.............. + add v8.8H, v27.8H, v6.8H // .....................*............. + add v10.8H, v15.8H, v0.8H // ...................*............... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sub v11.8H, v15.8H, v0.8H // ..................*................ + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqdmulh v2.8H, v9.8H, v7.H[1] // ......................*............ + sqdmulh v15.8H, v8.8H, v7.H[1] // .........................*......... + sqdmulh v26.8H, v10.8H, v7.H[1] // ........................*.......... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + sqdmulh v16.8H, v11.8H, v7.H[1] // .......................*........... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + srshr v12.8H, v15.8H, #11 // ..........................*........ + srshr v21.8H, v2.8H, #11 // ...........................*....... + srshr v14.8H, v16.8H, #11 // .............................*..... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + srshr v15.8H, v26.8H, #11 // ............................*...... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v8.8H, v12.8H, v7.H[0] // .................................*. + mls v9.8H, v21.8H, v7.H[0] // ................................*.. + mls v10.8H, v15.8H, v7.H[0] // ...............................*... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + mls v11.8H, v14.8H, v7.H[0] // ..............................*.... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + // gap // ................................... + st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ..................................* - // original source code - // mul v18.8H, v3.8H, v23.8H // ....*.................................. - // mul v20.8H, v0.8H, v23.8H // *...................................... - // trn1 v26.2D, v21.2D, v24.2D // ........*.............................. - // sqrdmulh v8.8H, v3.8H, v4.8H // ...*................................... - // trn1 v23.2D, v12.2D, v10.2D // .......*............................... - // sqrdmulh v6.8H, v0.8H, v4.8H // .*..................................... - // mls v18.8H, v8.8H, v7.H[0] // ...........*........................... - // ldr q4, [x4, #-16] // .....*................................. - // ldr q25, [x4, #-32] // ..*.................................... - // ldr q10, [x4, #-64] // ......*................................ - // ldr q27, [x4, #-48] // .........*............................. - // mls v20.8H, v6.8H, v7.H[0] // ..........*............................ - // add v3.8H, v23.8H, v18.8H // ..............*........................ - // sub v23.8H, v23.8H, v18.8H // ...............*....................... - // sqrdmulh v31.8H, v23.8H, v4.8H // ...................*................... - // mul v4.8H, v23.8H, v25.8H // ..................*.................... - // sqrdmulh v17.8H, v3.8H, v27.8H // .................*..................... - // mul v3.8H, v3.8H, v10.8H // ................*...................... - // sub v21.8H, v26.8H, v20.8H // ............*.......................... - // add v22.8H, v26.8H, v20.8H // .............*......................... - // mls v4.8H, v31.8H, v7.H[0] // ....................*.................. - // mls v3.8H, v17.8H, v7.H[0] // .....................*................. - // add v27.8H, v21.8H, v4.8H // ......................*................ - // sub v28.8H, v21.8H, v4.8H // .......................*............... - // sub v26.8H, v22.8H, v3.8H // ........................*.............. - // add v25.8H, v22.8H, v3.8H // .........................*............. - // sqdmulh v23.8H, v26.8H, v7.H[1] // ...........................*........... - // sqdmulh v1.8H, v25.8H, v7.H[1] // ............................*.......... - // sqdmulh v22.8H, v28.8H, v7.H[1] // ..........................*............ - // sqdmulh v20.8H, v27.8H, v7.H[1] // .............................*......... - // srshr v4.8H, v23.8H, #11 // ..............................*........ - // srshr v3.8H, v1.8H, #11 // ...............................*....... - // srshr v5.8H, v22.8H, #11 // .................................*..... - // srshr v22.8H, v20.8H, #11 // ................................*...... - // mls v27.8H, v22.8H, v7.H[0] // ....................................*.. - // mls v26.8H, v4.8H, v7.H[0] // ..................................*.... - // mls v28.8H, v5.8H, v7.H[0] // .....................................*. - // mls v25.8H, v3.8H, v7.H[0] // ...................................*... - // st4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1], #64 // ......................................* + // ---------- new position ----------> + // 0 25 + // |------------------------|--------- + // sqrdmulh v27.8H, v28.8H, v6.8H // ...*............................... + // mul v19.8H, v28.8H, v24.8H // .*................................. + // mul v31.8H, v5.8H, v24.8H // *.................................. + // sqrdmulh v6.8H, v5.8H, v6.8H // ..*................................ + // trn1 v20.2D, v4.2D, v21.2D // .....*............................. + // mls v31.8H, v6.8H, v7.H[0] // ......*............................ + // mls v19.8H, v27.8H, v7.H[0] // .......*........................... + // ldr q14, [x4, #-16] // ....*.............................. + // add v16.8H, v20.8H, v19.8H // ..........*........................ + // sub v6.8H, v20.8H, v19.8H // ...........*....................... + // add v27.8H, v15.8H, v31.8H // ........*.......................... + // sub v15.8H, v15.8H, v31.8H // .........*......................... + // sqrdmulh v8.8H, v15.8H, v14.8H // ...............*................... + // sqrdmulh v31.8H, v27.8H, v0.8H // .............*..................... + // mul v27.8H, v27.8H, v11.8H // ............*...................... + // mul v15.8H, v15.8H, v26.8H // ..............*.................... + // mls v15.8H, v8.8H, v7.H[0] // .................*................. + // mls v27.8H, v31.8H, v7.H[0] // ................*.................. + // sub v13.8H, v6.8H, v15.8H // .....................*............. + // add v12.8H, v6.8H, v15.8H // ....................*.............. + // sub v11.8H, v16.8H, v27.8H // ..................*................ + // add v10.8H, v16.8H, v27.8H // ...................*............... + // sqdmulh v6.8H, v11.8H, v7.H[1] // ......................*............ + // sqdmulh v16.8H, v13.8H, v7.H[1] // .........................*......... + // sqdmulh v27.8H, v12.8H, v7.H[1] // ........................*.......... + // sqdmulh v14.8H, v10.8H, v7.H[1] // .......................*........... + // srshr v2.8H, v14.8H, #11 // ..........................*........ + // srshr v14.8H, v6.8H, #11 // ...........................*....... + // srshr v27.8H, v27.8H, #11 // .............................*..... + // srshr v16.8H, v16.8H, #11 // ............................*...... + // mls v13.8H, v16.8H, v7.H[0] // .................................*. + // mls v12.8H, v27.8H, v7.H[0] // ................................*.. + // mls v11.8H, v14.8H, v7.H[0] // ...............................*... + // mls v10.8H, v2.8H, v7.H[0] // ..............................*.... + // st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x1], #64 // ..................................* pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_opt_m1_icestorm.s index c99a3cfd..ad001418 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_opt_m1_icestorm.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm - -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h @@ -67,15 +44,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +61,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -103,21 +74,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -139,7 +110,7 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -150,7 +121,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -160,7 +131,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -168,7 +139,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -179,19 +150,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -330,972 +301,1038 @@ _ntt_kyber_123_4567_opt_m1_icestorm: load_roots_123 .p2align 2 - ldr q31, [x0, #192] // ..*................................ - ldr q12, [x0, #448] // .*................................. - // gap // ................................... - // gap // ................................... - ldr q9, [x0, #384] // *.................................. - // gap // ................................... - // gap // ................................... - ldr q21, [x0, #128] // ...*............................... - ldr q26, [x0, #0] // .........*......................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - ldr q29, [x0, #256] // ...........................*....... - // gap // ................................... - // gap // ................................... - // gap // ................................... - sqrdmulh v20.8H, v12.8H, v0.H[1] // .......*........................... - mul v13.8H, v12.8H, v0.H[0] // ......*............................ - ldr q12, [x0, #64] // .....*............................. - ldr q19, [x0, #320] // ....*.............................. - sqrdmulh v24.8H, v9.8H, v0.H[1] // ..............*.................... - mul v9.8H, v9.8H, v0.H[0] // .............*..................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v13.8H, v20.8H, v7.H[0] // ........*.......................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v9.8H, v24.8H, v7.H[0] // ..................*................ - mul v22.8H, v19.8H, v0.H[0] // ...........*....................... - // gap // ................................... - // gap // ................................... - sqrdmulh v19.8H, v19.8H, v0.H[1] // ............*...................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - add v5.8H, v31.8H, v13.8H // ..........*........................ - // gap // ................................... - // gap // ................................... - // gap // ................................... - add v30.8H, v21.8H, v9.8H // .....................*............. - sub v28.8H, v21.8H, v9.8H // ......................*............ - // gap // ................................... - // gap // ................................... - sqrdmulh v6.8H, v5.8H, v0.H[3] // ...............*................... - mul v9.8H, v5.8H, v0.H[2] // ................*.................. - // gap // ................................... - // gap // ................................... - mls v22.8H, v19.8H, v7.H[0] // .................*................. - mul v15.8H, v30.8H, v0.H[2] // .........................*......... - // gap // ................................... - // gap // ................................... - mul v8.8H, v28.8H, v0.H[4] // ................................*.. - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v9.8H, v6.8H, v7.H[0] // ...................*............... - // gap // ................................... - // gap // ................................... - // gap // ................................... - add v4.8H, v12.8H, v22.8H // ....................*.............. - // gap // ................................... - // gap // ................................... - // gap // ................................... - sub v25.8H, v12.8H, v22.8H // ..............................*.... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - // gap // ................................... - sub v11.8H, v4.8H, v9.8H // .......................*........... - sqrdmulh v12.8H, v30.8H, v0.H[3] // ..........................*........ - // gap // ................................... - // gap // ................................... - // gap // ................................... - sqrdmulh v6.8H, v11.8H, v1.H[1] // .............................*..... - mul v22.8H, v11.8H, v1.H[0] // ............................*...... - // gap // ................................... - // gap // ................................... - add v30.8H, v4.8H, v9.8H // ........................*.......... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v15.8H, v12.8H, v7.H[0] // ...............................*... - // gap // ................................... - // gap // ................................... - // gap // ................................... - mls v22.8H, v6.8H, v7.H[0] // .................................*. - // gap // ................................... - // gap // ................................... - sqrdmulh v6.8H, v29.8H, v0.H[1] // ..................................* + // Instructions: 59 + // Expected cycles: 30 + // Expected IPC: 1.97 + // + // Cycle bound: 29.0 + // IPC bound: 2.03 + // + // Wall time: 3600.20s + // User time: 3600.20s + // + // ------------------- original position --------------------> + // 0 25 50 + // |------------------------|------------------------|-------- + ldr q16, [x0, #0] // *.......................................................... + ldr q15, [x0, #256] // .*......................................................... + // gap // ........................................................... + // gap // ........................................................... + ldr q27, [x0, #320] // ..*........................................................ + ldr q31, [x0, #448] // ...*....................................................... + // gap // ........................................................... + // gap // ........................................................... + ldr q25, [x0, #384] // ....*...................................................... + ldr q17, [x0, #192] // .....*..................................................... + // gap // ........................................................... + // gap // ........................................................... + ldr q10, [x0, #64] // ..........*................................................ + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + mul v11.8H, v15.8H, v0.H[0] // ......*.................................................... + sqrdmulh v26.8H, v15.8H, v0.H[1] // .......*................................................... + // gap // ........................................................... + // gap // ........................................................... + sqrdmulh v13.8H, v31.8H, v0.H[1] // ...........*............................................... + mul v31.8H, v31.8H, v0.H[0] // .........*................................................. + // gap // ........................................................... + // gap // ........................................................... + sqrdmulh v14.8H, v25.8H, v0.H[1] // .................*......................................... + mul v24.8H, v25.8H, v0.H[0] // ................*.......................................... + // gap // ........................................................... + // gap // ........................................................... + mul v6.8H, v27.8H, v0.H[0] // ............*.............................................. + sqrdmulh v15.8H, v27.8H, v0.H[1] // .............*............................................. + // gap // ........................................................... + // gap // ........................................................... + mls v31.8H, v13.8H, v7.H[0] // ...............*........................................... + mls v11.8H, v26.8H, v7.H[0] // ..............*............................................ + // gap // ........................................................... + ldr q23, [x0, #128] // ........*.................................................. + mls v24.8H, v14.8H, v7.H[0] // ......................*.................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + mls v6.8H, v15.8H, v7.H[0] // .......................*................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sub v15.8H, v17.8H, v31.8H // .....................*..................................... + add v27.8H, v17.8H, v31.8H // ....................*...................................... + // gap // ........................................................... + // gap // ........................................................... + sub v3.8H, v23.8H, v24.8H // ............................*.............................. + add v31.8H, v23.8H, v24.8H // ..................................*........................ + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + // gap // ........................................................... + sqrdmulh v23.8H, v15.8H, v0.H[5] // .........................*................................. + mul v18.8H, v15.8H, v0.H[4] // ........................*.................................. + mul v17.8H, v27.8H, v0.H[2] // ..........................*................................ + sqrdmulh v27.8H, v27.8H, v0.H[3] // ...........................*............................... + // gap // ........................................................... + // gap // ........................................................... + mul v29.8H, v31.8H, v0.H[2] // ..........................................*................ + sub v26.8H, v10.8H, v6.8H // ...............................*........................... + // gap // ........................................................... + // gap // ........................................................... + mul v9.8H, v3.8H, v0.H[4] // ................................*.......................... + mls v18.8H, v23.8H, v7.H[0] // ..............................*............................ + // gap // ........................................................... + // gap // ........................................................... + mls v17.8H, v27.8H, v7.H[0] // .................................*......................... + add v8.8H, v16.8H, v11.8H // ..................*........................................ + // gap // ........................................................... + // gap // ........................................................... + sub v2.8H, v16.8H, v11.8H // ...................*....................................... + add v11.8H, v10.8H, v6.8H // .............................*............................. + // gap // ........................................................... + // gap // ........................................................... + sqrdmulh v6.8H, v3.8H, v0.H[5] // ...................................*....................... + sub v14.8H, v26.8H, v18.8H // .....................................*..................... + // gap // ........................................................... + // gap // ........................................................... + add v28.8H, v11.8H, v17.8H // .......................................*................... + sub v13.8H, v11.8H, v17.8H // .....................................................*..... + // gap // ........................................................... + // gap // ........................................................... + mul v16.8H, v14.8H, v1.H[4] // ........................................*.................. + sqrdmulh v15.8H, v14.8H, v1.H[5] // .........................................*................. + // gap // ........................................................... + // gap // ........................................................... + add v27.8H, v26.8H, v18.8H // ....................................*...................... + // gap // ........................................................... + // gap // ........................................................... + mls v9.8H, v6.8H, v7.H[0] // ...........................................*............... + sqrdmulh v11.8H, v31.8H, v0.H[3] // ......................................*.................... + sqrdmulh v6.8H, v28.8H, v0.H[7] // .............................................*............. + // gap // ........................................................... + // gap // ........................................................... + sqrdmulh v18.8H, v27.8H, v1.H[3] // ...............................................*........... + mls v16.8H, v15.8H, v7.H[0] // ..............................................*............ + // gap // ........................................................... + // gap // ........................................................... + sub v22.8H, v2.8H, v9.8H // .................................................*......... + mul v3.8H, v28.8H, v0.H[6] // ............................................*.............. + // gap // ........................................................... + // gap // ........................................................... + mls v29.8H, v11.8H, v7.H[0] // ................................................*.......... + mul v14.8H, v27.8H, v1.H[2] // ..................................................*........ + // gap // ........................................................... + // gap // ........................................................... + add v25.8H, v2.8H, v9.8H // ......................................................*.... + sub v15.8H, v22.8H, v16.8H // ....................................................*...... + // gap // ........................................................... + // gap // ........................................................... + mls v3.8H, v6.8H, v7.H[0] // ...................................................*....... + // gap // ........................................................... + // gap // ........................................................... + sqrdmulh v11.8H, v13.8H, v1.H[1] // ..........................................................* + mls v14.8H, v18.8H, v7.H[0] // ........................................................*.. + add v19.8H, v8.8H, v29.8H // .......................................................*... + str q15, [x0, #448] // .........................................................*. + // gap // ........................................................... - // original source code - // ldr q14, [x0, #384] // ..*................................ - // ldr q18, [x0, #448] // .*................................. - // ldr q31, [x0, #192] // *.................................. - // ldr q4, [x0, #128] // ...*............................... - // ldr q12, [x0, #320] // .........*......................... - // ldr q24, [x0, #64] // ........*.......................... - // mul v13.8H, v18.8H, v0.H[0] // .......*........................... - // sqrdmulh v27.8H, v18.8H, v0.H[1] // ......*............................ - // mls v13.8H, v27.8H, v7.H[0] // ............*...................... - // ldr q26, [x0, #0] // ....*.............................. - // add v29.8H, v31.8H, v13.8H // ................*.................. - // mul v6.8H, v12.8H, v0.H[0] // ..............*.................... - // sqrdmulh v25.8H, v12.8H, v0.H[1] // ...............*................... - // mul v30.8H, v14.8H, v0.H[0] // ...........*....................... - // sqrdmulh v22.8H, v14.8H, v0.H[1] // ..........*........................ - // sqrdmulh v12.8H, v29.8H, v0.H[3] // ...................*............... - // mul v16.8H, v29.8H, v0.H[2] // ....................*.............. - // mls v6.8H, v25.8H, v7.H[0] // .....................*............. - // mls v30.8H, v22.8H, v7.H[0] // .............*..................... - // mls v16.8H, v12.8H, v7.H[0] // ........................*.......... - // add v22.8H, v24.8H, v6.8H // .........................*......... - // add v11.8H, v4.8H, v30.8H // .................*................. - // sub v28.8H, v4.8H, v30.8H // ..................*................ - // sub v5.8H, v22.8H, v16.8H // ...........................*....... - // add v30.8H, v22.8H, v16.8H // ...............................*... - // mul v15.8H, v11.8H, v0.H[2] // ......................*............ - // sqrdmulh v27.8H, v11.8H, v0.H[3] // ............................*...... - // ldr q29, [x0, #256] // .....*............................. - // mul v22.8H, v5.8H, v1.H[0] // ..............................*.... - // sqrdmulh v9.8H, v5.8H, v1.H[1] // .............................*..... - // sub v25.8H, v24.8H, v6.8H // ..........................*........ - // mls v15.8H, v27.8H, v7.H[0] // ................................*.. - // mul v8.8H, v28.8H, v0.H[4] // .......................*........... - // mls v22.8H, v9.8H, v7.H[0] // .................................*. - // sqrdmulh v6.8H, v29.8H, v0.H[1] // ..................................* + // ---------------------- new position ----------------------> + // 0 25 50 + // |------------------------|------------------------|-------- + // ldr q23, [x0, #0] // *.......................................................... + // ldr q2, [x0, #256] // .*......................................................... + // ldr q13, [x0, #320] // ..*........................................................ + // ldr q31, [x0, #448] // ...*....................................................... + // ldr q14, [x0, #384] // ....*...................................................... + // ldr q20, [x0, #192] // .....*..................................................... + // mul v16.8H, v2.8H, v0.H[0] // .......*................................................... + // sqrdmulh v15.8H, v2.8H, v0.H[1] // ........*.................................................. + // ldr q3, [x0, #128] // .................*......................................... + // mul v26.8H, v31.8H, v0.H[0] // ..........*................................................ + // ldr q12, [x0, #64] // ......*.................................................... + // sqrdmulh v31.8H, v31.8H, v0.H[1] // .........*................................................. + // mul v19.8H, v13.8H, v0.H[0] // .............*............................................. + // sqrdmulh v30.8H, v13.8H, v0.H[1] // ..............*............................................ + // mls v16.8H, v15.8H, v7.H[0] // ................*.......................................... + // mls v26.8H, v31.8H, v7.H[0] // ...............*........................................... + // mul v13.8H, v14.8H, v0.H[0] // ............*.............................................. + // sqrdmulh v14.8H, v14.8H, v0.H[1] // ...........*............................................... + // add v8.8H, v23.8H, v16.8H // .................................*......................... + // sub v9.8H, v23.8H, v16.8H // ..................................*........................ + // add v31.8H, v20.8H, v26.8H // .....................*..................................... + // sub v25.8H, v20.8H, v26.8H // ....................*...................................... + // mls v13.8H, v14.8H, v7.H[0] // ..................*........................................ + // mls v19.8H, v30.8H, v7.H[0] // ...................*....................................... + // mul v15.8H, v25.8H, v0.H[4] // .........................*................................. + // sqrdmulh v16.8H, v25.8H, v0.H[5] // ........................*.................................. + // mul v2.8H, v31.8H, v0.H[2] // ..........................*................................ + // sqrdmulh v4.8H, v31.8H, v0.H[3] // ...........................*............................... + // sub v23.8H, v3.8H, v13.8H // ......................*.................................... + // add v5.8H, v12.8H, v19.8H // ...................................*....................... + // mls v15.8H, v16.8H, v7.H[0] // ...............................*........................... + // sub v18.8H, v12.8H, v19.8H // .............................*............................. + // mul v31.8H, v23.8H, v0.H[4] // ..............................*............................ + // mls v2.8H, v4.8H, v7.H[0] // ................................*.......................... + // add v6.8H, v3.8H, v13.8H // .......................*................................... + // sqrdmulh v17.8H, v23.8H, v0.H[5] // ....................................*...................... + // add v13.8H, v18.8H, v15.8H // ..........................................*................ + // sub v15.8H, v18.8H, v15.8H // .....................................*..................... + // sqrdmulh v11.8H, v6.8H, v0.H[3] // ............................................*.............. + // add v14.8H, v5.8H, v2.8H // ......................................*.................... + // mul v16.8H, v15.8H, v1.H[4] // ........................................*.................. + // sqrdmulh v27.8H, v15.8H, v1.H[5] // .........................................*................. + // mul v29.8H, v6.8H, v0.H[2] // ............................*.............................. + // mls v31.8H, v17.8H, v7.H[0] // ...........................................*............... + // mul v3.8H, v14.8H, v0.H[6] // .................................................*......... + // sqrdmulh v26.8H, v14.8H, v0.H[7] // .............................................*............. + // mls v16.8H, v27.8H, v7.H[0] // ...............................................*........... + // sqrdmulh v6.8H, v13.8H, v1.H[3] // ..............................................*............ + // mls v29.8H, v11.8H, v7.H[0] // ..................................................*........ + // sub v22.8H, v9.8H, v31.8H // ................................................*.......... + // mul v14.8H, v13.8H, v1.H[2] // ...................................................*....... + // mls v3.8H, v26.8H, v7.H[0] // ......................................................*.... + // sub v15.8H, v22.8H, v16.8H // .....................................................*..... + // sub v13.8H, v5.8H, v2.8H // .......................................*................... + // add v25.8H, v9.8H, v31.8H // ....................................................*...... + // add v19.8H, v8.8H, v29.8H // .........................................................*. + // mls v14.8H, v6.8H, v7.H[0] // ........................................................*.. + // str q15, [x0, #448] // ..........................................................* + // sqrdmulh v11.8H, v13.8H, v1.H[1] // .......................................................*... sub count, count, #1 layer123_start: - mul v20.8H, v29.8H, v0.H[0] // ........*................................................................... - ldr q14, [x0, #400] // ......e..................................................................... - sub v19.8H, v31.8H, v13.8H // ..........................*................................................. - ldr q18, [x0, #464] // .......e.................................................................... - ldr q31, [x0, #208] // ...e........................................................................ - ldr q4, [x0, #144] // ..e......................................................................... - mul v21.8H, v30.8H, v0.H[6] // ................................................*........................... - sqrdmulh v2.8H, v28.8H, v0.H[5] // .......................................*.................................... - sqrdmulh v5.8H, v19.8H, v0.H[5] // ............................................*............................... - mul v11.8H, v19.8H, v0.H[4] // ...........................................*................................ - ldr q12, [x0, #336] // .....e...................................................................... - ldr q24, [x0, #80] // .e.......................................................................... - mls v20.8H, v6.8H, v7.H[0] // ..........*................................................................. + // Instructions: 76 + // Expected cycles: 30 + // Expected IPC: 2.53 + // + // Cycle bound: 30.0 + // IPC bound: 2.53 + // + // Wall time: 69.86s + // User time: 69.86s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q23, [x0, #16] // e........................................................................... + add v27.8H, v19.8H, v3.8H // ....................................................*....................... + mul v6.8H, v13.8H, v1.H[0] // ......................................................*..................... + ldr q2, [x0, #272] // ....e....................................................................... + add v15.8H, v22.8H, v16.8H // ...................................................................*........ + sub v26.8H, v19.8H, v3.8H // ...................................................*........................ + ldr q13, [x0, #336] // .....e...................................................................... + ldr q31, [x0, #464] // .......e.................................................................... + str q27, [x0], #(16) // ....................................................................*....... + add v16.8H, v25.8H, v14.8H // ..............................................................*............. + sub v27.8H, v25.8H, v14.8H // .............................................................*.............. + ldr q14, [x0, #384] // ......e..................................................................... + sub v4.8H, v8.8H, v29.8H // ...............................*............................................ + ldr q20, [x0, #192] // ...e........................................................................ + str q15, [x0, #368] // ..........................................................................*. + mls v6.8H, v11.8H, v7.H[0] // .......................................................*.................... + str q16, [x0, #240] // ........................................................................*... + mul v16.8H, v2.8H, v0.H[0] // .........e.................................................................. + sqrdmulh v15.8H, v2.8H, v0.H[1] // ........e................................................................... + ldr q3, [x0, #128] // ..e......................................................................... + str q26, [x0, #48] // .....................................................................*...... + mul v26.8H, v31.8H, v0.H[0] // ........................e................................................... + ldr q12, [x0, #64] // .e.......................................................................... + sqrdmulh v31.8H, v31.8H, v0.H[1] // .......................e.................................................... + add v11.8H, v4.8H, v6.8H // .........................................................*.................. + mul v19.8H, v13.8H, v0.H[0] // ..............e............................................................. // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v9.8H, v30.8H, v0.H[7] // .................................................*.......................... - mul v13.8H, v18.8H, v0.H[0] // .......................e.................................................... - sqrdmulh v27.8H, v18.8H, v0.H[1] // ........................e................................................... + sqrdmulh v30.8H, v13.8H, v0.H[1] // .............e.............................................................. + mls v16.8H, v15.8H, v7.H[0] // ..........e................................................................. // gap // ............................................................................ // gap // ............................................................................ - mls v11.8H, v5.8H, v7.H[0] // .............................................*.............................. - mls v8.8H, v2.8H, v7.H[0] // ........................................*................................... + mls v26.8H, v31.8H, v7.H[0] // .........................e.................................................. + sub v15.8H, v4.8H, v6.8H // ........................................................*................... // gap // ............................................................................ // gap // ............................................................................ - mls v21.8H, v9.8H, v7.H[0] // ..................................................*......................... - add v9.8H, v26.8H, v20.8H // ............*............................................................... + mul v13.8H, v14.8H, v0.H[0] // ...................e........................................................ + sqrdmulh v14.8H, v14.8H, v0.H[1] // ..................e......................................................... // gap // ............................................................................ // gap // ............................................................................ - mls v13.8H, v27.8H, v7.H[0] // .........................e.................................................. - sub v19.8H, v26.8H, v20.8H // ...........*................................................................ - ldr q26, [x0, #16] // e........................................................................... + str q15, [x0, #176] // .......................................................................*.... + add v8.8H, v23.8H, v16.8H // ............e............................................................... + sub v9.8H, v23.8H, v16.8H // ...........e................................................................ // gap // ............................................................................ - add v2.8H, v25.8H, v11.8H // ...............................................*............................ - sub v11.8H, v25.8H, v11.8H // ..............................................*............................. + add v31.8H, v20.8H, v26.8H // ...........................e................................................ + str q11, [x0, #112] // ......................................................................*..... // gap // ............................................................................ + sub v25.8H, v20.8H, v26.8H // ..........................e................................................. + str q27, [x0, #304] // .........................................................................*.. + mls v13.8H, v14.8H, v7.H[0] // ....................e....................................................... + mls v19.8H, v30.8H, v7.H[0] // ...............e............................................................ // gap // ............................................................................ - add v5.8H, v9.8H, v15.8H // ................................*........................................... - sub v28.8H, v9.8H, v15.8H // ...............................*............................................ + mul v15.8H, v25.8H, v0.H[4] // ............................................e............................... + sqrdmulh v16.8H, v25.8H, v0.H[5] // ...........................................e................................ // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v9.8H, v11.8H, v1.H[5] // ................................................................*........... - mul v11.8H, v11.8H, v1.H[4] // ...............................................................*............ + mul v2.8H, v31.8H, v0.H[2] // ..................................e......................................... + sqrdmulh v4.8H, v31.8H, v0.H[3] // .................................e.......................................... // gap // ............................................................................ // gap // ............................................................................ - sub v27.8H, v28.8H, v22.8H // ........................................................*................... - add v22.8H, v28.8H, v22.8H // .........................................................*.................. + sub v23.8H, v3.8H, v13.8H // .....................e...................................................... + add v5.8H, v12.8H, v19.8H // .................e.......................................................... // gap // ............................................................................ // gap // ............................................................................ - add v29.8H, v31.8H, v13.8H // ...........................e................................................ - add v18.8H, v5.8H, v21.8H // ....................................................*....................... + mls v15.8H, v16.8H, v7.H[0] // .............................................e.............................. + sub v18.8H, v12.8H, v19.8H // ................e........................................................... // gap // ............................................................................ // gap // ............................................................................ - mul v23.8H, v2.8H, v1.H[2] // ..........................................................*................. - str q22, [x0, #128] // ......................................................................*..... - mls v11.8H, v9.8H, v7.H[0] // .................................................................*.......... + mul v31.8H, v23.8H, v0.H[4] // .......................................e.................................... + mls v2.8H, v4.8H, v7.H[0] // ...................................e........................................ // gap // ............................................................................ - str q18, [x0], #(16) // ....................................................................*....... - sqrdmulh v20.8H, v2.8H, v1.H[3] // ...........................................................*................ - sub v10.8H, v19.8H, v8.8H // .........................................*.................................. // gap // ............................................................................ - mul v6.8H, v12.8H, v0.H[0] // .............e.............................................................. - sqrdmulh v25.8H, v12.8H, v0.H[1] // ..............e............................................................. - str q27, [x0, #176] // .......................................................................*.... + add v6.8H, v3.8H, v13.8H // ......................e..................................................... + sqrdmulh v17.8H, v23.8H, v0.H[5] // ......................................e..................................... // gap // ............................................................................ - add v9.8H, v10.8H, v11.8H // ...................................................................*........ - sub v11.8H, v10.8H, v11.8H // ..................................................................*......... // gap // ............................................................................ + add v13.8H, v18.8H, v15.8H // ...............................................e............................ + sub v15.8H, v18.8H, v15.8H // ..............................................e............................. // gap // ............................................................................ - mul v30.8H, v14.8H, v0.H[0] // ..................e......................................................... - sqrdmulh v22.8H, v14.8H, v0.H[1] // ...................e........................................................ // gap // ............................................................................ + sqrdmulh v11.8H, v6.8H, v0.H[3] // ............................e............................................... + add v14.8H, v5.8H, v2.8H // .....................................e...................................... // gap // ............................................................................ - str q9, [x0, #368] // ..........................................................................*. - sqrdmulh v12.8H, v29.8H, v0.H[3] // ..................................e......................................... - mul v16.8H, v29.8H, v0.H[2] // .................................e.......................................... // gap // ............................................................................ - str q11, [x0, #432] // ...........................................................................* - mls v23.8H, v20.8H, v7.H[0] // ............................................................*............... - mls v6.8H, v25.8H, v7.H[0] // ...............e............................................................ + mul v16.8H, v15.8H, v1.H[4] // ................................................................e........... + sqrdmulh v27.8H, v15.8H, v1.H[5] // ...............................................................e............ // gap // ............................................................................ - mls v30.8H, v22.8H, v7.H[0] // ....................e....................................................... - add v14.8H, v19.8H, v8.8H // ..........................................*................................. // gap // ............................................................................ + mul v29.8H, v6.8H, v0.H[2] // .............................e.............................................. + mls v31.8H, v17.8H, v7.H[0] // ........................................e................................... // gap // ............................................................................ - mls v16.8H, v12.8H, v7.H[0] // ...................................e........................................ - sub v27.8H, v5.8H, v21.8H // ...................................................*........................ // gap // ............................................................................ + mul v3.8H, v14.8H, v0.H[6] // .................................................e.......................... + sqrdmulh v26.8H, v14.8H, v0.H[7] // ................................................e........................... // gap // ............................................................................ - add v22.8H, v24.8H, v6.8H // .................e.......................................................... - sub v9.8H, v14.8H, v23.8H // .............................................................*.............. // gap // ............................................................................ + mls v16.8H, v27.8H, v7.H[0] // .................................................................e.......... + sqrdmulh v6.8H, v13.8H, v1.H[3] // ..........................................................e................. // gap // ............................................................................ - add v11.8H, v4.8H, v30.8H // ......................e..................................................... - sub v28.8H, v4.8H, v30.8H // .....................e...................................................... - str q27, [x0, #48] // .....................................................................*...... // gap // ............................................................................ - sub v5.8H, v22.8H, v16.8H // ....................................e....................................... - add v30.8H, v22.8H, v16.8H // .....................................e...................................... - str q9, [x0, #304] // .........................................................................*.. + mls v29.8H, v11.8H, v7.H[0] // ..............................e............................................. + sub v22.8H, v9.8H, v31.8H // .........................................e.................................. // gap // ............................................................................ - mul v15.8H, v11.8H, v0.H[2] // ............................e............................................... - sqrdmulh v27.8H, v11.8H, v0.H[3] // .............................e.............................................. // gap // ............................................................................ - ldr q29, [x0, #256] // ....e....................................................................... - mul v22.8H, v5.8H, v1.H[0] // .....................................................e...................... - sqrdmulh v9.8H, v5.8H, v1.H[1] // ......................................................e..................... + mul v14.8H, v13.8H, v1.H[2] // ...........................................................e................ + mls v3.8H, v26.8H, v7.H[0] // ..................................................e......................... // gap // ............................................................................ // gap // ............................................................................ - sub v25.8H, v24.8H, v6.8H // ................e........................................................... - add v20.8H, v14.8H, v23.8H // ..............................................................*............. + sub v15.8H, v22.8H, v16.8H // ..................................................................e......... + sub v13.8H, v5.8H, v2.8H // ....................................e....................................... // gap // ............................................................................ // gap // ............................................................................ - mls v15.8H, v27.8H, v7.H[0] // ..............................e............................................. - mul v8.8H, v28.8H, v0.H[4] // ......................................e..................................... + add v25.8H, v9.8H, v31.8H // ..........................................e................................. + add v19.8H, v8.8H, v29.8H // ................................e........................................... // gap // ............................................................................ // gap // ............................................................................ - str q20, [x0, #240] // ........................................................................*... - mls v22.8H, v9.8H, v7.H[0] // .......................................................e.................... - sqrdmulh v6.8H, v29.8H, v0.H[1] // .........e.................................................................. + mls v14.8H, v6.8H, v7.H[0] // ............................................................e............... + str q15, [x0, #448] // ...........................................................................e + sqrdmulh v11.8H, v13.8H, v1.H[1] // .....................................................e...................... // gap // ............................................................................ - // original source code - // ldr q8, [x0, #0] // .....................e.....................................................|.....................e................................................... - // ldr q9, [x0, #(1*(512/8))] // ..........e................................................................|..........e.............................................................. - // ldr q10, [x0, #(2*(512/8))] // ....e......................................................................|....e.................................................................... - // ldr q11, [x0, #(3*(512/8))] // ...e.......................................................................|...e..................................................................... - // ldr q12, [x0, #(4*(512/8))] // .................................................................e.........|.................................................................e....... - // ldr q13, [x0, #(5*(512/8))] // .........e.................................................................|.........e............................................................... - // ldr q14, [x0, #(6*(512/8))] // e..........................................................................|e........................................................................ - // ldr q15, [x0, #(7*(512/8))] // ..e........................................................................|..e...................................................................... - // mul v24.8h, v12.8h, v0.h[0] // ...........................................................................*......................................................................... - // sqrdmulh v12.8h, v12.8h, v0.h[1] // ..........................................................................e|......................................................................... - // mls v24.8h, v12.8h, v7.h[0] // ...........*...............................................................|...........*............................................................. - // sub v12.8h, v8.8h, v24.8h // ....................*......................................................|....................*.................................................... - // add v8.8h, v8.8h, v24.8h // ..................*........................................................|..................*...................................................... - // mul v24.8h, v13.8h, v0.h[0] // ......................................e....................................|......................................e.................................. - // sqrdmulh v13.8h, v13.8h, v0.h[1] // .......................................e...................................|.......................................e................................. - // mls v24.8h, v13.8h, v7.h[0] // ..................................................e........................|..................................................e...................... - // sub v13.8h, v9.8h, v24.8h // ....................................................................e......|....................................................................e.... - // add v9.8h, v9.8h, v24.8h // .......................................................e...................|.......................................................e................. - // mul v24.8h, v14.8h, v0.h[0] // ...........................................e...............................|...........................................e............................. - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ............................................e..............................|............................................e............................ - // mls v24.8h, v14.8h, v7.h[0] // ...................................................e.......................|...................................................e..................... - // sub v14.8h, v10.8h, v24.8h // ..........................................................e................|..........................................................e.............. - // add v10.8h, v10.8h, v24.8h // .........................................................e.................|.........................................................e............... - // mul v24.8h, v15.8h, v0.h[0] // .............e.............................................................|.............e........................................................... - // sqrdmulh v15.8h, v15.8h, v0.h[1] // ..............e............................................................|..............e.......................................................... - // mls v24.8h, v15.8h, v7.h[0] // ...................e.......................................................|...................e..................................................... - // sub v15.8h, v11.8h, v24.8h // .*.........................................................................|.*....................................................................... - // add v11.8h, v11.8h, v24.8h // ..............................e............................................|..............................e.......................................... - // mul v24.8h, v10.8h, v0.h[2] // ...............................................................e...........|...............................................................e......... - // sqrdmulh v10.8h, v10.8h, v0.h[3] // ................................................................e..........|................................................................e........ - // mls v24.8h, v10.8h, v7.h[0] // ......................................................................e....|......................................................................e.. - // sub v10.8h, v8.8h, v24.8h // .........................*.................................................|.........................*............................................... - // add v8.8h, v8.8h, v24.8h // ........................*..................................................|........................*................................................ - // mul v24.8h, v11.8h, v0.h[2] // ...............................................e...........................|...............................................e......................... - // sqrdmulh v11.8h, v11.8h, v0.h[3] // ..............................................e............................|..............................................e.......................... - // mls v24.8h, v11.8h, v7.h[0] // .....................................................e.....................|.....................................................e................... - // sub v11.8h, v9.8h, v24.8h // ............................................................e..............|............................................................e............ - // add v9.8h, v9.8h, v24.8h // .............................................................e.............|.............................................................e........... - // mul v24.8h, v14.8h, v0.h[4] // .......................................................................e...|.......................................................................e. - // sqrdmulh v14.8h, v14.8h, v0.h[5] // ......*....................................................................|......*.................................................................. - // mls v24.8h, v14.8h, v7.h[0] // ................*..........................................................|................*........................................................ - // sub v14.8h, v12.8h, v24.8h // .....................................*.....................................|.....................................*................................... - // add v12.8h, v12.8h, v24.8h // ....................................................*......................|....................................................*.................... - // mul v24.8h, v15.8h, v0.h[4] // ........*..................................................................|........*................................................................ - // sqrdmulh v15.8h, v15.8h, v0.h[5] // .......*...................................................................|.......*................................................................. - // mls v24.8h, v15.8h, v7.h[0] // ...............*...........................................................|...............*......................................................... - // sub v15.8h, v13.8h, v24.8h // .......................*...................................................|.......................*................................................. - // add v13.8h, v13.8h, v24.8h // ......................*....................................................|......................*.................................................. - // mul v24.8h, v9.8h, v0.h[6] // .....*.....................................................................|.....*................................................................... - // sqrdmulh v9.8h, v9.8h, v0.h[7] // ............*..............................................................|............*............................................................ - // mls v24.8h, v9.8h, v7.h[0] // .................*.........................................................|.................*....................................................... - // sub v9.8h, v8.8h, v24.8h // ......................................................*....................|......................................................*.................. - // add v8.8h, v8.8h, v24.8h // ...............................*...........................................|...............................*......................................... - // mul v24.8h, v11.8h, v1.h[0] // ..................................................................e........|..................................................................e...... - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ...................................................................e.......|...................................................................e..... - // mls v24.8h, v11.8h, v7.h[0] // .........................................................................e.|......................................................................... - // sub v11.8h, v10.8h, v24.8h // ............................*..............................................|............................*............................................ - // add v10.8h, v10.8h, v24.8h // .............................*.............................................|.............................*........................................... - // mul v24.8h, v13.8h, v1.h[2] // ................................*..........................................|................................*........................................ - // sqrdmulh v13.8h, v13.8h, v1.h[3] // ....................................*......................................|....................................*.................................... - // mls v24.8h, v13.8h, v7.h[0] // .................................................*.........................|.................................................*....................... - // sub v13.8h, v12.8h, v24.8h // ........................................................*..................|........................................................*................ - // add v12.8h, v12.8h, v24.8h // .....................................................................*.....|.....................................................................*... - // mul v24.8h, v15.8h, v1.h[4] // ...........................*...............................................|...........................*............................................. - // sqrdmulh v15.8h, v15.8h, v1.h[5] // ..........................*................................................|..........................*.............................................. - // mls v24.8h, v15.8h, v7.h[0] // ..................................*........................................|..................................*...................................... - // sub v15.8h, v14.8h, v24.8h // ..........................................*................................|..........................................*.............................. - // add v14.8h, v14.8h, v24.8h // .........................................*.................................|.........................................*............................... - // str q8, [x0], #(16) // ...................................*.......................................|...................................*..................................... - // str q9, [x0, #(-16 + 1*(512/8))] // ...........................................................*...............|...........................................................*............. - // str q10, [x0, #(-16 + 2*(512/8))] // .................................*.........................................|.................................*....................................... - // str q11, [x0, #(-16 + 3*(512/8))] // ........................................*..................................|........................................*................................ - // str q12, [x0, #(-16 + 4*(512/8))] // ........................................................................*..|........................................................................* - // str q13, [x0, #(-16 + 5*(512/8))] // ..............................................................*............|..............................................................*.......... - // str q14, [x0, #(-16 + 6*(512/8))] // .............................................*.............................|.............................................*........................... - // str q15, [x0, #(-16 + 7*(512/8))] // ................................................*..........................|................................................*........................ + // -------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|-------------- + // ldr q8, [x0, #0] // e...........................................................................~...................................... + // ldr q9, [x0, #(1*(512/8))] // ......................e.....................................................'.....................~................ + // ldr q10, [x0, #(2*(512/8))] // ...................e........................................................'..................~................... + // ldr q11, [x0, #(3*(512/8))] // .............e..............................................................'............~......................... + // ldr q12, [x0, #(4*(512/8))] // ...e........................................................................'..~................................... + // ldr q13, [x0, #(5*(512/8))] // ......e.....................................................................'.....~................................ + // ldr q14, [x0, #(6*(512/8))] // ...........e................................................................'..........~........................... + // ldr q15, [x0, #(7*(512/8))] // .......e....................................................................'......~............................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ..................e.........................................................'.................~.................... + // mul v24.8h, v12.8h, v0.h[0] // .................e..........................................................'................~..................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................e................................................'..........................~........... + // sub v12.8h, v8.8h, v24.8h // ..................................e.........................................'.................................~.... + // add v8.8h, v8.8h, v24.8h // .................................e..........................................'................................~..... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ..........................e.................................................'.........................~............ + // mul v24.8h, v13.8h, v0.h[0] // .........................e..................................................'........................~............. + // mls v24.8h, v27.8h, v7.h[0] // ........................................e...................................'...................................... + // sub v13.8h, v9.8h, v24.8h // ................................................e...........................'...................................... + // add v9.8h, v9.8h, v24.8h // ..............................................e.............................'...................................... + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ...............................e............................................'..............................~....... + // mul v24.8h, v14.8h, v0.h[0] // ..............................e.............................................'.............................~........ + // mls v24.8h, v27.8h, v7.h[0] // .......................................e....................................'...................................... + // sub v14.8h, v10.8h, v24.8h // .............................................e..............................'...................................... + // add v10.8h, v10.8h, v24.8h // ...................................................e........................'...................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // .......................e....................................................'......................~............... + // mul v24.8h, v15.8h, v0.h[0] // .....................e......................................................'....................~................. + // mls v24.8h, v27.8h, v7.h[0] // ............................e...............................................'...........................~.......... + // sub v15.8h, v11.8h, v24.8h // .....................................e......................................'....................................~. + // add v11.8h, v11.8h, v24.8h // ...................................e........................................'..................................~... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // .......................................................e....................'...................................... + // mul v24.8h, v10.8h, v0.h[2] // ...........................................................e................'...................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................................e..........'...................................... + // sub v10.8h, v8.8h, v24.8h // ............~...............................................................'...........*.......................... + // add v8.8h, v8.8h, v24.8h // ........................................................................e...'...................................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ............................................e...............................'...................................... + // mul v24.8h, v11.8h, v0.h[2] // ...........................................e................................'...................................... + // mls v24.8h, v27.8h, v7.h[0] // ..................................................e.........................'...................................... + // sub v11.8h, v9.8h, v24.8h // ......................................................................e.....'...................................... + // add v9.8h, v9.8h, v24.8h // ........................................................e...................'...................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ....................................................e.......................'...................................... + // mul v24.8h, v14.8h, v0.h[4] // .................................................e..........................'...................................... + // mls v24.8h, v27.8h, v7.h[0] // ............................................................e...............'...................................... + // sub v14.8h, v12.8h, v24.8h // ..................................................................e.........'...................................... + // add v12.8h, v12.8h, v24.8h // .......................................................................e....'...................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ..........................................e.................................'...................................... + // mul v24.8h, v15.8h, v0.h[4] // .........................................e..................................'...................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................................e............................'...................................... + // sub v15.8h, v13.8h, v24.8h // ......................................................e.....................'...................................... + // add v13.8h, v13.8h, v24.8h // .....................................................e......................'...................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ..............................................................e.............'...................................... + // mul v24.8h, v9.8h, v0.h[6] // .............................................................e..............'...................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................................................................e.......'...................................... + // sub v9.8h, v8.8h, v24.8h // .....~......................................................................'....*................................. + // add v8.8h, v8.8h, v24.8h // .~..........................................................................'*..................................... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ...........................................................................e'...................................... + // mul v24.8h, v11.8h, v1.h[0] // ..~.........................................................................'.*.................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............~............................................................'..............*....................... + // sub v11.8h, v10.8h, v24.8h // .............................~..............................................'............................*......... + // add v10.8h, v10.8h, v24.8h // ........................~...................................................'.......................*.............. + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ................................................................e...........'...................................... + // mul v24.8h, v13.8h, v1.h[2] // ...................................................................e........'...................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................................................e..'...................................... + // sub v13.8h, v12.8h, v24.8h // ..........~.................................................................'.........*............................ + // add v12.8h, v12.8h, v24.8h // .........~..................................................................'........*............................. + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ..........................................................e.................'...................................... + // mul v24.8h, v15.8h, v1.h[4] // .........................................................e..................'...................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................................................e............'...................................... + // sub v15.8h, v14.8h, v24.8h // .....................................................................e......'...................................... + // add v14.8h, v14.8h, v24.8h // ....~.......................................................................'...*.................................. + // str q8, [x0], #(16) // ........~...................................................................'.......*.............................. + // str q9, [x0, #(-16 + 1*(512/8))] // ....................~.......................................................'...................*.................. + // str q10, [x0, #(-16 + 2*(512/8))] // ....................................~.......................................'...................................*.. + // str q11, [x0, #(-16 + 3*(512/8))] // ................................~...........................................'...............................*...... + // str q12, [x0, #(-16 + 4*(512/8))] // ................~...........................................................'...............*...................... + // str q13, [x0, #(-16 + 5*(512/8))] // ......................................~.....................................'.....................................* + // str q14, [x0, #(-16 + 6*(512/8))] // ..............~.............................................................'.............*........................ + // str q15, [x0, #(-16 + 7*(512/8))] // ..........................................................................e.'...................................... sub count, count, #1 cbnz count, layer123_start - mul v5.8H, v30.8H, v0.H[6] // ..*...................................... - sub v9.8H, v31.8H, v13.8H // .*....................................... - // gap // ......................................... - // gap // ......................................... - sqrdmulh v13.8H, v28.8H, v0.H[5] // ...*..................................... - mul v17.8H, v29.8H, v0.H[0] // *........................................ - // gap // ......................................... - // gap // ......................................... - sqrdmulh v11.8H, v9.8H, v0.H[5] // ....*.................................... - mul v20.8H, v9.8H, v0.H[4] // .....*................................... - // gap // ......................................... - // gap // ......................................... - sqrdmulh v2.8H, v30.8H, v0.H[7] // .......*................................. - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - mls v17.8H, v6.8H, v7.H[0] // ......*.................................. - mls v8.8H, v13.8H, v7.H[0] // .........*............................... - // gap // ......................................... - // gap // ......................................... - mls v20.8H, v11.8H, v7.H[0] // ........*................................ - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - mls v5.8H, v2.8H, v7.H[0] // ..........*.............................. - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - add v27.8H, v26.8H, v17.8H // ...........*............................. - sub v12.8H, v26.8H, v17.8H // ............*............................ - // gap // ......................................... - // gap // ......................................... - sub v13.8H, v25.8H, v20.8H // ..............*.......................... - // gap // ......................................... - // gap // ......................................... - add v6.8H, v25.8H, v20.8H // .............*........................... - sub v16.8H, v27.8H, v15.8H // ................*........................ - add v4.8H, v27.8H, v15.8H // ...............*......................... - // gap // ......................................... - // gap // ......................................... - mul v17.8H, v6.8H, v1.H[2] // ......................*.................. - sqrdmulh v9.8H, v6.8H, v1.H[3] // ..........................*.............. - // gap // ......................................... - // gap // ......................................... - sqrdmulh v2.8H, v13.8H, v1.H[5] // .................*....................... - mul v19.8H, v13.8H, v1.H[4] // ..................*...................... - // gap // ......................................... - // gap // ......................................... - add v21.8H, v16.8H, v22.8H // ....................*.................... - sub v25.8H, v16.8H, v22.8H // ...................*..................... - // gap // ......................................... - // gap // ......................................... - mls v17.8H, v9.8H, v7.H[0] // .................................*....... - sub v20.8H, v4.8H, v5.8H // ...................................*..... - // gap // ......................................... - // gap // ......................................... - mls v19.8H, v2.8H, v7.H[0] // ........................*................ - str q21, [x0, #128] // .......................*................. - add v29.8H, v12.8H, v8.8H // ..................................*...... - // gap // ......................................... - str q25, [x0, #192] // ............................*............ - add v16.8H, v4.8H, v5.8H // .....................*................... - sub v13.8H, v12.8H, v8.8H // ...........................*............. - // gap // ......................................... - sub v25.8H, v29.8H, v17.8H // ....................................*.... - add v11.8H, v29.8H, v17.8H // .......................................*. - str q20, [x0, #64] // .....................................*... - // gap // ......................................... - add v26.8H, v13.8H, v19.8H // .............................*........... - sub v8.8H, v13.8H, v19.8H // ..............................*.......... - str q16, [x0], #(16) // .........................*............... - // gap // ......................................... - str q25, [x0, #304] // ......................................*.. - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - str q8, [x0, #432] // ................................*........ - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - str q26, [x0, #368] // ...............................*......... - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... - str q11, [x0, #240] // ........................................* - // gap // ......................................... - // gap // ......................................... - // gap // ......................................... + // Instructions: 17 + // Expected cycles: 10 + // Expected IPC: 1.70 + // + // Cycle bound: 10.0 + // IPC bound: 1.70 + // + // Wall time: 0.13s + // User time: 0.13s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mul v6.8H, v13.8H, v1.H[0] // .*............................ + add v31.8H, v22.8H, v16.8H // ..*........................... + // gap // .............................. + // gap // .............................. + sub v27.8H, v8.8H, v29.8H // .......*...................... + sub v16.8H, v19.8H, v3.8H // ...*.......................... + // gap // .............................. + // gap // .............................. + add v15.8H, v19.8H, v3.8H // *............................. + str q31, [x0, #384] // ........*..................... + add v30.8H, v25.8H, v14.8H // .....*........................ + // gap // .............................. + mls v6.8H, v11.8H, v7.H[0] // .........*.................... + str q16, [x0, #64] // ...........*.................. + // gap // .............................. + // gap // .............................. + str q15, [x0], #(16) // ....*......................... + sub v15.8H, v25.8H, v14.8H // ......*....................... + // gap // .............................. + // gap // .............................. + str q30, [x0, #240] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q15, [x0, #304] // ................*............. + add v15.8H, v27.8H, v6.8H // ............*................. + // gap // .............................. + // gap // .............................. + sub v25.8H, v27.8H, v6.8H // .............*................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q15, [x0, #112] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q25, [x0, #176] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. - // original source code - // mul v20.8H, v29.8H, v0.H[0] // ...*..................................... - // sub v19.8H, v31.8H, v13.8H // .*....................................... - // mul v21.8H, v30.8H, v0.H[6] // *........................................ - // sqrdmulh v2.8H, v28.8H, v0.H[5] // ..*...................................... - // sqrdmulh v5.8H, v19.8H, v0.H[5] // ....*.................................... - // mul v11.8H, v19.8H, v0.H[4] // .....*................................... - // mls v20.8H, v6.8H, v7.H[0] // .......*................................. - // sqrdmulh v9.8H, v30.8H, v0.H[7] // ......*.................................. - // mls v11.8H, v5.8H, v7.H[0] // .........*............................... - // mls v8.8H, v2.8H, v7.H[0] // ........*................................ - // mls v21.8H, v9.8H, v7.H[0] // ..........*.............................. - // add v9.8H, v26.8H, v20.8H // ...........*............................. - // sub v19.8H, v26.8H, v20.8H // ............*............................ - // add v2.8H, v25.8H, v11.8H // ..............*.......................... - // sub v11.8H, v25.8H, v11.8H // .............*........................... - // add v5.8H, v9.8H, v15.8H // ................*........................ - // sub v28.8H, v9.8H, v15.8H // ...............*......................... - // sqrdmulh v9.8H, v11.8H, v1.H[5] // ...................*..................... - // mul v11.8H, v11.8H, v1.H[4] // ....................*.................... - // sub v27.8H, v28.8H, v22.8H // ......................*.................. - // add v22.8H, v28.8H, v22.8H // .....................*................... - // add v18.8H, v5.8H, v21.8H // .............................*........... - // mul v23.8H, v2.8H, v1.H[2] // .................*....................... - // str q22, [x0, #128] // ..........................*.............. - // mls v11.8H, v9.8H, v7.H[0] // .........................*............... - // str q18, [x0], #(16) // ....................................*.... - // sqrdmulh v20.8H, v2.8H, v1.H[3] // ..................*...................... - // sub v10.8H, v19.8H, v8.8H // ..............................*.......... - // str q27, [x0, #176] // ............................*............ - // add v9.8H, v10.8H, v11.8H // ..................................*...... - // sub v11.8H, v10.8H, v11.8H // ...................................*..... - // str q9, [x0, #368] // .......................................*. - // str q11, [x0, #432] // ......................................*.. - // mls v23.8H, v20.8H, v7.H[0] // .......................*................. - // add v14.8H, v19.8H, v8.8H // ...........................*............. - // sub v27.8H, v5.8H, v21.8H // ........................*................ - // sub v9.8H, v14.8H, v23.8H // ...............................*......... - // str q27, [x0, #48] // .................................*....... - // str q9, [x0, #304] // .....................................*... - // add v20.8H, v14.8H, v23.8H // ................................*........ - // str q20, [x0, #240] // ........................................* + // -------- new position --------> + // 0 25 + // |------------------------|----- + // add v27.8H, v19.8H, v3.8H // ....*.......................... + // mul v6.8H, v13.8H, v1.H[0] // *.............................. + // add v15.8H, v22.8H, v16.8H // .*............................. + // sub v26.8H, v19.8H, v3.8H // ...*........................... + // str q27, [x0], #(16) // .........*..................... + // add v16.8H, v25.8H, v14.8H // ......*........................ + // sub v27.8H, v25.8H, v14.8H // ..........*.................... + // sub v4.8H, v8.8H, v29.8H // ..*............................ + // str q15, [x0, #368] // .....*......................... + // mls v6.8H, v11.8H, v7.H[0] // .......*....................... + // str q16, [x0, #240] // ...........*................... + // str q26, [x0, #48] // ........*...................... + // add v11.8H, v4.8H, v6.8H // .............*................. + // sub v15.8H, v4.8H, v6.8H // ..............*................ + // str q15, [x0, #176] // ................*.............. + // str q11, [x0, #112] // ...............*............... + // str q27, [x0, #304] // ............*.................. restore inp, STACK0 mov count, #8 .p2align 2 - ldr q13, [x3], #16 // *............................ - ldr q16, [x1, #48] // .*........................... - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - ldr q28, [x4, #16] // ............*................ - // gap // ............................. - // gap // ............................. - // gap // ............................. - ldr q5, [x1, #16] // ......*...................... - // gap // ............................. - // gap // ............................. - // gap // ............................. - ldr q1, [x1, #32] // ..*.......................... - mul v8.8H, v16.8H, v13.H[0] // .....*....................... - sqrdmulh v26.8H, v16.8H, v13.H[1] // ....*........................ - // gap // ............................. - // gap // ............................. - ldr q22, [x4], #(6*16) // ............................* - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - mls v8.8H, v26.8H, v7.H[0] // ........*.................... - // gap // ............................. - // gap // ............................. - // gap // ............................. - mul v14.8H, v1.8H, v13.H[0] // .......*..................... - sqrdmulh v0.8H, v1.8H, v13.H[1] // .........*................... - // gap // ............................. - // gap // ............................. - // gap // ............................. - ldr q24, [x1, #0] // ...*......................... - // gap // ............................. - // gap // ............................. - sub v12.8H, v5.8H, v8.8H // ..........*.................. - add v2.8H, v5.8H, v8.8H // ..............*.............. - // gap // ............................. - // gap // ............................. - mls v14.8H, v0.8H, v7.H[0] // ...........*................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - mul v26.8H, v12.8H, v13.H[4] // ...............*............. - sqrdmulh v12.8H, v12.8H, v13.H[5] // ..................*.......... - // gap // ............................. - // gap // ............................. - mul v8.8H, v2.8H, v13.H[2] // ................*............ - sqrdmulh v0.8H, v2.8H, v13.H[3] // ...................*......... - // gap // ............................. - // gap // ............................. - add v9.8H, v24.8H, v14.8H // .....................*....... - // gap // ............................. - // gap // ............................. - // gap // ............................. - mls v26.8H, v12.8H, v7.H[0] // ....................*........ - // gap // ............................. - // gap // ............................. - ldr q29, [x4, #-64] // .............*............... - mls v8.8H, v0.8H, v7.H[0] // ......................*...... - // gap // ............................. - // gap // ............................. - sub v0.8H, v24.8H, v14.8H // .................*........... - // gap // ............................. - // gap // ............................. - // gap // ............................. - // gap // ............................. - add v11.8H, v0.8H, v26.8H // ........................*.... - // gap // ............................. - // gap // ............................. - sub v20.8H, v0.8H, v26.8H // .......................*..... - sub v18.8H, v9.8H, v8.8H // ..........................*.. - // gap // ............................. - // gap // ............................. - // gap // ............................. - add v19.8H, v9.8H, v8.8H // .........................*... - // gap // ............................. - // gap // ............................. - trn2 v21.4S, v11.4S, v20.4S // ...........................*. + // Instructions: 32 + // Expected cycles: 21 + // Expected IPC: 1.52 + // + // Cycle bound: 21.0 + // IPC bound: 1.52 + // + // Wall time: 0.36s + // User time: 0.36s + // + // ------ original position ------> + // 0 25 + // |------------------------|------ + ldr q22, [x1, #48] // .*.............................. + ldr q0, [x3], #16 // *............................... + // gap // ................................ + // gap // ................................ + ldr q13, [x1, #0] // ...*............................ + ldr q26, [x1, #32] // ..*............................. + // gap // ................................ + // gap // ................................ + ldr q14, [x4], #(6*16) // ...........................*.... + ldr q6, [x1, #16] // ....*........................... + // gap // ................................ + // gap // ................................ + ldr q11, [x4, #-32] // ...............................* + ldr q31, [x4, #-16] // .........................*...... + // gap // ................................ + // gap // ................................ + sqrdmulh v25.8H, v22.8H, v0.H[1] // .....*.......................... + mul v16.8H, v22.8H, v0.H[0] // ......*......................... + ldr q22, [x4, #-64] // ..........................*..... + // gap // ................................ + sqrdmulh v3.8H, v26.8H, v0.H[1] // .......*........................ + mul v19.8H, v26.8H, v0.H[0] // ........*....................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v16.8H, v25.8H, v7.H[0] // .........*...................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v19.8H, v3.8H, v7.H[0] // ..........*..................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sub v25.8H, v6.8H, v16.8H // ...........*.................... + add v16.8H, v6.8H, v16.8H // ............*................... + // gap // ................................ + // gap // ................................ + sub v6.8H, v13.8H, v19.8H // ...............*................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sqrdmulh v15.8H, v25.8H, v0.H[5] // ..............*................. + mul v25.8H, v25.8H, v0.H[4] // ................*............... + // gap // ................................ + // gap // ................................ + sqrdmulh v3.8H, v16.8H, v0.H[3] // .................*.............. + mul v16.8H, v16.8H, v0.H[2] // ...................*............ + // gap // ................................ + // gap // ................................ + add v27.8H, v13.8H, v19.8H // ....................*........... + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v25.8H, v15.8H, v7.H[0] // ..................*............. + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v16.8H, v3.8H, v7.H[0] // ......................*......... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + add v15.8H, v6.8H, v25.8H // .......................*........ + sub v6.8H, v6.8H, v25.8H // .....................*.......... + // gap // ................................ + add v17.8H, v27.8H, v16.8H // .............................*.. + ldr q0, [x4, #-48] // .............*.................. + sub v16.8H, v27.8H, v16.8H // ........................*....... + // gap // ................................ + // gap // ................................ + ldr q27, [x4, #-80] // ............................*... + trn1 v2.4S, v15.4S, v6.4S // ..............................*. - // original source code - // ldr q15, [x3], #16 // *............................ - // ldr q26, [x1, #48] // .*........................... - // ldr q4, [x1, #32] // ....*........................ - // ldr q25, [x1, #0] // ...........*................. - // sqrdmulh v5.8H, v26.8H, v15.H[1] // ......*...................... - // mul v10.8H, v26.8H, v15.H[0] // .....*....................... - // ldr q0, [x1, #16] // ...*......................... - // mul v23.8H, v4.8H, v15.H[0] // .........*................... - // mls v10.8H, v5.8H, v7.H[0] // ........*.................... - // sqrdmulh v3.8H, v4.8H, v15.H[1] // ..........*.................. - // sub v14.8H, v0.8H, v10.8H // ............*................ - // mls v23.8H, v3.8H, v7.H[0] // ..............*.............. - // ldr q28, [x4, #16] // ..*.......................... - // ldr q29, [x4, #32] // .....................*....... - // add v19.8H, v0.8H, v10.8H // .............*............... - // mul v18.8H, v14.8H, v15.H[4] // ...............*............. - // mul v30.8H, v19.8H, v15.H[2] // .................*........... - // sub v26.8H, v25.8H, v23.8H // .......................*..... - // sqrdmulh v1.8H, v14.8H, v15.H[5] // ................*............ - // sqrdmulh v8.8H, v19.8H, v15.H[3] // ..................*.......... - // mls v18.8H, v1.8H, v7.H[0] // ....................*........ - // add v10.8H, v25.8H, v23.8H // ...................*......... - // mls v30.8H, v8.8H, v7.H[0] // ......................*...... - // sub v20.8H, v26.8H, v18.8H // .........................*... - // add v11.8H, v26.8H, v18.8H // ........................*.... - // add v19.8H, v10.8H, v30.8H // ...........................*. - // sub v18.8H, v10.8H, v30.8H // ..........................*.. - // trn2 v21.4S, v11.4S, v20.4S // ............................* - // ldr q22, [x4], #(6*16) // .......*..................... + // -------- new position ---------> + // 0 25 + // |------------------------|------ + // ldr q3, [x3], #16 // .*.............................. + // ldr q25, [x1, #48] // *............................... + // ldr q5, [x1, #32] // ...*............................ + // ldr q23, [x1, #0] // ..*............................. + // ldr q1, [x1, #16] // .....*.......................... + // sqrdmulh v6.8H, v25.8H, v3.H[1] // ........*....................... + // mul v10.8H, v25.8H, v3.H[0] // .........*...................... + // sqrdmulh v25.8H, v5.8H, v3.H[1] // ...........*.................... + // mul v29.8H, v5.8H, v3.H[0] // ............*................... + // mls v10.8H, v6.8H, v7.H[0] // .............*.................. + // mls v29.8H, v25.8H, v7.H[0] // ..............*................. + // sub v13.8H, v1.8H, v10.8H // ...............*................ + // add v17.8H, v1.8H, v10.8H // ................*............... + // ldr q0, [x4, #48] // ............................*... + // sqrdmulh v31.8H, v13.8H, v3.H[5] // ..................*............. + // sub v22.8H, v23.8H, v29.8H // .................*.............. + // mul v26.8H, v13.8H, v3.H[4] // ...................*............ + // sqrdmulh v25.8H, v17.8H, v3.H[3] // ....................*........... + // mls v26.8H, v31.8H, v7.H[0] // .......................*........ + // mul v2.8H, v17.8H, v3.H[2] // .....................*.......... + // add v17.8H, v23.8H, v29.8H // ......................*......... + // sub v6.8H, v22.8H, v26.8H // ..........................*..... + // mls v2.8H, v25.8H, v7.H[0] // ........................*....... + // add v15.8H, v22.8H, v26.8H // .........................*...... + // sub v16.8H, v17.8H, v2.8H // .............................*.. + // ldr q31, [x4, #80] // .......*........................ + // ldr q22, [x4, #32] // ..........*..................... + // ldr q14, [x4], #(6*16) // ....*........................... + // ldr q27, [x4, #-80] // ..............................*. + // add v17.8H, v17.8H, v2.8H // ...........................*.... + // trn1 v2.4S, v15.4S, v6.4S // ...............................* + // ldr q11, [x4, #-32] // ......*......................... sub count, count, #1 layer4567_start: - trn1 v10.4S, v11.4S, v20.4S // ...........................*............................................ - ldr q15, [x3], #16 // ....e................................................................... - trn2 v2.4S, v19.4S, v18.4S // ..........................*............................................. - // gap // ........................................................................ - trn1 v9.4S, v19.4S, v18.4S // .........................*.............................................. - ldr q26, [x1, #112] // ...e.................................................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v30.2D, v2.2D, v21.2D // ................................*....................................... - trn2 v5.2D, v2.2D, v21.2D // ..............................*......................................... - // gap // ........................................................................ - trn2 v11.2D, v9.2D, v10.2D // .............................*.......................................... - trn1 v16.2D, v9.2D, v10.2D // ...............................*........................................ - ldr q4, [x1, #96] // ..e..................................................................... - // gap // ........................................................................ - ldr q25, [x1, #64] // e....................................................................... - // gap // ........................................................................ - sqrdmulh v9.8H, v5.8H, v28.8H // .............................................*.......................... - mul v20.8H, v5.8H, v22.8H // ............................................*........................... - sqrdmulh v5.8H, v26.8H, v15.H[1] // ...........e............................................................ - mul v10.8H, v26.8H, v15.H[0] // ..........e............................................................. - ldr q0, [x1, #80] // .e...................................................................... - // gap // ........................................................................ - sqrdmulh v18.8H, v11.8H, v28.8H // ........................................*............................... - // gap // ........................................................................ - // gap // ........................................................................ - mul v27.8H, v11.8H, v22.8H // .......................................*................................ - mul v23.8H, v4.8H, v15.H[0] // .....e.................................................................. - mls v20.8H, v9.8H, v7.H[0] // ..............................................*......................... - // gap // ........................................................................ - // gap // ........................................................................ - mls v10.8H, v5.8H, v7.H[0] // ............e........................................................... - sqrdmulh v3.8H, v4.8H, v15.H[1] // ......e................................................................. - ldr q26, [x4, #-32] // .....................................*.................................. - ldr q24, [x4, #-16] // ......................................*................................. - mls v27.8H, v18.8H, v7.H[0] // .........................................*.............................. - // gap // ........................................................................ - // gap // ........................................................................ - ldr q5, [x4, #-48] // ....................................*................................... - add v2.8H, v30.8H, v20.8H // ................................................*....................... - sub v9.8H, v30.8H, v20.8H // ...............................................*........................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v14.8H, v0.8H, v10.8H // .............e.......................................................... - mls v23.8H, v3.8H, v7.H[0] // .......e................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v11.8H, v9.8H, v26.8H // ......................................................*................. - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v9.8H, v9.8H, v24.8H // .......................................................*................ - sqrdmulh v20.8H, v2.8H, v5.8H // ..................................................*..................... - mul v2.8H, v2.8H, v29.8H // .................................................*...................... - ldr q28, [x4, #16] // ..................................e..................................... - ldr q29, [x4, #32] // ...................................e.................................... - add v19.8H, v0.8H, v10.8H // ..............e......................................................... - mul v18.8H, v14.8H, v15.H[4] // ....................e................................................... - // gap // ........................................................................ - // gap // ........................................................................ - mls v11.8H, v9.8H, v7.H[0] // ........................................................*............... - sub v9.8H, v16.8H, v27.8H // ..........................................*............................. - // gap // ........................................................................ - // gap // ........................................................................ - mul v30.8H, v19.8H, v15.H[2] // ...............e........................................................ - mls v2.8H, v20.8H, v7.H[0] // ...................................................*.................... - // gap // ........................................................................ - // gap // ........................................................................ - add v22.8H, v16.8H, v27.8H // ...........................................*............................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v26.8H, v25.8H, v23.8H // ........e............................................................... - add v5.8H, v9.8H, v11.8H // ..........................................................*............. - sub v6.8H, v9.8H, v11.8H // .........................................................*.............. - // gap // ........................................................................ - // gap // ........................................................................ - sub v4.8H, v22.8H, v2.8H // ....................................................*................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - add v3.8H, v22.8H, v2.8H // .....................................................*.................. - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v2.8H, v6.8H, v7.H[1] // ....................................................................*... - sqdmulh v27.8H, v4.8H, v7.H[1] // ..............................................................*......... - sqrdmulh v1.8H, v14.8H, v15.H[5] // .....................e.................................................. - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v20.8H, v3.8H, v7.H[1] // ...........................................................*............ - // gap // ........................................................................ - // gap // ........................................................................ - sqdmulh v11.8H, v5.8H, v7.H[1] // .................................................................*...... - sqrdmulh v8.8H, v19.8H, v15.H[3] // ................e....................................................... - srshr v9.8H, v2.8H, #11 // .....................................................................*.. - // gap // ........................................................................ - // gap // ........................................................................ - mls v18.8H, v1.8H, v7.H[0] // ......................e................................................. - srshr v22.8H, v27.8H, #11 // ...............................................................*........ - // gap // ........................................................................ - // gap // ........................................................................ - srshr v27.8H, v20.8H, #11 // ............................................................*........... - srshr v11.8H, v11.8H, #11 // ..................................................................*..... - // gap // ........................................................................ - // gap // ........................................................................ - mls v6.8H, v9.8H, v7.H[0] // ......................................................................*. - add v10.8H, v25.8H, v23.8H // .........e.............................................................. - // gap // ........................................................................ - // gap // ........................................................................ - mls v30.8H, v8.8H, v7.H[0] // .................e...................................................... - mls v4.8H, v22.8H, v7.H[0] // ................................................................*....... - // gap // ........................................................................ - // gap // ........................................................................ - mls v5.8H, v11.8H, v7.H[0] // ...................................................................*.... - mls v3.8H, v27.8H, v7.H[0] // .............................................................*.......... - // gap // ........................................................................ - // gap // ........................................................................ - sub v20.8H, v26.8H, v18.8H // .......................e................................................ - add v11.8H, v26.8H, v18.8H // ........................e............................................... - // gap // ........................................................................ - // gap // ........................................................................ - add v19.8H, v10.8H, v30.8H // ...................e.................................................... - sub v18.8H, v10.8H, v30.8H // ..................e..................................................... - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v21.4S, v11.4S, v20.4S // ............................e........................................... - st4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1], #64 // .......................................................................* - // gap // ........................................................................ - ldr q22, [x4], #(6*16) // .................................e...................................... + // Instructions: 72 + // Expected cycles: 32 + // Expected IPC: 2.25 + // + // Cycle bound: 30.0 + // IPC bound: 2.40 + // + // Wall time: 3600.58s + // User time: 3600.58s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + // gap // ........................................................................ + ldr q3, [x3], #16 // ....e................................................................... + trn2 v29.4S, v17.4S, v16.4S // ..........................*............................................. + trn2 v28.4S, v15.4S, v6.4S // ............................*........................................... + ldr q25, [x1, #112] // ...e.................................................................... + // gap // ........................................................................ + trn1 v17.4S, v17.4S, v16.4S // .........................*.............................................. + // gap // ........................................................................ + ldr q5, [x1, #96] // ..e..................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v6.2D, v29.2D, v28.2D // ..............................*......................................... + ldr q23, [x1, #64] // e....................................................................... + // gap // ........................................................................ + trn2 v18.2D, v17.2D, v2.2D // .............................*.......................................... + trn1 v13.2D, v29.2D, v28.2D // ................................*....................................... + // gap // ........................................................................ + ldr q1, [x1, #80] // .e...................................................................... + sqrdmulh v16.8H, v6.8H, v27.8H // ............................................*........................... + mul v15.8H, v6.8H, v14.8H // .............................................*.......................... + // gap // ........................................................................ + sqrdmulh v6.8H, v25.8H, v3.H[1] // ..........e............................................................. + // gap // ........................................................................ + mul v10.8H, v25.8H, v3.H[0] // ...........e............................................................ + sqrdmulh v25.8H, v5.8H, v3.H[1] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + mul v29.8H, v5.8H, v3.H[0] // ......e................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v15.8H, v16.8H, v7.H[0] // ..............................................*......................... + mul v26.8H, v18.8H, v14.8H // ........................................*............................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v10.8H, v6.8H, v7.H[0] // ............e........................................................... + sqrdmulh v16.8H, v18.8H, v27.8H // .......................................*................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v29.8H, v25.8H, v7.H[0] // .......e................................................................ + trn1 v25.2D, v17.2D, v2.2D // ...............................*........................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v6.8H, v13.8H, v15.8H // ...............................................*........................ + add v15.8H, v13.8H, v15.8H // ................................................*....................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v13.8H, v1.8H, v10.8H // .............e.......................................................... + mls v26.8H, v16.8H, v7.H[0] // .........................................*.............................. + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v14.8H, v15.8H, v0.8H // .................................................*...................... + mul v27.8H, v15.8H, v22.8H // ..................................................*..................... + sqrdmulh v16.8H, v6.8H, v31.8H // ......................................................*................. + mul v15.8H, v6.8H, v11.8H // .......................................................*................ + // gap // ........................................................................ + // gap // ........................................................................ + add v17.8H, v1.8H, v10.8H // ..............e......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v6.8H, v25.8H, v26.8H // ...........................................*............................ + // gap // ........................................................................ + mls v27.8H, v14.8H, v7.H[0] // ...................................................*.................... + ldr q0, [x4, #48] // ....................................e................................... + sqrdmulh v31.8H, v13.8H, v3.H[5] // ....................e................................................... + sub v14.8H, v25.8H, v26.8H // ..........................................*............................. + mls v15.8H, v16.8H, v7.H[0] // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + sub v22.8H, v23.8H, v29.8H // ........e............................................................... + mul v26.8H, v13.8H, v3.H[4] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v11.8H, v6.8H, v27.8H // ....................................................*................... + add v10.8H, v6.8H, v27.8H // .....................................................*.................. + add v12.8H, v14.8H, v15.8H // ..........................................................*............. + sub v13.8H, v14.8H, v15.8H // .........................................................*.............. + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v25.8H, v17.8H, v3.H[3] // ...............e........................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v16.8H, v11.8H, v7.H[1] // ..............................................................*......... + sqdmulh v4.8H, v13.8H, v7.H[1] // ....................................................................*... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v27.8H, v10.8H, v7.H[1] // ...........................................................*............ + mls v26.8H, v31.8H, v7.H[0] // ......................e................................................. + sqdmulh v15.8H, v12.8H, v7.H[1] // .................................................................*...... + // gap // ........................................................................ + // gap // ........................................................................ + mul v2.8H, v17.8H, v3.H[2] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v17.8H, v23.8H, v29.8H // .........e.............................................................. + srshr v14.8H, v4.8H, #11 // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + srshr v27.8H, v27.8H, #11 // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + srshr v16.8H, v16.8H, #11 // ...............................................................*........ + srshr v15.8H, v15.8H, #11 // ..................................................................*..... + sub v6.8H, v22.8H, v26.8H // .......................e................................................ + mls v2.8H, v25.8H, v7.H[0] // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v13.8H, v14.8H, v7.H[0] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + mls v10.8H, v27.8H, v7.H[0] // .............................................................*.......... + // gap // ........................................................................ + // gap // ........................................................................ + mls v12.8H, v15.8H, v7.H[0] // ...................................................................*.... + mls v11.8H, v16.8H, v7.H[0] // ................................................................*....... + add v15.8H, v22.8H, v26.8H // ........................e............................................... + sub v16.8H, v17.8H, v2.8H // ..................e..................................................... + ldr q31, [x4, #80] // ......................................e................................. + ldr q22, [x4, #32] // ...................................e.................................... + ldr q14, [x4], #(6*16) // .................................e...................................... + // gap // ........................................................................ + ldr q27, [x4, #-80] // ..................................e..................................... + add v17.8H, v17.8H, v2.8H // ...................e.................................................... + trn1 v2.4S, v15.4S, v6.4S // ...........................e............................................ + st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x1], #64 // .......................................................................* + // gap // ........................................................................ + ldr q11, [x4, #-32] // .....................................e.................................. - // original source code - // ldr q8, [x1, #(16*0)] // .........e.............................................................|.........e............................................................ - // ldr q9, [x1, #(16*1)] // ..............e........................................................|..............e....................................................... - // ldr q10, [x1, #(16*2)] // ........e..............................................................|........e............................................................. - // ldr q11, [x1, #(16*3)] // ...e...................................................................|...e.................................................................. - // ldr q0, [x3], #16 // e......................................................................|e..................................................................... - // mul v24.8h, v10.8h, v0.h[0] // .................e.....................................................|.................e.................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[1] // ....................e..................................................|....................e................................................. - // mls v24.8h, v10.8h, v7.h[0] // ............................e..........................................|............................e......................................... - // sub v10.8h, v8.8h, v24.8h // ..........................................e............................|..........................................e........................... - // add v8.8h, v8.8h, v24.8h // ...........................................................e...........|...........................................................e.......... - // mul v24.8h, v11.8h, v0.h[0] // .............e.........................................................|.............e........................................................ - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ............e..........................................................|............e......................................................... - // mls v24.8h, v11.8h, v7.h[0] // ...................e...................................................|...................e.................................................. - // sub v11.8h, v9.8h, v24.8h // ...........................e...........................................|...........................e.......................................... - // add v9.8h, v9.8h, v24.8h // ...................................e...................................|...................................e.................................. - // mul v24.8h, v9.8h, v0.h[2] // .......................................e...............................|.......................................e.............................. - // sqrdmulh v9.8h, v9.8h, v0.h[3] // ....................................................e..................|....................................................e................. - // mls v24.8h, v9.8h, v7.h[0] // ............................................................e..........|............................................................e......... - // sub v9.8h, v8.8h, v24.8h // ...................................................................e...|...................................................................e.. - // add v8.8h, v8.8h, v24.8h // ..................................................................e....|..................................................................e... - // mul v24.8h, v11.8h, v0.h[4] // ....................................e..................................|....................................e................................. - // sqrdmulh v11.8h, v11.8h, v0.h[5] // .................................................e.....................|.................................................e.................... - // mls v24.8h, v11.8h, v7.h[0] // ......................................................e................|......................................................e............... - // sub v11.8h, v10.8h, v24.8h // ................................................................e......|................................................................e..... - // add v10.8h, v10.8h, v24.8h // .................................................................e.....|.................................................................e.... - // trn1 v25.4s, v8.4s, v9.4s // ..*....................................................................|..*................................................................... - // trn2 v26.4s, v8.4s, v9.4s // .*.....................................................................|.*.................................................................... - // trn1 v27.4s, v10.4s, v11.4s // .......................................................................*...................................................................... - // trn2 v28.4s, v10.4s, v11.4s // ....................................................................e..|....................................................................e. - // trn2 v10.2d, v25.2d, v27.2d // ......*................................................................|......*............................................................... - // trn2 v11.2d, v26.2d, v28.2d // .....*.................................................................|.....*................................................................ - // trn1 v8.2d, v25.2d, v27.2d // .......*...............................................................|.......*.............................................................. - // trn1 v9.2d, v26.2d, v28.2d // ....*..................................................................|....*................................................................. - // ldr q0, [x4], #(6*16) // ......................................................................e|...................................................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // .................................e.....................................|.................................e.................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ..................................e....................................|..................................e................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // ........................*..............................................|........................*............................................. - // ldr q2, [x4, #(-6*16 + 4*16)] // .....................*.................................................|.....................*................................................ - // ldr q6, [x4, #(-6*16 + 5*16)] // ......................*................................................|......................*............................................... - // mul v24.8h, v10.8h, v0.8h // ................*......................................................|................*..................................................... - // sqrdmulh v10.8h, v10.8h, v4.8h // ...............*.......................................................|...............*...................................................... - // mls v24.8h, v10.8h, v7.h[0] // .......................*...............................................|.......................*.............................................. - // sub v10.8h, v8.8h, v24.8h // ......................................*................................|......................................*............................... - // add v8.8h, v8.8h, v24.8h // .........................................*.............................|.........................................*............................ - // mul v24.8h, v11.8h, v0.8h // ...........*...........................................................|...........*.......................................................... - // sqrdmulh v11.8h, v11.8h, v4.8h // ..........*............................................................|..........*........................................................... - // mls v24.8h, v11.8h, v7.h[0] // ..................*....................................................|..................*................................................... - // sub v11.8h, v9.8h, v24.8h // ..........................*............................................|..........................*........................................... - // add v9.8h, v9.8h, v24.8h // .........................*.............................................|.........................*............................................ - // mul v24.8h, v9.8h, v1.8h // ................................*......................................|................................*..................................... - // sqrdmulh v9.8h, v9.8h, v5.8h // ...............................*.......................................|...............................*...................................... - // mls v24.8h, v9.8h, v7.h[0] // ........................................*..............................|........................................*............................. - // sub v9.8h, v8.8h, v24.8h // .............................................*.........................|.............................................*........................ - // add v8.8h, v8.8h, v24.8h // ..............................................*........................|..............................................*....................... - // mul v24.8h, v11.8h, v2.8h // .............................*.........................................|.............................*........................................ - // sqrdmulh v11.8h, v11.8h, v6.8h // ..............................*........................................|..............................*....................................... - // mls v24.8h, v11.8h, v7.h[0] // .....................................*.................................|.....................................*................................ - // sub v11.8h, v10.8h, v24.8h // ............................................*..........................|............................................*......................... - // add v10.8h, v10.8h, v24.8h // ...........................................*...........................|...........................................*.......................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................................................*....................|..................................................*................... - // srshr v25.8h, v25.8h, #11 // ........................................................*..............|........................................................*............. - // mls v8.8h, v25.8h, v7.h[0] // ...............................................................*.......|...............................................................*...... - // sqdmulh v25.8h, v9.8h, v7.h[1] // ................................................*......................|................................................*..................... - // srshr v25.8h, v25.8h, #11 // .......................................................*...............|.......................................................*.............. - // mls v9.8h, v25.8h, v7.h[0] // .............................................................*.........|.............................................................*........ - // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................................................*...................|...................................................*.................. - // srshr v25.8h, v25.8h, #11 // .........................................................*.............|.........................................................*............ - // mls v10.8h, v25.8h, v7.h[0] // ..............................................................*........|..............................................................*....... - // sqdmulh v25.8h, v11.8h, v7.h[1] // ...............................................*.......................|...............................................*...................... - // srshr v25.8h, v25.8h, #11 // .....................................................*.................|.....................................................*................ - // mls v11.8h, v25.8h, v7.h[0] // ..........................................................*............|..........................................................*........... - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // .....................................................................*.|.....................................................................* + // ---------------------------------------------------------------- new position ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q8, [x1, #(16*0)] // .......e................................................................'......~............................................................... + // ldr q9, [x1, #(16*1)] // ..........e.............................................................'.........~............................................................ + // ldr q10, [x1, #(16*2)] // .....e..................................................................'....~................................................................. + // ldr q11, [x1, #(16*3)] // ...e....................................................................'..~................................................................... + // ldr q0, [x3], #16 // e.......................................................................~...................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ...............e........................................................'..............~....................................................... + // mul v24.8h, v10.8h, v0.h[0] // ................e.......................................................'...............~...................................................... + // mls v24.8h, v27.8h, v7.h[0] // .....................e..................................................'....................~................................................. + // sub v10.8h, v8.8h, v24.8h // ......................................e.................................'.....................................~................................ + // add v8.8h, v8.8h, v24.8h // ...................................................e....................'..................................................~................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .............e..........................................................'............~......................................................... + // mul v24.8h, v11.8h, v0.h[0] // ..............e.........................................................'.............~........................................................ + // mls v24.8h, v27.8h, v7.h[0] // ...................e....................................................'..................~................................................... + // sub v11.8h, v9.8h, v24.8h // .........................e..............................................'........................~............................................. + // add v9.8h, v9.8h, v24.8h // ...............................e........................................'..............................~....................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ............................................e...........................'...........................................~.......................... + // mul v24.8h, v9.8h, v0.h[2] // ..................................................e.....................'.................................................~.................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................................e..............'........................................................~............. + // sub v9.8h, v8.8h, v24.8h // ...............................................................e........'..............................................................~....... + // add v8.8h, v8.8h, v24.8h // ....................................................................e...'...................................................................~.. + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ...................................e....................................'..................................~................................... + // mul v24.8h, v11.8h, v0.h[4] // .......................................e................................'......................................~............................... + // mls v24.8h, v27.8h, v7.h[0] // ................................................e.......................'...............................................~...................... + // sub v11.8h, v10.8h, v24.8h // ........................................................e...............'.......................................................~.............. + // add v10.8h, v10.8h, v24.8h // ..............................................................e.........'.............................................................~........ + // trn1 v25.4s, v8.4s, v9.4s // ....~...................................................................'...*.................................................................. + // trn2 v26.4s, v8.4s, v9.4s // .~......................................................................'*..................................................................... + // trn1 v27.4s, v10.4s, v11.4s // .....................................................................e..'....................................................................~. + // trn2 v28.4s, v10.4s, v11.4s // ..~.....................................................................'.*.................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ........~...............................................................'.......*.............................................................. + // trn2 v11.2d, v26.2d, v28.2d // ......~.................................................................'.....*................................................................ + // trn1 v8.2d, v25.2d, v27.2d // ......................~.................................................'.....................*................................................ + // trn1 v9.2d, v26.2d, v28.2d // .........~..............................................................'........*............................................................. + // ldr q0, [ x4], #(6*16) // ..................................................................e.....'.................................................................~.... + // ldr q4, [x4, #(-6*16 + 1*16)] // ...................................................................e....'..................................................................~... + // ldr q1, [ x4, #(-6*16 + 2*16)] // .................................................................e......'................................................................~..... + // ldr q5, [x4, #(-6*16 + 3*16)] // ..................................e.....................................'.................................~.................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // .......................................................................e'...................................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ................................................................e.......'...............................................................~...... + // sqrdmulh v27.8h, v10.8h, v4.8h // ....................~...................................................'...................*.................................................. + // mul v24.8h, v10.8h, v0.8h // ..................~.....................................................'.................*.................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................~.............................................'.........................*............................................ + // sub v10.8h, v8.8h, v24.8h // ....................................~...................................'...................................*.................................. + // add v8.8h, v8.8h, v24.8h // ................................~.......................................'...............................*...................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ...........~............................................................'..........*........................................................... + // mul v24.8h, v11.8h, v0.8h // ............~...........................................................'...........*.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................~......................................................'................*..................................................... + // sub v11.8h, v9.8h, v24.8h // .......................~................................................'......................*............................................... + // add v9.8h, v9.8h, v24.8h // ........................~...............................................'.......................*.............................................. + // sqrdmulh v27.8h, v9.8h, v5.8h // ...........................~............................................'..........................*........................................... + // mul v24.8h, v9.8h, v1.8h // ............................~...........................................'...........................*.......................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................~......................................'................................*..................................... + // sub v9.8h, v8.8h, v24.8h // ........................................~...............................'.......................................*.............................. + // add v8.8h, v8.8h, v24.8h // .........................................~..............................'........................................*............................. + // sqrdmulh v27.8h, v11.8h, v6.8h // .............................~..........................................'............................*......................................... + // mul v24.8h, v11.8h, v2.8h // ..............................~.........................................'.............................*........................................ + // mls v24.8h, v27.8h, v7.h[0] // .....................................~..................................'....................................*................................. + // sub v11.8h, v10.8h, v24.8h // ...........................................~............................'..........................................*........................... + // add v10.8h, v10.8h, v24.8h // ..........................................~.............................'.........................................*............................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...............................................~........................'..............................................*....................... + // srshr v25.8h, v25.8h, #11 // .....................................................~..................'....................................................*................. + // mls v8.8h, v25.8h, v7.h[0] // ...........................................................~............'..........................................................*........... + // sqdmulh v25.8h, v9.8h, v7.h[1] // .............................................~..........................'............................................*......................... + // srshr v25.8h, v25.8h, #11 // ......................................................~.................'.....................................................*................ + // mls v9.8h, v25.8h, v7.h[0] // .............................................................~..........'............................................................*......... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .................................................~......................'................................................*..................... + // srshr v25.8h, v25.8h, #11 // .......................................................~................'......................................................*............... + // mls v10.8h, v25.8h, v7.h[0] // ............................................................~...........'...........................................................*.......... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ..............................................~.........................'.............................................*........................ + // srshr v25.8h, v25.8h, #11 // ....................................................~...................'...................................................*.................. + // mls v11.8h, v25.8h, v7.h[0] // ..........................................................~.............'.........................................................*............ + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ......................................................................~.'.....................................................................* sub count, count, #1 cbnz count, layer4567_start - trn1 v9.4S, v11.4S, v20.4S // *.......................................... - trn2 v20.4S, v19.4S, v18.4S // .*......................................... - ldr q2, [x4, #-32] // ............*.............................. - ldr q27, [x4, #-16] // .............*............................. - trn1 v18.4S, v19.4S, v18.4S // ..*........................................ - ldr q30, [x4, #-48] // ...............*........................... - // gap // ........................................... - // gap // ........................................... - trn1 v11.2D, v20.2D, v21.2D // ...*....................................... - trn2 v20.2D, v20.2D, v21.2D // ....*...................................... - // gap // ........................................... - // gap // ........................................... - trn2 v5.2D, v18.2D, v9.2D // .....*..................................... - trn1 v9.2D, v18.2D, v9.2D // ......*.................................... - // gap // ........................................... - // gap // ........................................... - sqrdmulh v18.8H, v20.8H, v28.8H // .......*................................... - mul v21.8H, v20.8H, v22.8H // ........*.................................. - // gap // ........................................... - // gap // ........................................... - sqrdmulh v28.8H, v5.8H, v28.8H // .........*................................. - mul v20.8H, v5.8H, v22.8H // ..........*................................ - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mls v21.8H, v18.8H, v7.H[0] // ...........*............................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mls v20.8H, v28.8H, v7.H[0] // ..............*............................ - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - sub v22.8H, v11.8H, v21.8H // .................*......................... - add v11.8H, v11.8H, v21.8H // ................*.......................... - // gap // ........................................... - // gap // ........................................... - sub v18.8H, v9.8H, v20.8H // .......................*................... - add v9.8H, v9.8H, v20.8H // .........................*................. - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mul v2.8H, v22.8H, v2.8H // ..................*........................ - // gap // ........................................... - sqrdmulh v22.8H, v22.8H, v27.8H // ...................*....................... - mul v20.8H, v11.8H, v29.8H // .....................*..................... - sqrdmulh v11.8H, v11.8H, v30.8H // ....................*...................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mls v2.8H, v22.8H, v7.H[0] // ......................*.................... - // gap // ........................................... - mls v20.8H, v11.8H, v7.H[0] // ........................*.................. - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - sub v12.8H, v18.8H, v2.8H // ...........................*............... - // gap // ........................................... - // gap // ........................................... - add v11.8H, v18.8H, v2.8H // ..........................*................ - sub v10.8H, v9.8H, v20.8H // ............................*.............. - // gap // ........................................... - // gap // ........................................... - add v9.8H, v9.8H, v20.8H // .............................*............. - sqdmulh v2.8H, v11.8H, v7.H[1] // .................................*......... - // gap // ........................................... - // gap // ........................................... - sqdmulh v20.8H, v12.8H, v7.H[1] // ..............................*............ - sqdmulh v27.8H, v9.8H, v7.H[1] // ................................*.......... - // gap // ........................................... - // gap // ........................................... - sqdmulh v22.8H, v10.8H, v7.H[1] // ...............................*........... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - srshr v2.8H, v2.8H, #11 // .....................................*..... - // gap // ........................................... - // gap // ........................................... - srshr v20.8H, v20.8H, #11 // ..................................*........ - srshr v27.8H, v27.8H, #11 // ....................................*...... - // gap // ........................................... - // gap // ........................................... - srshr v22.8H, v22.8H, #11 // ...................................*....... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - mls v11.8H, v2.8H, v7.H[0] // ........................................*.. - // gap // ........................................... - // gap // ........................................... - mls v12.8H, v20.8H, v7.H[0] // ......................................*.... - mls v9.8H, v27.8H, v7.H[0] // .........................................*. - // gap // ........................................... - // gap // ........................................... - mls v10.8H, v22.8H, v7.H[0] // .......................................*... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - // gap // ........................................... - st4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1], #64 // ..........................................* + // Instructions: 40 + // Expected cycles: 31 + // Expected IPC: 1.29 + // + // Cycle bound: 31.0 + // IPC bound: 1.29 + // + // Wall time: 0.90s + // User time: 0.90s + // + // ---------- original position ----------> + // 0 25 + // |------------------------|-------------- + trn2 v15.4S, v15.4S, v6.4S // .*...................................... + trn2 v3.4S, v17.4S, v16.4S // *....................................... + // gap // ........................................ + // gap // ........................................ + trn1 v6.4S, v17.4S, v16.4S // ..*..................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v24.2D, v3.2D, v15.2D // ...*.................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn1 v1.2D, v3.2D, v15.2D // .....*.................................. + trn2 v15.2D, v6.2D, v2.2D // ....*................................... + // gap // ........................................ + // gap // ........................................ + sqrdmulh v26.8H, v24.8H, v27.8H // ......*................................. + mul v16.8H, v24.8H, v14.8H // .......*................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v27.8H, v15.8H, v27.8H // ..........*............................. + mul v13.8H, v15.8H, v14.8H // .........*.............................. + // gap // ........................................ + // gap // ........................................ + trn1 v17.2D, v6.2D, v2.2D // ...........*............................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v16.8H, v26.8H, v7.H[0] // ........*............................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v13.8H, v27.8H, v7.H[0] // ..............*......................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v6.8H, v1.8H, v16.8H // .............*.......................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v28.8H, v1.8H, v16.8H // ............*........................... + add v15.8H, v17.8H, v13.8H // ...................*.................... + // gap // ........................................ + // gap // ........................................ + sqrdmulh v14.8H, v6.8H, v0.8H // ...............*........................ + mul v16.8H, v6.8H, v22.8H // ................*....................... + // gap // ........................................ + // gap // ........................................ + sqrdmulh v25.8H, v28.8H, v31.8H // .................*...................... + mul v26.8H, v28.8H, v11.8H // ..................*..................... + // gap // ........................................ + // gap // ........................................ + sub v27.8H, v17.8H, v13.8H // .....................*.................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v16.8H, v14.8H, v7.H[0] // ....................*................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v26.8H, v25.8H, v7.H[0] // ......................*................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v14.8H, v15.8H, v16.8H // .......................*................ + add v13.8H, v15.8H, v16.8H // ........................*............... + // gap // ........................................ + // gap // ........................................ + add v15.8H, v27.8H, v26.8H // .........................*.............. + sub v16.8H, v27.8H, v26.8H // ..........................*............. + // gap // ........................................ + // gap // ........................................ + sqdmulh v27.8H, v14.8H, v7.H[1] // ...........................*............ + sqdmulh v6.8H, v13.8H, v7.H[1] // .............................*.......... + // gap // ........................................ + // gap // ........................................ + sqdmulh v0.8H, v15.8H, v7.H[1] // ..............................*......... + // gap // ........................................ + // gap // ........................................ + sqdmulh v26.8H, v16.8H, v7.H[1] // ............................*........... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + srshr v27.8H, v27.8H, #11 // .................................*...... + srshr v11.8H, v6.8H, #11 // ................................*....... + // gap // ........................................ + // gap // ........................................ + srshr v6.8H, v0.8H, #11 // ..................................*..... + srshr v26.8H, v26.8H, #11 // ...............................*........ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v14.8H, v27.8H, v7.H[0] // ......................................*. + mls v13.8H, v11.8H, v7.H[0] // ....................................*... + // gap // ........................................ + // gap // ........................................ + mls v15.8H, v6.8H, v7.H[0] // .....................................*.. + mls v16.8H, v26.8H, v7.H[0] // ...................................*.... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + st4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1], #64 // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ - // original source code - // trn1 v10.4S, v11.4S, v20.4S // *.......................................... - // trn2 v2.4S, v19.4S, v18.4S // .*......................................... - // trn1 v9.4S, v19.4S, v18.4S // ....*...................................... - // trn1 v30.2D, v2.2D, v21.2D // ......*.................................... - // trn2 v5.2D, v2.2D, v21.2D // .......*................................... - // trn2 v11.2D, v9.2D, v10.2D // ........*.................................. - // trn1 v16.2D, v9.2D, v10.2D // .........*................................. - // sqrdmulh v9.8H, v5.8H, v28.8H // ..........*................................ - // mul v20.8H, v5.8H, v22.8H // ...........*............................... - // sqrdmulh v18.8H, v11.8H, v28.8H // ............*.............................. - // mul v27.8H, v11.8H, v22.8H // .............*............................. - // mls v20.8H, v9.8H, v7.H[0] // ..............*............................ - // ldr q26, [x4, #-32] // ..*........................................ - // ldr q24, [x4, #-16] // ...*....................................... - // mls v27.8H, v18.8H, v7.H[0] // ...............*........................... - // ldr q5, [x4, #-48] // .....*..................................... - // add v2.8H, v30.8H, v20.8H // .................*......................... - // sub v9.8H, v30.8H, v20.8H // ................*.......................... - // mul v11.8H, v9.8H, v26.8H // ....................*...................... - // sqrdmulh v9.8H, v9.8H, v24.8H // .....................*..................... - // sqrdmulh v20.8H, v2.8H, v5.8H // .......................*................... - // mul v2.8H, v2.8H, v29.8H // ......................*.................... - // mls v11.8H, v9.8H, v7.H[0] // ........................*.................. - // sub v9.8H, v16.8H, v27.8H // ..................*........................ - // mls v2.8H, v20.8H, v7.H[0] // .........................*................. - // add v22.8H, v16.8H, v27.8H // ...................*....................... - // add v5.8H, v9.8H, v11.8H // ...........................*............... - // sub v6.8H, v9.8H, v11.8H // ..........................*................ - // sub v4.8H, v22.8H, v2.8H // ............................*.............. - // add v3.8H, v22.8H, v2.8H // .............................*............. - // sqdmulh v2.8H, v6.8H, v7.H[1] // ...............................*........... - // sqdmulh v27.8H, v4.8H, v7.H[1] // .................................*......... - // sqdmulh v20.8H, v3.8H, v7.H[1] // ................................*.......... - // sqdmulh v11.8H, v5.8H, v7.H[1] // ..............................*............ - // srshr v9.8H, v2.8H, #11 // ...................................*....... - // srshr v22.8H, v27.8H, #11 // .....................................*..... - // srshr v27.8H, v20.8H, #11 // ....................................*...... - // srshr v11.8H, v11.8H, #11 // ..................................*........ - // mls v6.8H, v9.8H, v7.H[0] // .......................................*... - // mls v4.8H, v22.8H, v7.H[0] // .........................................*. - // mls v5.8H, v11.8H, v7.H[0] // ......................................*.... - // mls v3.8H, v27.8H, v7.H[0] // ........................................*.. - // st4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1], #64 // ..........................................* + // ------------ new position -------------> + // 0 25 + // |------------------------|-------------- + // trn2 v29.4S, v17.4S, v16.4S // .*...................................... + // trn2 v28.4S, v15.4S, v6.4S // *....................................... + // trn1 v17.4S, v17.4S, v16.4S // ..*..................................... + // trn2 v6.2D, v29.2D, v28.2D // ...*.................................... + // trn2 v18.2D, v17.2D, v2.2D // .....*.................................. + // trn1 v13.2D, v29.2D, v28.2D // ....*................................... + // sqrdmulh v16.8H, v6.8H, v27.8H // ......*................................. + // mul v15.8H, v6.8H, v14.8H // .......*................................ + // mls v15.8H, v16.8H, v7.H[0] // ...........*............................ + // mul v26.8H, v18.8H, v14.8H // .........*.............................. + // sqrdmulh v16.8H, v18.8H, v27.8H // ........*............................... + // trn1 v25.2D, v17.2D, v2.2D // ..........*............................. + // sub v6.8H, v13.8H, v15.8H // ..............*......................... + // add v15.8H, v13.8H, v15.8H // .............*.......................... + // mls v26.8H, v16.8H, v7.H[0] // ............*........................... + // sqrdmulh v14.8H, v15.8H, v0.8H // ................*....................... + // mul v27.8H, v15.8H, v22.8H // .................*...................... + // sqrdmulh v16.8H, v6.8H, v31.8H // ..................*..................... + // mul v15.8H, v6.8H, v11.8H // ...................*.................... + // add v6.8H, v25.8H, v26.8H // ...............*........................ + // mls v27.8H, v14.8H, v7.H[0] // .....................*.................. + // sub v14.8H, v25.8H, v26.8H // ....................*................... + // mls v15.8H, v16.8H, v7.H[0] // ......................*................. + // sub v11.8H, v6.8H, v27.8H // .......................*................ + // add v10.8H, v6.8H, v27.8H // ........................*............... + // add v12.8H, v14.8H, v15.8H // .........................*.............. + // sub v13.8H, v14.8H, v15.8H // ..........................*............. + // sqdmulh v16.8H, v11.8H, v7.H[1] // ...........................*............ + // sqdmulh v4.8H, v13.8H, v7.H[1] // ..............................*......... + // sqdmulh v27.8H, v10.8H, v7.H[1] // ............................*........... + // sqdmulh v15.8H, v12.8H, v7.H[1] // .............................*.......... + // srshr v14.8H, v4.8H, #11 // ..................................*..... + // srshr v27.8H, v27.8H, #11 // ................................*....... + // srshr v16.8H, v16.8H, #11 // ...............................*........ + // srshr v15.8H, v15.8H, #11 // .................................*...... + // mls v13.8H, v14.8H, v7.H[0] // ......................................*. + // mls v10.8H, v27.8H, v7.H[0] // ....................................*... + // mls v12.8H, v15.8H, v7.H[0] // .....................................*.. + // mls v11.8H, v16.8H, v7.H[0] // ...................................*.... + // st4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x1], #64 // .......................................* pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_a55.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_a55.s index 844e2323..8daccc77 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_a55.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_a55.s @@ -26,46 +26,12 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 -.macro vins vec_out, gpr_in, lane // slothy:no-unfold +.macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold - umov \gpr_out\(), \vec_in\().d[\lane] -.endm - -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -83,15 +49,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -100,12 +66,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -119,43 +79,43 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -166,7 +126,7 @@ xtmp1 .req x11 str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -176,7 +136,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -184,7 +144,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -195,19 +155,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -220,7 +180,7 @@ roots: .text .global ntt_kyber_123_4567_scalar_load_opt_a55 - .global _ntt_kyber_123_4567_scalar_load_opt_a55 + .global _ntt_kyber_123_4567_scalar_load .p2align 4 const_addr: .short 3329 @@ -346,1037 +306,1103 @@ _ntt_kyber_123_4567_scalar_load_opt_a55: load_roots_123 .p2align 2 - ldr x7, [x0, #448] // ..................*.................... - // gap // ....................................... - // gap // ....................................... - ldr x22, [x0, #320] // .............*......................... - // gap // ....................................... - ldr x13, [x0, #456] // .....................*................. - ldr x15, [x0, #384] // ...............*....................... - // gap // ....................................... - vins v6, x7, 0 // .......................*............... - ldr x21, [x0, #328] // ..............*........................ - vins v2, x22, 0 // .................*..................... - ldr x9, [x0, #200] // .....*................................. - vins v6, x13, 1 // ........................*.............. - ldr x28, [x0, #392] // ................*...................... - vins v11, x15, 0 // ....................*.................. - ldr x14, [x0, #192] // ....*.................................. - mul v15.8H, v6.8H, v0.H[0] // .........................*............. - vins v2, x21, 1 // ...................*................... - sqrdmulh v10.8H, v6.8H, v0.H[1] // ..........................*............ - ldr x15, [x0, #64] // *...................................... - sqrdmulh v6.8H, v2.8H, v0.H[1] // ...............................*....... - vins v11, x28, 1 // ......................*................ - mul v3.8H, v2.8H, v0.H[0] // ..............................*........ - ldr x21, [x0, #72] // .*..................................... - mul v4.8H, v11.8H, v0.H[0] // .................................*..... - ldr x13, [x0, #256] // ......*................................ - mls v15.8H, v10.8H, v7.H[0] // ...........................*........... - vins v10, x14, 0 // .........*............................. - sqrdmulh v24.8H, v11.8H, v0.H[1] // ..................................*.... - ldr x25, [x0, #128] // ..*.................................... - vins v10, x9, 1 // ...........*........................... - ldr x10, [x0, #136] // ...*................................... - mls v3.8H, v6.8H, v7.H[0] // ................................*...... - vins v12, x13, 0 // ..........*............................ - add v30.8H, v10.8H, v15.8H // .............................*......... - ldr x13, [x0, #264] // .......*............................... - sub v5.8H, v10.8H, v15.8H // ............................*.......... - vins v16, x25, 0 // ........*.............................. - mls v4.8H, v24.8H, v7.H[0] // ....................................*.. - // gap // ....................................... - mul v2.8H, v30.8H, v0.H[2] // ...................................*... - // gap // ....................................... - sqrdmulh v23.8H, v30.8H, v0.H[3] // .....................................*. - vins v12, x13, 1 // ............*.......................... - sqrdmulh v29.8H, v5.8H, v0.H[5] // ......................................* - // gap // ....................................... - - // original source code - // ldr x15, [x0, #64] // ...............*....................... || .........*............. - // ldr x21, [x0, #72] // ...................*................... || ...........*........... - // ldr x14, [x0, #128] // .........................*............. || ..............*........ - // ldr x10, [x0, #136] // ...........................*........... || ...............*....... - // ldr x13, [x0, #192] // ...........*........................... || .......*............... - // ldr x24, [x0, #200] // .......*............................... || .....*................. - // ldr x26, [x0, #256] // .....................*................. || ............*.......... - // ldr x16, [x0, #264] // ...............................*....... || .................*..... - // vins v16, x14, 0 // .................................*..... || ..................*.... - // vins v13, x13, 0 // .......................*............... || .............*......... - // vins v12, x26, 0 // .............................*......... || ................*...... - // vins v13, x24, 1 // ..........................*............ || ...............*....... - // vins v12, x16, 1 // .....................................*. || .....................*. - // ldr x13, [x0, #320] // .*..................................... || .*..................... - // ldr x14, [x0, #328] // .....*................................. || ....*.................. - // ldr x24, [x0, #384] // ...*................................... || ...*................... - // ldr x26, [x0, #392] // .........*............................. || ......*................ - // vins v15, x13, 0 // ......*................................ || .....*................. - // ldr x13, [x0, #448] // *...................................... || *...................... - // vins v15, x14, 1 // .............*......................... || ........*.............. - // vins v31, x24, 0 // ..........*............................ || .......*............... - // ldr x14, [x0, #456] // ..*.................................... || ..*.................... - // vins v31, x26, 1 // .................*..................... || ..........*............ - // vins v25, x13, 0 // ....*.................................. || ....*.................. - // vins v25, x14, 1 // ........*.............................. || ......*................ - // mul v10.8H, v25.8H, v0.H[0] // ............*.......................... || ........*.............. - // sqrdmulh v23.8H, v25.8H, v0.H[1] // ..............*........................ || .........*............. - // mls v10.8H, v23.8H, v7.H[0] // ......................*................ || .............*......... - // sub v5.8H, v13.8H, v10.8H // ................................*...... || ..................*.... - // add v10.8H, v13.8H, v10.8H // ..............................*........ || .................*..... - // mul v3.8H, v15.8H, v0.H[0] // ..................*.................... || ...........*........... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ................*...................... || ..........*............ - // mls v3.8H, v15.8H, v7.H[0] // ............................*.......... || ................*...... - // mul v4.8H, v31.8H, v0.H[0] // ....................*.................. || ............*.......... - // sqrdmulh v6.8H, v31.8H, v0.H[1] // ........................*.............. || ..............*........ - // mul v2.8H, v10.8H, v0.H[2] // ...................................*... || ....................*.. - // mls v4.8H, v6.8H, v7.H[0] // ..................................*.... || ...................*... - // sqrdmulh v23.8H, v10.8H, v0.H[3] // ....................................*.. || .....................*. - // sqrdmulh v29.8H, v5.8H, v0.H[5] // ......................................* || ......................* - + // Instructions: 9 + // Expected cycles: 15 + // Expected IPC: 0.60 + // + // Cycle bound: 15.0 + // IPC bound: 0.60 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q5, [x0, #448] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q20, [x0, #64] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x0, #128] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x0, #192] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q23, [x0, #256] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q25, [x0, #384] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v29.8H, v23.8H, v0.H[1] // ......*....................... + // gap // .............................. + mul v10.8H, v23.8H, v0.H[0] // .......*...................... + // gap // .............................. + ldr q31, [x0, #320] // .....*........................ + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q5, [x0, #448] // *.............................. + // ldr q20, [x0, #64] // .*............................. + // ldr q11, [x0, #128] // ..*............................ + // ldr q13, [x0, #192] // ...*........................... + // ldr q18, [x0, #256] // ....*.......................... + // ldr q31, [x0, #320] // ........*...................... + // sqrdmulh v29.8H, v18.8H, v0.H[1] // ......*........................ + // mul v10.8H, v18.8H, v0.H[0] // .......*....................... + // ldr q25, [x0, #384] // .....*......................... + sub count, count, #1 -.p2align 2 layer123_start: - mul v6.8H, v12.8H, v0.H[0] // ................................*................................................................... - ldr x13, [x0, #0] // *................................................................................................... - mul v10.8H, v5.8H, v0.H[4] // ...................................................................*................................ - vins v15, x15, 0 // ......*............................................................................................. - sqrdmulh v11.8H, v12.8H, v0.H[1] // .................................*.................................................................. - vins v16, x10, 1 // ...........*........................................................................................ - mls v2.8H, v23.8H, v7.H[0] // ...........................................................*........................................ - vins v15, x21, 1 // .......*............................................................................................ - add v23.8H, v16.8H, v4.8H // ..............................................*..................................................... - vins v20, x13, 0 // ..*................................................................................................. - add v5.8H, v15.8H, v3.8H // .........................................*.......................................................... - ldr x13, [x0, #8] // .*.................................................................................................. - mls v6.8H, v11.8H, v7.H[0] // ..................................*................................................................. - ldr x15, [x0, #80] // ....e............................................................................................... - mul v11.8H, v23.8H, v0.H[2] // ....................................................*............................................... - ldr x21, [x0, #88] // .....e.............................................................................................. - add v13.8H, v5.8H, v2.8H // .............................................................*...................................... - ldr x14, [x0, #144] // ........e........................................................................................... - sqrdmulh v23.8H, v23.8H, v0.H[3] // .....................................................*.............................................. - vins v20, x13, 1 // ...*................................................................................................ - mls v10.8H, v29.8H, v7.H[0] // .....................................................................*.............................. - ldr x10, [x0, #152] // .........e.......................................................................................... - mul v29.8H, v13.8H, v0.H[6] // ........................................................................*........................... - ldr x13, [x0, #208] // ............e....................................................................................... - sqrdmulh v13.8H, v13.8H, v0.H[7] // .........................................................................*.......................... - ldr x24, [x0, #216] // .............e...................................................................................... - mls v11.8H, v23.8H, v7.H[0] // ......................................................*............................................. - ldr x26, [x0, #272] // ................e................................................................................... - add v23.8H, v20.8H, v6.8H // ....................................*............................................................... - ldr x16, [x0, #280] // .................e.................................................................................. - sub v31.8H, v16.8H, v4.8H // .............................................*...................................................... - vins v16, x14, 0 // ..........e......................................................................................... - mls v29.8H, v13.8H, v7.H[0] // ..........................................................................*......................... - vins v13, x13, 0 // ..............e..................................................................................... - add v4.8H, v23.8H, v11.8H // ........................................................*........................................... - vins v12, x26, 0 // ..................e................................................................................. - sub v2.8H, v5.8H, v2.8H // ............................................................*....................................... - vins v13, x24, 1 // ...............e.................................................................................... - sub v15.8H, v15.8H, v3.8H // ........................................*........................................................... - vins v12, x16, 1 // ...................e................................................................................ - sub v5.8H, v4.8H, v29.8H // ...........................................................................*........................ - ldr x13, [x0, #336] // ....................e............................................................................... - sub v6.8H, v20.8H, v6.8H // ...................................*................................................................ - ldr x14, [x0, #344] // .....................e.............................................................................. - sub v20.8H, v15.8H, v10.8H // ......................................................................*............................. - ldr x24, [x0, #400] // ........................e........................................................................... - str_vo v5, x0, 64 // .............................................................................................*...... - ldr x26, [x0, #408] // .........................e.......................................................................... - add v10.8H, v15.8H, v10.8H // .......................................................................*............................ - vins v15, x13, 0 // ......................e............................................................................. - sub v11.8H, v23.8H, v11.8H // .......................................................*............................................ - ldr x13, [x0, #464] // ............................e....................................................................... - mul v23.8H, v31.8H, v0.H[4] // ..............................................................*..................................... - vins v15, x14, 1 // .......................e............................................................................ - sqrdmulh v5.8H, v31.8H, v0.H[5] // ...............................................................*.................................... - vins v31, x24, 0 // ..........................e......................................................................... - add v29.8H, v4.8H, v29.8H // ............................................................................*....................... - ldr x14, [x0, #472] // .............................e...................................................................... - mul v4.8H, v2.8H, v1.H[0] // .............................................................................*...................... - vins v31, x26, 1 // ...........................e........................................................................ - sqrdmulh v2.8H, v2.8H, v1.H[1] // ..............................................................................*..................... - vins v25, x13, 0 // ..............................e..................................................................... - mls v23.8H, v5.8H, v7.H[0] // ................................................................*................................... - // gap // .................................................................................................... - mul v5.8H, v10.8H, v1.H[2] // ..................................................................................*................. - vins v25, x14, 1 // ...............................e.................................................................... - sqrdmulh v10.8H, v10.8H, v1.H[3] // ...................................................................................*................ - // gap // .................................................................................................... - mls v4.8H, v2.8H, v7.H[0] // ...............................................................................*.................... - // gap // .................................................................................................... - sub v2.8H, v6.8H, v23.8H // .................................................................*.................................. - // gap // .................................................................................................... - add v6.8H, v6.8H, v23.8H // ..................................................................*................................. - // gap // .................................................................................................... - mls v5.8H, v10.8H, v7.H[0] // ....................................................................................*............... - // gap // .................................................................................................... - mul v10.8H, v25.8H, v0.H[0] // ...............................................e.................................................... - // gap // .................................................................................................... - sqrdmulh v23.8H, v25.8H, v0.H[1] // ................................................e................................................... - // gap // .................................................................................................... - sub v25.8H, v11.8H, v4.8H // ................................................................................*................... - // gap // .................................................................................................... - add v11.8H, v11.8H, v4.8H // .................................................................................*.................. - // gap // .................................................................................................... - sub v4.8H, v6.8H, v5.8H // .....................................................................................*.............. - // gap // .................................................................................................... - mls v10.8H, v23.8H, v7.H[0] // .................................................e.................................................. - // gap // .................................................................................................... - add v6.8H, v6.8H, v5.8H // ......................................................................................*............. - // gap // .................................................................................................... - mul v23.8H, v20.8H, v1.H[4] // .......................................................................................*............ - // gap // .................................................................................................... - sqrdmulh v20.8H, v20.8H, v1.H[5] // ........................................................................................*........... - // gap // .................................................................................................... - sub v5.8H, v13.8H, v10.8H // ..................................................e................................................. - // gap // .................................................................................................... - add v10.8H, v13.8H, v10.8H // ...................................................e................................................ - // gap // .................................................................................................... - str_vi v29, x0, 16 // ............................................................................................*....... - // gap // .................................................................................................... - mls v23.8H, v20.8H, v7.H[0] // .........................................................................................*.......... - // gap // .................................................................................................... - str_vo v11, x0, 112 // ..............................................................................................*..... - // gap // .................................................................................................... - mul v3.8H, v15.8H, v0.H[0] // .....................................e.............................................................. - // gap // .................................................................................................... - str_vo v25, x0, 176 // ...............................................................................................*.... - // gap // .................................................................................................... - sqrdmulh v15.8H, v15.8H, v0.H[1] // ......................................e............................................................. - // gap // .................................................................................................... - sub v11.8H, v2.8H, v23.8H // ..........................................................................................*......... - // gap // .................................................................................................... - add v2.8H, v2.8H, v23.8H // ...........................................................................................*........ - // gap // .................................................................................................... - str_vo v6, x0, 240 // ................................................................................................*... - // gap // .................................................................................................... - mls v3.8H, v15.8H, v7.H[0] // .......................................e............................................................ - // gap // .................................................................................................... - str_vo v4, x0, 304 // .................................................................................................*.. - // gap // .................................................................................................... - mul v4.8H, v31.8H, v0.H[0] // ..........................................e......................................................... - // gap // .................................................................................................... - sqrdmulh v6.8H, v31.8H, v0.H[1] // ...........................................e........................................................ - // gap // .................................................................................................... - str_vo v2, x0, 368 // ..................................................................................................*. - // gap // .................................................................................................... - mul v2.8H, v10.8H, v0.H[2] // .........................................................e.......................................... - // gap // .................................................................................................... - str_vo v11, x0, 432 // ...................................................................................................* - // gap // .................................................................................................... - mls v4.8H, v6.8H, v7.H[0] // ............................................e....................................................... - // gap // .................................................................................................... - sqrdmulh v23.8H, v10.8H, v0.H[3] // ..........................................................e......................................... - // gap // .................................................................................................... - sqrdmulh v29.8H, v5.8H, v0.H[5] // ....................................................................e............................... - // gap // .................................................................................................... - - // original source code - // ldr x10, [x0, #0] // ........................................................................................*............................................................................................... || ..............................................................*................................................................ - // ldr x11, [x0, #8] // ..................................................................................................*..................................................................................... || ...................................................................*........................................................... - // vins v8, x10, 0 // ................................................................................................*....................................................................................... || ..................................................................*............................................................ - // vins v8, x11, 1 // ..........................................................................................................*............................................................................. || .......................................................................*....................................................... - // ldr x10, [x0, #64] // e....................................................................................................................................................................................... || e.............................................................................................................................. - // ldr x11, [x0, #72] // ..e..................................................................................................................................................................................... || .e............................................................................................................................. - // vins v9, x10, 0 // ..........................................................................................*............................................................................................. || ...............................................................*............................................................... - // vins v9, x11, 1 // ..............................................................................................*......................................................................................... || .................................................................*............................................................. - // ldr x10, [x0, #128] // ....e................................................................................................................................................................................... || ..e............................................................................................................................ - // ldr x11, [x0, #136] // ........e............................................................................................................................................................................... || ....e.......................................................................................................................... - // vins v10, x10, 0 // ..................e..................................................................................................................................................................... || .........e..................................................................................................................... - // vins v10, x11, 1 // ............................................................................................*........................................................................................... || ................................................................*.............................................................. - // ldr x10, [x0, #192] // ..........e............................................................................................................................................................................. || .....e......................................................................................................................... - // ldr x11, [x0, #200] // ............e........................................................................................................................................................................... || ......e........................................................................................................................ - // vins v11, x10, 0 // ....................e................................................................................................................................................................... || ..........e.................................................................................................................... - // vins v11, x11, 1 // ........................e............................................................................................................................................................... || ............e.................................................................................................................. - // ldr x10, [x0, #256] // ..............e......................................................................................................................................................................... || .......e....................................................................................................................... - // ldr x11, [x0, #264] // ................e....................................................................................................................................................................... || ........e...................................................................................................................... - // vins v12, x10, 0 // ......................e................................................................................................................................................................. || ...........e................................................................................................................... - // vins v12, x11, 1 // ..........................e............................................................................................................................................................. || .............e................................................................................................................. - // ldr x10, [x0, #320] // ............................e........................................................................................................................................................... || ..............e................................................................................................................ - // ldr x11, [x0, #328] // ..............................e......................................................................................................................................................... || ...............e............................................................................................................... - // vins v13, x10, 0 // ....................................e................................................................................................................................................... || ..................e............................................................................................................ - // vins v13, x11, 1 // ........................................e............................................................................................................................................... || ....................e.......................................................................................................... - // ldr x10, [x0, #384] // ................................e....................................................................................................................................................... || ................e.............................................................................................................. - // ldr x11, [x0, #392] // ..................................e..................................................................................................................................................... || .................e............................................................................................................. - // vins v14, x10, 0 // ..........................................e............................................................................................................................................. || .....................e......................................................................................................... - // vins v14, x11, 1 // ..............................................e......................................................................................................................................... || .......................e....................................................................................................... - // ldr x10, [x0, #448] // ......................................e................................................................................................................................................. || ...................e........................................................................................................... - // ldr x11, [x0, #456] // ............................................e........................................................................................................................................... || ......................e........................................................................................................ - // vins v15, x10, 0 // ................................................e....................................................................................................................................... || ........................e...................................................................................................... - // vins v15, x11, 1 // ...................................................e.................................................................................................................................... || ..........................e.................................................................................................... - // mul v24.8H, v12.8H, v0.H[0] // .......................................................................................*................................................................................................ || ..............................................................*................................................................ - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ...........................................................................................*............................................................................................ || ................................................................*.............................................................. - // mls v24.8H, v12.8H, v7.H[0] // ...................................................................................................*.................................................................................... || ....................................................................*.......................................................... - // sub v12.8H, v8.8H, v24.8H // .................................................................................................................................*...................................................... || ...................................................................................*........................................... - // add v8.8H, v8.8H, v24.8H // ...................................................................................................................*.................................................................... || ............................................................................*.................................................. - // mul v24.8H, v13.8H, v0.H[0] // .......................................................................e................................................................................................................ || ..............................................e................................................................................ - // sqrdmulh v13.8H, v13.8H, v0.H[1] // .........................................................................e.............................................................................................................. || ................................................e.............................................................................. - // mls v24.8H, v13.8H, v7.H[0] // .............................................................................e.......................................................................................................... || ....................................................e.......................................................................... - // sub v13.8H, v9.8H, v24.8H // .............................................................................................................................*.......................................................... || .................................................................................*............................................. - // add v9.8H, v9.8H, v24.8H // .................................................................................................*...................................................................................... || ...................................................................*........................................................... - // mul v24.8H, v14.8H, v0.H[0] // ...............................................................................e........................................................................................................ || ......................................................e........................................................................ - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ................................................................................e....................................................................................................... || .......................................................e....................................................................... - // mls v24.8H, v14.8H, v7.H[0] // ....................................................................................e................................................................................................... || ...........................................................e................................................................... - // sub v14.8H, v10.8H, v24.8H // .....................................................................................................................*.................................................................. || .............................................................................*................................................. - // add v10.8H, v10.8H, v24.8H // ...............................................................................................*........................................................................................ || ..................................................................*............................................................ - // mul v24.8H, v15.8H, v0.H[0] // .........................................................e.............................................................................................................................. || ................................e.............................................................................................. - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ..........................................................e............................................................................................................................. || .................................e............................................................................................. - // mls v24.8H, v15.8H, v7.H[0] // ..............................................................e......................................................................................................................... || .....................................e......................................................................................... - // sub v15.8H, v11.8H, v24.8H // ..................................................................e..................................................................................................................... || .........................................e..................................................................................... - // add v11.8H, v11.8H, v24.8H // ...................................................................e.................................................................................................................... || ..........................................e.................................................................................... - // mul v24.8H, v10.8H, v0.H[2] // .....................................................................................................*.................................................................................. || .....................................................................*......................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[3] // .........................................................................................................*.............................................................................. || .......................................................................*....................................................... - // mls v24.8H, v10.8H, v7.H[0] // .................................................................................................................*...................................................................... || ...........................................................................*................................................... - // sub v10.8H, v8.8H, v24.8H // .........................................................................................................................................*.............................................. || .......................................................................................*....................................... - // add v8.8H, v8.8H, v24.8H // .........................................................................................................................*.............................................................. || ...............................................................................*............................................... - // mul v24.8H, v11.8H, v0.H[2] // ..................................................................................e..................................................................................................... || .........................................................e..................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[3] // .....................................................................................e.................................................................................................. || ............................................................e.................................................................. - // mls v24.8H, v11.8H, v7.H[0] // .............................................................................................*.......................................................................................... || .................................................................*............................................................. - // sub v11.8H, v9.8H, v24.8H // ...........................................................................................................................*............................................................ || ................................................................................*.............................................. - // add v9.8H, v9.8H, v24.8H // .......................................................................................................*................................................................................ || ......................................................................*........................................................ - // mul v24.8H, v14.8H, v0.H[4] // ...........................................................................................................................................*............................................ || ........................................................................................*...................................... - // sqrdmulh v14.8H, v14.8H, v0.H[5] // .............................................................................................................................................*.......................................... || .........................................................................................*..................................... - // mls v24.8H, v14.8H, v7.H[0] // .....................................................................................................................................................*.................................. || .............................................................................................*................................. - // sub v14.8H, v12.8H, v24.8H // ..........................................................................................................................................................*............................. || .................................................................................................*............................. - // add v12.8H, v12.8H, v24.8H // ...........................................................................................................................................................*............................ || ..................................................................................................*............................ - // mul v24.8H, v15.8H, v0.H[4] // .........................................................................................*.............................................................................................. || ...............................................................*............................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // ......................................................................................e................................................................................................. || .............................................................e................................................................. - // mls v24.8H, v15.8H, v7.H[0] // ...........................................................................................................*............................................................................ || ........................................................................*...................................................... - // sub v15.8H, v13.8H, v24.8H // ...................................................................................................................................*.................................................... || ....................................................................................*.......................................... - // add v13.8H, v13.8H, v24.8H // .......................................................................................................................................*................................................ || ......................................................................................*........................................ - // mul v24.8H, v9.8H, v0.H[6] // .............................................................................................................*.......................................................................... || .........................................................................*..................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[7] // ...............................................................................................................*........................................................................ || ..........................................................................*.................................................... - // mls v24.8H, v9.8H, v7.H[0] // .......................................................................................................................*................................................................ || ..............................................................................*................................................ - // sub v9.8H, v8.8H, v24.8H // ...............................................................................................................................*........................................................ || ..................................................................................*............................................ - // add v8.8H, v8.8H, v24.8H // ...............................................................................................................................................*........................................ || ..........................................................................................*.................................... - // mul v24.8H, v11.8H, v1.H[0] // .................................................................................................................................................*...................................... || ...........................................................................................*................................... - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ...................................................................................................................................................*.................................... || ............................................................................................*.................................. - // mls v24.8H, v11.8H, v7.H[0] // .........................................................................................................................................................*.............................. || ................................................................................................*.............................. - // sub v11.8H, v10.8H, v24.8H // ...............................................................................................................................................................*........................ || ......................................................................................................*........................ - // add v10.8H, v10.8H, v24.8H // ................................................................................................................................................................*....................... || .......................................................................................................*....................... - // mul v24.8H, v13.8H, v1.H[2] // ......................................................................................................................................................*................................. || ..............................................................................................*................................ - // sqrdmulh v13.8H, v13.8H, v1.H[3] // ........................................................................................................................................................*............................... || ...............................................................................................*............................... - // mls v24.8H, v13.8H, v7.H[0] // ............................................................................................................................................................*........................... || ...................................................................................................*........................... - // sub v13.8H, v12.8H, v24.8H // .................................................................................................................................................................*...................... || ........................................................................................................*...................... - // add v12.8H, v12.8H, v24.8H // ...................................................................................................................................................................*.................... || ..........................................................................................................*.................... - // mul v24.8H, v15.8H, v1.H[4] // ....................................................................................................................................................................*................... || ...........................................................................................................*................... - // sqrdmulh v15.8H, v15.8H, v1.H[5] // .....................................................................................................................................................................*.................. || ............................................................................................................*.................. - // mls v24.8H, v15.8H, v7.H[0] // .........................................................................................................................................................................*.............. || ................................................................................................................*.............. - // sub v15.8H, v14.8H, v24.8H // ..............................................................................................................................................................................*......... || .....................................................................................................................*......... - // add v14.8H, v14.8H, v24.8H // ...............................................................................................................................................................................*........ || ......................................................................................................................*........ - // str_vi v8, x0, 16 // ........................................................................................................................................................................*............... || ...............................................................................................................*............... - // str_vo v9, x0, 48 // .....................................................................................................................................*.................................................. || .....................................................................................*......................................... - // str_vo v10, x0, 112 // ..........................................................................................................................................................................*............. || .................................................................................................................*............. - // str_vo v11, x0, 176 // ............................................................................................................................................................................*........... || ...................................................................................................................*........... - // str_vo v12, x0, 240 // ................................................................................................................................................................................*....... || .......................................................................................................................*....... - // str_vo v13, x0, 304 // ..................................................................................................................................................................................*..... || .........................................................................................................................*..... - // str_vo v14, x0, 368 // .....................................................................................................................................................................................*.. || ............................................................................................................................*.. - // str_vo v15, x0, 432 // .......................................................................................................................................................................................* || ..............................................................................................................................* - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Cycle bound: 84.0 + // IPC bound: 0.90 + // + // Wall time: 4.80s + // User time: 4.80s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q15, [x0, #0] // *........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v18.8H, v31.8H, v0.H[1] // .............*.............................................................. + // gap // ............................................................................ + mul v6.8H, v31.8H, v0.H[0] // ..............*............................................................. + // gap // ............................................................................ + mls v10.8H, v29.8H, v7.H[0] // ..........*................................................................. + // gap // ............................................................................ + sqrdmulh v26.8H, v25.8H, v0.H[1] // ..................*......................................................... + // gap // ............................................................................ + mul v16.8H, v5.8H, v0.H[0] // ........................*................................................... + // gap // ............................................................................ + mls v6.8H, v18.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + mul v18.8H, v25.8H, v0.H[0] // ...................*........................................................ + // gap // ............................................................................ + sqrdmulh v25.8H, v5.8H, v0.H[1] // .......................*.................................................... + // gap // ............................................................................ + sub v12.8H, v15.8H, v10.8H // ...........*................................................................ + // gap // ............................................................................ + sub v19.8H, v20.8H, v6.8H // ................*........................................................... + // gap // ............................................................................ + mls v18.8H, v26.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + add v6.8H, v20.8H, v6.8H // .................*.......................................................... + // gap // ............................................................................ + mls v16.8H, v25.8H, v7.H[0] // .........................*.................................................. + // gap // ............................................................................ + add v29.8H, v15.8H, v10.8H // ............*............................................................... + // gap // ............................................................................ + sub v26.8H, v11.8H, v18.8H // .....................*...................................................... + // gap // ............................................................................ + add v18.8H, v11.8H, v18.8H // ......................*..................................................... + // gap // ............................................................................ + sub v25.8H, v13.8H, v16.8H // ..........................*................................................. + // gap // ............................................................................ + sqrdmulh v31.8H, v26.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + mul v26.8H, v26.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + sqrdmulh v11.8H, v18.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + mul v18.8H, v18.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + add v16.8H, v13.8H, v16.8H // ...........................*................................................ + // gap // ............................................................................ + mls v26.8H, v31.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + sqrdmulh v20.8H, v25.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + mls v18.8H, v11.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + mul v25.8H, v25.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + sub v11.8H, v12.8H, v26.8H // .........................................*.................................. + // gap // ............................................................................ + add v26.8H, v12.8H, v26.8H // ..........................................*................................. + // gap // ............................................................................ + sub v13.8H, v29.8H, v18.8H // ...............................*............................................ + // gap // ............................................................................ + add v18.8H, v29.8H, v18.8H // ................................*........................................... + // gap // ............................................................................ + mls v25.8H, v20.8H, v7.H[0] // .............................................*.............................. + // gap // ............................................................................ + sqrdmulh v31.8H, v16.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + mul v16.8H, v16.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + ldr q5, [x0, #464] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v29.8H, v19.8H, v25.8H // ...............................................*............................ + // gap // ............................................................................ + mls v16.8H, v31.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + sub v20.8H, v19.8H, v25.8H // ..............................................*............................. + // gap // ............................................................................ + sqrdmulh v19.8H, v29.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + mul v8.8H, v29.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + sub v31.8H, v6.8H, v16.8H // ....................................*....................................... + // gap // ............................................................................ + add v6.8H, v6.8H, v16.8H // .....................................*...................................... + // gap // ............................................................................ + mul v16.8H, v20.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v29.8H, v20.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + sqrdmulh v23.8H, v6.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + mul v6.8H, v6.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + mul v12.8H, v31.8H, v1.H[0] // ......................................................*..................... + // gap // ............................................................................ + mls v8.8H, v19.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + sqrdmulh v20.8H, v31.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + mls v6.8H, v23.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + mls v16.8H, v29.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + sub v29.8H, v26.8H, v8.8H // .............................................................*.............. + // gap // ............................................................................ + mls v12.8H, v20.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + add v31.8H, v26.8H, v8.8H // ..............................................................*............. + // gap // ............................................................................ + sub v25.8H, v18.8H, v6.8H // ...................................................*........................ + // gap // ............................................................................ + add v18.8H, v18.8H, v6.8H // ....................................................*....................... + // gap // ............................................................................ + add v8.8H, v11.8H, v16.8H // ...................................................................*........ + // gap // ............................................................................ + sub v19.8H, v13.8H, v12.8H // ........................................................*................... + // gap // ............................................................................ + add v26.8H, v13.8H, v12.8H // .........................................................*.................. + // gap // ............................................................................ + str q18, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + sub v6.8H, v11.8H, v16.8H // ..................................................................*......... + // gap // ............................................................................ + str q25, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + ldr q20, [x0, #64] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q26, [x0, #112] // ......................................................................*..... + // gap // ............................................................................ + ldr q11, [x0, #128] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + ldr q13, [x0, #192] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q31, [x0, #240] // ........................................................................*... + // gap // ............................................................................ + ldr q18, [x0, #256] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q29, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ + ldr q31, [x0, #320] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q8, [x0, #368] // ..........................................................................*. + // gap // ............................................................................ + sqrdmulh v29.8H, v18.8H, v0.H[1] // ........e................................................................... + // gap // ............................................................................ + str q6, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + mul v10.8H, v18.8H, v0.H[0] // .........e.................................................................. + // gap // ............................................................................ + ldr q25, [x0, #384] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // -------------------------------------------------- new position ---------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q8, [x0, #0] // ..........................................*......................................................................... + // ldr q9, [x0, #(1*(512/8))] // ............................e.............'.............................................................~........... + // ldr q10, [x0, #(2*(512/8))] // ..............................e...........'...............................................................~......... + // ldr q11, [x0, #(3*(512/8))] // ................................e.........'.................................................................~....... + // ldr q12, [x0, #(4*(512/8))] // ..................................e.......'...................................................................~..... + // ldr q13, [x0, #(5*(512/8))] // ....................................e.....'.....................................................................~... + // ldr q14, [x0, #(6*(512/8))] // .........................................e'......................................................................... + // ldr q15, [x0, #(7*(512/8))] // e.........................................'.................................~....................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ......................................e...'.......................................................................~. + // mul v24.8h, v12.8h, v0.h[0] // ........................................e.'......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................'..*...................................................................... + // sub v12.8h, v8.8h, v24.8h // ..........................................'........*................................................................ + // add v8.8h, v8.8h, v24.8h // ..........................................'.............*........................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ..........................................'*........................................................................ + // mul v24.8h, v13.8h, v0.h[0] // ..........................................'.*....................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................'.....*................................................................... + // sub v13.8h, v9.8h, v24.8h // ..........................................'.........*............................................................... + // add v9.8h, v9.8h, v24.8h // ..........................................'...........*............................................................. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ..........................................'...*..................................................................... + // mul v24.8h, v14.8h, v0.h[0] // ..........................................'......*.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..........................................'..........*.............................................................. + // sub v14.8h, v10.8h, v24.8h // ..........................................'..............*.......................................................... + // add v10.8h, v10.8h, v24.8h // ..........................................'...............*......................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ..........................................'.......*................................................................. + // mul v24.8h, v15.8h, v0.h[0] // ..........................................'....*.................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................'............*............................................................ + // sub v15.8h, v11.8h, v24.8h // ..........................................'................*........................................................ + // add v11.8h, v11.8h, v24.8h // ..........................................'.....................*................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ..........................................'...................*..................................................... + // mul v24.8h, v10.8h, v0.h[2] // ..........................................'....................*.................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................'........................*................................................ + // sub v10.8h, v8.8h, v24.8h // ..........................................'............................*............................................ + // add v8.8h, v8.8h, v24.8h // ..........................................'.............................*........................................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ..........................................'...............................*......................................... + // mul v24.8h, v11.8h, v0.h[2] // ..........................................'................................*........................................ + // mls v24.8h, v27.8h, v7.h[0] // ..~.......................................'...................................*..................................... + // sub v11.8h, v9.8h, v24.8h // ......~...................................'.......................................*................................. + // add v9.8h, v9.8h, v24.8h // .......~..................................'........................................*................................ + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ..........................................'.................*....................................................... + // mul v24.8h, v14.8h, v0.h[4] // ..........................................'..................*...................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................'......................*.................................................. + // sub v14.8h, v12.8h, v24.8h // ..........................................'..........................*.............................................. + // add v12.8h, v12.8h, v24.8h // ..........................................'...........................*............................................. + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ..........................................'.......................*................................................. + // mul v24.8h, v15.8h, v0.h[4] // ..........................................'.........................*............................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................'..............................*.......................................... + // sub v15.8h, v13.8h, v24.8h // ...~......................................'....................................*.................................... + // add v13.8h, v13.8h, v24.8h // .~........................................'..................................*...................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ..........~...............................'...........................................*............................. + // mul v24.8h, v9.8h, v0.h[6] // ...........~..............................'............................................*............................ + // mls v24.8h, v27.8h, v7.h[0] // ...............~..........................'................................................*........................ + // sub v9.8h, v8.8h, v24.8h // ....................~.....................'.....................................................*................... + // add v8.8h, v8.8h, v24.8h // .....................~....................'......................................................*.................. + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ..............~...........................'...............................................*......................... + // mul v24.8h, v11.8h, v1.h[0] // ............~.............................'.............................................*........................... + // mls v24.8h, v27.8h, v7.h[0] // ..................~.......................'...................................................*..................... + // sub v11.8h, v10.8h, v24.8h // .......................~..................'........................................................*................ + // add v10.8h, v10.8h, v24.8h // ........................~.................'.........................................................*............... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ....~.....................................'.....................................*................................... + // mul v24.8h, v13.8h, v1.h[2] // .....~....................................'......................................*.................................. + // mls v24.8h, v27.8h, v7.h[0] // .............~............................'..............................................*.......................... + // sub v13.8h, v12.8h, v24.8h // .................~........................'..................................................*...................... + // add v12.8h, v12.8h, v24.8h // ...................~......................'....................................................*.................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // .........~................................'..........................................*.............................. + // mul v24.8h, v15.8h, v1.h[4] // ........~.................................'.........................................*............................... + // mls v24.8h, v27.8h, v7.h[0] // ................~.........................'.................................................*....................... + // sub v15.8h, v14.8h, v24.8h // ..........................~...............'...........................................................*............. + // add v14.8h, v14.8h, v24.8h // ......................~...................'.......................................................*................. + // str q8, [x0], #(16) // .........................~................'..........................................................*.............. + // str q9, [x0, #(-16 + 1*(512/8))] // ...........................~..............'............................................................*............ + // str q10, [x0, #(-16 + 2*(512/8))] // .............................~............'..............................................................*.......... + // str q11, [x0, #(-16 + 3*(512/8))] // ...............................~..........'................................................................*........ + // str q12, [x0, #(-16 + 4*(512/8))] // .................................~........'..................................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // ...................................~......'....................................................................*.... + // str q14, [x0, #(-16 + 6*(512/8))] // .....................................~....'......................................................................*.. + // str q15, [x0, #(-16 + 7*(512/8))] // .......................................~..'........................................................................* + + sub count, count, #1 cbnz count, layer123_start - mul v25.8H, v12.8H, v0.H[0] // *............................................................ - vins v16, x10, 1 // .....*....................................................... - sqrdmulh v6.8H, v12.8H, v0.H[1] // ....*........................................................ - ldr x12, [x0, #0] // .*........................................................... - sub v17.8H, v16.8H, v4.8H // ......................*...................................... - vins v9, x15, 0 // ...*......................................................... - mul v21.8H, v5.8H, v0.H[4] // ..*.......................................................... - ldr x13, [x0, #8] // ...........*................................................. - mls v2.8H, v23.8H, v7.H[0] // ......*...................................................... - vins v9, x21, 1 // .......*..................................................... - mul v30.8H, v17.8H, v0.H[4] // .................................*........................... - vins v31, x12, 0 // .........*................................................... - sqrdmulh v23.8H, v17.8H, v0.H[5] // ..................................*.......................... - // gap // ............................................................. - mls v21.8H, v29.8H, v7.H[0] // .................*........................................... - vins v31, x13, 1 // ................*............................................ - add v17.8H, v9.8H, v3.8H // ..........*.................................................. - // gap // ............................................................. - sub v10.8H, v9.8H, v3.8H // ..........................*.................................. - // gap // ............................................................. - mls v30.8H, v23.8H, v7.H[0] // ......................................*...................... - // gap // ............................................................. - add v19.8H, v17.8H, v2.8H // ..............*.............................................. - // gap // ............................................................. - sub v12.8H, v10.8H, v21.8H // .............................*............................... - // gap // ............................................................. - add v20.8H, v10.8H, v21.8H // ...............................*............................. - // gap // ............................................................. - mul v26.8H, v19.8H, v0.H[6] // ..................*.......................................... - // gap // ............................................................. - sqrdmulh v19.8H, v19.8H, v0.H[7] // ...................*......................................... - // gap // ............................................................. - mul v9.8H, v20.8H, v1.H[2] // .......................................*..................... - // gap // ............................................................. - add v21.8H, v16.8H, v4.8H // ........*.................................................... - // gap // ............................................................. - sqrdmulh v11.8H, v20.8H, v1.H[3] // ........................................*.................... - // gap // ............................................................. - mls v26.8H, v19.8H, v7.H[0] // .......................*..................................... - // gap // ............................................................. - mul v27.8H, v21.8H, v0.H[2] // .............*............................................... - // gap // ............................................................. - sqrdmulh v14.8H, v21.8H, v0.H[3] // ...............*............................................. - // gap // ............................................................. - mls v25.8H, v6.8H, v7.H[0] // ............*................................................ - // gap // ............................................................. - mls v9.8H, v11.8H, v7.H[0] // ............................................*................ - // gap // ............................................................. - sub v13.8H, v17.8H, v2.8H // .........................*................................... - // gap // ............................................................. - mls v27.8H, v14.8H, v7.H[0] // ....................*........................................ - // gap // ............................................................. - add v8.8H, v31.8H, v25.8H // .....................*....................................... - // gap // ............................................................. - sub v10.8H, v31.8H, v25.8H // ............................*................................ - // gap // ............................................................. - mul v28.8H, v13.8H, v1.H[0] // ....................................*........................ - // gap // ............................................................. - add v5.8H, v8.8H, v27.8H // ........................*.................................... - // gap // ............................................................. - sqrdmulh v15.8H, v13.8H, v1.H[1] // .....................................*....................... - // gap // ............................................................. - sub v11.8H, v8.8H, v27.8H // ................................*............................ - // gap // ............................................................. - add v22.8H, v5.8H, v26.8H // ...................................*......................... - // gap // ............................................................. - add v24.8H, v10.8H, v30.8H // ...........................................*................. - // gap // ............................................................. - mls v28.8H, v15.8H, v7.H[0] // .........................................*................... - // gap // ............................................................. - str_vi v22, x0, 16 // ...................................................*......... - // gap // ............................................................. - sqrdmulh v6.8H, v12.8H, v1.H[5] // ..................................................*.......... - // gap // ............................................................. - add v3.8H, v24.8H, v9.8H // ................................................*............ - // gap // ............................................................. - sub v13.8H, v11.8H, v28.8H // .............................................*............... - // gap // ............................................................. - sub v14.8H, v5.8H, v26.8H // ...........................*................................. - // gap // ............................................................. - str_vo v3, x0, 240 // .........................................................*... - // gap // ............................................................. - mul v29.8H, v12.8H, v1.H[4] // .................................................*........... - // gap // ............................................................. - str_vo v14, x0, 48 // ..............................*.............................. - // gap // ............................................................. - sub v12.8H, v24.8H, v9.8H // ...............................................*............. - // gap // ............................................................. - str_vo v13, x0, 176 // ......................................................*...... - // gap // ............................................................. - mls v29.8H, v6.8H, v7.H[0] // ....................................................*........ - // gap // ............................................................. - sub v31.8H, v10.8H, v30.8H // ..........................................*.................. - // gap // ............................................................. - add v10.8H, v11.8H, v28.8H // ..............................................*.............. - // gap // ............................................................. - str_vo v12, x0, 304 // ..........................................................*.. - // gap // ............................................................. - add v26.8H, v31.8H, v29.8H // ........................................................*.... - // gap // ............................................................. - str_vo v10, x0, 112 // .....................................................*....... - // gap // ............................................................. - sub v6.8H, v31.8H, v29.8H // .......................................................*..... - // gap // ............................................................. - str_vo v26, x0, 368 // ...........................................................*. - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - str_vo v6, x0, 432 // ............................................................* - // gap // ............................................................. - - // original source code - // mul v6.8H, v12.8H, v0.H[0] // *............................................................ || *...................................................... - // ldr x13, [x0, #0] // ...*......................................................... || .*..................................................... - // mul v10.8H, v5.8H, v0.H[4] // ......*...................................................... || ...*................................................... - // vins v15, x15, 0 // .....*....................................................... || ..*.................................................... - // sqrdmulh v11.8H, v12.8H, v0.H[1] // ..*.......................................................... || .*..................................................... - // vins v16, x10, 1 // .*........................................................... || *...................................................... - // mls v2.8H, v23.8H, v7.H[0] // ........*.................................................... || ....*.................................................. - // vins v15, x21, 1 // .........*................................................... || ....*.................................................. - // add v23.8H, v16.8H, v4.8H // ........................*.................................... || .................*..................................... - // vins v20, x13, 0 // ...........*................................................. || .....*................................................. - // add v5.8H, v15.8H, v3.8H // ...............*............................................. || ........*.............................................. - // ldr x13, [x0, #8] // .......*..................................................... || ...*................................................... - // mls v6.8H, v11.8H, v7.H[0] // .............................*............................... || ......................*................................ - // mul v11.8H, v23.8H, v0.H[2] // ...........................*................................. || ....................*.................................. - // add v13.8H, v5.8H, v2.8H // ..................*.......................................... || ...........*........................................... - // sqrdmulh v23.8H, v23.8H, v0.H[3] // ............................*................................ || .....................*................................. - // vins v20, x13, 1 // ..............*.............................................. || .......*............................................... - // mls v10.8H, v29.8H, v7.H[0] // .............*............................................... || .......*............................................... - // mul v29.8H, v13.8H, v0.H[6] // .....................*....................................... || ..............*........................................ - // sqrdmulh v13.8H, v13.8H, v0.H[7] // ......................*...................................... || ...............*....................................... - // mls v11.8H, v23.8H, v7.H[0] // ................................*............................ || .........................*............................. - // add v23.8H, v20.8H, v6.8H // .................................*........................... || ..........................*............................ - // sub v31.8H, v16.8H, v4.8H // ....*........................................................ || ..*.................................................... - // mls v29.8H, v13.8H, v7.H[0] // ..........................*.................................. || ...................*................................... - // add v4.8H, v23.8H, v11.8H // ....................................*........................ || .............................*......................... - // sub v2.8H, v5.8H, v2.8H // ...............................*............................. || ........................*.............................. - // sub v15.8H, v15.8H, v3.8H // ................*............................................ || .........*............................................. - // sub v5.8H, v4.8H, v29.8H // ..............................................*.............. || .......................................*............... - // sub v6.8H, v20.8H, v6.8H // ..................................*.......................... || ...........................*........................... - // sub v20.8H, v15.8H, v10.8H // ...................*......................................... || ............*.......................................... - // str_vo v5, x0, 64 // .................................................*........... || ..........................................*............ - // add v10.8H, v15.8H, v10.8H // ....................*........................................ || .............*......................................... - // sub v11.8H, v23.8H, v11.8H // ......................................*...................... || ...............................*....................... - // mul v23.8H, v31.8H, v0.H[4] // ..........*.................................................. || .....*................................................. - // sqrdmulh v5.8H, v31.8H, v0.H[5] // ............*................................................ || ......*................................................ - // add v29.8H, v4.8H, v29.8H // .......................................*..................... || ................................*...................... - // mul v4.8H, v2.8H, v1.H[0] // ...................................*......................... || ............................*.......................... - // sqrdmulh v2.8H, v2.8H, v1.H[1] // .....................................*....................... || ..............................*........................ - // mls v23.8H, v5.8H, v7.H[0] // .................*........................................... || ..........*............................................ - // mul v5.8H, v10.8H, v1.H[2] // .......................*..................................... || ................*...................................... - // sqrdmulh v10.8H, v10.8H, v1.H[3] // .........................*................................... || ..................*.................................... - // mls v4.8H, v2.8H, v7.H[0] // .........................................*................... || ..................................*.................... - // sub v2.8H, v6.8H, v23.8H // .....................................................*....... || ..............................................*........ - // add v6.8H, v6.8H, v23.8H // ........................................*.................... || .................................*..................... - // mls v5.8H, v10.8H, v7.H[0] // ..............................*.............................. || .......................*............................... - // sub v25.8H, v11.8H, v4.8H // .............................................*............... || ......................................*................ - // add v11.8H, v11.8H, v4.8H // ......................................................*...... || ...............................................*....... - // sub v4.8H, v6.8H, v5.8H // ..................................................*.......... || ...........................................*........... - // add v6.8H, v6.8H, v5.8H // ............................................*................ || .....................................*................. - // mul v23.8H, v20.8H, v1.H[4] // ................................................*............ || .........................................*............. - // sqrdmulh v20.8H, v20.8H, v1.H[5] // ...........................................*................. || ....................................*.................. - // str_vi v29, x0, 16 // ..........................................*.................. || ...................................*................... - // mls v23.8H, v20.8H, v7.H[0] // ....................................................*........ || .............................................*......... - // str_vo v11, x0, 112 // .........................................................*... || ..................................................*.... - // str_vo v25, x0, 176 // ...................................................*......... || ............................................*.......... - // sub v11.8H, v2.8H, v23.8H // ..........................................................*.. || ...................................................*... - // add v2.8H, v2.8H, v23.8H // ........................................................*.... || .................................................*..... - // str_vo v6, x0, 240 // ...............................................*............. || ........................................*.............. - // str_vo v4, x0, 304 // .......................................................*..... || ................................................*...... - // str_vo v2, x0, 368 // ...........................................................*. || ....................................................*.. - // str_vo v11, x0, 432 // ............................................................* || ......................................................* - + // Instructions: 67 + // Expected cycles: 69 + // Expected IPC: 0.97 + // + // Cycle bound: 69.0 + // IPC bound: 0.97 + // + // Wall time: 19.36s + // User time: 19.36s + // + // ----------------------- original position ------------------------> + // 0 25 50 + // |------------------------|------------------------|---------------- + ldr q19, [x0, #0] // *.................................................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v21.8H, v25.8H, v0.H[1] // ....*.............................................................. + // gap // ................................................................... + mul v18.8H, v25.8H, v0.H[0] // .......*........................................................... + // gap // ................................................................... + sqrdmulh v12.8H, v31.8H, v0.H[1] // .*................................................................. + // gap // ................................................................... + mul v31.8H, v31.8H, v0.H[0] // ..*................................................................ + // gap // ................................................................... + sqrdmulh v16.8H, v5.8H, v0.H[1] // ........*.......................................................... + // gap // ................................................................... + mls v18.8H, v21.8H, v7.H[0] // ...........*....................................................... + // gap // ................................................................... + mls v10.8H, v29.8H, v7.H[0] // ...*............................................................... + // gap // ................................................................... + mul v15.8H, v5.8H, v0.H[0] // .....*............................................................. + // gap // ................................................................... + mls v31.8H, v12.8H, v7.H[0] // ......*............................................................ + // gap // ................................................................... + sub v12.8H, v11.8H, v18.8H // ...............*................................................... + // gap // ................................................................... + add v5.8H, v11.8H, v18.8H // ................*.................................................. + // gap // ................................................................... + mls v15.8H, v16.8H, v7.H[0] // .............*..................................................... + // gap // ................................................................... + add v8.8H, v20.8H, v31.8H // ............*...................................................... + // gap // ................................................................... + sub v22.8H, v20.8H, v31.8H // ..........*........................................................ + // gap // ................................................................... + sqrdmulh v20.8H, v5.8H, v0.H[3] // ....................*.............................................. + // gap // ................................................................... + sub v16.8H, v13.8H, v15.8H // .................*................................................. + // gap // ................................................................... + add v14.8H, v13.8H, v15.8H // ......................*............................................ + // gap // ................................................................... + mul v6.8H, v5.8H, v0.H[2] // .....................*............................................. + // gap // ................................................................... + mul v5.8H, v16.8H, v0.H[4] // ..........................*........................................ + // gap // ................................................................... + sqrdmulh v27.8H, v16.8H, v0.H[5] // ........................*.......................................... + // gap // ................................................................... + sqrdmulh v3.8H, v12.8H, v0.H[5] // ..................*................................................ + // gap // ................................................................... + mul v25.8H, v12.8H, v0.H[4] // ...................*............................................... + // gap // ................................................................... + mls v6.8H, v20.8H, v7.H[0] // .........................*......................................... + // gap // ................................................................... + mls v5.8H, v27.8H, v7.H[0] // ...............................*................................... + // gap // ................................................................... + sub v12.8H, v19.8H, v10.8H // .........*......................................................... + // gap // ................................................................... + mls v25.8H, v3.8H, v7.H[0] // .......................*........................................... + // gap // ................................................................... + mul v26.8H, v14.8H, v0.H[2] // .................................*................................. + // gap // ................................................................... + add v18.8H, v22.8H, v5.8H // ..................................*................................ + // gap // ................................................................... + sub v21.8H, v22.8H, v5.8H // ....................................*.............................. + // gap // ................................................................... + sub v11.8H, v12.8H, v25.8H // ...........................*....................................... + // gap // ................................................................... + sqrdmulh v27.8H, v18.8H, v1.H[3] // .....................................*............................. + // gap // ................................................................... + sqrdmulh v16.8H, v21.8H, v1.H[5] // ..........................................*........................ + // gap // ................................................................... + mul v29.8H, v18.8H, v1.H[2] // ......................................*............................ + // gap // ................................................................... + mul v30.8H, v21.8H, v1.H[4] // .........................................*......................... + // gap // ................................................................... + add v23.8H, v12.8H, v25.8H // ............................*...................................... + // gap // ................................................................... + sqrdmulh v4.8H, v14.8H, v0.H[3] // ................................*.................................. + // gap // ................................................................... + mls v29.8H, v27.8H, v7.H[0] // ..............................................*.................... + // gap // ................................................................... + mls v30.8H, v16.8H, v7.H[0] // .................................................*................. + // gap // ................................................................... + add v27.8H, v19.8H, v10.8H // ..............*.................................................... + // gap // ................................................................... + mls v26.8H, v4.8H, v7.H[0] // ...................................*............................... + // gap // ................................................................... + add v22.8H, v23.8H, v29.8H // ....................................................*.............. + // gap // ................................................................... + sub v29.8H, v23.8H, v29.8H // ..................................................*................ + // gap // ................................................................... + sub v24.8H, v11.8H, v30.8H // ...........................................................*....... + // gap // ................................................................... + add v25.8H, v8.8H, v26.8H // ........................................*.......................... + // gap // ................................................................... + add v31.8H, v27.8H, v6.8H // ..............................*.................................... + // gap // ................................................................... + sub v28.8H, v8.8H, v26.8H // .......................................*........................... + // gap // ................................................................... + sqrdmulh v4.8H, v25.8H, v0.H[7] // ...........................................*....................... + // gap // ................................................................... + mul v12.8H, v25.8H, v0.H[6] // ............................................*...................... + // gap // ................................................................... + sqrdmulh v20.8H, v28.8H, v1.H[1] // ...............................................*................... + // gap // ................................................................... + sub v17.8H, v27.8H, v6.8H // .............................*..................................... + // gap // ................................................................... + mul v19.8H, v28.8H, v1.H[0] // .............................................*..................... + // gap // ................................................................... + mls v12.8H, v4.8H, v7.H[0] // ................................................*.................. + // gap // ................................................................... + str q24, [x0, #448] // ..................................................................* + // gap // ................................................................... + add v25.8H, v11.8H, v30.8H // .......................................................*........... + // gap // ................................................................... + str q22, [x0, #256] // ...............................................................*... + // gap // ................................................................... + sub v6.8H, v31.8H, v12.8H // .....................................................*............. + // gap // ................................................................... + str q25, [x0, #384] // .................................................................*. + // gap // ................................................................... + mls v19.8H, v20.8H, v7.H[0] // ...................................................*............... + // gap // ................................................................... + str q6, [x0, #64] // ............................................................*...... + // gap // ................................................................... + add v30.8H, v31.8H, v12.8H // ......................................................*............ + // gap // ................................................................... + str q29, [x0, #320] // ................................................................*.. + // gap // ................................................................... + sub v21.8H, v17.8H, v19.8H // ........................................................*.......... + // gap // ................................................................... + str q30, [x0], #(16) // ..........................................................*........ + // gap // ................................................................... + add v15.8H, v17.8H, v19.8H // .........................................................*......... + // gap // ................................................................... + str q21, [x0, #176] // ..............................................................*.... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + str q15, [x0, #112] // .............................................................*..... + // gap // ................................................................... + + // -------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|---------------- + // ldr q15, [x0, #0] // *.................................................................. + // sqrdmulh v18.8H, v31.8H, v0.H[1] // ...*............................................................... + // mul v6.8H, v31.8H, v0.H[0] // ....*.............................................................. + // mls v10.8H, v29.8H, v7.H[0] // .......*........................................................... + // sqrdmulh v26.8H, v25.8H, v0.H[1] // .*................................................................. + // mul v16.8H, v5.8H, v0.H[0] // ........*.......................................................... + // mls v6.8H, v18.8H, v7.H[0] // .........*......................................................... + // mul v18.8H, v25.8H, v0.H[0] // ..*................................................................ + // sqrdmulh v25.8H, v5.8H, v0.H[1] // .....*............................................................. + // sub v12.8H, v15.8H, v10.8H // .........................*......................................... + // sub v19.8H, v20.8H, v6.8H // ..............*.................................................... + // mls v18.8H, v26.8H, v7.H[0] // ......*............................................................ + // add v6.8H, v20.8H, v6.8H // .............*..................................................... + // mls v16.8H, v25.8H, v7.H[0] // ............*...................................................... + // add v29.8H, v15.8H, v10.8H // .......................................*........................... + // sub v26.8H, v11.8H, v18.8H // ..........*........................................................ + // add v18.8H, v11.8H, v18.8H // ...........*....................................................... + // sub v25.8H, v13.8H, v16.8H // ................*.................................................. + // sqrdmulh v31.8H, v26.8H, v0.H[5] // .....................*............................................. + // mul v26.8H, v26.8H, v0.H[4] // ......................*............................................ + // sqrdmulh v11.8H, v18.8H, v0.H[3] // ...............*................................................... + // mul v18.8H, v18.8H, v0.H[2] // ..................*................................................ + // add v16.8H, v13.8H, v16.8H // .................*................................................. + // mls v26.8H, v31.8H, v7.H[0] // ..........................*........................................ + // sqrdmulh v20.8H, v25.8H, v0.H[5] // ....................*.............................................. + // mls v18.8H, v11.8H, v7.H[0] // .......................*........................................... + // mul v25.8H, v25.8H, v0.H[4] // ...................*............................................... + // sub v11.8H, v12.8H, v26.8H // ..............................*.................................... + // add v26.8H, v12.8H, v26.8H // ...................................*............................... + // sub v13.8H, v29.8H, v18.8H // ..................................................*................ + // add v18.8H, v29.8H, v18.8H // .............................................*..................... + // mls v25.8H, v20.8H, v7.H[0] // ........................*.......................................... + // sqrdmulh v31.8H, v16.8H, v0.H[3] // ....................................*.............................. + // mul v16.8H, v16.8H, v0.H[2] // ...........................*....................................... + // add v29.8H, v19.8H, v25.8H // ............................*...................................... + // mls v16.8H, v31.8H, v7.H[0] // ........................................*.......................... + // sub v20.8H, v19.8H, v25.8H // .............................*..................................... + // sqrdmulh v19.8H, v29.8H, v1.H[3] // ...............................*................................... + // mul v8.8H, v29.8H, v1.H[2] // .................................*................................. + // sub v31.8H, v6.8H, v16.8H // ..............................................*.................... + // add v6.8H, v6.8H, v16.8H // ............................................*...................... + // mul v16.8H, v20.8H, v1.H[4] // ..................................*................................ + // sqrdmulh v29.8H, v20.8H, v1.H[5] // ................................*.................................. + // sqrdmulh v23.8H, v6.8H, v0.H[7] // ...............................................*................... + // mul v6.8H, v6.8H, v0.H[6] // ................................................*.................. + // mul v12.8H, v31.8H, v1.H[0] // ...................................................*............... + // mls v8.8H, v19.8H, v7.H[0] // .....................................*............................. + // sqrdmulh v20.8H, v31.8H, v1.H[1] // .................................................*................. + // mls v6.8H, v23.8H, v7.H[0] // ....................................................*.............. + // mls v16.8H, v29.8H, v7.H[0] // ......................................*............................ + // sub v29.8H, v26.8H, v8.8H // ..........................................*........................ + // mls v12.8H, v20.8H, v7.H[0] // ..........................................................*........ + // add v31.8H, v26.8H, v8.8H // .........................................*......................... + // sub v25.8H, v18.8H, v6.8H // ........................................................*.......... + // add v18.8H, v18.8H, v6.8H // ............................................................*...... + // add v8.8H, v11.8H, v16.8H // ......................................................*............ + // sub v19.8H, v13.8H, v12.8H // ..............................................................*.... + // add v26.8H, v13.8H, v12.8H // ................................................................*.. + // str q18, [x0], #(16) // ...............................................................*... + // sub v6.8H, v11.8H, v16.8H // ...........................................*....................... + // str q25, [x0, #48] // ...........................................................*....... + // str q26, [x0, #112] // ..................................................................* + // str q19, [x0, #176] // .................................................................*. + // str q31, [x0, #240] // .......................................................*........... + // str q29, [x0, #304] // .............................................................*..... + // str q8, [x0, #368] // .........................................................*......... + // str q6, [x0, #432] // .....................................................*............. + restore inp, STACK0 mov count, #8 .p2align 2 - ldr x10, [x1, #48] // .*........................... - // gap // ............................. - ldr x13, [x3] , #16 // ..*.......................... - // gap // ............................. - ldr x26, [x1, #56] // *............................ - // gap // ............................. - ldr x21, [x3, #-8] // ...*......................... - // gap // ............................. - vins v6, x10, 0 // ....*........................ - ldr x15, [x1, #0] // ...................*......... - vins v1, x13, 0 // .....*....................... - ldr x11, [x4, #64] // ......................*...... - vins v6, x26, 1 // ......*...................... - ldr x13, [x1, #16] // ............*................ - vins v1, x21, 1 // .......*..................... - ldr x9, [x1, #32] // ...............*............. - // gap // ............................. - vins v29, x15, 0 // ........................*.... - sqrdmulh v15.8H, v6.8H, v1.H[1] // ..........*.................. - ldr x15, [x1, #24] // .........*................... - mul v11.8H, v6.8H, v1.H[0] // ........*.................... - vins v10, x13, 0 // .............*............... - ldr x13, [x1, #40] // ..............*.............. - vins v9, x11, 0 // ............................* - vins v6, x9, 0 // ....................*........ - // gap // ............................. - vins v10, x15, 1 // ................*............ - ldr x15, [x1, #8] // .................*........... - mls v11.8H, v15.8H, v7.H[0] // ...........*................. - // gap // ............................. - vins v6, x13, 1 // .......................*..... - // gap // ............................. - // gap // ............................. - // gap // ............................. - sqrdmulh v19.8H, v6.8H, v1.H[1] // ..........................*.. - // gap // ............................. - sub v25.8H, v10.8H, v11.8H // ..................*.......... - // gap // ............................. - add v11.8H, v10.8H, v11.8H // .....................*....... - // gap // ............................. - mul v26.8H, v6.8H, v1.H[0] // ...........................*. - // gap // ............................. - mul v6.8H, v25.8H, v1.H[4] // .........................*... - // gap // ............................. - - // original source code - // ldr x23, [x1, #56] // ..*.......................... || ..*................... - // ldr x14, [x1, #48] // *............................ || *..................... - // ldr x21, [x3] , #16 // .*........................... || .*.................... - // ldr x13, [x3, #-8] // ...*......................... || ...*.................. - // vins v20, x14, 0 // ....*........................ || ....*................. - // vins v1, x21, 0 // ......*...................... || .....*................ - // vins v20, x23, 1 // ........*.................... || ......*............... - // vins v1, x13, 1 // ..........*.................. || .......*.............. - // mul v19.8H, v20.8H, v1.H[0] // ...............*............. || ..........*........... - // ldr x17, [x1, #24] // ..............*.............. || .........*............ - // sqrdmulh v15.8H, v20.8H, v1.H[1] // .............*............... || .........*............ - // mls v19.8H, v15.8H, v7.H[0] // ......................*...... || ..............*....... - // ldr x20, [x1, #16] // .........*................... || ......*............... - // vins v30, x20, 0 // ................*............ || ..........*........... - // ldr x9, [x1, #40] // .................*........... || ...........*.......... - // ldr x21, [x1, #32] // ...........*................. || .......*.............. - // vins v30, x17, 1 // ....................*........ || .............*........ - // ldr x15, [x1, #8] // .....................*....... || .............*........ - // sub v25.8H, v30.8H, v19.8H // .........................*... || ..................*... - // ldr x24, [x1, #0] // .....*....................... || ....*................. - // vins v27, x21, 0 // ...................*......... || ............*......... - // add v11.8H, v30.8H, v19.8H // ..........................*.. || ...................*.. - // ldr x26, [x4, #64] // .......*..................... || .....*................ - // vins v27, x9, 1 // .......................*..... || ...............*...... - // vins v29, x24, 0 // ............*................ || ........*............. - // mul v6.8H, v25.8H, v1.H[4] // ............................* || .....................* - // sqrdmulh v19.8H, v27.8H, v1.H[1] // ........................*.... || .................*.... - // mul v26.8H, v27.8H, v1.H[0] // ...........................*. || ....................*. - // vins v9, x26, 0 // ..................*.......... || ...........*.......... - + // Instructions: 18 + // Expected cycles: 24 + // Expected IPC: 0.75 + // + // Cycle bound: 24.0 + // IPC bound: 0.75 + // + // Wall time: 0.11s + // User time: 0.11s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q29, [x1, #48] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x3], #16 // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q31, [x1, #32] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v18.8H, v29.8H, v11.H[1] // ..*........................... + // gap // .............................. + mul v6.8H, v29.8H, v11.H[0] // ........*..................... + // gap // .............................. + sqrdmulh v3.8H, v31.8H, v11.H[1] // ....*......................... + // gap // .............................. + ldr q29, [x1, #16] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v6.8H, v18.8H, v7.H[0] // ..........*................... + // gap // .............................. + mul v1.8H, v31.8H, v11.H[0] // ......*....................... + // gap // .............................. + ldr q0, [x4, #16] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v18.8H, v29.8H, v6.8H // ...........*.................. + // gap // .............................. + mls v1.8H, v3.8H, v7.H[0] // .......*...................... + // gap // .............................. + add v5.8H, v29.8H, v6.8H // ............*................. + // gap // .............................. + sqrdmulh v6.8H, v18.8H, v11.H[5] // .............*................ + // gap // .............................. + mul v25.8H, v18.8H, v11.H[4] // ...............*.............. + // gap // .............................. + sqrdmulh v23.8H, v5.8H, v11.H[3] // ..............*............... + // gap // .............................. + ldr q26, [x4], #(6*16) // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q30, [x4, #-16] // .................*............ + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q20, [x1, #48] // *.............................. + // ldr q11, [x3], #16 // .*............................. + // sqrdmulh v27.8H, v20.8H, v11.H[1] // ...*........................... + // ldr q17, [x1, #32] // ..*............................ + // sqrdmulh v5.8H, v17.8H, v11.H[1] // .....*......................... + // ldr q0, [x4, #16] // .........*..................... + // mul v1.8H, v17.8H, v11.H[0] // ........*...................... + // mls v1.8H, v5.8H, v7.H[0] // ...........*................... + // mul v6.8H, v20.8H, v11.H[0] // ....*.......................... + // ldr q30, [x1, #16] // ......*........................ + // mls v6.8H, v27.8H, v7.H[0] // .......*....................... + // sub v25.8H, v30.8H, v6.8H // ..........*.................... + // add v5.8H, v30.8H, v6.8H // ............*.................. + // sqrdmulh v6.8H, v25.8H, v11.H[5] // .............*................. + // sqrdmulh v23.8H, v5.8H, v11.H[3] // ...............*............... + // mul v25.8H, v25.8H, v11.H[4] // ..............*................ + // ldr q26, [x4], #(6*16) // ................*.............. + // ldr q30, [x4, #-16] // .................*............. + sub count, count, #1 -.p2align 2 layer4567_start: - sqrdmulh v10.8H, v25.8H, v1.H[5] // ....................................*.................................................................... - ldr x23, [x1, #120] // .............e........................................................................................... - mls v26.8H, v19.8H, v7.H[0] // ......................*.................................................................................. - ldr x14, [x1, #112] // ............e............................................................................................ - mul v2.8H, v11.8H, v1.H[2] // ..............................*.......................................................................... - ldr x21, [x3] , #16 // ................e........................................................................................ - sqrdmulh v15.8H, v11.8H, v1.H[3] // ...............................*......................................................................... - vins v29, x15, 1 // ...*..................................................................................................... - mls v6.8H, v10.8H, v7.H[0] // .....................................*................................................................... - ldr x13, [x3, #-8] // .................e....................................................................................... - sub v12.8H, v29.8H, v26.8H // .......................*................................................................................. - vins v20, x14, 0 // ..............e.......................................................................................... - add v25.8H, v29.8H, v26.8H // ........................*................................................................................ - vins v1, x21, 0 // ..................e...................................................................................... - mls v2.8H, v15.8H, v7.H[0] // ................................*........................................................................ - vins v20, x23, 1 // ...............e......................................................................................... - sub v10.8H, v12.8H, v6.8H // ......................................*.................................................................. - vins v1, x13, 1 // ...................e..................................................................................... - add v22.8H, v12.8H, v6.8H // .......................................*................................................................. - ldr x16, [x4, #24] // .....................................................*................................................... - mul v19.8H, v20.8H, v1.H[0] // .........................e............................................................................... - ldr x24, [x4, #16] // ....................................................*.................................................... - sub v16.8H, v25.8H, v2.8H // .................................*....................................................................... - ldr x13, [x4, #80] // ....................................................................*.................................... - add v31.8H, v25.8H, v2.8H // ..................................*...................................................................... - ldr x28, [x4] , #96 // ................................................*........................................................ - trn1 v2.4S, v22.4S, v10.4S // ..........................................*.............................................................. - ldr x17, [x1, #88] // .....e................................................................................................... - trn2 v30.4S, v22.4S, v10.4S // ...........................................*............................................................. - vins v10, x24, 0 // ......................................................*.................................................. - trn2 v26.4S, v31.4S, v16.4S // .........................................*............................................................... - ldr x21, [x4, #-88] // .................................................*....................................................... - trn1 v14.4S, v31.4S, v16.4S // ........................................*................................................................ - ldr x23, [x4, #-40] // .............................................................*........................................... - sqrdmulh v15.8H, v20.8H, v1.H[1] // ..........................e.............................................................................. - vins v16, x28, 0 // ..................................................*...................................................... - trn2 v6.2D, v26.2D, v30.2D // .............................................*........................................................... - vins v10, x16, 1 // .......................................................*................................................. - trn2 v18.2D, v14.2D, v2.2D // ............................................*............................................................ - vins v16, x21, 1 // ...................................................*..................................................... - sqrdmulh v21.8H, v6.8H, v10.8H // ..............................................................................*.......................... - ldr x26, [x4, #-64] // ........................................................*................................................ - mul v6.8H, v6.8H, v16.8H // .............................................................................*........................... - ldr x15, [x4, #-48] // ............................................................*............................................ - mul v23.8H, v18.8H, v16.8H // ........................................................................*................................ - ldr x24, [x4, #-24] // .................................................................*....................................... - sqrdmulh v10.8H, v18.8H, v10.8H // .........................................................................*............................... - ldr x14, [x4, #-8] // .....................................................................*................................... - trn1 v27.2D, v26.2D, v30.2D // ...............................................*......................................................... - ldr x28, [x4, #-56] // .........................................................*............................................... - mls v6.8H, v21.8H, v7.H[0] // ...............................................................................*......................... - vins v5, x26, 0 // ..........................................................*.............................................. - mls v19.8H, v15.8H, v7.H[0] // ...........................e............................................................................. - vins v0, x13, 0 // ......................................................................*.................................. - mls v23.8H, v10.8H, v7.H[0] // ..........................................................................*.............................. - vins v20, x15, 0 // ..............................................................*.......................................... - trn1 v10.2D, v14.2D, v2.2D // ..............................................*.......................................................... - vins v0, x14, 1 // .......................................................................*................................. - add v17.8H, v27.8H, v6.8H // .................................................................................*....................... - ldr x20, [x1, #80] // ....e.................................................................................................... - sub v13.8H, v27.8H, v6.8H // ................................................................................*........................ - vins v20, x23, 1 // ...............................................................*......................................... - sub v15.8H, v10.8H, v23.8H // ...........................................................................*............................. - vins v5, x28, 1 // ...........................................................*............................................. - sqrdmulh v16.8H, v17.8H, v20.8H // ...................................................................................*..................... - vins v9, x24, 1 // ...................................................................*..................................... - mul v5.8H, v17.8H, v5.8H // ..................................................................................*...................... - vins v30, x20, 0 // ......e.................................................................................................. - mul v6.8H, v13.8H, v9.8H // .......................................................................................*................. - ldr x9, [x1, #104] // .........e............................................................................................... - sqrdmulh v2.8H, v13.8H, v0.8H // ........................................................................................*................ - ldr x21, [x1, #96] // ........e................................................................................................ - add v10.8H, v10.8H, v23.8H // ............................................................................*............................ - vins v30, x17, 1 // .......e................................................................................................. - mls v5.8H, v16.8H, v7.H[0] // ....................................................................................*.................... - ldr x15, [x1, #72] // .e....................................................................................................... - sub v25.8H, v30.8H, v19.8H // ............................e............................................................................ - ldr x24, [x1, #64] // e........................................................................................................ - mls v6.8H, v2.8H, v7.H[0] // .........................................................................................*............... - vins v27, x21, 0 // ..........e.............................................................................................. - add v11.8H, v30.8H, v19.8H // .............................e........................................................................... - ldr x26, [x4, #64] // ................................................................e........................................ - sub v3.8H, v10.8H, v5.8H // .....................................................................................*................... - vins v27, x9, 1 // ...........e............................................................................................. - add v2.8H, v10.8H, v5.8H // ......................................................................................*.................. - vins v29, x24, 0 // ..e...................................................................................................... - add v4.8H, v15.8H, v6.8H // ...........................................................................................*............. - // gap // ......................................................................................................... - sqdmulh v0.8H, v3.8H, v7.H[1] // ...............................................................................................*......... - // gap // ......................................................................................................... - sub v5.8H, v15.8H, v6.8H // ..........................................................................................*.............. - // gap // ......................................................................................................... - sqdmulh v21.8H, v2.8H, v7.H[1] // ............................................................................................*............ - // gap // ......................................................................................................... - sqdmulh v18.8H, v4.8H, v7.H[1] // ..................................................................................................*...... - // gap // ......................................................................................................... - sqdmulh v6.8H, v5.8H, v7.H[1] // .....................................................................................................*... - // gap // ......................................................................................................... - srshr v10.8H, v0.8H, #11 // ................................................................................................*........ - // gap // ......................................................................................................... - srshr v8.8H, v21.8H, #11 // .............................................................................................*........... - // gap // ......................................................................................................... - srshr v13.8H, v18.8H, #11 // ...................................................................................................*..... - // gap // ......................................................................................................... - srshr v6.8H, v6.8H, #11 // ......................................................................................................*.. - // gap // ......................................................................................................... - mls v2.8H, v8.8H, v7.H[0] // ..............................................................................................*.......... - // gap // ......................................................................................................... - mls v4.8H, v13.8H, v7.H[0] // ....................................................................................................*.... - // gap // ......................................................................................................... - mls v5.8H, v6.8H, v7.H[0] // .......................................................................................................*. - // gap // ......................................................................................................... - mls v3.8H, v10.8H, v7.H[0] // .................................................................................................*....... - // gap // ......................................................................................................... - mul v6.8H, v25.8H, v1.H[4] // ...................................e..................................................................... - // gap // ......................................................................................................... - sqrdmulh v19.8H, v27.8H, v1.H[1] // .....................e................................................................................... - // gap // ......................................................................................................... - mul v26.8H, v27.8H, v1.H[0] // ....................e.................................................................................... - vins v9, x26, 0 // ..................................................................e...................................... - // gap // ......................................................................................................... - st4 {v2.4S,v3.4S,v4.4S,v5.4S}, [x1], #64 // ........................................................................................................* - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - - // original source code - // ldr x10, [x1, #0] // ............................................................................e.................................................................................................................................... || ......................................e....................................................................................... - // ldr x11, [x1, #8] // ..........................................................................e...................................................................................................................................... || .....................................e........................................................................................ - // vins v8, x10, 0 // ....................................................................................e............................................................................................................................ || ..........................................e................................................................................... - // vins v8, x11, 1 // ...............................................................................................................*................................................................................................. || ....................................................................*......................................................... - // ldr x10, [x1, #16] // ..........................................................e...................................................................................................................................................... || .............................e................................................................................................ - // ldr x11, [x1, #24] // ..........................e...................................................................................................................................................................................... || .............e................................................................................................................ - // vins v9, x10, 0 // ..................................................................e.............................................................................................................................................. || .................................e............................................................................................ - // vins v9, x11, 1 // ........................................................................e........................................................................................................................................ || ....................................e......................................................................................... - // ldr x10, [x1, #32] // ......................................................................e.......................................................................................................................................... || ...................................e.......................................................................................... - // ldr x11, [x1, #40] // ....................................................................e............................................................................................................................................ || ..................................e........................................................................................... - // vins v10, x10, 0 // ..............................................................................e.................................................................................................................................. || .......................................e...................................................................................... - // vins v10, x11, 1 // ..................................................................................e.............................................................................................................................. || .........................................e.................................................................................... - // ldr x10, [x1, #48] // ..e.............................................................................................................................................................................................................. || .e............................................................................................................................ - // ldr x11, [x1, #56] // e................................................................................................................................................................................................................ || e............................................................................................................................. - // vins v11, x10, 0 // ..........e...................................................................................................................................................................................................... || .....e........................................................................................................................ - // vins v11, x11, 1 // ..............e.................................................................................................................................................................................................. || .......e...................................................................................................................... - // ldr x10, [x3] , #16 // ....e............................................................................................................................................................................................................ || ..e........................................................................................................................... - // ldr x11, [x3, #-8] // ........e........................................................................................................................................................................................................ || ....e......................................................................................................................... - // vins v0, x10, 0 // ............e.................................................................................................................................................................................................... || ......e....................................................................................................................... - // vins v0, x11, 1 // ................e................................................................................................................................................................................................ || ........e..................................................................................................................... - // mul v24.8H, v10.8H, v0.H[0] // .....................................................................................................e........................................................................................................... || ...........................................................e.................................................................. - // sqrdmulh v10.8H, v10.8H, v0.H[1] // ....................................................................................................e............................................................................................................ || ..........................................................e................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ..........................................................................................................*...................................................................................................... || ..................................................................*........................................................... - // sub v10.8H, v8.8H, v24.8H // ..................................................................................................................*.............................................................................................. || ......................................................................*....................................................... - // add v8.8H, v8.8H, v24.8H // ....................................................................................................................*............................................................................................ || .......................................................................*...................................................... - // mul v24.8H, v11.8H, v0.H[0] // ...................e............................................................................................................................................................................................. || ..........e................................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // .................................e............................................................................................................................................................................... || .................e............................................................................................................ - // mls v24.8H, v11.8H, v7.H[0] // ...................................................e............................................................................................................................................................. || ..........................e................................................................................................... - // sub v11.8H, v9.8H, v24.8H // ...........................................................................e..................................................................................................................................... || ......................................e....................................................................................... - // add v9.8H, v9.8H, v24.8H // ...............................................................................e................................................................................................................................. || ........................................e..................................................................................... - // mul v24.8H, v9.8H, v0.H[2] // ............................................................................................................*.................................................................................................... || ...................................................................*.......................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ..............................................................................................................*.................................................................................................. || ....................................................................*......................................................... - // mls v24.8H, v9.8H, v7.H[0] // ......................................................................................................................*.......................................................................................... || ........................................................................*..................................................... - // sub v9.8H, v8.8H, v24.8H // ..............................................................................................................................*.................................................................................. || ............................................................................*................................................. - // add v8.8H, v8.8H, v24.8H // ................................................................................................................................*................................................................................ || .............................................................................*................................................ - // mul v24.8H, v11.8H, v0.H[4] // ...................................................................................................e............................................................................................................. || .........................................................e.................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ........................................................................................................*........................................................................................................ || .................................................................*............................................................ - // mls v24.8H, v11.8H, v7.H[0] // ................................................................................................................*................................................................................................ || .....................................................................*........................................................ - // sub v11.8H, v10.8H, v24.8H // ........................................................................................................................*........................................................................................ || .........................................................................*.................................................... - // add v10.8H, v10.8H, v24.8H // ..........................................................................................................................*...................................................................................... || ..........................................................................*................................................... - // trn1 v25.4S, v8.4S, v9.4S // ........................................................................................................................................*........................................................................ || .................................................................................*............................................ - // trn2 v26.4S, v8.4S, v9.4S // ......................................................................................................................................*.......................................................................... || ................................................................................*............................................. - // trn1 v27.4S, v10.4S, v11.4S // ..................................................................................................................................*.............................................................................. || ..............................................................................*............................................... - // trn2 v28.4S, v10.4S, v11.4S // ....................................................................................................................................*............................................................................ || ...............................................................................*.............................................. - // trn2 v10.2D, v25.2D, v27.2D // ..............................................................................................................................................*.................................................................. || ....................................................................................*......................................... - // trn2 v11.2D, v26.2D, v28.2D // ............................................................................................................................................*.................................................................... || ...................................................................................*.......................................... - // trn1 v8.2D, v25.2D, v27.2D // ................................................................................................................................................................*................................................ || .............................................................................................*................................ - // trn1 v9.2D, v26.2D, v28.2D // ........................................................................................................................................................*........................................................ || .........................................................................................*.................................... - // ldr x10, [x4] , #96 // .................................................................................................................................*............................................................................... || .............................................................................*................................................ - // ldr x11, [x4, #-88] // .......................................................................................................................................*......................................................................... || ................................................................................*............................................. - // vins v0, x10, 0 // ...........................................................................................................................................*..................................................................... || ..................................................................................*........................................... - // vins v0, x11, 1 // ...............................................................................................................................................*................................................................. || ....................................................................................*......................................... - // ldr x10, [x4, #-80] // .............................................................................................................................*................................................................................... || ...........................................................................*.................................................. - // ldr x11, [x4, #-72] // ...........................................................................................................................*..................................................................................... || ..........................................................................*................................................... - // vins v4, x10, 0 // .....................................................................................................................................*........................................................................... || ...............................................................................*.............................................. - // vins v4, x11, 1 // .............................................................................................................................................*................................................................... || ...................................................................................*.......................................... - // ldr x10, [x4, #-64] // .................................................................................................................................................*............................................................... || .....................................................................................*........................................ - // ldr x11, [x4, #-56] // .........................................................................................................................................................*....................................................... || .........................................................................................*.................................... - // vins v1, x10, 0 // ...........................................................................................................................................................*..................................................... || ..........................................................................................*................................... - // vins v1, x11, 1 // .......................................................................................................................................................................*......................................... || ................................................................................................*............................. - // ldr x10, [x4, #-48] // ...................................................................................................................................................*............................................................. || ......................................................................................*....................................... - // ldr x11, [x4, #-40] // .........................................................................................................................................*....................................................................... || .................................................................................*............................................ - // vins v5, x10, 0 // ...............................................................................................................................................................*................................................. || ............................................................................................*................................. - // vins v5, x11, 1 // .....................................................................................................................................................................*........................................... || ...............................................................................................*.............................. - // ldr x10, [x4, #-32] // ................................................................................e................................................................................................................................ || ........................................e..................................................................................... - // ldr x11, [x4, #-24] // .....................................................................................................................................................*........................................................... || .......................................................................................*...................................... - // vins v2, x10, 0 // ......................................................................................................e.......................................................................................................... || ...........................................................e.................................................................. - // vins v2, x11, 1 // .........................................................................................................................................................................*....................................... || .................................................................................................*............................ - // ldr x10, [x4, #-16] // ...............................................................................................................................*................................................................................. || ............................................................................*................................................. - // ldr x11, [x4, #-8] // .......................................................................................................................................................*......................................................... || ........................................................................................*..................................... - // vins v6, x10, 0 // .............................................................................................................................................................*................................................... || ...........................................................................................*.................................. - // vins v6, x11, 1 // .................................................................................................................................................................*............................................... || .............................................................................................*................................ - // mul v24.8H, v10.8H, v0.8H // ....................................................................................................................................................*............................................................ || .......................................................................................*...................................... - // sqrdmulh v10.8H, v10.8H, v4.8H // ......................................................................................................................................................*.......................................................... || ........................................................................................*..................................... - // mls v24.8H, v10.8H, v7.H[0] // ..............................................................................................................................................................*.................................................. || ............................................................................................*................................. - // sub v10.8H, v8.8H, v24.8H // ......................................................................................................................................................................*.......................................... || ................................................................................................*............................. - // add v8.8H, v8.8H, v24.8H // ................................................................................................................................................................................*................................ || .....................................................................................................*........................ - // mul v24.8H, v11.8H, v0.8H // ..................................................................................................................................................*.............................................................. || ......................................................................................*....................................... - // sqrdmulh v11.8H, v11.8H, v4.8H // ................................................................................................................................................*................................................................ || .....................................................................................*........................................ - // mls v24.8H, v11.8H, v7.H[0] // ..........................................................................................................................................................*...................................................... || ..........................................................................................*................................... - // sub v11.8H, v9.8H, v24.8H // ....................................................................................................................................................................*............................................ || ...............................................................................................*.............................. - // add v9.8H, v9.8H, v24.8H // ..................................................................................................................................................................*.............................................. || ..............................................................................................*............................... - // mul v24.8H, v9.8H, v1.8H // ..........................................................................................................................................................................*...................................... || ..................................................................................................*........................... - // sqrdmulh v9.8H, v9.8H, v5.8H // ........................................................................................................................................................................*........................................ || .................................................................................................*............................ - // mls v24.8H, v9.8H, v7.H[0] // ..................................................................................................................................................................................*.............................. || ......................................................................................................*....................... - // sub v9.8H, v8.8H, v24.8H // ..........................................................................................................................................................................................*...................... || ..........................................................................................................*................... - // add v8.8H, v8.8H, v24.8H // ............................................................................................................................................................................................*.................... || ...........................................................................................................*.................. - // mul v24.8H, v11.8H, v2.8H // ............................................................................................................................................................................*.................................... || ...................................................................................................*.......................... - // sqrdmulh v11.8H, v11.8H, v6.8H // ..............................................................................................................................................................................*.................................. || ....................................................................................................*......................... - // mls v24.8H, v11.8H, v7.H[0] // ......................................................................................................................................................................................*.......................... || ........................................................................................................*..................... - // sub v11.8H, v10.8H, v24.8H // ................................................................................................................................................................................................*................ || ..............................................................................................................*............... - // add v10.8H, v10.8H, v24.8H // ..............................................................................................................................................................................................*.................. || ............................................................................................................*................. - // sqdmulh v25.8H, v8.8H, v7.H[1] // .................................................................................................................................................................................................*............... || ...............................................................................................................*.............. - // srshr v25.8H, v25.8H, #11 // .....................................................................................................................................................................................................*........... || ...................................................................................................................*.......... - // mls v8.8H, v25.8H, v7.H[0] // ........................................................................................................................................................................................................*........ || ......................................................................................................................*....... - // sqdmulh v25.8H, v9.8H, v7.H[1] // ...............................................................................................................................................................................................*................. || .............................................................................................................*................ - // srshr v25.8H, v25.8H, #11 // ....................................................................................................................................................................................................*............ || ..................................................................................................................*........... - // mls v9.8H, v25.8H, v7.H[0] // ...........................................................................................................................................................................................................*..... || .........................................................................................................................*.... - // sqdmulh v25.8H, v10.8H, v7.H[1] // ..................................................................................................................................................................................................*.............. || ................................................................................................................*............. - // srshr v25.8H, v25.8H, #11 // ......................................................................................................................................................................................................*.......... || ....................................................................................................................*......... - // mls v10.8H, v25.8H, v7.H[0] // .........................................................................................................................................................................................................*....... || .......................................................................................................................*...... - // sqdmulh v25.8H, v11.8H, v7.H[1] // ...................................................................................................................................................................................................*............. || .................................................................................................................*............ - // srshr v25.8H, v25.8H, #11 // .......................................................................................................................................................................................................*......... || .....................................................................................................................*........ - // mls v11.8H, v25.8H, v7.H[0] // ..........................................................................................................................................................................................................*...... || ........................................................................................................................*..... - // st4 {v8.4S,v9.4S,v10.4S,v11.4S}, [x1], #64 // ................................................................................................................................................................................................................* || .............................................................................................................................* - - subs count, count, #1 + // Instructions: 72 + // Expected cycles: 87 + // Expected IPC: 0.83 + // + // Cycle bound: 87.0 + // IPC bound: 0.83 + // + // Wall time: 204.28s + // User time: 204.28s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ldr q29, [x1, #0] // *....................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v25.8H, v6.8H, v7.H[0] // ......................*................................................. + // gap // ........................................................................ + mul v15.8H, v5.8H, v11.H[2] // ................*....................................................... + // gap // ........................................................................ + sub v31.8H, v29.8H, v1.8H // ........*............................................................... + // gap // ........................................................................ + ldr q20, [x1, #112] // ...e.................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v6.8H, v31.8H, v25.8H // .......................*................................................ + // gap // ........................................................................ + add v31.8H, v31.8H, v25.8H // ........................*............................................... + // gap // ........................................................................ + mls v15.8H, v23.8H, v7.H[0] // .................*...................................................... + // gap // ........................................................................ + add v29.8H, v29.8H, v1.8H // .........*.............................................................. + // gap // ........................................................................ + ldr q11, [x3], #16 // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v16.8H, v29.8H, v15.8H // ...................*.................................................... + // gap // ........................................................................ + sub v25.8H, v29.8H, v15.8H // ..................*..................................................... + // gap // ........................................................................ + trn1 v29.4S, v31.4S, v6.4S // ...........................*............................................ + // gap // ........................................................................ + trn2 v6.4S, v31.4S, v6.4S // ............................*........................................... + // gap // ........................................................................ + trn1 v31.4S, v16.4S, v25.4S // .........................*.............................................. + // gap // ........................................................................ + trn2 v15.4S, v16.4S, v25.4S // ..........................*............................................. + // gap // ........................................................................ + sqrdmulh v27.8H, v20.8H, v11.H[1] // ..........e............................................................. + // gap // ........................................................................ + trn2 v10.2D, v31.2D, v29.2D // .............................*.......................................... + // gap // ........................................................................ + trn2 v12.2D, v15.2D, v6.2D // ..............................*......................................... + // gap // ........................................................................ + mul v19.8H, v10.8H, v26.8H // ........................................*............................... + // gap // ........................................................................ + mul v26.8H, v12.8H, v26.8H // .............................................*.......................... + // gap // ........................................................................ + ldr q17, [x1, #96] // ..e..................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v16.2D, v15.2D, v6.2D // ................................*....................................... + // gap // ........................................................................ + sqrdmulh v6.8H, v10.8H, v0.8H // .......................................*................................ + // gap // ........................................................................ + ldr q25, [x4, #-64] // ...................................*.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v29.2D, v31.2D, v29.2D // ...............................*........................................ + // gap // ........................................................................ + mls v19.8H, v6.8H, v7.H[0] // .........................................*.............................. + // gap // ........................................................................ + ldr q6, [x4, #-48] // ....................................*................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v31.8H, v12.8H, v0.8H // ............................................*........................... + // gap // ........................................................................ + sqrdmulh v5.8H, v17.8H, v11.H[1] // .....e.................................................................. + // gap // ........................................................................ + ldr q0, [x4, #16] // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v31.8H, v7.H[0] // ..............................................*......................... + // gap // ........................................................................ + sub v31.8H, v29.8H, v19.8H // ..........................................*............................. + // gap // ........................................................................ + add v29.8H, v29.8H, v19.8H // ...........................................*............................ + // gap // ........................................................................ + mul v1.8H, v17.8H, v11.H[0] // ......e................................................................. + // gap // ........................................................................ + add v8.8H, v16.8H, v26.8H // ................................................*....................... + // gap // ........................................................................ + sub v18.8H, v16.8H, v26.8H // ...............................................*........................ + // gap // ........................................................................ + ldr q26, [x4, #-32] // .....................................*.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v6.8H, v8.8H, v6.8H // .................................................*...................... + // gap // ........................................................................ + mul v12.8H, v8.8H, v25.8H // ..................................................*..................... + // gap // ........................................................................ + sqrdmulh v25.8H, v18.8H, v30.8H // ......................................................*................. + // gap // ........................................................................ + mul v26.8H, v18.8H, v26.8H // .......................................................*................ + // gap // ........................................................................ + mls v1.8H, v5.8H, v7.H[0] // .......e................................................................ + // gap // ........................................................................ + mls v12.8H, v6.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + mul v6.8H, v20.8H, v11.H[0] // ...........e............................................................ + // gap // ........................................................................ + mls v26.8H, v25.8H, v7.H[0] // ........................................................*............... + // gap // ........................................................................ + ldr q30, [x1, #80] // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v6.8H, v27.8H, v7.H[0] // ............e........................................................... + // gap // ........................................................................ + sub v15.8H, v29.8H, v12.8H // ....................................................*................... + // gap // ........................................................................ + add v14.8H, v29.8H, v12.8H // .....................................................*.................. + // gap // ........................................................................ + sub v17.8H, v31.8H, v26.8H // .........................................................*.............. + // gap // ........................................................................ + add v16.8H, v31.8H, v26.8H // ..........................................................*............. + // gap // ........................................................................ + sqdmulh v27.8H, v14.8H, v7.H[1] // ...........................................................*............ + // gap // ........................................................................ + sqdmulh v29.8H, v15.8H, v7.H[1] // ..............................................................*......... + // gap // ........................................................................ + sqdmulh v31.8H, v16.8H, v7.H[1] // .................................................................*...... + // gap // ........................................................................ + sqdmulh v19.8H, v17.8H, v7.H[1] // ....................................................................*... + // gap // ........................................................................ + srshr v18.8H, v27.8H, #11 // ............................................................*........... + // gap // ........................................................................ + srshr v29.8H, v29.8H, #11 // ...............................................................*........ + // gap // ........................................................................ + srshr v21.8H, v31.8H, #11 // ..................................................................*..... + // gap // ........................................................................ + mls v14.8H, v18.8H, v7.H[0] // .............................................................*.......... + // gap // ........................................................................ + mls v15.8H, v29.8H, v7.H[0] // ................................................................*....... + // gap // ........................................................................ + mls v16.8H, v21.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + srshr v23.8H, v19.8H, #11 // .....................................................................*.. + // gap // ........................................................................ + sub v25.8H, v30.8H, v6.8H // .............e.......................................................... + // gap // ........................................................................ + add v5.8H, v30.8H, v6.8H // ..............e......................................................... + // gap // ........................................................................ + mls v17.8H, v23.8H, v7.H[0] // ......................................................................*. + // gap // ........................................................................ + sqrdmulh v6.8H, v25.8H, v11.H[5] // ....................e................................................... + // gap // ........................................................................ + sqrdmulh v23.8H, v5.8H, v11.H[3] // ...............e........................................................ + // gap // ........................................................................ + mul v25.8H, v25.8H, v11.H[4] // .....................e.................................................. + // gap // ........................................................................ + ldr q26, [x4], #(6*16) // .................................e...................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q30, [x4, #-16] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + st4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1], #64 // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + + // -------------------------------------------------------------- new position ---------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + // ldr q8, [x1, #(16*0)] // ....................................................................*....................................................................... + // ldr q9, [x1, #(16*1)] // ..........................................e.........................'.............................................~......................... + // ldr q10, [x1, #(16*2)] // .................e..................................................'....................~.................................................. + // ldr q11, [x1, #(16*3)] // e...................................................................'...~................................................................... + // ldr q0, [x3], #16 // .....e..............................................................'........~.............................................................. + // sqrdmulh v27.8h, v10.8h, v0.h[1] // .........................e..........................................'............................~.......................................... + // mul v24.8h, v10.8h, v0.h[0] // ..............................e.....................................'.................................~..................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................e.............................'.........................................~............................. + // sub v10.8h, v8.8h, v24.8h // ....................................................................'..*.................................................................... + // add v8.8h, v8.8h, v24.8h // ....~...............................................................'.......*............................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ............e.......................................................'...............~....................................................... + // mul v24.8h, v11.8h, v0.h[0] // ........................................e...........................'...........................................~........................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................................e........................'..............................................~........................ + // sub v11.8h, v9.8h, v24.8h // ...........................................................e........'..............................................................~........ + // add v9.8h, v9.8h, v24.8h // ............................................................e.......'...............................................................~....... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ...............................................................e....'..................................................................~.... + // mul v24.8h, v9.8h, v0.h[2] // ....................................................................'.*..................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...~................................................................'......*................................................................ + // sub v9.8h, v8.8h, v24.8h // .......~............................................................'..........*............................................................ + // add v8.8h, v8.8h, v24.8h // ......~.............................................................'.........*............................................................. + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ..............................................................e.....'.................................................................~..... + // mul v24.8h, v11.8h, v0.h[4] // ................................................................e...'...................................................................~... + // mls v24.8h, v27.8h, v7.h[0] // ....................................................................'*...................................................................... + // sub v11.8h, v10.8h, v24.8h // .~..................................................................'....*.................................................................. + // add v10.8h, v10.8h, v24.8h // ..~.................................................................'.....*................................................................. + // trn1 v25.4s, v8.4s, v9.4s // ..........~.........................................................'.............*......................................................... + // trn2 v26.4s, v8.4s, v9.4s // ...........~........................................................'..............*........................................................ + // trn1 v27.4s, v10.4s, v11.4s // ........~...........................................................'...........*........................................................... + // trn2 v28.4s, v10.4s, v11.4s // .........~..........................................................'............*.......................................................... + // trn2 v10.2d, v25.2d, v27.2d // .............~......................................................'................*...................................................... + // trn2 v11.2d, v26.2d, v28.2d // ..............~.....................................................'.................*..................................................... + // trn1 v8.2d, v25.2d, v27.2d // .....................~..............................................'........................*.............................................. + // trn1 v9.2d, v26.2d, v28.2d // ..................~.................................................'.....................*................................................. + // ldr q0, [ x4], #(6*16) // .................................................................e..'....................................................................~.. + // ldr q4, [x4, #(-6*16 + 1*16)] // ..........................e.........................................'.............................~......................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ....................~...............................................'.......................*............................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .......................~............................................'..........................*............................................ + // ldr q2, [ x4, #(-6*16 + 4*16)] // .................................~..................................'....................................*.................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // ..................................................................e.'.....................................................................~. + // sqrdmulh v27.8h, v10.8h, v4.8h // ...................~................................................'......................*................................................ + // mul v24.8h, v10.8h, v0.8h // ...............~....................................................'..................*.................................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................~.............................................'.........................*............................................. + // sub v10.8h, v8.8h, v24.8h // ............................~.......................................'...............................*....................................... + // add v8.8h, v8.8h, v24.8h // .............................~......................................'................................*...................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ........................~...........................................'...........................*........................................... + // mul v24.8h, v11.8h, v0.8h // ................~...................................................'...................*................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................~........................................'..............................*........................................ + // sub v11.8h, v9.8h, v24.8h // ................................~...................................'...................................*................................... + // add v9.8h, v9.8h, v24.8h // ...............................~....................................'..................................*.................................... + // sqrdmulh v27.8h, v9.8h, v5.8h // ..................................~.................................'.....................................*................................. + // mul v24.8h, v9.8h, v1.8h // ...................................~................................'......................................*................................ + // mls v24.8h, v27.8h, v7.h[0] // .......................................~............................'..........................................*............................ + // sub v9.8h, v8.8h, v24.8h // ............................................~.......................'...............................................*....................... + // add v8.8h, v8.8h, v24.8h // .............................................~......................'................................................*...................... + // sqrdmulh v27.8h, v11.8h, v6.8h // ....................................~...............................'.......................................*............................... + // mul v24.8h, v11.8h, v2.8h // .....................................~..............................'........................................*.............................. + // mls v24.8h, v27.8h, v7.h[0] // .........................................~..........................'............................................*.......................... + // sub v11.8h, v10.8h, v24.8h // ..............................................~.....................'.................................................*..................... + // add v10.8h, v10.8h, v24.8h // ...............................................~....................'..................................................*.................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ................................................~...................'...................................................*................... + // srshr v25.8h, v25.8h, #11 // ....................................................~...............'.......................................................*............... + // mls v8.8h, v25.8h, v7.h[0] // .......................................................~............'..........................................................*............ + // sqdmulh v25.8h, v9.8h, v7.h[1] // .................................................~..................'....................................................*.................. + // srshr v25.8h, v25.8h, #11 // .....................................................~..............'........................................................*.............. + // mls v9.8h, v25.8h, v7.h[0] // ........................................................~...........'...........................................................*........... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..................................................~.................'.....................................................*................. + // srshr v25.8h, v25.8h, #11 // ......................................................~.............'.........................................................*............. + // mls v10.8h, v25.8h, v7.h[0] // .........................................................~..........'............................................................*.......... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ...................................................~................'......................................................*................ + // srshr v25.8h, v25.8h, #11 // ..........................................................~.........'.............................................................*......... + // mls v11.8h, v25.8h, v7.h[0] // .............................................................~......'................................................................*...... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ...................................................................~'......................................................................* + + sub count, count, #1 cbnz count, layer4567_start - sqrdmulh v10.8H, v25.8H, v1.H[5] // *........................................................................... - vins v29, x15, 1 // ....*....................................................................... - mls v26.8H, v19.8H, v7.H[0] // .*.......................................................................... - ldr x13, [x4, #72] // ..................................*......................................... - mul v15.8H, v11.8H, v1.H[2] // ..*......................................................................... - ldr x15, [x4, #24] // ...........*................................................................ - sqrdmulh v11.8H, v11.8H, v1.H[3] // ...*........................................................................ - ldr x21, [x4, #16] // ............*............................................................... - mls v6.8H, v10.8H, v7.H[0] // .....*...................................................................... - ldr x14, [x4, #80] // ..............*............................................................. - sub v10.8H, v29.8H, v26.8H // ......*..................................................................... - vins v9, x13, 1 // ....................................................*....................... - add v16.8H, v29.8H, v26.8H // .......*.................................................................... - ldr x13, [x4] , #96 // ................*........................................................... - mls v15.8H, v11.8H, v7.H[0] // ........*................................................................... - vins v11, x21, 0 // ...................*........................................................ - add v2.8H, v10.8H, v6.8H // ..........*................................................................. - ldr x21, [x4, #-88] // .....................*...................................................... - vins v11, x15, 1 // ..........................*................................................. - ldr x15, [x4, #-40] // .......................*.................................................... - sub v6.8H, v10.8H, v6.8H // .........*.................................................................. - vins v10, x14, 0 // .........................................*.................................. - sub v23.8H, v16.8H, v15.8H // .............*.............................................................. - vins v20, x13, 0 // ........................*................................................... - add v15.8H, v16.8H, v15.8H // ...............*............................................................ - ldr x13, [x4, #-64] // ..............................*............................................. - trn1 v16.4S, v2.4S, v6.4S // .................*.......................................................... - vins v20, x21, 1 // ............................*............................................... - trn2 v6.4S, v2.4S, v6.4S // ..................*......................................................... - ldr x21, [x4, #-48] // ................................*........................................... - trn2 v2.4S, v15.4S, v23.4S // ....................*....................................................... - ldr x14, [x4, #-8] // ....................................*....................................... - trn1 v15.4S, v15.4S, v23.4S // ......................*..................................................... - ldr x10, [x4, #-56] // ......................................*..................................... - vins v23, x13, 0 // ........................................*................................... - // gap // ............................................................................ - trn2 v5.2D, v2.2D, v6.2D // .........................*.................................................. - vins v13, x21, 0 // ...........................................*................................ - trn2 v29.2D, v15.2D, v16.2D // ...........................*................................................ - vins v10, x14, 1 // .............................................*.............................. - sqrdmulh v0.8H, v5.8H, v11.8H // .............................*.............................................. - vins v13, x15, 1 // ................................................*........................... - mul v5.8H, v5.8H, v20.8H // ...............................*............................................ - vins v23, x10, 1 // ..................................................*......................... - sqrdmulh v11.8H, v29.8H, v11.8H // ...................................*........................................ - // gap // ............................................................................ - mul v20.8H, v29.8H, v20.8H // .................................*.......................................... - // gap // ............................................................................ - trn1 v6.2D, v2.2D, v6.2D // .....................................*...................................... - // gap // ............................................................................ - mls v5.8H, v0.8H, v7.H[0] // .......................................*.................................... - // gap // ............................................................................ - trn1 v15.2D, v15.2D, v16.2D // ............................................*............................... - // gap // ............................................................................ - mls v20.8H, v11.8H, v7.H[0] // ..........................................*................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v11.8H, v6.8H, v5.8H // ...............................................*............................ - // gap // ............................................................................ - add v6.8H, v6.8H, v5.8H // ..............................................*............................. - // gap // ............................................................................ - sub v16.8H, v15.8H, v20.8H // .................................................*.......................... - // gap // ............................................................................ - mul v2.8H, v11.8H, v9.8H // ......................................................*..................... - // gap // ............................................................................ - sqrdmulh v10.8H, v11.8H, v10.8H // .......................................................*.................... - // gap // ............................................................................ - mul v11.8H, v6.8H, v23.8H // .....................................................*...................... - // gap // ............................................................................ - sqrdmulh v6.8H, v6.8H, v13.8H // ...................................................*........................ - // gap // ............................................................................ - add v15.8H, v15.8H, v20.8H // ........................................................*................... - // gap // ............................................................................ - mls v2.8H, v10.8H, v7.H[0] // ..........................................................*................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v11.8H, v6.8H, v7.H[0] // .........................................................*.................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v23.8H, v16.8H, v2.8H // .............................................................*.............. - // gap // ............................................................................ - sub v24.8H, v16.8H, v2.8H // ...............................................................*............ - // gap // ............................................................................ - sub v22.8H, v15.8H, v11.8H // ...........................................................*................ - // gap // ............................................................................ - add v21.8H, v15.8H, v11.8H // ............................................................*............... - // gap // ............................................................................ - sqdmulh v6.8H, v23.8H, v7.H[1] // .................................................................*.......... - // gap // ............................................................................ - sqdmulh v10.8H, v22.8H, v7.H[1] // ..............................................................*............. - // gap // ............................................................................ - sqdmulh v15.8H, v21.8H, v7.H[1] // ................................................................*........... - // gap // ............................................................................ - sqdmulh v11.8H, v24.8H, v7.H[1] // ..................................................................*......... - // gap // ............................................................................ - srshr v6.8H, v6.8H, #11 // .....................................................................*...... - // gap // ............................................................................ - srshr v10.8H, v10.8H, #11 // ...................................................................*........ - // gap // ............................................................................ - srshr v15.8H, v15.8H, #11 // ....................................................................*....... - // gap // ............................................................................ - srshr v11.8H, v11.8H, #11 // ......................................................................*..... - // gap // ............................................................................ - mls v23.8H, v6.8H, v7.H[0] // ........................................................................*... - // gap // ............................................................................ - mls v21.8H, v15.8H, v7.H[0] // .......................................................................*.... - // gap // ............................................................................ - mls v24.8H, v11.8H, v7.H[0] // .........................................................................*.. - // gap // ............................................................................ - mls v22.8H, v10.8H, v7.H[0] // ..........................................................................*. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - st4 {v21.4S,v22.4S,v23.4S,v24.4S}, [x1], #64 // ...........................................................................* - // gap // ............................................................................ - - // original source code - // sqrdmulh v10.8H, v25.8H, v1.H[5] // *........................................................................... || *............................................................ - // mls v26.8H, v19.8H, v7.H[0] // ..*......................................................................... || .*........................................................... - // mul v2.8H, v11.8H, v1.H[2] // ....*....................................................................... || ..*.......................................................... - // sqrdmulh v15.8H, v11.8H, v1.H[3] // ......*..................................................................... || ...*......................................................... - // vins v29, x15, 1 // .*.......................................................................... || *............................................................ - // mls v6.8H, v10.8H, v7.H[0] // ........*................................................................... || ....*........................................................ - // sub v12.8H, v29.8H, v26.8H // ..........*................................................................. || .....*....................................................... - // add v25.8H, v29.8H, v26.8H // ............*............................................................... || ......*...................................................... - // mls v2.8H, v15.8H, v7.H[0] // ..............*............................................................. || .......*..................................................... - // sub v10.8H, v12.8H, v6.8H // ....................*....................................................... || ..........*.................................................. - // add v22.8H, v12.8H, v6.8H // ................*........................................................... || ........*.................................................... - // ldr x16, [x4, #24] // .....*...................................................................... || ..*.......................................................... - // ldr x24, [x4, #16] // .......*.................................................................... || ...*......................................................... - // sub v16.8H, v25.8H, v2.8H // ......................*..................................................... || ...........*................................................. - // ldr x13, [x4, #80] // .........*.................................................................. || ....*........................................................ - // add v31.8H, v25.8H, v2.8H // ........................*................................................... || ............*................................................ - // ldr x28, [x4] , #96 // .............*.............................................................. || ......*...................................................... - // trn1 v2.4S, v22.4S, v10.4S // ..........................*................................................. || .............*............................................... - // trn2 v30.4S, v22.4S, v10.4S // ............................*............................................... || ..............*.............................................. - // vins v10, x24, 0 // ...............*............................................................ || .......*..................................................... - // trn2 v26.4S, v31.4S, v16.4S // ..............................*............................................. || ...............*............................................. - // ldr x21, [x4, #-88] // .................*.......................................................... || ........*.................................................... - // trn1 v14.4S, v31.4S, v16.4S // ................................*........................................... || ................*............................................ - // ldr x23, [x4, #-40] // ...................*........................................................ || .........*................................................... - // vins v16, x28, 0 // .......................*.................................................... || ...........*................................................. - // trn2 v6.2D, v26.2D, v30.2D // ...................................*........................................ || ..................*.......................................... - // vins v10, x16, 1 // ..................*......................................................... || .........*................................................... - // trn2 v18.2D, v14.2D, v2.2D // .....................................*...................................... || ...................*......................................... - // vins v16, x21, 1 // ...........................*................................................ || .............*............................................... - // sqrdmulh v21.8H, v6.8H, v10.8H // .......................................*.................................... || ....................*........................................ - // ldr x26, [x4, #-64] // .........................*.................................................. || ............*................................................ - // mul v6.8H, v6.8H, v16.8H // .........................................*.................................. || .....................*....................................... - // ldr x15, [x4, #-48] // .............................*.............................................. || ..............*.............................................. - // mul v23.8H, v18.8H, v16.8H // ............................................*............................... || .......................*..................................... - // ldr x24, [x4, #-24] // ...*........................................................................ || .*........................................................... - // sqrdmulh v10.8H, v18.8H, v10.8H // ...........................................*................................ || ......................*...................................... - // ldr x14, [x4, #-8] // ...............................*............................................ || ...............*............................................. - // trn1 v27.2D, v26.2D, v30.2D // .............................................*.............................. || ........................*.................................... - // ldr x28, [x4, #-56] // .................................*.......................................... || ................*............................................ - // mls v6.8H, v21.8H, v7.H[0] // ..............................................*............................. || .........................*................................... - // vins v5, x26, 0 // ..................................*......................................... || .................*........................................... - // vins v0, x13, 0 // .....................*...................................................... || ..........*.................................................. - // mls v23.8H, v10.8H, v7.H[0] // ................................................*........................... || ...........................*................................. - // vins v20, x15, 0 // ....................................*....................................... || ..................*.......................................... - // trn1 v10.2D, v14.2D, v2.2D // ...............................................*............................ || ..........................*.................................. - // vins v0, x14, 1 // ......................................*..................................... || ...................*......................................... - // add v17.8H, v27.8H, v6.8H // ..................................................*......................... || ..............................*.............................. - // sub v13.8H, v27.8H, v6.8H // .................................................*.......................... || .............................*............................... - // vins v20, x23, 1 // ........................................*................................... || ....................*........................................ - // sub v15.8H, v10.8H, v23.8H // ...................................................*........................ || ...............................*............................. - // vins v5, x28, 1 // ..........................................*................................. || .....................*....................................... - // sqrdmulh v16.8H, v17.8H, v20.8H // .......................................................*.................... || ...................................*......................... - // vins v9, x24, 1 // ...........*................................................................ || .....*....................................................... - // mul v5.8H, v17.8H, v5.8H // ......................................................*..................... || ..................................*.......................... - // mul v6.8H, v13.8H, v9.8H // ....................................................*....................... || ................................*............................ - // sqrdmulh v2.8H, v13.8H, v0.8H // .....................................................*...................... || .................................*........................... - // add v10.8H, v10.8H, v23.8H // ........................................................*................... || ....................................*........................ - // mls v5.8H, v16.8H, v7.H[0] // ..........................................................*................. || .......................................*..................... - // mls v6.8H, v2.8H, v7.H[0] // .........................................................*.................. || .....................................*....................... - // sub v3.8H, v10.8H, v5.8H // .............................................................*.............. || ...........................................*................. - // add v2.8H, v10.8H, v5.8H // ..............................................................*............. || ............................................*................ - // add v4.8H, v15.8H, v6.8H // ...........................................................*................ || .........................................*................... - // sqdmulh v0.8H, v3.8H, v7.H[1] // ................................................................*........... || ..............................................*.............. - // sub v5.8H, v15.8H, v6.8H // ............................................................*............... || ..........................................*.................. - // sqdmulh v21.8H, v2.8H, v7.H[1] // .................................................................*.......... || ...............................................*............. - // sqdmulh v18.8H, v4.8H, v7.H[1] // ...............................................................*............ || .............................................*............... - // sqdmulh v6.8H, v5.8H, v7.H[1] // ..................................................................*......... || ................................................*............ - // srshr v10.8H, v0.8H, #11 // ....................................................................*....... || ..................................................*.......... - // srshr v8.8H, v21.8H, #11 // .....................................................................*...... || ...................................................*......... - // srshr v13.8H, v18.8H, #11 // ...................................................................*........ || .................................................*........... - // srshr v6.8H, v6.8H, #11 // ......................................................................*..... || ....................................................*........ - // mls v2.8H, v8.8H, v7.H[0] // ........................................................................*... || ......................................................*...... - // mls v4.8H, v13.8H, v7.H[0] // .......................................................................*.... || .....................................................*....... - // mls v5.8H, v6.8H, v7.H[0] // .........................................................................*.. || .......................................................*..... - // mls v3.8H, v10.8H, v7.H[0] // ..........................................................................*. || ........................................................*.... - // st4 {v2.4S,v3.4S,v4.4S,v5.4S}, [x1], #64 // ...........................................................................* || ............................................................* - + // Instructions: 54 + // Expected cycles: 61 + // Expected IPC: 0.89 + // + // Cycle bound: 61.0 + // IPC bound: 0.89 + // + // Wall time: 12.27s + // User time: 12.27s + // + // ----------------- original position -----------------> + // 0 25 50 + // |------------------------|------------------------|--- + ldr q17, [x1, #0] // *..................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v3.8H, v5.8H, v11.H[2] // ..*................................................... + // gap // ...................................................... + mls v25.8H, v6.8H, v7.H[0] // .*.................................................... + // gap // ...................................................... + add v31.8H, v17.8H, v1.8H // .......*.............................................. + // gap // ...................................................... + sub v12.8H, v17.8H, v1.8H // ...*.................................................. + // gap // ...................................................... + mls v3.8H, v23.8H, v7.H[0] // ......*............................................... + // gap // ...................................................... + ldr q21, [x4, #-48] // .......................*.............................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sub v11.8H, v12.8H, v25.8H // ....*................................................. + // gap // ...................................................... + add v28.8H, v12.8H, v25.8H // .....*................................................ + // gap // ...................................................... + add v5.8H, v31.8H, v3.8H // ........*............................................. + // gap // ...................................................... + sub v19.8H, v31.8H, v3.8H // .........*............................................ + // gap // ...................................................... + trn2 v27.4S, v28.4S, v11.4S // ...........*.......................................... + // gap // ...................................................... + trn1 v17.4S, v28.4S, v11.4S // ..........*........................................... + // gap // ...................................................... + trn2 v3.4S, v5.4S, v19.4S // .............*........................................ + // gap // ...................................................... + ldr q4, [x4, #-64] // ....................*................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn2 v16.2D, v3.2D, v27.2D // ...............*...................................... + // gap // ...................................................... + trn1 v14.4S, v5.4S, v19.4S // ............*......................................... + // gap // ...................................................... + mul v19.8H, v16.8H, v26.8H // .................*.................................... + // gap // ...................................................... + sqrdmulh v2.8H, v16.8H, v0.8H // ........................*............................. + // gap // ...................................................... + trn2 v20.2D, v14.2D, v17.2D // ..............*....................................... + // gap // ...................................................... + trn1 v1.2D, v3.2D, v27.2D // ..................*................................... + // gap // ...................................................... + sqrdmulh v13.8H, v20.8H, v0.8H // ...................*.................................. + // gap // ...................................................... + mls v19.8H, v2.8H, v7.H[0] // .........................*............................ + // gap // ...................................................... + mul v29.8H, v20.8H, v26.8H // ................*..................................... + // gap // ...................................................... + ldr q6, [x4, #-32] // ..............................*....................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v11.8H, v1.8H, v19.8H // ............................*......................... + // gap // ...................................................... + sub v18.8H, v1.8H, v19.8H // .............................*........................ + // gap // ...................................................... + mls v29.8H, v13.8H, v7.H[0] // ......................*............................... + // gap // ...................................................... + mul v31.8H, v11.8H, v4.8H // ................................*..................... + // gap // ...................................................... + sqrdmulh v19.8H, v11.8H, v21.8H // ...............................*...................... + // gap // ...................................................... + mul v9.8H, v18.8H, v6.8H // ..................................*................... + // gap // ...................................................... + sqrdmulh v10.8H, v18.8H, v30.8H // .................................*.................... + // gap // ...................................................... + trn1 v15.2D, v14.2D, v17.2D // .....................*................................ + // gap // ...................................................... + mls v31.8H, v19.8H, v7.H[0] // ...................................*.................. + // gap // ...................................................... + add v1.8H, v15.8H, v29.8H // ...........................*.......................... + // gap // ...................................................... + mls v9.8H, v10.8H, v7.H[0] // ....................................*................. + // gap // ...................................................... + sub v0.8H, v15.8H, v29.8H // ..........................*........................... + // gap // ...................................................... + add v3.8H, v1.8H, v31.8H // ......................................*............... + // gap // ...................................................... + sub v4.8H, v1.8H, v31.8H // .....................................*................ + // gap // ...................................................... + sub v6.8H, v0.8H, v9.8H // .......................................*.............. + // gap // ...................................................... + sqdmulh v12.8H, v3.8H, v7.H[1] // .........................................*............ + // gap // ...................................................... + sqdmulh v17.8H, v4.8H, v7.H[1] // ..........................................*........... + // gap // ...................................................... + add v5.8H, v0.8H, v9.8H // ........................................*............. + // gap // ...................................................... + sqdmulh v14.8H, v6.8H, v7.H[1] // ............................................*......... + // gap // ...................................................... + srshr v31.8H, v12.8H, #11 // .............................................*........ + // gap // ...................................................... + sqdmulh v13.8H, v5.8H, v7.H[1] // ...........................................*.......... + // gap // ...................................................... + srshr v28.8H, v17.8H, #11 // ..............................................*....... + // gap // ...................................................... + srshr v15.8H, v14.8H, #11 // ...................................................*.. + // gap // ...................................................... + mls v3.8H, v31.8H, v7.H[0] // ................................................*..... + // gap // ...................................................... + srshr v27.8H, v13.8H, #11 // ...............................................*...... + // gap // ...................................................... + mls v6.8H, v15.8H, v7.H[0] // ....................................................*. + // gap // ...................................................... + mls v4.8H, v28.8H, v7.H[0] // .................................................*.... + // gap // ...................................................... + mls v5.8H, v27.8H, v7.H[0] // ..................................................*... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + st4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1], #64 // .....................................................* + // gap // ...................................................... + + // ------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|--- + // ldr q29, [x1, #0] // *..................................................... + // mls v25.8H, v6.8H, v7.H[0] // ..*................................................... + // mul v15.8H, v5.8H, v11.H[2] // .*.................................................... + // sub v31.8H, v29.8H, v1.8H // ....*................................................. + // sub v6.8H, v31.8H, v25.8H // .......*.............................................. + // add v31.8H, v31.8H, v25.8H // ........*............................................. + // mls v15.8H, v23.8H, v7.H[0] // .....*................................................ + // add v29.8H, v29.8H, v1.8H // ...*.................................................. + // add v16.8H, v29.8H, v15.8H // .........*............................................ + // sub v25.8H, v29.8H, v15.8H // ..........*........................................... + // trn1 v29.4S, v31.4S, v6.4S // ............*......................................... + // trn2 v6.4S, v31.4S, v6.4S // ...........*.......................................... + // trn1 v31.4S, v16.4S, v25.4S // ................*..................................... + // trn2 v15.4S, v16.4S, v25.4S // .............*........................................ + // trn2 v10.2D, v31.2D, v29.2D // ...................*.................................. + // trn2 v12.2D, v15.2D, v6.2D // ...............*...................................... + // mul v19.8H, v10.8H, v26.8H // .......................*.............................. + // mul v26.8H, v12.8H, v26.8H // .................*.................................... + // trn1 v16.2D, v15.2D, v6.2D // ....................*................................. + // sqrdmulh v6.8H, v10.8H, v0.8H // .....................*................................ + // ldr q25, [x4, #-64] // ..............*....................................... + // trn1 v29.2D, v31.2D, v29.2D // ................................*..................... + // mls v19.8H, v6.8H, v7.H[0] // ...........................*.......................... + // ldr q6, [x4, #-48] // ......*............................................... + // sqrdmulh v31.8H, v12.8H, v0.8H // ..................*................................... + // mls v26.8H, v31.8H, v7.H[0] // ......................*............................... + // sub v31.8H, v29.8H, v19.8H // ....................................*................. + // add v29.8H, v29.8H, v19.8H // ..................................*................... + // add v8.8H, v16.8H, v26.8H // .........................*............................ + // sub v18.8H, v16.8H, v26.8H // ..........................*........................... + // ldr q26, [x4, #-32] // ........................*............................. + // sqrdmulh v6.8H, v8.8H, v6.8H // .............................*........................ + // mul v12.8H, v8.8H, v25.8H // ............................*......................... + // sqrdmulh v25.8H, v18.8H, v30.8H // ...............................*...................... + // mul v26.8H, v18.8H, v26.8H // ..............................*....................... + // mls v12.8H, v6.8H, v7.H[0] // .................................*.................... + // mls v26.8H, v25.8H, v7.H[0] // ...................................*.................. + // sub v15.8H, v29.8H, v12.8H // ......................................*............... + // add v14.8H, v29.8H, v12.8H // .....................................*................ + // sub v17.8H, v31.8H, v26.8H // .......................................*.............. + // add v16.8H, v31.8H, v26.8H // ..........................................*........... + // sqdmulh v27.8H, v14.8H, v7.H[1] // ........................................*............. + // sqdmulh v29.8H, v15.8H, v7.H[1] // .........................................*............ + // sqdmulh v31.8H, v16.8H, v7.H[1] // .............................................*........ + // sqdmulh v19.8H, v17.8H, v7.H[1] // ...........................................*.......... + // srshr v18.8H, v27.8H, #11 // ............................................*......... + // srshr v29.8H, v29.8H, #11 // ..............................................*....... + // srshr v21.8H, v31.8H, #11 // .................................................*.... + // mls v14.8H, v18.8H, v7.H[0] // ................................................*..... + // mls v15.8H, v29.8H, v7.H[0] // ...................................................*.. + // mls v16.8H, v21.8H, v7.H[0] // ....................................................*. + // srshr v23.8H, v19.8H, #11 // ...............................................*...... + // mls v17.8H, v23.8H, v7.H[0] // ..................................................*... + // st4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1], #64 // .....................................................* + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_a72.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_a72.s index 13b9338d..619a4853 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_a72.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_a72.s @@ -26,46 +26,12 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 -.macro vins vec_out, gpr_in, lane // slothy:no-unfold +.macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold - umov \gpr_out\(), \vec_in\().d[\lane] -.endm - -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -83,15 +49,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -100,12 +66,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -114,48 +74,48 @@ xtmp1 .req x11 .macro barrett_reduce a vqdmulhq t0, \a, consts, 1 - srshr t0.8H, t0.8H, #11 + srshr t0.8h, t0.8h, #11 vmlsq \a, t0, consts, 0 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -166,7 +126,7 @@ xtmp1 .req x11 str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -176,7 +136,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -184,7 +144,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -195,19 +155,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -220,7 +180,7 @@ roots: .text .global ntt_kyber_123_4567_scalar_load_opt_a72 - .global _ntt_kyber_123_4567_scalar_load_opt_a72 + .global _ntt_kyber_123_4567_scalar_load .p2align 4 const_addr: .short 3329 @@ -346,1451 +306,1347 @@ _ntt_kyber_123_4567_scalar_load_opt_a72: load_roots_123 .p2align 2 - ldr x25, [x0, #256] // ....*................... - ldr x21, [x0, #192] // ..*..................... - // gap // ........................ - ldr x26, [x0, #320] // ......*................. - ldr x23, [x0, #448] // *....................... - // gap // ........................ - ldr x16, [x0, #200] // ...*.................... - ldr x20, [x0, #64] // .*...................... - // gap // ........................ - ldr x24, [x0, #264] // .......*................ - ldr x18, [x0, #328] // ........*............... - // gap // ........................ - vins v25, x21, 0 // .............*.......... - vins v26, x25, 0 // ..............*......... - ldr x21, [x0, #456] // .........*.............. - vins v18, x26, 0 // ...............*........ - vins v2, x23, 0 // .....*.................. - ldr x25, [x0, #384] // ..........*............. - ldr x26, [x0, #392] // ...........*............ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - vins v10, x25, 0 // .................*...... - // gap // ........................ - // gap // ........................ - vins v26, x24, 1 // ................*....... - // gap // ........................ - // gap // ........................ - vins v2, x21, 1 // ............*........... - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - vins v10, x26, 1 // ..................*..... - // gap // ........................ - // gap // ........................ - sqrdmulh v6.8H, v26.8H, v0.H[1] // .....................*.. - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - sqrdmulh v16.8H, v2.8H, v0.H[1] // ....................*... - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - mul v15.8H, v2.8H, v0.H[0] // ...................*.... - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - sqrdmulh v4.8H, v10.8H, v0.H[1] // ......................*. - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - mls v15.8H, v16.8H, v7.H[0] // .......................* - // gap // ........................ - // gap // ........................ - - // original source code - // ldr x24, [x0, #448] // ...*.................... || .*....................... - // ldr x20, [x0, #64] // .....*.................. || ..*...................... - // ldr x21, [x0, #192] // .*...................... || *........................ - // ldr x16, [x0, #200] // ....*................... || ..*...................... - // ldr x26, [x0, #256] // *....................... || *........................ - // vins v6, x24, 0 // ............*........... || .....*................... - // ldr x24, [x0, #320] // ..*..................... || .*....................... - // ldr x9, [x0, #264] // ......*................. || ...*..................... - // ldr x18, [x0, #328] // .......*................ || ...*..................... - // ldr x23, [x0, #456] // ..........*............. || ....*.................... - // ldr x12, [x0, #384] // .............*.......... || .....*................... - // ldr x14, [x0, #392] // ..............*......... || ......*.................. - // vins v6, x23, 1 // .................*...... || ...........*............. - // vins v25, x21, 0 // ........*............... || ....*.................... - // vins v26, x26, 0 // .........*.............. || ....*.................... - // vins v18, x24, 0 // ...........*............ || .....*................... - // vins v26, x9, 1 // ................*....... || ..........*.............. - // vins v10, x12, 0 // ...............*........ || .........*............... - // vins v10, x14, 1 // ..................*..... || ...............*......... - // mul v15.8H, v6.8H, v0.H[0] // .....................*.. || ....................*.... - // sqrdmulh v13.8H, v6.8H, v0.H[1] // ....................*... || ..................*...... - // sqrdmulh v6.8H, v26.8H, v0.H[1] // ...................*.... || ................*........ - // sqrdmulh v4.8H, v10.8H, v0.H[1] // ......................*. || ......................*.. - // mls v15.8H, v13.8H, v7.H[0] // .......................* || ........................* - + // Instructions: 6 + // Expected cycles: 11 + // Expected IPC: 0.55 + // + // Cycle bound: 11.0 + // IPC bound: 0.55 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q15, [x0, #448] // *............................. + ldr q6, [x0, #384] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v16.8H, v15.8H, v0.H[1] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v15.8H, v15.8H, v0.H[0] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v14.8H, v6.8H, v0.H[1] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v15.8H, v16.8H, v7.H[0] // .....*........................ + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q19, [x0, #448] // *.............................. + // ldr q6, [x0, #384] // .*............................. + // sqrdmulh v16.8H, v19.8H, v0.H[1] // ..*............................ + // sqrdmulh v14.8H, v6.8H, v0.H[1] // ....*.......................... + // mul v15.8H, v19.8H, v0.H[0] // ...*........................... + // mls v15.8H, v16.8H, v7.H[0] // .....*......................... + sub count, count, #1 -.p2align 2 layer123_start: - ldr x21, [x0, #0] // *................................................................................................... - vins v2, x20, 0 // ......*............................................................................................. - ldr x25, [x0, #8] // .*.................................................................................................. - mul v16.8H, v26.8H, v0.H[0] // ................................*................................................................... - ldr x26, [x0, #128] // ........*........................................................................................... - ldr x23, [x0, #72] // .....*.............................................................................................. - ldr x24, [x0, #464] // ............................e....................................................................... - vins v25, x16, 1 // ...............*.................................................................................... - ldr x27, [x0, #136] // .........*.......................................................................................... - mls v16.8H, v6.8H, v7.H[0] // ..................................*................................................................. - vins v18, x18, 1 // .......................*............................................................................ - ldr x20, [x0, #80] // ....e............................................................................................... - vins v11, x21, 0 // ..*................................................................................................. - ldr x21, [x0, #208] // ............e....................................................................................... - ldr x16, [x0, #216] // .............e...................................................................................... - vins v13, x26, 0 // ..........*......................................................................................... - mul v29.8H, v10.8H, v0.H[0] // ..........................................*......................................................... - ldr x26, [x0, #272] // ................e................................................................................... - vins v6, x24, 0 // ..............................e..................................................................... - ldr x24, [x0, #336] // ....................e............................................................................... - ldr x9, [x0, #280] // .................e.................................................................................. - vins v2, x23, 1 // .......*............................................................................................ - mls v29.8H, v4.8H, v7.H[0] // ............................................*....................................................... - ldr x18, [x0, #344] // .....................e.............................................................................. - ldr x23, [x0, #472] // .............................e...................................................................... - ldr x12, [x0, #400] // ........................e........................................................................... - sub v4.8H, v25.8H, v15.8H // ..................................................*................................................. - add v15.8H, v25.8H, v15.8H // ...................................................*................................................ - mul v5.8H, v18.8H, v0.H[0] // .....................................*.............................................................. - ldr x14, [x0, #408] // .........................e.......................................................................... - vins v11, x25, 1 // ...*................................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - vins v13, x27, 1 // ...........*........................................................................................ - sqrdmulh v10.8H, v18.8H, v0.H[1] // ......................................*............................................................. - // gap // .................................................................................................... - vins v6, x23, 1 // ...............................e.................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - vins v25, x21, 0 // ..............e..................................................................................... - mul v21.8H, v15.8H, v0.H[2] // .........................................................*.......................................... - // gap // .................................................................................................... - vins v26, x26, 0 // ..................e................................................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - vins v18, x24, 0 // ......................e............................................................................. - sqrdmulh v15.8H, v15.8H, v0.H[3] // ..........................................................*......................................... - // gap // .................................................................................................... - sub v8.8H, v11.8H, v16.8H // ...................................*................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v16.8H, v11.8H, v16.8H // ....................................*............................................................... - mls v5.8H, v10.8H, v7.H[0] // .......................................*............................................................ - // gap // .................................................................................................... - sub v11.8H, v13.8H, v29.8H // .............................................*...................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v13.8H, v13.8H, v29.8H // ..............................................*..................................................... - mul v29.8H, v4.8H, v0.H[4] // ...................................................................*................................ - // gap // .................................................................................................... - vins v26, x9, 1 // ...................e................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - vins v10, x12, 0 // ..........................e......................................................................... - mls v21.8H, v15.8H, v7.H[0] // ...........................................................*........................................ - // gap // .................................................................................................... - sub v15.8H, v2.8H, v5.8H // ........................................*........................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v2.8H, v2.8H, v5.8H // .........................................*.......................................................... - mul v5.8H, v13.8H, v0.H[2] // ....................................................*............................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v13.8H, v13.8H, v0.H[3] // .....................................................*.............................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v30.8H, v2.8H, v21.8H // ............................................................*....................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v4.8H, v4.8H, v0.H[5] // ....................................................................*............................... - add v2.8H, v2.8H, v21.8H // .............................................................*...................................... - // gap // .................................................................................................... - vins v10, x14, 1 // ...........................e........................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v21.8H, v11.8H, v0.H[4] // ..............................................................*..................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v5.8H, v13.8H, v7.H[0] // ......................................................*............................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v11.8H, v11.8H, v0.H[5] // ...............................................................*.................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v29.8H, v4.8H, v7.H[0] // .....................................................................*.............................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v13.8H, v16.8H, v5.8H // .......................................................*............................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v16.8H, v16.8H, v5.8H // ........................................................*........................................... - mul v4.8H, v2.8H, v0.H[6] // ........................................................................*........................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v21.8H, v11.8H, v7.H[0] // ................................................................*................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v11.8H, v15.8H, v29.8H // ......................................................................*............................. - // gap // .................................................................................................... - // gap // .................................................................................................... - add v29.8H, v15.8H, v29.8H // .......................................................................*............................ - sqrdmulh v2.8H, v2.8H, v0.H[7] // .........................................................................*.......................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v15.8H, v30.8H, v1.H[0] // .............................................................................*...................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v5.8H, v8.8H, v21.8H // .................................................................*.................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - add v21.8H, v8.8H, v21.8H // ..................................................................*................................. - sqrdmulh v8.8H, v30.8H, v1.H[1] // ..............................................................................*..................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v4.8H, v2.8H, v7.H[0] // ..........................................................................*......................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v2.8H, v29.8H, v1.H[2] // ..................................................................................*................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v15.8H, v8.8H, v7.H[0] // ...............................................................................*.................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v8.8H, v16.8H, v4.8H // ...........................................................................*........................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v16.8H, v16.8H, v4.8H // ............................................................................*....................... - sqrdmulh v29.8H, v29.8H, v1.H[3] // ...................................................................................*................ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - str_vo v8, x0, 64 // .............................................................................................*...... - sqrdmulh v4.8H, v11.8H, v1.H[5] // ........................................................................................*........... - // gap // .................................................................................................... - str_vi v16, x0, 16 // ............................................................................................*....... - sub v16.8H, v13.8H, v15.8H // ................................................................................*................... - // gap // .................................................................................................... - add v13.8H, v13.8H, v15.8H // .................................................................................*.................. - mul v11.8H, v11.8H, v1.H[4] // .......................................................................................*............ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v2.8H, v29.8H, v7.H[0] // ....................................................................................*............... - str_vo v16, x0, 176 // ...............................................................................................*.... - // gap // .................................................................................................... - str_vo v13, x0, 112 // ..............................................................................................*..... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v11.8H, v4.8H, v7.H[0] // .........................................................................................*.......... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v15.8H, v6.8H, v0.H[0] // ...............................................e.................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v16.8H, v21.8H, v2.8H // .....................................................................................*.............. - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v13.8H, v6.8H, v0.H[1] // ................................................e................................................... - add v2.8H, v21.8H, v2.8H // ......................................................................................*............. - // gap // .................................................................................................... - sub v29.8H, v5.8H, v11.8H // ..........................................................................................*......... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v11.8H, v5.8H, v11.8H // ...........................................................................................*........ - str_vo v16, x0, 304 // .................................................................................................*.. - sqrdmulh v6.8H, v26.8H, v0.H[1] // .................................e.................................................................. - str_vo v2, x0, 240 // ................................................................................................*... - // gap // .................................................................................................... - // gap // .................................................................................................... - str_vo v29, x0, 432 // ...................................................................................................* - sqrdmulh v4.8H, v10.8H, v0.H[1] // ...........................................e........................................................ - // gap // .................................................................................................... - str_vo v11, x0, 368 // ..................................................................................................*. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v15.8H, v13.8H, v7.H[0] // .................................................e.................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - - // original source code - // ldr x10, [x0, #0] // ..............................................................................................*.................................................................................................. || ......................................................................*...................................................................... - // ldr x11, [x0, #8] // ................................................................................................*................................................................................................ || ......................................................................*...................................................................... - // vins v8, x10, 0 // ..........................................................................................................*...................................................................................... || ..........................................................................*.................................................................. - // vins v8, x11, 1 // ............................................................................................................................*.................................................................... || ................................................................................*............................................................ - // ldr x10, [x0, #64] // .....e........................................................................................................................................................................................... || .e........................................................................................................................................... - // ldr x11, [x0, #72] // ...................................................................................................*............................................................................................. || .......................................................................*..................................................................... - // vins v9, x10, 0 // ...............................................................................................*................................................................................................. || ......................................................................*...................................................................... - // vins v9, x11, 1 // ...................................................................................................................*............................................................................. || .............................................................................*............................................................... - // ldr x10, [x0, #128] // ..................................................................................................*.............................................................................................. || .......................................................................*..................................................................... - // ldr x11, [x0, #136] // ......................................................................................................*.......................................................................................... || ........................................................................*.................................................................... - // vins v10, x10, 0 // .............................................................................................................*................................................................................... || ...........................................................................*................................................................. - // vins v10, x11, 1 // .............................................................................................................................*................................................................... || .................................................................................*........................................................... - // ldr x10, [x0, #192] // .......e......................................................................................................................................................................................... || ..e.......................................................................................................................................... - // ldr x11, [x0, #200] // ........e........................................................................................................................................................................................ || ..e.......................................................................................................................................... - // vins v11, x10, 0 // ............................e.................................................................................................................................................................... || ...........e................................................................................................................................. - // vins v11, x11, 1 // .....................................................................................................*........................................................................................... || ........................................................................*.................................................................... - // ldr x10, [x0, #256] // ...........e..................................................................................................................................................................................... || ...e......................................................................................................................................... - // ldr x11, [x0, #264] // ..............e.................................................................................................................................................................................. || ....e........................................................................................................................................ - // vins v12, x10, 0 // ..............................e.................................................................................................................................................................. || ............e................................................................................................................................ - // vins v12, x11, 1 // .......................................e......................................................................................................................................................... || ..................e.......................................................................................................................... - // ldr x10, [x0, #320] // .............e................................................................................................................................................................................... || ....e........................................................................................................................................ - // ldr x11, [x0, #328] // .................e............................................................................................................................................................................... || .....e....................................................................................................................................... - // vins v13, x10, 0 // ...............................e................................................................................................................................................................. || .............e............................................................................................................................... - // vins v13, x11, 1 // ........................................................................................................*........................................................................................ || .........................................................................*................................................................... - // ldr x10, [x0, #384] // ...................e............................................................................................................................................................................. || ......e...................................................................................................................................... - // ldr x11, [x0, #392] // .......................e......................................................................................................................................................................... || .......e..................................................................................................................................... - // vins v14, x10, 0 // ........................................e........................................................................................................................................................ || ...................e......................................................................................................................... - // vins v14, x11, 1 // .................................................e............................................................................................................................................... || ..........................e.................................................................................................................. - // ldr x10, [x0, #448] // e................................................................................................................................................................................................ || e............................................................................................................................................ - // ldr x11, [x0, #456] // ..................e.............................................................................................................................................................................. || ......e...................................................................................................................................... - // vins v15, x10, 0 // ............e.................................................................................................................................................................................... || ....e........................................................................................................................................ - // vins v15, x11, 1 // ...........................e..................................................................................................................................................................... || ..........e.................................................................................................................................. - // mul v24.8H, v12.8H, v0.H[0] // .................................................................................................*............................................................................................... || .......................................................................*..................................................................... - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ........................................................................................e........................................................................................................ || .................................................................e........................................................................... - // mls v24.8H, v12.8H, v7.H[0] // .......................................................................................................*......................................................................................... || .........................................................................*................................................................... - // sub v12.8H, v8.8H, v24.8H // .....................................................................................................................................*........................................................... || ......................................................................................*...................................................... - // add v8.8H, v8.8H, v24.8H // ......................................................................................................................................*.......................................................... || .......................................................................................*..................................................... - // mul v24.8H, v13.8H, v0.H[0] // ..........................................................................................................................*...................................................................... || ...............................................................................*............................................................. - // sqrdmulh v13.8H, v13.8H, v0.H[1] // ..............................................................................................................................*.................................................................. || .................................................................................*........................................................... - // mls v24.8H, v13.8H, v7.H[0] // .......................................................................................................................................*......................................................... || .......................................................................................*..................................................... - // sub v13.8H, v9.8H, v24.8H // ..............................................................................................................................................*.................................................. || ............................................................................................*................................................ - // add v9.8H, v9.8H, v24.8H // ...............................................................................................................................................*................................................. || .............................................................................................*............................................... - // mul v24.8H, v14.8H, v0.H[0] // ..............................................................................................................*.................................................................................. || ...........................................................................*................................................................. - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ...........................................................................................e..................................................................................................... || ...................................................................e......................................................................... - // mls v24.8H, v14.8H, v7.H[0] // ....................................................................................................................*............................................................................ || .............................................................................*............................................................... - // sub v14.8H, v10.8H, v24.8H // ........................................................................................................................................*........................................................ || ........................................................................................*.................................................... - // add v10.8H, v10.8H, v24.8H // .........................................................................................................................................*....................................................... || .........................................................................................*................................................... - // mul v24.8H, v15.8H, v0.H[0] // .................................................................................e............................................................................................................... || .............................................................e............................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ...................................................................................e............................................................................................................. || ...............................................................e............................................................................. - // mls v24.8H, v15.8H, v7.H[0] // .............................................................................................e................................................................................................... || .....................................................................e....................................................................... - // sub v15.8H, v11.8H, v24.8H // ........................................................................................................................*........................................................................ || ..............................................................................*.............................................................. - // add v11.8H, v11.8H, v24.8H // .........................................................................................................................*....................................................................... || ...............................................................................*............................................................. - // mul v24.8H, v10.8H, v0.H[2] // ................................................................................................................................................*................................................ || .............................................................................................*............................................... - // sqrdmulh v10.8H, v10.8H, v0.H[3] // .................................................................................................................................................*............................................... || ...............................................................................................*............................................. - // mls v24.8H, v10.8H, v7.H[0] // .......................................................................................................................................................*......................................... || .....................................................................................................*....................................... - // sub v10.8H, v8.8H, v24.8H // ..........................................................................................................................................................*...................................... || ..........................................................................................................*.................................. - // add v8.8H, v8.8H, v24.8H // ...........................................................................................................................................................*..................................... || ...........................................................................................................*................................. - // mul v24.8H, v11.8H, v0.H[2] // .................................................................................................................................*............................................................... || ...................................................................................*......................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[3] // ....................................................................................................................................*............................................................ || .....................................................................................*....................................................... - // mls v24.8H, v11.8H, v7.H[0] // .............................................................................................................................................*................................................... || ...........................................................................................*................................................. - // sub v11.8H, v9.8H, v24.8H // ..................................................................................................................................................*.............................................. || ................................................................................................*............................................ - // add v9.8H, v9.8H, v24.8H // ....................................................................................................................................................*............................................ || .................................................................................................*........................................... - // mul v24.8H, v14.8H, v0.H[4] // ......................................................................................................................................................*.......................................... || ...................................................................................................*......................................... - // sqrdmulh v14.8H, v14.8H, v0.H[5] // ........................................................................................................................................................*........................................ || .......................................................................................................*..................................... - // mls v24.8H, v14.8H, v7.H[0] // .............................................................................................................................................................*................................... || .............................................................................................................*............................... - // sub v14.8H, v12.8H, v24.8H // ..................................................................................................................................................................*.............................. || ..................................................................................................................*.......................... - // add v12.8H, v12.8H, v24.8H // ...................................................................................................................................................................*............................. || ...................................................................................................................*......................... - // mul v24.8H, v15.8H, v0.H[4] // ..........................................................................................................................................*...................................................... || .........................................................................................*................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // ...................................................................................................................................................*............................................. || .................................................................................................*........................................... - // mls v24.8H, v15.8H, v7.H[0] // .........................................................................................................................................................*....................................... || .........................................................................................................*................................... - // sub v15.8H, v13.8H, v24.8H // ..............................................................................................................................................................*.................................. || ..............................................................................................................*.............................. - // add v13.8H, v13.8H, v24.8H // ...............................................................................................................................................................*................................. || ...............................................................................................................*............................. - // mul v24.8H, v9.8H, v0.H[6] // ............................................................................................................................................................*.................................... || ...........................................................................................................*................................. - // sqrdmulh v9.8H, v9.8H, v0.H[7] // ................................................................................................................................................................*................................ || ...............................................................................................................*............................. - // mls v24.8H, v9.8H, v7.H[0] // .....................................................................................................................................................................*........................... || .....................................................................................................................*....................... - // sub v9.8H, v8.8H, v24.8H // ........................................................................................................................................................................*........................ || ..........................................................................................................................*.................. - // add v8.8H, v8.8H, v24.8H // .........................................................................................................................................................................*....................... || ...........................................................................................................................*................. - // mul v24.8H, v11.8H, v1.H[0] // .................................................................................................................................................................*............................... || .................................................................................................................*........................... - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ....................................................................................................................................................................*............................ || ...................................................................................................................*......................... - // mls v24.8H, v11.8H, v7.H[0] // .......................................................................................................................................................................*......................... || .........................................................................................................................*................... - // sub v11.8H, v10.8H, v24.8H // ..............................................................................................................................................................................*.................. || ..............................................................................................................................*.............. - // add v10.8H, v10.8H, v24.8H // ...............................................................................................................................................................................*................. || ...............................................................................................................................*............. - // mul v24.8H, v13.8H, v1.H[2] // ......................................................................................................................................................................*.......................... || .......................................................................................................................*..................... - // sqrdmulh v13.8H, v13.8H, v1.H[3] // ..........................................................................................................................................................................*...................... || ...........................................................................................................................*................. - // mls v24.8H, v13.8H, v7.H[0] // .................................................................................................................................................................................*............... || .................................................................................................................................*........... - // sub v13.8H, v12.8H, v24.8H // ......................................................................................................................................................................................*.......... || ......................................................................................................................................*...... - // add v12.8H, v12.8H, v24.8H // ........................................................................................................................................................................................*........ || .......................................................................................................................................*..... - // mul v24.8H, v15.8H, v1.H[4] // ................................................................................................................................................................................*................ || ...............................................................................................................................*............. - // sqrdmulh v15.8H, v15.8H, v1.H[5] // ............................................................................................................................................................................*.................... || .............................................................................................................................*............... - // mls v24.8H, v15.8H, v7.H[0] // ....................................................................................................................................................................................*............ || ...................................................................................................................................*......... - // sub v15.8H, v14.8H, v24.8H // .........................................................................................................................................................................................*....... || ........................................................................................................................................*.... - // add v14.8H, v14.8H, v24.8H // ..........................................................................................................................................................................................*...... || .........................................................................................................................................*... - // str_vi v8, x0, 16 // .............................................................................................................................................................................*................... || ..............................................................................................................................*.............. - // str_vo v9, x0, 48 // ...........................................................................................................................................................................*..................... || .............................................................................................................................*............... - // str_vo v10, x0, 112 // ...................................................................................................................................................................................*............. || ..................................................................................................................................*.......... - // str_vo v11, x0, 176 // ..................................................................................................................................................................................*.............. || .................................................................................................................................*........... - // str_vo v12, x0, 240 // .............................................................................................................................................................................................*... || ..........................................................................................................................................*.. - // str_vo v13, x0, 304 // ...........................................................................................................................................................................................*..... || .........................................................................................................................................*... - // str_vo v14, x0, 368 // ................................................................................................................................................................................................* || ............................................................................................................................................* - // str_vo v15, x0, 432 // ..............................................................................................................................................................................................*.. || ...........................................................................................................................................*. - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Cycle bound: 72.0 + // IPC bound: 1.06 + // + // Wall time: 29.38s + // User time: 29.38s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q12, [x0, #64] // .*.......................................................................... + ldr q22, [x0, #320] // .....*...................................................................... + mul v27.8H, v6.8H, v0.H[0] // ...................*........................................................ + ldr q11, [x0, #192] // ...*........................................................................ + ldr q19, [x0, #464] // .......e.................................................................... + // gap // ............................................................................ + mls v27.8H, v14.8H, v7.H[0] // ....................*....................................................... + ldr q29, [x0, #256] // ....*....................................................................... + ldr q5, [x0, #128] // ..*......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.8H, v22.8H, v0.H[0] // ..............*............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v26.8H, v11.8H, v15.8H // ...........................*................................................ + sqrdmulh v28.8H, v22.8H, v0.H[1] // .............*.............................................................. + sub v11.8H, v11.8H, v15.8H // ..........................*................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v25.8H, v29.8H, v0.H[1] // ........*................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v16.8H, v11.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.8H, v28.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v15.8H, v11.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v23.8H, v5.8H, v27.8H // .....................*...................................................... + mls v15.8H, v16.8H, v7.H[0] // .............................................*.............................. + add v16.8H, v5.8H, v27.8H // ......................*..................................................... + // gap // ............................................................................ + add v17.8H, v12.8H, v13.8H // .................*.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v21.8H, v23.8H, v0.H[4] // .......................................*.................................... + sub v31.8H, v12.8H, v13.8H // ................*........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v4.8H, v26.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v28.8H, v31.8H, v15.8H // ..............................................*............................. + // gap // ............................................................................ + // gap // ............................................................................ + add v15.8H, v31.8H, v15.8H // ...............................................*............................ + // gap // ............................................................................ + sqrdmulh v6.8H, v16.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v22.8H, v28.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v3.8H, v15.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v15.8H, v15.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v31.8H, v29.8H, v0.H[0] // .........*.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v10.8H, v16.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v16.8H, v26.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + ldr q30, [x0, #0] // *........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v3.8H, v15.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v15.8H, v23.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v4.8H, v16.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v31.8H, v25.8H, v7.H[0] // ..........*................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v21.8H, v15.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.8H, v17.8H, v4.8H // ....................................*....................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v9.8H, v17.8H, v4.8H // .....................................*...................................... + mul v15.8H, v28.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v17.8H, v30.8H, v31.8H // ...........*................................................................ + mls v15.8H, v22.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v10.8H, v6.8H, v7.H[0] // ..............................*............................................. + sub v20.8H, v17.8H, v21.8H // .........................................*.................................. + ldr q6, [x0, #400] // ......e..................................................................... + add v14.8H, v17.8H, v21.8H // ..........................................*................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v21.8H, v30.8H, v31.8H // ............*............................................................... + // gap // ............................................................................ + sqrdmulh v24.8H, v9.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v28.8H, v20.8H, v15.8H // ..................................................................*......... + mul v5.8H, v9.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + sub v9.8H, v14.8H, v3.8H // .............................................................*.............. + sub v27.8H, v21.8H, v10.8H // ...............................*............................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v16.8H, v21.8H, v10.8H // ................................*........................................... + str q28, [x0, #448] // ...........................................................................* + sqrdmulh v28.8H, v26.8H, v1.H[1] // .....................................................*...................... + add v30.8H, v14.8H, v3.8H // ..............................................................*............. + str q9, [x0, #320] // .........................................................................*.. + // gap // ............................................................................ + mls v5.8H, v24.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v31.8H, v20.8H, v15.8H // ...................................................................*........ + mul v15.8H, v26.8H, v1.H[0] // ......................................................*..................... + str q30, [x0, #256] // ........................................................................*... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v15.8H, v28.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + str q31, [x0, #384] // ..........................................................................*. + // gap // ............................................................................ + sub v20.8H, v16.8H, v5.8H // ...................................................*........................ + add v2.8H, v16.8H, v5.8H // ....................................................*....................... + sqrdmulh v16.8H, v19.8H, v0.H[1] // .......................e.................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q20, [x0, #64] // .....................................................................*...... + sqrdmulh v14.8H, v6.8H, v0.H[1] // ..................e......................................................... + // gap // ............................................................................ + str q2, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + add v28.8H, v27.8H, v15.8H // .........................................................*.................. + sub v21.8H, v27.8H, v15.8H // ........................................................*................... + mul v15.8H, v19.8H, v0.H[0] // ........................e................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v15.8H, v16.8H, v7.H[0] // .........................e.................................................. + str q28, [x0, #112] // ......................................................................*..... + // gap // ............................................................................ + str q21, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + // gap // ............................................................................ + + // ------------------------------------------------------------------ new position -------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------------------- + // ldr q8, [x0, #0] // ............................~...........................................'...............................*........................................... + // ldr q9, [x0, #(1*(512/8))] // ........................................................................*........................................................................... + // ldr q10, [x0, #(2*(512/8))] // ...~....................................................................'......*.................................................................... + // ldr q11, [x0, #(3*(512/8))] // ........................................................................'..*........................................................................ + // ldr q12, [x0, #(4*(512/8))] // ..~.....................................................................'.....*..................................................................... + // ldr q13, [x0, #(5*(512/8))] // ........................................................................'*.......................................................................... + // ldr q14, [x0, #(6*(512/8))] // .........................................e..............................'............................................~.............................. + // ldr q15, [x0, #(7*(512/8))] // e.......................................................................'...~....................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ........~...............................................................'...........*............................................................... + // mul v24.8h, v12.8h, v0.h[0] // .........................~..............................................'............................*.............................................. + // mls v24.8h, v27.8h, v7.h[0] // ................................~.......................................'...................................*....................................... + // sub v12.8h, v8.8h, v24.8h // .....................................~..................................'........................................*.................................. + // add v8.8h, v8.8h, v24.8h // ...........................................~............................'..............................................*............................ + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ......~.................................................................'.........*................................................................. + // mul v24.8h, v13.8h, v0.h[0] // ....~...................................................................'.......*................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........~.............................................................'.............*............................................................. + // sub v13.8h, v9.8h, v24.8h // .................~......................................................'....................*...................................................... + // add v9.8h, v9.8h, v24.8h // ...............~........................................................'..................*........................................................ + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ................................................................e.......'...................................................................~....... + // mul v24.8h, v14.8h, v0.h[0] // ........................................................................'.*......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .~......................................................................'....*...................................................................... + // sub v14.8h, v10.8h, v24.8h // ............~...........................................................'...............*........................................................... + // add v10.8h, v10.8h, v24.8h // ..............~.........................................................'.................*......................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ..............................................................e.........'.................................................................~......... + // mul v24.8h, v15.8h, v0.h[0] // ....................................................................e...'.......................................................................~... + // mls v24.8h, v27.8h, v7.h[0] // .....................................................................e..'........................................................................~.. + // sub v15.8h, v11.8h, v24.8h // .......~................................................................'..........*................................................................ + // add v11.8h, v11.8h, v24.8h // .....~..................................................................'........*.................................................................. + // sqrdmulh v27.8h, v10.8h, v0.h[3] // .....................~..................................................'........................*.................................................. + // mul v24.8h, v10.8h, v0.h[2] // ..........................~.............................................'.............................*............................................. + // mls v24.8h, v27.8h, v7.h[0] // .......................................~................................'..........................................*................................ + // sub v10.8h, v8.8h, v24.8h // ................................................~.......................'...................................................*....................... + // add v8.8h, v8.8h, v24.8h // .................................................~......................'....................................................*...................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ...........................~............................................'..............................*............................................ + // mul v24.8h, v11.8h, v0.h[2] // ..................~.....................................................'.....................*..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................~........................................'..................................*........................................ + // sub v11.8h, v9.8h, v24.8h // ..................................~.....................................'.....................................*..................................... + // add v9.8h, v9.8h, v24.8h // ...................................~....................................'......................................*.................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ..............................~.........................................'.................................*......................................... + // mul v24.8h, v14.8h, v0.h[4] // ................~.......................................................'...................*....................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................~......................................'....................................*...................................... + // sub v14.8h, v12.8h, v24.8h // ........................................~...............................'...........................................*............................... + // add v12.8h, v12.8h, v24.8h // ..........................................~.............................'.............................................*............................. + // sqrdmulh v27.8h, v15.8h, v0.h[5] // .........~..............................................................'............*.............................................................. + // mul v24.8h, v15.8h, v0.h[4] // ...........~............................................................'..............*............................................................ + // mls v24.8h, v27.8h, v7.h[0] // .............~..........................................................'................*.......................................................... + // sub v15.8h, v13.8h, v24.8h // ...................~....................................................'......................*.................................................... + // add v13.8h, v13.8h, v24.8h // ....................~...................................................'.......................*................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ............................................~...........................'...............................................*........................... + // mul v24.8h, v9.8h, v0.h[6] // ..............................................~.........................'.................................................*......................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................................~.................'.........................................................*................. + // sub v9.8h, v8.8h, v24.8h // ............................................................~...........'...............................................................*........... + // add v8.8h, v8.8h, v24.8h // .............................................................~..........'................................................................*.......... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ...................................................~....................'......................................................*.................... + // mul v24.8h, v11.8h, v1.h[0] // ........................................................~...............'...........................................................*............... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................................~.............'.............................................................*............. + // sub v11.8h, v10.8h, v24.8h // ...................................................................~....'......................................................................*.... + // add v10.8h, v10.8h, v24.8h // ..................................................................~.....'.....................................................................*..... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ........................~...............................................'...........................*............................................... + // mul v24.8h, v13.8h, v1.h[2] // .......................~................................................'..........................*................................................ + // mls v24.8h, v27.8h, v7.h[0] // .............................~..........................................'................................*.......................................... + // sub v13.8h, v12.8h, v24.8h // ...............................................~........................'..................................................*........................ + // add v12.8h, v12.8h, v24.8h // ....................................................~...................'.......................................................*................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ......................~.................................................'.........................*................................................. + // mul v24.8h, v15.8h, v1.h[4] // ....................................~...................................'.......................................*................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................~.................................'.........................................*................................. + // sub v15.8h, v14.8h, v24.8h // .............................................~..........................'................................................*.......................... + // add v14.8h, v14.8h, v24.8h // .......................................................~................'..........................................................*................ + // str q8, [x0], #(16) // .................................................................~......'....................................................................*...... + // str q9, [x0, #(-16 + 1*(512/8))] // ...............................................................~........'..................................................................*........ + // str q10, [x0, #(-16 + 2*(512/8))] // ......................................................................~.'.........................................................................*. + // str q11, [x0, #(-16 + 3*(512/8))] // .......................................................................~'..........................................................................* + // str q12, [x0, #(-16 + 4*(512/8))] // .........................................................~..............'............................................................*.............. + // str q13, [x0, #(-16 + 5*(512/8))] // .....................................................~..................'........................................................*.................. + // str q14, [x0, #(-16 + 6*(512/8))] // ...........................................................~............'..............................................................*............ + // str q15, [x0, #(-16 + 7*(512/8))] // ..................................................~.....................'.....................................................*..................... + + sub count, count, #1 cbnz count, layer123_start - vins v2, x20, 0 // .*.......................................................................... - mul v16.8H, v26.8H, v0.H[0] // ...*........................................................................ - ldr x21, [x0, #0] // *........................................................................... - vins v25, x16, 1 // ......*..................................................................... - ldr x25, [x0, #128] // ....*....................................................................... - ldr x26, [x0, #8] // ..*......................................................................... - mls v16.8H, v6.8H, v7.H[0] // ........*................................................................... - vins v18, x18, 1 // .........*.................................................................. - ldr x23, [x0, #72] // .....*...................................................................... - ldr x24, [x0, #136] // .......*.................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - mul v11.8H, v10.8H, v0.H[0] // ............*............................................................... - vins v13, x21, 0 // ..........*................................................................. - // gap // ............................................................................ - vins v29, x25, 0 // ...........*................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v11.8H, v4.8H, v7.H[0] // ..............*............................................................. - vins v2, x23, 1 // .............*.............................................................. - // gap // ............................................................................ - sub v6.8H, v25.8H, v15.8H // ...............*............................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v25.8H, v25.8H, v15.8H // ................*........................................................... - mul v4.8H, v18.8H, v0.H[0] // .................*.......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - vins v13, x26, 1 // ..................*......................................................... - sqrdmulh v18.8H, v18.8H, v0.H[1] // ....................*....................................................... - // gap // ............................................................................ - vins v29, x24, 1 // ...................*........................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v15.8H, v25.8H, v0.H[2] // .....................*...................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v25.8H, v25.8H, v0.H[3] // ......................*..................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v4.8H, v18.8H, v7.H[0] // .........................*.................................................. - sub v18.8H, v13.8H, v16.8H // .......................*.................................................... - // gap // ............................................................................ - add v16.8H, v13.8H, v16.8H // ........................*................................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v13.8H, v29.8H, v11.8H // ..........................*................................................. - mul v5.8H, v6.8H, v0.H[4] // ............................*............................................... - // gap // ............................................................................ - add v11.8H, v29.8H, v11.8H // ...........................*................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v15.8H, v25.8H, v7.H[0] // .............................*.............................................. - // gap // ............................................................................ - // gap // ............................................................................ - sub v25.8H, v2.8H, v4.8H // ..............................*............................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v2.8H, v2.8H, v4.8H // ...............................*............................................ - sqrdmulh v29.8H, v6.8H, v0.H[5] // ...................................*........................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v6.8H, v11.8H, v0.H[2] // ................................*........................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v4.8H, v2.8H, v15.8H // ..................................*......................................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v11.8H, v11.8H, v0.H[3] // .................................*.......................................... - add v2.8H, v2.8H, v15.8H // ....................................*....................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v15.8H, v13.8H, v0.H[4] // .....................................*...................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v13.8H, v13.8H, v0.H[5] // .......................................*.................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v6.8H, v11.8H, v7.H[0] // ......................................*..................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.8H, v29.8H, v7.H[0] // ........................................*................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v11.8H, v2.8H, v0.H[6] // ...........................................*................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v29.8H, v16.8H, v6.8H // .........................................*.................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v16.8H, v16.8H, v6.8H // ..........................................*................................. - mls v15.8H, v13.8H, v7.H[0] // ............................................*............................... - // gap // ............................................................................ - sub v13.8H, v25.8H, v5.8H // .............................................*.............................. - // gap // ............................................................................ - // gap // ............................................................................ - add v25.8H, v25.8H, v5.8H // ..............................................*............................. - sqrdmulh v2.8H, v2.8H, v0.H[7] // ...............................................*............................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v6.8H, v4.8H, v1.H[0] // ................................................*........................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v5.8H, v18.8H, v15.8H // .................................................*.......................... - // gap // ............................................................................ - // gap // ............................................................................ - add v18.8H, v18.8H, v15.8H // ..................................................*......................... - sqrdmulh v4.8H, v4.8H, v1.H[1] // ...................................................*........................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v11.8H, v2.8H, v7.H[0] // ....................................................*....................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v2.8H, v25.8H, v1.H[2] // .....................................................*...................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v6.8H, v4.8H, v7.H[0] // ......................................................*..................... - // gap // ............................................................................ - // gap // ............................................................................ - add v4.8H, v16.8H, v11.8H // ........................................................*................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v25.8H, v25.8H, v1.H[3] // .........................................................*.................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vi v4, x0, 16 // ............................................................*............... - sqrdmulh v4.8H, v13.8H, v1.H[5] // ...........................................................*................ - // gap // ............................................................................ - sub v15.8H, v29.8H, v6.8H // .............................................................*.............. - // gap // ............................................................................ - // gap // ............................................................................ - add v29.8H, v29.8H, v6.8H // ..............................................................*............. - mul v13.8H, v13.8H, v1.H[4] // ...............................................................*............ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v2.8H, v25.8H, v7.H[0] // ................................................................*........... - str_vo v15, x0, 176 // .................................................................*.......... - // gap // ............................................................................ - str_vo v29, x0, 112 // ..................................................................*......... - // gap // ............................................................................ - // gap // ............................................................................ - mls v13.8H, v4.8H, v7.H[0] // ...................................................................*........ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v16.8H, v16.8H, v11.8H // .......................................................*.................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v25.8H, v18.8H, v2.8H // ....................................................................*....... - add v2.8H, v18.8H, v2.8H // .....................................................................*...... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v18.8H, v5.8H, v13.8H // ......................................................................*..... - add v11.8H, v5.8H, v13.8H // .......................................................................*.... - str_vo v16, x0, 48 // ..........................................................*................. - str_vo v25, x0, 304 // ........................................................................*... - str_vo v2, x0, 240 // .........................................................................*.. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v18, x0, 432 // ..........................................................................*. - str_vo v11, x0, 368 // ...........................................................................* - // gap // ............................................................................ - - // original source code - // ldr x21, [x0, #0] // ..*......................................................................... || *.................................................................... - // vins v2, x20, 0 // *........................................................................... || *.................................................................... - // ldr x25, [x0, #8] // .....*...................................................................... || .*................................................................... - // mul v16.8H, v26.8H, v0.H[0] // .*.......................................................................... || *.................................................................... - // ldr x26, [x0, #128] // ....*....................................................................... || .*................................................................... - // ldr x23, [x0, #72] // ........*................................................................... || ..*.................................................................. - // vins v25, x16, 1 // ...*........................................................................ || .*................................................................... - // ldr x27, [x0, #136] // .........*.................................................................. || ...*................................................................. - // mls v16.8H, v6.8H, v7.H[0] // ......*..................................................................... || ..*.................................................................. - // vins v18, x18, 1 // .......*.................................................................... || ..*.................................................................. - // vins v11, x21, 0 // ...........*................................................................ || ....*................................................................ - // vins v13, x26, 0 // ............*............................................................... || .....*............................................................... - // mul v29.8H, v10.8H, v0.H[0] // ..........*................................................................. || ....*................................................................ - // vins v2, x23, 1 // ..............*............................................................. || ......*.............................................................. - // mls v29.8H, v4.8H, v7.H[0] // .............*.............................................................. || ......*.............................................................. - // sub v4.8H, v25.8H, v15.8H // ...............*............................................................ || .......*............................................................. - // add v15.8H, v25.8H, v15.8H // ................*........................................................... || ........*............................................................ - // mul v5.8H, v18.8H, v0.H[0] // .................*.......................................................... || ........*............................................................ - // vins v11, x25, 1 // ..................*......................................................... || ..........*.......................................................... - // vins v13, x27, 1 // ....................*....................................................... || ...........*......................................................... - // sqrdmulh v10.8H, v18.8H, v0.H[1] // ...................*........................................................ || ..........*.......................................................... - // mul v21.8H, v15.8H, v0.H[2] // .....................*...................................................... || ............*........................................................ - // sqrdmulh v15.8H, v15.8H, v0.H[3] // ......................*..................................................... || ..............*...................................................... - // sub v8.8H, v11.8H, v16.8H // ........................*................................................... || ................*.................................................... - // add v16.8H, v11.8H, v16.8H // .........................*.................................................. || .................*................................................... - // mls v5.8H, v10.8H, v7.H[0] // .......................*.................................................... || ................*.................................................... - // sub v11.8H, v13.8H, v29.8H // ..........................*................................................. || ..................*.................................................. - // add v13.8H, v13.8H, v29.8H // ............................*............................................... || ...................*................................................. - // mul v29.8H, v4.8H, v0.H[4] // ...........................*................................................ || ..................*.................................................. - // mls v21.8H, v15.8H, v7.H[0] // .............................*.............................................. || ....................*................................................ - // sub v15.8H, v2.8H, v5.8H // ..............................*............................................. || .....................*............................................... - // add v2.8H, v2.8H, v5.8H // ...............................*............................................ || ......................*.............................................. - // mul v5.8H, v13.8H, v0.H[2] // .................................*.......................................... || ........................*............................................ - // sqrdmulh v13.8H, v13.8H, v0.H[3] // ...................................*........................................ || ..........................*.......................................... - // sub v30.8H, v2.8H, v21.8H // ..................................*......................................... || .........................*........................................... - // sqrdmulh v4.8H, v4.8H, v0.H[5] // ................................*........................................... || ......................*.............................................. - // add v2.8H, v2.8H, v21.8H // ....................................*....................................... || ..........................*.......................................... - // mul v21.8H, v11.8H, v0.H[4] // .....................................*...................................... || ............................*........................................ - // mls v5.8H, v13.8H, v7.H[0] // .......................................*.................................... || ................................*.................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ......................................*..................................... || ..............................*...................................... - // mls v29.8H, v4.8H, v7.H[0] // ........................................*................................... || ..................................*.................................. - // sub v13.8H, v16.8H, v5.8H // ..........................................*................................. || .....................................*............................... - // add v16.8H, v16.8H, v5.8H // ...........................................*................................ || ......................................*.............................. - // mul v4.8H, v2.8H, v0.H[6] // .........................................*.................................. || ....................................*................................ - // mls v21.8H, v11.8H, v7.H[0] // ............................................*............................... || ......................................*.............................. - // sub v11.8H, v15.8H, v29.8H // .............................................*.............................. || .......................................*............................. - // add v29.8H, v15.8H, v29.8H // ..............................................*............................. || ........................................*............................ - // sqrdmulh v2.8H, v2.8H, v0.H[7] // ...............................................*............................ || ........................................*............................ - // mul v15.8H, v30.8H, v1.H[0] // ................................................*........................... || ..........................................*.......................... - // sub v5.8H, v8.8H, v21.8H // .................................................*.......................... || ...........................................*......................... - // add v21.8H, v8.8H, v21.8H // ..................................................*......................... || ............................................*........................ - // sqrdmulh v8.8H, v30.8H, v1.H[1] // ...................................................*........................ || ............................................*........................ - // mls v4.8H, v2.8H, v7.H[0] // ....................................................*....................... || ..............................................*...................... - // mul v2.8H, v29.8H, v1.H[2] // .....................................................*...................... || ................................................*.................... - // mls v15.8H, v8.8H, v7.H[0] // ......................................................*..................... || ..................................................*.................. - // sub v8.8H, v16.8H, v4.8H // ..................................................................*......... || ..............................................................*...... - // add v16.8H, v16.8H, v4.8H // .......................................................*.................... || ...................................................*................. - // sqrdmulh v29.8H, v29.8H, v1.H[3] // ........................................................*................... || ....................................................*................ - // str_vo v8, x0, 64 // .......................................................................*.... || .................................................................*... - // sqrdmulh v4.8H, v11.8H, v1.H[5] // ..........................................................*................. || ......................................................*.............. - // str_vi v16, x0, 16 // .........................................................*.................. || ......................................................*.............. - // sub v16.8H, v13.8H, v15.8H // ...........................................................*................ || .......................................................*............. - // add v13.8H, v13.8H, v15.8H // ............................................................*............... || ........................................................*............ - // mul v11.8H, v11.8H, v1.H[4] // .............................................................*.............. || ........................................................*............ - // mls v2.8H, v29.8H, v7.H[0] // ..............................................................*............. || ..........................................................*.......... - // str_vo v16, x0, 176 // ...............................................................*............ || ..........................................................*.......... - // str_vo v13, x0, 112 // ................................................................*........... || ...........................................................*......... - // mls v11.8H, v4.8H, v7.H[0] // .................................................................*.......... || ............................................................*........ - // sub v16.8H, v21.8H, v2.8H // ...................................................................*........ || ...............................................................*..... - // add v2.8H, v21.8H, v2.8H // ....................................................................*....... || ...............................................................*..... - // sub v29.8H, v5.8H, v11.8H // .....................................................................*...... || .................................................................*... - // add v11.8H, v5.8H, v11.8H // ......................................................................*..... || .................................................................*... - // str_vo v16, x0, 304 // ........................................................................*... || ..................................................................*.. - // str_vo v2, x0, 240 // .........................................................................*.. || ..................................................................*.. - // str_vo v29, x0, 432 // ..........................................................................*. || ....................................................................* - // str_vo v11, x0, 368 // ...........................................................................* || ....................................................................* - + // Instructions: 70 + // Expected cycles: 71 + // Expected IPC: 0.99 + // + // Cycle bound: 71.0 + // IPC bound: 0.99 + // + // Wall time: 1.32s + // User time: 1.32s + // + // ------------------------- original position -------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------- + mul v11.8H, v6.8H, v0.H[0] // ..*................................................................... + // gap // ...................................................................... + ldr q31, [x0, #320] // .*.................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + ldr q3, [x0, #64] // *..................................................................... + mls v11.8H, v14.8H, v7.H[0] // ....*................................................................. + ldr q16, [x0, #192] // ...*.................................................................. + // gap // ...................................................................... + ldr q22, [x0, #256] // .....*................................................................ + // gap // ...................................................................... + // gap // ...................................................................... + ldr q10, [x0, #128] // ......*............................................................... + mul v26.8H, v31.8H, v0.H[0] // .......*.............................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v14.8H, v31.8H, v0.H[1] // .........*............................................................ + add v6.8H, v16.8H, v15.8H // ........*............................................................. + // gap // ...................................................................... + sub v16.8H, v16.8H, v15.8H // ..........*........................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v31.8H, v10.8H, v11.8H // ...............*...................................................... + // gap // ...................................................................... + sqrdmulh v25.8H, v22.8H, v0.H[1] // ...........*.......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v15.8H, v16.8H, v0.H[5] // ............*......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v26.8H, v14.8H, v7.H[0] // .............*........................................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v27.8H, v16.8H, v0.H[4] // ..............*....................................................... + add v16.8H, v10.8H, v11.8H // .................*.................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v27.8H, v15.8H, v7.H[0] // ................*..................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v13.8H, v3.8H, v26.8H // ..................*................................................... + sub v15.8H, v3.8H, v26.8H // ....................*................................................. + mul v11.8H, v31.8H, v0.H[4] // ...................*.................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v2.8H, v6.8H, v0.H[2] // .....................*................................................ + // gap // ...................................................................... + // gap // ...................................................................... + sub v8.8H, v15.8H, v27.8H // ......................*............................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v26.8H, v16.8H, v0.H[3] // ........................*............................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v14.8H, v22.8H, v0.H[0] // ............................*......................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v22.8H, v16.8H, v0.H[2] // .............................*........................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v16.8H, v6.8H, v0.H[3] // ..............................*....................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v15.8H, v15.8H, v27.8H // .......................*.............................................. + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v6.8H, v8.8H, v1.H[5] // .........................*............................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v3.8H, v15.8H, v1.H[2] // ..........................*........................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v27.8H, v15.8H, v1.H[3] // ...........................*.......................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v15.8H, v31.8H, v0.H[5] // .................................*.................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v2.8H, v16.8H, v7.H[0] // ..................................*................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v3.8H, v27.8H, v7.H[0] // ................................*..................................... + ldr q27, [x0, #0] // ...............................*...................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v14.8H, v25.8H, v7.H[0] // ...................................*.................................. + // gap // ...................................................................... + // gap // ...................................................................... + sub v25.8H, v13.8H, v2.8H // .....................................*................................ + // gap // ...................................................................... + // gap // ...................................................................... + add v16.8H, v13.8H, v2.8H // ......................................*............................... + mls v11.8H, v15.8H, v7.H[0] // ....................................*................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v31.8H, v8.8H, v1.H[4] // .......................................*.............................. + // gap // ...................................................................... + // gap // ...................................................................... + sub v15.8H, v27.8H, v14.8H // ........................................*............................. + // gap // ...................................................................... + // gap // ...................................................................... + mls v31.8H, v6.8H, v7.H[0] // .........................................*............................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v13.8H, v15.8H, v11.8H // ...........................................*.......................... + mls v22.8H, v26.8H, v7.H[0] // ..........................................*........................... + // gap // ...................................................................... + add v11.8H, v15.8H, v11.8H // ............................................*......................... + // gap // ...................................................................... + // gap // ...................................................................... + add v14.8H, v27.8H, v14.8H // .............................................*........................ + sqrdmulh v27.8H, v16.8H, v0.H[7] // ..............................................*....................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v15.8H, v13.8H, v31.8H // ...............................................*...................... + mul v26.8H, v16.8H, v0.H[6] // ................................................*..................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q15, [x0, #448] // ....................................................*................. + sqrdmulh v16.8H, v25.8H, v1.H[1] // .....................................................*................ + sub v15.8H, v11.8H, v3.8H // .................................................*.................... + sub v6.8H, v14.8H, v22.8H // ..................................................*................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v26.8H, v27.8H, v7.H[0] // ........................................................*............. + add v14.8H, v14.8H, v22.8H // ...................................................*.................. + // gap // ...................................................................... + str q15, [x0, #320] // .......................................................*.............. + add v15.8H, v11.8H, v3.8H // ......................................................*............... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v27.8H, v25.8H, v1.H[0] // ..........................................................*........... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v27.8H, v16.8H, v7.H[0] // ............................................................*......... + str q15, [x0, #256] // ...........................................................*.......... + add v15.8H, v13.8H, v31.8H // .........................................................*............ + sub v16.8H, v14.8H, v26.8H // ..............................................................*....... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q15, [x0, #384] // .............................................................*........ + // gap // ...................................................................... + add v15.8H, v14.8H, v26.8H // ...............................................................*...... + str q16, [x0, #64] // ................................................................*..... + // gap // ...................................................................... + // gap // ...................................................................... + add v16.8H, v6.8H, v27.8H // ..................................................................*... + // gap // ...................................................................... + sub v28.8H, v6.8H, v27.8H // ...................................................................*.. + str q15, [x0], #(16) // .................................................................*.... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q16, [x0, #112] // ....................................................................*. + // gap // ...................................................................... + str q28, [x0, #176] // .....................................................................* + + // --------------------------- new position ----------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------- + // ldr q12, [x0, #64] // ..*................................................................... + // ldr q22, [x0, #320] // .*.................................................................... + // mul v27.8H, v6.8H, v0.H[0] // *..................................................................... + // ldr q11, [x0, #192] // ....*................................................................. + // mls v27.8H, v14.8H, v7.H[0] // ...*.................................................................. + // ldr q29, [x0, #256] // .....*................................................................ + // ldr q5, [x0, #128] // ......*............................................................... + // mul v13.8H, v22.8H, v0.H[0] // .......*.............................................................. + // add v26.8H, v11.8H, v15.8H // .........*............................................................ + // sqrdmulh v28.8H, v22.8H, v0.H[1] // ........*............................................................. + // sub v11.8H, v11.8H, v15.8H // ..........*........................................................... + // sqrdmulh v25.8H, v29.8H, v0.H[1] // ............*......................................................... + // sqrdmulh v16.8H, v11.8H, v0.H[5] // .............*........................................................ + // mls v13.8H, v28.8H, v7.H[0] // ..............*....................................................... + // mul v15.8H, v11.8H, v0.H[4] // ...............*...................................................... + // sub v23.8H, v5.8H, v27.8H // ...........*.......................................................... + // mls v15.8H, v16.8H, v7.H[0] // .................*.................................................... + // add v16.8H, v5.8H, v27.8H // ................*..................................................... + // add v17.8H, v12.8H, v13.8H // ..................*................................................... + // mul v21.8H, v23.8H, v0.H[4] // ....................*................................................. + // sub v31.8H, v12.8H, v13.8H // ...................*.................................................. + // mul v4.8H, v26.8H, v0.H[2] // .....................*................................................ + // sub v28.8H, v31.8H, v15.8H // ......................*............................................... + // add v15.8H, v31.8H, v15.8H // ...........................*.......................................... + // sqrdmulh v6.8H, v16.8H, v0.H[3] // .......................*.............................................. + // sqrdmulh v22.8H, v28.8H, v1.H[5] // ............................*......................................... + // mul v3.8H, v15.8H, v1.H[2] // .............................*........................................ + // sqrdmulh v15.8H, v15.8H, v1.H[3] // ..............................*....................................... + // mul v31.8H, v29.8H, v0.H[0] // ........................*............................................. + // mul v10.8H, v16.8H, v0.H[2] // .........................*............................................ + // sqrdmulh v16.8H, v26.8H, v0.H[3] // ..........................*........................................... + // ldr q30, [x0, #0] // ..................................*................................... + // mls v3.8H, v15.8H, v7.H[0] // .................................*.................................... + // sqrdmulh v15.8H, v23.8H, v0.H[5] // ...............................*...................................... + // mls v4.8H, v16.8H, v7.H[0] // ................................*..................................... + // mls v31.8H, v25.8H, v7.H[0] // ...................................*.................................. + // mls v21.8H, v15.8H, v7.H[0] // ......................................*............................... + // sub v26.8H, v17.8H, v4.8H // ....................................*................................. + // add v9.8H, v17.8H, v4.8H // .....................................*................................ + // mul v15.8H, v28.8H, v1.H[4] // .......................................*.............................. + // sub v17.8H, v30.8H, v31.8H // ........................................*............................. + // mls v15.8H, v22.8H, v7.H[0] // .........................................*............................ + // mls v10.8H, v6.8H, v7.H[0] // ...........................................*.......................... + // sub v20.8H, v17.8H, v21.8H // ..........................................*........................... + // add v14.8H, v17.8H, v21.8H // ............................................*......................... + // add v21.8H, v30.8H, v31.8H // .............................................*........................ + // sqrdmulh v24.8H, v9.8H, v0.H[7] // ..............................................*....................... + // sub v28.8H, v20.8H, v15.8H // ...............................................*...................... + // mul v5.8H, v9.8H, v0.H[6] // ................................................*..................... + // sub v9.8H, v14.8H, v3.8H // ...................................................*.................. + // sub v27.8H, v21.8H, v10.8H // ....................................................*................. + // add v16.8H, v21.8H, v10.8H // ......................................................*............... + // str q28, [x0, #448] // .................................................*.................... + // sqrdmulh v28.8H, v26.8H, v1.H[1] // ..................................................*................... + // add v30.8H, v14.8H, v3.8H // ........................................................*............. + // str q9, [x0, #320] // .......................................................*.............. + // mls v5.8H, v24.8H, v7.H[0] // .....................................................*................ + // add v31.8H, v20.8H, v15.8H // ............................................................*......... + // mul v15.8H, v26.8H, v1.H[0] // .........................................................*............ + // str q30, [x0, #256] // ...........................................................*.......... + // mls v15.8H, v28.8H, v7.H[0] // ..........................................................*........... + // str q31, [x0, #384] // ..............................................................*....... + // sub v20.8H, v16.8H, v5.8H // .............................................................*........ + // add v2.8H, v16.8H, v5.8H // ...............................................................*...... + // str q20, [x0, #64] // ................................................................*..... + // str q2, [x0], #(16) // ...................................................................*.. + // add v28.8H, v27.8H, v15.8H // .................................................................*.... + // sub v21.8H, v27.8H, v15.8H // ..................................................................*... + // str q28, [x0, #112] // ....................................................................*. + // str q21, [x0, #176] // .....................................................................* + restore inp, STACK0 mov count, #8 .p2align 2 - ldr x21, [x1, #48] // *....................................................................... - // gap // ........................................................................ - ldr x22, [x3] , #16 // .*...................................................................... - ldr x7, [x1, #32] // ........*............................................................... - ldr x13, [x3, #-8] // ..*..................................................................... - // gap // ........................................................................ - ldr x18, [x4, #72] // ............................*........................................... - ldr x20, [x1, #56] // .........*.............................................................. - // gap // ........................................................................ - ldr x9, [x4, #56] // ........................*............................................... - // gap // ........................................................................ - // gap // ........................................................................ - vins v16, x21, 0 // .....*.................................................................. - ldr x26, [x1, #24] // ..........*............................................................. - vins v0, x22, 0 // .......*................................................................ - ldr x21, [x1, #0] // ...*.................................................................... - vins v25, x7, 0 // .............*.......................................................... - // gap // ........................................................................ - ldr x7, [x1, #16] // ......*................................................................. - // gap // ........................................................................ - // gap // ........................................................................ - ldr x12, [x4, #64] // ..........................*............................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - vins v24, x21, 0 // ...................*.................................................... - ldr x21, [x1, #40] // ...........*............................................................ - // gap // ........................................................................ - vins v0, x13, 1 // .................*...................................................... - vins v16, x20, 1 // ...............*........................................................ - // gap // ........................................................................ - vins v18, x12, 0 // ..........................................*............................. - ldr x12, [x4, #24] // ..................*..................................................... - // gap // ........................................................................ - vins v23, x7, 0 // .....................*.................................................. - // gap // ........................................................................ - // gap // ........................................................................ - vins v25, x21, 1 // .........................*.............................................. - ldr x21, [x4, #80] // ...............................*........................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v2.8H, v16.8H, v0.H[1] // ..............................*......................................... - // gap // ........................................................................ - // gap // ........................................................................ - vins v5, x21, 0 // ..............................................*......................... - ldr x21, [x4, #32] // ....................*................................................... - // gap // ........................................................................ - vins v23, x26, 1 // ...................................*.................................... - mul v6.8H, v16.8H, v0.H[0] // ..................................*..................................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v10.8H, v25.8H, v0.H[1] // ....................................*................................... - // gap // ........................................................................ - // gap // ........................................................................ - vins v29, x21, 0 // .............................*.......................................... - ldr x21, [x4, #16] // ................*....................................................... - // gap // ........................................................................ - mls v6.8H, v2.8H, v7.H[0] // .....................................*.................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v13.8H, v25.8H, v0.H[0] // .........................................*.............................. - // gap // ........................................................................ - // gap // ........................................................................ - vins v27, x21, 0 // ...........................*............................................ - // gap // ........................................................................ - // gap // ........................................................................ - ldr x21, [x1, #8] // ....*................................................................... - mls v13.8H, v10.8H, v7.H[0] // .............................................*.......................... - // gap // ........................................................................ - sub v28.8H, v23.8H, v6.8H // ...........................................*............................ - // gap // ........................................................................ - // gap // ........................................................................ - add v23.8H, v23.8H, v6.8H // ............................................*........................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - vins v24, x21, 1 // ................................*....................................... - ldr x21, [x4] , #96 // ............*........................................................... - sqrdmulh v1.8H, v28.8H, v0.H[5] // ...................................................*.................... - vins v27, x12, 1 // .......................................*................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v4.8H, v23.8H, v0.H[3] // ................................................*....................... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - vins v14, x21, 0 // .......................*................................................ - ldr x21, [x4, #-88] // ..............*......................................................... - mul v20.8H, v28.8H, v0.H[4] // ...............................................*........................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v30.8H, v23.8H, v0.H[2] // ....................................................*................... - add v26.8H, v24.8H, v13.8H // ..................................................*..................... - // gap // ........................................................................ - sub v28.8H, v24.8H, v13.8H // .................................................*...................... - // gap // ........................................................................ - // gap // ........................................................................ - mls v30.8H, v4.8H, v7.H[0] // ......................................................*................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v20.8H, v1.8H, v7.H[0] // .......................................................*................ - vins v14, x21, 1 // ......................................*................................. - ldr x21, [x4, #-48] // ......................*................................................. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sub v0.8H, v26.8H, v30.8H // ........................................................*............... - add v9.8H, v26.8H, v30.8H // .........................................................*.............. - // gap // ........................................................................ - vins v2, x21, 0 // ........................................*............................... - ldr x21, [x4, #-8] // .................................*...................................... - // gap // ........................................................................ - sub v19.8H, v28.8H, v20.8H // ..........................................................*............. - add v24.8H, v28.8H, v20.8H // ...........................................................*............ - // gap // ........................................................................ - trn2 v21.4S, v9.4S, v0.4S // ............................................................*........... - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v13.4S, v9.4S, v0.4S // ..............................................................*......... - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v23.4S, v24.4S, v19.4S // .............................................................*.......... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn1 v16.4S, v24.4S, v19.4S // ................................................................*....... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - vins v5, x21, 1 // .....................................................*.................. - trn2 v11.2D, v21.2D, v23.2D // ...............................................................*........ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - trn2 v24.2D, v13.2D, v16.2D // ...................................................................*.... - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v0.8H, v11.8H, v27.8H // ..................................................................*..... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - sqrdmulh v4.8H, v24.8H, v27.8H // .....................................................................*.. - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v15.8H, v11.8H, v14.8H // ....................................................................*... - trn1 v11.2D, v21.2D, v23.2D // .................................................................*...... - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mul v25.8H, v24.8H, v14.8H // .......................................................................* - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - // gap // ........................................................................ - mls v15.8H, v0.8H, v7.H[0] // ......................................................................*. - // gap // ........................................................................ - // gap // ........................................................................ - - // original source code - // ldr x26, [x1, #48] // *....................................................................... || *.............................................................. - // ldr x25, [x3] , #16 // .*...................................................................... || *.............................................................. - // ldr x23, [x3, #-8] // ...*.................................................................... || .*............................................................. - // ldr x24, [x1, #0] // ..........*............................................................. || .....*......................................................... - // ldr x27, [x1, #8] // ..................................*..................................... || ..........................*.................................... - // vins v13, x26, 0 // .......*................................................................ || ....*.......................................................... - // ldr x21, [x1, #16] // ............*........................................................... || ......*........................................................ - // vins v6, x25, 0 // .........*.............................................................. || ....*.......................................................... - // ldr x25, [x1, #32] // ..*..................................................................... || .*............................................................. - // ldr x26, [x1, #56] // .....*.................................................................. || ..*............................................................ - // ldr x20, [x1, #24] // ........*............................................................... || ....*.......................................................... - // ldr x16, [x1, #40] // ...............*........................................................ || .........*..................................................... - // ldr x9, [x4] , #96 // .......................................*................................ || ..............................*................................ - // vins v25, x25, 0 // ...........*............................................................ || .....*......................................................... - // ldr x25, [x4, #-88] // ............................................*........................... || ..................................*............................ - // vins v13, x26, 1 // .................*...................................................... || ..........*.................................................... - // ldr x26, [x4, #-80] // ..............................*......................................... || .....................*......................................... - // vins v6, x23, 1 // ................*....................................................... || ..........*.................................................... - // ldr x23, [x4, #-72] // ...................*.................................................... || ...........*................................................... - // vins v11, x24, 0 // ..............*......................................................... || .........*..................................................... - // ldr x24, [x4, #-64] // .........................*.............................................. || .................*............................................. - // vins v5, x21, 0 // ....................*................................................... || ............*.................................................. - // ldr x21, [x4, #-48] // ....................................................*................... || ........................................*...................... - // vins v10, x9, 0 // ...........................................*............................ || ..................................*............................ - // ldr x9, [x4, #-40] // ......*................................................................. || ...*........................................................... - // vins v25, x16, 1 // .....................*.................................................. || .............*................................................. - // ldr x16, [x4, #-32] // .............*.......................................................... || .......*....................................................... - // vins v4, x26, 0 // .................................*...................................... || .........................*..................................... - // ldr x18, [x4, #-24] // ....*................................................................... || ..*............................................................ - // vins v29, x24, 0 // .............................*.......................................... || .....................*......................................... - // sqrdmulh v2.8H, v13.8H, v6.H[1] // .......................*................................................ || ................*.............................................. - // ldr x26, [x4, #-16] // ......................*................................................. || .............*................................................. - // vins v11, x27, 1 // ......................................*................................. || ..............................*................................ - // ldr x24, [x4, #-8] // ........................................................*............... || ............................................*.................. - // mul v13.8H, v13.8H, v6.H[0] // ...........................*............................................ || ..................*............................................ - // vins v5, x20, 1 // ..........................*............................................. || ..................*............................................ - // sqrdmulh v0.8H, v25.8H, v6.H[1] // ............................*........................................... || ....................*.......................................... - // mls v13.8H, v2.8H, v7.H[0] // ...............................*........................................ || ......................*........................................ - // vins v10, x25, 1 // ...................................................*.................... || ........................................*...................... - // vins v4, x23, 1 // .........................................*.............................. || ...............................*............................... - // vins v2, x21, 0 // .......................................................*................ || ............................................*.................. - // mul v25.8H, v25.8H, v6.H[0] // ................................*....................................... || ........................*...................................... - // vins v18, x16, 0 // ..................*..................................................... || ...........*................................................... - // sub v15.8H, v5.8H, v13.8H // ....................................*................................... || ...........................*................................... - // add v13.8H, v5.8H, v13.8H // .....................................*.................................. || ............................*.................................. - // mls v25.8H, v0.8H, v7.H[0] // ...................................*.................................... || ..........................*.................................... - // vins v5, x26, 0 // ........................*............................................... || .................*............................................. - // mul v16.8H, v15.8H, v6.H[4] // .............................................*.......................... || ..................................*............................ - // sqrdmulh v26.8H, v13.8H, v6.H[3] // ..........................................*............................. || ................................*.............................. - // sub v8.8H, v11.8H, v25.8H // ................................................*....................... || .....................................*......................... - // add v25.8H, v11.8H, v25.8H // ...............................................*........................ || ....................................*.......................... - // sqrdmulh v11.8H, v15.8H, v6.H[5] // ........................................*............................... || ..............................*................................ - // mul v13.8H, v13.8H, v6.H[2] // ..............................................*......................... || ....................................*.......................... - // vins v5, x24, 1 // ...............................................................*........ || ...................................................*........... - // mls v13.8H, v26.8H, v7.H[0] // .................................................*...................... || ......................................*........................ - // mls v16.8H, v11.8H, v7.H[0] // ..................................................*..................... || ........................................*...................... - // sub v6.8H, v25.8H, v13.8H // .....................................................*.................. || ...........................................*................... - // add v25.8H, v25.8H, v13.8H // ......................................................*................. || ...........................................*................... - // sub v15.8H, v8.8H, v16.8H // .........................................................*.............. || .............................................*................. - // add v16.8H, v8.8H, v16.8H // ..........................................................*............. || .............................................*................. - // trn2 v0.4S, v25.4S, v6.4S // ...........................................................*............ || ..............................................*................ - // trn2 v11.4S, v16.4S, v15.4S // .............................................................*.......... || ................................................*.............. - // trn1 v13.4S, v25.4S, v6.4S // ............................................................*........... || ...............................................*............... - // trn2 v25.2D, v0.2D, v11.2D // ................................................................*....... || ...................................................*........... - // trn1 v16.4S, v16.4S, v15.4S // ..............................................................*......... || .................................................*............. - // trn1 v11.2D, v0.2D, v11.2D // .....................................................................*.. || ..........................................................*.... - // sqrdmulh v0.8H, v25.8H, v4.8H // ..................................................................*..... || ......................................................*........ - // trn2 v6.2D, v13.2D, v16.2D // .................................................................*...... || .....................................................*......... - // mul v15.8H, v25.8H, v10.8H // ....................................................................*... || ..........................................................*.... - // sqrdmulh v4.8H, v6.8H, v4.8H // ...................................................................*.... || ........................................................*...... - // mls v15.8H, v0.8H, v7.H[0] // .......................................................................* || ..............................................................* - // mul v25.8H, v6.8H, v10.8H // ......................................................................*. || ............................................................*.. - + // Instructions: 38 + // Expected cycles: 45 + // Expected IPC: 0.84 + // + // Cycle bound: 45.0 + // IPC bound: 0.84 + // + // Wall time: 0.69s + // User time: 0.69s + // + // --------- original position ---------> + // 0 25 + // |------------------------|------------ + ldr q15, [x3], #16 // ...*.................................. + ldr q18, [x1, #48] // ......*............................... + // gap // ...................................... + ldr q25, [x4, #48] // .....................................* + ldr q20, [x1, #32] // *..................................... + // gap // ...................................... + ldr q3, [x1, #0] // .*.................................... + ldr q27, [x1, #16] // ..*................................... + // gap // ...................................... + ldr q6, [x4, #16] // ...............*...................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v16.8H, v18.8H, v15.H[1] // ........*............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v14.8H, v18.8H, v15.H[0] // .........*............................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v14.8H, v16.8H, v7.H[0] // ............*......................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v30.8H, v20.8H, v15.H[1] // ....*................................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v11.8H, v20.8H, v15.H[0] // .....*................................ + // gap // ...................................... + // gap // ...................................... + add v21.8H, v27.8H, v14.8H // ..............*....................... + // gap // ...................................... + // gap // ...................................... + sub v19.8H, v27.8H, v14.8H // .............*........................ + // gap // ...................................... + // gap // ...................................... + mls v11.8H, v30.8H, v7.H[0] // .......*.............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v20.8H, v21.8H, v15.H[3] // ..................*................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v10.8H, v21.8H, v15.H[2] // .................*.................... + // gap // ...................................... + // gap // ...................................... + add v28.8H, v3.8H, v11.8H // ...........*.......................... + // gap // ...................................... + // gap // ...................................... + sub v0.8H, v3.8H, v11.8H // ..........*........................... + sqrdmulh v21.8H, v19.8H, v15.H[5] // ...................*.................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v10.8H, v20.8H, v7.H[0] // ....................*................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v12.8H, v19.8H, v15.H[4] // ................*..................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v12.8H, v21.8H, v7.H[0] // .....................*................ + // gap // ...................................... + // gap // ...................................... + sub v26.8H, v28.8H, v10.8H // .......................*.............. + // gap // ...................................... + // gap // ...................................... + add v15.8H, v28.8H, v10.8H // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v24.8H, v0.8H, v12.8H // .........................*............ + add v17.8H, v0.8H, v12.8H // ..........................*........... + ldr q0, [x4], #(6*16) // .............................*........ + trn2 v28.4S, v15.4S, v26.4S // ...........................*.......... + trn1 v15.4S, v15.4S, v26.4S // ............................*......... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + trn1 v12.4S, v17.4S, v24.4S // ..............................*....... + // gap // ...................................... + // gap // ...................................... + trn2 v31.4S, v17.4S, v24.4S // ...............................*...... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + trn2 v17.2D, v15.2D, v12.2D // ................................*..... + // gap // ...................................... + // gap // ...................................... + trn2 v1.2D, v28.2D, v31.2D // .................................*.... + // gap // ...................................... + // gap // ...................................... + trn1 v22.2D, v28.2D, v31.2D // ..................................*... + ldr q28, [x4, #-64] // ......................*............... + // gap // ...................................... + mul v16.8H, v17.8H, v0.8H // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v3.8H, v1.8H, v6.8H // ....................................*. + + // ----------- new position ------------> + // 0 25 + // |------------------------|------------ + // ldr q17, [x1, #32] // ...*.................................. + // ldr q27, [x1, #0] // ....*................................. + // ldr q9, [x1, #16] // .....*................................ + // ldr q2, [x3], #16 // *..................................... + // sqrdmulh v11.8H, v17.8H, v2.H[1] // ..........*........................... + // mul v15.8H, v17.8H, v2.H[0] // ...........*.......................... + // ldr q16, [x1, #48] // .*.................................... + // mls v15.8H, v11.8H, v7.H[0] // ..............*....................... + // sqrdmulh v8.8H, v16.8H, v2.H[1] // .......*.............................. + // mul v16.8H, v16.8H, v2.H[0] // ........*............................. + // sub v18.8H, v27.8H, v15.8H // ..................*................... + // add v30.8H, v27.8H, v15.8H // .................*.................... + // mls v16.8H, v8.8H, v7.H[0] // .........*............................ + // sub v1.8H, v9.8H, v16.8H // .............*........................ + // add v16.8H, v9.8H, v16.8H // ............*......................... + // ldr q6, [x4, #16] // ......*............................... + // mul v21.8H, v1.8H, v2.H[4] // .....................*................ + // mul v14.8H, v16.8H, v2.H[2] // ................*..................... + // sqrdmulh v13.8H, v16.8H, v2.H[3] // ...............*...................... + // sqrdmulh v28.8H, v1.8H, v2.H[5] // ...................*.................. + // mls v14.8H, v13.8H, v7.H[0] // ....................*................. + // mls v21.8H, v28.8H, v7.H[0] // ......................*............... + // ldr q28, [x4, #32] // ...................................*.. + // sub v0.8H, v30.8H, v14.8H // .......................*.............. + // add v15.8H, v30.8H, v14.8H // ........................*............. + // sub v31.8H, v18.8H, v21.8H // .........................*............ + // add v13.8H, v18.8H, v21.8H // ..........................*........... + // trn2 v30.4S, v15.4S, v0.4S // ............................*......... + // trn1 v15.4S, v15.4S, v0.4S // .............................*........ + // ldr q0, [x4], #(6*16) // ...........................*.......... + // trn1 v12.4S, v13.4S, v31.4S // ..............................*....... + // trn2 v13.4S, v13.4S, v31.4S // ...............................*...... + // trn2 v17.2D, v15.2D, v12.2D // ................................*..... + // trn2 v1.2D, v30.2D, v13.2D // .................................*.... + // trn1 v22.2D, v30.2D, v13.2D // ..................................*... + // mul v16.8H, v17.8H, v0.8H // ....................................*. + // sqrdmulh v3.8H, v1.8H, v6.8H // .....................................* + // ldr q25, [x4, #-48] // ..*................................... + sub count, count, #1 -.p2align 2 layer4567_start: - trn1 v0.2D, v13.2D, v16.2D // ..............................................*.......................................................... - ldr x26, [x1, #112] // ............e............................................................................................ - ldr x21, [x4, #-56] // .........................................................*............................................... - ldr x25, [x3] , #16 // ................e........................................................................................ - vins v2, x9, 1 // ...............................................................*......................................... - sub v16.8H, v11.8H, v15.8H // ................................................................................*........................ - mls v25.8H, v4.8H, v7.H[0] // ..........................................................................*.............................. - ldr x23, [x3, #-8] // .................e....................................................................................... - vins v18, x18, 1 // ...................................................................*..................................... - add v11.8H, v11.8H, v15.8H // .................................................................................*....................... - ldr x24, [x1, #64] // e........................................................................................................ - ldr x27, [x1, #72] // .e....................................................................................................... - vins v13, x26, 0 // ..............e.......................................................................................... - vins v29, x21, 1 // ...........................................................*............................................. - ldr x21, [x1, #80] // ....e.................................................................................................... - vins v6, x25, 0 // ..................e...................................................................................... - ldr x25, [x1, #96] // ........e................................................................................................ - sqrdmulh v4.8H, v16.8H, v5.8H // ........................................................................................*................ - ldr x26, [x1, #120] // .............e........................................................................................... - ldr x20, [x1, #88] // .....e................................................................................................... - // gap // ......................................................................................................... - sqrdmulh v2.8H, v11.8H, v2.8H // ...................................................................................*..................... - ldr x16, [x1, #104] // .........e............................................................................................... - sub v15.8H, v0.8H, v25.8H // ...........................................................................*............................. - add v0.8H, v0.8H, v25.8H // ............................................................................*............................ - ldr x9, [x4] , #96 // ................................................e........................................................ - // gap // ......................................................................................................... - mul v16.8H, v16.8H, v18.8H // .......................................................................................*................. - vins v25, x25, 0 // ..........e.............................................................................................. - ldr x25, [x4, #-88] // .................................................e....................................................... - vins v13, x26, 1 // ...............e......................................................................................... - ldr x26, [x4, #-80] // ....................................................e.................................................... - // gap // ......................................................................................................... - vins v6, x23, 1 // ...................e..................................................................................... - mul v18.8H, v11.8H, v29.8H // ..................................................................................*...................... - ldr x23, [x4, #-72] // .....................................................e................................................... - vins v11, x24, 0 // ..e...................................................................................................... - ldr x24, [x4, #-64] // ........................................................e................................................ - // gap // ......................................................................................................... - vins v5, x21, 0 // ......e.................................................................................................. - mls v18.8H, v2.8H, v7.H[0] // ....................................................................................*.................... - ldr x21, [x4, #-48] // ............................................................e............................................ - vins v10, x9, 0 // ..................................................e...................................................... - ldr x9, [x4, #-40] // .............................................................e........................................... - // gap // ......................................................................................................... - mls v16.8H, v4.8H, v7.H[0] // .........................................................................................*............... - vins v25, x16, 1 // ...........e............................................................................................. - ldr x16, [x4, #-32] // ................................................................e........................................ - vins v4, x26, 0 // ......................................................e.................................................. - ldr x18, [x4, #-24] // .................................................................e....................................... - // gap // ......................................................................................................... - vins v29, x24, 0 // ..........................................................e.............................................. - sqrdmulh v2.8H, v13.8H, v6.H[1] // ..........................e.............................................................................. - ldr x26, [x4, #-16] // ....................................................................e.................................... - vins v11, x27, 1 // ...e..................................................................................................... - ldr x24, [x4, #-8] // .....................................................................e................................... - // gap // ......................................................................................................... - mul v13.8H, v13.8H, v6.H[0] // .........................e............................................................................... - sub v21.8H, v0.8H, v18.8H // .....................................................................................*................... - // gap // ......................................................................................................... - vins v5, x20, 1 // .......e................................................................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v20.8H, v0.8H, v18.8H // ......................................................................................*.................. - sqrdmulh v0.8H, v25.8H, v6.H[1] // .....................e................................................................................... - // gap // ......................................................................................................... - sub v23.8H, v15.8H, v16.8H // ..........................................................................................*.............. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v22.8H, v15.8H, v16.8H // ...........................................................................................*............. - mls v13.8H, v2.8H, v7.H[0] // ...........................e............................................................................. - // gap // ......................................................................................................... - vins v10, x25, 1 // ...................................................e..................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - vins v4, x23, 1 // .......................................................e................................................. - sqdmulh v16.8H, v21.8H, v7.H[1] // ...............................................................................................*......... - // gap // ......................................................................................................... - vins v2, x21, 0 // ..............................................................e.......................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mul v25.8H, v25.8H, v6.H[0] // ....................e.................................................................................... - vins v18, x16, 0 // ..................................................................e...................................... - // gap // ......................................................................................................... - sub v15.8H, v5.8H, v13.8H // ............................e............................................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v13.8H, v5.8H, v13.8H // .............................e........................................................................... - mls v25.8H, v0.8H, v7.H[0] // ......................e.................................................................................. - // gap // ......................................................................................................... - srshr v0.8H, v16.8H, #11 // ................................................................................................*........ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - vins v5, x26, 0 // ......................................................................e.................................. - mul v16.8H, v15.8H, v6.H[4] // ...................................e..................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqrdmulh v26.8H, v13.8H, v6.H[3] // ...............................e......................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v8.8H, v11.8H, v25.8H // .......................e................................................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v25.8H, v11.8H, v25.8H // ........................e................................................................................ - sqrdmulh v11.8H, v15.8H, v6.H[5] // ....................................e.................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mul v13.8H, v13.8H, v6.H[2] // ..............................e.......................................................................... - vins v5, x24, 1 // .......................................................................e................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v13.8H, v26.8H, v7.H[0] // ................................e........................................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v16.8H, v11.8H, v7.H[0] // .....................................e................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqdmulh v11.8H, v20.8H, v7.H[1] // ............................................................................................*............ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v6.8H, v25.8H, v13.8H // .................................e....................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v25.8H, v25.8H, v13.8H // ..................................e...................................................................... - sqdmulh v13.8H, v22.8H, v7.H[1] // ..................................................................................................*...... - // gap // ......................................................................................................... - sub v15.8H, v8.8H, v16.8H // ......................................e.................................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v16.8H, v8.8H, v16.8H // .......................................e................................................................. - sqdmulh v26.8H, v23.8H, v7.H[1] // .....................................................................................................*... - // gap // ......................................................................................................... - srshr v11.8H, v11.8H, #11 // .............................................................................................*........... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v21.8H, v0.8H, v7.H[0] // .................................................................................................*....... - trn2 v0.4S, v25.4S, v6.4S // .........................................e............................................................... - // gap // ......................................................................................................... - srshr v13.8H, v13.8H, #11 // ...................................................................................................*..... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v20.8H, v11.8H, v7.H[0] // ..............................................................................................*.......... - trn2 v11.4S, v16.4S, v15.4S // ...........................................e............................................................. - // gap // ......................................................................................................... - srshr v26.8H, v26.8H, #11 // ......................................................................................................*.. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v22.8H, v13.8H, v7.H[0] // ....................................................................................................*.... - trn1 v13.4S, v25.4S, v6.4S // ........................................e................................................................ - // gap // ......................................................................................................... - trn2 v25.2D, v0.2D, v11.2D // .............................................e........................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v23.8H, v26.8H, v7.H[0] // .......................................................................................................*. - trn1 v16.4S, v16.4S, v15.4S // ..........................................e.............................................................. - // gap // ......................................................................................................... - trn1 v11.2D, v0.2D, v11.2D // ...............................................e......................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqrdmulh v0.8H, v25.8H, v4.8H // ..............................................................................e.......................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - trn2 v6.2D, v13.2D, v16.2D // ............................................e............................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mul v15.8H, v25.8H, v10.8H // .............................................................................e........................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - st4 {v20.4S,v21.4S,v22.4S,v23.4S}, [x1], #64 // ........................................................................................................* - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqrdmulh v4.8H, v6.8H, v4.8H // .........................................................................e............................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v15.8H, v0.8H, v7.H[0] // ...............................................................................e......................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mul v25.8H, v6.8H, v10.8H // ........................................................................e................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - - // original source code - // ldr x10, [x1, #0] // .........e.................................................................................................................................................................................................... || ...e............................................................................................................................ - // ldr x11, [x1, #8] // ..........e................................................................................................................................................................................................... || ...e............................................................................................................................ - // vins v8, x10, 0 // ................................e............................................................................................................................................................................. || ............e................................................................................................................... - // vins v8, x11, 1 // ...............................................e.............................................................................................................................................................. || ..................e............................................................................................................. - // ldr x10, [x1, #16] // .............e................................................................................................................................................................................................ || ....e........................................................................................................................... - // ldr x11, [x1, #24] // ..................e........................................................................................................................................................................................... || ......e......................................................................................................................... - // vins v9, x10, 0 // ..................................e........................................................................................................................................................................... || .............e.................................................................................................................. - // vins v9, x11, 1 // ...................................................e.......................................................................................................................................................... || ....................e........................................................................................................... - // ldr x10, [x1, #32] // ...............e.............................................................................................................................................................................................. || .....e.......................................................................................................................... - // ldr x11, [x1, #40] // ....................e......................................................................................................................................................................................... || .......e........................................................................................................................ - // vins v10, x10, 0 // .........................e.................................................................................................................................................................................... || .........e...................................................................................................................... - // vins v10, x11, 1 // ........................................e..................................................................................................................................................................... || ...............e................................................................................................................ - // ldr x10, [x1, #48] // e............................................................................................................................................................................................................. || e............................................................................................................................... - // ldr x11, [x1, #56] // .................e............................................................................................................................................................................................ || ......e......................................................................................................................... - // vins v11, x10, 0 // ...........e.................................................................................................................................................................................................. || ....e........................................................................................................................... - // vins v11, x11, 1 // ...........................e.................................................................................................................................................................................. || ..........e..................................................................................................................... - // ldr x10, [x3] , #16 // ..e........................................................................................................................................................................................................... || .e.............................................................................................................................. - // ldr x11, [x3, #-8] // ......e....................................................................................................................................................................................................... || ..e............................................................................................................................. - // vins v0, x10, 0 // ..............e............................................................................................................................................................................................... || .....e.......................................................................................................................... - // vins v0, x11, 1 // .............................e................................................................................................................................................................................ || ...........e.................................................................................................................... - // mul v24.8H, v10.8H, v0.H[0] // .............................................................e................................................................................................................................................ || ...........................e.................................................................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[1] // .....................................................e........................................................................................................................................................ || .....................e.......................................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // .................................................................e............................................................................................................................................ || .............................e.................................................................................................. - // sub v10.8H, v8.8H, v24.8H // ......................................................................e....................................................................................................................................... || ..................................e............................................................................................. - // add v8.8H, v8.8H, v24.8H // .......................................................................e...................................................................................................................................... || ...................................e............................................................................................ - // mul v24.8H, v11.8H, v0.H[0] // .................................................e............................................................................................................................................................ || ...................e............................................................................................................ - // sqrdmulh v11.8H, v11.8H, v0.H[1] // .............................................e................................................................................................................................................................ || .................e.............................................................................................................. - // mls v24.8H, v11.8H, v7.H[0] // ........................................................e..................................................................................................................................................... || .......................e........................................................................................................ - // sub v11.8H, v9.8H, v24.8H // ...............................................................e.............................................................................................................................................. || ............................e................................................................................................... - // add v9.8H, v9.8H, v24.8H // ................................................................e............................................................................................................................................. || .............................e.................................................................................................. - // mul v24.8H, v9.8H, v0.H[2] // .........................................................................e.................................................................................................................................... || .....................................e.......................................................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[3] // .....................................................................e........................................................................................................................................ || .................................e.............................................................................................. - // mls v24.8H, v9.8H, v7.H[0] // ...........................................................................e.................................................................................................................................. || .......................................e........................................................................................ - // sub v9.8H, v8.8H, v24.8H // ..............................................................................e............................................................................................................................... || ............................................e................................................................................... - // add v8.8H, v8.8H, v24.8H // ...............................................................................e.............................................................................................................................. || .............................................e.................................................................................. - // mul v24.8H, v11.8H, v0.H[4] // ....................................................................e......................................................................................................................................... || ...............................e................................................................................................ - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ........................................................................e..................................................................................................................................... || ...................................e............................................................................................ - // mls v24.8H, v11.8H, v7.H[0] // ............................................................................e................................................................................................................................. || .........................................e...................................................................................... - // sub v11.8H, v10.8H, v24.8H // .................................................................................e............................................................................................................................ || ..............................................e................................................................................. - // add v10.8H, v10.8H, v24.8H // ..................................................................................e........................................................................................................................... || ...............................................e................................................................................ - // trn1 v25.4S, v8.4S, v9.4S // ............................................................................................e................................................................................................................. || .....................................................e.......................................................................... - // trn2 v26.4S, v8.4S, v9.4S // ......................................................................................e....................................................................................................................... || .................................................e.............................................................................. - // trn1 v27.4S, v10.4S, v11.4S // ...............................................................................................e.............................................................................................................. || .......................................................e........................................................................ - // trn2 v28.4S, v10.4S, v11.4S // .........................................................................................e.................................................................................................................... || ...................................................e............................................................................ - // trn2 v10.2D, v25.2D, v27.2D // ..................................................................................................e........................................................................................................... || ..........................................................e..................................................................... - // trn2 v11.2D, v26.2D, v28.2D // .............................................................................................e................................................................................................................ || ......................................................e......................................................................... - // trn1 v8.2D, v25.2D, v27.2D // ........................................................................................................*..................................................................................................... || ...................................................................*............................................................ - // trn1 v9.2D, v26.2D, v28.2D // ................................................................................................e............................................................................................................. || ........................................................e....................................................................... - // ldr x10, [x4] , #96 // .......................e...................................................................................................................................................................................... || ........e....................................................................................................................... - // ldr x11, [x4, #-88] // ..........................e................................................................................................................................................................................... || .........e...................................................................................................................... - // vins v0, x10, 0 // .....................................e........................................................................................................................................................................ || ..............e................................................................................................................. - // vins v0, x11, 1 // .........................................................e.................................................................................................................................................... || ........................e....................................................................................................... - // ldr x10, [x4, #-80] // ............................e................................................................................................................................................................................. || ..........e..................................................................................................................... - // ldr x11, [x4, #-72] // ...............................e.............................................................................................................................................................................. || ...........e.................................................................................................................... - // vins v4, x10, 0 // ..........................................e................................................................................................................................................................... || ................e............................................................................................................... - // vins v4, x11, 1 // ..........................................................e................................................................................................................................................... || .........................e...................................................................................................... - // ldr x10, [x4, #-64] // .................................e............................................................................................................................................................................ || ............e................................................................................................................... - // ldr x11, [x4, #-56] // ..........................................................................................................*................................................................................................... || ...................................................................*............................................................ - // vins v1, x10, 0 // ............................................e................................................................................................................................................................. || .................e.............................................................................................................. - // vins v1, x11, 1 // .....................................................................................................................*........................................................................................ || .......................................................................*........................................................ - // ldr x10, [x4, #-48] // ....................................e......................................................................................................................................................................... || .............e.................................................................................................................. - // ldr x11, [x4, #-40] // ......................................e....................................................................................................................................................................... || ..............e................................................................................................................. - // vins v5, x10, 0 // ............................................................e................................................................................................................................................. || ..........................e..................................................................................................... - // vins v5, x11, 1 // ............................................................................................................*................................................................................................. || ....................................................................*........................................................... - // ldr x10, [x4, #-32] // .........................................e.................................................................................................................................................................... || ...............e................................................................................................................ - // ldr x11, [x4, #-24] // ...........................................e.................................................................................................................................................................. || ................e............................................................................................................... - // vins v2, x10, 0 // ..............................................................e............................................................................................................................................... || ...........................e.................................................................................................... - // vins v2, x11, 1 // ................................................................................................................*............................................................................................. || .....................................................................*.......................................................... - // ldr x10, [x4, #-16] // ..............................................e............................................................................................................................................................... || .................e.............................................................................................................. - // ldr x11, [x4, #-8] // ................................................e............................................................................................................................................................. || ..................e............................................................................................................. - // vins v6, x10, 0 // ...................................................................e.......................................................................................................................................... || ...............................e................................................................................................ - // vins v6, x11, 1 // ..........................................................................e................................................................................................................................... || .....................................e.......................................................................................... - // mul v24.8H, v10.8H, v0.8H // .......................................................................................................e...................................................................................................... || .................................................................e.............................................................. - // sqrdmulh v10.8H, v10.8H, v4.8H // .....................................................................................................e........................................................................................................ || .............................................................e.................................................................. - // mls v24.8H, v10.8H, v7.H[0] // ..............................................................................................................*............................................................................................... || .....................................................................*.......................................................... - // sub v10.8H, v8.8H, v24.8H // ..............................................................................................................................*............................................................................... || ..........................................................................*..................................................... - // add v8.8H, v8.8H, v24.8H // ...............................................................................................................................*.............................................................................. || ...........................................................................*.................................................... - // mul v24.8H, v11.8H, v0.8H // ...................................................................................................e.......................................................................................................... || ...........................................................e.................................................................... - // sqrdmulh v11.8H, v11.8H, v4.8H // .................................................................................................e............................................................................................................ || .........................................................e...................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ......................................................................................................e....................................................................................................... || ...............................................................e................................................................ - // sub v11.8H, v9.8H, v24.8H // .............................................................................................................*................................................................................................ || ....................................................................*........................................................... - // add v9.8H, v9.8H, v24.8H // .................................................................................................................*............................................................................................ || ......................................................................*......................................................... - // mul v24.8H, v9.8H, v1.8H // .......................................................................................................................................*...................................................................... || ..............................................................................*................................................. - // sqrdmulh v9.8H, v9.8H, v5.8H // ............................................................................................................................*................................................................................. || ..........................................................................*..................................................... - // mls v24.8H, v9.8H, v7.H[0] // ............................................................................................................................................*................................................................. || ................................................................................*............................................... - // sub v9.8H, v8.8H, v24.8H // ...........................................................................................................................................................*.................................................. || ......................................................................................*......................................... - // add v8.8H, v8.8H, v24.8H // .............................................................................................................................................................*................................................ || ........................................................................................*....................................... - // mul v24.8H, v11.8H, v2.8H // .................................................................................................................................*............................................................................ || ............................................................................*................................................... - // sqrdmulh v11.8H, v11.8H, v6.8H // .........................................................................................................................*.................................................................................... || ........................................................................*....................................................... - // mls v24.8H, v11.8H, v7.H[0] // ................................................................................................................................................*............................................................. || ..................................................................................*............................................. - // sub v11.8H, v10.8H, v24.8H // ...............................................................................................................................................................*.............................................. || .........................................................................................*...................................... - // add v10.8H, v10.8H, v24.8H // ................................................................................................................................................................*............................................. || ..........................................................................................*..................................... - // sqdmulh v25.8H, v8.8H, v7.H[1] // ......................................................................................................................................................................................*....................... || ..............................................................................................................*................. - // srshr v25.8H, v25.8H, #11 // .............................................................................................................................................................................................*................ || ...................................................................................................................*............ - // mls v8.8H, v25.8H, v7.H[0] // .................................................................................................................................................................................................*............ || ......................................................................................................................*......... - // sqdmulh v25.8H, v9.8H, v7.H[1] // ....................................................................................................................................................................*......................................... || ............................................................................................*................................... - // srshr v25.8H, v25.8H, #11 // ...........................................................................................................................................................................*.................................. || .................................................................................................*.............................. - // mls v9.8H, v25.8H, v7.H[0] // ..............................................................................................................................................................................................*............... || ....................................................................................................................*........... - // sqdmulh v25.8H, v10.8H, v7.H[1] // .........................................................................................................................................................................................*.................... || ................................................................................................................*............... - // srshr v25.8H, v25.8H, #11 // ................................................................................................................................................................................................*............. || .....................................................................................................................*.......... - // mls v10.8H, v25.8H, v7.H[0] // ....................................................................................................................................................................................................*......... || ........................................................................................................................*....... - // sqdmulh v25.8H, v11.8H, v7.H[1] // ............................................................................................................................................................................................*................. || ..................................................................................................................*............. - // srshr v25.8H, v25.8H, #11 // ...................................................................................................................................................................................................*.......... || .......................................................................................................................*........ - // mls v11.8H, v25.8H, v7.H[0] // .......................................................................................................................................................................................................*...... || ..........................................................................................................................*..... - // st4 {v8.4S,v9.4S,v10.4S,v11.4S}, [x1], #64 // .............................................................................................................................................................................................................* || ...............................................................................................................................* - - subs count, count, #1 + // Instructions: 72 + // Expected cycles: 64 + // Expected IPC: 1.12 + // + // Cycle bound: 64.0 + // IPC bound: 1.12 + // + // Wall time: 55.77s + // User time: 55.77s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + sqrdmulh v31.8H, v17.8H, v6.8H // .......................................*................................ + ldr q17, [x1, #96] // ..e..................................................................... + ldr q27, [x1, #64] // e....................................................................... + ldr q9, [x1, #80] // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + ldr q2, [x3], #16 // ....e................................................................... + // gap // ........................................................................ + mul v0.8H, v1.8H, v0.8H // .............................................*.......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v0.8H, v3.8H, v7.H[0] // ..............................................*......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v16.8H, v31.8H, v7.H[0] // .........................................*.............................. + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v31.2D, v15.2D, v12.2D // ...............................*........................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v11.8H, v17.8H, v2.H[1] // .....e.................................................................. + // gap // ........................................................................ + ldr q20, [x4, #-16] // ......................................*................................. + sub v24.8H, v22.8H, v0.8H // ...............................................*........................ + // gap // ........................................................................ + // gap // ........................................................................ + add v26.8H, v22.8H, v0.8H // ................................................*....................... + mul v15.8H, v17.8H, v2.H[0] // ......e................................................................. + // gap // ........................................................................ + add v10.8H, v31.8H, v16.8H // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v19.8H, v31.8H, v16.8H // ..........................................*............................. + sqrdmulh v20.8H, v24.8H, v20.8H // ......................................................*................. + ldr q16, [x1, #112] // ...e.................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v15.8H, v11.8H, v7.H[0] // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v8.8H, v16.8H, v2.H[1] // ..........e............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v16.8H, v16.8H, v2.H[0] // ...........e............................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v18.8H, v27.8H, v15.8H // ........e............................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v30.8H, v27.8H, v15.8H // .........e.............................................................. + ldr q27, [x4, #-32] // .....................................*.................................. + sqrdmulh v6.8H, v26.8H, v25.8H // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v16.8H, v8.8H, v7.H[0] // ............e........................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v27.8H, v24.8H, v27.8H // .......................................................*................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v23.8H, v26.8H, v28.8H // ..................................................*..................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v1.8H, v9.8H, v16.8H // .............e.......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v16.8H, v9.8H, v16.8H // ..............e......................................................... + mls v23.8H, v6.8H, v7.H[0] // ...................................................*.................... + ldr q6, [x4, #16] // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v21.8H, v1.8H, v2.H[4] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v14.8H, v16.8H, v2.H[2] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v25.8H, v10.8H, v23.8H // ....................................................*................... + // gap // ........................................................................ + // gap // ........................................................................ + add v24.8H, v10.8H, v23.8H // .....................................................*.................. + sqrdmulh v13.8H, v16.8H, v2.H[3] // ...............e........................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v28.8H, v1.8H, v2.H[5] // ....................e................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v20.8H, v7.H[0] // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v14.8H, v13.8H, v7.H[0] // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v21.8H, v28.8H, v7.H[0] // ......................e................................................. + ldr q28, [x4, #32] // ...................................e.................................... + // gap // ........................................................................ + add v26.8H, v19.8H, v27.8H // ..........................................................*............. + // gap // ........................................................................ + // gap // ........................................................................ + sub v27.8H, v19.8H, v27.8H // .........................................................*.............. + // gap // ........................................................................ + sqdmulh v5.8H, v24.8H, v7.H[1] // ...........................................................*............ + sub v0.8H, v30.8H, v14.8H // ..................e..................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v15.8H, v30.8H, v14.8H // ...................e.................................................... + // gap // ........................................................................ + sqdmulh v16.8H, v25.8H, v7.H[1] // ..............................................................*......... + sub v31.8H, v18.8H, v21.8H // .......................e................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v13.8H, v18.8H, v21.8H // ........................e............................................... + // gap // ........................................................................ + sqdmulh v20.8H, v26.8H, v7.H[1] // .................................................................*...... + srshr v23.8H, v5.8H, #11 // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v11.8H, v27.8H, v7.H[1] // ....................................................................*... + trn2 v30.4S, v15.4S, v0.4S // ..........................e............................................. + // gap // ........................................................................ + srshr v16.8H, v16.8H, #11 // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + mls v24.8H, v23.8H, v7.H[0] // .............................................................*.......... + trn1 v15.4S, v15.4S, v0.4S // .........................e.............................................. + ldr q0, [x4], #(6*16) // .................................e...................................... + srshr v20.8H, v20.8H, #11 // ..................................................................*..... + // gap // ........................................................................ + // gap // ........................................................................ + mls v25.8H, v16.8H, v7.H[0] // ................................................................*....... + trn1 v12.4S, v13.4S, v31.4S // ...........................e............................................ + // gap // ........................................................................ + srshr v16.8H, v11.8H, #11 // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v20.8H, v7.H[0] // ...................................................................*.... + trn2 v13.4S, v13.4S, v31.4S // ............................e........................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v17.2D, v15.2D, v12.2D // .............................e.......................................... + mls v27.8H, v16.8H, v7.H[0] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v1.2D, v30.2D, v13.2D // ..............................e......................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v22.2D, v30.2D, v13.2D // ................................e....................................... + mul v16.8H, v17.8H, v0.8H // ........................................e............................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v3.8H, v1.8H, v6.8H // ............................................e........................... + // gap // ........................................................................ + // gap // ........................................................................ + st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 // .......................................................................* + ldr q25, [x4, #-48] // ....................................e................................... + // gap // ........................................................................ + + // --------------------------------------------------------------- new position ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ldr q8, [x1, #(16*0)] // .e.....................................................................'.~.................................................................... + // ldr q9, [x1, #(16*1)] // ..e....................................................................'..~................................................................... + // ldr q10, [x1, #(16*2)] // e......................................................................'~..................................................................... + // ldr q11, [x1, #(16*3)] // ................e......................................................'................~..................................................... + // ldr q0, [x3], #16 // ...e...................................................................'...~.................................................................. + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ........e..............................................................'........~............................................................. + // mul v24.8h, v10.8h, v0.h[0] // ............e..........................................................'............~......................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................e.....................................................'.................~.................................................... + // sub v10.8h, v8.8h, v24.8h // ....................e..................................................'....................~................................................. + // add v8.8h, v8.8h, v24.8h // .....................e.................................................'.....................~................................................ + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ..................e....................................................'..................~................................................... + // mul v24.8h, v11.8h, v0.h[0] // ...................e...................................................'...................~.................................................. + // mls v24.8h, v27.8h, v7.h[0] // ........................e..............................................'........................~............................................. + // sub v11.8h, v9.8h, v24.8h // ...........................e...........................................'...........................~.......................................... + // add v9.8h, v9.8h, v24.8h // ............................e..........................................'............................~......................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ...................................e...................................'...................................~.................................. + // mul v24.8h, v9.8h, v0.h[2] // ................................e......................................'................................~..................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................e................................'......................................~............................... + // sub v9.8h, v8.8h, v24.8h // ............................................e..........................'............................................~......................... + // add v8.8h, v8.8h, v24.8h // .............................................e.........................'.............................................~........................ + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ....................................e..................................'....................................~................................. + // mul v24.8h, v11.8h, v0.h[4] // ...............................e.......................................'...............................~...................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................................e...............................'.......................................~.............................. + // sub v11.8h, v10.8h, v24.8h // ...............................................e.......................'...............................................~...................... + // add v10.8h, v10.8h, v24.8h // ................................................e......................'................................................~..................... + // trn1 v25.4s, v8.4s, v9.4s // .......................................................e...............'.......................................................~.............. + // trn2 v26.4s, v8.4s, v9.4s // ....................................................e..................'....................................................~................. + // trn1 v27.4s, v10.4s, v11.4s // ...........................................................e...........'...........................................................~.......... + // trn2 v28.4s, v10.4s, v11.4s // ..............................................................e........'..............................................................~....... + // trn2 v10.2d, v25.2d, v27.2d // ...............................................................e.......'...............................................................~...... + // trn2 v11.2d, v26.2d, v28.2d // .................................................................e.....'.................................................................~.... + // trn1 v8.2d, v25.2d, v27.2d // .......~...............................................................'.......*.............................................................. + // trn1 v9.2d, v26.2d, v28.2d // ..................................................................e....'..................................................................~... + // ldr q0, [ x4], #(6*16) // ........................................................e..............'........................................................~............. + // ldr q4, [x4, #(-6*16 + 1*16)] // ..............................e........................................'..............................~....................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ........................................e..............................'........................................~............................. + // ldr q5, [x4, #(-6*16 + 3*16)] // ......................................................................e'...................................................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // ......................~................................................'......................*............................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .........~.............................................................'.........*............................................................ + // sqrdmulh v27.8h, v10.8h, v4.8h // .......................................................................*...................................................................... + // mul v24.8h, v10.8h, v0.8h // ...................................................................e...'...................................................................~.. + // mls v24.8h, v27.8h, v7.h[0] // ......~................................................................'......*............................................................... + // sub v10.8h, v8.8h, v24.8h // ..............~........................................................'..............*....................................................... + // add v8.8h, v8.8h, v24.8h // .............~.........................................................'.............*........................................................ + // sqrdmulh v27.8h, v11.8h, v4.8h // ....................................................................e..'....................................................................~. + // mul v24.8h, v11.8h, v0.8h // ....~..................................................................'....*................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .....~.................................................................'.....*................................................................ + // sub v11.8h, v9.8h, v24.8h // ..........~............................................................'..........*........................................................... + // add v9.8h, v9.8h, v24.8h // ...........~...........................................................'...........*.......................................................... + // sqrdmulh v27.8h, v9.8h, v5.8h // .......................~...............................................'.......................*.............................................. + // mul v24.8h, v9.8h, v1.8h // ..........................~............................................'..........................*........................................... + // mls v24.8h, v27.8h, v7.h[0] // .............................~.........................................'.............................*........................................ + // sub v9.8h, v8.8h, v24.8h // .................................~.....................................'.................................*.................................... + // add v8.8h, v8.8h, v24.8h // ..................................~....................................'..................................*................................... + // sqrdmulh v27.8h, v11.8h, v6.8h // ...............~.......................................................'...............*...................................................... + // mul v24.8h, v11.8h, v2.8h // .........................~.............................................'.........................*............................................ + // mls v24.8h, v27.8h, v7.h[0] // .....................................~.................................'.....................................*................................ + // sub v11.8h, v10.8h, v24.8h // ..........................................~............................'..........................................*........................... + // add v10.8h, v10.8h, v24.8h // .........................................~.............................'.........................................*............................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........................................~...........................'...........................................*.......................... + // srshr v25.8h, v25.8h, #11 // ..................................................~....................'..................................................*................... + // mls v8.8h, v25.8h, v7.h[0] // ......................................................~................'......................................................*............... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ..............................................~........................'..............................................*....................... + // srshr v25.8h, v25.8h, #11 // .....................................................~.................'.....................................................*................ + // mls v9.8h, v25.8h, v7.h[0] // ..........................................................~............'..........................................................*........... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .................................................~.....................'.................................................*.................... + // srshr v25.8h, v25.8h, #11 // .........................................................~.............'.........................................................*............ + // mls v10.8h, v25.8h, v7.h[0] // .............................................................~.........'.............................................................*........ + // sqdmulh v25.8h, v11.8h, v7.h[1] // ...................................................~...................'...................................................*.................. + // srshr v25.8h, v25.8h, #11 // ............................................................~..........'............................................................*......... + // mls v11.8h, v25.8h, v7.h[0] // ................................................................~......'................................................................*..... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // .....................................................................~.'.....................................................................* + + sub count, count, #1 cbnz count, layer4567_start - sub v24.8H, v11.8H, v15.8H // ...*............................. - vins v18, x18, 1 // .....*........................... - ldr x21, [x4, #-56] // .*............................... - vins v2, x9, 1 // ..*.............................. - mls v25.8H, v4.8H, v7.H[0] // ....*............................ - // gap // ................................. - add v11.8H, v11.8H, v15.8H // ......*.......................... - // gap // ................................. - // gap // ................................. - sqrdmulh v21.8H, v24.8H, v5.8H // ........*........................ - // gap // ................................. - // gap // ................................. - vins v29, x21, 1 // .......*......................... - // gap // ................................. - // gap // ................................. - trn1 v0.2D, v13.2D, v16.2D // *................................ - // gap // ................................. - // gap // ................................. - mul v22.8H, v24.8H, v18.8H // ............*.................... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - sqrdmulh v13.8H, v11.8H, v2.8H // .........*....................... - sub v16.8H, v0.8H, v25.8H // ..........*...................... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - mls v22.8H, v21.8H, v7.H[0] // ...............*................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - mul v17.8H, v11.8H, v29.8H // .............*................... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - mls v17.8H, v13.8H, v7.H[0] // ..............*.................. - // gap // ................................. - // gap // ................................. - sub v30.8H, v16.8H, v22.8H // ..................*.............. - // gap // ................................. - // gap // ................................. - add v29.8H, v16.8H, v22.8H // ...................*............. - add v0.8H, v0.8H, v25.8H // ...........*..................... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - sqdmulh v24.8H, v30.8H, v7.H[1] // ........................*........ - // gap // ................................. - // gap // ................................. - sub v28.8H, v0.8H, v17.8H // ................*................ - // gap // ................................. - // gap // ................................. - add v27.8H, v0.8H, v17.8H // .................*............... - sqdmulh v8.8H, v29.8H, v7.H[1] // .......................*......... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - sqdmulh v20.8H, v28.8H, v7.H[1] // ....................*............ - // gap // ................................. - // gap // ................................. - srshr v23.8H, v24.8H, #11 // .............................*... - // gap // ................................. - // gap // ................................. - sqdmulh v0.8H, v27.8H, v7.H[1] // ......................*.......... - // gap // ................................. - // gap // ................................. - srshr v2.8H, v8.8H, #11 // ...........................*..... - // gap // ................................. - // gap // ................................. - mls v30.8H, v23.8H, v7.H[0] // ...............................*. - // gap // ................................. - // gap // ................................. - srshr v13.8H, v20.8H, #11 // .....................*........... - // gap // ................................. - // gap // ................................. - mls v29.8H, v2.8H, v7.H[0] // ..............................*.. - // gap // ................................. - // gap // ................................. - srshr v24.8H, v0.8H, #11 // .........................*....... - // gap // ................................. - // gap // ................................. - mls v28.8H, v13.8H, v7.H[0] // ..........................*...... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - mls v27.8H, v24.8H, v7.H[0] // ............................*.... - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - // gap // ................................. - st4 {v27.4S,v28.4S,v29.4S,v30.4S}, [x1], #64 // ................................* - // gap // ................................. - // gap // ................................. - - // original source code - // trn1 v0.2D, v13.2D, v16.2D // ........*........................ || .....*................................ - // ldr x21, [x4, #-56] // ..*.............................. || *..................................... - // vins v2, x9, 1 // ...*............................. || .*.................................... - // sub v16.8H, v11.8H, v15.8H // *................................ || *..................................... - // mls v25.8H, v4.8H, v7.H[0] // ....*............................ || .*.................................... - // vins v18, x18, 1 // .*............................... || *..................................... - // add v11.8H, v11.8H, v15.8H // .....*........................... || ..*................................... - // vins v29, x21, 1 // .......*......................... || ....*................................. - // sqrdmulh v4.8H, v16.8H, v5.8H // ......*.......................... || ...*.................................. - // sqrdmulh v2.8H, v11.8H, v2.8H // ..........*...................... || ........*............................. - // sub v15.8H, v0.8H, v25.8H // ...........*..................... || ........*............................. - // add v0.8H, v0.8H, v25.8H // .................*............... || ................*..................... - // mul v16.8H, v16.8H, v18.8H // .........*....................... || ......*............................... - // mul v18.8H, v11.8H, v29.8H // .............*................... || ............*......................... - // mls v18.8H, v2.8H, v7.H[0] // ..............*.................. || ..............*....................... - // mls v16.8H, v4.8H, v7.H[0] // ............*.................... || ..........*........................... - // sub v21.8H, v0.8H, v18.8H // ...................*............. || ...................*.................. - // add v20.8H, v0.8H, v18.8H // ....................*............ || ....................*................. - // sub v23.8H, v15.8H, v16.8H // ...............*................. || ...............*...................... - // add v22.8H, v15.8H, v16.8H // ................*................ || ................*..................... - // sqdmulh v16.8H, v21.8H, v7.H[1] // ......................*.......... || ......................*............... - // srshr v0.8H, v16.8H, #11 // ...........................*..... || ...........................*.......... - // sqdmulh v11.8H, v20.8H, v7.H[1] // ........................*........ || ........................*............. - // sqdmulh v13.8H, v22.8H, v7.H[1] // .....................*........... || ....................*................. - // sqdmulh v26.8H, v23.8H, v7.H[1] // ..................*.............. || ..................*................... - // srshr v11.8H, v11.8H, #11 // .............................*... || .............................*........ - // mls v21.8H, v0.8H, v7.H[0] // ..............................*.. || ..............................*....... - // srshr v13.8H, v13.8H, #11 // .........................*....... || .........................*............ - // mls v20.8H, v11.8H, v7.H[0] // ...............................*. || ................................*..... - // srshr v26.8H, v26.8H, #11 // .......................*......... || .......................*.............. - // mls v22.8H, v13.8H, v7.H[0] // ............................*.... || ............................*......... - // mls v23.8H, v26.8H, v7.H[0] // ..........................*...... || ..........................*........... - // st4 {v20.4S,v21.4S,v22.4S,v23.4S}, [x1], #64 // ................................* || .....................................* - + // Instructions: 34 + // Expected cycles: 45 + // Expected IPC: 0.76 + // + // Cycle bound: 45.0 + // IPC bound: 0.76 + // + // Wall time: 0.59s + // User time: 0.59s + // + // ------- original position -------> + // 0 25 + // |------------------------|-------- + mul v26.8H, v1.8H, v0.8H // .*................................ + trn1 v11.2D, v15.2D, v12.2D // ....*............................. + ldr q0, [x4, #-32] // ...........*...................... + ldr q14, [x4, #-16] // .....*............................ + // gap // .................................. + // gap // .................................. + mls v26.8H, v3.8H, v7.H[0] // ..*............................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v12.8H, v17.8H, v6.8H // *................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + add v27.8H, v22.8H, v26.8H // .......*.......................... + sub v1.8H, v22.8H, v26.8H // ......*........................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v16.8H, v12.8H, v7.H[0] // ...*.............................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v26.8H, v27.8H, v25.8H // ............*..................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v15.8H, v27.8H, v28.8H // ..............*................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v14.8H, v1.8H, v14.8H // ..........*....................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v15.8H, v26.8H, v7.H[0] // ...............*.................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v6.8H, v1.8H, v0.8H // .............*.................... + add v0.8H, v11.8H, v16.8H // ........*......................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v6.8H, v14.8H, v7.H[0] // ..................*............... + // gap // .................................. + // gap // .................................. + sub v2.8H, v0.8H, v15.8H // ................*................. + // gap // .................................. + // gap // .................................. + add v1.8H, v0.8H, v15.8H // .................*................ + sub v15.8H, v11.8H, v16.8H // .........*........................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqdmulh v16.8H, v2.8H, v7.H[1] // ......................*........... + add v3.8H, v15.8H, v6.8H // ...................*.............. + // gap // .................................. + // gap // .................................. + sub v4.8H, v15.8H, v6.8H // ....................*............. + sqdmulh v14.8H, v1.8H, v7.H[1] // .....................*............ + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqdmulh v15.8H, v3.8H, v7.H[1] // .......................*.......... + // gap // .................................. + // gap // .................................. + srshr v27.8H, v16.8H, #11 // ..........................*....... + // gap // .................................. + // gap // .................................. + sqdmulh v16.8H, v4.8H, v7.H[1] // .........................*........ + // gap // .................................. + // gap // .................................. + srshr v26.8H, v14.8H, #11 // ........................*......... + // gap // .................................. + // gap // .................................. + mls v2.8H, v27.8H, v7.H[0] // .............................*.... + // gap // .................................. + // gap // .................................. + srshr v15.8H, v15.8H, #11 // ............................*..... + // gap // .................................. + // gap // .................................. + mls v1.8H, v26.8H, v7.H[0] // ...........................*...... + // gap // .................................. + // gap // .................................. + srshr v27.8H, v16.8H, #11 // ..............................*... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v3.8H, v15.8H, v7.H[0] // ...............................*.. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v4.8H, v27.8H, v7.H[0] // ................................*. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + st4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x1], #64 // .................................* + // gap // .................................. + // gap // .................................. + + // --------- new position ----------> + // 0 25 + // |------------------------|-------- + // sqrdmulh v31.8H, v17.8H, v6.8H // .....*............................ + // mul v0.8H, v1.8H, v0.8H // *................................. + // mls v0.8H, v3.8H, v7.H[0] // ....*............................. + // mls v16.8H, v31.8H, v7.H[0] // ........*......................... + // trn1 v31.2D, v15.2D, v12.2D // .*................................ + // ldr q20, [x4, #-16] // ...*.............................. + // sub v24.8H, v22.8H, v0.8H // .......*.......................... + // add v26.8H, v22.8H, v0.8H // ......*........................... + // add v10.8H, v31.8H, v16.8H // ..............*................... + // sub v19.8H, v31.8H, v16.8H // ..................*............... + // sqrdmulh v20.8H, v24.8H, v20.8H // ...........*...................... + // ldr q27, [x4, #-32] // ..*............................... + // sqrdmulh v6.8H, v26.8H, v25.8H // .........*........................ + // mul v27.8H, v24.8H, v27.8H // .............*.................... + // mul v23.8H, v26.8H, v28.8H // ..........*....................... + // mls v23.8H, v6.8H, v7.H[0] // ............*..................... + // sub v25.8H, v10.8H, v23.8H // ................*................. + // add v24.8H, v10.8H, v23.8H // .................*................ + // mls v27.8H, v20.8H, v7.H[0] // ...............*.................. + // add v26.8H, v19.8H, v27.8H // ....................*............. + // sub v27.8H, v19.8H, v27.8H // .....................*............ + // sqdmulh v5.8H, v24.8H, v7.H[1] // ......................*........... + // sqdmulh v16.8H, v25.8H, v7.H[1] // ...................*.............. + // sqdmulh v20.8H, v26.8H, v7.H[1] // .......................*.......... + // srshr v23.8H, v5.8H, #11 // ..........................*....... + // sqdmulh v11.8H, v27.8H, v7.H[1] // .........................*........ + // srshr v16.8H, v16.8H, #11 // ........................*......... + // mls v24.8H, v23.8H, v7.H[0] // .............................*.... + // srshr v20.8H, v20.8H, #11 // ............................*..... + // mls v25.8H, v16.8H, v7.H[0] // ...........................*...... + // srshr v16.8H, v11.8H, #11 // ..............................*... + // mls v26.8H, v20.8H, v7.H[0] // ...............................*.. + // mls v27.8H, v16.8H, v7.H[0] // ................................*. + // st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 // .................................* + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_m1_firestorm.s index 9b82d465..381e3a70 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_m1_firestorm.s @@ -26,42 +26,12 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -79,15 +49,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -96,12 +66,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -115,21 +79,21 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -151,7 +115,7 @@ xtmp1 .req x11 trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -162,7 +126,7 @@ xtmp1 .req x11 str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -172,7 +136,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -180,7 +144,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -191,19 +155,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -342,1502 +306,1414 @@ _ntt_kyber_123_4567_scalar_load_opt_m1_firestorm: load_roots_123 .p2align 2 - ldr x21, [x0, #320] // ....*......................................... - ldr x14, [x0, #64] // .........*.................................... - ldr x24, [x0, #448] // *............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ldr x10, [x0, #384] // .*............................................ - ldr x29, [x0, #264] // ..................*........................... - ldr x20, [x0, #328] // .......*...................................... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ldr x23, [x0, #456] // ...*.......................................... - ldr x28, [x0, #72] // .....................*........................ - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v26.d[0], x14 // ....................................*......... - ins v8.d[0], x24 // ...........*.................................. - ldr x24, [x0, #0] // .................................*............ - ldr x15, [x0, #392] // ................*............................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v27.d[0], x10 // ..........*................................... - ldr x7, [x0, #128] // ..*........................................... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v8.d[1], x23 // .................*............................ - ldr x17, [x0, #256] // .....*........................................ - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v21.d[0], x24 // .........................................*.... - ins v27.d[1], x15 // ........................*..................... - ins v23.d[0], x21 // .............*................................ - ldr x24, [x0, #192] // ......*....................................... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - mul v3.8H, v8.8H, v0.H[0] // ....................*......................... - sqrdmulh v4.8H, v8.8H, v0.H[1] // ......................*....................... - ldr x25, [x0, #200] // ........*..................................... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - sqrdmulh v25.8H, v27.8H, v0.H[1] // ..........................*................... - mul v16.8H, v27.8H, v0.H[0] // ............................*................. - ins v23.d[1], x20 // ...................*.......................... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v22.d[0], x7 // ..............................*............... - ins v13.d[0], x24 // ...............*.............................. - ldr x24, [x0, #136] // ..............*............................... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v5.d[0], x17 // ..................................*........... - mls v3.8H, v4.8H, v7.H[0] // ...........................*.................. - mul v10.8H, v23.8H, v0.H[0] // .......................*...................... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v13.d[1], x25 // .............................*................ - mls v16.8H, v25.8H, v7.H[0] // ................................*............. - sqrdmulh v24.8H, v23.8H, v0.H[1] // .........................*.................... - ldr x25, [x0, #8] // ............*................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v22.d[1], x24 // .....................................*........ - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - add v12.8H, v13.8H, v3.8H // ...............................*.............. - sub v2.8H, v13.8H, v3.8H // ...................................*.......... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - ins v5.d[1], x29 // ......................................*....... - mls v10.8H, v24.8H, v7.H[0] // ............................................*. - add v23.8H, v22.8H, v16.8H // .............................................* - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - sqrdmulh v15.8H, v2.8H, v0.H[5] // ..........................................*... - mul v11.8H, v2.8H, v0.H[4] // ...........................................*.. - mul v6.8H, v12.8H, v0.H[2] // .......................................*...... - sqrdmulh v2.8H, v12.8H, v0.H[3] // ........................................*..... - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. - // gap // .............................................. + // Instructions: 38 + // Expected cycles: 16 + // Expected IPC: 2.38 + // + // Cycle bound: 16.0 + // IPC bound: 2.38 + // + // Wall time: 0.62s + // User time: 0.62s + // + // --------- original position ---------> + // 0 25 + // |------------------------|------------ + ldr q15, [x0, #448] // *..................................... + ldr q8, [x0, #128] // ....*................................. + ldr q11, [x0, #384] // .*.................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + ldr q10, [x0, #320] // ......*............................... + ldr q9, [x0, #0] // ...*.................................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + ldr q14, [x0, #192] // ..............*....................... + ldr q24, [x0, #64] // .....*................................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v26.8H, v15.8H, v0.H[0] // .......*.............................. + sqrdmulh v3.8H, v11.8H, v0.H[1] // ..........*........................... + sqrdmulh v25.8H, v15.8H, v0.H[1] // ........*............................. + ldr q30, [x0, #256] // ..*................................... + // gap // ...................................... + mul v18.8H, v11.8H, v0.H[0] // .........*............................ + // gap // ...................................... + // gap // ...................................... + mul v21.8H, v10.8H, v0.H[0] // ............*......................... + sqrdmulh v13.8H, v10.8H, v0.H[1] // .............*........................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v18.8H, v3.8H, v7.H[0] // .................*.................... + mls v26.8H, v25.8H, v7.H[0] // ................*..................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v21.8H, v13.8H, v7.H[0] // ..................*................... + mul v22.8H, v30.8H, v0.H[0] // ...............*...................... + sqrdmulh v19.8H, v30.8H, v0.H[1] // ...........*.......................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v27.8H, v8.8H, v18.8H // ......................*............... + sub v16.8H, v8.8H, v18.8H // ...........................*.......... + sub v17.8H, v14.8H, v26.8H // ....................*................. + add v30.8H, v14.8H, v26.8H // ...................*.................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v22.8H, v19.8H, v7.H[0] // .....................*................ + add v19.8H, v24.8H, v21.8H // .............................*........ + sub v18.8H, v24.8H, v21.8H // ....................................*. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v6.8H, v30.8H, v0.H[2] // ........................*............. + sqrdmulh v29.8H, v17.8H, v0.H[5] // ..........................*........... + mul v21.8H, v17.8H, v0.H[4] // .........................*............ + sqrdmulh v24.8H, v30.8H, v0.H[3] // .......................*.............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v14.8H, v16.8H, v0.H[5] // ..................................*... + sqrdmulh v31.8H, v27.8H, v0.H[3] // .....................................* + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v20.8H, v9.8H, v22.8H // ...............................*...... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v3.8H, v16.8H, v0.H[4] // .................................*.... + mul v26.8H, v27.8H, v0.H[2] // ............................*......... + add v4.8H, v9.8H, v22.8H // ..............................*....... + mls v6.8H, v24.8H, v7.H[0] // ................................*..... + mls v21.8H, v29.8H, v7.H[0] // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... - // original source code - // ldr x13, [x0, #448] // ..*........................................... - // ldr x15, [x0, #384] // ...*.......................................... - // ldr x20, [x0, #128] // .............*................................ - // ldr x24, [x0, #456] // ......*....................................... - // ldr x8, [x0, #320] // *............................................. - // ldr x14, [x0, #256] // ...............*.............................. - // ldr x7, [x0, #192] // ...................*.......................... - // ldr x21, [x0, #328] // .....*........................................ - // ldr x17, [x0, #200] // ......................*....................... - // ldr x22, [x0, #64] // .*............................................ - // ins v15.d[0], x15 // ............*................................. - // ins v25.d[0], x13 // .........*.................................... - // ldr x25, [x0, #8] // ...................................*.......... - // ins v20.d[0], x8 // ..................*........................... - // ldr x15, [x0, #136] // ............................*................. - // ins v14.d[0], x7 // ...........................*.................. - // ldr x12, [x0, #392] // ...........*.................................. - // ins v25.d[1], x24 // ..............*............................... - // ldr x26, [x0, #264] // ....*......................................... - // ins v20.d[1], x21 // .........................*.................... - // mul v11.8H, v25.8H, v0.H[0] // ....................*......................... - // ldr x28, [x0, #72] // .......*...................................... - // sqrdmulh v12.8H, v25.8H, v0.H[1] // .....................*........................ - // mul v10.8H, v20.8H, v0.H[0] // ...............................*.............. - // ins v15.d[1], x12 // .................*............................ - // sqrdmulh v13.8H, v20.8H, v0.H[1] // ..................................*........... - // sqrdmulh v28.8H, v15.8H, v0.H[1] // .......................*...................... - // mls v11.8H, v12.8H, v7.H[0] // ..............................*............... - // mul v16.8H, v15.8H, v0.H[0] // ........................*..................... - // ins v14.d[1], x17 // ................................*............. - // ins v22.d[0], x20 // ..........................*................... - // add v30.8H, v14.8H, v11.8H // .....................................*........ - // mls v16.8H, v28.8H, v7.H[0] // .................................*............ - // ldr x16, [x0, #0] // ..........*................................... - // ins v5.d[0], x14 // .............................*................ - // sub v23.8H, v14.8H, v11.8H // ......................................*....... - // ins v26.d[0], x22 // ........*..................................... - // ins v22.d[1], x15 // ....................................*......... - // ins v5.d[1], x26 // .......................................*...... - // mul v6.8H, v30.8H, v0.H[2] // ............................................*. - // sqrdmulh v2.8H, v30.8H, v0.H[3] // .............................................* - // ins v21.d[0], x16 // ................*............................. - // sqrdmulh v15.8H, v23.8H, v0.H[5] // ..........................................*... - // mul v11.8H, v23.8H, v0.H[4] // ...........................................*.. - // mls v10.8H, v13.8H, v7.H[0] // ........................................*..... - // add v23.8H, v22.8H, v16.8H // .........................................*.... + // ----------- new position ------------> + // 0 25 + // |------------------------|------------ + // ldr q17, [x0, #448] // *..................................... + // ldr q23, [x0, #384] // ..*................................... + // ldr q25, [x0, #256] // ..........*........................... + // ldr q30, [x0, #0] // ....*................................. + // ldr q10, [x0, #128] // .*.................................... + // ldr q24, [x0, #64] // ......*............................... + // ldr q2, [x0, #320] // ...*.................................. + // mul v29.8H, v17.8H, v0.H[0] // .......*.............................. + // sqrdmulh v17.8H, v17.8H, v0.H[1] // .........*............................ + // mul v19.8H, v23.8H, v0.H[0] // ...........*.......................... + // sqrdmulh v23.8H, v23.8H, v0.H[1] // ........*............................. + // sqrdmulh v21.8H, v25.8H, v0.H[1] // ..................*................... + // mul v8.8H, v2.8H, v0.H[0] // ............*......................... + // sqrdmulh v31.8H, v2.8H, v0.H[1] // .............*........................ + // ldr q9, [x0, #192] // .....*................................ + // mul v5.8H, v25.8H, v0.H[0] // .................*.................... + // mls v29.8H, v17.8H, v7.H[0] // ...............*...................... + // mls v19.8H, v23.8H, v7.H[0] // ..............*....................... + // mls v8.8H, v31.8H, v7.H[0] // ................*..................... + // add v31.8H, v9.8H, v29.8H // ......................*............... + // sub v9.8H, v9.8H, v29.8H // .....................*................ + // mls v5.8H, v21.8H, v7.H[0] // .......................*.............. + // add v17.8H, v10.8H, v19.8H // ...................*.................. + // sqrdmulh v11.8H, v31.8H, v0.H[3] // .............................*........ + // mul v6.8H, v31.8H, v0.H[2] // ..........................*........... + // mul v21.8H, v9.8H, v0.H[4] // ............................*......... + // sqrdmulh v31.8H, v9.8H, v0.H[5] // ...........................*.......... + // sub v29.8H, v10.8H, v19.8H // ....................*................. + // mul v26.8H, v17.8H, v0.H[2] // ..................................*... + // add v19.8H, v24.8H, v8.8H // ........................*............. + // add v4.8H, v30.8H, v5.8H // ...................................*.. + // sub v20.8H, v30.8H, v5.8H // ................................*..... + // mls v6.8H, v11.8H, v7.H[0] // ....................................*. + // mul v3.8H, v29.8H, v0.H[4] // .................................*.... + // sqrdmulh v14.8H, v29.8H, v0.H[5] // ..............................*....... + // mls v21.8H, v31.8H, v7.H[0] // .....................................* + // sub v18.8H, v24.8H, v8.8H // .........................*............ + // sqrdmulh v31.8H, v17.8H, v0.H[3] // ...............................*...... sub count, count, #1 layer123_start: - // gap // .................................................................................................... - mul v27.8H, v5.8H, v0.H[0] // ................................*................................................................... - ldr x13, [x0, #464] // ............................e....................................................................... - ldr x15, [x0, #400] // ........................e........................................................................... - sqrdmulh v28.8H, v5.8H, v0.H[1] // .................................*.................................................................. - ldr x20, [x0, #144] // ........e........................................................................................... - sub v13.8H, v22.8H, v16.8H // .............................................*...................................................... - ins v26.d[1], x28 // .......*............................................................................................ - mul v9.8H, v23.8H, v0.H[2] // ....................................................*............................................... - // gap // .................................................................................................... - sqrdmulh v23.8H, v23.8H, v0.H[3] // .....................................................*.............................................. - mls v6.8H, v2.8H, v7.H[0] // ...........................................................*........................................ - // gap // .................................................................................................... - ldr x24, [x0, #472] // .............................e...................................................................... - ldr x8, [x0, #336] // ....................e............................................................................... - ins v21.d[1], x25 // ...*................................................................................................ - add v30.8H, v26.8H, v10.8H // .........................................*.......................................................... - ldr x14, [x0, #272] // ................e................................................................................... - ldr x7, [x0, #208] // ............e....................................................................................... - sub v19.8H, v26.8H, v10.8H // ........................................*........................................................... - ldr x21, [x0, #344] // .....................e.............................................................................. - mls v11.8H, v15.8H, v7.H[0] // .....................................................................*.............................. - mul v8.8H, v13.8H, v0.H[4] // ..............................................................*..................................... - // gap // .................................................................................................... - ldr x17, [x0, #216] // .............e...................................................................................... - // gap // .................................................................................................... - mls v27.8H, v28.8H, v7.H[0] // ..................................*................................................................. - ldr x22, [x0, #80] // ....e............................................................................................... - // gap // .................................................................................................... - ins v15.d[0], x15 // ..........................e......................................................................... - sqrdmulh v18.8H, v13.8H, v0.H[5] // ...............................................................*.................................... - ins v25.d[0], x13 // ..............................e..................................................................... - add v24.8H, v30.8H, v6.8H // .............................................................*...................................... - sub v31.8H, v30.8H, v6.8H // ............................................................*....................................... - mls v9.8H, v23.8H, v7.H[0] // ......................................................*............................................. - ldr x25, [x0, #24] // .e.................................................................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v20.d[0], x8 // ......................e............................................................................. - // gap // .................................................................................................... - ldr x15, [x0, #152] // .........e.......................................................................................... - sub v5.8H, v19.8H, v11.8H // ......................................................................*............................. - add v29.8H, v19.8H, v11.8H // .......................................................................*............................ - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v14.d[0], x7 // ..............e..................................................................................... - ldr x12, [x0, #408] // .........................e.......................................................................... - ins v25.d[1], x24 // ...............................e.................................................................... - ldr x26, [x0, #280] // .................e.................................................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v20.d[1], x21 // .......................e............................................................................ - sqrdmulh v30.8H, v31.8H, v1.H[1] // ..............................................................................*..................... - // gap // .................................................................................................... - mls v8.8H, v18.8H, v7.H[0] // ................................................................*................................... - add v2.8H, v21.8H, v27.8H // ....................................*............................................................... - mul v17.8H, v29.8H, v1.H[2] // ..................................................................................*................. - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v11.8H, v25.8H, v0.H[0] // ...............................................e.................................................... - // gap // .................................................................................................... - mul v18.8H, v31.8H, v1.H[0] // .............................................................................*...................... - // gap // .................................................................................................... - sqrdmulh v4.8H, v29.8H, v1.H[3] // ...................................................................................*................ - sub v23.8H, v2.8H, v9.8H // .......................................................*............................................ - ldr x28, [x0, #88] // .....e.............................................................................................. - sqrdmulh v12.8H, v25.8H, v0.H[1] // ................................................e................................................... - mul v10.8H, v20.8H, v0.H[0] // .....................................e.............................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v15.d[1], x12 // ...........................e........................................................................ - sub v6.8H, v21.8H, v27.8H // ...................................*................................................................ - sqrdmulh v13.8H, v20.8H, v0.H[1] // ......................................e............................................................. - sqrdmulh v29.8H, v5.8H, v1.H[5] // ........................................................................................*........... - // gap // .................................................................................................... - mul v25.8H, v5.8H, v1.H[4] // .......................................................................................*............ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v21.8H, v24.8H, v0.H[6] // ........................................................................*........................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v17.8H, v4.8H, v7.H[0] // ....................................................................................*............... - sqrdmulh v28.8H, v15.8H, v0.H[1] // ...........................................e........................................................ - sqrdmulh v4.8H, v24.8H, v0.H[7] // .........................................................................*.......................... - // gap // .................................................................................................... - mls v11.8H, v12.8H, v7.H[0] // .................................................e.................................................. - mls v18.8H, v30.8H, v7.H[0] // ...............................................................................*.................... - add v26.8H, v6.8H, v8.8H // ..................................................................*................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v16.8H, v15.8H, v0.H[0] // ..........................................e......................................................... - // gap // .................................................................................................... - sub v31.8H, v6.8H, v8.8H // .................................................................*.................................. - add v24.8H, v2.8H, v9.8H // ........................................................*........................................... - mls v25.8H, v29.8H, v7.H[0] // .........................................................................................*.......... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v14.d[1], x17 // ...............e.................................................................................... - // gap // .................................................................................................... - ins v22.d[0], x20 // ..........e......................................................................................... - // gap // .................................................................................................... - mls v21.8H, v4.8H, v7.H[0] // ..........................................................................*......................... - sub v29.8H, v26.8H, v17.8H // .....................................................................................*.............. - add v12.8H, v26.8H, v17.8H // ......................................................................................*............. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v30.8H, v14.8H, v11.8H // ...................................................e................................................ - mls v16.8H, v28.8H, v7.H[0] // ............................................e....................................................... - // gap // .................................................................................................... - add v2.8H, v23.8H, v18.8H // .................................................................................*.................. - ldr x16, [x0, #16] // e................................................................................................... - sub v20.8H, v23.8H, v18.8H // ................................................................................*................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v5.d[0], x14 // ..................e................................................................................. - sub v23.8H, v14.8H, v11.8H // ..................................................e................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v18.8H, v31.8H, v25.8H // ..........................................................................................*......... - str q12, [x0, #256] // ................................................................................................*... - str q29, [x0, #320] // .................................................................................................*.. - add v29.8H, v31.8H, v25.8H // ...........................................................................................*........ - // gap // .................................................................................................... - add v4.8H, v24.8H, v21.8H // ............................................................................*....................... - ins v26.d[0], x22 // ......e............................................................................................. - // gap // .................................................................................................... - sub v3.8H, v24.8H, v21.8H // ...........................................................................*........................ - str q2, [x0, #128] // ..............................................................................................*..... - str q20, [x0, #192] // ...............................................................................................*.... - ins v22.d[1], x15 // ...........e........................................................................................ - ins v5.d[1], x26 // ...................e................................................................................ - mul v6.8H, v30.8H, v0.H[2] // .........................................................e.......................................... - sqrdmulh v2.8H, v30.8H, v0.H[3] // ..........................................................e......................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - str q18, [x0, #448] // ...................................................................................................* - ins v21.d[0], x16 // ..e................................................................................................. - str q29, [x0, #384] // ..................................................................................................*. - str q4, [x0], #(16) // ............................................................................................*....... - sqrdmulh v15.8H, v23.8H, v0.H[5] // ....................................................................e............................... - str q3, [x0, #48] // .............................................................................................*...... - mul v11.8H, v23.8H, v0.H[4] // ...................................................................e................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v10.8H, v13.8H, v7.H[0] // .......................................e............................................................ - add v23.8H, v22.8H, v16.8H // ..............................................e..................................................... + // Instructions: 76 + // Expected cycles: 17 + // Expected IPC: 4.47 + // + // Cycle bound: 16.0 + // IPC bound: 4.75 + // + // Wall time: 3600.42s + // User time: 3600.42s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q17, [x0, #464] // .......e.................................................................... + ldr q23, [x0, #400] // ......e..................................................................... + ldr q25, [x0, #272] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v27.8H, v18.8H, v21.8H // ...............................................*............................ + ldr q30, [x0, #16] // e........................................................................... + ldr q10, [x0, #144] // ..e......................................................................... + add v22.8H, v19.8H, v6.8H // .....................................*...................................... + sub v16.8H, v18.8H, v21.8H // ..............................................*............................. + mls v3.8H, v14.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + ldr q24, [x0, #80] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q2, [x0, #336] // .....e...................................................................... + // gap // ............................................................................ + sub v13.8H, v19.8H, v6.8H // ....................................*....................................... + mls v26.8H, v31.8H, v7.H[0] // ..............................*............................................. + mul v14.8H, v16.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + mul v15.8H, v27.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v18.8H, v22.8H, v0.H[6] // .................................................*.......................... + sqrdmulh v22.8H, v22.8H, v0.H[7] // ................................................*........................... + mul v12.8H, v13.8H, v1.H[0] // ......................................................*..................... + mul v29.8H, v17.8H, v0.H[0] // ........................e................................................... + // gap // ............................................................................ + sqrdmulh v17.8H, v17.8H, v0.H[1] // .......................e.................................................... + sqrdmulh v27.8H, v27.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v16.8H, v16.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + mul v19.8H, v23.8H, v0.H[0] // ...................e........................................................ + sqrdmulh v23.8H, v23.8H, v0.H[1] // ..................e......................................................... + sqrdmulh v21.8H, v25.8H, v0.H[1] // ........e................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v8.8H, v2.8H, v0.H[0] // ..............e............................................................. + sqrdmulh v28.8H, v13.8H, v1.H[1] // .....................................................*...................... + mls v18.8H, v22.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + sqrdmulh v31.8H, v2.8H, v0.H[1] // .............e.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + ldr q9, [x0, #208] // ...e........................................................................ + mul v5.8H, v25.8H, v0.H[0] // .........e.................................................................. + mls v15.8H, v27.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v2.8H, v4.8H, v26.8H // ................................*........................................... + mls v29.8H, v17.8H, v7.H[0] // .........................e.................................................. + mls v14.8H, v16.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + add v13.8H, v20.8H, v3.8H // ..........................................*................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v19.8H, v23.8H, v7.H[0] // ....................e....................................................... + sub v4.8H, v4.8H, v26.8H // ...............................*............................................ + sub v22.8H, v2.8H, v18.8H // ...................................................*........................ + mls v8.8H, v31.8H, v7.H[0] // ...............e............................................................ + mls v12.8H, v28.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v25.8H, v20.8H, v3.8H // .........................................*.................................. + sub v3.8H, v13.8H, v15.8H // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v31.8H, v9.8H, v29.8H // ...........................e................................................ + sub v9.8H, v9.8H, v29.8H // ..........................e................................................. + add v27.8H, v13.8H, v15.8H // ..............................................................*............. + // gap // ............................................................................ + add v15.8H, v25.8H, v14.8H // ...................................................................*........ + mls v5.8H, v21.8H, v7.H[0] // ..........e................................................................. + sub v16.8H, v25.8H, v14.8H // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v17.8H, v10.8H, v19.8H // ......................e..................................................... + // gap // ............................................................................ + str q22, [x0, #64] // .....................................................................*...... + sqrdmulh v11.8H, v31.8H, v0.H[3] // .................................e.......................................... + mul v6.8H, v31.8H, v0.H[2] // ..................................e......................................... + // gap // ............................................................................ + mul v21.8H, v9.8H, v0.H[4] // ............................................e............................... + sqrdmulh v31.8H, v9.8H, v0.H[5] // ...........................................e................................ + // gap // ............................................................................ + str q27, [x0, #256] // ........................................................................*... + str q16, [x0, #448] // ...........................................................................* + sub v25.8H, v4.8H, v12.8H // ........................................................*................... + add v23.8H, v4.8H, v12.8H // .........................................................*.................. + str q15, [x0, #384] // ..........................................................................*. + sub v29.8H, v10.8H, v19.8H // .....................e...................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v26.8H, v17.8H, v0.H[2] // .............................e.............................................. + add v19.8H, v24.8H, v8.8H // .................e.......................................................... + add v4.8H, v30.8H, v5.8H // ............e............................................................... + str q3, [x0, #320] // .........................................................................*.. + add v13.8H, v2.8H, v18.8H // ....................................................*....................... + sub v20.8H, v30.8H, v5.8H // ...........e................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q23, [x0, #128] // ......................................................................*..... + str q25, [x0, #192] // .......................................................................*.... + mls v6.8H, v11.8H, v7.H[0] // ...................................e........................................ + // gap // ............................................................................ + mul v3.8H, v29.8H, v0.H[4] // .......................................e.................................... + // gap // ............................................................................ + sqrdmulh v14.8H, v29.8H, v0.H[5] // ......................................e..................................... + mls v21.8H, v31.8H, v7.H[0] // .............................................e.............................. + // gap // ............................................................................ + sub v18.8H, v24.8H, v8.8H // ................e........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q13, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + sqrdmulh v31.8H, v17.8H, v0.H[3] // ............................e............................................... - // original source code - // ldr x10, [x0, #0] // .........................................................................e.........................|.........................................................................e...................... - // ldr x11, [x0, #(0+8)] // ............................e......................................................................|............................e................................................................... - // ins v8.d[0], x10 // ...........................................................................................e.......|...........................................................................................e.... - // ins v8.d[1], x11 // ...........*.......................................................................................|...........*.................................................................................... - // ldr x10, [x0, #(1*(512/8))] // .....................e.............................................................................|.....................e.......................................................................... - // ldr x11, [x0, #((1*(512/8))+8)] // ..............................................e....................................................|..............................................e................................................. - // ins v9.d[0], x10 // ..................................................................................e................|..................................................................................e............. - // ins v9.d[1], x11 // .....*.............................................................................................|.....*.......................................................................................... - // ldr x10, [x0, #(2*(512/8))] // ...e...............................................................................................|...e............................................................................................ - // ldr x11, [x0, #((2*(512/8))+8)] // ..............................e....................................................................|..............................e................................................................. - // ins v10.d[0], x10 // ..................................................................e................................|..................................................................e............................. - // ins v10.d[1], x11 // ......................................................................................e............|......................................................................................e......... - // ldr x10, [x0, #(3*(512/8))] // ..............e....................................................................................|..............e................................................................................. - // ldr x11, [x0, #((3*(512/8))+8)] // ...................e...............................................................................|...................e............................................................................ - // ins v11.d[0], x10 // .................................e.................................................................|.................................e.............................................................. - // ins v11.d[1], x11 // .................................................................e.................................|.................................................................e.............................. - // ldr x10, [x0, #(4*(512/8))] // .............e.....................................................................................|.............e.................................................................................. - // ldr x11, [x0, #((4*(512/8))+8)] // ....................................e..............................................................|....................................e........................................................... - // ins v12.d[0], x10 // ...........................................................................e.......................|...........................................................................e.................... - // ins v12.d[1], x11 // .......................................................................................e...........|.......................................................................................e........ - // ldr x10, [x0, #(5*(512/8))] // ..........e........................................................................................|..........e..................................................................................... - // ldr x11, [x0, #((5*(512/8))+8)] // ................e..................................................................................|................e............................................................................... - // ins v13.d[0], x10 // .............................e.....................................................................|.............................e.................................................................. - // ins v13.d[1], x11 // .....................................e.............................................................|.....................................e.......................................................... - // ldr x10, [x0, #(6*(512/8))] // .e.................................................................................................|.e.............................................................................................. - // ldr x11, [x0, #((6*(512/8))+8)] // ..................................e................................................................|..................................e............................................................. - // ins v14.d[0], x10 // ......................e............................................................................|......................e......................................................................... - // ins v14.d[1], x11 // .................................................e.................................................|.................................................e.............................................. - // ldr x10, [x0, #(7*(512/8))] // e..................................................................................................|e............................................................................................... - // ldr x11, [x0, #((7*(512/8))+8)] // .........e.........................................................................................|.........e...................................................................................... - // ins v15.d[0], x10 // ........................e..........................................................................|........................e....................................................................... - // ins v15.d[1], x11 // ...................................e...............................................................|...................................e............................................................ - // mul v24.8h, v12.8h, v0.h[0] // ...................................................................................................*................................................................................................ - // sqrdmulh v12.8h, v12.8h, v0.h[1] // ..*................................................................................................|..*............................................................................................. - // mls v24.8h, v12.8h, v7.h[0] // ....................*..............................................................................|....................*........................................................................... - // sub v12.8h, v8.8h, v24.8h // ..................................................*................................................|..................................................*............................................. - // add v8.8h, v8.8h, v24.8h // ........................................*..........................................................|........................................*....................................................... - // mul v24.8h, v13.8h, v0.h[0] // ................................................e..................................................|................................................e............................................... - // sqrdmulh v13.8h, v13.8h, v0.h[1] // ...................................................e...............................................|...................................................e............................................ - // mls v24.8h, v13.8h, v7.h[0] // .................................................................................................e.|................................................................................................ - // sub v13.8h, v9.8h, v24.8h // ...............*...................................................................................|...............*................................................................................ - // add v9.8h, v9.8h, v24.8h // ............*......................................................................................|............*................................................................................... - // mul v24.8h, v14.8h, v0.h[0] // .............................................................e.....................................|.............................................................e.................................. - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ........................................................e..........................................|........................................................e....................................... - // mls v24.8h, v14.8h, v7.h[0] // .......................................................................e...........................|.......................................................................e........................ - // sub v14.8h, v10.8h, v24.8h // ....*..............................................................................................|....*........................................................................................... - // add v10.8h, v10.8h, v24.8h // ..................................................................................................e|................................................................................................ - // mul v24.8h, v15.8h, v0.h[0] // ..........................................e........................................................|..........................................e..................................................... - // sqrdmulh v15.8h, v15.8h, v0.h[1] // ...............................................e...................................................|...............................................e................................................ - // mls v24.8h, v15.8h, v7.h[0] // ..........................................................e........................................|..........................................................e..................................... - // sub v15.8h, v11.8h, v24.8h // ............................................................................e......................|............................................................................e................... - // add v11.8h, v11.8h, v24.8h // ......................................................................e............................|......................................................................e......................... - // mul v24.8h, v10.8h, v0.h[2] // ......*............................................................................................|......*......................................................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[3] // .......*...........................................................................................|.......*........................................................................................ - // mls v24.8h, v10.8h, v7.h[0] // ...........................*.......................................................................|...........................*.................................................................... - // sub v10.8h, v8.8h, v24.8h // .............................................*.....................................................|.............................................*.................................................. - // add v8.8h, v8.8h, v24.8h // ...............................................................*...................................|...............................................................*................................ - // mul v24.8h, v11.8h, v0.h[2] // ........................................................................................e..........|........................................................................................e....... - // sqrdmulh v11.8h, v11.8h, v0.h[3] // .........................................................................................e.........|.........................................................................................e...... - // mls v24.8h, v11.8h, v7.h[0] // ........*..........................................................................................|........*....................................................................................... - // sub v11.8h, v9.8h, v24.8h // ..........................*........................................................................|..........................*..................................................................... - // add v9.8h, v9.8h, v24.8h // .........................*.........................................................................|.........................*...................................................................... - // mul v24.8h, v14.8h, v0.h[4] // ..................*................................................................................|..................*............................................................................. - // sqrdmulh v14.8h, v14.8h, v0.h[5] // .......................*...........................................................................|.......................*........................................................................ - // mls v24.8h, v14.8h, v7.h[0] // .......................................*...........................................................|.......................................*........................................................ - // sub v14.8h, v12.8h, v24.8h // ..............................................................*....................................|..............................................................*................................. - // add v12.8h, v12.8h, v24.8h // ............................................................*......................................|............................................................*................................... - // mul v24.8h, v15.8h, v0.h[4] // ................................................................................................e..|................................................................................................ - // sqrdmulh v15.8h, v15.8h, v0.h[5] // ..............................................................................................e....|..............................................................................................e. - // mls v24.8h, v15.8h, v7.h[0] // .................*.................................................................................|.................*.............................................................................. - // sub v15.8h, v13.8h, v24.8h // ...............................*...................................................................|...............................*................................................................ - // add v13.8h, v13.8h, v24.8h // ................................*..................................................................|................................*............................................................... - // mul v24.8h, v9.8h, v0.h[6] // ......................................................*............................................|......................................................*......................................... - // sqrdmulh v9.8h, v9.8h, v0.h[7] // .........................................................*.........................................|.........................................................*...................................... - // mls v24.8h, v9.8h, v7.h[0] // ...................................................................*...............................|...................................................................*............................ - // sub v9.8h, v8.8h, v24.8h // ...................................................................................*...............|...................................................................................*............ - // add v8.8h, v8.8h, v24.8h // .................................................................................*.................|.................................................................................*.............. - // mul v24.8h, v11.8h, v1.h[0] // ...........................................*.......................................................|...........................................*.................................................... - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ......................................*............................................................|......................................*......................................................... - // mls v24.8h, v11.8h, v7.h[0] // ...........................................................*.......................................|...........................................................*.................................... - // sub v11.8h, v10.8h, v24.8h // ..........................................................................*........................|..........................................................................*..................... - // add v10.8h, v10.8h, v24.8h // ........................................................................*..........................|........................................................................*....................... - // mul v24.8h, v13.8h, v1.h[2] // .........................................*.........................................................|.........................................*...................................................... - // sqrdmulh v13.8h, v13.8h, v1.h[3] // ............................................*......................................................|............................................*................................................... - // mls v24.8h, v13.8h, v7.h[0] // .......................................................*...........................................|.......................................................*........................................ - // sub v13.8h, v12.8h, v24.8h // ....................................................................*..............................|....................................................................*........................... - // add v12.8h, v12.8h, v24.8h // .....................................................................*.............................|.....................................................................*.......................... - // mul v24.8h, v15.8h, v1.h[4] // .....................................................*.............................................|.....................................................*.......................................... - // sqrdmulh v15.8h, v15.8h, v1.h[5] // ....................................................*..............................................|....................................................*........................................... - // mls v24.8h, v15.8h, v7.h[0] // ................................................................*..................................|................................................................*............................... - // sub v15.8h, v14.8h, v24.8h // .............................................................................*.....................|.............................................................................*.................. - // add v14.8h, v14.8h, v24.8h // ................................................................................*..................|................................................................................*............... - // str q8, [x0], #(16) // .............................................................................................*.....|.............................................................................................*.. - // str q9, [x0, #(-16 + 1*(512/8))] // ...............................................................................................*...|...............................................................................................* - // str q10, [x0, #(-16 + 2*(512/8))] // ....................................................................................*..............|....................................................................................*........... - // str q11, [x0, #(-16 + 3*(512/8))] // .....................................................................................*.............|.....................................................................................*.......... - // str q12, [x0, #(-16 + 4*(512/8))] // ..............................................................................*....................|..............................................................................*................. - // str q13, [x0, #(-16 + 5*(512/8))] // ...............................................................................*...................|...............................................................................*................ - // str q14, [x0, #(-16 + 6*(512/8))] // ............................................................................................*......|............................................................................................*... - // str q15, [x0, #(-16 + 7*(512/8))] // ..........................................................................................*........|..........................................................................................*..... + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q8, [x0, #0] // ....e.......................................................................'...~...................................................................... + // ldr q9, [x0, #(1*(512/8))] // .........e..................................................................'........~................................................................. + // ldr q10, [x0, #(2*(512/8))] // .....e......................................................................'....~..................................................................... + // ldr q11, [x0, #(3*(512/8))] // .............................e..............................................'............................~............................................. + // ldr q12, [x0, #(4*(512/8))] // ..e.........................................................................'.~........................................................................ + // ldr q13, [x0, #(5*(512/8))] // ..........e.................................................................'.........~................................................................ + // ldr q14, [x0, #(6*(512/8))] // .e..........................................................................'~......................................................................... + // ldr q15, [x0, #(7*(512/8))] // e...........................................................................~.......................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ........................e...................................................'.......................~.................................................. + // mul v24.8h, v12.8h, v0.h[0] // ..............................e.............................................'.............................~............................................ + // mls v24.8h, v27.8h, v7.h[0] // ...............................................e............................'..............................................~........................... + // sub v12.8h, v8.8h, v24.8h // ..................................................................e.........'.................................................................~........ + // add v8.8h, v8.8h, v24.8h // ...............................................................e............'..............................................................~........... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ............................e...............................................'...........................~.............................................. + // mul v24.8h, v13.8h, v0.h[0] // .........................e..................................................'........................~................................................. + // mls v24.8h, v27.8h, v7.h[0] // .......................................e....................................'......................................~................................... + // sub v13.8h, v9.8h, v24.8h // .........................................................................e..'........................................................................~. + // add v9.8h, v9.8h, v24.8h // ..............................................................e.............'.............................................................~............ + // sqrdmulh v27.8h, v14.8h, v0.h[1] // .......................e....................................................'......................~................................................... + // mul v24.8h, v14.8h, v0.h[0] // ......................e.....................................................'.....................~.................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................................e.......................................'...................................~...................................... + // sub v14.8h, v10.8h, v24.8h // ............................................................e...............'...........................................................~.............. + // add v10.8h, v10.8h, v24.8h // .................................................e..........................'................................................~......................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ...................e........................................................'..................~....................................................... + // mul v24.8h, v15.8h, v0.h[0] // ..................e.........................................................'.................~........................................................ + // mls v24.8h, v27.8h, v7.h[0] // .................................e..........................................'................................~......................................... + // sub v15.8h, v11.8h, v24.8h // ............................................e...............................'...........................................~.............................. + // add v11.8h, v11.8h, v24.8h // ...........................................e................................'..........................................~............................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ...........................................................................e'.......................................................................... + // mul v24.8h, v10.8h, v0.h[2] // .............................................................e..............'............................................................~............. + // mls v24.8h, v27.8h, v7.h[0] // ............~...............................................................'...........*.............................................................. + // sub v10.8h, v8.8h, v24.8h // .....................................~......................................'....................................*..................................... + // add v8.8h, v8.8h, v24.8h // ................................~...........................................'...............................*.......................................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ...................................................e........................'..................................................~....................... + // mul v24.8h, v11.8h, v0.h[2] // ....................................................e.......................'...................................................~...................... + // mls v24.8h, v27.8h, v7.h[0] // .....................................................................e......'....................................................................~..... + // sub v11.8h, v9.8h, v24.8h // ...........~................................................................'..........*............................................................... + // add v9.8h, v9.8h, v24.8h // ......~.....................................................................'.....*.................................................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // .......................................................................e....'......................................................................~... + // mul v24.8h, v14.8h, v0.h[4] // ......................................................................e.....'.....................................................................~.... + // mls v24.8h, v27.8h, v7.h[0] // ........~...................................................................'.......*.................................................................. + // sub v14.8h, v12.8h, v24.8h // .........................................~..................................'........................................*................................. + // add v12.8h, v12.8h, v24.8h // ...................................~........................................'..................................*....................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ......................................................e.....................'.....................................................~.................... + // mul v24.8h, v15.8h, v0.h[4] // .....................................................e......................'....................................................~..................... + // mls v24.8h, v27.8h, v7.h[0] // ........................................................................e...'.......................................................................~.. + // sub v15.8h, v13.8h, v24.8h // .......~....................................................................'......*................................................................... + // add v13.8h, v13.8h, v24.8h // ...~........................................................................'..*....................................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ................~...........................................................'...............*.......................................................... + // mul v24.8h, v9.8h, v0.h[6] // ...............~............................................................'..............*........................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................~................................................'..........................*............................................... + // sub v9.8h, v8.8h, v24.8h // ......................................~.....................................'.....................................*.................................... + // add v8.8h, v8.8h, v24.8h // .................................................................~..........'................................................................*......... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ..........................~.................................................'.........................*................................................ + // mul v24.8h, v11.8h, v1.h[0] // .................~..........................................................'................*......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................................~...................................'.......................................*.................................. + // sub v11.8h, v10.8h, v24.8h // .........................................................~..................'........................................................*................. + // add v10.8h, v10.8h, v24.8h // ..........................................................~.................'.........................................................*................ + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ....................~.......................................................'...................*...................................................... + // mul v24.8h, v13.8h, v1.h[2] // ..............~.............................................................'.............*............................................................ + // mls v24.8h, v27.8h, v7.h[0] // ...............................~............................................'..............................*........................................... + // sub v13.8h, v12.8h, v24.8h // ..........................................~.................................'.........................................*................................ + // add v12.8h, v12.8h, v24.8h // .............................................~..............................'............................................*............................. + // sqrdmulh v27.8h, v15.8h, v1.h[5] // .....................~......................................................'....................*..................................................... + // mul v24.8h, v15.8h, v1.h[4] // .............~..............................................................'............*............................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..................................~.........................................'.................................*........................................ + // sub v15.8h, v14.8h, v24.8h // ................................................~...........................'...............................................*.......................... + // add v14.8h, v14.8h, v24.8h // ..............................................~.............................'.............................................*............................ + // str q8, [x0], #(16) // ..........................................................................~.'.........................................................................* + // str q9, [x0, #(-16 + 1*(512/8))] // ..................................................~.........................'.................................................*........................ + // str q10, [x0, #(-16 + 2*(512/8))] // ...................................................................~........'..................................................................*....... + // str q11, [x0, #(-16 + 3*(512/8))] // ....................................................................~.......'...................................................................*...... + // str q12, [x0, #(-16 + 4*(512/8))] // .......................................................~....................'......................................................*................... + // str q13, [x0, #(-16 + 5*(512/8))] // ................................................................~...........'...............................................................*.......... + // str q14, [x0, #(-16 + 6*(512/8))] // ...........................................................~................'..........................................................*............... + // str q15, [x0, #(-16 + 7*(512/8))] // ........................................................~...................'.......................................................*.................. sub count, count, #1 cbnz count, layer123_start - mul v12.8H, v5.8H, v0.H[0] // *..................................................... - sqrdmulh v31.8H, v5.8H, v0.H[1] // .*.................................................... - sub v16.8H, v22.8H, v16.8H // ..*................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - ins v26.d[1], x28 // ...*.................................................. - sqrdmulh v9.8H, v23.8H, v0.H[3] // .....*................................................ - mul v29.8H, v23.8H, v0.H[2] // ....*................................................. - mls v11.8H, v15.8H, v7.H[0] // ..........*........................................... - mls v6.8H, v2.8H, v7.H[0] // ......*............................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sub v24.8H, v26.8H, v10.8H // .........*............................................ - add v26.8H, v26.8H, v10.8H // ........*............................................. - mul v27.8H, v16.8H, v0.H[4] // ...........*.......................................... - sqrdmulh v25.8H, v16.8H, v0.H[5] // .............*........................................ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - ins v21.d[1], x25 // .......*.............................................. - mls v12.8H, v31.8H, v7.H[0] // ............*......................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sub v31.8H, v24.8H, v11.8H // .................*.................................... - add v23.8H, v24.8H, v11.8H // ..................*................................... - sub v10.8H, v26.8H, v6.8H // ...............*...................................... - add v11.8H, v26.8H, v6.8H // ..............*....................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v27.8H, v25.8H, v7.H[0] // ....................*................................. - mls v29.8H, v9.8H, v7.H[0] // ................*..................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sqrdmulh v26.8H, v31.8H, v1.H[5] // ...........................*.......................... - mul v18.8H, v23.8H, v1.H[2] // ......................*............................... - sub v20.8H, v21.8H, v12.8H // ..........................*........................... - sqrdmulh v4.8H, v23.8H, v1.H[3] // ........................*............................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - add v30.8H, v21.8H, v12.8H // .....................*................................ - sqrdmulh v23.8H, v10.8H, v1.H[1] // ...................*.................................. - mul v14.8H, v31.8H, v1.H[4] // ............................*......................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sub v22.8H, v20.8H, v27.8H // ..................................*................... - add v27.8H, v20.8H, v27.8H // .................................*.................... - mul v13.8H, v10.8H, v1.H[0] // .......................*.............................. - mul v31.8H, v11.8H, v0.H[6] // .............................*........................ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - add v28.8H, v30.8H, v29.8H // ...................................*.................. - sub v25.8H, v30.8H, v29.8H // .........................*............................ - mls v18.8H, v4.8H, v7.H[0] // ..............................*....................... - sqrdmulh v2.8H, v11.8H, v0.H[7] // ...............................*...................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v14.8H, v26.8H, v7.H[0] // ....................................*................. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - mls v13.8H, v23.8H, v7.H[0] // ................................*..................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sub v12.8H, v27.8H, v18.8H // ......................................*............... - add v19.8H, v27.8H, v18.8H // .......................................*.............. - mls v31.8H, v2.8H, v7.H[0] // .....................................*................ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - add v9.8H, v22.8H, v14.8H // .............................................*........ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sub v5.8H, v22.8H, v14.8H // ..........................................*........... - str q19, [x0, #256] // ...........................................*.......... - add v3.8H, v25.8H, v13.8H // ........................................*............. - str q12, [x0, #320] // ............................................*......... - sub v21.8H, v25.8H, v13.8H // .........................................*............ - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - sub v29.8H, v28.8H, v31.8H // ...............................................*...... - str q9, [x0, #384] // ...................................................*.. - add v9.8H, v28.8H, v31.8H // ..............................................*....... - str q5, [x0, #448] // ..................................................*... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - str q3, [x0, #128] // ................................................*..... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - str q21, [x0, #192] // .................................................*.... - str q29, [x0, #64] // .....................................................* - str q9, [x0], #(16) // ....................................................*. - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... - // gap // ...................................................... + // Instructions: 38 + // Expected cycles: 14 + // Expected IPC: 2.71 + // + // Cycle bound: 14.0 + // IPC bound: 2.71 + // + // Wall time: 0.62s + // User time: 0.62s + // + // --------- original position ---------> + // 0 25 + // |------------------------|------------ + add v30.8H, v18.8H, v21.8H // *..................................... + sub v13.8H, v18.8H, v21.8H // ..*................................... + mls v26.8H, v31.8H, v7.H[0] // .....*................................ + mls v3.8H, v14.8H, v7.H[0] // ...*.................................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v2.8H, v19.8H, v6.8H // .*.................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v22.8H, v19.8H, v6.8H // ....*................................. + sqrdmulh v8.8H, v30.8H, v1.H[3] // ...........*.......................... + sqrdmulh v15.8H, v13.8H, v1.H[5] // ............*......................... + mul v17.8H, v30.8H, v1.H[2] // .......*.............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v30.8H, v20.8H, v3.8H // ..................*................... + sub v19.8H, v4.8H, v26.8H // ...................*.................. + mul v21.8H, v13.8H, v1.H[4] // ......*............................... + mul v10.8H, v2.8H, v0.H[6] // ........*............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v9.8H, v2.8H, v0.H[7] // .........*............................ + sub v24.8H, v20.8H, v3.8H // ......................*............... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v20.8H, v22.8H, v1.H[0] // ..........*........................... + mls v17.8H, v8.8H, v7.H[0] // ...............*...................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v28.8H, v22.8H, v1.H[1] // .............*........................ + add v11.8H, v4.8H, v26.8H // ................*..................... + mls v21.8H, v15.8H, v7.H[0] // .................*.................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v10.8H, v9.8H, v7.H[0] // ..............*....................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v29.8H, v30.8H, v17.8H // .......................*.............. + mls v20.8H, v28.8H, v7.H[0] // .....................*................ + add v17.8H, v30.8H, v17.8H // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v12.8H, v24.8H, v21.8H // .........................*............ + sub v18.8H, v24.8H, v21.8H // ..........................*........... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q17, [x0, #256] // ............................*......... + str q29, [x0, #320] // .................................*.... + sub v5.8H, v11.8H, v10.8H // ....................*................. + add v28.8H, v11.8H, v10.8H // ..................................*... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q12, [x0, #384] // ................................*..... + sub v30.8H, v19.8H, v20.8H // ..............................*....... + add v14.8H, v19.8H, v20.8H // ...............................*...... + str q18, [x0, #448] // .............................*........ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q5, [x0, #64] // ...........................*.......... + str q28, [x0], #(16) // .....................................* + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q30, [x0, #176] // ....................................*. + str q14, [x0, #112] // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... - // original source code - // mul v27.8H, v5.8H, v0.H[0] // *..................................................... - // sqrdmulh v28.8H, v5.8H, v0.H[1] // .*.................................................... - // sub v13.8H, v22.8H, v16.8H // ..*................................................... - // ins v26.d[1], x28 // ...*.................................................. - // mul v9.8H, v23.8H, v0.H[2] // .....*................................................ - // sqrdmulh v23.8H, v23.8H, v0.H[3] // ....*................................................. - // mls v6.8H, v2.8H, v7.H[0] // .......*.............................................. - // ins v21.d[1], x25 // ............*......................................... - // add v30.8H, v26.8H, v10.8H // .........*............................................ - // sub v19.8H, v26.8H, v10.8H // ........*............................................. - // mls v11.8H, v15.8H, v7.H[0] // ......*............................................... - // mul v8.8H, v13.8H, v0.H[4] // ..........*........................................... - // mls v27.8H, v28.8H, v7.H[0] // .............*........................................ - // sqrdmulh v18.8H, v13.8H, v0.H[5] // ...........*.......................................... - // add v24.8H, v30.8H, v6.8H // .................*.................................... - // sub v31.8H, v30.8H, v6.8H // ................*..................................... - // mls v9.8H, v23.8H, v7.H[0] // ...................*.................................. - // sub v5.8H, v19.8H, v11.8H // ..............*....................................... - // add v29.8H, v19.8H, v11.8H // ...............*...................................... - // sqrdmulh v30.8H, v31.8H, v1.H[1] // .........................*............................ - // mls v8.8H, v18.8H, v7.H[0] // ..................*................................... - // add v2.8H, v21.8H, v27.8H // ........................*............................. - // mul v17.8H, v29.8H, v1.H[2] // .....................*................................ - // mul v18.8H, v31.8H, v1.H[0] // .............................*........................ - // sqrdmulh v4.8H, v29.8H, v1.H[3] // .......................*.............................. - // sub v23.8H, v2.8H, v9.8H // ................................*..................... - // sub v6.8H, v21.8H, v27.8H // ......................*............................... - // sqrdmulh v29.8H, v5.8H, v1.H[5] // ....................*................................. - // mul v25.8H, v5.8H, v1.H[4] // ..........................*........................... - // mul v21.8H, v24.8H, v0.H[6] // ..............................*....................... - // mls v17.8H, v4.8H, v7.H[0] // .................................*.................... - // sqrdmulh v4.8H, v24.8H, v0.H[7] // ..................................*................... - // mls v18.8H, v30.8H, v7.H[0] // ....................................*................. - // add v26.8H, v6.8H, v8.8H // ............................*......................... - // sub v31.8H, v6.8H, v8.8H // ...........................*.......................... - // add v24.8H, v2.8H, v9.8H // ...............................*...................... - // mls v25.8H, v29.8H, v7.H[0] // ...................................*.................. - // mls v21.8H, v4.8H, v7.H[0] // .......................................*.............. - // sub v29.8H, v26.8H, v17.8H // .....................................*................ - // add v12.8H, v26.8H, v17.8H // ......................................*............... - // add v2.8H, v23.8H, v18.8H // ...........................................*.......... - // sub v20.8H, v23.8H, v18.8H // .............................................*........ - // sub v18.8H, v31.8H, v25.8H // .........................................*............ - // str q12, [x0, #256] // ..........................................*........... - // str q29, [x0, #320] // ............................................*......... - // add v29.8H, v31.8H, v25.8H // ........................................*............. - // add v4.8H, v24.8H, v21.8H // ................................................*..... - // sub v3.8H, v24.8H, v21.8H // ..............................................*....... - // str q2, [x0, #128] // ..................................................*... - // str q20, [x0, #192] // ...................................................*.. - // str q18, [x0, #448] // .................................................*.... - // str q29, [x0, #384] // ...............................................*...... - // str q4, [x0], #(16) // .....................................................* - // str q3, [x0, #48] // ....................................................*. + // ----------- new position ------------> + // 0 25 + // |------------------------|------------ + // add v27.8H, v18.8H, v21.8H // *..................................... + // add v22.8H, v19.8H, v6.8H // ....*................................. + // sub v16.8H, v18.8H, v21.8H // .*.................................... + // mls v3.8H, v14.8H, v7.H[0] // ...*.................................. + // sub v13.8H, v19.8H, v6.8H // .....*................................ + // mls v26.8H, v31.8H, v7.H[0] // ..*................................... + // mul v14.8H, v16.8H, v1.H[4] // ...........*.......................... + // mul v15.8H, v27.8H, v1.H[2] // ........*............................. + // mul v18.8H, v22.8H, v0.H[6] // ............*......................... + // sqrdmulh v22.8H, v22.8H, v0.H[7] // .............*........................ + // mul v12.8H, v13.8H, v1.H[0] // ...............*...................... + // sqrdmulh v27.8H, v27.8H, v1.H[3] // ......*............................... + // sqrdmulh v16.8H, v16.8H, v1.H[5] // .......*.............................. + // sqrdmulh v28.8H, v13.8H, v1.H[1] // .................*.................... + // mls v18.8H, v22.8H, v7.H[0] // ....................*................. + // mls v15.8H, v27.8H, v7.H[0] // ................*..................... + // add v2.8H, v4.8H, v26.8H // ..................*................... + // mls v14.8H, v16.8H, v7.H[0] // ...................*.................. + // add v13.8H, v20.8H, v3.8H // .........*............................ + // sub v4.8H, v4.8H, v26.8H // ..........*........................... + // sub v22.8H, v2.8H, v18.8H // ............................*......... + // mls v12.8H, v28.8H, v7.H[0] // ......................*............... + // sub v25.8H, v20.8H, v3.8H // ..............*....................... + // sub v3.8H, v13.8H, v15.8H // .....................*................ + // add v27.8H, v13.8H, v15.8H // .......................*.............. + // add v15.8H, v25.8H, v14.8H // ........................*............. + // sub v16.8H, v25.8H, v14.8H // .........................*............ + // str q22, [x0, #64] // ..................................*... + // str q27, [x0, #256] // ..........................*........... + // str q16, [x0, #448] // .................................*.... + // sub v25.8H, v4.8H, v12.8H // ...............................*...... + // add v23.8H, v4.8H, v12.8H // ................................*..... + // str q15, [x0, #384] // ..............................*....... + // str q3, [x0, #320] // ...........................*.......... + // add v13.8H, v2.8H, v18.8H // .............................*........ + // str q23, [x0, #128] // .....................................* + // str q25, [x0, #192] // ....................................*. + // str q13, [x0], #(16) // ...................................*.. restore inp, STACK0 mov count, #8 .p2align 2 - ldr x26, [x1, #48] // *................................................................ - ldr x20, [x3], #16 // ..*.............................................................. - ldr x15, [x4, #64] // ..............................................*.................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ldr x12, [x4, #72] // .......................................................*......... - ldr x16, [x4, #16] // ............*.................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ldr x14, [x3, #-8] // ......*.......................................................... - ldr x24, [x1, #56] // ...*............................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ins v12.d[0], x15 // ..................................................*.............. - ins v4.d[0], x20 // ..........*...................................................... - ins v23.d[0], x26 // .........*....................................................... - ldr x21, [x1, #16] // ....................*............................................ - ldr x22, [x4, #24] // .........................*....................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ins v29.d[0], x16 // ...................*............................................. - ldr x16, [x1, #8] // .......*......................................................... - ldr x13, [x1, #32] // .*............................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ins v12.d[1], x12 // ............................................................*.... - ins v4.d[1], x14 // ..............*.................................................. - ins v23.d[1], x24 // .............*................................................... - ldr x24, [x1, #24] // ..........................*...................................... - ldr x20, [x1, #0] // ................*................................................ - ldr x25, [x1, #40] // .....*........................................................... - // gap // ................................................................. - // gap // ................................................................. - ins v29.d[1], x22 // .................................*............................... - ldr x14, [x4, #80] // .................*............................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mul v26.8H, v23.8H, v4.H[0] // .......................*......................................... - sqrdmulh v22.8H, v23.8H, v4.H[1] // ........................*........................................ - ins v3.d[0], x13 // ........*........................................................ - ldr x11, [x4, #8] // ...........*..................................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ldr x10, [x4], #(6*16) // ....*............................................................ - ins v23.d[0], x20 // ......................*.......................................... - ldr x15, [x4, #-8] // ..............................*.................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ins v3.d[1], x25 // ...............*................................................. - ins v17.d[0], x21 // .............................*................................... - ins v27.d[0], x14 // ................................*................................ - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ins v23.d[1], x16 // ......................................*.......................... - mls v26.8H, v22.8H, v7.H[0] // ...............................*................................. - ins v14.d[0], x10 // ..................*.............................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ins v27.d[1], x15 // ....................................*............................ - ins v17.d[1], x24 // ..................................*.............................. - sqrdmulh v5.8H, v3.8H, v4.H[1] // .....................*........................................... - mul v22.8H, v3.8H, v4.H[0] // ...........................*..................................... - ldr x24, [x4, #-64] // ............................*.................................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ins v14.d[1], x11 // ................................................*................ - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sub v15.8H, v17.8H, v26.8H // .....................................*........................... - add v13.8H, v17.8H, v26.8H // .......................................*......................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v22.8H, v5.8H, v7.H[0] // ...................................*............................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sqrdmulh v17.8H, v15.8H, v4.H[5] // .........................................*....................... - mul v19.8H, v15.8H, v4.H[4] // ..........................................*...................... - mul v1.8H, v13.8H, v4.H[2] // ........................................*........................ - sqrdmulh v18.8H, v13.8H, v4.H[3] // ...........................................*..................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - ins v10.d[0], x24 // ..............................................................*.. - ldr x24, [x4, #-40] // .............................................................*... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - add v3.8H, v23.8H, v22.8H // ............................................*.................... - sub v22.8H, v23.8H, v22.8H // .............................................*................... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - mls v19.8H, v17.8H, v7.H[0] // .................................................*............... - mls v1.8H, v18.8H, v7.H[0] // ...............................................*................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sub v18.8H, v3.8H, v1.8H // ...................................................*............. - add v23.8H, v3.8H, v1.8H // ....................................................*............ - add v20.8H, v22.8H, v19.8H // .....................................................*........... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - sub v13.8H, v22.8H, v19.8H // ......................................................*.......... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - trn2 v19.4S, v23.4S, v18.4S // .........................................................*....... - trn1 v3.4S, v23.4S, v18.4S // ........................................................*........ - trn2 v26.4S, v20.4S, v13.4S // ...........................................................*..... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - trn1 v22.4S, v20.4S, v13.4S // ..........................................................*...... - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - trn2 v23.2D, v19.2D, v26.2D // ................................................................* - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - // gap // ................................................................. - trn2 v4.2D, v3.2D, v22.2D // ...............................................................*. + // Instructions: 40 + // Expected cycles: 25 + // Expected IPC: 1.60 + // + // Cycle bound: 25.0 + // IPC bound: 1.60 + // + // Wall time: 0.58s + // User time: 0.58s + // + // ---------- original position ----------> + // 0 25 + // |------------------------|-------------- + ldr q16, [x1, #32] // .*...................................... + ldr q14, [x3], #16 // *....................................... + ldr q28, [x1, #48] // ..*..................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q27, [x1, #16] // .....*.................................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q13, [x4, #64] // ............*........................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v6.8H, v28.8H, v14.H[1] // ........*............................... + mul v26.8H, v28.8H, v14.H[0] // .........*.............................. + sqrdmulh v15.8H, v16.8H, v14.H[1] // ......*................................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v26.8H, v6.8H, v7.H[0] // ..........*............................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v16.8H, v16.8H, v14.H[0] // .......*................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v6.8H, v27.8H, v26.8H // ...............*........................ + sub v27.8H, v27.8H, v26.8H // ..............*......................... + ldr q26, [x1, #0] // ...*.................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v16.8H, v15.8H, v7.H[0] // ...........*............................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v31.8H, v27.8H, v14.H[4] // ...................*.................... + sqrdmulh v23.8H, v27.8H, v14.H[5] // ....................*................... + mul v27.8H, v6.8H, v14.H[2] // .................*...................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v14.8H, v6.8H, v14.H[3] // ..................*..................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v11.8H, v26.8H, v16.8H // ................*....................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v31.8H, v23.8H, v7.H[0] // ......................*................. + mls v27.8H, v14.8H, v7.H[0] // ........................*............... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v0.8H, v26.8H, v16.8H // .......................*................ + ldr q21, [x4, #16] // ....*................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v15.8H, v11.8H, v31.8H // ..........................*............. + add v6.8H, v11.8H, v31.8H // ...........................*............ + add v16.8H, v0.8H, v27.8H // .........................*.............. + sub v14.8H, v0.8H, v27.8H // ............................*........... + ldr q31, [x4, #80] // .............*.......................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn1 v0.4S, v16.4S, v14.4S // .............................*.......... + trn1 v3.4S, v6.4S, v15.4S // ...............................*........ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v27.4S, v6.4S, v15.4S // ................................*....... + trn2 v15.4S, v16.4S, v14.4S // ..............................*......... + ldr q16, [x4, #48] // .....................*.................. + ldr q14, [x4], #(6*16) // .................................*...... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn1 v25.2D, v0.2D, v3.2D // ...................................*.... + trn2 v0.2D, v0.2D, v3.2D // ....................................*... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn1 v6.2D, v15.2D, v27.2D // .....................................*.. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v22.2D, v15.2D, v27.2D // ......................................*. + ldr q27, [x4, #-64] // ..................................*..... + sqrdmulh v11.8H, v0.8H, v21.8H // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ - // original source code - // ldr x10, [x1, #48] // *................................................................ - // ldr x19, [x1, #32] // ..............*.................................................. - // ldr x16, [x3], #16 // .*............................................................... - // ldr x25, [x1, #56] // ......*.......................................................... - // ldr x17, [x4], #(6*16) // ...........................*..................................... - // ldr x13, [x1, #40] // ....................*............................................ - // ldr x20, [x3, #-8] // .....*........................................................... - // ldr x11, [x1, #8] // .............*................................................... - // ins v28.d[0], x19 // .........................*....................................... - // ins v6.d[0], x10 // .........*....................................................... - // ins v1.d[0], x16 // ........*........................................................ - // ldr x27, [x4, #-88] // ..........................*...................................... - // ldr x14, [x4, #-80] // ....*............................................................ - // ins v6.d[1], x25 // .................*............................................... - // ins v1.d[1], x20 // ................*................................................ - // ins v28.d[1], x13 // ..............................*.................................. - // ldr x16, [x1, #0] // ...................*............................................. - // ldr x23, [x4, #-16] // ......................*.......................................... - // ins v14.d[0], x17 // ...................................*............................. - // ins v29.d[0], x14 // ............*.................................................... - // ldr x13, [x1, #16] // ..........*...................................................... - // sqrdmulh v15.8H, v28.8H, v1.H[1] // ......................................*.......................... - // ins v16.d[0], x16 // ............................*.................................... - // mul v19.8H, v6.8H, v1.H[0] // .......................*......................................... - // sqrdmulh v11.8H, v6.8H, v1.H[1] // ........................*........................................ - // ldr x15, [x4, #-72] // ...........*..................................................... - // ldr x26, [x1, #24] // ..................*.............................................. - // mul v2.8H, v28.8H, v1.H[0] // .......................................*......................... - // ldr x12, [x4, #-64] // ........................................*........................ - // ins v8.d[0], x13 // ...............................*................................. - // ldr x24, [x4, #-8] // .............................*................................... - // mls v19.8H, v11.8H, v7.H[0] // ..................................*.............................. - // ins v27.d[0], x23 // ................................*................................ - // ins v29.d[1], x15 // .....................*........................................... - // ins v8.d[1], x26 // .....................................*........................... - // mls v2.8H, v15.8H, v7.H[0] // ............................................*.................... - // ins v27.d[1], x24 // ....................................*............................ - // sub v17.8H, v8.8H, v19.8H // ..........................................*...................... - // ins v16.d[1], x11 // .................................*............................... - // add v21.8H, v8.8H, v19.8H // ...........................................*..................... - // mul v19.8H, v21.8H, v1.H[2] // ...............................................*................. - // sqrdmulh v0.8H, v17.8H, v1.H[5] // .............................................*................... - // mul v26.8H, v17.8H, v1.H[4] // ..............................................*.................. - // sqrdmulh v12.8H, v21.8H, v1.H[3] // ................................................*................ - // add v28.8H, v16.8H, v2.8H // ...................................................*............. - // sub v2.8H, v16.8H, v2.8H // ....................................................*............ - // ldr x24, [x4, #-32] // ..*.............................................................. - // mls v19.8H, v12.8H, v7.H[0] // ......................................................*.......... - // ins v14.d[1], x27 // .........................................*....................... - // mls v26.8H, v0.8H, v7.H[0] // .....................................................*........... - // ins v12.d[0], x24 // .......*......................................................... - // sub v15.8H, v28.8H, v19.8H // .......................................................*......... - // add v19.8H, v28.8H, v19.8H // ........................................................*........ - // add v0.8H, v2.8H, v26.8H // .........................................................*....... - // sub v26.8H, v2.8H, v26.8H // ..........................................................*...... - // ldr x24, [x4, #-24] // ...*............................................................. - // trn1 v3.4S, v19.4S, v15.4S // ............................................................*.... - // trn2 v19.4S, v19.4S, v15.4S // ...........................................................*..... - // trn1 v22.4S, v0.4S, v26.4S // ..............................................................*.. - // trn2 v26.4S, v0.4S, v26.4S // .............................................................*... - // ins v12.d[1], x24 // ...............*................................................. - // ldr x24, [x4, #-40] // ..................................................*.............. - // ins v10.d[0], x12 // .................................................*............... - // trn2 v4.2D, v3.2D, v22.2D // ................................................................* - // trn2 v23.2D, v19.2D, v26.2D // ...............................................................*. + // ------------ new position -------------> + // 0 25 + // |------------------------|-------------- + // ldr q8, [x3], #16 // .*...................................... + // ldr q1, [x1, #32] // *....................................... + // ldr q28, [x1, #48] // ..*..................................... + // ldr q17, [x1, #0] // ............*........................... + // ldr q21, [x4, #16] // ......................*................. + // ldr q22, [x1, #16] // ...*.................................... + // sqrdmulh v14.8H, v1.8H, v8.H[1] // .......*................................ + // mul v20.8H, v1.8H, v8.H[0] // .........*.............................. + // sqrdmulh v2.8H, v28.8H, v8.H[1] // .....*.................................. + // mul v11.8H, v28.8H, v8.H[0] // ......*................................. + // mls v11.8H, v2.8H, v7.H[0] // ........*............................... + // mls v20.8H, v14.8H, v7.H[0] // .............*.......................... + // ldr q13, [x4, #64] // ....*................................... + // ldr q31, [x4, #80] // ...........................*............ + // sub v24.8H, v22.8H, v11.8H // ...........*............................ + // add v27.8H, v22.8H, v11.8H // ..........*............................. + // sub v12.8H, v17.8H, v20.8H // ..................*..................... + // mul v26.8H, v27.8H, v8.H[2] // ................*....................... + // sqrdmulh v27.8H, v27.8H, v8.H[3] // .................*...................... + // mul v30.8H, v24.8H, v8.H[4] // ..............*......................... + // sqrdmulh v11.8H, v24.8H, v8.H[5] // ...............*........................ + // ldr q16, [x4, #48] // ................................*....... + // mls v30.8H, v11.8H, v7.H[0] // ...................*.................... + // add v17.8H, v17.8H, v20.8H // .....................*.................. + // mls v26.8H, v27.8H, v7.H[0] // ....................*................... + // add v6.8H, v17.8H, v26.8H // .........................*.............. + // sub v28.8H, v12.8H, v30.8H // .......................*................ + // add v22.8H, v12.8H, v30.8H // ........................*............... + // sub v17.8H, v17.8H, v26.8H // ..........................*............. + // trn1 v30.4S, v6.4S, v17.4S // ............................*........... + // trn2 v8.4S, v6.4S, v17.4S // ...............................*........ + // trn1 v0.4S, v22.4S, v28.4S // .............................*.......... + // trn2 v1.4S, v22.4S, v28.4S // ..............................*......... + // ldr q14, [x4], #(6*16) // .................................*...... + // ldr q27, [x4, #-64] // ......................................*. + // trn1 v25.2D, v30.2D, v0.2D // ..................................*..... + // trn2 v0.2D, v30.2D, v0.2D // ...................................*.... + // trn1 v6.2D, v8.2D, v1.2D // ....................................*... + // trn2 v22.2D, v8.2D, v1.2D // .....................................*.. + // sqrdmulh v11.8H, v0.8H, v21.8H // .......................................* sub count, count, #1 layer4567_start: - trn1 v0.2D, v19.2D, v26.2D // ...............................................*......................................................... - trn1 v20.2D, v3.2D, v22.2D // ..............................................*.......................................................... - ldr x10, [x1, #112] // ............e............................................................................................ - ldr x19, [x1, #96] // ........e................................................................................................ - ldr x16, [x3], #16 // ................e........................................................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x15, [x4, #-48] // ............................................................*............................................ - mul v18.8H, v4.8H, v14.8H // ........................................................................*................................ - sqrdmulh v4.8H, v4.8H, v29.8H // .........................................................................*............................... - ldr x21, [x4, #-56] // .........................................................*............................................... - mul v22.8H, v23.8H, v14.8H // .............................................................................*........................... - sqrdmulh v23.8H, v23.8H, v29.8H // ..............................................................................*.......................... - ldr x25, [x1, #120] // .............e........................................................................................... - // gap // ......................................................................................................... - ldr x17, [x4], #(6*16) // ................................................e........................................................ - ldr x13, [x1, #104] // .........e............................................................................................... - ldr x20, [x3, #-8] // .................e....................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x11, [x1, #72] // .e....................................................................................................... - ins v28.d[0], x19 // ..........e.............................................................................................. - ins v6.d[0], x10 // ..............e.......................................................................................... - ins v1.d[0], x16 // ..................e...................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x27, [x4, #-88] // .................................................e....................................................... - mls v18.8H, v4.8H, v7.H[0] // ..........................................................................*.............................. - ins v4.d[0], x15 // ..............................................................*.......................................... - ins v10.d[1], x21 // ...........................................................*............................................. - mls v22.8H, v23.8H, v7.H[0] // ...............................................................................*......................... - ldr x14, [x4, #-80] // ....................................................e.................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v6.d[1], x25 // ...............e......................................................................................... - ins v1.d[1], x20 // ...................e..................................................................................... - ins v28.d[1], x13 // ...........e............................................................................................. - ldr x16, [x1, #64] // e........................................................................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x23, [x4, #-16] // ....................................................................e.................................... - ins v14.d[0], x17 // ..................................................e...................................................... - ins v4.d[1], x24 // ...............................................................*......................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v23.8H, v0.8H, v22.8H // .................................................................................*....................... - sub v22.8H, v0.8H, v22.8H // ................................................................................*........................ - ins v29.d[0], x14 // ......................................................e.................................................. - ldr x13, [x1, #80] // ....e.................................................................................................... - sqrdmulh v15.8H, v28.8H, v1.H[1] // .....................e................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v16.d[0], x16 // ..e...................................................................................................... - mul v19.8H, v6.8H, v1.H[0] // .........................e............................................................................... - sqrdmulh v11.8H, v6.8H, v1.H[1] // ..........................e.............................................................................. - ldr x15, [x4, #-72] // .....................................................e................................................... - add v0.8H, v20.8H, v18.8H // ............................................................................*............................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mul v3.8H, v23.8H, v10.8H // ..................................................................................*...................... - sqrdmulh v4.8H, v23.8H, v4.8H // ...................................................................................*..................... - sqrdmulh v23.8H, v22.8H, v27.8H // ........................................................................................*................ - mul v22.8H, v22.8H, v12.8H // .......................................................................................*................. - ldr x26, [x1, #88] // .....e................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v26.8H, v20.8H, v18.8H // ...........................................................................*............................. - mul v2.8H, v28.8H, v1.H[0] // ....................e.................................................................................... - ldr x12, [x4, #-64] // ........................................................e................................................ - ins v8.d[0], x13 // ......e.................................................................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x24, [x4, #-8] // .....................................................................e................................... - mls v19.8H, v11.8H, v7.H[0] // ...........................e............................................................................. - ins v27.d[0], x23 // ......................................................................e.................................. - ins v29.d[1], x15 // .......................................................e................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v22.8H, v23.8H, v7.H[0] // .........................................................................................*............... - ins v8.d[1], x26 // .......e................................................................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v3.8H, v4.8H, v7.H[0] // ....................................................................................*.................... - mls v2.8H, v15.8H, v7.H[0] // ......................e.................................................................................. - ins v27.d[1], x24 // .......................................................................e................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v17.8H, v8.8H, v19.8H // ............................e............................................................................ - ins v16.d[1], x11 // ...e..................................................................................................... - add v21.8H, v8.8H, v19.8H // .............................e........................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v9.8H, v0.8H, v3.8H // .....................................................................................*................... - add v8.8H, v0.8H, v3.8H // ......................................................................................*.................. - sub v11.8H, v26.8H, v22.8H // ..........................................................................................*.............. - add v10.8H, v26.8H, v22.8H // ...........................................................................................*............. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mul v19.8H, v21.8H, v1.H[2] // ..............................e.......................................................................... - sqrdmulh v0.8H, v17.8H, v1.H[5] // ....................................e.................................................................... - mul v26.8H, v17.8H, v1.H[4] // ...................................e..................................................................... - sqrdmulh v12.8H, v21.8H, v1.H[3] // ...............................e......................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqdmulh v3.8H, v8.8H, v7.H[1] // ............................................................................................*............ - sqdmulh v4.8H, v9.8H, v7.H[1] // ...............................................................................................*......... - sqdmulh v22.8H, v10.8H, v7.H[1] // ..................................................................................................*...... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqdmulh v23.8H, v11.8H, v7.H[1] // .....................................................................................................*... - add v28.8H, v16.8H, v2.8H // ........................e................................................................................ - sub v2.8H, v16.8H, v2.8H // .......................e................................................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x24, [x4, #-32] // ................................................................e........................................ - mls v19.8H, v12.8H, v7.H[0] // ................................e........................................................................ - ins v14.d[1], x27 // ...................................................e..................................................... - mls v26.8H, v0.8H, v7.H[0] // .....................................e................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - srshr v4.8H, v4.8H, #11 // ................................................................................................*........ - srshr v22.8H, v22.8H, #11 // ...................................................................................................*..... - srshr v23.8H, v23.8H, #11 // ......................................................................................................*.. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - srshr v3.8H, v3.8H, #11 // .............................................................................................*........... - ins v12.d[0], x24 // ..................................................................e...................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v15.8H, v28.8H, v19.8H // .................................e....................................................................... - add v19.8H, v28.8H, v19.8H // ..................................e...................................................................... - add v0.8H, v2.8H, v26.8H // .......................................e................................................................. - sub v26.8H, v2.8H, v26.8H // ......................................e.................................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x24, [x4, #-24] // .................................................................e....................................... - mls v9.8H, v4.8H, v7.H[0] // .................................................................................................*....... - mls v11.8H, v23.8H, v7.H[0] // .......................................................................................................*. - mls v10.8H, v22.8H, v7.H[0] // ....................................................................................................*.... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v8.8H, v3.8H, v7.H[0] // ..............................................................................................*.......... - trn1 v3.4S, v19.4S, v15.4S // ........................................e................................................................ - trn2 v19.4S, v19.4S, v15.4S // .........................................e............................................................... - trn1 v22.4S, v0.4S, v26.4S // ..........................................e.............................................................. - trn2 v26.4S, v0.4S, v26.4S // ...........................................e............................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v12.d[1], x24 // ...................................................................e..................................... - ldr x24, [x4, #-40] // .............................................................e........................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ........................................................................................................* - ins v10.d[0], x12 // ..........................................................e.............................................. - trn2 v4.2D, v3.2D, v22.2D // ............................................e............................................................ - trn2 v23.2D, v19.2D, v26.2D // .............................................e........................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... + // Instructions: 72 + // Expected cycles: 26 + // Expected IPC: 2.77 + // + // Cycle bound: 26.0 + // IPC bound: 2.77 + // + // Wall time: 44.82s + // User time: 44.82s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ldr q8, [x3], #16 // ....e................................................................... + mul v0.8H, v0.8H, v14.8H // ........................................*............................... + ldr q1, [x1, #96] // ..e..................................................................... + ldr q28, [x1, #112] // ...e.................................................................... + mul v15.8H, v22.8H, v14.8H // .............................................*.......................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v26.8H, v22.8H, v21.8H // ............................................*........................... + ldr q17, [x1, #64] // e....................................................................... + ldr q21, [x4, #16] // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q22, [x1, #80] // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v15.8H, v26.8H, v7.H[0] // ..............................................*......................... + mls v0.8H, v11.8H, v7.H[0] // .........................................*.............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v14.8H, v1.8H, v8.H[1] // .....e.................................................................. + mul v20.8H, v1.8H, v8.H[0] // ......e................................................................. + sqrdmulh v2.8H, v28.8H, v8.H[1] // ..........e............................................................. + mul v11.8H, v28.8H, v8.H[0] // ...........e............................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v26.8H, v6.8H, v15.8H // ...............................................*........................ + add v15.8H, v6.8H, v15.8H // ................................................*....................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v11.8H, v2.8H, v7.H[0] // ............e........................................................... + mls v20.8H, v14.8H, v7.H[0] // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v14.8H, v15.8H, v27.8H // ..................................................*..................... + sqrdmulh v6.8H, v15.8H, v16.8H // .................................................*...................... + mul v15.8H, v26.8H, v13.8H // .......................................................*................ + sqrdmulh v16.8H, v26.8H, v31.8H // ......................................................*................. + ldr q13, [x4, #64] // .....................................e.................................. + ldr q31, [x4, #80] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v24.8H, v22.8H, v11.8H // .............e.......................................................... + add v27.8H, v22.8H, v11.8H // ..............e......................................................... + sub v12.8H, v17.8H, v20.8H // ........e............................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v14.8H, v6.8H, v7.H[0] // ...................................................*.................... + sub v6.8H, v25.8H, v0.8H // ..........................................*............................. + mls v15.8H, v16.8H, v7.H[0] // ........................................................*............... + add v16.8H, v25.8H, v0.8H // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v26.8H, v27.8H, v8.H[2] // ................e....................................................... + sqrdmulh v27.8H, v27.8H, v8.H[3] // ...............e........................................................ + mul v30.8H, v24.8H, v8.H[4] // .....................e.................................................. + sqrdmulh v11.8H, v24.8H, v8.H[5] // ....................e................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v4.8H, v6.8H, v15.8H // ..........................................................*............. + sub v5.8H, v6.8H, v15.8H // .........................................................*.............. + sub v3.8H, v16.8H, v14.8H // ....................................................*................... + add v2.8H, v16.8H, v14.8H // .....................................................*.................. + ldr q16, [x4, #48] // ....................................e................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v30.8H, v11.8H, v7.H[0] // ......................e................................................. + add v17.8H, v17.8H, v20.8H // .........e.............................................................. + mls v26.8H, v27.8H, v7.H[0] // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v27.8H, v3.8H, v7.H[1] // ..............................................................*......... + sqdmulh v14.8H, v2.8H, v7.H[1] // ...........................................................*............ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v10.8H, v4.8H, v7.H[1] // .................................................................*...... + sqdmulh v15.8H, v5.8H, v7.H[1] // ....................................................................*... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v6.8H, v17.8H, v26.8H // ...................e.................................................... + sub v28.8H, v12.8H, v30.8H // .......................e................................................ + add v22.8H, v12.8H, v30.8H // ........................e............................................... + sub v17.8H, v17.8H, v26.8H // ..................e..................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v10.8H, v10.8H, #11 // ..................................................................*..... + srshr v27.8H, v27.8H, #11 // ...............................................................*........ + srshr v14.8H, v14.8H, #11 // ............................................................*........... + srshr v15.8H, v15.8H, #11 // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v30.4S, v6.4S, v17.4S // .........................e.............................................. + trn2 v8.4S, v6.4S, v17.4S // ..........................e............................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v0.4S, v22.4S, v28.4S // ...........................e............................................ + trn2 v1.4S, v22.4S, v28.4S // ............................e........................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v3.8H, v27.8H, v7.H[0] // ................................................................*....... + mls v5.8H, v15.8H, v7.H[0] // ......................................................................*. + mls v2.8H, v14.8H, v7.H[0] // .............................................................*.......... + mls v4.8H, v10.8H, v7.H[0] // ...................................................................*.... + ldr q14, [x4], #(6*16) // .................................e...................................... + ldr q27, [x4, #-64] // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v25.2D, v30.2D, v0.2D // ...............................e........................................ + trn2 v0.2D, v30.2D, v0.2D // .............................e.......................................... + trn1 v6.2D, v8.2D, v1.2D // ................................e....................................... + trn2 v22.2D, v8.2D, v1.2D // ..............................e......................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // .......................................................................* + sqrdmulh v11.8H, v0.8H, v21.8H // .......................................e................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ - // original source code - // ldr x10, [x1, #(16*0)] // ..........................e............................................................................|...........................e......................................................................... - // ldr x11, [x1, #((16*0)+8)] // .............e.........................................................................................|..............e...................................................................................... - // ins v8.d[0], x10 // ...................................e...................................................................|....................................e................................................................ - // ins v8.d[1], x11 // ...........................................................e...........................................|............................................................e........................................ - // ldr x10, [x1, #(16*1)] // .................................e.....................................................................|..................................e.................................................................. - // ldr x11, [x1, #((16*1)+8)] // ............................................e..........................................................|.............................................e....................................................... - // ins v9.d[0], x10 // ................................................e......................................................|.................................................e................................................... - // ins v9.d[1], x11 // ......................................................e................................................|.......................................................e............................................. - // ldr x10, [x1, #(16*2)] // .e.....................................................................................................|..e.................................................................................................. - // ldr x11, [x1, #((16*2)+8)] // ...........e...........................................................................................|............e........................................................................................ - // ins v10.d[0], x10 // ..............e........................................................................................|...............e..................................................................................... - // ins v10.d[1], x11 // .........................e.............................................................................|..........................e.......................................................................... - // ldr x10, [x1, #(16*3)] // e......................................................................................................|.e................................................................................................... - // ldr x11, [x1, #((16*3)+8)] // .........e.............................................................................................|..........e.......................................................................................... - // ins v11.d[0], x10 // ...............e.......................................................................................|................e.................................................................................... - // ins v11.d[1], x11 // .......................e...............................................................................|........................e............................................................................ - // ldr x10, [x3], #16 // ..e....................................................................................................|...e................................................................................................. - // ldr x11, [x3, #(-16+8)] // ............e..........................................................................................|.............e....................................................................................... - // ins v0.d[0], x10 // ................e......................................................................................|.................e................................................................................... - // ins v0.d[1], x11 // ........................e..............................................................................|.........................e........................................................................... - // mul v24.8h, v10.8h, v0.h[0] // ..............................................e........................................................|...............................................e..................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[1] // ..................................e....................................................................|...................................e................................................................. - // mls v24.8h, v10.8h, v7.h[0] // ........................................................e..............................................|.........................................................e........................................... - // sub v10.8h, v8.8h, v24.8h // ..........................................................................e............................|...........................................................................e......................... - // add v8.8h, v8.8h, v24.8h // .........................................................................e.............................|..........................................................................e.......................... - // mul v24.8h, v11.8h, v0.h[0] // ....................................e..................................................................|.....................................e............................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[1] // .....................................e.................................................................|......................................e.............................................................. - // mls v24.8h, v11.8h, v7.h[0] // ..................................................e....................................................|...................................................e................................................. - // sub v11.8h, v9.8h, v24.8h // ..........................................................e............................................|...........................................................e......................................... - // add v9.8h, v9.8h, v24.8h // ............................................................e..........................................|.............................................................e....................................... - // mul v24.8h, v9.8h, v0.h[2] // .................................................................e.....................................|..................................................................e.................................. - // sqrdmulh v9.8h, v9.8h, v0.h[3] // ....................................................................e..................................|.....................................................................e............................... - // mls v24.8h, v9.8h, v7.h[0] // ............................................................................e..........................|.............................................................................e....................... - // sub v9.8h, v8.8h, v24.8h // ....................................................................................e..................|.....................................................................................e............... - // add v8.8h, v8.8h, v24.8h // .....................................................................................e.................|......................................................................................e.............. - // mul v24.8h, v11.8h, v0.h[4] // ...................................................................e...................................|....................................................................e................................ - // sqrdmulh v11.8h, v11.8h, v0.h[5] // ..................................................................e....................................|...................................................................e................................. - // mls v24.8h, v11.8h, v7.h[0] // ..............................................................................e........................|...............................................................................e..................... - // sub v11.8h, v10.8h, v24.8h // .......................................................................................e...............|........................................................................................e............ - // add v10.8h, v10.8h, v24.8h // ......................................................................................e................|.......................................................................................e............. - // trn1 v25.4s, v8.4s, v9.4s // .............................................................................................e.........|..............................................................................................e...... - // trn2 v26.4s, v8.4s, v9.4s // ..............................................................................................e........|...............................................................................................e..... - // trn1 v27.4s, v10.4s, v11.4s // ...............................................................................................e.......|................................................................................................e.... - // trn2 v28.4s, v10.4s, v11.4s // ................................................................................................e......|.................................................................................................e... - // trn2 v10.2d, v25.2d, v27.2d // .....................................................................................................e.|..................................................................................................... - // trn2 v11.2d, v26.2d, v28.2d // ......................................................................................................e|..................................................................................................... - // trn1 v8.2d, v25.2d, v27.2d // .......................................................................................................|*.................................................................................................... - // trn1 v9.2d, v26.2d, v28.2d // .......................................................................................................*..................................................................................................... - // ldr x10, [x4], #(6*16) // ..........e............................................................................................|...........e......................................................................................... - // ldr x11, [x4, #(-(6*16)+8)] // .................e.....................................................................................|..................e.................................................................................. - // ins v0.d[0], x10 // ............................e..........................................................................|.............................e....................................................................... - // ins v0.d[1], x11 // .............................................................................e.........................|..............................................................................e...................... - // ldr x10, [x4, #(-6*16 + 1*16)] // ......................e................................................................................|.......................e............................................................................. - // ldr x11, [x4, #((-6*16 + 1*16)+8)] // ......................................e................................................................|.......................................e............................................................. - // ins v4.d[0], x10 // ................................e......................................................................|.................................e................................................................... - // ins v4.d[1], x11 // ....................................................e..................................................|.....................................................e............................................... - // ldr x10, [x4, #(-6*16 + 2*16)] // ...............................................e.......................................................|................................................e.................................................... - // ldr x11, [x4, #((-6*16 + 2*16)+8)] // ......*................................................................................................|.......*............................................................................................. - // ins v1.d[0], x10 // ....................................................................................................e..|..................................................................................................... - // ins v1.d[1], x11 // ....................*..................................................................................|.....................*............................................................................... - // ldr x10, [x4, #(-6*16 + 3*16)] // ...*...................................................................................................|....*................................................................................................ - // ldr x11, [x4, #((-6*16 + 3*16)+8)] // ..................................................................................................e....|...................................................................................................e. - // ins v5.d[0], x10 // ...................*...................................................................................|....................*................................................................................ - // ins v5.d[1], x11 // .............................*.........................................................................|..............................*...................................................................... - // ldr x10, [x4, #(-6*16 + 4*16)] // ...........................................................................e...........................|............................................................................e........................ - // ldr x11, [x4, #((-6*16 + 4*16)+8)] // ........................................................................................e..............|.........................................................................................e........... - // ins v2.d[0], x10 // ...................................................................................e...................|....................................................................................e................ - // ins v2.d[1], x11 // .................................................................................................e.....|..................................................................................................e.. - // ldr x10, [x4, #(-6*16 + 5*16)] // ...........................e...........................................................................|............................e........................................................................ - // ldr x11, [x4, #((-6*16 + 5*16)+8)] // .................................................e.....................................................|..................................................e.................................................. - // ins v6.d[0], x10 // ...................................................e...................................................|....................................................e................................................ - // ins v6.d[1], x11 // .........................................................e.............................................|..........................................................e.......................................... - // mul v24.8h, v10.8h, v0.8h // ....*..................................................................................................|.....*............................................................................................... - // sqrdmulh v10.8h, v10.8h, v4.8h // .....*.................................................................................................|......*.............................................................................................. - // mls v24.8h, v10.8h, v7.h[0] // ..................*....................................................................................|...................*................................................................................. - // sub v10.8h, v8.8h, v24.8h // .............................................*.........................................................|..............................................*...................................................... - // add v8.8h, v8.8h, v24.8h // .......................................*...............................................................|........................................*............................................................ - // mul v24.8h, v11.8h, v0.8h // .......*...............................................................................................|........*............................................................................................ - // sqrdmulh v11.8h, v11.8h, v4.8h // ........*..............................................................................................|.........*........................................................................................... - // mls v24.8h, v11.8h, v7.h[0] // .....................*.................................................................................|......................*.............................................................................. - // sub v11.8h, v9.8h, v24.8h // ...............................*.......................................................................|................................*.................................................................... - // add v9.8h, v9.8h, v24.8h // ..............................*........................................................................|...............................*..................................................................... - // mul v24.8h, v9.8h, v1.8h // ........................................*..............................................................|.........................................*........................................................... - // sqrdmulh v9.8h, v9.8h, v5.8h // .........................................*.............................................................|..........................................*.......................................................... - // mls v24.8h, v9.8h, v7.h[0] // .......................................................*...............................................|........................................................*............................................ - // sub v9.8h, v8.8h, v24.8h // .............................................................*.........................................|..............................................................*...................................... - // add v8.8h, v8.8h, v24.8h // ..............................................................*........................................|...............................................................*..................................... - // mul v24.8h, v11.8h, v2.8h // ...........................................*...........................................................|............................................*........................................................ - // sqrdmulh v11.8h, v11.8h, v6.8h // ..........................................*............................................................|...........................................*......................................................... - // mls v24.8h, v11.8h, v7.h[0] // .....................................................*.................................................|......................................................*.............................................. - // sub v11.8h, v10.8h, v24.8h // ...............................................................*.......................................|................................................................*.................................... - // add v10.8h, v10.8h, v24.8h // ................................................................*......................................|.................................................................*................................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // .....................................................................*.................................|......................................................................*.............................. - // srshr v25.8h, v25.8h, #11 // ..................................................................................*....................|...................................................................................*................. - // mls v8.8h, v25.8h, v7.h[0] // ............................................................................................*..........|.............................................................................................*....... - // sqdmulh v25.8h, v9.8h, v7.h[1] // ......................................................................*................................|.......................................................................*............................. - // srshr v25.8h, v25.8h, #11 // ...............................................................................*.......................|................................................................................*.................... - // mls v9.8h, v25.8h, v7.h[0] // .........................................................................................*.............|..........................................................................................*.......... - // sqdmulh v25.8h, v10.8h, v7.h[1] // .......................................................................*...............................|........................................................................*............................ - // srshr v25.8h, v25.8h, #11 // ................................................................................*......................|.................................................................................*................... - // mls v10.8h, v25.8h, v7.h[0] // ...........................................................................................*...........|............................................................................................*........ - // sqdmulh v25.8h, v11.8h, v7.h[1] // ........................................................................*..............................|.........................................................................*........................... - // srshr v25.8h, v25.8h, #11 // .................................................................................*.....................|..................................................................................*.................. - // mls v11.8h, v25.8h, v7.h[0] // ..........................................................................................*............|...........................................................................................*......... - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ...................................................................................................*...|....................................................................................................* + // ---------------------------------------------------------------- new position ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q8, [x1, #(16*0)] // ......e.................................................................'.....~................................................................ + // ldr q9, [x1, #(16*1)] // ........e...............................................................'.......~.............................................................. + // ldr q10, [x1, #(16*2)] // ..e.....................................................................'.~.................................................................... + // ldr q11, [x1, #(16*3)] // ...e....................................................................'..~................................................................... + // ldr q0, [x3], #16 // e.......................................................................~...................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ...........e............................................................'..........~........................................................... + // mul v24.8h, v10.8h, v0.h[0] // ............e...........................................................'...........~.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..................e.....................................................'.................~.................................................... + // sub v10.8h, v8.8h, v24.8h // ...........................e............................................'..........................~........................................... + // add v8.8h, v8.8h, v24.8h // ..........................................e.............................'.........................................~............................ + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .............e..........................................................'............~......................................................... + // mul v24.8h, v11.8h, v0.h[0] // ..............e.........................................................'.............~........................................................ + // mls v24.8h, v27.8h, v7.h[0] // .................e......................................................'................~..................................................... + // sub v11.8h, v9.8h, v24.8h // .........................e..............................................'........................~............................................. + // add v9.8h, v9.8h, v24.8h // ..........................e.............................................'.........................~............................................ + // sqrdmulh v27.8h, v9.8h, v0.h[3] // .................................e......................................'................................~..................................... + // mul v24.8h, v9.8h, v0.h[2] // ................................e.......................................'...............................~...................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................................e............................'..........................................~........................... + // sub v9.8h, v8.8h, v24.8h // ...................................................e....................'..................................................~................... + // add v8.8h, v8.8h, v24.8h // ................................................e.......................'...............................................~...................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ...................................e....................................'..................................~................................... + // mul v24.8h, v11.8h, v0.h[4] // ..................................e.....................................'.................................~.................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................e..............................'........................................~............................. + // sub v11.8h, v10.8h, v24.8h // .................................................e......................'................................................~..................... + // add v10.8h, v10.8h, v24.8h // ..................................................e.....................'.................................................~.................... + // trn1 v25.4s, v8.4s, v9.4s // ........................................................e...............'.......................................................~.............. + // trn2 v26.4s, v8.4s, v9.4s // .........................................................e..............'........................................................~............. + // trn1 v27.4s, v10.4s, v11.4s // ..........................................................e.............'.........................................................~............ + // trn2 v28.4s, v10.4s, v11.4s // ...........................................................e............'..........................................................~........... + // trn2 v10.2d, v25.2d, v27.2d // ...................................................................e....'..................................................................~... + // trn2 v11.2d, v26.2d, v28.2d // .....................................................................e..'....................................................................~. + // trn1 v8.2d, v25.2d, v27.2d // ..................................................................e.....'.................................................................~.... + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e...'...................................................................~.. + // ldr q0, [ x4], #(6*16) // ................................................................e.......'...............................................................~...... + // ldr q4, [x4, #(-6*16 + 1*16)] // .......e................................................................'......~............................................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // .................................................................e......'................................................................~..... + // ldr q5, [x4, #(-6*16 + 3*16)] // ........................................e...............................'.......................................~.............................. + // ldr q2, [ x4, #(-6*16 + 4*16)] // .......................e................................................'......................~............................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ........................e...............................................'.......................~.............................................. + // sqrdmulh v27.8h, v10.8h, v4.8h // .......................................................................e'...................................................................... + // mul v24.8h, v10.8h, v0.8h // .~......................................................................'*..................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........~.............................................................'.........*............................................................ + // sub v10.8h, v8.8h, v24.8h // .............................~..........................................'............................*......................................... + // add v8.8h, v8.8h, v24.8h // ...............................~........................................'..............................*....................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // .....~..................................................................'....*................................................................. + // mul v24.8h, v11.8h, v0.8h // ....~...................................................................'...*.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .........~..............................................................'........*............................................................. + // sub v11.8h, v9.8h, v24.8h // ...............~........................................................'..............*....................................................... + // add v9.8h, v9.8h, v24.8h // ................~.......................................................'...............*...................................................... + // sqrdmulh v27.8h, v9.8h, v5.8h // ....................~...................................................'...................*.................................................. + // mul v24.8h, v9.8h, v1.8h // ...................~....................................................'..................*................................................... + // mls v24.8h, v27.8h, v7.h[0] // ............................~...........................................'...........................*.......................................... + // sub v9.8h, v8.8h, v24.8h // ......................................~.................................'.....................................*................................ + // add v8.8h, v8.8h, v24.8h // .......................................~................................'......................................*............................... + // sqrdmulh v27.8h, v11.8h, v6.8h // ......................~.................................................'.....................*................................................ + // mul v24.8h, v11.8h, v2.8h // .....................~..................................................'....................*................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..............................~.........................................'.............................*........................................ + // sub v11.8h, v10.8h, v24.8h // .....................................~..................................'....................................*................................. + // add v10.8h, v10.8h, v24.8h // ....................................~...................................'...................................*.................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // .............................................~..........................'............................................*......................... + // srshr v25.8h, v25.8h, #11 // ......................................................~.................'.....................................................*................ + // mls v8.8h, v25.8h, v7.h[0] // ..............................................................~.........'.............................................................*........ + // sqdmulh v25.8h, v9.8h, v7.h[1] // ............................................~...........................'...........................................*.......................... + // srshr v25.8h, v25.8h, #11 // .....................................................~..................'....................................................*................. + // mls v9.8h, v25.8h, v7.h[0] // ............................................................~...........'...........................................................*.......... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..............................................~.........................'.............................................*........................ + // srshr v25.8h, v25.8h, #11 // ....................................................~...................'...................................................*.................. + // mls v10.8h, v25.8h, v7.h[0] // ...............................................................~........'..............................................................*....... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ...............................................~........................'..............................................*....................... + // srshr v25.8h, v25.8h, #11 // .......................................................~................'......................................................*............... + // mls v11.8h, v25.8h, v7.h[0] // .............................................................~..........'............................................................*......... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ......................................................................~.'.....................................................................* sub count, count, #1 cbnz count, layer4567_start - mul v5.8H, v23.8H, v14.8H // ......*................................. - sqrdmulh v9.8H, v23.8H, v29.8H // .......*................................ - ldr x11, [x4, #-48] // ..*..................................... - ldr x9, [x4, #-56] // .....*.................................. - mul v11.8H, v4.8H, v14.8H // ...*.................................... - sqrdmulh v0.8H, v4.8H, v29.8H // ....*................................... - // gap // ........................................ - // gap // ........................................ - trn1 v2.2D, v3.2D, v22.2D // .*...................................... - trn1 v13.2D, v19.2D, v26.2D // *....................................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v11.8H, v0.8H, v7.H[0] // ........*............................... - mls v5.8H, v9.8H, v7.H[0] // ...........*............................ - ins v10.d[1], x9 // ..........*............................. - ins v8.d[0], x11 // .........*.............................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - ins v8.d[1], x24 // ............*........................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v4.8H, v13.8H, v5.8H // .............*.......................... - sub v29.8H, v13.8H, v5.8H // ..............*......................... - add v20.8H, v2.8H, v11.8H // ...............*........................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sub v31.8H, v2.8H, v11.8H // ....................*................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqrdmulh v13.8H, v29.8H, v27.8H // ..................*..................... - mul v30.8H, v4.8H, v10.8H // ................*....................... - sqrdmulh v25.8H, v4.8H, v8.8H // .................*...................... - mul v23.8H, v29.8H, v12.8H // ...................*.................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v30.8H, v25.8H, v7.H[0] // ......................*................. - mls v23.8H, v13.8H, v7.H[0] // .....................*.................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v22.8H, v31.8H, v23.8H // ..........................*............. - sub v23.8H, v31.8H, v23.8H // .........................*.............. - sub v21.8H, v20.8H, v30.8H // .......................*................ - add v20.8H, v20.8H, v30.8H // ........................*............... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqdmulh v29.8H, v23.8H, v7.H[1] // ..............................*......... - sqdmulh v30.8H, v21.8H, v7.H[1] // ............................*........... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqdmulh v17.8H, v20.8H, v7.H[1] // ...........................*............ - sqdmulh v4.8H, v22.8H, v7.H[1] // .............................*.......... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - srshr v26.8H, v17.8H, #11 // ..................................*..... - srshr v17.8H, v29.8H, #11 // .................................*...... - srshr v16.8H, v30.8H, #11 // ...............................*........ - srshr v12.8H, v4.8H, #11 // ................................*....... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v20.8H, v26.8H, v7.H[0] // ......................................*. - mls v23.8H, v17.8H, v7.H[0] // ....................................*... - mls v21.8H, v16.8H, v7.H[0] // ...................................*.... - mls v22.8H, v12.8H, v7.H[0] // .....................................*.. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64 // .......................................* - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ + // Instructions: 32 + // Expected cycles: 26 + // Expected IPC: 1.23 + // + // Cycle bound: 26.0 + // IPC bound: 1.23 + // + // Wall time: 0.49s + // User time: 0.49s + // + // ------ original position ------> + // 0 25 + // |------------------------|------ + mul v15.8H, v0.8H, v14.8H // *............................... + mul v14.8H, v22.8H, v14.8H // .*.............................. + sqrdmulh v26.8H, v22.8H, v21.8H // ..*............................. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v15.8H, v11.8H, v7.H[0] // ....*........................... + mls v14.8H, v26.8H, v7.H[0] // ...*............................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + add v26.8H, v6.8H, v14.8H // ......*......................... + sub v14.8H, v6.8H, v14.8H // .....*.......................... + sub v0.8H, v25.8H, v15.8H // ............*................... + add v6.8H, v25.8H, v15.8H // ..............*................. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mul v15.8H, v26.8H, v27.8H // .......*........................ + sqrdmulh v16.8H, v26.8H, v16.8H // ........*....................... + mul v27.8H, v14.8H, v13.8H // .........*...................... + sqrdmulh v14.8H, v14.8H, v31.8H // ..........*..................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v15.8H, v16.8H, v7.H[0] // ...........*.................... + mls v27.8H, v14.8H, v7.H[0] // .............*.................. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + add v16.8H, v0.8H, v27.8H // ...............*................ + sub v17.8H, v0.8H, v27.8H // ................*............... + add v14.8H, v6.8H, v15.8H // ..................*............. + sub v15.8H, v6.8H, v15.8H // .................*.............. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sqdmulh v27.8H, v15.8H, v7.H[1] // ...................*............ + sqdmulh v6.8H, v14.8H, v7.H[1] // ....................*........... + sqdmulh v26.8H, v16.8H, v7.H[1] // .....................*.......... + sqdmulh v0.8H, v17.8H, v7.H[1] // ......................*......... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + srshr v27.8H, v27.8H, #11 // ........................*....... + srshr v6.8H, v6.8H, #11 // .........................*...... + srshr v26.8H, v26.8H, #11 // .......................*........ + srshr v0.8H, v0.8H, #11 // ..........................*..... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v16.8H, v26.8H, v7.H[0] // ..............................*. + mls v17.8H, v0.8H, v7.H[0] // ............................*... + mls v14.8H, v6.8H, v7.H[0] // .............................*.. + mls v15.8H, v27.8H, v7.H[0] // ...........................*.... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + st4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1], #64 // ...............................* + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ - // original source code - // trn1 v0.2D, v19.2D, v26.2D // .......*................................ - // trn1 v20.2D, v3.2D, v22.2D // ......*................................. - // ldr x15, [x4, #-48] // ..*..................................... - // mul v18.8H, v4.8H, v14.8H // ....*................................... - // sqrdmulh v4.8H, v4.8H, v29.8H // .....*.................................. - // ldr x21, [x4, #-56] // ...*.................................... - // mul v22.8H, v23.8H, v14.8H // *....................................... - // sqrdmulh v23.8H, v23.8H, v29.8H // .*...................................... - // mls v18.8H, v4.8H, v7.H[0] // ........*............................... - // ins v4.d[0], x15 // ...........*............................ - // ins v10.d[1], x21 // ..........*............................. - // mls v22.8H, v23.8H, v7.H[0] // .........*.............................. - // ins v4.d[1], x24 // ............*........................... - // add v23.8H, v0.8H, v22.8H // .............*.......................... - // sub v22.8H, v0.8H, v22.8H // ..............*......................... - // add v0.8H, v20.8H, v18.8H // ...............*........................ - // mul v3.8H, v23.8H, v10.8H // ..................*..................... - // sqrdmulh v4.8H, v23.8H, v4.8H // ...................*.................... - // sqrdmulh v23.8H, v22.8H, v27.8H // .................*...................... - // mul v22.8H, v22.8H, v12.8H // ....................*................... - // sub v26.8H, v20.8H, v18.8H // ................*....................... - // mls v22.8H, v23.8H, v7.H[0] // ......................*................. - // mls v3.8H, v4.8H, v7.H[0] // .....................*.................. - // sub v9.8H, v0.8H, v3.8H // .........................*.............. - // add v8.8H, v0.8H, v3.8H // ..........................*............. - // sub v11.8H, v26.8H, v22.8H // ........................*............... - // add v10.8H, v26.8H, v22.8H // .......................*................ - // sqdmulh v3.8H, v8.8H, v7.H[1] // .............................*.......... - // sqdmulh v4.8H, v9.8H, v7.H[1] // ............................*........... - // sqdmulh v22.8H, v10.8H, v7.H[1] // ..............................*......... - // sqdmulh v23.8H, v11.8H, v7.H[1] // ...........................*............ - // srshr v4.8H, v4.8H, #11 // .................................*...... - // srshr v22.8H, v22.8H, #11 // ..................................*..... - // srshr v23.8H, v23.8H, #11 // ................................*....... - // srshr v3.8H, v3.8H, #11 // ...............................*........ - // mls v9.8H, v4.8H, v7.H[0] // .....................................*.. - // mls v11.8H, v23.8H, v7.H[0] // ....................................*... - // mls v10.8H, v22.8H, v7.H[0] // ......................................*. - // mls v8.8H, v3.8H, v7.H[0] // ...................................*.... - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // .......................................* + // -------- new position ---------> + // 0 25 + // |------------------------|------ + // mul v0.8H, v0.8H, v14.8H // *............................... + // mul v15.8H, v22.8H, v14.8H // .*.............................. + // sqrdmulh v26.8H, v22.8H, v21.8H // ..*............................. + // mls v15.8H, v26.8H, v7.H[0] // ....*........................... + // mls v0.8H, v11.8H, v7.H[0] // ...*............................ + // sub v26.8H, v6.8H, v15.8H // ......*......................... + // add v15.8H, v6.8H, v15.8H // .....*.......................... + // mul v14.8H, v15.8H, v27.8H // .........*...................... + // sqrdmulh v6.8H, v15.8H, v16.8H // ..........*..................... + // mul v15.8H, v26.8H, v13.8H // ...........*.................... + // sqrdmulh v16.8H, v26.8H, v31.8H // ............*................... + // mls v14.8H, v6.8H, v7.H[0] // .............*.................. + // sub v6.8H, v25.8H, v0.8H // .......*........................ + // mls v15.8H, v16.8H, v7.H[0] // ..............*................. + // add v16.8H, v25.8H, v0.8H // ........*....................... + // add v4.8H, v6.8H, v15.8H // ...............*................ + // sub v5.8H, v6.8H, v15.8H // ................*............... + // sub v3.8H, v16.8H, v14.8H // ..................*............. + // add v2.8H, v16.8H, v14.8H // .................*.............. + // sqdmulh v27.8H, v3.8H, v7.H[1] // ...................*............ + // sqdmulh v14.8H, v2.8H, v7.H[1] // ....................*........... + // sqdmulh v10.8H, v4.8H, v7.H[1] // .....................*.......... + // sqdmulh v15.8H, v5.8H, v7.H[1] // ......................*......... + // srshr v10.8H, v10.8H, #11 // .........................*...... + // srshr v27.8H, v27.8H, #11 // .......................*........ + // srshr v14.8H, v14.8H, #11 // ........................*....... + // srshr v15.8H, v15.8H, #11 // ..........................*..... + // mls v3.8H, v27.8H, v7.H[0] // ..............................*. + // mls v5.8H, v15.8H, v7.H[0] // ............................*... + // mls v2.8H, v14.8H, v7.H[0] // .............................*.. + // mls v4.8H, v10.8H, v7.H[0] // ...........................*.... + // st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // ...............................* pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_m1_icestorm.s index 69868d09..2cf9168f 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_opt_m1_icestorm.s @@ -26,42 +26,12 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 .macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -79,15 +49,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -96,12 +66,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -115,21 +79,21 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -151,7 +115,7 @@ xtmp1 .req x11 trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -162,7 +126,7 @@ xtmp1 .req x11 str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -172,7 +136,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -180,7 +144,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -191,19 +155,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -342,1294 +306,1034 @@ _ntt_kyber_123_4567_scalar_load_opt_m1_icestorm: load_roots_123 .p2align 2 - ldr x9, [x0, #128] // *................. - ldr x15, [x0, #384] // ......*........... - // gap // .................. - // gap // .................. - ldr x20, [x0, #256] // .*................ - // gap // .................. - // gap // .................. - // gap // .................. - ldr x10, [x0, #320] // ........*......... - // gap // .................. - // gap // .................. - // gap // .................. - ins v4.d[0], x15 // ............*..... - ldr x28, [x0, #392] // ....*............. - // gap // .................. - // gap // .................. - ins v29.d[0], x20 // ..........*....... - ldr x19, [x0, #328] // .....*............ - // gap // .................. - // gap // .................. - ins v5.d[0], x10 // ...........*...... - ldr x13, [x0, #264] // ..*............... - // gap // .................. - // gap // .................. - ins v4.d[1], x28 // .............*.... - ldr x11, [x0, #64] // ...*.............. - // gap // .................. - // gap // .................. - ldr x27, [x0, #448] // .......*.......... - ins v5.d[1], x19 // ..............*... - // gap // .................. - // gap // .................. - ldr x17, [x0, #456] // .........*........ - sqrdmulh v20.8H, v4.8H, v0.H[1] // ...............*.. - // gap // .................. - // gap // .................. - ins v29.d[1], x13 // ................*. - sqrdmulh v27.8H, v5.8H, v0.H[1] // .................* - // gap // .................. - // gap // .................. + // Instructions: 18 + // Expected cycles: 11 + // Expected IPC: 1.64 + // + // Cycle bound: 11.0 + // IPC bound: 1.64 + // + // Wall time: 0.11s + // User time: 0.11s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q14, [x0, #192] // ....*......................... + ldr q15, [x0, #448] // *............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x0, #128] // .....*........................ + ldr q16, [x0, #384] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q27, [x0, #256] // ...*.......................... + ldr q9, [x0, #320] // ..*........................... + // gap // .............................. + // gap // .............................. + ldr q25, [x0, #0] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v6.8H, v15.8H, v0.H[0] // ........*..................... + sqrdmulh v15.8H, v15.8H, v0.H[1] // .........*.................... + // gap // .............................. + // gap // .............................. + sqrdmulh v11.8H, v16.8H, v0.H[1] // .......*...................... + mul v26.8H, v16.8H, v0.H[0] // ..........*................... + // gap // .............................. + // gap // .............................. + sqrdmulh v3.8H, v27.8H, v0.H[1] // ...........*.................. + mul v16.8H, v27.8H, v0.H[0] // .............*................ + // gap // .............................. + // gap // .............................. + mls v6.8H, v15.8H, v7.H[0] // ............*................. + sqrdmulh v30.8H, v9.8H, v0.H[1] // ...............*.............. + // gap // .............................. + // gap // .............................. + mls v26.8H, v11.8H, v7.H[0] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v15.8H, v14.8H, v6.8H // ................*............. + sub v14.8H, v14.8H, v6.8H // .................*............ + // gap // .............................. + // gap // .............................. - // original source code - // ldr x9, [x0, #128] // *................. - // ldr x22, [x0, #256] // ..*............... - // ldr x8, [x0, #264] // .........*........ - // ldr x11, [x0, #64] // ...........*...... - // ldr x23, [x0, #392] // .....*............ - // ldr x20, [x0, #328] // .......*.......... - // ldr x13, [x0, #384] // .*................ - // ldr x27, [x0, #448] // ............*..... - // ldr x10, [x0, #320] // ...*.............. - // ldr x17, [x0, #456] // ..............*... - // ins v29.d[0], x22 // ......*........... - // ins v5.d[0], x10 // ........*......... - // ins v4.d[0], x13 // ....*............. - // ins v4.d[1], x23 // ..........*....... - // ins v5.d[1], x20 // .............*.... - // sqrdmulh v20.8H, v4.8H, v0.H[1] // ...............*.. - // ins v29.d[1], x8 // ................*. - // sqrdmulh v27.8H, v5.8H, v0.H[1] // .................* + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q12, [x0, #448] // .*............................. + // ldr q19, [x0, #384] // ...*........................... + // ldr q9, [x0, #320] // .....*......................... + // ldr q24, [x0, #256] // ....*.......................... + // ldr q20, [x0, #192] // *.............................. + // ldr q13, [x0, #128] // ..*............................ + // ldr q25, [x0, #0] // ......*........................ + // sqrdmulh v23.8H, v19.8H, v0.H[1] // .........*..................... + // mul v31.8H, v12.8H, v0.H[0] // .......*....................... + // sqrdmulh v22.8H, v12.8H, v0.H[1] // ........*...................... + // mul v26.8H, v19.8H, v0.H[0] // ..........*.................... + // sqrdmulh v3.8H, v24.8H, v0.H[1] // ...........*................... + // mls v31.8H, v22.8H, v7.H[0] // .............*................. + // mul v16.8H, v24.8H, v0.H[0] // ............*.................. + // mls v26.8H, v23.8H, v7.H[0] // ...............*............... + // sqrdmulh v30.8H, v9.8H, v0.H[1] // ..............*................ + // add v15.8H, v20.8H, v31.8H // ................*.............. + // sub v14.8H, v20.8H, v31.8H // .................*............. sub count, count, #1 layer123_start: - mul v9.8H, v29.8H, v0.H[0] // ................................*................................................................... - ins v30.d[0], x9 // ..........*......................................................................................... - ldr x12, [x0, #72] // .....*.............................................................................................. - ldr x13, [x0, #0] // *................................................................................................... - mul v24.8H, v5.8H, v0.H[0] // .....................................*.............................................................. - ldr x9, [x0, #144] // ........e........................................................................................... - // gap // .................................................................................................... - ldr x22, [x0, #272] // ................e................................................................................... - mul v22.8H, v4.8H, v0.H[0] // ..........................................*......................................................... - sqrdmulh v11.8H, v29.8H, v0.H[1] // .................................*.................................................................. - ldr x8, [x0, #280] // .................e.................................................................................. - ldr x19, [x0, #192] // ............*....................................................................................... - ins v19.d[0], x11 // ......*............................................................................................. - ldr x11, [x0, #80] // ....e............................................................................................... - ins v31.d[0], x13 // ..*................................................................................................. - ldr x24, [x0, #8] // .*.................................................................................................. - ldr x23, [x0, #408] // .........................e.......................................................................... - ldr x25, [x0, #136] // .........*.......................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v9.8H, v11.8H, v7.H[0] // ..................................*................................................................. - ins v5.d[0], x27 // ..............................*..................................................................... - ldr x20, [x0, #344] // .....................e.............................................................................. - ldr x10, [x0, #200] // .............*...................................................................................... - mls v22.8H, v20.8H, v7.H[0] // ............................................*....................................................... - ldr x13, [x0, #400] // ........................e........................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v19.d[1], x12 // .......*............................................................................................ - ins v31.d[1], x24 // ...*................................................................................................ - // gap // .................................................................................................... - ldr x27, [x0, #464] // ............................e....................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v10.8H, v31.8H, v9.8H // ....................................*............................................................... - sub v20.8H, v31.8H, v9.8H // ...................................*................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v9.d[0], x19 // ..............*..................................................................................... - ins v30.d[1], x25 // ...........*........................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v9.d[1], x10 // ...............*.................................................................................... - ldr x10, [x0, #336] // ....................e............................................................................... - ins v5.d[1], x17 // ...............................*.................................................................... - ldr x17, [x0, #472] // .............................e...................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v14.8H, v5.8H, v0.H[1] // ................................................*................................................... - mul v31.8H, v5.8H, v0.H[0] // ...............................................*.................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v24.8H, v27.8H, v7.H[0] // .......................................*............................................................ - sub v26.8H, v30.8H, v22.8H // .............................................*...................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v29.d[0], x22 // ..................e................................................................................. - add v22.8H, v30.8H, v22.8H // ..............................................*..................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v31.8H, v14.8H, v7.H[0] // .................................................*.................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v25.8H, v26.8H, v0.H[4] // ..............................................................*..................................... - sqrdmulh v2.8H, v26.8H, v0.H[5] // ...............................................................*.................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v28.8H, v22.8H, v0.H[2] // ....................................................*............................................... - sub v18.8H, v19.8H, v24.8H // ........................................*........................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v27.8H, v22.8H, v0.H[3] // .....................................................*.............................................. - sub v6.8H, v9.8H, v31.8H // ..................................................*................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v25.8H, v2.8H, v7.H[0] // ................................................................*................................... - add v12.8H, v9.8H, v31.8H // ...................................................*................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v8.8H, v6.8H, v0.H[5] // ....................................................................*............................... - mul v2.8H, v6.8H, v0.H[4] // ...................................................................*................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v22.8H, v12.8H, v0.H[3] // ..........................................................*......................................... - mul v21.8H, v12.8H, v0.H[2] // .........................................................*.......................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v28.8H, v27.8H, v7.H[0] // ......................................................*............................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - add v3.8H, v20.8H, v25.8H // ..................................................................*................................. - ins v5.d[0], x10 // ......................e............................................................................. - mls v2.8H, v8.8H, v7.H[0] // .....................................................................*.............................. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v21.8H, v22.8H, v7.H[0] // ...........................................................*........................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v16.8H, v10.8H, v28.8H // .......................................................*............................................ - add v22.8H, v19.8H, v24.8H // .........................................*.......................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v27.8H, v18.8H, v2.8H // ......................................................................*............................. - add v9.8H, v18.8H, v2.8H // .......................................................................*............................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v11.8H, v22.8H, v21.8H // .............................................................*...................................... - sub v8.8H, v22.8H, v21.8H // ............................................................*....................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v2.8H, v9.8H, v1.H[3] // ...................................................................................*................ - mul v31.8H, v9.8H, v1.H[2] // ..................................................................................*................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v17.8H, v11.8H, v0.H[7] // .........................................................................*.......................... - // gap // .................................................................................................... - mul v18.8H, v11.8H, v0.H[6] // ........................................................................*........................... - // gap // .................................................................................................... - sqrdmulh v11.8H, v27.8H, v1.H[5] // ........................................................................................*........... - sqrdmulh v12.8H, v8.8H, v1.H[1] // ..............................................................................*..................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v15.8H, v27.8H, v1.H[4] // .......................................................................................*............ - mls v31.8H, v2.8H, v7.H[0] // ....................................................................................*............... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v21.8H, v8.8H, v1.H[0] // .............................................................................*...................... - mls v18.8H, v17.8H, v7.H[0] // ..........................................................................*......................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v2.8H, v10.8H, v28.8H // ........................................................*........................................... - sub v28.8H, v20.8H, v25.8H // .................................................................*.................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v15.8H, v11.8H, v7.H[0] // .........................................................................................*.......... - add v30.8H, v3.8H, v31.8H // ......................................................................................*............. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v21.8H, v12.8H, v7.H[0] // ...............................................................................*.................... - add v27.8H, v2.8H, v18.8H // ............................................................................*....................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v4.d[0], x13 // ..........................e......................................................................... - str q30, [x0, #256] // ................................................................................................*... - sub v12.8H, v2.8H, v18.8H // ...........................................................................*........................ - // gap // .................................................................................................... - str q27, [x0], #(16) // ............................................................................................*....... - add v20.8H, v28.8H, v15.8H // ...........................................................................................*........ - // gap // .................................................................................................... - // gap // .................................................................................................... - str q12, [x0, #48] // .............................................................................................*...... - sub v9.8H, v3.8H, v31.8H // .....................................................................................*.............. - sub v8.8H, v16.8H, v21.8H // ................................................................................*................... - // gap // .................................................................................................... - str q20, [x0, #368] // ..................................................................................................*. - sub v11.8H, v28.8H, v15.8H // ..........................................................................................*......... - add v18.8H, v16.8H, v21.8H // .................................................................................*.................. - // gap // .................................................................................................... - ins v4.d[1], x23 // ...........................e........................................................................ - ins v5.d[1], x20 // .......................e............................................................................ - str q8, [x0, #176] // ...............................................................................................*.... - // gap // .................................................................................................... - str q9, [x0, #304] // .................................................................................................*.. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v20.8H, v4.8H, v0.H[1] // ...........................................e........................................................ - ins v29.d[1], x8 // ...................e................................................................................ - str q18, [x0, #112] // ..............................................................................................*..... - // gap // .................................................................................................... - sqrdmulh v27.8H, v5.8H, v0.H[1] // ......................................e............................................................. - str q11, [x0, #432] // ...................................................................................................* - // gap // .................................................................................................... - // gap // .................................................................................................... + // Instructions: 76 + // Expected cycles: 30 + // Expected IPC: 2.53 + // + // Cycle bound: 30.0 + // IPC bound: 2.53 + // + // Wall time: 62.20s + // User time: 62.20s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + mul v6.8H, v9.8H, v0.H[0] // ..............*............................................................. + ldr q12, [x0, #464] // .......e.................................................................... + ldr q19, [x0, #400] // ......e..................................................................... + mls v16.8H, v3.8H, v7.H[0] // ..........*................................................................. + sub v22.8H, v13.8H, v26.8H // .....................*...................................................... + ldr q9, [x0, #336] // .....e...................................................................... + ldr q24, [x0, #272] // ....e....................................................................... + add v11.8H, v13.8H, v26.8H // ......................*..................................................... + mul v26.8H, v15.8H, v0.H[2] // ..................................*......................................... + sqrdmulh v2.8H, v14.8H, v0.H[5] // ...........................................*................................ + ldr q20, [x0, #208] // ...e........................................................................ + // gap // ............................................................................ + add v3.8H, v25.8H, v16.8H // ............*............................................................... + mul v27.8H, v14.8H, v0.H[4] // ............................................*............................... + ldr q13, [x0, #144] // ..e......................................................................... + ldr q5, [x0, #64] // .*.......................................................................... + sub v10.8H, v25.8H, v16.8H // ...........*................................................................ + sqrdmulh v16.8H, v22.8H, v0.H[5] // ......................................*..................................... + ldr q25, [x0, #16] // e........................................................................... + // gap // ............................................................................ + sqrdmulh v15.8H, v15.8H, v0.H[3] // .................................*.......................................... + mls v6.8H, v30.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v27.8H, v2.8H, v7.H[0] // .............................................*.............................. + mul v2.8H, v22.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v14.8H, v11.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v29.8H, v11.8H, v0.H[2] // .............................*.............................................. + mls v26.8H, v15.8H, v7.H[0] // ...................................*........................................ + sub v11.8H, v5.8H, v6.8H // ................*........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v2.8H, v16.8H, v7.H[0] // ........................................*................................... + add v15.8H, v5.8H, v6.8H // .................*.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v29.8H, v14.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + // gap // ............................................................................ + sub v17.8H, v11.8H, v27.8H // ..............................................*............................. + add v16.8H, v15.8H, v26.8H // .....................................*...................................... + sqrdmulh v23.8H, v19.8H, v0.H[1] // ..................e......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v28.8H, v15.8H, v26.8H // ....................................*....................................... + add v26.8H, v11.8H, v27.8H // ...............................................*............................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v4.8H, v16.8H, v0.H[7] // ................................................*........................... + mul v16.8H, v16.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v6.8H, v28.8H, v1.H[0] // ......................................................*..................... + sqrdmulh v31.8H, v28.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v15.8H, v17.8H, v1.H[5] // ...............................................................*............ + mul v27.8H, v17.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + mls v16.8H, v4.8H, v7.H[0] // ..................................................*......................... + mul v14.8H, v26.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v11.8H, v26.8H, v1.H[3] // ..........................................................*................. + add v22.8H, v3.8H, v29.8H // ................................*........................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v6.8H, v31.8H, v7.H[0] // .......................................................*.................... + mls v27.8H, v15.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + sub v15.8H, v22.8H, v16.8H // ...................................................*........................ + add v16.8H, v22.8H, v16.8H // ....................................................*....................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.8H, v10.8H, v2.8H // .........................................*.................................. + sub v3.8H, v3.8H, v29.8H // ...............................*............................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q15, [x0, #64] // .....................................................................*...... + mls v14.8H, v11.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + mul v31.8H, v12.8H, v0.H[0] // ........................e................................................... + str q16, [x0], #(16) // ....................................................................*....... + add v15.8H, v3.8H, v6.8H // .........................................................*.................. + sub v28.8H, v26.8H, v27.8H // ..................................................................*......... + // gap // ............................................................................ + sub v6.8H, v3.8H, v6.8H // ........................................................*................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v22.8H, v12.8H, v0.H[1] // .......................e.................................................... + str q15, [x0, #112] // ......................................................................*..... + add v11.8H, v10.8H, v2.8H // ..........................................*................................. + add v16.8H, v26.8H, v27.8H // ...................................................................*........ + // gap // ............................................................................ + mul v26.8H, v19.8H, v0.H[0] // ...................e........................................................ + sqrdmulh v3.8H, v24.8H, v0.H[1] // ........e................................................................... + str q6, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + str q16, [x0, #368] // ..........................................................................*. + add v15.8H, v11.8H, v14.8H // ..............................................................*............. + mls v31.8H, v22.8H, v7.H[0] // .........................e.................................................. + // gap // ............................................................................ + sub v27.8H, v11.8H, v14.8H // .............................................................*.............. + mul v16.8H, v24.8H, v0.H[0] // .........e.................................................................. + str q28, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + mls v26.8H, v23.8H, v7.H[0] // ....................e....................................................... + sqrdmulh v30.8H, v9.8H, v0.H[1] // .............e.............................................................. + str q15, [x0, #240] // ........................................................................*... + // gap // ............................................................................ + add v15.8H, v20.8H, v31.8H // ...........................e................................................ + sub v14.8H, v20.8H, v31.8H // ..........................e................................................. + str q27, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ - // original source code - // ldr x10, [x0, #0] // ...............................................................................................|..*................................................................................................ - // ldr x11, [x0, #(0+8)] // .........*.....................................................................................|.............*..................................................................................... - // ins v8.d[0], x10 // ........*......................................................................................|............*...................................................................................... - // ins v8.d[1], x11 // ...................*...........................................................................|.......................*........................................................................... - // ldr x10, [x0, #(1*(512/8))] // .......e.......................................................................................|...........e....................................................................................... - // ldr x11, [x0, #((1*(512/8))+8)] // ...............................................................................................|.*................................................................................................. - // ins v9.d[0], x10 // ......*........................................................................................|..........*........................................................................................ - // ins v9.d[1], x11 // ..................*............................................................................|......................*............................................................................ - // ldr x10, [x0, #(2*(512/8))] // e..............................................................................................|....e.............................................................................................. - // ldr x11, [x0, #((2*(512/8))+8)] // ...........*...................................................................................|...............*................................................................................... - // ins v10.d[0], x10 // ...............................................................................................|*.................................................................................................. - // ins v10.d[1], x11 // ........................*......................................................................|............................*...................................................................... - // ldr x10, [x0, #(3*(512/8))] // .....*.........................................................................................|.........*......................................................................................... - // ldr x11, [x0, #((3*(512/8))+8)] // ...............*...............................................................................|...................*............................................................................... - // ins v11.d[0], x10 // .......................*.......................................................................|...........................*....................................................................... - // ins v11.d[1], x11 // .........................*.....................................................................|.............................*..................................................................... - // ldr x10, [x0, #(4*(512/8))] // .e.............................................................................................|.....e............................................................................................. - // ldr x11, [x0, #((4*(512/8))+8)] // ....e..........................................................................................|........e.......................................................................................... - // ins v12.d[0], x10 // .................................e.............................................................|.....................................e............................................................. - // ins v12.d[1], x11 // ...........................................................................................e...|...............................................................................................e... - // ldr x10, [x0, #(5*(512/8))] // ..........................e....................................................................|..............................e.................................................................... - // ldr x11, [x0, #((5*(512/8))+8)] // ..............e................................................................................|..................e................................................................................ - // ins v13.d[0], x10 // ..................................................e............................................|......................................................e............................................ - // ins v13.d[1], x11 // .......................................................................................e.......|...........................................................................................e....... - // ldr x10, [x0, #(6*(512/8))] // .................e.............................................................................|.....................e............................................................................. - // ldr x11, [x0, #((6*(512/8))+8)] // ..........e....................................................................................|..............e.................................................................................... - // ins v14.d[0], x10 // ...........................................................................e...................|...............................................................................e................... - // ins v14.d[1], x11 // ......................................................................................e........|..........................................................................................e........ - // ldr x10, [x0, #(7*(512/8))] // ....................e..........................................................................|........................e.......................................................................... - // ldr x11, [x0, #((7*(512/8))+8)] // ............................e..................................................................|................................e.................................................................. - // ins v15.d[0], x10 // .............*.................................................................................|.................*................................................................................. - // ins v15.d[1], x11 // ...........................*...................................................................|...............................*................................................................... - // mul v24.8h, v12.8h, v0.h[0] // ...............................................................................................*................................................................................................... - // sqrdmulh v12.8h, v12.8h, v0.h[1] // ...*...........................................................................................|.......*........................................................................................... - // mls v24.8h, v12.8h, v7.h[0] // ............*..................................................................................|................*.................................................................................. - // sub v12.8h, v8.8h, v24.8h // ......................*........................................................................|..........................*........................................................................ - // add v8.8h, v8.8h, v24.8h // .....................*.........................................................................|.........................*......................................................................... - // mul v24.8h, v13.8h, v0.h[0] // ...............................................................................................|...*............................................................................................... - // sqrdmulh v13.8h, v13.8h, v0.h[1] // .............................................................................................e.|.................................................................................................e. - // mls v24.8h, v13.8h, v7.h[0] // ...............................*...............................................................|...................................*............................................................... - // sub v13.8h, v9.8h, v24.8h // .......................................*.......................................................|...........................................*....................................................... - // add v9.8h, v9.8h, v24.8h // ......................................................*........................................|..........................................................*........................................ - // mul v24.8h, v14.8h, v0.h[0] // ..*............................................................................................|......*............................................................................................ - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ..........................................................................................e....|..............................................................................................e.... - // mls v24.8h, v14.8h, v7.h[0] // ................*..............................................................................|....................*.............................................................................. - // sub v14.8h, v10.8h, v24.8h // ................................*..............................................................|....................................*.............................................................. - // add v10.8h, v10.8h, v24.8h // ..................................*............................................................|......................................*............................................................ - // mul v24.8h, v15.8h, v0.h[0] // ..............................*................................................................|..................................*................................................................ - // sqrdmulh v15.8h, v15.8h, v0.h[1] // .............................*.................................................................|.................................*................................................................. - // mls v24.8h, v15.8h, v7.h[0] // ...................................*...........................................................|.......................................*........................................................... - // sub v15.8h, v11.8h, v24.8h // .........................................*.....................................................|.............................................*..................................................... - // add v11.8h, v11.8h, v24.8h // ...........................................*...................................................|...............................................*................................................... - // mul v24.8h, v10.8h, v0.h[2] // ......................................*........................................................|..........................................*........................................................ - // sqrdmulh v10.8h, v10.8h, v0.h[3] // ........................................*......................................................|............................................*...................................................... - // mls v24.8h, v10.8h, v7.h[0] // ................................................*..............................................|....................................................*.............................................. - // sub v10.8h, v8.8h, v24.8h // .....................................................*.........................................|.........................................................*......................................... - // add v8.8h, v8.8h, v24.8h // .....................................................................*.........................|.........................................................................*......................... - // mul v24.8h, v11.8h, v0.h[2] // ...............................................*...............................................|...................................................*............................................... - // sqrdmulh v11.8h, v11.8h, v0.h[3] // ..............................................*................................................|..................................................*................................................ - // mls v24.8h, v11.8h, v7.h[0] // ....................................................*..........................................|........................................................*.......................................... - // sub v11.8h, v9.8h, v24.8h // ..........................................................*....................................|..............................................................*.................................... - // add v9.8h, v9.8h, v24.8h // .........................................................*.....................................|.............................................................*..................................... - // mul v24.8h, v14.8h, v0.h[4] // ....................................*..........................................................|........................................*.......................................................... - // sqrdmulh v14.8h, v14.8h, v0.h[5] // .....................................*.........................................................|.........................................*......................................................... - // mls v24.8h, v14.8h, v7.h[0] // ..........................................*....................................................|..............................................*.................................................... - // sub v14.8h, v12.8h, v24.8h // ......................................................................*........................|..........................................................................*........................ - // add v12.8h, v12.8h, v24.8h // .................................................*.............................................|.....................................................*............................................. - // mul v24.8h, v15.8h, v0.h[4] // .............................................*.................................................|.................................................*................................................. - // sqrdmulh v15.8h, v15.8h, v0.h[5] // ............................................*..................................................|................................................*.................................................. - // mls v24.8h, v15.8h, v7.h[0] // ...................................................*...........................................|.......................................................*........................................... - // sub v15.8h, v13.8h, v24.8h // .......................................................*.......................................|...........................................................*....................................... - // add v13.8h, v13.8h, v24.8h // ........................................................*......................................|............................................................*...................................... - // mul v24.8h, v9.8h, v0.h[6] // ..............................................................*................................|..................................................................*................................ - // sqrdmulh v9.8h, v9.8h, v0.h[7] // .............................................................*.................................|.................................................................*................................. - // mls v24.8h, v9.8h, v7.h[0] // ....................................................................*..........................|........................................................................*.......................... - // sub v9.8h, v8.8h, v24.8h // .............................................................................*.................|.................................................................................*................. - // add v8.8h, v8.8h, v24.8h // ..........................................................................*....................|..............................................................................*.................... - // mul v24.8h, v11.8h, v1.h[0] // ...................................................................*...........................|.......................................................................*........................... - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ................................................................*..............................|....................................................................*.............................. - // mls v24.8h, v11.8h, v7.h[0] // .........................................................................*.....................|.............................................................................*..................... - // sub v11.8h, v10.8h, v24.8h // ..................................................................................*............|......................................................................................*............ - // add v10.8h, v10.8h, v24.8h // .....................................................................................*.........|.........................................................................................*......... - // mul v24.8h, v13.8h, v1.h[2] // ............................................................*..................................|................................................................*.................................. - // sqrdmulh v13.8h, v13.8h, v1.h[3] // ...........................................................*...................................|...............................................................*................................... - // mls v24.8h, v13.8h, v7.h[0] // ..................................................................*............................|......................................................................*............................ - // sub v13.8h, v12.8h, v24.8h // .................................................................................*.............|.....................................................................................*............. - // add v12.8h, v12.8h, v24.8h // ........................................................................*......................|............................................................................*...................... - // mul v24.8h, v15.8h, v1.h[4] // .................................................................*.............................|.....................................................................*............................. - // sqrdmulh v15.8h, v15.8h, v1.h[5] // ...............................................................*...............................|...................................................................*............................... - // mls v24.8h, v15.8h, v7.h[0] // .......................................................................*.......................|...........................................................................*....................... - // sub v15.8h, v14.8h, v24.8h // ....................................................................................*..........|........................................................................................*.......... - // add v14.8h, v14.8h, v24.8h // ...............................................................................*...............|...................................................................................*............... - // str q8, [x0], #(16) // ..............................................................................*................|..................................................................................*................ - // str q9, [x0, #(-16 + 1*(512/8))] // ................................................................................*..............|....................................................................................*.............. - // str q10, [x0, #(-16 + 2*(512/8))] // ............................................................................................*..|................................................................................................*.. - // str q11, [x0, #(-16 + 3*(512/8))] // ........................................................................................*......|............................................................................................*...... - // str q12, [x0, #(-16 + 4*(512/8))] // ............................................................................*..................|................................................................................*.................. - // str q13, [x0, #(-16 + 5*(512/8))] // .........................................................................................*.....|.............................................................................................*..... - // str q14, [x0, #(-16 + 6*(512/8))] // ...................................................................................*...........|.......................................................................................*........... - // str q15, [x0, #(-16 + 7*(512/8))] // ..............................................................................................*|..................................................................................................* + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q8, [x0, #0] // ................e..........................................................'................~.......................................................... + // ldr q9, [x0, #(1*(512/8))] // .............~.............................................................'.............*............................................................. + // ldr q10, [x0, #(2*(512/8))] // ............e..............................................................'............~.............................................................. + // ldr q11, [x0, #(3*(512/8))] // .........e.................................................................'.........~................................................................. + // ldr q12, [x0, #(4*(512/8))] // .....e.....................................................................'.....~..................................................................... + // ldr q13, [x0, #(5*(512/8))] // ....e......................................................................'....~...................................................................... + // ldr q14, [x0, #(6*(512/8))] // .e.........................................................................'.~......................................................................... + // ldr q15, [x0, #(7*(512/8))] // e..........................................................................'~.......................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // .............................................................e.............'.............................................................~............. + // mul v24.8h, v12.8h, v0.h[0] // ...................................................................e.......'...................................................................~....... + // mls v24.8h, v27.8h, v7.h[0] // ..~........................................................................'..*........................................................................ + // sub v12.8h, v8.8h, v24.8h // ..............~............................................................'..............*............................................................ + // add v8.8h, v8.8h, v24.8h // ..........~................................................................'..........*................................................................ + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ......................................................................e....'......................................................................~.... + // mul v24.8h, v13.8h, v0.h[0] // ...........................................................................*........................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..................~........................................................'..................*........................................................ + // sub v13.8h, v9.8h, v24.8h // ........................~..................................................'........................*.................................................. + // add v9.8h, v9.8h, v24.8h // ..........................~................................................'..........................*................................................ + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ..............................e............................................'..............................~............................................ + // mul v24.8h, v14.8h, v0.h[0] // ............................................................e..............'............................................................~.............. + // mls v24.8h, v27.8h, v7.h[0] // .....................................................................e.....'.....................................................................~..... + // sub v14.8h, v10.8h, v24.8h // ...~.......................................................................'...*....................................................................... + // add v10.8h, v10.8h, v24.8h // ......~....................................................................'......*.................................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ........................................................e..................'........................................................~.................. + // mul v24.8h, v15.8h, v0.h[0] // ...................................................e.......................'...................................................~....................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................................e.........'.................................................................~......... + // sub v15.8h, v11.8h, v24.8h // .........................................................................e.'.........................................................................~. + // add v11.8h, v11.8h, v24.8h // ........................................................................e..'........................................................................~.. + // sqrdmulh v27.8h, v10.8h, v0.h[3] // .....................~.....................................................'.....................*..................................................... + // mul v24.8h, v10.8h, v0.h[2] // ......................~....................................................'......................*.................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................~...............................................'...........................*............................................... + // sub v10.8h, v8.8h, v24.8h // ................................................~..........................'................................................*.......................... + // add v8.8h, v8.8h, v24.8h // ..........................................~................................'..........................................*................................ + // sqrdmulh v27.8h, v11.8h, v0.h[3] // .................~.........................................................'.................*......................................................... + // mul v24.8h, v11.8h, v0.h[2] // .......~...................................................................'.......*................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................~...................................................'.......................*................................................... + // sub v11.8h, v9.8h, v24.8h // ...............................~...........................................'...............................*........................................... + // add v9.8h, v9.8h, v24.8h // .............................~.............................................'.............................*............................................. + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ...............~...........................................................'...............*........................................................... + // mul v24.8h, v14.8h, v0.h[4] // ....................~......................................................'....................*...................................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................~.................................................'.........................*................................................. + // sub v14.8h, v12.8h, v24.8h // ...............................................~...........................'...............................................*........................... + // add v12.8h, v12.8h, v24.8h // ..........................................................~................'..........................................................*................ + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ........~..................................................................'........*.................................................................. + // mul v24.8h, v15.8h, v0.h[4] // ...........~...............................................................'...........*............................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................~.......................................................'...................*....................................................... + // sub v15.8h, v13.8h, v24.8h // ............................~..............................................'............................*.............................................. + // add v13.8h, v13.8h, v24.8h // ................................~..........................................'................................*.......................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // .................................~.........................................'.................................*......................................... + // mul v24.8h, v9.8h, v0.h[6] // ..................................~........................................'..................................*........................................ + // mls v24.8h, v27.8h, v7.h[0] // .......................................~...................................'.......................................*................................... + // sub v9.8h, v8.8h, v24.8h // .............................................~.............................'.............................................*............................. + // add v8.8h, v8.8h, v24.8h // ..............................................~............................'..............................................*............................ + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ....................................~......................................'....................................*...................................... + // mul v24.8h, v11.8h, v1.h[0] // ...................................~.......................................'...................................*....................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................................~...............................'...........................................*............................... + // sub v11.8h, v10.8h, v24.8h // .......................................................~...................'.......................................................*................... + // add v10.8h, v10.8h, v24.8h // .....................................................~.....................'.....................................................*..................... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // .........................................~.................................'.........................................*................................. + // mul v24.8h, v13.8h, v1.h[2] // ........................................~..................................'........................................*.................................. + // mls v24.8h, v27.8h, v7.h[0] // ..................................................~........................'..................................................*........................ + // sub v13.8h, v12.8h, v24.8h // ..................................................................~........'..................................................................*........ + // add v12.8h, v12.8h, v24.8h // ................................................................~..........'................................................................*.......... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // .....................................~.....................................'.....................................*..................................... + // mul v24.8h, v15.8h, v1.h[4] // ......................................~....................................'......................................*.................................... + // mls v24.8h, v27.8h, v7.h[0] // ............................................~..............................'............................................*.............................. + // sub v15.8h, v14.8h, v24.8h // ......................................................~....................'......................................................*.................... + // add v14.8h, v14.8h, v24.8h // ...........................................................~...............'...........................................................*............... + // str q8, [x0], #(16) // ....................................................~......................'....................................................*...................... + // str q9, [x0, #(-16 + 1*(512/8))] // .................................................~.........................'.................................................*......................... + // str q10, [x0, #(-16 + 2*(512/8))] // .........................................................~.................'.........................................................*................. + // str q11, [x0, #(-16 + 3*(512/8))] // ..............................................................~............'..............................................................*............ + // str q12, [x0, #(-16 + 4*(512/8))] // .......................................................................~...'.......................................................................*... + // str q13, [x0, #(-16 + 5*(512/8))] // ..........................................................................~'..........................................................................* + // str q14, [x0, #(-16 + 6*(512/8))] // ...............................................................~...........'...............................................................*........... + // str q15, [x0, #(-16 + 7*(512/8))] // ....................................................................~......'....................................................................*...... sub count, count, #1 cbnz count, layer123_start - ldr x21, [x0, #8] // ..........*....................................................................... - mul v31.8H, v29.8H, v0.H[0] // *................................................................................. - ins v6.d[0], x27 // .............*.................................................................... - ldr x15, [x0, #136] // ...........*...................................................................... - sqrdmulh v3.8H, v29.8H, v0.H[1] // ......*........................................................................... - ldr x19, [x0, #192] // .......*.......................................................................... - ldr x29, [x0, #200] // ..............*................................................................... - // gap // .................................................................................. - ins v19.d[0], x11 // ........*......................................................................... - ins v6.d[1], x17 // .......................*.......................................................... - ldr x23, [x0, #72] // ..*............................................................................... - ldr x14, [x0, #0] // ...*.............................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - sqrdmulh v29.8H, v6.8H, v0.H[1] // ........................*......................................................... - mul v24.8H, v6.8H, v0.H[0] // .........................*........................................................ - // gap // .................................................................................. - // gap // .................................................................................. - ins v25.d[0], x9 // .*................................................................................ - ins v22.d[0], x19 // ....................*............................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - mls v24.8H, v29.8H, v7.H[0] // .............................*.................................................... - // gap // .................................................................................. - // gap // .................................................................................. - mul v23.8H, v5.8H, v0.H[0] // ....*............................................................................. - ins v19.d[1], x23 // ................*................................................................. - ins v22.d[1], x29 // ......................*........................................................... - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - add v9.8H, v22.8H, v24.8H // .....................................*............................................ - sub v22.8H, v22.8H, v24.8H // ...................................*.............................................. - // gap // .................................................................................. - // gap // .................................................................................. - mul v2.8H, v4.8H, v0.H[0] // .....*............................................................................ - mls v23.8H, v27.8H, v7.H[0] // ..........................*....................................................... - // gap // .................................................................................. - // gap // .................................................................................. - sqrdmulh v4.8H, v9.8H, v0.H[3] // ........................................*......................................... - mul v24.8H, v9.8H, v0.H[2] // .........................................*........................................ - // gap // .................................................................................. - // gap // .................................................................................. - sqrdmulh v15.8H, v22.8H, v0.H[5] // ......................................*........................................... - mul v28.8H, v22.8H, v0.H[4] // .......................................*.......................................... - // gap // .................................................................................. - // gap // .................................................................................. - sub v29.8H, v19.8H, v23.8H // .................................*................................................ - mls v2.8H, v20.8H, v7.H[0] // ...............*.................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - add v9.8H, v19.8H, v23.8H // ...............................................*.................................. - mls v24.8H, v4.8H, v7.H[0] // .............................................*.................................... - // gap // .................................................................................. - // gap // .................................................................................. - mls v28.8H, v15.8H, v7.H[0] // ............................................*..................................... - ins v4.d[0], x14 // .........*........................................................................ - // gap // .................................................................................. - // gap // .................................................................................. - ins v25.d[1], x15 // .....................*............................................................ - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - add v26.8H, v9.8H, v24.8H // ..................................................*............................... - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - sub v15.8H, v25.8H, v2.8H // ...........................*...................................................... - mls v31.8H, v3.8H, v7.H[0] // ............*..................................................................... - // gap // .................................................................................. - // gap // .................................................................................. - add v22.8H, v25.8H, v2.8H // ............................*..................................................... - add v2.8H, v29.8H, v28.8H // .................................................*................................ - // gap // .................................................................................. - // gap // .................................................................................. - mul v19.8H, v15.8H, v0.H[4] // ..............................*................................................... - sqrdmulh v11.8H, v15.8H, v0.H[5] // ...............................*.................................................. - // gap // .................................................................................. - // gap // .................................................................................. - sqrdmulh v12.8H, v2.8H, v1.H[3] // ....................................................*............................. - mul v3.8H, v2.8H, v1.H[2] // .....................................................*............................ - // gap // .................................................................................. - // gap // .................................................................................. - ins v4.d[1], x21 // .................*................................................................ - sub v2.8H, v29.8H, v28.8H // ................................................*................................. - // gap // .................................................................................. - // gap // .................................................................................. - mls v19.8H, v11.8H, v7.H[0] // ....................................*............................................. - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - sqrdmulh v8.8H, v2.8H, v1.H[5] // ........................................................*......................... - sub v29.8H, v4.8H, v31.8H // ...................*.............................................................. - // gap // .................................................................................. - // gap // .................................................................................. - mls v3.8H, v12.8H, v7.H[0] // ...........................................................*...................... - mul v15.8H, v2.8H, v1.H[4] // ..........................................................*....................... - // gap // .................................................................................. - // gap // .................................................................................. - add v30.8H, v29.8H, v19.8H // ...........................................*...................................... - mul v28.8H, v22.8H, v0.H[2] // ................................*................................................. - // gap // .................................................................................. - // gap // .................................................................................. - sub v9.8H, v9.8H, v24.8H // ...................................................*.............................. - sub v11.8H, v29.8H, v19.8H // ...............................................................*.................. - // gap // .................................................................................. - // gap // .................................................................................. - sqrdmulh v22.8H, v22.8H, v0.H[3] // ..................................*............................................... - // gap // .................................................................................. - // gap // .................................................................................. - sub v12.8H, v30.8H, v3.8H // .........................................................................*........ - mls v15.8H, v8.8H, v7.H[0] // ................................................................*................. - // gap // .................................................................................. - // gap // .................................................................................. - sqrdmulh v21.8H, v9.8H, v1.H[1] // .........................................................*........................ - str q12, [x0, #320] // ...............................................................................*.. - add v6.8H, v4.8H, v31.8H // ..................*............................................................... - mul v20.8H, v9.8H, v1.H[0] // ............................................................*..................... - // gap // .................................................................................. - mls v28.8H, v22.8H, v7.H[0] // ..........................................*....................................... - sqrdmulh v18.8H, v26.8H, v0.H[7] // ......................................................*........................... - // gap // .................................................................................. - // gap // .................................................................................. - mul v27.8H, v26.8H, v0.H[6] // .......................................................*.......................... - sub v9.8H, v11.8H, v15.8H // ............................................................................*..... - // gap // .................................................................................. - // gap // .................................................................................. - mls v20.8H, v21.8H, v7.H[0] // ..................................................................*............... - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - str q9, [x0, #448] // .................................................................................* - add v9.8H, v30.8H, v3.8H // .................................................................*................ - sub v12.8H, v6.8H, v28.8H // ..............................................*................................... - // gap // .................................................................................. - mls v27.8H, v18.8H, v7.H[0] // .............................................................*.................... - add v29.8H, v11.8H, v15.8H // .......................................................................*.......... - // gap // .................................................................................. - // gap // .................................................................................. - add v2.8H, v6.8H, v28.8H // ..............................................................*................... - str q9, [x0, #256] // ....................................................................*............. - // gap // .................................................................................. - sub v31.8H, v12.8H, v20.8H // ..........................................................................*....... - add v22.8H, v12.8H, v20.8H // .............................................................................*.... - str q29, [x0, #384] // ...........................................................................*...... - // gap // .................................................................................. - // gap // .................................................................................. - str q31, [x0, #192] // ..............................................................................*... - add v9.8H, v2.8H, v27.8H // ...................................................................*.............. - // gap // .................................................................................. - // gap // .................................................................................. - sub v26.8H, v2.8H, v27.8H // .....................................................................*............ - str q22, [x0, #128] // ................................................................................*. - // gap // .................................................................................. - // gap // .................................................................................. - str q9, [x0], #(16) // ......................................................................*........... - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. - str q26, [x0, #48] // ........................................................................*......... - // gap // .................................................................................. - // gap // .................................................................................. - // gap // .................................................................................. + // Instructions: 58 + // Expected cycles: 28 + // Expected IPC: 2.07 + // + // Cycle bound: 27.0 + // IPC bound: 2.15 + // + // Wall time: 3600.22s + // User time: 3600.22s + // + // ------------------- original position -------------------> + // 0 25 50 + // |------------------------|------------------------|------- + mul v8.8H, v9.8H, v0.H[0] // *......................................................... + mls v16.8H, v3.8H, v7.H[0] // .*........................................................ + ldr q11, [x0, #64] // ........*................................................. + // gap // .......................................................... + sub v12.8H, v13.8H, v26.8H // ..*....................................................... + add v27.8H, v13.8H, v26.8H // ...*...................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v6.8H, v15.8H, v0.H[3] // ...........*.............................................. + mul v9.8H, v15.8H, v0.H[2] // ....*..................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v8.8H, v30.8H, v7.H[0] // ............*............................................. + sqrdmulh v29.8H, v14.8H, v0.H[5] // .....*.................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v31.8H, v14.8H, v0.H[4] // .......*.................................................. + add v23.8H, v25.8H, v16.8H // ......*................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v9.8H, v6.8H, v7.H[0] // .................*........................................ + mul v22.8H, v12.8H, v0.H[4] // ..............*........................................... + // gap // .......................................................... + // gap // .......................................................... + add v13.8H, v11.8H, v8.8H // ....................*..................................... + sqrdmulh v15.8H, v12.8H, v0.H[5] // ..........*............................................... + // gap // .......................................................... + // gap // .......................................................... + mls v31.8H, v29.8H, v7.H[0] // .............*............................................ + mul v6.8H, v27.8H, v0.H[2] // ................*......................................... + // gap // .......................................................... + // gap // .......................................................... + sub v14.8H, v11.8H, v8.8H // ..................*....................................... + add v26.8H, v13.8H, v9.8H // .......................*.................................. + // gap // .......................................................... + // gap // .......................................................... + sub v2.8H, v25.8H, v16.8H // .........*................................................ + mls v22.8H, v15.8H, v7.H[0] // ...................*...................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v29.8H, v26.8H, v0.H[7] // ..........................*............................... + add v25.8H, v14.8H, v31.8H // .........................*................................ + // gap // .......................................................... + // gap // .......................................................... + sub v3.8H, v14.8H, v31.8H // ......................*................................... + mul v8.8H, v26.8H, v0.H[6] // ...........................*.............................. + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v11.8H, v25.8H, v1.H[3] // ..................................*....................... + mul v17.8H, v25.8H, v1.H[2] // .................................*........................ + // gap // .......................................................... + // gap // .......................................................... + mul v15.8H, v3.8H, v1.H[4] // ...............................*.......................... + sub v21.8H, v13.8H, v9.8H // ........................*................................. + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v20.8H, v3.8H, v1.H[5] // ..............................*........................... + sqrdmulh v16.8H, v27.8H, v0.H[3] // ...............*.......................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v13.8H, v21.8H, v1.H[1] // .............................*............................ + mls v17.8H, v11.8H, v7.H[0] // ...........................................*.............. + // gap // .......................................................... + // gap // .......................................................... + mul v31.8H, v21.8H, v1.H[0] // ............................*............................. + add v28.8H, v2.8H, v22.8H // .................................................*........ + // gap // .......................................................... + // gap // .......................................................... + mls v6.8H, v16.8H, v7.H[0] // .....................*.................................... + mls v15.8H, v20.8H, v7.H[0] // .....................................*.................... + // gap // .......................................................... + // gap // .......................................................... + sub v14.8H, v2.8H, v22.8H // ........................................*................. + add v16.8H, v28.8H, v17.8H // .....................................................*.... + // gap // .......................................................... + // gap // .......................................................... + sub v12.8H, v28.8H, v17.8H // ......................................................*... + mls v8.8H, v29.8H, v7.H[0] // ................................*......................... + // gap // .......................................................... + // gap // .......................................................... + str q16, [x0, #256] // ........................................................*. + sub v27.8H, v14.8H, v15.8H // ..............................................*........... + add v26.8H, v23.8H, v6.8H // ...................................*...................... + // gap // .......................................................... + add v16.8H, v14.8H, v15.8H // ..................................................*....... + str q12, [x0, #320] // .........................................................* + mls v31.8H, v13.8H, v7.H[0] // ....................................*..................... + // gap // .......................................................... + str q27, [x0, #448] // .......................................................*.. + sub v27.8H, v23.8H, v6.8H // .........................................*................ + add v6.8H, v26.8H, v8.8H // .......................................*.................. + // gap // .......................................................... + str q16, [x0, #384] // ....................................................*..... + sub v26.8H, v26.8H, v8.8H // ......................................*................... + // gap // .......................................................... + // gap // .......................................................... + str q6, [x0], #(16) // ............................................*............. + sub v21.8H, v27.8H, v31.8H // ...............................................*.......... + add v27.8H, v27.8H, v31.8H // .............................................*............ + // gap // .......................................................... + str q26, [x0, #48] // ..........................................*............... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + str q21, [x0, #176] // ...................................................*...... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + str q27, [x0, #112] // ................................................*......... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... - // original source code - // mul v9.8H, v29.8H, v0.H[0] // .*................................................................................ - // ins v30.d[0], x9 // .............*.................................................................... - // ldr x12, [x0, #72] // .........*........................................................................ - // ldr x13, [x0, #0] // ..........*....................................................................... - // mul v24.8H, v5.8H, v0.H[0] // ................*................................................................. - // mul v22.8H, v4.8H, v0.H[0] // .....................*............................................................ - // sqrdmulh v11.8H, v29.8H, v0.H[1] // ....*............................................................................. - // ldr x19, [x0, #192] // .....*............................................................................ - // ins v19.d[0], x11 // .......*.......................................................................... - // ins v31.d[0], x13 // ................................*................................................. - // ldr x24, [x0, #8] // *................................................................................. - // ldr x25, [x0, #136] // ...*.............................................................................. - // mls v9.8H, v11.8H, v7.H[0] // ....................................*............................................. - // ins v5.d[0], x27 // ..*............................................................................... - // ldr x10, [x0, #200] // ......*........................................................................... - // mls v22.8H, v20.8H, v7.H[0] // ............................*..................................................... - // ins v19.d[1], x12 // .................*................................................................ - // ins v31.d[1], x24 // ...........................................*...................................... - // add v10.8H, v31.8H, v9.8H // ...........................................................*...................... - // sub v20.8H, v31.8H, v9.8H // ...............................................*.................................. - // ins v9.d[0], x19 // ..............*................................................................... - // ins v30.d[1], x25 // .................................*................................................ - // ins v9.d[1], x10 // ..................*............................................................... - // ins v5.d[1], x17 // ........*......................................................................... - // sqrdmulh v14.8H, v5.8H, v0.H[1] // ...........*...................................................................... - // mul v31.8H, v5.8H, v0.H[0] // ............*..................................................................... - // mls v24.8H, v27.8H, v7.H[0] // ......................*........................................................... - // sub v26.8H, v30.8H, v22.8H // ...................................*.............................................. - // add v22.8H, v30.8H, v22.8H // .....................................*............................................ - // mls v31.8H, v14.8H, v7.H[0] // ...............*.................................................................. - // mul v25.8H, v26.8H, v0.H[4] // .......................................*.......................................... - // sqrdmulh v2.8H, v26.8H, v0.H[5] // ........................................*......................................... - // mul v28.8H, v22.8H, v0.H[2] // ...................................................*.............................. - // sub v18.8H, v19.8H, v24.8H // ...........................*...................................................... - // sqrdmulh v27.8H, v22.8H, v0.H[3] // ......................................................*........................... - // sub v6.8H, v9.8H, v31.8H // ....................*............................................................. - // mls v25.8H, v2.8H, v7.H[0] // .............................................*.................................... - // add v12.8H, v9.8H, v31.8H // ...................*.............................................................. - // sqrdmulh v8.8H, v6.8H, v0.H[5] // .........................*........................................................ - // mul v2.8H, v6.8H, v0.H[4] // ..........................*....................................................... - // sqrdmulh v22.8H, v12.8H, v0.H[3] // .......................*.......................................................... - // mul v21.8H, v12.8H, v0.H[2] // ........................*......................................................... - // mls v28.8H, v27.8H, v7.H[0] // .............................................................*.................... - // add v3.8H, v20.8H, v25.8H // ..................................................*............................... - // mls v2.8H, v8.8H, v7.H[0] // ...............................*.................................................. - // mls v21.8H, v22.8H, v7.H[0] // ..............................*................................................... - // sub v16.8H, v10.8H, v28.8H // ....................................................................*............. - // add v22.8H, v19.8H, v24.8H // .............................*.................................................... - // sub v27.8H, v18.8H, v2.8H // ............................................*..................................... - // add v9.8H, v18.8H, v2.8H // ......................................*........................................... - // add v11.8H, v22.8H, v21.8H // ..................................*............................................... - // sub v8.8H, v22.8H, v21.8H // ....................................................*............................. - // sqrdmulh v2.8H, v9.8H, v1.H[3] // .........................................*........................................ - // mul v31.8H, v9.8H, v1.H[2] // ..........................................*....................................... - // sqrdmulh v17.8H, v11.8H, v0.H[7] // ..............................................................*................... - // mul v18.8H, v11.8H, v0.H[6] // ...............................................................*.................. - // sqrdmulh v11.8H, v27.8H, v1.H[5] // ..............................................*................................... - // sqrdmulh v12.8H, v8.8H, v1.H[1] // .........................................................*........................ - // mul v15.8H, v27.8H, v1.H[4] // .................................................*................................ - // mls v31.8H, v2.8H, v7.H[0] // ................................................*................................. - // mul v21.8H, v8.8H, v1.H[0] // ............................................................*..................... - // mls v18.8H, v17.8H, v7.H[0] // .....................................................................*............ - // add v2.8H, v10.8H, v28.8H // .......................................................................*.......... - // sub v28.8H, v20.8H, v25.8H // .....................................................*............................ - // mls v15.8H, v11.8H, v7.H[0] // ........................................................*......................... - // add v30.8H, v3.8H, v31.8H // ...................................................................*.............. - // mls v21.8H, v12.8H, v7.H[0] // .................................................................*................ - // add v27.8H, v2.8H, v18.8H // .............................................................................*.... - // str q30, [x0, #256] // ........................................................................*......... - // sub v12.8H, v2.8H, v18.8H // ..............................................................................*... - // str q27, [x0], #(16) // ................................................................................*. - // add v20.8H, v28.8H, v15.8H // ......................................................................*........... - // str q12, [x0, #48] // .................................................................................* - // sub v9.8H, v3.8H, v31.8H // .......................................................*.......................... - // sub v8.8H, v16.8H, v21.8H // .........................................................................*........ - // str q20, [x0, #368] // ...........................................................................*...... - // sub v11.8H, v28.8H, v15.8H // ................................................................*................. - // add v18.8H, v16.8H, v21.8H // ..........................................................................*....... - // str q8, [x0, #176] // ............................................................................*..... - // str q9, [x0, #304] // ..........................................................*....................... - // str q18, [x0, #112] // ...............................................................................*.. - // str q11, [x0, #432] // ..................................................................*............... + // --------------------- new position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------- + // mul v6.8H, v9.8H, v0.H[0] // *......................................................... + // mls v16.8H, v3.8H, v7.H[0] // .*........................................................ + // sub v22.8H, v13.8H, v26.8H // ...*...................................................... + // add v11.8H, v13.8H, v26.8H // ....*..................................................... + // mul v26.8H, v15.8H, v0.H[2] // ......*................................................... + // sqrdmulh v2.8H, v14.8H, v0.H[5] // ........*................................................. + // add v3.8H, v25.8H, v16.8H // ..........*............................................... + // mul v27.8H, v14.8H, v0.H[4] // .........*................................................ + // ldr q5, [x0, #64] // ..*....................................................... + // sub v10.8H, v25.8H, v16.8H // ...................*...................................... + // sqrdmulh v16.8H, v22.8H, v0.H[5] // ..............*........................................... + // sqrdmulh v15.8H, v15.8H, v0.H[3] // .....*.................................................... + // mls v6.8H, v30.8H, v7.H[0] // .......*.................................................. + // mls v27.8H, v2.8H, v7.H[0] // ...............*.......................................... + // mul v2.8H, v22.8H, v0.H[4] // ............*............................................. + // sqrdmulh v14.8H, v11.8H, v0.H[3] // ..............................*........................... + // mul v29.8H, v11.8H, v0.H[2] // ................*......................................... + // mls v26.8H, v15.8H, v7.H[0] // ...........*.............................................. + // sub v11.8H, v5.8H, v6.8H // .................*........................................ + // mls v2.8H, v16.8H, v7.H[0] // ....................*..................................... + // add v15.8H, v5.8H, v6.8H // .............*............................................ + // mls v29.8H, v14.8H, v7.H[0] // ...................................*...................... + // sub v17.8H, v11.8H, v27.8H // .......................*.................................. + // add v16.8H, v15.8H, v26.8H // ..................*....................................... + // sub v28.8H, v15.8H, v26.8H // ............................*............................. + // add v26.8H, v11.8H, v27.8H // ......................*................................... + // sqrdmulh v4.8H, v16.8H, v0.H[7] // .....................*.................................... + // mul v16.8H, v16.8H, v0.H[6] // ........................*................................. + // mul v6.8H, v28.8H, v1.H[0] // .................................*........................ + // sqrdmulh v31.8H, v28.8H, v1.H[1] // ...............................*.......................... + // sqrdmulh v15.8H, v17.8H, v1.H[5] // .............................*............................ + // mul v27.8H, v17.8H, v1.H[4] // ...........................*.............................. + // mls v16.8H, v4.8H, v7.H[0] // ........................................*................. + // mul v14.8H, v26.8H, v1.H[2] // ..........................*............................... + // sqrdmulh v11.8H, v26.8H, v1.H[3] // .........................*................................ + // add v22.8H, v3.8H, v29.8H // ...........................................*.............. + // mls v6.8H, v31.8H, v7.H[0] // ..............................................*........... + // mls v27.8H, v15.8H, v7.H[0] // ....................................*..................... + // sub v15.8H, v22.8H, v16.8H // ...................................................*...... + // add v16.8H, v22.8H, v16.8H // .................................................*........ + // sub v26.8H, v10.8H, v2.8H // .....................................*.................... + // sub v3.8H, v3.8H, v29.8H // ................................................*......... + // str q15, [x0, #64] // .......................................................*.. + // mls v14.8H, v11.8H, v7.H[0] // ................................*......................... + // str q16, [x0], #(16) // ....................................................*..... + // add v15.8H, v3.8H, v6.8H // ......................................................*... + // sub v28.8H, v26.8H, v27.8H // ..........................................*............... + // sub v6.8H, v3.8H, v6.8H // .....................................................*.... + // str q15, [x0, #112] // .........................................................* + // add v11.8H, v10.8H, v2.8H // ..................................*....................... + // add v16.8H, v26.8H, v27.8H // ............................................*............. + // str q6, [x0, #176] // ........................................................*. + // str q16, [x0, #368] // ..................................................*....... + // add v15.8H, v11.8H, v14.8H // ......................................*................... + // sub v27.8H, v11.8H, v14.8H // .......................................*.................. + // str q28, [x0, #432] // ...............................................*.......... + // str q15, [x0, #240] // .........................................*................ + // str q27, [x0, #304] // .............................................*............ restore inp, STACK0 mov count, #8 .p2align 2 - ldr x19, [x4], #(6*16) // ...*...................... - ldr x15, [x1, #48] // *......................... - // gap // .......................... - // gap // .......................... - ldr x12, [x3], #16 // .......*.................. - ldr x16, [x1, #8] // ..............*........... - // gap // .......................... - // gap // .......................... - ldr x29, [x1, #16] // .....*.................... - ldr x21, [x4, #-80] // ........*................. - // gap // .......................... - // gap // .......................... - ins v20.d[0], x19 // .....................*.... - ins v27.d[0], x15 // ..................*....... - ldr x26, [x3, #-8] // ....*..................... - ldr x10, [x1, #56] // ......*................... - ldr x28, [x4, #-32] // ..........*............... - ldr x15, [x1, #0] // ............*............. - // gap // .......................... - // gap // .......................... - ldr x8, [x4, #-40] // ...........*.............. - ins v2.d[0], x21 // .........................* - ins v3.d[0], x12 // .............*............ - // gap // .......................... - ldr x19, [x1, #32] // .........*................ - // gap // .......................... - // gap // .......................... - // gap // .......................... - ins v3.d[1], x26 // .................*........ - ins v27.d[1], x10 // ....................*..... - // gap // .......................... - // gap // .......................... - // gap // .......................... - ldr x10, [x4, #-88] // ...............*.......... - // gap // .......................... - // gap // .......................... - ldr x27, [x1, #40] // .*........................ - mul v22.8H, v27.8H, v3.H[0] // ......................*... - sqrdmulh v18.8H, v27.8H, v3.H[1] // .......................*.. - // gap // .......................... - ldr x20, [x4, #-48] // ................*......... - ins v5.d[0], x15 // ...................*...... - ins v27.d[0], x19 // ........................*. - ldr x15, [x1, #24] // ..*....................... + // Instructions: 32 + // Expected cycles: 21 + // Expected IPC: 1.52 + // + // Cycle bound: 21.0 + // IPC bound: 1.52 + // + // Wall time: 0.33s + // User time: 0.33s + // + // ------ original position ------> + // 0 25 + // |------------------------|------ + ldr q12, [x1, #48] // *............................... + ldr q15, [x3], #16 // .*.............................. + // gap // ................................ + // gap // ................................ + ldr q28, [x1, #16] // ......*......................... + ldr q13, [x4, #16] // ....................*........... + // gap // ................................ + // gap // ................................ + ldr q11, [x4, #32] // ............................*... + // gap // ................................ + // gap // ................................ + // gap // ................................ + ldr q9, [x1, #32] // ..*............................. + // gap // ................................ + // gap // ................................ + // gap // ................................ + mul v30.8H, v12.8H, v15.H[0] // ....*........................... + sqrdmulh v16.8H, v12.8H, v15.H[1] // .....*.......................... + ldr q6, [x4, #48] // .............................*.. + // gap // ................................ + ldr q14, [x4], #(6*16) // ...............................* + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v30.8H, v16.8H, v7.H[0] // .......*........................ + mul v4.8H, v9.8H, v15.H[0] // ........*....................... + // gap // ................................ + // gap // ................................ + sqrdmulh v0.8H, v9.8H, v15.H[1] // .........*...................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + sub v3.8H, v28.8H, v30.8H // ..........*..................... + // gap // ................................ + // gap // ................................ + // gap // ................................ + add v10.8H, v28.8H, v30.8H // ...........*.................... + mls v4.8H, v0.8H, v7.H[0] // ............*................... + // gap // ................................ + // gap // ................................ + sqrdmulh v26.8H, v3.8H, v15.H[5] // .............*.................. + mul v16.8H, v3.8H, v15.H[4] // ..............*................. + ldr q1, [x1, #0] // ...*............................ + // gap // ................................ + sqrdmulh v20.8H, v10.8H, v15.H[3] // ..................*............. + // gap // ................................ + // gap // ................................ + mul v28.8H, v10.8H, v15.H[2] // .................*.............. + // gap // ................................ + // gap // ................................ + // gap // ................................ + // gap // ................................ + mls v16.8H, v26.8H, v7.H[0] // ................*............... + ldr q0, [x4, #-16] // ..........................*..... + // gap // ................................ + // gap // ................................ + mls v28.8H, v20.8H, v7.H[0] // ......................*......... + sub v30.8H, v1.8H, v4.8H // ...............*................ + ldr q20, [x4, #-32] // .........................*...... + // gap // ................................ + add v8.8H, v1.8H, v4.8H // ...................*............ + // gap // ................................ + // gap // ................................ + // gap // ................................ + add v21.8H, v30.8H, v16.8H // .....................*.......... + sub v15.8H, v30.8H, v16.8H // .......................*........ + // gap // ................................ + // gap // ................................ + sub v31.8H, v8.8H, v28.8H // ........................*....... + // gap // ................................ + // gap // ................................ + // gap // ................................ + add v2.8H, v8.8H, v28.8H // ...........................*.... + trn1 v22.4S, v21.4S, v15.4S // ..............................*. + // gap // ................................ + // gap // ................................ - // original source code - // ldr x19, [x1, #48] // .*........................ - // ldr x27, [x1, #40] // ...................*...... - // ldr x15, [x1, #24] // .........................* - // ldr x14, [x4], #(6*16) // *......................... - // ldr x21, [x3, #8] // ........*................. - // ldr x29, [x1, #16] // ....*..................... - // ldr x24, [x1, #56] // .........*................ - // ldr x16, [x3], #16 // ..*....................... - // ldr x17, [x4, #-80] // .....*.................... - // ldr x22, [x1, #32] // ...............*.......... - // ldr x28, [x4, #-32] // ..........*............... - // ldr x8, [x4, #-40] // ............*............. - // ldr x12, [x1, #0] // ...........*.............. - // ins v3.d[0], x16 // ..............*........... - // ldr x16, [x1, #8] // ...*...................... - // ldr x10, [x4, #-88] // ..................*....... - // ldr x20, [x4, #-48] // ......................*... - // ins v3.d[1], x21 // ................*......... - // ins v27.d[0], x19 // .......*.................. - // ins v5.d[0], x12 // .......................*.. - // ins v27.d[1], x24 // .................*........ - // ins v20.d[0], x14 // ......*................... - // mul v22.8H, v27.8H, v3.H[0] // ....................*..... - // sqrdmulh v18.8H, v27.8H, v3.H[1] // .....................*.... - // ins v27.d[0], x22 // ........................*. - // ins v2.d[0], x17 // .............*............ + // -------- new position ---------> + // 0 25 + // |------------------------|------ + // ldr q27, [x1, #48] // *............................... + // ldr q2, [x3], #16 // .*.............................. + // ldr q8, [x1, #32] // .....*.......................... + // ldr q19, [x1, #0] // ..................*............. + // mul v3.8H, v27.8H, v2.H[0] // ......*......................... + // sqrdmulh v17.8H, v27.8H, v2.H[1] // .......*........................ + // ldr q29, [x1, #16] // ..*............................. + // mls v3.8H, v17.8H, v7.H[0] // ..........*..................... + // mul v23.8H, v8.8H, v2.H[0] // ...........*.................... + // sqrdmulh v25.8H, v8.8H, v2.H[1] // ............*................... + // sub v31.8H, v29.8H, v3.8H // .............*.................. + // add v29.8H, v29.8H, v3.8H // ..............*................. + // mls v23.8H, v25.8H, v7.H[0] // ...............*................ + // sqrdmulh v6.8H, v31.8H, v2.H[5] // ................*............... + // mul v10.8H, v31.8H, v2.H[4] // .................*.............. + // sub v12.8H, v19.8H, v23.8H // ........................*....... + // mls v10.8H, v6.8H, v7.H[0] // .....................*.......... + // mul v22.8H, v29.8H, v2.H[2] // ....................*........... + // sqrdmulh v0.8H, v29.8H, v2.H[3] // ...................*............ + // add v3.8H, v19.8H, v23.8H // ..........................*..... + // ldr q13, [x4, #16] // ...*............................ + // add v21.8H, v12.8H, v10.8H // ...........................*.... + // mls v22.8H, v0.8H, v7.H[0] // .......................*........ + // sub v15.8H, v12.8H, v10.8H // ............................*... + // sub v31.8H, v3.8H, v22.8H // .............................*.. + // ldr q20, [x4, #64] // .........................*...... + // ldr q0, [x4, #80] // ......................*......... + // add v2.8H, v3.8H, v22.8H // ..............................*. + // ldr q11, [x4, #32] // ....*........................... + // ldr q6, [x4, #48] // ........*....................... + // trn1 v22.4S, v21.4S, v15.4S // ...............................* + // ldr q14, [x4], #(6*16) // .........*...................... sub count, count, #1 layer4567_start: - ins v27.d[1], x27 // ...........*............................................................................................. - // gap // ......................................................................................................... - ldr x11, [x4, #-56] // .........................................................*............................................... - ins v6.d[0], x29 // ......*.................................................................................................. - ldr x19, [x1, #112] // ............e............................................................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x27, [x1, #104] // .........e............................................................................................... - // gap // ......................................................................................................... - mls v22.8H, v18.8H, v7.H[0] // ...........................*............................................................................. - mul v18.8H, v27.8H, v3.H[0] // ....................*.................................................................................... - ldr x12, [x4, #-16] // ....................................................................*.................................... - sqrdmulh v9.8H, v27.8H, v3.H[1] // .....................*................................................................................... - // gap // ......................................................................................................... - ins v6.d[1], x15 // .......*................................................................................................. - ins v20.d[1], x10 // ...................................................*..................................................... - ldr x10, [x4, #-72] // .....................................................*................................................... - ldr x15, [x1, #88] // .....e................................................................................................... - // gap // ......................................................................................................... - ldr x14, [x4], #(6*16) // ................................................e........................................................ - ldr x21, [x3, #8] // .................e....................................................................................... - // gap // ......................................................................................................... - add v11.8H, v6.8H, v22.8H // .............................*........................................................................... - ldr x29, [x1, #80] // ....e.................................................................................................... - mls v18.8H, v9.8H, v7.H[0] // ......................*.................................................................................. - sub v24.8H, v6.8H, v22.8H // ............................*............................................................................ - // gap // ......................................................................................................... - ins v2.d[1], x10 // .......................................................*................................................. - ins v5.d[1], x16 // ...*..................................................................................................... - ldr x10, [x4, #-160] // ........................................................*................................................ - ldr x24, [x1, #120] // .............e........................................................................................... - ldr x23, [x4, #-104] // .....................................................................*................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x16, [x3], #16 // ................e........................................................................................ - // gap // ......................................................................................................... - ins v31.d[0], x12 // ......................................................................*.................................. - sqrdmulh v9.8H, v11.8H, v3.H[3] // ...............................*......................................................................... - mul v13.8H, v24.8H, v3.H[4] // ...................................*..................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x12, [x4, #-120] // .................................................................*....................................... - ldr x17, [x4, #-80] // ....................................................e.................................................... - mul v11.8H, v11.8H, v3.H[2] // ..............................*.......................................................................... - // gap // ......................................................................................................... - add v27.8H, v5.8H, v18.8H // ........................*................................................................................ - ins v31.d[1], x23 // .......................................................................*................................. - sqrdmulh v3.8H, v24.8H, v3.H[5] // ....................................*.................................................................... - ldr x22, [x1, #96] // ........e................................................................................................ - // gap // ......................................................................................................... - ins v8.d[0], x20 // ..............................................................*.......................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v11.8H, v9.8H, v7.H[0] // ................................*........................................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v13.8H, v3.8H, v7.H[0] // .....................................*................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v28.d[0], x28 // ..................................................................*...................................... - ldr x28, [x4, #-32] // ................................................................e........................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v10.8H, v5.8H, v18.8H // .......................*................................................................................. - sub v22.8H, v27.8H, v11.8H // .................................*....................................................................... - add v9.8H, v27.8H, v11.8H // ..................................*...................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v21.8H, v10.8H, v13.8H // .......................................*................................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v1.8H, v10.8H, v13.8H // ......................................*.................................................................. - trn2 v11.4S, v9.4S, v22.4S // .........................................*............................................................... - trn1 v9.4S, v9.4S, v22.4S // ........................................*................................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v28.d[1], x12 // ...................................................................*..................................... - trn2 v5.4S, v21.4S, v1.4S // ...........................................*............................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - trn1 v6.4S, v21.4S, v1.4S // ..........................................*.............................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - trn2 v24.2D, v11.2D, v5.2D // .............................................*........................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - trn1 v21.2D, v11.2D, v5.2D // ...............................................*......................................................... - trn1 v30.2D, v9.2D, v6.2D // ..............................................*.......................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - trn2 v1.2D, v9.2D, v6.2D // ............................................*............................................................ - sqrdmulh v9.8H, v24.8H, v2.8H // ..............................................................................*.......................... - mul v22.8H, v24.8H, v20.8H // .............................................................................*........................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v8.d[1], x8 // ...............................................................*......................................... - ldr x8, [x4, #-40] // .............................................................e........................................... - ins v11.d[0], x10 // ..........................................................*.............................................. - // gap // ......................................................................................................... - ldr x12, [x1, #64] // e........................................................................................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v22.8H, v9.8H, v7.H[0] // ...............................................................................*......................... - mul v18.8H, v1.8H, v20.8H // ........................................................................*................................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqrdmulh v27.8H, v1.8H, v2.8H // .........................................................................*............................... - ins v11.d[1], x11 // ...........................................................*............................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v3.d[0], x16 // ..................e...................................................................................... - ldr x16, [x1, #72] // .e....................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ldr x10, [x4, #-88] // .................................................e....................................................... - add v29.8H, v21.8H, v22.8H // .................................................................................*....................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v18.8H, v27.8H, v7.H[0] // ..........................................................................*.............................. - sub v9.8H, v21.8H, v22.8H // ................................................................................*........................ - ldr x20, [x4, #-48] // ............................................................e............................................ - // gap // ......................................................................................................... - sqrdmulh v20.8H, v29.8H, v8.8H // ...................................................................................*..................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mul v22.8H, v29.8H, v11.8H // ..................................................................................*...................... - mul v11.8H, v9.8H, v28.8H // .......................................................................................*................. - sqrdmulh v9.8H, v9.8H, v31.8H // ........................................................................................*................ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v2.8H, v30.8H, v18.8H // ............................................................................*............................ - // gap // ......................................................................................................... - ins v3.d[1], x21 // ...................e..................................................................................... - // gap // ......................................................................................................... - mls v22.8H, v20.8H, v7.H[0] // ....................................................................................*.................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v27.d[0], x19 // ..............e.......................................................................................... - mls v11.8H, v9.8H, v7.H[0] // .........................................................................................*............... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sub v20.8H, v30.8H, v18.8H // ...........................................................................*............................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v8.8H, v2.8H, v22.8H // ......................................................................................*.................. - sub v9.8H, v2.8H, v22.8H // .....................................................................................*................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - add v10.8H, v20.8H, v11.8H // ...........................................................................................*............. - sub v11.8H, v20.8H, v11.8H // ..........................................................................................*.............. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqdmulh v20.8H, v9.8H, v7.H[1] // ...............................................................................................*......... - sqdmulh v2.8H, v8.8H, v7.H[1] // ............................................................................................*............ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v5.d[0], x12 // ..e...................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqdmulh v22.8H, v10.8H, v7.H[1] // ..................................................................................................*...... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqdmulh v29.8H, v11.8H, v7.H[1] // .....................................................................................................*... - ins v27.d[1], x24 // ...............e......................................................................................... - srshr v30.8H, v2.8H, #11 // .............................................................................................*........... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - srshr v18.8H, v22.8H, #11 // ...................................................................................................*..... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - srshr v22.8H, v20.8H, #11 // ................................................................................................*........ - // gap // ......................................................................................................... - // gap // ......................................................................................................... - srshr v2.8H, v29.8H, #11 // ......................................................................................................*.. - mls v8.8H, v30.8H, v7.H[0] // ..............................................................................................*.......... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - ins v20.d[0], x14 // ..................................................e...................................................... - mls v10.8H, v18.8H, v7.H[0] // ....................................................................................................*.... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v9.8H, v22.8H, v7.H[0] // .................................................................................................*....... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - mls v11.8H, v2.8H, v7.H[0] // .......................................................................................................*. - mul v22.8H, v27.8H, v3.H[0] // .........................e............................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... - sqrdmulh v18.8H, v27.8H, v3.H[1] // ..........................e.............................................................................. - ins v27.d[0], x22 // ..........e.............................................................................................. - ins v2.d[0], x17 // ......................................................e.................................................. - // gap // ......................................................................................................... - // gap // ......................................................................................................... - st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ........................................................................................................* - // gap // ......................................................................................................... - // gap // ......................................................................................................... - // gap // ......................................................................................................... + // Instructions: 72 + // Expected cycles: 32 + // Expected IPC: 2.25 + // + // Cycle bound: 30.0 + // IPC bound: 2.40 + // + // Wall time: 3600.58s + // User time: 3600.58s + // + // -------------------------- original position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + ldr q27, [x1, #112] // ...e.................................................................... + trn2 v15.4S, v21.4S, v15.4S // ............................*........................................... + // gap // ........................................................................ + trn2 v16.4S, v2.4S, v31.4S // ..........................*............................................. + trn1 v25.4S, v2.4S, v31.4S // .........................*.............................................. + ldr q2, [x3], #16 // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v10.2D, v16.2D, v15.2D // ..............................*......................................... + ldr q8, [x1, #96] // ..e..................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v26.2D, v16.2D, v15.2D // ................................*....................................... + trn2 v31.2D, v25.2D, v22.2D // .............................*.......................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q19, [x1, #64] // e....................................................................... + sqrdmulh v16.8H, v10.8H, v13.8H // ............................................*........................... + mul v15.8H, v10.8H, v14.8H // .............................................*.......................... + mul v3.8H, v27.8H, v2.H[0] // ...........e............................................................ + sqrdmulh v17.8H, v27.8H, v2.H[1] // ..........e............................................................. + ldr q29, [x1, #80] // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v27.8H, v31.8H, v13.8H // .......................................*................................ + mul v14.8H, v31.8H, v14.8H // ........................................*............................... + trn1 v13.2D, v25.2D, v22.2D // ...............................*........................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v15.8H, v16.8H, v7.H[0] // ..............................................*......................... + mls v3.8H, v17.8H, v7.H[0] // ............e........................................................... + mul v23.8H, v8.8H, v2.H[0] // ......e................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v25.8H, v8.8H, v2.H[1] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v14.8H, v27.8H, v7.H[0] // .........................................*.............................. + // gap // ........................................................................ + // gap // ........................................................................ + sub v27.8H, v26.8H, v15.8H // ...............................................*........................ + add v15.8H, v26.8H, v15.8H // ................................................*....................... + sub v31.8H, v29.8H, v3.8H // .............e.......................................................... + // gap // ........................................................................ + add v29.8H, v29.8H, v3.8H // ..............e......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v16.8H, v15.8H, v6.8H // .................................................*...................... + mul v8.8H, v15.8H, v11.8H // ..................................................*..................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v30.8H, v27.8H, v0.8H // ......................................................*................. + mul v27.8H, v27.8H, v20.8H // .......................................................*................ + mls v23.8H, v25.8H, v7.H[0] // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v6.8H, v31.8H, v2.H[5] // ....................e................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mul v10.8H, v31.8H, v2.H[4] // .....................e.................................................. + mls v8.8H, v16.8H, v7.H[0] // ...................................................*.................... + add v15.8H, v13.8H, v14.8H // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v30.8H, v7.H[0] // ........................................................*............... + sub v12.8H, v19.8H, v23.8H // ........e............................................................... + sub v16.8H, v13.8H, v14.8H // ..........................................*............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v10.8H, v6.8H, v7.H[0] // ......................e................................................. + add v24.8H, v15.8H, v8.8H // .....................................................*.................. + add v26.8H, v16.8H, v27.8H // ..........................................................*............. + // gap // ........................................................................ + // gap // ........................................................................ + sub v25.8H, v15.8H, v8.8H // ....................................................*................... + // gap // ........................................................................ + sub v27.8H, v16.8H, v27.8H // .........................................................*.............. + sqdmulh v17.8H, v24.8H, v7.H[1] // ...........................................................*............ + // gap // ........................................................................ + sqdmulh v15.8H, v26.8H, v7.H[1] // .................................................................*...... + sqdmulh v16.8H, v25.8H, v7.H[1] // ..............................................................*......... + // gap // ........................................................................ + // gap // ........................................................................ + mul v22.8H, v29.8H, v2.H[2] // ................e....................................................... + sqdmulh v11.8H, v27.8H, v7.H[1] // ....................................................................*... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v0.8H, v29.8H, v2.H[3] // ...............e........................................................ + srshr v14.8H, v17.8H, #11 // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + add v3.8H, v19.8H, v23.8H // .........e.............................................................. + srshr v15.8H, v15.8H, #11 // ..................................................................*..... + // gap // ........................................................................ + ldr q13, [x4, #16] // ..................................e..................................... + srshr v6.8H, v11.8H, #11 // .....................................................................*.. + srshr v16.8H, v16.8H, #11 // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + add v21.8H, v12.8H, v10.8H // ........................e............................................... + mls v22.8H, v0.8H, v7.H[0] // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v15.8H, v7.H[0] // ...................................................................*.... + mls v24.8H, v14.8H, v7.H[0] // .............................................................*.......... + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v6.8H, v7.H[0] // ......................................................................*. + mls v25.8H, v16.8H, v7.H[0] // ................................................................*....... + // gap // ........................................................................ + // gap // ........................................................................ + sub v15.8H, v12.8H, v10.8H // .......................e................................................ + sub v31.8H, v3.8H, v22.8H // ..................e..................................................... + ldr q20, [x4, #64] // .....................................e.................................. + ldr q0, [x4, #80] // ......................................e................................. + add v2.8H, v3.8H, v22.8H // ...................e.................................................... + // gap // ........................................................................ + ldr q11, [x4, #32] // ...................................e.................................... + ldr q6, [x4, #48] // ....................................e................................... + trn1 v22.4S, v21.4S, v15.4S // ...........................e............................................ + st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 // .......................................................................* + // gap // ........................................................................ + ldr q14, [x4], #(6*16) // .................................e...................................... - // original source code - // ldr x10, [x1, #(16*0)] // ........................................................e.............................................|..........................................................e............................................. - // ldr x11, [x1, #((16*0)+8)] // ..............................................................e.......................................|................................................................e....................................... - // ins v8.d[0], x10 // ....................................................................................e.................|......................................................................................e................. - // ins v8.d[1], x11 // .................*....................................................................................|...................*.................................................................................... - // ldr x10, [x1, #(16*1)] // .............e........................................................................................|...............e........................................................................................ - // ldr x11, [x1, #((16*1)+8)] // .........e............................................................................................|...........e............................................................................................ - // ins v9.d[0], x10 // ......................................................................................................|.*...................................................................................................... - // ins v9.d[1], x11 // ......*...............................................................................................|........*............................................................................................... - // ldr x10, [x1, #(16*2)] // ...............................e......................................................................|.................................e...................................................................... - // ldr x11, [x1, #((16*2)+8)] // .e....................................................................................................|...e.................................................................................................... - // ins v10.d[0], x10 // ...................................................................................................e..|.....................................................................................................e.. - // ins v10.d[1], x11 // ......................................................................................................*........................................................................................................ - // ldr x10, [x1, #(16*3)] // e.....................................................................................................|..e..................................................................................................... - // ldr x11, [x1, #((16*3)+8)] // ...................e..................................................................................|.....................e.................................................................................. - // ins v11.d[0], x10 // ...........................................................................e..........................|.............................................................................e.......................... - // ins v11.d[1], x11 // .......................................................................................e..............|.........................................................................................e.............. - // ldr x10, [x3], #16 // .....................e................................................................................|.......................e................................................................................ - // ldr x11, [x3, #(-16+8)] // ...........e..........................................................................................|.............e.......................................................................................... - // ins v0.d[0], x10 // .............................................................e........................................|...............................................................e........................................ - // ins v0.d[1], x11 // .........................................................................e............................|...........................................................................e............................ - // mul v24.8h, v10.8h, v0.h[0] // ...*..................................................................................................|.....*.................................................................................................. - // sqrdmulh v10.8h, v10.8h, v0.h[1] // .....*................................................................................................|.......*................................................................................................ - // mls v24.8h, v10.8h, v7.h[0] // ..............*.......................................................................................|................*....................................................................................... - // sub v10.8h, v8.8h, v24.8h // .....................................*................................................................|.......................................*................................................................ - // add v8.8h, v8.8h, v24.8h // ............................*.........................................................................|..............................*......................................................................... - // mul v24.8h, v11.8h, v0.h[0] // .................................................................................................e....|...................................................................................................e.... - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ..................................................................................................e...|....................................................................................................e... - // mls v24.8h, v11.8h, v7.h[0] // ..*...................................................................................................|....*................................................................................................... - // sub v11.8h, v9.8h, v24.8h // ...............*......................................................................................|.................*...................................................................................... - // add v9.8h, v9.8h, v24.8h // ............*.........................................................................................|..............*......................................................................................... - // mul v24.8h, v9.8h, v0.h[2] // ...........................*..........................................................................|.............................*.......................................................................... - // sqrdmulh v9.8h, v9.8h, v0.h[3] // .......................*..............................................................................|.........................*.............................................................................. - // mls v24.8h, v9.8h, v7.h[0] // .................................*....................................................................|...................................*.................................................................... - // sub v9.8h, v8.8h, v24.8h // ......................................*...............................................................|........................................*............................................................... - // add v8.8h, v8.8h, v24.8h // .......................................*..............................................................|.........................................*.............................................................. - // mul v24.8h, v11.8h, v0.h[4] // ........................*.............................................................................|..........................*............................................................................. - // sqrdmulh v11.8h, v11.8h, v0.h[5] // ..............................*.......................................................................|................................*....................................................................... - // mls v24.8h, v11.8h, v7.h[0] // ..................................*...................................................................|....................................*................................................................... - // sub v11.8h, v10.8h, v24.8h // .........................................*............................................................|...........................................*............................................................ - // add v10.8h, v10.8h, v24.8h // ........................................*.............................................................|..........................................*............................................................. - // trn1 v25.4s, v8.4s, v9.4s // ...........................................*..........................................................|.............................................*.......................................................... - // trn2 v26.4s, v8.4s, v9.4s // ..........................................*...........................................................|............................................*........................................................... - // trn1 v27.4s, v10.4s, v11.4s // ..............................................*.......................................................|................................................*....................................................... - // trn2 v28.4s, v10.4s, v11.4s // .............................................*........................................................|...............................................*........................................................ - // trn2 v10.2d, v25.2d, v27.2d // ..................................................*...................................................|....................................................*................................................... - // trn2 v11.2d, v26.2d, v28.2d // ...............................................*......................................................|.................................................*...................................................... - // trn1 v8.2d, v25.2d, v27.2d // .................................................*....................................................|...................................................*.................................................... - // trn1 v9.2d, v26.2d, v28.2d // ................................................*.....................................................|..................................................*..................................................... - // ldr x10, [x4], #(6*16) // ..........e...........................................................................................|............e........................................................................................... - // ldr x11, [x4, #(-(6*16)+8)] // ...............................................................e......................................|.................................................................e...................................... - // ins v0.d[0], x10 // .............................................................................................e........|...............................................................................................e........ - // ins v0.d[1], x11 // .......*..............................................................................................|.........*.............................................................................................. - // ldr x10, [x4, #(-6*16 + 1*16)] // ..........................e...........................................................................|............................e........................................................................... - // ldr x11, [x4, #((-6*16 + 1*16)+8)] // ........*.............................................................................................|..........*............................................................................................. - // ins v4.d[0], x10 // ....................................................................................................e.|......................................................................................................e. - // ins v4.d[1], x11 // ................*.....................................................................................|..................*..................................................................................... - // ldr x10, [x4, #(-6*16 + 2*16)] // ..................*...................................................................................|....................*................................................................................... - // ldr x11, [x4, #((-6*16 + 2*16)+8)] // ......................................................................................................|*....................................................................................................... - // ins v1.d[0], x10 // .......................................................*..............................................|.........................................................*.............................................. - // ins v1.d[1], x11 // ............................................................*.........................................|..............................................................*......................................... - // ldr x10, [x4, #(-6*16 + 3*16)] // ...................................................................e..................................|.....................................................................e.................................. - // ldr x11, [x4, #((-6*16 + 3*16)+8)] // ......................................................e...............................................|........................................................e............................................... - // ins v5.d[0], x10 // ................................*.....................................................................|..................................*..................................................................... - // ins v5.d[1], x11 // .....................................................*................................................|.......................................................*................................................ - // ldr x10, [x4, #(-6*16 + 4*16)] // ....................................e.................................................................|......................................e................................................................. - // ldr x11, [x4, #((-6*16 + 4*16)+8)] // .........................*............................................................................|...........................*............................................................................ - // ins v2.d[0], x10 // ...................................*..................................................................|.....................................*.................................................................. - // ins v2.d[1], x11 // ............................................*.........................................................|..............................................*......................................................... - // ldr x10, [x4, #(-6*16 + 5*16)] // ....*.................................................................................................|......*................................................................................................. - // ldr x11, [x4, #((-6*16 + 5*16)+8)] // ....................*.................................................................................|......................*................................................................................. - // ins v6.d[0], x10 // ......................*...............................................................................|........................*............................................................................... - // ins v6.d[1], x11 // .............................*........................................................................|...............................*........................................................................ - // mul v24.8h, v10.8h, v0.8h // ..........................................................*...........................................|............................................................*........................................... - // sqrdmulh v10.8h, v10.8h, v4.8h // ...........................................................*..........................................|.............................................................*.......................................... - // mls v24.8h, v10.8h, v7.h[0] // .................................................................*....................................|...................................................................*.................................... - // sub v10.8h, v8.8h, v24.8h // .............................................................................*........................|...............................................................................*........................ - // add v8.8h, v8.8h, v24.8h // ........................................................................*.............................|..........................................................................*............................. - // mul v24.8h, v11.8h, v0.8h // ....................................................*.................................................|......................................................*................................................. - // sqrdmulh v11.8h, v11.8h, v4.8h // ...................................................*..................................................|.....................................................*.................................................. - // mls v24.8h, v11.8h, v7.h[0] // .........................................................*............................................|...........................................................*............................................ - // sub v11.8h, v9.8h, v24.8h // ..................................................................*...................................|....................................................................*................................... - // add v9.8h, v9.8h, v24.8h // ................................................................*.....................................|..................................................................*..................................... - // mul v24.8h, v9.8h, v1.8h // .....................................................................*................................|.......................................................................*................................ - // sqrdmulh v9.8h, v9.8h, v5.8h // ....................................................................*.................................|......................................................................*................................. - // mls v24.8h, v9.8h, v7.h[0] // ..........................................................................*...........................|............................................................................*........................... - // sub v9.8h, v8.8h, v24.8h // ...............................................................................*......................|.................................................................................*...................... - // add v8.8h, v8.8h, v24.8h // ..............................................................................*.......................|................................................................................*....................... - // mul v24.8h, v11.8h, v2.8h // ......................................................................*...............................|........................................................................*............................... - // sqrdmulh v11.8h, v11.8h, v6.8h // .......................................................................*..............................|.........................................................................*.............................. - // mls v24.8h, v11.8h, v7.h[0] // ............................................................................*.........................|..............................................................................*......................... - // sub v11.8h, v10.8h, v24.8h // .................................................................................*....................|...................................................................................*.................... - // add v10.8h, v10.8h, v24.8h // ................................................................................*.....................|..................................................................................*..................... - // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................................................................................*..................|.....................................................................................*.................. - // srshr v25.8h, v25.8h, #11 // ........................................................................................*.............|..........................................................................................*............. - // mls v8.8h, v25.8h, v7.h[0] // ............................................................................................*.........|..............................................................................................*......... - // sqdmulh v25.8h, v9.8h, v7.h[1] // ..................................................................................*...................|....................................................................................*................... - // srshr v25.8h, v25.8h, #11 // ..........................................................................................*...........|............................................................................................*........... - // mls v9.8h, v25.8h, v7.h[0] // ...............................................................................................*......|.................................................................................................*...... - // sqdmulh v25.8h, v10.8h, v7.h[1] // .....................................................................................*................|.......................................................................................*................ - // srshr v25.8h, v25.8h, #11 // .........................................................................................*............|...........................................................................................*............ - // mls v10.8h, v25.8h, v7.h[0] // ..............................................................................................*.......|................................................................................................*....... - // sqdmulh v25.8h, v11.8h, v7.h[1] // ......................................................................................*...............|........................................................................................*............... - // srshr v25.8h, v25.8h, #11 // ...........................................................................................*..........|.............................................................................................*.......... - // mls v11.8h, v25.8h, v7.h[0] // ................................................................................................*.....|..................................................................................................*..... - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // .....................................................................................................*|.......................................................................................................* + // ---------------------------------------------------------------- new position ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q8, [x1, #(16*0)] // .........e..............................................................'........~............................................................. + // ldr q9, [x1, #(16*1)] // ..............e.........................................................'.............~........................................................ + // ldr q10, [x1, #(16*2)] // ......e.................................................................'.....~................................................................ + // ldr q11, [x1, #(16*3)] // e.......................................................................~...................................................................... + // ldr q0, [x3], #16 // ....e...................................................................'...~.................................................................. + // sqrdmulh v27.8h, v10.8h, v0.h[1] // .....................e..................................................'....................~................................................. + // mul v24.8h, v10.8h, v0.h[0] // ....................e...................................................'...................~.................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...............................e........................................'..............................~....................................... + // sub v10.8h, v8.8h, v24.8h // .....................................e..................................'....................................~................................. + // add v8.8h, v8.8h, v24.8h // ...................................................e....................'..................................................~................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .............e..........................................................'............~......................................................... + // mul v24.8h, v11.8h, v0.h[0] // ............e...........................................................'...........~.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................e....................................................'..................~................................................... + // sub v11.8h, v9.8h, v24.8h // .........................e..............................................'........................~............................................. + // add v9.8h, v9.8h, v24.8h // ..........................e.............................................'.........................~............................................ + // sqrdmulh v27.8h, v9.8h, v0.h[3] // .................................................e......................'................................................~..................... + // mul v24.8h, v9.8h, v0.h[2] // ...............................................e........................'..............................................~....................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................................e..............'........................................................~............. + // sub v9.8h, v8.8h, v24.8h // ...............................................................e........'..............................................................~....... + // add v8.8h, v8.8h, v24.8h // ..................................................................e.....'.................................................................~.... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ................................e.......................................'...............................~...................................... + // mul v24.8h, v11.8h, v0.h[4] // .................................e......................................'................................~..................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................................e................................'......................................~............................... + // sub v11.8h, v10.8h, v24.8h // ..............................................................e.........'.............................................................~........ + // add v10.8h, v10.8h, v24.8h // ........................................................e...............'.......................................................~.............. + // trn1 v25.4s, v8.4s, v9.4s // ...~....................................................................'..*................................................................... + // trn2 v26.4s, v8.4s, v9.4s // ..~.....................................................................'.*.................................................................... + // trn1 v27.4s, v10.4s, v11.4s // .....................................................................e..'....................................................................~. + // trn2 v28.4s, v10.4s, v11.4s // .~......................................................................'*..................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ........~...............................................................'.......*.............................................................. + // trn2 v11.2d, v26.2d, v28.2d // .....~..................................................................'....*................................................................. + // trn1 v8.2d, v25.2d, v27.2d // .................~......................................................'................*..................................................... + // trn1 v9.2d, v26.2d, v28.2d // .......~................................................................'......*............................................................... + // ldr q0, [ x4], #(6*16) // .......................................................................e'...................................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // .....................................................e..................'....................................................~................. + // ldr q1, [ x4, #(-6*16 + 2*16)] // ...................................................................e....'..................................................................~... + // ldr q5, [x4, #(-6*16 + 3*16)] // ....................................................................e...'...................................................................~.. + // ldr q2, [ x4, #(-6*16 + 4*16)] // ................................................................e.......'...............................................................~...... + // ldr q6, [x4, #(-6*16 + 5*16)] // .................................................................e......'................................................................~..... + // sqrdmulh v27.8h, v10.8h, v4.8h // ...............~........................................................'..............*....................................................... + // mul v24.8h, v10.8h, v0.8h // ................~.......................................................'...............*...................................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................~.................................................'.....................*................................................ + // sub v10.8h, v8.8h, v24.8h // ......................................~.................................'.....................................*................................ + // add v8.8h, v8.8h, v24.8h // ...................................~....................................'..................................*................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ..........~.............................................................'.........*............................................................ + // mul v24.8h, v11.8h, v0.8h // ...........~............................................................'..........*........................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..................~.....................................................'.................*.................................................... + // sub v11.8h, v9.8h, v24.8h // .......................~................................................'......................*............................................... + // add v9.8h, v9.8h, v24.8h // ........................~...............................................'.......................*.............................................. + // sqrdmulh v27.8h, v9.8h, v5.8h // ...........................~............................................'..........................*........................................... + // mul v24.8h, v9.8h, v1.8h // ............................~...........................................'...........................*.......................................... + // mls v24.8h, v27.8h, v7.h[0] // ..................................~.....................................'.................................*.................................... + // sub v9.8h, v8.8h, v24.8h // ..........................................~.............................'.........................................*............................ + // add v8.8h, v8.8h, v24.8h // ........................................~...............................'.......................................*.............................. + // sqrdmulh v27.8h, v11.8h, v6.8h // .............................~..........................................'............................*......................................... + // mul v24.8h, v11.8h, v2.8h // ..............................~.........................................'.............................*........................................ + // mls v24.8h, v27.8h, v7.h[0] // ....................................~...................................'...................................*.................................. + // sub v11.8h, v10.8h, v24.8h // ...........................................~............................'..........................................*........................... + // add v10.8h, v10.8h, v24.8h // .........................................~..............................'........................................*............................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ............................................~...........................'...........................................*.......................... + // srshr v25.8h, v25.8h, #11 // ..................................................~.....................'.................................................*.................... + // mls v8.8h, v25.8h, v7.h[0] // ...........................................................~............'..........................................................*........... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ..............................................~.........................'.............................................*........................ + // srshr v25.8h, v25.8h, #11 // .......................................................~................'......................................................*............... + // mls v9.8h, v25.8h, v7.h[0] // .............................................................~..........'............................................................*......... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .............................................~..........................'............................................*......................... + // srshr v25.8h, v25.8h, #11 // ....................................................~...................'...................................................*.................. + // mls v10.8h, v25.8h, v7.h[0] // ..........................................................~.............'.........................................................*............ + // sqdmulh v25.8h, v11.8h, v7.h[1] // ................................................~.......................'...............................................*...................... + // srshr v25.8h, v25.8h, #11 // ......................................................~.................'.....................................................*................ + // mls v11.8h, v25.8h, v7.h[0] // ............................................................~...........'...........................................................*.......... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ......................................................................~.'.....................................................................* sub count, count, #1 cbnz count, layer4567_start - mls v22.8H, v18.8H, v7.H[0] // ...*........................................................................... - ins v27.d[1], x27 // *.............................................................................. - ldr x22, [x4, #-24] // ....................*.......................................................... - ldr x24, [x4, #-16] // .....*......................................................................... - ins v9.d[0], x29 // ..*............................................................................ - ldr x19, [x4, #-64] // ...............*............................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v26.8H, v27.8H, v3.H[1] // ......*........................................................................ - ldr x21, [x4, #-72] // .........*..................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - ins v5.d[1], x16 // ..............*................................................................ - mul v18.8H, v27.8H, v3.H[0] // ....*.......................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - ins v9.d[1], x15 // .......*....................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - ldr x26, [x4, #-56] // .*............................................................................. - ins v15.d[0], x19 // ..............................................*................................ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sub v11.8H, v9.8H, v22.8H // ............*.................................................................. - ins v15.d[1], x26 // ..................................................*............................ - add v6.8H, v9.8H, v22.8H // ..........*.................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v18.8H, v26.8H, v7.H[0] // ...........*................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v8.8H, v11.8H, v3.H[4] // ...................*........................................................... - sqrdmulh v14.8H, v11.8H, v3.H[5] // ........................*...................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v30.8H, v6.8H, v3.H[2] // .....................*......................................................... - sqrdmulh v0.8H, v6.8H, v3.H[3] // ..................*............................................................ - // gap // ............................................................................... - // gap // ............................................................................... - sub v24.8H, v5.8H, v18.8H // .............................*................................................. - // gap // ............................................................................... - // gap // ............................................................................... - ins v2.d[1], x21 // .............*................................................................. - mls v8.8H, v14.8H, v7.H[0] // ...........................*................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - ins v20.d[1], x10 // ........*...................................................................... - mls v30.8H, v0.8H, v7.H[0] // ..........................*.................................................... - // gap // ............................................................................... - // gap // ............................................................................... - add v11.8H, v5.8H, v18.8H // ......................*........................................................ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sub v16.8H, v24.8H, v8.8H // .................................*............................................. - add v26.8H, v24.8H, v8.8H // ................................*.............................................. - // gap // ............................................................................... - // gap // ............................................................................... - sub v18.8H, v11.8H, v30.8H // ..............................*................................................ - add v4.8H, v11.8H, v30.8H // ...............................*............................................... - // gap // ............................................................................... - // gap // ............................................................................... - trn1 v29.4S, v26.4S, v16.4S // ......................................*........................................ - trn2 v25.4S, v26.4S, v16.4S // .....................................*......................................... - // gap // ............................................................................... - // gap // ............................................................................... - trn2 v12.4S, v4.4S, v18.4S // ..................................*............................................ - trn1 v31.4S, v4.4S, v18.4S // ...................................*........................................... - // gap // ............................................................................... - // gap // ............................................................................... - ins v28.d[0], x20 // .........................*..................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - trn2 v6.2D, v12.2D, v25.2D // .......................................*....................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - trn1 v13.2D, v12.2D, v25.2D // ........................................*...................................... - trn2 v10.2D, v31.2D, v29.2D // ..........................................*.................................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v9.8H, v6.8H, v2.8H // ...........................................*................................... - mul v14.8H, v6.8H, v20.8H // ............................................*.................................. - // gap // ............................................................................... - ldr x15, [x4, #-8] // ................*.............................................................. - ins v28.d[1], x8 // .............................................*................................. - ins v16.d[0], x24 // .................*............................................................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - ins v16.d[1], x15 // .......................*....................................................... - mls v14.8H, v9.8H, v7.H[0] // ...............................................*............................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v26.8H, v10.8H, v2.8H // .................................................*............................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - ins v21.d[0], x28 // ............................*.................................................. - mul v20.8H, v10.8H, v20.8H // ................................................*.............................. - // gap // ............................................................................... - // gap // ............................................................................... - sub v8.8H, v13.8H, v14.8H // .....................................................*......................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - ins v21.d[1], x22 // ....................................*.......................................... - add v0.8H, v13.8H, v14.8H // ...................................................*........................... - // gap // ............................................................................... - // gap // ............................................................................... - sqrdmulh v9.8H, v8.8H, v16.8H // .........................................................*..................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mul v13.8H, v8.8H, v21.8H // ........................................................*...................... - sqrdmulh v19.8H, v0.8H, v28.8H // ......................................................*........................ - // gap // ............................................................................... - // gap // ............................................................................... - mls v20.8H, v26.8H, v7.H[0] // ....................................................*.......................... - mul v11.8H, v0.8H, v15.8H // .......................................................*....................... - // gap // ............................................................................... - // gap // ............................................................................... - trn1 v27.2D, v31.2D, v29.2D // .........................................*..................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v13.8H, v9.8H, v7.H[0] // ............................................................*.................. - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v11.8H, v19.8H, v7.H[0] // ...........................................................*................... - sub v12.8H, v27.8H, v20.8H // .............................................................*................. - // gap // ............................................................................... - // gap // ............................................................................... - add v3.8H, v27.8H, v20.8H // ..........................................................*.................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - sub v30.8H, v12.8H, v13.8H // .................................................................*............. - add v29.8H, v12.8H, v13.8H // ................................................................*.............. - // gap // ............................................................................... - // gap // ............................................................................... - sub v28.8H, v3.8H, v11.8H // ...............................................................*............... - // gap // ............................................................................... - // gap // ............................................................................... - add v27.8H, v3.8H, v11.8H // ..............................................................*................ - sqdmulh v11.8H, v29.8H, v7.H[1] // ....................................................................*.......... - // gap // ............................................................................... - // gap // ............................................................................... - sqdmulh v9.8H, v30.8H, v7.H[1] // .....................................................................*......... - sqdmulh v2.8H, v27.8H, v7.H[1] // ...................................................................*........... - // gap // ............................................................................... - // gap // ............................................................................... - sqdmulh v22.8H, v28.8H, v7.H[1] // ..................................................................*............ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - srshr v20.8H, v11.8H, #11 // .......................................................................*....... - srshr v9.8H, v9.8H, #11 // .........................................................................*..... - // gap // ............................................................................... - // gap // ............................................................................... - srshr v11.8H, v22.8H, #11 // ........................................................................*...... - srshr v2.8H, v2.8H, #11 // ......................................................................*........ - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - mls v29.8H, v20.8H, v7.H[0] // ...........................................................................*... - // gap // ............................................................................... - // gap // ............................................................................... - mls v30.8H, v9.8H, v7.H[0] // .............................................................................*. - mls v28.8H, v11.8H, v7.H[0] // ............................................................................*.. - mls v27.8H, v2.8H, v7.H[0] // ..........................................................................*.... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... - st4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x1], #64 // ..............................................................................* - // gap // ............................................................................... - // gap // ............................................................................... - // gap // ............................................................................... + // Instructions: 40 + // Expected cycles: 31 + // Expected IPC: 1.29 + // + // Cycle bound: 31.0 + // IPC bound: 1.29 + // + // Wall time: 0.99s + // User time: 0.99s + // + // ---------- original position ----------> + // 0 25 + // |------------------------|-------------- + trn2 v28.4S, v2.4S, v31.4S // .*...................................... + // gap // ........................................ + // gap // ........................................ + trn2 v1.4S, v21.4S, v15.4S // *....................................... + trn1 v27.4S, v2.4S, v31.4S // ..*..................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v24.2D, v28.2D, v1.2D // ...*.................................... + trn1 v5.2D, v28.2D, v1.2D // ....*................................... + // gap // ........................................ + // gap // ........................................ + trn2 v8.2D, v27.2D, v22.2D // .....*.................................. + trn1 v9.2D, v27.2D, v22.2D // ..........*............................. + // gap // ........................................ + // gap // ........................................ + sqrdmulh v28.8H, v24.8H, v13.8H // ......*................................. + mul v30.8H, v24.8H, v14.8H // .......*................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v15.8H, v8.8H, v13.8H // ........*............................... + mul v16.8H, v8.8H, v14.8H // .........*.............................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v30.8H, v28.8H, v7.H[0] // ...........*............................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v16.8H, v15.8H, v7.H[0] // ............*........................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v18.8H, v5.8H, v30.8H // .............*.......................... + add v15.8H, v5.8H, v30.8H // ..............*......................... + // gap // ........................................ + // gap // ........................................ + add v25.8H, v9.8H, v16.8H // ....................*................... + sub v9.8H, v9.8H, v16.8H // ......................*................. + // gap // ........................................ + // gap // ........................................ + sqrdmulh v0.8H, v18.8H, v0.8H // .................*...................... + mul v26.8H, v18.8H, v20.8H // ..................*..................... + // gap // ........................................ + // gap // ........................................ + mul v1.8H, v15.8H, v11.8H // ................*....................... + sqrdmulh v6.8H, v15.8H, v6.8H // ...............*........................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v26.8H, v0.8H, v7.H[0] // .....................*.................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v1.8H, v6.8H, v7.H[0] // ...................*.................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v28.8H, v9.8H, v26.8H // ........................*............... + sub v29.8H, v9.8H, v26.8H // ..........................*............. + // gap // ........................................ + // gap // ........................................ + sub v27.8H, v25.8H, v1.8H // .........................*.............. + add v26.8H, v25.8H, v1.8H // .......................*................ + // gap // ........................................ + // gap // ........................................ + sqdmulh v15.8H, v28.8H, v7.H[1] // ............................*........... + sqdmulh v6.8H, v29.8H, v7.H[1] // ..............................*......... + // gap // ........................................ + // gap // ........................................ + sqdmulh v16.8H, v26.8H, v7.H[1] // ...........................*............ + sqdmulh v21.8H, v27.8H, v7.H[1] // .............................*.......... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + srshr v19.8H, v15.8H, #11 // ................................*....... + srshr v14.8H, v6.8H, #11 // .................................*...... + // gap // ........................................ + // gap // ........................................ + srshr v15.8H, v16.8H, #11 // ...............................*........ + srshr v1.8H, v21.8H, #11 // ..................................*..... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v29.8H, v14.8H, v7.H[0] // .....................................*.. + mls v28.8H, v19.8H, v7.H[0] // ...................................*.... + // gap // ........................................ + // gap // ........................................ + mls v27.8H, v1.8H, v7.H[0] // ......................................*. + mls v26.8H, v15.8H, v7.H[0] // ....................................*... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + st4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x1], #64 // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ - // original source code - // ins v27.d[1], x27 // .*............................................................................. - // ldr x11, [x4, #-56] // ...........*................................................................... - // ins v6.d[0], x29 // ....*.......................................................................... - // mls v22.8H, v18.8H, v7.H[0] // *.............................................................................. - // mul v18.8H, v27.8H, v3.H[0] // .........*..................................................................... - // ldr x12, [x4, #-16] // ...*........................................................................... - // sqrdmulh v9.8H, v27.8H, v3.H[1] // ......*........................................................................ - // ins v6.d[1], x15 // ..........*.................................................................... - // ins v20.d[1], x10 // ........................*...................................................... - // ldr x10, [x4, #-72] // .......*....................................................................... - // add v11.8H, v6.8H, v22.8H // ...............*............................................................... - // mls v18.8H, v9.8H, v7.H[0] // ................*.............................................................. - // sub v24.8H, v6.8H, v22.8H // .............*................................................................. - // ins v2.d[1], x10 // ......................*........................................................ - // ins v5.d[1], x16 // ........*...................................................................... - // ldr x10, [x4, #-64] // .....*......................................................................... - // ldr x23, [x4, #-8] // .........................................*..................................... - // ins v31.d[0], x12 // ...........................................*................................... - // sqrdmulh v9.8H, v11.8H, v3.H[3] // ....................*.......................................................... - // mul v13.8H, v24.8H, v3.H[4] // .................*............................................................. - // ldr x12, [x4, #-24] // ..*............................................................................ - // mul v11.8H, v11.8H, v3.H[2] // ...................*........................................................... - // add v27.8H, v5.8H, v18.8H // ..........................*.................................................... - // ins v31.d[1], x23 // ............................................*.................................. - // sqrdmulh v3.8H, v24.8H, v3.H[5] // ..................*............................................................ - // ins v8.d[0], x20 // ...................................*........................................... - // mls v11.8H, v9.8H, v7.H[0] // .........................*..................................................... - // mls v13.8H, v3.8H, v7.H[0] // .......................*....................................................... - // ins v28.d[0], x28 // ...............................................*............................... - // sub v10.8H, v5.8H, v18.8H // .....................*......................................................... - // sub v22.8H, v27.8H, v11.8H // .............................*................................................. - // add v9.8H, v27.8H, v11.8H // ..............................*................................................ - // add v21.8H, v10.8H, v13.8H // ............................*.................................................. - // sub v1.8H, v10.8H, v13.8H // ...........................*................................................... - // trn2 v11.4S, v9.4S, v22.4S // .................................*............................................. - // trn1 v9.4S, v9.4S, v22.4S // ..................................*............................................ - // ins v28.d[1], x12 // ..................................................*............................ - // trn2 v5.4S, v21.4S, v1.4S // ................................*.............................................. - // trn1 v6.4S, v21.4S, v1.4S // ...............................*............................................... - // trn2 v24.2D, v11.2D, v5.2D // ....................................*.......................................... - // trn1 v21.2D, v11.2D, v5.2D // .....................................*......................................... - // trn1 v30.2D, v9.2D, v6.2D // .........................................................*..................... - // trn2 v1.2D, v9.2D, v6.2D // ......................................*........................................ - // sqrdmulh v9.8H, v24.8H, v2.8H // .......................................*....................................... - // mul v22.8H, v24.8H, v20.8H // ........................................*...................................... - // ins v8.d[1], x8 // ..........................................*.................................... - // ins v11.d[0], x10 // ............*.................................................................. - // mls v22.8H, v9.8H, v7.H[0] // .............................................*................................. - // mul v18.8H, v1.8H, v20.8H // ................................................*.............................. - // sqrdmulh v27.8H, v1.8H, v2.8H // ..............................................*................................ - // ins v11.d[1], x11 // ..............*................................................................ - // add v29.8H, v21.8H, v22.8H // ...................................................*........................... - // mls v18.8H, v27.8H, v7.H[0] // .......................................................*....................... - // sub v9.8H, v21.8H, v22.8H // .................................................*............................. - // sqrdmulh v20.8H, v29.8H, v8.8H // ......................................................*........................ - // mul v22.8H, v29.8H, v11.8H // ........................................................*...................... - // mul v11.8H, v9.8H, v28.8H // .....................................................*......................... - // sqrdmulh v9.8H, v9.8H, v31.8H // ....................................................*.......................... - // add v2.8H, v30.8H, v18.8H // .............................................................*................. - // mls v22.8H, v20.8H, v7.H[0] // ...........................................................*................... - // mls v11.8H, v9.8H, v7.H[0] // ..........................................................*.................... - // sub v20.8H, v30.8H, v18.8H // ............................................................*.................. - // add v8.8H, v2.8H, v22.8H // .................................................................*............. - // sub v9.8H, v2.8H, v22.8H // ................................................................*.............. - // add v10.8H, v20.8H, v11.8H // ...............................................................*............... - // sub v11.8H, v20.8H, v11.8H // ..............................................................*................ - // sqdmulh v20.8H, v9.8H, v7.H[1] // .....................................................................*......... - // sqdmulh v2.8H, v8.8H, v7.H[1] // ....................................................................*.......... - // sqdmulh v22.8H, v10.8H, v7.H[1] // ..................................................................*............ - // sqdmulh v29.8H, v11.8H, v7.H[1] // ...................................................................*........... - // srshr v30.8H, v2.8H, #11 // .........................................................................*..... - // srshr v18.8H, v22.8H, #11 // ......................................................................*........ - // srshr v22.8H, v20.8H, #11 // ........................................................................*...... - // srshr v2.8H, v29.8H, #11 // .......................................................................*....... - // mls v8.8H, v30.8H, v7.H[0] // .............................................................................*. - // mls v10.8H, v18.8H, v7.H[0] // ..........................................................................*.... - // mls v9.8H, v22.8H, v7.H[0] // ............................................................................*.. - // mls v11.8H, v2.8H, v7.H[0] // ...........................................................................*... - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // ..............................................................................* + // ------------ new position -------------> + // 0 25 + // |------------------------|-------------- + // trn2 v15.4S, v21.4S, v15.4S // .*...................................... + // trn2 v16.4S, v2.4S, v31.4S // *....................................... + // trn1 v25.4S, v2.4S, v31.4S // ..*..................................... + // trn2 v10.2D, v16.2D, v15.2D // ...*.................................... + // trn1 v26.2D, v16.2D, v15.2D // ....*................................... + // trn2 v31.2D, v25.2D, v22.2D // .....*.................................. + // sqrdmulh v16.8H, v10.8H, v13.8H // .......*................................ + // mul v15.8H, v10.8H, v14.8H // ........*............................... + // sqrdmulh v27.8H, v31.8H, v13.8H // .........*.............................. + // mul v14.8H, v31.8H, v14.8H // ..........*............................. + // trn1 v13.2D, v25.2D, v22.2D // ......*................................. + // mls v15.8H, v16.8H, v7.H[0] // ...........*............................ + // mls v14.8H, v27.8H, v7.H[0] // ............*........................... + // sub v27.8H, v26.8H, v15.8H // .............*.......................... + // add v15.8H, v26.8H, v15.8H // ..............*......................... + // sqrdmulh v16.8H, v15.8H, v6.8H // ....................*................... + // mul v8.8H, v15.8H, v11.8H // ...................*.................... + // sqrdmulh v30.8H, v27.8H, v0.8H // .................*...................... + // mul v27.8H, v27.8H, v20.8H // ..................*..................... + // mls v8.8H, v16.8H, v7.H[0] // ......................*................. + // add v15.8H, v13.8H, v14.8H // ...............*........................ + // mls v27.8H, v30.8H, v7.H[0] // .....................*.................. + // sub v16.8H, v13.8H, v14.8H // ................*....................... + // add v24.8H, v15.8H, v8.8H // ..........................*............. + // add v26.8H, v16.8H, v27.8H // .......................*................ + // sub v25.8H, v15.8H, v8.8H // .........................*.............. + // sub v27.8H, v16.8H, v27.8H // ........................*............... + // sqdmulh v17.8H, v24.8H, v7.H[1] // .............................*.......... + // sqdmulh v15.8H, v26.8H, v7.H[1] // ...........................*............ + // sqdmulh v16.8H, v25.8H, v7.H[1] // ..............................*......... + // sqdmulh v11.8H, v27.8H, v7.H[1] // ............................*........... + // srshr v14.8H, v17.8H, #11 // .................................*...... + // srshr v15.8H, v15.8H, #11 // ...............................*........ + // srshr v6.8H, v11.8H, #11 // ................................*....... + // srshr v16.8H, v16.8H, #11 // ..................................*..... + // mls v26.8H, v15.8H, v7.H[0] // ....................................*... + // mls v24.8H, v14.8H, v7.H[0] // ......................................*. + // mls v27.8H, v6.8H, v7.H[0] // ...................................*.... + // mls v25.8H, v16.8H, v7.H[0] // .....................................*.. + // st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 // .......................................* pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_a55.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_a55.s index ae25efa4..e44f61cc 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_a55.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_a55.s @@ -26,46 +26,16 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 -.macro vins vec_out, gpr_in, lane // slothy:no-unfold +.macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -83,15 +53,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -100,12 +70,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -119,40 +83,40 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm .macro vec_to_scalar_matrix out, in @@ -178,7 +142,7 @@ xtmp1 .req x11 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -189,7 +153,7 @@ xtmp1 .req x11 str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -199,7 +163,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -207,7 +171,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -218,19 +182,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -243,7 +207,7 @@ roots: .text .global ntt_kyber_123_4567_scalar_load_store_opt_a55 - .global _ntt_kyber_123_4567_scalar_load_store_opt_a55 + .global _ntt_kyber_123_4567_scalar_load_store .p2align 4 const_addr: .short 3329 @@ -369,1097 +333,1171 @@ _ntt_kyber_123_4567_scalar_load_store_opt_a55: load_roots_123 .p2align 2 - ldr x12, [x0, #448] // ..................*.................... - // gap // ....................................... - // gap // ....................................... - ldr x7, [x0, #320] // .............*......................... - // gap // ....................................... - ldr x27, [x0, #456] // .....................*................. - ldr x18, [x0, #384] // ...............*....................... - // gap // ....................................... - vins v12, x12, 0 // .......................*............... - ldr x29, [x0, #328] // ..............*........................ - vins v5, x7, 0 // .................*..................... - ldr x16, [x0, #200] // .....*................................. - vins v12, x27, 1 // ........................*.............. - ldr x28, [x0, #392] // ................*...................... - vins v17, x18, 0 // ....................*.................. - ldr x13, [x0, #192] // ....*.................................. - mul v6.8H, v12.8H, v0.H[0] // .........................*............. - vins v5, x29, 1 // ...................*................... - sqrdmulh v24.8H, v12.8H, v0.H[1] // ..........................*............ - ldr x18, [x0, #64] // *...................................... - sqrdmulh v12.8H, v5.8H, v0.H[1] // ...............................*....... - vins v17, x28, 1 // ......................*................ - mul v23.8H, v5.8H, v0.H[0] // ..............................*........ - ldr x29, [x0, #72] // .*..................................... - mul v15.8H, v17.8H, v0.H[0] // .................................*..... - ldr x27, [x0, #256] // ......*................................ - mls v6.8H, v24.8H, v7.H[0] // ...........................*........... - vins v24, x13, 0 // .........*............................. - sqrdmulh v11.8H, v17.8H, v0.H[1] // ..................................*.... - ldr x20, [x0, #128] // ..*.................................... - vins v24, x16, 1 // ...........*........................... - ldr x9, [x0, #136] // ...*................................... - mls v23.8H, v12.8H, v7.H[0] // ................................*...... - vins v8, x27, 0 // ..........*............................ - add v31.8H, v24.8H, v6.8H // .............................*......... - ldr x27, [x0, #264] // .......*............................... - sub v25.8H, v24.8H, v6.8H // ............................*.......... - vins v4, x20, 0 // ........*.............................. - mls v15.8H, v11.8H, v7.H[0] // ....................................*.. - // gap // ....................................... - mul v5.8H, v31.8H, v0.H[2] // ...................................*... - // gap // ....................................... - sqrdmulh v9.8H, v31.8H, v0.H[3] // .....................................*. - vins v8, x27, 1 // ............*.......................... - sqrdmulh v27.8H, v25.8H, v0.H[5] // ......................................* - // gap // ....................................... - - // original source code - // ldr x18, [x0, #64] // ...............*....................... || .........*............. - // ldr x29, [x0, #72] // ...................*................... || ...........*........... - // ldr x13, [x0, #128] // .........................*............. || ..............*........ - // ldr x9, [x0, #136] // ...........................*........... || ...............*....... - // ldr x27, [x0, #192] // ...........*........................... || .......*............... - // ldr x19, [x0, #200] // .......*............................... || .....*................. - // ldr x8, [x0, #256] // .....................*................. || ............*.......... - // ldr x15, [x0, #264] // ...............................*....... || .................*..... - // vins v4, x13, 0 // .................................*..... || ..................*.... - // vins v28, x27, 0 // .......................*............... || .............*......... - // vins v8, x8, 0 // .............................*......... || ................*...... - // vins v28, x19, 1 // ..........................*............ || ...............*....... - // vins v8, x15, 1 // .....................................*. || .....................*. - // ldr x27, [x0, #320] // .*..................................... || .*..................... - // ldr x13, [x0, #328] // .....*................................. || ....*.................. - // ldr x19, [x0, #384] // ...*................................... || ...*................... - // ldr x8, [x0, #392] // .........*............................. || ......*................ - // vins v6, x27, 0 // ......*................................ || .....*................. - // ldr x27, [x0, #448] // *...................................... || *...................... - // vins v6, x13, 1 // .............*......................... || ........*.............. - // vins v16, x19, 0 // ..........*............................ || .......*............... - // ldr x13, [x0, #456] // ..*.................................... || ..*.................... - // vins v16, x8, 1 // .................*..................... || ..........*............ - // vins v22, x27, 0 // ....*.................................. || ....*.................. - // vins v22, x13, 1 // ........*.............................. || ......*................ - // mul v24.8H, v22.8H, v0.H[0] // ............*.......................... || ........*.............. - // sqrdmulh v9.8H, v22.8H, v0.H[1] // ..............*........................ || .........*............. - // mls v24.8H, v9.8H, v7.H[0] // ......................*................ || .............*......... - // sub v25.8H, v28.8H, v24.8H // ................................*...... || ..................*.... - // add v24.8H, v28.8H, v24.8H // ..............................*........ || .................*..... - // mul v23.8H, v6.8H, v0.H[0] // ..................*.................... || ...........*........... - // sqrdmulh v6.8H, v6.8H, v0.H[1] // ................*...................... || ..........*............ - // mls v23.8H, v6.8H, v7.H[0] // ............................*.......... || ................*...... - // mul v15.8H, v16.8H, v0.H[0] // ....................*.................. || ............*.......... - // sqrdmulh v12.8H, v16.8H, v0.H[1] // ........................*.............. || ..............*........ - // mul v5.8H, v24.8H, v0.H[2] // ...................................*... || ....................*.. - // mls v15.8H, v12.8H, v7.H[0] // ..................................*.... || ...................*... - // sqrdmulh v9.8H, v24.8H, v0.H[3] // ....................................*.. || .....................*. - // sqrdmulh v27.8H, v25.8H, v0.H[5] // ......................................* || ......................* - + // Instructions: 10 + // Expected cycles: 16 + // Expected IPC: 0.62 + // + // Cycle bound: 16.0 + // IPC bound: 0.62 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q10, [x0, #320] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x0, #128] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q29, [x0, #256] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x0, #192] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v18.8H, v29.8H, v0.H[1] // .....*........................ + // gap // .............................. + mul v26.8H, v29.8H, v0.H[0] // ......*....................... + // gap // .............................. + ldr q25, [x0, #384] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q16, [x0, #448] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v26.8H, v18.8H, v7.H[0] // ........*..................... + // gap // .............................. + ldr q20, [x0, #64] // ....*......................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q10, [x0, #320] // *.............................. + // ldr q11, [x0, #128] // .*............................. + // ldr q13, [x0, #192] // ...*........................... + // ldr q18, [x0, #256] // ..*............................ + // ldr q20, [x0, #64] // .........*..................... + // sqrdmulh v29.8H, v18.8H, v0.H[1] // ....*.......................... + // mul v26.8H, v18.8H, v0.H[0] // .....*......................... + // ldr q25, [x0, #384] // ......*........................ + // mls v26.8H, v29.8H, v7.H[0] // ........*...................... + // ldr q16, [x0, #448] // .......*....................... + sub count, count, #1 -.p2align 2 layer123_start: - mul v12.8H, v8.8H, v0.H[0] // ................................*................................................................... - ldr x27, [x0, #0] // *................................................................................................... - mul v24.8H, v25.8H, v0.H[4] // ...................................................................*................................ - vins v6, x18, 0 // ......*............................................................................................. - sqrdmulh v17.8H, v8.8H, v0.H[1] // .................................*.................................................................. - vins v4, x9, 1 // ...........*........................................................................................ - mls v5.8H, v9.8H, v7.H[0] // ...........................................................*........................................ - vins v6, x29, 1 // .......*............................................................................................ - add v9.8H, v4.8H, v15.8H // ..............................................*..................................................... - vins v14, x27, 0 // ..*................................................................................................. - add v25.8H, v6.8H, v23.8H // .........................................*.......................................................... - ldr x27, [x0, #8] // .*.................................................................................................. - mls v12.8H, v17.8H, v7.H[0] // ..................................*................................................................. - ldr x18, [x0, #80] // ....e............................................................................................... - mul v17.8H, v9.8H, v0.H[2] // ....................................................*............................................... - ldr x29, [x0, #88] // .....e.............................................................................................. - add v28.8H, v25.8H, v5.8H // .............................................................*...................................... - ldr x13, [x0, #144] // ........e........................................................................................... - sqrdmulh v9.8H, v9.8H, v0.H[3] // .....................................................*.............................................. - vins v14, x27, 1 // ...*................................................................................................ - mls v24.8H, v27.8H, v7.H[0] // .....................................................................*.............................. - ldr x9, [x0, #152] // .........e.......................................................................................... - mul v27.8H, v28.8H, v0.H[6] // ........................................................................*........................... - ldr x27, [x0, #208] // ............e....................................................................................... - sqrdmulh v28.8H, v28.8H, v0.H[7] // .........................................................................*.......................... - ldr x19, [x0, #216] // .............e...................................................................................... - mls v17.8H, v9.8H, v7.H[0] // ......................................................*............................................. - ldr x8, [x0, #272] // ................e................................................................................... - add v9.8H, v14.8H, v12.8H // ....................................*............................................................... - ldr x15, [x0, #280] // .................e.................................................................................. - sub v16.8H, v4.8H, v15.8H // .............................................*...................................................... - vins v4, x13, 0 // ..........e......................................................................................... - mls v27.8H, v28.8H, v7.H[0] // ..........................................................................*......................... - vins v28, x27, 0 // ..............e..................................................................................... - add v15.8H, v9.8H, v17.8H // ........................................................*........................................... - vins v8, x8, 0 // ..................e................................................................................. - sub v5.8H, v25.8H, v5.8H // ............................................................*....................................... - vins v28, x19, 1 // ...............e.................................................................................... - sub v6.8H, v6.8H, v23.8H // ........................................*........................................................... - vins v8, x15, 1 // ...................e................................................................................ - sub v25.8H, v15.8H, v27.8H // ...........................................................................*........................ - ldr x27, [x0, #336] // ....................e............................................................................... - sub v12.8H, v14.8H, v12.8H // ...................................*................................................................ - ldr x13, [x0, #344] // .....................e.............................................................................. - sub v14.8H, v6.8H, v24.8H // ......................................................................*............................. - ldr x19, [x0, #400] // ........................e........................................................................... - str_vo v25, x0, 64 // .............................................................................................*...... - ldr x8, [x0, #408] // .........................e.......................................................................... - add v24.8H, v6.8H, v24.8H // .......................................................................*............................ - vins v6, x27, 0 // ......................e............................................................................. - sub v17.8H, v9.8H, v17.8H // .......................................................*............................................ - ldr x27, [x0, #464] // ............................e....................................................................... - mul v9.8H, v16.8H, v0.H[4] // ..............................................................*..................................... - vins v6, x13, 1 // .......................e............................................................................ - sqrdmulh v25.8H, v16.8H, v0.H[5] // ...............................................................*.................................... - vins v16, x19, 0 // ..........................e......................................................................... - add v27.8H, v15.8H, v27.8H // ............................................................................*....................... - ldr x13, [x0, #472] // .............................e...................................................................... - mul v15.8H, v5.8H, v1.H[0] // .............................................................................*...................... - vins v16, x8, 1 // ...........................e........................................................................ - sqrdmulh v5.8H, v5.8H, v1.H[1] // ..............................................................................*..................... - vins v22, x27, 0 // ..............................e..................................................................... - mls v9.8H, v25.8H, v7.H[0] // ................................................................*................................... - // gap // .................................................................................................... - mul v25.8H, v24.8H, v1.H[2] // ..................................................................................*................. - vins v22, x13, 1 // ...............................e.................................................................... - sqrdmulh v24.8H, v24.8H, v1.H[3] // ...................................................................................*................ - // gap // .................................................................................................... - mls v15.8H, v5.8H, v7.H[0] // ...............................................................................*.................... - // gap // .................................................................................................... - sub v5.8H, v12.8H, v9.8H // .................................................................*.................................. - // gap // .................................................................................................... - add v12.8H, v12.8H, v9.8H // ..................................................................*................................. - // gap // .................................................................................................... - mls v25.8H, v24.8H, v7.H[0] // ....................................................................................*............... - // gap // .................................................................................................... - mul v24.8H, v22.8H, v0.H[0] // ...............................................e.................................................... - // gap // .................................................................................................... - sqrdmulh v9.8H, v22.8H, v0.H[1] // ................................................e................................................... - // gap // .................................................................................................... - sub v22.8H, v17.8H, v15.8H // ................................................................................*................... - // gap // .................................................................................................... - add v17.8H, v17.8H, v15.8H // .................................................................................*.................. - // gap // .................................................................................................... - sub v15.8H, v12.8H, v25.8H // .....................................................................................*.............. - // gap // .................................................................................................... - mls v24.8H, v9.8H, v7.H[0] // .................................................e.................................................. - // gap // .................................................................................................... - add v12.8H, v12.8H, v25.8H // ......................................................................................*............. - // gap // .................................................................................................... - mul v9.8H, v14.8H, v1.H[4] // .......................................................................................*............ - // gap // .................................................................................................... - sqrdmulh v14.8H, v14.8H, v1.H[5] // ........................................................................................*........... - // gap // .................................................................................................... - sub v25.8H, v28.8H, v24.8H // ..................................................e................................................. - // gap // .................................................................................................... - add v24.8H, v28.8H, v24.8H // ...................................................e................................................ - // gap // .................................................................................................... - str_vi v27, x0, 16 // ............................................................................................*....... - // gap // .................................................................................................... - mls v9.8H, v14.8H, v7.H[0] // .........................................................................................*.......... - // gap // .................................................................................................... - str_vo v17, x0, 112 // ..............................................................................................*..... - // gap // .................................................................................................... - mul v23.8H, v6.8H, v0.H[0] // .....................................e.............................................................. - // gap // .................................................................................................... - str_vo v22, x0, 176 // ...............................................................................................*.... - // gap // .................................................................................................... - sqrdmulh v6.8H, v6.8H, v0.H[1] // ......................................e............................................................. - // gap // .................................................................................................... - sub v17.8H, v5.8H, v9.8H // ..........................................................................................*......... - // gap // .................................................................................................... - add v5.8H, v5.8H, v9.8H // ...........................................................................................*........ - // gap // .................................................................................................... - str_vo v12, x0, 240 // ................................................................................................*... - // gap // .................................................................................................... - mls v23.8H, v6.8H, v7.H[0] // .......................................e............................................................ - // gap // .................................................................................................... - str_vo v15, x0, 304 // .................................................................................................*.. - // gap // .................................................................................................... - mul v15.8H, v16.8H, v0.H[0] // ..........................................e......................................................... - // gap // .................................................................................................... - sqrdmulh v12.8H, v16.8H, v0.H[1] // ...........................................e........................................................ - // gap // .................................................................................................... - str_vo v5, x0, 368 // ..................................................................................................*. - // gap // .................................................................................................... - mul v5.8H, v24.8H, v0.H[2] // .........................................................e.......................................... - // gap // .................................................................................................... - str_vo v17, x0, 432 // ...................................................................................................* - // gap // .................................................................................................... - mls v15.8H, v12.8H, v7.H[0] // ............................................e....................................................... - // gap // .................................................................................................... - sqrdmulh v9.8H, v24.8H, v0.H[3] // ..........................................................e......................................... - // gap // .................................................................................................... - sqrdmulh v27.8H, v25.8H, v0.H[5] // ....................................................................e............................... - // gap // .................................................................................................... - - // original source code - // ldr x10, [x0, #0] // ........................................................................................*............................................................................................... || ..............................................................*................................................................ - // ldr x11, [x0, #8] // ..................................................................................................*..................................................................................... || ...................................................................*........................................................... - // vins v8, x10, 0 // ................................................................................................*....................................................................................... || ..................................................................*............................................................ - // vins v8, x11, 1 // ..........................................................................................................*............................................................................. || .......................................................................*....................................................... - // ldr x10, [x0, #64] // e....................................................................................................................................................................................... || e.............................................................................................................................. - // ldr x11, [x0, #72] // ..e..................................................................................................................................................................................... || .e............................................................................................................................. - // vins v9, x10, 0 // ..........................................................................................*............................................................................................. || ...............................................................*............................................................... - // vins v9, x11, 1 // ..............................................................................................*......................................................................................... || .................................................................*............................................................. - // ldr x10, [x0, #128] // ....e................................................................................................................................................................................... || ..e............................................................................................................................ - // ldr x11, [x0, #136] // ........e............................................................................................................................................................................... || ....e.......................................................................................................................... - // vins v10, x10, 0 // ..................e..................................................................................................................................................................... || .........e..................................................................................................................... - // vins v10, x11, 1 // ............................................................................................*........................................................................................... || ................................................................*.............................................................. - // ldr x10, [x0, #192] // ..........e............................................................................................................................................................................. || .....e......................................................................................................................... - // ldr x11, [x0, #200] // ............e........................................................................................................................................................................... || ......e........................................................................................................................ - // vins v11, x10, 0 // ....................e................................................................................................................................................................... || ..........e.................................................................................................................... - // vins v11, x11, 1 // ........................e............................................................................................................................................................... || ............e.................................................................................................................. - // ldr x10, [x0, #256] // ..............e......................................................................................................................................................................... || .......e....................................................................................................................... - // ldr x11, [x0, #264] // ................e....................................................................................................................................................................... || ........e...................................................................................................................... - // vins v12, x10, 0 // ......................e................................................................................................................................................................. || ...........e................................................................................................................... - // vins v12, x11, 1 // ..........................e............................................................................................................................................................. || .............e................................................................................................................. - // ldr x10, [x0, #320] // ............................e........................................................................................................................................................... || ..............e................................................................................................................ - // ldr x11, [x0, #328] // ..............................e......................................................................................................................................................... || ...............e............................................................................................................... - // vins v13, x10, 0 // ....................................e................................................................................................................................................... || ..................e............................................................................................................ - // vins v13, x11, 1 // ........................................e............................................................................................................................................... || ....................e.......................................................................................................... - // ldr x10, [x0, #384] // ................................e....................................................................................................................................................... || ................e.............................................................................................................. - // ldr x11, [x0, #392] // ..................................e..................................................................................................................................................... || .................e............................................................................................................. - // vins v14, x10, 0 // ..........................................e............................................................................................................................................. || .....................e......................................................................................................... - // vins v14, x11, 1 // ..............................................e......................................................................................................................................... || .......................e....................................................................................................... - // ldr x10, [x0, #448] // ......................................e................................................................................................................................................. || ...................e........................................................................................................... - // ldr x11, [x0, #456] // ............................................e........................................................................................................................................... || ......................e........................................................................................................ - // vins v15, x10, 0 // ................................................e....................................................................................................................................... || ........................e...................................................................................................... - // vins v15, x11, 1 // ...................................................e.................................................................................................................................... || ..........................e.................................................................................................... - // mul v24.8H, v12.8H, v0.H[0] // .......................................................................................*................................................................................................ || ..............................................................*................................................................ - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ...........................................................................................*............................................................................................ || ................................................................*.............................................................. - // mls v24.8H, v12.8H, v7.H[0] // ...................................................................................................*.................................................................................... || ....................................................................*.......................................................... - // sub v12.8H, v8.8H, v24.8H // .................................................................................................................................*...................................................... || ...................................................................................*........................................... - // add v8.8H, v8.8H, v24.8H // ...................................................................................................................*.................................................................... || ............................................................................*.................................................. - // mul v24.8H, v13.8H, v0.H[0] // .......................................................................e................................................................................................................ || ..............................................e................................................................................ - // sqrdmulh v13.8H, v13.8H, v0.H[1] // .........................................................................e.............................................................................................................. || ................................................e.............................................................................. - // mls v24.8H, v13.8H, v7.H[0] // .............................................................................e.......................................................................................................... || ....................................................e.......................................................................... - // sub v13.8H, v9.8H, v24.8H // .............................................................................................................................*.......................................................... || .................................................................................*............................................. - // add v9.8H, v9.8H, v24.8H // .................................................................................................*...................................................................................... || ...................................................................*........................................................... - // mul v24.8H, v14.8H, v0.H[0] // ...............................................................................e........................................................................................................ || ......................................................e........................................................................ - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ................................................................................e....................................................................................................... || .......................................................e....................................................................... - // mls v24.8H, v14.8H, v7.H[0] // ....................................................................................e................................................................................................... || ...........................................................e................................................................... - // sub v14.8H, v10.8H, v24.8H // .....................................................................................................................*.................................................................. || .............................................................................*................................................. - // add v10.8H, v10.8H, v24.8H // ...............................................................................................*........................................................................................ || ..................................................................*............................................................ - // mul v24.8H, v15.8H, v0.H[0] // .........................................................e.............................................................................................................................. || ................................e.............................................................................................. - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ..........................................................e............................................................................................................................. || .................................e............................................................................................. - // mls v24.8H, v15.8H, v7.H[0] // ..............................................................e......................................................................................................................... || .....................................e......................................................................................... - // sub v15.8H, v11.8H, v24.8H // ..................................................................e..................................................................................................................... || .........................................e..................................................................................... - // add v11.8H, v11.8H, v24.8H // ...................................................................e.................................................................................................................... || ..........................................e.................................................................................... - // mul v24.8H, v10.8H, v0.H[2] // .....................................................................................................*.................................................................................. || .....................................................................*......................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[3] // .........................................................................................................*.............................................................................. || .......................................................................*....................................................... - // mls v24.8H, v10.8H, v7.H[0] // .................................................................................................................*...................................................................... || ...........................................................................*................................................... - // sub v10.8H, v8.8H, v24.8H // .........................................................................................................................................*.............................................. || .......................................................................................*....................................... - // add v8.8H, v8.8H, v24.8H // .........................................................................................................................*.............................................................. || ...............................................................................*............................................... - // mul v24.8H, v11.8H, v0.H[2] // ..................................................................................e..................................................................................................... || .........................................................e..................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[3] // .....................................................................................e.................................................................................................. || ............................................................e.................................................................. - // mls v24.8H, v11.8H, v7.H[0] // .............................................................................................*.......................................................................................... || .................................................................*............................................................. - // sub v11.8H, v9.8H, v24.8H // ...........................................................................................................................*............................................................ || ................................................................................*.............................................. - // add v9.8H, v9.8H, v24.8H // .......................................................................................................*................................................................................ || ......................................................................*........................................................ - // mul v24.8H, v14.8H, v0.H[4] // ...........................................................................................................................................*............................................ || ........................................................................................*...................................... - // sqrdmulh v14.8H, v14.8H, v0.H[5] // .............................................................................................................................................*.......................................... || .........................................................................................*..................................... - // mls v24.8H, v14.8H, v7.H[0] // .....................................................................................................................................................*.................................. || .............................................................................................*................................. - // sub v14.8H, v12.8H, v24.8H // ..........................................................................................................................................................*............................. || .................................................................................................*............................. - // add v12.8H, v12.8H, v24.8H // ...........................................................................................................................................................*............................ || ..................................................................................................*............................ - // mul v24.8H, v15.8H, v0.H[4] // .........................................................................................*.............................................................................................. || ...............................................................*............................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // ......................................................................................e................................................................................................. || .............................................................e................................................................. - // mls v24.8H, v15.8H, v7.H[0] // ...........................................................................................................*............................................................................ || ........................................................................*...................................................... - // sub v15.8H, v13.8H, v24.8H // ...................................................................................................................................*.................................................... || ....................................................................................*.......................................... - // add v13.8H, v13.8H, v24.8H // .......................................................................................................................................*................................................ || ......................................................................................*........................................ - // mul v24.8H, v9.8H, v0.H[6] // .............................................................................................................*.......................................................................... || .........................................................................*..................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[7] // ...............................................................................................................*........................................................................ || ..........................................................................*.................................................... - // mls v24.8H, v9.8H, v7.H[0] // .......................................................................................................................*................................................................ || ..............................................................................*................................................ - // sub v9.8H, v8.8H, v24.8H // ...............................................................................................................................*........................................................ || ..................................................................................*............................................ - // add v8.8H, v8.8H, v24.8H // ...............................................................................................................................................*........................................ || ..........................................................................................*.................................... - // mul v24.8H, v11.8H, v1.H[0] // .................................................................................................................................................*...................................... || ...........................................................................................*................................... - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ...................................................................................................................................................*.................................... || ............................................................................................*.................................. - // mls v24.8H, v11.8H, v7.H[0] // .........................................................................................................................................................*.............................. || ................................................................................................*.............................. - // sub v11.8H, v10.8H, v24.8H // ...............................................................................................................................................................*........................ || ......................................................................................................*........................ - // add v10.8H, v10.8H, v24.8H // ................................................................................................................................................................*....................... || .......................................................................................................*....................... - // mul v24.8H, v13.8H, v1.H[2] // ......................................................................................................................................................*................................. || ..............................................................................................*................................ - // sqrdmulh v13.8H, v13.8H, v1.H[3] // ........................................................................................................................................................*............................... || ...............................................................................................*............................... - // mls v24.8H, v13.8H, v7.H[0] // ............................................................................................................................................................*........................... || ...................................................................................................*........................... - // sub v13.8H, v12.8H, v24.8H // .................................................................................................................................................................*...................... || ........................................................................................................*...................... - // add v12.8H, v12.8H, v24.8H // ...................................................................................................................................................................*.................... || ..........................................................................................................*.................... - // mul v24.8H, v15.8H, v1.H[4] // ....................................................................................................................................................................*................... || ...........................................................................................................*................... - // sqrdmulh v15.8H, v15.8H, v1.H[5] // .....................................................................................................................................................................*.................. || ............................................................................................................*.................. - // mls v24.8H, v15.8H, v7.H[0] // .........................................................................................................................................................................*.............. || ................................................................................................................*.............. - // sub v15.8H, v14.8H, v24.8H // ..............................................................................................................................................................................*......... || .....................................................................................................................*......... - // add v14.8H, v14.8H, v24.8H // ...............................................................................................................................................................................*........ || ......................................................................................................................*........ - // str_vi v8, x0, 16 // ........................................................................................................................................................................*............... || ...............................................................................................................*............... - // str_vo v9, x0, 48 // .....................................................................................................................................*.................................................. || .....................................................................................*......................................... - // str_vo v10, x0, 112 // ..........................................................................................................................................................................*............. || .................................................................................................................*............. - // str_vo v11, x0, 176 // ............................................................................................................................................................................*........... || ...................................................................................................................*........... - // str_vo v12, x0, 240 // ................................................................................................................................................................................*....... || .......................................................................................................................*....... - // str_vo v13, x0, 304 // ..................................................................................................................................................................................*..... || .........................................................................................................................*..... - // str_vo v14, x0, 368 // .....................................................................................................................................................................................*.. || ............................................................................................................................*.. - // str_vo v15, x0, 432 // .......................................................................................................................................................................................* || ..............................................................................................................................* - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Cycle bound: 84.0 + // IPC bound: 0.90 + // + // Wall time: 6.49s + // User time: 6.49s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q29, [x0, #0] // *........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v18.8H, v10.8H, v0.H[1] // .............*.............................................................. + // gap // ............................................................................ + mul v6.8H, v10.8H, v0.H[0] // ..............*............................................................. + // gap // ............................................................................ + sub v31.8H, v29.8H, v26.8H // ...........*................................................................ + // gap // ............................................................................ + add v12.8H, v29.8H, v26.8H // ............*............................................................... + // gap // ............................................................................ + sqrdmulh v26.8H, v25.8H, v0.H[1] // ..................*......................................................... + // gap // ............................................................................ + mls v6.8H, v18.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + mul v18.8H, v25.8H, v0.H[0] // ...................*........................................................ + // gap // ............................................................................ + sqrdmulh v25.8H, v16.8H, v0.H[1] // .......................*.................................................... + // gap // ............................................................................ + mul v16.8H, v16.8H, v0.H[0] // ........................*................................................... + // gap // ............................................................................ + sub v19.8H, v20.8H, v6.8H // ................*........................................................... + // gap // ............................................................................ + mls v18.8H, v26.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + add v6.8H, v20.8H, v6.8H // .................*.......................................................... + // gap // ............................................................................ + mls v16.8H, v25.8H, v7.H[0] // .........................*.................................................. + // gap // ............................................................................ + ldr q10, [x0, #336] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.8H, v11.8H, v18.8H // .....................*...................................................... + // gap // ............................................................................ + add v18.8H, v11.8H, v18.8H // ......................*..................................................... + // gap // ............................................................................ + sub v25.8H, v13.8H, v16.8H // ..........................*................................................. + // gap // ............................................................................ + sqrdmulh v9.8H, v26.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + mul v26.8H, v26.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + sqrdmulh v11.8H, v18.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + mul v18.8H, v18.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + add v16.8H, v13.8H, v16.8H // ...........................*................................................ + // gap // ............................................................................ + mls v26.8H, v9.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + sqrdmulh v20.8H, v25.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + mls v18.8H, v11.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + mul v25.8H, v25.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + sub v11.8H, v31.8H, v26.8H // .........................................*.................................. + // gap // ............................................................................ + add v8.8H, v31.8H, v26.8H // ..........................................*................................. + // gap // ............................................................................ + add v30.8H, v12.8H, v18.8H // ................................*........................................... + // gap // ............................................................................ + mls v25.8H, v20.8H, v7.H[0] // .............................................*.............................. + // gap // ............................................................................ + sqrdmulh v29.8H, v16.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + mul v16.8H, v16.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + sub v26.8H, v12.8H, v18.8H // ...............................*............................................ + // gap // ............................................................................ + sub v20.8H, v19.8H, v25.8H // ..............................................*............................. + // gap // ............................................................................ + add v25.8H, v19.8H, v25.8H // ...............................................*............................ + // gap // ............................................................................ + mls v16.8H, v29.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + sqrdmulh v29.8H, v20.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + sqrdmulh v19.8H, v25.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + mul v25.8H, v25.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + sub v12.8H, v6.8H, v16.8H // ....................................*....................................... + // gap // ............................................................................ + add v6.8H, v6.8H, v16.8H // .....................................*...................................... + // gap // ............................................................................ + mul v16.8H, v20.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v18.8H, v12.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + sqrdmulh v31.8H, v6.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + mul v6.8H, v6.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + mul v12.8H, v12.8H, v1.H[0] // ......................................................*..................... + // gap // ............................................................................ + mls v25.8H, v19.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + mls v16.8H, v29.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + mls v6.8H, v31.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + mls v12.8H, v18.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + add v31.8H, v8.8H, v25.8H // ..............................................................*............. + // gap // ............................................................................ + sub v29.8H, v8.8H, v25.8H // .............................................................*.............. + // gap // ............................................................................ + sub v25.8H, v30.8H, v6.8H // ...................................................*........................ + // gap // ............................................................................ + add v18.8H, v30.8H, v6.8H // ....................................................*....................... + // gap // ............................................................................ + sub v6.8H, v11.8H, v16.8H // ..................................................................*......... + // gap // ............................................................................ + add v16.8H, v11.8H, v16.8H // ...................................................................*........ + // gap // ............................................................................ + str q18, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + sub v19.8H, v26.8H, v12.8H // ........................................................*................... + // gap // ............................................................................ + add v26.8H, v26.8H, v12.8H // .........................................................*.................. + // gap // ............................................................................ + str q25, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + ldr q11, [x0, #128] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q26, [x0, #112] // ......................................................................*..... + // gap // ............................................................................ + ldr q13, [x0, #192] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + ldr q18, [x0, #256] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q31, [x0, #240] // ........................................................................*... + // gap // ............................................................................ + ldr q20, [x0, #64] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q29, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ + sqrdmulh v29.8H, v18.8H, v0.H[1] // ........e................................................................... + // gap // ............................................................................ + str q16, [x0, #368] // ..........................................................................*. + // gap // ............................................................................ + mul v26.8H, v18.8H, v0.H[0] // .........e.................................................................. + // gap // ............................................................................ + str q6, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + ldr q25, [x0, #384] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.8H, v29.8H, v7.H[0] // ..........e................................................................. + // gap // ............................................................................ + ldr q16, [x0, #448] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // ------------------------------------------------------------ new position ------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|--------- + // ldr q8, [x0, #0] // ..............................................................*........................................................................ + // ldr q9, [x0, #(1*(512/8))] // .....................................................e........'..................................................................~..... + // ldr q10, [x0, #(2*(512/8))] // ...............................................e..............'............................................................~........... + // ldr q11, [x0, #(3*(512/8))] // .................................................e............'..............................................................~......... + // ldr q12, [x0, #(4*(512/8))] // ...................................................e..........'................................................................~....... + // ldr q13, [x0, #(5*(512/8))] // e.............................................................'.............~.......................................................... + // ldr q14, [x0, #(6*(512/8))] // ...........................................................e..'........................................................................ + // ldr q15, [x0, #(7*(512/8))] // .............................................................e'........................................................................ + // sqrdmulh v27.8h, v12.8h, v0.h[1] // .......................................................e......'....................................................................~... + // mul v24.8h, v12.8h, v0.h[0] // .........................................................e....'......................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // ............................................................e.'........................................................................ + // sub v12.8h, v8.8h, v24.8h // ..............................................................'..*..................................................................... + // add v8.8h, v8.8h, v24.8h // ..............................................................'...*.................................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ..............................................................'*....................................................................... + // mul v24.8h, v13.8h, v0.h[0] // ..............................................................'.*...................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................................................'.....*.................................................................. + // sub v13.8h, v9.8h, v24.8h // ..............................................................'.........*.............................................................. + // add v9.8h, v9.8h, v24.8h // ..............................................................'...........*............................................................ + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ..............................................................'....*................................................................... + // mul v24.8h, v14.8h, v0.h[0] // ..............................................................'......*................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..............................................................'..........*............................................................. + // sub v14.8h, v10.8h, v24.8h // .~............................................................'..............*......................................................... + // add v10.8h, v10.8h, v24.8h // ..~...........................................................'...............*........................................................ + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ..............................................................'.......*................................................................ + // mul v24.8h, v15.8h, v0.h[0] // ..............................................................'........*............................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................................................'............*........................................................... + // sub v15.8h, v11.8h, v24.8h // ...~..........................................................'................*....................................................... + // add v11.8h, v11.8h, v24.8h // ........~.....................................................'.....................*.................................................. + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ......~.......................................................'...................*.................................................... + // mul v24.8h, v10.8h, v0.h[2] // .......~......................................................'....................*................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........~..................................................'........................*............................................... + // sub v10.8h, v8.8h, v24.8h // ...................~..........................................'................................*....................................... + // add v8.8h, v8.8h, v24.8h // ...............~..............................................'............................*........................................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // .................~............................................'..............................*......................................... + // mul v24.8h, v11.8h, v0.h[2] // ..................~...........................................'...............................*........................................ + // mls v24.8h, v27.8h, v7.h[0] // ......................~.......................................'...................................*.................................... + // sub v11.8h, v9.8h, v24.8h // ..........................~...................................'.......................................*................................ + // add v9.8h, v9.8h, v24.8h // ...........................~..................................'........................................*............................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ....~.........................................................'.................*...................................................... + // mul v24.8h, v14.8h, v0.h[4] // .....~........................................................'..................*..................................................... + // mls v24.8h, v27.8h, v7.h[0] // .........~....................................................'......................*................................................. + // sub v14.8h, v12.8h, v24.8h // .............~................................................'..........................*............................................. + // add v12.8h, v12.8h, v24.8h // ..............~...............................................'...........................*............................................ + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ..........~...................................................'.......................*................................................ + // mul v24.8h, v15.8h, v0.h[4] // ............~.................................................'.........................*.............................................. + // mls v24.8h, v27.8h, v7.h[0] // ................~.............................................'.............................*.......................................... + // sub v15.8h, v13.8h, v24.8h // ....................~.........................................'.................................*...................................... + // add v13.8h, v13.8h, v24.8h // .....................~........................................'..................................*..................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ..............................~...............................'...........................................*............................ + // mul v24.8h, v9.8h, v0.h[6] // ...............................~..............................'............................................*........................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................~..........................'................................................*....................... + // sub v9.8h, v8.8h, v24.8h // .......................................~......................'....................................................*................... + // add v8.8h, v8.8h, v24.8h // ........................................~.....................'.....................................................*.................. + // sqrdmulh v27.8h, v11.8h, v1.h[1] // .............................~................................'..........................................*............................. + // mul v24.8h, v11.8h, v1.h[0] // ................................~.............................'.............................................*.......................... + // mls v24.8h, v27.8h, v7.h[0] // ....................................~.........................'.................................................*...................... + // sub v11.8h, v10.8h, v24.8h // ............................................~.................'.........................................................*.............. + // add v10.8h, v10.8h, v24.8h // .............................................~................'..........................................................*............. + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ........................~.....................................'.....................................*.................................. + // mul v24.8h, v13.8h, v1.h[2] // .........................~....................................'......................................*................................. + // mls v24.8h, v27.8h, v7.h[0] // .................................~............................'..............................................*......................... + // sub v13.8h, v12.8h, v24.8h // ......................................~.......................'...................................................*.................... + // add v12.8h, v12.8h, v24.8h // .....................................~........................'..................................................*..................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // .......................~......................................'....................................*................................... + // mul v24.8h, v15.8h, v1.h[4] // ............................~.................................'.........................................*.............................. + // mls v24.8h, v27.8h, v7.h[0] // ..................................~...........................'...............................................*........................ + // sub v15.8h, v14.8h, v24.8h // .........................................~....................'......................................................*................. + // add v14.8h, v14.8h, v24.8h // ..........................................~...................'.......................................................*................ + // str q8, [x0], #(16) // ...........................................~..................'........................................................*............... + // str q9, [x0, #(-16 + 1*(512/8))] // ..............................................~...............'...........................................................*............ + // str q10, [x0, #(-16 + 2*(512/8))] // ................................................~.............'.............................................................*.......... + // str q11, [x0, #(-16 + 3*(512/8))] // ..................................................~...........'...............................................................*........ + // str q12, [x0, #(-16 + 4*(512/8))] // ....................................................~.........'.................................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // ......................................................~.......'...................................................................*.... + // str q14, [x0, #(-16 + 6*(512/8))] // ........................................................~.....'.....................................................................*.. + // str q15, [x0, #(-16 + 7*(512/8))] // ..........................................................~...'.......................................................................* + + sub count, count, #1 cbnz count, layer123_start - mul v3.8H, v8.8H, v0.H[0] // *............................................................ - vins v20, x18, 0 // ...*......................................................... - mul v6.8H, v25.8H, v0.H[4] // ..*.......................................................... - vins v4, x9, 1 // .....*....................................................... - mls v5.8H, v9.8H, v7.H[0] // ......*...................................................... - vins v20, x29, 1 // .......*..................................................... - add v14.8H, v4.8H, v15.8H // ........*.................................................... - ldr x27, [x0, #0] // .*........................................................... - add v2.8H, v20.8H, v23.8H // ..........*.................................................. - ldr x16, [x0, #8] // ...........*................................................. - sqrdmulh v8.8H, v8.8H, v0.H[1] // ....*........................................................ - // gap // ............................................................. - mls v6.8H, v27.8H, v7.H[0] // .................*........................................... - // gap // ............................................................. - sub v9.8H, v20.8H, v23.8H // ..........................*.................................. - vins v24, x27, 0 // .........*................................................... - sub v12.8H, v2.8H, v5.8H // .........................*................................... - // gap // ............................................................. - sqrdmulh v17.8H, v14.8H, v0.H[3] // ...............*............................................. - vins v24, x16, 1 // ................*............................................ - add v21.8H, v9.8H, v6.8H // ...............................*............................. - // gap // ............................................................. - mul v20.8H, v12.8H, v1.H[0] // ....................................*........................ - // gap // ............................................................. - sqrdmulh v18.8H, v12.8H, v1.H[1] // .....................................*....................... - // gap // ............................................................. - sub v28.8H, v9.8H, v6.8H // .............................*............................... - // gap // ............................................................. - add v10.8H, v2.8H, v5.8H // ..............*.............................................. - // gap // ............................................................. - sqrdmulh v9.8H, v21.8H, v1.H[3] // ........................................*.................... - // gap // ............................................................. - mul v13.8H, v14.8H, v0.H[2] // .............*............................................... - // gap // ............................................................. - mls v3.8H, v8.8H, v7.H[0] // ............*................................................ - // gap // ............................................................. - mul v25.8H, v10.8H, v0.H[6] // ..................*.......................................... - // gap // ............................................................. - sqrdmulh v16.8H, v10.8H, v0.H[7] // ...................*......................................... - // gap // ............................................................. - mls v13.8H, v17.8H, v7.H[0] // ....................*........................................ - // gap // ............................................................. - add v6.8H, v24.8H, v3.8H // .....................*....................................... - // gap // ............................................................. - sqrdmulh v12.8H, v28.8H, v1.H[5] // ..................................................*.......... - // gap // ............................................................. - mls v25.8H, v16.8H, v7.H[0] // .......................*..................................... - // gap // ............................................................. - add v22.8H, v6.8H, v13.8H // ........................*.................................... - // gap // ............................................................. - mul v26.8H, v28.8H, v1.H[4] // .................................................*........... - // gap // ............................................................. - sub v24.8H, v24.8H, v3.8H // ............................*................................ - // gap // ............................................................. - add v17.8H, v22.8H, v25.8H // ...................................*......................... - // gap // ............................................................. - sub v29.8H, v4.8H, v15.8H // ......................*...................................... - // gap // ............................................................. - mls v26.8H, v12.8H, v7.H[0] // ....................................................*........ - // gap // ............................................................. - str_vi v17, x0, 16 // ...................................................*......... - // gap // ............................................................. - mul v11.8H, v29.8H, v0.H[4] // .................................*........................... - // gap // ............................................................. - sqrdmulh v23.8H, v29.8H, v0.H[5] // ..................................*.......................... - // gap // ............................................................. - mul v30.8H, v21.8H, v1.H[2] // .......................................*..................... - // gap // ............................................................. - sub v3.8H, v22.8H, v25.8H // ...........................*................................. - // gap // ............................................................. - mls v20.8H, v18.8H, v7.H[0] // .........................................*................... - // gap // ............................................................. - mls v11.8H, v23.8H, v7.H[0] // ......................................*...................... - // gap // ............................................................. - str_vo v3, x0, 48 // ..............................*.............................. - // gap // ............................................................. - sub v31.8H, v6.8H, v13.8H // ................................*............................ - // gap // ............................................................. - mls v30.8H, v9.8H, v7.H[0] // ............................................*................ - // gap // ............................................................. - add v13.8H, v24.8H, v11.8H // ...........................................*................. - // gap // ............................................................. - sub v12.8H, v31.8H, v20.8H // .............................................*............... - // gap // ............................................................. - sub v21.8H, v24.8H, v11.8H // ..........................................*.................. - // gap // ............................................................. - sub v8.8H, v13.8H, v30.8H // ...............................................*............. - // gap // ............................................................. - str_vo v12, x0, 176 // ......................................................*...... - // gap // ............................................................. - add v29.8H, v21.8H, v26.8H // ........................................................*.... - // gap // ............................................................. - str_vo v8, x0, 304 // ..........................................................*.. - // gap // ............................................................. - add v10.8H, v31.8H, v20.8H // ..............................................*.............. - // gap // ............................................................. - str_vo v29, x0, 368 // ...........................................................*. - // gap // ............................................................. - sub v6.8H, v21.8H, v26.8H // .......................................................*..... - // gap // ............................................................. - str_vo v10, x0, 112 // .....................................................*....... - // gap // ............................................................. - add v14.8H, v13.8H, v30.8H // ................................................*............ - // gap // ............................................................. - str_vo v6, x0, 432 // ............................................................* - // gap // ............................................................. - // gap // ............................................................. - // gap // ............................................................. - str_vo v14, x0, 240 // .........................................................*... - // gap // ............................................................. - - // original source code - // mul v12.8H, v8.8H, v0.H[0] // *............................................................ || *...................................................... - // ldr x27, [x0, #0] // .......*..................................................... || ...*................................................... - // mul v24.8H, v25.8H, v0.H[4] // ..*.......................................................... || .*..................................................... - // vins v6, x18, 0 // .*........................................................... || *...................................................... - // sqrdmulh v17.8H, v8.8H, v0.H[1] // ..........*.................................................. || .....*................................................. - // vins v4, x9, 1 // ...*......................................................... || .*..................................................... - // mls v5.8H, v9.8H, v7.H[0] // ....*........................................................ || ..*.................................................... - // vins v6, x29, 1 // .....*....................................................... || ..*.................................................... - // add v9.8H, v4.8H, v15.8H // ......*...................................................... || ...*................................................... - // vins v14, x27, 0 // .............*............................................... || .......*............................................... - // add v25.8H, v6.8H, v23.8H // ........*.................................................... || ....*.................................................. - // ldr x27, [x0, #8] // .........*................................................... || ....*.................................................. - // mls v12.8H, v17.8H, v7.H[0] // ........................*.................................... || .................*..................................... - // mul v17.8H, v9.8H, v0.H[2] // .......................*..................................... || ................*...................................... - // add v28.8H, v25.8H, v5.8H // .....................*....................................... || ..............*........................................ - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ...............*............................................. || .........*............................................. - // vins v14, x27, 1 // ................*............................................ || .........*............................................. - // mls v24.8H, v27.8H, v7.H[0] // ...........*................................................. || ......*................................................ - // mul v27.8H, v28.8H, v0.H[6] // .........................*................................... || ..................*.................................... - // sqrdmulh v28.8H, v28.8H, v0.H[7] // ..........................*.................................. || ...................*................................... - // mls v17.8H, v9.8H, v7.H[0] // ...........................*................................. || ....................*.................................. - // add v9.8H, v14.8H, v12.8H // ............................*................................ || .....................*................................. - // sub v16.8H, v4.8H, v15.8H // ...................................*......................... || ............................*.......................... - // mls v27.8H, v28.8H, v7.H[0] // ..............................*.............................. || .......................*............................... - // add v15.8H, v9.8H, v17.8H // ...............................*............................. || ........................*.............................. - // sub v5.8H, v25.8H, v5.8H // ..............*.............................................. || ........*.............................................. - // sub v6.8H, v6.8H, v23.8H // ............*................................................ || .......*............................................... - // sub v25.8H, v15.8H, v27.8H // .........................................*................... || ..................................*.................... - // sub v12.8H, v14.8H, v12.8H // .................................*........................... || ..........................*............................ - // sub v14.8H, v6.8H, v24.8H // ....................*........................................ || .............*......................................... - // str_vo v25, x0, 64 // ............................................*................ || .....................................*................. - // add v24.8H, v6.8H, v24.8H // .................*........................................... || ..........*............................................ - // sub v17.8H, v9.8H, v17.8H // .............................................*............... || ......................................*................ - // mul v9.8H, v16.8H, v0.H[4] // ......................................*...................... || ...............................*....................... - // sqrdmulh v25.8H, v16.8H, v0.H[5] // .......................................*..................... || ................................*...................... - // add v27.8H, v15.8H, v27.8H // ..................................*.......................... || ...........................*........................... - // mul v15.8H, v5.8H, v1.H[0] // ..................*.......................................... || ...........*........................................... - // sqrdmulh v5.8H, v5.8H, v1.H[1] // ...................*......................................... || ............*.......................................... - // mls v9.8H, v25.8H, v7.H[0] // ...........................................*................. || ....................................*.................. - // mul v25.8H, v24.8H, v1.H[2] // ........................................*.................... || .................................*..................... - // sqrdmulh v24.8H, v24.8H, v1.H[3] // ......................*...................................... || ...............*....................................... - // mls v15.8H, v5.8H, v7.H[0] // ..........................................*.................. || ...................................*................... - // sub v5.8H, v12.8H, v9.8H // .................................................*........... || ..........................................*............ - // add v12.8H, v12.8H, v9.8H // ...............................................*............. || ........................................*.............. - // mls v25.8H, v24.8H, v7.H[0] // ..............................................*.............. || .......................................*............... - // sub v22.8H, v17.8H, v15.8H // ................................................*............ || .........................................*............. - // add v17.8H, v17.8H, v15.8H // ......................................................*...... || ...............................................*....... - // sub v15.8H, v12.8H, v25.8H // ..................................................*.......... || ...........................................*........... - // add v12.8H, v12.8H, v25.8H // ..........................................................*.. || ...................................................*... - // mul v9.8H, v14.8H, v1.H[4] // ................................*............................ || .........................*............................. - // sqrdmulh v14.8H, v14.8H, v1.H[5] // .............................*............................... || ......................*................................ - // str_vi v27, x0, 16 // .....................................*....................... || ..............................*........................ - // mls v9.8H, v14.8H, v7.H[0] // ....................................*........................ || .............................*......................... - // str_vo v17, x0, 112 // .........................................................*... || ..................................................*.... - // str_vo v22, x0, 176 // ...................................................*......... || ............................................*.......... - // sub v17.8H, v5.8H, v9.8H // ........................................................*.... || .................................................*..... - // add v5.8H, v5.8H, v9.8H // ....................................................*........ || .............................................*......... - // str_vo v12, x0, 240 // ............................................................* || ......................................................* - // str_vo v15, x0, 304 // .....................................................*....... || ..............................................*........ - // str_vo v5, x0, 368 // .......................................................*..... || ................................................*...... - // str_vo v17, x0, 432 // ...........................................................*. || ....................................................*.. - + // Instructions: 66 + // Expected cycles: 68 + // Expected IPC: 0.97 + // + // Cycle bound: 68.0 + // IPC bound: 0.97 + // + // Wall time: 17.32s + // User time: 17.32s + // + // ----------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + ldr q8, [x0, #0] // *................................................................. + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + mul v15.8H, v16.8H, v0.H[0] // .........*........................................................ + // gap // .................................................................. + sqrdmulh v28.8H, v16.8H, v0.H[1] // ........*......................................................... + // gap // .................................................................. + add v21.8H, v8.8H, v26.8H // ....*............................................................. + // gap // .................................................................. + mul v16.8H, v10.8H, v0.H[0] // ..*............................................................... + // gap // .................................................................. + sqrdmulh v29.8H, v10.8H, v0.H[1] // .*................................................................ + // gap // .................................................................. + mul v18.8H, v25.8H, v0.H[0] // .......*.......................................................... + // gap // .................................................................. + mls v15.8H, v28.8H, v7.H[0] // .............*.................................................... + // gap // .................................................................. + sub v10.8H, v8.8H, v26.8H // ...*.............................................................. + // gap // .................................................................. + sqrdmulh v8.8H, v25.8H, v0.H[1] // .....*............................................................ + // gap // .................................................................. + mls v16.8H, v29.8H, v7.H[0] // ......*........................................................... + // gap // .................................................................. + sub v4.8H, v13.8H, v15.8H // ................*................................................. + // gap // .................................................................. + add v9.8H, v13.8H, v15.8H // .....................*............................................ + // gap // .................................................................. + mls v18.8H, v8.8H, v7.H[0] // ...........*...................................................... + // gap // .................................................................. + sqrdmulh v31.8H, v4.8H, v0.H[5] // .......................*.......................................... + // gap // .................................................................. + mul v19.8H, v4.8H, v0.H[4] // .........................*........................................ + // gap // .................................................................. + sqrdmulh v5.8H, v9.8H, v0.H[3] // ..............................*................................... + // gap // .................................................................. + mul v12.8H, v9.8H, v0.H[2] // ...............................*.................................. + // gap // .................................................................. + sub v28.8H, v11.8H, v18.8H // ..............*................................................... + // gap // .................................................................. + mls v19.8H, v31.8H, v7.H[0] // .............................*.................................... + // gap // .................................................................. + sub v13.8H, v20.8H, v16.8H // ..........*....................................................... + // gap // .................................................................. + mul v25.8H, v28.8H, v0.H[4] // ..................*............................................... + // gap // .................................................................. + sqrdmulh v2.8H, v28.8H, v0.H[5] // .................*................................................ + // gap // .................................................................. + add v29.8H, v13.8H, v19.8H // ..................................*............................... + // gap // .................................................................. + sub v31.8H, v13.8H, v19.8H // .................................*................................ + // gap // .................................................................. + add v19.8H, v11.8H, v18.8H // ...............*.................................................. + // gap // .................................................................. + mul v13.8H, v29.8H, v1.H[2] // ......................................*........................... + // gap // .................................................................. + sqrdmulh v4.8H, v29.8H, v1.H[3] // .....................................*............................ + // gap // .................................................................. + mls v25.8H, v2.8H, v7.H[0] // ......................*........................................... + // gap // .................................................................. + mul v6.8H, v31.8H, v1.H[4] // .........................................*........................ + // gap // .................................................................. + sqrdmulh v31.8H, v31.8H, v1.H[5] // ....................................*............................. + // gap // .................................................................. + mls v13.8H, v4.8H, v7.H[0] // ..............................................*................... + // gap // .................................................................. + add v27.8H, v10.8H, v25.8H // ...........................*...................................... + // gap // .................................................................. + sub v30.8H, v10.8H, v25.8H // ..........................*....................................... + // gap // .................................................................. + mls v6.8H, v31.8H, v7.H[0] // ...............................................*.................. + // gap // .................................................................. + add v4.8H, v27.8H, v13.8H // ..................................................*............... + // gap // .................................................................. + sqrdmulh v9.8H, v19.8H, v0.H[3] // ...................*.............................................. + // gap // .................................................................. + mul v26.8H, v19.8H, v0.H[2] // ....................*............................................. + // gap // .................................................................. + str q4, [x0, #256] // ..............................................................*... + // gap // .................................................................. + mls v12.8H, v5.8H, v7.H[0] // ...................................*.............................. + // gap // .................................................................. + add v17.8H, v20.8H, v16.8H // ............*..................................................... + // gap // .................................................................. + mls v26.8H, v9.8H, v7.H[0] // ........................*......................................... + // gap // .................................................................. + sub v22.8H, v27.8H, v13.8H // ...................................................*.............. + // gap // .................................................................. + sub v5.8H, v17.8H, v12.8H // .......................................*.......................... + // gap // .................................................................. + add v24.8H, v17.8H, v12.8H // ........................................*......................... + // gap // .................................................................. + sub v17.8H, v30.8H, v6.8H // ......................................................*........... + // gap // .................................................................. + sub v3.8H, v21.8H, v26.8H // ................................*................................. + // gap // .................................................................. + mul v15.8H, v5.8H, v1.H[0] // .............................................*.................... + // gap // .................................................................. + str q17, [x0, #448] // .................................................................* + // gap // .................................................................. + sqrdmulh v17.8H, v5.8H, v1.H[1] // ..........................................*....................... + // gap // .................................................................. + add v9.8H, v21.8H, v26.8H // ............................*..................................... + // gap // .................................................................. + sqrdmulh v2.8H, v24.8H, v0.H[7] // ...........................................*...................... + // gap // .................................................................. + mul v10.8H, v24.8H, v0.H[6] // ............................................*..................... + // gap // .................................................................. + mls v15.8H, v17.8H, v7.H[0] // .................................................*................ + // gap // .................................................................. + str q22, [x0, #320] // ...............................................................*.. + // gap // .................................................................. + add v29.8H, v30.8H, v6.8H // .......................................................*.......... + // gap // .................................................................. + mls v10.8H, v2.8H, v7.H[0] // ................................................*................. + // gap // .................................................................. + sub v2.8H, v3.8H, v15.8H // .........................................................*........ + // gap // .................................................................. + str q29, [x0, #384] // ................................................................*. + // gap // .................................................................. + add v19.8H, v3.8H, v15.8H // ..........................................................*....... + // gap // .................................................................. + str q2, [x0, #192] // .............................................................*.... + // gap // .................................................................. + sub v24.8H, v9.8H, v10.8H // ....................................................*............. + // gap // .................................................................. + str q19, [x0, #128] // ............................................................*..... + // gap // .................................................................. + add v9.8H, v9.8H, v10.8H // .....................................................*............ + // gap // .................................................................. + str q24, [x0, #64] // ...........................................................*...... + // gap // .................................................................. + // gap // .................................................................. + // gap // .................................................................. + str q9, [x0], #(16) // ........................................................*......... + // gap // .................................................................. + + // ------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // ldr q29, [x0, #0] // *................................................................. + // sqrdmulh v18.8H, v10.8H, v0.H[1] // .....*............................................................ + // mul v6.8H, v10.8H, v0.H[0] // ....*............................................................. + // sub v31.8H, v29.8H, v26.8H // ........*......................................................... + // add v12.8H, v29.8H, v26.8H // ...*.............................................................. + // sqrdmulh v26.8H, v25.8H, v0.H[1] // .........*........................................................ + // mls v6.8H, v18.8H, v7.H[0] // ..........*....................................................... + // mul v18.8H, v25.8H, v0.H[0] // ......*........................................................... + // sqrdmulh v25.8H, v16.8H, v0.H[1] // ..*............................................................... + // mul v16.8H, v16.8H, v0.H[0] // .*................................................................ + // sub v19.8H, v20.8H, v6.8H // ....................*............................................. + // mls v18.8H, v26.8H, v7.H[0] // .............*.................................................... + // add v6.8H, v20.8H, v6.8H // ........................................*......................... + // mls v16.8H, v25.8H, v7.H[0] // .......*.......................................................... + // sub v26.8H, v11.8H, v18.8H // ..................*............................................... + // add v18.8H, v11.8H, v18.8H // .........................*........................................ + // sub v25.8H, v13.8H, v16.8H // ...........*...................................................... + // sqrdmulh v9.8H, v26.8H, v0.H[5] // ......................*........................................... + // mul v26.8H, v26.8H, v0.H[4] // .....................*............................................ + // sqrdmulh v11.8H, v18.8H, v0.H[3] // ....................................*............................. + // mul v18.8H, v18.8H, v0.H[2] // .....................................*............................ + // add v16.8H, v13.8H, v16.8H // ............*..................................................... + // mls v26.8H, v9.8H, v7.H[0] // ............................*..................................... + // sqrdmulh v20.8H, v25.8H, v0.H[5] // ..............*................................................... + // mls v18.8H, v11.8H, v7.H[0] // .........................................*........................ + // mul v25.8H, v25.8H, v0.H[4] // ...............*.................................................. + // sub v11.8H, v31.8H, v26.8H // .................................*................................ + // add v8.8H, v31.8H, v26.8H // ................................*................................. + // add v30.8H, v12.8H, v18.8H // ..................................................*............... + // mls v25.8H, v20.8H, v7.H[0] // ...................*.............................................. + // sqrdmulh v29.8H, v16.8H, v0.H[3] // ................*................................................. + // mul v16.8H, v16.8H, v0.H[2] // .................*................................................ + // sub v26.8H, v12.8H, v18.8H // ..............................................*................... + // sub v20.8H, v19.8H, v25.8H // ........................*......................................... + // add v25.8H, v19.8H, v25.8H // .......................*.......................................... + // mls v16.8H, v29.8H, v7.H[0] // .......................................*.......................... + // sqrdmulh v29.8H, v20.8H, v1.H[5] // ..............................*................................... + // sqrdmulh v19.8H, v25.8H, v1.H[3] // ...........................*...................................... + // mul v25.8H, v25.8H, v1.H[2] // ..........................*....................................... + // sub v12.8H, v6.8H, v16.8H // ...........................................*...................... + // add v6.8H, v6.8H, v16.8H // ............................................*..................... + // mul v16.8H, v20.8H, v1.H[4] // .............................*.................................... + // sqrdmulh v18.8H, v12.8H, v1.H[1] // .................................................*................ + // sqrdmulh v31.8H, v6.8H, v0.H[7] // ...................................................*.............. + // mul v6.8H, v6.8H, v0.H[6] // ....................................................*............. + // mul v12.8H, v12.8H, v1.H[0] // ...............................................*.................. + // mls v25.8H, v19.8H, v7.H[0] // ...............................*.................................. + // mls v16.8H, v29.8H, v7.H[0] // ..................................*............................... + // mls v6.8H, v31.8H, v7.H[0] // ........................................................*......... + // mls v12.8H, v18.8H, v7.H[0] // .....................................................*............ + // add v31.8H, v8.8H, v25.8H // ...................................*.............................. + // sub v29.8H, v8.8H, v25.8H // ..........................................*....................... + // sub v25.8H, v30.8H, v6.8H // .............................................................*.... + // add v18.8H, v30.8H, v6.8H // ...............................................................*.. + // sub v6.8H, v11.8H, v16.8H // .............................................*.................... + // add v16.8H, v11.8H, v16.8H // .......................................................*.......... + // str q18, [x0], #(16) // .................................................................* + // sub v19.8H, v26.8H, v12.8H // .........................................................*........ + // add v26.8H, v26.8H, v12.8H // ...........................................................*...... + // str q25, [x0, #48] // ................................................................*. + // str q26, [x0, #112] // ..............................................................*... + // str q19, [x0, #176] // ............................................................*..... + // str q31, [x0, #240] // ......................................*........................... + // str q29, [x0, #304] // ......................................................*........... + // str q16, [x0, #368] // ..........................................................*....... + // str q6, [x0, #432] // ................................................*................. + restore inp, STACK0 mov count, #8 .p2align 2 - ldr x27, [x1, #48] // *........................................................................................... - // gap // ............................................................................................ - ldr x18, [x3] , #16 // .*.......................................................................................... - // gap // ............................................................................................ - ldr x29, [x3, #-8] // ..*......................................................................................... - // gap // ............................................................................................ - ldr x9, [x1, #56] // ...*........................................................................................ - // gap // ............................................................................................ - vins v12, x27, 0 // ......*..................................................................................... - ldr x27, [x1, #32] // ....*....................................................................................... - vins v6, x18, 0 // .....*...................................................................................... - ldr x18, [x1, #40] // .........*.................................................................................. - ldr x19, [x1, #24] // ..........*................................................................................. - // gap // ............................................................................................ - vins v6, x29, 1 // .......*.................................................................................... - vins v12, x9, 1 // ........*................................................................................... - vins v24, x27, 0 // ............*............................................................................... - ldr x27, [x1, #16] // ..............*............................................................................. - mul v17.8H, v12.8H, v6.H[0] // ...........*................................................................................ - ldr x29, [x1, #0] // ................*........................................................................... - sqrdmulh v12.8H, v12.8H, v6.H[1] // .............*.............................................................................. - vins v24, x18, 1 // ...............*............................................................................ - ldr x18, [x1, #8] // ..................*......................................................................... - // gap // ............................................................................................ - sqrdmulh v4.8H, v24.8H, v6.H[1] // .................*.......................................................................... - vins v5, x27, 0 // ....................*....................................................................... - mul v24.8H, v24.8H, v6.H[0] // .....................*...................................................................... - vins v9, x29, 0 // .........................*.................................................................. - mls v17.8H, v12.8H, v7.H[0] // ...................*........................................................................ - vins v5, x19, 1 // ......................*..................................................................... - vins v9, x18, 1 // .............................*.............................................................. - ldr x27, [x4] , #96 // ...........................*................................................................ - ldr x18, [x4, #-88] // ...............................*............................................................ - // gap // ............................................................................................ - mls v24.8H, v4.8H, v7.H[0] // ..........................*................................................................. - ldr x29, [x4, #-80] // .................................*.......................................................... - sub v12.8H, v5.8H, v17.8H // .......................*.................................................................... - ldr x9, [x4, #-72] // ...................................*........................................................ - add v17.8H, v5.8H, v17.8H // ........................*................................................................... - ldr x19, [x4, #-64] // ....................................*....................................................... - vins v4, x27, 0 // ...........................................*................................................ - ldr x27, [x4, #-48] // ......................................*..................................................... - mul v5.8H, v12.8H, v6.H[4] // ............................*............................................................... - ldr x8, [x4, #-8] // ........................................*................................................... - sqrdmulh v12.8H, v12.8H, v6.H[5] // ..............................*............................................................. - vins v4, x18, 1 // ...............................................*............................................ - mul v14.8H, v17.8H, v6.H[2] // ................................*........................................................... - vins v25, x19, 0 // .............................................*.............................................. - sqrdmulh v6.8H, v17.8H, v6.H[3] // ..................................*......................................................... - ldr x18, [x4, #-56] // .................................................*.......................................... - sub v17.8H, v9.8H, v24.8H // .......................................*.................................................... - ldr x19, [x4, #-24] // ...................................................*........................................ - mls v5.8H, v12.8H, v7.H[0] // .....................................*...................................................... - vins v12, x29, 0 // ........................................................*................................... - add v24.8H, v9.8H, v24.8H // ..........................................*................................................. - ldr x29, [x4, #-40] // ..............................................................*............................. - mls v14.8H, v6.8H, v7.H[0] // .........................................*.................................................. - vins v25, x18, 1 // ......................................................*..................................... - vins v12, x9, 1 // ..........................................................*................................. - vins v6, x27, 0 // ................................................................*........................... - add v9.8H, v17.8H, v5.8H // ..............................................*............................................. - ldr x27, [x4, #-32] // ....................................................................*....................... - sub v17.8H, v17.8H, v5.8H // ............................................*............................................... - ldr x24, [x4, #-16] // ......................................................................*..................... - sub v5.8H, v24.8H, v14.8H // ................................................*........................................... - vins v6, x29, 1 // ........................................................................*................... - add v24.8H, v24.8H, v14.8H // ..................................................*......................................... - // gap // ............................................................................................ - trn2 v14.4S, v9.4S, v17.4S // ....................................................*....................................... - vins v28, x27, 0 // ...........................................................................*................ - trn1 v17.4S, v9.4S, v17.4S // .......................................................*.................................... - vins v9, x24, 0 // .............................................................................*.............. - trn2 v27.4S, v24.4S, v5.4S // .....................................................*...................................... - vins v28, x19, 1 // ...............................................................................*............ - trn1 v24.4S, v24.4S, v5.4S // ...........................................................*................................ - vins v9, x8, 1 // .................................................................................*.......... - // gap // ............................................................................................ - // gap // ............................................................................................ - trn2 v5.2D, v27.2D, v14.2D // .........................................................*.................................. - // gap // ............................................................................................ - trn2 v16.2D, v24.2D, v17.2D // ...............................................................*............................ - // gap // ............................................................................................ - mul v15.8H, v5.8H, v4.8H // ............................................................*............................... - // gap // ............................................................................................ - sqrdmulh v5.8H, v5.8H, v12.8H // .............................................................*.............................. - // gap // ............................................................................................ - trn1 v14.2D, v27.2D, v14.2D // .................................................................*.......................... - // gap // ............................................................................................ - mul v4.8H, v16.8H, v4.8H // ...................................................................*........................ - // gap // ............................................................................................ - sqrdmulh v12.8H, v16.8H, v12.8H // .....................................................................*...................... - // gap // ............................................................................................ - mls v15.8H, v5.8H, v7.H[0] // ..................................................................*......................... - // gap // ............................................................................................ - trn1 v24.2D, v24.2D, v17.2D // .......................................................................*.................... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - mls v4.8H, v12.8H, v7.H[0] // ............................................................................*............... - // gap // ............................................................................................ - add v12.8H, v14.8H, v15.8H // .........................................................................*.................. - // gap // ............................................................................................ - sub v17.8H, v14.8H, v15.8H // ..........................................................................*................. - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - mul v5.8H, v12.8H, v25.8H // ..............................................................................*............. - // gap // ............................................................................................ - sqrdmulh v12.8H, v12.8H, v6.8H // ................................................................................*........... - // gap // ............................................................................................ - mul v6.8H, v17.8H, v28.8H // ..................................................................................*......... - // gap // ............................................................................................ - sqrdmulh v17.8H, v17.8H, v9.8H // ...................................................................................*........ - // gap // ............................................................................................ - add v9.8H, v24.8H, v4.8H // .....................................................................................*...... - // gap // ............................................................................................ - mls v5.8H, v12.8H, v7.H[0] // ....................................................................................*....... - // gap // ............................................................................................ - sub v12.8H, v24.8H, v4.8H // .......................................................................................*.... - // gap // ............................................................................................ - mls v6.8H, v17.8H, v7.H[0] // ......................................................................................*..... - // gap // ............................................................................................ - // gap // ............................................................................................ - // gap // ............................................................................................ - add v18.8H, v9.8H, v5.8H // ........................................................................................*... - // gap // ............................................................................................ - sub v30.8H, v9.8H, v5.8H // .........................................................................................*.. - // gap // ............................................................................................ - add v29.8H, v12.8H, v6.8H // ..........................................................................................*. - // gap // ............................................................................................ - sub v31.8H, v12.8H, v6.8H // ...........................................................................................* - // gap // ............................................................................................ - - // original source code - // ldr x25, [x1, #48] // *........................................................................................... || *................................................................. - // ldr x21, [x3] , #16 // .*.......................................................................................... || .*................................................................ - // ldr x28, [x3, #-8] // ..*......................................................................................... || ..*............................................................... - // ldr x18, [x1, #56] // ...*........................................................................................ || ...*.............................................................. - // ldr x19, [x1, #32] // .....*...................................................................................... || ....*............................................................. - // vins v6, x21, 0 // ......*..................................................................................... || .....*............................................................ - // vins v29, x25, 0 // ....*....................................................................................... || ....*............................................................. - // vins v6, x28, 1 // .........*.................................................................................. || .......*.......................................................... - // vins v29, x18, 1 // ..........*................................................................................. || .......*.......................................................... - // ldr x23, [x1, #40] // .......*.................................................................................... || .....*............................................................ - // ldr x27, [x1, #24] // ........*................................................................................... || ......*........................................................... - // mul v19.8H, v29.8H, v6.H[0] // .............*.............................................................................. || .........*........................................................ - // vins v1, x19, 0 // ...........*................................................................................ || ........*......................................................... - // sqrdmulh v25.8H, v29.8H, v6.H[1] // ...............*............................................................................ || ..........*....................................................... - // ldr x28, [x1, #16] // ............*............................................................................... || ........*......................................................... - // vins v1, x23, 1 // ................*........................................................................... || ..........*....................................................... - // ldr x23, [x1, #0] // ..............*............................................................................. || .........*........................................................ - // sqrdmulh v13.8H, v1.8H, v6.H[1] // ..................*......................................................................... || ............*..................................................... - // ldr x29, [x1, #8] // .................*.......................................................................... || ...........*...................................................... - // mls v19.8H, v25.8H, v7.H[0] // ......................*..................................................................... || ..............*................................................... - // vins v16, x28, 0 // ...................*........................................................................ || ............*..................................................... - // mul v2.8H, v1.8H, v6.H[0] // ....................*....................................................................... || .............*.................................................... - // vins v16, x27, 1 // .......................*.................................................................... || ..............*................................................... - // sub v31.8H, v16.8H, v19.8H // .............................*.............................................................. || ..................*............................................... - // add v19.8H, v16.8H, v19.8H // ...............................*............................................................ || ...................*.............................................. - // vins v8, x23, 0 // .....................*...................................................................... || .............*.................................................... - // mls v2.8H, v13.8H, v7.H[0] // ...........................*................................................................ || .................*................................................ - // ldr x21, [x4] , #96 // .........................*.................................................................. || ...............*.................................................. - // mul v29.8H, v31.8H, v6.H[4] // ...................................*........................................................ || .....................*............................................ - // vins v8, x29, 1 // ........................*................................................................... || ...............*.................................................. - // sqrdmulh v30.8H, v31.8H, v6.H[5] // .....................................*...................................................... || ......................*........................................... - // ldr x28, [x4, #-88] // ..........................*................................................................. || ................*................................................. - // mul v31.8H, v19.8H, v6.H[2] // .......................................*.................................................... || .......................*.......................................... - // ldr x23, [x4, #-80] // ............................*............................................................... || .................*................................................ - // sqrdmulh v16.8H, v19.8H, v6.H[3] // .........................................*.................................................. || ........................*......................................... - // ldr x26, [x4, #-72] // ..............................*............................................................. || ..................*............................................... - // ldr x10, [x4, #-64] // ................................*........................................................... || ...................*.............................................. - // mls v29.8H, v30.8H, v7.H[0] // .............................................*.............................................. || ..........................*....................................... - // ldr x24, [x4, #-48] // ..................................*......................................................... || ....................*............................................. - // sub v30.8H, v8.8H, v2.8H // ...........................................*................................................ || .........................*........................................ - // ldr x8, [x4, #-8] // ....................................*....................................................... || .....................*............................................ - // mls v31.8H, v16.8H, v7.H[0] // .................................................*.......................................... || ............................*..................................... - // add v24.8H, v8.8H, v2.8H // ...............................................*............................................ || ...........................*...................................... - // vins v12, x21, 0 // .................................*.......................................................... || ....................*............................................. - // sub v2.8H, v30.8H, v29.8H // .......................................................*.................................... || ...............................*.................................. - // vins v18, x10, 0 // ........................................*................................................... || .......................*.......................................... - // add v30.8H, v30.8H, v29.8H // .....................................................*...................................... || ..............................*................................... - // vins v12, x28, 1 // ......................................*..................................................... || ......................*........................................... - // sub v13.8H, v24.8H, v31.8H // .........................................................*.................................. || ................................*................................. - // ldr x28, [x4, #-56] // ..........................................*................................................. || ........................*......................................... - // add v25.8H, v24.8H, v31.8H // ...........................................................*................................ || .................................*................................ - // ldr x10, [x4, #-24] // ............................................*............................................... || .........................*........................................ - // trn2 v24.4S, v30.4S, v2.4S // ............................................................*............................... || ..................................*............................... - // trn2 v22.4S, v25.4S, v13.4S // ................................................................*........................... || ....................................*............................. - // vins v18, x28, 1 // ..................................................*......................................... || ............................*..................................... - // trn1 v6.4S, v30.4S, v2.4S // ..............................................................*............................. || ...................................*.............................. - // vins v30, x23, 0 // ..............................................*............................................. || ..........................*....................................... - // trn2 v0.2D, v22.2D, v24.2D // ....................................................................*....................... || .......................................*.......................... - // vins v30, x26, 1 // ...................................................*........................................ || .............................*.................................... - // trn1 v17.4S, v25.4S, v13.4S // ..................................................................*......................... || .....................................*............................ - // mul v31.8H, v0.8H, v12.8H // ......................................................................*..................... || .........................................*........................ - // sqrdmulh v2.8H, v0.8H, v30.8H // .......................................................................*.................... || ..........................................*....................... - // ldr x23, [x4, #-40] // ................................................*........................................... || ...........................*...................................... - // trn2 v15.2D, v17.2D, v6.2D // .....................................................................*...................... || ........................................*......................... - // vins v11, x24, 0 // ....................................................*....................................... || .............................*.................................... - // trn1 v4.2D, v22.2D, v24.2D // ........................................................................*................... || ...........................................*...................... - // mls v31.8H, v2.8H, v7.H[0] // ...........................................................................*................ || ..............................................*................... - // mul v10.8H, v15.8H, v12.8H // .........................................................................*.................. || ............................................*..................... - // ldr x28, [x4, #-32] // ......................................................*..................................... || ..............................*................................... - // sqrdmulh v2.8H, v15.8H, v30.8H // ..........................................................................*................. || .............................................*.................... - // ldr x24, [x4, #-16] // ........................................................*................................... || ...............................*.................................. - // trn1 v19.2D, v17.2D, v6.2D // ............................................................................*............... || ...............................................*.................. - // vins v11, x23, 1 // ..........................................................*................................. || ................................*................................. - // add v30.8H, v4.8H, v31.8H // ..............................................................................*............. || ..................................................*............... - // sub v24.8H, v4.8H, v31.8H // ...............................................................................*............ || ...................................................*.............. - // vins v31, x28, 0 // .............................................................*.............................. || ..................................*............................... - // mls v10.8H, v2.8H, v7.H[0] // .............................................................................*.............. || .................................................*................ - // vins v2, x24, 0 // ...............................................................*............................ || ...................................*.............................. - // mul v12.8H, v30.8H, v18.8H // ................................................................................*........... || .....................................................*............ - // vins v31, x10, 1 // .................................................................*.......................... || ....................................*............................. - // sqrdmulh v8.8H, v30.8H, v11.8H // .................................................................................*.......... || ......................................................*........... - // vins v2, x8, 1 // ...................................................................*........................ || .....................................*............................ - // mul v31.8H, v24.8H, v31.8H // ..................................................................................*......... || .......................................................*.......... - // sqrdmulh v15.8H, v24.8H, v2.8H // ...................................................................................*........ || ........................................................*......... - // mls v12.8H, v8.8H, v7.H[0] // .....................................................................................*...... || ..........................................................*....... - // add v30.8H, v19.8H, v10.8H // ....................................................................................*....... || .........................................................*........ - // mls v31.8H, v15.8H, v7.H[0] // .......................................................................................*.... || ............................................................*..... - // sub v5.8H, v19.8H, v10.8H // ......................................................................................*..... || ...........................................................*...... - // add v18.8H, v30.8H, v12.8H // ........................................................................................*... || ..............................................................*... - // sub v30.8H, v30.8H, v12.8H // .........................................................................................*.. || ...............................................................*.. - // add v29.8H, v5.8H, v31.8H // ..........................................................................................*. || ................................................................*. - // sub v31.8H, v5.8H, v31.8H // ...........................................................................................* || .................................................................* - + // Instructions: 75 + // Expected cycles: 84 + // Expected IPC: 0.89 + // + // Cycle bound: 84.0 + // IPC bound: 0.89 + // + // Wall time: 19.81s + // User time: 19.81s + // + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q21, [x1, #48] // *.......................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + ldr q10, [x3], #16 // .*......................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + ldr q26, [x1, #32] // ..*........................................................................ + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v3.8H, v21.8H, v10.H[0] // ...*....................................................................... + // gap // ........................................................................... + sqrdmulh v27.8H, v21.8H, v10.H[1] // ....*...................................................................... + // gap // ........................................................................... + ldr q15, [x1, #16] // ......*.................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mul v12.8H, v26.8H, v10.H[0] // .....*..................................................................... + // gap // ........................................................................... + mls v3.8H, v27.8H, v7.H[0] // .......*................................................................... + // gap // ........................................................................... + sqrdmulh v4.8H, v26.8H, v10.H[1] // ........*.................................................................. + // gap // ........................................................................... + ldr q26, [x1, #0] // .........*................................................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v27.8H, v15.8H, v3.8H // ..........*................................................................ + // gap // ........................................................................... + add v30.8H, v15.8H, v3.8H // ...........*............................................................... + // gap // ........................................................................... + mls v12.8H, v4.8H, v7.H[0] // ............*.............................................................. + // gap // ........................................................................... + mul v31.8H, v27.8H, v10.H[4] // .............*............................................................. + // gap // ........................................................................... + sqrdmulh v27.8H, v27.8H, v10.H[5] // ..............*............................................................ + // gap // ........................................................................... + sqrdmulh v28.8H, v30.8H, v10.H[3] // ...............*........................................................... + // gap // ........................................................................... + mul v25.8H, v30.8H, v10.H[2] // ................*.......................................................... + // gap // ........................................................................... + sub v18.8H, v26.8H, v12.8H // ...................*....................................................... + // gap // ........................................................................... + mls v31.8H, v27.8H, v7.H[0] // ..................*........................................................ + // gap // ........................................................................... + add v15.8H, v26.8H, v12.8H // .................*......................................................... + // gap // ........................................................................... + mls v25.8H, v28.8H, v7.H[0] // ....................*...................................................... + // gap // ........................................................................... + ldr q12, [x4, #16] // .............................*............................................. + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v22.8H, v18.8H, v31.8H // .......................*................................................... + // gap // ........................................................................... + sub v13.8H, v18.8H, v31.8H // ......................*.................................................... + // gap // ........................................................................... + add v31.8H, v15.8H, v25.8H // .........................*................................................. + // gap // ........................................................................... + sub v28.8H, v15.8H, v25.8H // ........................*.................................................. + // gap // ........................................................................... + trn2 v16.4S, v22.4S, v13.4S // ...........................*............................................... + // gap // ........................................................................... + ldr q0, [x4], #(6*16) // .....................*..................................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + trn2 v21.4S, v31.4S, v28.4S // ............................*.............................................. + // gap // ........................................................................... + trn1 v26.4S, v31.4S, v28.4S // ...............................*........................................... + // gap // ........................................................................... + trn1 v3.4S, v22.4S, v13.4S // ..........................*................................................ + // gap // ........................................................................... + trn2 v31.2D, v21.2D, v16.2D // ..............................*............................................ + // gap // ........................................................................... + trn1 v8.2D, v21.2D, v16.2D // ...................................*....................................... + // gap // ........................................................................... + mul v17.8H, v31.8H, v0.8H // .................................*......................................... + // gap // ........................................................................... + sqrdmulh v4.8H, v31.8H, v12.8H // ................................*.......................................... + // gap // ........................................................................... + trn2 v28.2D, v26.2D, v3.2D // .....................................*..................................... + // gap // ........................................................................... + trn1 v5.2D, v26.2D, v3.2D // ..............................................*............................ + // gap // ........................................................................... + mul v19.8H, v28.8H, v0.8H // .........................................*................................. + // gap // ........................................................................... + mls v17.8H, v4.8H, v7.H[0] // ....................................*...................................... + // gap // ........................................................................... + sqrdmulh v26.8H, v28.8H, v12.8H // ........................................*.................................. + // gap // ........................................................................... + ldr q0, [x4, #-32] // ..................................*........................................ + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v1.8H, v8.8H, v17.8H // .......................................*................................... + // gap // ........................................................................... + ldr q23, [x4, #-16] // ......................................*.................................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + mls v19.8H, v26.8H, v7.H[0] // .............................................*............................. + // gap // ........................................................................... + mul v31.8H, v1.8H, v0.8H // ...........................................*............................... + // gap // ........................................................................... + sqrdmulh v12.8H, v1.8H, v23.8H // ..........................................*................................ + // gap // ........................................................................... + ldr q27, [x4, #-48] // ....................................................*...................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + sub v3.8H, v5.8H, v19.8H // ................................................*.......................... + // gap // ........................................................................... + mls v31.8H, v12.8H, v7.H[0] // ...............................................*........................... + // gap // ........................................................................... + ldr q28, [x4, #-64] // .................................................*......................... + // gap // ........................................................................... + // gap // ........................................................................... + // gap // ........................................................................... + add v23.8H, v8.8H, v17.8H // ............................................*.............................. + // gap // ........................................................................... + add v10.8H, v3.8H, v31.8H // ...................................................*....................... + // gap // ........................................................................... + sub v31.8H, v3.8H, v31.8H // ..................................................*........................ + // gap // ........................................................................... + sqrdmulh v27.8H, v23.8H, v27.8H // ..........................................................*................ + // gap // ........................................................................... + mul v23.8H, v23.8H, v28.8H // .........................................................*................. + // gap // ........................................................................... + trn1 v16.4S, v10.4S, v31.4S // ......................................................*.................... + // gap // ........................................................................... + trn2 v9.4S, v10.4S, v31.4S // .....................................................*..................... + // gap // ........................................................................... + sqdmulh v20.8H, v16.8H, v7.H[1] // ........................................................*.................. + // gap // ........................................................................... + sqdmulh v28.8H, v9.8H, v7.H[1] // .......................................................*................... + // gap // ........................................................................... + mls v23.8H, v27.8H, v7.H[0] // ...............................................................*........... + // gap // ........................................................................... + add v26.8H, v5.8H, v19.8H // ................................................................*.......... + // gap // ........................................................................... + srshr v5.8H, v20.8H, #11 // ............................................................*.............. + // gap // ........................................................................... + srshr v21.8H, v28.8H, #11 // ...........................................................*............... + // gap // ........................................................................... + sub v25.8H, v26.8H, v23.8H // .................................................................*......... + // gap // ........................................................................... + add v19.8H, v26.8H, v23.8H // ..................................................................*........ + // gap // ........................................................................... + mls v16.8H, v5.8H, v7.H[0] // ..............................................................*............ + // gap // ........................................................................... + mls v9.8H, v21.8H, v7.H[0] // .............................................................*............. + // gap // ........................................................................... + trn2 v29.4S, v19.4S, v25.4S // .......................................................................*... + // gap // ........................................................................... + trn1 v26.4S, v19.4S, v25.4S // .....................................................................*..... + // gap // ........................................................................... + sqdmulh v11.8H, v29.8H, v7.H[1] // ..........................................................................* + // gap // ........................................................................... + umov x14, v9.d[0] // .........................................................................*. + umov x19, v9.d[1] // ....................................................................*...... + umov x29, v16.d[1] // ........................................................................*.. + umov x23, v16.d[0] // ...................................................................*....... + sqdmulh v6.8H, v26.8H, v7.H[1] // ......................................................................*.... + // gap // ........................................................................... + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q31, [x1, #48] // *.......................................................................... + // ldr q8, [x3], #16 // .*......................................................................... + // ldr q13, [x1, #32] // ..*........................................................................ + // mul v25.8H, v31.8H, v8.H[0] // ...*....................................................................... + // sqrdmulh v14.8H, v31.8H, v8.H[1] // ....*...................................................................... + // mul v15.8H, v13.8H, v8.H[0] // ......*.................................................................... + // ldr q18, [x1, #16] // .....*..................................................................... + // mls v25.8H, v14.8H, v7.H[0] // .......*................................................................... + // sqrdmulh v16.8H, v13.8H, v8.H[1] // ........*.................................................................. + // ldr q19, [x1, #0] // .........*................................................................. + // sub v31.8H, v18.8H, v25.8H // ..........*................................................................ + // add v18.8H, v18.8H, v25.8H // ...........*............................................................... + // mls v15.8H, v16.8H, v7.H[0] // ............*.............................................................. + // mul v2.8H, v31.8H, v8.H[4] // .............*............................................................. + // sqrdmulh v31.8H, v31.8H, v8.H[5] // ..............*............................................................ + // sqrdmulh v16.8H, v18.8H, v8.H[3] // ...............*........................................................... + // mul v18.8H, v18.8H, v8.H[2] // ................*.......................................................... + // add v24.8H, v19.8H, v15.8H // ...................*....................................................... + // mls v2.8H, v31.8H, v7.H[0] // ..................*........................................................ + // sub v25.8H, v19.8H, v15.8H // .................*......................................................... + // mls v18.8H, v16.8H, v7.H[0] // ....................*...................................................... + // ldr q31, [x4], #(6*16) // ...........................*............................................... + // sub v5.8H, v25.8H, v2.8H // .......................*................................................... + // add v13.8H, v25.8H, v2.8H // ......................*.................................................... + // sub v16.8H, v24.8H, v18.8H // .........................*................................................. + // add v14.8H, v24.8H, v18.8H // ........................*.................................................. + // trn1 v10.4S, v13.4S, v5.4S // ..............................*............................................ + // trn2 v30.4S, v13.4S, v5.4S // ..........................*................................................ + // trn2 v18.4S, v14.4S, v16.4S // ............................*.............................................. + // ldr q8, [x4, #-80] // .....................*..................................................... + // trn2 v20.2D, v18.2D, v30.2D // ...............................*........................................... + // trn1 v25.4S, v14.4S, v16.4S // .............................*............................................. + // sqrdmulh v19.8H, v20.8H, v8.8H // ..................................*........................................ + // mul v12.8H, v20.8H, v31.8H // .................................*......................................... + // ldr q27, [x4, #-32] // ........................................*.................................. + // trn1 v18.2D, v18.2D, v30.2D // ................................*.......................................... + // mls v12.8H, v19.8H, v7.H[0] // ......................................*.................................... + // trn2 v19.2D, v25.2D, v10.2D // ...................................*....................................... + // ldr q30, [x4, #-16] // ..........................................*................................ + // sub v20.8H, v18.8H, v12.8H // .........................................*................................. + // sqrdmulh v14.8H, v19.8H, v8.8H // .......................................*................................... + // mul v8.8H, v19.8H, v31.8H // .....................................*..................................... + // sqrdmulh v15.8H, v20.8H, v30.8H // .............................................*............................. + // mul v31.8H, v20.8H, v27.8H // ............................................*.............................. + // add v12.8H, v18.8H, v12.8H // ..................................................*........................ + // mls v8.8H, v14.8H, v7.H[0] // ...........................................*............................... + // trn1 v30.2D, v25.2D, v10.2D // ....................................*...................................... + // mls v31.8H, v15.8H, v7.H[0] // ................................................*.......................... + // sub v6.8H, v30.8H, v8.8H // ...............................................*........................... + // ldr q4, [x4, #-64] // .................................................*......................... + // sub v25.8H, v6.8H, v31.8H // ....................................................*...................... + // add v6.8H, v6.8H, v31.8H // ...................................................*....................... + // ldr q18, [x4, #-48] // ..............................................*............................ + // trn2 v31.4S, v6.4S, v25.4S // ........................................................*.................. + // trn1 v25.4S, v6.4S, v25.4S // .......................................................*................... + // sqdmulh v19.8H, v31.8H, v7.H[1] // ..........................................................*................ + // sqdmulh v16.8H, v25.8H, v7.H[1] // .........................................................*................. + // mul v20.8H, v12.8H, v4.8H // ......................................................*.................... + // sqrdmulh v12.8H, v12.8H, v18.8H // .....................................................*..................... + // srshr v18.8H, v19.8H, #11 // ..............................................................*............ + // srshr v16.8H, v16.8H, #11 // .............................................................*............. + // mls v31.8H, v18.8H, v7.H[0] // ..................................................................*........ + // mls v25.8H, v16.8H, v7.H[0] // .................................................................*......... + // mls v20.8H, v12.8H, v7.H[0] // ...........................................................*............... + // add v8.8H, v30.8H, v8.8H // ............................................................*.............. + // sub v17.8H, v8.8H, v20.8H // ...............................................................*........... + // add v18.8H, v8.8H, v20.8H // ................................................................*.......... + // umov x23, v25.d[0] // .........................................................................*. + // umov x19, v31.d[1] // .......................................................................*... + // trn1 v26.4S, v18.4S, v17.4S // ....................................................................*...... + // sqdmulh v6.8H, v26.8H, v7.H[1] // ..........................................................................* + // trn2 v29.4S, v18.4S, v17.4S // ...................................................................*....... + // umov x29, v25.d[1] // ........................................................................*.. + // umov x14, v31.d[0] // ......................................................................*.... + // sqdmulh v11.8H, v29.8H, v7.H[1] // .....................................................................*..... + sub count, count, #1 -.p2align 2 layer4567_start: - trn2 v10.4S, v18.4S, v30.4S // .............................................................................................*.............................. - ldr x25, [x1, #112] // ............e............................................................................................................... - trn1 v2.4S, v18.4S, v30.4S // ............................................................................................*............................... - ldr x21, [x3] , #16 // ................e........................................................................................................... - sqdmulh v12.8H, v10.8H, v7.H[1] // ...................................................................................................*........................ - ldr x28, [x3, #-8] // .................e.......................................................................................................... - trn1 v20.4S, v29.4S, v31.4S // ..............................................................................................*............................. - ldr x18, [x1, #120] // .............e.............................................................................................................. - trn2 v11.4S, v29.4S, v31.4S // ...............................................................................................*............................ - ldr x19, [x1, #96] // ........e................................................................................................................... - sqdmulh v19.8H, v2.8H, v7.H[1] // ................................................................................................*........................... - vins v6, x21, 0 // ..................e......................................................................................................... - srshr v31.8H, v12.8H, #11 // ....................................................................................................*....................... - vins v29, x25, 0 // ..............e............................................................................................................. - sqdmulh v18.8H, v11.8H, v7.H[1] // .........................................................................................................*.................. - vins v6, x28, 1 // ...................e........................................................................................................ - sqdmulh v30.8H, v20.8H, v7.H[1] // ......................................................................................................*..................... - vins v29, x18, 1 // ...............e............................................................................................................ - mls v10.8H, v31.8H, v7.H[0] // .....................................................................................................*...................... - ldr x23, [x1, #104] // .........e.................................................................................................................. - srshr v31.8H, v19.8H, #11 // .................................................................................................*.......................... - ldr x27, [x1, #88] // .....e...................................................................................................................... - mul v19.8H, v29.8H, v6.H[0] // .........................e.................................................................................................. - vins v1, x19, 0 // ..........e................................................................................................................. - sqrdmulh v25.8H, v29.8H, v6.H[1] // ..........................e................................................................................................. - ldr x28, [x1, #80] // ....e....................................................................................................................... - mls v2.8H, v31.8H, v7.H[0] // ..................................................................................................*......................... - vins v1, x23, 1 // ...........e................................................................................................................ - srshr v31.8H, v30.8H, #11 // .......................................................................................................*.................... - ldr x23, [x1, #64] // e........................................................................................................................... - sqrdmulh v13.8H, v1.8H, v6.H[1] // .....................e...................................................................................................... - ldr x29, [x1, #72] // .e.......................................................................................................................... - mls v19.8H, v25.8H, v7.H[0] // ...........................e................................................................................................ - vins v16, x28, 0 // ......e..................................................................................................................... - vext x24, v2, 0 // ............................................................................................................*............... - vext x8, v2, 1 // .............................................................................................................*.............. - mul v2.8H, v1.8H, v6.H[0] // ....................e....................................................................................................... - vins v16, x27, 1 // .......e.................................................................................................................... - mls v20.8H, v31.8H, v7.H[0] // ........................................................................................................*................... - str x24, [x1] , #64 // ....................................................................................................................*....... - sub v31.8H, v16.8H, v19.8H // ............................e............................................................................................... - str x8, [x1, #-32] // ........................................................................................................................*... - add v19.8H, v16.8H, v19.8H // .............................e.............................................................................................. - vins v8, x23, 0 // ..e......................................................................................................................... - mls v2.8H, v13.8H, v7.H[0] // ......................e..................................................................................................... - ldr x21, [x4] , #96 // ................................................e........................................................................... - mul v29.8H, v31.8H, v6.H[4] // ...................................e........................................................................................ - vins v8, x29, 1 // ...e........................................................................................................................ - sqrdmulh v30.8H, v31.8H, v6.H[5] // ....................................e....................................................................................... - ldr x28, [x4, #-88] // .................................................e.......................................................................... - mul v31.8H, v19.8H, v6.H[2] // ..............................e............................................................................................. - ldr x23, [x4, #-80] // ....................................................e....................................................................... - sqrdmulh v16.8H, v19.8H, v6.H[3] // ...............................e............................................................................................ - ldr x26, [x4, #-72] // .....................................................e...................................................................... - srshr v27.8H, v18.8H, #11 // ..........................................................................................................*................. - ldr x10, [x4, #-64] // ........................................................e................................................................... - mls v29.8H, v30.8H, v7.H[0] // .....................................e...................................................................................... - ldr x24, [x4, #-48] // ............................................................e............................................................... - sub v30.8H, v8.8H, v2.8H // .......................e.................................................................................................... - ldr x8, [x4, #-8] // .....................................................................e...................................................... - mls v31.8H, v16.8H, v7.H[0] // ................................e........................................................................................... - // gap // ............................................................................................................................ - add v24.8H, v8.8H, v2.8H // ........................e................................................................................................... - vins v12, x21, 0 // ..................................................e......................................................................... - sub v2.8H, v30.8H, v29.8H // ......................................e..................................................................................... - vins v18, x10, 0 // ..........................................................e................................................................. - add v30.8H, v30.8H, v29.8H // .......................................e.................................................................................... - vins v12, x28, 1 // ...................................................e........................................................................ - sub v13.8H, v24.8H, v31.8H // .................................e.......................................................................................... - ldr x28, [x4, #-56] // .........................................................e.................................................................. - add v25.8H, v24.8H, v31.8H // ..................................e......................................................................................... - ldr x10, [x4, #-24] // .................................................................e.......................................................... - mls v11.8H, v27.8H, v7.H[0] // ...........................................................................................................*................ - // gap // ............................................................................................................................ - trn2 v24.4S, v30.4S, v2.4S // ...........................................e................................................................................ - // gap // ............................................................................................................................ - trn2 v22.4S, v25.4S, v13.4S // .........................................e.................................................................................. - vins v18, x28, 1 // ...........................................................e................................................................ - trn1 v6.4S, v30.4S, v2.4S // ..........................................e................................................................................. - vins v30, x23, 0 // ......................................................e..................................................................... - vext x16, v11, 0 // ..................................................................................................................*......... - vext x12, v10, 0 // ..............................................................................................................*............. - trn2 v0.2D, v22.2D, v24.2D // .............................................e.............................................................................. - vins v30, x26, 1 // .......................................................e.................................................................... - trn1 v17.4S, v25.4S, v13.4S // ........................................e................................................................................... - str x12, [x1, #-48] // ......................................................................................................................*..... - mul v31.8H, v0.8H, v12.8H // .............................................................................e.............................................. - str x16, [x1, #-40] // .......................................................................................................................*.... - sqrdmulh v2.8H, v0.8H, v30.8H // ..............................................................................e............................................. - ldr x23, [x4, #-40] // .............................................................e.............................................................. - vext x13, v10, 1 // ...............................................................................................................*............ - vext x17, v11, 1 // ...................................................................................................................*........ - trn2 v15.2D, v17.2D, v6.2D // ............................................e............................................................................... - vins v11, x24, 0 // ..............................................................e............................................................. - trn1 v4.2D, v22.2D, v24.2D // ...............................................e............................................................................ - str x13, [x1, #-16] // ..........................................................................................................................*. - mls v31.8H, v2.8H, v7.H[0] // ...............................................................................e............................................ - str x17, [x1, #-8] // ...........................................................................................................................* - mul v10.8H, v15.8H, v12.8H // ........................................................................e................................................... - ldr x28, [x4, #-32] // ................................................................e........................................................... - sqrdmulh v2.8H, v15.8H, v30.8H // .........................................................................e.................................................. - ldr x24, [x4, #-16] // ....................................................................e....................................................... - trn1 v19.2D, v17.2D, v6.2D // ..............................................e............................................................................. - vins v11, x23, 1 // ...............................................................e............................................................ - add v30.8H, v4.8H, v31.8H // .................................................................................e.......................................... - // gap // ............................................................................................................................ - sub v24.8H, v4.8H, v31.8H // ................................................................................e........................................... - vins v31, x28, 0 // ..................................................................e......................................................... - mls v10.8H, v2.8H, v7.H[0] // ..........................................................................e................................................. - vins v2, x24, 0 // ......................................................................e..................................................... - mul v12.8H, v30.8H, v18.8H // ..................................................................................e......................................... - vins v31, x10, 1 // ...................................................................e........................................................ - sqrdmulh v8.8H, v30.8H, v11.8H // ...................................................................................e........................................ - vins v2, x8, 1 // .......................................................................e.................................................... - mul v31.8H, v24.8H, v31.8H // .......................................................................................e.................................... - // gap // ............................................................................................................................ - sqrdmulh v15.8H, v24.8H, v2.8H // ........................................................................................e................................... - // gap // ............................................................................................................................ - vext x14, v20, 0 // ................................................................................................................*........... - vext x15, v20, 1 // .................................................................................................................*.......... - mls v12.8H, v8.8H, v7.H[0] // ....................................................................................e....................................... - // gap // ............................................................................................................................ - add v30.8H, v19.8H, v10.8H // ............................................................................e............................................... - str x14, [x1, #-56] // .....................................................................................................................*...... - mls v31.8H, v15.8H, v7.H[0] // .........................................................................................e.................................. - str x15, [x1, #-24] // .........................................................................................................................*.. - sub v5.8H, v19.8H, v10.8H // ...........................................................................e................................................ - // gap // ............................................................................................................................ - add v18.8H, v30.8H, v12.8H // ......................................................................................e..................................... - // gap // ............................................................................................................................ - sub v30.8H, v30.8H, v12.8H // .....................................................................................e...................................... - // gap // ............................................................................................................................ - add v29.8H, v5.8H, v31.8H // ...........................................................................................e................................ - // gap // ............................................................................................................................ - sub v31.8H, v5.8H, v31.8H // ..........................................................................................e................................. - // gap // ............................................................................................................................ - - // original source code - // ldr x10, [x1, #0] // ............................e..................................................................................................................................................................................................................... || ..............e.................................................................................................................... - // ldr x11, [x1, #8] // ..............................e................................................................................................................................................................................................................... || ...............e................................................................................................................... - // vins v8, x10, 0 // ..........................................e....................................................................................................................................................................................................... || .....................e............................................................................................................. - // vins v8, x11, 1 // ..............................................e................................................................................................................................................................................................... || .......................e........................................................................................................... - // ldr x10, [x1, #16] // ........................e......................................................................................................................................................................................................................... || ............e...................................................................................................................... - // ldr x11, [x1, #24] // ....................e............................................................................................................................................................................................................................. || ..........e........................................................................................................................ - // vins v9, x10, 0 // ................................e................................................................................................................................................................................................................. || ................e.................................................................................................................. - // vins v9, x11, 1 // ....................................e............................................................................................................................................................................................................. || ..................e................................................................................................................ - // ldr x10, [x1, #32] // ........e......................................................................................................................................................................................................................................... || ....e.............................................................................................................................. - // ldr x11, [x1, #40] // ..................e............................................................................................................................................................................................................................... || .........e......................................................................................................................... - // vins v10, x10, 0 // ......................e........................................................................................................................................................................................................................... || ...........e....................................................................................................................... - // vins v10, x11, 1 // ..........................e....................................................................................................................................................................................................................... || .............e..................................................................................................................... - // ldr x10, [x1, #48] // e................................................................................................................................................................................................................................................. || e.................................................................................................................................. - // ldr x11, [x1, #56] // ......e........................................................................................................................................................................................................................................... || ...e............................................................................................................................... - // vins v11, x10, 0 // ............e..................................................................................................................................................................................................................................... || ......e............................................................................................................................ - // vins v11, x11, 1 // ................e................................................................................................................................................................................................................................. || ........e.......................................................................................................................... - // ldr x10, [x3] , #16 // ..e............................................................................................................................................................................................................................................... || .e................................................................................................................................. - // ldr x11, [x3, #-8] // ....e............................................................................................................................................................................................................................................. || ..e................................................................................................................................ - // vins v0, x10, 0 // ..........e....................................................................................................................................................................................................................................... || .....e............................................................................................................................. - // vins v0, x11, 1 // ..............e................................................................................................................................................................................................................................... || .......e........................................................................................................................... - // mul v24.8H, v10.8H, v0.H[0] // ...................................e.............................................................................................................................................................................................................. || ..................e................................................................................................................ - // sqrdmulh v10.8H, v10.8H, v0.H[1] // .............................e.................................................................................................................................................................................................................... || ...............e................................................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ...........................................e...................................................................................................................................................................................................... || ......................e............................................................................................................ - // sub v10.8H, v8.8H, v24.8H // .........................................................e........................................................................................................................................................................................ || .............................e..................................................................................................... - // add v8.8H, v8.8H, v24.8H // ............................................................e..................................................................................................................................................................................... || ...............................e................................................................................................... - // mul v24.8H, v11.8H, v0.H[0] // .....................e............................................................................................................................................................................................................................ || ...........e....................................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // .......................e.......................................................................................................................................................................................................................... || ............e...................................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ...............................e.................................................................................................................................................................................................................. || ................e.................................................................................................................. - // sub v11.8H, v9.8H, v24.8H // .......................................e.......................................................................................................................................................................................................... || ....................e.............................................................................................................. - // add v9.8H, v9.8H, v24.8H // .........................................e........................................................................................................................................................................................................ || .....................e............................................................................................................. - // mul v24.8H, v9.8H, v0.H[2] // .................................................e................................................................................................................................................................................................ || .........................e......................................................................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ...................................................e.............................................................................................................................................................................................. || ..........................e........................................................................................................ - // mls v24.8H, v9.8H, v7.H[0] // ...........................................................e...................................................................................................................................................................................... || ..............................e.................................................................................................... - // sub v9.8H, v8.8H, v24.8H // ..................................................................e............................................................................................................................................................................... || ..................................e................................................................................................ - // add v8.8H, v8.8H, v24.8H // ....................................................................e............................................................................................................................................................................. || ...................................e............................................................................................... - // mul v24.8H, v11.8H, v0.H[4] // .............................................e.................................................................................................................................................................................................... || .......................e........................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ...............................................e.................................................................................................................................................................................................. || ........................e.......................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // .......................................................e.......................................................................................................................................................................................... || ............................e...................................................................................................... - // sub v11.8H, v10.8H, v24.8H // ..............................................................e................................................................................................................................................................................... || ................................e.................................................................................................. - // add v10.8H, v10.8H, v24.8H // ................................................................e................................................................................................................................................................................. || .................................e................................................................................................. - // trn1 v25.4S, v8.4S, v9.4S // ................................................................................e................................................................................................................................................................. || ..........................................e........................................................................................ - // trn2 v26.4S, v8.4S, v9.4S // ........................................................................e......................................................................................................................................................................... || ......................................e............................................................................................ - // trn1 v27.4S, v10.4S, v11.4S // ..........................................................................e....................................................................................................................................................................... || .......................................e........................................................................................... - // trn2 v28.4S, v10.4S, v11.4S // .......................................................................e.......................................................................................................................................................................... || .....................................e............................................................................................. - // trn2 v10.2D, v25.2D, v27.2D // ........................................................................................e......................................................................................................................................................... || ..............................................e.................................................................................... - // trn2 v11.2D, v26.2D, v28.2D // ..............................................................................e................................................................................................................................................................... || .........................................e......................................................................................... - // trn1 v8.2D, v25.2D, v27.2D // ..................................................................................................e............................................................................................................................................... || ...................................................e............................................................................... - // trn1 v9.2D, v26.2D, v28.2D // ..........................................................................................e....................................................................................................................................................... || ...............................................e................................................................................... - // ldr x10, [x4] , #96 // ............................................e..................................................................................................................................................................................................... || ......................e............................................................................................................ - // ldr x11, [x4, #-88] // ................................................e................................................................................................................................................................................................. || ........................e.......................................................................................................... - // vins v0, x10, 0 // .............................................................e.................................................................................................................................................................................... || ...............................e................................................................................................... - // vins v0, x11, 1 // .................................................................e................................................................................................................................................................................ || .................................e................................................................................................. - // ldr x10, [x4, #-80] // ..................................................e............................................................................................................................................................................................... || .........................e......................................................................................................... - // ldr x11, [x4, #-72] // ....................................................e............................................................................................................................................................................................. || ..........................e........................................................................................................ - // vins v4, x10, 0 // ...........................................................................e...................................................................................................................................................................... || .......................................e........................................................................................... - // vins v4, x11, 1 // ...............................................................................e.................................................................................................................................................................. || .........................................e......................................................................................... - // ldr x10, [x4, #-64] // ......................................................e........................................................................................................................................................................................... || ...........................e....................................................................................................... - // ldr x11, [x4, #-56] // ...................................................................e.............................................................................................................................................................................. || ..................................e................................................................................................ - // vins v1, x10, 0 // ...............................................................e.................................................................................................................................................................................. || ................................e.................................................................................................. - // vins v1, x11, 1 // .........................................................................e........................................................................................................................................................................ || ......................................e............................................................................................ - // ldr x10, [x4, #-48] // ........................................................e......................................................................................................................................................................................... || ............................e...................................................................................................... - // ldr x11, [x4, #-40] // .....................................................................................e............................................................................................................................................................ || ............................................e...................................................................................... - // vins v5, x10, 0 // .........................................................................................e........................................................................................................................................................ || ..............................................e.................................................................................... - // vins v5, x11, 1 // ...................................................................................................e.............................................................................................................................................. || ...................................................e............................................................................... - // ldr x10, [x4, #-32] // ...............................................................................................e.................................................................................................................................................. || .................................................e................................................................................. - // ldr x11, [x4, #-24] // .....................................................................e............................................................................................................................................................................ || ...................................e............................................................................................... - // vins v2, x10, 0 // ......................................................................................................e........................................................................................................................................... || .....................................................e............................................................................. - // vins v2, x11, 1 // ..........................................................................................................e....................................................................................................................................... || .......................................................e........................................................................... - // ldr x10, [x4, #-16] // .................................................................................................e................................................................................................................................................ || ..................................................e................................................................................ - // ldr x11, [x4, #-8] // ..........................................................e....................................................................................................................................................................................... || .............................e..................................................................................................... - // vins v6, x10, 0 // ........................................................................................................e......................................................................................................................................... || ......................................................e............................................................................ - // vins v6, x11, 1 // ............................................................................................................e..................................................................................................................................... || ........................................................e.......................................................................... - // mul v24.8H, v10.8H, v0.8H // ..............................................................................................e................................................................................................................................................... || .................................................e................................................................................. - // sqrdmulh v10.8H, v10.8H, v4.8H // ................................................................................................e................................................................................................................................................. || ..................................................e................................................................................ - // mls v24.8H, v10.8H, v7.H[0] // .......................................................................................................e.......................................................................................................................................... || ......................................................e............................................................................ - // sub v10.8H, v8.8H, v24.8H // ......................................................................................................................e........................................................................................................................... || ...............................................................e................................................................... - // add v8.8H, v8.8H, v24.8H // ..................................................................................................................e............................................................................................................................... || .............................................................e..................................................................... - // mul v24.8H, v11.8H, v0.8H // ..................................................................................e............................................................................................................................................................... || ...........................................e....................................................................................... - // sqrdmulh v11.8H, v11.8H, v4.8H // ....................................................................................e............................................................................................................................................................. || ............................................e...................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ............................................................................................e..................................................................................................................................................... || ................................................e.................................................................................. - // sub v11.8H, v9.8H, v24.8H // .....................................................................................................e............................................................................................................................................ || .....................................................e............................................................................. - // add v9.8H, v9.8H, v24.8H // ....................................................................................................e............................................................................................................................................. || ....................................................e.............................................................................. - // mul v24.8H, v9.8H, v1.8H // .........................................................................................................e........................................................................................................................................ || .......................................................e........................................................................... - // sqrdmulh v9.8H, v9.8H, v5.8H // ...........................................................................................................e...................................................................................................................................... || ........................................................e.......................................................................... - // mls v24.8H, v9.8H, v7.H[0] // .................................................................................................................e................................................................................................................................ || ............................................................e...................................................................... - // sub v9.8H, v8.8H, v24.8H // ........................................................................................................................e......................................................................................................................... || .................................................................e................................................................. - // add v8.8H, v8.8H, v24.8H // .......................................................................................................................e.......................................................................................................................... || ................................................................e.................................................................. - // mul v24.8H, v11.8H, v2.8H // .............................................................................................................e.................................................................................................................................... || .........................................................e......................................................................... - // sqrdmulh v11.8H, v11.8H, v6.8H // ..............................................................................................................e................................................................................................................................... || ..........................................................e........................................................................ - // mls v24.8H, v11.8H, v7.H[0] // ....................................................................................................................e............................................................................................................................. || ..............................................................e.................................................................... - // sub v11.8H, v10.8H, v24.8H // ..........................................................................................................................e....................................................................................................................... || ...................................................................e............................................................... - // add v10.8H, v10.8H, v24.8H // .........................................................................................................................e........................................................................................................................ || ..................................................................e................................................................ - // trn1 v25.4S, v8.4S, v9.4S // .............................................................................................................................*.................................................................................................................... || .....................................................................*............................................................. - // trn2 v26.4S, v8.4S, v9.4S // ...........................................................................................................................*...................................................................................................................... || ....................................................................*.............................................................. - // trn1 v27.4S, v10.4S, v11.4S // .................................................................................................................................*................................................................................................................ || .......................................................................*........................................................... - // trn2 v28.4S, v10.4S, v11.4S // ...................................................................................................................................*.............................................................................................................. || ........................................................................*.......................................................... - // sqdmulh v24.8H, v25.8H, v7.H[1] // .....................................................................................................................................*............................................................................................................ || .........................................................................*......................................................... - // srshr v24.8H, v24.8H, #11 // ...............................................................................................................................................*.................................................................................................. || ..............................................................................*.................................................... - // mls v25.8H, v24.8H, v7.H[0] // .....................................................................................................................................................*............................................................................................ || .................................................................................*................................................. - // sqdmulh v24.8H, v26.8H, v7.H[1] // ...............................................................................................................................*.................................................................................................................. || ......................................................................*............................................................ - // srshr v24.8H, v24.8H, #11 // .......................................................................................................................................*.......................................................................................................... || ..........................................................................*........................................................ - // mls v26.8H, v24.8H, v7.H[0] // .............................................................................................................................................*.................................................................................................... || .............................................................................*..................................................... - // sqdmulh v24.8H, v27.8H, v7.H[1] // ...........................................................................................................................................*...................................................................................................... || ............................................................................*...................................................... - // srshr v24.8H, v24.8H, #11 // .......................................................................................................................................................*.......................................................................................... || ..................................................................................*................................................ - // mls v27.8H, v24.8H, v7.H[0] // .................................................................................................................................................................*................................................................................ || .......................................................................................*........................................... - // sqdmulh v24.8H, v28.8H, v7.H[1] // .........................................................................................................................................*........................................................................................................ || ...........................................................................*....................................................... - // srshr v24.8H, v24.8H, #11 // .................................................................................................................................................................................*................................................................ || ...............................................................................................*................................... - // mls v28.8H, v24.8H, v7.H[0] // ..................................................................................................................................................................................................*............................................... || ........................................................................................................*.......................... - // vext x10, v25, 0 // .............................................................................................................................................................*.................................................................................... || .....................................................................................*............................................. - // vext x11, v25, 1 // ..............................................................................................................................................................*................................................................................... || .....................................................................................*............................................. - // vext x12, v26, 0 // .........................................................................................................................................................................................................*........................................ || ............................................................................................................*...................... - // vext x13, v26, 1 // ..................................................................................................................................................................................................................*............................... || .................................................................................................................*................. - // vext x14, v27, 0 // ...........................................................................................................................................................................................................................................*...... || ...............................................................................................................................*... - // vext x15, v27, 1 // ............................................................................................................................................................................................................................................*..... || ...............................................................................................................................*... - // vext x16, v28, 0 // ........................................................................................................................................................................................................*......................................... || ............................................................................................................*...................... - // vext x17, v28, 1 // ...................................................................................................................................................................................................................*.............................. || .................................................................................................................*................. - // str x10, [x1] , #64 // ..................................................................................................................................................................*............................................................................... || .......................................................................................*........................................... - // str x14, [x1, #-56] // ...............................................................................................................................................................................................................................................*.. || .................................................................................................................................*. - // str x12, [x1, #-48] // .............................................................................................................................................................................................................*.................................... || ..............................................................................................................*.................... - // str x16, [x1, #-40] // ...............................................................................................................................................................................................................*.................................. || ...............................................................................................................*................... - // str x11, [x1, #-32] // ....................................................................................................................................................................*............................................................................. || ........................................................................................*.......................................... - // str x15, [x1, #-24] // .................................................................................................................................................................................................................................................* || ..................................................................................................................................* - // str x13, [x1, #-16] // .......................................................................................................................................................................................................................*.......................... || ...................................................................................................................*............... - // str x17, [x1, #-8] // .........................................................................................................................................................................................................................*........................ || ....................................................................................................................*.............. - - subs count, count, #1 + // Instructions: 91 + // Expected cycles: 90 + // Expected IPC: 1.01 + // + // Cycle bound: 90.0 + // IPC bound: 1.01 + // + // Wall time: 62.43s + // User time: 62.43s + // + // ----------------------------------- original position ------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--------------- + ldr q31, [x1, #112] // ...e....................................................................................... + str x19, [x1, #56] // ..........................................................................................* + str x29, [x1, #40] // ........................................................................................*.. + // gap // ........................................................................................... + ldr q8, [x3], #16 // ....e...................................................................................... + str x14, [x1, #24] // ......................................................................................*.... + str x23, [x1, #8] // ....................................................................................*...... + // gap // ........................................................................................... + ldr q13, [x1, #96] // ..e........................................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v25.8H, v31.8H, v8.H[0] // ...........e............................................................................... + // gap // ........................................................................................... + sqrdmulh v14.8H, v31.8H, v8.H[1] // ..........e................................................................................ + // gap // ........................................................................................... + mul v15.8H, v13.8H, v8.H[0] // ......e.................................................................................... + // gap // ........................................................................................... + ldr q18, [x1, #80] // .e......................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v25.8H, v14.8H, v7.H[0] // ............e.............................................................................. + // gap // ........................................................................................... + sqrdmulh v16.8H, v13.8H, v8.H[1] // .....e..................................................................................... + // gap // ........................................................................................... + ldr q19, [x1, #64] // e.......................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v31.8H, v18.8H, v25.8H // .............e............................................................................. + // gap // ........................................................................................... + add v18.8H, v18.8H, v25.8H // ..............e............................................................................ + // gap // ........................................................................................... + mls v15.8H, v16.8H, v7.H[0] // .......e................................................................................... + // gap // ........................................................................................... + mul v2.8H, v31.8H, v8.H[4] // .....................e..................................................................... + // gap // ........................................................................................... + sqrdmulh v31.8H, v31.8H, v8.H[5] // ....................e...................................................................... + // gap // ........................................................................................... + sqrdmulh v16.8H, v18.8H, v8.H[3] // ...............e........................................................................... + // gap // ........................................................................................... + mul v18.8H, v18.8H, v8.H[2] // ................e.......................................................................... + // gap // ........................................................................................... + add v24.8H, v19.8H, v15.8H // .........e................................................................................. + // gap // ........................................................................................... + mls v2.8H, v31.8H, v7.H[0] // ......................e.................................................................... + // gap // ........................................................................................... + sub v25.8H, v19.8H, v15.8H // ........e.................................................................................. + // gap // ........................................................................................... + mls v18.8H, v16.8H, v7.H[0] // .................e......................................................................... + // gap // ........................................................................................... + ldr q31, [x4], #(6*16) // .................................e......................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v5.8H, v25.8H, v2.8H // .......................e................................................................... + // gap // ........................................................................................... + add v13.8H, v25.8H, v2.8H // ........................e.................................................................. + // gap // ........................................................................................... + sub v16.8H, v24.8H, v18.8H // ..................e........................................................................ + // gap // ........................................................................................... + add v14.8H, v24.8H, v18.8H // ...................e....................................................................... + // gap // ........................................................................................... + trn1 v10.4S, v13.4S, v5.4S // ...........................e............................................................... + // gap // ........................................................................................... + trn2 v30.4S, v13.4S, v5.4S // ............................e.............................................................. + // gap // ........................................................................................... + trn2 v18.4S, v14.4S, v16.4S // ..........................e................................................................ + // gap // ........................................................................................... + ldr q8, [x4, #-80] // ..................................e........................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v20.2D, v18.2D, v30.2D // ..............................e............................................................ + // gap // ........................................................................................... + trn1 v25.4S, v14.4S, v16.4S // .........................e................................................................. + // gap // ........................................................................................... + sqrdmulh v19.8H, v20.8H, v8.8H // ............................................e.............................................. + // gap // ........................................................................................... + mul v12.8H, v20.8H, v31.8H // .............................................e............................................. + // gap // ........................................................................................... + ldr q27, [x4, #-32] // .....................................e..................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v18.2D, v18.2D, v30.2D // ................................e.......................................................... + // gap // ........................................................................................... + mls v12.8H, v19.8H, v7.H[0] // ..............................................e............................................ + // gap // ........................................................................................... + trn2 v19.2D, v25.2D, v10.2D // .............................e............................................................. + // gap // ........................................................................................... + ldr q30, [x4, #-16] // ......................................e.................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v20.8H, v18.8H, v12.8H // ...............................................e........................................... + // gap // ........................................................................................... + sqrdmulh v14.8H, v19.8H, v8.8H // .......................................e................................................... + // gap // ........................................................................................... + mul v8.8H, v19.8H, v31.8H // ........................................e.................................................. + // gap // ........................................................................................... + sqrdmulh v15.8H, v20.8H, v30.8H // ......................................................e.................................... + // gap // ........................................................................................... + mul v31.8H, v20.8H, v27.8H // .......................................................e................................... + // gap // ........................................................................................... + add v12.8H, v18.8H, v12.8H // ................................................e.......................................... + // gap // ........................................................................................... + mls v8.8H, v14.8H, v7.H[0] // .........................................e................................................. + // gap // ........................................................................................... + trn1 v30.2D, v25.2D, v10.2D // ...............................e........................................................... + // gap // ........................................................................................... + mls v31.8H, v15.8H, v7.H[0] // ........................................................e.................................. + // gap // ........................................................................................... + srshr v15.8H, v6.8H, #11 // ................................................................*.......................... + // gap // ........................................................................................... + sub v6.8H, v30.8H, v8.8H // ..........................................e................................................ + // gap // ........................................................................................... + ldr q4, [x4, #-64] // ...................................e....................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v25.8H, v6.8H, v31.8H // .........................................................e................................. + // gap // ........................................................................................... + add v6.8H, v6.8H, v31.8H // ..........................................................e................................ + // gap // ........................................................................................... + ldr q18, [x4, #-48] // ....................................e...................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v31.4S, v6.4S, v25.4S // ..............................................................e............................ + // gap // ........................................................................................... + trn1 v25.4S, v6.4S, v25.4S // .............................................................e............................. + // gap // ........................................................................................... + sqdmulh v19.8H, v31.8H, v7.H[1] // ........................................................................e.................. + // gap // ........................................................................................... + sqdmulh v16.8H, v25.8H, v7.H[1] // .....................................................................e..................... + // gap // ........................................................................................... + mul v20.8H, v12.8H, v4.8H // ..................................................e........................................ + // gap // ........................................................................................... + sqrdmulh v12.8H, v12.8H, v18.8H // .................................................e......................................... + // gap // ........................................................................................... + srshr v18.8H, v19.8H, #11 // .........................................................................e................. + // gap // ........................................................................................... + srshr v16.8H, v16.8H, #11 // ......................................................................e.................... + // gap // ........................................................................................... + srshr v6.8H, v11.8H, #11 // ...................................................................*....................... + // gap // ........................................................................................... + mls v31.8H, v18.8H, v7.H[0] // ..........................................................................e................ + // gap // ........................................................................................... + mls v25.8H, v16.8H, v7.H[0] // .......................................................................e................... + // gap // ........................................................................................... + mls v20.8H, v12.8H, v7.H[0] // ...................................................e....................................... + // gap // ........................................................................................... + add v8.8H, v30.8H, v8.8H // ...........................................e............................................... + // gap // ........................................................................................... + mls v26.8H, v15.8H, v7.H[0] // .................................................................*......................... + // gap // ........................................................................................... + mls v29.8H, v6.8H, v7.H[0] // ....................................................................*...................... + // gap // ........................................................................................... + sub v17.8H, v8.8H, v20.8H // ....................................................e...................................... + // gap // ........................................................................................... + add v18.8H, v8.8H, v20.8H // .....................................................e..................................... + // gap // ........................................................................................... + umov x29, v26.d[1] // ............................................................................*.............. + umov x23, v25.d[0] // ...............................................................................e........... + umov x19, v31.d[1] // ..................................................................................e........ + umov x18, v26.d[0] // ...........................................................................*............... + trn1 v26.4S, v18.4S, v17.4S // ...........................................................e............................... + str x29, [x1, #32] // .......................................................................................*... + umov x21, v29.d[1] // ..............................................................................*............ + umov x29, v29.d[0] // .............................................................................*............. + sqdmulh v6.8H, v26.8H, v7.H[1] // ...............................................................e........................... + str x18, [x1], #( 16*4) // ...................................................................................*....... + trn2 v29.4S, v18.4S, v17.4S // ............................................................e.............................. + str x29, [x1, #-48] // .....................................................................................*..... + umov x29, v25.d[1] // ................................................................................e.......... + umov x14, v31.d[0] // .................................................................................e......... + sqdmulh v11.8H, v29.8H, v7.H[1] // ..................................................................e........................ + str x21, [x1, #-16] // .........................................................................................*. + + // ----------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------ + // ldr q8, [x1, #(16*0)] // .............e.............................................................................'............~............................................................................. + // ldr q9, [x1, #(16*1)] // ..........e................................................................................'.........~................................................................................ + // ldr q10, [x1, #(16*2)] // ......e....................................................................................'.....~.................................................................................... + // ldr q11, [x1, #(16*3)] // e..........................................................................................~.......................................................................................... + // ldr q0, [x3], #16 // ...e.......................................................................................'..~....................................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ............e..............................................................................'...........~.............................................................................. + // mul v24.8h, v10.8h, v0.h[0] // .........e.................................................................................'........~................................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ................e..........................................................................'...............~.......................................................................... + // sub v10.8h, v8.8h, v24.8h // .......................e...................................................................'......................~................................................................... + // add v8.8h, v8.8h, v24.8h // .....................e.....................................................................'....................~..................................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ........e..................................................................................'.......~.................................................................................. + // mul v24.8h, v11.8h, v0.h[0] // .......e...................................................................................'......~................................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........e...............................................................................'..........~............................................................................... + // sub v11.8h, v9.8h, v24.8h // ..............e............................................................................'.............~............................................................................ + // add v9.8h, v9.8h, v24.8h // ...............e...........................................................................'..............~........................................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ...................e.......................................................................'..................~....................................................................... + // mul v24.8h, v9.8h, v0.h[2] // ....................e......................................................................'...................~...................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................e..................................................................'.......................~.................................................................. + // sub v9.8h, v8.8h, v24.8h // ............................e..............................................................'...........................~.............................................................. + // add v8.8h, v8.8h, v24.8h // .............................e.............................................................'............................~............................................................. + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ..................e........................................................................'.................~........................................................................ + // mul v24.8h, v11.8h, v0.h[4] // .................e.........................................................................'................~......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................e....................................................................'.....................~.................................................................... + // sub v11.8h, v10.8h, v24.8h // ..........................e................................................................'.........................~................................................................ + // add v10.8h, v10.8h, v24.8h // ...........................e...............................................................'..........................~............................................................... + // trn1 v25.4s, v8.4s, v9.4s // ...................................e.......................................................'..................................~....................................................... + // trn2 v26.4s, v8.4s, v9.4s // ................................e..........................................................'...............................~.......................................................... + // trn1 v27.4s, v10.4s, v11.4s // ..............................e............................................................'.............................~............................................................ + // trn2 v28.4s, v10.4s, v11.4s // ...............................e...........................................................'..............................~........................................................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................e.................................................'........................................~................................................. + // trn2 v11.2d, v26.2d, v28.2d // ..................................e........................................................'.................................~........................................................ + // trn1 v8.2d, v25.2d, v27.2d // ..................................................e........................................'.................................................~........................................ + // trn1 v9.2d, v26.2d, v28.2d // .......................................e...................................................'......................................~................................................... + // ldr q0, [ x4], #(6*16) // .........................e.................................................................'........................~................................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .................................e.........................................................'................................~......................................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ......................................................e....................................'.....................................................~.................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .........................................................e.................................'........................................................~................................. + // ldr q2, [ x4, #(-6*16 + 4*16)] // ......................................e....................................................'.....................................~.................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ..........................................e................................................'.........................................~................................................ + // sqrdmulh v27.8h, v10.8h, v4.8h // ............................................e..............................................'...........................................~.............................................. + // mul v24.8h, v10.8h, v0.8h // .............................................e.............................................'............................................~............................................. + // mls v24.8h, v27.8h, v7.h[0] // .................................................e.........................................'................................................~......................................... + // sub v10.8h, v8.8h, v24.8h // .....................................................e.....................................'....................................................~..................................... + // add v8.8h, v8.8h, v24.8h // ......................................................................e....................'.....................................................................~.................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ....................................e......................................................'...................................~...................................................... + // mul v24.8h, v11.8h, v0.8h // .....................................e.....................................................'....................................~..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................................e..................................................'.......................................~.................................................. + // sub v11.8h, v9.8h, v24.8h // ...........................................e...............................................'..........................................~............................................... + // add v9.8h, v9.8h, v24.8h // ................................................e..........................................'...............................................~.......................................... + // sqrdmulh v27.8h, v9.8h, v5.8h // ...............................................................e...........................'..............................................................~........................... + // mul v24.8h, v9.8h, v1.8h // ..............................................................e............................'.............................................................~............................ + // mls v24.8h, v27.8h, v7.h[0] // .....................................................................e.....................'....................................................................~..................... + // sub v9.8h, v8.8h, v24.8h // .........................................................................e.................'........................................................................~................. + // add v8.8h, v8.8h, v24.8h // ..........................................................................e................'.........................................................................~................ + // sqrdmulh v27.8h, v11.8h, v6.8h // ..............................................e............................................'.............................................~............................................ + // mul v24.8h, v11.8h, v2.8h // ...............................................e...........................................'..............................................~........................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................................e.......................................'..................................................~....................................... + // sub v11.8h, v10.8h, v24.8h // .......................................................e...................................'......................................................~................................... + // add v10.8h, v10.8h, v24.8h // ........................................................e..................................'.......................................................~.................................. + // trn1 v25.4s, v8.4s, v9.4s // ...............................................................................e...........'..............................................................................~........... + // trn2 v26.4s, v8.4s, v9.4s // .....................................................................................e.....'....................................................................................~..... + // trn1 v27.4s, v10.4s, v11.4s // ...........................................................e...............................'..........................................................~............................... + // trn2 v28.4s, v10.4s, v11.4s // ..........................................................e................................'.........................................................~................................ + // sqdmulh v24.8h, v25.8h, v7.h[1] // ...................................................................................e.......'..................................................................................~....... + // srshr v24.8h, v24.8h, #11 // ....................................................~......................................'...................................................*...................................... + // mls v25.8h, v24.8h, v7.h[0] // .......................................................................~...................'......................................................................*................... + // sqdmulh v24.8h, v26.8h, v7.h[1] // .........................................................................................e.'........................................................................................~. + // srshr v24.8h, v24.8h, #11 // ..................................................................~........................'.................................................................*........................ + // mls v26.8h, v24.8h, v7.h[0] // ........................................................................~..................'.......................................................................*.................. + // sqdmulh v24.8h, v27.8h, v7.h[1] // .............................................................e.............................'............................................................~............................. + // srshr v24.8h, v24.8h, #11 // .................................................................e.........................'................................................................~......................... + // mls v27.8h, v24.8h, v7.h[0] // ....................................................................e......................'...................................................................~...................... + // sqdmulh v24.8h, v28.8h, v7.h[1] // ............................................................e..............................'...........................................................~.............................. + // srshr v24.8h, v24.8h, #11 // ................................................................e..........................'...............................................................~.......................... + // mls v28.8h, v24.8h, v7.h[0] // ...................................................................e.......................'..................................................................~....................... + // umov x10, v25.d[0] // ..............................................................................~............'.............................................................................*............ + // umov x11, v25.d[1] // ...........................................................................~...............'..........................................................................*............... + // umov x12, v26.d[0] // ..................................................................................~........'.................................................................................*........ + // umov x13, v26.d[1] // .................................................................................~.........'................................................................................*......... + // umov x14, v27.d[0] // ............................................................................e..............'...........................................................................~.............. + // umov x15, v27.d[1] // .......................................................................................e...'......................................................................................~... + // umov x16, v28.d[0] // ........................................................................................e..'.......................................................................................~.. + // umov x17, v28.d[1] // .............................................................................e.............'............................................................................~............. + // str x10, [x1], #( 16*4) // ....................................................................................~......'...................................................................................*...... + // str x14, [x1, #(-16*4 + 8*1)] // .....~.....................................................................................'....*..................................................................................... + // str x12, [x1, #(-16*4 + 8*2)] // ......................................................................................~....'.....................................................................................*.... + // str x16, [x1, #(-16*4 + 8*3)] // ....~......................................................................................'...*...................................................................................... + // str x11, [x1, #(-16*4 + 8*4)] // ................................................................................~..........'...............................................................................*.......... + // str x15, [x1, #(-16*4 + 8*5)] // ..~........................................................................................'.*........................................................................................ + // str x13, [x1, #(-16*4 + 8*6)] // ..........................................................................................~'.........................................................................................* + // str x17, [x1, #(-16*4 + 8*7)] // .~.........................................................................................'*......................................................................................... + + sub count, count, #1 cbnz count, layer4567_start - trn2 v12.4S, v18.4S, v30.4S // *............................... - // gap // ................................ - trn1 v24.4S, v18.4S, v30.4S // .*.............................. - // gap // ................................ - trn1 v6.4S, v29.4S, v31.4S // ...*............................ - // gap // ................................ - trn2 v17.4S, v29.4S, v31.4S // ....*........................... - // gap // ................................ - sqdmulh v4.8H, v24.8H, v7.H[1] // .....*.......................... - // gap // ................................ - sqdmulh v5.8H, v17.8H, v7.H[1] // .......*........................ - // gap // ................................ - sqdmulh v9.8H, v12.8H, v7.H[1] // ..*............................. - // gap // ................................ - sqdmulh v14.8H, v6.8H, v7.H[1] // ........*....................... - // gap // ................................ - srshr v4.8H, v4.8H, #11 // ..........*..................... - // gap // ................................ - srshr v5.8H, v5.8H, #11 // ..................*............. - // gap // ................................ - srshr v9.8H, v9.8H, #11 // ......*......................... - // gap // ................................ - mls v24.8H, v4.8H, v7.H[0] // ...........*.................... - // gap // ................................ - mls v17.8H, v5.8H, v7.H[0] // ...................*............ - // gap // ................................ - mls v12.8H, v9.8H, v7.H[0] // .........*...................... - // gap // ................................ - srshr v4.8H, v14.8H, #11 // ............*................... - // gap // ................................ - vext x24, v24, 0 // .............*.................. - vext x8, v24, 1 // ..............*................. - vext x16, v17, 0 // ....................*........... - vext x17, v17, 1 // .........................*...... - mls v6.8H, v4.8H, v7.H[0] // ...............*................ - str x24, [x1] , #64 // ................*............... - vext x12, v12, 0 // .....................*.......... - str x8, [x1, #-32] // .................*.............. - vext x13, v12, 1 // ........................*....... - str x16, [x1, #-40] // .......................*........ - str x12, [x1, #-48] // ......................*......... - // gap // ................................ - vext x14, v6, 0 // ............................*... - str x13, [x1, #-16] // ..........................*..... - vext x15, v6, 1 // .............................*.. - str x17, [x1, #-8] // ...........................*.... - str x14, [x1, #-56] // ..............................*. - // gap // ................................ - str x15, [x1, #-24] // ...............................* - // gap // ................................ - - // original source code - // trn2 v10.4S, v18.4S, v30.4S // *............................... || *........................ - // trn1 v2.4S, v18.4S, v30.4S // .*.............................. || .*....................... - // sqdmulh v12.8H, v10.8H, v7.H[1] // ......*......................... || ......*.................. - // trn1 v20.4S, v29.4S, v31.4S // ..*............................. || ..*...................... - // trn2 v11.4S, v29.4S, v31.4S // ...*............................ || ...*..................... - // sqdmulh v19.8H, v2.8H, v7.H[1] // ....*........................... || ....*.................... - // srshr v31.8H, v12.8H, #11 // ..........*..................... || ..........*.............. - // sqdmulh v18.8H, v11.8H, v7.H[1] // .....*.......................... || .....*................... - // sqdmulh v30.8H, v20.8H, v7.H[1] // .......*........................ || .......*................. - // mls v10.8H, v31.8H, v7.H[0] // .............*.................. || .............*........... - // srshr v31.8H, v19.8H, #11 // ........*....................... || ........*................ - // mls v2.8H, v31.8H, v7.H[0] // ...........*.................... || ...........*............. - // srshr v31.8H, v30.8H, #11 // ..............*................. || ..............*.......... - // vext x24, v2, 0 // ...............*................ || ...............*......... - // vext x8, v2, 1 // ................*............... || ...............*......... - // mls v20.8H, v31.8H, v7.H[0] // ...................*............ || .................*....... - // str x24, [x1] , #64 // ....................*........... || .................*....... - // str x8, [x1, #-32] // ......................*......... || ..................*...... - // srshr v27.8H, v18.8H, #11 // .........*...................... || .........*............... - // mls v11.8H, v27.8H, v7.H[0] // ............*................... || ............*............ - // vext x16, v11, 0 // .................*.............. || ................*........ - // vext x12, v10, 0 // .....................*.......... || ..................*...... - // str x12, [x1, #-48] // .........................*...... || ....................*.... - // str x16, [x1, #-40] // ........................*....... || ...................*..... - // vext x13, v10, 1 // .......................*........ || ...................*..... - // vext x17, v11, 1 // ..................*............. || ................*........ - // str x13, [x1, #-16] // ...........................*.... || .....................*... - // str x17, [x1, #-8] // .............................*.. || ......................*.. - // vext x14, v20, 0 // ..........................*..... || .....................*... - // vext x15, v20, 1 // ............................*... || ......................*.. - // str x14, [x1, #-56] // ..............................*. || .......................*. - // str x15, [x1, #-24] // ...............................* || ........................* - + // Instructions: 16 + // Expected cycles: 13 + // Expected IPC: 1.23 + // + // Cycle bound: 13.0 + // IPC bound: 1.23 + // + // Wall time: 0.08s + // User time: 0.08s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + srshr v1.8H, v11.8H, #11 // .....*........................ + str x14, [x1, #24] // ..*........................... + srshr v15.8H, v6.8H, #11 // ....*......................... + str x19, [x1, #56] // *............................. + str x29, [x1, #40] // .*............................ + // gap // .............................. + mls v29.8H, v1.8H, v7.H[0] // .......*...................... + str x23, [x1, #8] // ...*.......................... + mls v26.8H, v15.8H, v7.H[0] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + umov x10, v29.d[1] // ...........*.................. + // gap // .............................. + // gap // .............................. + umov x7, v26.d[0] // .........*.................... + str x10, [x1, #48] // ...............*.............. + umov x15, v29.d[0] // ............*................. + str x7, [x1], #( 16*4) // .............*................ + umov x7, v26.d[1] // ........*..................... + str x15, [x1, #-48] // ..............*............... + // gap // .............................. + str x7, [x1, #-32] // ..........*................... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // str x19, [x1, #56] // ...*........................... + // str x29, [x1, #40] // ....*.......................... + // str x14, [x1, #24] // .*............................. + // str x23, [x1, #8] // ......*........................ + // srshr v15.8H, v6.8H, #11 // ..*............................ + // srshr v6.8H, v11.8H, #11 // *.............................. + // mls v26.8H, v15.8H, v7.H[0] // .......*....................... + // mls v29.8H, v6.8H, v7.H[0] // .....*......................... + // umov x29, v26.d[1] // .............*................. + // umov x18, v26.d[0] // .........*..................... + // str x29, [x1, #32] // ...............*............... + // umov x21, v29.d[1] // ........*...................... + // umov x29, v29.d[0] // ...........*................... + // str x18, [x1], #( 16*4) // ............*.................. + // str x29, [x1, #-48] // ..............*................ + // str x21, [x1, #-16] // ..........*.................... + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_a72.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_a72.s index 6f03917a..5460faba 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_a72.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_a72.s @@ -26,46 +26,16 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 -.macro vins vec_out, gpr_in, lane // slothy:no-unfold +.macro vins vec_out, gpr_in, lane ins \vec_out\().d[\lane], \gpr_in .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -83,15 +53,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -100,12 +70,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -114,45 +78,45 @@ xtmp1 .req x11 .macro barrett_reduce a vqdmulhq tmp, \a, consts, 1 - srshr tmp.8H, tmp.8H, #11 + srshr tmp.8h, tmp.8h, #11 vmlsq \a, tmp, consts, 0 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm .macro vec_to_scalar_matrix out, in @@ -178,7 +142,7 @@ xtmp1 .req x11 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -189,7 +153,7 @@ xtmp1 .req x11 str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -199,7 +163,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -207,7 +171,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -218,19 +182,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -243,7 +207,7 @@ roots: .text .global ntt_kyber_123_4567_scalar_load_store_opt_a72 - .global _ntt_kyber_123_4567_scalar_load_store_opt_a72 + .global _ntt_kyber_123_4567_scalar_load_store .p2align 4 const_addr: .short 3329 @@ -369,1561 +333,1421 @@ _ntt_kyber_123_4567_scalar_load_store_opt_a72: load_roots_123 .p2align 2 - ldr x10, [x0, #256] // ....*................... - ldr x15, [x0, #192] // ..*..................... - // gap // ........................ - ldr x29, [x0, #320] // ......*................. - ldr x18, [x0, #448] // *....................... - // gap // ........................ - ldr x27, [x0, #200] // ...*.................... - ldr x26, [x0, #64] // .*...................... - // gap // ........................ - ldr x22, [x0, #264] // .......*................ - ldr x23, [x0, #328] // ........*............... - // gap // ........................ - vins v24, x15, 0 // .............*.......... - vins v20, x10, 0 // ..............*......... - ldr x15, [x0, #456] // .........*.............. - vins v4, x29, 0 // ...............*........ - vins v2, x18, 0 // .....*.................. - ldr x10, [x0, #384] // ..........*............. - ldr x29, [x0, #392] // ...........*............ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - vins v23, x10, 0 // .................*...... - // gap // ........................ - // gap // ........................ - vins v20, x22, 1 // ................*....... - // gap // ........................ - // gap // ........................ - vins v2, x15, 1 // ............*........... - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - vins v23, x29, 1 // ..................*..... - // gap // ........................ - // gap // ........................ - sqrdmulh v22.8H, v20.8H, v0.H[1] // .....................*.. - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - sqrdmulh v29.8H, v2.8H, v0.H[1] // ....................*... - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - mul v15.8H, v2.8H, v0.H[0] // ...................*.... - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - sqrdmulh v11.8H, v23.8H, v0.H[1] // ......................*. - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - // gap // ........................ - mls v15.8H, v29.8H, v7.H[0] // .......................* - // gap // ........................ - // gap // ........................ - - // original source code - // ldr x22, [x0, #448] // ...*.................... || .*....................... - // ldr x26, [x0, #64] // .....*.................. || ..*...................... - // ldr x15, [x0, #192] // .*...................... || *........................ - // ldr x27, [x0, #200] // ....*................... || ..*...................... - // ldr x29, [x0, #256] // *....................... || *........................ - // vins v22, x22, 0 // ............*........... || .....*................... - // ldr x22, [x0, #320] // ..*..................... || .*....................... - // ldr x7, [x0, #264] // ......*................. || ...*..................... - // ldr x23, [x0, #328] // .......*................ || ...*..................... - // ldr x18, [x0, #456] // ..........*............. || ....*.................... - // ldr x13, [x0, #384] // .............*.......... || .....*................... - // ldr x20, [x0, #392] // ..............*......... || ......*.................. - // vins v22, x18, 1 // .................*...... || ...........*............. - // vins v24, x15, 0 // ........*............... || ....*.................... - // vins v20, x29, 0 // .........*.............. || ....*.................... - // vins v4, x22, 0 // ...........*............ || .....*................... - // vins v20, x7, 1 // ................*....... || ..........*.............. - // vins v23, x13, 0 // ...............*........ || .........*............... - // vins v23, x20, 1 // ..................*..... || ...............*......... - // mul v15.8H, v22.8H, v0.H[0] // .....................*.. || ....................*.... - // sqrdmulh v9.8H, v22.8H, v0.H[1] // ....................*... || ..................*...... - // sqrdmulh v22.8H, v20.8H, v0.H[1] // ...................*.... || ................*........ - // sqrdmulh v11.8H, v23.8H, v0.H[1] // ......................*. || ......................*.. - // mls v15.8H, v9.8H, v7.H[0] // .......................* || ........................* - + // Instructions: 18 + // Expected cycles: 20 + // Expected IPC: 0.90 + // + // Cycle bound: 20.0 + // IPC bound: 0.90 + // + // Wall time: 0.09s + // User time: 0.09s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q18, [x0, #0] // *............................. + ldr q16, [x0, #448] // .........*.................... + // gap // .............................. + ldr q27, [x0, #256] // ....*......................... + // gap // .............................. + // gap // .............................. + ldr q5, [x0, #64] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q4, [x0, #128] // ..*........................... + // gap // .............................. + // gap // .............................. + sqrdmulh v15.8H, v16.8H, v0.H[1] // ..........*................... + ldr q17, [x0, #320] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q28, [x0, #192] // ...*.......................... + mul v19.8H, v16.8H, v0.H[0] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v16.8H, v27.8H, v0.H[1] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v19.8H, v15.8H, v7.H[0] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v23.8H, v27.8H, v0.H[0] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v23.8H, v16.8H, v7.H[0] // .............*................ + ldr q16, [x0, #384] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v27.8H, v28.8H, v19.8H // ................*............. + sqrdmulh v6.8H, v17.8H, v0.H[1] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v11.8H, v16.8H, v0.H[0] // ..............*............... + // gap // .............................. + // gap // .............................. + sub v13.8H, v18.8H, v23.8H // .................*............ + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q18, [x0, #0] // *.............................. + // ldr q5, [x0, #64] // ...*........................... + // ldr q4, [x0, #128] // ....*.......................... + // ldr q28, [x0, #192] // .......*....................... + // ldr q6, [x0, #256] // ..*............................ + // sqrdmulh v27.8H, v6.8H, v0.H[1] // .........*..................... + // mul v23.8H, v6.8H, v0.H[0] // ...........*................... + // ldr q17, [x0, #320] // ......*........................ + // ldr q16, [x0, #384] // .............*................. + // ldr q19, [x0, #448] // .*............................. + // sqrdmulh v30.8H, v19.8H, v0.H[1] // .....*......................... + // mul v19.8H, v19.8H, v0.H[0] // ........*...................... + // mls v19.8H, v30.8H, v7.H[0] // ..........*.................... + // mls v23.8H, v27.8H, v7.H[0] // ............*.................. + // mul v11.8H, v16.8H, v0.H[0] // ................*.............. + // sqrdmulh v6.8H, v17.8H, v0.H[1] // ...............*............... + // sub v27.8H, v28.8H, v19.8H // ..............*................ + // sub v13.8H, v18.8H, v23.8H // .................*............. + sub count, count, #1 -.p2align 2 layer123_start: - ldr x15, [x0, #0] // *................................................................................................... - vins v2, x26, 0 // ......*............................................................................................. - ldr x10, [x0, #8] // .*.................................................................................................. - mul v29.8H, v20.8H, v0.H[0] // ................................*................................................................... - ldr x29, [x0, #128] // ........*........................................................................................... - ldr x18, [x0, #72] // .....*.............................................................................................. - ldr x22, [x0, #464] // ............................e....................................................................... - vins v24, x27, 1 // ...............*.................................................................................... - ldr x25, [x0, #136] // .........*.......................................................................................... - mls v29.8H, v22.8H, v7.H[0] // ..................................*................................................................. - vins v4, x23, 1 // .......................*............................................................................ - ldr x26, [x0, #80] // ....e............................................................................................... - vins v5, x15, 0 // ..*................................................................................................. - ldr x15, [x0, #208] // ............e....................................................................................... - ldr x27, [x0, #216] // .............e...................................................................................... - vins v9, x29, 0 // ..........*......................................................................................... - mul v30.8H, v23.8H, v0.H[0] // ..........................................*......................................................... - ldr x29, [x0, #272] // ................e................................................................................... - vins v22, x22, 0 // ..............................e..................................................................... - ldr x22, [x0, #336] // ....................e............................................................................... - ldr x7, [x0, #280] // .................e.................................................................................. - vins v2, x18, 1 // .......*............................................................................................ - mls v30.8H, v11.8H, v7.H[0] // ............................................*....................................................... - ldr x23, [x0, #344] // .....................e.............................................................................. - ldr x18, [x0, #472] // .............................e...................................................................... - ldr x13, [x0, #400] // ........................e........................................................................... - sub v11.8H, v24.8H, v15.8H // ..................................................*................................................. - add v15.8H, v24.8H, v15.8H // ...................................................*................................................ - mul v12.8H, v4.8H, v0.H[0] // .....................................*.............................................................. - ldr x20, [x0, #408] // .........................e.......................................................................... - vins v5, x10, 1 // ...*................................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - vins v9, x25, 1 // ...........*........................................................................................ - sqrdmulh v23.8H, v4.8H, v0.H[1] // ......................................*............................................................. - // gap // .................................................................................................... - vins v22, x18, 1 // ...............................e.................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - vins v24, x15, 0 // ..............e..................................................................................... - mul v8.8H, v15.8H, v0.H[2] // .........................................................*.......................................... - // gap // .................................................................................................... - vins v20, x29, 0 // ..................e................................................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - vins v4, x22, 0 // ......................e............................................................................. - sqrdmulh v15.8H, v15.8H, v0.H[3] // ..........................................................*......................................... - // gap // .................................................................................................... - sub v18.8H, v5.8H, v29.8H // ...................................*................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v29.8H, v5.8H, v29.8H // ....................................*............................................................... - mls v12.8H, v23.8H, v7.H[0] // .......................................*............................................................ - // gap // .................................................................................................... - sub v5.8H, v9.8H, v30.8H // .............................................*...................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v9.8H, v9.8H, v30.8H // ..............................................*..................................................... - mul v30.8H, v11.8H, v0.H[4] // ...................................................................*................................ - // gap // .................................................................................................... - vins v20, x7, 1 // ...................e................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - vins v23, x13, 0 // ..........................e......................................................................... - mls v8.8H, v15.8H, v7.H[0] // ...........................................................*........................................ - // gap // .................................................................................................... - sub v15.8H, v2.8H, v12.8H // ........................................*........................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v2.8H, v2.8H, v12.8H // .........................................*.......................................................... - mul v12.8H, v9.8H, v0.H[2] // ....................................................*............................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v9.8H, v9.8H, v0.H[3] // .....................................................*.............................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v3.8H, v2.8H, v8.8H // ............................................................*....................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v11.8H, v11.8H, v0.H[5] // ....................................................................*............................... - add v2.8H, v2.8H, v8.8H // .............................................................*...................................... - // gap // .................................................................................................... - vins v23, x20, 1 // ...........................e........................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v8.8H, v5.8H, v0.H[4] // ..............................................................*..................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v12.8H, v9.8H, v7.H[0] // ......................................................*............................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v5.8H, v5.8H, v0.H[5] // ...............................................................*.................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v30.8H, v11.8H, v7.H[0] // .....................................................................*.............................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v9.8H, v29.8H, v12.8H // .......................................................*............................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v29.8H, v29.8H, v12.8H // ........................................................*........................................... - mul v11.8H, v2.8H, v0.H[6] // ........................................................................*........................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v8.8H, v5.8H, v7.H[0] // ................................................................*................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v5.8H, v15.8H, v30.8H // ......................................................................*............................. - // gap // .................................................................................................... - // gap // .................................................................................................... - add v30.8H, v15.8H, v30.8H // .......................................................................*............................ - sqrdmulh v2.8H, v2.8H, v0.H[7] // .........................................................................*.......................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v15.8H, v3.8H, v1.H[0] // .............................................................................*...................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v12.8H, v18.8H, v8.8H // .................................................................*.................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - add v8.8H, v18.8H, v8.8H // ..................................................................*................................. - sqrdmulh v18.8H, v3.8H, v1.H[1] // ..............................................................................*..................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v11.8H, v2.8H, v7.H[0] // ..........................................................................*......................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v2.8H, v30.8H, v1.H[2] // ..................................................................................*................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v15.8H, v18.8H, v7.H[0] // ...............................................................................*.................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v18.8H, v29.8H, v11.8H // ...........................................................................*........................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v29.8H, v29.8H, v11.8H // ............................................................................*....................... - sqrdmulh v30.8H, v30.8H, v1.H[3] // ...................................................................................*................ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - str_vo v18, x0, 64 // .............................................................................................*...... - sqrdmulh v11.8H, v5.8H, v1.H[5] // ........................................................................................*........... - // gap // .................................................................................................... - str_vi v29, x0, 16 // ............................................................................................*....... - sub v29.8H, v9.8H, v15.8H // ................................................................................*................... - // gap // .................................................................................................... - add v9.8H, v9.8H, v15.8H // .................................................................................*.................. - mul v5.8H, v5.8H, v1.H[4] // .......................................................................................*............ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v2.8H, v30.8H, v7.H[0] // ....................................................................................*............... - str_vo v29, x0, 176 // ...............................................................................................*.... - // gap // .................................................................................................... - str_vo v9, x0, 112 // ..............................................................................................*..... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v5.8H, v11.8H, v7.H[0] // .........................................................................................*.......... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v15.8H, v22.8H, v0.H[0] // ...............................................e.................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v29.8H, v8.8H, v2.8H // .....................................................................................*.............. - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v9.8H, v22.8H, v0.H[1] // ................................................e................................................... - add v2.8H, v8.8H, v2.8H // ......................................................................................*............. - // gap // .................................................................................................... - sub v30.8H, v12.8H, v5.8H // ..........................................................................................*......... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v5.8H, v12.8H, v5.8H // ...........................................................................................*........ - str_vo v29, x0, 304 // .................................................................................................*.. - sqrdmulh v22.8H, v20.8H, v0.H[1] // .................................e.................................................................. - str_vo v2, x0, 240 // ................................................................................................*... - // gap // .................................................................................................... - // gap // .................................................................................................... - str_vo v30, x0, 432 // ...................................................................................................* - sqrdmulh v11.8H, v23.8H, v0.H[1] // ...........................................e........................................................ - // gap // .................................................................................................... - str_vo v5, x0, 368 // ..................................................................................................*. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v15.8H, v9.8H, v7.H[0] // .................................................e.................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - - // original source code - // ldr x10, [x0, #0] // ..............................................................................................*.................................................................................................. || ......................................................................*...................................................................... - // ldr x11, [x0, #8] // ................................................................................................*................................................................................................ || ......................................................................*...................................................................... - // vins v8, x10, 0 // ..........................................................................................................*...................................................................................... || ..........................................................................*.................................................................. - // vins v8, x11, 1 // ............................................................................................................................*.................................................................... || ................................................................................*............................................................ - // ldr x10, [x0, #64] // .....e........................................................................................................................................................................................... || .e........................................................................................................................................... - // ldr x11, [x0, #72] // ...................................................................................................*............................................................................................. || .......................................................................*..................................................................... - // vins v9, x10, 0 // ...............................................................................................*................................................................................................. || ......................................................................*...................................................................... - // vins v9, x11, 1 // ...................................................................................................................*............................................................................. || .............................................................................*............................................................... - // ldr x10, [x0, #128] // ..................................................................................................*.............................................................................................. || .......................................................................*..................................................................... - // ldr x11, [x0, #136] // ......................................................................................................*.......................................................................................... || ........................................................................*.................................................................... - // vins v10, x10, 0 // .............................................................................................................*................................................................................... || ...........................................................................*................................................................. - // vins v10, x11, 1 // .............................................................................................................................*................................................................... || .................................................................................*........................................................... - // ldr x10, [x0, #192] // .......e......................................................................................................................................................................................... || ..e.......................................................................................................................................... - // ldr x11, [x0, #200] // ........e........................................................................................................................................................................................ || ..e.......................................................................................................................................... - // vins v11, x10, 0 // ............................e.................................................................................................................................................................... || ...........e................................................................................................................................. - // vins v11, x11, 1 // .....................................................................................................*........................................................................................... || ........................................................................*.................................................................... - // ldr x10, [x0, #256] // ...........e..................................................................................................................................................................................... || ...e......................................................................................................................................... - // ldr x11, [x0, #264] // ..............e.................................................................................................................................................................................. || ....e........................................................................................................................................ - // vins v12, x10, 0 // ..............................e.................................................................................................................................................................. || ............e................................................................................................................................ - // vins v12, x11, 1 // .......................................e......................................................................................................................................................... || ..................e.......................................................................................................................... - // ldr x10, [x0, #320] // .............e................................................................................................................................................................................... || ....e........................................................................................................................................ - // ldr x11, [x0, #328] // .................e............................................................................................................................................................................... || .....e....................................................................................................................................... - // vins v13, x10, 0 // ...............................e................................................................................................................................................................. || .............e............................................................................................................................... - // vins v13, x11, 1 // ........................................................................................................*........................................................................................ || .........................................................................*................................................................... - // ldr x10, [x0, #384] // ...................e............................................................................................................................................................................. || ......e...................................................................................................................................... - // ldr x11, [x0, #392] // .......................e......................................................................................................................................................................... || .......e..................................................................................................................................... - // vins v14, x10, 0 // ........................................e........................................................................................................................................................ || ...................e......................................................................................................................... - // vins v14, x11, 1 // .................................................e............................................................................................................................................... || ..........................e.................................................................................................................. - // ldr x10, [x0, #448] // e................................................................................................................................................................................................ || e............................................................................................................................................ - // ldr x11, [x0, #456] // ..................e.............................................................................................................................................................................. || ......e...................................................................................................................................... - // vins v15, x10, 0 // ............e.................................................................................................................................................................................... || ....e........................................................................................................................................ - // vins v15, x11, 1 // ...........................e..................................................................................................................................................................... || ..........e.................................................................................................................................. - // mul v24.8H, v12.8H, v0.H[0] // .................................................................................................*............................................................................................... || .......................................................................*..................................................................... - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ........................................................................................e........................................................................................................ || .................................................................e........................................................................... - // mls v24.8H, v12.8H, v7.H[0] // .......................................................................................................*......................................................................................... || .........................................................................*................................................................... - // sub v12.8H, v8.8H, v24.8H // .....................................................................................................................................*........................................................... || ......................................................................................*...................................................... - // add v8.8H, v8.8H, v24.8H // ......................................................................................................................................*.......................................................... || .......................................................................................*..................................................... - // mul v24.8H, v13.8H, v0.H[0] // ..........................................................................................................................*...................................................................... || ...............................................................................*............................................................. - // sqrdmulh v13.8H, v13.8H, v0.H[1] // ..............................................................................................................................*.................................................................. || .................................................................................*........................................................... - // mls v24.8H, v13.8H, v7.H[0] // .......................................................................................................................................*......................................................... || .......................................................................................*..................................................... - // sub v13.8H, v9.8H, v24.8H // ..............................................................................................................................................*.................................................. || ............................................................................................*................................................ - // add v9.8H, v9.8H, v24.8H // ...............................................................................................................................................*................................................. || .............................................................................................*............................................... - // mul v24.8H, v14.8H, v0.H[0] // ..............................................................................................................*.................................................................................. || ...........................................................................*................................................................. - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ...........................................................................................e..................................................................................................... || ...................................................................e......................................................................... - // mls v24.8H, v14.8H, v7.H[0] // ....................................................................................................................*............................................................................ || .............................................................................*............................................................... - // sub v14.8H, v10.8H, v24.8H // ........................................................................................................................................*........................................................ || ........................................................................................*.................................................... - // add v10.8H, v10.8H, v24.8H // .........................................................................................................................................*....................................................... || .........................................................................................*................................................... - // mul v24.8H, v15.8H, v0.H[0] // .................................................................................e............................................................................................................... || .............................................................e............................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ...................................................................................e............................................................................................................. || ...............................................................e............................................................................. - // mls v24.8H, v15.8H, v7.H[0] // .............................................................................................e................................................................................................... || .....................................................................e....................................................................... - // sub v15.8H, v11.8H, v24.8H // ........................................................................................................................*........................................................................ || ..............................................................................*.............................................................. - // add v11.8H, v11.8H, v24.8H // .........................................................................................................................*....................................................................... || ...............................................................................*............................................................. - // mul v24.8H, v10.8H, v0.H[2] // ................................................................................................................................................*................................................ || .............................................................................................*............................................... - // sqrdmulh v10.8H, v10.8H, v0.H[3] // .................................................................................................................................................*............................................... || ...............................................................................................*............................................. - // mls v24.8H, v10.8H, v7.H[0] // .......................................................................................................................................................*......................................... || .....................................................................................................*....................................... - // sub v10.8H, v8.8H, v24.8H // ..........................................................................................................................................................*...................................... || ..........................................................................................................*.................................. - // add v8.8H, v8.8H, v24.8H // ...........................................................................................................................................................*..................................... || ...........................................................................................................*................................. - // mul v24.8H, v11.8H, v0.H[2] // .................................................................................................................................*............................................................... || ...................................................................................*......................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[3] // ....................................................................................................................................*............................................................ || .....................................................................................*....................................................... - // mls v24.8H, v11.8H, v7.H[0] // .............................................................................................................................................*................................................... || ...........................................................................................*................................................. - // sub v11.8H, v9.8H, v24.8H // ..................................................................................................................................................*.............................................. || ................................................................................................*............................................ - // add v9.8H, v9.8H, v24.8H // ....................................................................................................................................................*............................................ || .................................................................................................*........................................... - // mul v24.8H, v14.8H, v0.H[4] // ......................................................................................................................................................*.......................................... || ...................................................................................................*......................................... - // sqrdmulh v14.8H, v14.8H, v0.H[5] // ........................................................................................................................................................*........................................ || .......................................................................................................*..................................... - // mls v24.8H, v14.8H, v7.H[0] // .............................................................................................................................................................*................................... || .............................................................................................................*............................... - // sub v14.8H, v12.8H, v24.8H // ..................................................................................................................................................................*.............................. || ..................................................................................................................*.......................... - // add v12.8H, v12.8H, v24.8H // ...................................................................................................................................................................*............................. || ...................................................................................................................*......................... - // mul v24.8H, v15.8H, v0.H[4] // ..........................................................................................................................................*...................................................... || .........................................................................................*................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // ...................................................................................................................................................*............................................. || .................................................................................................*........................................... - // mls v24.8H, v15.8H, v7.H[0] // .........................................................................................................................................................*....................................... || .........................................................................................................*................................... - // sub v15.8H, v13.8H, v24.8H // ..............................................................................................................................................................*.................................. || ..............................................................................................................*.............................. - // add v13.8H, v13.8H, v24.8H // ...............................................................................................................................................................*................................. || ...............................................................................................................*............................. - // mul v24.8H, v9.8H, v0.H[6] // ............................................................................................................................................................*.................................... || ...........................................................................................................*................................. - // sqrdmulh v9.8H, v9.8H, v0.H[7] // ................................................................................................................................................................*................................ || ...............................................................................................................*............................. - // mls v24.8H, v9.8H, v7.H[0] // .....................................................................................................................................................................*........................... || .....................................................................................................................*....................... - // sub v9.8H, v8.8H, v24.8H // ........................................................................................................................................................................*........................ || ..........................................................................................................................*.................. - // add v8.8H, v8.8H, v24.8H // .........................................................................................................................................................................*....................... || ...........................................................................................................................*................. - // mul v24.8H, v11.8H, v1.H[0] // .................................................................................................................................................................*............................... || .................................................................................................................*........................... - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ....................................................................................................................................................................*............................ || ...................................................................................................................*......................... - // mls v24.8H, v11.8H, v7.H[0] // .......................................................................................................................................................................*......................... || .........................................................................................................................*................... - // sub v11.8H, v10.8H, v24.8H // ..............................................................................................................................................................................*.................. || ..............................................................................................................................*.............. - // add v10.8H, v10.8H, v24.8H // ...............................................................................................................................................................................*................. || ...............................................................................................................................*............. - // mul v24.8H, v13.8H, v1.H[2] // ......................................................................................................................................................................*.......................... || .......................................................................................................................*..................... - // sqrdmulh v13.8H, v13.8H, v1.H[3] // ..........................................................................................................................................................................*...................... || ...........................................................................................................................*................. - // mls v24.8H, v13.8H, v7.H[0] // .................................................................................................................................................................................*............... || .................................................................................................................................*........... - // sub v13.8H, v12.8H, v24.8H // ......................................................................................................................................................................................*.......... || ......................................................................................................................................*...... - // add v12.8H, v12.8H, v24.8H // ........................................................................................................................................................................................*........ || .......................................................................................................................................*..... - // mul v24.8H, v15.8H, v1.H[4] // ................................................................................................................................................................................*................ || ...............................................................................................................................*............. - // sqrdmulh v15.8H, v15.8H, v1.H[5] // ............................................................................................................................................................................*.................... || .............................................................................................................................*............... - // mls v24.8H, v15.8H, v7.H[0] // ....................................................................................................................................................................................*............ || ...................................................................................................................................*......... - // sub v15.8H, v14.8H, v24.8H // .........................................................................................................................................................................................*....... || ........................................................................................................................................*.... - // add v14.8H, v14.8H, v24.8H // ..........................................................................................................................................................................................*...... || .........................................................................................................................................*... - // str_vi v8, x0, 16 // .............................................................................................................................................................................*................... || ..............................................................................................................................*.............. - // str_vo v9, x0, 48 // ...........................................................................................................................................................................*..................... || .............................................................................................................................*............... - // str_vo v10, x0, 112 // ...................................................................................................................................................................................*............. || ..................................................................................................................................*.......... - // str_vo v11, x0, 176 // ..................................................................................................................................................................................*.............. || .................................................................................................................................*........... - // str_vo v12, x0, 240 // .............................................................................................................................................................................................*... || ..........................................................................................................................................*.. - // str_vo v13, x0, 304 // ...........................................................................................................................................................................................*..... || .........................................................................................................................................*... - // str_vo v14, x0, 368 // ................................................................................................................................................................................................* || ............................................................................................................................................* - // str_vo v15, x0, 432 // ..............................................................................................................................................................................................*.. || ...........................................................................................................................................*. - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Cycle bound: 72.0 + // IPC bound: 1.06 + // + // Wall time: 50.68s + // User time: 50.68s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v29.8H, v16.8H, v0.H[1] // ..................*......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v26.8H, v17.8H, v0.H[0] // ..............*............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v15.8H, v27.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v11.8H, v29.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.8H, v6.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v25.8H, v27.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v16.8H, v4.8H, v11.8H // .....................*...................................................... + mls v25.8H, v15.8H, v7.H[0] // .............................................*.............................. + add v29.8H, v18.8H, v23.8H // ............*............................................................... + ldr q18, [x0, #16] // e........................................................................... + sub v31.8H, v5.8H, v26.8H // ................*........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v14.8H, v5.8H, v26.8H // .................*.......................................................... + sqrdmulh v21.8H, v16.8H, v0.H[5] // ......................................*..................................... + ldr q5, [x0, #80] // .e.......................................................................... + add v2.8H, v28.8H, v19.8H // ...........................*................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v11.8H, v4.8H, v11.8H // ......................*..................................................... + mul v26.8H, v16.8H, v0.H[4] // .......................................*.................................... + ldr q4, [x0, #144] // ..e......................................................................... + ldr q28, [x0, #208] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v9.8H, v2.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v19.8H, v11.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v16.8H, v11.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v11.8H, v2.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v16.8H, v19.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + ldr q6, [x0, #272] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v11.8H, v9.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v27.8H, v6.8H, v0.H[1] // ........e................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v15.8H, v29.8H, v16.8H // ...............................*............................................ + add v3.8H, v29.8H, v16.8H // ................................*........................................... + mls v26.8H, v21.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + sub v22.8H, v14.8H, v11.8H // ....................................*....................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v21.8H, v14.8H, v11.8H // .....................................*...................................... + mul v23.8H, v6.8H, v0.H[0] // .........e.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v2.8H, v22.8H, v1.H[1] // .....................................................*...................... + sub v9.8H, v13.8H, v26.8H // .........................................*.................................. + // gap // ............................................................................ + // gap // ............................................................................ + mul v20.8H, v22.8H, v1.H[0] // ......................................................*..................... + // gap // ............................................................................ + // gap // ............................................................................ + add v22.8H, v13.8H, v26.8H // ..........................................*................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v13.8H, v31.8H, v25.8H // ..............................................*............................. + sqrdmulh v17.8H, v21.8H, v0.H[7] // ................................................*........................... + add v26.8H, v31.8H, v25.8H // ...............................................*............................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v31.8H, v21.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v12.8H, v26.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v20.8H, v2.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v31.8H, v17.8H, v7.H[0] // ..................................................*......................... + ldr q17, [x0, #336] // .....e...................................................................... + // gap // ............................................................................ + ldr q16, [x0, #400] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v29.8H, v26.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + ldr q19, [x0, #464] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v29.8H, v12.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v26.8H, v3.8H, v31.8H // ...................................................*........................ + // gap // ............................................................................ + sqrdmulh v12.8H, v13.8H, v1.H[5] // ...............................................................*............ + add v30.8H, v3.8H, v31.8H // ....................................................*....................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v14.8H, v13.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q30, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + sqrdmulh v30.8H, v19.8H, v0.H[1] // .......................e.................................................... + str q26, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + // gap // ............................................................................ + mls v14.8H, v12.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v19.8H, v19.8H, v0.H[0] // ........................e................................................... + // gap // ............................................................................ + sub v24.8H, v15.8H, v20.8H // ........................................................*................... + add v10.8H, v15.8H, v20.8H // .........................................................*.................. + // gap // ............................................................................ + // gap // ............................................................................ + mls v19.8H, v30.8H, v7.H[0] // .........................e.................................................. + // gap // ............................................................................ + sub v25.8H, v22.8H, v29.8H // .............................................................*.............. + add v2.8H, v22.8H, v29.8H // ..............................................................*............. + // gap // ............................................................................ + // gap // ............................................................................ + mls v23.8H, v27.8H, v7.H[0] // ..........e................................................................. + str q10, [x0, #112] // ......................................................................*..... + sub v10.8H, v9.8H, v14.8H // ..................................................................*......... + add v6.8H, v9.8H, v14.8H // ...................................................................*........ + str q24, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + str q2, [x0, #240] // ........................................................................*... + mul v11.8H, v16.8H, v0.H[0] // ...................e........................................................ + // gap // ............................................................................ + str q25, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ + // gap // ............................................................................ + str q6, [x0, #368] // ..........................................................................*. + sqrdmulh v6.8H, v17.8H, v0.H[1] // .............e.............................................................. + sub v27.8H, v28.8H, v19.8H // ..........................e................................................. + str q10, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + sub v13.8H, v18.8H, v23.8H // ...........e................................................................ + + // --------------------------------------------------------------- new position ----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ldr q8, [x0, #0] // e..................................................................'........~................................................................. + // ldr q9, [x0, #(1*(512/8))] // ....e..............................................................'............~............................................................. + // ldr q10, [x0, #(2*(512/8))] // ........e..........................................................'................~......................................................... + // ldr q11, [x0, #(3*(512/8))] // .........e.........................................................'.................~........................................................ + // ldr q12, [x0, #(4*(512/8))] // ...............e...................................................'.......................~.................................................. + // ldr q13, [x0, #(5*(512/8))] // ...................................e...............................'...........................................~.............................. + // ldr q14, [x0, #(6*(512/8))] // ....................................e..............................'............................................~............................. + // ldr q15, [x0, #(7*(512/8))] // ......................................e............................'..............................................~........................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // .................e.................................................'.........................~................................................ + // mul v24.8h, v12.8h, v0.h[0] // .......................e...........................................'...............................~.......................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................................e............'..............................................................~........... + // sub v12.8h, v8.8h, v24.8h // ..................................................................e'.......................................................................... + // add v8.8h, v8.8h, v24.8h // ...................................................................'.......*.................................................................. + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ...............................................................e...'.......................................................................~.. + // mul v24.8h, v13.8h, v0.h[0] // ...................................................................'*......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................................................'...*...................................................................... + // sub v13.8h, v9.8h, v24.8h // .~.................................................................'.........*................................................................ + // add v9.8h, v9.8h, v24.8h // ..~................................................................'..........*............................................................... + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ...................................................................*.......................................................................... + // mul v24.8h, v14.8h, v0.h[0] // ............................................................e......'....................................................................~..... + // mls v24.8h, v27.8h, v7.h[0] // ...................................................................'..*....................................................................... + // sub v14.8h, v10.8h, v24.8h // ...................................................................'.....*.................................................................... + // add v10.8h, v10.8h, v24.8h // ......~............................................................'..............*........................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // .............................................e.....................'.....................................................~.................... + // mul v24.8h, v15.8h, v0.h[0] // ................................................e..................'........................................................~................. + // mls v24.8h, v27.8h, v7.h[0] // ...................................................e...............'...........................................................~.............. + // sub v15.8h, v11.8h, v24.8h // ................................................................e..'........................................................................~. + // add v11.8h, v11.8h, v24.8h // .....~.............................................................'.............*............................................................ + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ...........~.......................................................'...................*...................................................... + // mul v24.8h, v10.8h, v0.h[2] // ............~......................................................'....................*..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............~....................................................'......................*................................................... + // sub v10.8h, v8.8h, v24.8h // ..................~................................................'..........................*............................................... + // add v8.8h, v8.8h, v24.8h // ...................~...............................................'...........................*.............................................. + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ..........~........................................................'..................*....................................................... + // mul v24.8h, v11.8h, v0.h[2] // .............~.....................................................'.....................*.................................................... + // mls v24.8h, v27.8h, v7.h[0] // ................~..................................................'........................*................................................. + // sub v11.8h, v9.8h, v24.8h // .....................~.............................................'.............................*............................................ + // add v9.8h, v9.8h, v24.8h // ......................~............................................'..............................*........................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ...~...............................................................'...........*.............................................................. + // mul v24.8h, v14.8h, v0.h[4] // .......~...........................................................'...............*.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................~..............................................'............................*............................................. + // sub v14.8h, v12.8h, v24.8h // .........................~.........................................'.................................*........................................ + // add v12.8h, v12.8h, v24.8h // ...........................~.......................................'...................................*...................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ...................................................................'.*........................................................................ + // mul v24.8h, v15.8h, v0.h[4] // ...................................................................'....*..................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................................................'......*................................................................... + // sub v15.8h, v13.8h, v24.8h // ............................~......................................'....................................*..................................... + // add v13.8h, v13.8h, v24.8h // ..............................~....................................'......................................*................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // .............................~.....................................'.....................................*.................................... + // mul v24.8h, v9.8h, v0.h[6] // ...............................~...................................'.......................................*.................................. + // mls v24.8h, v27.8h, v7.h[0] // ..................................~................................'..........................................*............................... + // sub v9.8h, v8.8h, v24.8h // ........................................~..........................'................................................*......................... + // add v8.8h, v8.8h, v24.8h // ..........................................~........................'..................................................*....................... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ........................~..........................................'................................*......................................... + // mul v24.8h, v11.8h, v1.h[0] // ..........................~........................................'..................................*....................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................~.................................'.........................................*................................ + // sub v11.8h, v10.8h, v24.8h // .................................................~.................'.........................................................*................ + // add v10.8h, v10.8h, v24.8h // ..................................................~................'..........................................................*............... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ................................~..................................'........................................*................................. + // mul v24.8h, v13.8h, v1.h[2] // .....................................~.............................'.............................................*............................ + // mls v24.8h, v27.8h, v7.h[0] // .......................................~...........................'...............................................*.......................... + // sub v13.8h, v12.8h, v24.8h // ....................................................~..............'............................................................*............. + // add v12.8h, v12.8h, v24.8h // .....................................................~.............'.............................................................*............ + // sqrdmulh v27.8h, v15.8h, v1.h[5] // .........................................~.........................'.................................................*........................ + // mul v24.8h, v15.8h, v1.h[4] // ...........................................~.......................'...................................................*...................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................................~...................'.......................................................*.................. + // sub v15.8h, v14.8h, v24.8h // ........................................................~..........'................................................................*......... + // add v14.8h, v14.8h, v24.8h // .........................................................~.........'.................................................................*........ + // str q8, [x0], #(16) // ............................................~......................'....................................................*..................... + // str q9, [x0, #(-16 + 1*(512/8))] // ..............................................~....................'......................................................*................... + // str q10, [x0, #(-16 + 2*(512/8))] // .......................................................~...........'...............................................................*.......... + // str q11, [x0, #(-16 + 3*(512/8))] // ..........................................................~........'..................................................................*....... + // str q12, [x0, #(-16 + 4*(512/8))] // ...........................................................~.......'...................................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // .............................................................~.....'.....................................................................*.... + // str q14, [x0, #(-16 + 6*(512/8))] // ..............................................................~....'......................................................................*... + // str q15, [x0, #(-16 + 7*(512/8))] // .................................................................~.'.........................................................................* + + sub count, count, #1 cbnz count, layer123_start - vins v2, x26, 0 // .*.......................................................................... - mul v29.8H, v20.8H, v0.H[0] // ...*........................................................................ - ldr x15, [x0, #0] // *........................................................................... - vins v24, x27, 1 // ......*..................................................................... - ldr x10, [x0, #128] // ....*....................................................................... - ldr x29, [x0, #8] // ..*......................................................................... - mls v29.8H, v22.8H, v7.H[0] // ........*................................................................... - vins v4, x23, 1 // .........*.................................................................. - ldr x18, [x0, #72] // .....*...................................................................... - ldr x22, [x0, #136] // .......*.................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - mul v5.8H, v23.8H, v0.H[0] // ............*............................................................... - vins v9, x15, 0 // ..........*................................................................. - // gap // ............................................................................ - vins v30, x10, 0 // ...........*................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.8H, v11.8H, v7.H[0] // ..............*............................................................. - vins v2, x18, 1 // .............*.............................................................. - // gap // ............................................................................ - sub v22.8H, v24.8H, v15.8H // ...............*............................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v24.8H, v24.8H, v15.8H // ................*........................................................... - mul v11.8H, v4.8H, v0.H[0] // .................*.......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - vins v9, x29, 1 // ..................*......................................................... - sqrdmulh v4.8H, v4.8H, v0.H[1] // ....................*....................................................... - // gap // ............................................................................ - vins v30, x22, 1 // ...................*........................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v15.8H, v24.8H, v0.H[2] // .....................*...................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v24.8H, v24.8H, v0.H[3] // ......................*..................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v11.8H, v4.8H, v7.H[0] // .........................*.................................................. - sub v4.8H, v9.8H, v29.8H // .......................*.................................................... - // gap // ............................................................................ - add v29.8H, v9.8H, v29.8H // ........................*................................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v9.8H, v30.8H, v5.8H // ..........................*................................................. - mul v12.8H, v22.8H, v0.H[4] // ............................*............................................... - // gap // ............................................................................ - add v5.8H, v30.8H, v5.8H // ...........................*................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v15.8H, v24.8H, v7.H[0] // .............................*.............................................. - // gap // ............................................................................ - // gap // ............................................................................ - sub v24.8H, v2.8H, v11.8H // ..............................*............................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v2.8H, v2.8H, v11.8H // ...............................*............................................ - sqrdmulh v30.8H, v22.8H, v0.H[5] // ...................................*........................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v22.8H, v5.8H, v0.H[2] // ................................*........................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v11.8H, v2.8H, v15.8H // ..................................*......................................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v5.8H, v5.8H, v0.H[3] // .................................*.......................................... - add v2.8H, v2.8H, v15.8H // ....................................*....................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v15.8H, v9.8H, v0.H[4] // .....................................*...................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v9.8H, v9.8H, v0.H[5] // .......................................*.................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v22.8H, v5.8H, v7.H[0] // ......................................*..................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v12.8H, v30.8H, v7.H[0] // ........................................*................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v5.8H, v2.8H, v0.H[6] // ...........................................*................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v30.8H, v29.8H, v22.8H // .........................................*.................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v29.8H, v29.8H, v22.8H // ..........................................*................................. - mls v15.8H, v9.8H, v7.H[0] // ............................................*............................... - // gap // ............................................................................ - sub v9.8H, v24.8H, v12.8H // .............................................*.............................. - // gap // ............................................................................ - // gap // ............................................................................ - add v24.8H, v24.8H, v12.8H // ..............................................*............................. - sqrdmulh v2.8H, v2.8H, v0.H[7] // ...............................................*............................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v22.8H, v11.8H, v1.H[0] // ................................................*........................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v12.8H, v4.8H, v15.8H // .................................................*.......................... - // gap // ............................................................................ - // gap // ............................................................................ - add v4.8H, v4.8H, v15.8H // ..................................................*......................... - sqrdmulh v11.8H, v11.8H, v1.H[1] // ...................................................*........................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.8H, v2.8H, v7.H[0] // ....................................................*....................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v2.8H, v24.8H, v1.H[2] // .....................................................*...................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v22.8H, v11.8H, v7.H[0] // ......................................................*..................... - // gap // ............................................................................ - // gap // ............................................................................ - add v11.8H, v29.8H, v5.8H // ........................................................*................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v24.8H, v24.8H, v1.H[3] // .........................................................*.................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vi v11, x0, 16 // ............................................................*............... - sqrdmulh v11.8H, v9.8H, v1.H[5] // ...........................................................*................ - // gap // ............................................................................ - sub v15.8H, v30.8H, v22.8H // .............................................................*.............. - // gap // ............................................................................ - // gap // ............................................................................ - add v30.8H, v30.8H, v22.8H // ..............................................................*............. - mul v9.8H, v9.8H, v1.H[4] // ...............................................................*............ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v2.8H, v24.8H, v7.H[0] // ................................................................*........... - str_vo v15, x0, 176 // .................................................................*.......... - // gap // ............................................................................ - str_vo v30, x0, 112 // ..................................................................*......... - // gap // ............................................................................ - // gap // ............................................................................ - mls v9.8H, v11.8H, v7.H[0] // ...................................................................*........ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v29.8H, v29.8H, v5.8H // .......................................................*.................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v24.8H, v4.8H, v2.8H // ....................................................................*....... - add v2.8H, v4.8H, v2.8H // .....................................................................*...... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v4.8H, v12.8H, v9.8H // ......................................................................*..... - add v5.8H, v12.8H, v9.8H // .......................................................................*.... - str_vo v29, x0, 48 // ..........................................................*................. - str_vo v24, x0, 304 // ........................................................................*... - str_vo v2, x0, 240 // .........................................................................*.. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v4, x0, 432 // ..........................................................................*. - str_vo v5, x0, 368 // ...........................................................................* - // gap // ............................................................................ - - // original source code - // ldr x15, [x0, #0] // ..*......................................................................... || *.................................................................... - // vins v2, x26, 0 // *........................................................................... || *.................................................................... - // ldr x10, [x0, #8] // .....*...................................................................... || .*................................................................... - // mul v29.8H, v20.8H, v0.H[0] // .*.......................................................................... || *.................................................................... - // ldr x29, [x0, #128] // ....*....................................................................... || .*................................................................... - // ldr x18, [x0, #72] // ........*................................................................... || ..*.................................................................. - // vins v24, x27, 1 // ...*........................................................................ || .*................................................................... - // ldr x25, [x0, #136] // .........*.................................................................. || ...*................................................................. - // mls v29.8H, v22.8H, v7.H[0] // ......*..................................................................... || ..*.................................................................. - // vins v4, x23, 1 // .......*.................................................................... || ..*.................................................................. - // vins v5, x15, 0 // ...........*................................................................ || ....*................................................................ - // vins v9, x29, 0 // ............*............................................................... || .....*............................................................... - // mul v30.8H, v23.8H, v0.H[0] // ..........*................................................................. || ....*................................................................ - // vins v2, x18, 1 // ..............*............................................................. || ......*.............................................................. - // mls v30.8H, v11.8H, v7.H[0] // .............*.............................................................. || ......*.............................................................. - // sub v11.8H, v24.8H, v15.8H // ...............*............................................................ || .......*............................................................. - // add v15.8H, v24.8H, v15.8H // ................*........................................................... || ........*............................................................ - // mul v12.8H, v4.8H, v0.H[0] // .................*.......................................................... || ........*............................................................ - // vins v5, x10, 1 // ..................*......................................................... || ..........*.......................................................... - // vins v9, x25, 1 // ....................*....................................................... || ...........*......................................................... - // sqrdmulh v23.8H, v4.8H, v0.H[1] // ...................*........................................................ || ..........*.......................................................... - // mul v8.8H, v15.8H, v0.H[2] // .....................*...................................................... || ............*........................................................ - // sqrdmulh v15.8H, v15.8H, v0.H[3] // ......................*..................................................... || ..............*...................................................... - // sub v18.8H, v5.8H, v29.8H // ........................*................................................... || ................*.................................................... - // add v29.8H, v5.8H, v29.8H // .........................*.................................................. || .................*................................................... - // mls v12.8H, v23.8H, v7.H[0] // .......................*.................................................... || ................*.................................................... - // sub v5.8H, v9.8H, v30.8H // ..........................*................................................. || ..................*.................................................. - // add v9.8H, v9.8H, v30.8H // ............................*............................................... || ...................*................................................. - // mul v30.8H, v11.8H, v0.H[4] // ...........................*................................................ || ..................*.................................................. - // mls v8.8H, v15.8H, v7.H[0] // .............................*.............................................. || ....................*................................................ - // sub v15.8H, v2.8H, v12.8H // ..............................*............................................. || .....................*............................................... - // add v2.8H, v2.8H, v12.8H // ...............................*............................................ || ......................*.............................................. - // mul v12.8H, v9.8H, v0.H[2] // .................................*.......................................... || ........................*............................................ - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ...................................*........................................ || ..........................*.......................................... - // sub v3.8H, v2.8H, v8.8H // ..................................*......................................... || .........................*........................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ................................*........................................... || ......................*.............................................. - // add v2.8H, v2.8H, v8.8H // ....................................*....................................... || ..........................*.......................................... - // mul v8.8H, v5.8H, v0.H[4] // .....................................*...................................... || ............................*........................................ - // mls v12.8H, v9.8H, v7.H[0] // .......................................*.................................... || ................................*.................................... - // sqrdmulh v5.8H, v5.8H, v0.H[5] // ......................................*..................................... || ..............................*...................................... - // mls v30.8H, v11.8H, v7.H[0] // ........................................*................................... || ..................................*.................................. - // sub v9.8H, v29.8H, v12.8H // ..........................................*................................. || .....................................*............................... - // add v29.8H, v29.8H, v12.8H // ...........................................*................................ || ......................................*.............................. - // mul v11.8H, v2.8H, v0.H[6] // .........................................*.................................. || ....................................*................................ - // mls v8.8H, v5.8H, v7.H[0] // ............................................*............................... || ......................................*.............................. - // sub v5.8H, v15.8H, v30.8H // .............................................*.............................. || .......................................*............................. - // add v30.8H, v15.8H, v30.8H // ..............................................*............................. || ........................................*............................ - // sqrdmulh v2.8H, v2.8H, v0.H[7] // ...............................................*............................ || ........................................*............................ - // mul v15.8H, v3.8H, v1.H[0] // ................................................*........................... || ..........................................*.......................... - // sub v12.8H, v18.8H, v8.8H // .................................................*.......................... || ...........................................*......................... - // add v8.8H, v18.8H, v8.8H // ..................................................*......................... || ............................................*........................ - // sqrdmulh v18.8H, v3.8H, v1.H[1] // ...................................................*........................ || ............................................*........................ - // mls v11.8H, v2.8H, v7.H[0] // ....................................................*....................... || ..............................................*...................... - // mul v2.8H, v30.8H, v1.H[2] // .....................................................*...................... || ................................................*.................... - // mls v15.8H, v18.8H, v7.H[0] // ......................................................*..................... || ..................................................*.................. - // sub v18.8H, v29.8H, v11.8H // ..................................................................*......... || ..............................................................*...... - // add v29.8H, v29.8H, v11.8H // .......................................................*.................... || ...................................................*................. - // sqrdmulh v30.8H, v30.8H, v1.H[3] // ........................................................*................... || ....................................................*................ - // str_vo v18, x0, 64 // .......................................................................*.... || .................................................................*... - // sqrdmulh v11.8H, v5.8H, v1.H[5] // ..........................................................*................. || ......................................................*.............. - // str_vi v29, x0, 16 // .........................................................*.................. || ......................................................*.............. - // sub v29.8H, v9.8H, v15.8H // ...........................................................*................ || .......................................................*............. - // add v9.8H, v9.8H, v15.8H // ............................................................*............... || ........................................................*............ - // mul v5.8H, v5.8H, v1.H[4] // .............................................................*.............. || ........................................................*............ - // mls v2.8H, v30.8H, v7.H[0] // ..............................................................*............. || ..........................................................*.......... - // str_vo v29, x0, 176 // ...............................................................*............ || ..........................................................*.......... - // str_vo v9, x0, 112 // ................................................................*........... || ...........................................................*......... - // mls v5.8H, v11.8H, v7.H[0] // .................................................................*.......... || ............................................................*........ - // sub v29.8H, v8.8H, v2.8H // ...................................................................*........ || ...............................................................*..... - // add v2.8H, v8.8H, v2.8H // ....................................................................*....... || ...............................................................*..... - // sub v30.8H, v12.8H, v5.8H // .....................................................................*...... || .................................................................*... - // add v5.8H, v12.8H, v5.8H // ......................................................................*..... || .................................................................*... - // str_vo v29, x0, 304 // ........................................................................*... || ..................................................................*.. - // str_vo v2, x0, 240 // .........................................................................*.. || ..................................................................*.. - // str_vo v30, x0, 432 // ..........................................................................*. || ....................................................................* - // str_vo v5, x0, 368 // ...........................................................................* || ....................................................................* - + // Instructions: 58 + // Expected cycles: 63 + // Expected IPC: 0.92 + // + // Cycle bound: 63.0 + // IPC bound: 0.92 + // + // Wall time: 1.91s + // User time: 1.91s + // + // ------------------- original position -------------------> + // 0 25 50 + // |------------------------|------------------------|------- + add v9.8H, v18.8H, v23.8H // ........*................................................. + sqrdmulh v3.8H, v27.8H, v0.H[5] // ..*....................................................... + // gap // .......................................................... + add v23.8H, v28.8H, v19.8H // ............*............................................. + // gap // .......................................................... + // gap // .......................................................... + mul v29.8H, v27.8H, v0.H[4] // .....*.................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v2.8H, v23.8H, v0.H[3] // ...............*.......................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v30.8H, v16.8H, v0.H[1] // *......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v22.8H, v23.8H, v0.H[2] // ..................*....................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v29.8H, v3.8H, v7.H[0] // .......*.................................................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v11.8H, v30.8H, v7.H[0] // ...*...................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v3.8H, v17.8H, v0.H[0] // .*........................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v3.8H, v6.8H, v7.H[0] // ....*..................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v23.8H, v4.8H, v11.8H // ......*................................................... + // gap // .......................................................... + mls v22.8H, v2.8H, v7.H[0] // ....................*..................................... + // gap // .......................................................... + // gap // .......................................................... + add v14.8H, v4.8H, v11.8H // .............*............................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v20.8H, v23.8H, v0.H[4] // ..............*........................................... + // gap // .......................................................... + sub v16.8H, v5.8H, v3.8H // .........*................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v23.8H, v23.8H, v0.H[5] // ...........*.............................................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v11.8H, v14.8H, v0.H[3] // ................*......................................... + add v6.8H, v16.8H, v29.8H // ................................*......................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v27.8H, v14.8H, v0.H[2] // .................*........................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v19.8H, v16.8H, v29.8H // ..............................*........................... + // gap // .......................................................... + mls v20.8H, v23.8H, v7.H[0] // .......................*.................................. + add v29.8H, v5.8H, v3.8H // ..........*............................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v27.8H, v11.8H, v7.H[0] // ...................*...................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + add v8.8H, v29.8H, v22.8H // .........................*................................ + mul v10.8H, v19.8H, v1.H[4] // ..........................................*............... + // gap // .......................................................... + // gap // .......................................................... + add v18.8H, v13.8H, v20.8H // .............................*............................ + // gap // .......................................................... + sub v17.8H, v29.8H, v22.8H // ........................*................................. + sqrdmulh v16.8H, v6.8H, v1.H[3] // ..................................*....................... + // gap // .......................................................... + sub v26.8H, v13.8H, v20.8H // ...........................*.............................. + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v20.8H, v8.8H, v0.H[7] // ...............................*.......................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v14.8H, v17.8H, v1.H[1] // ..........................*............................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v23.8H, v8.8H, v0.H[6] // .................................*........................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v28.8H, v9.8H, v27.8H // .....................*.................................... + mls v23.8H, v20.8H, v7.H[0] // ....................................*..................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v12.8H, v17.8H, v1.H[0] // ............................*............................. + add v17.8H, v9.8H, v27.8H // ......................*................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v12.8H, v14.8H, v7.H[0] // ...................................*...................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + add v5.8H, v17.8H, v23.8H // .........................................*................ + sub v14.8H, v17.8H, v23.8H // .......................................*.................. + // gap // .......................................................... + sqrdmulh v23.8H, v19.8H, v1.H[5] // ........................................*................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v19.8H, v6.8H, v1.H[2] // .....................................*.................... + // gap // .......................................................... + // gap // .......................................................... + str q14, [x0, #64] // ............................................*............. + sub v6.8H, v28.8H, v12.8H // ..............................................*........... + // gap // .......................................................... + mls v19.8H, v16.8H, v7.H[0] // ......................................*................... + // gap // .......................................................... + add v31.8H, v28.8H, v12.8H // ...............................................*.......... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + str q6, [x0, #192] // .....................................................*.... + mls v10.8H, v23.8H, v7.H[0] // .............................................*............ + // gap // .......................................................... + str q31, [x0, #128] // ..................................................*....... + // gap // .......................................................... + // gap // .......................................................... + str q5, [x0], #(16) // ...........................................*.............. + // gap // .......................................................... + // gap // .......................................................... + sub v21.8H, v18.8H, v19.8H // ................................................*......... + add v12.8H, v18.8H, v19.8H // .................................................*........ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + add v13.8H, v26.8H, v10.8H // ....................................................*..... + // gap // .......................................................... + sub v17.8H, v26.8H, v10.8H // ...................................................*...... + str q21, [x0, #304] // .......................................................*.. + str q12, [x0, #240] // ......................................................*... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + str q13, [x0, #368] // ........................................................*. + // gap // .......................................................... + str q17, [x0, #432] // .........................................................* + + // --------------------- new position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------- + // sqrdmulh v29.8H, v16.8H, v0.H[1] // .....*.................................................... + // mul v26.8H, v17.8H, v0.H[0] // .........*................................................ + // sqrdmulh v15.8H, v27.8H, v0.H[5] // .*........................................................ + // mls v11.8H, v29.8H, v7.H[0] // ........*................................................. + // mls v26.8H, v6.8H, v7.H[0] // ..........*............................................... + // mul v25.8H, v27.8H, v0.H[4] // ...*...................................................... + // sub v16.8H, v4.8H, v11.8H // ...........*.............................................. + // mls v25.8H, v15.8H, v7.H[0] // .......*.................................................. + // add v29.8H, v18.8H, v23.8H // *......................................................... + // sub v31.8H, v5.8H, v26.8H // ...............*.......................................... + // add v14.8H, v5.8H, v26.8H // ......................*................................... + // sqrdmulh v21.8H, v16.8H, v0.H[5] // ................*......................................... + // add v2.8H, v28.8H, v19.8H // ..*....................................................... + // add v11.8H, v4.8H, v11.8H // .............*............................................ + // mul v26.8H, v16.8H, v0.H[4] // ..............*........................................... + // sqrdmulh v9.8H, v2.8H, v0.H[3] // ....*..................................................... + // sqrdmulh v19.8H, v11.8H, v0.H[3] // .................*........................................ + // mul v16.8H, v11.8H, v0.H[2] // ...................*...................................... + // mul v11.8H, v2.8H, v0.H[2] // ......*................................................... + // mls v16.8H, v19.8H, v7.H[0] // .......................*.................................. + // mls v11.8H, v9.8H, v7.H[0] // ............*............................................. + // sub v15.8H, v29.8H, v16.8H // .................................*........................ + // add v3.8H, v29.8H, v16.8H // ....................................*..................... + // mls v26.8H, v21.8H, v7.H[0] // .....................*.................................... + // sub v22.8H, v14.8H, v11.8H // ...........................*.............................. + // add v21.8H, v14.8H, v11.8H // ........................*................................. + // sqrdmulh v2.8H, v22.8H, v1.H[1] // ...............................*.......................... + // sub v9.8H, v13.8H, v26.8H // .............................*............................ + // mul v20.8H, v22.8H, v1.H[0] // ...................................*...................... + // add v22.8H, v13.8H, v26.8H // ..........................*............................... + // sub v13.8H, v31.8H, v25.8H // ....................*..................................... + // sqrdmulh v17.8H, v21.8H, v0.H[7] // ..............................*........................... + // add v26.8H, v31.8H, v25.8H // ..................*....................................... + // mul v31.8H, v21.8H, v0.H[6] // ................................*......................... + // sqrdmulh v12.8H, v26.8H, v1.H[3] // ............................*............................. + // mls v20.8H, v2.8H, v7.H[0] // .....................................*.................... + // mls v31.8H, v17.8H, v7.H[0] // ..................................*....................... + // mul v29.8H, v26.8H, v1.H[2] // .........................................*................ + // mls v29.8H, v12.8H, v7.H[0] // ............................................*............. + // sub v26.8H, v3.8H, v31.8H // .......................................*.................. + // sqrdmulh v12.8H, v13.8H, v1.H[5] // ........................................*................. + // add v30.8H, v3.8H, v31.8H // ......................................*................... + // mul v14.8H, v13.8H, v1.H[4] // .........................*................................ + // str q30, [x0], #(16) // .................................................*........ + // str q26, [x0, #48] // ..........................................*............... + // mls v14.8H, v12.8H, v7.H[0] // ...............................................*.......... + // sub v24.8H, v15.8H, v20.8H // ...........................................*.............. + // add v10.8H, v15.8H, v20.8H // .............................................*............ + // sub v25.8H, v22.8H, v29.8H // ..................................................*....... + // add v2.8H, v22.8H, v29.8H // ...................................................*...... + // str q10, [x0, #112] // ................................................*......... + // sub v10.8H, v9.8H, v14.8H // .....................................................*.... + // add v6.8H, v9.8H, v14.8H // ....................................................*..... + // str q24, [x0, #176] // ..............................................*........... + // str q2, [x0, #240] // .......................................................*.. + // str q25, [x0, #304] // ......................................................*... + // str q6, [x0, #368] // ........................................................*. + // str q10, [x0, #432] // .........................................................* + restore inp, STACK0 mov count, #8 .p2align 2 - // gap // ........................................................................................ - ldr x18, [x3] , #16 // *....................................................................................... - ldr x29, [x1, #48] // .*...................................................................................... - ldr x23, [x1, #56] // ......*................................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr x7, [x1, #32] // .....*.................................................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr x26, [x1, #40] // .......*................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr x8, [x1, #16] // ...*.................................................................................... - vins v4, x29, 0 // ..........*............................................................................. - vins v6, x18, 0 // .........*.............................................................................. - ldr x24, [x4, #32] // ...........................*............................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v13, x7, 0 // ..............*......................................................................... - ldr x25, [x3, #-8] // .............*.......................................................................... - // gap // ........................................................................................ - ldr x7, [x4] , #96 // .........................*.............................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v5, x8, 0 // ...........*............................................................................ - ldr x11, [x4, #-8] // ...................................................................*.................... - // gap // ........................................................................................ - ldr x9, [x4, #-32] // ..............................................*......................................... - vins v31, x24, 0 // ..............................*......................................................... - // gap // ........................................................................................ - vins v4, x23, 1 // .....................*.................................................................. - vins v6, x25, 1 // ....................*................................................................... - ldr x10, [x1, #24] // ...............*........................................................................ - ldr x8, [x4, #-16] // .................................................................*...................... - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v13, x26, 1 // .......................*................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v5, x10, 1 // ......................*................................................................. - ldr x10, [x1, #0] // ....*................................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v17.8H, v4.8H, v6.H[1] // ............................*........................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v9.8H, v13.8H, v6.H[1] // .............................*.......................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v29, x10, 0 // ...................*.................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v10.8H, v4.8H, v6.H[0] // ................................*....................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v10.8H, v17.8H, v7.H[0] // .................................*...................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v16.8H, v13.8H, v6.H[0] // ..................................*..................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v16.8H, v9.8H, v7.H[0] // .....................................*.................................................. - add v30.8H, v5.8H, v10.8H // ...................................*.................................................... - // gap // ........................................................................................ - ldr x10, [x1, #8] // ................*....................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v24.8H, v5.8H, v10.8H // ....................................*................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v14.8H, v30.8H, v6.H[2] // ...........................................*............................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr x23, [x4, #-80] // ..*..................................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v29, x10, 1 // ..........................*............................................................. - sqrdmulh v4.8H, v30.8H, v6.H[3] // ......................................*................................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - ldr x10, [x4, #-56] // ........*............................................................................... - // gap // ........................................................................................ - sqrdmulh v26.8H, v24.8H, v6.H[5] // ........................................*............................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v10, x23, 0 // .................*...................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v30.8H, v24.8H, v6.H[4] // ................................................*....................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v14.8H, v4.8H, v7.H[0] // .............................................*.......................................... - sub v0.8H, v29.8H, v16.8H // .........................................*.............................................. - // gap // ........................................................................................ - add v16.8H, v29.8H, v16.8H // ..........................................*............................................. - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v30.8H, v26.8H, v7.H[0] // .................................................*...................................... - vins v24, x7, 0 // .......................................*................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v31, x10, 1 // ...................................................*.................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - add v2.8H, v16.8H, v14.8H // ..................................................*..................................... - sub v12.8H, v16.8H, v14.8H // ....................................................*................................... - ldr x10, [x4, #-72] // ............*........................................................................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v27.8H, v0.8H, v30.8H // .....................................................*.................................. - add v23.8H, v0.8H, v30.8H // ......................................................*................................. - // gap // ........................................................................................ - trn2 v5.4S, v2.4S, v12.4S // ........................................................*............................... - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v10, x10, 1 // ........................*............................................................... - ldr x10, [x4, #-88] // ..................*..................................................................... - // gap // ........................................................................................ - trn2 v4.4S, v23.4S, v27.4S // .........................................................*.............................. - // gap // ........................................................................................ - // gap // ........................................................................................ - trn1 v3.4S, v23.4S, v27.4S // ..........................................................*............................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - trn2 v21.2D, v5.2D, v4.2D // ...........................................................*............................ - vins v24, x10, 1 // ...............................................*........................................ - ldr x10, [x4, #-48] // ...............................*........................................................ - trn1 v29.2D, v5.2D, v4.2D // ............................................................*........................... - // gap // ........................................................................................ - // gap // ........................................................................................ - trn1 v2.4S, v2.4S, v12.4S // .......................................................*................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v15.8H, v21.8H, v10.8H // ...............................................................*........................ - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v30, x10, 0 // ............................................*........................................... - ldr x10, [x4, #-40] // ..............................................................*......................... - // gap // ........................................................................................ - trn2 v28.2D, v2.2D, v3.2D // .............................................................*.......................... - // gap // ........................................................................................ - // gap // ........................................................................................ - trn1 v1.2D, v2.2D, v3.2D // ................................................................*....................... - mul v16.8H, v21.8H, v24.8H // ......................................................................*................. - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v16.8H, v15.8H, v7.H[0] // ........................................................................*............... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v30, x10, 1 // ....................................................................*................... - ldr x10, [x4, #-24] // .....................................................................*.................. - sqrdmulh v12.8H, v28.8H, v10.8H // ..................................................................*..................... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v15.8H, v28.8H, v24.8H // ..........................................................................*............. - vins v24, x9, 0 // .........................................................................*.............. - // gap // ........................................................................................ - add v14.8H, v29.8H, v16.8H // ...........................................................................*............ - // gap // ........................................................................................ - // gap // ........................................................................................ - sub v8.8H, v29.8H, v16.8H // ...............................................................................*........ - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v15.8H, v12.8H, v7.H[0] // ............................................................................*........... - vins v16, x8, 0 // .......................................................................*................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v2.8H, v14.8H, v30.8H // ................................................................................*....... - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v24, x10, 1 // .............................................................................*.......... - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v29.8H, v14.8H, v31.8H // .................................................................................*...... - // gap // ........................................................................................ - // gap // ........................................................................................ - add v10.8H, v1.8H, v15.8H // ...................................................................................*.... - // gap // ........................................................................................ - // gap // ........................................................................................ - vins v16, x11, 1 // ..............................................................................*......... - // gap // ........................................................................................ - // gap // ........................................................................................ - mls v29.8H, v2.8H, v7.H[0] // ....................................................................................*... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - mul v25.8H, v8.8H, v24.8H // ..................................................................................*..... - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - // gap // ........................................................................................ - sqrdmulh v19.8H, v8.8H, v16.8H // .....................................................................................*.. - sub v6.8H, v10.8H, v29.8H // ......................................................................................*. - // gap // ........................................................................................ - add v22.8H, v10.8H, v29.8H // .......................................................................................* - // gap // ........................................................................................ - // gap // ........................................................................................ - - // original source code - // ldr x24, [x3] , #16 // *....................................................................................... || *............................................................................... - // ldr x29, [x1, #48] // .*...................................................................................... || *............................................................................... - // ldr x25, [x4, #16] // ..................................*..................................................... || ...............................*................................................ - // ldr x23, [x1, #16] // .....*.................................................................................. || ....*........................................................................... - // ldr x21, [x1, #0] // ......................*................................................................. || ...............*................................................................ - // ldr x26, [x1, #32] // ...*.................................................................................... || ..*............................................................................. - // ldr x22, [x1, #56] // ..*..................................................................................... || .*.............................................................................. - // ldr x9, [x1, #40] // ....*................................................................................... || ...*............................................................................ - // ldr x10, [x4, #40] // .....................................*.................................................. || .................................*.............................................. - // vins v5, x24, 0 // .......*................................................................................ || ....*........................................................................... - // vins v26, x29, 0 // ......*................................................................................. || ....*........................................................................... - // vins v17, x23, 0 // ............*........................................................................... || ........*....................................................................... - // ldr x28, [x4, #24] // .................................................*...................................... || ...........................................*.................................... - // ldr x29, [x3, #-8] // ..........*............................................................................. || ......*......................................................................... - // vins v23, x26, 0 // .........*.............................................................................. || ......*......................................................................... - // ldr x20, [x1, #24] // ..................*..................................................................... || ..........*..................................................................... - // ldr x24, [x1, #8] // ...............................*........................................................ || ............................*................................................... - // vins v11, x25, 0 // .......................................*................................................ || ...................................*............................................ - // ldr x26, [x4, #8] // ......................................................*................................. || ...............................................*................................ - // vins v27, x21, 0 // .........................*.............................................................. || ...................*............................................................ - // vins v5, x29, 1 // .................*...................................................................... || ..........*..................................................................... - // vins v26, x22, 1 // ................*....................................................................... || ..........*..................................................................... - // vins v17, x20, 1 // .....................*.................................................................. || ..............*................................................................. - // vins v23, x9, 1 // ....................*................................................................... || ............*................................................................... - // vins v11, x28, 1 // .....................................................*.................................. || ...............................................*................................ - // ldr x19, [x4] , #96 // ...........*............................................................................ || .......*........................................................................ - // vins v27, x24, 1 // ...................................*.................................................... || ................................*............................................... - // ldr x24, [x4, #-64] // ........*............................................................................... || .....*.......................................................................... - // sqrdmulh v10.8H, v26.8H, v5.H[1] // .......................*................................................................ || ................*............................................................... - // sqrdmulh v2.8H, v23.8H, v5.H[1] // ........................*............................................................... || ..................*............................................................. - // vins v8, x24, 0 // ...............*........................................................................ || .........*...................................................................... - // ldr x24, [x4, #-48] // ...........................................................*............................ || ...................................................*............................ - // mul v6.8H, v26.8H, v5.H[0] // ..........................*............................................................. || ....................*........................................................... - // mls v6.8H, v10.8H, v7.H[0] // ...........................*............................................................ || ......................*......................................................... - // mul v19.8H, v23.8H, v5.H[0] // ............................*........................................................... || ........................*....................................................... - // add v26.8H, v17.8H, v6.8H // ..............................*......................................................... || ...........................*.................................................... - // sub v3.8H, v17.8H, v6.8H // ................................*....................................................... || .............................*.................................................. - // mls v19.8H, v2.8H, v7.H[0] // .............................*.......................................................... || ...........................*.................................................... - // sqrdmulh v16.8H, v26.8H, v5.H[3] // ....................................*................................................... || ................................*............................................... - // vins v2, x19, 0 // .............................................*.......................................... || ........................................*....................................... - // sqrdmulh v15.8H, v3.8H, v5.H[5] // ......................................*................................................. || ..................................*............................................. - // sub v21.8H, v27.8H, v19.8H // ..........................................*............................................. || ......................................*......................................... - // add v10.8H, v27.8H, v19.8H // ...........................................*............................................ || .......................................*........................................ - // mul v6.8H, v26.8H, v5.H[2] // .................................*...................................................... || ..............................*................................................. - // vins v23, x24, 0 // ...............................................................*........................ || .......................................................*........................ - // mls v6.8H, v16.8H, v7.H[0] // .........................................*.............................................. || ......................................*......................................... - // ldr x24, [x4, #-32] // ..............*......................................................................... || .........*...................................................................... - // vins v2, x26, 1 // ..........................................................*............................. || ...................................................*............................ - // mul v24.8H, v3.8H, v5.H[4] // ........................................*............................................... || ....................................*........................................... - // mls v24.8H, v15.8H, v7.H[0] // ............................................*........................................... || ........................................*....................................... - // add v15.8H, v10.8H, v6.8H // ...............................................*........................................ || ...........................................*.................................... - // vins v8, x10, 1 // ..............................................*......................................... || ..........................................*..................................... - // sub v31.8H, v10.8H, v6.8H // ................................................*....................................... || ...........................................*.................................... - // sub v27.8H, v21.8H, v24.8H // ..................................................*..................................... || .............................................*.................................. - // add v24.8H, v21.8H, v24.8H // ...................................................*.................................... || .............................................*.................................. - // trn1 v18.4S, v15.4S, v31.4S // .............................................................*.......................... || .....................................................*.......................... - // trn2 v12.4S, v15.4S, v31.4S // ....................................................*................................... || ..............................................*................................. - // trn2 v3.4S, v24.4S, v27.4S // .......................................................*................................ || ................................................*............................... - // trn1 v0.4S, v24.4S, v27.4S // ........................................................*............................... || .................................................*.............................. - // trn2 v20.2D, v12.2D, v3.2D // .........................................................*.............................. || ...................................................*............................ - // trn1 v9.2D, v12.2D, v3.2D // ............................................................*........................... || ....................................................*........................... - // trn2 v19.2D, v18.2D, v0.2D // .................................................................*...................... || ........................................................*....................... - // ldr x10, [x4, #-40] // ................................................................*....................... || .......................................................*........................ - // sqrdmulh v31.8H, v20.8H, v11.8H // ..............................................................*......................... || ......................................................*......................... - // trn1 v1.2D, v18.2D, v0.2D // ..................................................................*..................... || .........................................................*...................... - // ldr x8, [x4, #-16] // ...................*.................................................................... || ...........*.................................................................... - // sqrdmulh v28.8H, v19.8H, v11.8H // .......................................................................*................ || .............................................................*.................. - // ldr x11, [x4, #-8] // .............*.......................................................................... || ........*....................................................................... - // vins v23, x10, 1 // .....................................................................*.................. || .............................................................*.................. - // ldr x10, [x4, #-24] // ......................................................................*................. || .............................................................*.................. - // mul v30.8H, v20.8H, v2.8H // ...................................................................*.................... || .........................................................*...................... - // vins v17, x8, 0 // .............................................................................*.......... || ..................................................................*............. - // mls v30.8H, v31.8H, v7.H[0] // ....................................................................*................... || ...........................................................*.................... - // vins v31, x24, 0 // .........................................................................*.............. || ...............................................................*................ - // mul v15.8H, v19.8H, v2.8H // ........................................................................*............... || ...............................................................*................ - // add v10.8H, v9.8H, v30.8H // ..........................................................................*............. || ................................................................*............... - // mls v15.8H, v28.8H, v7.H[0] // ............................................................................*........... || ..................................................................*............. - // vins v31, x10, 1 // ...............................................................................*........ || .....................................................................*.......... - // vins v17, x11, 1 // ..................................................................................*..... || ........................................................................*....... - // sub v13.8H, v9.8H, v30.8H // ...........................................................................*............ || .................................................................*.............. - // sqrdmulh v3.8H, v10.8H, v23.8H // ..............................................................................*......... || ....................................................................*........... - // mul v9.8H, v10.8H, v8.8H // ................................................................................*....... || ......................................................................*......... - // mul v25.8H, v13.8H, v31.8H // ....................................................................................*... || ...........................................................................*.... - // add v31.8H, v1.8H, v15.8H // .................................................................................*...... || .......................................................................*........ - // mls v9.8H, v3.8H, v7.H[0] // ...................................................................................*.... || .........................................................................*...... - // sqrdmulh v19.8H, v13.8H, v17.8H // .....................................................................................*.. || ..............................................................................*. - // sub v6.8H, v31.8H, v9.8H // ......................................................................................*. || ..............................................................................*. - // add v22.8H, v31.8H, v9.8H // .......................................................................................* || ...............................................................................* - + // Instructions: 37 + // Expected cycles: 49 + // Expected IPC: 0.76 + // + // Cycle bound: 49.0 + // IPC bound: 0.76 + // + // Wall time: 0.65s + // User time: 0.65s + // + // -------- original position ---------> + // 0 25 + // |------------------------|----------- + ldr q22, [x1, #48] // ....*................................ + ldr q15, [x3], #16 // .*................................... + // gap // ..................................... + ldr q16, [x1, #32] // *.................................... + ldr q23, [x1, #16] // ...*................................. + // gap // ..................................... + ldr q26, [x1, #0] // ..*.................................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v20.8H, v22.8H, v15.H[1] // ......*.............................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mul v30.8H, v22.8H, v15.H[0] // ........*............................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v8.8H, v16.8H, v15.H[1] // .......*............................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v30.8H, v20.8H, v7.H[0] // .........*........................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mul v2.8H, v16.8H, v15.H[0] // .....*............................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v2.8H, v8.8H, v7.H[0] // ..........*.......................... + // gap // ..................................... + // gap // ..................................... + sub v20.8H, v23.8H, v30.8H // ...........*......................... + // gap // ..................................... + // gap // ..................................... + add v28.8H, v23.8H, v30.8H // ............*........................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v6.8H, v20.8H, v15.H[5] // .................*................... + // gap // ..................................... + // gap // ..................................... + sub v29.8H, v26.8H, v2.8H // .............*....................... + // gap // ..................................... + // gap // ..................................... + add v23.8H, v26.8H, v2.8H // ...............*..................... + mul v4.8H, v20.8H, v15.H[4] // ..............*...................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v21.8H, v28.8H, v15.H[3] // ................*.................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v4.8H, v6.8H, v7.H[0] // ..................*.................. + ldr q6, [x4], #(6*16) // .............................*....... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mul v13.8H, v28.8H, v15.H[2] // ...................*................. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v13.8H, v21.8H, v7.H[0] // ....................*................ + // gap // ..................................... + // gap // ..................................... + add v30.8H, v29.8H, v4.8H // ......................*.............. + // gap // ..................................... + // gap // ..................................... + sub v10.8H, v29.8H, v4.8H // .....................*............... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sub v31.8H, v23.8H, v13.8H // .......................*............. + add v3.8H, v23.8H, v13.8H // ........................*............ + ldr q23, [x4, #-80] // .........................*........... + trn1 v18.4S, v30.4S, v10.4S // ..........................*.......... + trn2 v0.4S, v30.4S, v10.4S // ...............................*..... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + trn1 v4.4S, v3.4S, v31.4S // ...........................*......... + trn2 v28.4S, v3.4S, v31.4S // ............................*........ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + trn2 v15.2D, v4.2D, v18.2D // ..............................*...... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v19.8H, v15.8H, v23.8H // ................................*.... + // gap // ..................................... + // gap // ..................................... + trn2 v27.2D, v28.2D, v0.2D // .................................*... + // gap // ..................................... + // gap // ..................................... + mul v30.8H, v15.8H, v6.8H // ..................................*.. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v29.8H, v27.8H, v23.8H // ...................................*. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v30.8H, v19.8H, v7.H[0] // ....................................* + // gap // ..................................... + // gap // ..................................... + + // ----------- new position -----------> + // 0 25 + // |------------------------|----------- + // ldr q26, [x1, #32] // ..*.................................. + // ldr q12, [x3], #16 // .*................................... + // ldr q10, [x1, #0] // ....*................................ + // ldr q31, [x1, #16] // ...*................................. + // ldr q1, [x1, #48] // *.................................... + // mul v15.8H, v26.8H, v12.H[0] // .........*........................... + // sqrdmulh v0.8H, v1.8H, v12.H[1] // .....*............................... + // sqrdmulh v17.8H, v26.8H, v12.H[1] // .......*............................. + // mul v6.8H, v1.8H, v12.H[0] // ......*.............................. + // mls v6.8H, v0.8H, v7.H[0] // ........*............................ + // mls v15.8H, v17.8H, v7.H[0] // ..........*.......................... + // sub v9.8H, v31.8H, v6.8H // ...........*......................... + // add v23.8H, v31.8H, v6.8H // ............*........................ + // sub v6.8H, v10.8H, v15.8H // ..............*...................... + // mul v3.8H, v9.8H, v12.H[4] // ................*.................... + // add v21.8H, v10.8H, v15.8H // ...............*..................... + // sqrdmulh v13.8H, v23.8H, v12.H[3] // .................*................... + // sqrdmulh v1.8H, v9.8H, v12.H[5] // .............*....................... + // mls v3.8H, v1.8H, v7.H[0] // ..................*.................. + // mul v14.8H, v23.8H, v12.H[2] // ....................*................ + // mls v14.8H, v13.8H, v7.H[0] // .....................*............... + // sub v17.8H, v6.8H, v3.8H // .......................*............. + // add v3.8H, v6.8H, v3.8H // ......................*.............. + // sub v24.8H, v21.8H, v14.8H // ........................*............ + // add v9.8H, v21.8H, v14.8H // .........................*........... + // ldr q14, [x4, #16] // ..........................*.......... + // trn1 v18.4S, v3.4S, v17.4S // ...........................*......... + // trn1 v4.4S, v9.4S, v24.4S // .............................*....... + // trn2 v28.4S, v9.4S, v24.4S // ..............................*...... + // ldr q6, [x4], #(6*16) // ...................*................. + // trn2 v2.2D, v4.2D, v18.2D // ...............................*..... + // trn2 v0.4S, v3.4S, v17.4S // ............................*........ + // sqrdmulh v15.8H, v2.8H, v14.8H // ................................*.... + // trn2 v27.2D, v28.2D, v0.2D // .................................*... + // mul v30.8H, v2.8H, v6.8H // ..................................*.. + // sqrdmulh v29.8H, v27.8H, v14.8H // ...................................*. + // mls v30.8H, v15.8H, v7.H[0] // ....................................* + sub count, count, #1 -.p2align 2 layer4567_start: - mls v25.8H, v19.8H, v7.H[0] // .........................................................................................*.................................. - ldr x24, [x3] , #16 // ................e........................................................................................................... - ldr x29, [x1, #112] // ............e............................................................................................................... - trn2 v29.4S, v22.4S, v6.4S // .............................................................................................*.............................. - ldr x25, [x4, #16] // ....................................................e....................................................................... - ldr x23, [x1, #80] // ....e....................................................................................................................... - trn1 v30.4S, v22.4S, v6.4S // ............................................................................................*............................... - ldr x21, [x1, #64] // e........................................................................................................................... - ldr x26, [x1, #96] // ........e................................................................................................................... - ldr x22, [x1, #120] // .............e.............................................................................................................. - ldr x9, [x1, #104] // .........e.................................................................................................................. - // gap // ............................................................................................................................ - ldr x10, [x4, #40] // .........................................................e.................................................................. - vins v5, x24, 0 // ..................e......................................................................................................... - vins v26, x29, 0 // ..............e............................................................................................................. - vins v17, x23, 0 // ......e..................................................................................................................... - sqdmulh v18.8H, v29.8H, v7.H[1] // ...................................................................................................*........................ - ldr x28, [x4, #24] // .....................................................e...................................................................... - ldr x29, [x3, #-8] // .................e.......................................................................................................... - vins v23, x26, 0 // ..........e................................................................................................................. - ldr x20, [x1, #88] // .....e...................................................................................................................... - sqdmulh v13.8H, v30.8H, v7.H[1] // ................................................................................................*........................... - ldr x24, [x1, #72] // .e.......................................................................................................................... - // gap // ............................................................................................................................ - vins v11, x25, 0 // ......................................................e..................................................................... - ldr x26, [x4, #8] // .................................................e.......................................................................... - // gap // ............................................................................................................................ - vins v27, x21, 0 // ..e......................................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - vins v5, x29, 1 // ...................e........................................................................................................ - vins v26, x22, 1 // ...............e............................................................................................................ - // gap // ............................................................................................................................ - vins v17, x20, 1 // .......e.................................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - vins v23, x9, 1 // ...........e................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sub v14.8H, v1.8H, v15.8H // ...........................................................................*................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - vins v11, x28, 1 // .......................................................e.................................................................... - ldr x19, [x4] , #96 // ................................................e........................................................................... - vins v27, x24, 1 // ...e........................................................................................................................ - ldr x24, [x4, #-64] // ........................................................e................................................................... - // gap // ............................................................................................................................ - srshr v28.8H, v13.8H, #11 // .................................................................................................*.......................... - sqrdmulh v10.8H, v26.8H, v5.H[1] // ..........................e................................................................................................. - // gap // ............................................................................................................................ - srshr v0.8H, v18.8H, #11 // ....................................................................................................*....................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqrdmulh v2.8H, v23.8H, v5.H[1] // .....................e...................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - vins v8, x24, 0 // ..........................................................e................................................................. - ldr x24, [x4, #-48] // ............................................................e............................................................... - // gap // ............................................................................................................................ - mul v6.8H, v26.8H, v5.H[0] // .........................e.................................................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v6.8H, v10.8H, v7.H[0] // ...........................e................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v19.8H, v23.8H, v5.H[0] // ....................e....................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sub v12.8H, v14.8H, v25.8H // ..........................................................................................*................................. - add v4.8H, v14.8H, v25.8H // ...........................................................................................*................................ - // gap // ............................................................................................................................ - add v26.8H, v17.8H, v6.8H // .............................e.............................................................................................. - sub v3.8H, v17.8H, v6.8H // ............................e............................................................................................... - // gap // ............................................................................................................................ - mls v19.8H, v2.8H, v7.H[0] // ......................e..................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn1 v14.4S, v4.4S, v12.4S // ..............................................................................................*............................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v4.4S, v4.4S, v12.4S // ...............................................................................................*............................ - sqrdmulh v16.8H, v26.8H, v5.H[3] // ...............................e............................................................................................ - // gap // ............................................................................................................................ - vins v2, x19, 0 // ..................................................e......................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqrdmulh v15.8H, v3.8H, v5.H[5] // ....................................e....................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sub v21.8H, v27.8H, v19.8H // .......................e.................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v10.8H, v27.8H, v19.8H // ........................e................................................................................................... - mul v6.8H, v26.8H, v5.H[2] // ..............................e............................................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - vins v23, x24, 0 // ..............................................................e............................................................. - mls v6.8H, v16.8H, v7.H[0] // ................................e........................................................................................... - ldr x24, [x4, #-32] // ................................................................e........................................................... - vins v2, x26, 1 // ...................................................e........................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v24.8H, v3.8H, v5.H[4] // ...................................e........................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v24.8H, v15.8H, v7.H[0] // .....................................e...................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v15.8H, v10.8H, v6.8H // ..................................e......................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - vins v8, x10, 1 // ...........................................................e................................................................ - sub v31.8H, v10.8H, v6.8H // .................................e.......................................................................................... - // gap // ............................................................................................................................ - sqdmulh v9.8H, v4.8H, v7.H[1] // .........................................................................................................*.................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sub v27.8H, v21.8H, v24.8H // ......................................e..................................................................................... - add v24.8H, v21.8H, v24.8H // .......................................e.................................................................................... - // gap // ............................................................................................................................ - mls v30.8H, v28.8H, v7.H[0] // ..................................................................................................*......................... - trn1 v18.4S, v15.4S, v31.4S // ........................................e................................................................................... - // gap // ............................................................................................................................ - trn2 v12.4S, v15.4S, v31.4S // .........................................e.................................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v3.4S, v24.4S, v27.4S // ...........................................e................................................................................ - mls v29.8H, v0.8H, v7.H[0] // .....................................................................................................*...................... - // gap // ............................................................................................................................ - srshr v17.8H, v9.8H, #11 // ..........................................................................................................*................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn1 v0.4S, v24.4S, v27.4S // ..........................................e................................................................................. - sqdmulh v27.8H, v14.8H, v7.H[1] // ......................................................................................................*..................... - // gap // ............................................................................................................................ - vext x8, v30, 0 // ............................................................................................................*............... - trn2 v20.2D, v12.2D, v3.2D // .............................................e.............................................................................. - // gap // ............................................................................................................................ - mls v4.8H, v17.8H, v7.H[0] // ...........................................................................................................*................ - vext x11, v30, 1 // .............................................................................................................*.............. - trn1 v9.2D, v12.2D, v3.2D // ...............................................e............................................................................ - trn2 v19.2D, v18.2D, v0.2D // ............................................e............................................................................... - ldr x10, [x4, #-40] // .............................................................e.............................................................. - // gap // ............................................................................................................................ - sqrdmulh v31.8H, v20.8H, v11.8H // ..............................................................................e............................................. - trn1 v1.2D, v18.2D, v0.2D // ..............................................e............................................................................. - // gap // ............................................................................................................................ - srshr v6.8H, v27.8H, #11 // .......................................................................................................*.................... - str x8, [x1] , #64 // ....................................................................................................................*....... - ldr x8, [x4, #-16] // ....................................................................e....................................................... - str x11, [x1, #-32] // ........................................................................................................................*... - sqrdmulh v28.8H, v19.8H, v11.8H // .........................................................................e.................................................. - ldr x11, [x4, #-8] // .....................................................................e...................................................... - vins v23, x10, 1 // ...............................................................e............................................................ - ldr x10, [x4, #-24] // .................................................................e.......................................................... - // gap // ............................................................................................................................ - mul v30.8H, v20.8H, v2.8H // .............................................................................e.............................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - vins v17, x8, 0 // ......................................................................e..................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v30.8H, v31.8H, v7.H[0] // ...............................................................................e............................................ - vins v31, x24, 0 // ..................................................................e......................................................... - vext x12, v29, 0 // ..............................................................................................................*............. - // gap // ............................................................................................................................ - vext x13, v29, 1 // ...............................................................................................................*............ - // gap // ............................................................................................................................ - mls v14.8H, v6.8H, v7.H[0] // ........................................................................................................*................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v15.8H, v19.8H, v2.8H // ........................................................................e................................................... - str x12, [x1, #-48] // ......................................................................................................................*..... - // gap // ............................................................................................................................ - add v10.8H, v9.8H, v30.8H // .................................................................................e.......................................... - str x13, [x1, #-16] // ..........................................................................................................................*. - // gap // ............................................................................................................................ - mls v15.8H, v28.8H, v7.H[0] // ..........................................................................e................................................. - vins v31, x10, 1 // ...................................................................e........................................................ - // gap // ............................................................................................................................ - vins v17, x11, 1 // .......................................................................e.................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sub v13.8H, v9.8H, v30.8H // ................................................................................e........................................... - sqrdmulh v3.8H, v10.8H, v23.8H // ...................................................................................e........................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v9.8H, v10.8H, v8.8H // ..................................................................................e......................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - vext x15, v14, 1 // .................................................................................................................*.......... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v25.8H, v13.8H, v31.8H // .......................................................................................e.................................... - add v31.8H, v1.8H, v15.8H // ............................................................................e............................................... - vext x14, v14, 0 // ................................................................................................................*........... - vext x17, v4, 1 // ...................................................................................................................*........ - vext x16, v4, 0 // ..................................................................................................................*......... - // gap // ............................................................................................................................ - mls v9.8H, v3.8H, v7.H[0] // ....................................................................................e....................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - str x15, [x1, #-24] // .........................................................................................................................*.. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqrdmulh v19.8H, v13.8H, v17.8H // ........................................................................................e................................... - str x14, [x1, #-56] // .....................................................................................................................*...... - // gap // ............................................................................................................................ - str x16, [x1, #-40] // .......................................................................................................................*.... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - str x17, [x1, #-8] // ...........................................................................................................................* - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sub v6.8H, v31.8H, v9.8H // .....................................................................................e...................................... - add v22.8H, v31.8H, v9.8H // ......................................................................................e..................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - - // original source code - // ldr x10, [x1, #0] // ......e.............................................................................................................................................................................................................................................. || ..e............................................................................................................................................................. - // ldr x11, [x1, #8] // ....................e................................................................................................................................................................................................................................ || .......e........................................................................................................................................................ - // vins v8, x10, 0 // .......................e............................................................................................................................................................................................................................. || .........e...................................................................................................................................................... - // vins v8, x11, 1 // ...............................e..................................................................................................................................................................................................................... || ...............e................................................................................................................................................ - // ldr x10, [x1, #16] // ....e................................................................................................................................................................................................................................................ || .e.............................................................................................................................................................. - // ldr x11, [x1, #24] // ..................e.................................................................................................................................................................................................................................. || ......e......................................................................................................................................................... - // vins v9, x10, 0 // .............e....................................................................................................................................................................................................................................... || .....e.......................................................................................................................................................... - // vins v9, x11, 1 // ..........................e.......................................................................................................................................................................................................................... || ...........e.................................................................................................................................................... - // ldr x10, [x1, #32] // .......e............................................................................................................................................................................................................................................. || ..e............................................................................................................................................................. - // ldr x11, [x1, #40] // .........e........................................................................................................................................................................................................................................... || ...e............................................................................................................................................................ - // vins v10, x10, 0 // .................e................................................................................................................................................................................................................................... || ......e......................................................................................................................................................... - // vins v10, x11, 1 // ...........................e......................................................................................................................................................................................................................... || ............e................................................................................................................................................... - // ldr x10, [x1, #48] // .e................................................................................................................................................................................................................................................... || e............................................................................................................................................................... - // ldr x11, [x1, #56] // ........e............................................................................................................................................................................................................................................ || ...e............................................................................................................................................................ - // vins v11, x10, 0 // ............e........................................................................................................................................................................................................................................ || ....e........................................................................................................................................................... - // vins v11, x11, 1 // .........................e........................................................................................................................................................................................................................... || ..........e..................................................................................................................................................... - // ldr x10, [x3] , #16 // e.................................................................................................................................................................................................................................................... || e............................................................................................................................................................... - // ldr x11, [x3, #-8] // ................e.................................................................................................................................................................................................................................... || ......e......................................................................................................................................................... - // vins v0, x10, 0 // ...........e......................................................................................................................................................................................................................................... || ....e........................................................................................................................................................... - // vins v0, x11, 1 // ........................e............................................................................................................................................................................................................................ || ..........e..................................................................................................................................................... - // mul v24.8H, v10.8H, v0.H[0] // .........................................e........................................................................................................................................................................................................... || ........................e....................................................................................................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[1] // ....................................e................................................................................................................................................................................................................ || ..................e............................................................................................................................................. - // mls v24.8H, v10.8H, v7.H[0] // ..............................................e...................................................................................................................................................................................................... || ............................e................................................................................................................................... - // sub v10.8H, v8.8H, v24.8H // ....................................................e................................................................................................................................................................................................ || .................................e.............................................................................................................................. - // add v8.8H, v8.8H, v24.8H // .....................................................e............................................................................................................................................................................................... || ..................................e............................................................................................................................. - // mul v24.8H, v11.8H, v0.H[0] // .......................................e............................................................................................................................................................................................................. || ....................e........................................................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // ..................................e.................................................................................................................................................................................................................. || ................e............................................................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ........................................e............................................................................................................................................................................................................ || ......................e......................................................................................................................................... - // sub v11.8H, v9.8H, v24.8H // .............................................e....................................................................................................................................................................................................... || ...........................e.................................................................................................................................... - // add v9.8H, v9.8H, v24.8H // ............................................e........................................................................................................................................................................................................ || ...........................e.................................................................................................................................... - // mul v24.8H, v9.8H, v0.H[2] // ......................................................e.............................................................................................................................................................................................. || ..................................e............................................................................................................................. - // sqrdmulh v9.8H, v9.8H, v0.H[3] // .................................................e................................................................................................................................................................................................... || ..............................e................................................................................................................................. - // mls v24.8H, v9.8H, v7.H[0] // ........................................................e............................................................................................................................................................................................ || ....................................e........................................................................................................................... - // sub v9.8H, v8.8H, v24.8H // ...............................................................e..................................................................................................................................................................................... || ..........................................e..................................................................................................................... - // add v8.8H, v8.8H, v24.8H // .............................................................e....................................................................................................................................................................................... || .........................................e...................................................................................................................... - // mul v24.8H, v11.8H, v0.H[4] // ...........................................................e......................................................................................................................................................................................... || ......................................e......................................................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ...................................................e................................................................................................................................................................................................. || ................................e............................................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ............................................................e........................................................................................................................................................................................ || ........................................e....................................................................................................................... - // sub v11.8H, v10.8H, v24.8H // .................................................................e................................................................................................................................................................................... || .............................................e.................................................................................................................. - // add v10.8H, v10.8H, v24.8H // ..................................................................e.................................................................................................................................................................................. || .............................................e.................................................................................................................. - // trn1 v25.4S, v8.4S, v9.4S // ....................................................................e................................................................................................................................................................................ || ..............................................e................................................................................................................. - // trn2 v26.4S, v8.4S, v9.4S // .....................................................................e............................................................................................................................................................................... || ...............................................e................................................................................................................ - // trn1 v27.4S, v10.4S, v11.4S // .........................................................................e........................................................................................................................................................................... || ..................................................e............................................................................................................. - // trn2 v28.4S, v10.4S, v11.4S // ......................................................................e.............................................................................................................................................................................. || ................................................e............................................................................................................... - // trn2 v10.2D, v25.2D, v27.2D // ................................................................................e.................................................................................................................................................................... || .....................................................e.......................................................................................................... - // trn2 v11.2D, v26.2D, v28.2D // ............................................................................e........................................................................................................................................................................ || ...................................................e............................................................................................................ - // trn1 v8.2D, v25.2D, v27.2D // ...................................................................................e................................................................................................................................................................. || ......................................................e......................................................................................................... - // trn1 v9.2D, v26.2D, v28.2D // ...............................................................................e..................................................................................................................................................................... || ....................................................e........................................................................................................... - // ldr x10, [x4] , #96 // ..............................e...................................................................................................................................................................................................................... || ..............e................................................................................................................................................. - // ldr x11, [x4, #-88] // ......................e.............................................................................................................................................................................................................................. || ........e....................................................................................................................................................... - // vins v0, x10, 0 // ..................................................e.................................................................................................................................................................................................. || ...............................e................................................................................................................................ - // vins v0, x11, 1 // ..........................................................e.......................................................................................................................................................................................... || .....................................e.......................................................................................................................... - // ldr x10, [x4, #-80] // ...e................................................................................................................................................................................................................................................. || .e.............................................................................................................................................................. - // ldr x11, [x4, #-72] // ...............e..................................................................................................................................................................................................................................... || .....e.......................................................................................................................................................... - // vins v4, x10, 0 // .....................e............................................................................................................................................................................................................................... || ........e....................................................................................................................................................... - // vins v4, x11, 1 // .............................e....................................................................................................................................................................................................................... || ..............e................................................................................................................................................. - // ldr x10, [x4, #-64] // ................................e.................................................................................................................................................................................................................... || ...............e................................................................................................................................................ - // ldr x11, [x4, #-56] // ..........e.......................................................................................................................................................................................................................................... || ....e........................................................................................................................................................... - // vins v1, x10, 0 // .....................................e............................................................................................................................................................................................................... || ...................e............................................................................................................................................ - // vins v1, x11, 1 // ..............................................................e...................................................................................................................................................................................... || ..........................................e..................................................................................................................... - // ldr x10, [x4, #-48] // ......................................e.............................................................................................................................................................................................................. || ...................e............................................................................................................................................ - // ldr x11, [x4, #-40] // .................................................................................e................................................................................................................................................................... || .....................................................e.......................................................................................................... - // vins v5, x10, 0 // .......................................................e............................................................................................................................................................................................. || ....................................e........................................................................................................................... - // vins v5, x11, 1 // ..........................................................................................e.......................................................................................................................................................... || .........................................................e...................................................................................................... - // ldr x10, [x4, #-32] // .........................................................e........................................................................................................................................................................................... || ....................................e........................................................................................................................... - // ldr x11, [x4, #-24] // ...........................................................................................e......................................................................................................................................................... || .........................................................e...................................................................................................... - // vins v2, x10, 0 // ...............................................................................................e..................................................................................................................................................... || ............................................................e................................................................................................... - // vins v2, x11, 1 // ........................................................................................................e............................................................................................................................................ || ..................................................................e............................................................................................. - // ldr x10, [x4, #-16] // ......................................................................................e.............................................................................................................................................................. || .......................................................e........................................................................................................ - // ldr x11, [x4, #-8] // .........................................................................................e........................................................................................................................................................... || ........................................................e....................................................................................................... - // vins v6, x10, 0 // .............................................................................................e....................................................................................................................................................... || ...........................................................e.................................................................................................... - // vins v6, x11, 1 // .........................................................................................................e........................................................................................................................................... || ...................................................................e............................................................................................ - // mul v24.8H, v10.8H, v0.8H // ...................................................................................................e................................................................................................................................................. || ................................................................e............................................................................................... - // sqrdmulh v10.8H, v10.8H, v4.8H // ........................................................................................e............................................................................................................................................................ || ........................................................e....................................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // .......................................................................................................e............................................................................................................................................. || ..................................................................e............................................................................................. - // sub v10.8H, v8.8H, v24.8H // ........................................................................................................................................................*............................................................................................ || ..............................................................................................*................................................................. - // add v8.8H, v8.8H, v24.8H // ...............................................................................................................e..................................................................................................................................... || ........................................................................e....................................................................................... - // mul v24.8H, v11.8H, v0.8H // ............................................................................................e........................................................................................................................................................ || ..........................................................e..................................................................................................... - // sqrdmulh v11.8H, v11.8H, v4.8H // ..................................................................................e.................................................................................................................................................................. || ......................................................e......................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ..............................................................................................e...................................................................................................................................................... || ............................................................e................................................................................................... - // sub v11.8H, v9.8H, v24.8H // ..........................................................................................................e.......................................................................................................................................... || ....................................................................e........................................................................................... - // add v9.8H, v9.8H, v24.8H // .....................................................................................................e............................................................................................................................................... || .................................................................e.............................................................................................. - // mul v24.8H, v9.8H, v1.8H // ............................................................................................................e........................................................................................................................................ || ......................................................................e......................................................................................... - // sqrdmulh v9.8H, v9.8H, v5.8H // ...........................................................................................................e......................................................................................................................................... || ....................................................................e........................................................................................... - // mls v24.8H, v9.8H, v7.H[0] // ...................................................................................................................e................................................................................................................................. || ..........................................................................e..................................................................................... - // sub v9.8H, v8.8H, v24.8H // .........................................................................................................................e........................................................................................................................... || ...............................................................................e................................................................................ - // add v8.8H, v8.8H, v24.8H // ..........................................................................................................................e.......................................................................................................................... || ...............................................................................e................................................................................ - // mul v24.8H, v11.8H, v2.8H // ..............................................................................................................e...................................................................................................................................... || ........................................................................e....................................................................................... - // sqrdmulh v11.8H, v11.8H, v6.8H // .....................................................................................................................e............................................................................................................................... || ............................................................................e................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ...........................................................................................................................*......................................................................................................................... || .................................................................................*.............................................................................. - // sub v11.8H, v10.8H, v24.8H // ......................................................................................................................................................................*.............................................................................. || ...........................................................................................................*.................................................... - // add v10.8H, v10.8H, v24.8H // .......................................................................................................................................................................*............................................................................. || ...........................................................................................................*.................................................... - // trn1 v25.4S, v8.4S, v9.4S // .................................................................................................................................*................................................................................................................... || ...................................................................................*............................................................................ - // trn2 v26.4S, v8.4S, v9.4S // ..............................................................................................................................*...................................................................................................................... || ..................................................................................*............................................................................. - // trn1 v27.4S, v10.4S, v11.4S // ...........................................................................................................................................................................*......................................................................... || ..............................................................................................................*................................................. - // trn2 v28.4S, v10.4S, v11.4S // ............................................................................................................................................................................*........................................................................ || ...............................................................................................................*................................................ - // sqdmulh v24.8H, v25.8H, v7.H[1] // ...............................................................................................................................................*..................................................................................................... || ........................................................................................*....................................................................... - // srshr v24.8H, v24.8H, #11 // .............................................................................................................................................................*....................................................................................... || .................................................................................................*.............................................................. - // mls v25.8H, v24.8H, v7.H[0] // ...............................................................................................................................................................................................*..................................................... || ...............................................................................................................................*................................ - // sqdmulh v24.8H, v26.8H, v7.H[1] // ..........................................................................................................................................*.......................................................................................................... || ......................................................................................*......................................................................... - // srshr v24.8H, v24.8H, #11 // ...............................................................................................................................................................*..................................................................................... || ..................................................................................................*............................................................. - // mls v26.8H, v24.8H, v7.H[0] // ...................................................................................................................................................................................................*................................................. || .................................................................................................................................*.............................. - // sqdmulh v24.8H, v27.8H, v7.H[1] // ......................................................................................................................................................................................................*.............................................. || ...................................................................................................................................*............................ - // srshr v24.8H, v24.8H, #11 // ................................................................................................................................................................................................................*.................................... || ........................................................................................................................................*....................... - // mls v27.8H, v24.8H, v7.H[0] // ..............................................................................................................................................................................................................................*...................... || ...............................................................................................................................................*................ - // sqdmulh v24.8H, v28.8H, v7.H[1] // ............................................................................................................................................................................................*........................................................ || ............................................................................................................................*................................... - // srshr v24.8H, v24.8H, #11 // ....................................................................................................................................................................................................*................................................ || ..................................................................................................................................*............................. - // mls v28.8H, v24.8H, v7.H[0] // .........................................................................................................................................................................................................*........................................... || .....................................................................................................................................*.......................... - // vext x10, v25, 0 // .......................................................................................................................................................................................................*............................................. || ....................................................................................................................................*........................... - // vext x11, v25, 1 // ..........................................................................................................................................................................................................*.......................................... || .....................................................................................................................................*.......................... - // vext x12, v26, 0 // ............................................................................................................................................................................................................................*........................ || .............................................................................................................................................*.................. - // vext x13, v26, 1 // .............................................................................................................................................................................................................................*....................... || ..............................................................................................................................................*................. - // vext x14, v27, 0 // ............................................................................................................................................................................................................................................*........ || .........................................................................................................................................................*...... - // vext x15, v27, 1 // .........................................................................................................................................................................................................................................*........... || ........................................................................................................................................................*....... - // vext x16, v28, 0 // ..............................................................................................................................................................................................................................................*...... || ..........................................................................................................................................................*..... - // vext x17, v28, 1 // .............................................................................................................................................................................................................................................*....... || ..........................................................................................................................................................*..... - // str x10, [x1] , #64 // .................................................................................................................................................................................................................*................................... || ........................................................................................................................................*....................... - // str x14, [x1, #-56] // ..................................................................................................................................................................................................................................................*.. || .............................................................................................................................................................*.. - // str x12, [x1, #-48] // ................................................................................................................................................................................................................................*.................... || .................................................................................................................................................*.............. - // str x16, [x1, #-40] // ...................................................................................................................................................................................................................................................*. || ..............................................................................................................................................................*. - // str x11, [x1, #-32] // ...................................................................................................................................................................................................................*................................. || .........................................................................................................................................*...................... - // str x15, [x1, #-24] // ................................................................................................................................................................................................................................................*.... || ............................................................................................................................................................*... - // str x13, [x1, #-16] // ..................................................................................................................................................................................................................................*.................. || ..................................................................................................................................................*............. - // str x17, [x1, #-8] // ....................................................................................................................................................................................................................................................* || ...............................................................................................................................................................* - - subs count, count, #1 + // Instructions: 91 + // Expected cycles: 64 + // Expected IPC: 1.42 + // + // Cycle bound: 64.0 + // IPC bound: 1.42 + // + // Wall time: 69.09s + // User time: 69.09s + // + // ----------------------------------- original position ------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--------------- + ldr q26, [x1, #96] // ..e........................................................................................ + mul v27.8H, v27.8H, v6.8H // .............................................*............................................. + ldr q12, [x3], #16 // ....e...................................................................................... + ldr q10, [x1, #64] // e.......................................................................................... + ldr q11, [x4, #-64] // ...................................*....................................................... + trn1 v8.2D, v28.2D, v0.2D // ................................*.......................................................... + mls v27.8H, v29.8H, v7.H[0] // ..............................................*............................................ + ldr q31, [x1, #80] // .e......................................................................................... + ldr q1, [x1, #112] // ...e....................................................................................... + ldr q22, [x4, #-16] // ......................................*.................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v15.8H, v26.8H, v12.H[0] // ......e.................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v0.8H, v1.8H, v12.H[1] // ..........e................................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v16.8H, v8.8H, v27.8H // ................................................*.......................................... + sub v20.8H, v8.8H, v27.8H // ...............................................*........................................... + sqrdmulh v17.8H, v26.8H, v12.H[1] // .....e..................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v6.8H, v1.8H, v12.H[0] // ...........e............................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v6.8H, v0.8H, v7.H[0] // ............e.............................................................................. + // gap // ........................................................................................... + ldr q25, [x4, #-48] // ....................................*...................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v15.8H, v17.8H, v7.H[0] // .......e................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v21.8H, v16.8H, v25.8H // .................................................*......................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v9.8H, v31.8H, v6.8H // .............e............................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + add v23.8H, v31.8H, v6.8H // ..............e............................................................................ + mul v25.8H, v16.8H, v11.8H // ..................................................*........................................ + // gap // ........................................................................................... + sub v6.8H, v10.8H, v15.8H // ........e.................................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v3.8H, v9.8H, v12.H[4] // .....................e..................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v25.8H, v21.8H, v7.H[0] // ...................................................*....................................... + add v21.8H, v10.8H, v15.8H // .........e................................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v13.8H, v23.8H, v12.H[3] // ...............e........................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v24.2D, v4.2D, v18.2D // ...............................*........................................................... + sqrdmulh v15.8H, v20.8H, v22.8H // ......................................................*.................................... + // gap // ........................................................................................... + ldr q28, [x4, #-32] // .....................................*..................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v1.8H, v9.8H, v12.H[5] // ....................e...................................................................... + // gap // ........................................................................................... + add v27.8H, v24.8H, v30.8H // ...........................................*............................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v0.8H, v20.8H, v28.8H // .......................................................*................................... + sub v28.8H, v24.8H, v30.8H // ..........................................*................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v20.8H, v27.8H, v25.8H // .....................................................*..................................... + mls v0.8H, v15.8H, v7.H[0] // ........................................................*.................................. + sub v15.8H, v27.8H, v25.8H // ....................................................*...................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v3.8H, v1.8H, v7.H[0] // ......................e.................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v27.4S, v20.4S, v15.4S // ............................................................*.............................. + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v14.8H, v23.8H, v12.H[2] // ................e.......................................................................... + trn1 v11.4S, v20.4S, v15.4S // ...........................................................*............................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v15.8H, v28.8H, v0.8H // ..........................................................*................................ + mls v14.8H, v13.8H, v7.H[0] // .................e......................................................................... + sub v28.8H, v28.8H, v0.8H // .........................................................*................................. + // gap // ........................................................................................... + sub v17.8H, v6.8H, v3.8H // .......................e................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v3.8H, v6.8H, v3.8H // ........................e.................................................................. + // gap // ........................................................................................... + sqdmulh v13.8H, v27.8H, v7.H[1] // ..................................................................*........................ + trn1 v26.4S, v15.4S, v28.4S // .............................................................*............................. + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v16.4S, v15.4S, v28.4S // ..............................................................*............................ + // gap // ........................................................................................... + sqdmulh v15.8H, v11.8H, v7.H[1] // ...............................................................*........................... + sub v24.8H, v21.8H, v14.8H // ..................e........................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + add v9.8H, v21.8H, v14.8H // ...................e....................................................................... + sqdmulh v31.8H, v26.8H, v7.H[1] // .....................................................................*..................... + ldr q14, [x4, #16] // ..................................e........................................................ + srshr v23.8H, v13.8H, #11 // ...................................................................*....................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v18.4S, v3.4S, v17.4S // ...........................e............................................................... + sqdmulh v25.8H, v16.8H, v7.H[1] // ........................................................................*.................. + srshr v20.8H, v15.8H, #11 // ................................................................*.......................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v27.8H, v23.8H, v7.H[0] // ....................................................................*...................... + // gap // ........................................................................................... + trn1 v4.4S, v9.4S, v24.4S // .........................e................................................................. + srshr v6.8H, v31.8H, #11 // ......................................................................*.................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v11.8H, v20.8H, v7.H[0] // .................................................................*......................... + trn2 v28.4S, v9.4S, v24.4S // ..........................e................................................................ + // gap // ........................................................................................... + srshr v0.8H, v25.8H, #11 // .........................................................................*................. + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v26.8H, v6.8H, v7.H[0] // .......................................................................*................... + ldr q6, [x4], #(6*16) // .................................e......................................................... + // gap // ........................................................................................... + umov x12, v27.d[1] // ..............................................................................*............ + // gap // ........................................................................................... + trn2 v2.2D, v4.2D, v18.2D // .............................e............................................................. + mls v16.8H, v0.8H, v7.H[0] // ..........................................................................*................ + trn2 v0.4S, v3.4S, v17.4S // ............................e.............................................................. + umov x13, v27.d[0] // .............................................................................*............. + umov x22, v11.d[0] // ...........................................................................*............... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x26, v11.d[1] // ............................................................................*.............. + // gap // ........................................................................................... + sqrdmulh v15.8H, v2.8H, v14.8H // .......................................e................................................... + str x12, [x1, #48] // .........................................................................................*. + umov x14, v26.d[0] // ...............................................................................*........... + trn2 v27.2D, v28.2D, v0.2D // ..............................e............................................................ + umov x15, v26.d[1] // ................................................................................*.......... + mul v30.8H, v2.8H, v6.8H // ........................................e.................................................. + str x13, [x1, #16] // .....................................................................................*..... + umov x27, v16.d[0] // .................................................................................*......... + umov x19, v16.d[1] // ..................................................................................*........ + str x22, [x1], #( 16*4) // ...................................................................................*....... + sqrdmulh v29.8H, v27.8H, v14.8H // ............................................e.............................................. + str x26, [x1, #-32] // .......................................................................................*... + // gap // ........................................................................................... + str x14, [x1, #-56] // ....................................................................................*...... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v30.8H, v15.8H, v7.H[0] // .........................................e................................................. + str x15, [x1, #-24] // ........................................................................................*.. + // gap // ........................................................................................... + str x27, [x1, #-40] // ......................................................................................*.... + str x19, [x1, #-8] // ..........................................................................................* + // gap // ........................................................................................... + + // ----------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------ + // ldr q8, [x1, #(16*0)] // ...e.......................................................................................'..~....................................................................................... + // ldr q9, [x1, #(16*1)] // .......e...................................................................................'......~................................................................................... + // ldr q10, [x1, #(16*2)] // e..........................................................................................~.......................................................................................... + // ldr q11, [x1, #(16*3)] // ........e..................................................................................'.......~.................................................................................. + // ldr q0, [x3], #16 // ..e........................................................................................'.~........................................................................................ + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ..............e............................................................................'.............~............................................................................ + // mul v24.8h, v10.8h, v0.h[0] // ..........e................................................................................'.........~................................................................................ + // mls v24.8h, v27.8h, v7.h[0] // ..................e........................................................................'.................~........................................................................ + // sub v10.8h, v8.8h, v24.8h // .......................e...................................................................'......................~................................................................... + // add v8.8h, v8.8h, v24.8h // ..........................e................................................................'.........................~................................................................ + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ...........e...............................................................................'..........~............................................................................... + // mul v24.8h, v11.8h, v0.h[0] // ...............e...........................................................................'..............~........................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ................e..........................................................................'...............~.......................................................................... + // sub v11.8h, v9.8h, v24.8h // ....................e......................................................................'...................~...................................................................... + // add v9.8h, v9.8h, v24.8h // .....................e.....................................................................'....................~..................................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ...........................e...............................................................'..........................~............................................................... + // mul v24.8h, v9.8h, v0.h[2] // ........................................e..................................................'.......................................~.................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...........................................e...............................................'..........................................~............................................... + // sub v9.8h, v8.8h, v24.8h // ...................................................e.......................................'..................................................~....................................... + // add v8.8h, v8.8h, v24.8h // ....................................................e......................................'...................................................~...................................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ...............................e...........................................................'..............................~........................................................... + // mul v24.8h, v11.8h, v0.h[4] // ........................e..................................................................'.......................~.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ......................................e....................................................'.....................................~.................................................... + // sub v11.8h, v10.8h, v24.8h // .............................................e.............................................'............................................~............................................. + // add v10.8h, v10.8h, v24.8h // ..............................................e............................................'.............................................~............................................ + // trn1 v25.4s, v8.4s, v9.4s // ............................................................e..............................'...........................................................~.............................. + // trn2 v26.4s, v8.4s, v9.4s // ...............................................................e...........................'..............................................................~........................... + // trn1 v27.4s, v10.4s, v11.4s // ........................................................e..................................'.......................................................~.................................. + // trn2 v28.4s, v10.4s, v11.4s // ......................................................................e....................'.....................................................................~.................... + // trn2 v10.2d, v25.2d, v27.2d // ....................................................................e......................'...................................................................~...................... + // trn2 v11.2d, v26.2d, v28.2d // .............................................................................e.............'............................................................................~............. + // trn1 v8.2d, v25.2d, v27.2d // ............................~..............................................................'...........................*.............................................................. + // trn1 v9.2d, v26.2d, v28.2d // .....~.....................................................................................'....*..................................................................................... + // ldr q0, [ x4], #(6*16) // ..................................................................e........................'.................................................................~........................ + // ldr q4, [x4, #(-6*16 + 1*16)] // ......................................................e....................................'.....................................................~.................................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ....~......................................................................................'...*...................................................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .................~.........................................................................'................*......................................................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // ..............................~............................................................'.............................*............................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // .........~.................................................................................'........*................................................................................. + // sqrdmulh v27.8h, v10.8h, v4.8h // ..........................................................................e................'.........................................................................~................ + // mul v24.8h, v10.8h, v0.8h // ...............................................................................e...........'..............................................................................~........... + // mls v24.8h, v27.8h, v7.h[0] // .......................................................................................e...'......................................................................................~... + // sub v10.8h, v8.8h, v24.8h // ..................................~........................................................'.................................*........................................................ + // add v8.8h, v8.8h, v24.8h // ................................~..........................................................'...............................*.......................................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ....................................................................................e......'...................................................................................~...... + // mul v24.8h, v11.8h, v0.8h // .~.........................................................................................'*......................................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ......~....................................................................................'.....*.................................................................................... + // sub v11.8h, v9.8h, v24.8h // .............~.............................................................................'............*............................................................................. + // add v9.8h, v9.8h, v24.8h // ............~..............................................................................'...........*.............................................................................. + // sqrdmulh v27.8h, v9.8h, v5.8h // ...................~.......................................................................'..................*....................................................................... + // mul v24.8h, v9.8h, v1.8h // ......................~....................................................................'.....................*.................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................~.................................................................'........................*................................................................. + // sub v9.8h, v8.8h, v24.8h // .....................................~.....................................................'....................................*..................................................... + // add v8.8h, v8.8h, v24.8h // ...................................~.......................................................'..................................*....................................................... + // sqrdmulh v27.8h, v11.8h, v6.8h // .............................~.............................................................'............................*............................................................. + // mul v24.8h, v11.8h, v2.8h // .................................~.........................................................'................................*......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................................~......................................................'...................................*...................................................... + // sub v11.8h, v10.8h, v24.8h // ............................................~..............................................'...........................................*.............................................. + // add v10.8h, v10.8h, v24.8h // ..........................................~................................................'.........................................*................................................ + // trn1 v25.4s, v8.4s, v9.4s // .........................................~.................................................'........................................*................................................. + // trn2 v26.4s, v8.4s, v9.4s // .......................................~...................................................'......................................*................................................... + // trn1 v27.4s, v10.4s, v11.4s // ................................................~..........................................'...............................................*.......................................... + // trn2 v28.4s, v10.4s, v11.4s // .................................................~.........................................'................................................*......................................... + // sqdmulh v24.8h, v25.8h, v7.h[1] // ..................................................~........................................'.................................................*........................................ + // srshr v24.8h, v24.8h, #11 // ..........................................................~................................'.........................................................*................................ + // mls v25.8h, v24.8h, v7.h[0] // ..............................................................~............................'.............................................................*............................ + // sqdmulh v24.8h, v26.8h, v7.h[1] // ...............................................~...........................................'..............................................*........................................... + // srshr v24.8h, v24.8h, #11 // .......................................................~...................................'......................................................*................................... + // mls v26.8h, v24.8h, v7.h[0] // ...........................................................~...............................'..........................................................*............................... + // sqdmulh v24.8h, v27.8h, v7.h[1] // .....................................................~.....................................'....................................................*..................................... + // srshr v24.8h, v24.8h, #11 // .............................................................~.............................'............................................................*............................. + // mls v27.8h, v24.8h, v7.h[0] // .................................................................~.........................'................................................................*......................... + // sqdmulh v24.8h, v28.8h, v7.h[1] // .........................................................~.................................'........................................................*................................. + // srshr v24.8h, v24.8h, #11 // ................................................................~..........................'...............................................................*.......................... + // mls v28.8h, v24.8h, v7.h[0] // .....................................................................~.....................'....................................................................*..................... + // umov x10, v25.d[0] // ........................................................................~..................'.......................................................................*.................. + // umov x11, v25.d[1] // .........................................................................~.................'........................................................................*................. + // umov x12, v26.d[0] // .......................................................................~...................'......................................................................*................... + // umov x13, v26.d[1] // ...................................................................~.......................'..................................................................*....................... + // umov x14, v27.d[0] // ............................................................................~..............'...........................................................................*.............. + // umov x15, v27.d[1] // ..............................................................................~............'.............................................................................*............ + // umov x16, v28.d[0] // .................................................................................~.........'................................................................................*......... + // umov x17, v28.d[1] // ..................................................................................~........'.................................................................................*........ + // str x10, [x1], #( 16*4) // ...................................................................................~.......'..................................................................................*....... + // str x14, [x1, #(-16*4 + 8*1)] // ......................................................................................~....'.....................................................................................*.... + // str x12, [x1, #(-16*4 + 8*2)] // ................................................................................~..........'...............................................................................*.......... + // str x16, [x1, #(-16*4 + 8*3)] // .........................................................................................~.'........................................................................................*. + // str x11, [x1, #(-16*4 + 8*4)] // .....................................................................................~.....'....................................................................................*..... + // str x15, [x1, #(-16*4 + 8*5)] // ........................................................................................~..'.......................................................................................*.. + // str x13, [x1, #(-16*4 + 8*6)] // ...........................................................................~...............'..........................................................................*............... + // str x17, [x1, #(-16*4 + 8*7)] // ..........................................................................................~'.........................................................................................* + + sub count, count, #1 cbnz count, layer4567_start - sub v2.8H, v1.8H, v15.8H // .....*.............................. - mls v25.8H, v19.8H, v7.H[0] // *................................... - // gap // .................................... - trn2 v15.4S, v22.4S, v6.4S // .*.................................. - // gap // .................................... - // gap // .................................... - trn1 v29.4S, v22.4S, v6.4S // ..*................................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sub v28.8H, v2.8H, v25.8H // ........*........................... - add v2.8H, v2.8H, v25.8H // .........*.......................... - // gap // .................................... - sqdmulh v5.8H, v29.8H, v7.H[1] // ....*............................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - trn1 v8.4S, v2.4S, v28.4S // ..........*......................... - sqdmulh v24.8H, v15.8H, v7.H[1] // ...*................................ - // gap // .................................... - trn2 v19.4S, v2.4S, v28.4S // ...........*........................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - srshr v4.8H, v5.8H, #11 // ......*............................. - sqdmulh v5.8H, v8.8H, v7.H[1] // ................*................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sqdmulh v11.8H, v19.8H, v7.H[1] // ............*....................... - srshr v9.8H, v24.8H, #11 // .......*............................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v29.8H, v4.8H, v7.H[0] // .............*...................... - // gap // .................................... - // gap // .................................... - srshr v0.8H, v5.8H, #11 // ....................*............... - // gap // .................................... - // gap // .................................... - mls v15.8H, v9.8H, v7.H[0] // ..............*..................... - // gap // .................................... - // gap // .................................... - srshr v18.8H, v11.8H, #11 // ...............*.................... - // gap // .................................... - // gap // .................................... - mls v8.8H, v0.8H, v7.H[0] // .........................*.......... - // gap // .................................... - // gap // .................................... - vext x8, v29, 0 // .................*.................. - vext x11, v29, 1 // ...................*................ - // gap // .................................... - mls v19.8H, v18.8H, v7.H[0] // ..................*................. - // gap // .................................... - // gap // .................................... - vext x13, v15, 1 // ........................*........... - vext x12, v15, 0 // .......................*............ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - str x8, [x1] , #64 // .....................*.............. - str x11, [x1, #-32] // ......................*............. - vext x14, v8, 0 // .............................*...... - vext x15, v8, 1 // ............................*....... - // gap // .................................... - // gap // .................................... - vext x17, v19, 1 // ..............................*..... - vext x16, v19, 0 // ...............................*.... - str x13, [x1, #-16] // ...........................*........ - str x12, [x1, #-48] // ..........................*......... - // gap // .................................... - // gap // .................................... - str x14, [x1, #-56] // .................................*.. - // gap // .................................... - // gap // .................................... - str x15, [x1, #-24] // ................................*... - // gap // .................................... - // gap // .................................... - str x16, [x1, #-40] // ..................................*. - str x17, [x1, #-8] // ...................................* - // gap // .................................... - - // original source code - // mls v25.8H, v19.8H, v7.H[0] // .*.................................. || *.............................. - // trn2 v29.4S, v22.4S, v6.4S // ..*................................. || .*............................. - // trn1 v30.4S, v22.4S, v6.4S // ...*................................ || ..*............................ - // sqdmulh v18.8H, v29.8H, v7.H[1] // ........*........................... || ........*...................... - // sqdmulh v13.8H, v30.8H, v7.H[1] // ......*............................. || ......*........................ - // sub v14.8H, v1.8H, v15.8H // *................................... || *.............................. - // srshr v28.8H, v13.8H, #11 // ..........*......................... || ...........*................... - // srshr v0.8H, v18.8H, #11 // .............*...................... || .............*................. - // sub v12.8H, v14.8H, v25.8H // ....*............................... || .....*......................... - // add v4.8H, v14.8H, v25.8H // .....*.............................. || .....*......................... - // trn1 v14.4S, v4.4S, v12.4S // .......*............................ || ........*...................... - // trn2 v4.4S, v4.4S, v12.4S // .........*.......................... || .........*..................... - // sqdmulh v9.8H, v4.8H, v7.H[1] // ............*....................... || .............*................. - // mls v30.8H, v28.8H, v7.H[0] // ..............*..................... || ...............*............... - // mls v29.8H, v0.8H, v7.H[0] // ................*................... || .................*............. - // srshr v17.8H, v9.8H, #11 // .................*.................. || ..................*............ - // sqdmulh v27.8H, v14.8H, v7.H[1] // ...........*........................ || ...........*................... - // vext x8, v30, 0 // ...................*................ || ....................*.......... - // mls v4.8H, v17.8H, v7.H[0] // .....................*.............. || .....................*......... - // vext x11, v30, 1 // ....................*............... || ....................*.......... - // srshr v6.8H, v27.8H, #11 // ...............*.................... || ................*.............. - // str x8, [x1] , #64 // ........................*........... || ........................*...... - // str x11, [x1, #-32] // .........................*.......... || ........................*...... - // vext x12, v29, 0 // .......................*............ || ......................*........ - // vext x13, v29, 1 // ......................*............. || ......................*........ - // mls v14.8H, v6.8H, v7.H[0] // ..................*................. || ...................*........... - // str x12, [x1, #-48] // ...............................*.... || ...........................*... - // str x13, [x1, #-16] // ..............................*..... || ..........................*.... - // vext x15, v14, 1 // ...........................*........ || .........................*..... - // vext x14, v14, 0 // ..........................*......... || ........................*...... - // vext x17, v4, 1 // ............................*....... || ..........................*.... - // vext x16, v4, 0 // .............................*...... || ..........................*.... - // str x15, [x1, #-24] // .................................*.. || .............................*. - // str x14, [x1, #-56] // ................................*... || ............................*.. - // str x16, [x1, #-40] // ..................................*. || ..............................* - // str x17, [x1, #-8] // ...................................* || ..............................* - + // Instructions: 54 + // Expected cycles: 52 + // Expected IPC: 1.04 + // + // Cycle bound: 52.0 + // IPC bound: 1.04 + // + // Wall time: 1.46s + // User time: 1.46s + // + // ----------------- original position -----------------> + // 0 25 50 + // |------------------------|------------------------|--- + mul v31.8H, v27.8H, v6.8H // *..................................................... + ldr q27, [x4, #-64] // .*.................................................... + trn1 v15.2D, v4.2D, v18.2D // ...........*.......................................... + trn1 v16.2D, v28.2D, v0.2D // ..*................................................... + ldr q24, [x4, #-16] // ....*................................................. + ldr q21, [x4, #-48] // .......*.............................................. + mls v31.8H, v29.8H, v7.H[0] // ...*.................................................. + ldr q11, [x4, #-32] // .............*........................................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v13.8H, v16.8H, v31.8H // .....*................................................ + sub v0.8H, v16.8H, v31.8H // ......*............................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v14.8H, v13.8H, v21.8H // ........*............................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v31.8H, v0.8H, v24.8H // ............*......................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v6.8H, v13.8H, v27.8H // .........*............................................ + sub v27.8H, v15.8H, v30.8H // ................*..................................... + // gap // ...................................................... + add v15.8H, v15.8H, v30.8H // ..............*....................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v6.8H, v14.8H, v7.H[0] // ..........*........................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v14.8H, v0.8H, v11.8H // ...............*...................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v14.8H, v31.8H, v7.H[0] // ..................*................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sub v16.8H, v15.8H, v6.8H // ...................*.................................. + add v15.8H, v15.8H, v6.8H // .................*.................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v4.8H, v27.8H, v14.8H // ......................*............................... + trn2 v11.4S, v15.4S, v16.4S // ....................*................................. + // gap // ...................................................... + trn1 v0.4S, v15.4S, v16.4S // .....................*................................ + sub v14.8H, v27.8H, v14.8H // .......................*.............................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqdmulh v21.8H, v11.8H, v7.H[1] // ........................*............................. + trn2 v6.4S, v4.4S, v14.4S // ..........................*........................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v26.4S, v4.4S, v14.4S // .........................*............................ + sqdmulh v14.8H, v0.8H, v7.H[1] // ...........................*.......................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqdmulh v16.8H, v6.8H, v7.H[1] // ..............................*....................... + srshr v28.8H, v21.8H, #11 // .............................*........................ + // gap // ...................................................... + // gap // ...................................................... + sqdmulh v27.8H, v26.8H, v7.H[1] // ............................*......................... + // gap // ...................................................... + // gap // ...................................................... + srshr v15.8H, v14.8H, #11 // ...............................*...................... + // gap // ...................................................... + // gap // ...................................................... + mls v11.8H, v28.8H, v7.H[0] // ................................*..................... + // gap // ...................................................... + // gap // ...................................................... + srshr v16.8H, v16.8H, #11 // ...................................*.................. + // gap // ...................................................... + // gap // ...................................................... + mls v0.8H, v15.8H, v7.H[0] // ..................................*................... + // gap // ...................................................... + // gap // ...................................................... + srshr v15.8H, v27.8H, #11 // .................................*.................... + // gap // ...................................................... + // gap // ...................................................... + mls v6.8H, v16.8H, v7.H[0] // ......................................*............... + // gap // ...................................................... + // gap // ...................................................... + umov x19, v11.d[1] // .....................................*................ + umov x12, v11.d[0] // .......................................*.............. + // gap // ...................................................... + mls v26.8H, v15.8H, v7.H[0] // ....................................*................. + // gap // ...................................................... + // gap // ...................................................... + umov x24, v0.d[1] // .........................................*............ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + str x19, [x1, #48] // ..........................................*........... + umov x19, v0.d[0] // ........................................*............. + umov x27, v6.d[1] // ...............................................*...... + umov x11, v6.d[0] // ..............................................*....... + str x12, [x1, #16] // .............................................*........ + // gap // ...................................................... + umov x13, v26.d[1] // ............................................*......... + umov x12, v26.d[0] // ...........................................*.......... + str x24, [x1, #32] // .................................................*.... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + str x19, [x1], #( 16*4) // ................................................*..... + str x27, [x1, #-8] // .....................................................* + // gap // ...................................................... + str x11, [x1, #-40] // ....................................................*. + // gap // ...................................................... + // gap // ...................................................... + str x12, [x1, #-56] // ..................................................*... + str x13, [x1, #-24] // ...................................................*.. + // gap // ...................................................... + + // ------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|--- + // mul v27.8H, v27.8H, v6.8H // *..................................................... + // ldr q11, [x4, #-64] // .*.................................................... + // trn1 v8.2D, v28.2D, v0.2D // ...*.................................................. + // mls v27.8H, v29.8H, v7.H[0] // ......*............................................... + // ldr q22, [x4, #-16] // ....*................................................. + // add v16.8H, v8.8H, v27.8H // ........*............................................. + // sub v20.8H, v8.8H, v27.8H // .........*............................................ + // ldr q25, [x4, #-48] // .....*................................................ + // sqrdmulh v21.8H, v16.8H, v25.8H // ..........*........................................... + // mul v25.8H, v16.8H, v11.8H // ............*......................................... + // mls v25.8H, v21.8H, v7.H[0] // ...............*...................................... + // trn1 v24.2D, v4.2D, v18.2D // ..*................................................... + // sqrdmulh v15.8H, v20.8H, v22.8H // ...........*.......................................... + // ldr q28, [x4, #-32] // .......*.............................................. + // add v27.8H, v24.8H, v30.8H // ..............*....................................... + // mul v0.8H, v20.8H, v28.8H // ................*..................................... + // sub v28.8H, v24.8H, v30.8H // .............*........................................ + // add v20.8H, v27.8H, v25.8H // ...................*.................................. + // mls v0.8H, v15.8H, v7.H[0] // .................*.................................... + // sub v15.8H, v27.8H, v25.8H // ..................*................................... + // trn2 v27.4S, v20.4S, v15.4S // .....................*................................ + // trn1 v11.4S, v20.4S, v15.4S // ......................*............................... + // add v15.8H, v28.8H, v0.8H // ....................*................................. + // sub v28.8H, v28.8H, v0.8H // .......................*.............................. + // sqdmulh v13.8H, v27.8H, v7.H[1] // ........................*............................. + // trn1 v26.4S, v15.4S, v28.4S // ..........................*........................... + // trn2 v16.4S, v15.4S, v28.4S // .........................*............................ + // sqdmulh v15.8H, v11.8H, v7.H[1] // ...........................*.......................... + // sqdmulh v31.8H, v26.8H, v7.H[1] // ..............................*....................... + // srshr v23.8H, v13.8H, #11 // .............................*........................ + // sqdmulh v25.8H, v16.8H, v7.H[1] // ............................*......................... + // srshr v20.8H, v15.8H, #11 // ...............................*...................... + // mls v27.8H, v23.8H, v7.H[0] // ................................*..................... + // srshr v6.8H, v31.8H, #11 // ...................................*.................. + // mls v11.8H, v20.8H, v7.H[0] // ..................................*................... + // srshr v0.8H, v25.8H, #11 // .................................*.................... + // mls v26.8H, v6.8H, v7.H[0] // .......................................*.............. + // umov x12, v27.d[1] // .....................................*................ + // mls v16.8H, v0.8H, v7.H[0] // ....................................*................. + // umov x13, v27.d[0] // ......................................*............... + // umov x22, v11.d[0] // ..........................................*........... + // umov x26, v11.d[1] // ........................................*............. + // str x12, [x1, #48] // .........................................*............ + // umov x14, v26.d[0] // ...............................................*...... + // umov x15, v26.d[1] // ..............................................*....... + // str x13, [x1, #16] // .............................................*........ + // umov x27, v16.d[0] // ............................................*......... + // umov x19, v16.d[1] // ...........................................*.......... + // str x22, [x1], #( 16*4) // .................................................*.... + // str x26, [x1, #-32] // ................................................*..... + // str x14, [x1, #-56] // ....................................................*. + // str x15, [x1, #-24] // .....................................................* + // str x27, [x1, #-40] // ...................................................*.. + // str x19, [x1, #-8] // ..................................................*... + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_m1_firestorm.s index 266283f0..7974b99c 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_m1_firestorm.s @@ -26,15 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 .macro vins vec_out, gpr_in, lane @@ -45,27 +36,6 @@ xtmp1 .req x11 umov \gpr_out\(), \vec_in\().d[\lane] .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -83,15 +53,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -100,12 +70,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -119,21 +83,21 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -178,7 +142,7 @@ xtmp1 .req x11 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -189,7 +153,7 @@ xtmp1 .req x11 str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -199,7 +163,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -207,7 +171,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -218,19 +182,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -369,1684 +333,1548 @@ _ntt_kyber_123_4567_scalar_load_store_opt_m1_firestorm: load_roots_123 .p2align 2 - ldr x16, [x0, #0] // ................................*................ - ldr x17, [x0, #448] // .*............................................... - ldr x29, [x0, #128] // .......................*......................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ldr x11, [x0, #456] // ......*.......................................... - ldr x12, [x0, #192] // ..*.............................................. - ldr x20, [x0, #384] // *................................................ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ldr x9, [x0, #136] // ............................*.................... - ldr x15, [x0, #256] // ....*............................................ - ldr x14, [x0, #320] // .....*........................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ins v17.d[0], x16 // .........................................*....... - ins v29.d[0], x17 // ...........*..................................... - ldr x8, [x0, #200] // .........*....................................... - ins v27.d[0], x29 // .............................*................... - ldr x24, [x0, #392] // .......*......................................... - ldr x16, [x0, #64] // ...........................*..................... - // gap // ................................................. - // gap // ................................................. - ins v21.d[0], x12 // ..........*...................................... - ins v4.d[0], x20 // ........*........................................ - ldr x23, [x0, #264] // ...*............................................. - ldr x22, [x0, #72] // .................................*............... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ins v27.d[1], x9 // .......................................*......... - ins v24.d[0], x15 // .................*............................... - ldr x15, [x0, #8] // ......................................*.......... - ins v29.d[1], x11 // ..............*.................................. - ldr x26, [x0, #328] // ............*.................................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ins v21.d[1], x8 // ................*................................ - ins v4.d[1], x24 // .............*................................... - ins v23.d[0], x14 // ...............*................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mul v8.8H, v29.8H, v0.H[0] // ..................*.............................. - sqrdmulh v22.8H, v29.8H, v0.H[1] // ...................*............................. - ins v24.d[1], x23 // ......................*.......................... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - sqrdmulh v18.8H, v4.8H, v0.H[1] // ........................*........................ - mul v11.8H, v4.8H, v0.H[0] // .....................*........................... - ins v23.d[1], x26 // ....................*............................ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mul v5.8H, v24.8H, v0.H[0] // ................................................* - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mls v8.8H, v22.8H, v7.H[0] // .........................*....................... - mul v31.8H, v23.8H, v0.H[0] // ..........................*...................... - sqrdmulh v22.8H, v23.8H, v0.H[1] // ..............................*.................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mls v11.8H, v18.8H, v7.H[0] // ..................................*.............. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mls v31.8H, v22.8H, v7.H[0] // ........................................*........ - add v15.8H, v21.8H, v8.8H // ...............................*................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - add v28.8H, v27.8H, v11.8H // ..............................................*.. - sub v12.8H, v27.8H, v11.8H // .............................................*... - sub v11.8H, v21.8H, v8.8H // ...........................................*..... - sqrdmulh v8.8H, v24.8H, v0.H[1] // ............................................*.... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ins v24.d[0], x16 // ...................................*............. - mul v14.8H, v15.8H, v0.H[2] // .....................................*........... - sqrdmulh v30.8H, v15.8H, v0.H[3] // ....................................*............ - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - ins v24.d[1], x22 // ..........................................*...... - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - mls v14.8H, v30.8H, v7.H[0] // ...............................................*. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - // gap // ................................................. - - // original source code - // ldr x9, [x0, #384] // .....*........................................... - // ldr x21, [x0, #448] // .*............................................... - // ldr x13, [x0, #192] // ....*............................................ - // ldr x12, [x0, #264] // .................*............................... - // ldr x23, [x0, #256] // .......*......................................... - // ldr x20, [x0, #320] // ........*........................................ - // ldr x15, [x0, #456] // ...*............................................. - // ldr x10, [x0, #392] // .............*................................... - // ins v3.d[0], x9 // ................*................................ - // ldr x28, [x0, #200] // ...........*..................................... - // ins v30.d[0], x13 // ...............*................................. - // ins v23.d[0], x21 // ..........*...................................... - // ldr x9, [x0, #328] // .......................*......................... - // ins v3.d[1], x10 // .........................*....................... - // ins v23.d[1], x15 // ......................*.......................... - // ins v24.d[0], x20 // ..........................*...................... - // ins v30.d[1], x28 // ........................*........................ - // ins v21.d[0], x23 // ....................*............................ - // mul v2.8H, v23.8H, v0.H[0] // ...........................*..................... - // sqrdmulh v13.8H, v23.8H, v0.H[1] // ............................*.................... - // ins v24.d[1], x9 // ................................*................ - // mul v28.8H, v3.8H, v0.H[0] // ...............................*................. - // ins v21.d[1], x12 // .............................*................... - // ldr x23, [x0, #128] // ..*.............................................. - // sqrdmulh v8.8H, v3.8H, v0.H[1] // ..............................*.................. - // mls v2.8H, v13.8H, v7.H[0] // ..................................*.............. - // mul v31.8H, v24.8H, v0.H[0] // ...................................*............. - // ldr x20, [x0, #64] // ..............*.................................. - // ldr x11, [x0, #136] // ......*.......................................... - // ins v9.d[0], x23 // ............*.................................... - // sqrdmulh v16.8H, v24.8H, v0.H[1] // ....................................*............ - // add v19.8H, v30.8H, v2.8H // .......................................*......... - // ldr x14, [x0, #0] // *................................................ - // ldr x24, [x0, #72] // ..................*.............................. - // mls v28.8H, v8.8H, v7.H[0] // .....................................*........... - // ins v24.d[0], x20 // ............................................*.... - // sqrdmulh v26.8H, v19.8H, v0.H[3] // ..............................................*.. - // mul v14.8H, v19.8H, v0.H[2] // .............................................*... - // ldr x15, [x0, #8] // .....................*........................... - // ins v9.d[1], x11 // ...................*............................. - // mls v31.8H, v16.8H, v7.H[0] // ......................................*.......... - // ins v17.d[0], x14 // .........*....................................... - // ins v24.d[1], x24 // ...............................................*. - // sub v11.8H, v30.8H, v2.8H // ..........................................*...... - // sqrdmulh v8.8H, v21.8H, v0.H[1] // ...........................................*..... - // sub v12.8H, v9.8H, v28.8H // .........................................*....... - // add v28.8H, v9.8H, v28.8H // ........................................*........ - // mls v14.8H, v26.8H, v7.H[0] // ................................................* - // mul v5.8H, v21.8H, v0.H[0] // .................................*............... + // Instructions: 33 + // Expected cycles: 16 + // Expected IPC: 2.06 + // + // Cycle bound: 16.0 + // IPC bound: 2.06 + // + // Wall time: 0.28s + // User time: 0.28s + // + // ------ original position -------> + // 0 25 + // |------------------------|------- + ldr q21, [x0, #0] // *................................ + ldr q19, [x0, #384] // .....*........................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + ldr q5, [x0, #448] // .*............................... + ldr q13, [x0, #192] // ..........*...................... + ldr q18, [x0, #64] // .......................*......... + ldr q11, [x0, #320] // ..*.............................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v29.8H, v19.8H, v0.H[1] // .........*....................... + mul v28.8H, v19.8H, v0.H[0] // ............*.................... + mul v15.8H, v5.8H, v0.H[0] // .......*......................... + sqrdmulh v3.8H, v5.8H, v0.H[1] // ........*........................ + ldr q5, [x0, #256] // ...*............................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sqrdmulh v24.8H, v11.8H, v0.H[1] // ......*.......................... + mul v22.8H, v11.8H, v0.H[0] // .............*................... + ldr q19, [x0, #128] // ....*............................ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v15.8H, v3.8H, v7.H[0] // ...........*..................... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v22.8H, v24.8H, v7.H[0] // ........................*........ + mls v28.8H, v29.8H, v7.H[0] // .................*............... + sqrdmulh v20.8H, v5.8H, v0.H[1] // ..................*.............. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v12.8H, v5.8H, v0.H[0] // ..............*.................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sub v30.8H, v13.8H, v15.8H // ...............*................. + add v24.8H, v13.8H, v15.8H // ................*................ + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + sub v25.8H, v19.8H, v28.8H // ..........................*...... + sub v15.8H, v18.8H, v22.8H // ................................* + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v6.8H, v30.8H, v0.H[4] // ....................*............ + sqrdmulh v4.8H, v30.8H, v0.H[5] // .....................*........... + sqrdmulh v29.8H, v24.8H, v0.H[3] // ...................*............. + mul v5.8H, v24.8H, v0.H[2] // ......................*.......... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + add v13.8H, v18.8H, v22.8H // .............................*... + mls v12.8H, v20.8H, v7.H[0] // .........................*....... + sqrdmulh v11.8H, v25.8H, v0.H[5] // ..............................*.. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mul v3.8H, v25.8H, v0.H[4] // ...............................*. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + mls v6.8H, v4.8H, v7.H[0] // ............................*.... + mls v5.8H, v29.8H, v7.H[0] // ...........................*..... + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + // gap // ................................. + + // --------- new position ---------> + // 0 25 + // |------------------------|------- + // ldr q21, [x0, #0] // *................................ + // ldr q22, [x0, #448] // ..*.............................. + // ldr q2, [x0, #320] // .....*........................... + // ldr q18, [x0, #256] // ..........*...................... + // ldr q19, [x0, #128] // .............*................... + // ldr q23, [x0, #384] // .*............................... + // sqrdmulh v27.8H, v2.8H, v0.H[1] // ...........*..................... + // mul v17.8H, v22.8H, v0.H[0] // ........*........................ + // sqrdmulh v30.8H, v22.8H, v0.H[1] // .........*....................... + // sqrdmulh v22.8H, v23.8H, v0.H[1] // ......*.......................... + // ldr q24, [x0, #192] // ...*............................. + // mls v17.8H, v30.8H, v7.H[0] // ..............*.................. + // mul v28.8H, v23.8H, v0.H[0] // .......*......................... + // mul v23.8H, v2.8H, v0.H[0] // ............*.................... + // mul v12.8H, v18.8H, v0.H[0] // ..................*.............. + // sub v2.8H, v24.8H, v17.8H // ...................*............. + // add v3.8H, v24.8H, v17.8H // ....................*............ + // mls v28.8H, v22.8H, v7.H[0] // ................*................ + // sqrdmulh v17.8H, v18.8H, v0.H[1] // .................*............... + // sqrdmulh v26.8H, v3.8H, v0.H[3] // .........................*....... + // mul v6.8H, v2.8H, v0.H[4] // .......................*......... + // sqrdmulh v13.8H, v2.8H, v0.H[5] // ........................*........ + // mul v5.8H, v3.8H, v0.H[2] // ..........................*...... + // ldr q29, [x0, #64] // ....*............................ + // mls v23.8H, v27.8H, v7.H[0] // ...............*................. + // mls v12.8H, v17.8H, v7.H[0] // ............................*.... + // sub v16.8H, v19.8H, v28.8H // .....................*........... + // mls v5.8H, v26.8H, v7.H[0] // ................................* + // mls v6.8H, v13.8H, v7.H[0] // ...............................*. + // add v13.8H, v29.8H, v23.8H // ...........................*..... + // sqrdmulh v11.8H, v16.8H, v0.H[5] // .............................*... + // mul v3.8H, v16.8H, v0.H[4] // ..............................*.. + // sub v15.8H, v29.8H, v23.8H // ......................*.......... sub count, count, #1 layer123_start: - sqrdmulh v26.8H, v11.8H, v0.H[5] // ....................................................................*............................... - // gap // .................................................................................................... - ldr x9, [x0, #400] // ........................e........................................................................... - ldr x21, [x0, #464] // ............................e....................................................................... - ldr x13, [x0, #208] // ............e....................................................................................... - sub v19.8H, v24.8H, v31.8H // ........................................*........................................................... - add v22.8H, v24.8H, v31.8H // .........................................*.......................................................... - mul v31.8H, v11.8H, v0.H[4] // ...................................................................*................................ - mul v6.8H, v12.8H, v0.H[4] // ..............................................................*..................................... - sqrdmulh v4.8H, v12.8H, v0.H[5] // ...............................................................*.................................... - mul v10.8H, v28.8H, v0.H[2] // ....................................................*............................................... - sqrdmulh v28.8H, v28.8H, v0.H[3] // .....................................................*.............................................. - // gap // .................................................................................................... - ldr x12, [x0, #280] // .................e.................................................................................. - ldr x23, [x0, #272] // ................e................................................................................... - ldr x20, [x0, #336] // ....................e............................................................................... - ins v17.d[1], x15 // ...*................................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - ldr x15, [x0, #472] // .............................e...................................................................... - mls v5.8H, v8.8H, v7.H[0] // ..................................*................................................................. - sub v8.8H, v22.8H, v14.8H // ............................................................*....................................... - add v13.8H, v22.8H, v14.8H // .............................................................*...................................... - ldr x10, [x0, #408] // .........................e.......................................................................... - ins v3.d[0], x9 // ..........................e......................................................................... - ldr x28, [x0, #216] // .............e...................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v30.d[0], x13 // ..............e..................................................................................... - ins v23.d[0], x21 // ..............................e..................................................................... - mls v31.8H, v26.8H, v7.H[0] // .....................................................................*.............................. - // gap // .................................................................................................... - sqrdmulh v9.8H, v13.8H, v0.H[7] // .........................................................................*.......................... - mls v10.8H, v28.8H, v7.H[0] // ......................................................*............................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v27.8H, v8.8H, v1.H[0] // .............................................................................*...................... - sqrdmulh v18.8H, v8.8H, v1.H[1] // ..............................................................................*..................... - add v14.8H, v17.8H, v5.8H // ....................................*............................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ldr x9, [x0, #344] // .....................e.............................................................................. - ins v3.d[1], x10 // ...........................e........................................................................ - ins v23.d[1], x15 // ...............................e.................................................................... - // gap // .................................................................................................... - mls v6.8H, v4.8H, v7.H[0] // ................................................................*................................... - ins v24.d[0], x20 // ......................e............................................................................. - ins v30.d[1], x28 // ...............e.................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v11.8H, v13.8H, v0.H[6] // ........................................................................*........................... - add v4.8H, v19.8H, v31.8H // .......................................................................*............................ - ins v21.d[0], x23 // ..................e................................................................................. - sub v26.8H, v19.8H, v31.8H // ......................................................................*............................. - // gap // .................................................................................................... - mul v2.8H, v23.8H, v0.H[0] // ...............................................e.................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v13.8H, v23.8H, v0.H[1] // ................................................e................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v24.d[1], x9 // .......................e............................................................................ - mul v28.8H, v3.8H, v0.H[0] // ..........................................e......................................................... - mul v20.8H, v4.8H, v1.H[2] // ..................................................................................*................. - sqrdmulh v4.8H, v4.8H, v1.H[3] // ...................................................................................*................ - ins v21.d[1], x12 // ...................e................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ldr x23, [x0, #144] // ........e........................................................................................... - mul v22.8H, v26.8H, v1.H[4] // .......................................................................................*............ - mls v11.8H, v9.8H, v7.H[0] // ..........................................................................*......................... - sqrdmulh v8.8H, v3.8H, v0.H[1] // ...........................................e........................................................ - // gap // .................................................................................................... - sub v15.8H, v17.8H, v5.8H // ...................................*................................................................ - // gap // .................................................................................................... - mls v2.8H, v13.8H, v7.H[0] // .................................................e.................................................. - // gap // .................................................................................................... - add v12.8H, v14.8H, v10.8H // ........................................................*........................................... - // gap // .................................................................................................... - sqrdmulh v23.8H, v26.8H, v1.H[5] // ........................................................................................*........... - mul v31.8H, v24.8H, v0.H[0] // .....................................e.............................................................. - // gap // .................................................................................................... - mls v27.8H, v18.8H, v7.H[0] // ...............................................................................*.................... - // gap // .................................................................................................... - mls v20.8H, v4.8H, v7.H[0] // ....................................................................................*............... - sub v14.8H, v14.8H, v10.8H // .......................................................*............................................ - ldr x20, [x0, #80] // ....e............................................................................................... - ldr x11, [x0, #152] // .........e.......................................................................................... - add v26.8H, v15.8H, v6.8H // ..................................................................*................................. - sub v15.8H, v15.8H, v6.8H // .................................................................*.................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v9.d[0], x23 // ..........e......................................................................................... - sub v18.8H, v12.8H, v11.8H // ...........................................................................*........................ - sqrdmulh v16.8H, v24.8H, v0.H[1] // ......................................e............................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v19.8H, v30.8H, v2.8H // ...................................................e................................................ - add v3.8H, v12.8H, v11.8H // ............................................................................*....................... - mls v22.8H, v23.8H, v7.H[0] // .........................................................................................*.......... - ldr x14, [x0, #16] // e................................................................................................... - // gap // .................................................................................................... - ldr x24, [x0, #88] // .....e.............................................................................................. - mls v28.8H, v8.8H, v7.H[0] // ............................................e....................................................... - ins v24.d[0], x20 // ......e............................................................................................. - sub v8.8H, v26.8H, v20.8H // .....................................................................................*.............. - // gap // .................................................................................................... - sub v25.8H, v14.8H, v27.8H // ................................................................................*................... - str q18, [x0, #64] // .............................................................................................*...... - add v18.8H, v26.8H, v20.8H // ......................................................................................*............. - add v23.8H, v14.8H, v27.8H // .................................................................................*.................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v26.8H, v19.8H, v0.H[3] // ..........................................................e......................................... - mul v14.8H, v19.8H, v0.H[2] // .........................................................e.......................................... - str q3, [x0], #(16) // ............................................................................................*....... - ldr x15, [x0, #8] // .e.................................................................................................. - ins v9.d[1], x11 // ...........e........................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v31.8H, v16.8H, v7.H[0] // .......................................e............................................................ - sub v12.8H, v15.8H, v22.8H // ..........................................................................................*......... - add v4.8H, v15.8H, v22.8H // ...........................................................................................*........ - str q25, [x0, #176] // ...............................................................................................*.... - str q8, [x0, #304] // .................................................................................................*.. - ins v17.d[0], x14 // ..e................................................................................................. - str q23, [x0, #112] // ..............................................................................................*..... - // gap // .................................................................................................... - ins v24.d[1], x24 // .......e............................................................................................ - // gap // .................................................................................................... - str q18, [x0, #240] // ................................................................................................*... - sub v11.8H, v30.8H, v2.8H // ..................................................e................................................. - sqrdmulh v8.8H, v21.8H, v0.H[1] // .................................e.................................................................. - str q12, [x0, #432] // ...................................................................................................* - sub v12.8H, v9.8H, v28.8H // .............................................e...................................................... - add v28.8H, v9.8H, v28.8H // ..............................................e..................................................... - // gap // .................................................................................................... - mls v14.8H, v26.8H, v7.H[0] // ...........................................................e........................................ - // gap // .................................................................................................... - str q4, [x0, #368] // ..................................................................................................*. - mul v5.8H, v21.8H, v0.H[0] // ................................e................................................................... - - // original source code - // ldr x10, [x0, #0] // ....................................................................e..............................|....................................................................e............................. - // ldr x11, [x0, #(0+8)] // ................................................................................e..................|................................................................................e................. - // ins v8.d[0], x10 // .......................................................................................e...........|.......................................................................................e.......... - // ins v8.d[1], x11 // .............*.....................................................................................|.............*.................................................................................... - // ldr x10, [x0, #(1*(512/8))] // ..........................................................e........................................|..........................................................e....................................... - // ldr x11, [x0, #((1*(512/8))+8)] // .....................................................................e.............................|.....................................................................e............................ - // ins v9.d[0], x10 // .......................................................................e...........................|.......................................................................e.......................... - // ins v9.d[1], x11 // .........................................................................................e.........|.........................................................................................e........ - // ldr x10, [x0, #(2*(512/8))] // ..............................................e....................................................|..............................................e................................................... - // ldr x11, [x0, #((2*(512/8))+8)] // ...........................................................e.......................................|...........................................................e...................................... - // ins v10.d[0], x10 // ..............................................................e....................................|..............................................................e................................... - // ins v10.d[1], x11 // .................................................................................e.................|.................................................................................e................ - // ldr x10, [x0, #(3*(512/8))] // ..e................................................................................................|..e............................................................................................... - // ldr x11, [x0, #((3*(512/8))+8)] // ....................e..............................................................................|....................e............................................................................. - // ins v11.d[0], x10 // .....................e.............................................................................|.....................e............................................................................ - // ins v11.d[1], x11 // ..................................e................................................................|..................................e............................................................... - // ldr x10, [x0, #(4*(512/8))] // ...........e.......................................................................................|...........e...................................................................................... - // ldr x11, [x0, #((4*(512/8))+8)] // ..........e........................................................................................|..........e....................................................................................... - // ins v12.d[0], x10 // .....................................e.............................................................|.....................................e............................................................ - // ins v12.d[1], x11 // .............................................e.....................................................|.............................................e.................................................... - // ldr x10, [x0, #(5*(512/8))] // ............e......................................................................................|............e..................................................................................... - // ldr x11, [x0, #((5*(512/8))+8)] // .............................e.....................................................................|.............................e.................................................................... - // ins v13.d[0], x10 // .................................e.................................................................|.................................e................................................................ - // ins v13.d[1], x11 // .........................................e.........................................................|.........................................e........................................................ - // ldr x10, [x0, #(6*(512/8))] // e..................................................................................................|e................................................................................................. - // ldr x11, [x0, #((6*(512/8))+8)] // ..................e................................................................................|..................e............................................................................... - // ins v14.d[0], x10 // ...................e...............................................................................|...................e.............................................................................. - // ins v14.d[1], x11 // ..............................e....................................................................|..............................e................................................................... - // ldr x10, [x0, #(7*(512/8))] // .e.................................................................................................|.e................................................................................................ - // ldr x11, [x0, #((7*(512/8))+8)] // ..............e....................................................................................|..............e................................................................................... - // ins v15.d[0], x10 // ......................e............................................................................|......................e........................................................................... - // ins v15.d[1], x11 // ...............................e...................................................................|...............................e.................................................................. - // mul v24.8h, v12.8h, v0.h[0] // ..................................................................................................e|.................................................................................................. - // sqrdmulh v12.8h, v12.8h, v0.h[1] // ............................................................................................e......|............................................................................................e..... - // mls v24.8h, v12.8h, v7.h[0] // ...............*...................................................................................|...............*.................................................................................. - // sub v12.8h, v8.8h, v24.8h // ..................................................*................................................|..................................................*............................................... - // add v8.8h, v8.8h, v24.8h // ............................*......................................................................|............................*..................................................................... - // mul v24.8h, v13.8h, v0.h[0] // ......................................................e............................................|......................................................e........................................... - // sqrdmulh v13.8h, v13.8h, v0.h[1] // ................................................................e..................................|................................................................e................................. - // mls v24.8h, v13.8h, v7.h[0] // ..................................................................................e................|..................................................................................e............... - // sub v13.8h, v9.8h, v24.8h // ...*...............................................................................................|...*.............................................................................................. - // add v9.8h, v9.8h, v24.8h // ....*..............................................................................................|....*............................................................................................. - // mul v24.8h, v14.8h, v0.h[0] // ..........................................e........................................................|..........................................e....................................................... - // sqrdmulh v14.8h, v14.8h, v0.h[1] // .................................................e.................................................|.................................................e................................................ - // mls v24.8h, v14.8h, v7.h[0] // ......................................................................e............................|......................................................................e........................... - // sub v14.8h, v10.8h, v24.8h // ..............................................................................................e....|..............................................................................................e... - // add v10.8h, v10.8h, v24.8h // ...............................................................................................e...|...............................................................................................e.. - // mul v24.8h, v15.8h, v0.h[0] // .......................................e...........................................................|.......................................e.......................................................... - // sqrdmulh v15.8h, v15.8h, v0.h[1] // ........................................e..........................................................|........................................e......................................................... - // mls v24.8h, v15.8h, v7.h[0] // ...................................................e...............................................|...................................................e.............................................. - // sub v15.8h, v11.8h, v24.8h // ...........................................................................................e.......|...........................................................................................e...... - // add v11.8h, v11.8h, v24.8h // .................................................................e.................................|.................................................................e................................ - // mul v24.8h, v10.8h, v0.h[2] // ........*..........................................................................................|........*......................................................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[3] // .........*.........................................................................................|.........*........................................................................................ - // mls v24.8h, v10.8h, v7.h[0] // .........................*.........................................................................|.........................*........................................................................ - // sub v10.8h, v8.8h, v24.8h // .........................................................*.........................................|.........................................................*........................................ - // add v8.8h, v8.8h, v24.8h // ....................................................*..............................................|....................................................*............................................. - // mul v24.8h, v11.8h, v0.h[2] // ..............................................................................e....................|..............................................................................e................... - // sqrdmulh v11.8h, v11.8h, v0.h[3] // .............................................................................e.....................|.............................................................................e.................... - // mls v24.8h, v11.8h, v7.h[0] // ................................................................................................e..|................................................................................................e. - // sub v11.8h, v9.8h, v24.8h // ................*..................................................................................|................*................................................................................. - // add v9.8h, v9.8h, v24.8h // .................*.................................................................................|.................*................................................................................ - // mul v24.8h, v14.8h, v0.h[4] // ......*............................................................................................|......*........................................................................................... - // sqrdmulh v14.8h, v14.8h, v0.h[5] // .......*...........................................................................................|.......*.......................................................................................... - // mls v24.8h, v14.8h, v7.h[0] // ................................*..................................................................|................................*................................................................. - // sub v14.8h, v12.8h, v24.8h // .............................................................*.....................................|.............................................................*.................................... - // add v12.8h, v12.8h, v24.8h // ............................................................*......................................|............................................................*..................................... - // mul v24.8h, v15.8h, v0.h[4] // .....*.............................................................................................|.....*............................................................................................ - // sqrdmulh v15.8h, v15.8h, v0.h[5] // ...................................................................................................*.................................................................................................. - // mls v24.8h, v15.8h, v7.h[0] // .......................*...........................................................................|.......................*.......................................................................... - // sub v15.8h, v13.8h, v24.8h // ......................................*............................................................|......................................*........................................................... - // add v13.8h, v13.8h, v24.8h // ....................................*..............................................................|....................................*............................................................. - // mul v24.8h, v9.8h, v0.h[6] // ...................................*...............................................................|...................................*.............................................................. - // sqrdmulh v9.8h, v9.8h, v0.h[7] // ........................*..........................................................................|........................*......................................................................... - // mls v24.8h, v9.8h, v7.h[0] // ................................................*..................................................|................................................*................................................. - // sub v9.8h, v8.8h, v24.8h // ...............................................................*...................................|...............................................................*.................................. - // add v8.8h, v8.8h, v24.8h // ..................................................................*................................|..................................................................*............................... - // mul v24.8h, v11.8h, v1.h[0] // ..........................*........................................................................|..........................*....................................................................... - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ...........................*.......................................................................|...........................*...................................................................... - // mls v24.8h, v11.8h, v7.h[0] // .......................................................*...........................................|.......................................................*.......................................... - // sub v11.8h, v10.8h, v24.8h // .........................................................................*.........................|.........................................................................*........................ - // add v10.8h, v10.8h, v24.8h // ............................................................................*......................|............................................................................*..................... - // mul v24.8h, v13.8h, v1.h[2] // ...........................................*.......................................................|...........................................*...................................................... - // sqrdmulh v13.8h, v13.8h, v1.h[3] // ............................................*......................................................|............................................*..................................................... - // mls v24.8h, v13.8h, v7.h[0] // ........................................................*..........................................|........................................................*......................................... - // sub v13.8h, v12.8h, v24.8h // ........................................................................*..........................|........................................................................*......................... - // add v12.8h, v12.8h, v24.8h // ...........................................................................*.......................|...........................................................................*...................... - // mul v24.8h, v15.8h, v1.h[4] // ...............................................*...................................................|...............................................*.................................................. - // sqrdmulh v15.8h, v15.8h, v1.h[5] // .....................................................*.............................................|.....................................................*............................................ - // mls v24.8h, v15.8h, v7.h[0] // ...................................................................*...............................|...................................................................*.............................. - // sub v15.8h, v14.8h, v24.8h // ...................................................................................*...............|...................................................................................*.............. - // add v14.8h, v14.8h, v24.8h // ....................................................................................*..............|....................................................................................*............. - // str q8, [x0], #(16) // ...............................................................................*...................|...............................................................................*.................. - // str q9, [x0, #(-16 + 1*(512/8))] // ..........................................................................*........................|..........................................................................*....................... - // str q10, [x0, #(-16 + 2*(512/8))] // ........................................................................................*..........|........................................................................................*......... - // str q11, [x0, #(-16 + 3*(512/8))] // .....................................................................................*.............|.....................................................................................*............ - // str q12, [x0, #(-16 + 4*(512/8))] // ..........................................................................................*........|..........................................................................................*....... - // str q13, [x0, #(-16 + 5*(512/8))] // ......................................................................................*............|......................................................................................*........... - // str q14, [x0, #(-16 + 6*(512/8))] // .................................................................................................*.|.................................................................................................* - // str q15, [x0, #(-16 + 7*(512/8))] // .............................................................................................*.....|.............................................................................................*.... + // Instructions: 76 + // Expected cycles: 17 + // Expected IPC: 4.47 + // + // Cycle bound: 16.0 + // IPC bound: 4.75 + // + // Wall time: 3600.35s + // User time: 3600.35s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + add v29.8H, v21.8H, v12.8H // ............*............................................................... + sub v25.8H, v21.8H, v12.8H // ...........*................................................................ + ldr q21, [x0, #16] // e........................................................................... + ldr q22, [x0, #464] // .......e.................................................................... + ldr q2, [x0, #336] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v14.8H, v19.8H, v28.8H // ......................*..................................................... + add v31.8H, v13.8H, v5.8H // .....................................*...................................... + add v27.8H, v15.8H, v6.8H // ...............................................*............................ + sub v15.8H, v15.8H, v6.8H // ..............................................*............................. + sub v9.8H, v13.8H, v5.8H // ....................................*....................................... + ldr q18, [x0, #272] // ....e....................................................................... + ldr q19, [x0, #144] // ..e......................................................................... + // gap // ............................................................................ + ldr q23, [x0, #400] // ......e..................................................................... + mls v3.8H, v11.8H, v7.H[0] // ........................................*................................... + sqrdmulh v28.8H, v14.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + mul v20.8H, v14.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v16.8H, v15.8H, v1.H[5] // ...............................................................*............ + mul v26.8H, v9.8H, v1.H[0] // ......................................................*..................... + mul v10.8H, v27.8H, v1.H[2] // ...........................................................*................ + sqrdmulh v14.8H, v27.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v27.8H, v2.8H, v0.H[1] // .............e.............................................................. + mul v17.8H, v22.8H, v0.H[0] // ........................e................................................... + sqrdmulh v6.8H, v9.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v30.8H, v22.8H, v0.H[1] // .......................e.................................................... + add v13.8H, v25.8H, v3.8H // ..........................................*................................. + mul v9.8H, v31.8H, v0.H[6] // .................................................*.......................... + sqrdmulh v11.8H, v31.8H, v0.H[7] // ................................................*........................... + mls v20.8H, v28.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v15.8H, v15.8H, v1.H[4] // ................................................................*........... + sub v25.8H, v25.8H, v3.8H // .........................................*.................................. + mls v10.8H, v14.8H, v7.H[0] // ............................................................*............... + sqrdmulh v22.8H, v23.8H, v0.H[1] // ..................e......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q24, [x0, #208] // ...e........................................................................ + mls v17.8H, v30.8H, v7.H[0] // .........................e.................................................. + mul v28.8H, v23.8H, v0.H[0] // ...................e........................................................ + mls v26.8H, v6.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v9.8H, v11.8H, v7.H[0] // ..................................................*......................... + sub v5.8H, v29.8H, v20.8H // ...............................*............................................ + mul v23.8H, v2.8H, v0.H[0] // ..............e............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v12.8H, v18.8H, v0.H[0] // .........e.................................................................. + sub v6.8H, v13.8H, v10.8H // .............................................................*.............. + mls v15.8H, v16.8H, v7.H[0] // .................................................................*.......... + add v16.8H, v13.8H, v10.8H // ..............................................................*............. + add v14.8H, v29.8H, v20.8H // ................................*........................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v8.8H, v5.8H, v26.8H // ........................................................*................... + sub v2.8H, v24.8H, v17.8H // ..........................e................................................. + add v3.8H, v24.8H, v17.8H // ...........................e................................................ + add v31.8H, v5.8H, v26.8H // .........................................................*.................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #256] // ........................................................................*... + str q6, [x0, #320] // .........................................................................*.. + sub v16.8H, v14.8H, v9.8H // ...................................................*........................ + mls v28.8H, v22.8H, v7.H[0] // ....................e....................................................... + sqrdmulh v17.8H, v18.8H, v0.H[1] // ........e................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v26.8H, v3.8H, v0.H[3] // .................................e.......................................... + str q8, [x0, #192] // .......................................................................*.... + str q31, [x0, #128] // ......................................................................*..... + mul v6.8H, v2.8H, v0.H[4] // ............................................e............................... + sqrdmulh v13.8H, v2.8H, v0.H[5] // ...........................................e................................ + mul v5.8H, v3.8H, v0.H[2] // ..................................e......................................... + // gap // ............................................................................ + ldr q29, [x0, #80] // .e.......................................................................... + str q16, [x0, #64] // .....................................................................*...... + add v20.8H, v14.8H, v9.8H // ....................................................*....................... + mls v23.8H, v27.8H, v7.H[0] // ...............e............................................................ + add v11.8H, v25.8H, v15.8H // ...................................................................*........ + sub v15.8H, v25.8H, v15.8H // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v12.8H, v17.8H, v7.H[0] // ..........e................................................................. + sub v16.8H, v19.8H, v28.8H // .....................e...................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v5.8H, v26.8H, v7.H[0] // ...................................e........................................ + str q15, [x0, #448] // ...........................................................................* + mls v6.8H, v13.8H, v7.H[0] // .............................................e.............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q11, [x0, #384] // ..........................................................................*. + // gap // ............................................................................ + add v13.8H, v29.8H, v23.8H // .................e.......................................................... + sqrdmulh v11.8H, v16.8H, v0.H[5] // ......................................e..................................... + mul v3.8H, v16.8H, v0.H[4] // .......................................e.................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v15.8H, v29.8H, v23.8H // ................e........................................................... + str q20, [x0], #(16) // ....................................................................*....... + + // ------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------ + // ldr q8, [x0, #0] // e.........................................................................'.~......................................................................... + // ldr q9, [x0, #(1*(512/8))] // .........................................................e................'..........................................................~................ + // ldr q10, [x0, #(2*(512/8))] // .........e................................................................'..........~................................................................ + // ldr q11, [x0, #(3*(512/8))] // ..............................e...........................................'...............................~........................................... + // ldr q12, [x0, #(4*(512/8))] // ........e.................................................................'.........~................................................................. + // ldr q13, [x0, #(5*(512/8))] // ..e.......................................................................'...~....................................................................... + // ldr q14, [x0, #(6*(512/8))] // ..........e...............................................................'...........~............................................................... + // ldr q15, [x0, #(7*(512/8))] // .e........................................................................'..~........................................................................ + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ..................................................e.......................'...................................................~....................... + // mul v24.8h, v12.8h, v0.h[0] // .....................................e....................................'......................................~.................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................................................e..........'................................................................~.......... + // sub v12.8h, v8.8h, v24.8h // ..........................................................................'*.......................................................................... + // add v8.8h, v8.8h, v24.8h // ..........................................................................*........................................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ..................e.......................................................'...................~....................................................... + // mul v24.8h, v13.8h, v0.h[0] // ....................................e.....................................'.....................................~..................................... + // mls v24.8h, v27.8h, v7.h[0] // ............................................................e.............'.............................................................~............. + // sub v13.8h, v9.8h, v24.8h // ........................................................................e.'.........................................................................~. + // add v9.8h, v9.8h, v24.8h // .....................................................................e....'......................................................................~.... + // sqrdmulh v27.8h, v14.8h, v0.h[1] // .............................e............................................'..............................~............................................ + // mul v24.8h, v14.8h, v0.h[0] // ................................e.........................................'.................................~......................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................e........................'..................................................~........................ + // sub v14.8h, v10.8h, v24.8h // ................................................................e.........'.................................................................~......... + // add v10.8h, v10.8h, v24.8h // ...~......................................................................'....*...................................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // .....................e....................................................'......................~.................................................... + // mul v24.8h, v15.8h, v0.h[0] // ...................e......................................................'....................~...................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............................e..........................................'................................~.......................................... + // sub v15.8h, v11.8h, v24.8h // ...........................................e..............................'............................................~.............................. + // add v11.8h, v11.8h, v24.8h // ............................................e.............................'.............................................~............................. + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ............~.............................................................'.............*............................................................. + // mul v24.8h, v10.8h, v0.h[2] // .............~............................................................'..............*............................................................ + // mls v24.8h, v27.8h, v7.h[0] // .........................~................................................'..........................*................................................ + // sub v10.8h, v8.8h, v24.8h // ...................................~......................................'....................................*...................................... + // add v8.8h, v8.8h, v24.8h // .........................................~................................'..........................................*................................ + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ...................................................e......................'....................................................~...................... + // mul v24.8h, v11.8h, v0.h[2] // ........................................................e.................'.........................................................~................. + // mls v24.8h, v27.8h, v7.h[0] // .................................................................e........'..................................................................~........ + // sub v11.8h, v9.8h, v24.8h // .......~..................................................................'........*.................................................................. + // add v9.8h, v9.8h, v24.8h // ....~.....................................................................'.....*..................................................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ......................................................................e...'.......................................................................~... + // mul v24.8h, v14.8h, v0.h[4] // .......................................................................e..'........................................................................~.. + // mls v24.8h, v27.8h, v7.h[0] // ...........~..............................................................'............*.............................................................. + // sub v14.8h, v12.8h, v24.8h // ...........................~..............................................'............................*.............................................. + // add v12.8h, v12.8h, v24.8h // ......................~...................................................'.......................*................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // .......................................................e..................'........................................................~.................. + // mul v24.8h, v15.8h, v0.h[4] // ......................................................e...................'.......................................................~................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................................................e......'....................................................................~...... + // sub v15.8h, v13.8h, v24.8h // ......~...................................................................'.......*................................................................... + // add v13.8h, v13.8h, v24.8h // .....~....................................................................'......*.................................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ........................~.................................................'.........................*................................................. + // mul v24.8h, v9.8h, v0.h[6] // .......................~..................................................'........................*.................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..................................~.......................................'...................................*....................................... + // sub v9.8h, v8.8h, v24.8h // ................................................~.........................'.................................................*......................... + // add v8.8h, v8.8h, v24.8h // ...........................................................~..............'............................................................*.............. + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ....................~.....................................................'.....................*..................................................... + // mul v24.8h, v11.8h, v1.h[0] // ...............~..........................................................'................*.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................~........................................'..................................*........................................ + // sub v11.8h, v10.8h, v24.8h // ..........................................~...............................'...........................................*............................... + // add v10.8h, v10.8h, v24.8h // .............................................~............................'..............................................*............................ + // sqrdmulh v27.8h, v13.8h, v1.h[3] // .................~........................................................'..................*........................................................ + // mul v24.8h, v13.8h, v1.h[2] // ................~.........................................................'.................*......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ............................~.............................................'.............................*............................................. + // sub v13.8h, v12.8h, v24.8h // ......................................~...................................'.......................................*................................... + // add v12.8h, v12.8h, v24.8h // ........................................~.................................'.........................................*................................. + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ..............~...........................................................'...............*........................................................... + // mul v24.8h, v15.8h, v1.h[4] // ..........................~...............................................'...........................*............................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................................~..................................'........................................*.................................. + // sub v15.8h, v14.8h, v24.8h // ..............................................................~...........'...............................................................*........... + // add v14.8h, v14.8h, v24.8h // .............................................................~............'..............................................................*............ + // str q8, [x0], #(16) // .........................................................................~'..........................................................................* + // str q9, [x0, #(-16 + 1*(512/8))] // ..........................................................~...............'...........................................................*............... + // str q10, [x0, #(-16 + 2*(512/8))] // .....................................................~....................'......................................................*.................... + // str q11, [x0, #(-16 + 3*(512/8))] // ....................................................~.....................'.....................................................*..................... + // str q12, [x0, #(-16 + 4*(512/8))] // ..............................................~...........................'...............................................*........................... + // str q13, [x0, #(-16 + 5*(512/8))] // ...............................................~..........................'................................................*.......................... + // str q14, [x0, #(-16 + 6*(512/8))] // ....................................................................~.....'.....................................................................*..... + // str q15, [x0, #(-16 + 7*(512/8))] // ..................................................................~.......'...................................................................*....... sub count, count, #1 cbnz count, layer123_start - mul v9.8H, v11.8H, v0.H[4] // ...*............................................... - mul v2.8H, v28.8H, v0.H[2] // ......*............................................ - sqrdmulh v11.8H, v11.8H, v0.H[5] // *.................................................. - add v15.8H, v24.8H, v31.8H // ..*................................................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - ins v17.d[1], x15 // ........*.......................................... - sqrdmulh v4.8H, v28.8H, v0.H[3] // .......*........................................... - mls v5.8H, v8.8H, v7.H[0] // .........*......................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v21.8H, v15.8H, v14.8H // ..........*........................................ - add v22.8H, v15.8H, v14.8H // ...........*....................................... - sub v26.8H, v24.8H, v31.8H // .*................................................. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mul v3.8H, v12.8H, v0.H[4] // ....*.............................................. - sqrdmulh v28.8H, v12.8H, v0.H[5] // .....*............................................. - mls v9.8H, v11.8H, v7.H[0] // ............*...................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mul v18.8H, v21.8H, v1.H[0] // ...............*................................... - sqrdmulh v13.8H, v21.8H, v1.H[1] // ................*.................................. - sqrdmulh v20.8H, v22.8H, v0.H[7] // .............*..................................... - mul v21.8H, v22.8H, v0.H[6] // ...................*............................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - add v10.8H, v17.8H, v5.8H // .................*................................. - mls v2.8H, v4.8H, v7.H[0] // ..............*.................................... - sub v23.8H, v17.8H, v5.8H // ..........................*........................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v14.8H, v26.8H, v9.8H // .....................*............................. - add v5.8H, v26.8H, v9.8H // ....................*.............................. - mls v3.8H, v28.8H, v7.H[0] // ..................*................................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v18.8H, v13.8H, v7.H[0] // .............................*..................... - mls v21.8H, v20.8H, v7.H[0] // .........................*......................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mul v4.8H, v14.8H, v1.H[4] // ........................*.......................... - sqrdmulh v30.8H, v14.8H, v1.H[5] // ............................*...................... - sub v29.8H, v10.8H, v2.8H // ...............................*................... - add v15.8H, v10.8H, v2.8H // ...........................*....................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sqrdmulh v6.8H, v5.8H, v1.H[3] // .......................*........................... - mul v16.8H, v5.8H, v1.H[2] // ......................*............................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - add v2.8H, v15.8H, v21.8H // ...................................*............... - sub v19.8H, v15.8H, v21.8H // ..................................*................ - add v22.8H, v23.8H, v3.8H // ................................*.................. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v4.8H, v30.8H, v7.H[0] // ....................................*.............. - sub v20.8H, v23.8H, v3.8H // .................................*................. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v11.8H, v29.8H, v18.8H // ......................................*............ - str q19, [x0, #64] // .......................................*........... - mls v16.8H, v6.8H, v7.H[0] // ..............................*.................... - add v5.8H, v29.8H, v18.8H // .........................................*......... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str q11, [x0, #192] // .............................................*..... - str q2, [x0], #(16) // ..........................................*........ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v14.8H, v20.8H, v4.8H // ...........................................*....... - add v27.8H, v20.8H, v4.8H // ............................................*...... - str q5, [x0, #112] // ...............................................*... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v13.8H, v22.8H, v16.8H // .....................................*............. - add v26.8H, v22.8H, v16.8H // ........................................*.......... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str q14, [x0, #432] // .................................................*. - str q27, [x0, #368] // ..................................................* - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str q26, [x0, #240] // ................................................*.. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str q13, [x0, #304] // ..............................................*.... - - // original source code - // sqrdmulh v26.8H, v11.8H, v0.H[5] // ..*................................................ - // sub v19.8H, v24.8H, v31.8H // .........*......................................... - // add v22.8H, v24.8H, v31.8H // ...*............................................... - // mul v31.8H, v11.8H, v0.H[4] // *.................................................. - // mul v6.8H, v12.8H, v0.H[4] // ..........*........................................ - // sqrdmulh v4.8H, v12.8H, v0.H[5] // ...........*....................................... - // mul v10.8H, v28.8H, v0.H[2] // .*................................................. - // sqrdmulh v28.8H, v28.8H, v0.H[3] // .....*............................................. - // ins v17.d[1], x15 // ....*.............................................. - // mls v5.8H, v8.8H, v7.H[0] // ......*............................................ - // sub v8.8H, v22.8H, v14.8H // .......*........................................... - // add v13.8H, v22.8H, v14.8H // ........*.......................................... - // mls v31.8H, v26.8H, v7.H[0] // ............*...................................... - // sqrdmulh v9.8H, v13.8H, v0.H[7] // ...............*................................... - // mls v10.8H, v28.8H, v7.H[0] // ..................*................................ - // mul v27.8H, v8.8H, v1.H[0] // .............*..................................... - // sqrdmulh v18.8H, v8.8H, v1.H[1] // ..............*.................................... - // add v14.8H, v17.8H, v5.8H // .................*................................. - // mls v6.8H, v4.8H, v7.H[0] // ......................*............................ - // mul v11.8H, v13.8H, v0.H[6] // ................*.................................. - // add v4.8H, v19.8H, v31.8H // .....................*............................. - // sub v26.8H, v19.8H, v31.8H // ....................*.............................. - // mul v20.8H, v4.8H, v1.H[2] // ..............................*.................... - // sqrdmulh v4.8H, v4.8H, v1.H[3] // .............................*..................... - // mul v22.8H, v26.8H, v1.H[4] // .........................*......................... - // mls v11.8H, v9.8H, v7.H[0] // ........................*.......................... - // sub v15.8H, v17.8H, v5.8H // ...................*............................... - // add v12.8H, v14.8H, v10.8H // ............................*...................... - // sqrdmulh v23.8H, v26.8H, v1.H[5] // ..........................*........................ - // mls v27.8H, v18.8H, v7.H[0] // .......................*........................... - // mls v20.8H, v4.8H, v7.H[0] // ......................................*............ - // sub v14.8H, v14.8H, v10.8H // ...........................*....................... - // add v26.8H, v15.8H, v6.8H // .................................*................. - // sub v15.8H, v15.8H, v6.8H // ...................................*............... - // sub v18.8H, v12.8H, v11.8H // ................................*.................. - // add v3.8H, v12.8H, v11.8H // ...............................*................... - // mls v22.8H, v23.8H, v7.H[0] // ..................................*................ - // sub v8.8H, v26.8H, v20.8H // .............................................*..... - // sub v25.8H, v14.8H, v27.8H // ....................................*.............. - // str q18, [x0, #64] // .....................................*............. - // add v18.8H, v26.8H, v20.8H // ..............................................*.... - // add v23.8H, v14.8H, v27.8H // .......................................*........... - // str q3, [x0], #(16) // .........................................*......... - // sub v12.8H, v15.8H, v22.8H // ..........................................*........ - // add v4.8H, v15.8H, v22.8H // ...........................................*....... - // str q25, [x0, #176] // ........................................*.......... - // str q8, [x0, #304] // ..................................................* - // str q23, [x0, #112] // ............................................*...... - // str q18, [x0, #240] // .................................................*. - // str q12, [x0, #432] // ...............................................*... - // str q4, [x0, #368] // ................................................*.. + // Instructions: 43 + // Expected cycles: 14 + // Expected IPC: 3.07 + // + // Cycle bound: 14.0 + // IPC bound: 3.07 + // + // Wall time: 0.99s + // User time: 0.99s + // + // ----------- original position ------------> + // 0 25 + // |------------------------|----------------- + sub v26.8H, v13.8H, v5.8H // ......*.................................... + add v4.8H, v19.8H, v28.8H // ..*........................................ + mls v3.8H, v11.8H, v7.H[0] // .......*................................... + add v16.8H, v15.8H, v6.8H // ....*...................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v8.8H, v21.8H, v12.8H // *.......................................... + sub v21.8H, v21.8H, v12.8H // .*......................................... + add v23.8H, v13.8H, v5.8H // ...*....................................... + sub v10.8H, v15.8H, v6.8H // .....*..................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v19.8H, v4.8H, v0.H[3] // ........*.................................. + mul v27.8H, v4.8H, v0.H[2] // .........*................................. + mul v4.8H, v16.8H, v1.H[2] // ............*.............................. + sqrdmulh v31.8H, v16.8H, v1.H[3] // .............*............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v15.8H, v21.8H, v3.8H // ....................*...................... + sqrdmulh v29.8H, v10.8H, v1.H[5] // ..........*................................ + mul v10.8H, v10.8H, v1.H[4] // ...................*....................... + sqrdmulh v28.8H, v26.8H, v1.H[1] // ..............*............................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v2.8H, v26.8H, v1.H[0] // ...........*............................... + add v26.8H, v21.8H, v3.8H // ...............*........................... + sqrdmulh v14.8H, v23.8H, v0.H[7] // .................*......................... + mul v20.8H, v23.8H, v0.H[6] // ................*.......................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v4.8H, v31.8H, v7.H[0] // .....................*..................... + mls v27.8H, v19.8H, v7.H[0] // ..................*........................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v10.8H, v29.8H, v7.H[0] // ..........................*................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v2.8H, v28.8H, v7.H[0] // ......................*.................... + mls v20.8H, v14.8H, v7.H[0] // .......................*................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v17.8H, v26.8H, v4.8H // ...........................*............... + sub v6.8H, v26.8H, v4.8H // .........................*................. + sub v25.8H, v8.8H, v27.8H // ........................*.................. + add v16.8H, v8.8H, v27.8H // ............................*.............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v31.8H, v15.8H, v10.8H // ......................................*.... + sub v22.8H, v15.8H, v10.8H // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v21.8H, v16.8H, v20.8H // .................................*......... + str q6, [x0, #320] // ................................*.......... + sub v24.8H, v25.8H, v2.8H // .............................*............. + add v18.8H, v25.8H, v2.8H // ..............................*............ + add v29.8H, v16.8H, v20.8H // .....................................*..... + str q17, [x0, #256] // ...............................*........... + // gap // ........................................... + // gap // ........................................... + str q31, [x0, #384] // .........................................*. + str q22, [x0, #448] // ........................................*.. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q29, [x0], #(16) // ..........................................* + str q24, [x0, #176] // ..................................*........ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q21, [x0, #48] // ....................................*...... + str q18, [x0, #112] // ...................................*....... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + + // -------------- new position --------------> + // 0 25 + // |------------------------|----------------- + // add v29.8H, v21.8H, v12.8H // ....*...................................... + // sub v25.8H, v21.8H, v12.8H // .....*..................................... + // add v14.8H, v19.8H, v28.8H // .*......................................... + // add v31.8H, v13.8H, v5.8H // ......*.................................... + // add v27.8H, v15.8H, v6.8H // ...*....................................... + // sub v15.8H, v15.8H, v6.8H // .......*................................... + // sub v9.8H, v13.8H, v5.8H // *.......................................... + // mls v3.8H, v11.8H, v7.H[0] // ..*........................................ + // sqrdmulh v28.8H, v14.8H, v0.H[3] // ........*.................................. + // mul v20.8H, v14.8H, v0.H[2] // .........*................................. + // sqrdmulh v16.8H, v15.8H, v1.H[5] // .............*............................. + // mul v26.8H, v9.8H, v1.H[0] // ................*.......................... + // mul v10.8H, v27.8H, v1.H[2] // ..........*................................ + // sqrdmulh v14.8H, v27.8H, v1.H[3] // ...........*............................... + // sqrdmulh v6.8H, v9.8H, v1.H[1] // ...............*........................... + // add v13.8H, v25.8H, v3.8H // .................*......................... + // mul v9.8H, v31.8H, v0.H[6] // ...................*....................... + // sqrdmulh v11.8H, v31.8H, v0.H[7] // ..................*........................ + // mls v20.8H, v28.8H, v7.H[0] // .....................*..................... + // mul v15.8H, v15.8H, v1.H[4] // ..............*............................ + // sub v25.8H, v25.8H, v3.8H // ............*.............................. + // mls v10.8H, v14.8H, v7.H[0] // ....................*...................... + // mls v26.8H, v6.8H, v7.H[0] // .......................*................... + // mls v9.8H, v11.8H, v7.H[0] // ........................*.................. + // sub v5.8H, v29.8H, v20.8H // ...........................*............... + // sub v6.8H, v13.8H, v10.8H // ..........................*................ + // mls v15.8H, v16.8H, v7.H[0] // ......................*.................... + // add v16.8H, v13.8H, v10.8H // .........................*................. + // add v14.8H, v29.8H, v20.8H // ............................*.............. + // sub v8.8H, v5.8H, v26.8H // .................................*......... + // add v31.8H, v5.8H, v26.8H // ..................................*........ + // str q16, [x0, #256] // ....................................*...... + // str q6, [x0, #320] // ................................*.......... + // sub v16.8H, v14.8H, v9.8H // ...............................*........... + // str q8, [x0, #192] // ........................................*.. + // str q31, [x0, #128] // ..........................................* + // str q16, [x0, #64] // .........................................*. + // add v20.8H, v14.8H, v9.8H // ...................................*....... + // add v11.8H, v25.8H, v15.8H // .............................*............. + // sub v15.8H, v25.8H, v15.8H // ..............................*............ + // str q15, [x0, #448] // ......................................*.... + // str q11, [x0, #384] // .....................................*..... + // str q20, [x0], #(16) // .......................................*... restore inp, STACK0 mov count, #8 .p2align 2 - ldr x21, [x1, #32] // ..................*...................................................... - ldr x24, [x1, #48] // ..*...................................................................... - ldr x20, [x3], #16 // .*....................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ldr x19, [x3, #-8] // .......*................................................................. - ldr x22, [x1, #0] // ......*.................................................................. - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ldr x15, [x1, #56] // ....*.................................................................... - ldr x9, [x1, #40] // ..........................*.............................................. - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ldr x26, [x4], #(6*16) // .........*............................................................... - ins v10.d[0], x20 // ..........*.............................................................. - ins v23.d[0], x24 // ...........*............................................................. - ldr x24, [x1, #16] // *........................................................................ - ins v5.d[0], x21 // .............................*........................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v31.d[0], x22 // .....................................*................................... - ldr x22, [x1, #8] // ...........................*............................................. - ldr x14, [x1, #24] // ........*................................................................ - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v10.d[1], x19 // ......................*.................................................. - ldr x7, [x4, #-72] // ............*............................................................ - ins v23.d[1], x15 // .....................*................................................... - ldr x15, [x4, #-40] // .....................................................................*... - ins v5.d[1], x9 // ...................................*..................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v0.d[0], x24 // ..............*.......................................................... - ldr x19, [x4, #-32] // ..................................................................*...... - ldr x24, [x4, #-48] // .......................*................................................. - ldr x21, [x4, #-56] // ................*........................................................ - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - sqrdmulh v9.8H, v5.8H, v10.H[1] // ......................................*.................................. - sqrdmulh v3.8H, v23.8H, v10.H[1] // ............................*............................................ - mul v22.8H, v23.8H, v10.H[0] // ..............................*.......................................... - mul v4.8H, v5.8H, v10.H[0] // .......................................*................................. - ldr x10, [x4, #-80] // ...*..................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v0.d[1], x14 // .........................*............................................... - ins v31.d[1], x22 // ................................................*........................ - ldr x11, [x4, #-24] // ...................*..................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v29.d[0], x24 // ...............................*......................................... - ldr x24, [x4, #-88] // ....................*.................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - mls v22.8H, v3.8H, v7.H[0] // ..................................*...................................... - mls v4.8H, v9.8H, v7.H[0] // ...............................................*......................... - ins v23.d[0], x10 // ...............*......................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v23.d[1], x7 // ........................*................................................ - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - sub v13.8H, v31.8H, v4.8H // ...................................................*..................... - add v24.8H, v31.8H, v4.8H // ....................................................*.................... - sub v20.8H, v0.8H, v22.8H // ........................................*................................ - add v19.8H, v0.8H, v22.8H // .........................................*............................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - sqrdmulh v18.8H, v20.8H, v10.H[5] // ..........................................*.............................. - mul v3.8H, v20.8H, v10.H[4] // ...........................................*............................. - mul v12.8H, v19.8H, v10.H[2] // .............................................*........................... - sqrdmulh v22.8H, v19.8H, v10.H[3] // ..............................................*.......................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v20.d[0], x26 // .................................*....................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - mls v12.8H, v22.8H, v7.H[0] // .....................................................*................... - ins v20.d[1], x24 // ....................................*.................................... - mls v3.8H, v18.8H, v7.H[0] // .................................................*....................... - ldr x24, [x4, #-16] // .....*................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - add v22.8H, v13.8H, v3.8H // ......................................................*.................. - sub v28.8H, v13.8H, v3.8H // .......................................................*................. - sub v17.8H, v24.8H, v12.8H // .........................................................*............... - add v2.8H, v24.8H, v12.8H // ........................................................*................ - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v10.d[0], x24 // ................................*........................................ - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - trn2 v4.4S, v22.4S, v28.4S // ..........................................................*.............. - trn1 v22.4S, v22.4S, v28.4S // ...........................................................*............. - trn1 v5.4S, v2.4S, v17.4S // ..............................................................*.......... - trn2 v25.4S, v2.4S, v17.4S // ............................................................*............ - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ldr x24, [x4, #-64] // .............*........................................................... - trn1 v11.2D, v25.2D, v4.2D // ....................................................................*.... - trn2 v14.2D, v25.2D, v4.2D // .............................................................*........... - trn1 v13.2D, v5.2D, v22.2D // ...................................................................*..... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - trn2 v24.2D, v5.2D, v22.2D // .................................................................*....... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - sqrdmulh v17.8H, v14.8H, v23.8H // ...............................................................*......... - ins v15.d[0], x24 // ............................................*............................ - ldr x24, [x4, #-8] // .................*....................................................... - mul v0.8H, v14.8H, v20.8H // ................................................................*........ - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - mul v27.8H, v24.8H, v20.8H // .......................................................................*. - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - sqrdmulh v23.8H, v24.8H, v23.8H // ........................................................................* - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - ins v10.d[1], x24 // ..................................................*...................... - mls v0.8H, v17.8H, v7.H[0] // ......................................................................*.. - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - // gap // ......................................................................... - - // original source code - // ldr x28, [x1, #16] // ..........*.............................................................. - // ldr x15, [x3], #16 // ..*...................................................................... - // ldr x21, [x1, #48] // .*....................................................................... - // ldr x26, [x4, #16] // ............................*............................................ - // ldr x20, [x1, #56] // .....*................................................................... - // ldr x24, [x4, #80] // ..................................................*...................... - // ldr x16, [x1, #0] // ....*.................................................................... - // ldr x19, [x3, #-8] // ...*..................................................................... - // ldr x14, [x1, #24] // ..............*.......................................................... - // ldr x13, [x4], #(6*16) // .......*................................................................. - // ins v6.d[0], x15 // ........*................................................................ - // ins v1.d[0], x21 // .........*............................................................... - // ldr x22, [x4, #-72] // ................*........................................................ - // ldr x8, [x4, #-64] // ............................................................*............ - // ins v25.d[0], x28 // ....................*.................................................... - // ins v14.d[0], x26 // ....................................*.................................... - // ldr x21, [x4, #-56] // .......................*................................................. - // ldr x26, [x4, #-8] // ...................................................................*..... - // ldr x29, [x1, #32] // *........................................................................ - // ldr x11, [x4, #-24] // ...............................*......................................... - // ldr x23, [x4, #-88] // .................................*....................................... - // ins v1.d[1], x20 // .................*....................................................... - // ins v6.d[1], x19 // ...............*......................................................... - // ldr x7, [x4, #-48] // ......................*.................................................. - // ins v14.d[1], x22 // .....................................*................................... - // ins v25.d[1], x14 // .............................*........................................... - // ldr x10, [x1, #40] // ......*.................................................................. - // ldr x12, [x1, #8] // .............*........................................................... - // sqrdmulh v0.8H, v1.8H, v6.H[1] // .........................*............................................... - // ins v3.d[0], x29 // ...........*............................................................. - // mul v11.8H, v1.8H, v6.H[0] // ..........................*.............................................. - // ins v29.d[0], x7 // ................................*........................................ - // ins v10.d[0], x24 // .......................................................*................. - // ins v19.d[0], x13 // ..............................................*.......................... - // mls v11.8H, v0.8H, v7.H[0] // ..................................*...................................... - // ins v3.d[1], x10 // ...................*..................................................... - // ins v19.d[1], x23 // ................................................*........................ - // ins v1.d[0], x16 // ............*............................................................ - // sqrdmulh v16.8H, v3.8H, v6.H[1] // ........................*................................................ - // mul v12.8H, v3.8H, v6.H[0] // ...........................*............................................. - // sub v4.8H, v25.8H, v11.8H // ........................................*................................ - // add v20.8H, v25.8H, v11.8H // .........................................*............................... - // sqrdmulh v18.8H, v4.8H, v6.H[5] // ..........................................*.............................. - // mul v21.8H, v4.8H, v6.H[4] // ...........................................*............................. - // ins v15.d[0], x8 // ..................................................................*...... - // mul v26.8H, v20.8H, v6.H[2] // ............................................*............................ - // sqrdmulh v3.8H, v20.8H, v6.H[3] // .............................................*........................... - // mls v12.8H, v16.8H, v7.H[0] // ...................................*..................................... - // ins v1.d[1], x12 // ..............................*.......................................... - // mls v21.8H, v18.8H, v7.H[0] // .................................................*....................... - // ins v10.d[1], x26 // .......................................................................*. - // sub v13.8H, v1.8H, v12.8H // ......................................*.................................. - // add v12.8H, v1.8H, v12.8H // .......................................*................................. - // mls v26.8H, v3.8H, v7.H[0] // ...............................................*......................... - // add v3.8H, v13.8H, v21.8H // ...................................................*..................... - // sub v20.8H, v13.8H, v21.8H // ....................................................*.................... - // add v18.8H, v12.8H, v26.8H // ......................................................*.................. - // sub v21.8H, v12.8H, v26.8H // .....................................................*................... - // trn2 v9.4S, v3.4S, v20.4S // ........................................................*................ - // trn1 v11.4S, v3.4S, v20.4S // .........................................................*............... - // trn2 v3.4S, v18.4S, v21.4S // ...........................................................*............. - // trn2 v0.2D, v3.2D, v9.2D // ..............................................................*.......... - // trn1 v12.4S, v18.4S, v21.4S // ..........................................................*.............. - // sqrdmulh v22.8H, v0.8H, v14.8H // .................................................................*....... - // mul v0.8H, v0.8H, v19.8H // ....................................................................*.... - // trn2 v23.2D, v12.2D, v11.2D // ................................................................*........ - // ldr x19, [x4, #-32] // .....................*................................................... - // trn1 v13.2D, v12.2D, v11.2D // ...............................................................*......... - // trn1 v11.2D, v3.2D, v9.2D // .............................................................*........... - // ldr x15, [x4, #-40] // ..................*...................................................... - // mls v0.8H, v22.8H, v7.H[0] // ........................................................................* - // mul v27.8H, v23.8H, v19.8H // .....................................................................*... - // sqrdmulh v23.8H, v23.8H, v14.8H // ......................................................................*.. + // Instructions: 41 + // Expected cycles: 28 + // Expected IPC: 1.46 + // + // Cycle bound: 28.0 + // IPC bound: 1.46 + // + // Wall time: 0.51s + // User time: 0.51s + // + // ---------- original position -----------> + // 0 25 + // |------------------------|--------------- + ldr q12, [x4, #80] // .......................................*. + ldr q31, [x1, #48] // *........................................ + ldr q11, [x3], #16 // ..*...................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + ldr q2, [x1, #32] // .*....................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + ldr q9, [x1, #16] // ....*.................................... + ldr q17, [x1, #0] // ...*..................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + ldr q25, [x4, #16] // ............................*............ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mul v1.8H, v31.8H, v11.H[0] // .....*................................... + sqrdmulh v0.8H, v31.8H, v11.H[1] // ......*.................................. + ldr q31, [x4, #32] // .............................*........... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mul v6.8H, v2.8H, v11.H[0] // .......*................................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sqrdmulh v27.8H, v2.8H, v11.H[1] // ........*................................ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v1.8H, v0.8H, v7.H[0] // .........*............................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v6.8H, v27.8H, v7.H[0] // ..........*.............................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + add v28.8H, v9.8H, v1.8H // ...........*............................. + sub v2.8H, v9.8H, v1.8H // ............*............................ + ldr q1, [x4], #(6*16) // .........................*............... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sqrdmulh v0.8H, v28.8H, v11.H[3] // .............*........................... + mul v4.8H, v28.8H, v11.H[2] // ..............*.......................... + mul v18.8H, v2.8H, v11.H[4] // ...............*......................... + sqrdmulh v26.8H, v2.8H, v11.H[5] // ................*........................ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + add v21.8H, v17.8H, v6.8H // .................*....................... + sub v22.8H, v17.8H, v6.8H // ....................*.................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v4.8H, v0.8H, v7.H[0] // ..................*...................... + mls v18.8H, v26.8H, v7.H[0] // ...................*..................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v27.8H, v21.8H, v4.8H // .....................*................... + add v3.8H, v21.8H, v4.8H // ......................*.................. + sub v14.8H, v22.8H, v18.8H // .......................*................. + add v16.8H, v22.8H, v18.8H // ........................*................ + ldr q11, [x4, #-48] // ...............................*......... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn2 v5.4S, v16.4S, v14.4S // ...........................*............. + trn2 v4.4S, v3.4S, v27.4S // ..........................*.............. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn1 v6.4S, v3.4S, v27.4S // ..................................*...... + trn1 v13.4S, v16.4S, v14.4S // ....................................*.... + ldr q14, [x4, #-32] // ...................................*..... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn2 v2.2D, v4.2D, v5.2D // ..............................*.......... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn1 v17.2D, v4.2D, v5.2D // .....................................*... + trn2 v22.2D, v6.2D, v13.2D // ........................................* + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mul v9.8H, v2.8H, v1.8H // ................................*........ + sqrdmulh v16.8H, v2.8H, v25.8H // .................................*....... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v9.8H, v16.8H, v7.H[0] // ......................................*.. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + + // ------------- new position -------------> + // 0 25 + // |------------------------|--------------- + // ldr q4, [x1, #48] // .*....................................... + // ldr q21, [x1, #32] // ...*..................................... + // ldr q5, [x3], #16 // ..*...................................... + // ldr q17, [x1, #0] // .....*................................... + // ldr q10, [x1, #16] // ....*.................................... + // mul v13.8H, v4.8H, v5.H[0] // .......*................................. + // sqrdmulh v31.8H, v4.8H, v5.H[1] // ........*................................ + // mul v1.8H, v21.8H, v5.H[0] // ..........*.............................. + // sqrdmulh v3.8H, v21.8H, v5.H[1] // ...........*............................. + // mls v13.8H, v31.8H, v7.H[0] // ............*............................ + // mls v1.8H, v3.8H, v7.H[0] // .............*........................... + // add v26.8H, v10.8H, v13.8H // ..............*.......................... + // sub v22.8H, v10.8H, v13.8H // ...............*......................... + // sqrdmulh v25.8H, v26.8H, v5.H[3] // .................*....................... + // mul v21.8H, v26.8H, v5.H[2] // ..................*...................... + // mul v26.8H, v22.8H, v5.H[4] // ...................*..................... + // sqrdmulh v13.8H, v22.8H, v5.H[5] // ....................*.................... + // add v22.8H, v17.8H, v1.8H // .....................*................... + // mls v21.8H, v25.8H, v7.H[0] // .......................*................. + // mls v26.8H, v13.8H, v7.H[0] // ........................*................ + // sub v13.8H, v17.8H, v1.8H // ......................*.................. + // sub v14.8H, v22.8H, v21.8H // .........................*............... + // add v27.8H, v22.8H, v21.8H // ..........................*.............. + // sub v8.8H, v13.8H, v26.8H // ...........................*............. + // add v3.8H, v13.8H, v26.8H // ............................*............ + // ldr q1, [x4], #(6*16) // ................*........................ + // trn2 v10.4S, v27.4S, v14.4S // ...............................*......... + // trn2 v2.4S, v3.4S, v8.4S // ..............................*.......... + // ldr q25, [x4, #-80] // ......*.................................. + // ldr q31, [x4, #-64] // .........*............................... + // trn2 v17.2D, v10.2D, v2.2D // ...................................*..... + // ldr q11, [x4, #-48] // .............................*........... + // mul v9.8H, v17.8H, v1.8H // ......................................*.. + // sqrdmulh v16.8H, v17.8H, v25.8H // .......................................*. + // trn1 v6.4S, v27.4S, v14.4S // ................................*........ + // ldr q14, [x4, #-32] // ..................................*...... + // trn1 v13.4S, v3.4S, v8.4S // .................................*....... + // trn1 v17.2D, v10.2D, v2.2D // ....................................*.... + // mls v9.8H, v16.8H, v7.H[0] // ........................................* + // ldr q12, [x4, #-16] // *........................................ + // trn2 v22.2D, v6.2D, v13.2D // .....................................*... sub count, count, #1 layer4567_start: - ldr x28, [x1, #80] // ....e....................................................................................................................... - ins v19.d[0], x19 // ..................................................................*......................................................... - ins v29.d[1], x15 // ...............................................................*............................................................ - ins v15.d[1], x21 // ...........................................................*................................................................ - ldr x15, [x3], #16 // ................e........................................................................................................... - ldr x21, [x1, #112] // ............e............................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ldr x26, [x4, #16] // ....................................................e....................................................................... - mls v27.8H, v23.8H, v7.H[0] // ..........................................................................*................................................. - add v22.8H, v11.8H, v0.8H // .................................................................................*.......................................... - sub v23.8H, v11.8H, v0.8H // ................................................................................*........................................... - ldr x20, [x1, #120] // .............e.............................................................................................................. - ldr x24, [x4, #80] // ....................................................................e....................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ldr x16, [x1, #64] // e........................................................................................................................... - ldr x19, [x3, #-8] // .................e.......................................................................................................... - ins v19.d[1], x11 // ...................................................................*........................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ldr x14, [x1, #88] // .....e...................................................................................................................... - ldr x13, [x4], #(6*16) // ................................................e........................................................................... - ins v6.d[0], x15 // ..................e......................................................................................................... - sqrdmulh v0.8H, v22.8H, v29.8H // ...................................................................................*........................................ - mul v4.8H, v22.8H, v15.8H // ..................................................................................*......................................... - ins v1.d[0], x21 // ..............e............................................................................................................. - ldr x22, [x4, #-72] // .....................................................e...................................................................... - ldr x8, [x4, #-64] // ........................................................e................................................................... - // gap // ............................................................................................................................ - ins v25.d[0], x28 // ......e..................................................................................................................... - ins v14.d[0], x26 // ......................................................e..................................................................... - ldr x21, [x4, #-56] // .........................................................e.................................................................. - ldr x26, [x4, #-8] // .....................................................................e...................................................... - mul v22.8H, v23.8H, v19.8H // .......................................................................................*.................................... - sqrdmulh v3.8H, v23.8H, v10.8H // ........................................................................................*................................... - ldr x29, [x1, #96] // ........e................................................................................................................... - // gap // ............................................................................................................................ - ldr x11, [x4, #-24] // .................................................................e.......................................................... - ldr x23, [x4, #-88] // .................................................e.......................................................................... - ins v1.d[1], x20 // ...............e............................................................................................................ - add v26.8H, v13.8H, v27.8H // ............................................................................*............................................... - sub v27.8H, v13.8H, v27.8H // ...........................................................................*................................................ - ins v6.d[1], x19 // ...................e........................................................................................................ - ldr x7, [x4, #-48] // ............................................................e............................................................... - // gap // ............................................................................................................................ - mls v4.8H, v0.8H, v7.H[0] // ....................................................................................*....................................... - ins v14.d[1], x22 // .......................................................e.................................................................... - ins v25.d[1], x14 // .......e.................................................................................................................... - ldr x10, [x1, #104] // .........e.................................................................................................................. - ldr x12, [x1, #72] // .e.......................................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqrdmulh v0.8H, v1.8H, v6.H[1] // ..........................e................................................................................................. - mls v22.8H, v3.8H, v7.H[0] // .........................................................................................*.................................. - ins v3.d[0], x29 // ..........e................................................................................................................. - mul v11.8H, v1.8H, v6.H[0] // .........................e.................................................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ins v29.d[0], x7 // ..............................................................e............................................................. - ins v10.d[0], x24 // ......................................................................e..................................................... - ins v19.d[0], x13 // ..................................................e......................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v23.8H, v26.8H, v4.8H // ......................................................................................*..................................... - sub v4.8H, v26.8H, v4.8H // .....................................................................................*...................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v31.8H, v27.8H, v22.8H // ...........................................................................................*................................ - sub v22.8H, v27.8H, v22.8H // ..........................................................................................*................................. - mls v11.8H, v0.8H, v7.H[0] // ...........................e................................................................................................ - ins v3.d[1], x10 // ...........e................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ins v19.d[1], x23 // ...................................................e........................................................................ - trn2 v2.4S, v23.4S, v4.4S // .............................................................................................*.............................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ins v1.d[0], x16 // ..e......................................................................................................................... - trn1 v28.4S, v23.4S, v4.4S // ............................................................................................*............................... - sqrdmulh v16.8H, v3.8H, v6.H[1] // .....................e...................................................................................................... - trn2 v8.4S, v31.4S, v22.4S // ...............................................................................................*............................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn1 v23.4S, v31.4S, v22.4S // ..............................................................................................*............................. - mul v12.8H, v3.8H, v6.H[0] // ....................e....................................................................................................... - sub v4.8H, v25.8H, v11.8H // ............................e............................................................................................... - sqdmulh v3.8H, v2.8H, v7.H[1] // ...................................................................................................*........................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v20.8H, v25.8H, v11.8H // .............................e.............................................................................................. - sqdmulh v17.8H, v28.8H, v7.H[1] // ................................................................................................*........................... - sqdmulh v22.8H, v8.8H, v7.H[1] // .........................................................................................................*.................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqrdmulh v18.8H, v4.8H, v6.H[5] // ....................................e....................................................................................... - mul v21.8H, v4.8H, v6.H[4] // ...................................e........................................................................................ - ins v15.d[0], x8 // ..........................................................e................................................................. - sqdmulh v4.8H, v23.8H, v7.H[1] // ......................................................................................................*..................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - srshr v27.8H, v3.8H, #11 // ....................................................................................................*....................... - mul v26.8H, v20.8H, v6.H[2] // ..............................e............................................................................................. - sqrdmulh v3.8H, v20.8H, v6.H[3] // ...............................e............................................................................................ - mls v12.8H, v16.8H, v7.H[0] // ......................e..................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - srshr v16.8H, v17.8H, #11 // .................................................................................................*.......................... - ins v1.d[1], x12 // ...e........................................................................................................................ - srshr v22.8H, v22.8H, #11 // ..........................................................................................................*................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v21.8H, v18.8H, v7.H[0] // .....................................e...................................................................................... - srshr v0.8H, v4.8H, #11 // .......................................................................................................*.................... - ins v10.d[1], x26 // .......................................................................e.................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sub v13.8H, v1.8H, v12.8H // .......................e.................................................................................................... - add v12.8H, v1.8H, v12.8H // ........................e................................................................................................... - mls v2.8H, v27.8H, v7.H[0] // .....................................................................................................*...................... - mls v26.8H, v3.8H, v7.H[0] // ................................e........................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v8.8H, v22.8H, v7.H[0] // ...........................................................................................................*................ - mls v28.8H, v16.8H, v7.H[0] // ..................................................................................................*......................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v3.8H, v13.8H, v21.8H // .......................................e.................................................................................... - sub v20.8H, v13.8H, v21.8H // ......................................e..................................................................................... - mls v23.8H, v0.8H, v7.H[0] // ........................................................................................................*................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v18.8H, v12.8H, v26.8H // ..................................e......................................................................................... - sub v21.8H, v12.8H, v26.8H // .................................e.......................................................................................... - umov x25, v2.d[0] // ..............................................................................................................*............. - umov x16, v2.d[1] // ...............................................................................................................*............ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v9.4S, v3.4S, v20.4S // ...........................................e................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - umov x14, v23.d[1] // .................................................................................................................*.......... - trn1 v11.4S, v3.4S, v20.4S // ..........................................e................................................................................. - trn2 v3.4S, v18.4S, v21.4S // .........................................e.................................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - umov x10, v28.d[1] // .............................................................................................................*.............. - umov x13, v28.d[0] // ............................................................................................................*............... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - umov x20, v8.d[1] // ...................................................................................................................*........ - trn2 v0.2D, v3.2D, v9.2D // .............................................e.............................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - str x25, [x1, #16] // ......................................................................................................................*..... - str x16, [x1, #48] // ..........................................................................................................................*. - umov x24, v8.d[0] // ..................................................................................................................*......... - umov x16, v23.d[0] // ................................................................................................................*........... - trn1 v12.4S, v18.4S, v21.4S // ........................................e................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqrdmulh v22.8H, v0.8H, v14.8H // ..............................................................................e............................................. - mul v0.8H, v0.8H, v19.8H // .............................................................................e.............................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v23.2D, v12.2D, v11.2D // ............................................e............................................................................... - // gap // ............................................................................................................................ - str x14, [x1, #40] // .........................................................................................................................*.. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ldr x19, [x4, #-32] // ................................................................e........................................................... - str x13, [x1], #( 16*4) // ....................................................................................................................*....... - str x20, [x1, #-8] // ...........................................................................................................................* - trn1 v13.2D, v12.2D, v11.2D // ..............................................e............................................................................. - trn1 v11.2D, v3.2D, v9.2D // ...............................................e............................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ldr x15, [x4, #-40] // .............................................................e.............................................................. - mls v0.8H, v22.8H, v7.H[0] // ...............................................................................e............................................ - str x10, [x1, #-32] // ........................................................................................................................*... - mul v27.8H, v23.8H, v19.8H // ........................................................................e................................................... - sqrdmulh v23.8H, v23.8H, v14.8H // .........................................................................e.................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - str x24, [x1, #-40] // .......................................................................................................................*.... - str x16, [x1, #-56] // .....................................................................................................................*...... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - - // original source code - // ldr x10, [x1, #(16*0)] // ............e...............................................................................................................|...........e............................................................................................................... - // ldr x11, [x1, #((16*0)+8)] // .........................................e..................................................................................|........................................e.................................................................................. - // ins v8.d[0], x10 // .........................................................e..................................................................|........................................................e.................................................................. - // ins v8.d[1], x11 // .............................................................................e..............................................|............................................................................e.............................................. - // ldr x10, [x1, #(16*1)] // e...........................................................................................................................e........................................................................................................................... - // ldr x11, [x1, #((16*1)+8)] // ...............e............................................................................................................|..............e............................................................................................................ - // ins v9.d[0], x10 // .......................e....................................................................................................|......................e.................................................................................................... - // ins v9.d[1], x11 // .......................................e....................................................................................|......................................e.................................................................................... - // ldr x10, [x1, #(16*2)] // .............................e..............................................................................................|............................e.............................................................................................. - // ldr x11, [x1, #((16*2)+8)] // ........................................e...................................................................................|.......................................e................................................................................... - // ins v10.d[0], x10 // ............................................e...............................................................................|...........................................e............................................................................... - // ins v10.d[1], x11 // ......................................................e.....................................................................|.....................................................e..................................................................... - // ldr x10, [x1, #(16*3)] // .....e......................................................................................................................|....e...................................................................................................................... - // ldr x11, [x1, #((16*3)+8)] // ..........e.................................................................................................................|.........e................................................................................................................. - // ins v11.d[0], x10 // ....................e.......................................................................................................|...................e....................................................................................................... - // ins v11.d[1], x11 // ................................e...........................................................................................|...............................e........................................................................................... - // ldr x10, [x3], #16 // ....e.......................................................................................................................|...e....................................................................................................................... - // ldr x11, [x3, #(-16+8)] // .............e..............................................................................................................|............e.............................................................................................................. - // ins v0.d[0], x10 // .................e..........................................................................................................|................e.......................................................................................................... - // ins v0.d[1], x11 // ...................................e........................................................................................|..................................e........................................................................................ - // mul v24.8h, v10.8h, v0.h[0] // ..............................................................e.............................................................|.............................................................e............................................................. - // sqrdmulh v10.8h, v10.8h, v0.h[1] // ...........................................................e................................................................|..........................................................e................................................................ - // mls v24.8h, v10.8h, v7.h[0] // ...........................................................................e................................................|..........................................................................e................................................ - // sub v10.8h, v8.8h, v24.8h // ..................................................................................e.........................................|.................................................................................e......................................... - // add v8.8h, v8.8h, v24.8h // ...................................................................................e........................................|..................................................................................e........................................ - // mul v24.8h, v11.8h, v0.h[0] // .............................................e..............................................................................|............................................e.............................................................................. - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ..........................................e.................................................................................|.........................................e................................................................................. - // mls v24.8h, v11.8h, v7.h[0] // .....................................................e......................................................................|....................................................e...................................................................... - // sub v11.8h, v9.8h, v24.8h // ...............................................................e............................................................|..............................................................e............................................................ - // add v9.8h, v9.8h, v24.8h // .................................................................e..........................................................|................................................................e.......................................................... - // mul v24.8h, v9.8h, v0.h[2] // .........................................................................e..................................................|........................................................................e.................................................. - // sqrdmulh v9.8h, v9.8h, v0.h[3] // ..........................................................................e.................................................|.........................................................................e................................................. - // mls v24.8h, v9.8h, v7.h[0] // .....................................................................................e......................................|....................................................................................e...................................... - // sub v9.8h, v8.8h, v24.8h // ............................................................................................e...............................|...........................................................................................e............................... - // add v8.8h, v8.8h, v24.8h // ...........................................................................................e................................|..........................................................................................e................................ - // mul v24.8h, v11.8h, v0.h[4] // .....................................................................e......................................................|....................................................................e...................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[5] // ....................................................................e.......................................................|...................................................................e....................................................... - // mls v24.8h, v11.8h, v7.h[0] // ...............................................................................e............................................|..............................................................................e............................................ - // sub v11.8h, v10.8h, v24.8h // .........................................................................................e..................................|........................................................................................e.................................. - // add v10.8h, v10.8h, v24.8h // ........................................................................................e...................................|.......................................................................................e................................... - // trn1 v25.4s, v8.4s, v9.4s // ...........................................................................................................e................|..........................................................................................................e................ - // trn2 v26.4s, v8.4s, v9.4s // ..................................................................................................e.........................|.................................................................................................e......................... - // trn1 v27.4s, v10.4s, v11.4s // .................................................................................................e..........................|................................................................................................e.......................... - // trn2 v28.4s, v10.4s, v11.4s // ...............................................................................................e............................|..............................................................................................e............................ - // trn2 v10.2d, v25.2d, v27.2d // ..............................................................................................................e.............|.............................................................................................................e............. - // trn2 v11.2d, v26.2d, v28.2d // ......................................................................................................e.....................|.....................................................................................................e..................... - // trn1 v8.2d, v25.2d, v27.2d // ...................................................................................................................e........|..................................................................................................................e........ - // trn1 v9.2d, v26.2d, v28.2d // ....................................................................................................................e.......|...................................................................................................................e....... - // ldr x10, [x4], #(6*16) // ................e...........................................................................................................|...............e........................................................................................................... - // ldr x11, [x4, #(-(6*16)+8)] // ...............................e............................................................................................|..............................e............................................................................................ - // ins v0.d[0], x10 // ................................................e...........................................................................|...............................................e........................................................................... - // ins v0.d[1], x11 // .......................................................e....................................................................|......................................................e.................................................................... - // ldr x10, [x4, #(-6*16 + 1*16)] // ......e.....................................................................................................................|.....e..................................................................................................................... - // ldr x11, [x4, #((-6*16 + 1*16)+8)] // .....................e......................................................................................................|....................e...................................................................................................... - // ins v4.d[0], x10 // ........................e...................................................................................................|.......................e................................................................................................... - // ins v4.d[1], x11 // ......................................e.....................................................................................|.....................................e..................................................................................... - // ldr x10, [x4, #(-6*16 + 2*16)] // ......................e.....................................................................................................|.....................e..................................................................................................... - // ldr x11, [x4, #((-6*16 + 2*16)+8)] // .........................e..................................................................................................|........................e.................................................................................................. - // ins v1.d[0], x10 // ......................................................................e.....................................................|.....................................................................e..................................................... - // ins v1.d[1], x11 // ...*........................................................................................................................|..*........................................................................................................................ - // ldr x10, [x4, #(-6*16 + 3*16)] // ....................................e.......................................................................................|...................................e....................................................................................... - // ldr x11, [x4, #((-6*16 + 3*16)+8)] // .....................................................................................................................e......|....................................................................................................................e...... - // ins v5.d[0], x10 // ..............................................e.............................................................................|.............................................e............................................................................. - // ins v5.d[1], x11 // ..*.........................................................................................................................|.*......................................................................................................................... - // ldr x10, [x4, #(-6*16 + 4*16)] // ................................................................................................................e...........|...............................................................................................................e........... - // ldr x11, [x4, #((-6*16 + 4*16)+8)] // ..............................e.............................................................................................|.............................e............................................................................................. - // ins v2.d[0], x10 // .*..........................................................................................................................|*.......................................................................................................................... - // ins v2.d[1], x11 // ..............*.............................................................................................................|.............*............................................................................................................. - // ldr x10, [x4, #(-6*16 + 5*16)] // ...........e................................................................................................................|..........e................................................................................................................ - // ldr x11, [x4, #((-6*16 + 5*16)+8)] // ..........................e.................................................................................................|.........................e................................................................................................. - // ins v6.d[0], x10 // ...............................................e............................................................................|..............................................e............................................................................ - // ins v6.d[1], x11 // .................................................................................e..........................................|................................................................................e.......................................... - // mul v24.8h, v10.8h, v0.8h // ........................................................................................................................e...|.......................................................................................................................e... - // sqrdmulh v10.8h, v10.8h, v4.8h // .........................................................................................................................e..|........................................................................................................................e.. - // mls v24.8h, v10.8h, v7.h[0] // .......*....................................................................................................................|......*.................................................................................................................... - // sub v10.8h, v8.8h, v24.8h // ..................................*.........................................................................................|.................................*......................................................................................... - // add v8.8h, v8.8h, v24.8h // .................................*..........................................................................................|................................*.......................................................................................... - // mul v24.8h, v11.8h, v0.8h // .............................................................................................................e..............|............................................................................................................e.............. - // sqrdmulh v11.8h, v11.8h, v4.8h // ............................................................................................................e...............|...........................................................................................................e............... - // mls v24.8h, v11.8h, v7.h[0] // ......................................................................................................................e.....|.....................................................................................................................e..... - // sub v11.8h, v9.8h, v24.8h // .........*..................................................................................................................|........*.................................................................................................................. - // add v9.8h, v9.8h, v24.8h // ........*...................................................................................................................|.......*................................................................................................................... - // mul v24.8h, v9.8h, v1.8h // ...................*........................................................................................................|..................*........................................................................................................ - // sqrdmulh v9.8h, v9.8h, v5.8h // ..................*.........................................................................................................|.................*......................................................................................................... - // mls v24.8h, v9.8h, v7.h[0] // .....................................*......................................................................................|....................................*...................................................................................... - // sub v9.8h, v8.8h, v24.8h // ..................................................*.........................................................................|.................................................*......................................................................... - // add v8.8h, v8.8h, v24.8h // .................................................*..........................................................................|................................................*.......................................................................... - // mul v24.8h, v11.8h, v2.8h // ...........................*................................................................................................|..........................*................................................................................................ - // sqrdmulh v11.8h, v11.8h, v6.8h // ............................*...............................................................................................|...........................*............................................................................................... - // mls v24.8h, v11.8h, v7.h[0] // ...........................................*................................................................................|..........................................*................................................................................ - // sub v11.8h, v10.8h, v24.8h // ....................................................*.......................................................................|...................................................*....................................................................... - // add v10.8h, v10.8h, v24.8h // ...................................................*........................................................................|..................................................*........................................................................ - // trn1 v25.4s, v8.4s, v9.4s // ..........................................................*.................................................................|.........................................................*................................................................. - // trn2 v26.4s, v8.4s, v9.4s // ........................................................*...................................................................|.......................................................*................................................................... - // trn1 v27.4s, v10.4s, v11.4s // .............................................................*..............................................................|............................................................*.............................................................. - // trn2 v28.4s, v10.4s, v11.4s // ............................................................*...............................................................|...........................................................*............................................................... - // sqdmulh v24.8h, v25.8h, v7.h[1] // ..................................................................*.........................................................|.................................................................*......................................................... - // srshr v24.8h, v24.8h, #11 // ............................................................................*...............................................|...........................................................................*............................................... - // mls v25.8h, v24.8h, v7.h[0] // .......................................................................................*....................................|......................................................................................*.................................... - // sqdmulh v24.8h, v26.8h, v7.h[1] // ................................................................*...........................................................|...............................................................*........................................................... - // srshr v24.8h, v24.8h, #11 // ........................................................................*...................................................|.......................................................................*................................................... - // mls v26.8h, v24.8h, v7.h[0] // ....................................................................................*.......................................|...................................................................................*....................................... - // sqdmulh v24.8h, v27.8h, v7.h[1] // .......................................................................*....................................................|......................................................................*.................................................... - // srshr v24.8h, v24.8h, #11 // ................................................................................*...........................................|...............................................................................*........................................... - // mls v27.8h, v24.8h, v7.h[0] // ..........................................................................................*.................................|.........................................................................................*................................. - // sqdmulh v24.8h, v28.8h, v7.h[1] // ...................................................................*........................................................|..................................................................*........................................................ - // srshr v24.8h, v24.8h, #11 // ..............................................................................*.............................................|.............................................................................*............................................. - // mls v28.8h, v24.8h, v7.h[0] // ......................................................................................*.....................................|.....................................................................................*..................................... - // umov x10, v25.d[0] // ....................................................................................................*.......................|...................................................................................................*....................... - // umov x11, v25.d[1] // ...................................................................................................*........................|..................................................................................................*........................ - // umov x12, v26.d[0] // .............................................................................................*..............................|............................................................................................*.............................. - // umov x13, v26.d[1] // ..............................................................................................*.............................|.............................................................................................*............................. - // umov x14, v27.d[0] // ..........................................................................................................*.................|.........................................................................................................*................. - // umov x15, v27.d[1] // ................................................................................................*...........................|...............................................................................................*........................... - // umov x16, v28.d[0] // .........................................................................................................*..................|........................................................................................................*.................. - // umov x17, v28.d[1] // .....................................................................................................*......................|....................................................................................................*...................... - // str x10, [x1], #( 16*4) // .................................................................................................................*..........|................................................................................................................*.......... - // str x14, [x1, #(-16*4 + 8*1)] // ...........................................................................................................................*|..........................................................................................................................* - // str x12, [x1, #(-16*4 + 8*2)] // .......................................................................................................*....................|......................................................................................................*.................... - // str x16, [x1, #(-16*4 + 8*3)] // ..........................................................................................................................*.|.........................................................................................................................*. - // str x11, [x1, #(-16*4 + 8*4)] // .......................................................................................................................*....|......................................................................................................................*.... - // str x15, [x1, #(-16*4 + 8*5)] // ...............................................................................................................*............|..............................................................................................................*............ - // str x13, [x1, #(-16*4 + 8*6)] // ........................................................................................................*...................|.......................................................................................................*................... - // str x17, [x1, #(-16*4 + 8*7)] // ..................................................................................................................*.........|.................................................................................................................*......... + // Instructions: 91 + // Expected cycles: 31 + // Expected IPC: 2.94 + // + // Cycle bound: 31.0 + // IPC bound: 2.94 + // + // Wall time: 50.95s + // User time: 50.95s + // + // ----------------------------------- original position ------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--------------- + ldr q4, [x1, #112] // ...e....................................................................................... + trn1 v29.2D, v6.2D, v13.2D // ...............................*........................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q21, [x1, #96] // ..e........................................................................................ + add v15.8H, v17.8H, v9.8H // ................................................*.......................................... + ldr q5, [x3], #16 // ....e...................................................................................... + sub v16.8H, v17.8H, v9.8H // ...............................................*........................................... + mul v26.8H, v22.8H, v1.8H // ........................................*.................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v6.8H, v22.8H, v25.8H // .......................................*................................................... + ldr q17, [x1, #64] // e.......................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v9.8H, v15.8H, v11.8H // .................................................*......................................... + mul v0.8H, v15.8H, v31.8H // ..................................................*........................................ + ldr q10, [x1, #80] // .e......................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v15.8H, v16.8H, v12.8H // ......................................................*.................................... + mls v26.8H, v6.8H, v7.H[0] // .........................................*................................................. + mul v16.8H, v16.8H, v14.8H // .......................................................*................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v13.8H, v4.8H, v5.H[0] // ...........e............................................................................... + sqrdmulh v31.8H, v4.8H, v5.H[1] // ..........e................................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v0.8H, v9.8H, v7.H[0] // ...................................................*....................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v1.8H, v21.8H, v5.H[0] // ......e.................................................................................... + sqrdmulh v3.8H, v21.8H, v5.H[1] // .....e..................................................................................... + mls v16.8H, v15.8H, v7.H[0] // ........................................................*.................................. + add v15.8H, v29.8H, v26.8H // ...........................................*............................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v6.8H, v29.8H, v26.8H // ..........................................*................................................ + mls v13.8H, v31.8H, v7.H[0] // ............e.............................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v27.8H, v15.8H, v0.8H // .....................................................*..................................... + sub v14.8H, v15.8H, v0.8H // ....................................................*...................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v1.8H, v3.8H, v7.H[0] // .......e................................................................................... + sub v15.8H, v6.8H, v16.8H // .........................................................*................................. + add v16.8H, v6.8H, v16.8H // ..........................................................*................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v26.8H, v10.8H, v13.8H // ..............e............................................................................ + trn1 v11.4S, v27.4S, v14.4S // ...........................................................*............................... + trn2 v0.4S, v27.4S, v14.4S // ............................................................*.............................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v22.8H, v10.8H, v13.8H // .............e............................................................................. + trn1 v4.4S, v16.4S, v15.4S // .............................................................*............................. + trn2 v6.4S, v16.4S, v15.4S // ..............................................................*............................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v25.8H, v26.8H, v5.H[3] // ...............e........................................................................... + mul v21.8H, v26.8H, v5.H[2] // ................e.......................................................................... + sqdmulh v14.8H, v11.8H, v7.H[1] // ...............................................................*........................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqdmulh v27.8H, v0.8H, v7.H[1] // ..................................................................*........................ + mul v26.8H, v22.8H, v5.H[4] // .....................e..................................................................... + sqrdmulh v13.8H, v22.8H, v5.H[5] // ....................e...................................................................... + sqdmulh v16.8H, v4.8H, v7.H[1] // .....................................................................*..................... + sqdmulh v15.8H, v6.8H, v7.H[1] // ........................................................................*.................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v22.8H, v17.8H, v1.8H // .........e................................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v21.8H, v25.8H, v7.H[0] // .................e......................................................................... + srshr v14.8H, v14.8H, #11 // ................................................................*.......................... + srshr v27.8H, v27.8H, #11 // ...................................................................*....................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v26.8H, v13.8H, v7.H[0] // ......................e.................................................................... + srshr v16.8H, v16.8H, #11 // ......................................................................*.................... + srshr v15.8H, v15.8H, #11 // .........................................................................*................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v13.8H, v17.8H, v1.8H // ........e.................................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v0.8H, v27.8H, v7.H[0] // ....................................................................*...................... + mls v11.8H, v14.8H, v7.H[0] // .................................................................*......................... + sub v14.8H, v22.8H, v21.8H // ..................e........................................................................ + add v27.8H, v22.8H, v21.8H // ...................e....................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v4.8H, v16.8H, v7.H[0] // .......................................................................*................... + mls v6.8H, v15.8H, v7.H[0] // ..........................................................................*................ + sub v8.8H, v13.8H, v26.8H // .......................e................................................................... + add v3.8H, v13.8H, v26.8H // ........................e.................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q1, [x4], #(6*16) // .................................e......................................................... + trn2 v10.4S, v27.4S, v14.4S // ..........................e................................................................ + umov x19, v0.d[0] // .............................................................................*............. + umov x14, v0.d[1] // ..............................................................................*............ + trn2 v2.4S, v3.4S, v8.4S // ............................e.............................................................. + umov x12, v11.d[0] // ...........................................................................*............... + ldr q25, [x4, #-80] // ..................................e........................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x9, v4.d[1] // ................................................................................*.......... + ldr q31, [x4, #-64] // ...................................e....................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v17.2D, v10.2D, v2.2D // ..............................e............................................................ + umov x27, v6.d[0] // .................................................................................*......... + umov x11, v6.d[1] // ..................................................................................*........ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x13, v11.d[1] // ............................................................................*.............. + umov x20, v4.d[0] // ...............................................................................*........... + ldr q11, [x4, #-48] // ....................................e...................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v9.8H, v17.8H, v1.8H // .............................................e............................................. + sqrdmulh v16.8H, v17.8H, v25.8H // ............................................e.............................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v6.4S, v27.4S, v14.4S // .........................e................................................................. + str x19, [x1, #16] // .....................................................................................*..... + str x14, [x1, #48] // .........................................................................................*. + ldr q14, [x4, #-32] // .....................................e..................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v13.4S, v3.4S, v8.4S // ...........................e............................................................... + trn1 v17.2D, v10.2D, v2.2D // ................................e.......................................................... + str x12, [x1], #( 16*4) // ...................................................................................*....... + str x9, [x1, #-24] // ........................................................................................*.. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x27, [x1, #-40] // ......................................................................................*.... + str x11, [x1, #-8] // ..........................................................................................* + mls v9.8H, v16.8H, v7.H[0] // ..............................................e............................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x13, [x1, #-32] // .......................................................................................*... + str x20, [x1, #-56] // ....................................................................................*...... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q12, [x4, #-16] // ......................................e.................................................... + trn2 v22.2D, v6.2D, v13.2D // .............................e............................................................. + + // ---------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // ldr q8, [x1, #(16*0)] // ........e..................................................................................'.......~................................................................................ + // ldr q9, [x1, #(16*1)] // ...........e...............................................................................'..........~............................................................................. + // ldr q10, [x1, #(16*2)] // ..e........................................................................................'.~...................................................................................... + // ldr q11, [x1, #(16*3)] // e..........................................................................................~........................................................................................ + // ldr q0, [x3], #16 // ....e......................................................................................'...~.................................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ...................e.......................................................................'..................~..................................................................... + // mul v24.8h, v10.8h, v0.h[0] // ..................e........................................................................'.................~...................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................e................................................................'.........................~.............................................................. + // sub v10.8h, v8.8h, v24.8h // ..................................................e........................................'.................................................~...................................... + // add v8.8h, v8.8h, v24.8h // ...........................................e...............................................'..........................................~............................................. + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ................e..........................................................................'...............~........................................................................ + // mul v24.8h, v11.8h, v0.h[0] // ...............e...........................................................................'..............~......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................e...................................................................'......................~................................................................. + // sub v11.8h, v9.8h, v24.8h // ................................e..........................................................'...............................~........................................................ + // add v9.8h, v9.8h, v24.8h // .............................e.............................................................'............................~........................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ...................................e.......................................................'..................................~..................................................... + // mul v24.8h, v9.8h, v0.h[2] // ....................................e......................................................'...................................~.................................................... + // mls v24.8h, v27.8h, v7.h[0] // ............................................e..............................................'...........................................~............................................ + // sub v9.8h, v8.8h, v24.8h // .....................................................e.....................................'....................................................~................................... + // add v8.8h, v8.8h, v24.8h // ......................................................e....................................'.....................................................~.................................. + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ........................................e..................................................'.......................................~................................................ + // mul v24.8h, v11.8h, v0.h[4] // .......................................e...................................................'......................................~................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...............................................e...........................................'..............................................~......................................... + // sub v11.8h, v10.8h, v24.8h // .........................................................e.................................'........................................................~............................... + // add v10.8h, v10.8h, v24.8h // ..........................................................e................................'.........................................................~.............................. + // trn1 v25.4s, v8.4s, v9.4s // ............................................................................e..............'...........................................................................~............ + // trn2 v26.4s, v8.4s, v9.4s // ............................................................e..............................'...........................................................~............................ + // trn1 v27.4s, v10.4s, v11.4s // ................................................................................e..........'...............................................................................~........ + // trn2 v28.4s, v10.4s, v11.4s // ...............................................................e...........................'..............................................................~......................... + // trn2 v10.2d, v25.2d, v27.2d // ..........................................................................................e'........................................................................................ + // trn2 v11.2d, v26.2d, v28.2d // ....................................................................e......................'...................................................................~.................... + // trn1 v8.2d, v25.2d, v27.2d // .~.........................................................................................'*....................................................................................... + // trn1 v9.2d, v26.2d, v28.2d // .................................................................................e.........'................................................................................~....... + // ldr q0, [ x4], #(6*16) // ...........................................................e...............................'..........................................................~............................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .................................................................e.........................'................................................................~....................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ...................................................................e.......................'..................................................................~..................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .........................................................................e.................'........................................................................~............... + // ldr q2, [ x4, #(-6*16 + 4*16)] // ...............................................................................e...........'..............................................................................~......... + // ldr q6, [x4, #(-6*16 + 5*16)] // .........................................................................................e.'........................................................................................ + // sqrdmulh v27.8h, v10.8h, v4.8h // .......~...................................................................................'......*................................................................................. + // mul v24.8h, v10.8h, v0.8h // ......~....................................................................................'.....*.................................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .............~.............................................................................'............*........................................................................... + // sub v10.8h, v8.8h, v24.8h // ......................~....................................................................'.....................*.................................................................. + // add v8.8h, v8.8h, v24.8h // .....................~.....................................................................'....................*................................................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ...........................................................................e...............'..........................................................................~............. + // mul v24.8h, v11.8h, v0.8h // ..........................................................................e................'.........................................................................~.............. + // mls v24.8h, v27.8h, v7.h[0] // ......................................................................................e....'.....................................................................................~.. + // sub v11.8h, v9.8h, v24.8h // .....~.....................................................................................'....*................................................................................... + // add v9.8h, v9.8h, v24.8h // ...~.......................................................................................'..*..................................................................................... + // sqrdmulh v27.8h, v9.8h, v5.8h // .........~.................................................................................'........*............................................................................... + // mul v24.8h, v9.8h, v1.8h // ..........~................................................................................'.........*.............................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .................~.........................................................................'................*....................................................................... + // sub v9.8h, v8.8h, v24.8h // .........................~.................................................................'........................*............................................................... + // add v8.8h, v8.8h, v24.8h // ........................~..................................................................'.......................*................................................................ + // sqrdmulh v27.8h, v11.8h, v6.8h // ............~..............................................................................'...........*............................................................................ + // mul v24.8h, v11.8h, v2.8h // ..............~............................................................................'.............*.......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................~......................................................................'...................*.................................................................... + // sub v11.8h, v10.8h, v24.8h // ...........................~...............................................................'..........................*............................................................. + // add v10.8h, v10.8h, v24.8h // ............................~..............................................................'...........................*............................................................ + // trn1 v25.4s, v8.4s, v9.4s // ..............................~............................................................'.............................*.......................................................... + // trn2 v26.4s, v8.4s, v9.4s // ...............................~...........................................................'..............................*......................................................... + // trn1 v27.4s, v10.4s, v11.4s // .................................~.........................................................'................................*....................................................... + // trn2 v28.4s, v10.4s, v11.4s // ..................................~........................................................'.................................*...................................................... + // sqdmulh v24.8h, v25.8h, v7.h[1] // .....................................~.....................................................'....................................*................................................... + // srshr v24.8h, v24.8h, #11 // .............................................~.............................................'............................................*........................................... + // mls v25.8h, v24.8h, v7.h[0] // ....................................................~......................................'...................................................*.................................... + // sqdmulh v24.8h, v26.8h, v7.h[1] // ......................................~....................................................'.....................................*.................................................. + // srshr v24.8h, v24.8h, #11 // ..............................................~............................................'.............................................*.......................................... + // mls v26.8h, v24.8h, v7.h[0] // ...................................................~.......................................'..................................................*..................................... + // sqdmulh v24.8h, v27.8h, v7.h[1] // .........................................~.................................................'........................................*............................................... + // srshr v24.8h, v24.8h, #11 // ................................................~..........................................'...............................................*........................................ + // mls v27.8h, v24.8h, v7.h[0] // .......................................................~...................................'......................................................*................................. + // sqdmulh v24.8h, v28.8h, v7.h[1] // ..........................................~................................................'.........................................*.............................................. + // srshr v24.8h, v24.8h, #11 // .................................................~.........................................'................................................*....................................... + // mls v28.8h, v24.8h, v7.h[0] // ........................................................~..................................'.......................................................*................................ + // umov x10, v25.d[0] // ................................................................~..........................'...............................................................*........................ + // umov x11, v25.d[1] // .......................................................................~...................'......................................................................*................. + // umov x12, v26.d[0] // .............................................................~.............................'............................................................*........................... + // umov x13, v26.d[1] // ..............................................................~............................'.............................................................*.......................... + // umov x14, v27.d[0] // ........................................................................~..................'.......................................................................*................ + // umov x15, v27.d[1] // ..................................................................~........................'.................................................................*...................... + // umov x16, v28.d[0] // .....................................................................~.....................'....................................................................*................... + // umov x17, v28.d[1] // ......................................................................~....................'.....................................................................*.................. + // str x10, [x1], #( 16*4) // ..................................................................................~........'.................................................................................*...... + // str x14, [x1, #(-16*4 + 8*1)] // ........................................................................................~..'.......................................................................................* + // str x12, [x1, #(-16*4 + 8*2)] // .............................................................................~.............'............................................................................*........... + // str x16, [x1, #(-16*4 + 8*3)] // ....................................................................................~......'...................................................................................*.... + // str x11, [x1, #(-16*4 + 8*4)] // .......................................................................................~...'......................................................................................*. + // str x15, [x1, #(-16*4 + 8*5)] // ...................................................................................~.......'..................................................................................*..... + // str x13, [x1, #(-16*4 + 8*6)] // ..............................................................................~............'.............................................................................*.......... + // str x17, [x1, #(-16*4 + 8*7)] // .....................................................................................~.....'....................................................................................*... sub count, count, #1 cbnz count, layer4567_start - ins v26.d[0], x19 // *.................................................. - add v22.8H, v11.8H, v0.8H // ....*.............................................. - ins v15.d[1], x21 // ..*................................................ - ins v29.d[1], x15 // .*................................................. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v18.8H, v11.8H, v0.8H // .....*............................................. - mls v27.8H, v23.8H, v7.H[0] // ...*............................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - ins v26.d[1], x11 // ......*............................................ - mul v23.8H, v22.8H, v15.8H // ........*.......................................... - sqrdmulh v20.8H, v22.8H, v29.8H // .......*........................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sqrdmulh v10.8H, v18.8H, v10.8H // ..........*........................................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sub v28.8H, v13.8H, v27.8H // ............*...................................... - add v13.8H, v13.8H, v27.8H // ...........*....................................... - mul v9.8H, v18.8H, v26.8H // .........*......................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v23.8H, v20.8H, v7.H[0] // .............*..................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v9.8H, v10.8H, v7.H[0] // ..............*.................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - add v29.8H, v13.8H, v23.8H // ...............*................................... - sub v15.8H, v13.8H, v23.8H // ................*.................................. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - trn2 v0.4S, v29.4S, v15.4S // ...................*............................... - trn1 v29.4S, v29.4S, v15.4S // ....................*.............................. - add v8.8H, v28.8H, v9.8H // .................*................................. - sub v11.8H, v28.8H, v9.8H // ..................*................................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - trn1 v4.4S, v8.4S, v11.4S // ......................*............................ - trn2 v23.4S, v8.4S, v11.4S // .....................*............................. - sqdmulh v2.8H, v29.8H, v7.H[1] // ........................*.......................... - sqdmulh v19.8H, v0.8H, v7.H[1] // .......................*........................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - sqdmulh v5.8H, v4.8H, v7.H[1] // ..........................*........................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - srshr v9.8H, v2.8H, #11 // ............................*...................... - srshr v15.8H, v19.8H, #11 // ...........................*....................... - sqdmulh v28.8H, v23.8H, v7.H[1] // .........................*......................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - srshr v18.8H, v5.8H, #11 // ..............................*.................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v0.8H, v15.8H, v7.H[0] // ...............................*................... - srshr v15.8H, v28.8H, #11 // .............................*..................... - mls v29.8H, v9.8H, v7.H[0] // .................................*................. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - mls v4.8H, v18.8H, v7.H[0] // ..................................*................ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - umov x22, v0.d[0] // ...................................*............... - umov x12, v0.d[1] // ....................................*.............. - mls v23.8H, v15.8H, v7.H[0] // ................................*.................. - umov x20, v29.d[1] // ......................................*............ - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - umov x13, v29.d[0] // .......................................*........... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - umov x23, v4.d[1] // .....................................*............. - umov x7, v4.d[0] // ............................................*...... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - umov x19, v23.d[0] // ...........................................*....... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - umov x11, v23.d[1] // ........................................*.......... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str x12, [x1, #48] // ..........................................*........ - str x22, [x1, #16] // .........................................*......... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str x20, [x1, #32] // ................................................*.. - str x13, [x1], #( 16*4) // ..............................................*.... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str x23, [x1, #-24] // .............................................*..... - str x7, [x1, #-56] // ..................................................* - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str x19, [x1, #-40] // .................................................*. - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - // gap // ................................................... - str x11, [x1, #-8] // ...............................................*... - - // original source code - // ins v19.d[0], x19 // *.................................................. - // ins v29.d[1], x15 // ...*............................................... - // ins v15.d[1], x21 // ..*................................................ - // mls v27.8H, v23.8H, v7.H[0] // .....*............................................. - // add v22.8H, v11.8H, v0.8H // .*................................................. - // sub v23.8H, v11.8H, v0.8H // ....*.............................................. - // ins v19.d[1], x11 // ......*............................................ - // sqrdmulh v0.8H, v22.8H, v29.8H // ........*.......................................... - // mul v4.8H, v22.8H, v15.8H // .......*........................................... - // mul v22.8H, v23.8H, v19.8H // ............*...................................... - // sqrdmulh v3.8H, v23.8H, v10.8H // .........*......................................... - // add v26.8H, v13.8H, v27.8H // ...........*....................................... - // sub v27.8H, v13.8H, v27.8H // ..........*........................................ - // mls v4.8H, v0.8H, v7.H[0] // .............*..................................... - // mls v22.8H, v3.8H, v7.H[0] // ..............*.................................... - // add v23.8H, v26.8H, v4.8H // ...............*................................... - // sub v4.8H, v26.8H, v4.8H // ................*.................................. - // add v31.8H, v27.8H, v22.8H // ...................*............................... - // sub v22.8H, v27.8H, v22.8H // ....................*.............................. - // trn2 v2.4S, v23.4S, v4.4S // .................*................................. - // trn1 v28.4S, v23.4S, v4.4S // ..................*................................ - // trn2 v8.4S, v31.4S, v22.4S // ......................*............................ - // trn1 v23.4S, v31.4S, v22.4S // .....................*............................. - // sqdmulh v3.8H, v2.8H, v7.H[1] // ........................*.......................... - // sqdmulh v17.8H, v28.8H, v7.H[1] // .......................*........................... - // sqdmulh v22.8H, v8.8H, v7.H[1] // ............................*...................... - // sqdmulh v4.8H, v23.8H, v7.H[1] // .........................*......................... - // srshr v27.8H, v3.8H, #11 // ...........................*....................... - // srshr v16.8H, v17.8H, #11 // ..........................*........................ - // srshr v22.8H, v22.8H, #11 // ...............................*................... - // srshr v0.8H, v4.8H, #11 // .............................*..................... - // mls v2.8H, v27.8H, v7.H[0] // ..............................*.................... - // mls v8.8H, v22.8H, v7.H[0] // ....................................*.............. - // mls v28.8H, v16.8H, v7.H[0] // ................................*.................. - // mls v23.8H, v0.8H, v7.H[0] // .................................*................. - // umov x25, v2.d[0] // ..................................*................ - // umov x16, v2.d[1] // ...................................*............... - // umov x14, v23.d[1] // .......................................*........... - // umov x10, v28.d[1] // .....................................*............. - // umov x13, v28.d[0] // ......................................*............ - // umov x20, v8.d[1] // ..........................................*........ - // str x25, [x1, #16] // ............................................*...... - // str x16, [x1, #48] // ...........................................*....... - // umov x24, v8.d[0] // .........................................*......... - // umov x16, v23.d[0] // ........................................*.......... - // str x14, [x1, #40] // ...............................................*... - // str x13, [x1], #( 16*4) // ..............................................*.... - // str x20, [x1, #-8] // ..................................................* - // str x10, [x1, #-32] // .............................................*..... - // str x24, [x1, #-40] // .................................................*. - // str x16, [x1, #-56] // ................................................*.. + // Instructions: 50 + // Expected cycles: 30 + // Expected IPC: 1.67 + // + // Cycle bound: 30.0 + // IPC bound: 1.67 + // + // Wall time: 0.91s + // User time: 0.91s + // + // --------------- original position ---------------> + // 0 25 + // |------------------------| + trn1 v15.2D, v6.2D, v13.2D // *................................................. + mul v16.8H, v22.8H, v1.8H // ...*.............................................. + sqrdmulh v27.8H, v22.8H, v25.8H // ....*............................................. + add v6.8H, v17.8H, v9.8H // .*................................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v26.8H, v17.8H, v9.8H // ..*............................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v0.8H, v6.8H, v11.8H // .....*............................................ + mul v6.8H, v6.8H, v31.8H // ......*........................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v11.8H, v26.8H, v12.8H // .......*.......................................... + mul v14.8H, v26.8H, v14.8H // .........*........................................ + mls v16.8H, v27.8H, v7.H[0] // ........*......................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v6.8H, v0.8H, v7.H[0] // ..........*....................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v27.8H, v15.8H, v16.8H // ............*..................................... + sub v15.8H, v15.8H, v16.8H // .............*.................................... + mls v14.8H, v11.8H, v7.H[0] // ...........*...................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v16.8H, v27.8H, v6.8H // ..............*................................... + sub v27.8H, v27.8H, v6.8H // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v26.8H, v15.8H, v14.8H // ................*................................. + add v15.8H, v15.8H, v14.8H // .................*................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v6.4S, v16.4S, v27.4S // ..................*............................... + trn2 v16.4S, v16.4S, v27.4S // ...................*.............................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v27.4S, v15.4S, v26.4S // ....................*............................. + trn2 v15.4S, v15.4S, v26.4S // .....................*............................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqdmulh v14.8H, v6.8H, v7.H[1] // ......................*........................... + sqdmulh v26.8H, v16.8H, v7.H[1] // .......................*.......................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqdmulh v0.8H, v27.8H, v7.H[1] // ........................*......................... + sqdmulh v11.8H, v15.8H, v7.H[1] // .........................*........................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + srshr v14.8H, v14.8H, #11 // ..........................*....................... + srshr v26.8H, v26.8H, #11 // ...........................*...................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + srshr v0.8H, v0.8H, #11 // ............................*..................... + srshr v11.8H, v11.8H, #11 // .............................*.................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v6.8H, v14.8H, v7.H[0] // ...............................*.................. + mls v16.8H, v26.8H, v7.H[0] // ..............................*................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v27.8H, v0.8H, v7.H[0] // ................................*................. + mls v15.8H, v11.8H, v7.H[0] // .................................*................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + umov x27, v6.d[0] // ....................................*............. + umov x19, v6.d[1] // ........................................*......... + umov x13, v16.d[0] // ..................................*............... + umov x12, v16.d[1] // ...................................*.............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + umov x14, v27.d[1] // .....................................*............ + umov x15, v27.d[0] // .........................................*........ + umov x9, v15.d[0] // ......................................*........... + umov x23, v15.d[1] // .......................................*.......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + str x13, [x1, #16] // ..........................................*....... + str x12, [x1, #48] // ...........................................*...... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + str x27, [x1], #( 16*4) // ............................................*..... + str x19, [x1, #-32] // ................................................*. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + str x14, [x1, #-24] // .............................................*.... + str x9, [x1, #-40] // ..............................................*... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + str x15, [x1, #-56] // .................................................* + str x23, [x1, #-8] // ...............................................*.. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + + // ----------------- new position ------------------> + // 0 25 + // |------------------------|------------------------ + // trn1 v29.2D, v6.2D, v13.2D // *................................................. + // add v15.8H, v17.8H, v9.8H // ...*.............................................. + // sub v16.8H, v17.8H, v9.8H // ....*............................................. + // mul v26.8H, v22.8H, v1.8H // .*................................................ + // sqrdmulh v6.8H, v22.8H, v25.8H // ..*............................................... + // sqrdmulh v9.8H, v15.8H, v11.8H // .....*............................................ + // mul v0.8H, v15.8H, v31.8H // ......*........................................... + // sqrdmulh v15.8H, v16.8H, v12.8H // .......*.......................................... + // mls v26.8H, v6.8H, v7.H[0] // .........*........................................ + // mul v16.8H, v16.8H, v14.8H // ........*......................................... + // mls v0.8H, v9.8H, v7.H[0] // ..........*....................................... + // mls v16.8H, v15.8H, v7.H[0] // .............*.................................... + // add v15.8H, v29.8H, v26.8H // ...........*...................................... + // sub v6.8H, v29.8H, v26.8H // ............*..................................... + // add v27.8H, v15.8H, v0.8H // ..............*................................... + // sub v14.8H, v15.8H, v0.8H // ...............*.................................. + // sub v15.8H, v6.8H, v16.8H // ................*................................. + // add v16.8H, v6.8H, v16.8H // .................*................................ + // trn1 v11.4S, v27.4S, v14.4S // ..................*............................... + // trn2 v0.4S, v27.4S, v14.4S // ...................*.............................. + // trn1 v4.4S, v16.4S, v15.4S // ....................*............................. + // trn2 v6.4S, v16.4S, v15.4S // .....................*............................ + // sqdmulh v14.8H, v11.8H, v7.H[1] // ......................*........................... + // sqdmulh v27.8H, v0.8H, v7.H[1] // .......................*.......................... + // sqdmulh v16.8H, v4.8H, v7.H[1] // ........................*......................... + // sqdmulh v15.8H, v6.8H, v7.H[1] // .........................*........................ + // srshr v14.8H, v14.8H, #11 // ..........................*....................... + // srshr v27.8H, v27.8H, #11 // ...........................*...................... + // srshr v16.8H, v16.8H, #11 // ............................*..................... + // srshr v15.8H, v15.8H, #11 // .............................*.................... + // mls v0.8H, v27.8H, v7.H[0] // ...............................*.................. + // mls v11.8H, v14.8H, v7.H[0] // ..............................*................... + // mls v4.8H, v16.8H, v7.H[0] // ................................*................. + // mls v6.8H, v15.8H, v7.H[0] // .................................*................ + // umov x19, v0.d[0] // ....................................*............. + // umov x14, v0.d[1] // .....................................*............ + // umov x12, v11.d[0] // ..................................*............... + // umov x9, v4.d[1] // ......................................*........... + // umov x27, v6.d[0] // ........................................*......... + // umov x11, v6.d[1] // .........................................*........ + // umov x13, v11.d[1] // ...................................*.............. + // umov x20, v4.d[0] // .......................................*.......... + // str x19, [x1, #16] // ..........................................*....... + // str x14, [x1, #48] // ...........................................*...... + // str x12, [x1], #( 16*4) // ............................................*..... + // str x9, [x1, #-24] // ..............................................*... + // str x27, [x1, #-40] // ...............................................*.. + // str x11, [x1, #-8] // .................................................* + // str x13, [x1, #-32] // .............................................*.... + // str x20, [x1, #-56] // ................................................*. pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_m1_icestorm.s index b4932a82..2b63f435 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_load_store_opt_m1_icestorm.s @@ -26,15 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - xtmp0 .req x10 xtmp1 .req x11 .macro vins vec_out, gpr_in, lane @@ -45,27 +36,6 @@ xtmp1 .req x11 umov \gpr_out\(), \vec_in\().d[\lane] .endm -.macro ldr_vo vec, base, offset - ldr xtmp0, [\base, #\offset] - ldr xtmp1, [\base, #(\offset+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro ldr_vi vec, base, inc - ldr xtmp0, [\base], #\inc - ldr xtmp1, [\base, #(-\inc+8)] - vins \vec, xtmp0, 0 - vins \vec, xtmp1, 1 -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -83,15 +53,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -100,12 +70,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -119,21 +83,21 @@ xtmp1 .req x11 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -178,7 +142,7 @@ xtmp1 .req x11 .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -189,7 +153,7 @@ xtmp1 .req x11 str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -199,7 +163,7 @@ xtmp1 .req x11 add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -207,7 +171,7 @@ xtmp1 .req x11 stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -218,19 +182,19 @@ xtmp1 .req x11 #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -369,1424 +333,1156 @@ _ntt_kyber_123_4567_scalar_load_store_opt_m1_icestorm: load_roots_123 .p2align 2 - ldr x21, [x0, #320] // *........................ - ldr x13, [x0, #256] // .........*............... - // gap // ......................... - // gap // ......................... - ldr x22, [x0, #384] // ..*...................... - ldr x29, [x0, #264] // ................*........ - // gap // ......................... - // gap // ......................... - ldr x11, [x0, #72] // ......*.................. - // gap // ......................... - // gap // ......................... - // gap // ......................... - ins v29.d[0], x13 // ...........*............. - ldr x20, [x0, #392] // .*....................... - // gap // ......................... - // gap // ......................... - ldr x9, [x0, #328] // ...*..................... - ins v14.d[0], x22 // ........*................ - // gap // ......................... - // gap // ......................... - ins v30.d[0], x21 // ..........*.............. - ldr x26, [x0, #128] // ....*.................... - // gap // ......................... - // gap // ......................... - ins v14.d[1], x20 // ...............*......... - ldr x13, [x0, #8] // .....*................... - // gap // ......................... - // gap // ......................... - ins v29.d[1], x29 // ..................*...... - ldr x28, [x0, #64] // .......*................. - // gap // ......................... - // gap // ......................... - ldr x20, [x0, #200] // ............*............ - ins v30.d[1], x9 // ..............*.......... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ldr x15, [x0, #0] // .............*........... - sqrdmulh v26.8H, v29.8H, v0.H[1] // ......................*.. - // gap // ......................... - sqrdmulh v20.8H, v30.8H, v0.H[1] // ....................*.... - sqrdmulh v12.8H, v14.8H, v0.H[1] // .......................*. - // gap // ......................... - // gap // ......................... - mul v4.8H, v30.8H, v0.H[0] // ...................*..... - mul v30.8H, v29.8H, v0.H[0] // .....................*... - // gap // ......................... - // gap // ......................... - // gap // ......................... - ins v29.d[0], x26 // ........................* - // gap // ......................... - ins v11.d[0], x28 // .................*....... - - // original source code - // ldr x7, [x0, #320] // *........................ - // ldr x12, [x0, #392] // ......*.................. - // ldr x9, [x0, #384] // ..*...................... - // ldr x22, [x0, #328] // .......*................. - // ldr x23, [x0, #128] // ..........*.............. - // ldr x13, [x0, #8] // ............*............ - // ldr x11, [x0, #72] // ....*.................... - // ldr x19, [x0, #64] // ..............*.......... - // ins v14.d[0], x9 // ........*................ - // ldr x25, [x0, #256] // .*....................... - // ins v5.d[0], x7 // .........*............... - // ins v29.d[0], x25 // .....*................... - // ldr x20, [x0, #200] // ...............*......... - // ldr x15, [x0, #0] // .................*....... - // ins v5.d[1], x22 // ................*........ - // ins v14.d[1], x12 // ...........*............. - // ldr x17, [x0, #264] // ...*..................... - // ins v11.d[0], x19 // ........................* - // ins v29.d[1], x17 // .............*........... - // mul v4.8H, v5.8H, v0.H[0] // .....................*... - // sqrdmulh v20.8H, v5.8H, v0.H[1] // ...................*..... - // mul v30.8H, v29.8H, v0.H[0] // ......................*.. - // sqrdmulh v26.8H, v29.8H, v0.H[1] // ..................*...... - // sqrdmulh v12.8H, v14.8H, v0.H[1] // ....................*.... - // ins v29.d[0], x23 // .......................*. + // Instructions: 55 + // Expected cycles: 28 + // Expected IPC: 1.96 + // + // Cycle bound: 28.0 + // IPC bound: 1.96 + // + // Wall time: 1.55s + // User time: 1.55s + // + // ----------------- original position ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + ldr q14, [x0, #320] // ..*.................................................... + ldr q15, [x0, #448] // *...................................................... + // gap // ....................................................... + // gap // ....................................................... + ldr q6, [x0, #256] // ....*.................................................. + ldr q29, [x0, #384] // ...*................................................... + // gap // ....................................................... + // gap // ....................................................... + ldr q21, [x0, #128] // ......*................................................ + ldr q13, [x0, #192] // .....*................................................. + // gap // ....................................................... + // gap // ....................................................... + ldr q17, [x0, #64] // .........*............................................. + // gap // ....................................................... + // gap // ....................................................... + // gap // ....................................................... + mul v11.8H, v15.8H, v0.H[0] // .......*............................................... + sqrdmulh v27.8H, v15.8H, v0.H[1] // ........*.............................................. + // gap // ....................................................... + // gap // ....................................................... + sqrdmulh v12.8H, v14.8H, v0.H[1] // ............*.......................................... + mul v14.8H, v14.8H, v0.H[0] // .............*......................................... + // gap // ....................................................... + // gap // ....................................................... + sqrdmulh v25.8H, v6.8H, v0.H[1] // ................*...................................... + mul v6.8H, v6.8H, v0.H[0] // ..........................*............................ + // gap // ....................................................... + // gap // ....................................................... + mls v11.8H, v27.8H, v7.H[0] // ..............*........................................ + mul v3.8H, v29.8H, v0.H[0] // ..........*............................................ + // gap // ....................................................... + // gap // ....................................................... + sqrdmulh v15.8H, v29.8H, v0.H[1] // ...........*........................................... + mls v14.8H, v12.8H, v7.H[0] // ..................*.................................... + // gap // ....................................................... + ldr q16, [x0, #0] // .*..................................................... + mls v6.8H, v25.8H, v7.H[0] // ...............................*....................... + // gap // ....................................................... + // gap // ....................................................... + // gap // ....................................................... + sub v25.8H, v13.8H, v11.8H // ...................*................................... + add v27.8H, v13.8H, v11.8H // .................*..................................... + // gap // ....................................................... + // gap // ....................................................... + add v11.8H, v17.8H, v14.8H // ........................*.............................. + // gap // ....................................................... + // gap // ....................................................... + mls v3.8H, v15.8H, v7.H[0] // ...............*....................................... + sub v15.8H, v16.8H, v6.8H // ................................................*...... + add v23.8H, v16.8H, v6.8H // ...........................................*........... + // gap // ....................................................... + // gap // ....................................................... + mul v16.8H, v27.8H, v0.H[2] // .....................*................................. + sqrdmulh v13.8H, v27.8H, v0.H[3] // ......................*................................ + // gap // ....................................................... + // gap // ....................................................... + add v6.8H, v21.8H, v3.8H // .............................*......................... + sub v26.8H, v21.8H, v3.8H // ....................*.................................. + // gap // ....................................................... + // gap // ....................................................... + sqrdmulh v27.8H, v25.8H, v0.H[5] // ..............................*........................ + mul v25.8H, v25.8H, v0.H[4] // ............................*.......................... + // gap // ....................................................... + // gap // ....................................................... + sqrdmulh v22.8H, v6.8H, v0.H[3] // ................................*...................... + mls v16.8H, v13.8H, v7.H[0] // ...........................*........................... + // gap // ....................................................... + // gap // ....................................................... + mul v31.8H, v26.8H, v0.H[4] // .......................*............................... + sqrdmulh v26.8H, v26.8H, v0.H[5] // .....................................*................. + // gap // ....................................................... + // gap // ....................................................... + sub v2.8H, v17.8H, v14.8H // .........................*............................. + mls v25.8H, v27.8H, v7.H[0] // ....................................*.................. + // gap // ....................................................... + // gap // ....................................................... + mul v21.8H, v6.8H, v0.H[2] // ..................................*.................... + // gap // ....................................................... + // gap // ....................................................... + add v6.8H, v11.8H, v16.8H // ......................................*................ + mls v31.8H, v26.8H, v7.H[0] // ...............................................*....... + sub v27.8H, v11.8H, v16.8H // .................................*..................... + // gap // ....................................................... + // gap // ....................................................... + sub v16.8H, v2.8H, v25.8H // ............................................*.......... + add v26.8H, v2.8H, v25.8H // ..........................................*............ + // gap // ....................................................... + // gap // ....................................................... + mul v14.8H, v27.8H, v1.H[0] // ...................................*................... + sqrdmulh v27.8H, v27.8H, v1.H[1] // ........................................*.............. + // gap // ....................................................... + // gap // ....................................................... + sqrdmulh v11.8H, v26.8H, v1.H[3] // .............................................*......... + mls v21.8H, v22.8H, v7.H[0] // .......................................*............... + // gap // ....................................................... + // gap // ....................................................... + mul v20.8H, v26.8H, v1.H[2] // ..............................................*........ + mul v29.8H, v6.8H, v0.H[6] // ..................................................*.... + // gap // ....................................................... + // gap // ....................................................... + sqrdmulh v30.8H, v6.8H, v0.H[7] // .........................................*............. + add v28.8H, v15.8H, v31.8H // ......................................................* + // gap // ....................................................... + // gap // ....................................................... + sub v6.8H, v15.8H, v31.8H // .....................................................*. + add v18.8H, v23.8H, v21.8H // ...................................................*... + // gap // ....................................................... + // gap // ....................................................... + mls v14.8H, v27.8H, v7.H[0] // .................................................*..... + mls v20.8H, v11.8H, v7.H[0] // ....................................................*.. + // gap // ....................................................... + // gap // ....................................................... + + // -------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // ldr q24, [x0, #448] // .*..................................................... + // ldr q13, [x0, #0] // .................*..................................... + // ldr q26, [x0, #320] // *...................................................... + // ldr q2, [x0, #384] // ...*................................................... + // ldr q30, [x0, #256] // ..*.................................................... + // ldr q17, [x0, #192] // .....*................................................. + // ldr q22, [x0, #128] // ....*.................................................. + // mul v31.8H, v24.8H, v0.H[0] // .......*............................................... + // sqrdmulh v15.8H, v24.8H, v0.H[1] // ........*.............................................. + // ldr q8, [x0, #64] // ......*................................................ + // mul v23.8H, v2.8H, v0.H[0] // ..............*........................................ + // sqrdmulh v4.8H, v2.8H, v0.H[1] // ...............*....................................... + // sqrdmulh v25.8H, v26.8H, v0.H[1] // .........*............................................. + // mul v29.8H, v26.8H, v0.H[0] // ..........*............................................ + // mls v31.8H, v15.8H, v7.H[0] // .............*......................................... + // mls v23.8H, v4.8H, v7.H[0] // ......................*................................ + // sqrdmulh v21.8H, v30.8H, v0.H[1] // ...........*........................................... + // add v26.8H, v17.8H, v31.8H // ....................*.................................. + // mls v29.8H, v25.8H, v7.H[0] // ................*...................................... + // sub v18.8H, v17.8H, v31.8H // ...................*................................... + // sub v11.8H, v22.8H, v23.8H // ............................*.......................... + // mul v25.8H, v26.8H, v0.H[2] // .........................*............................. + // sqrdmulh v16.8H, v26.8H, v0.H[3] // ..........................*............................ + // mul v12.8H, v11.8H, v0.H[4] // .................................*..................... + // add v5.8H, v8.8H, v29.8H // .....................*................................. + // sub v27.8H, v8.8H, v29.8H // ...................................*................... + // mul v8.8H, v30.8H, v0.H[0] // ............*.......................................... + // mls v25.8H, v16.8H, v7.H[0] // ................................*...................... + // mul v3.8H, v18.8H, v0.H[4] // ..............................*........................ + // add v14.8H, v22.8H, v23.8H // ...........................*........................... + // sqrdmulh v31.8H, v18.8H, v0.H[5] // .............................*......................... + // mls v8.8H, v21.8H, v7.H[0] // ..................*.................................... + // sqrdmulh v22.8H, v14.8H, v0.H[3] // ...............................*....................... + // sub v16.8H, v5.8H, v25.8H // ........................................*.............. + // mul v21.8H, v14.8H, v0.H[2] // .....................................*................. + // mul v14.8H, v16.8H, v1.H[0] // ...........................................*........... + // mls v3.8H, v31.8H, v7.H[0] // ....................................*.................. + // sqrdmulh v15.8H, v11.8H, v0.H[5] // ..................................*.................... + // add v6.8H, v5.8H, v25.8H // ......................................*................ + // mls v21.8H, v22.8H, v7.H[0] // ..............................................*........ + // sqrdmulh v4.8H, v16.8H, v1.H[1] // ............................................*.......... + // sqrdmulh v30.8H, v6.8H, v0.H[7] // .................................................*..... + // add v11.8H, v27.8H, v3.8H // ..........................................*............ + // add v23.8H, v13.8H, v8.8H // ........................*.............................. + // sub v16.8H, v27.8H, v3.8H // .........................................*............. + // sqrdmulh v26.8H, v11.8H, v1.H[3] // .............................................*......... + // mul v20.8H, v11.8H, v1.H[2] // ...............................................*....... + // mls v12.8H, v15.8H, v7.H[0] // .......................................*............... + // sub v28.8H, v13.8H, v8.8H // .......................*............................... + // mls v14.8H, v4.8H, v7.H[0] // .....................................................*. + // mul v29.8H, v6.8H, v0.H[6] // ................................................*...... + // add v18.8H, v23.8H, v21.8H // ....................................................*.. + // mls v20.8H, v26.8H, v7.H[0] // ......................................................* + // sub v6.8H, v28.8H, v12.8H // ...................................................*... + // add v28.8H, v28.8H, v12.8H // ..................................................*.... sub count, count, #1 layer123_start: - mls v4.8H, v20.8H, v7.H[0] // .......................................*............................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - ldr x7, [x0, #336] // ....................e............................................................................... - mls v30.8H, v26.8H, v7.H[0] // ..................................*................................................................. - ldr x10, [x0, #448] // ............................*....................................................................... - mul v22.8H, v14.8H, v0.H[0] // ..........................................*......................................................... - ldr x12, [x0, #408] // .........................e.......................................................................... - ins v11.d[1], x11 // .......*............................................................................................ - ldr x25, [x0, #136] // .........*.......................................................................................... - ldr x9, [x0, #400] // ........................e........................................................................... - ins v18.d[0], x15 // ..*................................................................................................. - ldr x22, [x0, #344] // .....................e.............................................................................. - ldr x15, [x0, #192] // ............*....................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v5.d[0], x10 // ..............................*..................................................................... - mls v22.8H, v12.8H, v7.H[0] // ............................................*....................................................... - ldr x23, [x0, #144] // ........e........................................................................................... - // gap // .................................................................................................... - ins v18.d[1], x13 // ...*................................................................................................ - // gap // .................................................................................................... - ldr x13, [x0, #24] // .e.................................................................................................. - // gap // .................................................................................................... - ldr x11, [x0, #88] // .....e.............................................................................................. - ins v29.d[1], x25 // ...........*........................................................................................ - // gap // .................................................................................................... - // gap // .................................................................................................... - add v3.8H, v18.8H, v30.8H // ....................................*............................................................... - // gap // .................................................................................................... - ldr x19, [x0, #80] // ....e............................................................................................... - // gap // .................................................................................................... - sub v31.8H, v29.8H, v22.8H // .............................................*...................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v21.8H, v11.8H, v4.8H // .........................................*.......................................................... - ins v14.d[0], x9 // ..........................e......................................................................... - add v9.8H, v29.8H, v22.8H // ..............................................*..................................................... - // gap // .................................................................................................... - ldr x17, [x0, #456] // .............................*...................................................................... - mul v27.8H, v31.8H, v0.H[4] // ..............................................................*..................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v23.8H, v9.8H, v0.H[2] // ....................................................*............................................... - // gap // .................................................................................................... - sqrdmulh v10.8H, v9.8H, v0.H[3] // .....................................................*.............................................. - // gap // .................................................................................................... - sqrdmulh v20.8H, v31.8H, v0.H[5] // ...............................................................*.................................... - ins v5.d[1], x17 // ...............................*.................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v22.8H, v11.8H, v4.8H // ........................................*........................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v9.8H, v5.8H, v0.H[1] // ................................................*................................................... - // gap // .................................................................................................... - mul v11.8H, v5.8H, v0.H[0] // ...............................................*.................................................... - mls v23.8H, v10.8H, v7.H[0] // ......................................................*............................................. - // gap // .................................................................................................... - ldr x25, [x0, #272] // ................e................................................................................... - ins v5.d[0], x7 // ......................e............................................................................. - ins v2.d[0], x15 // ..............*..................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v11.8H, v9.8H, v7.H[0] // .................................................*.................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - ins v2.d[1], x20 // ...............*.................................................................................... - ins v29.d[0], x25 // ..................e................................................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sub v15.8H, v18.8H, v30.8H // ...................................*................................................................ - ldr x20, [x0, #216] // .............e...................................................................................... - // gap // .................................................................................................... - add v9.8H, v2.8H, v11.8H // ...................................................*................................................ - ldr x15, [x0, #16] // e................................................................................................... - mls v27.8H, v20.8H, v7.H[0] // ................................................................*................................... - sub v16.8H, v2.8H, v11.8H // ..................................................*................................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v2.8H, v9.8H, v0.H[3] // ..........................................................*......................................... - mul v4.8H, v9.8H, v0.H[2] // .........................................................*.......................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v11.8H, v16.8H, v0.H[4] // ...................................................................*................................ - sqrdmulh v9.8H, v16.8H, v0.H[5] // ....................................................................*............................... - sub v24.8H, v3.8H, v23.8H // .......................................................*............................................ - sub v28.8H, v15.8H, v27.8H // .................................................................*.................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - add v30.8H, v15.8H, v27.8H // ..................................................................*................................. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v4.8H, v2.8H, v7.H[0] // ...........................................................*........................................ - // gap // .................................................................................................... - ins v5.d[1], x22 // .......................e............................................................................ - // gap // .................................................................................................... - mls v11.8H, v9.8H, v7.H[0] // .....................................................................*.............................. - add v2.8H, v3.8H, v23.8H // ........................................................*........................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v15.8H, v21.8H, v4.8H // .............................................................*...................................... - sub v10.8H, v21.8H, v4.8H // ............................................................*....................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - add v9.8H, v22.8H, v11.8H // .......................................................................*............................ - sub v25.8H, v22.8H, v11.8H // ......................................................................*............................. - // gap // .................................................................................................... - // gap // .................................................................................................... - sqrdmulh v22.8H, v15.8H, v0.H[7] // .........................................................................*.......................... - mul v18.8H, v15.8H, v0.H[6] // ........................................................................*........................... - sqrdmulh v11.8H, v10.8H, v1.H[1] // ..............................................................................*..................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v21.8H, v10.8H, v1.H[0] // .............................................................................*...................... - sqrdmulh v27.8H, v9.8H, v1.H[3] // ...................................................................................*................ - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v31.8H, v9.8H, v1.H[2] // ..................................................................................*................. - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v18.8H, v22.8H, v7.H[0] // ..........................................................................*......................... - sqrdmulh v9.8H, v25.8H, v1.H[5] // ........................................................................................*........... - // gap // .................................................................................................... - ins v14.d[1], x12 // ...........................e........................................................................ - // gap // .................................................................................................... - mls v21.8H, v11.8H, v7.H[0] // ...............................................................................*.................... - // gap // .................................................................................................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v31.8H, v27.8H, v7.H[0] // ....................................................................................*............... - add v3.8H, v2.8H, v18.8H // ............................................................................*....................... - // gap // .................................................................................................... - // gap // .................................................................................................... - mul v22.8H, v25.8H, v1.H[4] // .......................................................................................*............ - ldr x17, [x0, #280] // .................e.................................................................................. - // gap // .................................................................................................... - sub v2.8H, v2.8H, v18.8H // ...........................................................................*........................ - add v13.8H, v24.8H, v21.8H // .................................................................................*.................. - ins v11.d[0], x19 // ......e............................................................................................. - // gap // .................................................................................................... - str q3, [x0], #(16) // ............................................................................................*....... - sub v20.8H, v30.8H, v31.8H // .....................................................................................*.............. - str q2, [x0, #48] // .............................................................................................*...... - // gap // .................................................................................................... - // gap // .................................................................................................... - mls v22.8H, v9.8H, v7.H[0] // .........................................................................................*.......... - sub v2.8H, v24.8H, v21.8H // ................................................................................*................... - // gap // .................................................................................................... - str q13, [x0, #112] // ..............................................................................................*..... - ins v29.d[1], x17 // ...................e................................................................................ - str q20, [x0, #304] // .................................................................................................*.. - // gap // .................................................................................................... - add v16.8H, v30.8H, v31.8H // ......................................................................................*............. - // gap // .................................................................................................... - // gap // .................................................................................................... - add v9.8H, v28.8H, v22.8H // ...........................................................................................*........ - str q2, [x0, #176] // ...............................................................................................*.... - sub v27.8H, v28.8H, v22.8H // ..........................................................................................*......... - // gap // .................................................................................................... - str q16, [x0, #240] // ................................................................................................*... - mul v4.8H, v5.8H, v0.H[0] // .....................................e.............................................................. - sqrdmulh v20.8H, v5.8H, v0.H[1] // ......................................e............................................................. - str q27, [x0, #432] // ...................................................................................................* - mul v30.8H, v29.8H, v0.H[0] // ................................e................................................................... - // gap // .................................................................................................... - sqrdmulh v26.8H, v29.8H, v0.H[1] // .................................e.................................................................. - sqrdmulh v12.8H, v14.8H, v0.H[1] // ...........................................e........................................................ - str q9, [x0, #368] // ..................................................................................................*. - // gap // .................................................................................................... - ins v29.d[0], x23 // ..........e......................................................................................... - - // original source code - // ldr x10, [x0, #0] // ...........................................e.......................................................|...........................................e...................................................... - // ldr x11, [x0, #(0+8)] // ...............e...................................................................................|...............e.................................................................................. - // ins v8.d[0], x10 // ........*..........................................................................................|........*......................................................................................... - // ins v8.d[1], x11 // ..............*....................................................................................|..............*................................................................................... - // ldr x10, [x0, #(1*(512/8))] // ...................e...............................................................................|...................e.............................................................................. - // ldr x11, [x0, #((1*(512/8))+8)] // ................e..................................................................................|................e................................................................................. - // ins v9.d[0], x10 // .............................................................................e.....................|.............................................................................e.................... - // ins v9.d[1], x11 // .....*.............................................................................................|.....*............................................................................................ - // ldr x10, [x0, #(2*(512/8))] // .............e.....................................................................................|.............e.................................................................................... - // ldr x11, [x0, #((2*(512/8))+8)] // ......*............................................................................................|......*........................................................................................... - // ins v10.d[0], x10 // ..................................................................................................e|.................................................................................................. - // ins v10.d[1], x11 // .................*.................................................................................|.................*................................................................................ - // ldr x10, [x0, #(3*(512/8))] // ..........*........................................................................................|..........*....................................................................................... - // ldr x11, [x0, #((3*(512/8))+8)] // .........................................e.........................................................|.........................................e........................................................ - // ins v11.d[0], x10 // ....................................*..............................................................|....................................*............................................................. - // ins v11.d[1], x11 // ......................................*............................................................|......................................*........................................................... - // ldr x10, [x0, #(4*(512/8))] // ..................................e................................................................|..................................e............................................................... - // ldr x11, [x0, #((4*(512/8))+8)] // ..........................................................................e........................|..........................................................................e....................... - // ins v12.d[0], x10 // .......................................e...........................................................|.......................................e.......................................................... - // ins v12.d[1], x11 // ....................................................................................e..............|....................................................................................e............. - // ldr x10, [x0, #(5*(512/8))] // e..................................................................................................|e................................................................................................. - // ldr x11, [x0, #((5*(512/8))+8)] // .........e.........................................................................................|.........e........................................................................................ - // ins v13.d[0], x10 // ...................................e...............................................................|...................................e.............................................................. - // ins v13.d[1], x11 // ......................................................e............................................|......................................................e........................................... - // ldr x10, [x0, #(6*(512/8))] // .......e...........................................................................................|.......e.......................................................................................... - // ldr x11, [x0, #((6*(512/8))+8)] // ....e..............................................................................................|....e............................................................................................. - // ins v14.d[0], x10 // ......................e............................................................................|......................e........................................................................... - // ins v14.d[1], x11 // .....................................................................e.............................|.....................................................................e............................ - // ldr x10, [x0, #(7*(512/8))] // ..*................................................................................................|..*............................................................................................... - // ldr x11, [x0, #((7*(512/8))+8)] // ........................*..........................................................................|........................*......................................................................... - // ins v15.d[0], x10 // ...........*.......................................................................................|...........*...................................................................................... - // ins v15.d[1], x11 // .............................*.....................................................................|.............................*.................................................................... - // mul v24.8h, v12.8h, v0.h[0] // ..............................................................................................e....|..............................................................................................e... - // sqrdmulh v12.8h, v12.8h, v0.h[1] // ...............................................................................................e...|...............................................................................................e.. - // mls v24.8h, v12.8h, v7.h[0] // .*.................................................................................................|.*................................................................................................ - // sub v12.8h, v8.8h, v24.8h // ........................................*..........................................................|........................................*......................................................... - // add v8.8h, v8.8h, v24.8h // ..................*................................................................................|..................*............................................................................... - // mul v24.8h, v13.8h, v0.h[0] // ...........................................................................................e.......|...........................................................................................e...... - // sqrdmulh v13.8h, v13.8h, v0.h[1] // ............................................................................................e......|............................................................................................e..... - // mls v24.8h, v13.8h, v7.h[0] // ...................................................................................................*.................................................................................................. - // sub v13.8h, v9.8h, v24.8h // ..............................*....................................................................|..............................*................................................................... - // add v9.8h, v9.8h, v24.8h // .....................*.............................................................................|.....................*............................................................................ - // mul v24.8h, v14.8h, v0.h[0] // ...*...............................................................................................|...*.............................................................................................. - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ................................................................................................e..|................................................................................................e. - // mls v24.8h, v14.8h, v7.h[0] // ............*......................................................................................|............*..................................................................................... - // sub v14.8h, v10.8h, v24.8h // ....................*..............................................................................|....................*............................................................................. - // add v10.8h, v10.8h, v24.8h // .......................*...........................................................................|.......................*.......................................................................... - // mul v24.8h, v15.8h, v0.h[0] // ................................*..................................................................|................................*................................................................. - // sqrdmulh v15.8h, v15.8h, v0.h[1] // ...............................*...................................................................|...............................*.................................................................. - // mls v24.8h, v15.8h, v7.h[0] // .....................................*.............................................................|.....................................*............................................................ - // sub v15.8h, v11.8h, v24.8h // .............................................*.....................................................|.............................................*.................................................... - // add v11.8h, v11.8h, v24.8h // ..........................................*........................................................|..........................................*....................................................... - // mul v24.8h, v10.8h, v0.h[2] // ..........................*........................................................................|..........................*....................................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[3] // ...........................*.......................................................................|...........................*...................................................................... - // mls v24.8h, v10.8h, v7.h[0] // .................................*.................................................................|.................................*................................................................ - // sub v10.8h, v8.8h, v24.8h // ..................................................*................................................|..................................................*............................................... - // add v8.8h, v8.8h, v24.8h // ........................................................*..........................................|........................................................*......................................... - // mul v24.8h, v11.8h, v0.h[2] // ...............................................*...................................................|...............................................*.................................................. - // sqrdmulh v11.8h, v11.8h, v0.h[3] // ..............................................*....................................................|..............................................*................................................... - // mls v24.8h, v11.8h, v7.h[0] // .....................................................*.............................................|.....................................................*............................................ - // sub v11.8h, v9.8h, v24.8h // ..........................................................*........................................|..........................................................*....................................... - // add v9.8h, v9.8h, v24.8h // .........................................................*.........................................|.........................................................*........................................ - // mul v24.8h, v14.8h, v0.h[4] // .........................*.........................................................................|.........................*........................................................................ - // sqrdmulh v14.8h, v14.8h, v0.h[5] // ............................*......................................................................|............................*..................................................................... - // mls v24.8h, v14.8h, v7.h[0] // ............................................*......................................................|............................................*..................................................... - // sub v14.8h, v12.8h, v24.8h // ...................................................*...............................................|...................................................*.............................................. - // add v12.8h, v12.8h, v24.8h // ....................................................*..............................................|....................................................*............................................. - // mul v24.8h, v15.8h, v0.h[4] // ................................................*..................................................|................................................*................................................. - // sqrdmulh v15.8h, v15.8h, v0.h[5] // .................................................*.................................................|.................................................*................................................ - // mls v24.8h, v15.8h, v7.h[0] // .......................................................*...........................................|.......................................................*.......................................... - // sub v15.8h, v13.8h, v24.8h // ............................................................*......................................|............................................................*..................................... - // add v13.8h, v13.8h, v24.8h // ...........................................................*.......................................|...........................................................*...................................... - // mul v24.8h, v9.8h, v0.h[6] // ..............................................................*....................................|..............................................................*................................... - // sqrdmulh v9.8h, v9.8h, v0.h[7] // .............................................................*.....................................|.............................................................*.................................... - // mls v24.8h, v9.8h, v7.h[0] // ...................................................................*...............................|...................................................................*.............................. - // sub v9.8h, v8.8h, v24.8h // ...........................................................................*.......................|...........................................................................*...................... - // add v8.8h, v8.8h, v24.8h // ........................................................................*..........................|........................................................................*......................... - // mul v24.8h, v11.8h, v1.h[0] // ................................................................*..................................|................................................................*................................. - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ...............................................................*...................................|...............................................................*.................................. - // mls v24.8h, v11.8h, v7.h[0] // ......................................................................*............................|......................................................................*........................... - // sub v11.8h, v10.8h, v24.8h // ..................................................................................*................|..................................................................................*............... - // add v10.8h, v10.8h, v24.8h // ............................................................................*......................|............................................................................*..................... - // mul v24.8h, v13.8h, v1.h[2] // ..................................................................*................................|..................................................................*............................... - // sqrdmulh v13.8h, v13.8h, v1.h[3] // .................................................................*.................................|.................................................................*................................ - // mls v24.8h, v13.8h, v7.h[0] // .......................................................................*...........................|.......................................................................*.......................... - // sub v13.8h, v12.8h, v24.8h // ...............................................................................*...................|...............................................................................*.................. - // add v12.8h, v12.8h, v24.8h // ......................................................................................*............|......................................................................................*........... - // mul v24.8h, v15.8h, v1.h[4] // .........................................................................*.........................|.........................................................................*........................ - // sqrdmulh v15.8h, v15.8h, v1.h[5] // ....................................................................*..............................|....................................................................*............................. - // mls v24.8h, v15.8h, v7.h[0] // .................................................................................*.................|.................................................................................*................ - // sub v15.8h, v14.8h, v24.8h // .........................................................................................*.........|.........................................................................................*........ - // add v14.8h, v14.8h, v24.8h // .......................................................................................*...........|.......................................................................................*.......... - // str q8, [x0], #(16) // ..............................................................................*....................|..............................................................................*................... - // str q9, [x0, #(-16 + 1*(512/8))] // ................................................................................*..................|................................................................................*................. - // str q10, [x0, #(-16 + 2*(512/8))] // ...................................................................................*...............|...................................................................................*.............. - // str q11, [x0, #(-16 + 3*(512/8))] // ........................................................................................*..........|........................................................................................*......... - // str q12, [x0, #(-16 + 4*(512/8))] // ..........................................................................................*........|..........................................................................................*....... - // str q13, [x0, #(-16 + 5*(512/8))] // .....................................................................................*.............|.....................................................................................*............ - // str q14, [x0, #(-16 + 6*(512/8))] // .................................................................................................*.|.................................................................................................* - // str q15, [x0, #(-16 + 7*(512/8))] // .............................................................................................*.....|.............................................................................................*.... + // Instructions: 76 + // Expected cycles: 30 + // Expected IPC: 2.53 + // + // Cycle bound: 30.0 + // IPC bound: 2.53 + // + // Wall time: 42.11s + // User time: 42.11s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q24, [x0, #464] // .......e.................................................................... + ldr q13, [x0, #16] // e........................................................................... + sub v11.8H, v23.8H, v21.8H // ...............................*............................................ + mls v29.8H, v30.8H, v7.H[0] // ..................................................*......................... + sqrdmulh v3.8H, v16.8H, v1.H[5] // ...............................................................*............ + ldr q26, [x0, #336] // .....e...................................................................... + ldr q2, [x0, #400] // ......e..................................................................... + sub v15.8H, v28.8H, v20.8H // .............................................................*.............. + sub v9.8H, v11.8H, v14.8H // ........................................................*................... + ldr q30, [x0, #272] // ....e....................................................................... + ldr q17, [x0, #208] // ...e........................................................................ + mul v19.8H, v16.8H, v1.H[4] // ................................................................*........... + sub v16.8H, v18.8H, v29.8H // ...................................................*........................ + add v5.8H, v18.8H, v29.8H // ....................................................*....................... + str q15, [x0, #320] // .........................................................................*.. + ldr q22, [x0, #144] // ..e......................................................................... + mul v31.8H, v24.8H, v0.H[0] // ........................e................................................... + sqrdmulh v15.8H, v24.8H, v0.H[1] // .......................e.................................................... + ldr q8, [x0, #80] // .e.......................................................................... + // gap // ............................................................................ + mul v23.8H, v2.8H, v0.H[0] // ...................e........................................................ + sqrdmulh v4.8H, v2.8H, v0.H[1] // ..................e......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v19.8H, v3.8H, v7.H[0] // .................................................................*.......... + sqrdmulh v25.8H, v26.8H, v0.H[1] // .............e.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + mul v29.8H, v26.8H, v0.H[0] // ..............e............................................................. + mls v31.8H, v15.8H, v7.H[0] // .........................e.................................................. + // gap // ............................................................................ + // gap // ............................................................................ + str q16, [x0, #64] // .....................................................................*...... + mls v23.8H, v4.8H, v7.H[0] // ....................e....................................................... + add v27.8H, v11.8H, v14.8H // .........................................................*.................. + // gap // ............................................................................ + sqrdmulh v21.8H, v30.8H, v0.H[1] // ........e................................................................... + str q9, [x0, #192] // .......................................................................*.... + // gap // ............................................................................ + sub v15.8H, v6.8H, v19.8H // ..................................................................*......... + str q27, [x0, #128] // ......................................................................*..... + add v26.8H, v17.8H, v31.8H // ...........................e................................................ + // gap // ............................................................................ + mls v29.8H, v25.8H, v7.H[0] // ...............e............................................................ + str q15, [x0, #448] // ...........................................................................* + sub v18.8H, v17.8H, v31.8H // ..........................e................................................. + sub v11.8H, v22.8H, v23.8H // .....................e...................................................... + // gap // ............................................................................ + str q5, [x0], #(16) // ....................................................................*....... + mul v25.8H, v26.8H, v0.H[2] // ..................................e......................................... + sqrdmulh v16.8H, v26.8H, v0.H[3] // .................................e.......................................... + // gap // ............................................................................ + mul v12.8H, v11.8H, v0.H[4] // .......................................e.................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v5.8H, v8.8H, v29.8H // .................e.......................................................... + sub v27.8H, v8.8H, v29.8H // ................e........................................................... + mul v8.8H, v30.8H, v0.H[0] // .........e.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + mls v25.8H, v16.8H, v7.H[0] // ...................................e........................................ + mul v3.8H, v18.8H, v0.H[4] // ............................................e............................... + // gap // ............................................................................ + // gap // ............................................................................ + add v14.8H, v22.8H, v23.8H // ......................e..................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v26.8H, v28.8H, v20.8H // ..............................................................*............. + sqrdmulh v31.8H, v18.8H, v0.H[5] // ...........................................e................................ + mls v8.8H, v21.8H, v7.H[0] // ..........e................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v22.8H, v14.8H, v0.H[3] // ............................e............................................... + str q26, [x0, #240] // ........................................................................*... + sub v16.8H, v5.8H, v25.8H // ....................................e....................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v21.8H, v14.8H, v0.H[2] // .............................e.............................................. + add v15.8H, v6.8H, v19.8H // ...................................................................*........ + mul v14.8H, v16.8H, v1.H[0] // ......................................................e..................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v3.8H, v31.8H, v7.H[0] // .............................................e.............................. + str q15, [x0, #368] // ..........................................................................*. + sqrdmulh v15.8H, v11.8H, v0.H[5] // ......................................e..................................... + add v6.8H, v5.8H, v25.8H // .....................................e...................................... + // gap // ............................................................................ + mls v21.8H, v22.8H, v7.H[0] // ..............................e............................................. + // gap // ............................................................................ + sqrdmulh v4.8H, v16.8H, v1.H[1] // .....................................................e...................... + // gap // ............................................................................ + sqrdmulh v30.8H, v6.8H, v0.H[7] // ................................................e........................... + // gap // ............................................................................ + add v11.8H, v27.8H, v3.8H // ...............................................e............................ + // gap // ............................................................................ + add v23.8H, v13.8H, v8.8H // ............e............................................................... + // gap // ............................................................................ + sub v16.8H, v27.8H, v3.8H // ..............................................e............................. + // gap // ............................................................................ + sqrdmulh v26.8H, v11.8H, v1.H[3] // ..........................................................e................. + // gap // ............................................................................ + // gap // ............................................................................ + mul v20.8H, v11.8H, v1.H[2] // ...........................................................e................ + mls v12.8H, v15.8H, v7.H[0] // ........................................e................................... + sub v28.8H, v13.8H, v8.8H // ...........e................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v14.8H, v4.8H, v7.H[0] // .......................................................e.................... + mul v29.8H, v6.8H, v0.H[6] // .................................................e.......................... + // gap // ............................................................................ + // gap // ............................................................................ + add v18.8H, v23.8H, v21.8H // ................................e........................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v20.8H, v26.8H, v7.H[0] // ............................................................e............... + sub v6.8H, v28.8H, v12.8H // .........................................e.................................. + add v28.8H, v28.8H, v12.8H // ..........................................e................................. + // gap // ............................................................................ + // gap // ............................................................................ + + // ----------------------------------------------------------- new position ------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|-------- + // ldr q8, [x0, #0] // .e..........................................................................'~........................................................ + // ldr q9, [x0, #(1*(512/8))] // ..................e.........................................................'.................~....................................... + // ldr q10, [x0, #(2*(512/8))] // ...............e............................................................'..............~.......................................... + // ldr q11, [x0, #(3*(512/8))] // ..........e.................................................................'.........~............................................... + // ldr q12, [x0, #(4*(512/8))] // .........e..................................................................'........~................................................ + // ldr q13, [x0, #(5*(512/8))] // .....e......................................................................'....~.................................................... + // ldr q14, [x0, #(6*(512/8))] // ......e.....................................................................'.....~................................................... + // ldr q15, [x0, #(7*(512/8))] // e...........................................................................~......................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ............................e...............................................'...........................~............................. + // mul v24.8h, v12.8h, v0.h[0] // ...........................................e................................'..........................................~.............. + // mls v24.8h, v27.8h, v7.h[0] // .................................................e..........................'................................................~........ + // sub v12.8h, v8.8h, v24.8h // .....................................................................e......'......................................................... + // add v8.8h, v8.8h, v24.8h // ................................................................e...........'......................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ......................e.....................................................'.....................~................................... + // mul v24.8h, v13.8h, v0.h[0] // .......................e....................................................'......................~.................................. + // mls v24.8h, v27.8h, v7.h[0] // .................................e..........................................'................................~........................ + // sub v13.8h, v9.8h, v24.8h // ..........................................e.................................'.........................................~............... + // add v9.8h, v9.8h, v24.8h // .........................................e..................................'........................................~................ + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ....................e.......................................................'...................~..................................... + // mul v24.8h, v14.8h, v0.h[0] // ...................e........................................................'..................~...................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................e.................................................'.........................~............................... + // sub v14.8h, v10.8h, v24.8h // ....................................e.......................................'...................................~..................... + // add v10.8h, v10.8h, v24.8h // ..............................................e.............................'.............................................~........... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // .................e..........................................................'................~........................................ + // mul v24.8h, v15.8h, v0.h[0] // ................e...........................................................'...............~......................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................e...................................................'.......................~................................. + // sub v15.8h, v11.8h, v24.8h // ...................................e........................................'..................................~...................... + // add v11.8h, v11.8h, v24.8h // ................................e...........................................'...............................~......................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ..................................................e.........................'.................................................~....... + // mul v24.8h, v10.8h, v0.h[2] // .....................................................e......................'....................................................~.... + // mls v24.8h, v27.8h, v7.h[0] // ............................................................e...............'......................................................... + // sub v10.8h, v8.8h, v24.8h // ..~.........................................................................'.*....................................................... + // add v8.8h, v8.8h, v24.8h // ........................................................................e...'......................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // .......................................e....................................'......................................~.................. + // mul v24.8h, v11.8h, v0.h[2] // ......................................e.....................................'.....................................~................... + // mls v24.8h, v27.8h, v7.h[0] // ............................................e...............................'...........................................~............. + // sub v11.8h, v9.8h, v24.8h // ....................................................e.......................'...................................................~..... + // add v9.8h, v9.8h, v24.8h // ...........................................................e................'......................................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ..........................................................e.................'......................................................... + // mul v24.8h, v14.8h, v0.h[4] // ........................................e...................................'.......................................~................. + // mls v24.8h, v27.8h, v7.h[0] // ....................................................................e.......'......................................................... + // sub v14.8h, v12.8h, v24.8h // ..........................................................................e.'......................................................... + // add v12.8h, v12.8h, v24.8h // ...........................................................................e'......................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ................................................e...........................'...............................................~......... + // mul v24.8h, v15.8h, v0.h[4] // .............................................e..............................'............................................~............ + // mls v24.8h, v27.8h, v7.h[0] // ........................................................e...................'.......................................................~. + // sub v15.8h, v13.8h, v24.8h // .................................................................e..........'......................................................... + // add v13.8h, v13.8h, v24.8h // ...............................................................e............'......................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ..............................................................e.............'......................................................... + // mul v24.8h, v9.8h, v0.h[6] // .......................................................................e....'......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...~........................................................................'..*...................................................... + // sub v9.8h, v8.8h, v24.8h // ............~...............................................................'...........*............................................. + // add v8.8h, v8.8h, v24.8h // .............~..............................................................'............*............................................ + // sqrdmulh v27.8h, v11.8h, v1.h[1] // .............................................................e..............'......................................................... + // mul v24.8h, v11.8h, v1.h[0] // .......................................................e....................'......................................................~.. + // mls v24.8h, v27.8h, v7.h[0] // ......................................................................e.....'......................................................... + // sub v11.8h, v10.8h, v24.8h // ........~...................................................................'.......*................................................. + // add v10.8h, v10.8h, v24.8h // ...........................~................................................'..........................*.............................. + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ..................................................................e.........'......................................................... + // mul v24.8h, v13.8h, v1.h[2] // ...................................................................e........'......................................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................................................e..'......................................................... + // sub v13.8h, v12.8h, v24.8h // .......~....................................................................'......*.................................................. + // add v12.8h, v12.8h, v24.8h // ...............................................~............................'..............................................*.......... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ....~.......................................................................'...*..................................................... + // mul v24.8h, v15.8h, v1.h[4] // ...........~................................................................'..........*.............................................. + // mls v24.8h, v27.8h, v7.h[0] // .....................~......................................................'....................*.................................... + // sub v15.8h, v14.8h, v24.8h // ..............................~.............................................'.............................*........................... + // add v14.8h, v14.8h, v24.8h // ......................................................~.....................'.....................................................*... + // str q8, [x0], #(16) // .....................................~......................................'....................................*.................... + // str q9, [x0, #(-16 + 1*(512/8))] // .........................~..................................................'........................*................................ + // str q10, [x0, #(-16 + 2*(512/8))] // ...............................~............................................'..............................*.......................... + // str q11, [x0, #(-16 + 3*(512/8))] // .............................~..............................................'............................*............................ + // str q12, [x0, #(-16 + 4*(512/8))] // ...................................................~........................'..................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // ..............~.............................................................'.............*........................................... + // str q14, [x0, #(-16 + 6*(512/8))] // .........................................................~..................'........................................................* + // str q15, [x0, #(-16 + 7*(512/8))] // ..................................~.........................................'.................................*....................... sub count, count, #1 cbnz count, layer123_start - ldr x12, [x0, #136] // .....*..................................................................... - mls v4.8H, v20.8H, v7.H[0] // *.......................................................................... - mul v14.8H, v14.8H, v0.H[0] // ...*....................................................................... - ldr x21, [x0, #448] // ..*........................................................................ - ins v11.d[1], x11 // ....*...................................................................... - ins v28.d[0], x15 // ......*.................................................................... - // gap // ........................................................................... - ldr x10, [x0, #192] // .......*................................................................... - ldr x23, [x0, #456] // ................*.......................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - ins v29.d[1], x12 // ...........*............................................................... - ins v9.d[0], x21 // ........*.................................................................. - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - mls v30.8H, v26.8H, v7.H[0] // .*......................................................................... - ins v9.d[1], x23 // .....................*..................................................... - // gap // ........................................................................... - // gap // ........................................................................... - mls v14.8H, v12.8H, v7.H[0] // .........*................................................................. - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v20.8H, v9.8H, v0.H[1] // .......................*................................................... - mul v3.8H, v9.8H, v0.H[0] // ........................*.................................................. - // gap // ........................................................................... - // gap // ........................................................................... - sub v22.8H, v11.8H, v4.8H // ......................*.................................................... - ins v12.d[0], x10 // ..........................*................................................ - // gap // ........................................................................... - // gap // ........................................................................... - ins v28.d[1], x13 // ..........*................................................................ - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - mls v3.8H, v20.8H, v7.H[0] // ...........................*............................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - ins v12.d[1], x20 // ............................*.............................................. - sub v27.8H, v29.8H, v14.8H // .............*............................................................. - // gap // ........................................................................... - // gap // ........................................................................... - add v2.8H, v29.8H, v14.8H // ...............*........................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - sub v9.8H, v12.8H, v3.8H // ................................*.......................................... - mul v21.8H, v27.8H, v0.H[4] // .................*......................................................... - // gap // ........................................................................... - // gap // ........................................................................... - add v20.8H, v12.8H, v3.8H // ..............................*............................................ - mul v12.8H, v2.8H, v0.H[2] // ..................*........................................................ - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v13.8H, v9.8H, v0.H[5] // ....................................*...................................... - mul v9.8H, v9.8H, v0.H[4] // ...................................*....................................... - // gap // ........................................................................... - // gap // ........................................................................... - mul v24.8H, v20.8H, v0.H[2] // ..................................*........................................ - sqrdmulh v20.8H, v20.8H, v0.H[3] // .................................*......................................... - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v26.8H, v2.8H, v0.H[3] // ...................*....................................................... - sqrdmulh v27.8H, v27.8H, v0.H[5] // ....................*...................................................... - // gap // ........................................................................... - // gap // ........................................................................... - add v3.8H, v28.8H, v30.8H // ............*.............................................................. - mls v9.8H, v13.8H, v7.H[0] // .........................................*................................. - // gap // ........................................................................... - // gap // ........................................................................... - mls v24.8H, v20.8H, v7.H[0] // ........................................*.................................. - add v5.8H, v11.8H, v4.8H // ..............*............................................................ - // gap // ........................................................................... - // gap // ........................................................................... - mls v21.8H, v27.8H, v7.H[0] // ...............................*........................................... - sub v23.8H, v28.8H, v30.8H // .............................*............................................. - // gap // ........................................................................... - // gap // ........................................................................... - add v2.8H, v22.8H, v9.8H // .............................................*............................. - sub v9.8H, v22.8H, v9.8H // ..............................................*............................ - // gap // ........................................................................... - // gap // ........................................................................... - add v18.8H, v5.8H, v24.8H // ...........................................*............................... - sub v8.8H, v5.8H, v24.8H // ............................................*.............................. - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v11.8H, v9.8H, v1.H[5] // ......................................................*.................... - mul v9.8H, v9.8H, v1.H[4] // ..........................................................*................ - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v20.8H, v2.8H, v1.H[3] // ...................................................*....................... - sub v5.8H, v23.8H, v21.8H // ......................................*.................................... - // gap // ........................................................................... - // gap // ........................................................................... - mul v2.8H, v2.8H, v1.H[2] // ....................................................*...................... - mls v12.8H, v26.8H, v7.H[0] // .........................*................................................. - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v27.8H, v8.8H, v1.H[1] // .................................................*......................... - mls v9.8H, v11.8H, v7.H[0] // ................................................................*.......... - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v30.8H, v18.8H, v0.H[7] // ...............................................*........................... - mul v18.8H, v18.8H, v0.H[6] // ................................................*.......................... - // gap // ........................................................................... - // gap // ........................................................................... - add v11.8H, v23.8H, v21.8H // .......................................*................................... - mls v2.8H, v20.8H, v7.H[0] // ........................................................*.................. - // gap // ........................................................................... - // gap // ........................................................................... - add v10.8H, v5.8H, v9.8H // .....................................................................*..... - mul v22.8H, v8.8H, v1.H[0] // ..................................................*........................ - // gap // ........................................................................... - // gap // ........................................................................... - sub v9.8H, v5.8H, v9.8H // .......................................................................*... - mls v18.8H, v30.8H, v7.H[0] // .....................................................*..................... - // gap // ........................................................................... - // gap // ........................................................................... - sub v20.8H, v11.8H, v2.8H // ..............................................................*............ - str q10, [x0, #384] // ..........................................................................* - // gap // ........................................................................... - add v29.8H, v3.8H, v12.8H // ..........................................*................................ - add v19.8H, v11.8H, v2.8H // ....................................................................*...... - str q9, [x0, #448] // .........................................................................*. - mls v22.8H, v27.8H, v7.H[0] // .......................................................*................... - // gap // ........................................................................... - sub v11.8H, v29.8H, v18.8H // ...........................................................*............... - str q20, [x0, #320] // ...................................................................*....... - sub v16.8H, v3.8H, v12.8H // .....................................*..................................... - // gap // ........................................................................... - add v9.8H, v29.8H, v18.8H // .........................................................*................. - str q19, [x0, #256] // ........................................................................*.. - // gap // ........................................................................... - // gap // ........................................................................... - str q11, [x0, #64] // ...............................................................*........... - add v11.8H, v16.8H, v22.8H // ............................................................*.............. - // gap // ........................................................................... - // gap // ........................................................................... - str q9, [x0], #(16) // .............................................................*............. - sub v9.8H, v16.8H, v22.8H // .................................................................*......... - // gap // ........................................................................... - // gap // ........................................................................... - str q11, [x0, #112] // ..................................................................*........ - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - str q9, [x0, #176] // ......................................................................*.... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - - // original source code - // mls v4.8H, v20.8H, v7.H[0] // .*......................................................................... - // mls v30.8H, v26.8H, v7.H[0] // ..........*................................................................ - // ldr x10, [x0, #448] // ...*....................................................................... - // mul v22.8H, v14.8H, v0.H[0] // ..*........................................................................ - // ins v11.d[1], x11 // ....*...................................................................... - // ldr x25, [x0, #136] // *.......................................................................... - // ins v18.d[0], x15 // .....*..................................................................... - // ldr x15, [x0, #192] // ......*.................................................................... - // ins v5.d[0], x10 // .........*................................................................. - // mls v22.8H, v12.8H, v7.H[0] // ............*.............................................................. - // ins v18.d[1], x13 // .................*......................................................... - // ins v29.d[1], x25 // ........*.................................................................. - // add v3.8H, v18.8H, v30.8H // ................................*.......................................... - // sub v31.8H, v29.8H, v22.8H // ....................*...................................................... - // add v21.8H, v11.8H, v4.8H // ...................................*....................................... - // add v9.8H, v29.8H, v22.8H // .....................*..................................................... - // ldr x17, [x0, #456] // .......*................................................................... - // mul v27.8H, v31.8H, v0.H[4] // .......................*................................................... - // mul v23.8H, v9.8H, v0.H[2] // .........................*................................................. - // sqrdmulh v10.8H, v9.8H, v0.H[3] // ..............................*............................................ - // sqrdmulh v20.8H, v31.8H, v0.H[5] // ...............................*........................................... - // ins v5.d[1], x17 // ...........*............................................................... - // sub v22.8H, v11.8H, v4.8H // ...............*........................................................... - // sqrdmulh v9.8H, v5.8H, v0.H[1] // .............*............................................................. - // mul v11.8H, v5.8H, v0.H[0] // ..............*............................................................ - // mls v23.8H, v10.8H, v7.H[0] // ...............................................*........................... - // ins v2.d[0], x15 // ................*.......................................................... - // mls v11.8H, v9.8H, v7.H[0] // ..................*........................................................ - // ins v2.d[1], x20 // ...................*....................................................... - // sub v15.8H, v18.8H, v30.8H // .....................................*..................................... - // add v9.8H, v2.8H, v11.8H // ........................*.................................................. - // mls v27.8H, v20.8H, v7.H[0] // ....................................*...................................... - // sub v16.8H, v2.8H, v11.8H // ......................*.................................................... - // sqrdmulh v2.8H, v9.8H, v0.H[3] // .............................*............................................. - // mul v4.8H, v9.8H, v0.H[2] // ............................*.............................................. - // mul v11.8H, v16.8H, v0.H[4] // ...........................*............................................... - // sqrdmulh v9.8H, v16.8H, v0.H[5] // ..........................*................................................ - // sub v24.8H, v3.8H, v23.8H // ..................................................................*........ - // sub v28.8H, v15.8H, v27.8H // .............................................*............................. - // add v30.8H, v15.8H, v27.8H // ....................................................*...................... - // mls v4.8H, v2.8H, v7.H[0] // ..................................*........................................ - // mls v11.8H, v9.8H, v7.H[0] // .................................*......................................... - // add v2.8H, v3.8H, v23.8H // ............................................................*.............. - // add v15.8H, v21.8H, v4.8H // ........................................*.................................. - // sub v10.8H, v21.8H, v4.8H // .........................................*................................. - // add v9.8H, v22.8H, v11.8H // ......................................*.................................... - // sub v25.8H, v22.8H, v11.8H // .......................................*................................... - // sqrdmulh v22.8H, v15.8H, v0.H[7] // ..................................................*........................ - // mul v18.8H, v15.8H, v0.H[6] // ...................................................*....................... - // sqrdmulh v11.8H, v10.8H, v1.H[1] // ................................................*.......................... - // mul v21.8H, v10.8H, v1.H[0] // .......................................................*................... - // sqrdmulh v27.8H, v9.8H, v1.H[3] // ............................................*.............................. - // mul v31.8H, v9.8H, v1.H[2] // ..............................................*............................ - // mls v18.8H, v22.8H, v7.H[0] // .........................................................*................. - // sqrdmulh v9.8H, v25.8H, v1.H[5] // ..........................................*................................ - // mls v21.8H, v11.8H, v7.H[0] // ...............................................................*........... - // mls v31.8H, v27.8H, v7.H[0] // .....................................................*..................... - // add v3.8H, v2.8H, v18.8H // ...................................................................*....... - // mul v22.8H, v25.8H, v1.H[4] // ...........................................*............................... - // sub v2.8H, v2.8H, v18.8H // ................................................................*.......... - // add v13.8H, v24.8H, v21.8H // ......................................................................*.... - // str q3, [x0], #(16) // .......................................................................*... - // sub v20.8H, v30.8H, v31.8H // ..........................................................*................ - // str q2, [x0, #48] // .....................................................................*..... - // mls v22.8H, v9.8H, v7.H[0] // .................................................*......................... - // sub v2.8H, v24.8H, v21.8H // ........................................................................*.. - // str q13, [x0, #112] // .........................................................................*. - // str q20, [x0, #304] // .................................................................*......... - // add v16.8H, v30.8H, v31.8H // .............................................................*............. - // add v9.8H, v28.8H, v22.8H // ......................................................*.................... - // str q2, [x0, #176] // ..........................................................................* - // sub v27.8H, v28.8H, v22.8H // ........................................................*.................. - // str q16, [x0, #240] // ....................................................................*...... - // str q27, [x0, #432] // ..............................................................*............ - // str q9, [x0, #368] // ...........................................................*............... + // Instructions: 21 + // Expected cycles: 11 + // Expected IPC: 1.91 + // + // Cycle bound: 11.0 + // IPC bound: 1.91 + // + // Wall time: 0.18s + // User time: 0.18s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mul v11.8H, v16.8H, v1.H[4] // .....*........................ + sub v27.8H, v28.8H, v20.8H // ...*.......................... + // gap // .............................. + // gap // .............................. + sub v9.8H, v23.8H, v21.8H // *............................. + // gap // .............................. + sqrdmulh v13.8H, v16.8H, v1.H[5] // ..*........................... + // gap // .............................. + mls v29.8H, v30.8H, v7.H[0] // .*............................ + add v15.8H, v28.8H, v20.8H // .................*............ + // gap // .............................. + // gap // .............................. + str q27, [x0, #320] // ........*..................... + sub v31.8H, v9.8H, v14.8H // ....*......................... + // gap // .............................. + // gap // .............................. + mls v11.8H, v13.8H, v7.H[0] // .........*.................... + str q15, [x0, #256] // ..................*........... + add v27.8H, v9.8H, v14.8H // ...........*.................. + // gap // .............................. + sub v26.8H, v18.8H, v29.8H // ......*....................... + str q31, [x0, #192] // ............*................. + // gap // .............................. + add v15.8H, v18.8H, v29.8H // .......*...................... + str q27, [x0, #128] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q26, [x0, #64] // ..........*................... + // gap // .............................. + sub v16.8H, v6.8H, v11.8H // .............*................ + // gap // .............................. + str q15, [x0], #(16) // ................*............. + add v15.8H, v6.8H, v11.8H // ...................*.......... + // gap // .............................. + // gap // .............................. + str q16, [x0, #432] // ...............*.............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q15, [x0, #368] // ....................*......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sub v11.8H, v23.8H, v21.8H // ..*............................ + // mls v29.8H, v30.8H, v7.H[0] // ....*.......................... + // sqrdmulh v3.8H, v16.8H, v1.H[5] // ...*........................... + // sub v15.8H, v28.8H, v20.8H // .*............................. + // sub v9.8H, v11.8H, v14.8H // .......*....................... + // mul v19.8H, v16.8H, v1.H[4] // *.............................. + // sub v16.8H, v18.8H, v29.8H // ...........*................... + // add v5.8H, v18.8H, v29.8H // .............*................. + // str q15, [x0, #320] // ......*........................ + // mls v19.8H, v3.8H, v7.H[0] // ........*...................... + // str q16, [x0, #64] // ...............*............... + // add v27.8H, v11.8H, v14.8H // ..........*.................... + // str q9, [x0, #192] // ............*.................. + // sub v15.8H, v6.8H, v19.8H // ................*.............. + // str q27, [x0, #128] // ..............*................ + // str q15, [x0, #448] // ...................*........... + // str q5, [x0], #(16) // .................*............. + // add v26.8H, v28.8H, v20.8H // .....*......................... + // str q26, [x0, #240] // .........*..................... + // add v15.8H, v6.8H, v19.8H // ..................*............ + // str q15, [x0, #368] // ....................*.......... restore inp, STACK0 mov count, #8 .p2align 2 - ldr x24, [x1, #32] // ............*..................... - ldr x26, [x1, #0] // ..*............................... - // gap // .................................. - // gap // .................................. - ldr x22, [x1, #40] // .*................................ - ldr x29, [x1, #16] // ................*................. - // gap // .................................. - // gap // .................................. - ldr x19, [x4], #(6*16) // ........*......................... - ldr x14, [x1, #48] // ..........*....................... - // gap // .................................. - // gap // .................................. - ldr x15, [x1, #8] // *................................. - ins v9.d[0], x24 // ...............*.................. - ldr x13, [x4, #-24] // .....*............................ - ins v31.d[0], x26 // ...................*.............. - ldr x12, [x4, #-40] // ...............................*.. - ldr x24, [x1, #56] // ....*............................. - // gap // .................................. - // gap // .................................. - ins v9.d[1], x22 // .................*................ - ins v4.d[0], x14 // ..................*............... - ldr x27, [x4, #-88] // ............................*..... - ldr x10, [x4, #-32] // .........*........................ - ldr x7, [x1, #24] // .......*.......................... - ldr x26, [x3], #16 // ...........*...................... - // gap // .................................. - // gap // .................................. - ins v22.d[0], x19 // .........................*........ - ins v4.d[1], x24 // ...........................*...... - ldr x20, [x3, #-8] // ......*........................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - ins v22.d[1], x27 // ................................*. - ins v3.d[0], x26 // .............*.................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - ins v12.d[0], x10 // ....................*............. - ins v3.d[1], x20 // ..............*................... - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - // gap // .................................. - ins v31.d[1], x15 // .....................*............ - ldr x15, [x4, #-48] // ...*.............................. - mul v28.8H, v9.8H, v3.H[0] // .......................*.......... - // gap // .................................. - sqrdmulh v2.8H, v9.8H, v3.H[1] // ........................*......... - // gap // .................................. - // gap // .................................. - // gap // .................................. - sqrdmulh v9.8H, v4.8H, v3.H[1] // ..............................*... - mul v18.8H, v4.8H, v3.H[0] // .................................* - // gap // .................................. - // gap // .................................. - ins v12.d[1], x13 // ......................*........... - ins v29.d[0], x15 // .............................*.... - ldr x15, [x4, #-64] // ..........................*....... - // gap // .................................. - - // original source code - // ldr x9, [x1, #8] // ......*........................... - // ldr x8, [x1, #40] // ..*............................... - // ldr x27, [x1, #0] // .*................................ - // ldr x11, [x4, #48] // ..........................*....... - // ldr x20, [x1, #56] // ...........*...................... - // ldr x25, [x4, #72] // ........*......................... - // ldr x13, [x3, #8] // ....................*............. - // ldr x7, [x1, #24] // ................*................. - // ldr x15, [x4], #(6*16) // ....*............................. - // ldr x17, [x4, #-32] // ...............*.................. - // ldr x23, [x1, #48] // .....*............................ - // ldr x28, [x3], #16 // .................*................ - // ldr x29, [x1, #32] // *................................. - // ins v3.d[0], x28 // ......................*........... - // ins v3.d[1], x13 // ........................*......... - // ins v5.d[0], x29 // .......*.......................... - // ldr x29, [x1, #16] // ...*.............................. - // ins v5.d[1], x8 // ............*..................... - // ins v21.d[0], x23 // .............*.................... - // ins v31.d[0], x27 // .........*........................ - // ins v12.d[0], x17 // .......................*.......... - // ins v31.d[1], x9 // .........................*........ - // ins v12.d[1], x25 // ...............................*.. - // mul v28.8H, v5.8H, v3.H[0] // ...........................*...... - // sqrdmulh v2.8H, v5.8H, v3.H[1] // ............................*..... - // ins v22.d[0], x15 // ..................*............... - // ldr x15, [x4, #-64] // .................................* - // ins v21.d[1], x20 // ...................*.............. - // ldr x23, [x4, #-88] // ..............*................... - // ins v29.d[0], x11 // ................................*. - // sqrdmulh v9.8H, v21.8H, v3.H[1] // .............................*.... - // ldr x12, [x4, #-40] // ..........*....................... - // ins v22.d[1], x23 // .....................*............ - // mul v18.8H, v21.8H, v3.H[0] // ..............................*... + // Instructions: 57 + // Expected cycles: 40 + // Expected IPC: 1.43 + // + // Cycle bound: 40.0 + // IPC bound: 1.43 + // + // Wall time: 2.55s + // User time: 2.55s + // + // ------------------ original position -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + ldr q2, [x3], #16 // .*....................................................... + ldr q16, [x1, #48] // *........................................................ + // gap // ......................................................... + // gap // ......................................................... + ldr q3, [x1, #32] // ..*...................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + ldr q28, [x4, #80] // ..........*.............................................. + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + ldr q1, [x4, #48] // .....*................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sqrdmulh v22.8H, v16.8H, v2.H[1] // ......*.................................................. + mul v4.8H, v16.8H, v2.H[0] // .......*................................................. + ldr q29, [x4, #32] // ..........................................*.............. + // gap // ......................................................... + sqrdmulh v11.8H, v3.8H, v2.H[1] // ...........*............................................. + ldr q0, [x1, #0] // ...*..................................................... + // gap // ......................................................... + // gap // ......................................................... + mul v15.8H, v3.8H, v2.H[0] // ............*............................................ + // gap // ......................................................... + // gap // ......................................................... + ldr q27, [x1, #16] // ....*.................................................... + mls v4.8H, v22.8H, v7.H[0] // .............*........................................... + ldr q30, [x4, #16] // .........*............................................... + // gap // ......................................................... + // gap // ......................................................... + ldr q21, [x4], #(6*16) // ........*................................................ + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mls v15.8H, v11.8H, v7.H[0] // ..............*.......................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + add v22.8H, v27.8H, v4.8H // ................*........................................ + sub v5.8H, v27.8H, v4.8H // ...............*......................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mul v20.8H, v22.8H, v2.H[2] // ....................*.................................... + sqrdmulh v16.8H, v22.8H, v2.H[3] // .....................*................................... + // gap // ......................................................... + // gap // ......................................................... + sqrdmulh v9.8H, v5.8H, v2.H[5] // .................*....................................... + mul v12.8H, v5.8H, v2.H[4] // ..................*...................................... + // gap // ......................................................... + // gap // ......................................................... + add v27.8H, v0.8H, v15.8H // ........................*................................ + sub v31.8H, v0.8H, v15.8H // ...................*..................................... + // gap // ......................................................... + // gap // ......................................................... + mls v20.8H, v16.8H, v7.H[0] // .......................*................................. + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mls v12.8H, v9.8H, v7.H[0] // ......................*.................................. + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + add v18.8H, v27.8H, v20.8H // ...........................*............................. + sub v23.8H, v27.8H, v20.8H // ............................*............................ + // gap // ......................................................... + // gap // ......................................................... + sub v17.8H, v31.8H, v12.8H // .........................*............................... + add v0.8H, v31.8H, v12.8H // ..........................*.............................. + // gap // ......................................................... + // gap // ......................................................... + trn2 v6.4S, v18.4S, v23.4S // ...............................*......................... + trn1 v16.4S, v18.4S, v23.4S // .................................*....................... + // gap // ......................................................... + // gap // ......................................................... + trn1 v3.4S, v0.4S, v17.4S // ..............................*.......................... + trn2 v23.4S, v0.4S, v17.4S // .............................*........................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + trn2 v4.2D, v6.2D, v23.2D // ..................................*...................... + trn1 v6.2D, v6.2D, v23.2D // .......................................*................. + // gap // ......................................................... + // gap // ......................................................... + trn2 v10.2D, v16.2D, v3.2D // ...................................*..................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sqrdmulh v15.8H, v4.8H, v30.8H // ....................................*.................... + // gap // ......................................................... + // gap // ......................................................... + mul v18.8H, v4.8H, v21.8H // .....................................*................... + trn1 v26.2D, v16.2D, v3.2D // ..................................................*...... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sqrdmulh v16.8H, v10.8H, v30.8H // ......................................*.................. + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mls v18.8H, v15.8H, v7.H[0] // .........................................*............... + mul v3.8H, v10.8H, v21.8H // ........................................*................ + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mls v3.8H, v16.8H, v7.H[0] // ............................................*............ + ldr q13, [x4, #-32] // ................................*........................ + add v21.8H, v6.8H, v18.8H // ...........................................*............. + // gap // ......................................................... + sub v5.8H, v6.8H, v18.8H // .............................................*........... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sqrdmulh v17.8H, v21.8H, v1.8H // ..............................................*.......... + mul v9.8H, v21.8H, v29.8H // ...............................................*......... + // gap // ......................................................... + // gap // ......................................................... + add v22.8H, v26.8H, v3.8H // ....................................................*.... + sub v26.8H, v26.8H, v3.8H // ......................................................*.. + // gap // ......................................................... + // gap // ......................................................... + mul v16.8H, v5.8H, v13.8H // ................................................*........ + sqrdmulh v23.8H, v5.8H, v28.8H // .................................................*....... + // gap // ......................................................... + // gap // ......................................................... + mls v9.8H, v17.8H, v7.H[0] // ...................................................*..... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mls v16.8H, v23.8H, v7.H[0] // .....................................................*... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + add v27.8H, v22.8H, v9.8H // .......................................................*. + sub v14.8H, v22.8H, v9.8H // ........................................................* + // gap // ......................................................... + // gap // ......................................................... + + // --------------------- new position ---------------------> + // 0 25 50 + // |------------------------|------------------------|------ + // ldr q22, [x1, #48] // .*....................................................... + // ldr q8, [x3], #16 // *........................................................ + // ldr q28, [x1, #32] // ..*...................................................... + // ldr q23, [x1, #0] // .........*............................................... + // ldr q30, [x1, #16] // ...........*............................................. + // ldr q13, [x4, #48] // ....*.................................................... + // sqrdmulh v10.8H, v22.8H, v8.H[1] // .....*................................................... + // mul v6.8H, v22.8H, v8.H[0] // ......*.................................................. + // ldr q29, [x4], #(6*16) // ..............*.......................................... + // ldr q2, [x4, #-80] // .............*........................................... + // ldr q25, [x4, #-16] // ...*..................................................... + // sqrdmulh v31.8H, v28.8H, v8.H[1] // ........*................................................ + // mul v22.8H, v28.8H, v8.H[0] // ..........*.............................................. + // mls v6.8H, v10.8H, v7.H[0] // ............*............................................ + // mls v22.8H, v31.8H, v7.H[0] // ...............*......................................... + // sub v15.8H, v30.8H, v6.8H // .................*....................................... + // add v6.8H, v30.8H, v6.8H // ................*........................................ + // sqrdmulh v27.8H, v15.8H, v8.H[5] // ....................*.................................... + // mul v15.8H, v15.8H, v8.H[4] // .....................*................................... + // sub v26.8H, v23.8H, v22.8H // .......................*................................. + // mul v17.8H, v6.8H, v8.H[2] // ..................*...................................... + // sqrdmulh v6.8H, v6.8H, v8.H[3] // ...................*..................................... + // mls v15.8H, v27.8H, v7.H[0] // .........................*............................... + // mls v17.8H, v6.8H, v7.H[0] // ........................*................................ + // add v27.8H, v23.8H, v22.8H // ......................*.................................. + // sub v16.8H, v26.8H, v15.8H // ............................*............................ + // add v15.8H, v26.8H, v15.8H // .............................*........................... + // add v22.8H, v27.8H, v17.8H // ..........................*.............................. + // sub v3.8H, v27.8H, v17.8H // ...........................*............................. + // trn2 v17.4S, v15.4S, v16.4S // .................................*....................... + // trn1 v8.4S, v15.4S, v16.4S // ................................*........................ + // trn2 v26.4S, v22.4S, v3.4S // ..............................*.......................... + // ldr q31, [x4, #-32] // ............................................*............ + // trn1 v27.4S, v22.4S, v3.4S // ...............................*......................... + // trn2 v15.2D, v26.2D, v17.2D // ..................................*...................... + // trn2 v3.2D, v27.2D, v8.2D // ....................................*.................... + // sqrdmulh v16.8H, v15.8H, v2.8H // .....................................*................... + // mul v6.8H, v15.8H, v29.8H // ......................................*.................. + // sqrdmulh v15.8H, v3.8H, v2.8H // ........................................*................ + // trn1 v22.2D, v26.2D, v17.2D // ...................................*..................... + // mul v26.8H, v3.8H, v29.8H // ..........................................*.............. + // mls v6.8H, v16.8H, v7.H[0] // .........................................*............... + // ldr q0, [x4, #-64] // .......*................................................. + // add v16.8H, v22.8H, v6.8H // .............................................*........... + // mls v26.8H, v15.8H, v7.H[0] // ...........................................*............. + // sub v14.8H, v22.8H, v6.8H // ..............................................*.......... + // sqrdmulh v15.8H, v16.8H, v13.8H // ...............................................*......... + // mul v6.8H, v16.8H, v0.8H // ................................................*........ + // mul v16.8H, v14.8H, v31.8H // ...................................................*..... + // sqrdmulh v14.8H, v14.8H, v25.8H // ....................................................*.... + // trn1 v27.2D, v27.2D, v8.2D // .......................................*................. + // mls v6.8H, v15.8H, v7.H[0] // .....................................................*... + // add v28.8H, v27.8H, v26.8H // .................................................*....... + // mls v16.8H, v14.8H, v7.H[0] // ......................................................*.. + // sub v26.8H, v27.8H, v26.8H // ..................................................*...... + // add v27.8H, v28.8H, v6.8H // .......................................................*. + // sub v14.8H, v28.8H, v6.8H // ........................................................* sub count, count, #1 layer4567_start: - ldr x9, [x1, #72] // .e.......................................................................................................................... - ldr x8, [x1, #104] // .........e.................................................................................................................. - ins v29.d[1], x12 // ...............................................................*............................................................ - ins v4.d[0], x15 // ..........................................................*................................................................. - ldr x15, [x4, #-80] // ....................................................*....................................................................... - ldr x27, [x1, #64] // e........................................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ldr x11, [x4, #48] // ............................................................e............................................................... - mls v18.8H, v9.8H, v7.H[0] // ...........................*................................................................................................ - ldr x20, [x1, #120] // .............e.............................................................................................................. - ins v9.d[0], x29 // ......*..................................................................................................................... - ldr x12, [x4, #-8] // .....................................................................*...................................................... - mls v28.8H, v2.8H, v7.H[0] // ......................*..................................................................................................... - // gap // ............................................................................................................................ - ldr x10, [x4, #-72] // .....................................................*...................................................................... - ldr x25, [x4, #72] // .................................................................e.......................................................... - ins v2.d[0], x15 // ......................................................*..................................................................... - ins v9.d[1], x7 // .......*.................................................................................................................... - ldr x13, [x3, #8] // .................e.......................................................................................................... - ldr x7, [x1, #88] // .....e...................................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ldr x19, [x4, #-16] // ....................................................................*....................................................... - ins v2.d[1], x10 // .......................................................*.................................................................... - ldr x10, [x4, #-56] // .........................................................*.................................................................. - ldr x15, [x4], #(6*16) // ................................................e........................................................................... - sub v27.8H, v9.8H, v18.8H // ............................*............................................................................................... - ldr x17, [x4, #-32] // ................................................................e........................................................... - add v11.8H, v9.8H, v18.8H // .............................*.............................................................................................. - ldr x23, [x1, #112] // ............e............................................................................................................... - // gap // ............................................................................................................................ - sqrdmulh v9.8H, v27.8H, v3.H[5] // ....................................*....................................................................................... - mul v27.8H, v27.8H, v3.H[4] // ...................................*........................................................................................ - ldr x28, [x3], #16 // ................e........................................................................................................... - // gap // ............................................................................................................................ - sqrdmulh v20.8H, v11.8H, v3.H[3] // ...............................*............................................................................................ - mul v16.8H, v11.8H, v3.H[2] // ..............................*............................................................................................. - ldr x29, [x1, #96] // ........e................................................................................................................... - // gap // ............................................................................................................................ - ins v21.d[0], x19 // ......................................................................*..................................................... - sub v30.8H, v31.8H, v28.8H // .......................*.................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v27.8H, v9.8H, v7.H[0] // .....................................*...................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v16.8H, v20.8H, v7.H[0] // ................................*........................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ins v3.d[0], x28 // ..................e......................................................................................................... - add v18.8H, v31.8H, v28.8H // ........................*................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v1.8H, v30.8H, v27.8H // .......................................*.................................................................................... - // gap // ............................................................................................................................ - sub v9.8H, v30.8H, v27.8H // ......................................*..................................................................................... - // gap // ............................................................................................................................ - add v10.8H, v18.8H, v16.8H // ..................................*......................................................................................... - sub v18.8H, v18.8H, v16.8H // .................................*.......................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v19.4S, v1.4S, v9.4S // ...........................................*................................................................................ - ins v3.d[1], x13 // ...................e........................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v5.4S, v10.4S, v18.4S // .........................................*.................................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn1 v28.4S, v1.4S, v9.4S // ..........................................*................................................................................. - trn1 v18.4S, v10.4S, v18.4S // ........................................*................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ins v21.d[1], x12 // .......................................................................*.................................................... - trn2 v20.2D, v5.2D, v19.2D // .............................................*.............................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v27.2D, v18.2D, v28.2D // ............................................*............................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v9.8H, v20.8H, v22.8H // .............................................................................*.............................................. - sqrdmulh v11.8H, v20.8H, v2.8H // ..............................................................................*............................................. - sqrdmulh v30.8H, v27.8H, v2.8H // .........................................................................*.................................................. - mul v24.8H, v27.8H, v22.8H // ........................................................................*................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn1 v19.2D, v5.2D, v19.2D // ...............................................*............................................................................ - ins v5.d[0], x29 // ..........e................................................................................................................. - ldr x29, [x1, #80] // ....e....................................................................................................................... - // gap // ............................................................................................................................ - mls v9.8H, v11.8H, v7.H[0] // ...............................................................................*............................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v24.8H, v30.8H, v7.H[0] // ..........................................................................*................................................. - ins v5.d[1], x8 // ...........e................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn1 v17.2D, v18.2D, v28.2D // ..............................................*............................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ins v4.d[1], x10 // ...........................................................*................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v16.8H, v19.8H, v9.8H // .................................................................................*.......................................... - sub v9.8H, v19.8H, v9.8H // ................................................................................*........................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqrdmulh v22.8H, v16.8H, v29.8H // ...................................................................................*........................................ - mul v2.8H, v16.8H, v4.8H // ..................................................................................*......................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v0.8H, v9.8H, v12.8H // .......................................................................................*.................................... - sqrdmulh v9.8H, v9.8H, v21.8H // ........................................................................................*................................... - // gap // ............................................................................................................................ - add v11.8H, v17.8H, v24.8H // ............................................................................*............................................... - ins v21.d[0], x23 // ..............e............................................................................................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v2.8H, v22.8H, v7.H[0] // ....................................................................................*....................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v0.8H, v9.8H, v7.H[0] // .........................................................................................*.................................. - ins v31.d[0], x27 // ..e......................................................................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sub v9.8H, v17.8H, v24.8H // ...........................................................................*................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - add v8.8H, v11.8H, v2.8H // ......................................................................................*..................................... - // gap // ............................................................................................................................ - sub v2.8H, v11.8H, v2.8H // .....................................................................................*...................................... - // gap // ............................................................................................................................ - add v10.8H, v9.8H, v0.8H // ...........................................................................................*................................ - sub v11.8H, v9.8H, v0.8H // ..........................................................................................*................................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v29.4S, v8.4S, v2.4S // .............................................................................................*.............................. - trn1 v30.4S, v8.4S, v2.4S // ............................................................................................*............................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - trn2 v20.4S, v10.4S, v11.4S // ...............................................................................................*............................ - trn1 v11.4S, v10.4S, v11.4S // ..............................................................................................*............................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqdmulh v2.8H, v30.8H, v7.H[1] // ................................................................................................*........................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqdmulh v18.8H, v29.8H, v7.H[1] // ...................................................................................................*........................ - ins v12.d[0], x17 // ..................................................................e......................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqdmulh v9.8H, v11.8H, v7.H[1] // ......................................................................................................*..................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - sqdmulh v22.8H, v20.8H, v7.H[1] // .........................................................................................................*.................. - ins v31.d[1], x9 // ...e........................................................................................................................ - srshr v2.8H, v2.8H, #11 // .................................................................................................*.......................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - srshr v27.8H, v9.8H, #11 // .......................................................................................................*.................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - srshr v9.8H, v18.8H, #11 // ....................................................................................................*....................... - srshr v1.8H, v22.8H, #11 // ..........................................................................................................*................. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v30.8H, v2.8H, v7.H[0] // ..................................................................................................*......................... - ins v12.d[1], x25 // ...................................................................e........................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v11.8H, v27.8H, v7.H[0] // ........................................................................................................*................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v20.8H, v1.8H, v7.H[0] // ...........................................................................................................*................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mls v29.8H, v9.8H, v7.H[0] // .....................................................................................................*...................... - umov x24, v30.d[0] // ............................................................................................................*............... - umov x10, v30.d[1] // .............................................................................................................*.............. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - umov x13, v29.d[0] // ..............................................................................................................*............. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - umov x19, v20.d[0] // ..................................................................................................................*......... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - str x10, [x1, #32] // ........................................................................................................................*... - umov x10, v20.d[1] // ...................................................................................................................*........ - umov x25, v29.d[1] // ...............................................................................................................*............ - // gap // ............................................................................................................................ - str x24, [x1], #( 16*4) // ....................................................................................................................*....... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v28.8H, v5.8H, v3.H[0] // ....................e....................................................................................................... - str x13, [x1, #-48] // ......................................................................................................................*..... - sqrdmulh v2.8H, v5.8H, v3.H[1] // .....................e...................................................................................................... - // gap // ............................................................................................................................ - ins v22.d[0], x15 // ..................................................e......................................................................... - str x19, [x1, #-40] // .......................................................................................................................*.... - ldr x15, [x4, #-64] // ........................................................e................................................................... - umov x13, v11.d[0] // ................................................................................................................*........... - str x25, [x1, #-16] // ..........................................................................................................................*. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - umov x19, v11.d[1] // .................................................................................................................*.......... - ins v21.d[1], x20 // ...............e............................................................................................................ - str x10, [x1, #-8] // ...........................................................................................................................* - ldr x23, [x4, #-88] // .................................................e.......................................................................... - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - ins v29.d[0], x11 // ..............................................................e............................................................. - sqrdmulh v9.8H, v21.8H, v3.H[1] // ..........................e................................................................................................. - str x13, [x1, #-56] // .....................................................................................................................*...... - ldr x12, [x4, #-40] // .............................................................e.............................................................. - ins v22.d[1], x23 // ...................................................e........................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - mul v18.8H, v21.8H, v3.H[0] // .........................e.................................................................................................. - str x19, [x1, #-24] // .........................................................................................................................*.. - // gap // ............................................................................................................................ - // gap // ............................................................................................................................ - - // original source code - // ldr x10, [x1, #(16*0)] // .....e......................................................................................................................|....e...................................................................................................................... - // ldr x11, [x1, #((16*0)+8)] // e...........................................................................................................................e........................................................................................................................... - // ins v8.d[0], x10 // ........................................................................e...................................................|.......................................................................e................................................... - // ins v8.d[1], x11 // .......................................................................................e....................................|......................................................................................e.................................... - // ldr x10, [x1, #(16*1)] // ........................................................e...................................................................|.......................................................e................................................................... - // ldr x11, [x1, #((16*1)+8)] // .................e..........................................................................................................|................e.......................................................................................................... - // ins v9.d[0], x10 // .........*..................................................................................................................|........*.................................................................................................................. - // ins v9.d[1], x11 // ...............*............................................................................................................|..............*............................................................................................................ - // ldr x10, [x1, #(16*2)] // ...............................e............................................................................................|..............................e............................................................................................ - // ldr x11, [x1, #((16*2)+8)] // .e..........................................................................................................................|e.......................................................................................................................... - // ins v10.d[0], x10 // .......................................................e....................................................................|......................................................e.................................................................... - // ins v10.d[1], x11 // ...........................................................e................................................................|..........................................................e................................................................ - // ldr x10, [x1, #(16*3)] // .........................e..................................................................................................|........................e.................................................................................................. - // ldr x11, [x1, #((16*3)+8)] // ........e...................................................................................................................|.......e................................................................................................................... - // ins v11.d[0], x10 // .....................................................................e......................................................|....................................................................e...................................................... - // ins v11.d[1], x11 // ..................................................................................................................e.........|.................................................................................................................e......... - // ldr x10, [x3], #16 // ............................e...............................................................................................|...........................e............................................................................................... - // ldr x11, [x3, #(-16+8)] // ................e...........................................................................................................|...............e........................................................................................................... - // ins v0.d[0], x10 // ....................................e.......................................................................................|...................................e....................................................................................... - // ins v0.d[1], x11 // ...........................................e................................................................................|..........................................e................................................................................ - // mul v24.8h, v10.8h, v0.h[0] // .........................................................................................................e..................|........................................................................................................e.................. - // sqrdmulh v10.8h, v10.8h, v0.h[1] // ...........................................................................................................e................|..........................................................................................................e................ - // mls v24.8h, v10.8h, v7.h[0] // ...........*................................................................................................................|..........*................................................................................................................ - // sub v10.8h, v8.8h, v24.8h // .................................*..........................................................................................|................................*.......................................................................................... - // add v8.8h, v8.8h, v24.8h // .....................................*......................................................................................|....................................*...................................................................................... - // mul v24.8h, v11.8h, v0.h[0] // ..........................................................................................................................e.|.........................................................................................................................e. - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ......................................................................................................................e.....|.....................................................................................................................e..... - // mls v24.8h, v11.8h, v7.h[0] // .......*....................................................................................................................|......*.................................................................................................................... - // sub v11.8h, v9.8h, v24.8h // ......................*.....................................................................................................|.....................*..................................................................................................... - // add v9.8h, v9.8h, v24.8h // ........................*...................................................................................................|.......................*................................................................................................... - // mul v24.8h, v9.8h, v0.h[2] // ..............................*.............................................................................................|.............................*............................................................................................. - // sqrdmulh v9.8h, v9.8h, v0.h[3] // .............................*..............................................................................................|............................*.............................................................................................. - // mls v24.8h, v9.8h, v7.h[0] // ...................................*........................................................................................|..................................*........................................................................................ - // sub v9.8h, v8.8h, v24.8h // .........................................*..................................................................................|........................................*.................................................................................. - // add v8.8h, v8.8h, v24.8h // ........................................*...................................................................................|.......................................*................................................................................... - // mul v24.8h, v11.8h, v0.h[4] // ...........................*................................................................................................|..........................*................................................................................................ - // sqrdmulh v11.8h, v11.8h, v0.h[5] // ..........................*.................................................................................................|.........................*................................................................................................. - // mls v24.8h, v11.8h, v7.h[0] // ..................................*.........................................................................................|.................................*......................................................................................... - // sub v11.8h, v10.8h, v24.8h // .......................................*....................................................................................|......................................*.................................................................................... - // add v10.8h, v10.8h, v24.8h // ......................................*.....................................................................................|.....................................*..................................................................................... - // trn1 v25.4s, v8.4s, v9.4s // ..............................................*.............................................................................|.............................................*............................................................................. - // trn2 v26.4s, v8.4s, v9.4s // ............................................*...............................................................................|...........................................*............................................................................... - // trn1 v27.4s, v10.4s, v11.4s // .............................................*..............................................................................|............................................*.............................................................................. - // trn2 v28.4s, v10.4s, v11.4s // ..........................................*.................................................................................|.........................................*................................................................................. - // trn2 v10.2d, v25.2d, v27.2d // .................................................*..........................................................................|................................................*.......................................................................... - // trn2 v11.2d, v26.2d, v28.2d // ................................................*...........................................................................|...............................................*........................................................................... - // trn1 v8.2d, v25.2d, v27.2d // ............................................................*...............................................................|...........................................................*............................................................... - // trn1 v9.2d, v26.2d, v28.2d // ......................................................*.....................................................................|.....................................................*..................................................................... - // ldr x10, [x4], #(6*16) // .....................e......................................................................................................|....................e...................................................................................................... - // ldr x11, [x4, #(-(6*16)+8)] // ....................................................................................................................e.......|...................................................................................................................e....... - // ins v0.d[0], x10 // ............................................................................................................e...............|...........................................................................................................e............... - // ins v0.d[1], x11 // .........................................................................................................................e..|........................................................................................................................e.. - // ldr x10, [x4, #(-6*16 + 1*16)] // ....*.......................................................................................................................|...*....................................................................................................................... - // ldr x11, [x4, #((-6*16 + 1*16)+8)] // ............*...............................................................................................................|...........*............................................................................................................... - // ins v4.d[0], x10 // ..............*.............................................................................................................|.............*............................................................................................................. - // ins v4.d[1], x11 // ...................*........................................................................................................|..................*........................................................................................................ - // ldr x10, [x4, #(-6*16 + 2*16)] // ..............................................................................................................e.............|.............................................................................................................e............. - // ldr x11, [x4, #((-6*16 + 2*16)+8)] // ....................*.......................................................................................................|...................*....................................................................................................... - // ins v1.d[0], x10 // ...*........................................................................................................................|..*........................................................................................................................ - // ins v1.d[1], x11 // .............................................................*..............................................................|............................................................*.............................................................. - // ldr x10, [x4, #(-6*16 + 3*16)] // ......e.....................................................................................................................|.....e..................................................................................................................... - // ldr x11, [x4, #((-6*16 + 3*16)+8)] // ........................................................................................................................e...|.......................................................................................................................e... - // ins v5.d[0], x10 // .....................................................................................................................e......|....................................................................................................................e...... - // ins v5.d[1], x11 // ..*.........................................................................................................................|.*......................................................................................................................... - // ldr x10, [x4, #(-6*16 + 4*16)] // .......................e....................................................................................................|......................e.................................................................................................... - // ldr x11, [x4, #((-6*16 + 4*16)+8)] // .............e..............................................................................................................|............e.............................................................................................................. - // ins v2.d[0], x10 // ....................................................................................e.......................................|...................................................................................e....................................... - // ins v2.d[1], x11 // .............................................................................................e..............................|............................................................................................e.............................. - // ldr x10, [x4, #(-6*16 + 5*16)] // ..................*.........................................................................................................|.................*......................................................................................................... - // ldr x11, [x4, #((-6*16 + 5*16)+8)] // ..........*.................................................................................................................|.........*................................................................................................................. - // ins v6.d[0], x10 // ................................*...........................................................................................|...............................*........................................................................................... - // ins v6.d[1], x11 // ...............................................*............................................................................|..............................................*............................................................................ - // mul v24.8h, v10.8h, v0.8h // .....................................................*......................................................................|....................................................*...................................................................... - // sqrdmulh v10.8h, v10.8h, v4.8h // ....................................................*.......................................................................|...................................................*....................................................................... - // mls v24.8h, v10.8h, v7.h[0] // ..........................................................*.................................................................|.........................................................*................................................................. - // sub v10.8h, v8.8h, v24.8h // .........................................................................*..................................................|........................................................................*.................................................. - // add v8.8h, v8.8h, v24.8h // ....................................................................*.......................................................|...................................................................*....................................................... - // mul v24.8h, v11.8h, v0.8h // ..................................................*.........................................................................|.................................................*......................................................................... - // sqrdmulh v11.8h, v11.8h, v4.8h // ...................................................*........................................................................|..................................................*........................................................................ - // mls v24.8h, v11.8h, v7.h[0] // .........................................................*..................................................................|........................................................*.................................................................. - // sub v11.8h, v9.8h, v24.8h // ...............................................................*............................................................|..............................................................*............................................................ - // add v9.8h, v9.8h, v24.8h // ..............................................................*.............................................................|.............................................................*............................................................. - // mul v24.8h, v9.8h, v1.8h // .................................................................*..........................................................|................................................................*.......................................................... - // sqrdmulh v9.8h, v9.8h, v5.8h // ................................................................*...........................................................|...............................................................*........................................................... - // mls v24.8h, v9.8h, v7.h[0] // ......................................................................*.....................................................|.....................................................................*..................................................... - // sub v9.8h, v8.8h, v24.8h // ...........................................................................*................................................|..........................................................................*................................................ - // add v8.8h, v8.8h, v24.8h // ..........................................................................*.................................................|.........................................................................*................................................. - // mul v24.8h, v11.8h, v2.8h // ..................................................................*.........................................................|.................................................................*......................................................... - // sqrdmulh v11.8h, v11.8h, v6.8h // ...................................................................*........................................................|..................................................................*........................................................ - // mls v24.8h, v11.8h, v7.h[0] // .......................................................................*....................................................|......................................................................*.................................................... - // sub v11.8h, v10.8h, v24.8h // .............................................................................*..............................................|............................................................................*.............................................. - // add v10.8h, v10.8h, v24.8h // ............................................................................*...............................................|...........................................................................*............................................... - // trn1 v25.4s, v8.4s, v9.4s // ...............................................................................*............................................|..............................................................................*............................................ - // trn2 v26.4s, v8.4s, v9.4s // ..............................................................................*.............................................|.............................................................................*............................................. - // trn1 v27.4s, v10.4s, v11.4s // .................................................................................*..........................................|................................................................................*.......................................... - // trn2 v28.4s, v10.4s, v11.4s // ................................................................................*...........................................|...............................................................................*........................................... - // sqdmulh v24.8h, v25.8h, v7.h[1] // ..................................................................................*.........................................|.................................................................................*......................................... - // srshr v24.8h, v24.8h, #11 // ........................................................................................*...................................|.......................................................................................*................................... - // mls v25.8h, v24.8h, v7.h[0] // ............................................................................................*...............................|...........................................................................................*............................... - // sqdmulh v24.8h, v26.8h, v7.h[1] // ...................................................................................*........................................|..................................................................................*........................................ - // srshr v24.8h, v24.8h, #11 // ..........................................................................................*.................................|.........................................................................................*................................. - // mls v26.8h, v24.8h, v7.h[0] // ................................................................................................*...........................|...............................................................................................*........................... - // sqdmulh v24.8h, v27.8h, v7.h[1] // .....................................................................................*......................................|....................................................................................*...................................... - // srshr v24.8h, v24.8h, #11 // .........................................................................................*..................................|........................................................................................*.................................. - // mls v27.8h, v24.8h, v7.h[0] // ..............................................................................................*.............................|.............................................................................................*............................. - // sqdmulh v24.8h, v28.8h, v7.h[1] // ......................................................................................*.....................................|.....................................................................................*..................................... - // srshr v24.8h, v24.8h, #11 // ...........................................................................................*................................|..........................................................................................*................................ - // mls v28.8h, v24.8h, v7.h[0] // ...............................................................................................*............................|..............................................................................................*............................ - // umov x10, v25.d[0] // .................................................................................................*..........................|................................................................................................*.......................... - // umov x11, v25.d[1] // ..................................................................................................*.........................|.................................................................................................*......................... - // umov x12, v26.d[0] // ...................................................................................................*........................|..................................................................................................*........................ - // umov x13, v26.d[1] // .......................................................................................................*....................|......................................................................................................*.................... - // umov x14, v27.d[0] // ...............................................................................................................*............|..............................................................................................................*............ - // umov x15, v27.d[1] // .................................................................................................................*..........|................................................................................................................*.......... - // umov x16, v28.d[0] // ....................................................................................................*.......................|...................................................................................................*....................... - // umov x17, v28.d[1] // ......................................................................................................*.....................|.....................................................................................................*..................... - // str x10, [x1], #( 16*4) // ........................................................................................................*...................|.......................................................................................................*................... - // str x14, [x1, #(-16*4 + 8*1)] // .......................................................................................................................*....|......................................................................................................................*.... - // str x12, [x1, #(-16*4 + 8*2)] // ..........................................................................................................*.................|.........................................................................................................*................. - // str x16, [x1, #(-16*4 + 8*3)] // .............................................................................................................*..............|............................................................................................................*.............. - // str x11, [x1, #(-16*4 + 8*4)] // .....................................................................................................*......................|....................................................................................................*...................... - // str x15, [x1, #(-16*4 + 8*5)] // ...........................................................................................................................*|..........................................................................................................................* - // str x13, [x1, #(-16*4 + 8*6)] // ................................................................................................................*...........|...............................................................................................................*........... - // str x17, [x1, #(-16*4 + 8*7)] // ...................................................................................................................*........|..................................................................................................................*........ + // Instructions: 91 + // Expected cycles: 41 + // Expected IPC: 2.22 + // + // Cycle bound: 40.0 + // IPC bound: 2.27 + // + // Wall time: 3601.43s + // User time: 3601.43s + // + // ----------------------------------- original position ------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--------------- + sub v15.8H, v26.8H, v16.8H // .........................................................*................................. + add v13.8H, v26.8H, v16.8H // ..........................................................*................................ + ldr q22, [x1, #112] // ...e....................................................................................... + ldr q8, [x3], #16 // ....e...................................................................................... + trn1 v9.4S, v27.4S, v14.4S // ...........................................................*............................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v0.4S, v27.4S, v14.4S // ............................................................*.............................. + ldr q28, [x1, #96] // ..e........................................................................................ + trn1 v11.4S, v13.4S, v15.4S // .............................................................*............................. + ldr q23, [x1, #64] // e.......................................................................................... + // gap // ........................................................................................... + trn2 v19.4S, v13.4S, v15.4S // ..............................................................*............................ + ldr q30, [x1, #80] // .e......................................................................................... + // gap // ........................................................................................... + sqdmulh v26.8H, v0.8H, v7.H[1] // ..................................................................*........................ + ldr q13, [x4, #48] // ....................................e...................................................... + sqrdmulh v10.8H, v22.8H, v8.H[1] // ..........e................................................................................ + mul v6.8H, v22.8H, v8.H[0] // ...........e............................................................................... + ldr q29, [x4], #(6*16) // .................................e......................................................... + sqdmulh v16.8H, v9.8H, v7.H[1] // ...............................................................*........................... + ldr q2, [x4, #-80] // ..................................e........................................................ + ldr q25, [x4, #-16] // ......................................e.................................................... + sqdmulh v15.8H, v11.8H, v7.H[1] // .....................................................................*..................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v31.8H, v28.8H, v8.H[1] // .....e..................................................................................... + mul v22.8H, v28.8H, v8.H[0] // ......e.................................................................................... + sqdmulh v27.8H, v19.8H, v7.H[1] // ........................................................................*.................. + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v6.8H, v10.8H, v7.H[0] // ............e.............................................................................. + srshr v26.8H, v26.8H, #11 // ...................................................................*....................... + srshr v16.8H, v16.8H, #11 // ................................................................*.......................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v22.8H, v31.8H, v7.H[0] // .......e................................................................................... + srshr v14.8H, v15.8H, #11 // ......................................................................*.................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v15.8H, v30.8H, v6.8H // .............e............................................................................. + add v6.8H, v30.8H, v6.8H // ..............e............................................................................ + mls v9.8H, v16.8H, v7.H[0] // .................................................................*......................... + srshr v16.8H, v27.8H, #11 // .........................................................................*................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v27.8H, v15.8H, v8.H[5] // ....................e...................................................................... + mul v15.8H, v15.8H, v8.H[4] // .....................e..................................................................... + mls v0.8H, v26.8H, v7.H[0] // ....................................................................*...................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v26.8H, v23.8H, v22.8H // ........e.................................................................................. + mul v17.8H, v6.8H, v8.H[2] // ................e.......................................................................... + sqrdmulh v6.8H, v6.8H, v8.H[3] // ...............e........................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v19.8H, v16.8H, v7.H[0] // ..........................................................................*................ + umov x15, v9.d[0] // ...........................................................................*............... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v15.8H, v27.8H, v7.H[0] // ......................e.................................................................... + umov x27, v0.d[0] // .............................................................................*............. + mls v17.8H, v6.8H, v7.H[0] // .................e......................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v27.8H, v23.8H, v22.8H // .........e................................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v16.8H, v26.8H, v15.8H // .......................e................................................................... + add v15.8H, v26.8H, v15.8H // ........................e.................................................................. + add v22.8H, v27.8H, v17.8H // ...................e....................................................................... + sub v3.8H, v27.8H, v17.8H // ..................e........................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + str x27, [x1, #16] // .....................................................................................*..... + // gap // ........................................................................................... + trn2 v17.4S, v15.4S, v16.4S // ............................e.............................................................. + trn1 v8.4S, v15.4S, v16.4S // ...........................e............................................................... + // gap // ........................................................................................... + trn2 v26.4S, v22.4S, v3.4S // ..........................e................................................................ + ldr q31, [x4, #-32] // .....................................e..................................................... + umov x27, v19.d[0] // .................................................................................*......... + // gap // ........................................................................................... + trn1 v27.4S, v22.4S, v3.4S // .........................e................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v15.2D, v26.2D, v17.2D // ..............................e............................................................ + umov x9, v9.d[1] // ............................................................................*.............. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v3.2D, v27.2D, v8.2D // .............................e............................................................. + // gap // ........................................................................................... + sqrdmulh v16.8H, v15.8H, v2.8H // ............................................e.............................................. + mul v6.8H, v15.8H, v29.8H // .............................................e............................................. + // gap // ........................................................................................... + mls v11.8H, v14.8H, v7.H[0] // .......................................................................*................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v15.8H, v3.8H, v2.8H // .......................................e................................................... + trn1 v22.2D, v26.2D, v17.2D // ................................e.......................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v26.8H, v3.8H, v29.8H // ........................................e.................................................. + umov x12, v0.d[1] // ..............................................................................*............ + // gap // ........................................................................................... + mls v6.8H, v16.8H, v7.H[0] // ..............................................e............................................ + ldr q0, [x4, #-64] // ...................................e....................................................... + str x15, [x1], #( 16*4) // ...................................................................................*....... + umov x13, v11.d[0] // ...............................................................................*........... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x19, v11.d[1] // ................................................................................*.......... + // gap // ........................................................................................... + str x9, [x1, #-32] // .......................................................................................*... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v16.8H, v22.8H, v6.8H // ................................................e.......................................... + mls v26.8H, v15.8H, v7.H[0] // .........................................e................................................. + sub v14.8H, v22.8H, v6.8H // ...............................................e........................................... + str x12, [x1, #-16] // .........................................................................................*. + // gap // ........................................................................................... + sqrdmulh v15.8H, v16.8H, v13.8H // .................................................e......................................... + // gap // ........................................................................................... + str x13, [x1, #-56] // ....................................................................................*...... + mul v6.8H, v16.8H, v0.8H // ..................................................e........................................ + str x19, [x1, #-24] // ........................................................................................*.. + // gap // ........................................................................................... + mul v16.8H, v14.8H, v31.8H // .......................................................e................................... + sqrdmulh v14.8H, v14.8H, v25.8H // ......................................................e.................................... + trn1 v27.2D, v27.2D, v8.2D // ...............................e........................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x11, v19.d[1] // ..................................................................................*........ + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v6.8H, v15.8H, v7.H[0] // ...................................................e....................................... + // gap // ........................................................................................... + add v28.8H, v27.8H, v26.8H // ...........................................e............................................... + // gap // ........................................................................................... + mls v16.8H, v14.8H, v7.H[0] // ........................................................e.................................. + // gap // ........................................................................................... + str x27, [x1, #-40] // ......................................................................................*.... + // gap // ........................................................................................... + sub v26.8H, v27.8H, v26.8H // ..........................................e................................................ + // gap // ........................................................................................... + add v27.8H, v28.8H, v6.8H // .....................................................e..................................... + str x11, [x1, #-8] // ..........................................................................................* + // gap // ........................................................................................... + sub v14.8H, v28.8H, v6.8H // ....................................................e...................................... + + // ---------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--- + // ldr q8, [x1, #(16*0)] // ......e..................................................................................'.......~................................................................................. + // ldr q9, [x1, #(16*1)] // ........e................................................................................'.........~............................................................................... + // ldr q10, [x1, #(16*2)] // ....e....................................................................................'.....~................................................................................... + // ldr q11, [x1, #(16*3)] // e........................................................................................'.~....................................................................................... + // ldr q0, [x3], #16 // .e.......................................................................................'..~...................................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ..................e......................................................................'...................~..................................................................... + // mul v24.8h, v10.8h, v0.h[0] // ...................e.....................................................................'....................~.................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................e................................................................'.........................~............................................................... + // sub v10.8h, v8.8h, v24.8h // .................................e.......................................................'..................................~...................................................... + // add v8.8h, v8.8h, v24.8h // .........................................e...............................................'..........................................~.............................................. + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ...........e.............................................................................'............~............................................................................ + // mul v24.8h, v11.8h, v0.h[0] // ............e............................................................................'.............~........................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .....................e...................................................................'......................~.................................................................. + // sub v11.8h, v9.8h, v24.8h // ..........................e..............................................................'...........................~............................................................. + // add v9.8h, v9.8h, v24.8h // ...........................e.............................................................'............................~............................................................ + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ...................................e.....................................................'....................................~.................................................... + // mul v24.8h, v9.8h, v0.h[2] // ..................................e......................................................'...................................~..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................................e................................................'.........................................~............................................... + // sub v9.8h, v8.8h, v24.8h // .............................................e...........................................'..............................................~.......................................... + // add v8.8h, v8.8h, v24.8h // ............................................e............................................'.............................................~........................................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ..............................e..........................................................'...............................~......................................................... + // mul v24.8h, v11.8h, v0.h[4] // ...............................e.........................................................'................................~........................................................ + // mls v24.8h, v27.8h, v7.h[0] // ......................................e..................................................'.......................................~................................................. + // sub v11.8h, v10.8h, v24.8h // ..........................................e..............................................'...........................................~............................................. + // add v10.8h, v10.8h, v24.8h // ...........................................e.............................................'............................................~............................................ + // trn1 v25.4s, v8.4s, v9.4s // ....................................................e....................................'.....................................................~................................... + // trn2 v26.4s, v8.4s, v9.4s // .................................................e.......................................'..................................................~...................................... + // trn1 v27.4s, v10.4s, v11.4s // ................................................e........................................'.................................................~....................................... + // trn2 v28.4s, v10.4s, v11.4s // ...............................................e.........................................'................................................~........................................ + // trn2 v10.2d, v25.2d, v27.2d // .......................................................e.................................'........................................................~................................ + // trn2 v11.2d, v26.2d, v28.2d // .....................................................e...................................'......................................................~.................................. + // trn1 v8.2d, v25.2d, v27.2d // ...............................................................................e.........'................................................................................~........ + // trn1 v9.2d, v26.2d, v28.2d // ............................................................e............................'.............................................................~........................... + // ldr q0, [ x4], #(6*16) // .............e...........................................................................'..............~.......................................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ...............e.........................................................................'................~........................................................................ + // ldr q1, [ x4, #(-6*16 + 2*16)] // ................................................................e........................'.................................................................~....................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ..........e..............................................................................'...........~............................................................................. + // ldr q2, [ x4, #(-6*16 + 4*16)] // ..................................................e......................................'...................................................~..................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ................e........................................................................'.................~....................................................................... + // sqrdmulh v27.8h, v10.8h, v4.8h // ...........................................................e.............................'............................................................~............................ + // mul v24.8h, v10.8h, v0.8h // .............................................................e...........................'..............................................................~.......................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................................................e..................'.......................................................................~................. + // sub v10.8h, v8.8h, v24.8h // .....................................................................................e...'......................................................................................~.. + // add v8.8h, v8.8h, v24.8h // ..................................................................................e......'...................................................................................~..... + // sqrdmulh v27.8h, v11.8h, v4.8h // ........................................................e................................'.........................................................~............................... + // mul v24.8h, v11.8h, v0.8h // .........................................................e...............................'..........................................................~.............................. + // mls v24.8h, v27.8h, v7.h[0] // ...............................................................e.........................'................................................................~........................ + // sub v11.8h, v9.8h, v24.8h // .......................................................................e.................'........................................................................~................ + // add v9.8h, v9.8h, v24.8h // .....................................................................e...................'......................................................................~.................. + // sqrdmulh v27.8h, v9.8h, v5.8h // .........................................................................e...............'..........................................................................~.............. + // mul v24.8h, v9.8h, v1.8h // ...........................................................................e.............'............................................................................~............ + // mls v24.8h, v27.8h, v7.h[0] // .................................................................................e.......'..................................................................................~...... + // sub v9.8h, v8.8h, v24.8h // ........................................................................................e'......................................................................................... + // add v8.8h, v8.8h, v24.8h // ......................................................................................e..'.......................................................................................~. + // sqrdmulh v27.8h, v11.8h, v6.8h // ..............................................................................e..........'...............................................................................~......... + // mul v24.8h, v11.8h, v2.8h // .............................................................................e...........'..............................................................................~.......... + // mls v24.8h, v27.8h, v7.h[0] // ...................................................................................e.....'....................................................................................~.... + // sub v11.8h, v10.8h, v24.8h // .........................................................................................*......................................................................................... + // add v10.8h, v10.8h, v24.8h // .........................................................................................'*........................................................................................ + // trn1 v25.4s, v8.4s, v9.4s // ..~......................................................................................'...*..................................................................................... + // trn2 v26.4s, v8.4s, v9.4s // ...~.....................................................................................'....*.................................................................................... + // trn1 v27.4s, v10.4s, v11.4s // .....~...................................................................................'......*.................................................................................. + // trn2 v28.4s, v10.4s, v11.4s // .......~.................................................................................'........*................................................................................ + // sqdmulh v24.8h, v25.8h, v7.h[1] // ..............~..........................................................................'...............*......................................................................... + // srshr v24.8h, v24.8h, #11 // .......................~.................................................................'........................*................................................................ + // mls v25.8h, v24.8h, v7.h[0] // ............................~............................................................'.............................*........................................................... + // sqdmulh v24.8h, v26.8h, v7.h[1] // .........~...............................................................................'..........*.............................................................................. + // srshr v24.8h, v24.8h, #11 // ......................~..................................................................'.......................*................................................................. + // mls v26.8h, v24.8h, v7.h[0] // ................................~........................................................'.................................*....................................................... + // sqdmulh v24.8h, v27.8h, v7.h[1] // .................~.......................................................................'..................*...................................................................... + // srshr v24.8h, v24.8h, #11 // .........................~...............................................................'..........................*.............................................................. + // mls v27.8h, v24.8h, v7.h[0] // ..........................................................~..............................'...........................................................*............................. + // sqdmulh v24.8h, v28.8h, v7.h[1] // ....................~....................................................................'.....................*................................................................... + // srshr v24.8h, v24.8h, #11 // .............................~...........................................................'..............................*.......................................................... + // mls v28.8h, v24.8h, v7.h[0] // ....................................~....................................................'.....................................*................................................... + // umov x10, v25.d[0] // .....................................~...................................................'......................................*.................................................. + // umov x11, v25.d[1] // ......................................................~..................................'.......................................................*................................. + // umov x12, v26.d[0] // .......................................~.................................................'........................................*................................................ + // umov x13, v26.d[1] // ..............................................................~..........................'...............................................................*......................... + // umov x14, v27.d[0] // ..................................................................~......................'...................................................................*..................... + // umov x15, v27.d[1] // ...................................................................~.....................'....................................................................*.................... + // umov x16, v28.d[0] // ...................................................~.....................................'....................................................*.................................... + // umov x17, v28.d[1] // ................................................................................~........'.................................................................................*....... + // str x10, [x1], #( 16*4) // .................................................................~.......................'..................................................................*...................... + // str x14, [x1, #(-16*4 + 8*1)] // ..........................................................................~..............'...........................................................................*............. + // str x12, [x1, #(-16*4 + 8*2)] // ..............................................~..........................................'...............................................*......................................... + // str x16, [x1, #(-16*4 + 8*3)] // ....................................................................................~....'.....................................................................................*... + // str x11, [x1, #(-16*4 + 8*4)] // ....................................................................~....................'.....................................................................*................... + // str x15, [x1, #(-16*4 + 8*5)] // ............................................................................~............'.............................................................................*........... + // str x13, [x1, #(-16*4 + 8*6)] // ........................................................................~................'.........................................................................*............... + // str x17, [x1, #(-16*4 + 8*7)] // .......................................................................................~.'........................................................................................* sub count, count, #1 cbnz count, layer4567_start - mls v28.8H, v2.8H, v7.H[0] // ......*................................................................................... - ldr x24, [x4, #-16] // ..........*............................................................................... - ldr x19, [x4, #-80] // ..*....................................................................................... - ins v20.d[0], x29 // ....*..................................................................................... - mls v18.8H, v9.8H, v7.H[0] // ...*...................................................................................... - ldr x20, [x4, #-72] // .......*.................................................................................. - ldr x10, [x4, #-56] // ............*............................................................................. - // gap // .......................................................................................... - ins v20.d[1], x7 // .........*................................................................................ - ins v14.d[0], x15 // .*........................................................................................ - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - ins v14.d[1], x10 // ...........................................*.............................................. - add v15.8H, v20.8H, v18.8H // ..............*........................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - sub v9.8H, v20.8H, v18.8H // .............*............................................................................ - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - sqrdmulh v19.8H, v15.8H, v3.H[3] // .................*........................................................................ - mul v5.8H, v15.8H, v3.H[2] // ..................*....................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - mul v30.8H, v9.8H, v3.H[4] // ................*......................................................................... - sqrdmulh v26.8H, v9.8H, v3.H[5] // ...............*.......................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - ins v6.d[0], x19 // ........*................................................................................. - add v9.8H, v31.8H, v28.8H // .......................*.................................................................. - // gap // .......................................................................................... - // gap // .......................................................................................... - mls v5.8H, v19.8H, v7.H[0] // ......................*................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - ins v6.d[1], x20 // ...........*.............................................................................. - mls v30.8H, v26.8H, v7.H[0] // .....................*.................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - sub v18.8H, v31.8H, v28.8H // ....................*..................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - add v20.8H, v9.8H, v5.8H // ..........................*............................................................... - sub v11.8H, v9.8H, v5.8H // ...........................*.............................................................. - // gap // .......................................................................................... - // gap // .......................................................................................... - sub v2.8H, v18.8H, v30.8H // .........................*................................................................ - add v27.8H, v18.8H, v30.8H // ........................*................................................................. - // gap // .......................................................................................... - // gap // .......................................................................................... - trn1 v9.4S, v20.4S, v11.4S // ...............................*.......................................................... - trn2 v20.4S, v20.4S, v11.4S // .............................*............................................................ - // gap // .......................................................................................... - // gap // .......................................................................................... - trn1 v11.4S, v27.4S, v2.4S // ..............................*........................................................... - trn2 v2.4S, v27.4S, v2.4S // ............................*............................................................. - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - trn1 v0.2D, v9.2D, v11.2D // ..........................................*............................................... - trn2 v10.2D, v20.2D, v2.2D // .................................*........................................................ - // gap // .......................................................................................... - // gap // .......................................................................................... - trn1 v18.2D, v20.2D, v2.2D // .......................................*.................................................. - trn2 v21.2D, v9.2D, v11.2D // ..................................*....................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - sqrdmulh v27.8H, v10.8H, v6.8H // ....................................*..................................................... - mul v11.8H, v10.8H, v22.8H // ...................................*...................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - mul v22.8H, v21.8H, v22.8H // ......................................*................................................... - sqrdmulh v9.8H, v21.8H, v6.8H // .....................................*.................................................... - // gap // .......................................................................................... - ldr x19, [x4, #-8] // .....*.................................................................................... - ins v6.d[0], x24 // ...................*...................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - mls v11.8H, v27.8H, v7.H[0] // ........................................*................................................. - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - ins v6.d[1], x19 // ................................*......................................................... - mls v22.8H, v9.8H, v7.H[0] // .........................................*................................................ - // gap // .......................................................................................... - // gap // .......................................................................................... - ins v29.d[1], x12 // *......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - sub v16.8H, v18.8H, v11.8H // .............................................*............................................ - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - add v2.8H, v0.8H, v22.8H // ..................................................*....................................... - add v28.8H, v18.8H, v11.8H // ............................................*............................................. - // gap // .......................................................................................... - // gap // .......................................................................................... - mul v26.8H, v16.8H, v12.8H // ................................................*......................................... - sqrdmulh v9.8H, v16.8H, v6.8H // .................................................*........................................ - // gap // .......................................................................................... - // gap // .......................................................................................... - sqrdmulh v20.8H, v28.8H, v29.8H // ..............................................*........................................... - mul v27.8H, v28.8H, v14.8H // ...............................................*.......................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - mls v26.8H, v9.8H, v7.H[0] // ....................................................*..................................... - sub v9.8H, v0.8H, v22.8H // .....................................................*.................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - mls v27.8H, v20.8H, v7.H[0] // ...................................................*...................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - add v11.8H, v9.8H, v26.8H // ........................................................*................................. - sub v26.8H, v9.8H, v26.8H // .........................................................*................................ - // gap // .......................................................................................... - // gap // .......................................................................................... - sub v20.8H, v2.8H, v27.8H // .......................................................*.................................. - add v29.8H, v2.8H, v27.8H // ......................................................*................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - trn2 v9.4S, v11.4S, v26.4S // ............................................................*............................. - trn1 v27.4S, v11.4S, v26.4S // .............................................................*............................ - // gap // .......................................................................................... - // gap // .......................................................................................... - trn1 v18.4S, v29.4S, v20.4S // ...........................................................*.............................. - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - sqdmulh v2.8H, v27.8H, v7.H[1] // ................................................................*......................... - // gap // .......................................................................................... - // gap // .......................................................................................... - sqdmulh v11.8H, v9.8H, v7.H[1] // .................................................................*........................ - sqdmulh v22.8H, v18.8H, v7.H[1] // ..............................................................*........................... - trn2 v30.4S, v29.4S, v20.4S // ..........................................................*............................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - srshr v11.8H, v11.8H, #11 // .....................................................................*.................... - srshr v2.8H, v2.8H, #11 // ...................................................................*...................... - // gap // .......................................................................................... - // gap // .......................................................................................... - srshr v22.8H, v22.8H, #11 // ..................................................................*....................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - sqdmulh v20.8H, v30.8H, v7.H[1] // ...............................................................*.......................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - mls v27.8H, v2.8H, v7.H[0] // .......................................................................*.................. - mls v9.8H, v11.8H, v7.H[0] // ........................................................................*................. - // gap // .......................................................................................... - // gap // .......................................................................................... - mls v18.8H, v22.8H, v7.H[0] // ......................................................................*................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - srshr v20.8H, v20.8H, #11 // ....................................................................*..................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - umov x15, v27.d[0] // ....................................................................................*..... - umov x19, v9.d[0] // .............................................................................*............ - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - umov x26, v9.d[1] // ...............................................................................*.......... - mls v30.8H, v20.8H, v7.H[0] // .........................................................................*................ - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - umov x12, v18.d[1] // ...........................................................................*.............. - str x15, [x1, #8] // ........................................................................................*. - umov x15, v27.d[1] // ......................................................................................*... - // gap // .......................................................................................... - // gap // .......................................................................................... - str x19, [x1, #24] // ...................................................................................*...... - umov x10, v30.d[1] // ................................................................................*......... - // gap // .......................................................................................... - // gap // .......................................................................................... - str x26, [x1, #56] // .......................................................................................*.. - umov x13, v30.d[0] // ............................................................................*............. - // gap // .......................................................................................... - // gap // .......................................................................................... - umov x25, v18.d[0] // ..........................................................................*............... - str x12, [x1, #32] // ..............................................................................*........... - // gap // .......................................................................................... - // gap // .......................................................................................... - str x15, [x1, #40] // .........................................................................................* - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - str x10, [x1, #48] // .....................................................................................*.... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - str x13, [x1, #16] // ..................................................................................*....... - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - str x25, [x1], #( 16*4) // .................................................................................*........ - // gap // .......................................................................................... - // gap // .......................................................................................... - // gap // .......................................................................................... - - // original source code - // ins v29.d[1], x12 // ...........................................*.............................................. - // ins v4.d[0], x15 // ........*................................................................................. - // ldr x15, [x4, #-80] // ..*....................................................................................... - // mls v18.8H, v9.8H, v7.H[0] // ....*..................................................................................... - // ins v9.d[0], x29 // ...*...................................................................................... - // ldr x12, [x4, #-8] // ......................................*................................................... - // mls v28.8H, v2.8H, v7.H[0] // *......................................................................................... - // ldr x10, [x4, #-72] // .....*.................................................................................... - // ins v2.d[0], x15 // ................*......................................................................... - // ins v9.d[1], x7 // .......*.................................................................................. - // ldr x19, [x4, #-16] // .*........................................................................................ - // ins v2.d[1], x10 // ...................*...................................................................... - // ldr x10, [x4, #-56] // ......*................................................................................... - // sub v27.8H, v9.8H, v18.8H // ...........*.............................................................................. - // add v11.8H, v9.8H, v18.8H // ..........*............................................................................... - // sqrdmulh v9.8H, v27.8H, v3.H[5] // ...............*.......................................................................... - // mul v27.8H, v27.8H, v3.H[4] // ..............*........................................................................... - // sqrdmulh v20.8H, v11.8H, v3.H[3] // ............*............................................................................. - // mul v16.8H, v11.8H, v3.H[2] // .............*............................................................................ - // ins v21.d[0], x19 // .......................................*.................................................. - // sub v30.8H, v31.8H, v28.8H // .....................*.................................................................... - // mls v27.8H, v9.8H, v7.H[0] // ....................*..................................................................... - // mls v16.8H, v20.8H, v7.H[0] // ..................*....................................................................... - // add v18.8H, v31.8H, v28.8H // .................*........................................................................ - // add v1.8H, v30.8H, v27.8H // .........................*................................................................ - // sub v9.8H, v30.8H, v27.8H // ........................*................................................................. - // add v10.8H, v18.8H, v16.8H // ......................*................................................................... - // sub v18.8H, v18.8H, v16.8H // .......................*.................................................................. - // trn2 v19.4S, v1.4S, v9.4S // .............................*............................................................ - // trn2 v5.4S, v10.4S, v18.4S // ...........................*.............................................................. - // trn1 v28.4S, v1.4S, v9.4S // ............................*............................................................. - // trn1 v18.4S, v10.4S, v18.4S // ..........................*............................................................... - // ins v21.d[1], x12 // .........................................*................................................ - // trn2 v20.2D, v5.2D, v19.2D // ...............................*.......................................................... - // trn2 v27.2D, v18.2D, v28.2D // .................................*........................................................ - // mul v9.8H, v20.8H, v22.8H // ...................................*...................................................... - // sqrdmulh v11.8H, v20.8H, v2.8H // ..................................*....................................................... - // sqrdmulh v30.8H, v27.8H, v2.8H // .....................................*.................................................... - // mul v24.8H, v27.8H, v22.8H // ....................................*..................................................... - // trn1 v19.2D, v5.2D, v19.2D // ................................*......................................................... - // mls v9.8H, v11.8H, v7.H[0] // ........................................*................................................. - // mls v24.8H, v30.8H, v7.H[0] // ..........................................*............................................... - // trn1 v17.2D, v18.2D, v28.2D // ..............................*........................................................... - // ins v4.d[1], x10 // .........*................................................................................ - // add v16.8H, v19.8H, v9.8H // ..............................................*........................................... - // sub v9.8H, v19.8H, v9.8H // ............................................*............................................. - // sqrdmulh v22.8H, v16.8H, v29.8H // .................................................*........................................ - // mul v2.8H, v16.8H, v4.8H // ..................................................*....................................... - // mul v0.8H, v9.8H, v12.8H // ...............................................*.......................................... - // sqrdmulh v9.8H, v9.8H, v21.8H // ................................................*......................................... - // add v11.8H, v17.8H, v24.8H // .............................................*............................................ - // mls v2.8H, v22.8H, v7.H[0] // .....................................................*.................................... - // mls v0.8H, v9.8H, v7.H[0] // ...................................................*...................................... - // sub v9.8H, v17.8H, v24.8H // ....................................................*..................................... - // add v8.8H, v11.8H, v2.8H // .........................................................*................................ - // sub v2.8H, v11.8H, v2.8H // ........................................................*................................. - // add v10.8H, v9.8H, v0.8H // ......................................................*................................... - // sub v11.8H, v9.8H, v0.8H // .......................................................*.................................. - // trn2 v29.4S, v8.4S, v2.4S // ................................................................*......................... - // trn1 v30.4S, v8.4S, v2.4S // ............................................................*............................. - // trn2 v20.4S, v10.4S, v11.4S // ..........................................................*............................... - // trn1 v11.4S, v10.4S, v11.4S // ...........................................................*.............................. - // sqdmulh v2.8H, v30.8H, v7.H[1] // ...............................................................*.......................... - // sqdmulh v18.8H, v29.8H, v7.H[1] // ....................................................................*..................... - // sqdmulh v9.8H, v11.8H, v7.H[1] // .............................................................*............................ - // sqdmulh v22.8H, v20.8H, v7.H[1] // ..............................................................*........................... - // srshr v2.8H, v2.8H, #11 // ...................................................................*...................... - // srshr v27.8H, v9.8H, #11 // ..................................................................*....................... - // srshr v9.8H, v18.8H, #11 // ........................................................................*................. - // srshr v1.8H, v22.8H, #11 // .................................................................*........................ - // mls v30.8H, v2.8H, v7.H[0] // .......................................................................*.................. - // mls v11.8H, v27.8H, v7.H[0] // .....................................................................*.................... - // mls v20.8H, v1.8H, v7.H[0] // ......................................................................*................... - // mls v29.8H, v9.8H, v7.H[0] // ............................................................................*............. - // umov x24, v30.d[0] // ....................................................................................*..... - // umov x10, v30.d[1] // .............................................................................*............ - // umov x13, v29.d[0] // ...................................................................................*...... - // umov x19, v20.d[0] // ..........................................................................*............... - // str x10, [x1, #32] // .....................................................................................*.... - // umov x10, v20.d[1] // ...........................................................................*.............. - // umov x25, v29.d[1] // .................................................................................*........ - // str x24, [x1], #( 16*4) // .........................................................................................* - // str x13, [x1, #-48] // ........................................................................................*. - // str x19, [x1, #-40] // ................................................................................*......... - // umov x13, v11.d[0] // .........................................................................*................ - // str x25, [x1, #-16] // .......................................................................................*.. - // umov x19, v11.d[1] // ...............................................................................*.......... - // str x10, [x1, #-8] // ..................................................................................*....... - // str x13, [x1, #-56] // ..............................................................................*........... - // str x19, [x1, #-24] // ......................................................................................*... + // Instructions: 34 + // Expected cycles: 24 + // Expected IPC: 1.42 + // + // Cycle bound: 24.0 + // IPC bound: 1.42 + // + // Wall time: 0.78s + // User time: 0.78s + // + // ------- original position -------> + // 0 25 + // |------------------------|-------- + sub v6.8H, v26.8H, v16.8H // *................................. + trn1 v15.4S, v27.4S, v14.4S // ..*............................... + // gap // .................................. + // gap // .................................. + add v16.8H, v26.8H, v16.8H // .*................................ + trn2 v27.4S, v27.4S, v14.4S // ...*.............................. + // gap // .................................. + // gap // .................................. + sqdmulh v14.8H, v15.8H, v7.H[1] // .......*.......................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + trn1 v26.4S, v16.4S, v6.4S // ....*............................. + trn2 v16.4S, v16.4S, v6.4S // .....*............................ + // gap // .................................. + // gap // .................................. + sqdmulh v6.8H, v27.8H, v7.H[1] // ......*........................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + srshr v14.8H, v14.8H, #11 // ...........*...................... + sqdmulh v0.8H, v26.8H, v7.H[1] // ........*......................... + // gap // .................................. + // gap // .................................. + sqdmulh v11.8H, v16.8H, v7.H[1] // .........*........................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + srshr v6.8H, v6.8H, #11 // ..........*....................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v15.8H, v14.8H, v7.H[0] // .............*.................... + srshr v14.8H, v0.8H, #11 // ............*..................... + // gap // .................................. + // gap // .................................. + srshr v0.8H, v11.8H, #11 // ..............*................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v27.8H, v6.8H, v7.H[0] // ...............*.................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v26.8H, v14.8H, v7.H[0] // ......................*........... + umov x27, v15.d[0] // .................*................ + // gap // .................................. + // gap // .................................. + mls v16.8H, v0.8H, v7.H[0] // ................*................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + umov x19, v15.d[1] // .....................*............ + umov x13, v27.d[0] // ..................*............... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + umov x12, v27.d[1] // .......................*.......... + str x27, [x1], #( 16*4) // ........................*......... + umov x27, v16.d[0] // ....................*............. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + umov x14, v26.d[0] // .........................*........ + umov x15, v26.d[1] // ..........................*....... + str x13, [x1, #-48] // ...................*.............. + // gap // .................................. + str x19, [x1, #-32] // ...........................*...... + // gap // .................................. + // gap // .................................. + // gap // .................................. + umov x19, v16.d[1] // ...............................*.. + str x12, [x1, #-16] // ............................*..... + // gap // .................................. + // gap // .................................. + str x27, [x1, #-40] // ................................*. + // gap // .................................. + // gap // .................................. + // gap // .................................. + str x14, [x1, #-56] // .............................*.... + // gap // .................................. + // gap // .................................. + // gap // .................................. + str x15, [x1, #-24] // ..............................*... + // gap // .................................. + // gap // .................................. + // gap // .................................. + str x19, [x1, #-8] // .................................* + // gap // .................................. + // gap // .................................. + // gap // .................................. + + // --------- new position ----------> + // 0 25 + // |------------------------|-------- + // sub v15.8H, v26.8H, v16.8H // *................................. + // add v13.8H, v26.8H, v16.8H // ..*............................... + // trn1 v9.4S, v27.4S, v14.4S // .*................................ + // trn2 v0.4S, v27.4S, v14.4S // ...*.............................. + // trn1 v11.4S, v13.4S, v15.4S // .....*............................ + // trn2 v19.4S, v13.4S, v15.4S // ......*........................... + // sqdmulh v26.8H, v0.8H, v7.H[1] // .......*.......................... + // sqdmulh v16.8H, v9.8H, v7.H[1] // ....*............................. + // sqdmulh v15.8H, v11.8H, v7.H[1] // .........*........................ + // sqdmulh v27.8H, v19.8H, v7.H[1] // ..........*....................... + // srshr v26.8H, v26.8H, #11 // ...........*...................... + // srshr v16.8H, v16.8H, #11 // ........*......................... + // srshr v14.8H, v15.8H, #11 // .............*.................... + // mls v9.8H, v16.8H, v7.H[0] // ............*..................... + // srshr v16.8H, v27.8H, #11 // ..............*................... + // mls v0.8H, v26.8H, v7.H[0] // ...............*.................. + // mls v19.8H, v16.8H, v7.H[0] // ..................*............... + // umov x15, v9.d[0] // .................*................ + // umov x27, v0.d[0] // ....................*............. + // str x27, [x1, #16] // ..........................*....... + // umov x27, v19.d[0] // .......................*.......... + // umov x9, v9.d[1] // ...................*.............. + // mls v11.8H, v14.8H, v7.H[0] // ................*................. + // umov x12, v0.d[1] // .....................*............ + // str x15, [x1], #( 16*4) // ......................*........... + // umov x13, v11.d[0] // ........................*......... + // umov x19, v11.d[1] // .........................*........ + // str x9, [x1, #-32] // ...........................*...... + // str x12, [x1, #-16] // .............................*.... + // str x13, [x1, #-56] // ...............................*.. + // str x19, [x1, #-24] // ................................*. + // umov x11, v19.d[1] // ............................*..... + // str x27, [x1, #-40] // ..............................*... + // str x11, [x1, #-8] // .................................* pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_a55.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_a55.s index 2d2a5c28..c3fb882c 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_a55.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_a55.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -66,15 +43,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -83,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -102,43 +73,43 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm @@ -165,7 +136,7 @@ .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -176,7 +147,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -186,7 +157,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -194,7 +165,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -205,19 +176,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -230,7 +201,7 @@ roots: .text .global ntt_kyber_123_4567_scalar_store_opt_a55 - .global _ntt_kyber_123_4567_scalar_store_opt_a55 + .global _ntt_kyber_123_4567_scalar_store .p2align 4 const_addr: .short 3329 @@ -356,1081 +327,1169 @@ _ntt_kyber_123_4567_scalar_store_opt_a55: load_roots_123 .p2align 2 - ldr_vo v4, x0, 0 // *......... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v12, x0, 64 // .*........ - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v30, x0, 128 // ..*....... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v18, x0, 192 // ...*...... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v20, x0, 256 // ....*..... - // gap // .......... - // gap // .......... - // gap // .......... - ldr_vo v26, x0, 448 // .......*.. - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v5.8H, v20.8H, v0.H[1] // ......*... - // gap // .......... - ldr_vo v6, x0, 320 // .....*.... - // gap // .......... - // gap // .......... - // gap // .......... - sqrdmulh v24.8H, v26.8H, v0.H[1] // .........* - // gap // .......... - ldr_vo v25, x0, 384 // ........*. - // gap // .......... - - // original source code - // ldr_vo v4, x0, 0 // *......... || *................ - // ldr_vo v12, x0, 64 // .*........ || ..*.............. - // ldr_vo v30, x0, 128 // ..*....... || ....*............ - // ldr_vo v18, x0, 192 // ...*...... || ......*.......... - // ldr_vo v20, x0, 256 // ....*..... || ........*........ - // ldr_vo v6, x0, 320 // .......*.. || .............*... - // sqrdmulh v5.8H, v20.8H, v0.H[1] // ......*... || ............*.... - // ldr_vo v26, x0, 448 // .....*.... || ..........*...... - // ldr_vo v25, x0, 384 // .........* || ................* - // sqrdmulh v24.8H, v26.8H, v0.H[1] // ........*. || ...............*. - + // Instructions: 9 + // Expected cycles: 15 + // Expected IPC: 0.60 + // + // Cycle bound: 15.0 + // IPC bound: 0.60 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q10, [x0, #384] // *............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x0, #64] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x0, #128] // ..*........................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q12, [x0, #192] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q22, [x0, #256] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q16, [x0, #448] // ........*..................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v19.8H, v22.8H, v0.H[1] // ......*....................... + // gap // .............................. + mul v26.8H, v22.8H, v0.H[0] // .......*...................... + // gap // .............................. + ldr q31, [x0, #320] // .....*........................ + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q10, [x0, #384] // *.............................. + // ldr q13, [x0, #64] // .*............................. + // ldr q11, [x0, #128] // ..*............................ + // ldr q12, [x0, #192] // ...*........................... + // ldr q18, [x0, #256] // ....*.......................... + // ldr q31, [x0, #320] // ........*...................... + // sqrdmulh v19.8H, v18.8H, v0.H[1] // ......*........................ + // mul v26.8H, v18.8H, v0.H[0] // .......*....................... + // ldr q16, [x0, #448] // .....*......................... + sub count, count, #1 -.p2align 2 layer123_start: - mul v20.8H, v20.8H, v0.H[0] // ........*................................................................... - // gap // ............................................................................ - mul v13.8H, v6.8H, v0.H[0] // .............*.............................................................. - // gap // ............................................................................ - sqrdmulh v9.8H, v6.8H, v0.H[1] // ..............*............................................................. - // gap // ............................................................................ - mul v10.8H, v25.8H, v0.H[0] // ..................*......................................................... - // gap // ............................................................................ - mls v20.8H, v5.8H, v7.H[0] // ..........*................................................................. - // gap // ............................................................................ - sqrdmulh v6.8H, v25.8H, v0.H[1] // ...................*........................................................ - // gap // ............................................................................ - mls v13.8H, v9.8H, v7.H[0] // ...............*............................................................ - // gap // ............................................................................ - mul v9.8H, v26.8H, v0.H[0] // .......................*.................................................... - // gap // ............................................................................ - sub v5.8H, v4.8H, v20.8H // ...........*................................................................ - // gap // ............................................................................ - mls v10.8H, v6.8H, v7.H[0] // ....................*....................................................... - // gap // ............................................................................ - add v20.8H, v4.8H, v20.8H // ............*............................................................... - // gap // ............................................................................ - sub v6.8H, v12.8H, v13.8H // ................*........................................................... - // gap // ............................................................................ - add v13.8H, v12.8H, v13.8H // .................*.......................................................... - // gap // ............................................................................ - sub v26.8H, v30.8H, v10.8H // .....................*...................................................... - // gap // ............................................................................ - add v10.8H, v30.8H, v10.8H // ......................*..................................................... - // gap // ............................................................................ - mls v9.8H, v24.8H, v7.H[0] // .........................*.................................................. - // gap // ............................................................................ - mul v30.8H, v26.8H, v0.H[4] // ......................................*..................................... - // gap // ............................................................................ - sqrdmulh v26.8H, v26.8H, v0.H[5] // .......................................*.................................... - // gap // ............................................................................ - mul v4.8H, v10.8H, v0.H[2] // ............................*............................................... - // gap // ............................................................................ - sub v12.8H, v18.8H, v9.8H // ..........................*................................................. - // gap // ............................................................................ - add v9.8H, v18.8H, v9.8H // ...........................*................................................ - // gap // ............................................................................ - mls v30.8H, v26.8H, v7.H[0] // ........................................*................................... - // gap // ............................................................................ - sqrdmulh v10.8H, v10.8H, v0.H[3] // .............................*.............................................. - // gap // ............................................................................ - mul v26.8H, v12.8H, v0.H[4] // ...........................................*................................ - // gap // ............................................................................ - sqrdmulh v12.8H, v12.8H, v0.H[5] // ............................................*............................... - // gap // ............................................................................ - sub v18.8H, v5.8H, v30.8H // .........................................*.................................. - // gap // ............................................................................ - add v5.8H, v5.8H, v30.8H // ..........................................*................................. - // gap // ............................................................................ - mls v4.8H, v10.8H, v7.H[0] // ..............................*............................................. - // gap // ............................................................................ - mul v10.8H, v9.8H, v0.H[2] // .................................*.......................................... - // gap // ............................................................................ - mls v26.8H, v12.8H, v7.H[0] // .............................................*.............................. - // gap // ............................................................................ - sqrdmulh v9.8H, v9.8H, v0.H[3] // ..................................*......................................... - // gap // ............................................................................ - sub v30.8H, v20.8H, v4.8H // ...............................*............................................ - // gap // ............................................................................ - add v20.8H, v20.8H, v4.8H // ................................*........................................... - // gap // ............................................................................ - sub v4.8H, v6.8H, v26.8H // ..............................................*............................. - // gap // ............................................................................ - add v6.8H, v6.8H, v26.8H // ...............................................*............................ - // gap // ............................................................................ - mls v10.8H, v9.8H, v7.H[0] // ...................................*........................................ - // gap // ............................................................................ - mul v9.8H, v4.8H, v1.H[4] // ...............................................................*............ - // gap // ............................................................................ - mul v26.8H, v6.8H, v1.H[2] // ..........................................................*................. - // gap // ............................................................................ - sqrdmulh v6.8H, v6.8H, v1.H[3] // ...........................................................*................ - // gap // ............................................................................ - sub v12.8H, v13.8H, v10.8H // ....................................*....................................... - // gap // ............................................................................ - add v13.8H, v13.8H, v10.8H // .....................................*...................................... - // gap // ............................................................................ - sqrdmulh v10.8H, v4.8H, v1.H[5] // ................................................................*........... - // gap // ............................................................................ - mul v4.8H, v12.8H, v1.H[0] // .....................................................*...................... - // gap // ............................................................................ - mul v25.8H, v13.8H, v0.H[6] // ................................................*........................... - // gap // ............................................................................ - sqrdmulh v13.8H, v13.8H, v0.H[7] // .................................................*.......................... - // gap // ............................................................................ - sqrdmulh v12.8H, v12.8H, v1.H[1] // ......................................................*..................... - // gap // ............................................................................ - mls v26.8H, v6.8H, v7.H[0] // ............................................................*............... - // gap // ............................................................................ - mls v9.8H, v10.8H, v7.H[0] // .................................................................*.......... - // gap // ............................................................................ - mls v25.8H, v13.8H, v7.H[0] // ..................................................*......................... - // gap // ............................................................................ - mls v4.8H, v12.8H, v7.H[0] // .......................................................*.................... - // gap // ............................................................................ - sub v13.8H, v5.8H, v26.8H // .............................................................*.............. - // gap // ............................................................................ - sub v10.8H, v18.8H, v9.8H // ..................................................................*......... - // gap // ............................................................................ - add v9.8H, v18.8H, v9.8H // ...................................................................*........ - // gap // ............................................................................ - add v6.8H, v5.8H, v26.8H // ..............................................................*............. - // gap // ............................................................................ - sub v5.8H, v20.8H, v25.8H // ...................................................*........................ - // gap // ............................................................................ - add v20.8H, v20.8H, v25.8H // ....................................................*....................... - // gap // ............................................................................ - sub v26.8H, v30.8H, v4.8H // ........................................................*................... - // gap // ............................................................................ - add v30.8H, v30.8H, v4.8H // .........................................................*.................. - // gap // ............................................................................ - str_vi v20, x0, 16 // ....................................................................*....... - // gap // ............................................................................ - ldr_vo v4, x0, 0 // e........................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v5, x0, 48 // .....................................................................*...... - // gap // ............................................................................ - ldr_vo v12, x0, 64 // .e.......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v30, x0, 112 // ......................................................................*..... - // gap // ............................................................................ - ldr_vo v30, x0, 128 // ..e......................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v26, x0, 176 // .......................................................................*.... - // gap // ............................................................................ - ldr_vo v18, x0, 192 // ...e........................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v6, x0, 240 // ........................................................................*... - // gap // ............................................................................ - ldr_vo v20, x0, 256 // ....e....................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v13, x0, 304 // .........................................................................*.. - // gap // ............................................................................ - ldr_vo v6, x0, 320 // .....e...................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v9, x0, 368 // ..........................................................................*. - // gap // ............................................................................ - sqrdmulh v5.8H, v20.8H, v0.H[1] // .........e.................................................................. - // gap // ............................................................................ - str_vo v10, x0, 432 // ...........................................................................* - // gap // ............................................................................ - ldr_vo v26, x0, 448 // .......e.................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - ldr_vo v25, x0, 384 // ......e..................................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v24.8H, v26.8H, v0.H[1] // ........................e................................................... - // gap // ............................................................................ - - // original source code - // ldr_vo v8, x0, 0 // e......................................................................................... || e....................................................................................................... - // ldr_vo v9, x0, 64 // ..e....................................................................................... || ...e.................................................................................................... - // ldr_vo v10, x0, 128 // ....e..................................................................................... || ......e................................................................................................. - // ldr_vo v11, x0, 192 // ......e................................................................................... || .........e.............................................................................................. - // ldr_vo v12, x0, 256 // ........e................................................................................. || ............e........................................................................................... - // ldr_vo v13, x0, 320 // ..........e............................................................................... || ...............e........................................................................................ - // ldr_vo v14, x0, 384 // ...............e.......................................................................... || ......................e................................................................................. - // ldr_vo v15, x0, 448 // ..............e........................................................................... || ....................e................................................................................... - // mul v24.8H, v12.8H, v0.H[0] // .................*........................................................................ || .........................*.............................................................................. - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ............e............................................................................. || ..................e..................................................................................... - // mls v24.8H, v12.8H, v7.H[0] // .....................*.................................................................... || .............................*.......................................................................... - // sub v12.8H, v8.8H, v24.8H // .........................*................................................................ || .................................*...................................................................... - // add v8.8H, v8.8H, v24.8H // ...........................*.............................................................. || ...................................*.................................................................... - // mul v24.8H, v13.8H, v0.H[0] // ..................*....................................................................... || ..........................*............................................................................. - // sqrdmulh v13.8H, v13.8H, v0.H[1] // ...................*...................................................................... || ...........................*............................................................................ - // mls v24.8H, v13.8H, v7.H[0] // .......................*.................................................................. || ...............................*........................................................................ - // sub v13.8H, v9.8H, v24.8H // ............................*............................................................. || ....................................*................................................................... - // add v9.8H, v9.8H, v24.8H // .............................*............................................................ || .....................................*.................................................................. - // mul v24.8H, v14.8H, v0.H[0] // ....................*..................................................................... || ............................*........................................................................... - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ......................*................................................................... || ..............................*......................................................................... - // mls v24.8H, v14.8H, v7.H[0] // ..........................*............................................................... || ..................................*..................................................................... - // sub v14.8H, v10.8H, v24.8H // ..............................*........................................................... || ......................................*................................................................. - // add v10.8H, v10.8H, v24.8H // ...............................*.......................................................... || .......................................*................................................................ - // mul v24.8H, v15.8H, v0.H[0] // ........................*................................................................. || ................................*....................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ................e......................................................................... || ........................e............................................................................... - // mls v24.8H, v15.8H, v7.H[0] // ................................*......................................................... || ........................................*............................................................... - // sub v15.8H, v11.8H, v24.8H // ....................................*..................................................... || ............................................*........................................................... - // add v11.8H, v11.8H, v24.8H // .....................................*.................................................... || .............................................*.......................................................... - // mul v24.8H, v10.8H, v0.H[2] // ...................................*...................................................... || ...........................................*............................................................ - // sqrdmulh v10.8H, v10.8H, v0.H[3] // .......................................*.................................................. || ...............................................*........................................................ - // mls v24.8H, v10.8H, v7.H[0] // ............................................*............................................. || ....................................................*................................................... - // sub v10.8H, v8.8H, v24.8H // ................................................*......................................... || ........................................................*............................................... - // add v8.8H, v8.8H, v24.8H // .................................................*........................................ || .........................................................*.............................................. - // mul v24.8H, v11.8H, v0.H[2] // .............................................*............................................ || .....................................................*.................................................. - // sqrdmulh v11.8H, v11.8H, v0.H[3] // ...............................................*.......................................... || .......................................................*................................................ - // mls v24.8H, v11.8H, v7.H[0] // ....................................................*..................................... || ............................................................*........................................... - // sub v11.8H, v9.8H, v24.8H // ........................................................*................................. || ................................................................*....................................... - // add v9.8H, v9.8H, v24.8H // .........................................................*................................ || .................................................................*...................................... - // mul v24.8H, v14.8H, v0.H[4] // .................................*........................................................ || .........................................*.............................................................. - // sqrdmulh v14.8H, v14.8H, v0.H[5] // ..................................*....................................................... || ..........................................*............................................................. - // mls v24.8H, v14.8H, v7.H[0] // ......................................*................................................... || ..............................................*......................................................... - // sub v14.8H, v12.8H, v24.8H // ..........................................*............................................... || ..................................................*..................................................... - // add v12.8H, v12.8H, v24.8H // ...........................................*.............................................. || ...................................................*.................................................... - // mul v24.8H, v15.8H, v0.H[4] // ........................................*................................................. || ................................................*....................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // .........................................*................................................ || .................................................*...................................................... - // mls v24.8H, v15.8H, v7.H[0] // ..............................................*........................................... || ......................................................*................................................. - // sub v15.8H, v13.8H, v24.8H // ..................................................*....................................... || ..........................................................*............................................. - // add v13.8H, v13.8H, v24.8H // ...................................................*...................................... || ...........................................................*............................................ - // mul v24.8H, v9.8H, v0.H[6] // ............................................................*............................. || ....................................................................*................................... - // sqrdmulh v9.8H, v9.8H, v0.H[7] // .............................................................*............................ || .....................................................................*.................................. - // mls v24.8H, v9.8H, v7.H[0] // .................................................................*........................ || .........................................................................*.............................. - // sub v9.8H, v8.8H, v24.8H // .......................................................................*.................. || ...............................................................................*........................ - // add v8.8H, v8.8H, v24.8H // ........................................................................*................. || ................................................................................*....................... - // mul v24.8H, v11.8H, v1.H[0] // ...........................................................*.............................. || ...................................................................*.................................... - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ..............................................................*........................... || ......................................................................*................................. - // mls v24.8H, v11.8H, v7.H[0] // ..................................................................*....................... || ..........................................................................*............................. - // sub v11.8H, v10.8H, v24.8H // .........................................................................*................ || .................................................................................*...................... - // add v10.8H, v10.8H, v24.8H // ..........................................................................*............... || ..................................................................................*..................... - // mul v24.8H, v13.8H, v1.H[2] // ......................................................*................................... || ..............................................................*......................................... - // sqrdmulh v13.8H, v13.8H, v1.H[3] // .......................................................*.................................. || ...............................................................*........................................ - // mls v24.8H, v13.8H, v7.H[0] // ...............................................................*.......................... || .......................................................................*................................ - // sub v13.8H, v12.8H, v24.8H // ...................................................................*...................... || ...........................................................................*............................ - // add v12.8H, v12.8H, v24.8H // ......................................................................*................... || ..............................................................................*......................... - // mul v24.8H, v15.8H, v1.H[4] // .....................................................*.................................... || .............................................................*.......................................... - // sqrdmulh v15.8H, v15.8H, v1.H[5] // ..........................................................*............................... || ..................................................................*..................................... - // mls v24.8H, v15.8H, v7.H[0] // ................................................................*......................... || ........................................................................*............................... - // sub v15.8H, v14.8H, v24.8H // ....................................................................*..................... || ............................................................................*........................... - // add v14.8H, v14.8H, v24.8H // .....................................................................*.................... || .............................................................................*.......................... - // str_vi v8, x0, 16 // ...........................................................................*.............. || ...................................................................................*.................... - // str_vo v9, x0, 48 // .............................................................................*............ || ......................................................................................*................. - // str_vo v10, x0, 112 // ...............................................................................*.......... || .........................................................................................*.............. - // str_vo v11, x0, 176 // .................................................................................*........ || ............................................................................................*........... - // str_vo v12, x0, 240 // ...................................................................................*...... || ...............................................................................................*........ - // str_vo v13, x0, 304 // .....................................................................................*.... || ..................................................................................................*..... - // str_vo v14, x0, 368 // .......................................................................................*.. || .....................................................................................................*.. - // str_vo v15, x0, 432 // .........................................................................................* || .......................................................................................................* - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 84 + // Expected IPC: 0.90 + // + // Cycle bound: 84.0 + // IPC bound: 0.90 + // + // Wall time: 6.12s + // User time: 6.12s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q29, [x0, #0] // *........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v18.8H, v31.8H, v0.H[1] // .............*.............................................................. + // gap // ............................................................................ + mul v6.8H, v31.8H, v0.H[0] // ..............*............................................................. + // gap // ............................................................................ + sqrdmulh v25.8H, v16.8H, v0.H[1] // .......................*.................................................... + // gap // ............................................................................ + mul v16.8H, v16.8H, v0.H[0] // ........................*................................................... + // gap // ............................................................................ + sqrdmulh v31.8H, v10.8H, v0.H[1] // ..................*......................................................... + // gap // ............................................................................ + mls v6.8H, v18.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + mul v18.8H, v10.8H, v0.H[0] // ...................*........................................................ + // gap // ............................................................................ + mls v16.8H, v25.8H, v7.H[0] // .........................*.................................................. + // gap // ............................................................................ + mls v26.8H, v19.8H, v7.H[0] // ..........*................................................................. + // gap // ............................................................................ + sub v19.8H, v13.8H, v6.8H // ................*........................................................... + // gap // ............................................................................ + mls v18.8H, v31.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + sub v25.8H, v12.8H, v16.8H // ..........................*................................................. + // gap // ............................................................................ + sub v8.8H, v29.8H, v26.8H // ...........*................................................................ + // gap // ............................................................................ + add v5.8H, v29.8H, v26.8H // ............*............................................................... + // gap // ............................................................................ + sub v26.8H, v11.8H, v18.8H // .....................*...................................................... + // gap // ............................................................................ + add v18.8H, v11.8H, v18.8H // ......................*..................................................... + // gap // ............................................................................ + mul v29.8H, v25.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + sqrdmulh v2.8H, v26.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + mul v9.8H, v26.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + sqrdmulh v11.8H, v18.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + mul v18.8H, v18.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + add v16.8H, v12.8H, v16.8H // ...........................*................................................ + // gap // ............................................................................ + mls v9.8H, v2.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + sqrdmulh v20.8H, v25.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + mls v18.8H, v11.8H, v7.H[0] // ..............................*............................................. + // gap // ............................................................................ + sqrdmulh v11.8H, v16.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + ldr q10, [x0, #400] // ......e..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v2.8H, v5.8H, v18.8H // ...............................*............................................ + // gap // ............................................................................ + add v18.8H, v5.8H, v18.8H // ................................*........................................... + // gap // ............................................................................ + mls v29.8H, v20.8H, v7.H[0] // .............................................*.............................. + // gap // ............................................................................ + add v6.8H, v13.8H, v6.8H // .................*.......................................................... + // gap // ............................................................................ + mul v16.8H, v16.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + add v31.8H, v8.8H, v9.8H // ..........................................*................................. + // gap // ............................................................................ + add v25.8H, v19.8H, v29.8H // ...............................................*............................ + // gap // ............................................................................ + sub v26.8H, v19.8H, v29.8H // ..............................................*............................. + // gap // ............................................................................ + mls v16.8H, v11.8H, v7.H[0] // ...................................*........................................ + // gap // ............................................................................ + sqrdmulh v19.8H, v25.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + mul v25.8H, v25.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + sqrdmulh v29.8H, v26.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + sub v3.8H, v6.8H, v16.8H // ....................................*....................................... + // gap // ............................................................................ + add v6.8H, v6.8H, v16.8H // .....................................*...................................... + // gap // ............................................................................ + mul v16.8H, v26.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v20.8H, v3.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + sqrdmulh v26.8H, v6.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + mul v11.8H, v6.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + mul v12.8H, v3.8H, v1.H[0] // ......................................................*..................... + // gap // ............................................................................ + mls v25.8H, v19.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + mls v16.8H, v29.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + mls v11.8H, v26.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + mls v12.8H, v20.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + sub v29.8H, v31.8H, v25.8H // .............................................................*.............. + // gap // ............................................................................ + add v31.8H, v31.8H, v25.8H // ..............................................................*............. + // gap // ............................................................................ + sub v25.8H, v18.8H, v11.8H // ...................................................*........................ + // gap // ............................................................................ + add v18.8H, v18.8H, v11.8H // ....................................................*....................... + // gap // ............................................................................ + sub v11.8H, v8.8H, v9.8H // .........................................*.................................. + // gap // ............................................................................ + add v26.8H, v2.8H, v12.8H // .........................................................*.................. + // gap // ............................................................................ + sub v19.8H, v2.8H, v12.8H // ........................................................*................... + // gap // ............................................................................ + str q18, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + sub v6.8H, v11.8H, v16.8H // ..................................................................*......... + // gap // ............................................................................ + str q25, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + add v20.8H, v11.8H, v16.8H // ...................................................................*........ + // gap // ............................................................................ + ldr q13, [x0, #64] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q26, [x0, #112] // ......................................................................*..... + // gap // ............................................................................ + ldr q11, [x0, #128] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + ldr q12, [x0, #192] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q31, [x0, #240] // ........................................................................*... + // gap // ............................................................................ + ldr q18, [x0, #256] // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q29, [x0, #304] // .........................................................................*.. + // gap // ............................................................................ + ldr q31, [x0, #320] // .....e...................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q20, [x0, #368] // ..........................................................................*. + // gap // ............................................................................ + sqrdmulh v19.8H, v18.8H, v0.H[1] // ........e................................................................... + // gap // ............................................................................ + str q6, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + mul v26.8H, v18.8H, v0.H[0] // .........e.................................................................. + // gap // ............................................................................ + ldr q16, [x0, #448] // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // ------------------------------------------------------ new position ------------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|---------------------- + // ldr q8, [x0, #0] // .................................................*......................................................................... + // ldr q9, [x0, #(1*(512/8))] // ...................................e.............'.............................................................~........... + // ldr q10, [x0, #(2*(512/8))] // .....................................e...........'...............................................................~......... + // ldr q11, [x0, #(3*(512/8))] // .......................................e.........'.................................................................~....... + // ldr q12, [x0, #(4*(512/8))] // .........................................e.......'...................................................................~..... + // ldr q13, [x0, #(5*(512/8))] // ...........................................e.....'.....................................................................~... + // ldr q14, [x0, #(6*(512/8))] // e................................................'..........................~.............................................. + // ldr q15, [x0, #(7*(512/8))] // ................................................e'......................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // .............................................e...'.......................................................................~. + // mul v24.8h, v12.8h, v0.h[0] // ...............................................e.'......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................'........*................................................................ + // sub v12.8h, v8.8h, v24.8h // .................................................'............*............................................................ + // add v8.8h, v8.8h, v24.8h // .................................................'.............*........................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // .................................................'*........................................................................ + // mul v24.8h, v13.8h, v0.h[0] // .................................................'.*....................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................'.....*................................................................... + // sub v13.8h, v9.8h, v24.8h // .................................................'.........*............................................................... + // add v9.8h, v9.8h, v24.8h // ....~............................................'..............................*.......................................... + // sqrdmulh v27.8h, v14.8h, v0.h[1] // .................................................'....*.................................................................... + // mul v24.8h, v14.8h, v0.h[0] // .................................................'......*.................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .................................................'..........*.............................................................. + // sub v14.8h, v10.8h, v24.8h // .................................................'..............*.......................................................... + // add v10.8h, v10.8h, v24.8h // .................................................'...............*......................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // .................................................'..*...................................................................... + // mul v24.8h, v15.8h, v0.h[0] // .................................................'...*..................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................'.......*................................................................. + // sub v15.8h, v11.8h, v24.8h // .................................................'...........*............................................................. + // add v11.8h, v11.8h, v24.8h // .................................................'.....................*................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // .................................................'...................*..................................................... + // mul v24.8h, v10.8h, v0.h[2] // .................................................'....................*.................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................'........................*................................................ + // sub v10.8h, v8.8h, v24.8h // .~...............................................'...........................*............................................. + // add v8.8h, v8.8h, v24.8h // ..~..............................................'............................*............................................ + // sqrdmulh v27.8h, v11.8h, v0.h[3] // .................................................'.........................*............................................... + // mul v24.8h, v11.8h, v0.h[2] // .....~...........................................'...............................*......................................... + // mls v24.8h, v27.8h, v7.h[0] // .........~.......................................'...................................*..................................... + // sub v11.8h, v9.8h, v24.8h // .............~...................................'.......................................*................................. + // add v9.8h, v9.8h, v24.8h // ..............~..................................'........................................*................................ + // sqrdmulh v27.8h, v14.8h, v0.h[5] // .................................................'.................*....................................................... + // mul v24.8h, v14.8h, v0.h[4] // .................................................'..................*...................................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................'......................*.................................................. + // sub v14.8h, v12.8h, v24.8h // ............................~....................'......................................................*.................. + // add v12.8h, v12.8h, v24.8h // ......~..........................................'................................*........................................ + // sqrdmulh v27.8h, v15.8h, v0.h[5] // .................................................'.......................*................................................. + // mul v24.8h, v15.8h, v0.h[4] // .................................................'................*........................................................ + // mls v24.8h, v27.8h, v7.h[0] // ...~.............................................'.............................*........................................... + // sub v15.8h, v13.8h, v24.8h // ........~........................................'..................................*...................................... + // add v13.8h, v13.8h, v24.8h // .......~.........................................'.................................*....................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // .................~...............................'...........................................*............................. + // mul v24.8h, v9.8h, v0.h[6] // ..................~..............................'............................................*............................ + // mls v24.8h, v27.8h, v7.h[0] // ......................~..........................'................................................*........................ + // sub v9.8h, v8.8h, v24.8h // ..........................~......................'....................................................*.................... + // add v8.8h, v8.8h, v24.8h // ...........................~.....................'.....................................................*................... + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ................~................................'..........................................*.............................. + // mul v24.8h, v11.8h, v1.h[0] // ...................~.............................'.............................................*........................... + // mls v24.8h, v27.8h, v7.h[0] // .......................~.........................'.................................................*....................... + // sub v11.8h, v10.8h, v24.8h // ..............................~..................'........................................................*................ + // add v10.8h, v10.8h, v24.8h // .............................~...................'.......................................................*................. + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ..........~......................................'....................................*.................................... + // mul v24.8h, v13.8h, v1.h[2] // ...........~.....................................'.....................................*................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................~............................'..............................................*.......................... + // sub v13.8h, v12.8h, v24.8h // ........................~........................'..................................................*...................... + // add v12.8h, v12.8h, v24.8h // .........................~.......................'...................................................*..................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ............~....................................'......................................*.................................. + // mul v24.8h, v15.8h, v1.h[4] // ...............~.................................'.........................................*............................... + // mls v24.8h, v27.8h, v7.h[0] // .....................~...........................'...............................................*......................... + // sub v15.8h, v14.8h, v24.8h // ................................~................'..........................................................*.............. + // add v14.8h, v14.8h, v24.8h // ..................................~..............'............................................................*............ + // str q8, [x0], #(16) // ...............................~.................'.........................................................*............... + // str q9, [x0, #(-16 + 1*(512/8))] // .................................~...............'...........................................................*............. + // str q10, [x0, #(-16 + 2*(512/8))] // ....................................~............'..............................................................*.......... + // str q11, [x0, #(-16 + 3*(512/8))] // ......................................~..........'................................................................*........ + // str q12, [x0, #(-16 + 4*(512/8))] // ........................................~........'..................................................................*...... + // str q13, [x0, #(-16 + 5*(512/8))] // ..........................................~......'....................................................................*.... + // str q14, [x0, #(-16 + 6*(512/8))] // ............................................~....'......................................................................*.. + // str q15, [x0, #(-16 + 7*(512/8))] // ..............................................~..'........................................................................* + + sub count, count, #1 cbnz count, layer123_start - mul v29.8H, v26.8H, v0.H[0] // .......*.......................................................... - // gap // .................................................................. - mul v14.8H, v20.8H, v0.H[0] // *................................................................. - // gap // .................................................................. - mul v3.8H, v6.8H, v0.H[0] // .*................................................................ - // gap // .................................................................. - sqrdmulh v8.8H, v6.8H, v0.H[1] // ..*............................................................... - // gap // .................................................................. - mls v29.8H, v24.8H, v7.H[0] // ...............*.................................................. - // gap // .................................................................. - mul v20.8H, v25.8H, v0.H[0] // ...*.............................................................. - // gap // .................................................................. - sqrdmulh v22.8H, v25.8H, v0.H[1] // .....*............................................................ - // gap // .................................................................. - mls v3.8H, v8.8H, v7.H[0] // ......*........................................................... - // gap // .................................................................. - add v26.8H, v18.8H, v29.8H // ....................*............................................. - // gap // .................................................................. - sub v8.8H, v18.8H, v29.8H // ...................*.............................................. - // gap // .................................................................. - mls v20.8H, v22.8H, v7.H[0] // .........*........................................................ - // gap // .................................................................. - mul v19.8H, v26.8H, v0.H[2] // ............................*..................................... - // gap // .................................................................. - mls v14.8H, v5.8H, v7.H[0] // ....*............................................................. - // gap // .................................................................. - sub v10.8H, v12.8H, v3.8H // ...........*...................................................... - // gap // .................................................................. - add v23.8H, v12.8H, v3.8H // ............*..................................................... - // gap // .................................................................. - sub v5.8H, v30.8H, v20.8H // .............*.................................................... - // gap // .................................................................. - add v20.8H, v30.8H, v20.8H // ..............*................................................... - // gap // .................................................................. - sub v17.8H, v4.8H, v14.8H // ........*......................................................... - // gap // .................................................................. - mul v24.8H, v8.8H, v0.H[4] // .......................*.......................................... - // gap // .................................................................. - mul v11.8H, v20.8H, v0.H[2] // ..................*............................................... - // gap // .................................................................. - sqrdmulh v27.8H, v20.8H, v0.H[3] // ......................*........................................... - // gap // .................................................................. - add v20.8H, v4.8H, v14.8H // ..........*....................................................... - // gap // .................................................................. - sqrdmulh v15.8H, v5.8H, v0.H[5] // .................*................................................ - // gap // .................................................................. - sqrdmulh v13.8H, v8.8H, v0.H[5] // ........................*......................................... - // gap // .................................................................. - mls v11.8H, v27.8H, v7.H[0] // ...........................*...................................... - // gap // .................................................................. - mul v22.8H, v5.8H, v0.H[4] // ................*................................................. - // gap // .................................................................. - sqrdmulh v16.8H, v26.8H, v0.H[3] // ..............................*................................... - // gap // .................................................................. - mls v24.8H, v13.8H, v7.H[0] // .............................*.................................... - // gap // .................................................................. - sub v8.8H, v20.8H, v11.8H // ...............................*.................................. - // gap // .................................................................. - mls v22.8H, v15.8H, v7.H[0] // .....................*............................................ - // gap // .................................................................. - mls v19.8H, v16.8H, v7.H[0] // ...................................*.............................. - // gap // .................................................................. - add v11.8H, v20.8H, v11.8H // ................................*................................. - // gap // .................................................................. - sub v25.8H, v10.8H, v24.8H // .................................*................................ - // gap // .................................................................. - sub v9.8H, v17.8H, v22.8H // .........................*........................................ - // gap // .................................................................. - add v14.8H, v23.8H, v19.8H // ........................................*......................... - // gap // .................................................................. - sub v19.8H, v23.8H, v19.8H // .......................................*.......................... - // gap // .................................................................. - add v16.8H, v17.8H, v22.8H // ..........................*....................................... - // gap // .................................................................. - add v20.8H, v10.8H, v24.8H // ..................................*............................... - // gap // .................................................................. - mul v6.8H, v25.8H, v1.H[4] // ....................................*............................. - // gap // .................................................................. - sqrdmulh v25.8H, v25.8H, v1.H[5] // .........................................*........................ - // gap // .................................................................. - mul v3.8H, v14.8H, v0.H[6] // ...........................................*...................... - // gap // .................................................................. - sqrdmulh v31.8H, v14.8H, v0.H[7] // ............................................*..................... - // gap // .................................................................. - mul v2.8H, v20.8H, v1.H[2] // .....................................*............................ - // gap // .................................................................. - sqrdmulh v17.8H, v19.8H, v1.H[1] // .............................................*.................... - // gap // .................................................................. - mul v19.8H, v19.8H, v1.H[0] // ..........................................*....................... - // gap // .................................................................. - mls v3.8H, v31.8H, v7.H[0] // ................................................*................. - // gap // .................................................................. - sqrdmulh v18.8H, v20.8H, v1.H[3] // ......................................*........................... - // gap // .................................................................. - mls v6.8H, v25.8H, v7.H[0] // ...............................................*.................. - // gap // .................................................................. - mls v19.8H, v17.8H, v7.H[0] // .................................................*................ - // gap // .................................................................. - add v20.8H, v11.8H, v3.8H // .......................................................*.......... - // gap // .................................................................. - mls v2.8H, v18.8H, v7.H[0] // ..............................................*................... - // gap // .................................................................. - sub v5.8H, v11.8H, v3.8H // ......................................................*........... - // gap // .................................................................. - str_vi v20, x0, 16 // ..........................................................*....... - // gap // .................................................................. - add v23.8H, v8.8H, v19.8H // .........................................................*........ - // gap // .................................................................. - str_vo v5, x0, 48 // ...........................................................*...... - // gap // .................................................................. - sub v19.8H, v8.8H, v19.8H // ........................................................*......... - // gap // .................................................................. - str_vo v23, x0, 112 // ............................................................*..... - // gap // .................................................................. - add v14.8H, v16.8H, v2.8H // .....................................................*............ - // gap // .................................................................. - str_vo v19, x0, 176 // .............................................................*.... - // gap // .................................................................. - sub v23.8H, v16.8H, v2.8H // ..................................................*............... - // gap // .................................................................. - str_vo v14, x0, 240 // ..............................................................*... - // gap // .................................................................. - add v19.8H, v9.8H, v6.8H // ....................................................*............. - // gap // .................................................................. - str_vo v23, x0, 304 // ...............................................................*.. - // gap // .................................................................. - sub v20.8H, v9.8H, v6.8H // ...................................................*.............. - // gap // .................................................................. - str_vo v19, x0, 368 // ................................................................*. - // gap // .................................................................. - // gap // .................................................................. - // gap // .................................................................. - str_vo v20, x0, 432 // .................................................................* - // gap // .................................................................. - - // original source code - // mul v20.8H, v20.8H, v0.H[0] // .*................................................................ || .*................................................................. - // mul v13.8H, v6.8H, v0.H[0] // ..*............................................................... || ..*................................................................ - // sqrdmulh v9.8H, v6.8H, v0.H[1] // ...*.............................................................. || ...*............................................................... - // mul v10.8H, v25.8H, v0.H[0] // .....*............................................................ || .....*............................................................. - // mls v20.8H, v5.8H, v7.H[0] // ............*..................................................... || ............*...................................................... - // sqrdmulh v6.8H, v25.8H, v0.H[1] // ......*........................................................... || ......*............................................................ - // mls v13.8H, v9.8H, v7.H[0] // .......*.......................................................... || .......*........................................................... - // mul v9.8H, v26.8H, v0.H[0] // *................................................................. || *.................................................................. - // sub v5.8H, v4.8H, v20.8H // .................*................................................ || .................*................................................. - // mls v10.8H, v6.8H, v7.H[0] // ..........*....................................................... || ..........*........................................................ - // add v20.8H, v4.8H, v20.8H // .....................*............................................ || .....................*............................................. - // sub v6.8H, v12.8H, v13.8H // .............*.................................................... || .............*..................................................... - // add v13.8H, v12.8H, v13.8H // ..............*................................................... || ..............*.................................................... - // sub v26.8H, v30.8H, v10.8H // ...............*.................................................. || ...............*................................................... - // add v10.8H, v30.8H, v10.8H // ................*................................................. || ................*.................................................. - // mls v9.8H, v24.8H, v7.H[0] // ....*............................................................. || ....*.............................................................. - // mul v30.8H, v26.8H, v0.H[4] // .........................*........................................ || .........................*......................................... - // sqrdmulh v26.8H, v26.8H, v0.H[5] // ......................*........................................... || ......................*............................................ - // mul v4.8H, v10.8H, v0.H[2] // ...................*.............................................. || ...................*............................................... - // sub v12.8H, v18.8H, v9.8H // .........*........................................................ || .........*......................................................... - // add v9.8H, v18.8H, v9.8H // ........*......................................................... || ........*.......................................................... - // mls v30.8H, v26.8H, v7.H[0] // .............................*.................................... || .............................*..................................... - // sqrdmulh v10.8H, v10.8H, v0.H[3] // ....................*............................................. || ....................*.............................................. - // mul v26.8H, v12.8H, v0.H[4] // ..................*............................................... || ..................*................................................ - // sqrdmulh v12.8H, v12.8H, v0.H[5] // .......................*.......................................... || .......................*........................................... - // sub v18.8H, v5.8H, v30.8H // .................................*................................ || .................................*................................. - // add v5.8H, v5.8H, v30.8H // ....................................*............................. || ....................................*.............................. - // mls v4.8H, v10.8H, v7.H[0] // ........................*......................................... || ........................*.......................................... - // mul v10.8H, v9.8H, v0.H[2] // ...........*...................................................... || ...........*....................................................... - // mls v26.8H, v12.8H, v7.H[0] // ...........................*...................................... || ...........................*....................................... - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ..........................*....................................... || ..........................*........................................ - // sub v30.8H, v20.8H, v4.8H // ............................*..................................... || ............................*...................................... - // add v20.8H, v20.8H, v4.8H // ...............................*.................................. || ...............................*................................... - // sub v4.8H, v6.8H, v26.8H // ................................*................................. || ................................*.................................. - // add v6.8H, v6.8H, v26.8H // .....................................*............................ || .....................................*............................. - // mls v10.8H, v9.8H, v7.H[0] // ..............................*................................... || ..............................*.................................... - // mul v9.8H, v4.8H, v1.H[4] // ......................................*........................... || ......................................*............................ - // mul v26.8H, v6.8H, v1.H[2] // ..........................................*....................... || ..........................................*........................ - // sqrdmulh v6.8H, v6.8H, v1.H[3] // ..............................................*................... || ..............................................*.................... - // sub v12.8H, v13.8H, v10.8H // ...................................*.............................. || ...................................*............................... - // add v13.8H, v13.8H, v10.8H // ..................................*............................... || ..................................*................................ - // sqrdmulh v10.8H, v4.8H, v1.H[5] // .......................................*.......................... || .......................................*........................... - // mul v4.8H, v12.8H, v1.H[0] // ............................................*..................... || ............................................*...................... - // mul v25.8H, v13.8H, v0.H[6] // ........................................*......................... || ........................................*.......................... - // sqrdmulh v13.8H, v13.8H, v0.H[7] // .........................................*........................ || .........................................*......................... - // sqrdmulh v12.8H, v12.8H, v1.H[1] // ...........................................*...................... || ...........................................*....................... - // mls v26.8H, v6.8H, v7.H[0] // ..................................................*............... || ..................................................*................ - // mls v9.8H, v10.8H, v7.H[0] // ...............................................*.................. || ...............................................*................... - // mls v25.8H, v13.8H, v7.H[0] // .............................................*.................... || .............................................*..................... - // mls v4.8H, v12.8H, v7.H[0] // ................................................*................. || ................................................*.................. - // sub v13.8H, v5.8H, v26.8H // ...........................................................*...... || ...........................................................*....... - // sub v10.8H, v18.8H, v9.8H // ...............................................................*.. || ...............................................................*... - // add v9.8H, v18.8H, v9.8H // .............................................................*.... || .............................................................*..... - // add v6.8H, v5.8H, v26.8H // .........................................................*........ || .........................................................*......... - // sub v5.8H, v20.8H, v25.8H // ...................................................*.............. || ...................................................*............... - // add v20.8H, v20.8H, v25.8H // .................................................*................ || .................................................*................. - // sub v26.8H, v30.8H, v4.8H // .......................................................*.......... || .......................................................*........... - // add v30.8H, v30.8H, v4.8H // .....................................................*............ || .....................................................*............. - // str_vi v20, x0, 16 // ....................................................*............. || ....................................................*.............. - // str_vo v5, x0, 48 // ......................................................*........... || ......................................................*............ - // str_vo v30, x0, 112 // ........................................................*......... || ........................................................*.......... - // str_vo v26, x0, 176 // ..........................................................*....... || ..........................................................*........ - // str_vo v6, x0, 240 // ............................................................*..... || ............................................................*...... - // str_vo v13, x0, 304 // ..............................................................*... || ..............................................................*.... - // str_vo v9, x0, 368 // ................................................................*. || ................................................................*.. - // str_vo v10, x0, 432 // .................................................................* || ..................................................................* - + // Instructions: 67 + // Expected cycles: 69 + // Expected IPC: 0.97 + // + // Cycle bound: 69.0 + // IPC bound: 0.97 + // + // Wall time: 18.88s + // User time: 18.88s + // + // ----------------------- original position ------------------------> + // 0 25 50 + // |------------------------|------------------------|---------------- + ldr q14, [x0, #0] // *.................................................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v6.8H, v31.8H, v0.H[1] // .*................................................................. + // gap // ................................................................... + mul v15.8H, v31.8H, v0.H[0] // ..*................................................................ + // gap // ................................................................... + sqrdmulh v29.8H, v16.8H, v0.H[1] // ...*............................................................... + // gap // ................................................................... + mul v16.8H, v16.8H, v0.H[0] // ....*.............................................................. + // gap // ................................................................... + sqrdmulh v27.8H, v10.8H, v0.H[1] // .....*............................................................. + // gap // ................................................................... + mul v18.8H, v10.8H, v0.H[0] // .......*........................................................... + // gap // ................................................................... + mls v15.8H, v6.8H, v7.H[0] // ......*............................................................ + // gap // ................................................................... + mls v16.8H, v29.8H, v7.H[0] // ........*.......................................................... + // gap // ................................................................... + mls v26.8H, v19.8H, v7.H[0] // .........*......................................................... + // gap // ................................................................... + mls v18.8H, v27.8H, v7.H[0] // ...........*....................................................... + // gap // ................................................................... + sub v30.8H, v13.8H, v15.8H // ..........*........................................................ + // gap // ................................................................... + sub v20.8H, v12.8H, v16.8H // ............*...................................................... + // gap // ................................................................... + sub v8.8H, v14.8H, v26.8H // .............*..................................................... + // gap // ................................................................... + add v6.8H, v11.8H, v18.8H // ................*.................................................. + // gap // ................................................................... + add v26.8H, v14.8H, v26.8H // ..............*.................................................... + // gap // ................................................................... + sub v25.8H, v11.8H, v18.8H // ...............*................................................... + // gap // ................................................................... + sqrdmulh v18.8H, v6.8H, v0.H[3] // ....................*.............................................. + // gap // ................................................................... + mul v31.8H, v6.8H, v0.H[2] // .....................*............................................. + // gap // ................................................................... + sqrdmulh v29.8H, v25.8H, v0.H[5] // ..................*................................................ + // gap // ................................................................... + mul v10.8H, v25.8H, v0.H[4] // ...................*............................................... + // gap // ................................................................... + mul v19.8H, v20.8H, v0.H[4] // .................*................................................. + // gap // ................................................................... + mls v31.8H, v18.8H, v7.H[0] // .........................*......................................... + // gap // ................................................................... + sqrdmulh v18.8H, v20.8H, v0.H[5] // ........................*.......................................... + // gap // ................................................................... + mls v10.8H, v29.8H, v7.H[0] // .......................*........................................... + // gap // ................................................................... + add v6.8H, v12.8H, v16.8H // ......................*............................................ + // gap // ................................................................... + add v11.8H, v26.8H, v31.8H // ............................*...................................... + // gap // ................................................................... + sub v12.8H, v26.8H, v31.8H // ...........................*....................................... + // gap // ................................................................... + mul v31.8H, v6.8H, v0.H[2] // ...............................*................................... + // gap // ................................................................... + mls v19.8H, v18.8H, v7.H[0] // .............................*..................................... + // gap // ................................................................... + sqrdmulh v29.8H, v6.8H, v0.H[3] // ..........................*........................................ + // gap // ................................................................... + add v26.8H, v13.8H, v15.8H // ..............................*.................................... + // gap // ................................................................... + add v20.8H, v8.8H, v10.8H // ................................*.................................. + // gap // ................................................................... + add v18.8H, v30.8H, v19.8H // .................................*................................. + // gap // ................................................................... + mls v31.8H, v29.8H, v7.H[0] // ...................................*............................... + // gap // ................................................................... + sub v6.8H, v30.8H, v19.8H // ..................................*................................ + // gap // ................................................................... + sqrdmulh v19.8H, v18.8H, v1.H[3] // ....................................*.............................. + // gap // ................................................................... + mul v16.8H, v18.8H, v1.H[2] // .....................................*............................. + // gap // ................................................................... + add v29.8H, v26.8H, v31.8H // ........................................*.......................... + // gap // ................................................................... + sub v31.8H, v26.8H, v31.8H // .......................................*........................... + // gap // ................................................................... + mul v26.8H, v6.8H, v1.H[4] // .........................................*......................... + // gap // ................................................................... + sqrdmulh v18.8H, v29.8H, v0.H[7] // ...........................................*....................... + // gap // ................................................................... + mul v29.8H, v29.8H, v0.H[6] // ............................................*...................... + // gap // ................................................................... + sqrdmulh v25.8H, v6.8H, v1.H[5] // ......................................*............................ + // gap // ................................................................... + sqrdmulh v6.8H, v31.8H, v1.H[1] // ..........................................*........................ + // gap // ................................................................... + mul v31.8H, v31.8H, v1.H[0] // .............................................*..................... + // gap // ................................................................... + mls v29.8H, v18.8H, v7.H[0] // ................................................*.................. + // gap // ................................................................... + mls v26.8H, v25.8H, v7.H[0] // ...............................................*................... + // gap // ................................................................... + mls v16.8H, v19.8H, v7.H[0] // ..............................................*.................... + // gap // ................................................................... + mls v31.8H, v6.8H, v7.H[0] // .................................................*................. + // gap // ................................................................... + add v18.8H, v11.8H, v29.8H // .....................................................*............. + // gap // ................................................................... + sub v29.8H, v11.8H, v29.8H // ....................................................*.............. + // gap // ................................................................... + add v6.8H, v20.8H, v16.8H // ...................................................*............... + // gap // ................................................................... + str q18, [x0], #(16) // .........................................................*......... + // gap // ................................................................... + sub v18.8H, v12.8H, v31.8H // ........................................................*.......... + // gap // ................................................................... + str q29, [x0, #48] // ...........................................................*....... + // gap // ................................................................... + add v29.8H, v12.8H, v31.8H // .......................................................*........... + // gap // ................................................................... + str q18, [x0, #176] // ..............................................................*.... + // gap // ................................................................... + sub v18.8H, v8.8H, v10.8H // ......................................................*............ + // gap // ................................................................... + str q29, [x0, #112] // .............................................................*..... + // gap // ................................................................... + sub v31.8H, v20.8H, v16.8H // ..................................................*................ + // gap // ................................................................... + str q6, [x0, #240] // ...............................................................*... + // gap // ................................................................... + sub v29.8H, v18.8H, v26.8H // ..........................................................*........ + // gap // ................................................................... + str q31, [x0, #304] // ................................................................*.. + // gap // ................................................................... + add v18.8H, v18.8H, v26.8H // ............................................................*...... + // gap // ................................................................... + str q29, [x0, #432] // ..................................................................* + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + str q18, [x0, #368] // .................................................................*. + // gap // ................................................................... + + // -------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|---------------- + // ldr q29, [x0, #0] // *.................................................................. + // sqrdmulh v18.8H, v31.8H, v0.H[1] // .*................................................................. + // mul v6.8H, v31.8H, v0.H[0] // ..*................................................................ + // sqrdmulh v25.8H, v16.8H, v0.H[1] // ...*............................................................... + // mul v16.8H, v16.8H, v0.H[0] // ....*.............................................................. + // sqrdmulh v31.8H, v10.8H, v0.H[1] // .....*............................................................. + // mls v6.8H, v18.8H, v7.H[0] // .......*........................................................... + // mul v18.8H, v10.8H, v0.H[0] // ......*............................................................ + // mls v16.8H, v25.8H, v7.H[0] // ........*.......................................................... + // mls v26.8H, v19.8H, v7.H[0] // .........*......................................................... + // sub v19.8H, v13.8H, v6.8H // ...........*....................................................... + // mls v18.8H, v31.8H, v7.H[0] // ..........*........................................................ + // sub v25.8H, v12.8H, v16.8H // ............*...................................................... + // sub v8.8H, v29.8H, v26.8H // .............*..................................................... + // add v5.8H, v29.8H, v26.8H // ...............*................................................... + // sub v26.8H, v11.8H, v18.8H // ................*.................................................. + // add v18.8H, v11.8H, v18.8H // ..............*.................................................... + // mul v29.8H, v25.8H, v0.H[4] // .....................*............................................. + // sqrdmulh v2.8H, v26.8H, v0.H[5] // ...................*............................................... + // mul v9.8H, v26.8H, v0.H[4] // ....................*.............................................. + // sqrdmulh v11.8H, v18.8H, v0.H[3] // .................*................................................. + // mul v18.8H, v18.8H, v0.H[2] // ..................*................................................ + // add v16.8H, v12.8H, v16.8H // .........................*......................................... + // mls v9.8H, v2.8H, v7.H[0] // ........................*.......................................... + // sqrdmulh v20.8H, v25.8H, v0.H[5] // .......................*........................................... + // mls v18.8H, v11.8H, v7.H[0] // ......................*............................................ + // sqrdmulh v11.8H, v16.8H, v0.H[3] // ..............................*.................................... + // sub v2.8H, v5.8H, v18.8H // ...........................*....................................... + // add v18.8H, v5.8H, v18.8H // ..........................*........................................ + // mls v29.8H, v20.8H, v7.H[0] // .............................*..................................... + // add v6.8H, v13.8H, v6.8H // ...............................*................................... + // mul v16.8H, v16.8H, v0.H[2] // ............................*...................................... + // add v31.8H, v8.8H, v9.8H // ................................*.................................. + // add v25.8H, v19.8H, v29.8H // .................................*................................. + // sub v26.8H, v19.8H, v29.8H // ...................................*............................... + // mls v16.8H, v11.8H, v7.H[0] // ..................................*................................ + // sqrdmulh v19.8H, v25.8H, v1.H[3] // ....................................*.............................. + // mul v25.8H, v25.8H, v1.H[2] // .....................................*............................. + // sqrdmulh v29.8H, v26.8H, v1.H[5] // ...........................................*....................... + // sub v3.8H, v6.8H, v16.8H // .......................................*........................... + // add v6.8H, v6.8H, v16.8H // ......................................*............................ + // mul v16.8H, v26.8H, v1.H[4] // ........................................*.......................... + // sqrdmulh v20.8H, v3.8H, v1.H[1] // ............................................*...................... + // sqrdmulh v26.8H, v6.8H, v0.H[7] // .........................................*......................... + // mul v11.8H, v6.8H, v0.H[6] // ..........................................*........................ + // mul v12.8H, v3.8H, v1.H[0] // .............................................*..................... + // mls v25.8H, v19.8H, v7.H[0] // ................................................*.................. + // mls v16.8H, v29.8H, v7.H[0] // ...............................................*................... + // mls v11.8H, v26.8H, v7.H[0] // ..............................................*.................... + // mls v12.8H, v20.8H, v7.H[0] // .................................................*................. + // sub v29.8H, v31.8H, v25.8H // ............................................................*...... + // add v31.8H, v31.8H, v25.8H // ....................................................*.............. + // sub v25.8H, v18.8H, v11.8H // ...................................................*............... + // add v18.8H, v18.8H, v11.8H // ..................................................*................ + // sub v11.8H, v8.8H, v9.8H // ..........................................................*........ + // add v26.8H, v2.8H, v12.8H // ........................................................*.......... + // sub v19.8H, v2.8H, v12.8H // ......................................................*............ + // str q18, [x0], #(16) // .....................................................*............. + // sub v6.8H, v11.8H, v16.8H // ..............................................................*.... + // str q25, [x0, #48] // .......................................................*........... + // add v20.8H, v11.8H, v16.8H // ................................................................*.. + // str q26, [x0, #112] // ...........................................................*....... + // str q19, [x0, #176] // .........................................................*......... + // str q31, [x0, #240] // .............................................................*..... + // str q29, [x0, #304] // ...............................................................*... + // str q20, [x0, #368] // ..................................................................* + // str q6, [x0, #432] // .................................................................*. + restore inp, STACK0 mov count, #8 .p2align 2 - // gap // ........................................................................... - ldr_vi v3, x3, 16 // ...*....................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - ldr_vo v13, x1, 48 // ..*........................................................................ - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - ldr_vo v17, x1, 32 // ....*...................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v18.8H, v13.8H, v3.H[1] // ......*.................................................................... - // gap // ........................................................................... - mul v9.8H, v13.8H, v3.H[0] // .....*..................................................................... - // gap // ........................................................................... - ldr_vo v20, x1, 16 // .*......................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v15.8H, v17.8H, v3.H[1] // ........*.................................................................. - // gap // ........................................................................... - mls v9.8H, v18.8H, v7.H[0] // ..........*................................................................ - // gap // ........................................................................... - mul v23.8H, v17.8H, v3.H[0] // .......*................................................................... - // gap // ........................................................................... - ldr_vo v0, x1, 0 // *.......................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - sub v1.8H, v20.8H, v9.8H // ................*.......................................................... - // gap // ........................................................................... - mls v23.8H, v15.8H, v7.H[0] // ...........*............................................................... - // gap // ........................................................................... - add v8.8H, v20.8H, v9.8H // .............*............................................................. - // gap // ........................................................................... - sqrdmulh v17.8H, v1.8H, v3.H[5] // ....................*...................................................... - // gap // ........................................................................... - mul v13.8H, v1.8H, v3.H[4] // ...................*....................................................... - // gap // ........................................................................... - sub v19.8H, v0.8H, v23.8H // ..............*............................................................ - // gap // ........................................................................... - sqrdmulh v16.8H, v8.8H, v3.H[3] // ..................*........................................................ - // gap // ........................................................................... - mul v10.8H, v8.8H, v3.H[2] // .................*......................................................... - // gap // ........................................................................... - mls v13.8H, v17.8H, v7.H[0] // .......................*................................................... - // gap // ........................................................................... - ldr_vo v20, x4, 16 // ............*.............................................................. - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - mls v10.8H, v16.8H, v7.H[0] // ......................*.................................................... - // gap // ........................................................................... - add v17.8H, v0.8H, v23.8H // ...............*........................................................... - // gap // ........................................................................... - add v31.8H, v19.8H, v13.8H // ............................*.............................................. - // gap // ........................................................................... - sub v3.8H, v19.8H, v13.8H // ...........................*............................................... - // gap // ........................................................................... - add v11.8H, v17.8H, v10.8H // ..........................*................................................ - // gap // ........................................................................... - sub v21.8H, v17.8H, v10.8H // .........................*................................................. - // gap // ........................................................................... - trn2 v29.4S, v31.4S, v3.4S // ...............................*........................................... - // gap // ........................................................................... - trn1 v26.4S, v31.4S, v3.4S // ..................................*........................................ - // gap // ........................................................................... - trn2 v9.4S, v11.4S, v21.4S // ..............................*............................................ - // gap // ........................................................................... - ldr_vi v12, x4, 96 // .........*................................................................. - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - trn2 v1.2D, v9.2D, v29.2D // .................................*......................................... - // gap // ........................................................................... - trn1 v22.4S, v11.4S, v21.4S // .............................*............................................. - // gap // ........................................................................... - mul v24.8H, v1.8H, v12.8H // ...................................*....................................... - // gap // ........................................................................... - sqrdmulh v17.8H, v1.8H, v20.8H // ....................................*...................................... - // gap // ........................................................................... - trn2 v21.2D, v22.2D, v26.2D // .....................................*..................................... - // gap // ........................................................................... - trn1 v4.2D, v9.2D, v29.2D // ......................................*.................................... - // gap // ........................................................................... - sqrdmulh v16.8H, v21.8H, v20.8H // .......................................*................................... - // gap // ........................................................................... - mls v24.8H, v17.8H, v7.H[0] // ........................................*.................................. - // gap // ........................................................................... - mul v23.8H, v21.8H, v12.8H // .........................................*................................. - // gap // ........................................................................... - ldr_vo v20, x4, -64 // .....................*..................................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - add v5.8H, v4.8H, v24.8H // .............................................*............................. - // gap // ........................................................................... - mls v23.8H, v16.8H, v7.H[0] // ............................................*.............................. - // gap // ........................................................................... - sub v10.8H, v4.8H, v24.8H // ...........................................*............................... - // gap // ........................................................................... - mul v9.8H, v5.8H, v20.8H // ................................................*.......................... - // gap // ........................................................................... - ldr_vo v20, x4, -48 // ........................*.................................................. - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - ldr_vo v21, x4, -32 // ................................*.......................................... - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - sqrdmulh v17.8H, v5.8H, v20.8H // .................................................*......................... - // gap // ........................................................................... - trn1 v15.2D, v22.2D, v26.2D // ..................................................*........................ - // gap // ........................................................................... - ldr_vo v20, x4, -16 // ..........................................*................................ - // gap // ........................................................................... - // gap // ........................................................................... - // gap // ........................................................................... - mls v9.8H, v17.8H, v7.H[0] // .....................................................*..................... - // gap // ........................................................................... - add v26.8H, v15.8H, v23.8H // ......................................................*.................... - // gap // ........................................................................... - mul v6.8H, v10.8H, v21.8H // ..............................................*............................ - // gap // ........................................................................... - sqrdmulh v30.8H, v10.8H, v20.8H // ...............................................*........................... - // gap // ........................................................................... - sub v13.8H, v26.8H, v9.8H // .........................................................*................. - // gap // ........................................................................... - add v20.8H, v26.8H, v9.8H // ..........................................................*................ - // gap // ........................................................................... - sub v10.8H, v15.8H, v23.8H // ....................................................*...................... - // gap // ........................................................................... - mls v6.8H, v30.8H, v7.H[0] // ...................................................*....................... - // gap // ........................................................................... - trn1 v9.4S, v20.4S, v13.4S // .............................................................*............. - // gap // ........................................................................... - trn2 v20.4S, v20.4S, v13.4S // ..............................................................*............ - // gap // ........................................................................... - sqdmulh v17.8H, v9.8H, v7.H[1] // ...............................................................*........... - // gap // ........................................................................... - sub v5.8H, v10.8H, v6.8H // .......................................................*................... - // gap // ........................................................................... - add v6.8H, v10.8H, v6.8H // ........................................................*.................. - // gap // ........................................................................... - sqdmulh v26.8H, v20.8H, v7.H[1] // ................................................................*.......... - // gap // ........................................................................... - srshr v29.8H, v17.8H, #11 // ...................................................................*....... - // gap // ........................................................................... - trn1 v10.4S, v6.4S, v5.4S // ...........................................................*............... - // gap // ........................................................................... - trn2 v13.4S, v6.4S, v5.4S // ............................................................*.............. - // gap // ........................................................................... - sqdmulh v0.8H, v10.8H, v7.H[1] // .................................................................*......... - // gap // ........................................................................... - sqdmulh v5.8H, v13.8H, v7.H[1] // ..................................................................*........ - // gap // ........................................................................... - mls v9.8H, v29.8H, v7.H[0] // ......................................................................*.... - // gap // ........................................................................... - srshr v3.8H, v26.8H, #11 // ....................................................................*...... - // gap // ........................................................................... - srshr v6.8H, v0.8H, #11 // .....................................................................*..... - // gap // ........................................................................... - srshr v16.8H, v5.8H, #11 // .......................................................................*... - // gap // ........................................................................... - mls v20.8H, v3.8H, v7.H[0] // ........................................................................*.. - // gap // ........................................................................... - mls v10.8H, v6.8H, v7.H[0] // .........................................................................*. - // gap // ........................................................................... - mls v13.8H, v16.8H, v7.H[0] // ..........................................................................* - // gap // ........................................................................... - - // original source code - // ldr_vo v20, x1, 0 // .........*................................................................. || .............*........................................................................ - // ldr_vo v13, x1, 16 // .....*..................................................................... || ........*............................................................................. - // ldr_vo v9, x1, 48 // .*......................................................................... || ..*................................................................................... - // ldr_vi v10, x3, 16 // *.......................................................................... || *..................................................................................... - // ldr_vo v6, x1, 32 // ..*........................................................................ || ....*................................................................................. - // mul v5.8H, v9.8H, v10.H[0] // ....*...................................................................... || .......*.............................................................................. - // sqrdmulh v9.8H, v9.8H, v10.H[1] // ...*....................................................................... || ......*............................................................................... - // mul v0.8H, v6.8H, v10.H[0] // ........*.................................................................. || ............*......................................................................... - // sqrdmulh v6.8H, v6.8H, v10.H[1] // ......*.................................................................... || ..........*........................................................................... - // ldr_vi v26, x4, 96 // .............................*............................................. || ...................................*.................................................. - // mls v5.8H, v9.8H, v7.H[0] // .......*................................................................... || ...........*.......................................................................... - // mls v0.8H, v6.8H, v7.H[0] // ...........*............................................................... || ................*..................................................................... - // ldr_vo v9, x4, -80 // ...................*....................................................... || ........................*............................................................. - // add v6.8H, v13.8H, v5.8H // ............*.............................................................. || .................*.................................................................... - // sub v30.8H, v20.8H, v0.8H // ...............*........................................................... || ....................*................................................................. - // add v20.8H, v20.8H, v0.8H // .....................*..................................................... || ...........................*.......................................................... - // sub v13.8H, v13.8H, v5.8H // ..........*................................................................ || ...............*...................................................................... - // mul v5.8H, v6.8H, v10.H[2] // .................*......................................................... || ......................*............................................................... - // sqrdmulh v6.8H, v6.8H, v10.H[3] // ................*.......................................................... || .....................*................................................................ - // mul v0.8H, v13.8H, v10.H[4] // ..............*............................................................ || ...................*.................................................................. - // sqrdmulh v13.8H, v13.8H, v10.H[5] // .............*............................................................. || ..................*................................................................... - // ldr_vo v10, x4, -64 // .......................................*................................... || ..............................................*....................................... - // mls v5.8H, v6.8H, v7.H[0] // ....................*...................................................... || ..........................*........................................................... - // mls v0.8H, v13.8H, v7.H[0] // ..................*........................................................ || .......................*.............................................................. - // ldr_vo v13, x4, -48 // ............................................*.............................. || ....................................................*................................. - // sub v6.8H, v20.8H, v5.8H // .........................*................................................. || ...............................*...................................................... - // add v20.8H, v20.8H, v5.8H // ........................*.................................................. || ..............................*....................................................... - // sub v5.8H, v30.8H, v0.8H // .......................*................................................... || .............................*........................................................ - // add v0.8H, v30.8H, v0.8H // ......................*.................................................... || ............................*......................................................... - // trn1 v30.4S, v20.4S, v6.4S // ...............................*........................................... || ......................................*............................................... - // trn2 v20.4S, v20.4S, v6.4S // ............................*.............................................. || ..................................*................................................... - // trn2 v6.4S, v0.4S, v5.4S // ..........................*................................................ || ................................*..................................................... - // ldr_vo v4, x4, -32 // .............................................*............................. || ......................................................*............................... - // trn2 v12.2D, v20.2D, v6.2D // ..............................*............................................ || .....................................*................................................ - // trn1 v5.4S, v0.4S, v5.4S // ...........................*............................................... || .................................*.................................................... - // mul v0.8H, v12.8H, v26.8H // ................................*.......................................... || .......................................*.............................................. - // sqrdmulh v12.8H, v12.8H, v9.8H // .................................*......................................... || ........................................*............................................. - // trn2 v18.2D, v30.2D, v5.2D // ..................................*........................................ || .........................................*............................................ - // trn1 v20.2D, v20.2D, v6.2D // ...................................*....................................... || ..........................................*........................................... - // sqrdmulh v9.8H, v18.8H, v9.8H // ....................................*...................................... || ...........................................*.......................................... - // mls v0.8H, v12.8H, v7.H[0] // .....................................*..................................... || ............................................*......................................... - // mul v6.8H, v18.8H, v26.8H // ......................................*.................................... || .............................................*........................................ - // ldr_vo v26, x4, -16 // ................................................*.......................... || ..........................................................*........................... - // sub v12.8H, v20.8H, v0.8H // ..........................................*................................ || ..................................................*................................... - // mls v6.8H, v9.8H, v7.H[0] // .........................................*................................. || .................................................*.................................... - // add v20.8H, v20.8H, v0.8H // ........................................*.................................. || ................................................*..................................... - // mul v9.8H, v12.8H, v4.8H // ...................................................*....................... || ..............................................................*....................... - // sqrdmulh v0.8H, v12.8H, v26.8H // ....................................................*...................... || ...............................................................*...................... - // mul v10.8H, v20.8H, v10.8H // ...........................................*............................... || ...................................................*.................................. - // sqrdmulh v20.8H, v20.8H, v13.8H // ..............................................*............................ || ........................................................*............................. - // trn1 v13.2D, v30.2D, v5.2D // ...............................................*........................... || .........................................................*............................ - // mls v9.8H, v0.8H, v7.H[0] // ........................................................*.................. || ...................................................................*.................. - // sub v5.8H, v13.8H, v6.8H // .......................................................*................... || ..................................................................*................... - // mls v10.8H, v20.8H, v7.H[0] // .................................................*......................... || ............................................................*......................... - // add v20.8H, v13.8H, v6.8H // ..................................................*........................ || .............................................................*........................ - // sub v13.8H, v5.8H, v9.8H // ............................................................*.............. || .......................................................................*.............. - // add v9.8H, v5.8H, v9.8H // .............................................................*............. || ........................................................................*............. - // sub v6.8H, v20.8H, v10.8H // .....................................................*..................... || ................................................................*..................... - // add v20.8H, v20.8H, v10.8H // ......................................................*.................... || .................................................................*.................... - // trn1 v10.4S, v9.4S, v13.4S // ................................................................*.......... || ...........................................................................*.......... - // trn2 v13.4S, v9.4S, v13.4S // .................................................................*......... || ............................................................................*......... - // trn1 v9.4S, v20.4S, v6.4S // .........................................................*................. || ....................................................................*................. - // trn2 v20.4S, v20.4S, v6.4S // ..........................................................*................ || .....................................................................*................ - // sqdmulh v6.8H, v9.8H, v7.H[1] // ...........................................................*............... || ......................................................................*............... - // sqdmulh v5.8H, v20.8H, v7.H[1] // ..............................................................*............ || .........................................................................*............ - // sqdmulh v0.8H, v10.8H, v7.H[1] // ..................................................................*........ || .............................................................................*........ - // sqdmulh v26.8H, v13.8H, v7.H[1] // ...................................................................*....... || ..............................................................................*....... - // srshr v6.8H, v6.8H, #11 // ...............................................................*........... || ..........................................................................*........... - // srshr v5.8H, v5.8H, #11 // .....................................................................*..... || ................................................................................*..... - // srshr v0.8H, v0.8H, #11 // ......................................................................*.... || .................................................................................*.... - // mls v9.8H, v6.8H, v7.H[0] // ....................................................................*...... || ...............................................................................*...... - // srshr v6.8H, v26.8H, #11 // .......................................................................*... || ..................................................................................*... - // mls v20.8H, v5.8H, v7.H[0] // ........................................................................*.. || ...................................................................................*.. - // mls v10.8H, v0.8H, v7.H[0] // .........................................................................*. || ....................................................................................*. - // mls v13.8H, v6.8H, v7.H[0] // ..........................................................................* || .....................................................................................* - + // Instructions: 67 + // Expected cycles: 79 + // Expected IPC: 0.85 + // + // Cycle bound: 79.0 + // IPC bound: 0.85 + // + // Wall time: 19.62s + // User time: 19.62s + // + // ----------------------- original position ------------------------> + // 0 25 50 + // |------------------------|------------------------|---------------- + ldr q15, [x1, #48] // .*................................................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + ldr q6, [x3], #16 // *.................................................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + ldr q18, [x1, #32] // ..*................................................................ + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mul v30.8H, v15.8H, v6.H[0] // ....*.............................................................. + // gap // ................................................................... + sqrdmulh v11.8H, v15.8H, v6.H[1] // ...*............................................................... + // gap // ................................................................... + sqrdmulh v31.8H, v18.8H, v6.H[1] // ........*.......................................................... + // gap // ................................................................... + ldr q4, [x1, #16] // ......*............................................................ + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v30.8H, v11.8H, v7.H[0] // .......*........................................................... + // gap // ................................................................... + mul v0.8H, v18.8H, v6.H[0] // .....*............................................................. + // gap // ................................................................... + ldr q24, [x1, #0] // .........*......................................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sub v9.8H, v4.8H, v30.8H // ..........*........................................................ + // gap // ................................................................... + mls v0.8H, v31.8H, v7.H[0] // ............*...................................................... + // gap // ................................................................... + add v8.8H, v4.8H, v30.8H // ...........*....................................................... + // gap // ................................................................... + sqrdmulh v1.8H, v9.8H, v6.H[5] // .............*..................................................... + // gap // ................................................................... + mul v17.8H, v9.8H, v6.H[4] // ..............*.................................................... + // gap // ................................................................... + sqrdmulh v15.8H, v8.8H, v6.H[3] // ................*.................................................. + // gap // ................................................................... + mul v19.8H, v8.8H, v6.H[2] // ...............*................................................... + // gap // ................................................................... + ldr q6, [x4, #16] // .................................*................................. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v17.8H, v1.8H, v7.H[0] // ..................*................................................ + // gap // ................................................................... + sub v23.8H, v24.8H, v0.8H // ...................*............................................... + // gap // ................................................................... + mls v19.8H, v15.8H, v7.H[0] // ....................*.............................................. + // gap // ................................................................... + add v9.8H, v24.8H, v0.8H // .................*................................................. + // gap // ................................................................... + add v15.8H, v23.8H, v17.8H // ......................*............................................ + // gap // ................................................................... + sub v18.8H, v23.8H, v17.8H // .....................*............................................. + // gap // ................................................................... + add v14.8H, v9.8H, v19.8H // ........................*.......................................... + // gap // ................................................................... + sub v3.8H, v9.8H, v19.8H // .......................*........................................... + // gap // ................................................................... + trn2 v1.4S, v15.4S, v18.4S // ..........................*........................................ + // gap // ................................................................... + trn1 v15.4S, v15.4S, v18.4S // .........................*......................................... + // gap // ................................................................... + trn2 v9.4S, v14.4S, v3.4S // ...........................*....................................... + // gap // ................................................................... + ldr q24, [x4], #(6*16) // .............................*..................................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + trn2 v17.2D, v9.2D, v1.2D // ...............................*................................... + // gap // ................................................................... + trn1 v30.4S, v14.4S, v3.4S // ............................*...................................... + // gap // ................................................................... + sqrdmulh v5.8H, v17.8H, v6.8H // .....................................*............................. + // gap // ................................................................... + mul v10.8H, v17.8H, v24.8H // ....................................*.............................. + // gap // ................................................................... + trn2 v17.2D, v30.2D, v15.2D // ..............................*.................................... + // gap // ................................................................... + ldr q11, [x4, #-64] // .............................................*..................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mls v10.8H, v5.8H, v7.H[0] // ........................................*.......................... + // gap // ................................................................... + trn1 v1.2D, v9.2D, v1.2D // .........................................*......................... + // gap // ................................................................... + ldr q2, [x4, #-48] // ..............................................*.................... + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + add v28.8H, v1.8H, v10.8H // ............................................*...................... + // gap // ................................................................... + ldr q25, [x4, #-16] // ..................................*................................ + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + mul v29.8H, v28.8H, v11.8H // ...............................................*................... + // gap // ................................................................... + sqrdmulh v13.8H, v28.8H, v2.8H // ................................................*.................. + // gap // ................................................................... + sqrdmulh v23.8H, v17.8H, v6.8H // ...................................*............................... + // gap // ................................................................... + mul v31.8H, v17.8H, v24.8H // ................................*.................................. + // gap // ................................................................... + sub v9.8H, v1.8H, v10.8H // ...........................................*....................... + // gap // ................................................................... + ldr q18, [x4, #-32] // ..................................................*................ + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + sqrdmulh v27.8H, v9.8H, v25.8H // ....................................................*.............. + // gap // ................................................................... + mls v31.8H, v23.8H, v7.H[0] // ......................................*............................ + // gap // ................................................................... + mul v20.8H, v9.8H, v18.8H // .....................................................*............. + // gap // ................................................................... + trn1 v0.2D, v30.2D, v15.2D // .......................................*........................... + // gap // ................................................................... + mls v29.8H, v13.8H, v7.H[0] // ...................................................*............... + // gap // ................................................................... + add v15.8H, v0.8H, v31.8H // .................................................*................. + // gap // ................................................................... + mls v20.8H, v27.8H, v7.H[0] // ........................................................*.......... + // gap // ................................................................... + sub v18.8H, v0.8H, v31.8H // ..........................................*........................ + // gap // ................................................................... + add v21.8H, v15.8H, v29.8H // .......................................................*........... + // gap // ................................................................... + sub v13.8H, v15.8H, v29.8H // ......................................................*............ + // gap // ................................................................... + sub v5.8H, v18.8H, v20.8H // ...........................................................*....... + // gap // ................................................................... + add v9.8H, v18.8H, v20.8H // ..........................................................*........ + // gap // ................................................................... + trn2 v0.4S, v21.4S, v13.4S // .................................................................*. + // gap // ................................................................... + trn1 v26.4S, v21.4S, v13.4S // .........................................................*......... + // gap // ................................................................... + trn1 v25.4S, v9.4S, v5.4S // ............................................................*...... + // gap // ................................................................... + sqdmulh v15.8H, v26.8H, v7.H[1] // .............................................................*..... + // gap // ................................................................... + sqdmulh v1.8H, v25.8H, v7.H[1] // ..............................................................*.... + // gap // ................................................................... + trn2 v31.4S, v9.4S, v5.4S // ................................................................*.. + // gap // ................................................................... + // gap // ................................................................... + // gap // ................................................................... + srshr v9.8H, v15.8H, #11 // ...............................................................*... + // gap // ................................................................... + srshr v16.8H, v1.8H, #11 // ..................................................................* + // gap // ................................................................... + + // -------------------------- new position --------------------------> + // 0 25 50 + // |------------------------|------------------------|---------------- + // ldr q11, [x3], #16 // .*................................................................. + // ldr q17, [x1, #48] // *.................................................................. + // ldr q23, [x1, #32] // ..*................................................................ + // sqrdmulh v18.8H, v17.8H, v11.H[1] // ....*.............................................................. + // mul v15.8H, v17.8H, v11.H[0] // ...*............................................................... + // mul v10.8H, v23.8H, v11.H[0] // ........*.......................................................... + // ldr q3, [x1, #16] // ......*............................................................ + // mls v15.8H, v18.8H, v7.H[0] // .......*........................................................... + // sqrdmulh v30.8H, v23.8H, v11.H[1] // .....*............................................................. + // ldr q19, [x1, #0] // .........*......................................................... + // sub v13.8H, v3.8H, v15.8H // ..........*........................................................ + // add v15.8H, v3.8H, v15.8H // ............*...................................................... + // mls v10.8H, v30.8H, v7.H[0] // ...........*....................................................... + // sqrdmulh v20.8H, v13.8H, v11.H[5] // .............*..................................................... + // mul v1.8H, v13.8H, v11.H[4] // ..............*.................................................... + // mul v5.8H, v15.8H, v11.H[2] // ................*.................................................. + // sqrdmulh v12.8H, v15.8H, v11.H[3] // ...............*................................................... + // add v28.8H, v19.8H, v10.8H // .....................*............................................. + // mls v1.8H, v20.8H, v7.H[0] // ..................*................................................ + // sub v19.8H, v19.8H, v10.8H // ...................*............................................... + // mls v5.8H, v12.8H, v7.H[0] // ....................*.............................................. + // sub v21.8H, v19.8H, v1.8H // .......................*........................................... + // add v11.8H, v19.8H, v1.8H // ......................*............................................ + // sub v24.8H, v28.8H, v5.8H // .........................*......................................... + // add v23.8H, v28.8H, v5.8H // ........................*.......................................... + // trn1 v10.4S, v11.4S, v21.4S // ...........................*....................................... + // trn2 v18.4S, v11.4S, v21.4S // ..........................*........................................ + // trn2 v29.4S, v23.4S, v24.4S // ............................*...................................... + // trn1 v13.4S, v23.4S, v24.4S // ...............................*................................... + // ldr q21, [x4], #(6*16) // .............................*..................................... + // trn2 v30.2D, v13.2D, v10.2D // ..................................*................................ + // trn2 v2.2D, v29.2D, v18.2D // ..............................*.................................... + // mul v22.8H, v30.8H, v21.8H // ............................................*...................... + // ldr q23, [x4, #-80] // .................*................................................. + // ldr q14, [x4, #-16] // ........................................*.......................... + // sqrdmulh v3.8H, v30.8H, v23.8H // ...........................................*....................... + // mul v30.8H, v2.8H, v21.8H // .................................*................................. + // sqrdmulh v15.8H, v2.8H, v23.8H // ................................*.................................. + // mls v22.8H, v3.8H, v7.H[0] // ................................................*.................. + // trn1 v2.2D, v13.2D, v10.2D // ..................................................*................ + // mls v30.8H, v15.8H, v7.H[0] // ....................................*.............................. + // trn1 v8.2D, v29.2D, v18.2D // .....................................*............................. + // sub v12.8H, v2.8H, v22.8H // ......................................................*............ + // sub v6.8H, v8.8H, v30.8H // .............................................*..................... + // add v1.8H, v8.8H, v30.8H // .......................................*........................... + // ldr q25, [x4, #-64] // ...................................*............................... + // ldr q21, [x4, #-48] // ......................................*............................ + // mul v29.8H, v1.8H, v25.8H // .........................................*......................... + // sqrdmulh v20.8H, v1.8H, v21.8H // ..........................................*........................ + // add v30.8H, v2.8H, v22.8H // ....................................................*.............. + // ldr q28, [x4, #-32] // ..............................................*.................... + // mls v29.8H, v20.8H, v7.H[0] // ...................................................*............... + // sqrdmulh v20.8H, v6.8H, v14.8H // ...............................................*................... + // mul v11.8H, v6.8H, v28.8H // .................................................*................. + // sub v24.8H, v30.8H, v29.8H // ........................................................*.......... + // add v8.8H, v30.8H, v29.8H // .......................................................*........... + // mls v11.8H, v20.8H, v7.H[0] // .....................................................*............. + // trn1 v26.4S, v8.4S, v24.4S // ............................................................*...... + // add v6.8H, v12.8H, v11.8H // ..........................................................*........ + // sub v29.8H, v12.8H, v11.8H // .........................................................*......... + // trn1 v25.4S, v6.4S, v29.4S // .............................................................*..... + // sqdmulh v4.8H, v26.8H, v7.H[1] // ..............................................................*.... + // sqdmulh v21.8H, v25.8H, v7.H[1] // ...............................................................*... + // srshr v9.8H, v4.8H, #11 // .................................................................*. + // trn2 v31.4S, v6.4S, v29.4S // ................................................................*.. + // trn2 v0.4S, v8.4S, v24.4S // ...........................................................*....... + // srshr v16.8H, v21.8H, #11 // ..................................................................* + sub count, count, #1 -.p2align 2 layer4567_start: - vext x10, v9, 0 // ...........................................................................*............... - vext x11, v9, 1 // ............................................................................*.............. - vext x12, v20, 0 // .............................................................................*............. - vext x13, v20, 1 // ..............................................................................*............ - vext x14, v10, 0 // ...............................................................................*........... - vext x15, v10, 1 // ................................................................................*.......... - vext x16, v13, 0 // .................................................................................*......... - vext x17, v13, 1 // ..................................................................................*........ - ldr_vo v20, x1, 64 // e.......................................................................................... - str x10, [x1] , #64 // ...................................................................................*....... - str x14, [x1, #-56] // ....................................................................................*...... - // gap // ........................................................................................... - ldr_vo v13, x1, 16 // .e......................................................................................... - str x12, [x1, #-48] // .....................................................................................*..... - str x16, [x1, #-40] // ......................................................................................*.... - // gap // ........................................................................................... - ldr_vo v9, x1, 48 // ...e....................................................................................... - str x11, [x1, #-32] // .......................................................................................*... - str x15, [x1, #-24] // ........................................................................................*.. - // gap // ........................................................................................... - ldr_vi v10, x3, 16 // ....e...................................................................................... - str x13, [x1, #-16] // .........................................................................................*. - str x17, [x1, #-8] // ..........................................................................................* - // gap // ........................................................................................... - ldr_vo v6, x1, 32 // ..e........................................................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v5.8H, v9.8H, v10.H[0] // ..........e................................................................................ - // gap // ........................................................................................... - sqrdmulh v9.8H, v9.8H, v10.H[1] // ...........e............................................................................... - // gap // ........................................................................................... - mul v0.8H, v6.8H, v10.H[0] // .....e..................................................................................... - // gap // ........................................................................................... - sqrdmulh v6.8H, v6.8H, v10.H[1] // ......e.................................................................................... - // gap // ........................................................................................... - ldr_vi v26, x4, 96 // .................................e......................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v5.8H, v9.8H, v7.H[0] // ............e.............................................................................. - // gap // ........................................................................................... - mls v0.8H, v6.8H, v7.H[0] // .......e................................................................................... - // gap // ........................................................................................... - ldr_vo v9, x4, -80 // ..................................e........................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - add v6.8H, v13.8H, v5.8H // ..............e............................................................................ - // gap // ........................................................................................... - sub v30.8H, v20.8H, v0.8H // ........e.................................................................................. - // gap // ........................................................................................... - add v20.8H, v20.8H, v0.8H // .........e................................................................................. - // gap // ........................................................................................... - sub v13.8H, v13.8H, v5.8H // .............e............................................................................. - // gap // ........................................................................................... - mul v5.8H, v6.8H, v10.H[2] // ...............e........................................................................... - // gap // ........................................................................................... - sqrdmulh v6.8H, v6.8H, v10.H[3] // ................e.......................................................................... - // gap // ........................................................................................... - mul v0.8H, v13.8H, v10.H[4] // ....................e...................................................................... - // gap // ........................................................................................... - sqrdmulh v13.8H, v13.8H, v10.H[5] // .....................e..................................................................... - // gap // ........................................................................................... - ldr_vo v10, x4, -64 // ...................................e....................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v5.8H, v6.8H, v7.H[0] // .................e......................................................................... - // gap // ........................................................................................... - mls v0.8H, v13.8H, v7.H[0] // ......................e.................................................................... - // gap // ........................................................................................... - ldr_vo v13, x4, -48 // ....................................e...................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v6.8H, v20.8H, v5.8H // ..................e........................................................................ - // gap // ........................................................................................... - add v20.8H, v20.8H, v5.8H // ...................e....................................................................... - // gap // ........................................................................................... - sub v5.8H, v30.8H, v0.8H // .......................e................................................................... - // gap // ........................................................................................... - add v0.8H, v30.8H, v0.8H // ........................e.................................................................. - // gap // ........................................................................................... - trn1 v30.4S, v20.4S, v6.4S // .........................e................................................................. - // gap // ........................................................................................... - trn2 v20.4S, v20.4S, v6.4S // ..........................e................................................................ - // gap // ........................................................................................... - trn2 v6.4S, v0.4S, v5.4S // ............................e.............................................................. - // gap // ........................................................................................... - ldr_vo v4, x4, -32 // .....................................e..................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - trn2 v12.2D, v20.2D, v6.2D // ..............................e............................................................ - // gap // ........................................................................................... - trn1 v5.4S, v0.4S, v5.4S // ...........................e............................................................... - // gap // ........................................................................................... - mul v0.8H, v12.8H, v26.8H // ............................................e.............................................. - // gap // ........................................................................................... - sqrdmulh v12.8H, v12.8H, v9.8H // .............................................e............................................. - // gap // ........................................................................................... - trn2 v18.2D, v30.2D, v5.2D // .............................e............................................................. - // gap // ........................................................................................... - trn1 v20.2D, v20.2D, v6.2D // ................................e.......................................................... - // gap // ........................................................................................... - sqrdmulh v9.8H, v18.8H, v9.8H // ........................................e.................................................. - // gap // ........................................................................................... - mls v0.8H, v12.8H, v7.H[0] // ..............................................e............................................ - // gap // ........................................................................................... - mul v6.8H, v18.8H, v26.8H // .......................................e................................................... - // gap // ........................................................................................... - ldr_vo v26, x4, -16 // ......................................e.................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v12.8H, v20.8H, v0.8H // ...............................................e........................................... - // gap // ........................................................................................... - mls v6.8H, v9.8H, v7.H[0] // .........................................e................................................. - // gap // ........................................................................................... - add v20.8H, v20.8H, v0.8H // ................................................e.......................................... - // gap // ........................................................................................... - mul v9.8H, v12.8H, v4.8H // ......................................................e.................................... - // gap // ........................................................................................... - sqrdmulh v0.8H, v12.8H, v26.8H // .......................................................e................................... - // gap // ........................................................................................... - mul v10.8H, v20.8H, v10.8H // .................................................e......................................... - // gap // ........................................................................................... - sqrdmulh v20.8H, v20.8H, v13.8H // ..................................................e........................................ - // gap // ........................................................................................... - trn1 v13.2D, v30.2D, v5.2D // ...............................e........................................................... - // gap // ........................................................................................... - mls v9.8H, v0.8H, v7.H[0] // ........................................................e.................................. - // gap // ........................................................................................... - sub v5.8H, v13.8H, v6.8H // ..........................................e................................................ - // gap // ........................................................................................... - mls v10.8H, v20.8H, v7.H[0] // ...................................................e....................................... - // gap // ........................................................................................... - add v20.8H, v13.8H, v6.8H // ...........................................e............................................... - // gap // ........................................................................................... - sub v13.8H, v5.8H, v9.8H // .........................................................e................................. - // gap // ........................................................................................... - add v9.8H, v5.8H, v9.8H // ..........................................................e................................ - // gap // ........................................................................................... - sub v6.8H, v20.8H, v10.8H // ....................................................e...................................... - // gap // ........................................................................................... - add v20.8H, v20.8H, v10.8H // .....................................................e..................................... - // gap // ........................................................................................... - trn1 v10.4S, v9.4S, v13.4S // .............................................................e............................. - // gap // ........................................................................................... - trn2 v13.4S, v9.4S, v13.4S // ..............................................................e............................ - // gap // ........................................................................................... - trn1 v9.4S, v20.4S, v6.4S // ...........................................................e............................... - // gap // ........................................................................................... - trn2 v20.4S, v20.4S, v6.4S // ............................................................e.............................. - // gap // ........................................................................................... - sqdmulh v6.8H, v9.8H, v7.H[1] // ...............................................................e........................... - // gap // ........................................................................................... - sqdmulh v5.8H, v20.8H, v7.H[1] // ..................................................................e........................ - // gap // ........................................................................................... - sqdmulh v0.8H, v10.8H, v7.H[1] // .....................................................................e..................... - // gap // ........................................................................................... - sqdmulh v26.8H, v13.8H, v7.H[1] // ........................................................................e.................. - // gap // ........................................................................................... - srshr v6.8H, v6.8H, #11 // ................................................................e.......................... - // gap // ........................................................................................... - srshr v5.8H, v5.8H, #11 // ...................................................................e....................... - // gap // ........................................................................................... - srshr v0.8H, v0.8H, #11 // ......................................................................e.................... - // gap // ........................................................................................... - mls v9.8H, v6.8H, v7.H[0] // .................................................................e......................... - // gap // ........................................................................................... - srshr v6.8H, v26.8H, #11 // .........................................................................e................. - // gap // ........................................................................................... - mls v20.8H, v5.8H, v7.H[0] // ....................................................................e...................... - // gap // ........................................................................................... - mls v10.8H, v0.8H, v7.H[0] // .......................................................................e................... - // gap // ........................................................................................... - mls v13.8H, v6.8H, v7.H[0] // ..........................................................................e................ - // gap // ........................................................................................... - - // original source code - // ldr_vo v8, x1, 0 // e...................................................................................................... || e................................................................................................. - // ldr_vo v9, x1, 16 // ...e................................................................................................... || ..e............................................................................................... - // ldr_vo v10, x1, 32 // ............e.......................................................................................... || ........e......................................................................................... - // ldr_vo v11, x1, 48 // ......e................................................................................................ || ....e............................................................................................. - // ldr_vi v0, x3, 16 // .........e............................................................................................. || ......e........................................................................................... - // mul v24.8H, v10.8H, v0.H[0] // ...............e....................................................................................... || ............e..................................................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[1] // ................e...................................................................................... || .............e.................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ...................e................................................................................... || .................e................................................................................ - // sub v10.8H, v8.8H, v24.8H // ......................e................................................................................ || .....................e............................................................................ - // add v8.8H, v8.8H, v24.8H // .......................e............................................................................... || ......................e........................................................................... - // mul v24.8H, v11.8H, v0.H[0] // .............e......................................................................................... || ..........e....................................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[1] // ..............e........................................................................................ || ...........e...................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ..................e.................................................................................... || ................e................................................................................. - // sub v11.8H, v9.8H, v24.8H // ........................e.............................................................................. || .......................e.......................................................................... - // add v9.8H, v9.8H, v24.8H // .....................e................................................................................. || ....................e............................................................................. - // mul v24.8H, v9.8H, v0.H[2] // .........................e............................................................................. || ........................e......................................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ..........................e............................................................................ || .........................e........................................................................ - // mls v24.8H, v9.8H, v7.H[0] // ..............................e........................................................................ || ..............................e................................................................... - // sub v9.8H, v8.8H, v24.8H // .................................e..................................................................... || ..................................e............................................................... - // add v8.8H, v8.8H, v24.8H // ..................................e.................................................................... || ...................................e.............................................................. - // mul v24.8H, v11.8H, v0.H[4] // ...........................e........................................................................... || ..........................e....................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ............................e.......................................................................... || ...........................e...................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ...............................e....................................................................... || ...............................e.................................................................. - // sub v11.8H, v10.8H, v24.8H // ...................................e................................................................... || ....................................e............................................................. - // add v10.8H, v10.8H, v24.8H // ....................................e.................................................................. || .....................................e............................................................ - // trn1 v25.4S, v8.4S, v9.4S // .....................................e................................................................. || ......................................e........................................................... - // trn2 v26.4S, v8.4S, v9.4S // ......................................e................................................................ || .......................................e.......................................................... - // trn1 v27.4S, v10.4S, v11.4S // ..........................................e............................................................ || ............................................e..................................................... - // trn2 v28.4S, v10.4S, v11.4S // .......................................e............................................................... || ........................................e......................................................... - // trn2 v10.2D, v25.2D, v27.2D // .............................................e......................................................... || ...............................................e.................................................. - // trn2 v11.2D, v26.2D, v28.2D // .........................................e............................................................. || ...........................................e...................................................... - // trn1 v8.2D, v25.2D, v27.2D // ..........................................................e............................................ || .............................................................e.................................... - // trn1 v9.2D, v26.2D, v28.2D // ..............................................e........................................................ || ................................................e................................................. - // ldr_vi v0, x4, 96 // .................e..................................................................................... || ..............e................................................................................... - // ldr_vo v4, x4, -80 // ....................e.................................................................................. || ..................e............................................................................... - // ldr_vo v1, x4, -64 // .............................e......................................................................... || ............................e..................................................................... - // ldr_vo v5, x4, -48 // ................................e...................................................................... || ................................e................................................................. - // ldr_vo v2, x4, -32 // ........................................e.............................................................. || .........................................e........................................................ - // ldr_vo v6, x4, -16 // ..................................................e.................................................... || ....................................................e............................................. - // mul v24.8H, v10.8H, v0.8H // .................................................e..................................................... || ...................................................e.............................................. - // sqrdmulh v10.8H, v10.8H, v4.8H // ...............................................e....................................................... || .................................................e................................................ - // mls v24.8H, v10.8H, v7.H[0] // ....................................................e.................................................. || .......................................................e.......................................... - // sub v10.8H, v8.8H, v24.8H // ............................................................e.......................................... || ...............................................................e.................................. - // add v8.8H, v8.8H, v24.8H // ..............................................................e........................................ || .................................................................e................................ - // mul v24.8H, v11.8H, v0.8H // ...........................................e........................................................... || .............................................e.................................................... - // sqrdmulh v11.8H, v11.8H, v4.8H // ............................................e.......................................................... || ..............................................e................................................... - // mls v24.8H, v11.8H, v7.H[0] // ................................................e...................................................... || ..................................................e............................................... - // sub v11.8H, v9.8H, v24.8H // ...................................................e................................................... || ......................................................e........................................... - // add v9.8H, v9.8H, v24.8H // .....................................................e................................................. || ........................................................e......................................... - // mul v24.8H, v9.8H, v1.8H // ........................................................e.............................................. || ...........................................................e...................................... - // sqrdmulh v9.8H, v9.8H, v5.8H // .........................................................e............................................. || ............................................................e..................................... - // mls v24.8H, v9.8H, v7.H[0] // .............................................................e......................................... || ................................................................e................................. - // sub v9.8H, v8.8H, v24.8H // .................................................................e..................................... || ....................................................................e............................. - // add v8.8H, v8.8H, v24.8H // ..................................................................e.................................... || .....................................................................e............................ - // mul v24.8H, v11.8H, v2.8H // ......................................................e................................................ || .........................................................e........................................ - // sqrdmulh v11.8H, v11.8H, v6.8H // .......................................................e............................................... || ..........................................................e....................................... - // mls v24.8H, v11.8H, v7.H[0] // ...........................................................e........................................... || ..............................................................e................................... - // sub v11.8H, v10.8H, v24.8H // ...............................................................e....................................... || ..................................................................e............................... - // add v10.8H, v10.8H, v24.8H // ................................................................e...................................... || ...................................................................e.............................. - // trn1 v25.4S, v8.4S, v9.4S // .....................................................................e................................. || ........................................................................e......................... - // trn2 v26.4S, v8.4S, v9.4S // ......................................................................e................................ || .........................................................................e........................ - // trn1 v27.4S, v10.4S, v11.4S // ...................................................................e................................... || ......................................................................e........................... - // trn2 v28.4S, v10.4S, v11.4S // ....................................................................e.................................. || .......................................................................e.......................... - // sqdmulh v24.8H, v25.8H, v7.H[1] // .......................................................................e............................... || ..........................................................................e....................... - // srshr v24.8H, v24.8H, #11 // ...........................................................................e........................... || ..............................................................................e................... - // mls v25.8H, v24.8H, v7.H[0] // ..............................................................................e........................ || .................................................................................e................ - // sqdmulh v24.8H, v26.8H, v7.H[1] // ........................................................................e.............................. || ...........................................................................e...................... - // srshr v24.8H, v24.8H, #11 // ............................................................................e.......................... || ...............................................................................e.................. - // mls v26.8H, v24.8H, v7.H[0] // ................................................................................e...................... || ...................................................................................e.............. - // sqdmulh v24.8H, v27.8H, v7.H[1] // .........................................................................e............................. || ............................................................................e..................... - // srshr v24.8H, v24.8H, #11 // .............................................................................e......................... || ................................................................................e................. - // mls v27.8H, v24.8H, v7.H[0] // .................................................................................e..................... || ....................................................................................e............. - // sqdmulh v24.8H, v28.8H, v7.H[1] // ..........................................................................e............................ || .............................................................................e.................... - // srshr v24.8H, v24.8H, #11 // ...............................................................................e....................... || ..................................................................................e............... - // mls v28.8H, v24.8H, v7.H[0] // ..................................................................................e.................... || .....................................................................................e............ - // vext x10, v25, 0 // ...................................................................................*................... || ......................................................................................*........... - // vext x11, v25, 1 // ....................................................................................*.................. || ......................................................................................*........... - // vext x12, v26, 0 // .....................................................................................*................. || .......................................................................................*.......... - // vext x13, v26, 1 // ......................................................................................*................ || .......................................................................................*.......... - // vext x14, v27, 0 // .......................................................................................*............... || ........................................................................................*......... - // vext x15, v27, 1 // ........................................................................................*.............. || ........................................................................................*......... - // vext x16, v28, 0 // .........................................................................................*............. || .........................................................................................*........ - // vext x17, v28, 1 // ..........................................................................................*............ || .........................................................................................*........ - // str x10, [x1] , #64 // ............................................................................................*.......... || ..........................................................................................*....... - // str x14, [x1, #-56] // .............................................................................................*......... || ...........................................................................................*...... - // str x12, [x1, #-48] // ...............................................................................................*....... || ............................................................................................*..... - // str x16, [x1, #-40] // ................................................................................................*...... || .............................................................................................*.... - // str x11, [x1, #-32] // ..................................................................................................*.... || ..............................................................................................*... - // str x15, [x1, #-24] // ...................................................................................................*... || ...............................................................................................*.. - // str x13, [x1, #-16] // .....................................................................................................*. || ................................................................................................*. - // str x17, [x1, #-8] // ......................................................................................................* || .................................................................................................* - - subs count, count, #1 + // Instructions: 91 + // Expected cycles: 90 + // Expected IPC: 1.01 + // + // Cycle bound: 90.0 + // IPC bound: 1.01 + // + // Wall time: 45.80s + // User time: 45.80s + // + // ----------------------------------- original position ------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--------------- + ldr q11, [x3], #16 // ....e...................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q17, [x1, #112] // ...e....................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q23, [x1, #96] // ..e........................................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v18.8H, v17.8H, v11.H[1] // ..........e................................................................................ + // gap // ........................................................................................... + mul v15.8H, v17.8H, v11.H[0] // ...........e............................................................................... + // gap // ........................................................................................... + mul v10.8H, v23.8H, v11.H[0] // ......e.................................................................................... + // gap // ........................................................................................... + ldr q3, [x1, #80] // .e......................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v15.8H, v18.8H, v7.H[0] // ............e.............................................................................. + // gap // ........................................................................................... + sqrdmulh v30.8H, v23.8H, v11.H[1] // .....e..................................................................................... + // gap // ........................................................................................... + ldr q19, [x1, #64] // e.......................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v13.8H, v3.8H, v15.8H // .............e............................................................................. + // gap // ........................................................................................... + add v15.8H, v3.8H, v15.8H // ..............e............................................................................ + // gap // ........................................................................................... + mls v10.8H, v30.8H, v7.H[0] // .......e................................................................................... + // gap // ........................................................................................... + sqrdmulh v20.8H, v13.8H, v11.H[5] // ....................e...................................................................... + // gap // ........................................................................................... + mul v1.8H, v13.8H, v11.H[4] // .....................e..................................................................... + // gap // ........................................................................................... + mul v5.8H, v15.8H, v11.H[2] // ................e.......................................................................... + // gap // ........................................................................................... + sqrdmulh v12.8H, v15.8H, v11.H[3] // ...............e........................................................................... + // gap // ........................................................................................... + add v28.8H, v19.8H, v10.8H // .........e................................................................................. + // gap // ........................................................................................... + mls v1.8H, v20.8H, v7.H[0] // ......................e.................................................................... + // gap // ........................................................................................... + sub v19.8H, v19.8H, v10.8H // ........e.................................................................................. + // gap // ........................................................................................... + mls v5.8H, v12.8H, v7.H[0] // .................e......................................................................... + // gap // ........................................................................................... + mls v25.8H, v16.8H, v7.H[0] // .......................................................................*................... + // gap // ........................................................................................... + sub v21.8H, v19.8H, v1.8H // .......................e................................................................... + // gap // ........................................................................................... + add v11.8H, v19.8H, v1.8H // ........................e.................................................................. + // gap // ........................................................................................... + sub v24.8H, v28.8H, v5.8H // ..................e........................................................................ + // gap // ........................................................................................... + add v23.8H, v28.8H, v5.8H // ...................e....................................................................... + // gap // ........................................................................................... + trn1 v10.4S, v11.4S, v21.4S // ...........................e............................................................... + // gap // ........................................................................................... + trn2 v18.4S, v11.4S, v21.4S // ............................e.............................................................. + // gap // ........................................................................................... + trn2 v29.4S, v23.4S, v24.4S // ..........................e................................................................ + // gap // ........................................................................................... + trn1 v13.4S, v23.4S, v24.4S // .........................e................................................................. + // gap // ........................................................................................... + ldr q21, [x4], #(6*16) // .................................e......................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v30.2D, v13.2D, v10.2D // .............................e............................................................. + // gap // ........................................................................................... + trn2 v2.2D, v29.2D, v18.2D // ..............................e............................................................ + // gap // ........................................................................................... + mul v22.8H, v30.8H, v21.8H // ........................................e.................................................. + // gap // ........................................................................................... + ldr q23, [x4, #-80] // ..................................e........................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q14, [x4, #-16] // ......................................e.................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v3.8H, v30.8H, v23.8H // .......................................e................................................... + // gap // ........................................................................................... + mul v30.8H, v2.8H, v21.8H // .............................................e............................................. + // gap // ........................................................................................... + sqrdmulh v15.8H, v2.8H, v23.8H // ............................................e.............................................. + // gap // ........................................................................................... + sqdmulh v19.8H, v31.8H, v7.H[1] // ........................................................................*.................. + // gap // ........................................................................................... + mls v22.8H, v3.8H, v7.H[0] // .........................................e................................................. + // gap // ........................................................................................... + trn1 v2.2D, v13.2D, v10.2D // ...............................e........................................................... + // gap // ........................................................................................... + mls v30.8H, v15.8H, v7.H[0] // ..............................................e............................................ + // gap // ........................................................................................... + trn1 v8.2D, v29.2D, v18.2D // ................................e.......................................................... + // gap // ........................................................................................... + sub v12.8H, v2.8H, v22.8H // ..........................................e................................................ + // gap // ........................................................................................... + umov x23, v25.d[0] // ...............................................................................*........... + umov x29, v25.d[1] // ................................................................................*.......... + sub v6.8H, v8.8H, v30.8H // ...............................................e........................................... + // gap // ........................................................................................... + add v1.8H, v8.8H, v30.8H // ................................................e.......................................... + // gap // ........................................................................................... + ldr q25, [x4, #-64] // ...................................e....................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q21, [x4, #-48] // ....................................e...................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqdmulh v18.8H, v0.8H, v7.H[1] // ..................................................................*........................ + // gap // ........................................................................................... + mul v29.8H, v1.8H, v25.8H // ..................................................e........................................ + // gap // ........................................................................................... + sqrdmulh v20.8H, v1.8H, v21.8H // .................................................e......................................... + // gap // ........................................................................................... + add v30.8H, v2.8H, v22.8H // ...........................................e............................................... + // gap // ........................................................................................... + ldr q28, [x4, #-32] // .....................................e..................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v29.8H, v20.8H, v7.H[0] // ...................................................e....................................... + // gap // ........................................................................................... + sqrdmulh v20.8H, v6.8H, v14.8H // ......................................................e.................................... + // gap // ........................................................................................... + mul v11.8H, v6.8H, v28.8H // .......................................................e................................... + // gap // ........................................................................................... + mls v26.8H, v9.8H, v7.H[0] // .................................................................*......................... + // gap // ........................................................................................... + sub v24.8H, v30.8H, v29.8H // ....................................................e...................................... + // gap // ........................................................................................... + add v8.8H, v30.8H, v29.8H // .....................................................e..................................... + // gap // ........................................................................................... + mls v11.8H, v20.8H, v7.H[0] // ........................................................e.................................. + // gap // ........................................................................................... + umov x18, v26.d[0] // ...........................................................................*............... + umov x19, v26.d[1] // ............................................................................*.............. + trn1 v26.4S, v8.4S, v24.4S // ...........................................................e............................... + // gap // ........................................................................................... + srshr v4.8H, v18.8H, #11 // ...................................................................*....................... + str x19, [x1, #32] // .......................................................................................*... + add v6.8H, v12.8H, v11.8H // ..........................................................e................................ + // gap // ........................................................................................... + sub v29.8H, v12.8H, v11.8H // .........................................................e................................. + // gap // ........................................................................................... + srshr v18.8H, v19.8H, #11 // .........................................................................*................. + // gap // ........................................................................................... + mls v0.8H, v4.8H, v7.H[0] // ....................................................................*...................... + // gap // ........................................................................................... + trn1 v25.4S, v6.4S, v29.4S // .............................................................e............................. + // gap // ........................................................................................... + sqdmulh v4.8H, v26.8H, v7.H[1] // ...............................................................e........................... + str x23, [x1, #8] // ....................................................................................*...... + mls v31.8H, v18.8H, v7.H[0] // ..........................................................................*................ + str x18, [x1], #( 16*4) // ...................................................................................*....... + umov x21, v0.d[0] // .............................................................................*............. + umov x18, v0.d[1] // ..............................................................................*............ + sqdmulh v21.8H, v25.8H, v7.H[1] // .....................................................................e..................... + str x29, [x1, #-24] // ........................................................................................*.. + srshr v9.8H, v4.8H, #11 // ................................................................e.......................... + str x18, [x1, #-16] // .........................................................................................*. + umov x14, v31.d[0] // .................................................................................*......... + umov x18, v31.d[1] // ..................................................................................*........ + trn2 v31.4S, v6.4S, v29.4S // ..............................................................e............................ + str x21, [x1, #-48] // .....................................................................................*..... + trn2 v0.4S, v8.4S, v24.4S // ............................................................e.............................. + str x18, [x1, #-8] // ..........................................................................................* + srshr v16.8H, v21.8H, #11 // ......................................................................e.................... + str x14, [x1, #-40] // ......................................................................................*.... + + // ----------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------ + // ldr q8, [x1, #(16*0)] // .........e.................................................................................'........~................................................................................. + // ldr q9, [x1, #(16*1)] // ......e....................................................................................'.....~.................................................................................... + // ldr q10, [x1, #(16*2)] // ..e........................................................................................'.~........................................................................................ + // ldr q11, [x1, #(16*3)] // .e.........................................................................................'~......................................................................................... + // ldr q0, [x3], #16 // e..........................................................................................~.......................................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ........e..................................................................................'.......~.................................................................................. + // mul v24.8h, v10.8h, v0.h[0] // .....e.....................................................................................'....~..................................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ............e..............................................................................'...........~.............................................................................. + // sub v10.8h, v8.8h, v24.8h // ...................e.......................................................................'..................~....................................................................... + // add v8.8h, v8.8h, v24.8h // .................e.........................................................................'................~......................................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ...e.......................................................................................'..~....................................................................................... + // mul v24.8h, v11.8h, v0.h[0] // ....e......................................................................................'...~...................................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......e...................................................................................'......~................................................................................... + // sub v11.8h, v9.8h, v24.8h // ..........e................................................................................'.........~................................................................................ + // add v9.8h, v9.8h, v24.8h // ...........e...............................................................................'..........~............................................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ................e..........................................................................'...............~.......................................................................... + // mul v24.8h, v9.8h, v0.h[2] // ...............e...........................................................................'..............~........................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................e......................................................................'...................~...................................................................... + // sub v9.8h, v8.8h, v24.8h // ........................e..................................................................'.......................~.................................................................. + // add v8.8h, v8.8h, v24.8h // .........................e.................................................................'........................~................................................................. + // sqrdmulh v27.8h, v11.8h, v0.h[5] // .............e.............................................................................'............~............................................................................. + // mul v24.8h, v11.8h, v0.h[4] // ..............e............................................................................'.............~............................................................................ + // mls v24.8h, v27.8h, v7.h[0] // ..................e........................................................................'.................~........................................................................ + // sub v11.8h, v10.8h, v24.8h // ......................e....................................................................'.....................~.................................................................... + // add v10.8h, v10.8h, v24.8h // .......................e...................................................................'......................~................................................................... + // trn1 v25.4s, v8.4s, v9.4s // .............................e.............................................................'............................~............................................................. + // trn2 v26.4s, v8.4s, v9.4s // ............................e..............................................................'...........................~.............................................................. + // trn1 v27.4s, v10.4s, v11.4s // ..........................e................................................................'.........................~................................................................ + // trn2 v28.4s, v10.4s, v11.4s // ...........................e...............................................................'..........................~............................................................... + // trn2 v10.2d, v25.2d, v27.2d // ...............................e...........................................................'..............................~........................................................... + // trn2 v11.2d, v26.2d, v28.2d // ................................e..........................................................'...............................~.......................................................... + // trn1 v8.2d, v25.2d, v27.2d // .........................................e.................................................'........................................~................................................. + // trn1 v9.2d, v26.2d, v28.2d // ...........................................e...............................................'..........................................~............................................... + // ldr q0, [ x4], #(6*16) // ..............................e............................................................'.............................~............................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // ..................................e........................................................'.................................~........................................................ + // ldr q1, [ x4, #(-6*16 + 2*16)] // .................................................e.........................................'................................................~......................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ..................................................e........................................'.................................................~........................................ + // ldr q2, [ x4, #(-6*16 + 4*16)] // .......................................................e...................................'......................................................~................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ...................................e.......................................................'..................................~....................................................... + // sqrdmulh v27.8h, v10.8h, v4.8h // ....................................e......................................................'...................................~...................................................... + // mul v24.8h, v10.8h, v0.8h // .................................e.........................................................'................................~......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................................e..................................................'.......................................~.................................................. + // sub v10.8h, v8.8h, v24.8h // ............................................e..............................................'...........................................~.............................................. + // add v8.8h, v8.8h, v24.8h // ......................................................e....................................'.....................................................~.................................... + // sqrdmulh v27.8h, v11.8h, v4.8h // ......................................e....................................................'.....................................~.................................................... + // mul v24.8h, v11.8h, v0.8h // .....................................e.....................................................'....................................~..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................e................................................'.........................................~................................................ + // sub v11.8h, v9.8h, v24.8h // ...............................................e...........................................'..............................................~........................................... + // add v9.8h, v9.8h, v24.8h // ................................................e..........................................'...............................................~.......................................... + // sqrdmulh v27.8h, v9.8h, v5.8h // .....................................................e.....................................'....................................................~..................................... + // mul v24.8h, v9.8h, v1.8h // ....................................................e......................................'...................................................~...................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................................................e..................................'.......................................................~.................................. + // sub v9.8h, v8.8h, v24.8h // ............................................................e..............................'...........................................................~.............................. + // add v8.8h, v8.8h, v24.8h // .............................................................e.............................'............................................................~............................. + // sqrdmulh v27.8h, v11.8h, v6.8h // .........................................................e.................................'........................................................~................................. + // mul v24.8h, v11.8h, v2.8h // ..........................................................e................................'.........................................................~................................ + // mls v24.8h, v27.8h, v7.h[0] // ..............................................................e............................'.............................................................~............................ + // sub v11.8h, v10.8h, v24.8h // .....................................................................e.....................'....................................................................~..................... + // add v10.8h, v10.8h, v24.8h // ....................................................................e......................'...................................................................~...................... + // trn1 v25.4s, v8.4s, v9.4s // .................................................................e.........................'................................................................~......................... + // trn2 v26.4s, v8.4s, v9.4s // .......................................................................................e...'......................................................................................~... + // trn1 v27.4s, v10.4s, v11.4s // ........................................................................e..................'.......................................................................~.................. + // trn2 v28.4s, v10.4s, v11.4s // .....................................................................................e.....'....................................................................................~..... + // sqdmulh v24.8h, v25.8h, v7.h[1] // .........................................................................e.................'........................................................................~................. + // srshr v24.8h, v24.8h, #11 // .................................................................................e.........'................................................................................~......... + // mls v25.8h, v24.8h, v7.h[0] // ...........................................................~...............................'..........................................................*............................... + // sqdmulh v24.8h, v26.8h, v7.h[1] // ...................................................~.......................................'..................................................*....................................... + // srshr v24.8h, v24.8h, #11 // ..................................................................~........................'.................................................................*........................ + // mls v26.8h, v24.8h, v7.h[0] // .......................................................................~...................'......................................................................*................... + // sqdmulh v24.8h, v27.8h, v7.h[1] // ...............................................................................e...........'..............................................................................~........... + // srshr v24.8h, v24.8h, #11 // .........................................................................................e.'........................................................................................~. + // mls v27.8h, v24.8h, v7.h[0] // .....................~.....................................................................'....................*..................................................................... + // sqdmulh v24.8h, v28.8h, v7.h[1] // .......................................~...................................................'......................................*................................................... + // srshr v24.8h, v24.8h, #11 // ......................................................................~....................'.....................................................................*.................... + // mls v28.8h, v24.8h, v7.h[0] // ...........................................................................~...............'..........................................................................*............... + // umov x10, v25.d[0] // ...............................................................~...........................'..............................................................*........................... + // umov x11, v25.d[1] // ................................................................~..........................'...............................................................*.......................... + // umov x12, v26.d[0] // .............................................................................~.............'............................................................................*............. + // umov x13, v26.d[1] // ..............................................................................~............'.............................................................................*............ + // umov x14, v27.d[0] // .............................................~.............................................'............................................*............................................. + // umov x15, v27.d[1] // ..............................................~............................................'.............................................*............................................ + // umov x16, v28.d[0] // ...................................................................................~.......'..................................................................................*....... + // umov x17, v28.d[1] // ....................................................................................~......'...................................................................................*...... + // str x10, [x1], #( 16*4) // ............................................................................~..............'...........................................................................*.............. + // str x14, [x1, #(-16*4 + 8*1)] // ..........................................................................~................'.........................................................................*................ + // str x12, [x1, #(-16*4 + 8*2)] // ......................................................................................~....'.....................................................................................*.... + // str x16, [x1, #(-16*4 + 8*3)] // ..........................................................................................~'.........................................................................................* + // str x11, [x1, #(-16*4 + 8*4)] // ...................................................................~.......................'..................................................................*....................... + // str x15, [x1, #(-16*4 + 8*5)] // ................................................................................~..........'...............................................................................*.......... + // str x13, [x1, #(-16*4 + 8*6)] // ..................................................................................~........'.................................................................................*........ + // str x17, [x1, #(-16*4 + 8*7)] // ........................................................................................~..'.......................................................................................*.. + + sub count, count, #1 cbnz count, layer4567_start - vext x10, v9, 0 // *............... - // gap // ................ - vext x11, v9, 1 // .*.............. - // gap // ................ - vext x12, v20, 0 // ..*............. - str x10, [x1] , #64 // ........*....... - vext x13, v20, 1 // ...*............ - str x11, [x1, #-32] // ............*... - vext x14, v10, 0 // ....*........... - str x12, [x1, #-48] // ..........*..... - vext x15, v10, 1 // .....*.......... - str x13, [x1, #-16] // ..............*. - vext x16, v13, 0 // ......*......... - str x14, [x1, #-56] // .........*...... - vext x17, v13, 1 // .......*........ - str x15, [x1, #-24] // .............*.. - str x16, [x1, #-40] // ...........*.... - // gap // ................ - str x17, [x1, #-8] // ...............* - // gap // ................ - - // original source code - // vext x10, v9, 0 // *............... || *......... - // vext x11, v9, 1 // .*.............. || .*........ - // vext x12, v20, 0 // ..*............. || ..*....... - // vext x13, v20, 1 // ....*........... || ...*...... - // vext x14, v10, 0 // ......*......... || ....*..... - // vext x15, v10, 1 // ........*....... || .....*.... - // vext x16, v13, 0 // ..........*..... || ......*... - // vext x17, v13, 1 // ............*... || .......*.. - // str x10, [x1] , #64 // ...*............ || ..*....... - // str x14, [x1, #-56] // ...........*.... || ......*... - // str x12, [x1, #-48] // .......*........ || ....*..... - // str x16, [x1, #-40] // ..............*. || ........*. - // str x11, [x1, #-32] // .....*.......... || ...*...... - // str x15, [x1, #-24] // .............*.. || .......*.. - // str x13, [x1, #-16] // .........*...... || .....*.... - // str x17, [x1, #-8] // ...............* || .........* - + // Instructions: 24 + // Expected cycles: 17 + // Expected IPC: 1.41 + // + // Cycle bound: 17.0 + // IPC bound: 1.41 + // + // Wall time: 0.12s + // User time: 0.12s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sqdmulh v18.8H, v31.8H, v7.H[1] // .*............................ + // gap // .............................. + mls v25.8H, v16.8H, v7.H[0] // *............................. + // gap // .............................. + sqdmulh v29.8H, v0.8H, v7.H[1] // ....*......................... + // gap // .............................. + mls v26.8H, v9.8H, v7.H[0] // .....*........................ + // gap // .............................. + srshr v27.8H, v18.8H, #11 // ..........*................... + // gap // .............................. + umov x22, v25.d[0] // ..*........................... + umov x18, v25.d[1] // ...*.......................... + srshr v18.8H, v29.8H, #11 // ........*..................... + // gap // .............................. + mls v31.8H, v27.8H, v7.H[0] // .............*................ + str x22, [x1, #8] // ............*................. + str x18, [x1, #40] // .................*............ + umov x18, v26.d[0] // ......*....................... + mls v0.8H, v18.8H, v7.H[0] // ...........*.................. + // gap // .............................. + str x18, [x1], #( 16*4) // ..............*............... + umov x18, v26.d[1] // .......*...................... + umov x12, v31.d[1] // ....................*......... + umov x10, v31.d[0] // ...................*.......... + str x18, [x1, #-32] // .........*.................... + // gap // .............................. + str x12, [x1, #-8] // ......................*....... + umov x18, v0.d[0] // ...............*.............. + umov x25, v0.d[1] // ................*............. + str x10, [x1, #-40] // .......................*...... + str x18, [x1, #-48] // .....................*........ + // gap // .............................. + str x25, [x1, #-16] // ..................*........... + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // mls v25.8H, v16.8H, v7.H[0] // .*............................. + // sqdmulh v19.8H, v31.8H, v7.H[1] // *.............................. + // umov x23, v25.d[0] // .....*......................... + // umov x29, v25.d[1] // ......*........................ + // sqdmulh v18.8H, v0.8H, v7.H[1] // ..*............................ + // mls v26.8H, v9.8H, v7.H[0] // ...*........................... + // umov x18, v26.d[0] // ...........*................... + // umov x19, v26.d[1] // ..............*................ + // srshr v4.8H, v18.8H, #11 // .......*....................... + // str x19, [x1, #32] // .................*............. + // srshr v18.8H, v19.8H, #11 // ....*.......................... + // mls v0.8H, v4.8H, v7.H[0] // ............*.................. + // str x23, [x1, #8] // .........*..................... + // mls v31.8H, v18.8H, v7.H[0] // ........*...................... + // str x18, [x1], #( 16*4) // .............*................. + // umov x21, v0.d[0] // ...................*........... + // umov x18, v0.d[1] // ....................*.......... + // str x29, [x1, #-24] // ..........*.................... + // str x18, [x1, #-16] // .......................*....... + // umov x14, v31.d[0] // ................*.............. + // umov x18, v31.d[1] // ...............*............... + // str x21, [x1, #-48] // ......................*........ + // str x18, [x1, #-8] // ..................*............ + // str x14, [x1, #-40] // .....................*......... + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_a72.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_a72.s index 9e29feff..1f889bff 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_a72.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_a72.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. -// -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset // slothy:no-unfold - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc // slothy:no-unfold - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset // slothy:no-unfold - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc // slothy:no-unfold - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -66,15 +43,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -83,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -97,48 +68,48 @@ .macro barrett_reduce a vqdmulhq tmp, \a, consts, 1 - srshr tmp.8H, tmp.8H, #11 + srshr tmp.8h, tmp.8h, #11 vmlsq \a, tmp, consts, 0 .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data - trn1 t0.4s, \data\()0\().4s, \data\()1\().4s - trn2 t1.4s, \data\()0\().4s, \data\()1\().4s - trn1 t2.4s, \data\()2\().4s, \data\()3\().4s - trn2 t3.4s, \data\()2\().4s, \data\()3\().4s + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s - trn2 \data\()2\().2d, t0.2d, t2.2d - trn2 \data\()3\().2d, t1.2d, t3.2d - trn1 \data\()0\().2d, t0.2d, t2.2d - trn1 \data\()1\().2d, t1.2d, t3.2d + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d .endm .macro transpose_single data_out, data_in - trn1 \data_out\()0\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn2 \data_out\()1\().4s, \data_in\()0\().4s, \data_in\()1\().4s - trn1 \data_out\()2\().4s, \data_in\()2\().4s, \data_in\()3\().4s - trn2 \data_out\()3\().4s, \data_in\()2\().4s, \data_in\()3\().4s + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro vext gpr_out, vec_in, lane // slothy:no-unfold +.macro vext gpr_out, vec_in, lane umov \gpr_out\(), \vec_in\().d[\lane] .endm @@ -165,7 +136,7 @@ .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -176,7 +147,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -186,7 +157,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -194,7 +165,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -205,19 +176,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -230,7 +201,7 @@ roots: .text .global ntt_kyber_123_4567_scalar_store_opt_a72 - .global _ntt_kyber_123_4567_scalar_store_opt_a72 + .global _ntt_kyber_123_4567_scalar_store .p2align 4 const_addr: .short 3329 @@ -356,1330 +327,1418 @@ _ntt_kyber_123_4567_scalar_store_opt_a72: load_roots_123 .p2align 2 - ldr_vo v23, x0, 192 // ..*......... - ldr_vo v19, x0, 448 // .*.......... - // gap // ............ - ldr_vo v5, x0, 256 // *........... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v25.8H, v19.8H, v0.H[1] // ....*....... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v19.8H, v19.8H, v0.H[0] // ...*........ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v16.8H, v5.8H, v0.H[1] // ........*... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v19.8H, v25.8H, v7.H[0] // .....*...... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v20.8H, v5.8H, v0.H[0] // ......*..... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mls v20.8H, v16.8H, v7.H[0] // ...........* - // gap // ............ - // gap // ............ - add v16.8H, v23.8H, v19.8H // .......*.... - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - sqrdmulh v8.8H, v16.8H, v0.H[3] // .........*.. - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - // gap // ............ - mul v2.8H, v16.8H, v0.H[2] // ..........*. - // gap // ............ - // gap // ............ - - // original source code - // ldr_vo v4, x0, 256 // ..*......... || .*................... - // ldr_vo v27, x0, 448 // .*.......... || *.................... - // ldr_vo v23, x0, 192 // *........... || *.................... - // mul v19.8H, v27.8H, v0.H[0] // ....*....... || ......*.............. - // sqrdmulh v8.8H, v27.8H, v0.H[1] // ...*........ || ....*................ - // mls v19.8H, v8.8H, v7.H[0] // ......*..... || ..........*.......... - // mul v20.8H, v4.8H, v0.H[0] // .......*.... || ............*........ - // add v21.8H, v23.8H, v19.8H // .........*.. || ...............*..... - // sqrdmulh v24.8H, v4.8H, v0.H[1] // .....*...... || ........*............ - // sqrdmulh v8.8H, v21.8H, v0.H[3] // ..........*. || ..................*.. - // mul v2.8H, v21.8H, v0.H[2] // ...........* || ....................* - // mls v20.8H, v24.8H, v7.H[0] // ........*... || ..............*...... - + // Instructions: 7 + // Expected cycles: 11 + // Expected IPC: 0.64 + // + // Cycle bound: 11.0 + // IPC bound: 0.64 + // + // Wall time: 0.05s + // User time: 0.05s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q16, [x0, #448] // *............................. + // gap // .............................. + // gap // .............................. + ldr q3, [x0, #192] // ..*........................... + // gap // .............................. + // gap // .............................. + ldr q15, [x0, #320] // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v6.8H, v16.8H, v0.H[1] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v16.8H, v16.8H, v0.H[0] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v25.8H, v15.8H, v0.H[0] // ...*.......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v11.8H, v15.8H, v0.H[1] // .....*........................ + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q2, [x0, #448] // *.............................. + // ldr q22, [x0, #320] // ..*............................ + // ldr q3, [x0, #192] // .*............................. + // mul v25.8H, v22.8H, v0.H[0] // .....*......................... + // sqrdmulh v6.8H, v2.8H, v0.H[1] // ...*........................... + // sqrdmulh v11.8H, v22.8H, v0.H[1] // ......*........................ + // mul v16.8H, v2.8H, v0.H[0] // ....*.......................... + sub count, count, #1 -.p2align 2 layer123_start: - ldr_vo v16, x0, 0 // *........................................................................... - sub v19.8H, v23.8H, v19.8H // ..........................*................................................. - ldr_vo v5, x0, 128 // ..*......................................................................... - ldr_vo v21, x0, 384 // ......*..................................................................... - ldr_vo v24, x0, 64 // .*.......................................................................... - mls v2.8H, v8.8H, v7.H[0] // ...................................*........................................ - ldr_vo v4, x0, 272 // ....e....................................................................... - ldr_vo v8, x0, 320 // .....*...................................................................... - // gap // ............................................................................ - mul v18.8H, v19.8H, v0.H[4] // ...........................................*................................ - ldr_vo v27, x0, 464 // .......e.................................................................... - ldr_vo v23, x0, 208 // ...e........................................................................ - sub v14.8H, v16.8H, v20.8H // ...........*................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v16.8H, v16.8H, v20.8H // ............*............................................................... - sqrdmulh v19.8H, v19.8H, v0.H[5] // ............................................*............................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v20.8H, v21.8H, v0.H[1] // ...................*........................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v21.8H, v21.8H, v0.H[0] // ..................*......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v29.8H, v8.8H, v0.H[1] // ..............*............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v21.8H, v20.8H, v7.H[0] // ....................*....................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v8.8H, v8.8H, v0.H[0] // .............*.............................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v8.8H, v29.8H, v7.H[0] // ...............*............................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sub v20.8H, v5.8H, v21.8H // .....................*...................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v5.8H, v5.8H, v21.8H // ......................*..................................................... - mls v18.8H, v19.8H, v7.H[0] // .............................................*.............................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v19.8H, v27.8H, v0.H[0] // .......................e.................................................... - // gap // ............................................................................ - // gap // ............................................................................ - add v21.8H, v24.8H, v8.8H // .................*.......................................................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v24.8H, v24.8H, v8.8H // ................*........................................................... - sqrdmulh v8.8H, v27.8H, v0.H[1] // ........................e................................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v27.8H, v20.8H, v0.H[5] // .......................................*.................................... - sub v29.8H, v21.8H, v2.8H // ....................................*....................................... - // gap // ............................................................................ - add v21.8H, v21.8H, v2.8H // .....................................*...................................... - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v2.8H, v5.8H, v0.H[3] // .............................*.............................................. - sub v25.8H, v24.8H, v18.8H // ..............................................*............................. - // gap // ............................................................................ - add v24.8H, v24.8H, v18.8H // ...............................................*............................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v18.8H, v20.8H, v0.H[4] // ......................................*..................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v18.8H, v27.8H, v7.H[0] // ........................................*................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v5.8H, v5.8H, v0.H[2] // ............................*............................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.8H, v2.8H, v7.H[0] // ..............................*............................................. - // gap // ............................................................................ - // gap // ............................................................................ - sub v2.8H, v14.8H, v18.8H // .........................................*.................................. - // gap // ............................................................................ - // gap // ............................................................................ - add v18.8H, v14.8H, v18.8H // ..........................................*................................. - mul v27.8H, v21.8H, v0.H[6] // ................................................*........................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v21.8H, v21.8H, v0.H[7] // .................................................*.......................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v14.8H, v16.8H, v5.8H // ...............................*............................................ - // gap // ............................................................................ - // gap // ............................................................................ - add v16.8H, v16.8H, v5.8H // ................................*........................................... - mul v5.8H, v29.8H, v1.H[0] // .....................................................*...................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v20.8H, v29.8H, v1.H[1] // ......................................................*..................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v27.8H, v21.8H, v7.H[0] // ..................................................*......................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v21.8H, v24.8H, v1.H[2] // ..........................................................*................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v5.8H, v20.8H, v7.H[0] // .......................................................*.................... - // gap // ............................................................................ - // gap // ............................................................................ - sub v20.8H, v16.8H, v27.8H // ...................................................*........................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v24.8H, v24.8H, v1.H[3] // ...........................................................*................ - add v16.8H, v16.8H, v27.8H // ....................................................*....................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - str_vo v20, x0, 64 // .....................................................................*...... - mul v27.8H, v25.8H, v1.H[4] // ...............................................................*............ - // gap // ............................................................................ - str_vi v16, x0, 16 // ....................................................................*....... - sub v16.8H, v14.8H, v5.8H // ........................................................*................... - // gap // ............................................................................ - sqrdmulh v20.8H, v25.8H, v1.H[5] // ................................................................*........... - add v5.8H, v14.8H, v5.8H // .........................................................*.................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v21.8H, v24.8H, v7.H[0] // ............................................................*............... - str_vo v16, x0, 176 // .......................................................................*.... - // gap // ............................................................................ - str_vo v5, x0, 112 // ......................................................................*..... - // gap // ............................................................................ - // gap // ............................................................................ - mls v19.8H, v8.8H, v7.H[0] // .........................e.................................................. - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mls v27.8H, v20.8H, v7.H[0] // .................................................................*.......... - // gap // ............................................................................ - // gap // ............................................................................ - sub v16.8H, v18.8H, v21.8H // .............................................................*.............. - // gap // ............................................................................ - // gap // ............................................................................ - add v5.8H, v18.8H, v21.8H // ..............................................................*............. - mul v20.8H, v4.8H, v0.H[0] // ........e................................................................... - // gap // ............................................................................ - add v21.8H, v23.8H, v19.8H // ...........................e................................................ - // gap // ............................................................................ - // gap // ............................................................................ - sqrdmulh v24.8H, v4.8H, v0.H[1] // .........e.................................................................. - str_vo v16, x0, 304 // .........................................................................*.. - // gap // ............................................................................ - str_vo v5, x0, 240 // ........................................................................*... - sub v16.8H, v2.8H, v27.8H // ..................................................................*......... - // gap // ............................................................................ - add v5.8H, v2.8H, v27.8H // ...................................................................*........ - sqrdmulh v8.8H, v21.8H, v0.H[3] // ..................................e......................................... - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - // gap // ............................................................................ - mul v2.8H, v21.8H, v0.H[2] // .................................e.......................................... - str_vo v16, x0, 432 // ...........................................................................* - // gap // ............................................................................ - str_vo v5, x0, 368 // ..........................................................................*. - // gap // ............................................................................ - // gap // ............................................................................ - mls v20.8H, v24.8H, v7.H[0] // ..........e................................................................. - // gap // ............................................................................ - // gap // ............................................................................ - - // original source code - // ldr_vo v8, x0, 0 // ......................................................................*.......................................................................... || ......................................................................*...................................................................... - // ldr_vo v9, x0, 64 // ..........................................................................*...................................................................... || .......................................................................*..................................................................... - // ldr_vo v10, x0, 128 // ........................................................................*........................................................................ || ......................................................................*...................................................................... - // ldr_vo v11, x0, 192 // ....e............................................................................................................................................ || .e........................................................................................................................................... - // ldr_vo v12, x0, 256 // e................................................................................................................................................ || e............................................................................................................................................ - // ldr_vo v13, x0, 320 // .............................................................................*................................................................... || ........................................................................*.................................................................... - // ldr_vo v14, x0, 384 // .........................................................................*....................................................................... || .......................................................................*..................................................................... - // ldr_vo v15, x0, 448 // ...e............................................................................................................................................. || .e........................................................................................................................................... - // mul v24.8H, v12.8H, v0.H[0] // ..........................................................e...................................................................................... || .............................................................e............................................................................... - // sqrdmulh v12.8H, v12.8H, v0.H[1] // ............................................................e.................................................................................... || ...............................................................e............................................................................. - // mls v24.8H, v12.8H, v7.H[0] // .....................................................................e........................................................................... || .....................................................................e....................................................................... - // sub v12.8H, v8.8H, v24.8H // .................................................................................*............................................................... || ..........................................................................*.................................................................. - // add v8.8H, v8.8H, v24.8H // ..................................................................................*.............................................................. || ...........................................................................*................................................................. - // mul v24.8H, v13.8H, v0.H[0] // ........................................................................................*........................................................ || .....................................................................................*....................................................... - // sqrdmulh v13.8H, v13.8H, v0.H[1] // ......................................................................................*.......................................................... || .................................................................................*........................................................... - // mls v24.8H, v13.8H, v7.H[0] // .........................................................................................*....................................................... || .......................................................................................*..................................................... - // sub v13.8H, v9.8H, v24.8H // ...............................................................................................*................................................. || .............................................................................................*............................................... - // add v9.8H, v9.8H, v24.8H // ..............................................................................................*.................................................. || ............................................................................................*................................................ - // mul v24.8H, v14.8H, v0.H[0] // .....................................................................................*........................................................... || ...............................................................................*............................................................. - // sqrdmulh v14.8H, v14.8H, v0.H[1] // ....................................................................................*............................................................ || .............................................................................*............................................................... - // mls v24.8H, v14.8H, v7.H[0] // .......................................................................................*......................................................... || ...................................................................................*......................................................... - // sub v14.8H, v10.8H, v24.8H // ..........................................................................................*...................................................... || ........................................................................................*.................................................... - // add v10.8H, v10.8H, v24.8H // ...........................................................................................*..................................................... || .........................................................................................*................................................... - // mul v24.8H, v15.8H, v0.H[0] // .................e............................................................................................................................... || ...................e......................................................................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[1] // ....................e............................................................................................................................ || .....................e....................................................................................................................... - // mls v24.8H, v15.8H, v7.H[0] // ......................................................e.......................................................................................... || .........................................................e................................................................................... - // sub v15.8H, v11.8H, v24.8H // .......................................................................*......................................................................... || ......................................................................*...................................................................... - // add v11.8H, v11.8H, v24.8H // ...........................................................e..................................................................................... || ..............................................................e.............................................................................. - // mul v24.8H, v10.8H, v0.H[2] // .........................................................................................................*....................................... || .......................................................................................................*..................................... - // sqrdmulh v10.8H, v10.8H, v0.H[3] // ....................................................................................................*............................................ || .................................................................................................*........................................... - // mls v24.8H, v10.8H, v7.H[0] // ..........................................................................................................*...................................... || .........................................................................................................*................................... - // sub v10.8H, v8.8H, v24.8H // ...............................................................................................................*................................. || ..............................................................................................................*.............................. - // add v8.8H, v8.8H, v24.8H // ................................................................................................................*................................ || ...............................................................................................................*............................. - // mul v24.8H, v11.8H, v0.H[2] // ..................................................................e.............................................................................. || ...................................................................e......................................................................... - // sqrdmulh v11.8H, v11.8H, v0.H[3] // .................................................................e............................................................................... || .................................................................e........................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ...........................................................................*..................................................................... || .......................................................................*..................................................................... - // sub v11.8H, v9.8H, v24.8H // ..................................................................................................*.............................................. || ...............................................................................................*............................................. - // add v9.8H, v9.8H, v24.8H // ...................................................................................................*............................................. || ................................................................................................*............................................ - // mul v24.8H, v14.8H, v0.H[4] // .......................................................................................................*......................................... || ...................................................................................................*......................................... - // sqrdmulh v14.8H, v14.8H, v0.H[5] // .................................................................................................*............................................... || ...............................................................................................*............................................. - // mls v24.8H, v14.8H, v7.H[0] // ........................................................................................................*........................................ || .....................................................................................................*....................................... - // sub v14.8H, v12.8H, v24.8H // ...........................................................................................................*..................................... || ..........................................................................................................*.................................. - // add v12.8H, v12.8H, v24.8H // ............................................................................................................*.................................... || ...........................................................................................................*................................. - // mul v24.8H, v15.8H, v0.H[4] // ..............................................................................*.................................................................. || .........................................................................*................................................................... - // sqrdmulh v15.8H, v15.8H, v0.H[5] // ...................................................................................*............................................................. || ...........................................................................*................................................................. - // mls v24.8H, v15.8H, v7.H[0] // ............................................................................................*.................................................... || .........................................................................................*................................................... - // sub v15.8H, v13.8H, v24.8H // .....................................................................................................*........................................... || .................................................................................................*........................................... - // add v13.8H, v13.8H, v24.8H // ......................................................................................................*.......................................... || ..................................................................................................*.......................................... - // mul v24.8H, v9.8H, v0.H[6] // .............................................................................................................*................................... || ...........................................................................................................*................................. - // sqrdmulh v9.8H, v9.8H, v0.H[7] // ..............................................................................................................*.................................. || .............................................................................................................*............................... - // mls v24.8H, v9.8H, v7.H[0] // ...................................................................................................................*............................. || ...................................................................................................................*......................... - // sub v9.8H, v8.8H, v24.8H // ......................................................................................................................*.......................... || ........................................................................................................................*.................... - // add v8.8H, v8.8H, v24.8H // ........................................................................................................................*........................ || .........................................................................................................................*................... - // mul v24.8H, v11.8H, v1.H[0] // .................................................................................................................*............................... || ...............................................................................................................*............................. - // sqrdmulh v11.8H, v11.8H, v1.H[1] // ..................................................................................................................*.............................. || .................................................................................................................*........................... - // mls v24.8H, v11.8H, v7.H[0] // .....................................................................................................................*........................... || .......................................................................................................................*..................... - // sub v11.8H, v10.8H, v24.8H // ............................................................................................................................*.................... || ............................................................................................................................*................ - // add v10.8H, v10.8H, v24.8H // ..............................................................................................................................*.................. || .............................................................................................................................*............... - // mul v24.8H, v13.8H, v1.H[2] // ....................................................................................................................*............................ || .....................................................................................................................*....................... - // sqrdmulh v13.8H, v13.8H, v1.H[3] // .......................................................................................................................*......................... || .........................................................................................................................*................... - // mls v24.8H, v13.8H, v7.H[0] // ...............................................................................................................................*................. || ...............................................................................................................................*............. - // sub v13.8H, v12.8H, v24.8H // ....................................................................................................................................*............ || ....................................................................................................................................*........ - // add v12.8H, v12.8H, v24.8H // .....................................................................................................................................*........... || .....................................................................................................................................*....... - // mul v24.8H, v15.8H, v1.H[4] // ..........................................................................................................................*...................... || ...........................................................................................................................*................. - // sqrdmulh v15.8H, v15.8H, v1.H[5] // .............................................................................................................................*................... || .............................................................................................................................*............... - // mls v24.8H, v15.8H, v7.H[0] // ...................................................................................................................................*............. || ...................................................................................................................................*......... - // sub v15.8H, v14.8H, v24.8H // ...........................................................................................................................................*..... || ........................................................................................................................................*.... - // add v14.8H, v14.8H, v24.8H // ............................................................................................................................................*.... || .........................................................................................................................................*... - // str_vi v8, x0, 16 // ...........................................................................................................................*..................... || ............................................................................................................................*................ - // str_vo v9, x0, 48 // .........................................................................................................................*....................... || ...........................................................................................................................*................. - // str_vo v10, x0, 112 // .................................................................................................................................*............... || ................................................................................................................................*............ - // str_vo v11, x0, 176 // ................................................................................................................................*................ || ...............................................................................................................................*............. - // str_vo v12, x0, 240 // ..........................................................................................................................................*...... || ........................................................................................................................................*.... - // str_vo v13, x0, 304 // .........................................................................................................................................*....... || .......................................................................................................................................*..... - // str_vo v14, x0, 368 // ................................................................................................................................................* || ............................................................................................................................................* - // str_vo v15, x0, 432 // ...............................................................................................................................................*. || ...........................................................................................................................................*. - - subs count, count, #1 + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Cycle bound: 72.0 + // IPC bound: 1.06 + // + // Wall time: 54.95s + // User time: 54.95s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q31, [x0, #64] // .*.......................................................................... + ldr q17, [x0, #256] // ....*....................................................................... + mls v16.8H, v6.8H, v7.H[0] // .........................*.................................................. + ldr q2, [x0, #464] // .......e.................................................................... + ldr q18, [x0, #128] // ..*......................................................................... + // gap // ............................................................................ + mls v25.8H, v11.8H, v7.H[0] // ...............*............................................................ + ldr q26, [x0, #384] // ......*..................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v22.8H, v17.8H, v0.H[1] // ........*................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v15.8H, v3.8H, v16.8H // ...........................*................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v14.8H, v26.8H, v0.H[1] // ..................*......................................................... + sub v27.8H, v3.8H, v16.8H // ..........................*................................................. + // gap // ............................................................................ + sub v6.8H, v31.8H, v25.8H // ................*........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v9.8H, v15.8H, v0.H[2] // ..................................*......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v16.8H, v26.8H, v0.H[0] // ...................*........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v16.8H, v14.8H, v7.H[0] // ....................*....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v28.8H, v15.8H, v0.H[3] // .................................*.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v15.8H, v27.8H, v0.H[4] // ............................................*............................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v13.8H, v18.8H, v16.8H // .....................*...................................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v11.8H, v18.8H, v16.8H // ......................*..................................................... + sqrdmulh v16.8H, v27.8H, v0.H[5] // ...........................................*................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v14.8H, v13.8H, v0.H[5] // ......................................*..................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v3.8H, v11.8H, v0.H[2] // .............................*.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v9.8H, v28.8H, v7.H[0] // ...................................*........................................ + ldr q28, [x0, #0] // *........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v15.8H, v16.8H, v7.H[0] // .............................................*.............................. + add v16.8H, v31.8H, v25.8H // .................*.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v31.8H, v17.8H, v0.H[0] // .........*.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v26.8H, v16.8H, v9.8H // .....................................*...................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v8.8H, v16.8H, v9.8H // ....................................*....................................... + mls v31.8H, v22.8H, v7.H[0] // ..........*................................................................. + ldr q22, [x0, #336] // .....e...................................................................... + add v27.8H, v6.8H, v15.8H // ...............................................*............................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v6.8H, v6.8H, v15.8H // ..............................................*............................. + sqrdmulh v16.8H, v26.8H, v0.H[7] // ................................................*........................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v15.8H, v27.8H, v1.H[2] // ...........................................................*................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v25.8H, v28.8H, v31.8H // ...........*................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v20.8H, v28.8H, v31.8H // ............*............................................................... + mul v26.8H, v26.8H, v0.H[6] // .................................................*.......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.8H, v16.8H, v7.H[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v16.8H, v27.8H, v1.H[3] // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v13.8H, v13.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v13.8H, v14.8H, v7.H[0] // ........................................*................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v15.8H, v16.8H, v7.H[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v31.8H, v11.8H, v0.H[3] // ............................*............................................... + // gap // ............................................................................ + // gap // ............................................................................ + add v11.8H, v25.8H, v13.8H // ..........................................*................................. + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v14.8H, v8.8H, v1.H[1] // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v27.8H, v11.8H, v15.8H // .............................................................*.............. + sqrdmulh v16.8H, v6.8H, v1.H[5] // ...............................................................*............ + // gap // ............................................................................ + add v15.8H, v11.8H, v15.8H // ..............................................................*............. + // gap // ............................................................................ + // gap // ............................................................................ + mls v3.8H, v31.8H, v7.H[0] // ..............................*............................................. + sub v11.8H, v25.8H, v13.8H // .........................................*.................................. + // gap // ............................................................................ + str q27, [x0, #320] // .........................................................................*.. + // gap // ............................................................................ + // gap // ............................................................................ + str q15, [x0, #256] // ........................................................................*... + mul v15.8H, v6.8H, v1.H[4] // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v15.8H, v16.8H, v7.H[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + add v13.8H, v20.8H, v3.8H // ................................*........................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v27.8H, v8.8H, v1.H[0] // ......................................................*..................... + sub v31.8H, v20.8H, v3.8H // ...............................*............................................ + ldr q3, [x0, #208] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v6.8H, v13.8H, v26.8H // ....................................................*....................... + mls v27.8H, v14.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + sub v16.8H, v11.8H, v15.8H // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + mul v25.8H, v22.8H, v0.H[0] // ..............e............................................................. + add v15.8H, v11.8H, v15.8H // ...................................................................*........ + // gap // ............................................................................ + sub v14.8H, v13.8H, v26.8H // ...................................................*........................ + str q6, [x0], #(16) // ....................................................................*....... + // gap // ............................................................................ + sqrdmulh v6.8H, v2.8H, v0.H[1] // .......................e.................................................... + // gap // ............................................................................ + // gap // ............................................................................ + str q15, [x0, #368] // ..........................................................................*. + add v15.8H, v31.8H, v27.8H // .........................................................*.................. + // gap // ............................................................................ + sqrdmulh v11.8H, v22.8H, v0.H[1] // .............e.............................................................. + sub v27.8H, v31.8H, v27.8H // ........................................................*................... + str q14, [x0, #48] // .....................................................................*...... + // gap // ............................................................................ + str q16, [x0, #432] // ...........................................................................* + // gap // ............................................................................ + mul v16.8H, v2.8H, v0.H[0] // ........................e................................................... + str q15, [x0, #112] // ......................................................................*..... + // gap // ............................................................................ + str q27, [x0, #176] // .......................................................................*.... + // gap // ............................................................................ + // gap // ............................................................................ + + // ------------------------------------------------------------------- new position -------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + // ldr q8, [x0, #0] // ....................~....................................................'......................*.................................................... + // ldr q9, [x0, #(1*(512/8))] // .........................................................................*........................................................................... + // ldr q10, [x0, #(2*(512/8))] // .~.......................................................................'...*....................................................................... + // ldr q11, [x0, #(3*(512/8))] // .......................................................e.................'.........................................................~................. + // ldr q12, [x0, #(4*(512/8))] // .........................................................................'*.......................................................................... + // ldr q13, [x0, #(5*(512/8))] // ...........................e.............................................'.............................~............................................. + // ldr q14, [x0, #(6*(512/8))] // ...~.....................................................................'.....*..................................................................... + // ldr q15, [x0, #(7*(512/8))] // e........................................................................'..~........................................................................ + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ....~....................................................................'......*.................................................................... + // mul v24.8h, v12.8h, v0.h[0] // .......................~.................................................'.........................*................................................. + // mls v24.8h, v27.8h, v7.h[0] // ..........................~..............................................'............................*.............................................. + // sub v12.8h, v8.8h, v24.8h // ................................~........................................'..................................*........................................ + // add v8.8h, v8.8h, v24.8h // .................................~.......................................'...................................*....................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ..................................................................e......'....................................................................~...... + // mul v24.8h, v13.8h, v0.h[0] // ...........................................................e.............'.............................................................~............. + // mls v24.8h, v27.8h, v7.h[0] // ..~......................................................................'....*...................................................................... + // sub v13.8h, v9.8h, v24.8h // ........~................................................................'..........*................................................................ + // add v9.8h, v9.8h, v24.8h // ......................~..................................................'........................*.................................................. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ......~..................................................................'........*.................................................................. + // mul v24.8h, v14.8h, v0.h[0] // ..........~..............................................................'............*.............................................................. + // mls v24.8h, v27.8h, v7.h[0] // ...........~.............................................................'.............*............................................................. + // sub v14.8h, v10.8h, v24.8h // ..............~..........................................................'................*.......................................................... + // add v10.8h, v10.8h, v24.8h // ...............~.........................................................'.................*......................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ...............................................................e.........'.................................................................~......... + // mul v24.8h, v15.8h, v0.h[0] // ......................................................................e..'........................................................................~.. + // mls v24.8h, v27.8h, v7.h[0] // .........................................................................'.*......................................................................... + // sub v15.8h, v11.8h, v24.8h // .......~.................................................................'.........*................................................................. + // add v11.8h, v11.8h, v24.8h // .....~...................................................................'.......*................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ........................................~................................'..........................................*................................ + // mul v24.8h, v10.8h, v0.h[2] // ..................~......................................................'....................*...................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................................~..........................'................................................*.......................... + // sub v10.8h, v8.8h, v24.8h // ......................................................~..................'........................................................*.................. + // add v8.8h, v8.8h, v24.8h // ....................................................~....................'......................................................*.................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ............~............................................................'..............*............................................................ + // mul v24.8h, v11.8h, v0.h[2] // .........~...............................................................'...........*............................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................~.....................................................'.....................*..................................................... + // sub v11.8h, v9.8h, v24.8h // .........................~...............................................'...........................*............................................... + // add v9.8h, v9.8h, v24.8h // ........................~................................................'..........................*................................................ + // sqrdmulh v27.8h, v14.8h, v0.h[5] // .................~.......................................................'...................*....................................................... + // mul v24.8h, v14.8h, v0.h[4] // .....................................~...................................'.......................................*................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................~..................................'........................................*.................................. + // sub v14.8h, v12.8h, v24.8h // ...............................................~.........................'.................................................*......................... + // add v12.8h, v12.8h, v24.8h // .........................................~...............................'...........................................*............................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ................~........................................................'..................*........................................................ + // mul v24.8h, v15.8h, v0.h[4] // .............~...........................................................'...............*........................................................... + // mls v24.8h, v27.8h, v7.h[0] // .....................~...................................................'.......................*................................................... + // sub v15.8h, v13.8h, v24.8h // .............................~...........................................'...............................*........................................... + // add v13.8h, v13.8h, v24.8h // ............................~............................................'..............................*............................................ + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ..............................~..........................................'................................*.......................................... + // mul v24.8h, v9.8h, v0.h[6] // ..................................~......................................'....................................*...................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................~.....................................'.....................................*..................................... + // sub v9.8h, v8.8h, v24.8h // .............................................................~...........'...............................................................*........... + // add v8.8h, v8.8h, v24.8h // ........................................................~................'..........................................................*................ + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ..........................................~..............................'............................................*.............................. + // mul v24.8h, v11.8h, v1.h[0] // .....................................................~...................'.......................................................*................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................................~...............'...........................................................*............... + // sub v11.8h, v10.8h, v24.8h // ...................................................................~.....'.....................................................................*..... + // add v10.8h, v10.8h, v24.8h // .................................................................~.......'...................................................................*....... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ....................................~....................................'......................................*.................................... + // mul v24.8h, v13.8h, v1.h[2] // ...............................~.........................................'.................................*......................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................................~.................................'.........................................*................................. + // sub v13.8h, v12.8h, v24.8h // ...........................................~.............................'.............................................*............................. + // add v12.8h, v12.8h, v24.8h // .............................................~...........................'...............................................*........................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ............................................~............................'..............................................*............................ + // mul v24.8h, v15.8h, v1.h[4] // ..................................................~......................'....................................................*...................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................................~.....................'.....................................................*..................... + // sub v15.8h, v14.8h, v24.8h // ..........................................................~..............'............................................................*.............. + // add v14.8h, v14.8h, v24.8h // ............................................................~............'..............................................................*............ + // str q8, [x0], #(16) // ..............................................................~..........'................................................................*.......... + // str q9, [x0, #(-16 + 1*(512/8))] // ....................................................................~....'......................................................................*.... + // str q10, [x0, #(-16 + 2*(512/8))] // .......................................................................~.'.........................................................................*. + // str q11, [x0, #(-16 + 3*(512/8))] // ........................................................................~'..........................................................................* + // str q12, [x0, #(-16 + 4*(512/8))] // .................................................~.......................'...................................................*....................... + // str q13, [x0, #(-16 + 5*(512/8))] // ................................................~........................'..................................................*........................ + // str q14, [x0, #(-16 + 6*(512/8))] // ................................................................~........'..................................................................*........ + // str q15, [x0, #(-16 + 7*(512/8))] // .....................................................................~...'.......................................................................*... + + sub count, count, #1 cbnz count, layer123_start - sub v19.8H, v23.8H, v19.8H // .*.............................................................. - mls v2.8H, v8.8H, v7.H[0] // .....*.......................................................... - ldr_vo v16, x0, 0 // *............................................................... - ldr_vo v5, x0, 384 // ...*............................................................ - ldr_vo v21, x0, 128 // ..*............................................................. - // gap // ................................................................ - ldr_vo v24, x0, 320 // ......*......................................................... - ldr_vo v8, x0, 64 // ....*........................................................... - // gap // ................................................................ - mul v4.8H, v19.8H, v0.H[4] // .......*........................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v18.8H, v16.8H, v20.8H // ........*....................................................... - // gap // ................................................................ - // gap // ................................................................ - add v16.8H, v16.8H, v20.8H // .........*...................................................... - sqrdmulh v19.8H, v19.8H, v0.H[5] // ..........*..................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v27.8H, v5.8H, v0.H[1] // ...........*.................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v5.8H, v5.8H, v0.H[0] // ............*................................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v23.8H, v24.8H, v0.H[1] // .............*.................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v5.8H, v27.8H, v7.H[0] // ..............*................................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v24.8H, v24.8H, v0.H[0] // ...............*................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v24.8H, v23.8H, v7.H[0] // ................*............................................... - // gap // ................................................................ - // gap // ................................................................ - sub v27.8H, v21.8H, v5.8H // .................*.............................................. - // gap // ................................................................ - // gap // ................................................................ - add v5.8H, v21.8H, v5.8H // ..................*............................................. - mls v4.8H, v19.8H, v7.H[0] // ...................*............................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v19.8H, v27.8H, v0.H[5] // ......................*......................................... - // gap // ................................................................ - // gap // ................................................................ - add v21.8H, v8.8H, v24.8H // ....................*........................................... - // gap // ................................................................ - // gap // ................................................................ - sub v24.8H, v8.8H, v24.8H // .....................*.......................................... - sqrdmulh v8.8H, v5.8H, v0.H[3] // .........................*...................................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v27.8H, v27.8H, v0.H[4] // ............................*................................... - sub v23.8H, v21.8H, v2.8H // .......................*........................................ - // gap // ................................................................ - add v21.8H, v21.8H, v2.8H // ........................*....................................... - // gap // ................................................................ - // gap // ................................................................ - mls v27.8H, v19.8H, v7.H[0] // .............................*.................................. - sub v19.8H, v24.8H, v4.8H // ..........................*..................................... - // gap // ................................................................ - add v24.8H, v24.8H, v4.8H // ...........................*.................................... - // gap // ................................................................ - // gap // ................................................................ - mul v5.8H, v5.8H, v0.H[2] // ..............................*................................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v5.8H, v8.8H, v7.H[0] // ...............................*................................ - // gap // ................................................................ - // gap // ................................................................ - sub v2.8H, v18.8H, v27.8H // ................................*............................... - // gap // ................................................................ - // gap // ................................................................ - add v8.8H, v18.8H, v27.8H // .................................*.............................. - mul v4.8H, v21.8H, v0.H[6] // ..................................*............................. - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v21.8H, v21.8H, v0.H[7] // ...................................*............................ - // gap // ................................................................ - // gap // ................................................................ - sub v18.8H, v16.8H, v5.8H // ....................................*........................... - // gap // ................................................................ - // gap // ................................................................ - add v16.8H, v16.8H, v5.8H // .....................................*.......................... - mul v5.8H, v23.8H, v1.H[0] // ......................................*......................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v27.8H, v23.8H, v1.H[1] // .......................................*........................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v4.8H, v21.8H, v7.H[0] // ........................................*....................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mul v21.8H, v24.8H, v1.H[2] // .........................................*...................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v5.8H, v27.8H, v7.H[0] // ..........................................*..................... - // gap // ................................................................ - // gap // ................................................................ - add v27.8H, v16.8H, v4.8H // .............................................*.................. - // gap // ................................................................ - // gap // ................................................................ - sqrdmulh v24.8H, v24.8H, v1.H[3] // ............................................*................... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str_vi v27, x0, 16 // ................................................*............... - sqrdmulh v27.8H, v19.8H, v1.H[5] // ..................................................*............. - // gap // ................................................................ - sub v23.8H, v18.8H, v5.8H // .................................................*.............. - // gap // ................................................................ - // gap // ................................................................ - mul v19.8H, v19.8H, v1.H[4] // ...............................................*................ - add v5.8H, v18.8H, v5.8H // ...................................................*............ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - mls v21.8H, v24.8H, v7.H[0] // ....................................................*........... - str_vo v23, x0, 176 // .....................................................*.......... - // gap // ................................................................ - str_vo v5, x0, 112 // ......................................................*......... - // gap // ................................................................ - // gap // ................................................................ - mls v19.8H, v27.8H, v7.H[0] // .......................................................*........ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v16.8H, v16.8H, v4.8H // ...........................................*.................... - // gap // ................................................................ - // gap // ................................................................ - sub v5.8H, v8.8H, v21.8H // ........................................................*....... - add v21.8H, v8.8H, v21.8H // .........................................................*...... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - sub v24.8H, v2.8H, v19.8H // ............................................................*... - add v19.8H, v2.8H, v19.8H // .............................................................*.. - str_vo v16, x0, 48 // ..............................................*................. - str_vo v5, x0, 304 // ..........................................................*..... - str_vo v21, x0, 240 // ...........................................................*.... - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - // gap // ................................................................ - str_vo v24, x0, 432 // ..............................................................*. - str_vo v19, x0, 368 // ...............................................................* - // gap // ................................................................ - - // original source code - // ldr_vo v16, x0, 0 // ..*............................................................. || *............................................................... - // sub v19.8H, v23.8H, v19.8H // *............................................................... || *............................................................... - // ldr_vo v5, x0, 128 // ....*........................................................... || .*.............................................................. - // ldr_vo v21, x0, 384 // ...*............................................................ || .*.............................................................. - // ldr_vo v24, x0, 64 // ......*......................................................... || ..*............................................................. - // mls v2.8H, v8.8H, v7.H[0] // .*.............................................................. || *............................................................... - // ldr_vo v8, x0, 320 // .....*.......................................................... || ..*............................................................. - // mul v18.8H, v19.8H, v0.H[4] // .......*........................................................ || ...*............................................................ - // sub v14.8H, v16.8H, v20.8H // ........*....................................................... || ....*........................................................... - // add v16.8H, v16.8H, v20.8H // .........*...................................................... || .....*.......................................................... - // sqrdmulh v19.8H, v19.8H, v0.H[5] // ..........*..................................................... || .....*.......................................................... - // sqrdmulh v20.8H, v21.8H, v0.H[1] // ...........*.................................................... || .......*........................................................ - // mul v21.8H, v21.8H, v0.H[0] // ............*................................................... || .........*...................................................... - // sqrdmulh v29.8H, v8.8H, v0.H[1] // .............*.................................................. || ...........*.................................................... - // mls v21.8H, v20.8H, v7.H[0] // ..............*................................................. || .............*.................................................. - // mul v8.8H, v8.8H, v0.H[0] // ...............*................................................ || ...............*................................................ - // mls v8.8H, v29.8H, v7.H[0] // ................*............................................... || .................*.............................................. - // sub v20.8H, v5.8H, v21.8H // .................*.............................................. || ..................*............................................. - // add v5.8H, v5.8H, v21.8H // ..................*............................................. || ...................*............................................ - // mls v18.8H, v19.8H, v7.H[0] // ...................*............................................ || ...................*............................................ - // add v21.8H, v24.8H, v8.8H // .....................*.......................................... || ......................*......................................... - // sub v24.8H, v24.8H, v8.8H // ......................*......................................... || .......................*........................................ - // sqrdmulh v27.8H, v20.8H, v0.H[5] // ....................*........................................... || .....................*.......................................... - // sub v29.8H, v21.8H, v2.8H // .........................*...................................... || .........................*...................................... - // add v21.8H, v21.8H, v2.8H // ..........................*..................................... || ..........................*..................................... - // sqrdmulh v2.8H, v5.8H, v0.H[3] // .......................*........................................ || .......................*........................................ - // sub v25.8H, v24.8H, v18.8H // ............................*................................... || ...........................*.................................... - // add v24.8H, v24.8H, v18.8H // .............................*.................................. || ............................*................................... - // mul v18.8H, v20.8H, v0.H[4] // ........................*....................................... || .........................*...................................... - // mls v18.8H, v27.8H, v7.H[0] // ...........................*.................................... || ...........................*.................................... - // mul v5.8H, v5.8H, v0.H[2] // ..............................*................................. || .............................*.................................. - // mls v5.8H, v2.8H, v7.H[0] // ...............................*................................ || ...............................*................................ - // sub v2.8H, v14.8H, v18.8H // ................................*............................... || ................................*............................... - // add v18.8H, v14.8H, v18.8H // .................................*.............................. || .................................*.............................. - // mul v27.8H, v21.8H, v0.H[6] // ..................................*............................. || .................................*.............................. - // sqrdmulh v21.8H, v21.8H, v0.H[7] // ...................................*............................ || ...................................*............................ - // sub v14.8H, v16.8H, v5.8H // ....................................*........................... || ....................................*........................... - // add v16.8H, v16.8H, v5.8H // .....................................*.......................... || .....................................*.......................... - // mul v5.8H, v29.8H, v1.H[0] // ......................................*......................... || .....................................*.......................... - // sqrdmulh v20.8H, v29.8H, v1.H[1] // .......................................*........................ || .......................................*........................ - // mls v27.8H, v21.8H, v7.H[0] // ........................................*....................... || .........................................*...................... - // mul v21.8H, v24.8H, v1.H[2] // .........................................*...................... || ...........................................*.................... - // mls v5.8H, v20.8H, v7.H[0] // ..........................................*..................... || .............................................*.................. - // sub v20.8H, v16.8H, v27.8H // ......................................................*......... || .........................................................*...... - // sqrdmulh v24.8H, v24.8H, v1.H[3] // ............................................*................... || ...............................................*................ - // add v16.8H, v16.8H, v27.8H // ...........................................*.................... || ..............................................*................. - // str_vo v20, x0, 64 // ...........................................................*.... || ............................................................*... - // mul v27.8H, v25.8H, v1.H[4] // ................................................*............... || ...................................................*............ - // str_vi v16, x0, 16 // .............................................*.................. || .................................................*.............. - // sub v16.8H, v14.8H, v5.8H // ...............................................*................ || ..................................................*............. - // sqrdmulh v20.8H, v25.8H, v1.H[5] // ..............................................*................. || .................................................*.............. - // add v5.8H, v14.8H, v5.8H // .................................................*.............. || ...................................................*............ - // mls v21.8H, v24.8H, v7.H[0] // ..................................................*............. || .....................................................*.......... - // str_vo v16, x0, 176 // ...................................................*............ || .....................................................*.......... - // str_vo v5, x0, 112 // ....................................................*........... || ......................................................*......... - // mls v27.8H, v20.8H, v7.H[0] // .....................................................*.......... || .......................................................*........ - // sub v16.8H, v18.8H, v21.8H // .......................................................*........ || ..........................................................*..... - // add v5.8H, v18.8H, v21.8H // ........................................................*....... || ..........................................................*..... - // str_vo v16, x0, 304 // ............................................................*... || .............................................................*.. - // str_vo v5, x0, 240 // .............................................................*.. || .............................................................*.. - // sub v16.8H, v2.8H, v27.8H // .........................................................*...... || ............................................................*... - // add v5.8H, v2.8H, v27.8H // ..........................................................*..... || ............................................................*... - // str_vo v16, x0, 432 // ..............................................................*. || ...............................................................* - // str_vo v5, x0, 368 // ...............................................................* || ...............................................................* - + // Instructions: 69 + // Expected cycles: 71 + // Expected IPC: 0.97 + // + // Cycle bound: 71.0 + // IPC bound: 0.97 + // + // Wall time: 2.84s + // User time: 2.84s + // + // ------------------------ original position -------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------ + mls v16.8H, v6.8H, v7.H[0] // ..*.................................................................. + ldr q15, [x0, #128] // ...*................................................................. + ldr q5, [x0, #256] // .*................................................................... + ldr q23, [x0, #384] // .....*............................................................... + ldr q26, [x0, #0] // ......................*.............................................. + // gap // ..................................................................... + mls v25.8H, v11.8H, v7.H[0] // ....*................................................................ + ldr q29, [x0, #64] // *.................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sqrdmulh v14.8H, v5.8H, v0.H[1] // ......*.............................................................. + // gap // ..................................................................... + // gap // ..................................................................... + sub v10.8H, v3.8H, v16.8H // .........*........................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mul v18.8H, v5.8H, v0.H[0] // .........................*........................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sqrdmulh v6.8H, v23.8H, v0.H[1] // ........*............................................................ + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mls v18.8H, v14.8H, v7.H[0] // ............................*........................................ + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mul v8.8H, v23.8H, v0.H[0] // ............*........................................................ + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mls v8.8H, v6.8H, v7.H[0] // .............*....................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sub v2.8H, v26.8H, v18.8H // .................................*................................... + // gap // ..................................................................... + // gap // ..................................................................... + add v11.8H, v3.8H, v16.8H // .......*............................................................. + mul v22.8H, v10.8H, v0.H[4] // ...............*..................................................... + // gap // ..................................................................... + sub v31.8H, v29.8H, v25.8H // ..........*.......................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sqrdmulh v16.8H, v10.8H, v0.H[5] // ..................*.................................................. + add v3.8H, v29.8H, v25.8H // ........................*............................................ + // gap // ..................................................................... + add v13.8H, v15.8H, v8.8H // .................*................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sqrdmulh v12.8H, v11.8H, v0.H[3] // ..............*...................................................... + sub v27.8H, v15.8H, v8.8H // ................*.................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mul v20.8H, v13.8H, v0.H[2] // ....................*................................................ + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mls v22.8H, v16.8H, v7.H[0] // .......................*............................................. + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mul v28.8H, v11.8H, v0.H[2] // ...........*......................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mls v28.8H, v12.8H, v7.H[0] // .....................*............................................... + // gap // ..................................................................... + // gap // ..................................................................... + add v23.8H, v31.8H, v22.8H // .............................*....................................... + // gap // ..................................................................... + // gap // ..................................................................... + sub v4.8H, v31.8H, v22.8H // ..............................*...................................... + // gap // ..................................................................... + sqrdmulh v22.8H, v27.8H, v0.H[5] // ...................*................................................. + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mul v8.8H, v27.8H, v0.H[4] // ......................................*.............................. + // gap // ..................................................................... + // gap // ..................................................................... + add v29.8H, v3.8H, v28.8H // ..........................*.......................................... + // gap // ..................................................................... + // gap // ..................................................................... + mul v27.8H, v23.8H, v1.H[2] // ................................*.................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mls v8.8H, v22.8H, v7.H[0] // .......................................*............................. + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sqrdmulh v15.8H, v13.8H, v0.H[3] // .........................................*........................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sqrdmulh v25.8H, v23.8H, v1.H[3] // .....................................*............................... + // gap // ..................................................................... + // gap // ..................................................................... + sub v12.8H, v2.8H, v8.8H // ................................................*.................... + // gap // ..................................................................... + // gap // ..................................................................... + sqrdmulh v30.8H, v29.8H, v0.H[7] // ...............................*..................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mls v20.8H, v15.8H, v7.H[0] // ...............................................*..................... + add v15.8H, v26.8H, v18.8H // ..................................*.................................. + // gap // ..................................................................... + add v18.8H, v2.8H, v8.8H // ..........................................*.......................... + // gap // ..................................................................... + // gap // ..................................................................... + mul v8.8H, v29.8H, v0.H[6] // ...................................*................................. + sub v29.8H, v3.8H, v28.8H // ...........................*......................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mls v8.8H, v30.8H, v7.H[0] // ....................................*................................ + // gap // ..................................................................... + // gap // ..................................................................... + sub v13.8H, v15.8H, v20.8H // .......................................................*............. + // gap // ..................................................................... + // gap // ..................................................................... + mls v27.8H, v25.8H, v7.H[0] // ........................................*............................ + add v26.8H, v15.8H, v20.8H // .....................................................*............... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sqrdmulh v14.8H, v29.8H, v1.H[1] // ...........................................*......................... + add v10.8H, v26.8H, v8.8H // ........................................................*............ + // gap // ..................................................................... + // gap // ..................................................................... + sub v15.8H, v26.8H, v8.8H // ............................................................*........ + sqrdmulh v16.8H, v4.8H, v1.H[5] // .............................................*....................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + sub v21.8H, v18.8H, v27.8H // ............................................*........................ + add v2.8H, v18.8H, v27.8H // ..............................................*...................... + mul v23.8H, v29.8H, v1.H[0] // ......................................................*.............. + str q10, [x0], #(16) // .............................................................*....... + str q15, [x0, #48] // .................................................................*... + // gap // ..................................................................... + // gap // ..................................................................... + str q21, [x0, #304] // .................................................*................... + mul v17.8H, v4.8H, v1.H[4] // ...................................................*................. + // gap // ..................................................................... + str q2, [x0, #240] // ..................................................*.................. + // gap // ..................................................................... + // gap // ..................................................................... + mls v17.8H, v16.8H, v7.H[0] // ....................................................*................ + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + mls v23.8H, v14.8H, v7.H[0] // .........................................................*........... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + add v28.8H, v12.8H, v17.8H // ...........................................................*......... + sub v12.8H, v12.8H, v17.8H // ..........................................................*.......... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + add v27.8H, v13.8H, v23.8H // ...............................................................*..... + sub v11.8H, v13.8H, v23.8H // ................................................................*.... + // gap // ..................................................................... + str q28, [x0, #368] // ..............................................................*...... + str q12, [x0, #432] // ..................................................................*.. + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + // gap // ..................................................................... + str q11, [x0, #176] // ....................................................................* + str q27, [x0, #112] // ...................................................................*. + // gap // ..................................................................... + + // --------------------------- new position ---------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------ + // ldr q31, [x0, #64] // ......*.............................................................. + // ldr q17, [x0, #256] // ..*.................................................................. + // mls v16.8H, v6.8H, v7.H[0] // *.................................................................... + // ldr q18, [x0, #128] // .*................................................................... + // mls v25.8H, v11.8H, v7.H[0] // .....*............................................................... + // ldr q26, [x0, #384] // ...*................................................................. + // sqrdmulh v22.8H, v17.8H, v0.H[1] // .......*............................................................. + // add v15.8H, v3.8H, v16.8H // ...............*..................................................... + // sqrdmulh v14.8H, v26.8H, v0.H[1] // ..........*.......................................................... + // sub v27.8H, v3.8H, v16.8H // ........*............................................................ + // sub v6.8H, v31.8H, v25.8H // .................*................................................... + // mul v9.8H, v15.8H, v0.H[2] // .........................*........................................... + // mul v16.8H, v26.8H, v0.H[0] // ............*........................................................ + // mls v16.8H, v14.8H, v7.H[0] // .............*....................................................... + // sqrdmulh v28.8H, v15.8H, v0.H[3] // .....................*............................................... + // mul v15.8H, v27.8H, v0.H[4] // ................*.................................................... + // sub v13.8H, v18.8H, v16.8H // ......................*.............................................. + // add v11.8H, v18.8H, v16.8H // ....................*................................................ + // sqrdmulh v16.8H, v27.8H, v0.H[5] // ..................*.................................................. + // sqrdmulh v14.8H, v13.8H, v0.H[5] // .............................*....................................... + // mul v3.8H, v11.8H, v0.H[2] // .......................*............................................. + // mls v9.8H, v28.8H, v7.H[0] // ..........................*.......................................... + // ldr q28, [x0, #0] // ....*................................................................ + // mls v15.8H, v16.8H, v7.H[0] // ........................*............................................ + // add v16.8H, v31.8H, v25.8H // ...................*................................................. + // mul v31.8H, v17.8H, v0.H[0] // .........*........................................................... + // add v26.8H, v16.8H, v9.8H // ...............................*..................................... + // sub v8.8H, v16.8H, v9.8H // ..........................................*.......................... + // mls v31.8H, v22.8H, v7.H[0] // ...........*......................................................... + // add v27.8H, v6.8H, v15.8H // ...........................*......................................... + // sub v6.8H, v6.8H, v15.8H // ............................*........................................ + // sqrdmulh v16.8H, v26.8H, v0.H[7] // .....................................*............................... + // mul v15.8H, v27.8H, v1.H[2] // ................................*.................................... + // sub v25.8H, v28.8H, v31.8H // ..............*...................................................... + // add v20.8H, v28.8H, v31.8H // .......................................*............................. + // mul v26.8H, v26.8H, v0.H[6] // .........................................*........................... + // mls v26.8H, v16.8H, v7.H[0] // ...........................................*......................... + // sqrdmulh v16.8H, v27.8H, v1.H[3] // ...................................*................................. + // mul v13.8H, v13.8H, v0.H[4] // ..............................*...................................... + // mls v13.8H, v14.8H, v7.H[0] // .................................*................................... + // mls v15.8H, v16.8H, v7.H[0] // .............................................*....................... + // sqrdmulh v31.8H, v11.8H, v0.H[3] // ..................................*.................................. + // add v11.8H, v25.8H, v13.8H // ........................................*............................ + // sqrdmulh v14.8H, v8.8H, v1.H[1] // ...............................................*..................... + // sub v27.8H, v11.8H, v15.8H // ...................................................*................. + // sqrdmulh v16.8H, v6.8H, v1.H[5] // ..................................................*.................. + // add v15.8H, v11.8H, v15.8H // ....................................................*................ + // mls v3.8H, v31.8H, v7.H[0] // ......................................*.............................. + // sub v11.8H, v25.8H, v13.8H // ....................................*................................ + // str q27, [x0, #320] // ........................................................*............ + // str q15, [x0, #256] // ..........................................................*.......... + // mul v15.8H, v6.8H, v1.H[4] // .........................................................*........... + // mls v15.8H, v16.8H, v7.H[0] // ...........................................................*......... + // add v13.8H, v20.8H, v3.8H // ..............................................*...................... + // mul v27.8H, v8.8H, v1.H[0] // .....................................................*............... + // sub v31.8H, v20.8H, v3.8H // ............................................*........................ + // add v6.8H, v13.8H, v26.8H // ................................................*.................... + // mls v27.8H, v14.8H, v7.H[0] // ............................................................*........ + // sub v16.8H, v11.8H, v15.8H // ..............................................................*...... + // add v15.8H, v11.8H, v15.8H // .............................................................*....... + // sub v14.8H, v13.8H, v26.8H // .................................................*................... + // str q6, [x0], #(16) // ......................................................*.............. + // str q15, [x0, #368] // .................................................................*... + // add v15.8H, v31.8H, v27.8H // ...............................................................*..... + // sub v27.8H, v31.8H, v27.8H // ................................................................*.... + // str q14, [x0, #48] // .......................................................*............. + // str q16, [x0, #432] // ..................................................................*.. + // str q15, [x0, #112] // ....................................................................* + // str q27, [x0, #176] // ...................................................................*. + restore inp, STACK0 mov count, #8 .p2align 2 - ldr_vi v10, x3, 16 // .*................................................... - ldr_vo v14, x1, 48 // *.................................................... - // gap // ..................................................... - ldr_vo v21, x1, 16 // ....*................................................ - ldr_vo v13, x4, 48 // ..*.................................................. - // gap // ..................................................... - ldr_vo v23, x1, 32 // ...*................................................. - ldr_vo v19, x1, 0 // ..........*.......................................... - // gap // ..................................................... - ldr_vo v0, x4, 16 // ........*............................................ - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v27.8H, v14.8H, v10.H[1] // ......*.............................................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v5.8H, v14.8H, v10.H[0] // .......*............................................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v11.8H, v23.8H, v10.H[1] // .........*........................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v5.8H, v27.8H, v7.H[0] // ...........*......................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v1.8H, v23.8H, v10.H[0] // ............*........................................ - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - add v22.8H, v21.8H, v5.8H // ..............*...................................... - sub v16.8H, v21.8H, v5.8H // ...............*..................................... - // gap // ..................................................... - mls v1.8H, v11.8H, v7.H[0] // .............*....................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v11.8H, v22.8H, v10.H[3] // .................*................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v23.8H, v16.8H, v10.H[5] // ..................*.................................. - // gap // ..................................................... - // gap // ..................................................... - add v28.8H, v19.8H, v1.8H // ....................*................................ - // gap // ..................................................... - // gap // ..................................................... - mul v31.8H, v22.8H, v10.H[2] // ......................*.............................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v31.8H, v11.8H, v7.H[0] // .......................*............................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v5.8H, v16.8H, v10.H[4] // .....................*............................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v5.8H, v23.8H, v7.H[0] // ........................*............................ - // gap // ..................................................... - // gap // ..................................................... - add v24.8H, v28.8H, v31.8H // .........................*........................... - // gap // ..................................................... - // gap // ..................................................... - sub v11.8H, v28.8H, v31.8H // ..........................*.......................... - sub v19.8H, v19.8H, v1.8H // ...................*................................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sub v28.8H, v19.8H, v5.8H // ............................*........................ - add v23.8H, v19.8H, v5.8H // .............................*....................... - ldr_vi v5, x4, 96 // ................*.................................... - trn2 v17.4S, v24.4S, v11.4S // ..............................*...................... - // gap // ..................................................... - // gap // ..................................................... - trn1 v24.4S, v24.4S, v11.4S // ................................*.................... - ldr_vo v11, x4, -64 // .....*............................................... - // gap // ..................................................... - trn2 v1.4S, v23.4S, v28.4S // ...............................*..................... - // gap // ..................................................... - // gap // ..................................................... - trn1 v19.4S, v23.4S, v28.4S // ..................................*.................. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - trn2 v16.2D, v17.2D, v1.2D // .................................*................... - // gap // ..................................................... - // gap // ..................................................... - trn2 v28.2D, v24.2D, v19.2D // ......................................*.............. - // gap // ..................................................... - // gap // ..................................................... - trn1 v30.2D, v24.2D, v19.2D // ..........................................*.......... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v19.8H, v16.8H, v0.8H // .....................................*............... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v27.8H, v16.8H, v5.8H // .......................................*............. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v27.8H, v19.8H, v7.H[0] // .........................................*........... - trn1 v19.2D, v17.2D, v1.2D // ....................................*................ - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v25.8H, v28.8H, v0.8H // ........................................*............ - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v31.8H, v28.8H, v5.8H // ...........................................*......... - ldr_vo v28, x4, -16 // ...........................*......................... - // gap // ..................................................... - ldr_vo v5, x4, -32 // ...................................*................. - sub v2.8H, v19.8H, v27.8H // ..............................................*...... - // gap // ..................................................... - add v16.8H, v19.8H, v27.8H // ............................................*........ - // gap // ..................................................... - // gap // ..................................................... - mls v31.8H, v25.8H, v7.H[0] // .............................................*....... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v0.8H, v16.8H, v11.8H // ................................................*.... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v19.8H, v16.8H, v13.8H // ...............................................*..... - // gap // ..................................................... - // gap // ..................................................... - add v13.8H, v30.8H, v31.8H // .................................................*... - // gap // ..................................................... - // gap // ..................................................... - sqrdmulh v14.8H, v2.8H, v28.8H // ..................................................*.. - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mul v6.8H, v2.8H, v5.8H // ....................................................* - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - // gap // ..................................................... - mls v0.8H, v19.8H, v7.H[0] // ...................................................*. - // gap // ..................................................... - // gap // ..................................................... - - // original source code - // ldr_vo v23, x1, 48 // .*................................................... || *................................................................ - // ldr_vi v11, x3, 16 // *.................................................... || *................................................................ - // ldr_vo v10, x4, 48 // ...*................................................. || .*............................................................... - // ldr_vo v14, x1, 32 // ....*................................................ || ..*.............................................................. - // ldr_vo v26, x1, 16 // ..*.................................................. || .*............................................................... - // ldr_vo v5, x4, 32 // ..............................*...................... || ...................................*............................. - // sqrdmulh v20.8H, v23.8H, v11.H[1] // .......*............................................. || ....*............................................................ - // mul v24.8H, v23.8H, v11.H[0] // ........*............................................ || ......*.......................................................... - // ldr_vo v3, x4, 16 // ......*.............................................. || ...*............................................................. - // sqrdmulh v8.8H, v14.8H, v11.H[1] // .........*........................................... || ........*........................................................ - // ldr_vo v22, x1, 0 // .....*............................................... || ..*.............................................................. - // mls v24.8H, v20.8H, v7.H[0] // ..........*.......................................... || ..........*...................................................... - // mul v28.8H, v14.8H, v11.H[0] // ...........*......................................... || ............*.................................................... - // mls v28.8H, v8.8H, v7.H[0] // ..............*...................................... || ................*................................................ - // add v1.8H, v26.8H, v24.8H // ............*........................................ || ...............*................................................. - // sub v30.8H, v26.8H, v24.8H // .............*....................................... || ...............*................................................. - // ldr_vi v23, x4, 96 // ...........................*......................... || .................................*............................... - // sqrdmulh v17.8H, v1.8H, v11.H[3] // ...............*..................................... || ..................*.............................................. - // sqrdmulh v2.8H, v30.8H, v11.H[5] // ................*.................................... || ....................*............................................ - // sub v31.8H, v22.8H, v28.8H // ........................*............................ || ..............................*.................................. - // add v26.8H, v22.8H, v28.8H // .................*................................... || .....................*........................................... - // mul v15.8H, v30.8H, v11.H[4] // ....................*................................ || ..........................*...................................... - // mul v1.8H, v1.8H, v11.H[2] // ..................*.................................. || ......................*.......................................... - // mls v1.8H, v17.8H, v7.H[0] // ...................*................................. || ........................*........................................ - // mls v15.8H, v2.8H, v7.H[0] // .....................*............................... || ............................*.................................... - // add v24.8H, v26.8H, v1.8H // ......................*.............................. || .............................*................................... - // sub v12.8H, v26.8H, v1.8H // .......................*............................. || ..............................*.................................. - // ldr_vo v11, x4, -16 // ..........................................*.......... || ...................................................*............. - // sub v26.8H, v31.8H, v15.8H // .........................*........................... || .................................*............................... - // add v20.8H, v31.8H, v15.8H // ..........................*.......................... || .................................*............................... - // trn2 v8.4S, v24.4S, v12.4S // ............................*........................ || ..................................*.............................. - // trn2 v18.4S, v20.4S, v26.4S // ...............................*..................... || ....................................*............................ - // trn1 v12.4S, v24.4S, v12.4S // .............................*....................... || ...................................*............................. - // trn2 v0.2D, v8.2D, v18.2D // .................................*................... || .......................................*......................... - // trn1 v17.4S, v20.4S, v26.4S // ................................*.................... || .....................................*........................... - // ldr_vo v16, x4, -32 // ...........................................*......... || ....................................................*............ - // trn1 v14.2D, v8.2D, v18.2D // .......................................*............. || ...............................................*................. - // sqrdmulh v28.8H, v0.8H, v3.8H // ....................................*................ || ..........................................*...................... - // trn2 v1.2D, v12.2D, v17.2D // ..................................*.................. || ........................................*........................ - // mul v4.8H, v0.8H, v23.8H // .....................................*............... || ............................................*.................... - // sqrdmulh v26.8H, v1.8H, v3.8H // ........................................*............ || .................................................*............... - // mls v4.8H, v28.8H, v7.H[0] // ......................................*.............. || ...............................................*................. - // trn1 v30.2D, v12.2D, v17.2D // ...................................*................. || .........................................*....................... - // mul v31.8H, v1.8H, v23.8H // .........................................*........... || ...................................................*............. - // add v2.8H, v14.8H, v4.8H // .............................................*....... || .....................................................*........... - // mls v31.8H, v26.8H, v7.H[0] // ..............................................*...... || ......................................................*.......... - // sub v28.8H, v14.8H, v4.8H // ............................................*........ || ....................................................*............ - // sqrdmulh v4.8H, v2.8H, v10.8H // ................................................*.... || ..........................................................*...... - // mul v0.8H, v2.8H, v5.8H // ...............................................*..... || ........................................................*........ - // add v13.8H, v30.8H, v31.8H // .................................................*... || ...........................................................*..... - // sqrdmulh v14.8H, v28.8H, v11.8H // ..................................................*.. || ............................................................*.... - // mls v0.8H, v4.8H, v7.H[0] // ....................................................* || ................................................................* - // mul v6.8H, v28.8H, v16.8H // ...................................................*. || ..............................................................*.. - + // Instructions: 38 + // Expected cycles: 49 + // Expected IPC: 0.78 + // + // Cycle bound: 49.0 + // IPC bound: 0.78 + // + // Wall time: 0.74s + // User time: 0.74s + // + // --------- original position ---------> + // 0 25 + // |------------------------|------------ + ldr q1, [x1, #48] // *..................................... + // gap // ...................................... + ldr q15, [x3], #16 // .*.................................... + ldr q6, [x1, #16] // ...*.................................. + ldr q23, [x4], #(6*16) // ...........................*.......... + // gap // ...................................... + ldr q18, [x4, #-80] // ..............................*....... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v26.8H, v1.8H, v15.H[1] // ....*................................. + ldr q28, [x1, #32] // ..*................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v27.8H, v1.8H, v15.H[0] // ......*............................... + // gap // ...................................... + // gap // ...................................... + ldr q16, [x1, #0] // ................*..................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v21.8H, v28.8H, v15.H[1] // .....*................................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v27.8H, v26.8H, v7.H[0] // .......*.............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v8.8H, v28.8H, v15.H[0] // ........*............................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v8.8H, v21.8H, v7.H[0] // ...............*...................... + // gap // ...................................... + // gap // ...................................... + add v21.8H, v6.8H, v27.8H // ..........*........................... + // gap // ...................................... + // gap // ...................................... + sub v14.8H, v6.8H, v27.8H // .........*............................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v28.8H, v21.8H, v15.H[3] // ..............*....................... + // gap // ...................................... + // gap // ...................................... + sub v1.8H, v16.8H, v8.8H // ...................*.................. + // gap // ...................................... + // gap // ...................................... + add v12.8H, v16.8H, v8.8H // ....................*................. + sqrdmulh v27.8H, v14.8H, v15.H[5] // ............*......................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v0.8H, v14.8H, v15.H[4] // ...........*.......................... + ldr q14, [x4, #-48] // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v15.8H, v21.8H, v15.H[2] // .............*........................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v0.8H, v27.8H, v7.H[0] // .................*.................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v15.8H, v28.8H, v7.H[0] // ..................*................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + add v16.8H, v1.8H, v0.8H // .........................*............ + sub v22.8H, v1.8H, v0.8H // .....................*................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v27.8H, v12.8H, v15.8H // ......................*............... + add v29.8H, v12.8H, v15.8H // .......................*.............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + trn1 v13.4S, v29.4S, v27.4S // ..........................*........... + trn1 v1.4S, v16.4S, v22.4S // ............................*......... + // gap // ...................................... + trn2 v0.4S, v29.4S, v27.4S // ................................*..... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + trn2 v3.4S, v16.4S, v22.4S // .............................*........ + trn2 v19.2D, v13.2D, v1.2D // ...............................*...... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + trn2 v27.2D, v0.2D, v3.2D // ..................................*... + // gap // ...................................... + sqrdmulh v15.8H, v19.8H, v18.8H // .................................*.... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sqrdmulh v29.8H, v27.8H, v18.8H // ....................................*. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v19.8H, v19.8H, v23.8H // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v19.8H, v15.8H, v7.H[0] // .....................................* + // gap // ...................................... + // gap // ...................................... + + // ----------- new position ------------> + // 0 25 + // |------------------------|------------ + // ldr q16, [x1, #48] // *..................................... + // ldr q8, [x3], #16 // .*.................................... + // ldr q6, [x1, #32] // ......*............................... + // ldr q28, [x1, #16] // ..*................................... + // sqrdmulh v0.8H, v16.8H, v8.H[1] // .....*................................ + // sqrdmulh v17.8H, v6.8H, v8.H[1] // .........*............................ + // mul v24.8H, v16.8H, v8.H[0] // .......*.............................. + // mls v24.8H, v0.8H, v7.H[0] // ..........*........................... + // mul v15.8H, v6.8H, v8.H[0] // ...........*.......................... + // sub v2.8H, v28.8H, v24.8H // ..............*....................... + // add v28.8H, v28.8H, v24.8H // .............*........................ + // mul v3.8H, v2.8H, v8.H[4] // ...................*.................. + // sqrdmulh v1.8H, v2.8H, v8.H[5] // ..................*................... + // mul v14.8H, v28.8H, v8.H[2] // .....................*................ + // sqrdmulh v13.8H, v28.8H, v8.H[3] // ...............*...................... + // mls v15.8H, v17.8H, v7.H[0] // ............*......................... + // ldr q0, [x1, #0] // ........*............................. + // mls v3.8H, v1.8H, v7.H[0] // ......................*............... + // mls v14.8H, v13.8H, v7.H[0] // .......................*.............. + // sub v21.8H, v0.8H, v15.8H // ................*..................... + // add v28.8H, v0.8H, v15.8H // .................*.................... + // sub v2.8H, v21.8H, v3.8H // .........................*............ + // sub v29.8H, v28.8H, v14.8H // ..........................*........... + // add v17.8H, v28.8H, v14.8H // ...........................*.......... + // ldr q14, [x4, #48] // ....................*................. + // add v3.8H, v21.8H, v3.8H // ........................*............. + // trn1 v13.4S, v17.4S, v29.4S // ............................*......... + // ldr q23, [x4], #(6*16) // ...*.................................. + // trn1 v1.4S, v3.4S, v2.4S // .............................*........ + // trn2 v3.4S, v3.4S, v2.4S // ...............................*...... + // ldr q6, [x4, #-80] // ....*................................. + // trn2 v2.2D, v13.2D, v1.2D // ................................*..... + // trn2 v0.4S, v17.4S, v29.4S // ..............................*....... + // sqrdmulh v15.8H, v2.8H, v6.8H // ..................................*... + // trn2 v27.2D, v0.2D, v3.2D // .................................*.... + // mul v19.8H, v2.8H, v23.8H // ....................................*. + // sqrdmulh v29.8H, v27.8H, v6.8H // ...................................*.. + // mls v19.8H, v15.8H, v7.H[0] // .....................................* + sub count, count, #1 -.p2align 2 layer4567_start: - sub v25.8H, v30.8H, v31.8H // ..........................................*................................................ - ldr_vo v23, x1, 112 // ...e....................................................................................... - ldr_vi v11, x3, 16 // ....e...................................................................................... - ldr_vo v10, x4, 48 // ....................................e...................................................... - mls v6.8H, v14.8H, v7.H[0] // ........................................................*.................................. - ldr_vo v14, x1, 96 // ..e........................................................................................ - add v18.8H, v13.8H, v0.8H // .....................................................*..................................... - ldr_vo v26, x1, 80 // .e......................................................................................... - // gap // ........................................................................................... - ldr_vo v5, x4, 32 // ...................................e....................................................... - sub v16.8H, v13.8H, v0.8H // ....................................................*...................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sqrdmulh v20.8H, v23.8H, v11.H[1] // ...........e............................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - trn1 v19.4S, v18.4S, v16.4S // ...........................................................*............................... - mul v24.8H, v23.8H, v11.H[0] // ..........e................................................................................ - ldr_vo v3, x4, 16 // ..................................e........................................................ - // gap // ........................................................................................... - trn2 v27.4S, v18.4S, v16.4S // ............................................................*.............................. - // gap // ........................................................................................... - sub v29.8H, v25.8H, v6.8H // .........................................................*................................. - sqrdmulh v8.8H, v14.8H, v11.H[1] // ......e.................................................................................... - // gap // ........................................................................................... - add v2.8H, v25.8H, v6.8H // ..........................................................*................................ - ldr_vo v22, x1, 64 // e.......................................................................................... - // gap // ........................................................................................... - mls v24.8H, v20.8H, v7.H[0] // ............e.............................................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v28.8H, v14.8H, v11.H[0] // .....e..................................................................................... - trn1 v9.4S, v2.4S, v29.4S // .............................................................*............................. - // gap // ........................................................................................... - trn2 v29.4S, v2.4S, v29.4S // ..............................................................*............................ - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v28.8H, v8.8H, v7.H[0] // .......e................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - add v1.8H, v26.8H, v24.8H // ..............e............................................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v30.8H, v26.8H, v24.8H // .............e............................................................................. - sqdmulh v13.8H, v29.8H, v7.H[1] // ........................................................................*.................. - // gap // ........................................................................................... - ldr_vi v23, x4, 96 // .................................e......................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sqrdmulh v17.8H, v1.8H, v11.H[3] // ................e.......................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sqrdmulh v2.8H, v30.8H, v11.H[5] // .....................e..................................................................... - sub v31.8H, v22.8H, v28.8H // ........e.................................................................................. - // gap // ........................................................................................... - srshr v16.8H, v13.8H, #11 // .........................................................................*................. - // gap // ........................................................................................... - // gap // ........................................................................................... - add v26.8H, v22.8H, v28.8H // .........e................................................................................. - mul v15.8H, v30.8H, v11.H[4] // ....................e...................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v1.8H, v1.8H, v11.H[2] // ...............e........................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v1.8H, v17.8H, v7.H[0] // .................e......................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v15.8H, v2.8H, v7.H[0] // ......................e.................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sqdmulh v0.8H, v27.8H, v7.H[1] // ..................................................................*........................ - // gap // ........................................................................................... - // gap // ........................................................................................... - add v24.8H, v26.8H, v1.8H // ...................e....................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v12.8H, v26.8H, v1.8H // ..................e........................................................................ - ldr_vo v11, x4, -16 // ......................................e.................................................... - // gap // ........................................................................................... - sqdmulh v14.8H, v19.8H, v7.H[1] // ...............................................................*........................... - sub v26.8H, v31.8H, v15.8H // .......................e................................................................... - // gap // ........................................................................................... - add v20.8H, v31.8H, v15.8H // ........................e.................................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v29.8H, v16.8H, v7.H[0] // ..........................................................................*................ - srshr v30.8H, v0.8H, #11 // ...................................................................*....................... - // gap // ........................................................................................... - trn2 v8.4S, v24.4S, v12.4S // ..........................e................................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - sqdmulh v16.8H, v9.8H, v7.H[1] // .....................................................................*..................... - trn2 v18.4S, v20.4S, v26.4S // ............................e.............................................................. - // gap // ........................................................................................... - srshr v22.8H, v14.8H, #11 // ................................................................*.......................... - // gap // ........................................................................................... - // gap // ........................................................................................... - trn1 v12.4S, v24.4S, v12.4S // .........................e................................................................. - // gap // ........................................................................................... - mls v27.8H, v30.8H, v7.H[0] // ....................................................................*...................... - trn2 v0.2D, v8.2D, v18.2D // ..............................e............................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - trn1 v17.4S, v20.4S, v26.4S // ...........................e............................................................... - // gap // ........................................................................................... - mls v19.8H, v22.8H, v7.H[0] // .................................................................*......................... - srshr v30.8H, v16.8H, #11 // ......................................................................*.................... - ldr_vo v16, x4, -32 // .....................................e..................................................... - // gap // ........................................................................................... - trn1 v14.2D, v8.2D, v18.2D // ................................e.......................................................... - sqrdmulh v28.8H, v0.8H, v3.8H // .............................................e............................................. - // gap // ........................................................................................... - trn2 v1.2D, v12.2D, v17.2D // .............................e............................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v4.8H, v0.8H, v23.8H // ............................................e.............................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sqrdmulh v26.8H, v1.8H, v3.8H // ........................................e.................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v4.8H, v28.8H, v7.H[0] // ..............................................e............................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v9.8H, v30.8H, v7.H[0] // .......................................................................*................... - vext x10, v19, 0 // ...........................................................................*............... - trn1 v30.2D, v12.2D, v17.2D // ...............................e........................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v31.8H, v1.8H, v23.8H // .......................................e................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - add v2.8H, v14.8H, v4.8H // ................................................e.......................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - vext x11, v19, 1 // ............................................................................*.............. - mls v31.8H, v26.8H, v7.H[0] // .........................................e................................................. - str x10, [x1] , #64 // ...................................................................................*....... - sub v28.8H, v14.8H, v4.8H // ...............................................e........................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - vext x17, v29, 1 // ..................................................................................*........ - vext x16, v29, 0 // .................................................................................*......... - sqrdmulh v4.8H, v2.8H, v10.8H // ..................................................e........................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - vext x13, v27, 1 // ..............................................................................*............ - vext x12, v27, 0 // .............................................................................*............. - mul v0.8H, v2.8H, v5.8H // .................................................e......................................... - vext x14, v9, 0 // ...............................................................................*........... - add v13.8H, v30.8H, v31.8H // ...........................................e............................................... - str x11, [x1, #-32] // .......................................................................................*... - sqrdmulh v14.8H, v28.8H, v11.8H // .......................................................e................................... - vext x15, v9, 1 // ................................................................................*.......... - str x16, [x1, #-40] // ......................................................................................*.... - str x17, [x1, #-8] // ..........................................................................................* - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v0.8H, v4.8H, v7.H[0] // ...................................................e....................................... - str x12, [x1, #-48] // .....................................................................................*..... - // gap // ........................................................................................... - str x14, [x1, #-56] // ....................................................................................*...... - str x13, [x1, #-16] // .........................................................................................*. - // gap // ........................................................................................... - mul v6.8H, v28.8H, v16.8H // ......................................................e.................................... - str x15, [x1, #-24] // ........................................................................................*.. - // gap // ........................................................................................... - - // original source code - // ldr_vo v8, x1, 0 // .................e................................................................................................................................................................... || .........e.......................................................................................................................... - // ldr_vo v9, x1, 16 // ......e.............................................................................................................................................................................. || ..e................................................................................................................................. - // ldr_vo v10, x1, 32 // ....e................................................................................................................................................................................ || .e.................................................................................................................................. - // ldr_vo v11, x1, 48 // e.................................................................................................................................................................................... || e................................................................................................................................... - // ldr_vi v0, x3, 16 // .e................................................................................................................................................................................... || e................................................................................................................................... - // mul v24.8H, v10.8H, v0.H[0] // ...................e................................................................................................................................................................. || ............e....................................................................................................................... - // sqrdmulh v10.8H, v10.8H, v0.H[1] // ...............e..................................................................................................................................................................... || ........e........................................................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ......................e.............................................................................................................................................................. || ..............e..................................................................................................................... - // sub v10.8H, v8.8H, v24.8H // .............................e....................................................................................................................................................... || ....................e............................................................................................................... - // add v8.8H, v8.8H, v24.8H // ...............................e..................................................................................................................................................... || ......................e............................................................................................................. - // mul v24.8H, v11.8H, v0.H[0] // ...........e......................................................................................................................................................................... || ......e............................................................................................................................. - // sqrdmulh v11.8H, v11.8H, v0.H[1] // .........e........................................................................................................................................................................... || ....e............................................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ..................e.................................................................................................................................................................. || ..........e......................................................................................................................... - // sub v11.8H, v9.8H, v24.8H // ........................e............................................................................................................................................................ || ................e................................................................................................................... - // add v9.8H, v9.8H, v24.8H // .......................e............................................................................................................................................................. || ...............e.................................................................................................................... - // mul v24.8H, v9.8H, v0.H[2] // .................................e................................................................................................................................................... || ........................e........................................................................................................... - // sqrdmulh v9.8H, v9.8H, v0.H[3] // ...........................e......................................................................................................................................................... || ..................e................................................................................................................. - // mls v24.8H, v9.8H, v7.H[0] // ..................................e.................................................................................................................................................. || ..........................e......................................................................................................... - // sub v9.8H, v8.8H, v24.8H // ......................................e.............................................................................................................................................. || ................................e................................................................................................... - // add v8.8H, v8.8H, v24.8H // .....................................e............................................................................................................................................... || ...............................e.................................................................................................... - // mul v24.8H, v11.8H, v0.H[4] // ................................e.................................................................................................................................................... || ......................e............................................................................................................. - // sqrdmulh v11.8H, v11.8H, v0.H[5] // ............................e........................................................................................................................................................ || ....................e............................................................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ...................................e................................................................................................................................................. || ............................e....................................................................................................... - // sub v11.8H, v10.8H, v24.8H // .........................................e........................................................................................................................................... || .................................e.................................................................................................. - // add v10.8H, v10.8H, v24.8H // ..........................................e.......................................................................................................................................... || ..................................e................................................................................................. - // trn1 v25.4S, v8.4S, v9.4S // .................................................e................................................................................................................................... || .......................................e............................................................................................ - // trn2 v26.4S, v8.4S, v9.4S // .............................................e....................................................................................................................................... || ....................................e............................................................................................... - // trn1 v27.4S, v10.4S, v11.4S // ....................................................e................................................................................................................................ || .........................................e.......................................................................................... - // trn2 v28.4S, v10.4S, v11.4S // ...............................................e..................................................................................................................................... || .....................................e.............................................................................................. - // trn2 v10.2D, v25.2D, v27.2D // ..........................................................e.......................................................................................................................... || ............................................e....................................................................................... - // trn2 v11.2D, v26.2D, v28.2D // ...................................................e................................................................................................................................. || ........................................e........................................................................................... - // trn1 v8.2D, v25.2D, v27.2D // ................................................................e.................................................................................................................... || ...................................................e................................................................................ - // trn1 v9.2D, v26.2D, v28.2D // ........................................................e............................................................................................................................ || ...........................................e........................................................................................ - // ldr_vi v0, x4, 96 // ..........................e.......................................................................................................................................................... || .................e.................................................................................................................. - // ldr_vo v4, x4, -80 // ............e........................................................................................................................................................................ || ......e............................................................................................................................. - // ldr_vo v1, x4, -64 // .......e............................................................................................................................................................................. || ...e................................................................................................................................ - // ldr_vo v5, x4, -48 // ..e.................................................................................................................................................................................. || .e.................................................................................................................................. - // ldr_vo v2, x4, -32 // .......................................................e............................................................................................................................. || ..........................................e......................................................................................... - // ldr_vo v6, x4, -16 // .......................................e............................................................................................................................................. || ................................e................................................................................................... - // mul v24.8H, v10.8H, v0.8H // .................................................................e................................................................................................................... || .....................................................e.............................................................................. - // sqrdmulh v10.8H, v10.8H, v4.8H // ............................................................e........................................................................................................................ || ...............................................e.................................................................................... - // mls v24.8H, v10.8H, v7.H[0] // ....................................................................e................................................................................................................ || .......................................................e............................................................................ - // sub v10.8H, v8.8H, v24.8H // ..........................................................................................*.......................................................................................... || ..................................................................*................................................................. - // add v8.8H, v8.8H, v24.8H // ..............................................................................e...................................................................................................... || ............................................................e....................................................................... - // mul v24.8H, v11.8H, v0.8H // ...........................................................e......................................................................................................................... || .............................................e...................................................................................... - // sqrdmulh v11.8H, v11.8H, v4.8H // .........................................................e........................................................................................................................... || ...........................................e........................................................................................ - // mls v24.8H, v11.8H, v7.H[0] // .............................................................e....................................................................................................................... || .................................................e.................................................................................. - // sub v11.8H, v9.8H, v24.8H // ......................................................................e.............................................................................................................. || ........................................................e........................................................................... - // add v9.8H, v9.8H, v24.8H // ..................................................................e.................................................................................................................. || ......................................................e............................................................................. - // mul v24.8H, v9.8H, v1.8H // ............................................................................e........................................................................................................ || ...........................................................e........................................................................ - // sqrdmulh v9.8H, v9.8H, v5.8H // .........................................................................e........................................................................................................... || .........................................................e.......................................................................... - // mls v24.8H, v9.8H, v7.H[0] // ....................................................................................e................................................................................................ || ...............................................................e.................................................................... - // sub v9.8H, v8.8H, v24.8H // ...................................................................................................*................................................................................. || .....................................................................*.............................................................. - // add v8.8H, v8.8H, v24.8H // ................................................................................................*.................................................................................... || ....................................................................*............................................................... - // mul v24.8H, v11.8H, v2.8H // ........................................................................................e............................................................................................ || .................................................................e.................................................................. - // sqrdmulh v11.8H, v11.8H, v6.8H // ................................................................................e.................................................................................................... || .............................................................e...................................................................... - // mls v24.8H, v11.8H, v7.H[0] // ..............................................................................................*...................................................................................... || ...................................................................*................................................................ - // sub v11.8H, v10.8H, v24.8H // .........................................................................................................*........................................................................... || ..........................................................................*......................................................... - // add v10.8H, v10.8H, v24.8H // ...........................................................................................................*......................................................................... || ...........................................................................*........................................................ - // trn1 v25.4S, v8.4S, v9.4S // .....................................................................................................*............................................................................... || ........................................................................*........................................................... - // trn2 v26.4S, v8.4S, v9.4S // ........................................................................................................*............................................................................ || .........................................................................*.......................................................... - // trn1 v27.4S, v10.4S, v11.4S // ...............................................................................................................*..................................................................... || ..............................................................................*..................................................... - // trn2 v28.4S, v10.4S, v11.4S // ................................................................................................................*.................................................................... || ...............................................................................*.................................................... - // sqdmulh v24.8H, v25.8H, v7.H[1] // ...................................................................................................................................*................................................. || ...................................................................................................*................................ - // srshr v24.8H, v24.8H, #11 // ...........................................................................................................................................*......................................... || ........................................................................................................*........................... - // mls v25.8H, v24.8H, v7.H[0] // ................................................................................................................................................*.................................... || ...........................................................................................................*........................ - // sqdmulh v24.8H, v26.8H, v7.H[1] // ...............................................................................................................................*..................................................... || ................................................................................................*................................... - // srshr v24.8H, v24.8H, #11 // .......................................................................................................................................*............................................. || .....................................................................................................*.............................. - // mls v26.8H, v24.8H, v7.H[0] // .............................................................................................................................................*....................................... || .........................................................................................................*.......................... - // sqdmulh v24.8H, v27.8H, v7.H[1] // .........................................................................................................................................*........................................... || .......................................................................................................*............................ - // srshr v24.8H, v24.8H, #11 // .................................................................................................................................................*................................... || ............................................................................................................*....................... - // mls v27.8H, v24.8H, v7.H[0] // .........................................................................................................................................................*........................... || .....................................................................................................................*.............. - // sqdmulh v24.8H, v28.8H, v7.H[1] // ....................................................................................................................*................................................................ || ..................................................................................*................................................. - // srshr v24.8H, v24.8H, #11 // .........................................................................................................................*........................................................... || .......................................................................................*............................................ - // mls v28.8H, v24.8H, v7.H[0] // ......................................................................................................................................*.............................................. || .....................................................................................................*.............................. - // vext x10, v25, 0 // ..........................................................................................................................................................*.......................... || .....................................................................................................................*.............. - // vext x11, v25, 1 // ..............................................................................................................................................................*...................... || .........................................................................................................................*.......... - // vext x12, v26, 0 // ......................................................................................................................................................................*.............. || .............................................................................................................................*...... - // vext x13, v26, 1 // .....................................................................................................................................................................*............... || .............................................................................................................................*...... - // vext x14, v27, 0 // ........................................................................................................................................................................*............ || ..............................................................................................................................*..... - // vext x15, v27, 1 // ............................................................................................................................................................................*........ || ...............................................................................................................................*.... - // vext x16, v28, 0 // ...................................................................................................................................................................*................. || ...........................................................................................................................*........ - // vext x17, v28, 1 // ..................................................................................................................................................................*.................. || ...........................................................................................................................*........ - // str x10, [x1] , #64 // ................................................................................................................................................................*.................... || .........................................................................................................................*.......... - // str x14, [x1, #-56] // .................................................................................................................................................................................*... || ..................................................................................................................................*. - // str x12, [x1, #-48] // ................................................................................................................................................................................*.... || .................................................................................................................................*.. - // str x16, [x1, #-40] // .............................................................................................................................................................................*....... || ...............................................................................................................................*.... - // str x11, [x1, #-32] // ..........................................................................................................................................................................*.......... || ..............................................................................................................................*..... - // str x15, [x1, #-24] // ....................................................................................................................................................................................* || ...................................................................................................................................* - // str x13, [x1, #-16] // ..................................................................................................................................................................................*.. || ..................................................................................................................................*. - // str x17, [x1, #-8] // ..............................................................................................................................................................................*...... || ................................................................................................................................*... - - subs count, count, #1 + // Instructions: 91 + // Expected cycles: 64 + // Expected IPC: 1.42 + // + // Cycle bound: 64.0 + // IPC bound: 1.42 + // + // Wall time: 39.05s + // User time: 39.05s + // + // ----------------------------------- original position ------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--------------- + mul v15.8H, v27.8H, v23.8H // .............................................*............................................. + ldr q16, [x1, #112] // ...e....................................................................................... + ldr q8, [x3], #16 // ....e...................................................................................... + trn1 v23.2D, v0.2D, v3.2D // ................................*.......................................................... + ldr q22, [x4, #-16] // ......................................*.................................................... + ldr q6, [x1, #96] // ..e........................................................................................ + trn1 v25.2D, v13.2D, v1.2D // ...............................*........................................................... + mls v15.8H, v29.8H, v7.H[0] // ..............................................*............................................ + ldr q28, [x1, #80] // .e......................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v0.8H, v16.8H, v8.H[1] // ..........e................................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v17.8H, v6.8H, v8.H[1] // .....e..................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v26.8H, v23.8H, v15.8H // ...............................................*........................................... + mul v24.8H, v16.8H, v8.H[0] // ...........e............................................................................... + add v16.8H, v23.8H, v15.8H // ................................................*.......................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v24.8H, v0.8H, v7.H[0] // ............e.............................................................................. + // gap // ........................................................................................... + ldr q27, [x4, #-64] // ...................................*....................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v15.8H, v6.8H, v8.H[0] // ......e.................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v6.8H, v16.8H, v27.8H // ..................................................*........................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v2.8H, v28.8H, v24.8H // .............e............................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + add v28.8H, v28.8H, v24.8H // ..............e............................................................................ + sqrdmulh v16.8H, v16.8H, v14.8H // .................................................*......................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v3.8H, v2.8H, v8.H[4] // .....................e..................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v27.8H, v26.8H, v22.8H // ......................................................*.................................... + // gap // ........................................................................................... + ldr q0, [x4, #-32] // .....................................*..................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v6.8H, v16.8H, v7.H[0] // ...................................................*....................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v16.8H, v26.8H, v0.8H // .......................................................*................................... + // gap // ........................................................................................... + add v26.8H, v25.8H, v19.8H // ...........................................*............................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v1.8H, v2.8H, v8.H[5] // ....................e...................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v31.8H, v26.8H, v6.8H // ....................................................*...................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v16.8H, v27.8H, v7.H[0] // ........................................................*.................................. + add v27.8H, v26.8H, v6.8H // .....................................................*..................................... + // gap // ........................................................................................... + sub v0.8H, v25.8H, v19.8H // ..........................................*................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v14.8H, v28.8H, v8.H[2] // ................e.......................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v11.4S, v27.4S, v31.4S // ...........................................................*............................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v13.8H, v28.8H, v8.H[3] // ...............e........................................................................... + trn2 v27.4S, v27.4S, v31.4S // ............................................................*.............................. + // gap // ........................................................................................... + sub v6.8H, v0.8H, v16.8H // .........................................................*................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v15.8H, v17.8H, v7.H[0] // .......e................................................................................... + add v16.8H, v0.8H, v16.8H // ..........................................................*................................ + ldr q0, [x1, #64] // e.......................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v3.8H, v1.8H, v7.H[0] // ......................e.................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v26.4S, v16.4S, v6.4S // .............................................................*............................. + mls v14.8H, v13.8H, v7.H[0] // .................e......................................................................... + trn2 v16.4S, v16.4S, v6.4S // ..............................................................*............................ + // gap // ........................................................................................... + sub v21.8H, v0.8H, v15.8H // ........e.................................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + add v28.8H, v0.8H, v15.8H // .........e................................................................................. + // gap // ........................................................................................... + sqdmulh v6.8H, v11.8H, v7.H[1] // ...............................................................*........................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqdmulh v15.8H, v27.8H, v7.H[1] // ..................................................................*........................ + sub v2.8H, v21.8H, v3.8H // .......................e................................................................... + // gap // ........................................................................................... + sub v29.8H, v28.8H, v14.8H // ..................e........................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + add v17.8H, v28.8H, v14.8H // ...................e....................................................................... + ldr q14, [x4, #48] // ....................................e...................................................... + sqdmulh v31.8H, v26.8H, v7.H[1] // .....................................................................*..................... + srshr v6.8H, v6.8H, #11 // ................................................................*.......................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqdmulh v25.8H, v16.8H, v7.H[1] // ........................................................................*.................. + add v3.8H, v21.8H, v3.8H // ........................e.................................................................. + srshr v23.8H, v15.8H, #11 // ...................................................................*....................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v11.8H, v6.8H, v7.H[0] // .................................................................*......................... + trn1 v13.4S, v17.4S, v29.4S // .........................e................................................................. + // gap // ........................................................................................... + srshr v6.8H, v31.8H, #11 // ......................................................................*.................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v27.8H, v23.8H, v7.H[0] // ....................................................................*...................... + ldr q23, [x4], #(6*16) // .................................e......................................................... + trn1 v1.4S, v3.4S, v2.4S // ...........................e............................................................... + srshr v0.8H, v25.8H, #11 // .........................................................................*................. + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v26.8H, v6.8H, v7.H[0] // .......................................................................*................... + trn2 v3.4S, v3.4S, v2.4S // ............................e.............................................................. + ldr q6, [x4, #-80] // ..................................e........................................................ + umov x25, v11.d[0] // ...........................................................................*............... + trn2 v2.2D, v13.2D, v1.2D // .............................e............................................................. + // gap // ........................................................................................... + mls v16.8H, v0.8H, v7.H[0] // ..........................................................................*................ + trn2 v0.4S, v17.4S, v29.4S // ..........................e................................................................ + umov x18, v11.d[1] // ............................................................................*.............. + umov x13, v27.d[0] // .............................................................................*............. + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x12, v27.d[1] // ..............................................................................*............ + sqrdmulh v15.8H, v2.8H, v6.8H // .......................................e................................................... + // gap // ........................................................................................... + str x25, [x1], #( 16*4) // ...................................................................................*....... + umov x14, v26.d[0] // ...............................................................................*........... + trn2 v27.2D, v0.2D, v3.2D // ..............................e............................................................ + umov x15, v26.d[1] // ................................................................................*.......... + mul v19.8H, v2.8H, v23.8H // ........................................e.................................................. + str x18, [x1, #-32] // .......................................................................................*... + umov x27, v16.d[0] // .................................................................................*......... + umov x19, v16.d[1] // ..................................................................................*........ + str x13, [x1, #-48] // .....................................................................................*..... + sqrdmulh v29.8H, v27.8H, v6.8H // ............................................e.............................................. + str x12, [x1, #-16] // .........................................................................................*. + // gap // ........................................................................................... + str x14, [x1, #-56] // ....................................................................................*...... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v19.8H, v15.8H, v7.H[0] // .........................................e................................................. + str x15, [x1, #-24] // ........................................................................................*.. + // gap // ........................................................................................... + str x27, [x1, #-40] // ......................................................................................*.... + str x19, [x1, #-8] // ..........................................................................................* + // gap // ........................................................................................... + + // ----------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----- + // ldr q8, [x1, #(16*0)] // ......................................e...................................................'......................................~................................................... + // ldr q9, [x1, #(16*1)] // .......e..................................................................................'.......~.................................................................................. + // ldr q10, [x1, #(16*2)] // ....e.....................................................................................'....~..................................................................................... + // ldr q11, [x1, #(16*3)] // e.........................................................................................'~......................................................................................... + // ldr q0, [x3], #16 // .e........................................................................................'.~........................................................................................ + // sqrdmulh v27.8h, v10.8h, v0.h[1] // .........e................................................................................'.........~................................................................................ + // mul v24.8h, v10.8h, v0.h[0] // ...............e..........................................................................'...............~.......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................................e.....................................................'....................................~..................................................... + // sub v10.8h, v8.8h, v24.8h // ...........................................e..............................................'...........................................~.............................................. + // add v8.8h, v8.8h, v24.8h // ............................................e.............................................'............................................~............................................. + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ........e.................................................................................'........~................................................................................. + // mul v24.8h, v11.8h, v0.h[0] // ...........e..............................................................................'...........~.............................................................................. + // mls v24.8h, v27.8h, v7.h[0] // .............e............................................................................'.............~............................................................................ + // sub v11.8h, v9.8h, v24.8h // .................e........................................................................'.................~........................................................................ + // add v9.8h, v9.8h, v24.8h // ..................e.......................................................................'..................~....................................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // .................................e........................................................'.................................~........................................................ + // mul v24.8h, v9.8h, v0.h[2] // ...............................e..........................................................'...............................~.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................................e................................................'.........................................~................................................ + // sub v9.8h, v8.8h, v24.8h // ................................................e.........................................'................................................~......................................... + // add v8.8h, v8.8h, v24.8h // .................................................e........................................'.................................................~........................................ + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ..........................e...............................................................'..........................~............................................................... + // mul v24.8h, v11.8h, v0.h[4] // ....................e.....................................................................'....................~..................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................................e..................................................'.......................................~.................................................. + // sub v11.8h, v10.8h, v24.8h // ...............................................e..........................................'...............................................~.......................................... + // add v10.8h, v10.8h, v24.8h // ......................................................e...................................'......................................................~................................... + // trn1 v25.4s, v8.4s, v9.4s // .........................................................e................................'.........................................................~................................ + // trn2 v26.4s, v8.4s, v9.4s // .....................................................................e....................'.....................................................................~.................... + // trn1 v27.4s, v10.4s, v11.4s // .............................................................e............................'.............................................................~............................ + // trn2 v28.4s, v10.4s, v11.4s // ................................................................e.........................'................................................................~......................... + // trn2 v10.2d, v25.2d, v27.2d // ...................................................................e......................'...................................................................~...................... + // trn2 v11.2d, v26.2d, v28.2d // ............................................................................e.............'............................................................................~............. + // trn1 v8.2d, v25.2d, v27.2d // .....~....................................................................................'.....*.................................................................................... + // trn1 v9.2d, v26.2d, v28.2d // ..~.......................................................................................'..*....................................................................................... + // ldr q0, [ x4], #(6*16) // ............................................................e.............................'............................................................~............................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .................................................................e........................'.................................................................~........................ + // ldr q1, [ x4, #(-6*16 + 2*16)] // ..............~...........................................................................'..............*........................................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ..................................................e.......................................'..................................................~....................................... + // ldr q2, [ x4, #(-6*16 + 4*16)] // ......................~...................................................................'......................*................................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ...~......................................................................................'...*...................................................................................... + // sqrdmulh v27.8h, v10.8h, v4.8h // .........................................................................e................'.........................................................................~................ + // mul v24.8h, v10.8h, v0.8h // ..............................................................................e...........'..............................................................................~........... + // mls v24.8h, v27.8h, v7.h[0] // ......................................................................................e...'......................................................................................~... + // sub v10.8h, v8.8h, v24.8h // ..............................~...........................................................'..............................*........................................................... + // add v8.8h, v8.8h, v24.8h // .........................~................................................................'.........................*................................................................ + // sqrdmulh v27.8h, v11.8h, v4.8h // ...................................................................................e......'...................................................................................~...... + // mul v24.8h, v11.8h, v0.8h // ..........................................................................................*.......................................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ......~...................................................................................'......*................................................................................... + // sub v11.8h, v9.8h, v24.8h // ..........~...............................................................................'..........*............................................................................... + // add v9.8h, v9.8h, v24.8h // ............~.............................................................................'............*............................................................................. + // sqrdmulh v27.8h, v9.8h, v5.8h // ...................~......................................................................'...................*...................................................................... + // mul v24.8h, v9.8h, v1.8h // ................~.........................................................................'................*......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................~..................................................................'.......................*.................................................................. + // sub v9.8h, v8.8h, v24.8h // ...........................~..............................................................'...........................*.............................................................. + // add v8.8h, v8.8h, v24.8h // .............................~............................................................'.............................*............................................................ + // sqrdmulh v27.8h, v11.8h, v6.8h // .....................~....................................................................'.....................*.................................................................... + // mul v24.8h, v11.8h, v2.8h // ........................~.................................................................'........................*................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ............................~.............................................................'............................*............................................................. + // sub v11.8h, v10.8h, v24.8h // ...................................~......................................................'...................................*...................................................... + // add v10.8h, v10.8h, v24.8h // .....................................~....................................................'.....................................*.................................................... + // trn1 v25.4s, v8.4s, v9.4s // ................................~.........................................................'................................*......................................................... + // trn2 v26.4s, v8.4s, v9.4s // ..................................~.......................................................'..................................*....................................................... + // trn1 v27.4s, v10.4s, v11.4s // ........................................~.................................................'........................................*................................................. + // trn2 v28.4s, v10.4s, v11.4s // ..........................................~...............................................'..........................................*............................................... + // sqdmulh v24.8h, v25.8h, v7.h[1] // .............................................~............................................'.............................................*............................................ + // srshr v24.8h, v24.8h, #11 // ....................................................~.....................................'....................................................*..................................... + // mls v25.8h, v24.8h, v7.h[0] // ........................................................~.................................'........................................................*................................. + // sqdmulh v24.8h, v26.8h, v7.h[1] // ..............................................~...........................................'..............................................*........................................... + // srshr v24.8h, v24.8h, #11 // .......................................................~..................................'.......................................................*.................................. + // mls v26.8h, v24.8h, v7.h[0] // ...........................................................~..............................'...........................................................*.............................. + // sqdmulh v24.8h, v27.8h, v7.h[1] // ...................................................~......................................'...................................................*...................................... + // srshr v24.8h, v24.8h, #11 // ..........................................................~...............................'..........................................................*............................... + // mls v27.8h, v24.8h, v7.h[0] // ...............................................................~..........................'...............................................................*.......................... + // sqdmulh v24.8h, v28.8h, v7.h[1] // .....................................................~....................................'.....................................................*.................................... + // srshr v24.8h, v24.8h, #11 // ..............................................................~...........................'..............................................................*........................... + // mls v28.8h, v24.8h, v7.h[0] // ....................................................................~.....................'....................................................................*..................... + // umov x10, v25.d[0] // ..................................................................~.......................'..................................................................*....................... + // umov x11, v25.d[1] // ......................................................................~...................'......................................................................*................... + // umov x12, v26.d[0] // .......................................................................~..................'.......................................................................*.................. + // umov x13, v26.d[1] // ........................................................................~.................'........................................................................*................. + // umov x14, v27.d[0] // ...........................................................................~..............'...........................................................................*.............. + // umov x15, v27.d[1] // .............................................................................~............'.............................................................................*............ + // umov x16, v28.d[0] // ................................................................................~.........'................................................................................*......... + // umov x17, v28.d[1] // .................................................................................~........'.................................................................................*........ + // str x10, [x1], #( 16*4) // ..........................................................................~...............'..........................................................................*............... + // str x14, [x1, #(-16*4 + 8*1)] // .....................................................................................~....'.....................................................................................*.... + // str x12, [x1, #(-16*4 + 8*2)] // ..................................................................................~.......'..................................................................................*....... + // str x16, [x1, #(-16*4 + 8*3)] // ........................................................................................~.'........................................................................................*. + // str x11, [x1, #(-16*4 + 8*4)] // ...............................................................................~..........'...............................................................................*.......... + // str x15, [x1, #(-16*4 + 8*5)] // .......................................................................................~..'.......................................................................................*.. + // str x13, [x1, #(-16*4 + 8*6)] // ....................................................................................~.....'....................................................................................*..... + // str x17, [x1, #(-16*4 + 8*7)] // .........................................................................................~'.........................................................................................* + + sub count, count, #1 cbnz count, layer4567_start - mls v6.8H, v14.8H, v7.H[0] // .*.................................... - add v19.8H, v13.8H, v0.8H // ..*................................... - // gap // ...................................... - sub v16.8H, v13.8H, v0.8H // ...*.................................. - // gap // ...................................... - // gap // ...................................... - sub v1.8H, v30.8H, v31.8H // *..................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - trn1 v5.4S, v19.4S, v16.4S // ....*................................. - trn2 v19.4S, v19.4S, v16.4S // .....*................................ - // gap // ...................................... - sub v16.8H, v1.8H, v6.8H // ......*............................... - add v1.8H, v1.8H, v6.8H // .......*.............................. - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqdmulh v21.8H, v5.8H, v7.H[1] // .............*........................ - // gap // ...................................... - // gap // ...................................... - trn2 v24.4S, v1.4S, v16.4S // .........*............................ - // gap // ...................................... - // gap // ...................................... - trn1 v16.4S, v1.4S, v16.4S // ........*............................. - sqdmulh v1.8H, v19.8H, v7.H[1] // ............*......................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - sqdmulh v2.8H, v24.8H, v7.H[1] // ..........*........................... - // gap // ...................................... - // gap // ...................................... - srshr v21.8H, v21.8H, #11 // .................*.................... - // gap // ...................................... - // gap // ...................................... - sqdmulh v8.8H, v16.8H, v7.H[1] // ................*..................... - // gap // ...................................... - // gap // ...................................... - srshr v1.8H, v1.8H, #11 // ...............*...................... - // gap // ...................................... - // gap // ...................................... - mls v5.8H, v21.8H, v7.H[0] // ...................*.................. - // gap // ...................................... - // gap // ...................................... - srshr v21.8H, v2.8H, #11 // ...........*.......................... - // gap // ...................................... - // gap // ...................................... - mls v19.8H, v1.8H, v7.H[0] // ..................*................... - // gap // ...................................... - // gap // ...................................... - srshr v1.8H, v8.8H, #11 // ....................*................. - // gap // ...................................... - // gap // ...................................... - mls v24.8H, v21.8H, v7.H[0] // ..............*....................... - // gap // ...................................... - // gap // ...................................... - vext x10, v5, 0 // ......................*............... - vext x11, v5, 1 // .......................*.............. - // gap // ...................................... - mls v16.8H, v1.8H, v7.H[0] // .....................*................ - // gap // ...................................... - // gap // ...................................... - vext x13, v19, 1 // ...........................*.......... - vext x12, v19, 0 // ............................*......... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - vext x17, v24, 1 // .........................*............ - vext x16, v24, 0 // ..........................*........... - str x10, [x1] , #64 // ........................*............. - str x11, [x1, #-32] // ..............................*....... - // gap // ...................................... - // gap // ...................................... - vext x14, v16, 0 // .............................*........ - vext x15, v16, 1 // ...............................*...... - str x12, [x1, #-48] // ..................................*... - str x13, [x1, #-16] // ....................................*. - // gap // ...................................... - // gap // ...................................... - str x16, [x1, #-40] // ................................*..... - str x17, [x1, #-8] // .................................*.... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - // gap // ...................................... - str x14, [x1, #-56] // ...................................*.. - str x15, [x1, #-24] // .....................................* - // gap // ...................................... - - // original source code - // sub v25.8H, v30.8H, v31.8H // ...*.................................. || ..*............................ - // mls v6.8H, v14.8H, v7.H[0] // *..................................... || *.............................. - // add v18.8H, v13.8H, v0.8H // .*.................................... || *.............................. - // sub v16.8H, v13.8H, v0.8H // ..*................................... || .*............................. - // trn1 v19.4S, v18.4S, v16.4S // ....*................................. || ....*.......................... - // trn2 v27.4S, v18.4S, v16.4S // .....*................................ || ....*.......................... - // sub v29.8H, v25.8H, v6.8H // ......*............................... || .....*......................... - // add v2.8H, v25.8H, v6.8H // .......*.............................. || .....*......................... - // trn1 v9.4S, v2.4S, v29.4S // ..........*........................... || .........*..................... - // trn2 v29.4S, v2.4S, v29.4S // .........*............................ || ........*...................... - // sqdmulh v13.8H, v29.8H, v7.H[1] // ............*......................... || ...........*................... - // srshr v16.8H, v13.8H, #11 // .................*.................... || ................*.............. - // sqdmulh v0.8H, v27.8H, v7.H[1] // ...........*.......................... || .........*..................... - // sqdmulh v14.8H, v19.8H, v7.H[1] // ........*............................. || .......*....................... - // mls v29.8H, v16.8H, v7.H[0] // ....................*................. || ...................*........... - // srshr v30.8H, v0.8H, #11 // ...............*...................... || ..............*................ - // sqdmulh v16.8H, v9.8H, v7.H[1] // ..............*....................... || .............*................. - // srshr v22.8H, v14.8H, #11 // .............*........................ || ............*.................. - // mls v27.8H, v30.8H, v7.H[0] // ..................*................... || .................*............. - // mls v19.8H, v22.8H, v7.H[0] // ................*..................... || ...............*............... - // srshr v30.8H, v16.8H, #11 // ...................*.................. || ..................*............ - // mls v9.8H, v30.8H, v7.H[0] // .......................*.............. || .....................*......... - // vext x10, v19, 0 // .....................*................ || ....................*.......... - // vext x11, v19, 1 // ......................*............... || ....................*.......... - // str x10, [x1] , #64 // ............................*......... || ........................*...... - // vext x17, v29, 1 // ..........................*........... || ........................*...... - // vext x16, v29, 0 // ...........................*.......... || ........................*...... - // vext x13, v27, 1 // ........................*............. || ......................*........ - // vext x12, v27, 0 // .........................*............ || ......................*........ - // vext x14, v9, 0 // ..............................*....... || ..........................*.... - // str x11, [x1, #-32] // .............................*........ || .........................*..... - // vext x15, v9, 1 // ...............................*...... || ..........................*.... - // str x16, [x1, #-40] // ..................................*... || ............................*.. - // str x17, [x1, #-8] // ...................................*.. || ............................*.. - // str x12, [x1, #-48] // ................................*..... || ..........................*.... - // str x14, [x1, #-56] // ....................................*. || ..............................* - // str x13, [x1, #-16] // .................................*.... || ...........................*... - // str x15, [x1, #-24] // .....................................* || ..............................* - + // Instructions: 53 + // Expected cycles: 52 + // Expected IPC: 1.02 + // + // Cycle bound: 52.0 + // IPC bound: 1.02 + // + // Wall time: 1.03s + // User time: 1.03s + // + // ---------------- original position -----------------> + // 0 25 50 + // |------------------------|------------------------|-- + mul v15.8H, v27.8H, v23.8H // *.................................................... + trn1 v16.2D, v0.2D, v3.2D // .*................................................... + ldr q5, [x4, #-16] // ..*.................................................. + trn1 v0.2D, v13.2D, v1.2D // ...*................................................. + ldr q4, [x4, #-64] // .......*............................................. + // gap // ..................................................... + mls v15.8H, v29.8H, v7.H[0] // ....*................................................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + add v29.8H, v0.8H, v19.8H // ..............*...................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sub v26.8H, v16.8H, v15.8H // .....*............................................... + add v6.8H, v16.8H, v15.8H // ......*.............................................. + ldr q15, [x4, #-32] // ...........*......................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v27.8H, v6.8H, v14.8H // .........*........................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqrdmulh v16.8H, v26.8H, v5.8H // ..........*.......................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v6.8H, v6.8H, v4.8H // ........*............................................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v6.8H, v27.8H, v7.H[0] // ............*........................................ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mul v26.8H, v26.8H, v15.8H // .............*....................................... + sub v15.8H, v0.8H, v19.8H // ..................*.................................. + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + mls v26.8H, v16.8H, v7.H[0] // ................*.................................... + // gap // ..................................................... + // gap // ..................................................... + sub v27.8H, v29.8H, v6.8H // ...............*..................................... + // gap // ..................................................... + // gap // ..................................................... + add v16.8H, v29.8H, v6.8H // .................*................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + trn1 v6.4S, v16.4S, v27.4S // ...................*................................. + add v14.8H, v15.8H, v26.8H // ......................*.............................. + // gap // ..................................................... + sub v26.8H, v15.8H, v26.8H // .....................*............................... + // gap // ..................................................... + // gap // ..................................................... + trn2 v16.4S, v16.4S, v27.4S // ....................*................................ + // gap // ..................................................... + // gap // ..................................................... + sqdmulh v0.8H, v6.8H, v7.H[1] // .........................*........................... + // gap // ..................................................... + // gap // ..................................................... + trn2 v15.4S, v14.4S, v26.4S // ........................*............................ + // gap // ..................................................... + // gap // ..................................................... + trn1 v27.4S, v14.4S, v26.4S // .......................*............................. + sqdmulh v14.8H, v16.8H, v7.H[1] // ..........................*.......................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + sqdmulh v26.8H, v15.8H, v7.H[1] // .............................*....................... + // gap // ..................................................... + // gap // ..................................................... + srshr v0.8H, v0.8H, #11 // ............................*........................ + // gap // ..................................................... + // gap // ..................................................... + sqdmulh v11.8H, v27.8H, v7.H[1] // ...........................*......................... + // gap // ..................................................... + // gap // ..................................................... + srshr v14.8H, v14.8H, #11 // ..............................*...................... + // gap // ..................................................... + // gap // ..................................................... + mls v6.8H, v0.8H, v7.H[0] // ...............................*..................... + // gap // ..................................................... + // gap // ..................................................... + srshr v26.8H, v26.8H, #11 // ..................................*.................. + // gap // ..................................................... + // gap // ..................................................... + mls v16.8H, v14.8H, v7.H[0] // .................................*................... + // gap // ..................................................... + // gap // ..................................................... + srshr v14.8H, v11.8H, #11 // ................................*.................... + // gap // ..................................................... + // gap // ..................................................... + mls v15.8H, v26.8H, v7.H[0] // .....................................*............... + // gap // ..................................................... + // gap // ..................................................... + umov x27, v6.d[0] // ....................................*................ + umov x19, v6.d[1] // ......................................*.............. + // gap // ..................................................... + mls v27.8H, v14.8H, v7.H[0] // ...................................*................. + // gap // ..................................................... + // gap // ..................................................... + umov x13, v16.d[0] // .......................................*............. + umov x12, v16.d[1] // ........................................*............ + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + // gap // ..................................................... + str x27, [x1], #( 16*4) // .........................................*........... + str x19, [x1, #-32] // ............................................*........ + umov x27, v15.d[0] // .............................................*....... + umov x19, v15.d[1] // ..............................................*...... + // gap // ..................................................... + // gap // ..................................................... + umov x14, v27.d[0] // ..........................................*.......... + umov x15, v27.d[1] // ...........................................*......... + str x13, [x1, #-48] // ...............................................*..... + str x12, [x1, #-16] // ................................................*.... + // gap // ..................................................... + // gap // ..................................................... + str x27, [x1, #-40] // ...................................................*. + // gap // ..................................................... + // gap // ..................................................... + str x19, [x1, #-8] // ....................................................* + // gap // ..................................................... + // gap // ..................................................... + str x14, [x1, #-56] // .................................................*... + str x15, [x1, #-24] // ..................................................*.. + // gap // ..................................................... + + // ------------------- new position -------------------> + // 0 25 50 + // |------------------------|------------------------|-- + // mul v15.8H, v27.8H, v23.8H // *.................................................... + // trn1 v23.2D, v0.2D, v3.2D // .*................................................... + // ldr q22, [x4, #-16] // ..*.................................................. + // trn1 v25.2D, v13.2D, v1.2D // ...*................................................. + // mls v15.8H, v29.8H, v7.H[0] // .....*............................................... + // sub v26.8H, v23.8H, v15.8H // .......*............................................. + // add v16.8H, v23.8H, v15.8H // ........*............................................ + // ldr q27, [x4, #-64] // ....*................................................ + // mul v6.8H, v16.8H, v27.8H // ............*........................................ + // sqrdmulh v16.8H, v16.8H, v14.8H // ..........*.......................................... + // sqrdmulh v27.8H, v26.8H, v22.8H // ...........*......................................... + // ldr q0, [x4, #-32] // .........*........................................... + // mls v6.8H, v16.8H, v7.H[0] // .............*....................................... + // mul v16.8H, v26.8H, v0.8H // ..............*...................................... + // add v26.8H, v25.8H, v19.8H // ......*.............................................. + // sub v31.8H, v26.8H, v6.8H // .................*................................... + // mls v16.8H, v27.8H, v7.H[0] // ................*.................................... + // add v27.8H, v26.8H, v6.8H // ..................*.................................. + // sub v0.8H, v25.8H, v19.8H // ...............*..................................... + // trn1 v11.4S, v27.4S, v31.4S // ...................*................................. + // trn2 v27.4S, v27.4S, v31.4S // ......................*.............................. + // sub v6.8H, v0.8H, v16.8H // .....................*............................... + // add v16.8H, v0.8H, v16.8H // ....................*................................ + // trn1 v26.4S, v16.4S, v6.4S // .........................*........................... + // trn2 v16.4S, v16.4S, v6.4S // ........................*............................ + // sqdmulh v6.8H, v11.8H, v7.H[1] // .......................*............................. + // sqdmulh v15.8H, v27.8H, v7.H[1] // ..........................*.......................... + // sqdmulh v31.8H, v26.8H, v7.H[1] // .............................*....................... + // srshr v6.8H, v6.8H, #11 // ............................*........................ + // sqdmulh v25.8H, v16.8H, v7.H[1] // ...........................*......................... + // srshr v23.8H, v15.8H, #11 // ..............................*...................... + // mls v11.8H, v6.8H, v7.H[0] // ...............................*..................... + // srshr v6.8H, v31.8H, #11 // ..................................*.................. + // mls v27.8H, v23.8H, v7.H[0] // .................................*................... + // srshr v0.8H, v25.8H, #11 // ................................*.................... + // mls v26.8H, v6.8H, v7.H[0] // ......................................*.............. + // umov x25, v11.d[0] // ....................................*................ + // mls v16.8H, v0.8H, v7.H[0] // ...................................*................. + // umov x18, v11.d[1] // .....................................*............... + // umov x13, v27.d[0] // .......................................*............. + // umov x12, v27.d[1] // ........................................*............ + // str x25, [x1], #( 16*4) // .........................................*........... + // umov x14, v26.d[0] // .............................................*....... + // umov x15, v26.d[1] // ..............................................*...... + // str x18, [x1, #-32] // ..........................................*.......... + // umov x27, v16.d[0] // ...........................................*......... + // umov x19, v16.d[1] // ............................................*........ + // str x13, [x1, #-48] // ...............................................*..... + // str x12, [x1, #-16] // ................................................*.... + // str x14, [x1, #-56] // ...................................................*. + // str x15, [x1, #-24] // ....................................................* + // str x27, [x1, #-40] // .................................................*... + // str x19, [x1, #-8] // ..................................................*.. + pop_stack ret \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_m1_firestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_m1_firestorm.s index 95a964d3..55066e36 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_m1_firestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_m1_firestorm.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -66,15 +43,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -83,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -102,21 +73,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -165,7 +136,7 @@ .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -176,7 +147,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -186,7 +157,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -194,7 +165,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -205,19 +176,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -356,1466 +327,1548 @@ _ntt_kyber_123_4567_scalar_store_opt_m1_firestorm: load_roots_123 .p2align 2 - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - ldr q26, [x0, #448] // ..*................................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - ldr q10, [x0, #384] // *................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - ldr q21, [x0, #320] // .*.................................. - ldr q29, [x0, #128] // ....*............................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - ldr q13, [x0, #192] // .....*.............................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v2.8H, v26.8H, v0.H[0] // ..........*......................... - ldr q5, [x0, #256] // .......*............................ - // gap // .................................... - sqrdmulh v4.8H, v26.8H, v0.H[1] // .........*.......................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v18.8H, v10.8H, v0.H[0] // ...........*........................ - sqrdmulh v26.8H, v10.8H, v0.H[1] // ........*........................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sqrdmulh v31.8H, v21.8H, v0.H[1] // ..............*..................... - mul v21.8H, v21.8H, v0.H[0] // ............*....................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v2.8H, v4.8H, v7.H[0] // ...............*.................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v15.8H, v5.8H, v0.H[0] // ....................*............... - mls v18.8H, v26.8H, v7.H[0] // ................*................... - sqrdmulh v20.8H, v5.8H, v0.H[1] // .............*...................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v21.8H, v31.8H, v7.H[0] // .................*.................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - add v19.8H, v13.8H, v2.8H // ...................*................ - sub v30.8H, v13.8H, v2.8H // ..................*................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v15.8H, v20.8H, v7.H[0] // ...........................*........ - sub v14.8H, v29.8H, v18.8H // .....................*.............. - add v25.8H, v29.8H, v18.8H // ......................*............. - ldr q2, [x0, #64] // ......*............................. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sqrdmulh v8.8H, v30.8H, v0.H[5] // ........................*........... - mul v27.8H, v19.8H, v0.H[2] // .........................*.......... - sqrdmulh v31.8H, v19.8H, v0.H[3] // .......................*............ - mul v4.8H, v30.8H, v0.H[4] // ..........................*......... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - sqrdmulh v22.8H, v25.8H, v0.H[3] // ................................*... - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mul v6.8H, v14.8H, v0.H[4] // .............................*...... - sqrdmulh v10.8H, v14.8H, v0.H[5] // ..............................*..... - mul v14.8H, v25.8H, v0.H[2] // ...................................* - ldr q20, [x0, #0] // ...*................................ - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... - mls v4.8H, v8.8H, v7.H[0] // .................................*.. - sub v28.8H, v2.8H, v21.8H // ............................*....... - add v8.8H, v2.8H, v21.8H // ...............................*.... - mls v27.8H, v31.8H, v7.H[0] // ..................................*. - // gap // .................................... - // gap // .................................... - // gap // .................................... - // gap // .................................... + // Instructions: 34 + // Expected cycles: 16 + // Expected IPC: 2.12 + // + // Cycle bound: 16.0 + // IPC bound: 2.12 + // + // Wall time: 0.37s + // User time: 0.37s + // + // ------- original position -------> + // 0 25 + // |------------------------|-------- + ldr q15, [x0, #320] // .*................................ + ldr q16, [x0, #448] // ..*............................... + ldr q27, [x0, #128] // *................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + ldr q14, [x0, #384] // ...*.............................. + ldr q6, [x0, #256] // ..............*................... + ldr q26, [x0, #192] // ....*............................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + ldr q11, [x0, #64] // ...................*.............. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v13.8H, v15.8H, v0.H[1] // .......*.......................... + mul v15.8H, v15.8H, v0.H[0] // ........*......................... + sqrdmulh v31.8H, v16.8H, v0.H[1] // .....*............................ + mul v16.8H, v16.8H, v0.H[0] // ......*........................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v22.8H, v14.8H, v0.H[1] // .........*........................ + mul v3.8H, v14.8H, v0.H[0] // ..........*....................... + sqrdmulh v19.8H, v6.8H, v0.H[1] // ............................*..... + mul v25.8H, v6.8H, v0.H[0] // .............................*.... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v15.8H, v13.8H, v7.H[0] // ...........*...................... + mls v16.8H, v31.8H, v7.H[0] // ............*..................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v3.8H, v22.8H, v7.H[0] // .............*.................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sub v14.8H, v26.8H, v16.8H // ...............*.................. + add v16.8H, v26.8H, v16.8H // ................*................. + sub v28.8H, v11.8H, v15.8H // ..............................*... + add v9.8H, v11.8H, v15.8H // ...............................*.. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + add v6.8H, v27.8H, v3.8H // .................*................ + sub v27.8H, v27.8H, v3.8H // ..................*............... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v26.8H, v14.8H, v0.H[5] // ......................*........... + mul v21.8H, v14.8H, v0.H[4] // .......................*.......... + mul v15.8H, v16.8H, v0.H[2] // ....................*............. + sqrdmulh v14.8H, v16.8H, v0.H[3] // .....................*............ + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqrdmulh v16.8H, v6.8H, v0.H[3] // ........................*......... + mul v5.8H, v6.8H, v0.H[2] // .........................*........ + sqrdmulh v24.8H, v27.8H, v0.H[5] // ................................*. + mul v4.8H, v27.8H, v0.H[4] // .................................* + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v21.8H, v26.8H, v7.H[0] // ..........................*....... + mls v15.8H, v14.8H, v7.H[0] // ...........................*...... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. - // original source code - // ldr q24, [x0, #384] // .*.................................. - // ldr q21, [x0, #320] // ..*................................. - // ldr q15, [x0, #448] // *................................... - // ldr q20, [x0, #0] // ...............................*.... - // ldr q30, [x0, #128] // ...*................................ - // ldr q17, [x0, #192] // ....*............................... - // ldr q11, [x0, #64] // ......................*............. - // ldr q2, [x0, #256] // ......*............................. - // sqrdmulh v10.8H, v24.8H, v0.H[1] // .........*.......................... - // sqrdmulh v19.8H, v15.8H, v0.H[1] // .......*............................ - // mul v15.8H, v15.8H, v0.H[0] // .....*.............................. - // mul v31.8H, v24.8H, v0.H[0] // ........*........................... - // mul v5.8H, v21.8H, v0.H[0] // ...........*........................ - // sqrdmulh v8.8H, v2.8H, v0.H[1] // ...............*.................... - // sqrdmulh v23.8H, v21.8H, v0.H[1] // ..........*......................... - // mls v15.8H, v19.8H, v7.H[0] // ............*....................... - // mls v31.8H, v10.8H, v7.H[0] // ..............*..................... - // mls v5.8H, v23.8H, v7.H[0] // ................*................... - // sub v19.8H, v17.8H, v15.8H // ..................*................. - // add v17.8H, v17.8H, v15.8H // .................*.................. - // mul v15.8H, v2.8H, v0.H[0] // .............*...................... - // sub v10.8H, v30.8H, v31.8H // ....................*............... - // add v3.8H, v30.8H, v31.8H // .....................*.............. - // sqrdmulh v9.8H, v17.8H, v0.H[3] // .........................*.......... - // sqrdmulh v23.8H, v19.8H, v0.H[5] // .......................*............ - // mul v27.8H, v17.8H, v0.H[2] // ........................*........... - // mul v4.8H, v19.8H, v0.H[4] // ..........................*......... - // mls v15.8H, v8.8H, v7.H[0] // ...................*................ - // sub v28.8H, v11.8H, v5.8H // .................................*.. - // mul v6.8H, v10.8H, v0.H[4] // ............................*....... - // sqrdmulh v10.8H, v10.8H, v0.H[5] // .............................*...... - // add v8.8H, v11.8H, v5.8H // ..................................*. - // sqrdmulh v22.8H, v3.8H, v0.H[3] // ...........................*........ - // mls v4.8H, v23.8H, v7.H[0] // ................................*... - // mls v27.8H, v9.8H, v7.H[0] // ...................................* - // mul v14.8H, v3.8H, v0.H[2] // ..............................*..... + // --------- new position ----------> + // 0 25 + // |------------------------|-------- + // ldr q29, [x0, #128] // ..*............................... + // ldr q17, [x0, #320] // *................................. + // ldr q3, [x0, #448] // .*................................ + // ldr q22, [x0, #384] // ...*.............................. + // ldr q8, [x0, #192] // .....*............................ + // sqrdmulh v14.8H, v3.8H, v0.H[1] // .........*........................ + // mul v26.8H, v3.8H, v0.H[0] // ..........*....................... + // sqrdmulh v21.8H, v17.8H, v0.H[1] // .......*.......................... + // mul v30.8H, v17.8H, v0.H[0] // ........*......................... + // sqrdmulh v2.8H, v22.8H, v0.H[1] // ...........*...................... + // mul v12.8H, v22.8H, v0.H[0] // ............*..................... + // mls v30.8H, v21.8H, v7.H[0] // ...............*.................. + // mls v26.8H, v14.8H, v7.H[0] // ................*................. + // mls v12.8H, v2.8H, v7.H[0] // .................*................ + // ldr q25, [x0, #256] // ....*............................. + // sub v13.8H, v8.8H, v26.8H // ..................*............... + // add v2.8H, v8.8H, v26.8H // ...................*.............. + // add v27.8H, v29.8H, v12.8H // ......................*........... + // sub v3.8H, v29.8H, v12.8H // .......................*.......... + // ldr q12, [x0, #64] // ......*........................... + // mul v15.8H, v2.8H, v0.H[2] // ..........................*....... + // sqrdmulh v19.8H, v2.8H, v0.H[3] // ...........................*...... + // sqrdmulh v6.8H, v13.8H, v0.H[5] // ........................*......... + // mul v21.8H, v13.8H, v0.H[4] // .........................*........ + // sqrdmulh v16.8H, v27.8H, v0.H[3] // ............................*..... + // mul v5.8H, v27.8H, v0.H[2] // .............................*.... + // mls v21.8H, v6.8H, v7.H[0] // ................................*. + // mls v15.8H, v19.8H, v7.H[0] // .................................* + // sqrdmulh v19.8H, v25.8H, v0.H[1] // .............*.................... + // mul v25.8H, v25.8H, v0.H[0] // ..............*................... + // sub v28.8H, v12.8H, v30.8H // ....................*............. + // add v9.8H, v12.8H, v30.8H // .....................*............ + // sqrdmulh v24.8H, v3.8H, v0.H[5] // ..............................*... + // mul v4.8H, v3.8H, v0.H[4] // ...............................*.. sub count, count, #1 layer123_start: - ldr q24, [x0, #400] // ......e..................................................................... - ldr q21, [x0, #336] // .....e...................................................................... - sub v9.8H, v20.8H, v15.8H // ...........*................................................................ - add v12.8H, v20.8H, v15.8H // ............*............................................................... + // Instructions: 76 + // Expected cycles: 17 + // Expected IPC: 4.47 + // + // Cycle bound: 16.0 + // IPC bound: 4.75 + // + // Wall time: 3600.46s + // User time: 3600.46s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + ldr q29, [x0, #144] // ..e......................................................................... + ldr q17, [x0, #336] // .....e...................................................................... // gap // ............................................................................ - ldr q15, [x0, #464] // .......e.................................................................... - mls v6.8H, v10.8H, v7.H[0] // ........................................*................................... // gap // ............................................................................ - ldr q20, [x0, #16] // e........................................................................... - add v26.8H, v28.8H, v4.8H // ...............................................*............................ - add v13.8H, v8.8H, v27.8H // .....................................*...................................... - ldr q30, [x0, #144] // ..e......................................................................... - sub v18.8H, v8.8H, v27.8H // ....................................*....................................... - sub v4.8H, v28.8H, v4.8H // ..............................................*............................. - ldr q17, [x0, #208] // ...e........................................................................ + ldr q3, [x0, #464] // .......e.................................................................... // gap // ............................................................................ - mls v14.8H, v22.8H, v7.H[0] // ..............................*............................................. - ldr q11, [x0, #80] // .e.......................................................................... - ldr q2, [x0, #272] // ....e....................................................................... + mls v5.8H, v16.8H, v7.H[0] // ..............................*............................................. // gap // ............................................................................ + sub v31.8H, v28.8H, v21.8H // ..............................................*............................. + ldr q22, [x0, #400] // ......e..................................................................... + add v13.8H, v28.8H, v21.8H // ...............................................*............................ + ldr q20, [x0, #0] // *........................................................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + sub v18.8H, v9.8H, v15.8H // ....................................*....................................... + mls v4.8H, v24.8H, v7.H[0] // ........................................*................................... + add v24.8H, v9.8H, v15.8H // .....................................*...................................... + ldr q8, [x0, #208] // ...e........................................................................ // gap // ............................................................................ - mul v28.8H, v4.8H, v1.H[4] // ...............................................................*............ - mul v27.8H, v13.8H, v0.H[6] // ................................................*........................... // gap // ............................................................................ - sqrdmulh v3.8H, v13.8H, v0.H[7] // .................................................*.......................... // gap // ............................................................................ + mls v25.8H, v19.8H, v7.H[0] // ..........*................................................................. // gap // ............................................................................ + sqrdmulh v27.8H, v18.8H, v1.H[1] // .....................................................*...................... + mul v23.8H, v18.8H, v1.H[0] // ......................................................*..................... + sqrdmulh v6.8H, v13.8H, v1.H[3] // ..........................................................*................. + mul v10.8H, v13.8H, v1.H[2] // ...........................................................*................ // gap // ............................................................................ - sqrdmulh v23.8H, v4.8H, v1.H[5] // ................................................................*........... - sqrdmulh v10.8H, v24.8H, v0.H[1] // ...................e........................................................ - sqrdmulh v19.8H, v15.8H, v0.H[1] // ........................e................................................... - mul v15.8H, v15.8H, v0.H[0] // .......................e.................................................... - sub v4.8H, v9.8H, v6.8H // .........................................*.................................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + sqrdmulh v14.8H, v3.8H, v0.H[1] // .......................e.................................................... + mul v26.8H, v3.8H, v0.H[0] // ........................e................................................... + sqrdmulh v21.8H, v17.8H, v0.H[1] // .............e.............................................................. + mul v30.8H, v17.8H, v0.H[0] // ..............e............................................................. // gap // ............................................................................ - mul v31.8H, v24.8H, v0.H[0] // ..................e......................................................... - mul v5.8H, v21.8H, v0.H[0] // .............e.............................................................. - sqrdmulh v22.8H, v26.8H, v1.H[3] // ...........................................................*................ - mul v26.8H, v26.8H, v1.H[2] // ..........................................................*................. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + sqrdmulh v2.8H, v22.8H, v0.H[1] // ..................e......................................................... + mul v12.8H, v22.8H, v0.H[0] // ...................e........................................................ + sub v22.8H, v20.8H, v25.8H // ...........*................................................................ + add v11.8H, v20.8H, v25.8H // ............*............................................................... // gap // ............................................................................ - sqrdmulh v8.8H, v2.8H, v0.H[1] // .........e.................................................................. - mls v27.8H, v3.8H, v7.H[0] // ..................................................*......................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + mls v23.8H, v27.8H, v7.H[0] // .......................................................*.................... + mls v10.8H, v6.8H, v7.H[0] // ............................................................*............... + sqrdmulh v3.8H, v24.8H, v0.H[7] // ................................................*........................... + mul v16.8H, v24.8H, v0.H[6] // .................................................*.......................... // gap // ............................................................................ - mls v28.8H, v23.8H, v7.H[0] // .................................................................*.......... - sqrdmulh v23.8H, v21.8H, v0.H[1] // ..............e............................................................. - mls v15.8H, v19.8H, v7.H[0] // .........................e.................................................. - mul v24.8H, v18.8H, v1.H[0] // .....................................................*...................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + mls v30.8H, v21.8H, v7.H[0] // ...............e............................................................ + mls v26.8H, v14.8H, v7.H[0] // .........................e.................................................. + sub v27.8H, v11.8H, v5.8H // ...............................*............................................ // gap // ............................................................................ - sqrdmulh v16.8H, v18.8H, v1.H[1] // ......................................................*..................... - add v25.8H, v12.8H, v14.8H // ................................*........................................... - sub v21.8H, v12.8H, v14.8H // ...............................*............................................ - add v12.8H, v9.8H, v6.8H // ..........................................*................................. - mls v31.8H, v10.8H, v7.H[0] // ....................e....................................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + add v25.8H, v22.8H, v4.8H // ..........................................*................................. + mul v20.8H, v31.8H, v1.H[4] // ................................................................*........... + sqrdmulh v9.8H, v31.8H, v1.H[5] // ...............................................................*............ + mls v12.8H, v2.8H, v7.H[0] // ....................e....................................................... // gap // ............................................................................ - mls v26.8H, v22.8H, v7.H[0] // ............................................................*............... - mls v5.8H, v23.8H, v7.H[0] // ...............e............................................................ - add v23.8H, v4.8H, v28.8H // ...................................................................*........ // gap // ............................................................................ // gap // ............................................................................ - sub v10.8H, v4.8H, v28.8H // ..................................................................*......... // gap // ............................................................................ // gap // ............................................................................ - sub v22.8H, v25.8H, v27.8H // ...................................................*........................ - mls v24.8H, v16.8H, v7.H[0] // .......................................................*.................... - sub v19.8H, v17.8H, v15.8H // ..........................e................................................. - add v17.8H, v17.8H, v15.8H // ...........................e................................................ + add v14.8H, v25.8H, v10.8H // ..............................................................*............. + sub v31.8H, v25.8H, v10.8H // .............................................................*.............. // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + mls v16.8H, v3.8H, v7.H[0] // ..................................................*......................... + ldr q25, [x0, #272] // ....e....................................................................... + add v6.8H, v27.8H, v23.8H // .........................................................*.................. + sub v13.8H, v8.8H, v26.8H // ..........................e................................................. + add v2.8H, v8.8H, v26.8H // ...........................e................................................ + sub v10.8H, v22.8H, v4.8H // .........................................*.................................. + sub v23.8H, v27.8H, v23.8H // ........................................................*................... // gap // ............................................................................ - mul v15.8H, v2.8H, v0.H[0] // ........e................................................................... - str q22, [x0, #64] // .....................................................................*...... - str q10, [x0, #448] // ...........................................................................* - add v22.8H, v25.8H, v27.8H // ....................................................*....................... // gap // ............................................................................ - sub v14.8H, v12.8H, v26.8H // .............................................................*.............. - sub v10.8H, v30.8H, v31.8H // .....................e...................................................... // gap // ............................................................................ - add v3.8H, v30.8H, v31.8H // ......................e..................................................... - str q23, [x0, #384] // ..........................................................................*. // gap // ............................................................................ - sqrdmulh v9.8H, v17.8H, v0.H[3] // ..................................e......................................... - sqrdmulh v23.8H, v19.8H, v0.H[5] // ............................................e............................... - mul v27.8H, v17.8H, v0.H[2] // .................................e.......................................... + str q6, [x0, #128] // ......................................................................*..... + add v27.8H, v29.8H, v12.8H // ......................e..................................................... + sub v3.8H, v29.8H, v12.8H // .....................e...................................................... + ldr q12, [x0, #80] // .e.......................................................................... + mls v20.8H, v9.8H, v7.H[0] // .................................................................*.......... + add v29.8H, v11.8H, v5.8H // ................................*........................................... // gap // ............................................................................ // gap // ............................................................................ - mul v4.8H, v19.8H, v0.H[4] // ...........................................e................................ - mls v15.8H, v8.8H, v7.H[0] // ..........e................................................................. - str q22, [x0], #(16) // ....................................................................*....... - str q14, [x0, #304] // .........................................................................*.. - add v22.8H, v21.8H, v24.8H // .........................................................*.................. - add v26.8H, v12.8H, v26.8H // ..............................................................*............. - sub v13.8H, v21.8H, v24.8H // ........................................................*................... + str q31, [x0, #320] // .........................................................................*.. + mul v15.8H, v2.8H, v0.H[2] // ..................................e......................................... + sqrdmulh v19.8H, v2.8H, v0.H[3] // .................................e.......................................... + str q23, [x0, #192] // .......................................................................*.... + sqrdmulh v6.8H, v13.8H, v0.H[5] // ...........................................e................................ + mul v21.8H, v13.8H, v0.H[4] // ............................................e............................... // gap // ............................................................................ // gap // ............................................................................ - sub v28.8H, v11.8H, v5.8H // ................e........................................................... + add v22.8H, v29.8H, v16.8H // ....................................................*....................... + sub v13.8H, v29.8H, v16.8H // ...................................................*........................ + str q14, [x0, #256] // ........................................................................*... + sqrdmulh v16.8H, v27.8H, v0.H[3] // ............................e............................................... // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + sub v24.8H, v10.8H, v20.8H // ..................................................................*......... + add v18.8H, v10.8H, v20.8H // ...................................................................*........ + mul v5.8H, v27.8H, v0.H[2] // .............................e.............................................. // gap // ............................................................................ - mul v6.8H, v10.8H, v0.H[4] // ......................................e..................................... - sqrdmulh v10.8H, v10.8H, v0.H[5] // .......................................e.................................... - add v8.8H, v11.8H, v5.8H // .................e.......................................................... - str q22, [x0, #112] // ......................................................................*..... - str q13, [x0, #176] // .......................................................................*.... - sqrdmulh v22.8H, v3.8H, v0.H[3] // .............................e.............................................. - mls v4.8H, v23.8H, v7.H[0] // .............................................e.............................. - mls v27.8H, v9.8H, v7.H[0] // ...................................e........................................ // gap // ............................................................................ // gap // ............................................................................ - mul v14.8H, v3.8H, v0.H[2] // ............................e............................................... - str q26, [x0, #240] // ........................................................................*... // gap // ............................................................................ // gap // ............................................................................ + mls v21.8H, v6.8H, v7.H[0] // .............................................e.............................. + mls v15.8H, v19.8H, v7.H[0] // ...................................e........................................ + sqrdmulh v19.8H, v25.8H, v0.H[1] // ........e................................................................... + mul v25.8H, v25.8H, v0.H[0] // .........e.................................................................. + str q13, [x0, #64] // .....................................................................*...... + str q22, [x0], #(16) // ....................................................................*....... // gap // ............................................................................ // gap // ............................................................................ + sub v28.8H, v12.8H, v30.8H // ................e........................................................... + add v9.8H, v12.8H, v30.8H // .................e.......................................................... + str q24, [x0, #432] // ...........................................................................* + sqrdmulh v24.8H, v3.8H, v0.H[5] // ......................................e..................................... + mul v4.8H, v3.8H, v0.H[4] // .......................................e.................................... + str q18, [x0, #368] // ..........................................................................*. // gap // ............................................................................ // gap // ............................................................................ - // original source code - // ldr q8, [x0, #0] // ......e.....................................................................|.....e..................................................................... - // ldr q9, [x0, #(1*(512/8))] // ..............e.............................................................|.............e............................................................. - // ldr q10, [x0, #(2*(512/8))] // .........e..................................................................|........e.................................................................. - // ldr q11, [x0, #(3*(512/8))] // ............e...............................................................|...........e............................................................... - // ldr q12, [x0, #(4*(512/8))] // ...............e............................................................|..............e............................................................ - // ldr q13, [x0, #(5*(512/8))] // .e..........................................................................|e.......................................................................... - // ldr q14, [x0, #(6*(512/8))] // e...........................................................................e........................................................................... - // ldr q15, [x0, #(7*(512/8))] // ....e.......................................................................|...e....................................................................... - // mul v24.8h, v12.8h, v0.h[0] // ...............................................e............................|..............................................e............................ - // sqrdmulh v12.8h, v12.8h, v0.h[1] // ............................e...............................................|...........................e............................................... - // mls v24.8h, v12.8h, v7.h[0] // ...........................................................e................|..........................................................e................ - // sub v12.8h, v8.8h, v24.8h // ..*.........................................................................|.*......................................................................... - // add v8.8h, v8.8h, v24.8h // ...*........................................................................|..*........................................................................ - // mul v24.8h, v13.8h, v0.h[0] // .........................e..................................................|........................e.................................................. - // sqrdmulh v13.8h, v13.8h, v0.h[1] // ...............................e............................................|..............................e............................................ - // mls v24.8h, v13.8h, v7.h[0] // ........................................e...................................|.......................................e................................... - // sub v13.8h, v9.8h, v24.8h // .................................................................e..........|................................................................e.......... - // add v9.8h, v9.8h, v24.8h // ....................................................................e.......|...................................................................e....... - // mul v24.8h, v14.8h, v0.h[0] // ........................e...................................................|.......................e................................................... - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ....................e.......................................................|...................e....................................................... - // mls v24.8h, v14.8h, v7.h[0] // ......................................e.....................................|.....................................e..................................... - // sub v14.8h, v10.8h, v24.8h // ....................................................e.......................|...................................................e....................... - // add v10.8h, v10.8h, v24.8h // .....................................................e......................|....................................................e...................... - // mul v24.8h, v15.8h, v0.h[0] // ......................e.....................................................|.....................e..................................................... - // sqrdmulh v15.8h, v15.8h, v0.h[1] // .....................e......................................................|....................e...................................................... - // mls v24.8h, v15.8h, v7.h[0] // ................................e...........................................|...............................e........................................... - // sub v15.8h, v11.8h, v24.8h // .............................................e..............................|............................................e.............................. - // add v11.8h, v11.8h, v24.8h // ..............................................e.............................|.............................................e............................. - // mul v24.8h, v10.8h, v0.h[2] // ..........................................................................e.|.........................................................................e. - // sqrdmulh v10.8h, v10.8h, v0.h[3] // .......................................................................e....|......................................................................e.... - // mls v24.8h, v10.8h, v7.h[0] // .............*..............................................................|............*.............................................................. - // sub v10.8h, v8.8h, v24.8h // ....................................*.......................................|...................................*....................................... - // add v8.8h, v8.8h, v24.8h // ...................................*........................................|..................................*........................................ - // mul v24.8h, v11.8h, v0.h[2] // .........................................................e..................|........................................................e.................. - // sqrdmulh v11.8h, v11.8h, v0.h[3] // .......................................................e....................|......................................................e.................... - // mls v24.8h, v11.8h, v7.h[0] // .........................................................................e..|........................................................................e.. - // sub v11.8h, v9.8h, v24.8h // ..........*.................................................................|.........*................................................................. - // add v9.8h, v9.8h, v24.8h // ........*...................................................................|.......*................................................................... - // mul v24.8h, v14.8h, v0.h[4] // ..................................................................e.........|.................................................................e......... - // sqrdmulh v14.8h, v14.8h, v0.h[5] // ...................................................................e........|..................................................................e........ - // mls v24.8h, v14.8h, v7.h[0] // .....*......................................................................|....*...................................................................... - // sub v14.8h, v12.8h, v24.8h // .......................*....................................................|......................*.................................................... - // add v12.8h, v12.8h, v24.8h // .....................................*......................................|....................................*...................................... - // mul v24.8h, v15.8h, v0.h[4] // ..........................................................e.................|.........................................................e................. - // sqrdmulh v15.8h, v15.8h, v0.h[5] // ........................................................e...................|.......................................................e................... - // mls v24.8h, v15.8h, v7.h[0] // ........................................................................e...|.......................................................................e... - // sub v15.8h, v13.8h, v24.8h // ...........*................................................................|..........*................................................................ - // add v13.8h, v13.8h, v24.8h // .......*....................................................................|......*.................................................................... - // mul v24.8h, v9.8h, v0.h[6] // .................*..........................................................|................*.......................................................... - // sqrdmulh v9.8h, v9.8h, v0.h[7] // ..................*.........................................................|.................*......................................................... - // mls v24.8h, v9.8h, v7.h[0] // .............................*..............................................|............................*.............................................. - // sub v9.8h, v8.8h, v24.8h // ...........................................*................................|..........................................*................................ - // add v8.8h, v8.8h, v24.8h // ..................................................*.........................|.................................................*......................... - // mul v24.8h, v11.8h, v1.h[0] // .................................*..........................................|................................*.......................................... - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ..................................*.........................................|.................................*......................................... - // mls v24.8h, v11.8h, v7.h[0] // ............................................*...............................|...........................................*............................... - // sub v11.8h, v10.8h, v24.8h // ................................................................*...........|...............................................................*........... - // add v10.8h, v10.8h, v24.8h // ..............................................................*.............|.............................................................*............. - // mul v24.8h, v13.8h, v1.h[2] // ...........................*................................................|..........................*................................................ - // sqrdmulh v13.8h, v13.8h, v1.h[3] // ..........................*.................................................|.........................*................................................. - // mls v24.8h, v13.8h, v7.h[0] // .......................................*....................................|......................................*.................................... - // sub v13.8h, v12.8h, v24.8h // ...................................................*........................|..................................................*........................ - // add v12.8h, v12.8h, v24.8h // ...............................................................*............|..............................................................*............ - // mul v24.8h, v15.8h, v1.h[4] // ................*...........................................................|...............*........................................................... - // sqrdmulh v15.8h, v15.8h, v1.h[5] // ...................*........................................................|..................*........................................................ - // mls v24.8h, v15.8h, v7.h[0] // ..............................*.............................................|.............................*............................................. - // sub v15.8h, v14.8h, v24.8h // ..........................................*.................................|.........................................*................................. - // add v14.8h, v14.8h, v24.8h // .........................................*..................................|........................................*.................................. - // str q8, [x0], #(16) // ............................................................*...............|...........................................................*............... - // str q9, [x0, #(-16 + 1*(512/8))] // ................................................*...........................|...............................................*........................... - // str q10, [x0, #(-16 + 2*(512/8))] // .....................................................................*......|....................................................................*...... - // str q11, [x0, #(-16 + 3*(512/8))] // ......................................................................*.....|.....................................................................*..... - // str q12, [x0, #(-16 + 4*(512/8))] // ...........................................................................*|..........................................................................* - // str q13, [x0, #(-16 + 5*(512/8))] // .............................................................*..............|............................................................*.............. - // str q14, [x0, #(-16 + 6*(512/8))] // ......................................................*.....................|.....................................................*..................... - // str q15, [x0, #(-16 + 7*(512/8))] // .................................................*..........................|................................................*.......................... + // -------------------------------------------------------------------- new position ---------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr q8, [x0, #0] // .......~....................................................................'......*.................................................................... + // ldr q9, [x0, #(1*(512/8))] // ................................................e...........................'...............................................~........................... + // ldr q10, [x0, #(2*(512/8))] // e...........................................................................~........................................................................... + // ldr q11, [x0, #(3*(512/8))] // ...........e................................................................'..........~................................................................ + // ldr q12, [x0, #(4*(512/8))] // .......................................e....................................'......................................~.................................... + // ldr q13, [x0, #(5*(512/8))] // .e..........................................................................'~.......................................................................... + // ldr q14, [x0, #(6*(512/8))] // .....e......................................................................'....~...................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..e.........................................................................'.~......................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ..................................................................e.........'.................................................................~......... + // mul v24.8h, v12.8h, v0.h[0] // ...................................................................e........'..................................................................~........ + // mls v24.8h, v27.8h, v7.h[0] // ............~...............................................................'...........*............................................................... + // sub v12.8h, v8.8h, v24.8h // .......................~....................................................'......................*.................................................... + // add v8.8h, v8.8h, v24.8h // ........................~...................................................'.......................*................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ...................e........................................................'..................~........................................................ + // mul v24.8h, v13.8h, v0.h[0] // ....................e.......................................................'...................~....................................................... + // mls v24.8h, v27.8h, v7.h[0] // .............................e..............................................'............................~.............................................. + // sub v13.8h, v9.8h, v24.8h // ......................................................................e.....'.....................................................................~..... + // add v9.8h, v9.8h, v24.8h // .......................................................................e....'......................................................................~.... + // sqrdmulh v27.8h, v14.8h, v0.h[1] // .....................e......................................................'....................~...................................................... + // mul v24.8h, v14.8h, v0.h[0] // ......................e.....................................................'.....................~..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................e........................................'..................................~........................................ + // sub v14.8h, v10.8h, v24.8h // ...............................................e............................'..............................................~............................ + // add v10.8h, v10.8h, v24.8h // ..............................................e.............................'.............................................~............................. + // sqrdmulh v27.8h, v15.8h, v0.h[1] // .................e..........................................................'................~.......................................................... + // mul v24.8h, v15.8h, v0.h[0] // ..................e.........................................................'.................~......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................e.............................................'.............................~............................................. + // sub v15.8h, v11.8h, v24.8h // .........................................e..................................'........................................~.................................. + // add v11.8h, v11.8h, v24.8h // ..........................................e.................................'.........................................~................................. + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ............................................................e...............'...........................................................~............... + // mul v24.8h, v10.8h, v0.h[2] // ...............................................................e............'..............................................................~............ + // mls v24.8h, v27.8h, v7.h[0] // ...~........................................................................'..*........................................................................ + // sub v10.8h, v8.8h, v24.8h // ...............................~............................................'..............................*............................................ + // add v8.8h, v8.8h, v24.8h // ..................................................~.........................'.................................................*......................... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // .....................................................e......................'....................................................~...................... + // mul v24.8h, v11.8h, v0.h[2] // ....................................................e.......................'...................................................~....................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................................e..........'................................................................~.......... + // sub v11.8h, v9.8h, v24.8h // ........~...................................................................'.......*................................................................... + // add v9.8h, v9.8h, v24.8h // ..........~.................................................................'.........*................................................................. + // sqrdmulh v27.8h, v14.8h, v0.h[5] // .........................................................................e..'........................................................................~.. + // mul v24.8h, v14.8h, v0.h[4] // ..........................................................................e.'.........................................................................~. + // mls v24.8h, v27.8h, v7.h[0] // .........~..................................................................'........*.................................................................. + // sub v14.8h, v12.8h, v24.8h // ...........................................~................................'..........................................*................................ + // add v12.8h, v12.8h, v24.8h // ................................~...........................................'...............................*........................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // .......................................................e....................'......................................................~.................... + // mul v24.8h, v15.8h, v0.h[4] // ........................................................e...................'.......................................................~................... + // mls v24.8h, v27.8h, v7.h[0] // ................................................................e...........'...............................................................~........... + // sub v15.8h, v13.8h, v24.8h // ....~.......................................................................'...*....................................................................... + // add v13.8h, v13.8h, v24.8h // ......~.....................................................................'.....*..................................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ...........................~................................................'..........................*................................................ + // mul v24.8h, v9.8h, v0.h[6] // ............................~...............................................'...........................*............................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................................~.....................................'.....................................*..................................... + // sub v9.8h, v8.8h, v24.8h // ..........................................................~.................'.........................................................*................. + // add v8.8h, v8.8h, v24.8h // .........................................................~..................'........................................................*.................. + // sqrdmulh v27.8h, v11.8h, v1.h[1] // .............~..............................................................'............*.............................................................. + // mul v24.8h, v11.8h, v1.h[0] // ..............~.............................................................'.............*............................................................. + // mls v24.8h, v27.8h, v7.h[0] // .........................~..................................................'........................*.................................................. + // sub v11.8h, v10.8h, v24.8h // ............................................~...............................'...........................................*............................... + // add v10.8h, v10.8h, v24.8h // ........................................~...................................'.......................................*................................... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ...............~............................................................'..............*............................................................ + // mul v24.8h, v13.8h, v1.h[2] // ................~...........................................................'...............*........................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................~.................................................'.........................*................................................. + // sub v13.8h, v12.8h, v24.8h // .....................................~......................................'....................................*...................................... + // add v12.8h, v12.8h, v24.8h // ....................................~.......................................'...................................*....................................... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ..................................~.........................................'.................................*......................................... + // mul v24.8h, v15.8h, v1.h[4] // .................................~..........................................'................................*.......................................... + // mls v24.8h, v27.8h, v7.h[0] // .................................................~..........................'................................................*.......................... + // sub v15.8h, v14.8h, v24.8h // .............................................................~..............'............................................................*.............. + // add v14.8h, v14.8h, v24.8h // ..............................................................~.............'.............................................................*............. + // str q8, [x0], #(16) // .....................................................................~......'....................................................................*...... + // str q9, [x0, #(-16 + 1*(512/8))] // ....................................................................~.......'...................................................................*....... + // str q10, [x0, #(-16 + 2*(512/8))] // .............................................~..............................'............................................*.............................. + // str q11, [x0, #(-16 + 3*(512/8))] // ......................................................~.....................'.....................................................*..................... + // str q12, [x0, #(-16 + 4*(512/8))] // ...........................................................~................'..........................................................*................ + // str q13, [x0, #(-16 + 5*(512/8))] // ...................................................~........................'..................................................*........................ + // str q14, [x0, #(-16 + 6*(512/8))] // ...........................................................................~'..........................................................................* + // str q15, [x0, #(-16 + 7*(512/8))] // ........................................................................~...'.......................................................................*... sub count, count, #1 cbnz count, layer123_start - mls v6.8H, v10.8H, v7.H[0] // ..*..................................... - sub v25.8H, v20.8H, v15.8H // *....................................... - add v31.8H, v20.8H, v15.8H // .*...................................... - sub v29.8H, v28.8H, v4.8H // ......*................................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v12.8H, v28.8H, v4.8H // ...*.................................... - mls v14.8H, v22.8H, v7.H[0] // .......*................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sub v20.8H, v8.8H, v27.8H // .....*.................................. - add v5.8H, v8.8H, v27.8H // ....*................................... - mul v18.8H, v29.8H, v1.H[4] // ........*............................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - sqrdmulh v11.8H, v29.8H, v1.H[5] // ...........*............................ - sqrdmulh v26.8H, v20.8H, v1.H[1] // ..................*..................... - sqrdmulh v16.8H, v12.8H, v1.H[3] // .............*.......................... - mul v8.8H, v12.8H, v1.H[2] // ..............*......................... - mul v21.8H, v20.8H, v1.H[0] // .................*...................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mul v10.8H, v5.8H, v0.H[6] // .........*.............................. - sqrdmulh v9.8H, v5.8H, v0.H[7] // ..........*............................. - add v23.8H, v25.8H, v6.8H // .....................*.................. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v18.8H, v11.8H, v7.H[0] // ................*....................... - sub v13.8H, v25.8H, v6.8H // ............*........................... - add v22.8H, v31.8H, v14.8H // ...................*.................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v21.8H, v26.8H, v7.H[0] // ..........................*............. - mls v8.8H, v16.8H, v7.H[0] // ......................*................. - sub v16.8H, v31.8H, v14.8H // ....................*................... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - mls v10.8H, v9.8H, v7.H[0] // ...............*........................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v3.8H, v13.8H, v18.8H // .......................*................ - sub v28.8H, v13.8H, v18.8H // ........................*............... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - add v12.8H, v23.8H, v8.8H // ...................................*.... - sub v26.8H, v23.8H, v8.8H // ..............................*......... - add v17.8H, v16.8H, v21.8H // ..................................*..... - sub v11.8H, v16.8H, v21.8H // ....................................*... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - str q3, [x0, #384] // ...............................*........ - sub v13.8H, v22.8H, v10.8H // .........................*.............. - add v20.8H, v22.8H, v10.8H // .............................*.......... - str q28, [x0, #448] // ............................*........... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - str q12, [x0, #256] // .......................................* - str q17, [x0, #128] // .....................................*.. - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - str q26, [x0, #320] // .................................*...... - str q20, [x0], #(16) // ................................*....... - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - str q11, [x0, #176] // ......................................*. - str q13, [x0, #48] // ...........................*............ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ - // gap // ........................................ + // Instructions: 42 + // Expected cycles: 14 + // Expected IPC: 3.00 + // + // Cycle bound: 14.0 + // IPC bound: 3.00 + // + // Wall time: 0.85s + // User time: 0.85s + // + // ----------- original position -----------> + // 0 25 + // |------------------------|---------------- + mls v5.8H, v16.8H, v7.H[0] // *......................................... + mls v4.8H, v24.8H, v7.H[0] // .....*.................................... + mls v25.8H, v19.8H, v7.H[0] // .......*.................................. + add v16.8H, v28.8H, v21.8H // ..*....................................... + ldr q27, [x0, #0] // ...*...................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v14.8H, v28.8H, v21.8H // .*........................................ + add v6.8H, v9.8H, v15.8H // ......*................................... + sub v15.8H, v9.8H, v15.8H // ....*..................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sqrdmulh v26.8H, v16.8H, v1.H[3] // ..........*............................... + mul v16.8H, v16.8H, v1.H[2] // ...........*.............................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sqrdmulh v11.8H, v6.8H, v0.H[7] // ................*......................... + mul v6.8H, v6.8H, v0.H[6] // .................*........................ + sqrdmulh v13.8H, v15.8H, v1.H[1] // ........*................................. + mul v15.8H, v15.8H, v1.H[0] // .........*................................ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v31.8H, v27.8H, v25.8H // .............*............................ + sub v27.8H, v27.8H, v25.8H // ............*............................. + mul v25.8H, v14.8H, v1.H[4] // ....................*..................... + sqrdmulh v14.8H, v14.8H, v1.H[5] // .....................*.................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v16.8H, v26.8H, v7.H[0] // ...............*.......................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v22.8H, v27.8H, v4.8H // ...................*...................... + add v26.8H, v31.8H, v5.8H // ..............................*........... + sub v31.8H, v31.8H, v5.8H // ..................*....................... + mls v6.8H, v11.8H, v7.H[0] // ........................*................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v27.8H, v27.8H, v4.8H // ..........................*............... + mls v15.8H, v13.8H, v7.H[0] // ..............*........................... + mls v25.8H, v14.8H, v7.H[0] // .............................*............ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v14.8H, v22.8H, v16.8H // ......................*................... + sub v16.8H, v22.8H, v16.8H // .......................*.................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v11.8H, v26.8H, v6.8H // .................................*........ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v6.8H, v26.8H, v6.8H // ..................................*....... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + add v26.8H, v31.8H, v15.8H // .........................*................ + sub v15.8H, v31.8H, v15.8H // ...........................*.............. + str q14, [x0, #256] // ...................................*...... + str q16, [x0, #320] // ...............................*.......... + // gap // .......................................... + // gap // .......................................... + sub v16.8H, v27.8H, v25.8H // ....................................*..... + add v27.8H, v27.8H, v25.8H // .....................................*.... + str q6, [x0, #64] // ......................................*... + str q11, [x0], #(16) // .......................................*.. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q26, [x0, #112] // ............................*............. + str q15, [x0, #176] // ................................*......... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q27, [x0, #368] // .........................................* + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q16, [x0, #432] // ........................................*. - // original source code - // sub v9.8H, v20.8H, v15.8H // .*...................................... - // add v12.8H, v20.8H, v15.8H // ..*..................................... - // mls v6.8H, v10.8H, v7.H[0] // *....................................... - // add v26.8H, v28.8H, v4.8H // ....*................................... - // add v13.8H, v8.8H, v27.8H // .......*................................ - // sub v18.8H, v8.8H, v27.8H // ......*................................. - // sub v4.8H, v28.8H, v4.8H // ...*.................................... - // mls v14.8H, v22.8H, v7.H[0] // .....*.................................. - // mul v28.8H, v4.8H, v1.H[4] // ........*............................... - // mul v27.8H, v13.8H, v0.H[6] // ..............*......................... - // sqrdmulh v3.8H, v13.8H, v0.H[7] // ...............*........................ - // sqrdmulh v23.8H, v4.8H, v1.H[5] // .........*.............................. - // sub v4.8H, v9.8H, v6.8H // ..................*..................... - // sqrdmulh v22.8H, v26.8H, v1.H[3] // ...........*............................ - // mul v26.8H, v26.8H, v1.H[2] // ............*........................... - // mls v27.8H, v3.8H, v7.H[0] // .......................*................ - // mls v28.8H, v23.8H, v7.H[0] // .................*...................... - // mul v24.8H, v18.8H, v1.H[0] // .............*.......................... - // sqrdmulh v16.8H, v18.8H, v1.H[1] // ..........*............................. - // add v25.8H, v12.8H, v14.8H // ...................*.................... - // sub v21.8H, v12.8H, v14.8H // ......................*................. - // add v12.8H, v9.8H, v6.8H // ................*....................... - // mls v26.8H, v22.8H, v7.H[0] // .....................*.................. - // add v23.8H, v4.8H, v28.8H // ........................*............... - // sub v10.8H, v4.8H, v28.8H // .........................*.............. - // sub v22.8H, v25.8H, v27.8H // ...............................*........ - // mls v24.8H, v16.8H, v7.H[0] // ....................*................... - // str q22, [x0, #64] // .......................................* - // str q10, [x0, #448] // .................................*...... - // add v22.8H, v25.8H, v27.8H // ................................*....... - // sub v14.8H, v12.8H, v26.8H // ...........................*............ - // str q23, [x0, #384] // ..............................*......... - // str q22, [x0], #(16) // .....................................*.. - // str q14, [x0, #304] // ....................................*... - // add v22.8H, v21.8H, v24.8H // ............................*........... - // add v26.8H, v12.8H, v26.8H // ..........................*............. - // sub v13.8H, v21.8H, v24.8H // .............................*.......... - // str q22, [x0, #112] // ...................................*.... - // str q13, [x0, #176] // ......................................*. - // str q26, [x0, #240] // ..................................*..... + // ------------- new position --------------> + // 0 25 + // |------------------------|---------------- + // mls v5.8H, v16.8H, v7.H[0] // *......................................... + // sub v31.8H, v28.8H, v21.8H // .....*.................................... + // add v13.8H, v28.8H, v21.8H // ...*...................................... + // ldr q20, [x0, #0] // ....*..................................... + // sub v18.8H, v9.8H, v15.8H // .......*.................................. + // mls v4.8H, v24.8H, v7.H[0] // .*........................................ + // add v24.8H, v9.8H, v15.8H // ......*................................... + // mls v25.8H, v19.8H, v7.H[0] // ..*....................................... + // sqrdmulh v27.8H, v18.8H, v1.H[1] // ............*............................. + // mul v23.8H, v18.8H, v1.H[0] // .............*............................ + // sqrdmulh v6.8H, v13.8H, v1.H[3] // ........*................................. + // mul v10.8H, v13.8H, v1.H[2] // .........*................................ + // sub v22.8H, v20.8H, v25.8H // ...............*.......................... + // add v11.8H, v20.8H, v25.8H // ..............*........................... + // mls v23.8H, v27.8H, v7.H[0] // ........................*................. + // mls v10.8H, v6.8H, v7.H[0] // ..................*....................... + // sqrdmulh v3.8H, v24.8H, v0.H[7] // ..........*............................... + // mul v16.8H, v24.8H, v0.H[6] // ...........*.............................. + // sub v27.8H, v11.8H, v5.8H // .....................*.................... + // add v25.8H, v22.8H, v4.8H // ...................*...................... + // mul v20.8H, v31.8H, v1.H[4] // ................*......................... + // sqrdmulh v9.8H, v31.8H, v1.H[5] // .................*........................ + // add v14.8H, v25.8H, v10.8H // ..........................*............... + // sub v31.8H, v25.8H, v10.8H // ...........................*.............. + // mls v16.8H, v3.8H, v7.H[0] // ......................*................... + // add v6.8H, v27.8H, v23.8H // ..............................*........... + // sub v10.8H, v22.8H, v4.8H // .......................*.................. + // sub v23.8H, v27.8H, v23.8H // ...............................*.......... + // str q6, [x0, #128] // ......................................*... + // mls v20.8H, v9.8H, v7.H[0] // .........................*................ + // add v29.8H, v11.8H, v5.8H // ....................*..................... + // str q31, [x0, #320] // .................................*........ + // str q23, [x0, #192] // .......................................*.. + // add v22.8H, v29.8H, v16.8H // ............................*............. + // sub v13.8H, v29.8H, v16.8H // .............................*............ + // str q14, [x0, #256] // ................................*......... + // sub v24.8H, v10.8H, v20.8H // ..................................*....... + // add v18.8H, v10.8H, v20.8H // ...................................*...... + // str q13, [x0, #64] // ....................................*..... + // str q22, [x0], #(16) // .....................................*.... + // str q24, [x0, #432] // .........................................* + // str q18, [x0, #368] // ........................................*. restore inp, STACK0 mov count, #8 .p2align 2 - ldr q5, [x3], #16 // *.............................................. - ldr q15, [x1, #48] // .*............................................. - ldr q18, [x1, #32] // ...*........................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - ldr q25, [x1, #0] // ..*............................................ - ldr q21, [x4, #32] // ......*........................................ - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - ldr q20, [x4, #80] // .........*..................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - ldr q17, [x1, #16] // ....*.......................................... - sqrdmulh v14.8H, v15.8H, v5.H[1] // ........*...................................... - mul v31.8H, v15.8H, v5.H[0] // .......*....................................... - mul v29.8H, v18.8H, v5.H[0] // ..........*.................................... - sqrdmulh v9.8H, v18.8H, v5.H[1] // ...........*................................... - ldr q30, [x4], #(6*16) // .........................*..................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v29.8H, v9.8H, v7.H[0] // ...............*............................... - mls v31.8H, v14.8H, v7.H[0] // ............*.................................. - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - add v1.8H, v25.8H, v29.8H // .....................*......................... - sub v11.8H, v25.8H, v29.8H // ....................*.......................... - sub v29.8H, v17.8H, v31.8H // .............*................................. - add v18.8H, v17.8H, v31.8H // ..............*................................ - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mul v10.8H, v29.8H, v5.H[4] // ................*.............................. - sqrdmulh v24.8H, v29.8H, v5.H[5] // .................*............................. - sqrdmulh v2.8H, v18.8H, v5.H[3] // ..................*............................ - mul v9.8H, v18.8H, v5.H[2] // ...................*........................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v9.8H, v2.8H, v7.H[0] // ........................*...................... - mls v10.8H, v24.8H, v7.H[0] // ......................*........................ - ldr q2, [x4, #-32] // ........................................*...... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sub v31.8H, v11.8H, v10.8H // ............................*.................. - add v15.8H, v11.8H, v10.8H // .............................*................. - add v10.8H, v1.8H, v9.8H // ...........................*................... - sub v5.8H, v1.8H, v9.8H // ..........................*.................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - trn1 v4.4S, v10.4S, v5.4S // .................................*............. - trn2 v8.4S, v10.4S, v5.4S // ................................*.............. - trn2 v23.4S, v15.4S, v31.4S // ...............................*............... - trn1 v9.4S, v15.4S, v31.4S // ..............................*................ - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - ldr q18, [x4, #-80] // .......................*....................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - trn2 v11.2D, v4.2D, v9.2D // ...................................*........... - trn1 v25.2D, v4.2D, v9.2D // .........................................*..... - trn2 v10.2D, v8.2D, v23.2D // ..................................*............ - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - trn1 v23.2D, v8.2D, v23.2D // ...........................................*... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sqrdmulh v28.8H, v10.8H, v18.8H // ....................................*.......... - mul v5.8H, v10.8H, v30.8H // .....................................*......... - sqrdmulh v8.8H, v11.8H, v18.8H // .......................................*....... - mul v11.8H, v11.8H, v30.8H // ......................................*........ - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - mls v11.8H, v8.8H, v7.H[0] // ............................................*.. - mls v5.8H, v28.8H, v7.H[0] // ..........................................*.... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - sub v22.8H, v23.8H, v5.8H // ..............................................* - add v23.8H, v23.8H, v5.8H // .............................................*. - ldr q5, [x4, #-48] // .....*......................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... - // gap // ............................................... + // Instructions: 43 + // Expected cycles: 28 + // Expected IPC: 1.54 + // + // Cycle bound: 28.0 + // IPC bound: 1.54 + // + // Wall time: 0.64s + // User time: 0.64s + // + // ----------- original position ------------> + // 0 25 + // |------------------------|----------------- + ldr q13, [x1, #48] // *.......................................... + ldr q27, [x1, #32] // ...*....................................... + ldr q12, [x3], #16 // ..*........................................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q31, [x1, #0] // .*......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q11, [x4, #16] // ...............................*........... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q0, [x4, #80] // .........................................*. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v8.8H, v13.8H, v12.H[1] // .....*..................................... + mul v25.8H, v13.8H, v12.H[0] // ......*.................................... + mul v20.8H, v27.8H, v12.H[0] // ........*.................................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v29.8H, v27.8H, v12.H[1] // .......*................................... + ldr q13, [x1, #16] // ....*...................................... + ldr q28, [x4], #(6*16) // .........*................................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v25.8H, v8.8H, v7.H[0] // ..........*................................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v20.8H, v29.8H, v7.H[0] // ...........*............................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v9.8H, v13.8H, v25.8H // .............*............................. + sub v29.8H, v13.8H, v25.8H // ............*.............................. + ldr q13, [x4, #-48] // ............................*.............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v27.8H, v31.8H, v20.8H // ...............*........................... + add v21.8H, v31.8H, v20.8H // ................*.......................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v31.8H, v29.8H, v12.H[5] // ....................*...................... + mul v29.8H, v29.8H, v12.H[4] // ...................*....................... + sqrdmulh v20.8H, v9.8H, v12.H[3] // .................*......................... + mul v22.8H, v9.8H, v12.H[2] // ..................*........................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v29.8H, v31.8H, v7.H[0] // ......................*.................... + mls v22.8H, v20.8H, v7.H[0] // .....................*..................... + ldr q31, [x4, #-64] // .......................*................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v20.8H, v21.8H, v22.8H // .........................*................. + sub v2.8H, v27.8H, v29.8H // ..........................*................ + add v6.8H, v27.8H, v29.8H // ...........................*............... + add v8.8H, v21.8H, v22.8H // ........................*.................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v15.4S, v6.4S, v2.4S // ................................*.......... + trn2 v24.4S, v8.4S, v20.4S // .............................*............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v16.4S, v6.4S, v2.4S // .................................*......... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v25.2D, v24.2D, v15.2D // ..................................*........ + trn1 v15.2D, v24.2D, v15.2D // ......................................*.... + trn1 v24.4S, v8.4S, v20.4S // ..............................*............ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v20.8H, v25.8H, v11.8H // ....................................*...... + mul v21.8H, v25.8H, v28.8H // .....................................*..... + trn2 v19.2D, v24.2D, v16.2D // ...................................*....... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v2.8H, v19.8H, v28.8H // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v14.8H, v19.8H, v11.8H // ........................................*.. + mls v21.8H, v20.8H, v7.H[0] // ..........................................* + ldr q11, [x4, #-32] // ..............*............................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... - // original source code - // ldr q12, [x3], #16 // *.............................................. - // ldr q26, [x1, #48] // .*............................................. - // ldr q9, [x1, #0] // ...*........................................... - // ldr q16, [x1, #32] // ..*............................................ - // ldr q28, [x1, #16] // ......*........................................ - // ldr q5, [x4, #48] // ..............................................* - // ldr q21, [x4, #32] // ....*.......................................... - // mul v11.8H, v26.8H, v12.H[0] // ........*...................................... - // sqrdmulh v26.8H, v26.8H, v12.H[1] // .......*....................................... - // ldr q20, [x4, #80] // .....*......................................... - // mul v14.8H, v16.8H, v12.H[0] // .........*..................................... - // sqrdmulh v19.8H, v16.8H, v12.H[1] // ..........*.................................... - // mls v11.8H, v26.8H, v7.H[0] // .............*................................. - // sub v10.8H, v28.8H, v11.8H // ................*.............................. - // add v13.8H, v28.8H, v11.8H // .................*............................. - // mls v14.8H, v19.8H, v7.H[0] // ............*.................................. - // mul v30.8H, v10.8H, v12.H[4] // ..................*............................ - // sqrdmulh v10.8H, v10.8H, v12.H[5] // ...................*........................... - // sqrdmulh v11.8H, v13.8H, v12.H[3] // ....................*.......................... - // mul v12.8H, v13.8H, v12.H[2] // .....................*......................... - // sub v15.8H, v9.8H, v14.8H // ...............*............................... - // add v24.8H, v9.8H, v14.8H // ..............*................................ - // mls v30.8H, v10.8H, v7.H[0] // .......................*....................... - // ldr q10, [x4, #16] // .................................*............. - // mls v12.8H, v11.8H, v7.H[0] // ......................*........................ - // ldr q4, [x4], #(6*16) // ...........*................................... - // sub v11.8H, v24.8H, v12.8H // ............................*.................. - // add v3.8H, v24.8H, v12.8H // ...........................*................... - // sub v19.8H, v15.8H, v30.8H // .........................*..................... - // add v24.8H, v15.8H, v30.8H // ..........................*.................... - // trn1 v28.4S, v24.4S, v19.4S // ................................*.............. - // trn2 v14.4S, v24.4S, v19.4S // ...............................*............... - // trn2 v13.4S, v3.4S, v11.4S // ..............................*................ - // trn1 v15.4S, v3.4S, v11.4S // .............................*................. - // trn2 v29.2D, v13.2D, v14.2D // ....................................*.......... - // trn2 v3.2D, v15.2D, v28.2D // ..................................*............ - // sqrdmulh v23.8H, v29.8H, v10.8H // ......................................*........ - // mul v22.8H, v29.8H, v4.8H // .......................................*....... - // mul v11.8H, v3.8H, v4.8H // .........................................*..... - // sqrdmulh v4.8H, v3.8H, v10.8H // ........................................*...... - // ldr q2, [x4, #-32] // ........................*...................... - // trn1 v25.2D, v15.2D, v28.2D // ...................................*........... - // mls v22.8H, v23.8H, v7.H[0] // ...........................................*... - // trn1 v13.2D, v13.2D, v14.2D // .....................................*......... - // mls v11.8H, v4.8H, v7.H[0] // ..........................................*.... - // add v23.8H, v13.8H, v22.8H // .............................................*. - // sub v22.8H, v13.8H, v22.8H // ............................................*.. + // -------------- new position --------------> + // 0 25 + // |------------------------|----------------- + // ldr q27, [x1, #48] // *.......................................... + // ldr q22, [x1, #0] // ...*....................................... + // ldr q3, [x3], #16 // ..*........................................ + // ldr q16, [x1, #32] // .*......................................... + // ldr q8, [x1, #16] // ..........*................................ + // sqrdmulh v28.8H, v27.8H, v3.H[1] // ......*.................................... + // mul v27.8H, v27.8H, v3.H[0] // .......*................................... + // sqrdmulh v24.8H, v16.8H, v3.H[1] // .........*................................. + // mul v16.8H, v16.8H, v3.H[0] // ........*.................................. + // ldr q26, [x4], #(6*16) // ...........*............................... + // mls v27.8H, v28.8H, v7.H[0] // ............*.............................. + // mls v16.8H, v24.8H, v7.H[0] // .............*............................. + // sub v4.8H, v8.8H, v27.8H // ...............*........................... + // add v21.8H, v8.8H, v27.8H // ..............*............................ + // ldr q11, [x4, #-32] // ..........................................* + // sub v20.8H, v22.8H, v16.8H // .................*......................... + // add v10.8H, v22.8H, v16.8H // ..................*........................ + // sqrdmulh v25.8H, v21.8H, v3.H[3] // .....................*..................... + // mul v16.8H, v21.8H, v3.H[2] // ......................*.................... + // mul v13.8H, v4.8H, v3.H[4] // ....................*...................... + // sqrdmulh v31.8H, v4.8H, v3.H[5] // ...................*....................... + // mls v16.8H, v25.8H, v7.H[0] // ........................*.................. + // mls v13.8H, v31.8H, v7.H[0] // .......................*................... + // ldr q31, [x4, #-64] // .........................*................. + // add v27.8H, v10.8H, v16.8H // .............................*............. + // sub v21.8H, v10.8H, v16.8H // ..........................*................ + // sub v4.8H, v20.8H, v13.8H // ...........................*............... + // add v16.8H, v20.8H, v13.8H // ............................*.............. + // ldr q13, [x4, #-48] // ................*.......................... + // trn2 v18.4S, v27.4S, v21.4S // ...............................*........... + // trn1 v24.4S, v27.4S, v21.4S // ...................................*....... + // ldr q0, [x4, #-80] // ....*...................................... + // trn2 v14.4S, v16.4S, v4.4S // ..............................*............ + // trn1 v16.4S, v16.4S, v4.4S // ................................*.......... + // trn2 v8.2D, v18.2D, v14.2D // .................................*......... + // trn2 v29.2D, v24.2D, v16.2D // ......................................*.... + // sqrdmulh v27.8H, v8.8H, v0.8H // ....................................*...... + // mul v21.8H, v8.8H, v26.8H // .....................................*..... + // trn1 v15.2D, v18.2D, v14.2D // ..................................*........ + // mul v2.8H, v29.8H, v26.8H // .......................................*... + // sqrdmulh v14.8H, v29.8H, v0.8H // ........................................*.. + // ldr q0, [x4, #-16] // .....*..................................... + // mls v21.8H, v27.8H, v7.H[0] // .........................................*. sub count, count, #1 layer4567_start: - sub v0.8H, v25.8H, v11.8H // ..........................................*................................................ - ldr q12, [x3], #16 // ....e...................................................................................... - ldr q26, [x1, #112] // ...e....................................................................................... - // gap // ........................................................................................... - ldr q9, [x1, #64] // e.......................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v4.8H, v22.8H, v2.8H // ......................................................*.................................... - sqrdmulh v1.8H, v22.8H, v20.8H // .......................................................*................................... - mul v22.8H, v23.8H, v21.8H // .................................................*......................................... - sqrdmulh v18.8H, v23.8H, v5.8H // ..................................................*........................................ - ldr q16, [x1, #96] // ..e........................................................................................ - // gap // ........................................................................................... - ldr q28, [x1, #80] // .e......................................................................................... - // gap // ........................................................................................... - ldr q5, [x4, #48] // ....................................e...................................................... - add v23.8H, v25.8H, v11.8H // ...........................................*............................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - ldr q21, [x4, #32] // ...................................e....................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v11.8H, v26.8H, v12.H[0] // ..........e................................................................................ - sqrdmulh v26.8H, v26.8H, v12.H[1] // ...........e............................................................................... - mls v4.8H, v1.8H, v7.H[0] // ........................................................*.................................. - ldr q20, [x4, #80] // ......................................e.................................................... - mls v22.8H, v18.8H, v7.H[0] // ...................................................*....................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v14.8H, v16.8H, v12.H[0] // .....e..................................................................................... - sqrdmulh v19.8H, v16.8H, v12.H[1] // ......e.................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - add v3.8H, v23.8H, v22.8H // .....................................................*..................................... - mls v11.8H, v26.8H, v7.H[0] // ............e.............................................................................. - sub v30.8H, v23.8H, v22.8H // ....................................................*...................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v22.8H, v0.8H, v4.8H // .........................................................*................................. - add v23.8H, v0.8H, v4.8H // ..........................................................*................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - trn1 v6.4S, v3.4S, v30.4S // ...........................................................*............................... - trn2 v27.4S, v3.4S, v30.4S // ............................................................*.............................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - trn2 v0.4S, v23.4S, v22.4S // ..............................................................*............................ - sub v10.8H, v28.8H, v11.8H // .............e............................................................................. - add v13.8H, v28.8H, v11.8H // ..............e............................................................................ - trn1 v26.4S, v23.4S, v22.4S // .............................................................*............................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v14.8H, v19.8H, v7.H[0] // .......e................................................................................... - sqdmulh v3.8H, v6.8H, v7.H[1] // ...............................................................*........................... - sqdmulh v4.8H, v27.8H, v7.H[1] // ..................................................................*........................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v30.8H, v10.8H, v12.H[4] // ....................e...................................................................... - sqrdmulh v10.8H, v10.8H, v12.H[5] // .....................e..................................................................... - sqrdmulh v11.8H, v13.8H, v12.H[3] // ................e.......................................................................... - mul v12.8H, v13.8H, v12.H[2] // ...............e........................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sqdmulh v22.8H, v26.8H, v7.H[1] // .....................................................................*..................... - sqdmulh v23.8H, v0.8H, v7.H[1] // ........................................................................*.................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - srshr v3.8H, v3.8H, #11 // ................................................................*.......................... - srshr v4.8H, v4.8H, #11 // ...................................................................*....................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v15.8H, v9.8H, v14.8H // ........e.................................................................................. - add v24.8H, v9.8H, v14.8H // .........e................................................................................. - mls v30.8H, v10.8H, v7.H[0] // ......................e.................................................................... - ldr q10, [x4, #16] // ..................................e........................................................ - mls v12.8H, v11.8H, v7.H[0] // .................e......................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - srshr v22.8H, v22.8H, #11 // ......................................................................*.................... - srshr v23.8H, v23.8H, #11 // .........................................................................*................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v6.8H, v3.8H, v7.H[0] // .................................................................*......................... - mls v27.8H, v4.8H, v7.H[0] // ....................................................................*...................... - ldr q4, [x4], #(6*16) // .................................e......................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v11.8H, v24.8H, v12.8H // ..................e........................................................................ - add v3.8H, v24.8H, v12.8H // ...................e....................................................................... - sub v19.8H, v15.8H, v30.8H // .......................e................................................................... - add v24.8H, v15.8H, v30.8H // ........................e.................................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v0.8H, v23.8H, v7.H[0] // ..........................................................................*................ - mls v26.8H, v22.8H, v7.H[0] // .......................................................................*................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - trn1 v28.4S, v24.4S, v19.4S // ...........................e............................................................... - trn2 v14.4S, v24.4S, v19.4S // ............................e.............................................................. - umov x16, v6.d[1] // ............................................................................*.............. - trn2 v13.4S, v3.4S, v11.4S // ..........................e................................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - umov x10, v6.d[0] // ...........................................................................*............... - trn1 v15.4S, v3.4S, v11.4S // .........................e................................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - umov x25, v27.d[0] // .............................................................................*............. - umov x20, v27.d[1] // ..............................................................................*............ - trn2 v29.2D, v13.2D, v14.2D // ..............................e............................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - umov x21, v26.d[1] // ................................................................................*.......... - umov x14, v26.d[0] // ...............................................................................*........... - trn2 v3.2D, v15.2D, v28.2D // .............................e............................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sqrdmulh v23.8H, v29.8H, v10.8H // .............................................e............................................. - mul v22.8H, v29.8H, v4.8H // ............................................e.............................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v11.8H, v3.8H, v4.8H // .......................................e................................................... - sqrdmulh v4.8H, v3.8H, v10.8H // ........................................e.................................................. - str x16, [x1, #32] // .......................................................................................*... - umov x24, v0.d[1] // ..................................................................................*........ - umov x15, v0.d[0] // .................................................................................*......... - ldr q2, [x4, #-32] // .....................................e..................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - str x25, [x1, #16] // .....................................................................................*..... - trn1 v25.2D, v15.2D, v28.2D // ...............................e........................................................... - str x10, [x1], #( 16*4) // ...................................................................................*....... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - str x20, [x1, #-16] // .........................................................................................*. - mls v22.8H, v23.8H, v7.H[0] // ..............................................e............................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - trn1 v13.2D, v13.2D, v14.2D // ................................e.......................................................... - str x21, [x1, #-24] // ........................................................................................*.. - str x14, [x1, #-56] // ....................................................................................*...... - mls v11.8H, v4.8H, v7.H[0] // .........................................e................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - str x24, [x1, #-8] // ..........................................................................................* - str x15, [x1, #-40] // ......................................................................................*.... - add v23.8H, v13.8H, v22.8H // ................................................e.......................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v22.8H, v13.8H, v22.8H // ...............................................e........................................... + // Instructions: 91 + // Expected cycles: 31 + // Expected IPC: 2.94 + // + // Cycle bound: 31.0 + // IPC bound: 2.94 + // + // Wall time: 93.86s + // User time: 93.86s + // + // ----------------------------------- original position ------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--------------- + ldr q27, [x1, #112] // ...e....................................................................................... + ldr q22, [x1, #64] // e.......................................................................................... + add v20.8H, v15.8H, v21.8H // ................................................*.......................................... + sub v28.8H, v15.8H, v21.8H // ...............................................*........................................... + ldr q3, [x3], #16 // ....e...................................................................................... + // gap // ........................................................................................... + trn1 v25.2D, v24.2D, v16.2D // ...............................*........................................................... + mls v2.8H, v14.8H, v7.H[0] // .........................................*................................................. + ldr q16, [x1, #96] // ..e........................................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q8, [x1, #80] // .e......................................................................................... + sqrdmulh v26.8H, v20.8H, v13.8H // .................................................*......................................... + mul v6.8H, v20.8H, v31.8H // ..................................................*........................................ + sqrdmulh v14.8H, v28.8H, v0.8H // ......................................................*.................................... + mul v15.8H, v28.8H, v11.8H // .......................................................*................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v0.8H, v25.8H, v2.8H // ...........................................*............................................... + sub v13.8H, v25.8H, v2.8H // ..........................................*................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v28.8H, v27.8H, v3.H[1] // ..........e................................................................................ + mul v27.8H, v27.8H, v3.H[0] // ...........e............................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v6.8H, v26.8H, v7.H[0] // ...................................................*....................................... + sqrdmulh v24.8H, v16.8H, v3.H[1] // .....e..................................................................................... + mul v16.8H, v16.8H, v3.H[0] // ......e.................................................................................... + mls v15.8H, v14.8H, v7.H[0] // ........................................................*.................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + ldr q26, [x4], #(6*16) // .................................e......................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v27.8H, v28.8H, v7.H[0] // ............e.............................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v16.8H, v24.8H, v7.H[0] // .......e................................................................................... + sub v11.8H, v0.8H, v6.8H // ....................................................*...................................... + add v6.8H, v0.8H, v6.8H // .....................................................*..................................... + sub v28.8H, v13.8H, v15.8H // .........................................................*................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v15.8H, v13.8H, v15.8H // ..........................................................*................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v4.8H, v8.8H, v27.8H // .............e............................................................................. + add v21.8H, v8.8H, v27.8H // ..............e............................................................................ + trn1 v0.4S, v6.4S, v11.4S // ...........................................................*............................... + trn2 v2.4S, v6.4S, v11.4S // ............................................................*.............................. + ldr q11, [x4, #-32] // .....................................e..................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v20.8H, v22.8H, v16.8H // ........e.................................................................................. + trn1 v9.4S, v15.4S, v28.4S // .............................................................*............................. + trn2 v15.4S, v15.4S, v28.4S // ..............................................................*............................ + add v10.8H, v22.8H, v16.8H // .........e................................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v25.8H, v21.8H, v3.H[3] // ...............e........................................................................... + mul v16.8H, v21.8H, v3.H[2] // ................e.......................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqdmulh v14.8H, v2.8H, v7.H[1] // ..................................................................*........................ + sqdmulh v6.8H, v0.8H, v7.H[1] // ...............................................................*........................... + mul v13.8H, v4.8H, v3.H[4] // .....................e..................................................................... + sqrdmulh v31.8H, v4.8H, v3.H[5] // ....................e...................................................................... + sqdmulh v27.8H, v9.8H, v7.H[1] // .....................................................................*..................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqdmulh v21.8H, v15.8H, v7.H[1] // ........................................................................*.................. + mls v16.8H, v25.8H, v7.H[0] // .................e......................................................................... + srshr v30.8H, v6.8H, #11 // ................................................................*.......................... + srshr v14.8H, v14.8H, #11 // ...................................................................*....................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v13.8H, v31.8H, v7.H[0] // ......................e.................................................................... + srshr v6.8H, v27.8H, #11 // ......................................................................*.................... + ldr q31, [x4, #-64] // ...................................e....................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + srshr v18.8H, v21.8H, #11 // .........................................................................*................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v0.8H, v30.8H, v7.H[0] // .................................................................*......................... + mls v2.8H, v14.8H, v7.H[0] // ....................................................................*...................... + add v27.8H, v10.8H, v16.8H // ...................e....................................................................... + sub v21.8H, v10.8H, v16.8H // ..................e........................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v4.8H, v20.8H, v13.8H // .......................e................................................................... + add v16.8H, v20.8H, v13.8H // ........................e.................................................................. + ldr q13, [x4, #-48] // ....................................e...................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v9.8H, v6.8H, v7.H[0] // .......................................................................*................... + mls v15.8H, v18.8H, v7.H[0] // ..........................................................................*................ + trn2 v18.4S, v27.4S, v21.4S // ..........................e................................................................ + trn1 v24.4S, v27.4S, v21.4S // .........................e................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x23, v0.d[0] // ...........................................................................*............... + umov x9, v0.d[1] // ............................................................................*.............. + ldr q0, [x4, #-80] // ..................................e........................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x14, v2.d[1] // ..............................................................................*............ + trn2 v14.4S, v16.4S, v4.4S // ............................e.............................................................. + trn1 v16.4S, v16.4S, v4.4S // ...........................e............................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x15, v2.d[0] // .............................................................................*............. + trn2 v8.2D, v18.2D, v14.2D // ..............................e............................................................ + umov x12, v9.d[0] // ...............................................................................*........... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x19, v15.d[0] // .................................................................................*......... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v29.2D, v24.2D, v16.2D // .............................e............................................................. + umov x13, v9.d[1] // ................................................................................*.......... + umov x27, v15.d[1] // ..................................................................................*........ + sqrdmulh v27.8H, v8.8H, v0.8H // ............................................e.............................................. + mul v21.8H, v8.8H, v26.8H // .............................................e............................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v15.2D, v18.2D, v14.2D // ................................e.......................................................... + str x23, [x1], #( 16*4) // ...................................................................................*....... + str x9, [x1, #-32] // .......................................................................................*... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v2.8H, v29.8H, v26.8H // ........................................e.................................................. + str x14, [x1, #-16] // .........................................................................................*. + sqrdmulh v14.8H, v29.8H, v0.8H // .......................................e................................................... + ldr q0, [x4, #-16] // ......................................e.................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x15, [x1, #-48] // .....................................................................................*..... + str x19, [x1, #-40] // ......................................................................................*.... + mls v21.8H, v27.8H, v7.H[0] // ..............................................e............................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x12, [x1, #-56] // ....................................................................................*...... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x13, [x1, #-24] // ........................................................................................*.. + str x27, [x1, #-8] // ..........................................................................................* + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... - // original source code - // ldr q8, [x1, #(16*0)] // ..e.......................................................................................|..e..................................................................................... - // ldr q9, [x1, #(16*1)] // ........e.................................................................................|........e............................................................................... - // ldr q10, [x1, #(16*2)] // .......e..................................................................................|.......e................................................................................ - // ldr q11, [x1, #(16*3)] // .e........................................................................................|.e...................................................................................... - // ldr q0, [x3], #16 // e.........................................................................................|e....................................................................................... - // mul v24.8h, v10.8h, v0.h[0] // .................e........................................................................|.................e...................................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[1] // ..................e.......................................................................|..................e..................................................................... - // mls v24.8h, v10.8h, v7.h[0] // ..............................e...........................................................|..............................e......................................................... - // sub v10.8h, v8.8h, v24.8h // .........................................e................................................|.........................................e.............................................. - // add v8.8h, v8.8h, v24.8h // ..........................................e...............................................|..........................................e............................................. - // mul v24.8h, v11.8h, v0.h[0] // ............e.............................................................................|............e........................................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[1] // .............e............................................................................|.............e.......................................................................... - // mls v24.8h, v11.8h, v7.h[0] // ....................e.....................................................................|....................e................................................................... - // sub v11.8h, v9.8h, v24.8h // ...........................e..............................................................|...........................e............................................................ - // add v9.8h, v9.8h, v24.8h // ............................e.............................................................|............................e........................................................... - // mul v24.8h, v9.8h, v0.h[2] // ....................................e.....................................................|....................................e................................................... - // sqrdmulh v9.8h, v9.8h, v0.h[3] // ...................................e......................................................|...................................e.................................................... - // mls v24.8h, v9.8h, v7.h[0] // .............................................e............................................|.............................................e.......................................... - // sub v9.8h, v8.8h, v24.8h // ...................................................e......................................|...................................................e.................................... - // add v8.8h, v8.8h, v24.8h // ....................................................e.....................................|....................................................e................................... - // mul v24.8h, v11.8h, v0.h[4] // .................................e........................................................|.................................e...................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[5] // ..................................e.......................................................|..................................e..................................................... - // mls v24.8h, v11.8h, v7.h[0] // ...........................................e..............................................|...........................................e............................................ - // sub v11.8h, v10.8h, v24.8h // .....................................................e....................................|.....................................................e.................................. - // add v10.8h, v10.8h, v24.8h // ......................................................e...................................|......................................................e................................. - // trn1 v25.4s, v8.4s, v9.4s // ..............................................................e...........................|..............................................................e......................... - // trn2 v26.4s, v8.4s, v9.4s // ............................................................e.............................|............................................................e........................... - // trn1 v27.4s, v10.4s, v11.4s // .........................................................e................................|.........................................................e.............................. - // trn2 v28.4s, v10.4s, v11.4s // ..........................................................e...............................|..........................................................e............................. - // trn2 v10.2d, v25.2d, v27.2d // ....................................................................e.....................|....................................................................e................... - // trn2 v11.2d, v26.2d, v28.2d // .................................................................e........................|.................................................................e...................... - // trn1 v8.2d, v25.2d, v27.2d // ..............................................................................e...........|..............................................................................e......... - // trn1 v9.2d, v26.2d, v28.2d // ..................................................................................e.......|..................................................................................e..... - // ldr q0, [x4], #(6*16) // ..................................................e.......................................|..................................................e..................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // ............................................e.............................................|............................................e........................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ...........e..............................................................................|...........e............................................................................ - // ldr q5, [x4, #(-6*16 + 3*16)] // .........e................................................................................|.........e.............................................................................. - // ldr q2, [x4, #(-6*16 + 4*16)] // ............................................................................e.............|............................................................................e........... - // ldr q6, [x4, #(-6*16 + 5*16)] // ...............e..........................................................................|...............e........................................................................ - // mul v24.8h, v10.8h, v0.8h // .......................................................................e..................|.......................................................................e................ - // sqrdmulh v10.8h, v10.8h, v4.8h // ........................................................................e.................|........................................................................e............... - // mls v24.8h, v10.8h, v7.h[0] // .....................................................................................e....|.....................................................................................e.. - // sub v10.8h, v8.8h, v24.8h // ..........................................................................................*........................................................................................ - // add v8.8h, v8.8h, v24.8h // ..........*...............................................................................|..........*............................................................................. - // mul v24.8h, v11.8h, v0.8h // ......................................................................e...................|......................................................................e................. - // sqrdmulh v11.8h, v11.8h, v4.8h // .....................................................................e....................|.....................................................................e.................. - // mls v24.8h, v11.8h, v7.h[0] // .................................................................................e........|.................................................................................e...... - // sub v11.8h, v9.8h, v24.8h // .........................................................................................e|........................................................................................ - // add v9.8h, v9.8h, v24.8h // ........................................................................................e.|........................................................................................ - // mul v24.8h, v9.8h, v1.8h // .....*....................................................................................|.....*.................................................................................. - // sqrdmulh v9.8h, v9.8h, v5.8h // ......*...................................................................................|......*................................................................................. - // mls v24.8h, v9.8h, v7.h[0] // ................*.........................................................................|................*....................................................................... - // sub v9.8h, v8.8h, v24.8h // .....................*....................................................................|.....................*.................................................................. - // add v8.8h, v8.8h, v24.8h // ...................*......................................................................|...................*.................................................................... - // mul v24.8h, v11.8h, v2.8h // ...*......................................................................................|...*.................................................................................... - // sqrdmulh v11.8h, v11.8h, v6.8h // ....*.....................................................................................|....*................................................................................... - // mls v24.8h, v11.8h, v7.h[0] // ..............*...........................................................................|..............*......................................................................... - // sub v11.8h, v10.8h, v24.8h // ......................*...................................................................|......................*................................................................. - // add v10.8h, v10.8h, v24.8h // .......................*..................................................................|.......................*................................................................ - // trn1 v25.4s, v8.4s, v9.4s // ........................*.................................................................|........................*............................................................... - // trn2 v26.4s, v8.4s, v9.4s // .........................*................................................................|.........................*.............................................................. - // trn1 v27.4s, v10.4s, v11.4s // .............................*............................................................|.............................*.......................................................... - // trn2 v28.4s, v10.4s, v11.4s // ..........................*...............................................................|..........................*............................................................. - // sqdmulh v24.8h, v25.8h, v7.h[1] // ...............................*..........................................................|...............................*........................................................ - // srshr v24.8h, v24.8h, #11 // .......................................*..................................................|.......................................*................................................ - // mls v25.8h, v24.8h, v7.h[0] // ................................................*.........................................|................................................*....................................... - // sqdmulh v24.8h, v26.8h, v7.h[1] // ................................*.........................................................|................................*....................................................... - // srshr v24.8h, v24.8h, #11 // ........................................*.................................................|........................................*............................................... - // mls v26.8h, v24.8h, v7.h[0] // .................................................*........................................|.................................................*...................................... - // sqdmulh v24.8h, v27.8h, v7.h[1] // .....................................*....................................................|.....................................*.................................................. - // srshr v24.8h, v24.8h, #11 // ..............................................*...........................................|..............................................*......................................... - // mls v27.8h, v24.8h, v7.h[0] // ........................................................*.................................|........................................................*............................... - // sqdmulh v24.8h, v28.8h, v7.h[1] // ......................................*...................................................|......................................*................................................. - // srshr v24.8h, v24.8h, #11 // ...............................................*..........................................|...............................................*........................................ - // mls v28.8h, v24.8h, v7.h[0] // .......................................................*..................................|.......................................................*................................ - // umov x10, v25.d[0] // .............................................................*............................|.............................................................*.......................... - // umov x11, v25.d[1] // ...........................................................*..............................|...........................................................*............................ - // umov x12, v26.d[0] // ...............................................................*..........................|...............................................................*........................ - // umov x13, v26.d[1] // ................................................................*.........................|................................................................*....................... - // umov x14, v27.d[0] // ...................................................................*......................|...................................................................*.................... - // umov x15, v27.d[1] // ..................................................................*.......................|..................................................................*..................... - // umov x16, v28.d[0] // ...........................................................................*..............|...........................................................................*............ - // umov x17, v28.d[1] // ..........................................................................*...............|..........................................................................*............. - // str x10, [x1], #( 16*4) // ...............................................................................*..........|...............................................................................*........ - // str x14, [x1, #(-16*4 + 8*1)] // ....................................................................................*.....|....................................................................................*... - // str x12, [x1, #(-16*4 + 8*2)] // .............................................................................*............|.............................................................................*.......... - // str x16, [x1, #(-16*4 + 8*3)] // .......................................................................................*..|.......................................................................................* - // str x11, [x1, #(-16*4 + 8*4)] // .........................................................................*................|.........................................................................*.............. - // str x15, [x1, #(-16*4 + 8*5)] // ...................................................................................*......|...................................................................................*.... - // str x13, [x1, #(-16*4 + 8*6)] // ................................................................................*.........|................................................................................*....... - // str x17, [x1, #(-16*4 + 8*7)] // ......................................................................................*...|......................................................................................*. + // ----------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------ + // ldr q8, [x1, #(16*0)] // .e.........................................................................................'~......................................................................................... + // ldr q9, [x1, #(16*1)] // ........e..................................................................................'.......~.................................................................................. + // ldr q10, [x1, #(16*2)] // .......e...................................................................................'......~................................................................................... + // ldr q11, [x1, #(16*3)] // e..........................................................................................~.......................................................................................... + // ldr q0, [x3], #16 // ....e......................................................................................'...~...................................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ..................e........................................................................'.................~........................................................................ + // mul v24.8h, v10.8h, v0.h[0] // ...................e.......................................................................'..................~....................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .......................e...................................................................'......................~................................................................... + // sub v10.8h, v8.8h, v24.8h // .................................e.........................................................'................................~......................................................... + // add v8.8h, v8.8h, v24.8h // ....................................e......................................................'...................................~...................................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ...............e...........................................................................'..............~........................................................................... + // mul v24.8h, v11.8h, v0.h[0] // ................e..........................................................................'...............~.......................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ......................e....................................................................'.....................~.................................................................... + // sub v11.8h, v9.8h, v24.8h // ............................e..............................................................'...........................~.............................................................. + // add v9.8h, v9.8h, v24.8h // .............................e.............................................................'............................~............................................................. + // sqrdmulh v27.8h, v9.8h, v0.h[3] // .....................................e.....................................................'....................................~..................................................... + // mul v24.8h, v9.8h, v0.h[2] // ......................................e....................................................'.....................................~.................................................... + // mls v24.8h, v27.8h, v7.h[0] // .............................................e.............................................'............................................~............................................. + // sub v9.8h, v8.8h, v24.8h // .......................................................e...................................'......................................................~................................... + // add v8.8h, v8.8h, v24.8h // ......................................................e....................................'.....................................................~.................................... + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ..........................................e................................................'.........................................~................................................ + // mul v24.8h, v11.8h, v0.h[4] // .........................................e.................................................'........................................~................................................. + // mls v24.8h, v27.8h, v7.h[0] // ................................................e..........................................'...............................................~.......................................... + // sub v11.8h, v10.8h, v24.8h // ........................................................e..................................'.......................................................~.................................. + // add v10.8h, v10.8h, v24.8h // .........................................................e.................................'........................................................~................................. + // trn1 v25.4s, v8.4s, v9.4s // ..............................................................e............................'.............................................................~............................ + // trn2 v26.4s, v8.4s, v9.4s // .............................................................e.............................'............................................................~............................. + // trn1 v27.4s, v10.4s, v11.4s // ....................................................................e......................'...................................................................~...................... + // trn2 v28.4s, v10.4s, v11.4s // ...................................................................e.......................'..................................................................~....................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................................................e.................'........................................................................~................. + // trn2 v11.2d, v26.2d, v28.2d // ......................................................................e....................'.....................................................................~.................... + // trn1 v8.2d, v25.2d, v27.2d // .....~.....................................................................................'....*..................................................................................... + // trn1 v9.2d, v26.2d, v28.2d // ..............................................................................e............'.............................................................................~............ + // ldr q0, [ x4], #(6*16) // .....................e.....................................................................'....................~..................................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // .................................................................e.........................'................................................................~......................... + // ldr q1, [ x4, #(-6*16 + 2*16)] // ..................................................e........................................'.................................................~........................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // ..........................................................e................................'.........................................................~................................ + // ldr q2, [ x4, #(-6*16 + 4*16)] // ................................e..........................................................'...............................~.......................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ....................................................................................e......'...................................................................................~...... + // sqrdmulh v27.8h, v10.8h, v4.8h // ...................................................................................e.......'..................................................................................~....... + // mul v24.8h, v10.8h, v0.8h // .................................................................................e.........'................................................................................~......... + // mls v24.8h, v27.8h, v7.h[0] // ......~....................................................................................'.....*.................................................................................... + // sub v10.8h, v8.8h, v24.8h // ..............~............................................................................'.............*............................................................................ + // add v8.8h, v8.8h, v24.8h // .............~.............................................................................'............*............................................................................. + // sqrdmulh v27.8h, v11.8h, v4.8h // ............................................................................e..............'...........................................................................~.............. + // mul v24.8h, v11.8h, v0.8h // .............................................................................e.............'............................................................................~............. + // mls v24.8h, v27.8h, v7.h[0] // .......................................................................................e...'......................................................................................~... + // sub v11.8h, v9.8h, v24.8h // ...~.......................................................................................'..*....................................................................................... + // add v9.8h, v9.8h, v24.8h // ..~........................................................................................'.*........................................................................................ + // sqrdmulh v27.8h, v9.8h, v5.8h // .........~.................................................................................'........*................................................................................. + // mul v24.8h, v9.8h, v1.8h // ..........~................................................................................'.........*................................................................................ + // mls v24.8h, v27.8h, v7.h[0] // .................~.........................................................................'................*......................................................................... + // sub v9.8h, v8.8h, v24.8h // ........................~..................................................................'.......................*.................................................................. + // add v8.8h, v8.8h, v24.8h // .........................~.................................................................'........................*................................................................. + // sqrdmulh v27.8h, v11.8h, v6.8h // ...........~...............................................................................'..........*............................................................................... + // mul v24.8h, v11.8h, v2.8h // ............~..............................................................................'...........*.............................................................................. + // mls v24.8h, v27.8h, v7.h[0] // ....................~......................................................................'...................*...................................................................... + // sub v11.8h, v10.8h, v24.8h // ..........................~................................................................'.........................*................................................................ + // add v10.8h, v10.8h, v24.8h // ...........................~...............................................................'..........................*............................................................... + // trn1 v25.4s, v8.4s, v9.4s // ..............................~............................................................'.............................*............................................................ + // trn2 v26.4s, v8.4s, v9.4s // ...............................~...........................................................'..............................*........................................................... + // trn1 v27.4s, v10.4s, v11.4s // ..................................~........................................................'.................................*........................................................ + // trn2 v28.4s, v10.4s, v11.4s // ...................................~.......................................................'..................................*....................................................... + // sqdmulh v24.8h, v25.8h, v7.h[1] // ........................................~..................................................'.......................................*.................................................. + // srshr v24.8h, v24.8h, #11 // ..............................................~............................................'.............................................*............................................ + // mls v25.8h, v24.8h, v7.h[0] // ....................................................~......................................'...................................................*...................................... + // sqdmulh v24.8h, v26.8h, v7.h[1] // .......................................~...................................................'......................................*................................................... + // srshr v24.8h, v24.8h, #11 // ...............................................~...........................................'..............................................*........................................... + // mls v26.8h, v24.8h, v7.h[0] // .....................................................~.....................................'....................................................*..................................... + // sqdmulh v24.8h, v27.8h, v7.h[1] // ...........................................~...............................................'..........................................*............................................... + // srshr v24.8h, v24.8h, #11 // .................................................~.........................................'................................................*......................................... + // mls v27.8h, v24.8h, v7.h[0] // ...........................................................~...............................'..........................................................*............................... + // sqdmulh v24.8h, v28.8h, v7.h[1] // ............................................~..............................................'...........................................*.............................................. + // srshr v24.8h, v24.8h, #11 // ...................................................~.......................................'..................................................*....................................... + // mls v28.8h, v24.8h, v7.h[0] // ............................................................~..............................'...........................................................*.............................. + // umov x10, v25.d[0] // ...............................................................~...........................'..............................................................*........................... + // umov x11, v25.d[1] // ................................................................~..........................'...............................................................*.......................... + // umov x12, v26.d[0] // .....................................................................~.....................'....................................................................*..................... + // umov x13, v26.d[1] // ..................................................................~........................'.................................................................*........................ + // umov x14, v27.d[0] // .......................................................................~...................'......................................................................*................... + // umov x15, v27.d[1] // ..........................................................................~................'.........................................................................*................ + // umov x16, v28.d[0] // ........................................................................~..................'.......................................................................*.................. + // umov x17, v28.d[1] // ...........................................................................~...............'..........................................................................*............... + // str x10, [x1], #( 16*4) // ...............................................................................~...........'..............................................................................*........... + // str x14, [x1, #(-16*4 + 8*1)] // ........................................................................................~..'.......................................................................................*.. + // str x12, [x1, #(-16*4 + 8*2)] // .....................................................................................~.....'....................................................................................*..... + // str x16, [x1, #(-16*4 + 8*3)] // ......................................................................................~....'.....................................................................................*.... + // str x11, [x1, #(-16*4 + 8*4)] // ................................................................................~..........'...............................................................................*.......... + // str x15, [x1, #(-16*4 + 8*5)] // .........................................................................................~.'........................................................................................*. + // str x13, [x1, #(-16*4 + 8*6)] // ..................................................................................~........'.................................................................................*........ + // str x17, [x1, #(-16*4 + 8*7)] // ..........................................................................................~'.........................................................................................* sub count, count, #1 cbnz count, layer4567_start - add v12.8H, v25.8H, v11.8H // .....*...................................... - sub v11.8H, v25.8H, v11.8H // *........................................... - sqrdmulh v29.8H, v22.8H, v20.8H // ..*......................................... - mul v1.8H, v22.8H, v2.8H // .*.......................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mul v19.8H, v23.8H, v21.8H // ...*........................................ - sqrdmulh v4.8H, v23.8H, v5.8H // ....*....................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v1.8H, v29.8H, v7.H[0] // ......*..................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v19.8H, v4.8H, v7.H[0] // .......*.................................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sub v3.8H, v11.8H, v1.8H // ..........*................................. - add v24.8H, v11.8H, v1.8H // ...........*................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - add v14.8H, v12.8H, v19.8H // ........*................................... - sub v12.8H, v12.8H, v19.8H // .........*.................................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - trn1 v2.4S, v24.4S, v3.4S // ...............*............................ - trn2 v26.4S, v24.4S, v3.4S // ..............*............................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - trn2 v5.4S, v14.4S, v12.4S // .............*.............................. - trn1 v29.4S, v14.4S, v12.4S // ............*............................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sqdmulh v31.8H, v26.8H, v7.H[1] // ...................*........................ - sqdmulh v0.8H, v2.8H, v7.H[1] // ..................*......................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sqdmulh v9.8H, v29.8H, v7.H[1] // ................*........................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - sqdmulh v15.8H, v5.8H, v7.H[1] // .................*.......................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - srshr v1.8H, v31.8H, #11 // .......................*.................... - srshr v25.8H, v0.8H, #11 // ......................*..................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - srshr v30.8H, v15.8H, #11 // .....................*...................... - srshr v3.8H, v9.8H, #11 // ....................*....................... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v26.8H, v1.8H, v7.H[0] // ..........................*................. - mls v2.8H, v25.8H, v7.H[0] // ...........................*................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - mls v29.8H, v3.8H, v7.H[0] // ........................*................... - mls v5.8H, v30.8H, v7.H[0] // .........................*.................. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - umov x13, v2.d[1] // ................................*........... - umov x28, v2.d[0] // .................................*.......... - umov x29, v26.d[0] // ....................................*....... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - umov x23, v5.d[0] // ..............................*............. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - umov x22, v5.d[1] // ...............................*............ - umov x20, v26.d[1] // ...................................*........ - umov x11, v29.d[0] // .............................*.............. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - umov x25, v29.d[1] // ............................*............... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - str x28, [x1, #8] // .........................................*.. - str x29, [x1, #24] // ...........................................* - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - str x23, [x1, #16] // .....................................*...... - str x13, [x1, #40] // ........................................*... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - str x11, [x1], #( 16*4) // ......................................*..... - str x20, [x1, #-8] // ..........................................*. - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - str x25, [x1, #-32] // ..................................*......... - str x22, [x1, #-16] // .......................................*.... - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ - // gap // ............................................ + // Instructions: 48 + // Expected cycles: 30 + // Expected IPC: 1.60 + // + // Cycle bound: 30.0 + // IPC bound: 1.60 + // + // Wall time: 0.75s + // User time: 0.75s + // + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + trn1 v16.2D, v24.2D, v16.2D // ..*............................................. + mls v2.8H, v14.8H, v7.H[0] // ...*............................................ + add v27.8H, v15.8H, v21.8H // *............................................... + sub v15.8H, v15.8H, v21.8H // .*.............................................. + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sqrdmulh v14.8H, v27.8H, v13.8H // ....*........................................... + mul v27.8H, v27.8H, v31.8H // .....*.......................................... + sqrdmulh v6.8H, v15.8H, v0.8H // ......*......................................... + mul v15.8H, v15.8H, v11.8H // .......*........................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + add v26.8H, v16.8H, v2.8H // ........*....................................... + sub v16.8H, v16.8H, v2.8H // .........*...................................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v27.8H, v14.8H, v7.H[0] // ..........*..................................... + mls v15.8H, v6.8H, v7.H[0] // ...........*.................................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sub v14.8H, v26.8H, v27.8H // ............*................................... + add v27.8H, v26.8H, v27.8H // .............*.................................. + sub v6.8H, v16.8H, v15.8H // ..............*................................. + add v15.8H, v16.8H, v15.8H // ...............*................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + trn1 v16.4S, v27.4S, v14.4S // ................*............................... + trn2 v27.4S, v27.4S, v14.4S // .................*.............................. + trn1 v14.4S, v15.4S, v6.4S // ..................*............................. + trn2 v15.4S, v15.4S, v6.4S // ...................*............................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + sqdmulh v6.8H, v27.8H, v7.H[1] // ....................*........................... + sqdmulh v26.8H, v16.8H, v7.H[1] // .....................*.......................... + sqdmulh v0.8H, v14.8H, v7.H[1] // ......................*......................... + sqdmulh v11.8H, v15.8H, v7.H[1] // .......................*........................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + srshr v6.8H, v6.8H, #11 // .........................*...................... + srshr v26.8H, v26.8H, #11 // ........................*....................... + srshr v0.8H, v0.8H, #11 // ..........................*..................... + srshr v11.8H, v11.8H, #11 // ...........................*.................... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + mls v14.8H, v0.8H, v7.H[0] // ..............................*................. + mls v15.8H, v11.8H, v7.H[0] // ...............................*................ + mls v16.8H, v26.8H, v7.H[0] // ............................*................... + mls v27.8H, v6.8H, v7.H[0] // .............................*.................. + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + umov x27, v16.d[0] // ................................*............... + umov x19, v16.d[1] // .................................*.............. + umov x13, v27.d[1] // ..................................*............. + umov x12, v27.d[0] // ...................................*............ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + umov x14, v14.d[0] // ....................................*........... + umov x15, v14.d[1] // ......................................*......... + umov x9, v15.d[0] // .....................................*.......... + umov x23, v15.d[1] // .......................................*........ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + str x27, [x1], #( 16*4) // ........................................*....... + str x19, [x1, #-32] // .........................................*...... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + str x13, [x1, #-16] // ..........................................*..... + str x12, [x1, #-48] // ...........................................*.... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + str x14, [x1, #-56] // .............................................*.. + str x9, [x1, #-40] // ............................................*... + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + str x15, [x1, #-24] // ..............................................*. + str x23, [x1, #-8] // ...............................................* + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ + // gap // ................................................ - // original source code - // sub v0.8H, v25.8H, v11.8H // .*.......................................... - // mul v4.8H, v22.8H, v2.8H // ...*........................................ - // sqrdmulh v1.8H, v22.8H, v20.8H // ..*......................................... - // mul v22.8H, v23.8H, v21.8H // ....*....................................... - // sqrdmulh v18.8H, v23.8H, v5.8H // .....*...................................... - // add v23.8H, v25.8H, v11.8H // *........................................... - // mls v4.8H, v1.8H, v7.H[0] // ......*..................................... - // mls v22.8H, v18.8H, v7.H[0] // .......*.................................... - // add v3.8H, v23.8H, v22.8H // ..........*................................. - // sub v30.8H, v23.8H, v22.8H // ...........*................................ - // sub v22.8H, v0.8H, v4.8H // ........*................................... - // add v23.8H, v0.8H, v4.8H // .........*.................................. - // trn1 v6.4S, v3.4S, v30.4S // ...............*............................ - // trn2 v27.4S, v3.4S, v30.4S // ..............*............................. - // trn2 v0.4S, v23.4S, v22.4S // .............*.............................. - // trn1 v26.4S, v23.4S, v22.4S // ............*............................... - // sqdmulh v3.8H, v6.8H, v7.H[1] // ..................*......................... - // sqdmulh v4.8H, v27.8H, v7.H[1] // ...................*........................ - // sqdmulh v22.8H, v26.8H, v7.H[1] // .................*.......................... - // sqdmulh v23.8H, v0.8H, v7.H[1] // ................*........................... - // srshr v3.8H, v3.8H, #11 // .......................*.................... - // srshr v4.8H, v4.8H, #11 // ......................*..................... - // srshr v22.8H, v22.8H, #11 // .....................*...................... - // srshr v23.8H, v23.8H, #11 // ....................*....................... - // mls v6.8H, v3.8H, v7.H[0] // ..........................*................. - // mls v27.8H, v4.8H, v7.H[0] // ...........................*................ - // mls v0.8H, v23.8H, v7.H[0] // ........................*................... - // mls v26.8H, v22.8H, v7.H[0] // .........................*.................. - // umov x16, v6.d[1] // ...................................*........ - // umov x10, v6.d[0] // ..................................*......... - // umov x25, v27.d[0] // ...............................*............ - // umov x20, v27.d[1] // ................................*........... - // umov x21, v26.d[1] // ............................*............... - // umov x14, v26.d[0] // .............................*.............. - // str x16, [x1, #32] // ..........................................*. - // umov x24, v0.d[1] // .................................*.......... - // umov x15, v0.d[0] // ..............................*............. - // str x25, [x1, #16] // ......................................*..... - // str x10, [x1], #( 16*4) // ........................................*... - // str x20, [x1, #-16] // ...........................................* - // str x21, [x1, #-24] // .......................................*.... - // str x14, [x1, #-56] // ....................................*....... - // str x24, [x1, #-8] // .........................................*.. - // str x15, [x1, #-40] // .....................................*...... + // ---------------- new position -----------------> + // 0 25 + // |------------------------|---------------------- + // add v20.8H, v15.8H, v21.8H // ..*............................................. + // sub v28.8H, v15.8H, v21.8H // ...*............................................ + // trn1 v25.2D, v24.2D, v16.2D // *............................................... + // mls v2.8H, v14.8H, v7.H[0] // .*.............................................. + // sqrdmulh v26.8H, v20.8H, v13.8H // ....*........................................... + // mul v6.8H, v20.8H, v31.8H // .....*.......................................... + // sqrdmulh v14.8H, v28.8H, v0.8H // ......*......................................... + // mul v15.8H, v28.8H, v11.8H // .......*........................................ + // add v0.8H, v25.8H, v2.8H // ........*....................................... + // sub v13.8H, v25.8H, v2.8H // .........*...................................... + // mls v6.8H, v26.8H, v7.H[0] // ..........*..................................... + // mls v15.8H, v14.8H, v7.H[0] // ...........*.................................... + // sub v11.8H, v0.8H, v6.8H // ............*................................... + // add v6.8H, v0.8H, v6.8H // .............*.................................. + // sub v28.8H, v13.8H, v15.8H // ..............*................................. + // add v15.8H, v13.8H, v15.8H // ...............*................................ + // trn1 v0.4S, v6.4S, v11.4S // ................*............................... + // trn2 v2.4S, v6.4S, v11.4S // .................*.............................. + // trn1 v9.4S, v15.4S, v28.4S // ..................*............................. + // trn2 v15.4S, v15.4S, v28.4S // ...................*............................ + // sqdmulh v14.8H, v2.8H, v7.H[1] // ....................*........................... + // sqdmulh v6.8H, v0.8H, v7.H[1] // .....................*.......................... + // sqdmulh v27.8H, v9.8H, v7.H[1] // ......................*......................... + // sqdmulh v21.8H, v15.8H, v7.H[1] // .......................*........................ + // srshr v30.8H, v6.8H, #11 // .........................*...................... + // srshr v14.8H, v14.8H, #11 // ........................*....................... + // srshr v6.8H, v27.8H, #11 // ..........................*..................... + // srshr v18.8H, v21.8H, #11 // ...........................*.................... + // mls v0.8H, v30.8H, v7.H[0] // ..............................*................. + // mls v2.8H, v14.8H, v7.H[0] // ...............................*................ + // mls v9.8H, v6.8H, v7.H[0] // ............................*................... + // mls v15.8H, v18.8H, v7.H[0] // .............................*.................. + // umov x23, v0.d[0] // ................................*............... + // umov x9, v0.d[1] // .................................*.............. + // umov x14, v2.d[1] // ..................................*............. + // umov x15, v2.d[0] // ...................................*............ + // umov x12, v9.d[0] // ....................................*........... + // umov x19, v15.d[0] // ......................................*......... + // umov x13, v9.d[1] // .....................................*.......... + // umov x27, v15.d[1] // .......................................*........ + // str x23, [x1], #( 16*4) // ........................................*....... + // str x9, [x1, #-32] // .........................................*...... + // str x14, [x1, #-16] // ..........................................*..... + // str x15, [x1, #-48] // ...........................................*.... + // str x19, [x1, #-40] // .............................................*.. + // str x12, [x1, #-56] // ............................................*... + // str x13, [x1, #-24] // ..............................................*. + // str x27, [x1, #-8] // ...............................................* pop_stack diff --git a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_m1_icestorm.s b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_m1_icestorm.s index a438574e..98bf87e6 100644 --- a/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_m1_icestorm.s +++ b/examples/opt/aarch64/ntt_kyber_123_4567_scalar_store_opt_m1_icestorm.s @@ -26,29 +26,6 @@ // Needed to provide ASM_LOAD directive #include -// NOTE -// We use a lot of trivial macros to simplify the parsing burden for Slothy -// The macros are not unfolded by Slothy and thus interpreted as instructions, -// which are easier to parse due to e.g. the lack of size specifiers and simpler -// syntax for pre and post increment for loads and stores. - -// Eventually, NeLight should include a proper parser for AArch64, -// but for initial investigations, the below is enough. - -.macro ldr_vo vec, base, offset - ldr qform_\vec, [\base, #\offset] -.endm -.macro ldr_vi vec, base, inc - ldr qform_\vec, [\base], #\inc -.endm - -.macro str_vo vec, base, offset - str qform_\vec, [\base, #\offset] -.endm -.macro str_vi vec, base, inc - str qform_\vec, [\base], #\inc -.endm - .macro vqrdmulh d,a,b sqrdmulh \d\().8h, \a\().8h, \b\().8h .endm @@ -66,15 +43,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -83,12 +60,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h @@ -102,21 +73,21 @@ .endm .macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 + ldr qform_root0, [r_ptr0], #32 + ldr qform_root1, [r_ptr0, #-16] .endm .macro load_next_roots_45 - ldr_vi root0, r_ptr0, 16 + ldr qform_root0, [r_ptr0], #16 .endm .macro load_next_roots_67 - ldr_vi root0, r_ptr1, (6*16) - ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) - ldr_vo root1, r_ptr1, (-6*16 + 2*16) - ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) - ldr_vo root2, r_ptr1, (-6*16 + 4*16) - ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) + ldr qform_root0, [ r_ptr1], #(6*16) + ldr qform_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr qform_root1, [ r_ptr1, #(-6*16 + 2*16)] + ldr qform_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr qform_root2, [ r_ptr1, #(-6*16 + 4*16)] + ldr qform_root2_tw, [r_ptr1, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -165,7 +136,7 @@ .endm -.macro save_gprs // slothy:no-unfold +.macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] stp x19, x20, [sp, #16*0] @@ -176,7 +147,7 @@ str x29, [sp, #16*5] .endm -.macro restore_gprs // slothy:no-unfold +.macro restore_gprs // @slothy:no-unfold ldp x19, x20, [sp, #16*0] ldp x21, x22, [sp, #16*1] ldp x23, x24, [sp, #16*2] @@ -186,7 +157,7 @@ add sp, sp, #(16*6) .endm -.macro save_vregs // slothy:no-unfold +.macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] stp d10, d11, [sp, #16*1] @@ -194,7 +165,7 @@ stp d14, d15, [sp, #16*3] .endm -.macro restore_vregs // slothy:no-unfold +.macro restore_vregs // @slothy:no-unfold ldp d8, d9, [sp, #16*0] ldp d10, d11, [sp, #16*1] ldp d12, d13, [sp, #16*2] @@ -205,19 +176,19 @@ #define STACK_SIZE 16 #define STACK0 0 -.macro restore a, loc // slothy:no-unfold +.macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm -.macro save loc, a // slothy:no-unfold +.macro save loc, a // @slothy:no-unfold str \a, [sp, #\loc\()] .endm -.macro push_stack // slothy:no-unfold +.macro push_stack // @slothy:no-unfold save_gprs save_vregs sub sp, sp, #STACK_SIZE .endm -.macro pop_stack // slothy:no-unfold +.macro pop_stack // @slothy:no-unfold add sp, sp, #STACK_SIZE restore_vregs restore_gprs @@ -356,1066 +327,1156 @@ _ntt_kyber_123_4567_scalar_store_opt_m1_icestorm: load_roots_123 .p2align 2 - ldr q8, [x0, #320] // ...*............... - // gap // ................... - // gap // ................... - ldr q9, [x0, #256] // .*................. - ldr q23, [x0, #448] // *.................. - // gap // ................... - // gap // ................... - // gap // ................... - ldr q31, [x0, #192] // .......*........... - // gap // ................... - // gap // ................... - // gap // ................... - ldr q24, [x0, #64] // .........*......... - // gap // ................... - // gap // ................... - // gap // ................... - sqrdmulh v13.8H, v9.8H, v0.H[1] // .....*............. - mul v9.8H, v9.8H, v0.H[0] // ....*.............. - // gap // ................... - ldr q11, [x0, #384] // ..*................ - ldr q16, [x0, #128] // ........*.......... - // gap // ................... - // gap // ................... - sqrdmulh v15.8H, v23.8H, v0.H[1] // ......*............ - mul v10.8H, v23.8H, v0.H[0] // ..........*........ - ldr q30, [x0, #0] // ...........*....... - // gap // ................... - // gap // ................... - mls v9.8H, v13.8H, v7.H[0] // ............*...... - mul v29.8H, v8.8H, v0.H[0] // ..................* - // gap // ................... - // gap // ................... - sqrdmulh v20.8H, v11.8H, v0.H[1] // .............*..... - // gap // ................... - // gap // ................... - // gap // ................... - mul v28.8H, v11.8H, v0.H[0] // ...............*... - mls v10.8H, v15.8H, v7.H[0] // ..............*.... - // gap // ................... - // gap // ................... - add v23.8H, v30.8H, v9.8H // .................*. - sub v15.8H, v30.8H, v9.8H // ................*.. - // gap // ................... - // gap // ................... + // Instructions: 58 + // Expected cycles: 30 + // Expected IPC: 1.93 + // + // Cycle bound: 29.0 + // IPC bound: 2.00 + // + // Wall time: 3600.21s + // User time: 3600.21s + // + // ------------------- original position -------------------> + // 0 25 50 + // |------------------------|------------------------|------- + ldr q15, [x0, #448] // *......................................................... + ldr q16, [x0, #384] // .*........................................................ + // gap // .......................................................... + // gap // .......................................................... + ldr q12, [x0, #256] // ..*....................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q26, [x0, #320] // ...*...................................................... + ldr q4, [x0, #64] // .........*................................................ + ldr q30, [x0, #0] // ....*..................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q28, [x0, #192] // ...........*.............................................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q5, [x0, #128] // ..........*............................................... + sqrdmulh v14.8H, v16.8H, v0.H[1] // ........*................................................. + mul v16.8H, v16.8H, v0.H[0] // .......*.................................................. + // gap // .......................................................... + sqrdmulh v17.8H, v15.8H, v0.H[1] // .....*.................................................... + mul v27.8H, v15.8H, v0.H[0] // ......*................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v15.8H, v26.8H, v0.H[1] // ............*............................................. + mul v25.8H, v26.8H, v0.H[0] // .............*............................................ + // gap // .......................................................... + // gap // .......................................................... + mls v16.8H, v14.8H, v7.H[0] // ................*......................................... + sqrdmulh v14.8H, v12.8H, v0.H[1] // .................*........................................ + // gap // .......................................................... + // gap // .......................................................... + mls v27.8H, v17.8H, v7.H[0] // ..............*........................................... + mul v3.8H, v12.8H, v0.H[0] // ...............*.......................................... + // gap // .......................................................... + // gap // .......................................................... + mls v25.8H, v15.8H, v7.H[0] // ..................*....................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v24.8H, v5.8H, v16.8H // .......................*.................................. + add v13.8H, v5.8H, v16.8H // ......................*................................... + // gap // .......................................................... + // gap // .......................................................... + sub v15.8H, v28.8H, v27.8H // ...................*...................................... + add v27.8H, v28.8H, v27.8H // ....................*..................................... + // gap // .......................................................... + // gap // .......................................................... + add v26.8H, v4.8H, v25.8H // .............................*............................ + mul v31.8H, v24.8H, v0.H[4] // ...........................*.............................. + // gap // .......................................................... + // gap // .......................................................... + mul v21.8H, v15.8H, v0.H[4] // .........................*................................ + sqrdmulh v16.8H, v15.8H, v0.H[5] // ..........................*............................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v15.8H, v24.8H, v0.H[5] // ............................*............................. + mul v10.8H, v13.8H, v0.H[2] // ..............................*........................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v2.8H, v13.8H, v0.H[3] // ..........................................*............... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v6.8H, v27.8H, v0.H[3] // ........................*................................. + mls v21.8H, v16.8H, v7.H[0] // ...............................*.......................... + mul v27.8H, v27.8H, v0.H[2] // .................................*........................ + // gap // .......................................................... + // gap // .......................................................... + mls v31.8H, v15.8H, v7.H[0] // ..................................*....................... + mls v3.8H, v14.8H, v7.H[0] // .....................*.................................... + // gap // .......................................................... + // gap // .......................................................... + sub v15.8H, v4.8H, v25.8H // ................................*......................... + // gap // .......................................................... + mls v10.8H, v2.8H, v7.H[0] // .................................................*........ + // gap // .......................................................... + mls v27.8H, v6.8H, v7.H[0] // .......................................*.................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v16.8H, v15.8H, v21.8H // .....................................*.................... + sub v17.8H, v30.8H, v3.8H // ...................................*...................... + // gap // .......................................................... + // gap // .......................................................... + add v11.8H, v15.8H, v21.8H // ......................................*................... + add v13.8H, v30.8H, v3.8H // ....................................*..................... + // gap // .......................................................... + // gap // .......................................................... + add v15.8H, v26.8H, v27.8H // ..............................................*........... + sub v27.8H, v26.8H, v27.8H // .............................................*............ + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v6.8H, v16.8H, v1.H[5] // ........................................*................. + mul v16.8H, v16.8H, v1.H[4] // .........................................*................ + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v28.8H, v27.8H, v1.H[1] // ....................................................*..... + mul v26.8H, v15.8H, v0.H[6] // ...................................................*...... + // gap // .......................................................... + // gap // .......................................................... + mul v14.8H, v27.8H, v1.H[0] // .....................................................*.... + sqrdmulh v27.8H, v15.8H, v0.H[7] // ..................................................*....... + // gap // .......................................................... + // gap // .......................................................... + mls v16.8H, v6.8H, v7.H[0] // ...............................................*.......... + mul v2.8H, v11.8H, v1.H[2] // ...........................................*.............. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v18.8H, v11.8H, v1.H[3] // ............................................*............. + sub v25.8H, v17.8H, v31.8H // ................................................*......... + // gap // .......................................................... + mls v26.8H, v27.8H, v7.H[0] // .........................................................* + sub v6.8H, v13.8H, v10.8H // ........................................................*. + // gap // .......................................................... + // gap // .......................................................... + add v29.8H, v13.8H, v10.8H // ......................................................*... + // gap // .......................................................... + sub v15.8H, v25.8H, v16.8H // .......................................................*.. + // gap // .......................................................... - // original source code - // ldr q3, [x0, #448] // ..*................ - // ldr q22, [x0, #256] // .*................. - // ldr q5, [x0, #384] // .......*........... - // ldr q8, [x0, #320] // *.................. - // mul v21.8H, v22.8H, v0.H[0] // ......*............ - // sqrdmulh v22.8H, v22.8H, v0.H[1] // .....*............. - // sqrdmulh v28.8H, v3.8H, v0.H[1] // .........*......... - // ldr q31, [x0, #192] // ...*............... - // ldr q16, [x0, #128] // ........*.......... - // ldr q24, [x0, #64] // ....*.............. - // mul v10.8H, v3.8H, v0.H[0] // ..........*........ - // ldr q19, [x0, #0] // ...........*....... - // mls v21.8H, v22.8H, v7.H[0] // ............*...... - // sqrdmulh v20.8H, v5.8H, v0.H[1] // ..............*.... - // mls v10.8H, v28.8H, v7.H[0] // ................*.. - // mul v28.8H, v5.8H, v0.H[0] // ...............*... - // sub v15.8H, v19.8H, v21.8H // ..................* - // add v23.8H, v19.8H, v21.8H // .................*. - // mul v29.8H, v8.8H, v0.H[0] // .............*..... + // --------------------- new position ----------------------> + // 0 25 50 + // |------------------------|------------------------|------- + // ldr q22, [x0, #448] // *......................................................... + // ldr q23, [x0, #384] // .*........................................................ + // ldr q30, [x0, #256] // ..*....................................................... + // ldr q4, [x0, #320] // ...*...................................................... + // ldr q20, [x0, #0] // .....*.................................................... + // sqrdmulh v16.8H, v22.8H, v0.H[1] // ..........*............................................... + // mul v25.8H, v22.8H, v0.H[0] // ...........*.............................................. + // mul v15.8H, v23.8H, v0.H[0] // .........*................................................ + // sqrdmulh v3.8H, v23.8H, v0.H[1] // ........*................................................. + // ldr q8, [x0, #64] // ....*..................................................... + // ldr q29, [x0, #128] // .......*.................................................. + // ldr q12, [x0, #192] // ......*................................................... + // sqrdmulh v22.8H, v4.8H, v0.H[1] // ............*............................................. + // mul v17.8H, v4.8H, v0.H[0] // .............*............................................ + // mls v25.8H, v16.8H, v7.H[0] // ................*......................................... + // mul v9.8H, v30.8H, v0.H[0] // .................*........................................ + // mls v15.8H, v3.8H, v7.H[0] // ..............*........................................... + // sqrdmulh v31.8H, v30.8H, v0.H[1] // ...............*.......................................... + // mls v17.8H, v22.8H, v7.H[0] // ..................*....................................... + // sub v26.8H, v12.8H, v25.8H // .....................*.................................... + // add v16.8H, v12.8H, v25.8H // ......................*................................... + // mls v9.8H, v31.8H, v7.H[0] // ..................................*....................... + // add v11.8H, v29.8H, v15.8H // ....................*..................................... + // sub v13.8H, v29.8H, v15.8H // ...................*...................................... + // sqrdmulh v15.8H, v16.8H, v0.H[3] // ..............................*........................... + // mul v22.8H, v26.8H, v0.H[4] // .........................*................................ + // sqrdmulh v18.8H, v26.8H, v0.H[5] // ..........................*............................... + // mul v31.8H, v13.8H, v0.H[4] // ........................*................................. + // sqrdmulh v5.8H, v13.8H, v0.H[5] // ...........................*.............................. + // add v23.8H, v8.8H, v17.8H // .......................*.................................. + // mul v3.8H, v11.8H, v0.H[2] // ............................*............................. + // mls v22.8H, v18.8H, v7.H[0] // ...............................*.......................... + // sub v10.8H, v8.8H, v17.8H // ...................................*...................... + // mul v26.8H, v16.8H, v0.H[2] // ................................*......................... + // mls v31.8H, v5.8H, v7.H[0] // .................................*........................ + // sub v17.8H, v20.8H, v9.8H // .......................................*.................. + // add v19.8H, v20.8H, v9.8H // .........................................*................ + // sub v29.8H, v10.8H, v22.8H // ......................................*................... + // add v8.8H, v10.8H, v22.8H // ........................................*................. + // mls v26.8H, v15.8H, v7.H[0] // .....................................*.................... + // sqrdmulh v15.8H, v29.8H, v1.H[5] // ............................................*............. + // mul v16.8H, v29.8H, v1.H[4] // .............................................*............ + // sqrdmulh v6.8H, v11.8H, v0.H[3] // .............................*............................ + // mul v2.8H, v8.8H, v1.H[2] // ...................................................*...... + // sqrdmulh v18.8H, v8.8H, v1.H[3] // ....................................................*..... + // sub v22.8H, v23.8H, v26.8H // ...........................................*.............. + // add v27.8H, v23.8H, v26.8H // ..........................................*............... + // mls v16.8H, v15.8H, v7.H[0] // ..................................................*....... + // sub v25.8H, v17.8H, v31.8H // .....................................................*.... + // mls v3.8H, v6.8H, v7.H[0] // ....................................*..................... + // sqrdmulh v11.8H, v27.8H, v0.H[7] // .................................................*........ + // mul v26.8H, v27.8H, v0.H[6] // ...............................................*.......... + // sqrdmulh v28.8H, v22.8H, v1.H[1] // ..............................................*........... + // mul v14.8H, v22.8H, v1.H[0] // ................................................*......... + // add v29.8H, v19.8H, v3.8H // ........................................................*. + // sub v15.8H, v25.8H, v16.8H // .........................................................* + // sub v6.8H, v19.8H, v3.8H // .......................................................*.. + // mls v26.8H, v11.8H, v7.H[0] // ......................................................*... sub count, count, #1 layer123_start: - sub v27.8H, v31.8H, v10.8H // ..........................*................................................. - sqrdmulh v25.8H, v8.8H, v0.H[1] // ..............*............................................................. - ldr q3, [x0, #464] // .......e.................................................................... - ldr q22, [x0, #272] // ....e....................................................................... - mls v28.8H, v20.8H, v7.H[0] // ....................*....................................................... - ldr q5, [x0, #400] // ......e..................................................................... - add v17.8H, v31.8H, v10.8H // ...........................*................................................ + // Instructions: 76 + // Expected cycles: 30 + // Expected IPC: 2.53 + // + // Cycle bound: 30.0 + // IPC bound: 2.53 + // + // Wall time: 74.24s + // User time: 74.24s + // + // ---------------------------- original position ----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------| + mls v2.8H, v18.8H, v7.H[0] // ............................................................*............... // gap // ............................................................................ - mul v2.8H, v27.8H, v0.H[4] // ...........................................*................................ - sqrdmulh v18.8H, v27.8H, v0.H[5] // ............................................*............................... - ldr q8, [x0, #336] // .....e...................................................................... + ldr q22, [x0, #464] // .......e.................................................................... + mls v14.8H, v28.8H, v7.H[0] // .......................................................*.................... + add v16.8H, v25.8H, v16.8H // ...................................................................*........ + add v13.8H, v17.8H, v31.8H // ..........................................*................................. + ldr q23, [x0, #400] // ......e..................................................................... // gap // ............................................................................ - sqrdmulh v27.8H, v17.8H, v0.H[3] // ..................................*......................................... + ldr q30, [x0, #272] // ....e....................................................................... + ldr q4, [x0, #336] // .....e...................................................................... + sub v11.8H, v29.8H, v26.8H // ...................................................*........................ + add v21.8H, v29.8H, v26.8H // ....................................................*....................... + add v27.8H, v13.8H, v2.8H // ..............................................................*............. + str q16, [x0, #384] // ..........................................................................*. + add v24.8H, v6.8H, v14.8H // .........................................................*.................. // gap // ............................................................................ + ldr q20, [x0, #16] // e........................................................................... // gap // ............................................................................ - mls v29.8H, v25.8H, v7.H[0] // ...............*............................................................ - mul v9.8H, v17.8H, v0.H[2] // .................................*.......................................... - add v11.8H, v16.8H, v28.8H // ......................*..................................................... + sqrdmulh v16.8H, v22.8H, v0.H[1] // .......................e.................................................... + mul v25.8H, v22.8H, v0.H[0] // ........................e................................................... + str q15, [x0, #448] // ...........................................................................* + mul v15.8H, v23.8H, v0.H[0] // ...................e........................................................ + sqrdmulh v3.8H, v23.8H, v0.H[1] // ..................e......................................................... + ldr q8, [x0, #80] // .e.......................................................................... + ldr q29, [x0, #144] // ..e......................................................................... + ldr q12, [x0, #208] // ...e........................................................................ + sqrdmulh v22.8H, v4.8H, v0.H[1] // .............e.............................................................. + mul v17.8H, v4.8H, v0.H[0] // ..............e............................................................. + str q11, [x0, #64] // .....................................................................*...... + mls v25.8H, v16.8H, v7.H[0] // .........................e.................................................. + mul v9.8H, v30.8H, v0.H[0] // .........e.................................................................. // gap // ............................................................................ + mls v15.8H, v3.8H, v7.H[0] // ....................e....................................................... + sqrdmulh v31.8H, v30.8H, v0.H[1] // ........e................................................................... // gap // ............................................................................ - mls v2.8H, v18.8H, v7.H[0] // .............................................*.............................. - sub v17.8H, v16.8H, v28.8H // .....................*...................................................... // gap // ............................................................................ + sub v11.8H, v13.8H, v2.8H // .............................................................*.............. // gap // ............................................................................ - sub v18.8H, v24.8H, v29.8H // ................*........................................................... // gap // ............................................................................ + mls v17.8H, v22.8H, v7.H[0] // ...............e............................................................ // gap // ............................................................................ - mul v26.8H, v11.8H, v0.H[2] // ............................*............................................... - mls v9.8H, v27.8H, v7.H[0] // ...................................*........................................ - mul v20.8H, v17.8H, v0.H[4] // ......................................*..................................... // gap // ............................................................................ + sub v26.8H, v12.8H, v25.8H // ..........................e................................................. + add v16.8H, v12.8H, v25.8H // ...........................e................................................ + str q11, [x0, #320] // .........................................................................*.. + mls v9.8H, v31.8H, v7.H[0] // ..........e................................................................. // gap // ............................................................................ - add v10.8H, v24.8H, v29.8H // .................*.......................................................... + add v11.8H, v29.8H, v15.8H // ......................e..................................................... + sub v13.8H, v29.8H, v15.8H // .....................e...................................................... // gap // ............................................................................ // gap // ............................................................................ - sub v28.8H, v18.8H, v2.8H // ..............................................*............................. - add v29.8H, v18.8H, v2.8H // ...............................................*............................ + sqrdmulh v15.8H, v16.8H, v0.H[3] // .................................e.......................................... + mul v22.8H, v26.8H, v0.H[4] // ............................................e............................... + sqrdmulh v18.8H, v26.8H, v0.H[5] // ...........................................e................................ // gap // ............................................................................ // gap // ............................................................................ - mul v21.8H, v22.8H, v0.H[0] // ........e................................................................... - sub v27.8H, v10.8H, v9.8H // ....................................*....................................... - mul v25.8H, v28.8H, v1.H[4] // ...............................................................*............ + mul v31.8H, v13.8H, v0.H[4] // .......................................e.................................... + sqrdmulh v5.8H, v13.8H, v0.H[5] // ......................................e..................................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v30.8H, v11.8H, v0.H[3] // .............................*.............................................. - sqrdmulh v18.8H, v28.8H, v1.H[5] // ................................................................*........... // gap // ............................................................................ // gap // ............................................................................ - add v11.8H, v10.8H, v9.8H // .....................................*...................................... + add v23.8H, v8.8H, v17.8H // .................e.......................................................... + mul v3.8H, v11.8H, v0.H[2] // .............................e.............................................. + mls v22.8H, v18.8H, v7.H[0] // .............................................e.............................. + sub v10.8H, v8.8H, v17.8H // ................e........................................................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v2.8H, v27.8H, v1.H[1] // ......................................................*..................... - mul v19.8H, v27.8H, v1.H[0] // .....................................................*...................... + mul v26.8H, v16.8H, v0.H[2] // ..................................e......................................... + mls v31.8H, v5.8H, v7.H[0] // ........................................e................................... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v22.8H, v22.8H, v0.H[1] // .........e.................................................................. - sqrdmulh v14.8H, v17.8H, v0.H[5] // .......................................*.................................... - mls v26.8H, v30.8H, v7.H[0] // ..............................*............................................. + sub v14.8H, v6.8H, v14.8H // ........................................................*................... + sub v17.8H, v20.8H, v9.8H // ...........e................................................................ // gap // ............................................................................ // gap // ............................................................................ // gap // ............................................................................ + add v19.8H, v20.8H, v9.8H // ............e............................................................... + sub v29.8H, v10.8H, v22.8H // ..............................................e............................. // gap // ............................................................................ - sqrdmulh v28.8H, v3.8H, v0.H[1] // ........................e................................................... - mul v6.8H, v11.8H, v0.H[6] // ................................................*........................... - sqrdmulh v16.8H, v11.8H, v0.H[7] // .................................................*.......................... - mls v19.8H, v2.8H, v7.H[0] // .......................................................*.................... + add v8.8H, v10.8H, v22.8H // ...............................................e............................ // gap // ............................................................................ // gap // ............................................................................ - mls v20.8H, v14.8H, v7.H[0] // ........................................*................................... - sub v2.8H, v23.8H, v26.8H // ...............................*............................................ + mls v26.8H, v15.8H, v7.H[0] // ...................................e........................................ // gap // ............................................................................ + str q27, [x0, #256] // ........................................................................*... + sqrdmulh v15.8H, v29.8H, v1.H[5] // ...............................................................e............ + mul v16.8H, v29.8H, v1.H[4] // ................................................................e........... + sqrdmulh v6.8H, v11.8H, v0.H[3] // ............................e............................................... + mul v2.8H, v8.8H, v1.H[2] // ...........................................................e................ // gap // ............................................................................ - mls v25.8H, v18.8H, v7.H[0] // .................................................................*.......... - mul v18.8H, v29.8H, v1.H[2] // ..........................................................*................. // gap // ............................................................................ + sqrdmulh v18.8H, v8.8H, v1.H[3] // ..........................................................e................. // gap // ............................................................................ - sub v24.8H, v2.8H, v19.8H // ........................................................*................... + sub v22.8H, v23.8H, v26.8H // ....................................e....................................... + str q14, [x0, #192] // .......................................................................*.... // gap // ............................................................................ // gap // ............................................................................ - sqrdmulh v4.8H, v29.8H, v1.H[3] // ...........................................................*................ - sub v27.8H, v15.8H, v20.8H // .........................................*.................................. - mls v6.8H, v16.8H, v7.H[0] // ..................................................*......................... + add v27.8H, v23.8H, v26.8H // .....................................e...................................... + mls v16.8H, v15.8H, v7.H[0] // .................................................................e.......... + sub v25.8H, v17.8H, v31.8H // .........................................e.................................. // gap // ............................................................................ // gap // ............................................................................ - add v29.8H, v23.8H, v26.8H // ................................*........................................... - str q24, [x0, #192] // .......................................................................*.... - add v24.8H, v2.8H, v19.8H // .........................................................*.................. - ldr q31, [x0, #208] // ...e........................................................................ - mls v18.8H, v4.8H, v7.H[0] // ............................................................*............... - sub v11.8H, v27.8H, v25.8H // ..................................................................*......... + mls v3.8H, v6.8H, v7.H[0] // ..............................e............................................. // gap // ............................................................................ - ldr q16, [x0, #144] // ..e......................................................................... - add v2.8H, v15.8H, v20.8H // ..........................................*................................. - str q24, [x0, #128] // ......................................................................*..... - add v4.8H, v29.8H, v6.8H // ....................................................*....................... - ldr q24, [x0, #80] // .e.......................................................................... - mul v10.8H, v3.8H, v0.H[0] // .......................e.................................................... - str q11, [x0, #448] // ...........................................................................* - add v11.8H, v27.8H, v25.8H // ...................................................................*........ - ldr q19, [x0, #16] // e........................................................................... - mls v21.8H, v22.8H, v7.H[0] // ..........e................................................................. - str q4, [x0], #(16) // ....................................................................*....... - sub v15.8H, v2.8H, v18.8H // .............................................................*.............. // gap // ............................................................................ - str q11, [x0, #368] // ..........................................................................*. - sqrdmulh v20.8H, v5.8H, v0.H[1] // ...................e........................................................ + sqrdmulh v11.8H, v27.8H, v0.H[7] // ................................................e........................... + mul v26.8H, v27.8H, v0.H[6] // .................................................e.......................... // gap // ............................................................................ - add v30.8H, v2.8H, v18.8H // ..............................................................*............. - mls v10.8H, v28.8H, v7.H[0] // .........................e.................................................. - sub v11.8H, v29.8H, v6.8H // ...................................................*........................ - str q15, [x0, #304] // .........................................................................*.. // gap // ............................................................................ - mul v28.8H, v5.8H, v0.H[0] // ..................e......................................................... - str q30, [x0, #240] // ........................................................................*... + sqrdmulh v28.8H, v22.8H, v1.H[1] // .....................................................e...................... + mul v14.8H, v22.8H, v1.H[0] // ......................................................e..................... // gap // ............................................................................ - sub v15.8H, v19.8H, v21.8H // ...........e................................................................ - add v23.8H, v19.8H, v21.8H // ............e............................................................... - str q11, [x0, #48] // .....................................................................*...... + add v29.8H, v19.8H, v3.8H // ................................e........................................... + sub v15.8H, v25.8H, v16.8H // ..................................................................e......... + str q21, [x0], #(16) // ....................................................................*....... // gap // ............................................................................ - mul v29.8H, v8.8H, v0.H[0] // .............e.............................................................. + sub v6.8H, v19.8H, v3.8H // ...............................e............................................ + mls v26.8H, v11.8H, v7.H[0] // ..................................................e......................... + str q24, [x0, #112] // ......................................................................*..... - // original source code - // ldr q8, [x0, #0] // ..........................................................e...............|...........................................................e.............. - // ldr q9, [x0, #(1*(512/8))] // ......................................................e...................|.......................................................e.................. - // ldr q10, [x0, #(2*(512/8))] // ..................................................e.......................|...................................................e...................... - // ldr q11, [x0, #(3*(512/8))] // ...............................................e..........................|................................................e......................... - // ldr q12, [x0, #(4*(512/8))] // .e........................................................................|..e....................................................................... - // ldr q13, [x0, #(5*(512/8))] // .......e..................................................................|........e................................................................. - // ldr q14, [x0, #(6*(512/8))] // ...e......................................................................|....e..................................................................... - // ldr q15, [x0, #(7*(512/8))] // e.........................................................................|.e........................................................................ - // mul v24.8h, v12.8h, v0.h[0] // .....................e....................................................|......................e................................................... - // sqrdmulh v12.8h, v12.8h, v0.h[1] // .............................e............................................|..............................e........................................... - // mls v24.8h, v12.8h, v7.h[0] // ...........................................................e..............|............................................................e............. - // sub v12.8h, v8.8h, v24.8h // ......................................................................e...|.......................................................................e.. - // add v8.8h, v8.8h, v24.8h // .......................................................................e..|........................................................................e. - // mul v24.8h, v13.8h, v0.h[0] // .........................................................................e|.......................................................................... - // sqrdmulh v13.8h, v13.8h, v0.h[1] // ..........................................................................|*......................................................................... - // mls v24.8h, v13.8h, v7.h[0] // .........*................................................................|..........*............................................................... - // sub v13.8h, v9.8h, v24.8h // ..............*...........................................................|...............*.......................................................... - // add v9.8h, v9.8h, v24.8h // ..................*.......................................................|...................*...................................................... - // mul v24.8h, v14.8h, v0.h[0] // ....................................................................e.....|.....................................................................e.... - // sqrdmulh v14.8h, v14.8h, v0.h[1] // ...............................................................e..........|................................................................e......... - // mls v24.8h, v14.8h, v7.h[0] // ..*.......................................................................|...*...................................................................... - // sub v14.8h, v10.8h, v24.8h // .............*............................................................|..............*........................................................... - // add v10.8h, v10.8h, v24.8h // ...........*..............................................................|............*............................................................. - // mul v24.8h, v15.8h, v0.h[0] // .......................................................e..................|........................................................e................. - // sqrdmulh v15.8h, v15.8h, v0.h[1] // ................................e.........................................|.................................e........................................ - // mls v24.8h, v15.8h, v7.h[0] // .................................................................e........|..................................................................e....... - // sub v15.8h, v11.8h, v24.8h // ..........................................................................*.......................................................................... - // add v11.8h, v11.8h, v24.8h // ....*.....................................................................|.....*.................................................................... - // mul v24.8h, v10.8h, v0.h[2] // ...............*..........................................................|................*......................................................... - // sqrdmulh v10.8h, v10.8h, v0.h[3] // ........................*.................................................|.........................*................................................ - // mls v24.8h, v10.8h, v7.h[0] // ...............................*..........................................|................................*......................................... - // sub v10.8h, v8.8h, v24.8h // .....................................*....................................|......................................*................................... - // add v8.8h, v8.8h, v24.8h // ............................................*.............................|.............................................*............................ - // mul v24.8h, v11.8h, v0.h[2] // ..........*...............................................................|...........*.............................................................. - // sqrdmulh v11.8h, v11.8h, v0.h[3] // ........*.................................................................|.........*................................................................ - // mls v24.8h, v11.8h, v7.h[0] // ................*.........................................................|.................*........................................................ - // sub v11.8h, v9.8h, v24.8h // ......................*...................................................|.......................*.................................................. - // add v9.8h, v9.8h, v24.8h // ..........................*...............................................|...........................*.............................................. - // mul v24.8h, v14.8h, v0.h[4] // .................*........................................................|..................*....................................................... - // sqrdmulh v14.8h, v14.8h, v0.h[5] // ..............................*...........................................|...............................*.......................................... - // mls v24.8h, v14.8h, v7.h[0] // ....................................*.....................................|.....................................*.................................... - // sub v14.8h, v12.8h, v24.8h // ..........................................*...............................|...........................................*.............................. - // add v12.8h, v12.8h, v24.8h // ...................................................*......................|....................................................*..................... - // mul v24.8h, v15.8h, v0.h[4] // .....*....................................................................|......*................................................................... - // sqrdmulh v15.8h, v15.8h, v0.h[5] // ......*...................................................................|.......*.................................................................. - // mls v24.8h, v15.8h, v7.h[0] // ............*.............................................................|.............*............................................................ - // sub v15.8h, v13.8h, v24.8h // ...................*......................................................|....................*..................................................... - // add v13.8h, v13.8h, v24.8h // ....................*.....................................................|.....................*.................................................... - // mul v24.8h, v9.8h, v0.h[6] // .................................*........................................|..................................*....................................... - // sqrdmulh v9.8h, v9.8h, v0.h[7] // ..................................*.......................................|...................................*...................................... - // mls v24.8h, v9.8h, v7.h[0] // ...........................................*..............................|............................................*............................. - // sub v9.8h, v8.8h, v24.8h // ..................................................................*.......|...................................................................*...... - // add v8.8h, v8.8h, v24.8h // .....................................................*....................|......................................................*................... - // mul v24.8h, v11.8h, v1.h[0] // ............................*.............................................|.............................*............................................ - // sqrdmulh v11.8h, v11.8h, v1.h[1] // ...........................*..............................................|............................*............................................. - // mls v24.8h, v11.8h, v7.h[0] // ...................................*......................................|....................................*..................................... - // sub v11.8h, v10.8h, v24.8h // ........................................*.................................|.........................................*................................ - // add v10.8h, v10.8h, v24.8h // ..............................................*...........................|...............................................*.......................... - // mul v24.8h, v13.8h, v1.h[2] // .......................................*..................................|........................................*................................. - // sqrdmulh v13.8h, v13.8h, v1.h[3] // .........................................*................................|..........................................*............................... - // mls v24.8h, v13.8h, v7.h[0] // ................................................*.........................|.................................................*........................ - // sub v13.8h, v12.8h, v24.8h // .............................................................*............|..............................................................*........... - // add v12.8h, v12.8h, v24.8h // ................................................................*.........|.................................................................*........ - // mul v24.8h, v15.8h, v1.h[4] // .......................*..................................................|........................*................................................. - // sqrdmulh v15.8h, v15.8h, v1.h[5] // .........................*................................................|..........................*............................................... - // mls v24.8h, v15.8h, v7.h[0] // ......................................*...................................|.......................................*.................................. - // sub v15.8h, v14.8h, v24.8h // .................................................*........................|..................................................*....................... - // add v14.8h, v14.8h, v24.8h // .........................................................*................|..........................................................*............... - // str q8, [x0], #(16) // ............................................................*.............|.............................................................*............ - // str q9, [x0, #(-16 + 1*(512/8))] // ........................................................................*.|.........................................................................* - // str q10, [x0, #(-16 + 2*(512/8))] // ....................................................*.....................|.....................................................*.................... - // str q11, [x0, #(-16 + 3*(512/8))] // .............................................*............................|..............................................*........................... - // str q12, [x0, #(-16 + 4*(512/8))] // .....................................................................*....|......................................................................*... - // str q13, [x0, #(-16 + 5*(512/8))] // ...................................................................*......|....................................................................*..... - // str q14, [x0, #(-16 + 6*(512/8))] // ..............................................................*...........|...............................................................*.......... - // str q15, [x0, #(-16 + 7*(512/8))] // ........................................................*.................|.........................................................*................ + // -------------------------------------------------------------------- new position --------------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q8, [x0, #0] // ............e..............................................................'............~.............................................................. + // ldr q9, [x0, #(1*(512/8))] // ..................e........................................................'..................~........................................................ + // ldr q10, [x0, #(2*(512/8))] // ...................e.......................................................'...................~....................................................... + // ldr q11, [x0, #(3*(512/8))] // ....................e......................................................'....................~...................................................... + // ldr q12, [x0, #(4*(512/8))] // .....e.....................................................................'.....~..................................................................... + // ldr q13, [x0, #(5*(512/8))] // ......e....................................................................'......~.................................................................... + // ldr q14, [x0, #(6*(512/8))] // ....e......................................................................'....~...................................................................... + // ldr q15, [x0, #(7*(512/8))] // e..........................................................................'~.......................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ...........................e...............................................'...........................~............................................... + // mul v24.8h, v12.8h, v0.h[0] // .........................e.................................................'.........................~................................................. + // mls v24.8h, v27.8h, v7.h[0] // .................................e.........................................'.................................~......................................... + // sub v12.8h, v8.8h, v24.8h // ................................................e..........................'................................................~.......................... + // add v8.8h, v8.8h, v24.8h // .................................................e.........................'.................................................~......................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // .....................e.....................................................'.....................~..................................................... + // mul v24.8h, v13.8h, v0.h[0] // ......................e....................................................'......................~.................................................... + // mls v24.8h, v27.8h, v7.h[0] // .............................e.............................................'.............................~............................................. + // sub v13.8h, v9.8h, v24.8h // ............................................e..............................'............................................~.............................. + // add v9.8h, v9.8h, v24.8h // .........................................e.................................'.........................................~................................. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // .................e.........................................................'.................~......................................................... + // mul v24.8h, v14.8h, v0.h[0] // ................e..........................................................'................~.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................e................................................'..........................~................................................ + // sub v14.8h, v10.8h, v24.8h // ...................................e.......................................'...................................~....................................... + // add v10.8h, v10.8h, v24.8h // ..................................e........................................'..................................~........................................ + // sqrdmulh v27.8h, v15.8h, v0.h[1] // .............e.............................................................'.............~............................................................. + // mul v24.8h, v15.8h, v0.h[0] // ..............e............................................................'..............~............................................................ + // mls v24.8h, v27.8h, v7.h[0] // ........................e..................................................'........................~.................................................. + // sub v15.8h, v11.8h, v24.8h // ..............................e............................................'..............................~............................................ + // add v11.8h, v11.8h, v24.8h // ...............................e...........................................'...............................~........................................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ........................................................e..................'........................................................~.................. + // mul v24.8h, v10.8h, v0.h[2] // ..........................................e................................'..........................................~................................ + // mls v24.8h, v27.8h, v7.h[0] // ................................................................e..........'................................................................~.......... + // sub v10.8h, v8.8h, v24.8h // ........................................................................e..'........................................................................~.. + // add v8.8h, v8.8h, v24.8h // .....................................................................e.....'.....................................................................~..... + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ....................................e......................................'....................................~...................................... + // mul v24.8h, v11.8h, v0.h[2] // .............................................e.............................'.............................................~............................. + // mls v24.8h, v27.8h, v7.h[0] // ....................................................e......................'....................................................~...................... + // sub v11.8h, v9.8h, v24.8h // ...........................................................e...............'...........................................................~............... + // add v9.8h, v9.8h, v24.8h // .............................................................e.............'.............................................................~............. + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ........................................e..................................'........................................~.................................. + // mul v24.8h, v14.8h, v0.h[4] // .......................................e...................................'.......................................~................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................................e............................'..............................................~............................ + // sub v14.8h, v12.8h, v24.8h // ...............................................................e...........'...............................................................~........... + // add v12.8h, v12.8h, v24.8h // ...~.......................................................................'...*....................................................................... + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ......................................e....................................'......................................~.................................... + // mul v24.8h, v15.8h, v0.h[4] // .....................................e.....................................'.....................................~..................................... + // mls v24.8h, v27.8h, v7.h[0] // ...........................................e...............................'...........................................~............................... + // sub v15.8h, v13.8h, v24.8h // ..................................................e........................'..................................................~........................ + // add v13.8h, v13.8h, v24.8h // ...................................................e.......................'...................................................~....................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // .................................................................e.........'.................................................................~......... + // mul v24.8h, v9.8h, v0.h[6] // ..................................................................e........'..................................................................~........ + // mls v24.8h, v27.8h, v7.h[0] // .........................................................................e.'.........................................................................~. + // sub v9.8h, v8.8h, v24.8h // .......~...................................................................'.......*................................................................... + // add v8.8h, v8.8h, v24.8h // ........~..................................................................'........*.................................................................. + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ...................................................................e.......'...................................................................~....... + // mul v24.8h, v11.8h, v1.h[0] // ....................................................................e......'....................................................................~...... + // mls v24.8h, v27.8h, v7.h[0] // .~.........................................................................'.*......................................................................... + // sub v11.8h, v10.8h, v24.8h // ...............................................~...........................'...............................................*........................... + // add v10.8h, v10.8h, v24.8h // ...........~...............................................................'...........*............................................................... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ..........................................................e................'..........................................................~................ + // mul v24.8h, v13.8h, v1.h[2] // .........................................................e.................'.........................................................~................. + // mls v24.8h, v27.8h, v7.h[0] // ...........................................................................*........................................................................... + // sub v13.8h, v12.8h, v24.8h // ............................~..............................................'............................*.............................................. + // add v12.8h, v12.8h, v24.8h // .........~.................................................................'.........*................................................................. + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ......................................................e....................'......................................................~.................... + // mul v24.8h, v15.8h, v1.h[4] // .......................................................e...................'.......................................................~................... + // mls v24.8h, v27.8h, v7.h[0] // ..............................................................e............'..............................................................~............ + // sub v15.8h, v14.8h, v24.8h // ......................................................................e....'......................................................................~.... + // add v14.8h, v14.8h, v24.8h // ..~........................................................................'..*........................................................................ + // str q8, [x0], #(16) // .......................................................................~...'.......................................................................*... + // str q9, [x0, #(-16 + 1*(512/8))] // .......................~...................................................'.......................*................................................... + // str q10, [x0, #(-16 + 2*(512/8))] // ..........................................................................~'..........................................................................* + // str q11, [x0, #(-16 + 3*(512/8))] // ............................................................~..............'............................................................*.............. + // str q12, [x0, #(-16 + 4*(512/8))] // .....................................................~.....................'.....................................................*..................... + // str q13, [x0, #(-16 + 5*(512/8))] // ................................~..........................................'................................*.......................................... + // str q14, [x0, #(-16 + 6*(512/8))] // ..........~................................................................'..........*................................................................ + // str q15, [x0, #(-16 + 7*(512/8))] // ...............~...........................................................'...............*........................................................... sub count, count, #1 cbnz count, layer123_start - sub v26.8H, v31.8H, v10.8H // *........................................................ - sqrdmulh v27.8H, v8.8H, v0.H[1] // .*....................................................... + // Instructions: 18 + // Expected cycles: 9 + // Expected IPC: 2.00 + // + // Cycle bound: 9.0 + // IPC bound: 2.00 + // + // Wall time: 0.11s + // User time: 0.11s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + sub v21.8H, v29.8H, v26.8H // ....*......................... + mls v14.8H, v28.8H, v7.H[0] // .*............................ + str q15, [x0, #448] // .........*.................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + add v28.8H, v25.8H, v16.8H // ..*........................... + // gap // .............................. + mls v2.8H, v18.8H, v7.H[0] // *............................. + str q21, [x0, #64] // ..........*................... + add v21.8H, v29.8H, v26.8H // .....*........................ + // gap // .............................. + str q28, [x0, #384] // .......*...................... + sub v20.8H, v6.8H, v14.8H // .............*................ + add v28.8H, v17.8H, v31.8H // ...*.......................... + // gap // .............................. + str q21, [x0], #(16) // ................*............. + add v21.8H, v6.8H, v14.8H // ........*..................... + // gap // .............................. + // gap // .............................. + str q20, [x0, #176] // ...............*.............. + sub v20.8H, v28.8H, v2.8H // ...........*.................. + // gap // .............................. + // gap // .............................. + add v28.8H, v28.8H, v2.8H // ......*....................... + str q21, [x0, #112] // .................*............ + // gap // .............................. + // gap // .............................. + str q20, [x0, #304] // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q28, [x0, #240] // ..............*............... + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // mls v2.8H, v18.8H, v7.H[0] // ....*.......................... + // mls v14.8H, v28.8H, v7.H[0] // .*............................. + // add v16.8H, v25.8H, v16.8H // ...*........................... + // add v13.8H, v17.8H, v31.8H // .........*..................... + // sub v11.8H, v29.8H, v26.8H // *.............................. + // add v21.8H, v29.8H, v26.8H // ......*........................ + // add v27.8H, v13.8H, v2.8H // ..............*................ + // str q16, [x0, #384] // .......*....................... + // add v24.8H, v6.8H, v14.8H // ...........*................... + // str q15, [x0, #448] // ..*............................ + // str q11, [x0, #64] // .....*......................... + // sub v11.8H, v13.8H, v2.8H // .............*................. + // str q11, [x0, #320] // ................*.............. + // sub v14.8H, v6.8H, v14.8H // ........*...................... + // str q27, [x0, #256] // .................*............. + // str q14, [x0, #192] // ............*.................. + // str q21, [x0], #(16) // ..........*.................... + // str q24, [x0, #112] // ...............*............... + + + restore inp, STACK0 + mov count, #8 + + .p2align 2 + // Instructions: 57 + // Expected cycles: 40 + // Expected IPC: 1.43 + // + // Cycle bound: 40.0 + // IPC bound: 1.43 + // + // Wall time: 2.28s + // User time: 2.28s + // + // ------------------ original position -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + ldr q4, [x3], #16 // ..*...................................................... + ldr q15, [x1, #48] // ...*..................................................... // gap // ......................................................... // gap // ......................................................... - mls v28.8H, v20.8H, v7.H[0] // ..*...................................................... - add v6.8H, v31.8H, v10.8H // ...*..................................................... + ldr q6, [x1, #16] // .............*........................................... // gap // ......................................................... // gap // ......................................................... - mul v14.8H, v26.8H, v0.H[4] // ....*.................................................... - sqrdmulh v12.8H, v26.8H, v0.H[5] // .....*................................................... // gap // ......................................................... + ldr q13, [x1, #32] // .*....................................................... // gap // ......................................................... - mls v29.8H, v27.8H, v7.H[0] // .......*................................................. - sqrdmulh v3.8H, v6.8H, v0.H[3] // ......*.................................................. // gap // ......................................................... // gap // ......................................................... - mul v18.8H, v6.8H, v0.H[2] // ........*................................................ - add v17.8H, v16.8H, v28.8H // .........*............................................... + ldr q28, [x1, #0] // ......*.................................................. // gap // ......................................................... // gap // ......................................................... - sub v10.8H, v16.8H, v28.8H // ...........*............................................. - mls v14.8H, v12.8H, v7.H[0] // ..........*.............................................. // gap // ......................................................... + sqrdmulh v25.8H, v15.8H, v4.H[1] // .........*............................................... + mul v10.8H, v15.8H, v4.H[0] // ..........*.............................................. + ldr q3, [x4, #48] // .......*................................................. // gap // ......................................................... - sub v9.8H, v24.8H, v29.8H // ............*............................................ - mul v31.8H, v17.8H, v0.H[2] // .............*........................................... // gap // ......................................................... // gap // ......................................................... - sqrdmulh v11.8H, v10.8H, v0.H[5] // ..........................*.............................. - mls v18.8H, v3.8H, v7.H[0] // ..............*.......................................... + ldr q20, [x4, #16] // ....*.................................................... // gap // ......................................................... + ldr q12, [x4], #(6*16) // .....*................................................... + mul v1.8H, v13.8H, v4.H[0] // ...........*............................................. + sqrdmulh v30.8H, v13.8H, v4.H[1] // ............*............................................ // gap // ......................................................... - add v2.8H, v9.8H, v14.8H // ..................*...................................... - mul v5.8H, v10.8H, v0.H[4] // ...............*......................................... + mls v10.8H, v25.8H, v7.H[0] // ..............*.......................................... + ldr q13, [x4, #-16] // *........................................................ // gap // ......................................................... // gap // ......................................................... - add v8.8H, v24.8H, v29.8H // ................*........................................ - sub v12.8H, v9.8H, v14.8H // .................*....................................... // gap // ......................................................... // gap // ......................................................... - mul v22.8H, v2.8H, v1.H[2] // ..................................*...................... - sqrdmulh v19.8H, v2.8H, v1.H[3] // ....................................*.................... // gap // ......................................................... // gap // ......................................................... - sqrdmulh v20.8H, v12.8H, v1.H[5] // ......................*.................................. - mls v5.8H, v11.8H, v7.H[0] // ...............................*......................... // gap // ......................................................... + mls v1.8H, v30.8H, v7.H[0] // ...............*......................................... // gap // ......................................................... - mul v27.8H, v12.8H, v1.H[4] // ....................*.................................... - sub v4.8H, v8.8H, v18.8H // ...................*..................................... // gap // ......................................................... + sub v15.8H, v6.8H, v10.8H // ................*........................................ // gap // ......................................................... - sqrdmulh v26.8H, v17.8H, v0.H[3] // .....................*................................... - mls v22.8H, v19.8H, v7.H[0] // ..........................................*.............. // gap // ......................................................... // gap // ......................................................... - mul v25.8H, v4.8H, v1.H[0] // .........................*............................... - add v11.8H, v15.8H, v5.8H // ............................................*............ // gap // ......................................................... // gap // ......................................................... - mls v27.8H, v20.8H, v7.H[0] // .................................*....................... - add v8.8H, v8.8H, v18.8H // .......................*................................. // gap // ......................................................... + add v26.8H, v6.8H, v10.8H // ..................*...................................... + sqrdmulh v10.8H, v15.8H, v4.H[5] // ....................*.................................... + mul v27.8H, v15.8H, v4.H[4] // ...................*..................................... // gap // ......................................................... - sub v12.8H, v11.8H, v22.8H // ..................................................*...... - sqrdmulh v2.8H, v4.8H, v1.H[1] // ........................*................................ // gap // ......................................................... + mul v11.8H, v26.8H, v4.H[2] // ......................*.................................. // gap // ......................................................... - mul v30.8H, v8.8H, v0.H[6] // ............................*............................ - mls v31.8H, v26.8H, v7.H[0] // ...........................*............................. // gap // ......................................................... + sqrdmulh v15.8H, v26.8H, v4.H[3] // .....................*................................... + sub v16.8H, v28.8H, v1.8H // .......................*................................. + add v14.8H, v28.8H, v1.8H // .................*....................................... // gap // ......................................................... - sub v24.8H, v15.8H, v5.8H // .....................................*................... - str q12, [x0, #320] // ......................................................*.. - sqrdmulh v18.8H, v8.8H, v0.H[7] // .............................*........................... // gap // ......................................................... - mls v25.8H, v2.8H, v7.H[0] // ..............................*.......................... - add v6.8H, v11.8H, v22.8H // ....................................................*.... + mls v27.8H, v10.8H, v7.H[0] // ........................*................................ // gap // ......................................................... // gap // ......................................................... - add v22.8H, v24.8H, v27.8H // ................................................*........ - sub v12.8H, v23.8H, v31.8H // ................................*........................ // gap // ......................................................... // gap // ......................................................... - sub v9.8H, v24.8H, v27.8H // ...........................................*............. - mls v30.8H, v18.8H, v7.H[0] // ......................................*.................. - str q6, [x0, #256] // .......................................................*. // gap // ......................................................... - add v21.8H, v23.8H, v31.8H // .......................................*................. - add v26.8H, v12.8H, v25.8H // .........................................*............... - str q22, [x0, #384] // ...................................................*..... // gap // ......................................................... - sub v8.8H, v12.8H, v25.8H // ...................................*..................... - str q9, [x0, #448] // ...............................................*......... + mls v11.8H, v15.8H, v7.H[0] // .........................*............................... // gap // ......................................................... // gap // ......................................................... - str q26, [x0, #128] // .............................................*........... - add v13.8H, v21.8H, v30.8H // ..............................................*.......... - sub v12.8H, v21.8H, v30.8H // .....................................................*... // gap // ......................................................... - str q8, [x0, #192] // ........................................*................ // gap // ......................................................... + add v30.8H, v16.8H, v27.8H // ..........................*.............................. + sub v0.8H, v16.8H, v27.8H // ...........................*............................. // gap // ......................................................... // gap // ......................................................... - str q13, [x0], #(16) // .................................................*....... + add v15.8H, v14.8H, v11.8H // .............................*........................... // gap // ......................................................... // gap // ......................................................... + sub v19.8H, v14.8H, v11.8H // ............................*............................ + trn1 v18.4S, v30.4S, v0.4S // ..............................*.......................... // gap // ......................................................... - str q12, [x0, #48] // ........................................................* // gap // ......................................................... // gap // ......................................................... + trn2 v22.4S, v30.4S, v0.4S // ...............................*......................... // gap // ......................................................... + // gap // ......................................................... + trn2 v17.4S, v15.4S, v19.4S // ................................*........................ + // gap // ......................................................... + trn1 v23.4S, v15.4S, v19.4S // .................................*....................... + // gap // ......................................................... + // gap // ......................................................... + trn1 v21.2D, v17.2D, v22.2D // .........................................*............... + // gap // ......................................................... + // gap // ......................................................... + trn2 v26.2D, v17.2D, v22.2D // ..................................*...................... + trn2 v19.2D, v23.2D, v18.2D // ...................................*..................... + trn1 v2.2D, v23.2D, v18.2D // ..................................................*...... + // gap // ......................................................... + // gap // ......................................................... + mul v0.8H, v26.8H, v12.8H // .....................................*................... + // gap // ......................................................... + ldr q22, [x4, #-64] // ........*................................................ + sqrdmulh v5.8H, v26.8H, v20.8H // ....................................*.................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sqrdmulh v30.8H, v19.8H, v20.8H // ......................................*.................. + mul v15.8H, v19.8H, v12.8H // ........................................*................ + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mls v0.8H, v5.8H, v7.H[0] // .......................................*................. + ldr q9, [x4, #-32] // ..........................................*.............. + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mls v15.8H, v30.8H, v7.H[0] // .............................................*........... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sub v12.8H, v21.8H, v0.8H // ...........................................*............. + add v24.8H, v21.8H, v0.8H // ............................................*............ + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sqrdmulh v21.8H, v12.8H, v13.8H // ...............................................*......... + // gap // ......................................................... + // gap // ......................................................... + mul v17.8H, v12.8H, v9.8H // ..............................................*.......... + sqrdmulh v10.8H, v24.8H, v3.8H // .................................................*....... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + mul v27.8H, v24.8H, v22.8H // ................................................*........ + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + add v26.8H, v2.8H, v15.8H // ......................................................*.. + // gap // ......................................................... + // gap // ......................................................... + mls v17.8H, v21.8H, v7.H[0] // ...................................................*..... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sub v0.8H, v2.8H, v15.8H // .....................................................*... + mls v27.8H, v10.8H, v7.H[0] // ....................................................*.... + // gap // ......................................................... + // gap // ......................................................... + // gap // ......................................................... + sub v16.8H, v0.8H, v17.8H // ........................................................* + // gap // ......................................................... + // gap // ......................................................... + add v24.8H, v0.8H, v17.8H // .......................................................*. - // original source code - // sub v27.8H, v31.8H, v10.8H // *........................................................ - // sqrdmulh v25.8H, v8.8H, v0.H[1] // .*....................................................... - // mls v28.8H, v20.8H, v7.H[0] // ..*...................................................... - // add v17.8H, v31.8H, v10.8H // ...*..................................................... - // mul v2.8H, v27.8H, v0.H[4] // ....*.................................................... - // sqrdmulh v18.8H, v27.8H, v0.H[5] // .....*................................................... - // sqrdmulh v27.8H, v17.8H, v0.H[3] // .......*................................................. - // mls v29.8H, v25.8H, v7.H[0] // ......*.................................................. - // mul v9.8H, v17.8H, v0.H[2] // ........*................................................ - // add v11.8H, v16.8H, v28.8H // .........*............................................... - // mls v2.8H, v18.8H, v7.H[0] // ...........*............................................. - // sub v17.8H, v16.8H, v28.8H // ..........*.............................................. - // sub v18.8H, v24.8H, v29.8H // ............*............................................ - // mul v26.8H, v11.8H, v0.H[2] // .............*........................................... - // mls v9.8H, v27.8H, v7.H[0] // ...............*......................................... - // mul v20.8H, v17.8H, v0.H[4] // .................*....................................... - // add v10.8H, v24.8H, v29.8H // ..................*...................................... - // sub v28.8H, v18.8H, v2.8H // ...................*..................................... - // add v29.8H, v18.8H, v2.8H // ................*........................................ - // sub v27.8H, v10.8H, v9.8H // .........................*............................... - // mul v25.8H, v28.8H, v1.H[4] // ........................*................................ - // sqrdmulh v30.8H, v11.8H, v0.H[3] // ..........................*.............................. - // sqrdmulh v18.8H, v28.8H, v1.H[5] // ......................*.................................. - // add v11.8H, v10.8H, v9.8H // ...............................*......................... - // sqrdmulh v2.8H, v27.8H, v1.H[1] // .................................*....................... - // mul v19.8H, v27.8H, v1.H[0] // ............................*............................ - // sqrdmulh v14.8H, v17.8H, v0.H[5] // ..............*.......................................... - // mls v26.8H, v30.8H, v7.H[0] // ...................................*..................... - // mul v6.8H, v11.8H, v0.H[6] // ..................................*...................... - // sqrdmulh v16.8H, v11.8H, v0.H[7] // ......................................*.................. - // mls v19.8H, v2.8H, v7.H[0] // .......................................*................. - // mls v20.8H, v14.8H, v7.H[0] // .......................*................................. - // sub v2.8H, v23.8H, v26.8H // ..........................................*.............. - // mls v25.8H, v18.8H, v7.H[0] // ..............................*.......................... - // mul v18.8H, v29.8H, v1.H[2] // ....................*.................................... - // sub v24.8H, v2.8H, v19.8H // .................................................*....... - // sqrdmulh v4.8H, v29.8H, v1.H[3] // .....................*................................... - // sub v27.8H, v15.8H, v20.8H // ....................................*.................... - // mls v6.8H, v16.8H, v7.H[0] // ............................................*............ - // add v29.8H, v23.8H, v26.8H // ..............................................*.......... - // str q24, [x0, #192] // ......................................................*.. - // add v24.8H, v2.8H, v19.8H // ...............................................*......... - // mls v18.8H, v4.8H, v7.H[0] // ...........................*............................. - // sub v11.8H, v27.8H, v25.8H // ...........................................*............. - // add v2.8H, v15.8H, v20.8H // .............................*........................... - // str q24, [x0, #128] // ...................................................*..... - // add v4.8H, v29.8H, v6.8H // ....................................................*.... - // str q11, [x0, #448] // ..................................................*...... - // add v11.8H, v27.8H, v25.8H // .........................................*............... - // str q4, [x0], #(16) // .......................................................*. - // sub v15.8H, v2.8H, v18.8H // ................................*........................ - // str q11, [x0, #368] // ................................................*........ - // add v30.8H, v2.8H, v18.8H // ........................................*................ - // sub v11.8H, v29.8H, v6.8H // .....................................................*... - // str q15, [x0, #304] // .....................................*................... - // str q30, [x0, #240] // .............................................*........... - // str q11, [x0, #48] // ........................................................* - - - restore inp, STACK0 - mov count, #8 - - .p2align 2 - ldr q0, [x3], #16 // *.......................................................... - // gap // ........................................................... - // gap // ........................................................... - ldr q8, [x1, #48] // .*......................................................... - ldr q21, [x4, #80] // ...*....................................................... - ldr q19, [x1, #32] // ..*........................................................ - // gap // ........................................................... - // gap // ........................................................... - ldr q5, [x4, #16] // ........*.................................................. - ldr q27, [x4, #32] // .....*..................................................... - // gap // ........................................................... - // gap // ........................................................... - ldr q9, [x1, #0] // ............*.............................................. - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sqrdmulh v14.8H, v8.8H, v0.H[1] // .........*................................................. - mul v30.8H, v8.8H, v0.H[0] // ..........*................................................ - ldr q12, [x1, #16] // ...........*............................................... - // gap // ........................................................... - sqrdmulh v20.8H, v19.8H, v0.H[1] // ...............*........................................... - ldr q16, [x4, #64] // ....*...................................................... - // gap // ........................................................... - // gap // ........................................................... - mul v17.8H, v19.8H, v0.H[0] // ..............*............................................ - ldr q2, [x4, #48] // ......*.................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v30.8H, v14.8H, v7.H[0] // .............*............................................. - ldr q3, [x4], #(6*16) // .......*................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v17.8H, v20.8H, v7.H[0] // .................*......................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sub v1.8H, v12.8H, v30.8H // ................*.......................................... - add v18.8H, v12.8H, v30.8H // ..................*........................................ - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sqrdmulh v14.8H, v1.8H, v0.H[5] // ...................*....................................... - mul v22.8H, v1.8H, v0.H[4] // ....................*...................................... - // gap // ........................................................... - // gap // ........................................................... - sqrdmulh v15.8H, v18.8H, v0.H[3] // .....................*..................................... - mul v28.8H, v18.8H, v0.H[2] // ......................*.................................... - // gap // ........................................................... - // gap // ........................................................... - sub v25.8H, v9.8H, v17.8H // .......................*................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v22.8H, v14.8H, v7.H[0] // ........................*.................................. - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v28.8H, v15.8H, v7.H[0] // .........................*................................. - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - add v14.8H, v9.8H, v17.8H // ..........................*................................ - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - add v24.8H, v25.8H, v22.8H // ...........................*............................... - sub v6.8H, v25.8H, v22.8H // ............................*.............................. - // gap // ........................................................... - // gap // ........................................................... - sub v11.8H, v14.8H, v28.8H // .............................*............................. - add v22.8H, v14.8H, v28.8H // ..............................*............................ - // gap // ........................................................... - // gap // ........................................................... - trn1 v18.4S, v24.4S, v6.4S // .................................*......................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - trn2 v0.4S, v24.4S, v6.4S // ...............................*........................... - trn2 v19.4S, v22.4S, v11.4S // ................................*.......................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - trn1 v28.4S, v22.4S, v11.4S // ..................................*........................ - // gap // ........................................................... - // gap // ........................................................... - trn2 v13.2D, v19.2D, v0.2D // ....................................*...................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - trn1 v14.2D, v19.2D, v0.2D // ...................................*....................... - trn2 v23.2D, v28.2D, v18.2D // ......................................*.................... - // gap // ........................................................... - // gap // ........................................................... - mul v0.8H, v13.8H, v3.8H // .......................................*................... - sqrdmulh v29.8H, v13.8H, v5.8H // ........................................*.................. - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mul v1.8H, v23.8H, v3.8H // ..........................................*................ - sqrdmulh v31.8H, v23.8H, v5.8H // .........................................*................. - // gap // ........................................................... - // gap // ........................................................... - mls v0.8H, v29.8H, v7.H[0] // ...........................................*............... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - trn1 v10.2D, v28.2D, v18.2D // .....................................*..................... - mls v1.8H, v31.8H, v7.H[0] // ............................................*.............. - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - sub v26.8H, v14.8H, v0.8H // ..............................................*............ - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - add v20.8H, v14.8H, v0.8H // .............................................*............. - // gap // ........................................................... - // gap // ........................................................... - sqrdmulh v31.8H, v26.8H, v21.8H // ..................................................*........ - mul v24.8H, v26.8H, v16.8H // .................................................*......... - // gap // ........................................................... - // gap // ........................................................... - sqrdmulh v15.8H, v20.8H, v2.8H // ...............................................*........... - mul v20.8H, v20.8H, v27.8H // ................................................*.......... - // gap // ........................................................... - // gap // ........................................................... - sub v3.8H, v10.8H, v1.8H // .....................................................*..... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v24.8H, v31.8H, v7.H[0] // ......................................................*.... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - mls v20.8H, v15.8H, v7.H[0] // ....................................................*...... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - add v19.8H, v10.8H, v1.8H // ...................................................*....... - // gap // ........................................................... - // gap // ........................................................... - // gap // ........................................................... - add v11.8H, v3.8H, v24.8H // .........................................................*. - sub v9.8H, v3.8H, v24.8H // ..........................................................* - // gap // ........................................................... - // gap // ........................................................... - sub v22.8H, v19.8H, v20.8H // .......................................................*... - add v2.8H, v19.8H, v20.8H // ........................................................*.. - // gap // ........................................................... - // gap // ........................................................... - - // original source code - // ldr q13, [x3], #16 // *.......................................................... - // ldr q6, [x1, #48] // .*......................................................... - // ldr q31, [x1, #32] // ...*....................................................... - // ldr q28, [x4, #80] // ..*........................................................ - // ldr q0, [x4, #64] // ...........*............................................... - // ldr q3, [x4, #32] // .....*..................................................... - // ldr q30, [x4, #48] // .............*............................................. - // ldr q10, [x4], #(6*16) // ...............*........................................... - // ldr q29, [x4, #-80] // ....*...................................................... - // sqrdmulh v21.8H, v6.8H, v13.H[1] // .......*................................................... - // mul v4.8H, v6.8H, v13.H[0] // ........*.................................................. - // ldr q24, [x1, #16] // .........*................................................. - // ldr q15, [x1, #0] // ......*.................................................... - // mls v4.8H, v21.8H, v7.H[0] // ..............*............................................ - // mul v21.8H, v31.8H, v13.H[0] // ............*.............................................. - // sqrdmulh v20.8H, v31.8H, v13.H[1] // ..........*................................................ - // sub v11.8H, v24.8H, v4.8H // .................*......................................... - // mls v21.8H, v20.8H, v7.H[0] // ................*.......................................... - // add v2.8H, v24.8H, v4.8H // ..................*........................................ - // sqrdmulh v9.8H, v11.8H, v13.H[5] // ...................*....................................... - // mul v11.8H, v11.8H, v13.H[4] // ....................*...................................... - // sqrdmulh v20.8H, v2.8H, v13.H[3] // .....................*..................................... - // mul v4.8H, v2.8H, v13.H[2] // ......................*.................................... - // sub v23.8H, v15.8H, v21.8H // .......................*................................... - // mls v11.8H, v9.8H, v7.H[0] // ........................*.................................. - // mls v4.8H, v20.8H, v7.H[0] // .........................*................................. - // add v13.8H, v15.8H, v21.8H // ..........................*................................ - // add v2.8H, v23.8H, v11.8H // ...........................*............................... - // sub v9.8H, v23.8H, v11.8H // ............................*.............................. - // sub v21.8H, v13.8H, v4.8H // .............................*............................. - // add v11.8H, v13.8H, v4.8H // ..............................*............................ - // trn2 v12.4S, v2.4S, v9.4S // ................................*.......................... - // trn2 v4.4S, v11.4S, v21.4S // .................................*......................... - // trn1 v20.4S, v2.4S, v9.4S // ...............................*........................... - // trn1 v16.4S, v11.4S, v21.4S // ..................................*........................ - // trn1 v2.2D, v4.2D, v12.2D // ....................................*...................... - // trn2 v24.2D, v4.2D, v12.2D // ...................................*....................... - // trn1 v21.2D, v16.2D, v20.2D // ...........................................*............... - // trn2 v4.2D, v16.2D, v20.2D // .....................................*..................... - // mul v11.8H, v24.8H, v10.8H // ......................................*.................... - // sqrdmulh v20.8H, v24.8H, v29.8H // .......................................*................... - // sqrdmulh v9.8H, v4.8H, v29.8H // .........................................*................. - // mul v29.8H, v4.8H, v10.8H // ........................................*.................. - // mls v11.8H, v20.8H, v7.H[0] // ..........................................*................ - // mls v29.8H, v9.8H, v7.H[0] // ............................................*.............. - // add v9.8H, v2.8H, v11.8H // ..............................................*............ - // sub v11.8H, v2.8H, v11.8H // .............................................*............. - // sqrdmulh v30.8H, v9.8H, v30.8H // .................................................*......... - // mul v13.8H, v9.8H, v3.8H // ..................................................*........ - // mul v20.8H, v11.8H, v0.8H // ................................................*.......... - // sqrdmulh v9.8H, v11.8H, v28.8H // ...............................................*........... - // add v11.8H, v21.8H, v29.8H // ......................................................*.... - // mls v13.8H, v30.8H, v7.H[0] // .....................................................*..... - // sub v27.8H, v21.8H, v29.8H // ...................................................*....... - // mls v20.8H, v9.8H, v7.H[0] // ....................................................*...... - // sub v22.8H, v11.8H, v13.8H // .........................................................*. - // add v2.8H, v11.8H, v13.8H // ..........................................................* - // add v11.8H, v27.8H, v20.8H // .......................................................*... - // sub v9.8H, v27.8H, v20.8H // ........................................................*.. + // --------------------- new position ---------------------> + // 0 25 50 + // |------------------------|------------------------|------ + // ldr q9, [x4, #80] // .............*........................................... + // ldr q12, [x1, #32] // ...*..................................................... + // ldr q5, [x3], #16 // *........................................................ + // ldr q25, [x1, #48] // .*....................................................... + // ldr q31, [x4, #16] // ........*................................................ + // ldr q22, [x4], #(6*16) // .........*............................................... + // ldr q11, [x1, #0] // ....*.................................................... + // ldr q13, [x4, #-48] // .......*................................................. + // ldr q4, [x4, #-64] // ......................................*.................. + // sqrdmulh v26.8H, v25.8H, v5.H[1] // .....*................................................... + // mul v30.8H, v25.8H, v5.H[0] // ......*.................................................. + // mul v19.8H, v12.8H, v5.H[0] // ..........*.............................................. + // sqrdmulh v29.8H, v12.8H, v5.H[1] // ...........*............................................. + // ldr q0, [x1, #16] // ..*...................................................... + // mls v30.8H, v26.8H, v7.H[0] // ............*............................................ + // mls v19.8H, v29.8H, v7.H[0] // ..............*.......................................... + // sub v16.8H, v0.8H, v30.8H // ...............*......................................... + // add v12.8H, v11.8H, v19.8H // ......................*.................................. + // add v15.8H, v0.8H, v30.8H // ................*........................................ + // mul v8.8H, v16.8H, v5.H[4] // ..................*...................................... + // sqrdmulh v26.8H, v16.8H, v5.H[5] // .................*....................................... + // sqrdmulh v1.8H, v15.8H, v5.H[3] // ....................*.................................... + // mul v15.8H, v15.8H, v5.H[2] // ...................*..................................... + // sub v21.8H, v11.8H, v19.8H // .....................*................................... + // mls v8.8H, v26.8H, v7.H[0] // .......................*................................. + // mls v15.8H, v1.8H, v7.H[0] // ........................*................................ + // add v17.8H, v21.8H, v8.8H // .........................*............................... + // sub v16.8H, v21.8H, v8.8H // ..........................*.............................. + // sub v28.8H, v12.8H, v15.8H // ............................*............................ + // add v24.8H, v12.8H, v15.8H // ...........................*............................. + // trn1 v26.4S, v17.4S, v16.4S // .............................*........................... + // trn2 v15.4S, v17.4S, v16.4S // ..............................*.......................... + // trn2 v11.4S, v24.4S, v28.4S // ...............................*......................... + // trn1 v14.4S, v24.4S, v28.4S // ................................*........................ + // trn2 v23.2D, v11.2D, v15.2D // ..................................*...................... + // trn2 v10.2D, v14.2D, v26.2D // ...................................*..................... + // sqrdmulh v16.8H, v23.8H, v31.8H // .......................................*................. + // mul v28.8H, v23.8H, v22.8H // .....................................*................... + // sqrdmulh v12.8H, v10.8H, v31.8H // ........................................*................ + // mls v28.8H, v16.8H, v7.H[0] // ..........................................*.............. + // mul v0.8H, v10.8H, v22.8H // .........................................*............... + // trn1 v3.2D, v11.2D, v15.2D // .................................*....................... + // ldr q11, [x4, #-32] // ...........................................*............. + // sub v21.8H, v3.8H, v28.8H // .............................................*........... + // add v6.8H, v3.8H, v28.8H // ..............................................*.......... + // mls v0.8H, v12.8H, v7.H[0] // ............................................*............ + // mul v15.8H, v21.8H, v11.8H // ................................................*........ + // sqrdmulh v16.8H, v21.8H, v9.8H // ...............................................*......... + // mul v27.8H, v6.8H, v4.8H // ..................................................*...... + // sqrdmulh v6.8H, v6.8H, v13.8H // .................................................*....... + // trn1 v2.2D, v14.2D, v26.2D // ....................................*.................... + // mls v15.8H, v16.8H, v7.H[0] // ....................................................*.... + // mls v27.8H, v6.8H, v7.H[0] // ......................................................*.. + // sub v16.8H, v2.8H, v0.8H // .....................................................*... + // add v26.8H, v2.8H, v0.8H // ...................................................*..... + // add v24.8H, v16.8H, v15.8H // ........................................................* + // sub v16.8H, v16.8H, v15.8H // .......................................................*. sub count, count, #1 layer4567_start: - trn1 v27.4S, v2.4S, v22.4S // ...........................................................*............................... - trn2 v18.4S, v2.4S, v22.4S // ............................................................*.............................. - ldr q13, [x3], #16 // ....e...................................................................................... - ldr q6, [x1, #112] // ...e....................................................................................... - ldr q31, [x1, #96] // ..e........................................................................................ - trn2 v5.4S, v11.4S, v9.4S // ..............................................................*............................ - trn1 v22.4S, v11.4S, v9.4S // .............................................................*............................. - ldr q28, [x4, #80] // ......................................e.................................................... - sqdmulh v9.8H, v18.8H, v7.H[1] // ..................................................................*........................ - sqdmulh v2.8H, v27.8H, v7.H[1] // ...............................................................*........................... - ldr q0, [x4, #64] // .....................................e..................................................... - ldr q3, [x4, #32] // ...................................e....................................................... - sqdmulh v23.8H, v5.8H, v7.H[1] // ........................................................................*.................. - ldr q30, [x4, #48] // ....................................e...................................................... - ldr q10, [x4], #(6*16) // .................................e......................................................... - sqdmulh v11.8H, v22.8H, v7.H[1] // .....................................................................*..................... - ldr q29, [x4, #-80] // ..................................e........................................................ - sqrdmulh v21.8H, v6.8H, v13.H[1] // ...........e............................................................................... - mul v4.8H, v6.8H, v13.H[0] // ..........e................................................................................ - ldr q24, [x1, #80] // .e......................................................................................... - ldr q15, [x1, #64] // e.......................................................................................... - srshr v20.8H, v9.8H, #11 // ...................................................................*....................... - srshr v2.8H, v2.8H, #11 // ................................................................*.......................... - // gap // ........................................................................................... - srshr v9.8H, v23.8H, #11 // .........................................................................*................. - srshr v25.8H, v11.8H, #11 // ......................................................................*.................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v4.8H, v21.8H, v7.H[0] // ............e.............................................................................. - mul v21.8H, v31.8H, v13.H[0] // .....e..................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v18.8H, v20.8H, v7.H[0] // ....................................................................*...................... - sqrdmulh v20.8H, v31.8H, v13.H[1] // ......e.................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v5.8H, v9.8H, v7.H[0] // ..........................................................................*................ - mls v22.8H, v25.8H, v7.H[0] // .......................................................................*................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v27.8H, v2.8H, v7.H[0] // .................................................................*......................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v11.8H, v24.8H, v4.8H // .............e............................................................................. - mls v21.8H, v20.8H, v7.H[0] // .......e................................................................................... - // gap // ........................................................................................... - add v2.8H, v24.8H, v4.8H // ..............e............................................................................ - // gap // ........................................................................................... - sqrdmulh v9.8H, v11.8H, v13.H[5] // .....................e..................................................................... - mul v11.8H, v11.8H, v13.H[4] // ....................e...................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sqrdmulh v20.8H, v2.8H, v13.H[3] // ................e.......................................................................... - mul v4.8H, v2.8H, v13.H[2] // ...............e........................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - umov x25, v27.d[0] // ...........................................................................*............... - sub v23.8H, v15.8H, v21.8H // ........e.................................................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v11.8H, v9.8H, v7.H[0] // ......................e.................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v4.8H, v20.8H, v7.H[0] // .................e......................................................................... - umov x15, v22.d[1] // ................................................................................*.......... - // gap // ........................................................................................... - // gap // ........................................................................................... - add v13.8H, v15.8H, v21.8H // .........e................................................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - str x25, [x1], #( 16*4) // ...................................................................................*....... - add v2.8H, v23.8H, v11.8H // ........................e.................................................................. - sub v9.8H, v23.8H, v11.8H // .......................e................................................................... - // gap // ........................................................................................... - sub v21.8H, v13.8H, v4.8H // ..................e........................................................................ - add v11.8H, v13.8H, v4.8H // ...................e....................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - str x15, [x1, #-24] // ........................................................................................*.. - umov x15, v5.d[0] // .................................................................................*......... - trn2 v12.4S, v2.4S, v9.4S // ............................e.............................................................. - // gap // ........................................................................................... - trn2 v4.4S, v11.4S, v21.4S // ..........................e................................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - trn1 v20.4S, v2.4S, v9.4S // ...........................e............................................................... - trn1 v16.4S, v11.4S, v21.4S // .........................e................................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - trn1 v2.2D, v4.2D, v12.2D // ................................e.......................................................... - trn2 v24.2D, v4.2D, v12.2D // ..............................e............................................................ - // gap // ........................................................................................... - // gap // ........................................................................................... - str x15, [x1, #-40] // ......................................................................................*.... - trn1 v21.2D, v16.2D, v20.2D // ...............................e........................................................... - trn2 v4.2D, v16.2D, v20.2D // .............................e............................................................. - // gap // ........................................................................................... - mul v11.8H, v24.8H, v10.8H // ............................................e.............................................. - sqrdmulh v20.8H, v24.8H, v29.8H // .............................................e............................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - umov x10, v5.d[1] // ..................................................................................*........ - sqrdmulh v9.8H, v4.8H, v29.8H // ........................................e.................................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mul v29.8H, v4.8H, v10.8H // .......................................e................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - umov x15, v27.d[1] // ............................................................................*.............. - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v11.8H, v20.8H, v7.H[0] // ..............................................e............................................ - umov x19, v22.d[0] // ...............................................................................*........... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - mls v29.8H, v9.8H, v7.H[0] // .........................................e................................................. - str x10, [x1, #-8] // ..........................................................................................* - // gap // ........................................................................................... - // gap // ........................................................................................... - umov x10, v18.d[1] // ..............................................................................*............ - add v9.8H, v2.8H, v11.8H // ................................................e.......................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - str x15, [x1, #-32] // .......................................................................................*... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v11.8H, v2.8H, v11.8H // ...............................................e........................................... - str x19, [x1, #-56] // ....................................................................................*...... - sqrdmulh v30.8H, v9.8H, v30.8H // ..................................................e........................................ - mul v13.8H, v9.8H, v3.8H // .................................................e......................................... - // gap // ........................................................................................... - mul v20.8H, v11.8H, v0.8H // ......................................................e.................................... - sqrdmulh v9.8H, v11.8H, v28.8H // .......................................................e................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - add v11.8H, v21.8H, v29.8H // ...........................................e............................................... - umov x15, v18.d[0] // .............................................................................*............. - str x10, [x1, #-16] // .........................................................................................*. - // gap // ........................................................................................... - mls v13.8H, v30.8H, v7.H[0] // ...................................................e....................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v27.8H, v21.8H, v29.8H // ..........................................e................................................ - mls v20.8H, v9.8H, v7.H[0] // ........................................................e.................................. - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - // gap // ........................................................................................... - sub v22.8H, v11.8H, v13.8H // ....................................................e...................................... - str x15, [x1, #-48] // .....................................................................................*..... - add v2.8H, v11.8H, v13.8H // .....................................................e..................................... - // gap // ........................................................................................... - add v11.8H, v27.8H, v20.8H // ..........................................................e................................ - sub v9.8H, v27.8H, v20.8H // .........................................................e................................. - // gap // ........................................................................................... - // gap // ........................................................................................... + // Instructions: 91 + // Expected cycles: 41 + // Expected IPC: 2.22 + // + // Cycle bound: 40.0 + // IPC bound: 2.27 + // + // Wall time: 3601.55s + // User time: 3601.55s + // + // ----------------------------------- original position ------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--------------- + add v0.8H, v26.8H, v27.8H // .....................................................*..................................... + sub v11.8H, v26.8H, v27.8H // ....................................................*...................................... + ldr q9, [x4, #80] // ......................................e.................................................... + ldr q12, [x1, #96] // ..e........................................................................................ + trn2 v2.4S, v24.4S, v16.4S // ..............................................................*............................ + trn1 v6.4S, v24.4S, v16.4S // .............................................................*............................. + ldr q5, [x3], #16 // ....e...................................................................................... + ldr q25, [x1, #112] // ...e....................................................................................... + ldr q31, [x4, #16] // ..................................e........................................................ + trn2 v27.4S, v0.4S, v11.4S // ............................................................*.............................. + trn1 v20.4S, v0.4S, v11.4S // ...........................................................*............................... + ldr q22, [x4], #(6*16) // .................................e......................................................... + ldr q11, [x1, #64] // e.......................................................................................... + sqdmulh v16.8H, v6.8H, v7.H[1] // .....................................................................*..................... + ldr q13, [x4, #-48] // ....................................e...................................................... + sqdmulh v15.8H, v2.8H, v7.H[1] // ........................................................................*.................. + ldr q4, [x4, #-64] // ...................................e....................................................... + // gap // ........................................................................................... + sqdmulh v0.8H, v20.8H, v7.H[1] // ...............................................................*........................... + sqdmulh v14.8H, v27.8H, v7.H[1] // ..................................................................*........................ + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v26.8H, v25.8H, v5.H[1] // ..........e................................................................................ + mul v30.8H, v25.8H, v5.H[0] // ...........e............................................................................... + mul v19.8H, v12.8H, v5.H[0] // ......e.................................................................................... + sqrdmulh v29.8H, v12.8H, v5.H[1] // .....e..................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + srshr v8.8H, v0.8H, #11 // ................................................................*.......................... + srshr v14.8H, v14.8H, #11 // ...................................................................*....................... + ldr q0, [x1, #80] // .e......................................................................................... + // gap // ........................................................................................... + srshr v16.8H, v16.8H, #11 // ......................................................................*.................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v30.8H, v26.8H, v7.H[0] // ............e.............................................................................. + mls v19.8H, v29.8H, v7.H[0] // .......e................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + srshr v25.8H, v15.8H, #11 // .........................................................................*................. + mls v27.8H, v14.8H, v7.H[0] // ....................................................................*...................... + mls v20.8H, v8.8H, v7.H[0] // .................................................................*......................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v6.8H, v16.8H, v7.H[0] // .......................................................................*................... + sub v16.8H, v0.8H, v30.8H // .............e............................................................................. + add v12.8H, v11.8H, v19.8H // .........e................................................................................. + add v15.8H, v0.8H, v30.8H // ..............e............................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v8.8H, v16.8H, v5.H[4] // .....................e..................................................................... + sqrdmulh v26.8H, v16.8H, v5.H[5] // ....................e...................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sqrdmulh v1.8H, v15.8H, v5.H[3] // ...............e........................................................................... + mul v15.8H, v15.8H, v5.H[2] // ................e.......................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + sub v21.8H, v11.8H, v19.8H // ........e.................................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x27, v27.d[1] // ..............................................................................*............ + mls v8.8H, v26.8H, v7.H[0] // ......................e.................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x15, v20.d[0] // ...........................................................................*............... + mls v15.8H, v1.8H, v7.H[0] // .................e......................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v2.8H, v25.8H, v7.H[0] // ..........................................................................*................ + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x27, [x1, #48] // .........................................................................................*. + add v17.8H, v21.8H, v8.8H // ........................e.................................................................. + // gap // ........................................................................................... + sub v16.8H, v21.8H, v8.8H // .......................e................................................................... + sub v28.8H, v12.8H, v15.8H // ..................e........................................................................ + add v24.8H, v12.8H, v15.8H // ...................e....................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v26.4S, v17.4S, v16.4S // ...........................e............................................................... + str x15, [x1], #( 16*4) // ...................................................................................*....... + trn2 v15.4S, v17.4S, v16.4S // ............................e.............................................................. + // gap // ........................................................................................... + umov x19, v20.d[1] // ............................................................................*.............. + trn2 v11.4S, v24.4S, v28.4S // ..........................e................................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + trn1 v14.4S, v24.4S, v28.4S // .........................e................................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x28, v27.d[0] // .............................................................................*............. + trn2 v23.2D, v11.2D, v15.2D // ..............................e............................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + trn2 v10.2D, v14.2D, v26.2D // .............................e............................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x19, [x1, #-32] // .......................................................................................*... + sqrdmulh v16.8H, v23.8H, v31.8H // ............................................e.............................................. + mul v28.8H, v23.8H, v22.8H // .............................................e............................................. + // gap // ........................................................................................... + sqrdmulh v12.8H, v10.8H, v31.8H // .......................................e................................................... + umov x27, v6.d[1] // ................................................................................*.......... + // gap // ........................................................................................... + // gap // ........................................................................................... + umov x11, v2.d[0] // .................................................................................*......... + str x28, [x1, #-48] // .....................................................................................*..... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v28.8H, v16.8H, v7.H[0] // ..............................................e............................................ + // gap // ........................................................................................... + mul v0.8H, v10.8H, v22.8H // ........................................e.................................................. + trn1 v3.2D, v11.2D, v15.2D // ................................e.......................................................... + ldr q11, [x4, #-32] // .....................................e..................................................... + // gap // ........................................................................................... + str x27, [x1, #-24] // ........................................................................................*.. + umov x27, v6.d[0] // ...............................................................................*........... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x11, [x1, #-40] // ......................................................................................*.... + sub v21.8H, v3.8H, v28.8H // ...............................................e........................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v6.8H, v3.8H, v28.8H // ................................................e.......................................... + mls v0.8H, v12.8H, v7.H[0] // .........................................e................................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + mul v15.8H, v21.8H, v11.8H // .......................................................e................................... + sqrdmulh v16.8H, v21.8H, v9.8H // ......................................................e.................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + str x27, [x1, #-56] // ....................................................................................*...... + mul v27.8H, v6.8H, v4.8H // ..................................................e........................................ + sqrdmulh v6.8H, v6.8H, v13.8H // .................................................e......................................... + // gap // ........................................................................................... + umov x27, v2.d[1] // ..................................................................................*........ + trn1 v2.2D, v14.2D, v26.2D // ...............................e........................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v15.8H, v16.8H, v7.H[0] // ........................................................e.................................. + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + mls v27.8H, v6.8H, v7.H[0] // ...................................................e....................................... + sub v16.8H, v2.8H, v0.8H // ..........................................e................................................ + // gap // ........................................................................................... + // gap // ........................................................................................... + add v26.8H, v2.8H, v0.8H // ...........................................e............................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + // gap // ........................................................................................... + add v24.8H, v16.8H, v15.8H // ..........................................................e................................ + sub v16.8H, v16.8H, v15.8H // .........................................................e................................. + str x27, [x1, #-8] // ..........................................................................................* + // gap // ........................................................................................... - // original source code - // ldr q8, [x1, #(16*0)] // ..................e......................................................................|...................e................................................................... - // ldr q9, [x1, #(16*1)] // .................e.......................................................................|..................e.................................................................... - // ldr q10, [x1, #(16*2)] // ..e......................................................................................|...e................................................................................... - // ldr q11, [x1, #(16*3)] // .e.......................................................................................|..e.................................................................................... - // ldr q0, [x3], #16 // e........................................................................................|.e..................................................................................... - // mul v24.8h, v10.8h, v0.h[0] // ........................e................................................................|.........................e............................................................. - // sqrdmulh v10.8h, v10.8h, v0.h[1] // ..........................e..............................................................|...........................e........................................................... - // mls v24.8h, v10.8h, v7.h[0] // ...............................e.........................................................|................................e...................................................... - // sub v10.8h, v8.8h, v24.8h // ......................................e..................................................|.......................................e............................................... - // add v8.8h, v8.8h, v24.8h // ..........................................e..............................................|...........................................e........................................... - // mul v24.8h, v11.8h, v0.h[0] // ................e........................................................................|.................e..................................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[1] // ...............e.........................................................................|................e...................................................................... - // mls v24.8h, v11.8h, v7.h[0] // .......................e.................................................................|........................e.............................................................. - // sub v11.8h, v9.8h, v24.8h // ..............................e..........................................................|...............................e....................................................... - // add v9.8h, v9.8h, v24.8h // ................................e........................................................|.................................e..................................................... - // mul v24.8h, v9.8h, v0.h[2] // ....................................e....................................................|.....................................e................................................. - // sqrdmulh v9.8h, v9.8h, v0.h[3] // ...................................e.....................................................|....................................e.................................................. - // mls v24.8h, v9.8h, v7.h[0] // ........................................e................................................|.........................................e............................................. - // sub v9.8h, v8.8h, v24.8h // ..............................................e..........................................|...............................................e....................................... - // add v8.8h, v8.8h, v24.8h // ...............................................e.........................................|................................................e...................................... - // mul v24.8h, v11.8h, v0.h[4] // ..................................e......................................................|...................................e................................................... - // sqrdmulh v11.8h, v11.8h, v0.h[5] // .................................e.......................................................|..................................e.................................................... - // mls v24.8h, v11.8h, v7.h[0] // .......................................e.................................................|........................................e.............................................. - // sub v11.8h, v10.8h, v24.8h // .............................................e...........................................|..............................................e........................................ - // add v10.8h, v10.8h, v24.8h // ............................................e............................................|.............................................e......................................... - // trn1 v25.4s, v8.4s, v9.4s // .....................................................e...................................|......................................................e................................ - // trn2 v26.4s, v8.4s, v9.4s // ...................................................e.....................................|....................................................e.................................. - // trn1 v27.4s, v10.4s, v11.4s // ....................................................e....................................|.....................................................e................................. - // trn2 v28.4s, v10.4s, v11.4s // ..................................................e......................................|...................................................e................................... - // trn2 v10.2d, v25.2d, v27.2d // ..........................................................e..............................|...........................................................e........................... - // trn2 v11.2d, v26.2d, v28.2d // .......................................................e.................................|........................................................e.............................. - // trn1 v8.2d, v25.2d, v27.2d // .........................................................e...............................|..........................................................e............................ - // trn1 v9.2d, v26.2d, v28.2d // ......................................................e..................................|.......................................................e............................... - // ldr q0, [x4], #(6*16) // ............e............................................................................|.............e......................................................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // ..............e..........................................................................|...............e....................................................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // .........e...............................................................................|..........e............................................................................ - // ldr q5, [x4, #(-6*16 + 3*16)] // ...........e.............................................................................|............e.......................................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ........e................................................................................|.........e............................................................................. - // ldr q6, [x4, #(-6*16 + 5*16)] // .....e...................................................................................|......e................................................................................ - // mul v24.8h, v10.8h, v0.8h // ...............................................................e.........................|................................................................e...................... - // sqrdmulh v10.8h, v10.8h, v4.8h // ..............................................................e..........................|...............................................................e....................... - // mls v24.8h, v10.8h, v7.h[0] // ...................................................................e.....................|....................................................................e.................. - // sub v10.8h, v8.8h, v24.8h // ..................................................................................e......|...................................................................................e... - // add v8.8h, v8.8h, v24.8h // ..............................................................................e..........|...............................................................................e....... - // mul v24.8h, v11.8h, v0.8h // ...........................................................e.............................|............................................................e.......................... - // sqrdmulh v11.8h, v11.8h, v4.8h // ............................................................e............................|.............................................................e......................... - // mls v24.8h, v11.8h, v7.h[0] // .................................................................e.......................|..................................................................e.................... - // sub v11.8h, v9.8h, v24.8h // ........................................................................e................|.........................................................................e............. - // add v9.8h, v9.8h, v24.8h // ......................................................................e..................|.......................................................................e............... - // mul v24.8h, v9.8h, v1.8h // ...........................................................................e.............|............................................................................e.......... - // sqrdmulh v9.8h, v9.8h, v5.8h // ..........................................................................e..............|...........................................................................e........... - // mls v24.8h, v9.8h, v7.h[0] // .................................................................................e.......|..................................................................................e.... - // sub v9.8h, v8.8h, v24.8h // ....................................................................................e....|.....................................................................................e. - // add v8.8h, v8.8h, v24.8h // ......................................................................................e..|....................................................................................... - // mul v24.8h, v11.8h, v2.8h // ............................................................................e............|.............................................................................e......... - // sqrdmulh v11.8h, v11.8h, v6.8h // .............................................................................e...........|..............................................................................e........ - // mls v24.8h, v11.8h, v7.h[0] // ...................................................................................e.....|....................................................................................e.. - // sub v11.8h, v10.8h, v24.8h // ........................................................................................e|....................................................................................... - // add v10.8h, v10.8h, v24.8h // .......................................................................................e.|....................................................................................... - // trn1 v25.4s, v8.4s, v9.4s // .........................................................................................*....................................................................................... - // trn2 v26.4s, v8.4s, v9.4s // .........................................................................................|*...................................................................................... - // trn1 v27.4s, v10.4s, v11.4s // ....*....................................................................................|.....*................................................................................. - // trn2 v28.4s, v10.4s, v11.4s // ...*.....................................................................................|....*.................................................................................. - // sqdmulh v24.8h, v25.8h, v7.h[1] // .......*.................................................................................|........*.............................................................................. - // srshr v24.8h, v24.8h, #11 // ....................*....................................................................|.....................*................................................................. - // mls v25.8h, v24.8h, v7.h[0] // .............................*...........................................................|..............................*........................................................ - // sqdmulh v24.8h, v26.8h, v7.h[1] // ......*..................................................................................|.......*............................................................................... - // srshr v24.8h, v24.8h, #11 // ...................*.....................................................................|....................*.................................................................. - // mls v26.8h, v24.8h, v7.h[0] // .........................*...............................................................|..........................*............................................................ - // sqdmulh v24.8h, v27.8h, v7.h[1] // .............*...........................................................................|..............*........................................................................ - // srshr v24.8h, v24.8h, #11 // ......................*..................................................................|.......................*............................................................... - // mls v27.8h, v24.8h, v7.h[0] // ............................*............................................................|.............................*......................................................... - // sqdmulh v24.8h, v28.8h, v7.h[1] // ..........*..............................................................................|...........*........................................................................... - // srshr v24.8h, v24.8h, #11 // .....................*...................................................................|......................*................................................................ - // mls v28.8h, v24.8h, v7.h[0] // ...........................*.............................................................|............................*.......................................................... - // umov x10, v25.d[0] // .....................................*...................................................|......................................*................................................ - // umov x11, v25.d[1] // ................................................................*........................|.................................................................*..................... - // umov x12, v26.d[0] // ...............................................................................*.........|................................................................................*...... - // umov x13, v26.d[1] // .....................................................................*...................|......................................................................*................ - // umov x14, v27.d[0] // ..................................................................*......................|...................................................................*................... - // umov x15, v27.d[1] // .........................................*...............................................|..........................................*............................................ - // umov x16, v28.d[0] // .................................................*.......................................|..................................................*.................................... - // umov x17, v28.d[1] // .............................................................*...........................|..............................................................*........................ - // str x10, [x1], #( 16*4) // ...........................................*.............................................|............................................*.......................................... - // str x14, [x1, #(-16*4 + 8*1)] // .........................................................................*...............|..........................................................................*............ - // str x12, [x1, #(-16*4 + 8*2)] // .....................................................................................*...|......................................................................................* - // str x16, [x1, #(-16*4 + 8*3)] // ........................................................*................................|.........................................................*............................. - // str x11, [x1, #(-16*4 + 8*4)] // .......................................................................*.................|........................................................................*.............. - // str x15, [x1, #(-16*4 + 8*5)] // ................................................*........................................|.................................................*..................................... - // str x13, [x1, #(-16*4 + 8*6)] // ................................................................................*........|.................................................................................*..... - // str x17, [x1, #(-16*4 + 8*7)] // ....................................................................*....................|.....................................................................*................. + // ---------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---- + // ldr q8, [x1, #(16*0)] // ..........e..............................................................................'...........~.............................................................................. + // ldr q9, [x1, #(16*1)] // .......................e.................................................................'........................~................................................................. + // ldr q10, [x1, #(16*2)] // .e.......................................................................................'..~....................................................................................... + // ldr q11, [x1, #(16*3)] // .....e...................................................................................'......~................................................................................... + // ldr q0, [x3], #16 // ....e....................................................................................'.....~.................................................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ....................e....................................................................'.....................~.................................................................... + // mul v24.8h, v10.8h, v0.h[0] // ...................e.....................................................................'....................~..................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................e..............................................................'...........................~.............................................................. + // sub v10.8h, v8.8h, v24.8h // ......................................e..................................................'.......................................~.................................................. + // add v8.8h, v8.8h, v24.8h // ................................e........................................................'.................................~........................................................ + // sqrdmulh v27.8h, v11.8h, v0.h[1] // .................e.......................................................................'..................~....................................................................... + // mul v24.8h, v11.8h, v0.h[0] // ..................e......................................................................'...................~...................................................................... + // mls v24.8h, v27.8h, v7.h[0] // .........................e...............................................................'..........................~............................................................... + // sub v11.8h, v9.8h, v24.8h // ...............................e.........................................................'................................~......................................................... + // add v9.8h, v9.8h, v24.8h // .................................e.......................................................'..................................~....................................................... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ....................................e....................................................'.....................................~.................................................... + // mul v24.8h, v9.8h, v0.h[2] // .....................................e...................................................'......................................~................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................e..............................................'...........................................~.............................................. + // sub v9.8h, v8.8h, v24.8h // ...............................................e.........................................'................................................~......................................... + // add v8.8h, v8.8h, v24.8h // ................................................e........................................'.................................................~........................................ + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ...................................e.....................................................'....................................~..................................................... + // mul v24.8h, v11.8h, v0.h[4] // ..................................e......................................................'...................................~...................................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................................e................................................'.........................................~................................................ + // sub v11.8h, v10.8h, v24.8h // ..............................................e..........................................'...............................................~.......................................... + // add v10.8h, v10.8h, v24.8h // .............................................e...........................................'..............................................~........................................... + // trn1 v25.4s, v8.4s, v9.4s // ......................................................e..................................'.......................................................~.................................. + // trn2 v26.4s, v8.4s, v9.4s // .....................................................e...................................'......................................................~................................... + // trn1 v27.4s, v10.4s, v11.4s // .................................................e.......................................'..................................................~....................................... + // trn2 v28.4s, v10.4s, v11.4s // ...................................................e.....................................'....................................................~..................................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................................e...............................'..........................................................~............................... + // trn2 v11.2d, v26.2d, v28.2d // ........................................................e................................'.........................................................~................................ + // trn1 v8.2d, v25.2d, v27.2d // .................................................................................e.......'..................................................................................~....... + // trn1 v9.2d, v26.2d, v28.2d // ...................................................................e.....................'....................................................................~..................... + // ldr q0, [ x4], #(6*16) // .........e...............................................................................'..........~............................................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // ......e..................................................................................'.......~.................................................................................. + // ldr q1, [ x4, #(-6*16 + 2*16)] // ..............e..........................................................................'...............~.......................................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ............e............................................................................'.............~............................................................................ + // ldr q2, [ x4, #(-6*16 + 4*16)] // ....................................................................e....................'.....................................................................~.................... + // ldr q6, [x4, #(-6*16 + 5*16)] // e........................................................................................'.~........................................................................................ + // sqrdmulh v27.8h, v10.8h, v4.8h // .............................................................e...........................'..............................................................~........................... + // mul v24.8h, v10.8h, v0.8h // ..................................................................e......................'...................................................................~...................... + // mls v24.8h, v27.8h, v7.h[0] // ..........................................................................e..............'...........................................................................~.............. + // sub v10.8h, v8.8h, v24.8h // ....................................................................................e....'.....................................................................................~.... + // add v8.8h, v8.8h, v24.8h // .....................................................................................e...'......................................................................................~... + // sqrdmulh v27.8h, v11.8h, v4.8h // ...........................................................e.............................'............................................................~............................. + // mul v24.8h, v11.8h, v0.8h // ............................................................e............................'.............................................................~............................ + // mls v24.8h, v27.8h, v7.h[0] // .................................................................e.......................'..................................................................~....................... + // sub v11.8h, v9.8h, v24.8h // ........................................................................e................'.........................................................................~................ + // add v9.8h, v9.8h, v24.8h // .........................................................................e...............'..........................................................................~............... + // sqrdmulh v27.8h, v9.8h, v5.8h // ...............................................................................e.........'................................................................................~......... + // mul v24.8h, v9.8h, v1.8h // ..............................................................................e..........'...............................................................................~.......... + // mls v24.8h, v27.8h, v7.h[0] // ...................................................................................e.....'....................................................................................~..... + // sub v9.8h, v8.8h, v24.8h // .........................................................................................'*......................................................................................... + // add v8.8h, v8.8h, v24.8h // .........................................................................................*.......................................................................................... + // sqrdmulh v27.8h, v11.8h, v6.8h // ............................................................................e............'.............................................................................~............ + // mul v24.8h, v11.8h, v2.8h // ...........................................................................e.............'............................................................................~............. + // mls v24.8h, v27.8h, v7.h[0] // ..................................................................................e......'...................................................................................~...... + // sub v11.8h, v10.8h, v24.8h // .......................................................................................e.'........................................................................................~. + // add v10.8h, v10.8h, v24.8h // ......................................................................................e..'.......................................................................................~.. + // trn1 v25.4s, v8.4s, v9.4s // ........~................................................................................'.........*................................................................................ + // trn2 v26.4s, v8.4s, v9.4s // .......~.................................................................................'........*................................................................................. + // trn1 v27.4s, v10.4s, v11.4s // ...~.....................................................................................'....*..................................................................................... + // trn2 v28.4s, v10.4s, v11.4s // ..~......................................................................................'...*...................................................................................... + // sqdmulh v24.8h, v25.8h, v7.h[1] // ...............~.........................................................................'................*......................................................................... + // srshr v24.8h, v24.8h, #11 // .....................~...................................................................'......................*................................................................... + // mls v25.8h, v24.8h, v7.h[0] // .............................~...........................................................'..............................*........................................................... + // sqdmulh v24.8h, v26.8h, v7.h[1] // ................~........................................................................'.................*........................................................................ + // srshr v24.8h, v24.8h, #11 // ......................~..................................................................'.......................*.................................................................. + // mls v26.8h, v24.8h, v7.h[0] // ............................~............................................................'.............................*............................................................ + // sqdmulh v24.8h, v27.8h, v7.h[1] // ...........~.............................................................................'............*............................................................................. + // srshr v24.8h, v24.8h, #11 // ........................~................................................................'.........................*................................................................ + // mls v27.8h, v24.8h, v7.h[0] // ..............................~..........................................................'...............................*.......................................................... + // sqdmulh v24.8h, v28.8h, v7.h[1] // .............~...........................................................................'..............*........................................................................... + // srshr v24.8h, v24.8h, #11 // ...........................~.............................................................'............................*............................................................. + // mls v28.8h, v24.8h, v7.h[0] // ...........................................~.............................................'............................................*............................................. + // umov x10, v25.d[0] // .........................................~...............................................'..........................................*............................................... + // umov x11, v25.d[1] // ....................................................~....................................'.....................................................*.................................... + // umov x12, v26.d[0] // .......................................................~.................................'........................................................*................................. + // umov x13, v26.d[1] // .......................................~.................................................'........................................*................................................. + // umov x14, v27.d[0] // ......................................................................~..................'.......................................................................*.................. + // umov x15, v27.d[1] // ..............................................................~..........................'...............................................................*.......................... + // umov x16, v28.d[0] // ...............................................................~.........................'................................................................*......................... + // umov x17, v28.d[1] // ................................................................................~........'.................................................................................*........ + // str x10, [x1], #( 16*4) // ..................................................~......................................'...................................................*...................................... + // str x14, [x1, #(-16*4 + 8*1)] // .............................................................................~...........'..............................................................................*........... + // str x12, [x1, #(-16*4 + 8*2)] // ................................................................~........................'.................................................................*........................ + // str x16, [x1, #(-16*4 + 8*3)] // .......................................................................~.................'........................................................................*................. + // str x11, [x1, #(-16*4 + 8*4)] // ..........................................................~..............................'...........................................................*.............................. + // str x15, [x1, #(-16*4 + 8*5)] // .....................................................................~...................'......................................................................*................... + // str x13, [x1, #(-16*4 + 8*6)] // ............................................~............................................'.............................................*............................................ + // str x17, [x1, #(-16*4 + 8*7)] // ........................................................................................~'.........................................................................................* sub count, count, #1 cbnz count, layer4567_start - trn1 v20.4S, v2.4S, v22.4S // *............................... - trn2 v2.4S, v2.4S, v22.4S // .*.............................. - // gap // ................................ - // gap // ................................ - trn2 v22.4S, v11.4S, v9.4S // ..*............................. - trn1 v9.4S, v11.4S, v9.4S // ...*............................ - // gap // ................................ - // gap // ................................ - sqdmulh v11.8H, v2.8H, v7.H[1] // ....*........................... - sqdmulh v27.8H, v20.8H, v7.H[1] // .....*.......................... - // gap // ................................ - // gap // ................................ - sqdmulh v18.8H, v22.8H, v7.H[1] // ......*......................... - sqdmulh v30.8H, v9.8H, v7.H[1] // .......*........................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - srshr v11.8H, v11.8H, #11 // ........*....................... - srshr v27.8H, v27.8H, #11 // .........*...................... - // gap // ................................ - // gap // ................................ - srshr v18.8H, v18.8H, #11 // ..........*..................... - srshr v30.8H, v30.8H, #11 // ...........*.................... - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - mls v20.8H, v27.8H, v7.H[0] // ...............*................ - mls v2.8H, v11.8H, v7.H[0] // ............*................... - // gap // ................................ - // gap // ................................ - mls v22.8H, v18.8H, v7.H[0] // .............*.................. - mls v9.8H, v30.8H, v7.H[0] // ..............*................. - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - umov x15, v20.d[0] // ................*............... - umov x10, v20.d[1] // .......................*........ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - umov x19, v9.d[1] // .................*.............. - umov x13, v22.d[0] // ....................*........... - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - // gap // ................................ - umov x12, v22.d[1] // ......................*......... - umov x25, v9.d[0] // ........................*....... - str x15, [x1], #( 16*4) // ..................*............. - // gap // ................................ - str x10, [x1, #-32] // ...........................*.... - // gap // ................................ - // gap // ................................ - // gap // ................................ - umov x15, v2.d[1] // ..........................*..... - umov x10, v2.d[0] // .............................*.. - str x19, [x1, #-24] // ...................*............ - // gap // ................................ - str x13, [x1, #-40] // .....................*.......... - // gap // ................................ - // gap // ................................ - // gap // ................................ - str x12, [x1, #-8] // .........................*...... - // gap // ................................ - // gap // ................................ - // gap // ................................ - str x25, [x1, #-56] // ............................*... - // gap // ................................ - // gap // ................................ - // gap // ................................ - str x15, [x1, #-16] // ..............................*. - // gap // ................................ - // gap // ................................ - // gap // ................................ - str x10, [x1, #-48] // ...............................* - // gap // ................................ - // gap // ................................ - // gap // ................................ + // Instructions: 34 + // Expected cycles: 24 + // Expected IPC: 1.42 + // + // Cycle bound: 24.0 + // IPC bound: 1.42 + // + // Wall time: 0.70s + // User time: 0.70s + // + // ------- original position -------> + // 0 25 + // |------------------------|-------- + add v15.8H, v26.8H, v27.8H // *................................. + sub v27.8H, v26.8H, v27.8H // .*................................ + // gap // .................................. + // gap // .................................. + trn2 v14.4S, v24.4S, v16.4S // ..*............................... + trn1 v16.4S, v24.4S, v16.4S // ...*.............................. + // gap // .................................. + // gap // .................................. + trn2 v6.4S, v15.4S, v27.4S // ....*............................. + trn1 v15.4S, v15.4S, v27.4S // .....*............................ + // gap // .................................. + // gap // .................................. + sqdmulh v27.8H, v16.8H, v7.H[1] // ......*........................... + sqdmulh v26.8H, v14.8H, v7.H[1] // .......*.......................... + // gap // .................................. + // gap // .................................. + sqdmulh v0.8H, v15.8H, v7.H[1] // ........*......................... + sqdmulh v11.8H, v6.8H, v7.H[1] // .........*........................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + srshr v27.8H, v27.8H, #11 // ............*..................... + srshr v26.8H, v26.8H, #11 // .............*.................... + // gap // .................................. + // gap // .................................. + srshr v0.8H, v0.8H, #11 // ..........*....................... + srshr v11.8H, v11.8H, #11 // ...........*...................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v14.8H, v26.8H, v7.H[0] // ...................*.............. + mls v16.8H, v27.8H, v7.H[0] // ................*................. + // gap // .................................. + // gap // .................................. + mls v6.8H, v11.8H, v7.H[0] // ..............*................... + mls v15.8H, v0.8H, v7.H[0] // ...............*.................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + umov x27, v16.d[1] // .........................*........ + umov x19, v14.d[0] // ..........................*....... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + umov x13, v6.d[1] // .................*................ + umov x12, v15.d[0] // ..................*............... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + umov x14, v6.d[0] // .......................*.......... + umov x15, v15.d[1] // ......................*........... + str x27, [x1, #40] // ............................*..... + // gap // .................................. + str x19, [x1, #24] // ..............................*... + // gap // .................................. + // gap // .................................. + // gap // .................................. + umov x27, v14.d[1] // ................................*. + umov x19, v16.d[0] // .............................*.... + str x13, [x1, #48] // ....................*............. + // gap // .................................. + str x12, [x1], #( 16*4) // .....................*............ + // gap // .................................. + // gap // .................................. + // gap // .................................. + str x15, [x1, #-32] // ........................*......... + // gap // .................................. + // gap // .................................. + // gap // .................................. + str x14, [x1, #-48] // ...........................*...... + // gap // .................................. + // gap // .................................. + // gap // .................................. + str x19, [x1, #-56] // ...............................*.. + // gap // .................................. + // gap // .................................. + // gap // .................................. + str x27, [x1, #-8] // .................................* + // gap // .................................. + // gap // .................................. + // gap // .................................. - // original source code - // trn1 v27.4S, v2.4S, v22.4S // *............................... - // trn2 v18.4S, v2.4S, v22.4S // .*.............................. - // trn2 v5.4S, v11.4S, v9.4S // ..*............................. - // trn1 v22.4S, v11.4S, v9.4S // ...*............................ - // sqdmulh v9.8H, v18.8H, v7.H[1] // ....*........................... - // sqdmulh v2.8H, v27.8H, v7.H[1] // .....*.......................... - // sqdmulh v23.8H, v5.8H, v7.H[1] // ......*......................... - // sqdmulh v11.8H, v22.8H, v7.H[1] // .......*........................ - // srshr v20.8H, v9.8H, #11 // ........*....................... - // srshr v2.8H, v2.8H, #11 // .........*...................... - // srshr v9.8H, v23.8H, #11 // ..........*..................... - // srshr v25.8H, v11.8H, #11 // ...........*.................... - // mls v18.8H, v20.8H, v7.H[0] // .............*.................. - // mls v5.8H, v9.8H, v7.H[0] // ..............*................. - // mls v22.8H, v25.8H, v7.H[0] // ...............*................ - // mls v27.8H, v2.8H, v7.H[0] // ............*................... - // umov x25, v27.d[0] // ................*............... - // umov x15, v22.d[1] // ..................*............. - // str x25, [x1], #( 16*4) // ......................*......... - // str x15, [x1, #-24] // ..........................*..... - // umov x15, v5.d[0] // ...................*............ - // str x15, [x1, #-40] // ...........................*.... - // umov x10, v5.d[1] // ....................*........... - // umov x15, v27.d[1] // .................*.............. - // umov x19, v22.d[0] // .....................*.......... - // str x10, [x1, #-8] // ............................*... - // umov x10, v18.d[1] // ........................*....... - // str x15, [x1, #-32] // .......................*........ - // str x19, [x1, #-56] // .............................*.. - // umov x15, v18.d[0] // .........................*...... - // str x10, [x1, #-16] // ..............................*. - // str x15, [x1, #-48] // ...............................* + // --------- new position ----------> + // 0 25 + // |------------------------|-------- + // add v0.8H, v26.8H, v27.8H // *................................. + // sub v11.8H, v26.8H, v27.8H // .*................................ + // trn2 v2.4S, v24.4S, v16.4S // ..*............................... + // trn1 v6.4S, v24.4S, v16.4S // ...*.............................. + // trn2 v27.4S, v0.4S, v11.4S // ....*............................. + // trn1 v20.4S, v0.4S, v11.4S // .....*............................ + // sqdmulh v16.8H, v6.8H, v7.H[1] // ......*........................... + // sqdmulh v15.8H, v2.8H, v7.H[1] // .......*.......................... + // sqdmulh v0.8H, v20.8H, v7.H[1] // ........*......................... + // sqdmulh v14.8H, v27.8H, v7.H[1] // .........*........................ + // srshr v8.8H, v0.8H, #11 // ............*..................... + // srshr v14.8H, v14.8H, #11 // .............*.................... + // srshr v16.8H, v16.8H, #11 // ..........*....................... + // srshr v25.8H, v15.8H, #11 // ...........*...................... + // mls v27.8H, v14.8H, v7.H[0] // ................*................. + // mls v20.8H, v8.8H, v7.H[0] // .................*................ + // mls v6.8H, v16.8H, v7.H[0] // ...............*.................. + // umov x27, v27.d[1] // ....................*............. + // umov x15, v20.d[0] // .....................*............ + // mls v2.8H, v25.8H, v7.H[0] // ..............*................... + // str x27, [x1, #48] // ............................*..... + // str x15, [x1], #( 16*4) // .............................*.... + // umov x19, v20.d[1] // .......................*.......... + // umov x28, v27.d[0] // ......................*........... + // str x19, [x1, #-32] // ..............................*... + // umov x27, v6.d[1] // ..................*............... + // umov x11, v2.d[0] // ...................*.............. + // str x28, [x1, #-48] // ...............................*.. + // str x27, [x1, #-24] // ........................*......... + // umov x27, v6.d[0] // ...........................*...... + // str x11, [x1, #-40] // .........................*........ + // str x27, [x1, #-56] // ................................*. + // umov x27, v2.d[1] // ..........................*....... + // str x27, [x1, #-8] // .................................* pop_stack